;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; ;;; Ling443 ;;; ;;; Program for lemmatizing nouns using suffix replacement rules. ;;; ;;; Author: Philip Resnik ;;; Date: September 16, 1998 ;;; ;;; Usage: ;;; ;;; (lemmatize-noun noun) ;;; ;;; where noun is a word represented as a list of one-character symbols, ;;; e.g. (c a t s). ;;; ;;; Examples: ;;; ;;; (lemmatize-noun '(c a t s)) => (c a t) ;;; (lemmatize-noun '(g l a s s e s)) => (g l a s s) ;;; (lemmatize-noun '(b r i l l i g)) => (b r i l l i g) ;;; ;;; Algorithm: ;;; ;;; The program uses a set of ten suffix-replacement rules. ;;; It tries each rule in turn. If replacing the old suffix ;;; with the new suffix results in a word that is in the noun ;;; dictionary, then that word is returned. If all the rules ;;; are tried and none results in finding a singular form in ;;; the dictionary, then the word itself is returned. Note, ;;; therefore, that irregular plurals are not handled. ;;; ;;; The suffix replacement rules are as follows: ;;; ;;; Old New ;;; suffix suffix Example ;;; ------------------------------------- ;;; ss ss (g l a s s) -> (g l a s s) ;;; s nil (b u c k s) -> (b u c k) ;;; ses s (g l a s s e s) -> (g l a s s) ;;; xes x (b o x e s) -> (b o x) ;;; zes z (q u i z z e s) -> (q u i z) ;;; ches ch (m a t c h e s) -> (m a t c h) ;;; shes sh (w i s h e s) -> (w i s h) ;;; ies y (b o d i e s) -> (b o d y) ;;; es e (v a s e s) -> (v a s e) ;;; es nil (t o m a t o e s) -> (t o m a t o) ;;; ;;; ;;; Example: ;;; ;;; Given input (b o x e s), the algorithm first tries the rule replacing ;;; suffix ss with ss. Since the word, represnted as a list, does not end ;;; with the suffix (s s), this rule fails. Next the algorithm tries the ;;; rule replacing s with NIL. In this case, it finds that the input does ;;; have (s) as a suffix, so it replaces the suffix (s) with the suffix ;;; NIL, producing (b o x e). However, since (b o x e) is not on the ;;; dictionary list, this rule also fails to produce a lemma, and the ;;; algorithm continues down the rule list. The third rule, attempting to ;;; replace ses with s, fails for the same reason as the first rule. When ;;; the fourth rule is applies, the suffix (x e s) is replaced with (x) to ;;; produce (b o x), and since (b o x) is found on the dictionary list, ;;; that value is returned by the function. ;;; ;;; Global variables: ;;; ;;; The program uses the global variable +noun-list+, which ;;; is defined in file noun-list.lisp. This is the dictionary ;;; containing known nouns represesented in list form, as above. ;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (defun onlistp (word dictlist) "Returns T if word is on dictionary list, NIL otherwise" (and (member word dictlist :test #'equal) t)) (defun defined-noun (word) "Returns the word itself if word is on global list +noun-list+, and returns NIL otherwise. For example, (defined-noun '(c a t)) => (c a t) (defined-noun '(f b r z)) => NIL " (if (onlistp word +noun-list+) word)) (defun suffix (sfxlen list) " Returns the last sfxlen elements of list. If sfxlen is greater than the length of the list, returns nil. E.g., (suffix 2 '(r a c e s)) => (e s) (suffix 0 '(r a c e s)) => NIL (suffix 3 '(i n)) => NIL This function makes use of the 'nthcdr' function, which returns the result of calling CDR on the list n times. For example, (nthcdr 2 '(a b c d e f)) => (c d e f) (nthcdr 4 '(a b c d e f)) => (e f) (nthcdr 8 '(a b c d e f)) => NIL " (if (<= sfxlen (length list)) (nthcdr (- (length list) sfxlen) list))) (defun prefix (pfxlen list) " Returns the first pfxlen elements of list. If pfxlen is greater than the length of the list, returns the entire list. E.g., (prefix 4 '(r a c e s)) => (r a c e) (prefix 0 '(r a c e s)) => NIL (prefix 3 '(t h e)) => (t h e) (prefix 3 '(i n)) => (i n) This function makes use of the 'reverse' function, which take a list and returns a list of the elements in reverse order. E.g. (reverse '(a b c d)) => (d c b a) " (if (> pfxlen (length list)) list (reverse (suffix pfxlen (reverse list))))) (defun has-suffix (list sfx) "Returns true if and only if sfx is a suffix of list, e.g. (has-suffix '(r a c e d) '(e d)) => T (has-suffix '(r a c e d) '(e s)) => NIL (has-suffix '(a t) '(i n g)) => NIL (has-suffix '(r a c e d) nil) => T " (equal (suffix (length sfx) list) sfx)) (defun concat (list1 list2) "Concatenates two lists. E.g. (concat '(a b c) '(d e)) => (a b c d e) (concat '(a b c) NIL) => (a b c) " (append list1 list2)) (defun replace-suffix (list old-sfx new-sfx) " Assuming that list has suffix old-sfx, this function returns a version of list in which old-sfx has been replaced with new-sfx. If list does NOT have old-sfx as a suffix, the function returns NIL. E.g., (replace-suffix '(r a c e s) '(e s) '(i n g)) => (r a c i n g) (replace-suffix '(r a c e s) '(e d) '(i n g)) => NIL (replace-suffix '(r a c i n g) '(e d) '(i n g)) => NIL (replace-suffix '(r a c i n g) '(i n g) '(e d)) => (r a c e d) (replace-suffix '(r a c i n g) '(i n g) nil) => (r a c) " (if (has-suffix list old-sfx) (concat (prefix (- (length list) (length old-sfx)) list) new-sfx))) (defun lemmatize-noun (noun) " Returns the singular form of a plural noun if a suffix-replacement rule can be applied that produces a noun known to be in the dictionary. If no such rule can be applied, e.g. if the noun is an irregular plural or the singular form is not in the dictionary, this function returns the noun itself. " (or ;; glass -> glass (defined-noun (replace-suffix noun '(s s) '(s s))) ;; bucks -> buck (defined-noun (replace-suffix noun '(s) nil)) ;; glasses -> glass (defined-noun (replace-suffix noun '(s e s) '(s))) ;; boxes -> box (defined-noun (replace-suffix noun '(x e s) '(x))) ;; quizzes -> quiz (defined-noun (replace-suffix noun '(z e s) '(z))) ;; matches -> match (defined-noun (replace-suffix noun '(c h e s) '(c h))) ;; wishes -> wish (defined-noun (replace-suffix noun '(s h e s) '(s h))) ;; bodies -> body (defined-noun (replace-suffix noun '(i e s) '(y))) ;; vases -> vase (defined-noun (replace-suffix noun '(e s) '(e))) ;; tomatoes -> tomato (defined-noun (replace-suffix noun '(e s) nil)) ;; Unknown; return the noun itself noun)) (defun lemmatize-noun-cond (noun) " [Alternative solution to lemmatize-noun, using COND. Also ok, though note the duplicate calls to replace-suffix.] Returns the singular form of a plural noun if a suffix-replacement rule can be applied that produces a noun known to be in the dictionary. If no such rule can be applied, e.g. if the noun is an irregular plural or the singular form is not in the dictionary, this function returns the noun itself. " (cond ;; glass -> glass ((defined-noun (replace-suffix noun '(s s) '(s s))) (replace-suffix noun '(s s) '(s s))) ;; bucks -> buck ((defined-noun (replace-suffix noun '(s) nil)) (replace-suffix noun '(s) nil)) ;; glasses -> glass ((defined-noun (replace-suffix noun '(s e s) '(s))) (replace-suffix noun '(s e s) '(s))) ;; boxes -> box ((defined-noun (replace-suffix noun '(x e s) '(x))) (replace-suffix noun '(x e s) '(x))) ;; quizzes -> quiz ((defined-noun (replace-suffix noun '(z e s) '(z))) (replace-suffix noun '(z e s) '(z))) ;; matches -> match ((defined-noun (replace-suffix noun '(c h e s) '(c h))) (replace-suffix noun '(c h e s) '(c h))) ;; wishes -> wish ((defined-noun (replace-suffix noun '(s h e s) '(s h))) (replace-suffix noun '(s h e s) '(s h))) ;; bodies -> body ((defined-noun (replace-suffix noun '(i e s) '(y))) (replace-suffix noun '(i e s) '(y))) ;; vases -> vase ((defined-noun (replace-suffix noun '(e s) '(e))) (replace-suffix noun '(e s) '(e))) ;; tomatoes -> tomato ((defined-noun (replace-suffix noun '(e s) nil)) (replace-suffix noun '(e s) nil)) ;; Unknown; return the noun itself (t noun))) (defun test () (mapcar #'print (list (lemmatize-noun '(c l a s s)) (lemmatize-noun '(c a t s)) (lemmatize-noun '(c l a s s e s)) (lemmatize-noun '(b o x e s)) (lemmatize-noun '(b u z z e s)) (lemmatize-noun '(c a t c h e s)) (lemmatize-noun '(c a t c h)) (lemmatize-noun '(a s h e s)) (lemmatize-noun '(b o d i e s)) (lemmatize-noun '(b a s e s)) (lemmatize-noun '(a l b i n o s)) (lemmatize-noun '(a l b i n o e s)))) t)