import argparse import string from collections import defaultdict import operator from numpy import array from sklearn.metrics import confusion_matrix from sklearn.linear_model import SGDClassifier from sklearn.feature_extraction.text import HashingVectorizer from sklearn.metrics import accuracy_score from nltk.corpus import wordnet as wn from nltk.corpus import brown from nltk.util import ngrams def normalize_tags(tag): if not tag or not tag[0] in string.uppercase: return "PUNC" else: return tag[:2] kTAGSET = ["", "JJ", "NN", "PP", "RB", "VB"] class Analyzer: def __init__(self, word, before, after, prev, next, char, dict): self.word = word self.after = after self.before = before self.prev = prev self.next = next self.dict = dict self.char = char def __call__(self, feature_string): feats = feature_string.split() if self.word: yield feats[0] if self.after: for ii in [x for x in feats if x.startswith("A:")]: yield ii if self.before: for ii in [x for x in feats if x.startswith("B:")]: yield ii if self.prev: for ii in [x for x in feats if x.startswith("P:")]: yield ii if self.next: for ii in [x for x in feats if x.startswith("N:")]: yield ii if self.dict: for ii in [x for x in feats if x.startswith("D:")]: yield ii if self.char: for ii in [x for x in feats if x.startswith("C:")]: yield ii def example(sentence, position): word = sentence[position][0] ex = word tag = normalize_tags(sentence[position][1]) if tag in kTAGSET: target = kTAGSET.index(tag) else: target = None if position > 0: prev = " P:%s" % sentence[position - 1][0] else: prev = "" if position < len(sentence) - 1: next = " N:%s" % sentence[position + 1][0] else: next = '' all_before = " " + " ".join(["B:%s" % x[0] for x in sentence[:position]]) all_after = " " + " ".join(["A:%s" % x[0] for x in sentence[(position + 1):]]) dictionary = ["D:ADJ"] * len(wn.synsets(word, wn.ADJ)) + \ ["D:ADV"] * len(wn.synsets(word, wn.ADV)) + \ ["D:VERB"] * len(wn.synsets(word, wn.VERB)) + \ ["D:NOUN"] * len(wn.synsets(word, wn.NOUN)) dictionary = " " + " ".join(dictionary) char = ' ' padded_word = "~%s^" % sentence[position][0] for ngram_length in xrange(2, 5): char += ' ' + " ".join("C:%s" % "".join(cc for cc in x) for x in ngrams(padded_word, ngram_length)) ex += char ex += prev ex += next ex += all_after ex += all_before ex += dictionary return ex, target def all_examples(limit, train=True): sent_num = 0 for ii in brown.tagged_sents(): sent_num += 1 if limit > 0 and sent_num > limit: break for jj in xrange(len(ii)): ex, tgt = example(ii, jj) if tgt: if train and sent_num % 5 != 0: yield ex, tgt if not train and sent_num % 5 == 0: yield ex, tgt def accuracy(classifier, x, y, examples): predictions = classifier.predict(x) cm = confusion_matrix(y, predictions) print("Accuracy: %f" % accuracy_score(y, predictions)) print("\t".join(kTAGSET[1:])) for ii in cm: print("\t".join(str(x) for x in ii)) errors = defaultdict(int) for ii, ex_tuple in enumerate(examples): ex, tgt = ex_tuple if tgt != predictions[ii]: errors[(ex.split()[0], kTAGSET[predictions[ii]])] += 1 for ww, cc in sorted(errors.items(), key=operator.itemgetter(1), reverse=True)[:10]: print("%s\t%i" % (ww, cc)) if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--word', default=False, action='store_true', help="Use word features") parser.add_argument('--all_before', default=False, action='store_true', help="Use all words before context as features") parser.add_argument('--all_after', default=False, action='store_true', help="Use all words after context as features") parser.add_argument('--one_before', default=False, action='store_true', help="Use one word before context as feature") parser.add_argument('--one_after', default=False, action='store_true', help="Use one word after context as feature") parser.add_argument('--characters', default=False, action='store_true', help="Use character features") parser.add_argument('--dictionary', default=False, action='store_true', help="Use dictionary features") parser.add_argument('--limit', default=-1, type=int, help="How many sentences to use") flags = parser.parse_args() analyzer = Analyzer(flags.word, flags.all_before, flags.all_after, flags.one_before, flags.one_after, flags.characters, flags.dictionary) vectorizer = HashingVectorizer(analyzer=analyzer) x_train = vectorizer.fit_transform(ex for ex, tgt in all_examples(flags.limit)) x_test = vectorizer.fit_transform(ex for ex, tgt in all_examples(flags.limit, train=False)) for ex, tgt in all_examples(1): print(" ".join(analyzer(ex))) y_train = array(list(tgt for ex, tgt in all_examples(flags.limit))) y_test = array(list(tgt for ex, tgt in all_examples(flags.limit, train=False))) lr = SGDClassifier(loss='log', penalty='l2', shuffle=True) lr.fit(x_train, y_train) print("TRAIN\n-------------------------") accuracy(lr, x_train, y_train, all_examples(flags.limit)) print("TEST\n-------------------------") accuracy(lr, x_test, y_test, all_examples(flags.limit, train=False))