-------- from nltk.book import * texts() len(text1) len(set(text1)) sorted(set(text1)) text1.collocations() text1.concordance("love") text1.similar("love") f1 = FreqDist(text1) f1.keys()[0:50] ---------- from nltk.corpus import gutenberg gutenberg.fileids() gutenberg.sents('shakespeare-macbeth.txt') ---------- from nltk.corpus import reuters reuters.fileids() reuters.categories() reuters.words('training/9971') reuters.categories('training/9971') cfd = nltk.ConfitionalFreqDist( (label, word) for label in teuters.categories() for word in reuters.words(categories=label) list(cfd['acq'])[0:100] list(cfd['housing'])[0:100] len(reuters.fileids()) pairs = [(list(reuters.words(file)), label) for label in reuters.categories() for file in reuters.fileids(label)] random.shuffle(pairs) pairs[0] pairs[2] f2 = FreqDist(reuters.words()).keys()[0:1000] def features(document): words = set(document) f = {} for word in f2: f['contains(%s)' % word] = (word in words) return f print features(reuters.words('training/9971)) all = [(features(d), c) for (d,c) in pairs] train_set = all[0:1000] test_set = all[1000:1100] classifier = nltk.NaiveBayesClassifier.train(train_set) classifier.show_most_informative_features(15) print nltk.classify.accuracy(classifier, test_set)