I've been using the maxent classifier in python and its failing and I don't understand why.
I'm using the movie reviews corpus. (total noob)
import nltk.classify.util
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
classifier = MaxentClassifier.train(trainfeats)
This is the error (I know I'm doing this wrong please link to how Maxent works)
Warning (from warnings module): File "C:\Python27\lib\site-packages\nltk\classify\maxent.py", line 1334 sum1 = numpy.sum(exp_nf_delta * A, axis=0) RuntimeWarning: invalid value encountered in multiply
Warning (from warnings module): File "C:\Python27\lib\site-packages\nltk\classify\maxent.py", line 1335 sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) RuntimeWarning: invalid value encountered in multiply
Warning (from warnings module): File "C:\Python27\lib\site-packages\nltk\classify\maxent.py", line 1341 deltas -= (ffreq_empirical - sum1) / -sum2 RuntimeWarning: invalid value encountered in divide
I changed and update the code a bit.
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews
def word_feats(words):
return dict([(word, True) for word in words])
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
#classifier = nltk.MaxentClassifier.train(trainfeats)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=3)
classifier.show_most_informative_features(10)
all_words = nltk.FreqDist(word for word in movie_reviews.words())
top_words = set(all_words.keys()[:300])
def word_feats(words):
return {word:True for word in words if word in top_words}