Natural Language Processing with NLTK – Part 2

Text Classification

In [26]:
import nltk
import random
from nltk.corpus import movie_reviews
In [27]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

Shuffling the documents as it’s in ordered category

In [28]:
random.shuffle(documents)

Lowercase all words and converting list to Frequency Distribution

In [29]:
all_words = []
In [30]:
for w in movie_reviews.words():
    all_words.append(w.lower())
In [31]:
all_words = nltk.FreqDist(all_words)

Top 15 common words

In [32]:
print(all_words.most_common(15))
[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]

How many times does the word ‘stupid’ pop up?

In [33]:
print(all_words['stupid'])
253

Words as Features for learning

Top 3000 words

In [34]:
word_features = list(all_words.keys())[:3000]
In [35]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        # To check if the top 3000 words are in the document
        features[w] = (w in words)
        
    return (features)
In [36]:
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
In [37]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

Naive Bayes

ML Classification to classify whether a movie review is positive/negative

posterior = (prior occurences x likelihood) / evidence

In [38]:
train = featuresets[:1900]
test = featuresets[1900:]
In [39]:
classifier = nltk.NaiveBayesClassifier.train(train)

Accuracy of the classifier

In [40]:
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test))*100)
Naive Bayes Algo accuracy percent: 84.0

Top 15 most informative features

In [41]:
classifier.show_most_informative_features(15)
Most Informative Features
              schumacher = True              neg : pos    =     11.8 : 1.0
                   sucks = True              neg : pos    =     10.7 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                  welles = True              neg : pos    =      7.7 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                    olds = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
               atrocious = True              neg : pos    =      6.7 : 1.0
                  regard = True              pos : neg    =      6.5 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0

Save Classifier with Pickle

How to save your trained algorithm

In [42]:
import pickle

‘wb’ stands for write binary

In [43]:
save_classifier = open('naivebayes.pickle','wb')

‘Dump’ (Save) your classifier to save_classifier

In [44]:
pickle.dump(classifier, save_classifier)
In [45]:
save_classifier.close()

‘rb’ stands for read binary

In [46]:
classifier_f = open('naivebayes.pickle','rb')
In [47]:
classifier_2 = pickle.load(classifier_f)
In [48]:
classifier_f.close()
In [49]:
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier_2, test))*100)
Naive Bayes Algo accuracy percent: 84.0
In [50]:
classifier_2.show_most_informative_features(15)
Most Informative Features
              schumacher = True              neg : pos    =     11.8 : 1.0
                   sucks = True              neg : pos    =     10.7 : 1.0
                  annual = True              pos : neg    =      9.6 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                  welles = True              neg : pos    =      7.7 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                    olds = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
               atrocious = True              neg : pos    =      6.7 : 1.0
                  regard = True              pos : neg    =      6.5 : 1.0
                 kidding = True              neg : pos    =      6.4 : 1.0

Scikit-Learn incorporation

In [62]:
from nltk.classify.scikitlearn import SklearnClassifier
In [63]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
In [64]:
MNB_classifier = SklearnClassifier(MultinomialNB())
In [65]:
MNB_classifier.train(train)
Out[65]:
<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>
In [66]:
print("MNB Algo accuracy percent:", (nltk.classify.accuracy(MNB_classifier, test))*100)
MNB Algo accuracy percent: 85.0
In [67]:
Bernoulli_classifier = SklearnClassifier(BernoulliNB())
Bernoulli_classifier.train(train)
print("Bernoulli Algo accuracy percent:", (nltk.classify.accuracy(Bernoulli_classifier, test))*100)
Bernoulli Algo accuracy percent: 84.0

Other ML algos to try!

In [68]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
In [69]:
Logistic_classifier = SklearnClassifier(LogisticRegression())
Logistic_classifier.train(train)
print("Logistic Algo accuracy percent:", (nltk.classify.accuracy(Logistic_classifier, test))*100)
SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(train)
print("SGD Algo accuracy percent:", (nltk.classify.accuracy(SGD_classifier, test))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(train)
print("SVC Algo accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train)
print("Linear SVC Algo accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train)
print("NuSVC Algo accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, test))*100)
Logistic Algo accuracy percent: 80.0
/Users/ryanong/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
"and default tol will be 1e-3." % type(self), FutureWarning)
SGD Algo accuracy percent: 77.0
SVC Algo accuracy percent: 81.0
Linear SVC Algo accuracy percent: 79.0
NuSVC Algo accuracy percent: 83.0

Combining Algos with a Vote

In [71]:
from nltk.classify import ClassifierI
from statistics import mode
In [72]:
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes/ len(votes)
return conf
In [73]:
voted_classifier = VoteClassifier(classifier,MNB_classifier,Bernoulli_classifier,Logistic_classifier,SGD_classifier,
LinearSVC_classifier,NuSVC_classifier)
In [76]:
print("Voted classifier Algo accuracy percent:", (nltk.classify.accuracy(voted_classifier, test))*100)
print("Classification:", voted_classifier.classify(test[0][0]), "Confidence %:", voted_classifier.confidence(test[0][0])*100)
Voted classifier Algo accuracy percent: 85.0
Classification: neg Confidence %: 100.0
In [77]:
print("Classification:", voted_classifier.classify(test[1][0]), "Confidence %:", voted_classifier.confidence(test[1][0])*100)
print("Classification:", voted_classifier.classify(test[2][0]), "Confidence %:", voted_classifier.confidence(test[2][0])*100)
print("Classification:", voted_classifier.classify(test[3][0]), "Confidence %:", voted_classifier.confidence(test[3][0])*100)
print("Classification:", voted_classifier.classify(test[4][0]), "Confidence %:", voted_classifier.confidence(test[4][0])*100)
print("Classification:", voted_classifier.classify(test[5][0]), "Confidence %:", voted_classifier.confidence(test[5][0])*100)
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: pos Confidence %: 85.71428571428571
Classification: neg Confidence %: 85.71428571428571
Classification: pos Confidence %: 100.0

Sentiment Analysis module

In [78]:
def sentiment(text):
feats = find_features(text)
return voted_classifier.classify(feats), voted_classifier.confidence(feats)

You can save all the codes into your own sentiment analysis module. This allows you to import it and combine it with other API’s such as Twitter!!!

Leave a Reply

Your email address will not be published. Required fields are marked *