source: internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial tests/nltk test2.py @ 16325

Last change on this file since 16325 was 16325, checked in by dferreira, 3 years ago

Initial tests updated.

File size: 5.2 KB
Line 
1
2from __future__ import division
3import nltk
4import random
5from nltk.corpus import movie_reviews
6from nltk.classify.scikitlearn import SklearnClassifier
7import pickle
8
9from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
10from sklearn.linear_model import LogisticRegression, SGDClassifier
11from sklearn.svm import SVC, LinearSVC, NuSVC
12
13from nltk.classify import ClassifierI
14from statistics import mode
15
16class VoteClassifier(ClassifierI):
17        def __init__(self, *classifiers):
18                self._classifiers = classifiers
19
20        def classify(self, features):
21                votes = []
22                for c in self._classifiers:
23                        v = c.classify(features)
24                        votes.append(v)
25                return mode(votes)
26
27        def confidence(self, features):
28                votes = []
29                for c in self._classifiers:
30                        v = c.classify(features)
31                        votes.append(v)
32               
33                choice_votes = votes.count(mode(votes))
34                conf = choice_votes/len(votes)
35                return conf
36
37documents = []
38
39# Saves a list of (words in movie_reviews, category(positive or negative))
40for category in movie_reviews.categories():
41        for fileid in movie_reviews.fileids(category):
42                documents.append((list(movie_reviews.words(fileid)), category))
43
44random.shuffle(documents)
45
46all_words = []
47
48# Saves all words in reviews
49for w in movie_reviews.words():
50        all_words.append(w.lower())
51
52all_words = nltk.FreqDist(all_words)
53#print all_words.most_common(15)
54
55# 3000 most common words
56word_features = list(all_words.keys())[:3000]
57
58# Check if it finds features in words
59def find_features(document):
60        words = set(document)
61        features = {}
62        for w in word_features:
63                features[w] = (w in words)
64
65        return features
66
67#print find_features(movie_reviews.words('neg/cv000_29416.txt'))
68
69featuresets = [(find_features(rev), category) for (rev, category) in documents]
70
71training_set = featuresets[:1900]
72testing_set = featuresets[1900:]
73
74# posterior = prior ocurrences * likelihood/evidence
75
76#classifier = nltk.NaiveBayesClassifier.train(training_set)
77
78classifier_f = open("naivebayes.pickle", "rb")
79classifier = pickle.load(classifier_f)
80classifier_f.close()
81
82print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
83
84classifier.show_most_informative_features(15)
85
86#save_classifier = open("naivebayes.pickle", "wb")
87#pickle.dump(classifier, save_classifier)
88#save_classifier.close()
89
90MNB_classifier = SklearnClassifier(MultinomialNB())
91MNB_classifier.train(training_set)
92print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100
93
94#GNB_classifier = SklearnClassifier(GaussianNB())
95#GNB_classifier.train(training_set)
96#print "GNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(GNB_classifier, testing_set))*100
97
98BNB_classifier = SklearnClassifier(BernoulliNB())
99BNB_classifier.train(training_set)
100print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100
101
102LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
103LogisticRegression_classifier.train(training_set)
104print "LogisticRegression_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100
105
106SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
107SGDClassifier_classifier.train(training_set)
108print "SGDClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100
109
110SVCClassifier_classifier = SklearnClassifier(SVC())
111SVCClassifier_classifier.train(training_set)
112print "SVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SVCClassifier_classifier, testing_set))*100
113
114LinearSVCClassifier_classifier = SklearnClassifier(LinearSVC())
115LinearSVCClassifier_classifier.train(training_set)
116print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
117
118NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
119NuSVCClassifier_classifier.train(training_set)
120print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
121
122voted_classifier = VoteClassifier(classifier, 
123        MNB_classifier, 
124        BNB_classifier, 
125        LogisticRegression_classifier, 
126        SGDClassifier_classifier, 
127        LinearSVCClassifier_classifier, 
128        NuSVCClassifier_classifier)
129print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
130
131print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
132print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
133print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
134print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
135print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
136print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
Note: See TracBrowser for help on using the repository browser.