source: internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial tests/nltk test2.py @ 16331

Last change on this file since 16331 was 16331, checked in by dferreira, 3 years ago

Initial tests updated

File size: 5.9 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Diogo Ferreira
4# Aptoide, 2016
5# Initial tests for nltk library
6
7from __future__ import division
8import sys
9import nltk
10import random
11from nltk.corpus import movie_reviews
12from nltk.classify.scikitlearn import SklearnClassifier
13from nltk.tokenize import word_tokenize
14import pickle
15from unidecode import unidecode
16
17from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
18from sklearn.linear_model import LogisticRegression, SGDClassifier
19from sklearn.svm import SVC, LinearSVC, NuSVC
20
21from nltk.classify import ClassifierI
22from statistics import mode
23
24
25class VoteClassifier(ClassifierI):
26        def __init__(self, *classifiers):
27                self._classifiers = classifiers
28
29        def classify(self, features):
30                votes = []
31                for c in self._classifiers:
32                        v = c.classify(features)
33                        votes.append(v)
34                return mode(votes)
35
36        def confidence(self, features):
37                votes = []
38                for c in self._classifiers:
39                        v = c.classify(features)
40                        votes.append(v)
41               
42                choice_votes = votes.count(mode(votes))
43                conf = choice_votes/len(votes)
44                return conf
45
46short_pos = open("short_reviews/positive.txt","r").read()
47short_neg = open("short_reviews/negative.txt","r").read()
48
49documents = []
50
51for r in short_pos.split('\n'):
52        documents.append((r,"pos"))
53
54for r in short_neg.split('\n'):
55        documents.append((r,"neg"))
56
57# Saves a list of (words in movie_reviews, category(positive or negative))
58#for category in movie_reviews.categories():
59#       for fileid in movie_reviews.fileids(category):
60#               documents.append((list(movie_reviews.words(fileid)), category))
61
62#random.shuffle(documents)
63
64all_words = []
65
66short_pos_words = word_tokenize(short_pos.decode('utf8'))
67short_neg_words = word_tokenize(short_neg.decode('utf8'))
68
69for w in short_pos_words:
70        all_words.append(w.lower())
71
72for w in short_neg_words:
73        all_words.append(w.lower())
74
75# Saves all words in reviews
76#for w in movie_reviews.words():
77#       all_words.append(w.lower())
78
79all_words = nltk.FreqDist(all_words)
80#print all_words.most_common(15)
81
82# 3000 most common words
83word_features = list(all_words.keys())[:5000]
84
85# Check if it finds features in words
86def find_features(document):
87        words = word_tokenize(document)
88        features = {}
89        for w in word_features:
90                features[w] = (w in words)
91
92        return features
93
94#print find_features(movie_reviews.words('neg/cv000_29416.txt'))
95
96featuresets = [(find_features(rev), category) for (rev, category) in documents]
97
98random.shuffle(featuresets)
99
100training_set = featuresets[:10000]
101testing_set = featuresets[10000:]
102
103
104
105# posterior = prior ocurrences * likelihood/evidence
106
107#classifier = nltk.NaiveBayesClassifier.train(training_set)
108
109classifier_f = open("naivebayes.pickle", "rb")
110classifier = pickle.load(classifier_f)
111classifier_f.close()
112
113print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
114
115classifier.show_most_informative_features(15)
116
117#save_classifier = open("naivebayes.pickle", "wb")
118#pickle.dump(classifier, save_classifier)
119#save_classifier.close()
120
121MNB_classifier = SklearnClassifier(MultinomialNB())
122MNB_classifier.train(training_set)
123print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100
124
125#GNB_classifier = SklearnClassifier(GaussianNB())
126#GNB_classifier.train(training_set)
127#print "GNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(GNB_classifier, testing_set))*100
128
129BNB_classifier = SklearnClassifier(BernoulliNB())
130BNB_classifier.train(training_set)
131print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100
132
133LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
134LogisticRegression_classifier.train(training_set)
135print "LogisticRegression_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100
136
137SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
138SGDClassifier_classifier.train(training_set)
139print "SGDClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100
140
141SVCClassifier_classifier = SklearnClassifier(SVC())
142SVCClassifier_classifier.train(training_set)
143print "SVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SVCClassifier_classifier, testing_set))*100
144
145LinearSVCClassifier_classifier = SklearnClassifier(LinearSVC())
146LinearSVCClassifier_classifier.train(training_set)
147print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
148
149NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
150NuSVCClassifier_classifier.train(training_set)
151print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
152
153voted_classifier = VoteClassifier(classifier, 
154        MNB_classifier, 
155        BNB_classifier, 
156        LogisticRegression_classifier, 
157        SGDClassifier_classifier, 
158        LinearSVCClassifier_classifier, 
159        NuSVCClassifier_classifier)
160print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
161
162#print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
163#print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
164#print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
165#print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
166#print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
167#print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
Note: See TracBrowser for help on using the repository browser.