source: internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial tests/nltk test2.py @ 16335

Last change on this file since 16335 was 16335, checked in by dferreira, 3 years ago

Changes to linguage extractor test

File size: 5.9 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# Diogo Ferreira
4# Aptoide, 2016
5# Initial tests for nltk library
6
7from __future__ import division
8import sys
9import nltk
10import random
11from nltk.corpus import movie_reviews
12from nltk.classify.scikitlearn import SklearnClassifier
13from nltk.tokenize import word_tokenize
14import pickle
15from unidecode import unidecode
16
17from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
18from sklearn.linear_model import LogisticRegression, SGDClassifier
19from sklearn.svm import SVC, LinearSVC, NuSVC
20
21from nltk.classify import ClassifierI
22from statistics import mode
23
24
25class VoteClassifier(ClassifierI):
26        def __init__(self, *classifiers):
27                self._classifiers = classifiers
28
29        def classify(self, features):
30                votes = []
31                for c in self._classifiers:
32                        v = c.classify(features)
33                        votes.append(v)
34                return mode(votes)
35
36        def confidence(self, features):
37                votes = []
38                for c in self._classifiers:
39                        v = c.classify(features)
40                        votes.append(v)
41               
42                choice_votes = votes.count(mode(votes))
43                conf = choice_votes/len(votes)
44                return conf
45
46short_pos = open("short_reviews/positive.txt","r").read()
47short_neg = open("short_reviews/negative.txt","r").read()
48
49documents = []
50
51for r in short_pos.split('\n'):
52        documents.append((r,"pos"))
53
54for r in short_neg.split('\n'):
55        documents.append((r,"neg"))
56
57# Saves a list of (words in movie_reviews, category(positive or negative))
58#for category in movie_reviews.categories():
59#       for fileid in movie_reviews.fileids(category):
60#               documents.append((list(movie_reviews.words(fileid)), category))
61
62#random.shuffle(documents)
63
64all_words = []
65
66short_pos_words = word_tokenize(short_pos)
67short_neg_words = word_tokenize(short_neg)
68
69for w in short_pos_words:
70        all_words.append(w.lower())
71
72for w in short_neg_words:
73        all_words.append(w.lower())
74
75# Saves all words in reviews
76#for w in movie_reviews.words():
77#       all_words.append(w.lower())
78
79all_words = nltk.FreqDist(all_words)
80#print all_words.most_common(15)
81
82# 3000 most common words
83word_features = list(all_words.keys())[:5000]
84
85# Check if it finds features in words
86def find_features(document):
87        words = word_tokenize(document)
88        features = {}
89        for w in word_features:
90                features[w] = (w in words)
91
92        return features
93
94#print find_features(movie_reviews.words('neg/cv000_29416.txt'))
95
96featuresets = [(find_features(rev), category) for (rev, category) in documents]
97
98random.shuffle(featuresets)
99
100training_set = featuresets[:10000]
101testing_set = featuresets[10000:]
102
103# posterior = prior ocurrences * likelihood/evidence
104
105#classifier = nltk.NaiveBayesClassifier.train(training_set)
106
107classifier_f = open("naivebayes.pickle", "rb")
108classifier = pickle.load(classifier_f)
109classifier_f.close()
110
111print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
112
113classifier.show_most_informative_features(15)
114
115#save_classifier = open("naivebayes.pickle", "wb")
116#pickle.dump(classifier, save_classifier)
117#save_classifier.close()
118
119MNB_classifier = SklearnClassifier(MultinomialNB())
120MNB_classifier.train(training_set)
121print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100
122
123#GNB_classifier = SklearnClassifier(GaussianNB())
124#GNB_classifier.train(training_set)
125#print "GNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(GNB_classifier, testing_set))*100
126
127BNB_classifier = SklearnClassifier(BernoulliNB())
128BNB_classifier.train(training_set)
129print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100
130
131LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
132LogisticRegression_classifier.train(training_set)
133print "LogisticRegression_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100
134
135SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
136SGDClassifier_classifier.train(training_set)
137print "SGDClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100
138
139SVCClassifier_classifier = SklearnClassifier(SVC())
140SVCClassifier_classifier.train(training_set)
141print "SVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SVCClassifier_classifier, testing_set))*100
142
143LinearSVCClassifier_classifier = SklearnClassifier(LinearSVC())
144LinearSVCClassifier_classifier.train(training_set)
145print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
146
147NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
148NuSVCClassifier_classifier.train(training_set)
149print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
150
151voted_classifier = VoteClassifier(classifier, 
152        MNB_classifier, 
153        BNB_classifier, 
154        LogisticRegression_classifier, 
155        SGDClassifier_classifier, 
156        LinearSVCClassifier_classifier, 
157        NuSVCClassifier_classifier)
158print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
159
160#print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
161#print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
162#print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
163#print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
164#print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
165#print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
Note: See TracBrowser for help on using the repository browser.