source: internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py @ 16368

Last change on this file since 16368 was 16368, checked in by dferreira, 3 years ago

Changes to language extractor tests and excel with the results.

File size: 7.8 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Diogo Ferreira
5# Aptoide, 2016
6# Initial language extractor
7
8
9from __future__ import division
10import sqlite3
11from nltk.tokenize import word_tokenize
12import nltk
13import random
14import collections
15from nltk.metrics import precision, recall, f_measure
16from sklearn.naive_bayes import MultinomialNB, BernoulliNB
17from sklearn.linear_model import LogisticRegression, SGDClassifier
18from nltk.classify.scikitlearn import SklearnClassifier
19from sklearn.svm import SVC, LinearSVC, NuSVC
20from nltk.classify import ClassifierI
21from nltk.corpus import stopwords
22from nltk.stem.porter import *
23from nltk.util import ngrams
24from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
25from sklearn import tree
26from statistics import mode
27import string
28from datetime import datetime, time
29
30
31class VoteClassifier(ClassifierI):
32        def __init__(self, *classifiers):
33                self._classifiers = classifiers
34
35        def classify(self, features):
36                votes = []
37                sumv = 0
38                for c in self._classifiers:
39                        sumv += c.classify(features)
40                return sumv/len(self._classifiers)
41
42
43# Step 0: Define parameters
44
45# Length of test set
46number_testing = 500
47# Number of most common words used for classifier
48n_common_words = 5000
49# Number of most informative features
50n_most_informative_features = 25
51
52# Stemmer to all words
53stemmer = PorterStemmer()
54
55punctuations = list(string.punctuation)
56punctuations.append("''")
57punctuations.append("--")
58
59# Step 1: Get the Content and label it
60
61db = sqlite3.connect('../API to download database/app_info_non_explicit.db')
62c = db.cursor()
63
64db2 = sqlite3.connect('../API to download database/app_info_explicit.db')
65c2 = db2.cursor()
66
67explicit_content = []
68non_explicit_content = []
69documents = []
70exp_size = 0
71non_size = 0
72
73c2.execute(''' SELECT description FROM app_data WHERE majority=1 ''')
74
75for d in c2.fetchall():
76        explicit_content.append(d[0])
77        documents.append((d[0],'exp'))
78        exp_size+=1
79
80c2.execute(''' SELECT description FROM app_data WHERE majority=0 ''')
81
82for d in c2.fetchall():
83        non_explicit_content.append(d[0])
84        documents.append((d[0],'non'))
85        non_size += 1
86
87c.execute(''' SELECT description FROM app_data WHERE majority=1''')
88
89for d in c.fetchall():
90        explicit_content.append(d[0])
91        documents.append((d[0],'exp'))
92        exp_size += 1
93
94c.execute(''' SELECT description FROM app_data WHERE majority=0''')
95
96for d in c.fetchall():
97        non_explicit_content.append(d[0])
98        documents.append((d[0],'non'))
99        non_size += 1
100       
101print "Explicit descriptions: "+str(exp_size)
102print "Non-Explicit descriptions: "+str(non_size)
103
104db.close()
105db2.close()
106
107print "Pre-Processing..."
108
109# Step 2: Tokenize words
110print "Tokenizing..."
111now = datetime.now()
112explicit_content_words = [word_tokenize(w) for w in explicit_content]
113non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
114print str(datetime.now()-now)
115
116# Step 3: Append all words (lower)
117
118all_words = []
119print "Appending all words..."
120now = datetime.now()
121for w in explicit_content_words:
122        for x in w:
123                if x not in stopwords.words('english') and x not in punctuations:
124                        all_words.append(stemmer.stem(x.lower()))
125
126for w in non_explicit_content_words:
127        for x in w:
128                if x not in stopwords.words('english') and x not in punctuations:
129                        all_words.append(stemmer.stem(x.lower()))
130print str(datetime.now()-now)
131
132print "Creating a frequency distribution..."
133now = datetime.now()
134# Step 4: Get FreqDist
135all_words = nltk.FreqDist(all_words)
136print str(datetime.now()-now)
137print all_words.most_common(25)
138
139print "Get the n most common features..."
140now = datetime.now()
141# Step 5: Get n common words as features
142word_features = list(all_words.keys())[:n_common_words]
143#print word_features
144
145# Step 6: Check if it finds features in words
146def find_features(document):
147        words = word_tokenize(document)
148
149        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
150
151        # Later try to add size of description, category and title
152        # Also try to add bigrams and trigrams with and without stop words
153        # Careful so it doesn't overfit
154        features = {}
155        for w in word_features:
156                features[w] = (w in words)
157
158        return features
159
160print str(datetime.now()-now)
161
162print "Create a feature set..."
163now = datetime.now()
164# Step 7: Get feature set
165featuresets = [(find_features(rev), category) for (rev, category) in documents]
166
167# Step 8: Shuffle feature sets
168random.shuffle(featuresets)
169
170# Step 9: Create training set and testing set from feature sets
171training_set = featuresets[:exp_size+non_size-number_testing]
172#print training_set
173testing_set = featuresets[exp_size+non_size-number_testing:]
174#print testing_set
175print str(datetime.now()-now)
176# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
177
178print "Training..."
179
180def results(classifier, testing_set, training_set):
181        now = datetime.now()
182        classifier = classifier.train(training_set)
183        refsets = collections.defaultdict(set)
184        testsets = collections.defaultdict(set)
185
186        for i, (features, label) in enumerate(testing_set):
187                refsets[label].add(i)
188                observed = classifier.classify(features)
189                testsets[observed].add(i)
190
191        print "Time training: "+ str(datetime.now()-now)
192        print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
193        print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
194        print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
195        print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
196        print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
197        print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
198
199        print "Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
200
201print "****** NAIVE BAYES ************"
202results(nltk.NaiveBayesClassifier, testing_set, training_set)
203print "****** Random Forest ************"
204results(SklearnClassifier(RandomForestClassifier()), testing_set, training_set)
205print "****** ADA BOOST ************"
206results(SklearnClassifier(AdaBoostClassifier()), testing_set, training_set)
207print "****** MULTINOMIAL ************"
208results(SklearnClassifier(MultinomialNB()), testing_set, training_set)
209print "****** DECISION TREE ************"
210results(SklearnClassifier(tree.DecisionTreeClassifier()), testing_set, training_set)
211print "****** BERNOULLI ************"
212results(SklearnClassifier(BernoulliNB()), testing_set, training_set)
213print "****** LOGISTIC REGRESSION ************"
214results(SklearnClassifier(LogisticRegression()), testing_set, training_set)
215print "****** SGD CLASSIFIER ************"
216results(SklearnClassifier(SGDClassifier()), testing_set, training_set)
217print "****** SVC ************"
218results(SklearnClassifier(SVC()), testing_set, training_set)
219print "****** LINEAR SVC ************"
220results(SklearnClassifier(LinearSVC()), testing_set, training_set)
221
222#classifier.show_most_informative_features(n_most_informative_features)
223'''
224
225voted_classifier = VoteClassifier(classifier,
226        MNB_classifier,
227        ADA_classifier,
228        TREE_classifier,
229        RFC_classifier,
230        BNB_classifier,
231        LogisticRegression_classifier,
232        SGDClassifier_classifier,
233        SVCClassifier_classifier,
234#       NuSVCClassifier_classifier,
235        LinearSVCClassifier_classifier)
236print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
237
238print "Classification:", voted_classifier.classify(testing_set[0][0])*100
239print "Classification:", voted_classifier.classify(testing_set[1][0])*100
240print "Classification:", voted_classifier.classify(testing_set[2][0])*100
241print "Classification:", voted_classifier.classify(testing_set[3][0])*100
242print "Classification:", voted_classifier.classify(testing_set[4][0])*100
243print "Classification:", voted_classifier.classify(testing_set[5][0])*100
244
245
246# Step 11: Create Classifier class and try to decide which of the classifiers is more accurate.
247
248# Step 12: Research about classifier parameters and decide which is better.
249
250# Step 13: Save classifier with pickle'''
251
Note: See TracBrowser for help on using the repository browser.