source: internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py @ 16372

Last change on this file since 16372 was 16372, checked in by dferreira, 3 years ago

Results updated

File size: 7.0 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Diogo Ferreira
5# Aptoide, 2016
6# Initial language extractor
7
8
9from __future__ import division
10import sqlite3
11from nltk.tokenize import word_tokenize
12import nltk
13import random
14import collections
15from nltk.metrics import precision, recall, f_measure
16from sklearn.naive_bayes import MultinomialNB, BernoulliNB
17from sklearn.linear_model import LogisticRegression, SGDClassifier
18from nltk.classify.scikitlearn import SklearnClassifier
19from sklearn.svm import SVC, LinearSVC, NuSVC
20from nltk.classify import ClassifierI
21from nltk.corpus import stopwords
22from nltk.stem.porter import *
23from nltk.util import ngrams
24from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
25from sklearn import tree
26from statistics import mode
27import string
28from datetime import datetime, time
29
30
31class VoteClassifier(ClassifierI):
32        def __init__(self, *classifiers):
33                self._classifiers = classifiers
34
35        def classify(self, features):
36                votes = []
37                sumv = 0
38                for c in self._classifiers:
39                        sumv += c.classify(features)
40                return sumv/len(self._classifiers)
41
42
43# Step 0: Define parameters
44
45# Length of test set
46number_testing = 500
47# Number of most common words used for classifier
48n_common_words = 5000
49# Number of most informative features
50n_most_informative_features = 25
51
52# Stemmer to all words
53stemmer = PorterStemmer()
54
55punctuations = list(string.punctuation)
56punctuations.append("''")
57punctuations.append("--")
58
59# Step 1: Get the Content and label it
60
61db = sqlite3.connect('../API to download database/app_info_non_explicit.db')
62c = db.cursor()
63
64db2 = sqlite3.connect('../API to download database/app_info_explicit.db')
65c2 = db2.cursor()
66
67explicit_content = []
68non_explicit_content = []
69documents = []
70exp_size = 0
71non_size = 0
72
73c2.execute(''' SELECT description FROM app_data WHERE majority=1 ''')
74
75for d in c2.fetchall():
76        explicit_content.append(d[0])
77        documents.append((d[0],'exp'))
78        exp_size+=1
79
80c.execute(''' SELECT description FROM app_data WHERE majority=1''')
81
82for d in c.fetchall():
83        explicit_content.append(d[0])
84        documents.append((d[0],'exp'))
85        exp_size += 1
86
87
88c.execute(''' SELECT description FROM app_data WHERE majority=0''')
89
90for d in c.fetchall():
91        non_explicit_content.append(d[0])
92        documents.append((d[0],'non'))
93        non_size += 1
94
95        # Non_size==exp_size
96        if non_size==exp_size:
97                break
98
99#c2.execute(''' SELECT description FROM app_data WHERE majority=0 ''')
100'''
101for d in c2.fetchall():
102        non_explicit_content.append(d[0])
103        documents.append((d[0],'non'))
104        non_size += 1
105'''
106       
107print "Explicit descriptions: "+str(exp_size)
108print "Non-Explicit descriptions: "+str(non_size)
109
110db.close()
111db2.close()
112
113print "Pre-Processing..."
114
115# Step 2: Tokenize words
116print "Tokenizing..."
117now = datetime.now()
118explicit_content_words = [word_tokenize(w) for w in explicit_content]
119non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
120print str(datetime.now()-now)
121
122# Step 3: Append all words (lower)
123
124all_words = []
125print "Appending all words..."
126now = datetime.now()
127for w in explicit_content_words:
128        for x in w:
129                if x not in stopwords.words('english') and x not in punctuations:
130                        all_words.append(stemmer.stem(x.lower()))
131
132for w in non_explicit_content_words:
133        for x in w:
134                if x not in stopwords.words('english') and x not in punctuations:
135                        all_words.append(stemmer.stem(x.lower()))
136print str(datetime.now()-now)
137
138print "Creating a frequency distribution..."
139now = datetime.now()
140# Step 4: Get FreqDist
141all_words = nltk.FreqDist(all_words)
142print str(datetime.now()-now)
143print all_words.most_common(25)
144
145print "Get the n most common features..."
146now = datetime.now()
147# Step 5: Get n common words as features
148word_features = list(all_words.keys())[:n_common_words]
149#print word_features
150
151# Step 6: Check if it finds features in words
152def find_features(document):
153        words = word_tokenize(document)
154
155        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
156
157        # Later try to add size of description, category and title
158        # Also try to add bigrams and trigrams with and without stop words
159        # Careful so it doesn't overfit
160        features = {}
161        for w in word_features:
162                features[w] = (w in words)
163
164        return features
165
166print str(datetime.now()-now)
167
168print "Create a feature set..."
169now = datetime.now()
170# Step 7: Get feature set
171featuresets = [(find_features(rev), category) for (rev, category) in documents]
172
173# Step 8: Shuffle feature sets
174random.shuffle(featuresets)
175
176# Step 9: Create training set and testing set from feature sets
177training_set = featuresets[:exp_size+non_size-number_testing]
178#print training_set
179testing_set = featuresets[exp_size+non_size-number_testing:]
180#print testing_set
181print str(datetime.now()-now)
182
183# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
184
185print "Training..."
186
187def results(classifier, testing_set, training_set):
188        now = datetime.now()
189        classifier = classifier.train(training_set)
190        refsets = collections.defaultdict(set)
191        testsets = collections.defaultdict(set)
192
193        for i, (features, label) in enumerate(testing_set):
194                refsets[label].add(i)
195                observed = classifier.classify(features)
196                testsets[observed].add(i)
197
198        print "Time training: "+ str(datetime.now()-now)
199        print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
200        print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
201        print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
202        print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
203        print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
204        print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
205
206        print "Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
207
208print "\n****** NAIVE BAYES ************"
209results(nltk.NaiveBayesClassifier, testing_set, training_set)
210print "\n****** Random Forest ************"
211results(SklearnClassifier(RandomForestClassifier()), testing_set, training_set)
212print "\n****** ADA BOOST ************"
213results(SklearnClassifier(AdaBoostClassifier()), testing_set, training_set)
214print "\n****** MULTINOMIAL ************"
215results(SklearnClassifier(MultinomialNB()), testing_set, training_set)
216print "\n****** DECISION TREE ************"
217results(SklearnClassifier(tree.DecisionTreeClassifier()), testing_set, training_set)
218print "\n****** BERNOULLI ************"
219results(SklearnClassifier(BernoulliNB()), testing_set, training_set)
220print "\n****** LOGISTIC REGRESSION ************"
221results(SklearnClassifier(LogisticRegression()), testing_set, training_set)
222print "\n****** SGD CLASSIFIER ************"
223results(SklearnClassifier(SGDClassifier()), testing_set, training_set)
224print "\n****** SVC ************"
225results(SklearnClassifier(SVC()), testing_set, training_set)
226print "\n****** LINEAR SVC ************"
227results(SklearnClassifier(LinearSVC()), testing_set, training_set)
228
229#classifier.show_most_informative_features(n_most_informative_features)
230'''
231
232# Step 11: Decide which of the classifiers is more accurate.
233
234# Step 12: Research about classifier parameters and decide which is better.
235
236# Step 13: Save classifier with pickle'''
237
Note: See TracBrowser for help on using the repository browser.