source: internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization-tests.py @ 16391

Last change on this file since 16391 was 16391, checked in by dferreira, 3 years ago

Database updated

File size: 7.0 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Diogo Ferreira
5# Aptoide, 2016
6# Initial Test Categorization tests
7
8
9from __future__ import division
10import sqlite3
11from nltk.tokenize import word_tokenize
12import nltk
13import os
14import pickle
15import random
16import collections
17from nltk.metrics import precision, recall, f_measure
18from sklearn.naive_bayes import MultinomialNB, BernoulliNB
19from sklearn.linear_model import LogisticRegression, SGDClassifier
20from nltk.classify.scikitlearn import SklearnClassifier
21from sklearn.svm import SVC, LinearSVC, NuSVC
22from nltk.classify import ClassifierI
23from nltk.corpus import stopwords
24from nltk.stem.porter import *
25from nltk.util import ngrams
26from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
27from sklearn import tree
28from statistics import mode
29import string
30from datetime import datetime, time
31
32
33
34# Step 0: Define parameters
35
36# Length of test set
37number_testing = 500
38# Number of most common words used for classifier
39n_common_words = 5000
40# Number of most informative features
41n_most_informative_features = 25
42
43# Stemmer to all words
44stemmer = PorterStemmer()
45
46punctuations = list(string.punctuation)
47punctuations.append("''")
48punctuations.append("--")
49
50# Step 1: Get the Content and label it
51db = sqlite3.connect('../API to download database/app_info_non_explicit.db')
52c = db.cursor()
53
54db2 = sqlite3.connect('../API to download database/app_info_explicit.db')
55c2 = db2.cursor()
56
57explicit_content = []
58non_explicit_content = []
59documents = []
60exp_size = 0
61non_size = 0
62
63c2.execute(''' SELECT description FROM app_data WHERE majority=1 ''')
64
65for d in c2.fetchall():
66        explicit_content.append(d[0])
67        documents.append((d[0],'exp'))
68        exp_size+=1
69
70c.execute(''' SELECT description FROM app_data WHERE majority=1''')
71
72for d in c.fetchall():
73        explicit_content.append(d[0])
74        documents.append((d[0],'exp'))
75        exp_size += 1
76
77
78c.execute(''' SELECT description FROM app_data WHERE majority=0''')
79
80for d in c.fetchall():
81        non_explicit_content.append(d[0])
82        documents.append((d[0],'non'))
83        non_size += 1
84
85        #if non_size==exp_size:
86        #       break
87
88c2.execute(''' SELECT description FROM app_data WHERE majority=0 ''')
89
90for d in c2.fetchall():
91        non_explicit_content.append(d[0])
92        documents.append((d[0],'non'))
93        non_size += 1
94
95       
96print "Explicit descriptions: "+str(exp_size)
97print "Non-Explicit descriptions: "+str(non_size)
98
99db.close()
100db2.close()
101
102print "Pre-Processing..."
103
104# Step 2: Tokenize words
105print "Tokenizing..."
106now = datetime.now()
107explicit_content_words = [word_tokenize(w) for w in explicit_content]
108non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
109print str(datetime.now()-now)
110
111# Step 3: Append all words (lower)
112
113all_words = []
114print "Appending all words..."
115now = datetime.now()
116for w in explicit_content_words:
117        for x in w:
118                if x not in stopwords.words('english') and x not in punctuations:
119                        all_words.append(stemmer.stem(x.lower()))
120
121for w in non_explicit_content_words:
122        for x in w:
123                if x not in stopwords.words('english') and x not in punctuations:
124                        all_words.append(stemmer.stem(x.lower()))
125print str(datetime.now()-now)
126
127print "Creating a frequency distribution..."
128now = datetime.now()
129# Step 4: Get FreqDist
130all_words = nltk.FreqDist(all_words)
131print str(datetime.now()-now)
132print all_words.most_common(25)
133
134print "Get the n most common features..."
135now = datetime.now()
136# Step 5: Get n common words as features
137word_features = list(all_words.keys())[:n_common_words]
138#print word_features
139
140# Step 6: Check if it finds features in words
141def find_features(document):
142        words = word_tokenize(document)
143
144        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
145
146        # Later try to add size of description, category and title
147        # Also try to add bigrams and trigrams with and without stop words
148        # Careful so it doesn't overfit
149        features = {}
150        for w in word_features:
151                features[w] = (w in words)
152
153        return features
154
155print str(datetime.now()-now)
156
157print "Create a feature set..."
158now = datetime.now()
159# Step 7: Get feature set
160featuresets = [(find_features(rev), category) for (rev, category) in documents]
161
162# Step 8: Shuffle feature sets
163random.shuffle(featuresets)
164
165# Step 9: Create training set and testing set from feature sets
166training_set = featuresets[:exp_size+non_size-number_testing]
167#print training_set
168testing_set = featuresets[exp_size+non_size-number_testing:]
169#print testing_set
170print str(datetime.now()-now)
171
172# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
173
174print "Training..."
175
176def results(classifier, testing_set, training_set):
177        now = datetime.now()
178        classifier = classifier.train(training_set)
179        refsets = collections.defaultdict(set)
180        testsets = collections.defaultdict(set)
181
182        tp=0
183        fp=0
184        tn=0
185        fn=0
186
187        for i, (features, label) in enumerate(testing_set):
188                refsets[label].add(i)
189                observed = classifier.classify(features)
190                testsets[observed].add(i)
191                if label =='exp' and observed =='exp':
192                        tp += 1
193                elif label=='non' and observed=='non':
194                        tn += 1
195                elif label=='exp' and observed=='non':
196                        fn += 1
197                else:
198                        fp += 1
199
200        print "Time training: " + str(datetime.now()-now)
201        print "True Positives: " + str(tp)
202        print "False Positives: " + str(fp)
203        print "True Negatives: " + str(tn)
204        print "False Negatives: " + str(fn)
205        print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
206        print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
207        print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
208        print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
209        print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
210        print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
211
212        print "Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
213        return classifier
214
215print "\n****** NAIVE BAYES ************"
216results(nltk.NaiveBayesClassifier, testing_set, training_set)
217print "\n****** Random Forest ************"
218results(SklearnClassifier(RandomForestClassifier()), testing_set, training_set)
219print "\n****** ADA BOOST ************"
220results(SklearnClassifier(AdaBoostClassifier()), testing_set, training_set)
221print "\n****** MULTINOMIAL ************"
222results(SklearnClassifier(MultinomialNB()), testing_set, training_set)
223print "\n****** DECISION TREE ************"
224results(SklearnClassifier(tree.DecisionTreeClassifier()), testing_set, training_set)
225print "\n****** BERNOULLI ************"
226results(SklearnClassifier(BernoulliNB()), testing_set, training_set)
227print "\n****** LOGISTIC REGRESSION ************"
228save_model = results(SklearnClassifier(LogisticRegression()), testing_set, training_set)
229print "\n****** SGD CLASSIFIER ************"
230results(SklearnClassifier(SGDClassifier()), testing_set, training_set)
231print "\n****** SVC ************"
232results(SklearnClassifier(SVC()), testing_set, training_set)
233print "\n****** LINEAR SVC ************"
234results(SklearnClassifier(LinearSVC()), testing_set, training_set)
235
236save_model = open("./training_info.pickle", "wb")
237pickle.dump(saving_model, save_model)
238save_model.close()
Note: See TracBrowser for help on using the repository browser.