source: internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization-tests.py @ 16387

Last change on this file since 16387 was 16387, checked in by dferreira, 3 years ago

File with text categorization added

File size: 6.8 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Diogo Ferreira
5# Aptoide, 2016
6# Initial Test Categorization tests
7
8
9from __future__ import division
10import sqlite3
11from nltk.tokenize import word_tokenize
12import nltk
13import os
14import random
15import collections
16from nltk.metrics import precision, recall, f_measure
17from sklearn.naive_bayes import MultinomialNB, BernoulliNB
18from sklearn.linear_model import LogisticRegression, SGDClassifier
19from nltk.classify.scikitlearn import SklearnClassifier
20from sklearn.svm import SVC, LinearSVC, NuSVC
21from nltk.classify import ClassifierI
22from nltk.corpus import stopwords
23from nltk.stem.porter import *
24from nltk.util import ngrams
25from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
26from sklearn import tree
27from statistics import mode
28import string
29from datetime import datetime, time
30
31
32
33# Step 0: Define parameters
34
35# Length of test set
36number_testing = 500
37# Number of most common words used for classifier
38n_common_words = 5000
39# Number of most informative features
40n_most_informative_features = 25
41
42# Stemmer to all words
43stemmer = PorterStemmer()
44
45punctuations = list(string.punctuation)
46punctuations.append("''")
47punctuations.append("--")
48
49# Step 1: Get the Content and label it
50db = sqlite3.connect('../API to download database/app_info_non_explicit.db')
51c = db.cursor()
52
53db2 = sqlite3.connect('../API to download database/app_info_explicit.db')
54c2 = db2.cursor()
55
56explicit_content = []
57non_explicit_content = []
58documents = []
59exp_size = 0
60non_size = 0
61
62c2.execute(''' SELECT description FROM app_data WHERE majority=1 ''')
63
64for d in c2.fetchall():
65        explicit_content.append(d[0])
66        documents.append((d[0],'exp'))
67        exp_size+=1
68
69c.execute(''' SELECT description FROM app_data WHERE majority=1''')
70
71for d in c.fetchall():
72        explicit_content.append(d[0])
73        documents.append((d[0],'exp'))
74        exp_size += 1
75
76
77c.execute(''' SELECT description FROM app_data WHERE majority=0''')
78
79for d in c.fetchall():
80        non_explicit_content.append(d[0])
81        documents.append((d[0],'non'))
82        non_size += 1
83
84        #if non_size==exp_size:
85        #       break
86
87c2.execute(''' SELECT description FROM app_data WHERE majority=0 ''')
88
89for d in c2.fetchall():
90        non_explicit_content.append(d[0])
91        documents.append((d[0],'non'))
92        non_size += 1
93
94       
95print "Explicit descriptions: "+str(exp_size)
96print "Non-Explicit descriptions: "+str(non_size)
97
98db.close()
99db2.close()
100
101print "Pre-Processing..."
102
103# Step 2: Tokenize words
104print "Tokenizing..."
105now = datetime.now()
106explicit_content_words = [word_tokenize(w) for w in explicit_content]
107non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
108print str(datetime.now()-now)
109
110# Step 3: Append all words (lower)
111
112all_words = []
113print "Appending all words..."
114now = datetime.now()
115for w in explicit_content_words:
116        for x in w:
117                if x not in stopwords.words('english') and x not in punctuations:
118                        all_words.append(stemmer.stem(x.lower()))
119
120for w in non_explicit_content_words:
121        for x in w:
122                if x not in stopwords.words('english') and x not in punctuations:
123                        all_words.append(stemmer.stem(x.lower()))
124print str(datetime.now()-now)
125
126print "Creating a frequency distribution..."
127now = datetime.now()
128# Step 4: Get FreqDist
129all_words = nltk.FreqDist(all_words)
130print str(datetime.now()-now)
131print all_words.most_common(25)
132
133print "Get the n most common features..."
134now = datetime.now()
135# Step 5: Get n common words as features
136word_features = list(all_words.keys())[:n_common_words]
137#print word_features
138
139# Step 6: Check if it finds features in words
140def find_features(document):
141        words = word_tokenize(document)
142
143        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
144
145        # Later try to add size of description, category and title
146        # Also try to add bigrams and trigrams with and without stop words
147        # Careful so it doesn't overfit
148        features = {}
149        for w in word_features:
150                features[w] = (w in words)
151
152        return features
153
154print str(datetime.now()-now)
155
156print "Create a feature set..."
157now = datetime.now()
158# Step 7: Get feature set
159featuresets = [(find_features(rev), category) for (rev, category) in documents]
160
161# Step 8: Shuffle feature sets
162random.shuffle(featuresets)
163
164# Step 9: Create training set and testing set from feature sets
165training_set = featuresets[:exp_size+non_size-number_testing]
166#print training_set
167testing_set = featuresets[exp_size+non_size-number_testing:]
168#print testing_set
169print str(datetime.now()-now)
170
171# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
172
173print "Training..."
174
175def results(classifier, testing_set, training_set):
176        now = datetime.now()
177        classifier = classifier.train(training_set)
178        refsets = collections.defaultdict(set)
179        testsets = collections.defaultdict(set)
180
181        tp=0
182        fp=0
183        tn=0
184        fn=0
185
186        for i, (features, label) in enumerate(testing_set):
187                refsets[label].add(i)
188                observed = classifier.classify(features)
189                testsets[observed].add(i)
190                if label =='exp' and observed =='exp':
191                        tp += 1
192                elif label=='non' and observed=='non':
193                        tn += 1
194                elif label=='exp' and observed=='non':
195                        fn += 1
196                else:
197                        fp += 1
198
199        print "Time training: " + str(datetime.now()-now)
200        print "True Positives: " + str(tp)
201        print "False Positives: " + str(fp)
202        print "True Negatives: " + str(tn)
203        print "False Negatives: " + str(fn)
204        print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
205        print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
206        print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
207        print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
208        print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
209        print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
210
211        print "Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
212
213print "\n****** NAIVE BAYES ************"
214results(nltk.NaiveBayesClassifier, testing_set, training_set)
215print "\n****** Random Forest ************"
216results(SklearnClassifier(RandomForestClassifier()), testing_set, training_set)
217print "\n****** ADA BOOST ************"
218results(SklearnClassifier(AdaBoostClassifier()), testing_set, training_set)
219print "\n****** MULTINOMIAL ************"
220results(SklearnClassifier(MultinomialNB()), testing_set, training_set)
221print "\n****** DECISION TREE ************"
222results(SklearnClassifier(tree.DecisionTreeClassifier()), testing_set, training_set)
223print "\n****** BERNOULLI ************"
224results(SklearnClassifier(BernoulliNB()), testing_set, training_set)
225print "\n****** LOGISTIC REGRESSION ************"
226results(SklearnClassifier(LogisticRegression()), testing_set, training_set)
227print "\n****** SGD CLASSIFIER ************"
228results(SklearnClassifier(SGDClassifier()), testing_set, training_set)
229print "\n****** SVC ************"
230results(SklearnClassifier(SVC()), testing_set, training_set)
231print "\n****** LINEAR SVC ************"
232results(SklearnClassifier(LinearSVC()), testing_set, training_set)
233
Note: See TracBrowser for help on using the repository browser.