Changeset 16366


Ignore:
Timestamp:
Jul 22, 2016, 11:19:37 AM (3 years ago)
Author:
dferreira
Message:

Changes on language extractor

Location:
internals/2016/aptoideimagesdetector/trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py

    r16365 r16366  
    55# Aptoide, 2016
    66# Initial language extractor
    7 
    8 # LEFT:
    9 # Change database to add one more field - comment to change db
    10 # Change categorization of text and images
    117
    128
     
    1612import nltk
    1713import random
     14import collections
     15from nltk.metrics import precision, recall, f_measure
    1816from sklearn.naive_bayes import MultinomialNB, BernoulliNB
    1917from sklearn.linear_model import LogisticRegression, SGDClassifier
     
    2826from statistics import mode
    2927import string
     28from datetime import datetime, time
    3029
    3130
     
    3635        def classify(self, features):
    3736                votes = []
     37                sumv = 0
    3838                for c in self._classifiers:
    39                         v = c.classify(features)
    40                         votes.append(v)
    41                 return mode(votes)
    42 
    43         def confidence(self, features):
    44                 votes = []
    45                 for c in self._classifiers:
    46                         v = c.classify(features)
    47                         votes.append(v)
    48                
    49                 choice_votes = votes.count(mode(votes))
    50                 conf = float(choice_votes)/float(len(votes))
    51                 return conf
     39                        sumv += c.classify(features)
     40                return sumv/len(self._classifiers)
     41
    5242
    5343# Step 0: Define parameters
     
    5848n_common_words = 5000
    5949# Number of most informative features
    60 n_most_informative_features = 15
     50n_most_informative_features = 25
    6151
    6252# Stemmer to all words
     
    6555punctuations = list(string.punctuation)
    6656punctuations.append("''")
     57punctuations.append("--")
    6758
    6859# Step 1: Get the Content and label it
     
    7162c = db.cursor()
    7263
    73 db2 = sqlite3.connect('../API to download database/app_info.db')
    74 c2 = db.cursor()
     64db2 = sqlite3.connect('../API to download database/app_info_explicit.db')
     65c2 = db2.cursor()
    7566
    7667explicit_content = []
    7768non_explicit_content = []
    7869documents = []
    79 total_size = 0
    80 
     70exp_size = 0
     71non_size = 0
    8172
    8273c2.execute(''' SELECT description FROM app_data WHERE majority=1 ''')
     
    8576        explicit_content.append(d[0])
    8677        documents.append((d[0],'exp'))
    87         total_size += 1
     78        exp_size+=1
    8879
    8980c2.execute(''' SELECT description FROM app_data WHERE majority=0 ''')
     
    9283        non_explicit_content.append(d[0])
    9384        documents.append((d[0],'non'))
    94         total_size += 1
     85        non_size += 1
    9586
    9687c.execute(''' SELECT description FROM app_data WHERE majority=1''')
     
    9990        explicit_content.append(d[0])
    10091        documents.append((d[0],'exp'))
    101         total_size += 1
     92        exp_size += 1
    10293
    10394c.execute(''' SELECT description FROM app_data WHERE majority=0''')
     
    10697        non_explicit_content.append(d[0])
    10798        documents.append((d[0],'non'))
    108         total_size += 1
     99        non_size += 1
    109100       
    110 print total_size
     101print "Explicit descriptions: "+str(exp_size)
     102print "Non-Explicit descriptions: "+str(non_size)
    111103
    112104db.close()
     
    117109# Step 2: Tokenize words
    118110print "Tokenizing..."
     111now = datetime.now()
    119112explicit_content_words = [word_tokenize(w) for w in explicit_content]
    120113non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
    121 
     114print str(datetime.now()-now)
    122115
    123116# Step 3: Append all words (lower)
     
    125118all_words = []
    126119print "Appending all words..."
     120now = datetime.now()
    127121for w in explicit_content_words:
    128122        for x in w:
     
    134128                if x not in stopwords.words('english') and x not in punctuations:
    135129                        all_words.append(stemmer.stem(x.lower()))
     130print str(datetime.now()-now)
    136131
    137132print "Creating a frequency distribution..."
     133now = datetime.now()
    138134# Step 4: Get FreqDist
    139135all_words = nltk.FreqDist(all_words)
    140 print all_words.most_common(15)
     136print str(datetime.now()-now)
     137print all_words.most_common(25)
    141138
    142139print "Get the n most common features..."
     140now = datetime.now()
    143141# Step 5: Get n common words as features
    144142word_features = list(all_words.keys())[:n_common_words]
     
    160158        return features
    161159
     160print str(datetime.now()-now)
     161
    162162print "Create a feature set..."
     163now = datetime.now()
    163164# Step 7: Get feature set
    164165featuresets = [(find_features(rev), category) for (rev, category) in documents]
     
    168169
    169170# Step 9: Create training set and testing set from feature sets
    170 training_set = featuresets[:total_size-number_testing]
     171training_set = featuresets[:exp_size+non_size-number_testing]
    171172#print training_set
    172 testing_set = featuresets[total_size-number_testing:]
     173testing_set = featuresets[exp_size+non_size-number_testing:]
    173174#print testing_set
     175print str(datetime.now()-now)
    174176
    175177print "Training..."
    176178# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
    177179classifier = nltk.NaiveBayesClassifier.train(training_set)
     180refsets = collections.defaultdict(set)
     181testsets = collections.defaultdict(set)
     182
     183for i, (features, label) in enumerate(testing_set):
     184        refsets[label].add(i)
     185        observed = classifier.classify(features)
     186        testsets[observed].add(i)
     187
     188print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
     189print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
     190print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
     191print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
     192print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
     193print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
     194
    178195print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
    179196
     
    233250print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
    234251
    235 print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
    236 print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
    237 print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
    238 print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
    239 print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
    240 print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
     252print "Classification:", voted_classifier.classify(testing_set[0][0])*100
     253print "Classification:", voted_classifier.classify(testing_set[1][0])*100
     254print "Classification:", voted_classifier.classify(testing_set[2][0])*100
     255print "Classification:", voted_classifier.classify(testing_set[3][0])*100
     256print "Classification:", voted_classifier.classify(testing_set[4][0])*100
     257print "Classification:", voted_classifier.classify(testing_set[5][0])*100
    241258
    242259
Note: See TracChangeset for help on using the changeset viewer.