Changeset 16358


Ignore:
Timestamp:
Jul 21, 2016, 11:15:40 AM (3 years ago)
Author:
dferreira
Message:

Database started running

Location:
internals/2016/aptoideimagesdetector/trunk/Source Code
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/API to download database/get_list_id.py

    r16353 r16358  
    128128                       
    129129        c = db.cursor()
    130         c.execute(''' SELECT * FROM crawl_list ''')
    131         print c.fetchall()
     130        c.execute(''' SELECT id FROM crawl_list ''')
     131        print (len(c.fetchall()))
     132        #print c.fetchall()
    132133
    133134        db.close()
  • internals/2016/aptoideimagesdetector/trunk/Source Code/API to download database/get_store_info.py

    r16353 r16358  
    239239        else:
    240240                get_store_info(db)
    241         #c = db.cursor()
    242         #c.execute(''' SELECT * FROM app_data ''')
     241        c = db.cursor()
     242        c.execute(''' SELECT id FROM app_data ''')
     243        print len(c.fetchall())
    243244        #print c.fetchall()
    244245        #c.execute(''' SELECT * FROM crawl_list ''')
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py

    r16357 r16358  
    66# Initial language extractor
    77
     8# LEFT:
     9# Add field to ?? in web service
     10# Change database to add one more field
     11# Change categorization of text
     12
     13
     14from __future__ import division
    815import sqlite3
    916from nltk.tokenize import word_tokenize
    1017import nltk
    1118import random
    12 from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
     19from sklearn.naive_bayes import MultinomialNB, BernoulliNB
    1320from sklearn.linear_model import LogisticRegression, SGDClassifier
    1421from nltk.classify.scikitlearn import SklearnClassifier
    1522from sklearn.svm import SVC, LinearSVC, NuSVC
    1623from nltk.classify import ClassifierI
     24from nltk.corpus import stopwords
     25from nltk.stem.porter import *
     26from nltk.util import ngrams
     27from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
     28from sklearn import tree
    1729from statistics import mode
     30
    1831
    1932class VoteClassifier(ClassifierI):
     
    3851                return conf
    3952
    40 # Step 1: Get the Content and label
     53# Step 0: Define parameters
     54
     55# Length of test set
     56number_testing = 10
     57# Number of most common words used for classifier
     58n_common_words = 5000
     59# Number of most informative features
     60n_most_informative_features = 15
     61
     62# Stemmer to all words
     63stemmer = PorterStemmer()
     64
     65
     66# Step 1: Get the Content and label it
    4167
    4268db = sqlite3.connect('../API to download database/app_info.db')
     
    4672non_explicit_content = []
    4773documents = []
     74total_size = 0
    4875
    4976c.execute(''' SELECT description FROM app_data WHERE age>=18 ''')
     
    5279        explicit_content.append(d[0])
    5380        documents.append((d[0],'exp'))
     81        total_size += 1
    5482
    5583c.execute(''' SELECT description FROM app_data WHERE age<18 ''')
     
    5886        non_explicit_content.append(d[0])
    5987        documents.append((d[0],'non'))
     88        total_size += 1
    6089
    6190
     
    75104for w in explicit_content_words:
    76105        for x in w:
    77                 all_words.append(x)
     106                if x not in stopwords.words('english'):
     107                        all_words.append(stemmer.stem(x.lower()))
    78108
    79109for w in non_explicit_content_words:
    80110        for x in w:
    81                 all_words.append(x)
     111                if x not in stopwords.words('english'):
     112                        all_words.append(stemmer.stem(x.lower()))
    82113
    83114
     
    87118
    88119# Step 5: Get n common words as features
    89 word_features = list(all_words.keys())[:5000]
     120word_features = list(all_words.keys())[:n_common_words]
    90121
    91122# Step 6: Check if it finds features in words
    92123def find_features(document):
    93124        words = word_tokenize(document)
     125
     126        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english')]
     127
     128        # Later try to add size of description, category and title
     129        # Also try to add bigrams and trigrams with and without stop words
     130        # Careful so it doesn't overfit
    94131        features = {}
    95132        for w in word_features:
     
    104141
    105142# Step 8: Shuffle feature sets
    106 random.shuffle(featuresets)
     143#random.shuffle(featuresets)
    107144
    108145# Step 9: Create training set and testing set from feature sets
    109 number_training = 10
    110 training_set = featuresets[:number_training]
    111 testing_set = featuresets[number_training:]
    112 
    113 
    114 
    115 # Step 11: With the original Naive Bayes, print Classification.
     146training_set = featuresets[:total_size-number_testing]
     147testing_set = featuresets[total_size-number_testing:]
     148
     149
     150# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
    116151classifier = nltk.NaiveBayesClassifier.train(training_set)
    117152print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
    118153
    119 classifier.show_most_informative_features(15)
     154classifier.show_most_informative_features(n_most_informative_features)
     155
     156RFC_classifier = SklearnClassifier(RandomForestClassifier())
     157RFC_classifier.train(training_set)
     158print "RFC_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(RFC_classifier, testing_set))*100
     159
     160ADA_classifier = SklearnClassifier(AdaBoostClassifier())
     161ADA_classifier.train(training_set)
     162print "ADA_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(ADA_classifier, testing_set))*100
    120163
    121164MNB_classifier = SklearnClassifier(MultinomialNB())
    122165MNB_classifier.train(training_set)
    123 print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100
    124 
    125 #GNB_classifier = SklearnClassifier(GaussianNB())
    126 #GNB_classifier.train(training_set)
    127 #print "GNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(GNB_classifier, testing_set))*100
     166print "MNB_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100
     167
     168TREE_classifier = SklearnClassifier(tree.DecisionTreeClassifier())
     169TREE_classifier.train(training_set)
     170print "TREE_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(TREE_classifier, testing_set))*100
    128171
    129172BNB_classifier = SklearnClassifier(BernoulliNB())
    130173BNB_classifier.train(training_set)
    131 print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100
     174print "BNB_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100
    132175
    133176LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    134177LogisticRegression_classifier.train(training_set)
    135 print "LogisticRegression_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100
     178print "LogisticRegression_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100
    136179
    137180SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    138181SGDClassifier_classifier.train(training_set)
    139 print "SGDClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100
     182print "SGDClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100
    140183
    141184SVCClassifier_classifier = SklearnClassifier(SVC())
    142185SVCClassifier_classifier.train(training_set)
    143 print "SVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SVCClassifier_classifier, testing_set))*100
     186print "SVCClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(SVCClassifier_classifier, testing_set))*100
    144187
    145188LinearSVCClassifier_classifier = SklearnClassifier(LinearSVC())
    146189LinearSVCClassifier_classifier.train(training_set)
    147 print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
    148 
    149 #NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
    150 #NuSVCClassifier_classifier.train(training_set)
    151 #print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
    152 
    153 voted_classifier = VoteClassifier(classifier,
    154         MNB_classifier,
    155         BNB_classifier,
    156         LogisticRegression_classifier,
    157         SGDClassifier_classifier,
    158         LinearSVCClassifier_classifier)
    159 print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
    160 
    161 print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
    162 print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
    163 print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
    164 print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
    165 print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
    166 print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
    167 
    168 
    169 # Step 12: Create Classifier class and try to decide which of the classifiers is more accurate.
    170 
    171 # Step 13: Research about classifier parameters and decide which is better.
    172 
    173 # Step 14: Save classifier with pickle
    174 
    175 # Step 15: Try to add more information such as title of app'''
     190print "LinearSVCClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
     191
     192NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
     193NuSVCClassifier_classifier.train(training_set)
     194print "LinearSVCClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
     195
     196#voted_classifier = VoteClassifier(classifier,
     197#       MNB_classifier,
     198#       ADA_classifier,
     199#       TREE_classifier,
     200#       RFC_classifier,
     201#       BNB_classifier,
     202#       LogisticRegression_classifier,
     203#       SGDClassifier_classifier,
     204#       SVCClassifier_classifier,
     205#       NuSVCClassifier_classifier,
     206#       LinearSVCClassifier_classifier)
     207#print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
     208
     209#print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
     210#print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
     211#print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
     212#print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
     213#print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
     214#print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
     215
     216
     217# Step 11: Create Classifier class and try to decide which of the classifiers is more accurate.
     218
     219# Step 12: Research about classifier parameters and decide which is better.
     220
     221# Step 13: Save classifier with pickle
     222
Note: See TracChangeset for help on using the changeset viewer.