Changeset 16363


Ignore:
Timestamp:
Jul 21, 2016, 3:30:51 PM (3 years ago)
Author:
dferreira
Message:

Images keep local

Location:
internals/2016/aptoideimagesdetector/trunk/Source Code
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py

    r16359 r16363  
    2727from sklearn import tree
    2828from statistics import mode
     29import string
    2930
    3031
     
    4748               
    4849                choice_votes = votes.count(mode(votes))
    49                 conf = choice_votes/len(votes)
     50                conf = float(choice_votes)/float(len(votes))
    5051                return conf
    5152
     
    5354
    5455# Length of test set
    55 number_testing = 10
     56number_testing = 500
    5657# Number of most common words used for classifier
    5758n_common_words = 5000
     
    6263stemmer = PorterStemmer()
    6364
     65punctuations = list(string.punctuation)
     66punctuations.append("''")
    6467
    6568# Step 1: Get the Content and label it
    6669
    67 db = sqlite3.connect('../API to download database/app_info.db')
     70db = sqlite3.connect('../API to download database/app_info_non_explicit.db')
    6871c = db.cursor()
     72
     73db2 = sqlite3.connect('../API to download database/app_info.db')
     74c2 = db.cursor()
    6975
    7076explicit_content = []
     
    7379total_size = 0
    7480
    75 c.execute(''' SELECT description FROM app_data WHERE age>=18 ''')
    76 
    77 for d in c.fetchall():
     81#WHERE age>=18
     82c2.execute(''' SELECT description FROM app_data ''')
     83
     84for d in c2.fetchall():
    7885        explicit_content.append(d[0])
    7986        documents.append((d[0],'exp'))
    8087        total_size += 1
    81 
    82 c.execute(''' SELECT description FROM app_data WHERE age<18 ''')
     88       
     89#WHERE age<18
     90c.execute(''' SELECT description FROM app_data ''')
    8391
    8492for d in c.fetchall():
     
    8694        documents.append((d[0],'non'))
    8795        total_size += 1
    88 
     96print total_size
    8997
    9098db.close()
    91 
     99db2.close()
     100
     101print "Pre-Processing..."
    92102
    93103# Step 2: Tokenize words
    94 
     104print "Tokenizing..."
    95105explicit_content_words = [word_tokenize(w) for w in explicit_content]
    96106non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
     
    100110
    101111all_words = []
    102 
     112print "Appending all words..."
    103113for w in explicit_content_words:
    104114        for x in w:
    105                 if x not in stopwords.words('english'):
     115                if x not in stopwords.words('english') and x not in punctuations:
    106116                        all_words.append(stemmer.stem(x.lower()))
    107117
    108118for w in non_explicit_content_words:
    109119        for x in w:
    110                 if x not in stopwords.words('english'):
     120                if x not in stopwords.words('english') and x not in punctuations:
    111121                        all_words.append(stemmer.stem(x.lower()))
    112122
    113 
    114 
     123print "Creating a frequency distribution..."
    115124# Step 4: Get FreqDist
    116125all_words = nltk.FreqDist(all_words)
    117 
     126print all_words.most_common(15)
     127
     128print "Get the n most common features..."
    118129# Step 5: Get n common words as features
    119130word_features = list(all_words.keys())[:n_common_words]
     131#print word_features
    120132
    121133# Step 6: Check if it finds features in words
     
    123135        words = word_tokenize(document)
    124136
    125         words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english')]
     137        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
    126138
    127139        # Later try to add size of description, category and title
     
    134146        return features
    135147
    136 
     148print "Create a feature set..."
    137149# Step 7: Get feature set
    138150featuresets = [(find_features(rev), category) for (rev, category) in documents]
    139151
    140 
    141152# Step 8: Shuffle feature sets
    142 #random.shuffle(featuresets)
     153random.shuffle(featuresets)
    143154
    144155# Step 9: Create training set and testing set from feature sets
    145156training_set = featuresets[:total_size-number_testing]
     157#print training_set
    146158testing_set = featuresets[total_size-number_testing:]
    147 
    148 
     159#print testing_set
     160
     161print "Training..."
    149162# Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
    150163classifier = nltk.NaiveBayesClassifier.train(training_set)
     
    189202print "LinearSVCClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
    190203
    191 NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
    192 NuSVCClassifier_classifier.train(training_set)
    193 print "LinearSVCClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
    194 
    195 #voted_classifier = VoteClassifier(classifier,
    196 #       MNB_classifier,
    197 #       ADA_classifier,
    198 #       TREE_classifier,
    199 #       RFC_classifier,
    200 #       BNB_classifier,
    201 #       LogisticRegression_classifier,
    202 #       SGDClassifier_classifier,
    203 #       SVCClassifier_classifier,
     204#NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
     205#NuSVCClassifier_classifier.train(training_set)
     206#print "LinearSVCClassifier_classifier Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
     207
     208voted_classifier = VoteClassifier(classifier,
     209        MNB_classifier,
     210        ADA_classifier,
     211        TREE_classifier,
     212        RFC_classifier,
     213        BNB_classifier,
     214        LogisticRegression_classifier,
     215        SGDClassifier_classifier,
     216        SVCClassifier_classifier,
    204217#       NuSVCClassifier_classifier,
    205 #       LinearSVCClassifier_classifier)
    206 #print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
    207 
    208 #print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
    209 #print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
    210 #print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
    211 #print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
    212 #print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
    213 #print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
     218        LinearSVCClassifier_classifier)
     219print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
     220
     221print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
     222print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
     223print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
     224print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
     225print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
     226print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
    214227
    215228
Note: See TracChangeset for help on using the changeset viewer.