Changeset 16357


Ignore:
Timestamp:
Jul 21, 2016, 7:59:48 AM (3 years ago)
Author:
dferreira
Message:

Start of language extractor

Location:
internals/2016/aptoideimagesdetector/trunk/Source Code
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py

    r16356 r16357  
    77
    88import sqlite3
     9from nltk.tokenize import word_tokenize
     10import nltk
     11import random
     12from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
     13from sklearn.linear_model import LogisticRegression, SGDClassifier
     14from nltk.classify.scikitlearn import SklearnClassifier
     15from sklearn.svm import SVC, LinearSVC, NuSVC
     16from nltk.classify import ClassifierI
     17from statistics import mode
     18
     19class VoteClassifier(ClassifierI):
     20        def __init__(self, *classifiers):
     21                self._classifiers = classifiers
     22
     23        def classify(self, features):
     24                votes = []
     25                for c in self._classifiers:
     26                        v = c.classify(features)
     27                        votes.append(v)
     28                return mode(votes)
     29
     30        def confidence(self, features):
     31                votes = []
     32                for c in self._classifiers:
     33                        v = c.classify(features)
     34                        votes.append(v)
     35               
     36                choice_votes = votes.count(mode(votes))
     37                conf = choice_votes/len(votes)
     38                return conf
    939
    1040# Step 1: Get the Content and label
     
    1343c = db.cursor()
    1444
    15 c.execute(''' SELECT description FROM app_data WHERE age>=18 ''')
    16 
    17 
    18 
    1945explicit_content = []
    2046non_explicit_content = []
    2147documents = []
    2248
     49c.execute(''' SELECT description FROM app_data WHERE age>=18 ''')
     50
     51for d in c.fetchall():
     52        explicit_content.append(d[0])
     53        documents.append((d[0],'exp'))
     54
     55c.execute(''' SELECT description FROM app_data WHERE age<18 ''')
     56
     57for d in c.fetchall():
     58        non_explicit_content.append(d[0])
     59        documents.append((d[0],'non'))
     60
     61
    2362db.close()
    2463
    25 '''
     64
    2665# Step 2: Tokenize words
    27 explicit_content_words = word_tokenize(explicit_content)
    28 non_explicit_content_words = word_tokenize(non_explicit_content)
     66
     67explicit_content_words = [word_tokenize(w) for w in explicit_content]
     68non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
     69
    2970
    3071# Step 3: Append all words (lower)
     72
    3173all_words = []
    3274
    3375for w in explicit_content_words:
    34         all_words.append(w.lower())
     76        for x in w:
     77                all_words.append(x)
    3578
    3679for w in non_explicit_content_words:
    37         all_words.append(w.lower())
     80        for x in w:
     81                all_words.append(x)
     82
     83
    3884
    3985# Step 4: Get FreqDist
     
    61107
    62108# Step 9: Create training set and testing set from feature sets
    63 training_set = featuresets[:]
    64 testing_set = featuresets[:]
     109number_training = 10
     110training_set = featuresets[:number_training]
     111testing_set = featuresets[number_training:]
     112
     113
    65114
    66115# Step 11: With the original Naive Bayes, print Classification.
    67116classifier = nltk.NaiveBayesClassifier.train(training_set)
    68117print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
     118
     119classifier.show_most_informative_features(15)
     120
     121MNB_classifier = SklearnClassifier(MultinomialNB())
     122MNB_classifier.train(training_set)
     123print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(MNB_classifier, testing_set))*100
     124
     125#GNB_classifier = SklearnClassifier(GaussianNB())
     126#GNB_classifier.train(training_set)
     127#print "GNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(GNB_classifier, testing_set))*100
     128
     129BNB_classifier = SklearnClassifier(BernoulliNB())
     130BNB_classifier.train(training_set)
     131print "MNB_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(BNB_classifier, testing_set))*100
     132
     133LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
     134LogisticRegression_classifier.train(training_set)
     135print "LogisticRegression_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100
     136
     137SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
     138SGDClassifier_classifier.train(training_set)
     139print "SGDClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100
     140
     141SVCClassifier_classifier = SklearnClassifier(SVC())
     142SVCClassifier_classifier.train(training_set)
     143print "SVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(SVCClassifier_classifier, testing_set))*100
     144
     145LinearSVCClassifier_classifier = SklearnClassifier(LinearSVC())
     146LinearSVCClassifier_classifier.train(training_set)
     147print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(LinearSVCClassifier_classifier, testing_set))*100
     148
     149#NuSVCClassifier_classifier = SklearnClassifier(NuSVC())
     150#NuSVCClassifier_classifier.train(training_set)
     151#print "LinearSVCClassifier_classifier Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(NuSVCClassifier_classifier, testing_set))*100
     152
     153voted_classifier = VoteClassifier(classifier,
     154        MNB_classifier,
     155        BNB_classifier,
     156        LogisticRegression_classifier,
     157        SGDClassifier_classifier,
     158        LinearSVCClassifier_classifier)
     159print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
     160
     161print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
     162print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
     163print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
     164print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
     165print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
     166print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
     167
    69168
    70169# Step 12: Create Classifier class and try to decide which of the classifiers is more accurate.
Note: See TracChangeset for help on using the changeset viewer.