Changeset 16406


Ignore:
Timestamp:
Jul 26, 2016, 1:46:05 PM (3 years ago)
Author:
dferreira
Message:

Text categorization improved.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization-tests.py

    r16405 r16406  
    3535
    3636# Length of test set
    37 number_testing = 500
     37#number_testing = 500
    3838# Number of most common words used for classifier
    39 n_common_words = 5000
     39# With NuSVC:
     40# 3000 Words Accuracy: 94.8
     41# 5000 Words Accuracy: 97.2
     42# 6000 Words Accuracy: 97.4
     43# 7000 Words Accuracy: 96.8
     44n_common_words = 6000
    4045# Number of most informative features
    41 n_most_informative_features = 25
     46#n_most_informative_features = 25
    4247
    4348# Stemmer to all words
     
    125130print "Creating a frequency distribution..."
    126131now = datetime.now()
     132
    127133# Step 4: Get FreqDist
    128134all_words = nltk.FreqDist(all_words)
     
    141147        words = word_tokenize(document)
    142148        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
     149       
    143150        # Features:
    144151        # Title: Not Included
     
    147154        # Description: Included
    148155
    149         # Also try to add bigrams and trigrams
    150156       
    151157        features = {}
     
    153159                features[w] = (w in words)
    154160        #features["size"] = size
    155         #features["category"] = cat
     161        features["category"] = cat
    156162        features["age"] = age
    157163
     
    166172
    167173# Step 8: Shuffle feature sets
    168 random.shuffle(featuresets)
     174#random.shuffle(featuresets)
    169175
    170176# Step 9: Create training set and testing set from feature sets
    171 training_set = featuresets[:exp_size+non_size-number_testing]
     177#training_set = featuresets[:exp_size+non_size-number_testing]
    172178#print training_set
    173 testing_set = featuresets[exp_size+non_size-number_testing:]
     179#testing_set = featuresets[exp_size+non_size-number_testing:]
    174180#print testing_set
    175 print str(datetime.now()-now)
    176 
    177 # Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
    178 
    179 print "Training..."
    180 
    181 def results(classifier, testing_set, training_set):
    182         now = datetime.now()
    183         classifier = classifier.train(training_set)
    184         refsets = collections.defaultdict(set)
    185         testsets = collections.defaultdict(set)
    186 
    187         tp=0
    188         fp=0
    189         tn=0
    190         fn=0
    191 
    192         for i, (features, label) in enumerate(testing_set):
    193                 refsets[label].add(i)
    194                 observed = classifier.classify(features)
    195                 testsets[observed].add(i)
    196                 if label =='exp' and observed =='exp':
    197                         tp += 1
    198                 elif label=='non' and observed=='non':
    199                         tn += 1
    200                 elif label=='exp' and observed=='non':
    201                         fn += 1
    202                 else:
    203                         fp += 1
    204 
    205         print "Time training: " + str(datetime.now()-now)
    206         print "True Positives: " + str(tp)
    207         print "False Positives: " + str(fp)
    208         print "True Negatives: " + str(tn)
    209         print "False Negatives: " + str(fn)
    210         print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
    211         print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
    212         print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
    213         print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
    214         print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
    215         print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
    216 
    217         print "Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
    218         return classifier
     181#print str(datetime.now()-now)
     182
     183explicit_feat = [feature for feature in featuresets if feature[4]=='exp']
     184non_explicit_feat = [feature for feature in featuresets if feature[4]=='non']
     185
     186i=0
     187while i<5:
     188        # Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
     189
     190        print "Training..."
     191
     192        testing_set = explicit_feat[(i*len(explicit_feat)/5):((i+1)*len(explicit_feat)/5)]+non_explicit_feat[(i*len(non_explicit_feat)/5):((i+1)*len(non_explicit_feat)/5)]
     193        print "Testing: "+str(len(testing_set))
     194        training_set = [x for j,x in enumerate(explicit_feat) if j<(i*len(explicit_feat)/5) or j>((i+1)*len(explicit_feat)/5)]
     195        training_set += [x for j,x in enumerate(non_explicit_feat) if j<(i*len(non_explicit_feat)/5) or j>((i+1)*len(non_explicit_feat)/5)]
     196        print "Training: "+str(len(training_set))
     197
     198        random.shuffle(training_set)
     199        random.shuffle(testing_set)
     200
     201        def results(classifier, testing_set, training_set):
     202                now = datetime.now()
     203                classifier = classifier.train(training_set)
     204                refsets = collections.defaultdict(set)
     205                testsets = collections.defaultdict(set)
     206
     207                tp=0
     208                fp=0
     209                tn=0
     210                fn=0
     211
     212                for i, (features, label) in enumerate(testing_set):
     213                        refsets[label].add(i)
     214                        observed = classifier.classify(features)
     215                        testsets[observed].add(i)
     216                        if label =='exp' and observed =='exp':
     217                                tp += 1
     218                        elif label=='non' and observed=='non':
     219                                tn += 1
     220                        elif label=='exp' and observed=='non':
     221                                fn += 1
     222                        else:
     223                                fp += 1
     224
     225                print "Time training: " + str(datetime.now()-now)
     226                print "True Positives: " + str(tp)
     227                print "False Positives: " + str(fp)
     228                print "True Negatives: " + str(tn)
     229                print "False Negatives: " + str(fn)
     230                print 'Explicit Precision: ', precision(refsets['exp'], testsets['exp'])
     231                print 'Explicit recall: ', recall(refsets['exp'], testsets['exp'])
     232                print 'Explicit F-Score: ', f_measure(refsets['exp'], testsets['exp'])
     233                print 'Non-Explicit Precision: ', precision(refsets['non'], testsets['non'])
     234                print 'Non-Explicit Recall: ', recall(refsets['non'], testsets['non'])
     235                print 'Non-Explicit F-Score: ', f_measure(refsets['non'], testsets['non'])
     236
     237                print "Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
     238                return classifier
     239
     240
     241
     242        try:
     243                print "\n****** NU SVC ************"
     244                saving_model=results(SklearnClassifier(NuSVC(nu=0.1)), testing_set, training_set)
     245        except:
     246                pass
     247        i+=1
     248'''
    219249
    220250try:
     
    242272
    243273try:
    244         print "\n****** NU SVC ************"
    245         results(SklearnClassifier(NuSVC(nu=0.1)), testing_set, training_set)
    246 except:
    247         pass
    248 
    249 try:
    250274        print "\n****** Extra Trees ************"
    251275        results(SklearnClassifier(ExtraTreesClassifier()), testing_set, training_set)
     
    253277        pass
    254278
    255 '''
    256279try:
    257280        print "\n****** NAIVE BAYES ************"
Note: See TracChangeset for help on using the changeset viewer.