Ignore:
Timestamp:
Jul 18, 2016, 1:40:27 PM (3 years ago)
Author:
dferreira
Message:

Changes to linguage extractor test

File:
1 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial language extractor/language_extractor.py

    r16334 r16335  
    66# Initial language extractor
    77
     8
     9# Step 1: Get the Content and label
     10explicit_content = []
     11non_explicit_content = []
    812documents = []
    913
     14# Step 2: Tokenize words
     15explicit_content_words = word_tokenize(explicit_content)
     16non_explicit_content_words = word_tokenize(non_explicit_content)
     17
     18# Step 3: Append all words (lower)
     19all_words = []
     20
     21for w in explicit_content_words:
     22        all_words.append(w.lower())
     23
     24for w in non_explicit_content_words:
     25        all_words.append(w.lower())
     26
     27# Step 4: Get FreqDist
     28all_words = nltk.FreqDist(all_words)
     29
     30# Step 5: Get n common words as features
     31word_features = list(all_words.keys())[:5000]
     32
     33# Step 6: Check if it finds features in words
     34def find_features(document):
     35        words = word_tokenize(document)
     36        features = {}
     37        for w in word_features:
     38                features[w] = (w in words)
     39
     40        return features
     41
     42
     43# Step 7: Get feature set
     44featuresets = [(find_features(rev), category) for (rev, category) in documents]
     45
     46
     47# Step 8: Shuffle feature sets
     48random.shuffle(featuresets)
     49
     50# Step 9: Create training set and testing set from feature sets
     51training_set = featuresets[:]
     52testing_set = featuresets[:]
     53
     54# Step 11: With the original Naive Bayes, print Classification.
     55classifier = nltk.NaiveBayesClassifier.train(training_set)
     56print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
     57
     58# Step 12: Create Classifier class and try to decide which of the classifiers is more accurate.
     59
     60# Step 13: Research about classifier parameters and decide which is better.
     61
     62# Step 14: Save classifier with pickle
     63
     64# Step 15: Try to add more information such as title of app
Note: See TracChangeset for help on using the changeset viewer.