source: internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial language extractor/language_extractor.py @ 16335

Last change on this file since 16335 was 16335, checked in by dferreira, 3 years ago

Changes to linguage extractor test

File size: 1.6 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Diogo Ferreira
5# Aptoide, 2016
6# Initial language extractor
7
8
9# Step 1: Get the Content and label
10explicit_content = []
11non_explicit_content = []
12documents = []
13
14# Step 2: Tokenize words
15explicit_content_words = word_tokenize(explicit_content)
16non_explicit_content_words = word_tokenize(non_explicit_content)
17
18# Step 3: Append all words (lower)
19all_words = []
20
21for w in explicit_content_words:
22        all_words.append(w.lower())
23
24for w in non_explicit_content_words:
25        all_words.append(w.lower())
26
27# Step 4: Get FreqDist
28all_words = nltk.FreqDist(all_words)
29
30# Step 5: Get n common words as features
31word_features = list(all_words.keys())[:5000]
32
33# Step 6: Check if it finds features in words
34def find_features(document):
35        words = word_tokenize(document)
36        features = {}
37        for w in word_features:
38                features[w] = (w in words)
39
40        return features
41
42
43# Step 7: Get feature set
44featuresets = [(find_features(rev), category) for (rev, category) in documents]
45
46
47# Step 8: Shuffle feature sets
48random.shuffle(featuresets)
49
50# Step 9: Create training set and testing set from feature sets
51training_set = featuresets[:]
52testing_set = featuresets[:]
53
54# Step 11: With the original Naive Bayes, print Classification.
55classifier = nltk.NaiveBayesClassifier.train(training_set)
56print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
57
58# Step 12: Create Classifier class and try to decide which of the classifiers is more accurate.
59
60# Step 13: Research about classifier parameters and decide which is better.
61
62# Step 14: Save classifier with pickle
63
64# Step 15: Try to add more information such as title of app
Note: See TracBrowser for help on using the repository browser.