source: internals/2016/aptoideimagesdetector/trunk/Source Code/Language Extractor/language_extractor.py @ 16356

Last change on this file since 16356 was 16356, checked in by dferreira, 3 years ago

Starting tests on Text Mining with the description of apps

File size: 1.8 KB
Line 
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3
4# Diogo Ferreira
5# Aptoide, 2016
6# Initial language extractor
7
8import sqlite3
9
10# Step 1: Get the Content and label
11
12db = sqlite3.connect('../API to download database/app_info.db')
13c = db.cursor()
14
15c.execute(''' SELECT description FROM app_data WHERE age>=18 ''')
16
17
18
19explicit_content = []
20non_explicit_content = []
21documents = []
22
23db.close()
24
25'''
26# Step 2: Tokenize words
27explicit_content_words = word_tokenize(explicit_content)
28non_explicit_content_words = word_tokenize(non_explicit_content)
29
30# Step 3: Append all words (lower)
31all_words = []
32
33for w in explicit_content_words:
34        all_words.append(w.lower())
35
36for w in non_explicit_content_words:
37        all_words.append(w.lower())
38
39# Step 4: Get FreqDist
40all_words = nltk.FreqDist(all_words)
41
42# Step 5: Get n common words as features
43word_features = list(all_words.keys())[:5000]
44
45# Step 6: Check if it finds features in words
46def find_features(document):
47        words = word_tokenize(document)
48        features = {}
49        for w in word_features:
50                features[w] = (w in words)
51
52        return features
53
54
55# Step 7: Get feature set
56featuresets = [(find_features(rev), category) for (rev, category) in documents]
57
58
59# Step 8: Shuffle feature sets
60random.shuffle(featuresets)
61
62# Step 9: Create training set and testing set from feature sets
63training_set = featuresets[:]
64testing_set = featuresets[:]
65
66# Step 11: With the original Naive Bayes, print Classification.
67classifier = nltk.NaiveBayesClassifier.train(training_set)
68print "Original Naive Bayes Algo Accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set))*100
69
70# Step 12: Create Classifier class and try to decide which of the classifiers is more accurate.
71
72# Step 13: Research about classifier parameters and decide which is better.
73
74# Step 14: Save classifier with pickle
75
76# Step 15: Try to add more information such as title of app'''
Note: See TracBrowser for help on using the repository browser.