source: internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization.py @ 16414

Last change on this file since 16414 was 16414, checked in by dferreira, 3 years ago

Black box scripts improved. Ready to run.

File size: 2.1 KB
Line 
1# Aptoide, 2016
2# Diogo Ferreira
3# Text categorization simple API
4
5
6from __future__ import division
7import sqlite3
8import os
9from nltk.tokenize import word_tokenize
10import nltk
11import random
12import pickle
13import os
14import collections
15from nltk.metrics import precision, recall, f_measure
16from sklearn.naive_bayes import MultinomialNB, BernoulliNB
17from sklearn.linear_model import LogisticRegression, SGDClassifier
18from nltk.classify.scikitlearn import SklearnClassifier
19from sklearn.svm import SVC, LinearSVC, NuSVC
20from nltk.classify import ClassifierI
21from nltk.corpus import stopwords
22from nltk.stem.porter import *
23from nltk.util import ngrams
24from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
25from sklearn import tree
26import string
27from datetime import datetime, time
28
29
30def text_cat(description, size, cat, age):
31        classifier = ""
32        words = []
33        word_features = []
34
35        try:
36                filename, file_extension = os.path.splitext(os.path.realpath(__file__))
37                fn = os.path.dirname(os.path.abspath(os.path.join(filename, os.pardir)))
38               
39                f = open(fn+"/Text_categorization/model_info.pickle", "rb")
40                classifier = pickle.load(f)
41                f.close()
42
43                f = open(fn+"/Text_categorization/word_features.pickle", "rb")
44                word_features = pickle.load(f)
45                f.close()
46        except:
47                print "Serialized objects not found"
48                exit(0)
49        stemmer = PorterStemmer()
50        punctuations = list(string.punctuation)
51        punctuations.append("''")
52        punctuations.append("--")
53
54        def find_features(document, size, cat, age):
55                words = word_tokenize(document)
56                words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
57               
58                # Features:
59                # Title: Not Included
60                # Size of description: Not Included
61                # Category: Included
62                # Description: Included
63                # Age : Not Included (It can overfit the model)
64               
65                features = {}
66                for w in word_features:
67                        features[w] = (w in words)
68                #features["size"] = size
69                features["category"] = cat
70                #features["age"] = age
71
72                return features
73
74        d = find_features(description, size, cat, age)
75        #print d
76        return classifier.prob_classify(d)
77
78if __name__=="__main__":
79        dist = text_cat("sex", 21 ,"ApplicationsEntertainment", 18)
80        print dist.prob('exp')
Note: See TracBrowser for help on using the repository browser.