source: internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization.py @ 16413

Last change on this file since 16413 was 16413, checked in by dferreira, 3 years ago

Text Categorization improved.

File size: 2.2 KB
Line 
1# Aptoide, 2016
2# Diogo Ferreira
3# Text categorization simple API
4
5
6from __future__ import division
7import sqlite3
8import os
9from nltk.tokenize import word_tokenize
10import nltk
11import random
12import pickle
13import os
14import collections
15from nltk.metrics import precision, recall, f_measure
16from sklearn.naive_bayes import MultinomialNB, BernoulliNB
17from sklearn.linear_model import LogisticRegression, SGDClassifier
18from nltk.classify.scikitlearn import SklearnClassifier
19from sklearn.svm import SVC, LinearSVC, NuSVC
20from nltk.classify import ClassifierI
21from nltk.corpus import stopwords
22from nltk.stem.porter import *
23from nltk.util import ngrams
24from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
25from sklearn import tree
26import string
27from datetime import datetime, time
28
29
30def text_cat(description, size, cat, age):
31        classifier = ""
32        words = []
33        word_features = []
34
35        try:
36                filename, file_extension = os.path.splitext(os.path.realpath(__file__))
37                fn = os.path.dirname(os.path.abspath(os.path.join(filename, os.pardir)))
38               
39                f = open(fn+"/Text_categorization/model_info.pickle", "rb")
40                classifier = pickle.load(f)
41                f.close()
42        except:
43                print "Serialized objects not found"
44                exit(0)
45        stemmer = PorterStemmer()
46        punctuations = list(string.punctuation)
47        punctuations.append("''")
48        punctuations.append("--")
49
50        def find_features(document, size, cat, age):
51                words = word_tokenize(document)
52                words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
53               
54                # Features:
55                # Title: Not Included
56                # Size of description: Not Included
57                # Category: Included
58                # Description: Included
59                # Age : Not Included (It can overfit the model)
60               
61                features = {}
62                for w in word_features:
63                        features[w] = (w in words)
64                #features["size"] = size
65                features["category"] = cat
66                #features["age"] = age
67
68                return features
69
70        d = find_features(description, size, cat, age)
71       
72        return classifier.prob_classify(d)
73
74if __name__=="__main__":
75        dist = text_cat("Are you looking for Porn Movies ? If yes, you are at right place. Enjoy porn movies. You can download porn movies also to enjoy later. Tag: Porn, Porno, Sex, Adult, Downloader, sex, xxx, xx", 20 ,"ApplicationsEntertainment", 18)
76        print dist.prob('exp')
Note: See TracBrowser for help on using the repository browser.