source: internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization.py @ 16392

Last change on this file since 16392 was 16392, checked in by dferreira, 4 years ago

Black box v1 done with API

File size: 2.0 KB
Line 
1
2# Initial Test Categorization tests
3
4
5from __future__ import division
6import sqlite3
7import os
8from nltk.tokenize import word_tokenize
9import nltk
10import random
11import pickle
12import os
13import collections
14from nltk.metrics import precision, recall, f_measure
15from sklearn.naive_bayes import MultinomialNB, BernoulliNB
16from sklearn.linear_model import LogisticRegression, SGDClassifier
17from nltk.classify.scikitlearn import SklearnClassifier
18from sklearn.svm import SVC, LinearSVC, NuSVC
19from nltk.classify import ClassifierI
20from nltk.corpus import stopwords
21from nltk.stem.porter import *
22from nltk.util import ngrams
23from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
24from sklearn import tree
25import string
26from datetime import datetime, time
27
28
29def text_cat(description):
30        classifier = ""
31        words = []
32        word_features = []
33
34        try:
35                filename, file_extension = os.path.splitext(os.path.realpath(__file__))
36                fn = os.path.dirname(os.path.abspath(os.path.join(filename, os.pardir)))
37
38                f = open(fn+"/Text_categorization/model_info.pickle", "rb")
39                classifier = pickle.load(f)
40                f.close()
41                f = open(fn+"/Text_categorization/word_features.pickle", "rb")
42                word_features = pickle.load(f)
43                f.close()
44        except:
45                print "Serialized objects not found"
46                exit(0)
47        stemmer = PorterStemmer()
48        punctuations = list(string.punctuation)
49        punctuations.append("''")
50        punctuations.append("--")
51
52        def find_features(document):
53                words = word_tokenize(document)
54
55                words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
56
57                features = {}
58                for w in word_features:
59                        features[w] = (w in words)
60
61                return features
62
63        d = find_features(description)
64       
65        return classifier.prob_classify(d)
66
67if __name__=="__main__":
68        dist = text_cat("Are you looking for Porn Movies ? If yes, you are at right place. Enjoy porn movies. You can download porn movies also to enjoy later. Tag: Porn, Porno, Sex, Adult, Downloader, sex, xxx, xx")
69        print dist.prob('exp')
Note: See TracBrowser for help on using the repository browser.