source: internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization.py @ 16397

Last change on this file since 16397 was 16397, checked in by dferreira, 4 years ago

Changes to text categorization

File size: 1.8 KB
Line 
1
2# Initial Test Categorization tests
3
4
5from __future__ import division
6import sqlite3
7import os
8from nltk.tokenize import word_tokenize
9import nltk
10import random
11import pickle
12import os
13import collections
14from nltk.metrics import precision, recall, f_measure
15from sklearn.naive_bayes import MultinomialNB, BernoulliNB
16from sklearn.linear_model import LogisticRegression, SGDClassifier
17from nltk.classify.scikitlearn import SklearnClassifier
18from sklearn.svm import SVC, LinearSVC, NuSVC
19from nltk.classify import ClassifierI
20from nltk.corpus import stopwords
21from nltk.stem.porter import *
22from nltk.util import ngrams
23from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
24from sklearn import tree
25import string
26from datetime import datetime, time
27
28
29def text_cat(description):
30        classifier = ""
31        words = []
32        word_features = []
33
34        try:
35                filename, file_extension = os.path.splitext(os.path.realpath(__file__))
36                fn = os.path.dirname(os.path.abspath(os.path.join(filename, os.pardir)))
37
38                f = open(fn+"/Text_categorization/model_info.pickle", "rb")
39                classifier = pickle.load(f)
40                f.close()
41        except:
42                print "Serialized objects not found"
43                exit(0)
44        stemmer = PorterStemmer()
45        punctuations = list(string.punctuation)
46        punctuations.append("''")
47        punctuations.append("--")
48
49        def find_features(document):
50                words = word_tokenize(document)
51
52                words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
53
54                features = {}
55                for w in word_features:
56                        features[w] = (w in words)
57
58                return features
59
60        d = find_features(description)
61       
62        return classifier.prob_classify(d)
63
64if __name__=="__main__":
65        dist = text_cat("Are you looking for Porn Movies ? If yes, you are at right place. Enjoy porn movies. You can download porn movies also to enjoy later. Tag: Porn, Porno, Sex, Adult, Downloader, sex, xxx, xx")
66        print dist.prob('exp')
Note: See TracBrowser for help on using the repository browser.