Changeset 16392


Ignore:
Timestamp:
Jul 25, 2016, 4:05:33 PM (3 years ago)
Author:
dferreira
Message:

Black box v1 done with API

Location:
internals/2016/aptoideimagesdetector/trunk/Source Code
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization-tests.py

    r16391 r16392  
    140140# Step 6: Check if it finds features in words
    141141def find_features(document):
     142
    142143        words = word_tokenize(document)
    143 
    144144        words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
     145
    145146
    146147        # Later try to add size of description, category and title
     
    158159now = datetime.now()
    159160# Step 7: Get feature set
    160 featuresets = [(find_features(rev), category) for (rev, category) in documents]
     161featuresets = [(find_features(desc), category) for (desc, category) in documents]
    161162
    162163# Step 8: Shuffle feature sets
     
    226227results(SklearnClassifier(BernoulliNB()), testing_set, training_set)
    227228print "\n****** LOGISTIC REGRESSION ************"
    228 save_model = results(SklearnClassifier(LogisticRegression()), testing_set, training_set)
     229saving_model = results(SklearnClassifier(LogisticRegression()), testing_set, training_set)
    229230print "\n****** SGD CLASSIFIER ************"
    230231results(SklearnClassifier(SGDClassifier()), testing_set, training_set)
     
    234235results(SklearnClassifier(LinearSVC()), testing_set, training_set)
    235236
    236 save_model = open("./training_info.pickle", "wb")
     237save_model = open("./model_info.pickle", "wb")
    237238pickle.dump(saving_model, save_model)
    238239save_model.close()
     240
     241save_model = open("./word_features.pickle", "wb")
     242pickle.dump(word_features, save_model)
     243save_model.close()
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Text_categorization/Text_categorization.py

    r16391 r16392  
    99import nltk
    1010import random
     11import pickle
     12import os
    1113import collections
    1214from nltk.metrics import precision, recall, f_measure
     
    2123from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    2224from sklearn import tree
    23 from statistics import mode
    2425import string
    2526from datetime import datetime, time
     
    2728
    2829def text_cat(description):
    29         # Step 0: Define parameters
     30        classifier = ""
     31        words = []
     32        word_features = []
    3033
    31         # Length of test set
    32         number_testing = 500
    33         # Number of most common words used for classifier
    34         n_common_words = 5000
    35         # Number of most informative features
    36         n_most_informative_features = 25
     34        try:
     35                filename, file_extension = os.path.splitext(os.path.realpath(__file__))
     36                fn = os.path.dirname(os.path.abspath(os.path.join(filename, os.pardir)))
    3737
    38         # Stemmer to all words
     38                f = open(fn+"/Text_categorization/model_info.pickle", "rb")
     39                classifier = pickle.load(f)
     40                f.close()
     41                f = open(fn+"/Text_categorization/word_features.pickle", "rb")
     42                word_features = pickle.load(f)
     43                f.close()
     44        except:
     45                print "Serialized objects not found"
     46                exit(0)
    3947        stemmer = PorterStemmer()
    40 
    4148        punctuations = list(string.punctuation)
    4249        punctuations.append("''")
    4350        punctuations.append("--")
    4451
    45         # Step 1: Get the Content and label it
    46         filename, file_extension = os.path.splitext(os.path.realpath(__file__))
    47         fn = os.path.dirname(os.path.abspath(os.path.join(filename, os.pardir)))
    48         print fn
    49         db = sqlite3.connect(fn+'/API to download database/app_info_non_explicit.db')
    50         c = db.cursor()
    51 
    52         db2 = sqlite3.connect(fn+'/API to download database/app_info_explicit.db')
    53         c2 = db2.cursor()
    54 
    55         explicit_content = []
    56         non_explicit_content = []
    57         documents = []
    58         exp_size = 0
    59         non_size = 0
    60 
    61         c2.execute(''' SELECT description FROM app_data WHERE majority=1 ''')
    62 
    63         for d in c2.fetchall():
    64                 explicit_content.append(d[0])
    65                 documents.append((d[0],'exp'))
    66                 exp_size+=1
    67 
    68         c.execute(''' SELECT description FROM app_data WHERE majority=1''')
    69 
    70         for d in c.fetchall():
    71                 explicit_content.append(d[0])
    72                 documents.append((d[0],'exp'))
    73                 exp_size += 1
    74 
    75 
    76         c.execute(''' SELECT description FROM app_data WHERE majority=0''')
    77 
    78         for d in c.fetchall():
    79                 non_explicit_content.append(d[0])
    80                 documents.append((d[0],'non'))
    81                 non_size += 1
    82 
    83                 #if non_size==exp_size:
    84                 #       break
    85 
    86         c2.execute(''' SELECT description FROM app_data WHERE majority=0 ''')
    87        
    88         for d in c2.fetchall():
    89                 non_explicit_content.append(d[0])
    90                 documents.append((d[0],'non'))
    91                 non_size += 1
    92                
    93         print "Explicit descriptions: "+str(exp_size)
    94         print "Non-Explicit descriptions: "+str(non_size)
    95 
    96         db.close()
    97         db2.close()
    98 
    99         print "Pre-Processing..."
    100 
    101         # Step 2: Tokenize words
    102         print "Tokenizing..."
    103         now = datetime.now()
    104         explicit_content_words = [word_tokenize(w) for w in explicit_content]
    105         non_explicit_content_words = [word_tokenize(w) for w in non_explicit_content]
    106 
    107         print str(datetime.now()-now)
    108 
    109         # Step 3: Append all words (lower)
    110 
    111         all_words = []
    112         d = []
    113         print "Appending all words..."
    114         now = datetime.now()
    115         for w in explicit_content_words:
    116                 for x in w:
    117                         if x not in stopwords.words('english') and x not in punctuations:
    118                                 all_words.append(stemmer.stem(x.lower()))
    119 
    120         for w in non_explicit_content_words:
    121                 for x in w:
    122                         if x not in stopwords.words('english') and x not in punctuations:
    123                                 all_words.append(stemmer.stem(x.lower()))
    124 
    125 
    126         print str(datetime.now()-now)
    127 
    128         print "Creating a frequency distribution..."
    129         now = datetime.now()
    130         # Step 4: Get FreqDist
    131         all_words = nltk.FreqDist(all_words)
    132         print str(datetime.now()-now)
    133         print all_words.most_common(25)
    134 
    135         print "Get the n most common features..."
    136         now = datetime.now()
    137         # Step 5: Get n common words as features
    138         word_features = list(all_words.keys())[:n_common_words]
    139         #print word_features
    140 
    141         # Step 6: Check if it finds features in words
    14252        def find_features(document):
    14353                words = word_tokenize(document)
     
    14555                words = [stemmer.stem(w.lower()) for w in words if not w in stopwords.words('english') and w not in punctuations]
    14656
    147                 # Later try to add size of description, category and title
    148                 # Also try to add bigrams and trigrams with and without stop words
    149                 # Careful so it doesn't overfit
    15057                features = {}
    15158                for w in word_features:
     
    15461                return features
    15562
    156         print str(datetime.now()-now)
    157 
    158         print "Create a feature set..."
    159         now = datetime.now()
    160         # Step 7: Get feature set
    161         featuresets = [(find_features(rev), category) for (rev, category) in documents]
    162 
    163         # Step 8: Shuffle feature sets
    164         random.shuffle(featuresets)
    165 
    166         # Step 9: Create training set and testing set from feature sets
    167         training_set = featuresets[:exp_size+non_size-number_testing]
    168         #print training_set
    169         testing_set = featuresets[exp_size+non_size-number_testing:]
    170         #print testing_set
    171         print str(datetime.now()-now)
    172 
    17363        d = find_features(description)
    174 
    175 
    176         # Step 10: With the original Naive Bayes, print Classification. Try with others classifiers
    177 
    178         print "Training..."
    179 
    180         classifier = SklearnClassifier(LogisticRegression()).train(training_set)
     64       
    18165        return classifier.prob_classify(d)
    18266
    183        
    184         #classifier.show_most_informative_features(n_most_informative_features)
    18567if __name__=="__main__":
    186         text_cat("Are you looking for Porn Movies ? If yes, you are at right place. Enjoy porn movies. You can download porn movies also to enjoy later. Tag: Porn, Porno, Sex, Adult, Downloader, sex, xxx, xx")
     68        dist = text_cat("Are you looking for Porn Movies ? If yes, you are at right place. Enjoy porn movies. You can download porn movies also to enjoy later. Tag: Porn, Porno, Sex, Adult, Downloader, sex, xxx, xx")
     69        print dist.prob('exp')
  • internals/2016/aptoideimagesdetector/trunk/Source Code/analyse_app.py

    r16391 r16392  
    1212def analyse_app(illust2vec, icons, screenshots, description):
    1313
    14         images = []
     14        try:
     15                f = open("./model_apps_info.pickle", "rb")
     16                classifier = pickle.load(f)
     17                f.close()
     18        except:
     19                print "Serialized objects not found"
     20                exit(0)
    1521
     22        icons = []
     23        screens = []
    1624        for icon in icons:
    17                 images.append(analyse_explicit(illust2vec, icon))
     25                icons.append(analyse_explicit(illust2vec, icon))
    1826
    1927        for scr in screenshots:
    20                 images.append(analyse_explicit(illust2vec, scr))
     28                screens.append(analyse_explicit(illust2vec, scr))
    2129
    22         if len(images)==0:
    23                 return 0
    24         else:
    25                 print images
     30        description = text_cat(description)
     31
     32        def find_features(icon_list, scr_list, description_result):
     33                features = {}
    2634                maximum = 0
    27                 for im in images:
    28                         if im[1]>maximum:
    29                                 maximum = im[1]
     35                flag = 0
     36                safe = 0
     37                if len(icon_list)==0:
     38                        maximum = 0.5
     39                        safe = 0.5
     40                else:
     41                        for icon in icon_list:
     42                                for data in icon:
     43                                        if data[0]=='explicit' and data[1]>maximum:
     44                                                maximum = data[1]
     45                                                flag = 1
     46                                        if data[0]=='safe' and flag==1:
     47                                                safe = data[1]
     48                                                flag = 0
     49                maximum_s = 0
     50                flag = 0
     51                safe_s = 0
    3052
    31                 print maximum
     53                if len(scr_list)==0:
     54                        maximum_s = 0.5
     55                        safe_s = 0.5
     56                else:
     57                        for scr in scr_list:
     58                                for data in scr:
     59                                        if data[0]=='explicit' and data[1]>maximum_s:
     60                                                maximum_s = data[1]
     61                                                flag = 1
     62                                        if data[0]=='safe' and flag==1:
     63                                                safe_s = data[1]
     64                                                flag = 0
    3265
    33                 prob2 = text_cat(description)
    34                 print prob2.prob('exp')
    35                 return 0.7*prob2+0.3*prob1
    36 
     66                features['ic_exp'] = maximum
     67                features['ic_non'] = safe
     68                features['sc_exp'] = maximum_s
     69                features['sc_non'] = safe_s
     70                features['desc_exp'] = description_result.prob('exp')
     71                return features
     72        features = find_features(icons, screens, description)
     73        return classifier.prob_classify(features)
    3774
    3875if __name__=='__main__':
     
    5390        description = ""
    5491
    55         icons.append("./API to download database/images/Explicit search/icons_explicit/7883823.png")
    56         screenshots.append("./API to download database/images/Explicit search/screenshot_explicit/18966461_3_hd.jpg")
     92        icons.append('./API to download database/images/Explicit search/icons_explicit/7677797.png')
     93        screenshots.append('./API to download database/images/Explicit search/screenshot_explicit/4694725_8_hd.png')
    5794        description = "Are you looking for Porn Movies ? If yes, you are at right place. Enjoy porn movies. You can download porn movies also to enjoy later. Tag: Porn, Porno, Sex, Adult, Downloader, sex, xxx, xx"
    58 
    59         print analyse_app(illust2vec, icons, screenshots, description)
     95        dist = analyse_app(illust2vec, icons, screenshots, description)
     96        print dist.prob('exp')
     97       
Note: See TracChangeset for help on using the changeset viewer.