Changeset 16331


Ignore:
Timestamp:
Jul 18, 2016, 11:45:57 AM (3 years ago)
Author:
dferreira
Message:

Initial tests updated

File:
1 edited

Legend:

Unmodified
Added
Removed
  • internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial tests/nltk test2.py

    r16325 r16331  
     1#!/usr/bin/python
     2# -*- coding: utf-8 -*-
     3# Diogo Ferreira
     4# Aptoide, 2016
     5# Initial tests for nltk library
    16
    27from __future__ import division
     8import sys
    39import nltk
    410import random
    511from nltk.corpus import movie_reviews
    612from nltk.classify.scikitlearn import SklearnClassifier
     13from nltk.tokenize import word_tokenize
    714import pickle
     15from unidecode import unidecode
    816
    917from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
     
    1321from nltk.classify import ClassifierI
    1422from statistics import mode
     23
    1524
    1625class VoteClassifier(ClassifierI):
     
    3544                return conf
    3645
     46short_pos = open("short_reviews/positive.txt","r").read()
     47short_neg = open("short_reviews/negative.txt","r").read()
     48
    3749documents = []
    3850
     51for r in short_pos.split('\n'):
     52        documents.append((r,"pos"))
     53
     54for r in short_neg.split('\n'):
     55        documents.append((r,"neg"))
     56
    3957# Saves a list of (words in movie_reviews, category(positive or negative))
    40 for category in movie_reviews.categories():
    41         for fileid in movie_reviews.fileids(category):
    42                 documents.append((list(movie_reviews.words(fileid)), category))
     58#for category in movie_reviews.categories():
     59#       for fileid in movie_reviews.fileids(category):
     60#               documents.append((list(movie_reviews.words(fileid)), category))
    4361
    44 random.shuffle(documents)
     62#random.shuffle(documents)
    4563
    4664all_words = []
    4765
     66short_pos_words = word_tokenize(short_pos.decode('utf8'))
     67short_neg_words = word_tokenize(short_neg.decode('utf8'))
     68
     69for w in short_pos_words:
     70        all_words.append(w.lower())
     71
     72for w in short_neg_words:
     73        all_words.append(w.lower())
     74
    4875# Saves all words in reviews
    49 for w in movie_reviews.words():
    50         all_words.append(w.lower())
     76#for w in movie_reviews.words():
     77#       all_words.append(w.lower())
    5178
    5279all_words = nltk.FreqDist(all_words)
     
    5481
    5582# 3000 most common words
    56 word_features = list(all_words.keys())[:3000]
     83word_features = list(all_words.keys())[:5000]
    5784
    5885# Check if it finds features in words
    5986def find_features(document):
    60         words = set(document)
     87        words = word_tokenize(document)
    6188        features = {}
    6289        for w in word_features:
     
    6996featuresets = [(find_features(rev), category) for (rev, category) in documents]
    7097
    71 training_set = featuresets[:1900]
    72 testing_set = featuresets[1900:]
     98random.shuffle(featuresets)
     99
     100training_set = featuresets[:10000]
     101testing_set = featuresets[10000:]
     102
     103
    73104
    74105# posterior = prior ocurrences * likelihood/evidence
     
    129160print "Voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100
    130161
    131 print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
    132 print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
    133 print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
    134 print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
    135 print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
    136 print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
     162#print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100
     163#print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100
     164#print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100
     165#print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100
     166#print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100
     167#print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100
Note: See TracChangeset for help on using the changeset viewer.