source: internals/2016/aptoideimagesdetector/trunk/Source Code/Linguage Extractor/Initial tests/NLTK tests.py @ 16323

Last change on this file since 16323 was 16323, checked in by dferreira, 3 years ago

Initial linguage tests

File size: 4.7 KB
Line 
1import nltk
2from nltk.tokenize import sent_tokenize, word_tokenize
3from nltk.corpus import stopwords, state_union, wordnet, movie_reviews
4from nltk.tokenize import word_tokenize, PunktSentenceTokenizer
5from nltk.stem import PorterStemmer, WordNetLemmatizer
6import random
7
8
9
10# Video 1 - Tokenization
11
12# tokenizing - word tokenizers... sentence tokenizers
13# lexicon and corporas
14# corpora - body of the text. ex: medical journals, presidential speeches, English language
15# lexicon - words ant their means
16
17example_text = "Hello Mr. Smith, how are you doing today? The weather is great and Python is awesome. Is great to work on Aptoide. You should no eat cardboard."
18
19print sent_tokenize(example_text)
20
21print word_tokenize(example_text)
22
23###########################################################
24# Video 2 - Stop Words
25
26example_sentence = "This is an example showing off stop word filtration."
27# Language can be changed
28stop_words = set(stopwords.words("english"))
29
30words = word_tokenize(example_sentence)
31
32filtered_sentence = []
33
34for w in words:
35        if w not in stop_words:
36                filtered_sentence.append(w)
37
38# or filtered_sentence = [w for w in words if not w in stop_words]
39
40print filtered_sentence
41
42#############################################################
43# Video 3 - Stemming
44
45ps = PorterStemmer()
46
47example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]
48
49for w in example_words:
50        print ps.stem(w)
51
52new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."
53
54words = word_tokenize(new_text)
55
56for w in words:
57        print ps.stem(w)
58
59#############################################################
60# Video 4/5/6 - Part of Speech Tagging/Chunking/Chinking
61
62train_text = state_union.raw("2005-GWBush.txt")
63sample_text = state_union.raw("2006-GWBush.txt")
64
65custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
66
67tokenized = custom_sent_tokenizer.tokenize(sample_text)
68
69def process_content():
70        try:
71                for i in tokenized:
72                        words = nltk.word_tokenize(i)
73                        tagged = nltk.pos_tag(words)
74
75                        #print tagged
76
77                        #chunkGram = r"""Chunk: {<RB.?>*<VB.*>*<NNP>+<NN>?}"""
78                        #chunkGram = r"""Chunk: {<.*>+}
79                        #                               } <VB.?|IN|DT|TO>+ {"""
80                        #chunkParser = nltk.RegexpParser(chunkGram)
81                        #chunked = chunkParser.parse(tagged)
82
83                        #chunked.draw()
84
85
86        except Exception as e:
87                print str(e)
88
89#process_content()
90
91#############################################################
92# Video 7 - Named Entity Recognition
93
94train_text = state_union.raw("2005-GWBush.txt")
95sample_text = state_union.raw("2006-GWBush.txt")
96
97custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
98
99tokenized = custom_sent_tokenizer.tokenize(sample_text)
100
101def process_content():
102        try:
103                for i in tokenized[5:]:
104                        words = nltk.word_tokenize(i)
105                        tagged = nltk.pos_tag(words)
106
107                        namedEnt = nltk.ne_chunk(tagged, binary = True)
108
109                        namedEnt.draw()
110        except:
111                pass
112
113#process_content()
114
115#############################################################
116# Video 8 - Lemmatizing
117
118lematizer = WordNetLemmatizer()
119#print lematizer.lemmatize("better", pos="a")
120#print lematizer.lemmatize("cats")
121
122#############################################################
123# Video 10 - WordNet
124
125syns = wordnet.synsets("program")
126
127# synset
128print syns[0].name()
129
130# just the word
131print syns[0].lemmas()[0].name()
132
133# definition
134print syns[0].definition()
135
136# examples
137print syns[0].examples()
138
139synonyms = []
140antonyms = []
141
142for syn in wordnet.synsets("good"):
143        for l in syn.lemmas():
144                synonyms.append(l.name())
145                if l.antonyms():
146                        antonyms.append(l.antonyms()[0].name())
147
148print set(synonyms)
149print set(antonyms)
150
151w1 = wordnet.synset("ship.n.01")
152w2 = wordnet.synset("boat.n.01")
153print w1.wup_similarity(w2)
154
155w1 = wordnet.synset("boy.n.01")
156w2 = wordnet.synset("girl.n.01")
157print w1.wup_similarity(w2)
158
159#############################################################
160# Video 11/12 - Text Classification/Words as Features for Learning
161
162#documents = [(list(movie_reviews.words(fileid)), category)
163#                       for category in movie_reviews.categories()
164#                       for fileid in movie_reviews.fileids(category)]
165
166documents = []
167
168for category in movie_reviews.categories():
169        for fileid in movie_reviews.fileids(category):
170                documents.append((list(movie_reviews.words(fileid)), category))
171
172random.shuffle(documents)
173
174#print documents[1]
175
176all_words = []
177
178for w in movie_reviews.words():
179        all_words.append(w.lower())
180
181all_words = nltk.FreqDist(all_words)
182#print all_words.most_common(15)
183
184word_features = list(all_words.keys())[:3000]
185
186def find_features(document):
187        words = set(document)
188        features = {}
189        for w in word_features:
190                features[w] = (w in words)
191        return features
192
193print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
194featuresets = [(find_features(rev), category) for (rev, category) in documents]
195
196
197
Note: See TracBrowser for help on using the repository browser.