Skip to content

Commit c720d55

Browse files
committed
Adding a simple webservice
1 parent 8682dc2 commit c720d55

File tree

7 files changed

+213
-3
lines changed

7 files changed

+213
-3
lines changed

‎ldaops.py‎

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/python
2+
__author__ = 'Ashwin'
3+
__email__ = 'gashwin1@umbc.edu'
4+
5+
"""
6+
CMSC 691 - Final Project
7+
Visualizing Topics using LDA
8+
9+
AUTHOR: Ashwinkumar Ganesan.
10+
Kiante Branley
11+
12+
Latent Dirichlet Allocation is a topic modelling method that calculates
13+
the joint probabilities of words across documents and generates the
14+
two distributions:
15+
1. The distribution of topics for each document.
16+
2. The distribution of words for each topic.
17+
18+
This project tries to visualize the topics, the documents and the clusters
19+
that are generated.
20+
"""
21+
22+
"""
23+
E.g for other file types.
24+
25+
# Read the directory
26+
reader = FileReader()
27+
reader.read_file(data_loc)
28+
reader.read_dir(data_dir_loc)
29+
reader.read_text_sections(data_sects)
30+
"""
31+
32+
# File Operations
33+
from processdata.fileops import FileReader
34+
from processdata import preprocess
35+
36+
# LDA Operations.
37+
from processdata.lda import LDAVisualModel
38+
from processdata.fileops import write_rank_to_file
39+
from processdata.fileops import write_prob_to_file
40+
from processdata.fileops import write_top_hier_to_file
41+
from processdata.fileops import write_top_word_to_file
42+
from processdata.fileops import write_top_word_matrix_to_file
43+
44+
45+
def perform_lda():
46+
47+
# Define the arameters.
48+
num_of_words = 20 # Totally 5792 in our dataset
49+
num_of_topics = 20
50+
para_alpha = 1 / num_of_topics #1 / num_of_topics # 'symmetric'(de),'asymmetric','auto'
51+
num_of_passes = 1 # Default is 1, 10 --> much worse
52+
num_of_updates = 1 # Default is 1 in LDA, 50 give too many repeated words, and one-document words after re-ranking
53+
num_of_trunks = 100 # Default 2000, 200 is good. Large-->too many repeated words, and one-document words after re-ranking; Small->less repeated but words are too general meaning
54+
num_of_iterations = 10 # Default is 50
55+
num_of_eval = 10 # Default is 10
56+
para_gamma_t = 0.001 # Default is 0.001
57+
para_kappa = 0.5 # Default is 0.5
58+
para_tao = 1.0 # Default is 1.0
59+
para_eta = None # Default is None
60+
min_df_cutoff = 0.3 # 0.3
61+
62+
# Location of the data.
63+
data_loc = '20_newsgroups/alt.temp/9976'
64+
#data_dir_loc = '../20news/20_newsgroups/alt.atheism' change line 79/80 and 90/91
65+
data_sects = 'server/data/TVCG_abstract.csv'
66+
title_list = 'server/data/TVCG_title.csv'
67+
68+
# List of files that are written to.
69+
prob_data_file = 'server/data/TVCG_prob.csv'
70+
rank_data_file = 'server/data/TVCG_rank.csv'
71+
top_hier_data_file = 'server/data/top_hier_data.json'
72+
top_word_data_file = 'server/data/TVCG_top_words.txt'
73+
re_top_word_data_file = 'server/data/TVCG_re_rank_top_words.txt'
74+
top_word_matrix_file = 'server/data/TVCG_top_words_matrix.csv'
75+
#s_words = ['data', 'visualization', 'visual', 'approach', 'analysis', 'study', 'techniques',
76+
# 'interactive', 'results', 'design', 'paper', 'user', 'information', 'based', 'system',
77+
# 'present', 'time', 'different', 'use', 'using', 'used']
78+
79+
if __name__ == "__main__":
80+
81+
# Read the directory
82+
tfidf_tokenizer = FileReader()
83+
tfidf_tokenizer.read_file_text(data_sects)
84+
# tfidf_tokenizer.read_data_dir_loc(data_dir_loc)
85+
# Get the token list
86+
raw_text = tfidf_tokenizer.get_token_list()
87+
[tfidf, tfs] = preprocess.vectorize(raw_text, min_df_cutoff) ###
88+
s_words = preprocess.build_stop_word_list(tfidf, tfs) ###
89+
e_words = preprocess.get_stop_words(tfidf) ###
90+
91+
# Get the set of words.
92+
# Read the file again for LDA tokens.
93+
lda_tokenizer = FileReader()
94+
lda_tokenizer.read_text_sections(data_sects, s_words, e_words)
95+
# lda_tokenizer.read_text_sections_dir(data_dir_loc, s_words, e_words)
96+
word_corpus = lda_tokenizer.get_token_list() # Generate lda_tokenizer.token_list with repeated words
97+
98+
# word_corpus = preprocess.singularize(word_corpus)
99+
[word_corpus, stem_relation] = preprocess.stem(word_corpus)
100+
stem_map = preprocess.gen_stem_map(stem_relation) # dictionary that map back
101+
102+
# Perform LDA.
103+
lda = LDAVisualModel(word_corpus) # token2id
104+
lda.create_word_corpus(word_corpus) # create mm: 8 * list of (token_id, token_count)
105+
106+
# Train the LDA model for specific number of topics
107+
# and iterations.
108+
lda.train_lda(num_of_topics, para_alpha, num_of_passes, num_of_updates,
109+
num_of_trunks, num_of_iterations, num_of_eval,
110+
para_gamma_t, para_kappa, para_tao, para_eta) # id2token, lda.lda
111+
# Get lda.lda.expElogbeta: (matrix) num_topic * num_corpus_word
112+
# Get lda.id2word.id2token because we feed in lda.id2word when training
113+
114+
# C-LDA
115+
# must_link = tfidf_tokenizer.read_must_link(must_link_file)
116+
# cannot_link = tfidf_tokenizer.read_cannot_link(cannot_link_file)
117+
# lda.train_clda(num_of_topics, must_link, cannot_link, num_of_passes, num_of_updates,
118+
# num_of_trunks, num_of_iterations, num_of_eval,
119+
# para_gamma_t, para_kappa, para_tao, para_eta)
120+
121+
lda.id2word.id2token = preprocess.inverse_stem(lda.id2word.id2token, stem_map)
122+
# Only clean lda.id2word.id2token, NOT cleaning anything else
123+
# Not needed to clean lda.id2word.token2id, lda.lda.id2word
124+
125+
# Generate matrices # OK
126+
matrix_doc_top = lda.generate_doc_topic()
127+
doc_top_rank = lda.generate_doc_topic_rank()
128+
# *******
129+
# Use: for idx, doc in enumerate(self.lda[self.mm])
130+
# To generate num_doc lists. Each list has many pairs of
131+
# (top_idx, top_prob). Ex: (2, 0.0182), (6, 0.0155)
132+
133+
matrix_top_word = lda.lda.expElogbeta
134+
matrix_doc_word = lda.generate_doc_word(matrix_doc_top)
135+
[top_word, top_word_index] = lda.generate_top_words(num_of_words)
136+
137+
# Get the topic corpus.
138+
topics = lda.get_lda_corpus(num_of_topics, num_of_words) #
139+
# Print topic_words (Decide how many) for each topic
140+
# Each has num_of_words(20) pairs of ('0.023', 'network')
141+
142+
# Isolate top words for documents.
143+
doc_to_word = lda.gen_doc_top_words(topics, matrix_doc_top) #
144+
# Print doc_words for each document
145+
# Each has num_or_words(20) pairs of ('0.023', 'network')
146+
147+
# Generate the topic hierarchy. #OK
148+
top_hier = lda.gen_topic_hierarchy(topics) #
149+
top_word_o = lda.gen_topic_words(topics) #
150+
151+
# Evaluation
152+
doc_word_index_set = lda.doc_word_index_set()
153+
top_coherence = lda.topic_coherence(top_word_index, num_of_words, doc_word_index_set) # lda.mm is in self
154+
# Need the indices of the first 20 words, and then go to
155+
# those columns in doc_word_times matrix
156+
coherence = sum(top_coherence) / num_of_topics
157+
perplexity = lda.perplexity(matrix_doc_word, doc_word_index_set)
158+
159+
# Print the topic information to a file. #OK
160+
write_prob_to_file(doc_to_word, matrix_doc_top, num_of_words, num_of_topics, title_list, prob_data_file)
161+
write_rank_to_file(doc_to_word, doc_top_rank, num_of_words, num_of_topics, title_list, rank_data_file)
162+
write_top_hier_to_file(top_hier, top_hier_data_file)
163+
write_top_word_to_file(top_word_o, top_word_data_file, top_coherence, coherence, perplexity)
164+
write_top_word_matrix_to_file(lda.id2word.id2token, matrix_top_word, top_word_matrix_file)
165+
166+
# Re-rank topic keywords
167+
re_rank_top_word = lda.top_word_re_rank(num_of_words, 2)
168+
# CHANGED FOR DEBUGGING num_of_words)
169+
# 1:SUM 2:LOG
170+
171+
# Print the re-ranked topic keywords
172+
write_top_word_to_file(re_rank_top_word, re_top_word_data_file, top_coherence, coherence, perplexity)

‎mainTVCG.py‎

100644100755
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,10 @@
103103
# and iterations.
104104
lda.train_lda(num_of_topics, para_alpha, num_of_passes, num_of_updates,
105105
num_of_trunks, num_of_iterations, num_of_eval,
106-
para_gamma_t, para_kappa, para_tao, para_eta) # id2token, lda.lda
106+
para_gamma_t, para_kappa, para_eta) # id2token, lda.lda
107107
# Get lda.lda.expElogbeta: (matrix) num_topic * num_corpus_word
108108
# Get lda.id2word.id2token because we feed in lda.id2word when training
109+
#TODO: para_tao has been removed from the 2nd last position temporarily. Add it back later.
109110

110111
# C-LDA
111112
# must_link = tfidf_tokenizer.read_must_link(must_link_file)

‎processdata/fileops.py‎

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ def read_dir(self, file_dir_name):
163163
def get_token_list(self):
164164
return self.token_list
165165

166+
166167
# This function is to writes to a CSV file.
167168
# The file contains the probability of each topic.
168169
def write_prob_to_file(doc_to_word, doc_top, num_of_words, num_topics, t_file, filename):
@@ -178,7 +179,7 @@ def write_prob_to_file(doc_to_word, doc_top, num_of_words, num_topics, t_file, f
178179
for idx, doc in enumerate(doc_top):
179180

180181
# This is the name of the document.
181-
doc_string = csvreader.__next__()[0]
182+
doc_string = csvreader.next()[0]
182183

183184
# Write the title to the document.
184185
col_string += "\"TITLE: " + doc_string + " WORDS: "
@@ -238,6 +239,7 @@ def write_top_hier_to_file(top_hier, filename):
238239
with open(filename, "w") as file_ptr:
239240
json.dump(top_hier, file_ptr)
240241

242+
241243
def write_top_word_to_file(top_word, filename, top_coherence, coherence, perplexity):
242244
write_string = "Coherence: %f Perplexity: %f \n" % (coherence, perplexity)
243245

@@ -250,6 +252,7 @@ def write_top_word_to_file(top_word, filename, top_coherence, coherence, perplex
250252
with open(filename, "w") as file_ptr:
251253
file_ptr.write(write_string)
252254

255+
253256
def write_top_word_matrix_to_file(word_dict, matrix, filename):
254257
write_string = ","
255258
for idx in word_dict:
@@ -265,6 +268,7 @@ def write_top_word_matrix_to_file(word_dict, matrix, filename):
265268
with open(filename, "w") as file_ptr:
266269
file_ptr.write(write_string)
267270

271+
268272
def write_evaluation_matrix_to_file(coherences, perplexities, filename):
269273
write_string = ""
270274
for coherence in coherences:

‎processdata/fileops.pyc‎

2.37 KB
Binary file not shown.

‎processdata/lda.py‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import numpy # for top_word re-rank
1111
import scipy # for gmean
1212

13+
1314
class LDAVisualModel:
1415
def __init__(self, word_corpus, scaling_factor=100000):
1516
"""
@@ -47,12 +48,14 @@ def train_lda(self, num_top=10, alpha1='symmetric', num_pass=1, update_t=1, chun
4748
passes=num_pass, update_every=update_t,
4849
chunksize=chunks, iterations=num_iter,
4950
eval_every=num_eval, gamma_threshold=gamma_t,
50-
decay=kappa, offset=tao, eta=eta)
51+
decay=kappa, eta=eta)
5152
#(corpus=None, num_topics=100, id2word=None, distributed=False,
5253
# chunksize=2000, passes=1, update_every=1, alpha='symmetric',
5354
# eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
5455
# gamma_threshold=0.001)
5556

57+
#TODO: offset=tao, Removing the Offset parameter temporary. Restore when error resolved.
58+
5659
def get_lda_corpus(self, num_of_topics=10, num_of_words=10):
5760
"""
5861
Get the topic associated with each document.

‎processdata/lda.pyc‎

4.9 KB
Binary file not shown.

‎webservice.py‎

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""
2+
Creating a simple web service to handle requests from Javascript
3+
to perform LDA.
4+
5+
Using Flask package to perform LDA.
6+
"""
7+
__author__ = 'ashwin'
8+
9+
#!/usr/bin/env python
10+
from flask import Flask, url_for
11+
from ldaops import perform_lda
12+
13+
app = Flask(__name__)
14+
15+
16+
welcome_message = "<html><body><p>REST API for LDAExplore</p>\n Use: /lda to perform LDA.</body></html>"
17+
18+
# A standard welcome message.
19+
@app.route('/')
20+
def api_root():
21+
return welcome_message
22+
23+
# API to perform LDA.
24+
@app.route('/lda')
25+
def api_lda():
26+
perform_lda()
27+
28+
29+
if __name__ == "__main__":
30+
app.run()

0 commit comments

Comments
 (0)