|
| 1 | +#!/usr/bin/python |
| 2 | +__author__ = 'Ashwin' |
| 3 | +__email__ = 'gashwin1@umbc.edu' |
| 4 | + |
| 5 | +""" |
| 6 | +CMSC 691 - Final Project |
| 7 | +Visualizing Topics using LDA |
| 8 | +
|
| 9 | +AUTHOR: Ashwinkumar Ganesan. |
| 10 | + Kiante Branley |
| 11 | +
|
| 12 | +Latent Dirichlet Allocation is a topic modelling method that calculates |
| 13 | +the joint probabilities of words across documents and generates the |
| 14 | +two distributions: |
| 15 | +1. The distribution of topics for each document. |
| 16 | +2. The distribution of words for each topic. |
| 17 | +
|
| 18 | +This project tries to visualize the topics, the documents and the clusters |
| 19 | +that are generated. |
| 20 | +""" |
| 21 | + |
| 22 | +""" |
| 23 | +E.g for other file types. |
| 24 | +
|
| 25 | +# Read the directory |
| 26 | +reader = FileReader() |
| 27 | +reader.read_file(data_loc) |
| 28 | +reader.read_dir(data_dir_loc) |
| 29 | +reader.read_text_sections(data_sects) |
| 30 | +""" |
| 31 | + |
| 32 | +# File Operations |
| 33 | +from processdata.fileops import FileReader |
| 34 | +from processdata import preprocess |
| 35 | + |
| 36 | +# LDA Operations. |
| 37 | +from processdata.lda import LDAVisualModel |
| 38 | +from processdata.fileops import write_rank_to_file |
| 39 | +from processdata.fileops import write_prob_to_file |
| 40 | +from processdata.fileops import write_top_hier_to_file |
| 41 | +from processdata.fileops import write_top_word_to_file |
| 42 | +from processdata.fileops import write_top_word_matrix_to_file |
| 43 | + |
| 44 | + |
| 45 | +def perform_lda(): |
| 46 | + |
| 47 | + # Define the arameters. |
| 48 | + num_of_words = 20 # Totally 5792 in our dataset |
| 49 | + num_of_topics = 20 |
| 50 | + para_alpha = 1 / num_of_topics #1 / num_of_topics # 'symmetric'(de),'asymmetric','auto' |
| 51 | + num_of_passes = 1 # Default is 1, 10 --> much worse |
| 52 | + num_of_updates = 1 # Default is 1 in LDA, 50 give too many repeated words, and one-document words after re-ranking |
| 53 | + num_of_trunks = 100 # Default 2000, 200 is good. Large-->too many repeated words, and one-document words after re-ranking; Small->less repeated but words are too general meaning |
| 54 | + num_of_iterations = 10 # Default is 50 |
| 55 | + num_of_eval = 10 # Default is 10 |
| 56 | + para_gamma_t = 0.001 # Default is 0.001 |
| 57 | + para_kappa = 0.5 # Default is 0.5 |
| 58 | + para_tao = 1.0 # Default is 1.0 |
| 59 | + para_eta = None # Default is None |
| 60 | + min_df_cutoff = 0.3 # 0.3 |
| 61 | + |
| 62 | + # Location of the data. |
| 63 | + data_loc = '20_newsgroups/alt.temp/9976' |
| 64 | + #data_dir_loc = '../20news/20_newsgroups/alt.atheism' change line 79/80 and 90/91 |
| 65 | + data_sects = 'server/data/TVCG_abstract.csv' |
| 66 | + title_list = 'server/data/TVCG_title.csv' |
| 67 | + |
| 68 | + # List of files that are written to. |
| 69 | + prob_data_file = 'server/data/TVCG_prob.csv' |
| 70 | + rank_data_file = 'server/data/TVCG_rank.csv' |
| 71 | + top_hier_data_file = 'server/data/top_hier_data.json' |
| 72 | + top_word_data_file = 'server/data/TVCG_top_words.txt' |
| 73 | + re_top_word_data_file = 'server/data/TVCG_re_rank_top_words.txt' |
| 74 | + top_word_matrix_file = 'server/data/TVCG_top_words_matrix.csv' |
| 75 | + #s_words = ['data', 'visualization', 'visual', 'approach', 'analysis', 'study', 'techniques', |
| 76 | + # 'interactive', 'results', 'design', 'paper', 'user', 'information', 'based', 'system', |
| 77 | + # 'present', 'time', 'different', 'use', 'using', 'used'] |
| 78 | + |
| 79 | + if __name__ == "__main__": |
| 80 | + |
| 81 | + # Read the directory |
| 82 | + tfidf_tokenizer = FileReader() |
| 83 | + tfidf_tokenizer.read_file_text(data_sects) |
| 84 | + # tfidf_tokenizer.read_data_dir_loc(data_dir_loc) |
| 85 | + # Get the token list |
| 86 | + raw_text = tfidf_tokenizer.get_token_list() |
| 87 | + [tfidf, tfs] = preprocess.vectorize(raw_text, min_df_cutoff) ### |
| 88 | + s_words = preprocess.build_stop_word_list(tfidf, tfs) ### |
| 89 | + e_words = preprocess.get_stop_words(tfidf) ### |
| 90 | + |
| 91 | + # Get the set of words. |
| 92 | + # Read the file again for LDA tokens. |
| 93 | + lda_tokenizer = FileReader() |
| 94 | + lda_tokenizer.read_text_sections(data_sects, s_words, e_words) |
| 95 | + # lda_tokenizer.read_text_sections_dir(data_dir_loc, s_words, e_words) |
| 96 | + word_corpus = lda_tokenizer.get_token_list() # Generate lda_tokenizer.token_list with repeated words |
| 97 | + |
| 98 | + # word_corpus = preprocess.singularize(word_corpus) |
| 99 | + [word_corpus, stem_relation] = preprocess.stem(word_corpus) |
| 100 | + stem_map = preprocess.gen_stem_map(stem_relation) # dictionary that map back |
| 101 | + |
| 102 | + # Perform LDA. |
| 103 | + lda = LDAVisualModel(word_corpus) # token2id |
| 104 | + lda.create_word_corpus(word_corpus) # create mm: 8 * list of (token_id, token_count) |
| 105 | + |
| 106 | + # Train the LDA model for specific number of topics |
| 107 | + # and iterations. |
| 108 | + lda.train_lda(num_of_topics, para_alpha, num_of_passes, num_of_updates, |
| 109 | + num_of_trunks, num_of_iterations, num_of_eval, |
| 110 | + para_gamma_t, para_kappa, para_tao, para_eta) # id2token, lda.lda |
| 111 | + # Get lda.lda.expElogbeta: (matrix) num_topic * num_corpus_word |
| 112 | + # Get lda.id2word.id2token because we feed in lda.id2word when training |
| 113 | + |
| 114 | + # C-LDA |
| 115 | + # must_link = tfidf_tokenizer.read_must_link(must_link_file) |
| 116 | + # cannot_link = tfidf_tokenizer.read_cannot_link(cannot_link_file) |
| 117 | + # lda.train_clda(num_of_topics, must_link, cannot_link, num_of_passes, num_of_updates, |
| 118 | + # num_of_trunks, num_of_iterations, num_of_eval, |
| 119 | + # para_gamma_t, para_kappa, para_tao, para_eta) |
| 120 | + |
| 121 | + lda.id2word.id2token = preprocess.inverse_stem(lda.id2word.id2token, stem_map) |
| 122 | + # Only clean lda.id2word.id2token, NOT cleaning anything else |
| 123 | + # Not needed to clean lda.id2word.token2id, lda.lda.id2word |
| 124 | + |
| 125 | + # Generate matrices # OK |
| 126 | + matrix_doc_top = lda.generate_doc_topic() |
| 127 | + doc_top_rank = lda.generate_doc_topic_rank() |
| 128 | + # ******* |
| 129 | + # Use: for idx, doc in enumerate(self.lda[self.mm]) |
| 130 | + # To generate num_doc lists. Each list has many pairs of |
| 131 | + # (top_idx, top_prob). Ex: (2, 0.0182), (6, 0.0155) |
| 132 | + |
| 133 | + matrix_top_word = lda.lda.expElogbeta |
| 134 | + matrix_doc_word = lda.generate_doc_word(matrix_doc_top) |
| 135 | + [top_word, top_word_index] = lda.generate_top_words(num_of_words) |
| 136 | + |
| 137 | + # Get the topic corpus. |
| 138 | + topics = lda.get_lda_corpus(num_of_topics, num_of_words) # |
| 139 | + # Print topic_words (Decide how many) for each topic |
| 140 | + # Each has num_of_words(20) pairs of ('0.023', 'network') |
| 141 | + |
| 142 | + # Isolate top words for documents. |
| 143 | + doc_to_word = lda.gen_doc_top_words(topics, matrix_doc_top) # |
| 144 | + # Print doc_words for each document |
| 145 | + # Each has num_or_words(20) pairs of ('0.023', 'network') |
| 146 | + |
| 147 | + # Generate the topic hierarchy. #OK |
| 148 | + top_hier = lda.gen_topic_hierarchy(topics) # |
| 149 | + top_word_o = lda.gen_topic_words(topics) # |
| 150 | + |
| 151 | + # Evaluation |
| 152 | + doc_word_index_set = lda.doc_word_index_set() |
| 153 | + top_coherence = lda.topic_coherence(top_word_index, num_of_words, doc_word_index_set) # lda.mm is in self |
| 154 | + # Need the indices of the first 20 words, and then go to |
| 155 | + # those columns in doc_word_times matrix |
| 156 | + coherence = sum(top_coherence) / num_of_topics |
| 157 | + perplexity = lda.perplexity(matrix_doc_word, doc_word_index_set) |
| 158 | + |
| 159 | + # Print the topic information to a file. #OK |
| 160 | + write_prob_to_file(doc_to_word, matrix_doc_top, num_of_words, num_of_topics, title_list, prob_data_file) |
| 161 | + write_rank_to_file(doc_to_word, doc_top_rank, num_of_words, num_of_topics, title_list, rank_data_file) |
| 162 | + write_top_hier_to_file(top_hier, top_hier_data_file) |
| 163 | + write_top_word_to_file(top_word_o, top_word_data_file, top_coherence, coherence, perplexity) |
| 164 | + write_top_word_matrix_to_file(lda.id2word.id2token, matrix_top_word, top_word_matrix_file) |
| 165 | + |
| 166 | + # Re-rank topic keywords |
| 167 | + re_rank_top_word = lda.top_word_re_rank(num_of_words, 2) |
| 168 | + # CHANGED FOR DEBUGGING num_of_words) |
| 169 | + # 1:SUM 2:LOG |
| 170 | + |
| 171 | + # Print the re-ranked topic keywords |
| 172 | + write_top_word_to_file(re_rank_top_word, re_top_word_data_file, top_coherence, coherence, perplexity) |
0 commit comments