import pickle import json import numpy as np print('start') ids = [] with open ('lda_cache/ids', 'rb') as fp: ids = pickle.load(fp) corpus = [] with open ('lda_cache/corpus', 'rb') as fp: corpus = pickle.load(fp) doc_topics = [] with open ('lda_cache/doc_topics', 'rb') as fp: doc_topics = pickle.load(fp) top_25_words_scores = [] with open ('lda_cache/top_25_words_scores', 'rb') as fp: top_25_words_scores = pickle.load(fp) # print(ids[0]) # print(corpus[0]) # print(doc_topics[0]) # print(top_25_words_scores[39]) # print(top_25_words_scores[46]) test_ids = ["mj_qa_section_2499344", "mj_qa_section_1464855", "mj_qa_section_695777", "mj_qa_section_837222", "mj_qa_section_2584285", "mj_qa_section_878324", "mj_qa_section_903789", "mj_qa_section_2750472", "mj_qa_section_2521317", "mj_qa_section_1041965", "mj_qa_section_1176837"] ids_indices = [i for i, id in enumerate(ids) if id in test_ids] test_corpus = [corpus[i] for i in ids_indices] test_doc_topics = [doc_topics[i] for i in ids_indices] #--- # NOTE: for bug in dumping float in json class NumpyFloatValuesEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.float32): return float(obj) return json.JSONEncoder.default(self, obj) #json.dumps(d, cls=NumpyFloatValuesEncoder) #---- keywords_array = [] for i, topic in enumerate(test_doc_topics): id = test_ids[i] content = test_corpus[i] ttopic = [] for ti,tp in topic: topws = top_25_words_scores[ti] ttopic.append({ 'id': ti, 'probability': tp, 'topic_top_25_words': topws, }) keywords_array.append({ 'id': id, 'content':content, 'topics':ttopic, }) keywords_file = open('test_lda.json', "w", encoding='utf-8') keywords_file.write(json.dumps(keywords_array, ensure_ascii=False, indent=4, cls=NumpyFloatValuesEncoder)) keywords_file.close() print('end')