78 lines
2.1 KiB
Python
78 lines
2.1 KiB
Python
import pickle
|
|
import json
|
|
import numpy as np
|
|
|
|
print('start')
|
|
ids = []
|
|
with open ('lda_cache/ids', 'rb') as fp:
|
|
ids = pickle.load(fp)
|
|
|
|
corpus = []
|
|
with open ('lda_cache/corpus', 'rb') as fp:
|
|
corpus = pickle.load(fp)
|
|
|
|
doc_topics = []
|
|
with open ('lda_cache/doc_topics', 'rb') as fp:
|
|
doc_topics = pickle.load(fp)
|
|
|
|
top_25_words_scores = []
|
|
with open ('lda_cache/top_25_words_scores', 'rb') as fp:
|
|
top_25_words_scores = pickle.load(fp)
|
|
|
|
# print(ids[0])
|
|
# print(corpus[0])
|
|
# print(doc_topics[0])
|
|
# print(top_25_words_scores[39])
|
|
# print(top_25_words_scores[46])
|
|
|
|
test_ids = ["mj_qa_section_2499344",
|
|
"mj_qa_section_1464855",
|
|
"mj_qa_section_695777",
|
|
"mj_qa_section_837222",
|
|
"mj_qa_section_2584285",
|
|
"mj_qa_section_878324",
|
|
"mj_qa_section_903789",
|
|
"mj_qa_section_2750472",
|
|
"mj_qa_section_2521317",
|
|
"mj_qa_section_1041965",
|
|
"mj_qa_section_1176837"]
|
|
|
|
ids_indices = [i for i, id in enumerate(ids) if id in test_ids]
|
|
|
|
test_corpus = [corpus[i] for i in ids_indices]
|
|
|
|
|
|
test_doc_topics = [doc_topics[i] for i in ids_indices]
|
|
#---
|
|
# NOTE: for bug in dumping float in json
|
|
class NumpyFloatValuesEncoder(json.JSONEncoder):
|
|
def default(self, obj):
|
|
if isinstance(obj, np.float32):
|
|
return float(obj)
|
|
return json.JSONEncoder.default(self, obj)
|
|
#json.dumps(d, cls=NumpyFloatValuesEncoder)
|
|
#----
|
|
keywords_array = []
|
|
for i, topic in enumerate(test_doc_topics):
|
|
id = test_ids[i]
|
|
content = test_corpus[i]
|
|
ttopic = []
|
|
for ti,tp in topic:
|
|
topws = top_25_words_scores[ti]
|
|
ttopic.append({
|
|
'id': ti,
|
|
'probability': tp,
|
|
'topic_top_25_words': topws,
|
|
})
|
|
keywords_array.append({
|
|
'id': id,
|
|
'content':content,
|
|
'topics':ttopic,
|
|
})
|
|
|
|
|
|
keywords_file = open('test_lda.json', "w", encoding='utf-8')
|
|
keywords_file.write(json.dumps(keywords_array, ensure_ascii=False, indent=4, cls=NumpyFloatValuesEncoder))
|
|
keywords_file.close()
|
|
|
|
print('end') |