RD_relation/test.py

78 lines
2.1 KiB
Python
Raw Normal View History

2025-01-19 15:46:21 +00:00
import pickle
import json
import numpy as np
print('start')
ids = []
with open ('lda_cache/ids', 'rb') as fp:
ids = pickle.load(fp)
corpus = []
with open ('lda_cache/corpus', 'rb') as fp:
corpus = pickle.load(fp)
doc_topics = []
with open ('lda_cache/doc_topics', 'rb') as fp:
doc_topics = pickle.load(fp)
top_25_words_scores = []
with open ('lda_cache/top_25_words_scores', 'rb') as fp:
top_25_words_scores = pickle.load(fp)
# print(ids[0])
# print(corpus[0])
# print(doc_topics[0])
# print(top_25_words_scores[39])
# print(top_25_words_scores[46])
test_ids = ["mj_qa_section_2499344",
"mj_qa_section_1464855",
"mj_qa_section_695777",
"mj_qa_section_837222",
"mj_qa_section_2584285",
"mj_qa_section_878324",
"mj_qa_section_903789",
"mj_qa_section_2750472",
"mj_qa_section_2521317",
"mj_qa_section_1041965",
"mj_qa_section_1176837"]
ids_indices = [i for i, id in enumerate(ids) if id in test_ids]
test_corpus = [corpus[i] for i in ids_indices]
test_doc_topics = [doc_topics[i] for i in ids_indices]
#---
# NOTE: for bug in dumping float in json
class NumpyFloatValuesEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.float32):
return float(obj)
return json.JSONEncoder.default(self, obj)
#json.dumps(d, cls=NumpyFloatValuesEncoder)
#----
keywords_array = []
for i, topic in enumerate(test_doc_topics):
id = test_ids[i]
content = test_corpus[i]
ttopic = []
for ti,tp in topic:
topws = top_25_words_scores[ti]
ttopic.append({
'id': ti,
'probability': tp,
'topic_top_25_words': topws,
})
keywords_array.append({
'id': id,
'content':content,
'topics':ttopic,
})
keywords_file = open('test_lda.json', "w", encoding='utf-8')
keywords_file.write(json.dumps(keywords_array, ensure_ascii=False, indent=4, cls=NumpyFloatValuesEncoder))
keywords_file.close()
print('end')