50 lines
1.2 KiB
Python
50 lines
1.2 KiB
Python
from elastic_helper import ElasticHelper
|
|
import datetime
|
|
import json
|
|
|
|
with open('./data/sections_classes.json', 'r', encoding='utf-8') as file:
|
|
extracted_class = json.load(file)
|
|
|
|
print(len(extracted_class))
|
|
|
|
extracted_class_ids = [item['id'] for item in extracted_class]
|
|
|
|
eh_obj = ElasticHelper()
|
|
|
|
path = "/home/gpu/data_11/mj_qa_section.zip"
|
|
|
|
data = eh_obj.iterateJsonFile(path, True)
|
|
|
|
|
|
print(f'start: {datetime.datetime.now()}')
|
|
count = 1
|
|
all = 282671
|
|
large_sections = []
|
|
for item in data:
|
|
source = item['source']
|
|
content = source['content']
|
|
section_id = item['id']
|
|
|
|
if not section_id in extracted_class_ids:
|
|
section = {
|
|
"id": section_id,
|
|
"len_content": len(content.split()),
|
|
"content" : content
|
|
}
|
|
large_sections.append(section)
|
|
print(f'###### --> id: {section_id} ######')
|
|
|
|
|
|
print(f'from {all} --> count: {count} --> id: {section_id}')
|
|
count +=1
|
|
# if count == 10:
|
|
# break
|
|
|
|
with open('./data/large_sections.json', 'w', encoding='utf-8') as file:
|
|
json_data = json.dumps(large_sections, ensure_ascii=False, indent=4)
|
|
file.write(json_data)
|
|
|
|
print(f'end: {datetime.datetime.now()}')
|
|
print('finished!')
|
|
|