Classifier/find_large_sections.py
2025-07-13 17:32:37 +03:30

50 lines
1.2 KiB
Python

from elastic_helper import ElasticHelper
import datetime
import json
with open('./data/sections_classes.json', 'r', encoding='utf-8') as file:
extracted_class = json.load(file)
print(len(extracted_class))
extracted_class_ids = [item['id'] for item in extracted_class]
eh_obj = ElasticHelper()
path = "/home/gpu/data_11/mj_qa_section.zip"
data = eh_obj.iterateJsonFile(path, True)
print(f'start: {datetime.datetime.now()}')
count = 1
all = 282671
large_sections = []
for item in data:
source = item['source']
content = source['content']
section_id = item['id']
if not section_id in extracted_class_ids:
section = {
"id": section_id,
"len_content": len(content.split()),
"content" : content
}
large_sections.append(section)
print(f'###### --> id: {section_id} ######')
print(f'from {all} --> count: {count} --> id: {section_id}')
count +=1
# if count == 10:
# break
with open('./data/large_sections.json', 'w', encoding='utf-8') as file:
json_data = json.dumps(large_sections, ensure_ascii=False, indent=4)
file.write(json_data)
print(f'end: {datetime.datetime.now()}')
print('finished!')