ai_dataset/main_qa_data/main_sections_temp.py

40 lines
1.2 KiB
Python

from elasticsearch7 import Elasticsearch
from general_functions import normalize_content
from funcs import read_from_json, write_to_json, save_to_file_by_address, read_file_by_address
import os
import json
base_address = os.getcwd() + "/main_qa_data"
json_address_85sections = base_address + "/data/qa_85_sections.json"
topiced_sections_15k_address = base_address + "/data/topiced_sections_15k.txt"
# dataids = read_from_json(dest_address)
datalist_85 = read_from_json(json_address_85sections)
data_15k_text = read_file_by_address(topiced_sections_15k_address)
data_15k = data_15k_text.splitlines()
final_text = ''
count = 0
sections_85 = []
c = 1
for sec in data_15k:
section = json.loads(sec)
id1 = section['id']
for line in datalist_85:
if line["id"] == id1:
content = section['content']
print(c)
c += 1
sections_85.append({
"id": id1,
"content": content,
})
break
new_json_address_85sections = base_address + "/data/qa_sections_85.json"
write_to_json(sections_85, new_json_address_85sections)
print('finished')