ai_dataset/main_qa_data/main_sections.py

86 lines
2.9 KiB
Python

from elasticsearch7 import Elasticsearch
from general_functions import normalize_content
from funcs import read_from_json, write_to_json, save_to_file_by_address, read_file_by_address
import os
import json
base_address = os.getcwd() + "/main_qa_data"
dest_address = base_address + "/sections_100_test.txt"
json_address = base_address + "/qasections1.json"
json_address_85sections = base_address + "/qa_85_sections.json"
topiced_sections_15k_address = base_address + "/topiced_sections_15k.txt"
# dataids = read_from_json(dest_address)
address = base_address + "/main_sections2.json"
datalist_3K = read_from_json(address)
data_15k_text = read_file_by_address(topiced_sections_15k_address)
data_15k = data_15k_text.splitlines()
final_text = ''
'''# selected_sections = []
# for item in datalist:
# text = normalize_content(item["content"])
# current_id = item["id"]
# tokens = text.split()
# # حذف سکشن هایی که جمله نیستند
# if len(tokens) <= 20:
# continue
# # حذف سکشن های جدولی
# if current_id == 'qs2158325' or current_id =='qs2158326' or current_id =='qs2160910' or current_id =='qs1416208' or current_id =='qs1419763':
# continue
# # حذف سکشن هایی که کامل نیستند و برای کامل شدن به سکشن های بعدی خود نیاز دارند
# if text[len(text)-1] == ":":
# continue
# final_text += text + "\n\n"
# selected_sections.append({
# "id" : current_id,
# "content": text,
# })
# final_text = final_text.strip()
# save_to_file_by_address(dest_address, final_text)
# write_to_json(selected_sections, json_address)
'''
count = 0
all_sections = []
for sec in data_15k:
section = json.loads(sec)
id1 = section['id']
topicss = section['topics']
if len(topicss) != 0: continue
all_sections.append(id1)
sections_3k = [section["id"] for section in datalist_3K]
for data in all_sections:
if data in sections_3k:
print(data)
selected_ids = []
for index, item in enumerate(data_15k):
if index % 100 != 0:
continue
section = json.loads(item)
text = section["content"].strip()
if text.__contains__("\t\t"): continue
if text.__contains__("ارسلناک"): continue
current_id = section["id"]
topics = section["topics"]
if len(topics) != 0: continue
tokens = text.split()
# حذف سکشن هایی که جمله نیستند
if len(tokens) <= 20: continue
# حذف سکشن هایی که کامل نیستند و برای کامل شدن به سکشن های بعدی خود نیاز دارند
if text[len(text)-1] == ":": continue
final_text += text+" *سکشن*" + "\n\n"
selected_ids.append({"id": current_id})
if count == 100: break
count += 1
final_text = final_text.strip()
save_to_file_by_address(dest_address, final_text)
write_to_json(selected_ids, json_address_85sections)
print('finished')