86 lines
2.9 KiB
Python
86 lines
2.9 KiB
Python
|
from elasticsearch7 import Elasticsearch
|
||
|
from general_functions import normalize_content
|
||
|
from funcs import read_from_json, write_to_json, save_to_file_by_address, read_file_by_address
|
||
|
import os
|
||
|
import json
|
||
|
|
||
|
base_address = os.getcwd() + "/main_qa_data"
|
||
|
|
||
|
dest_address = base_address + "/sections_100_test.txt"
|
||
|
json_address = base_address + "/qasections1.json"
|
||
|
json_address_85sections = base_address + "/qa_85_sections.json"
|
||
|
topiced_sections_15k_address = base_address + "/topiced_sections_15k.txt"
|
||
|
# dataids = read_from_json(dest_address)
|
||
|
|
||
|
address = base_address + "/main_sections2.json"
|
||
|
datalist_3K = read_from_json(address)
|
||
|
|
||
|
data_15k_text = read_file_by_address(topiced_sections_15k_address)
|
||
|
data_15k = data_15k_text.splitlines()
|
||
|
final_text = ''
|
||
|
'''# selected_sections = []
|
||
|
# for item in datalist:
|
||
|
# text = normalize_content(item["content"])
|
||
|
# current_id = item["id"]
|
||
|
# tokens = text.split()
|
||
|
# # حذف سکشن هایی که جمله نیستند
|
||
|
# if len(tokens) <= 20:
|
||
|
# continue
|
||
|
# # حذف سکشن های جدولی
|
||
|
# if current_id == 'qs2158325' or current_id =='qs2158326' or current_id =='qs2160910' or current_id =='qs1416208' or current_id =='qs1419763':
|
||
|
# continue
|
||
|
# # حذف سکشن هایی که کامل نیستند و برای کامل شدن به سکشن های بعدی خود نیاز دارند
|
||
|
# if text[len(text)-1] == ":":
|
||
|
# continue
|
||
|
# final_text += text + "\n\n"
|
||
|
# selected_sections.append({
|
||
|
# "id" : current_id,
|
||
|
# "content": text,
|
||
|
# })
|
||
|
|
||
|
# final_text = final_text.strip()
|
||
|
# save_to_file_by_address(dest_address, final_text)
|
||
|
# write_to_json(selected_sections, json_address)
|
||
|
'''
|
||
|
count = 0
|
||
|
|
||
|
all_sections = []
|
||
|
for sec in data_15k:
|
||
|
section = json.loads(sec)
|
||
|
id1 = section['id']
|
||
|
topicss = section['topics']
|
||
|
if len(topicss) != 0: continue
|
||
|
all_sections.append(id1)
|
||
|
|
||
|
sections_3k = [section["id"] for section in datalist_3K]
|
||
|
for data in all_sections:
|
||
|
if data in sections_3k:
|
||
|
print(data)
|
||
|
|
||
|
selected_ids = []
|
||
|
for index, item in enumerate(data_15k):
|
||
|
if index % 100 != 0:
|
||
|
continue
|
||
|
section = json.loads(item)
|
||
|
text = section["content"].strip()
|
||
|
|
||
|
if text.__contains__("\t\t"): continue
|
||
|
if text.__contains__("ارسلناک"): continue
|
||
|
current_id = section["id"]
|
||
|
topics = section["topics"]
|
||
|
if len(topics) != 0: continue
|
||
|
tokens = text.split()
|
||
|
# حذف سکشن هایی که جمله نیستند
|
||
|
if len(tokens) <= 20: continue
|
||
|
# حذف سکشن هایی که کامل نیستند و برای کامل شدن به سکشن های بعدی خود نیاز دارند
|
||
|
if text[len(text)-1] == ":": continue
|
||
|
final_text += text+" *سکشن*" + "\n\n"
|
||
|
selected_ids.append({"id": current_id})
|
||
|
if count == 100: break
|
||
|
count += 1
|
||
|
|
||
|
final_text = final_text.strip()
|
||
|
save_to_file_by_address(dest_address, final_text)
|
||
|
write_to_json(selected_ids, json_address_85sections)
|
||
|
|
||
|
print('finished')
|