from elasticsearch7 import Elasticsearch from general_functions import normalize_content from funcs import read_from_json, write_to_json, save_to_file_by_address, read_file_by_address import os import json base_address = os.getcwd() + "/main_qa_data" dest_address = base_address + "/sections_100_test.txt" json_address = base_address + "/qasections1.json" json_address_85sections = base_address + "/qa_85_sections.json" topiced_sections_15k_address = base_address + "/topiced_sections_15k.txt" # dataids = read_from_json(dest_address) address = base_address + "/main_sections2.json" datalist_3K = read_from_json(address) data_15k_text = read_file_by_address(topiced_sections_15k_address) data_15k = data_15k_text.splitlines() final_text = '' '''# selected_sections = [] # for item in datalist: # text = normalize_content(item["content"]) # current_id = item["id"] # tokens = text.split() # # حذف سکشن هایی که جمله نیستند # if len(tokens) <= 20: # continue # # حذف سکشن های جدولی # if current_id == 'qs2158325' or current_id =='qs2158326' or current_id =='qs2160910' or current_id =='qs1416208' or current_id =='qs1419763': # continue # # حذف سکشن هایی که کامل نیستند و برای کامل شدن به سکشن های بعدی خود نیاز دارند # if text[len(text)-1] == ":": # continue # final_text += text + "\n\n" # selected_sections.append({ # "id" : current_id, # "content": text, # }) # final_text = final_text.strip() # save_to_file_by_address(dest_address, final_text) # write_to_json(selected_sections, json_address) ''' count = 0 all_sections = [] for sec in data_15k: section = json.loads(sec) id1 = section['id'] topicss = section['topics'] if len(topicss) != 0: continue all_sections.append(id1) sections_3k = [section["id"] for section in datalist_3K] for data in all_sections: if data in sections_3k: print(data) selected_ids = [] for index, item in enumerate(data_15k): if index % 100 != 0: continue section = json.loads(item) text = section["content"].strip() if text.__contains__("\t\t"): continue if text.__contains__("ارسلناک"): continue current_id = section["id"] topics = section["topics"] if len(topics) != 0: continue tokens = text.split() # حذف سکشن هایی که جمله نیستند if len(tokens) <= 20: continue # حذف سکشن هایی که کامل نیستند و برای کامل شدن به سکشن های بعدی خود نیاز دارند if text[len(text)-1] == ":": continue final_text += text+" *سکشن*" + "\n\n" selected_ids.append({"id": current_id}) if count == 100: break count += 1 final_text = final_text.strip() save_to_file_by_address(dest_address, final_text) write_to_json(selected_ids, json_address_85sections) print('finished')