import os import json import orjson import tiktoken from pathlib import Path def load_orjson(path: str | Path): path = Path(path) with path.open("rb") as f: # باید باینری باز بشه برای orjson return orjson.loads(f.read()) def save_orjson(path, data): with open(path, "wb") as f: f.write( orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS) ) def merge_json_dir(input_path, output_path): directory = Path(input_path) if not directory.is_dir(): raise ValueError(f"Not valid PATH: {input_path}") seen_ids = set() # برای ردیابی idهای دیده‌شده (سریع!) unique_data = [] # فقط داده‌های یکتا failed_files = [] json_files = list(directory.glob("*.json")) if not json_files: print("⚠️ NO JSON File Found In This PATH") return for json_file in json_files: try: data = load_orjson(json_file) if not data: # خالی یا None failed_files.append(json_file.name) continue # if isinstance(data, dict): # unique_data.append(data) if isinstance(data, list) and isinstance(data[0], dict): for item in data: item_id = item.get("id") if item_id is None: # اگر id نداشت، می‌تونی تصمیم بگیری: نگه داری یا ردش کنی # اینجا فرض می‌کنیم فقط مواردی با id معتبر مهم هستند continue if item_id not in seen_ids: seen_ids.add(item_id) unique_data.append(item) else: raise ValueError(f"no list available in this json -> {json_file}") except (json.JSONDecodeError, ValueError, OSError, KeyError, TypeError) as e: print(f"❌ Failed in process '{json_file.name}': {e}") failed_files.append(json_file.name) # گزارش خطاها if failed_files: print("\n❌ We lose this file:") for name in failed_files: print(f" - {name}") else: print("\n✅ All JSON added") # ذخیره خروجی try: save_orjson(data=unique_data, path=output_path) print( f"\n💾 Final file saved: {output_path} (Total unique items: {len(unique_data)})" ) except Exception as e: print(f"❌ Error in saving final file: {e}") def make_new_proccessed_ids_from_file(json_in, out_path): data = load_orjson(json_in) finall_data = [] for d in data: if d["id"]: finall_data.append(d["id"]) finall_data = set(finall_data) finall_data = list(finall_data) print(f"-- len ids {len(finall_data)}") save_orjson(data=finall_data, path=out_path) def __try_parse_json(value): """اگر value یک رشته باشد و بتوان آن را به JSON پارس کرد، نسخه پارس‌شده را برمی‌گرداند.""" if isinstance(value, str): try: parsed = json.loads(value) # فقط اگر واقعاً JSON بود (نه یک عدد یا کلمه ساده) if isinstance(parsed, (dict, list)): return parsed else: return value # مثلاً اگر رشته "123" بود، نمی‌خواهیم به عدد تبدیلش کنیم except (json.JSONDecodeError, TypeError): pass return value def __deep_parse_json_strings(obj): """به صورت بازگشتی همه رشته‌هایی که JSON هستند را پارس می‌کند.""" if isinstance(obj, dict): return { key: __deep_parse_json_strings(__try_parse_json(value)) for key, value in obj.items() } elif isinstance(obj, list): return [__deep_parse_json_strings(__try_parse_json(item)) for item in obj] else: return obj def serialize_json_from_str_fields(json_in, out_path=None): if out_path is None: out_path = json_in # بارگذاری داده data = load_orjson(json_in) # پارس کردن عمیق همه رشته‌های JSON cleaned_data = __deep_parse_json_strings(data) # ذخیره نتیجه save_orjson(data=cleaned_data, path=out_path) print(f"✅ all done '{out_path}'") def make_format(in_path, out_path): data = load_orjson(in_path) f_data = [] for i in data: form = { "id": None, "word": None, "ai_code_version": None, "ai_result": [], } form["id"] = i["id"] form["word"] = i["word"] form["ai_result"] = i["ai_code"]["ai_code"]["result"] form["ai_code_version"] = i["ai_code"]["ai_code_version"] f_data.append(form) save_orjson(data=f_data, path=out_path) def count_tokens(model_name, system_prompt, user_prompt): """ شمارش دقیق توکن‌های ورودی برای مدل‌های مختلف """ text = f"<|system|>\n{system_prompt}\n<|user|>\n{user_prompt}" # --- انتخاب tokenizer --- if "openai" in model_name.lower() or "oss" in model_name.lower(): # مدل‌های مشابه GPT یا OSS از tiktoken استفاده می‌کنند enc = tiktoken.get_encoding("cl100k_base") tokens = enc.encode(text) return len(tokens) elif "gemma" in model_name.lower(): from transformers import AutoTokenizer # SentencePiece tokenizer (Gemma از HuggingFace) tokenizer = AutoTokenizer.from_pretrained(model_name) tokens = tokenizer.encode(text) return len(tokens) elif "magistral" in model_name.lower() or "mistral" in model_name.lower(): from transformers import AutoTokenizer # Mistral / Magistral tokenizer (BPE) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) tokens = tokenizer.encode(text) return len(tokens) else: raise ValueError(f"Model {model_name} not recognized.") # --- نحوه استفاده --- if __name__ == "__main__": # ##### یکی کردن تمام بچ های خروجی در یک فایل # merge_json_dir( # input_path= '/home1/ava3/project/aiDataParser/task/keyword_extractor/output/batch_data', # output_path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/merged_1.json' # ) ###### ساخت یک proccessed id از فایل نهایی # make_new_proccessed_ids_from_file( # json_in ='/home1/ava3/word_bank_proccess/oss_120b_v1/merged_finall.json', # out_path='/home1/ava3/word_bank_proccess/oss_120b_v1/proccessed_id.json', # ) # جیسونی کردن تمام فیلد ها # serialize_json_from_str_fields( # json_in='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/merged_1.json', # out_path='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/merged_2.json' # ) # format finallize # make_format( # in_path='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/data.json', # out_path='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/data_f1.json' # ) # input_path = "/home1/ava3/word_bank_proccess/oss_120b_v1/merged_finall.json" # data = load_orjson(input_path) # print(f"------{len(data)}-----") # Truley = [] # for i in data: # if i['ai_code']['ai_code']['is_correct'] is True : #and i['ai_code']['ai_code']['is_proper_noun'] is True # Truley.append(i) # ---------------- filter حقوقی word # input_path = "/home1/ava3/word_bank_proccess/oss_120b_v1/merged_finall.json" # data = load_orjson(input_path) # Truley = [] # for item in data: # if "حقوقی" in item['ai_code']['ai_code']['scope'] and item['ai_code']['ai_code']["is_correct"] is True: # Truley.append(item) # print(f'Truley {len(Truley)}') # save_orjson( # data=Truley, # path='/home1/ava3/project/aidataparser_test/motaradef_dataset/motaradef.json' # ) ######################################################################## # raw_data = '/home1/ava3/data/mj_qa_section.json' # raw_data = load_orjson(raw_data) # ignore_data = '/home1/ava3/data/ignore_mj_qa_sections.json' # ignore_data = load_orjson(ignore_data) # ignore_data = set(ignore_data) # f_data = [] # for item in raw_data: # if item['id'] not in ignore_data: # f_data.append(item) # print(f'f_data {len(f_data)}') # save_orjson( # data=f_data, # path='/home1/ava3/data/valid_mj_qa_sections.json' # ) # ########################################################################## # raw_data = load_orjson('/home1/ava3/data/valid_mj_qa_sections.json') # f_data = [] # for item in raw_data: # f_data.append( # { # 'id':item['id'], # 'content':item['content'] # } # ) # save_orjson( # data=f_data, # path='/home1/ava3/data/valid_mj_qa_sections_light.json') ##################################### work with tree ##################################### # fr_tree = '/home1/ava3/project/aiDataParser/task/match_code_fr_per/prompt_ir_def.json' # fr_tree = load_orjson(fr_tree) # code_title = [] # for k, v in fr_tree.items(): # code_title.append( # k # ) # save_orjson( # data=code_title, # path='/home1/ava3/project/aiDataParser/task/match_code_fr_per/all_code_title_persian.json' # ) ##################################### make tree ##################################### # fr_tree = '/home1/ava3/franc_legal_codes/translate/all_tree_franc.json' # fr_tree = load_orjson(fr_tree) # pr_data = '/home1/ava3/project/aiDataParser/task/france_translate/all_pr_fr_title_cleaned_1.json' # pr_data = load_orjson(pr_data) # def build_lookup(flat_list): # lookup = {} # for item in flat_list: # key = item.get("france") # key = key.strip() # lookup[key] = item # return lookup # def clean_title(title: str) -> str: # import re # title = re.sub(r'\n+', '', title) # \s+ = هر ترکیبی از whitespace (space, \t, \n, \r, ...) # if r'-\s+' in title: # title = title.replace(r'-\s+', '-') # cleaned = re.sub(r'\s+', ' ', title) # \s+ = هر ترکیبی از whitespace (space, \t, \n, \r, ...) # return cleaned.strip() # def enrich_tree(node, flatted_list, lvl=0): # title = clean_title(node["title"]) # # یافتن مطابق در لیست مسطح # enriched = flatted_list.get(title, {}) # persian = enriched.get("persian") # if persian == None: # unlist = { # 'fr_title':title, # 'pr_title':enriched.get('france') # } # print(unlist) # return (unlist, 1) # # ساخت گره جدید # new_node = { # "france": title, # "persian": enriched.get("persian"), # پیش‌فرض: همان فرانسوی اگر ترجمه نبود # "id": enriched.get("id"), # "level":lvl # # "ai_code_version": enriched.get("ai_code_version"), # } # lvl +=1 # # اضافه کردن زیربخش‌ها (sections) اگر وجود داشت # if "sections" in node and node["sections"]: # new_node["sections"] = [ # enrich_tree(child, flatted_list, lvl) for child in node["sections"] # ] # # return new_node # return new_node # f_data = [] # lookup = build_lookup(pr_data) # for node in fr_tree: # # f_data.append(enrich_tree(node, lookup)) # res = enrich_tree(node, lookup) # if isinstance(res, tuple): # res, _ = res # f_data.append( # res # ) # save_orjson( # path='/home1/ava3/project/aiDataParser/task/france_translate/tree_test1_title.json', # data=f_data # ) ########################################################################## # input_per = '/home1/ava3/project/aiDataParser/task/france_translate/all_pr_fr_title.json' # input_per = load_orjson(input_per) # for i in input_per: # # ) ° # title = i['persian'] # # حذف ° # if '°' in title: # title = title.replace('°', '') # if '.' in title: # title = title.replace('.', '') # if ':' in title: # title = title.split(':', 1)[1].strip() # if ')' in title : #and '(' not in title: # colon_count1 = title.count(')') # colon_count2 = title.count('(') # if colon_count1 == colon_count2: # continue # else: # # print(colon_count1) # # print(title) # title = title.split(')', 1)[1].strip() # # print(title) # # break # i['persian'] = title # save_orjson( # data=input_per, # path='/home1/ava3/project/aiDataParser/task/france_translate/all_pr_fr_title_cleaned_1.json' # ) ########################################################################## # input_path1 = "/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/temp1.json" # data_ = load_orjson(input_path1) # data_1 = [] # unvalid = [] # for i in data_: # # if len(i['persian']) > 0 and isinstance(i['persian'], list) and bool(i['persian'][0] != None) and bool(i['persian'][0] != ''): # data_1.append( # { # "id": str(i["id"]), # # "ai_code_version": str(i["ai_code_version"]), # 'persian' : i['fa'] # } # ) # # else: # # unvalid.append(str(i["id"])) # input_path2 = "/home1/ava3/project/aiDataParser/task/france_translate/input_fr_title.json" # data_2 = load_orjson(input_path2) # data_2 = [ # { # "id": str(i["id"]), # "fr": str(i["fr"]) # } # for i in data_2 # if str(i['id']) in unvalid # ] # # print( # # f'-- data_1 {len(data_1)}\n', # # f'-- data_2 {len(data_2)}\n', # # ) # f_data = [] # for i in data_1: # form = {} # for j in data_2: # if i['id'] == j['id'] and i['id'] not in unvalid: # form['id'] = j['id'] # form['france'] = j['fr'] # form['persian'] = i['persian'] # # form['ai_code_version'] = i['ai_code_version'] # f_data.append( # form # ) # break # save_orjson( # data=f_data, # path="/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/temp2.json", # ) # save_orjson( # data=data_2, # path="/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/unvalid_step1.json", # ) ########################################################################## # input_path1 = '/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/setp2_per.json' # data_1 = load_orjson(input_path1) # input_path2 = '/home1/ava3/project/aiDataParser/task/france_translate/input_fr_title.json' # data_2 = load_orjson(input_path2) # undata = [] # proccess_id = [] # for i in data_2: # proccess_id.append(str(i['id'])) # for i in data_1: # if isinstance(i, dict): # for key in i.keys(): # if str(key) not in proccess_id: # undata.append(i) # print(f'--- translated qwen {len(undata)}') # # print(f'--- NOT translated {len(undata)}') # # save_orjson( # # data=undata, # # path='/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/step2_trnsalte_unfinished.json' # # ) ######################################### make keyword finall dataset ################################# # data1 = load_orjson( # "/home1/ava3/project/aiDataParser/task/keyword_extractor/input/valid_mj_qa_sections.json" # ) # data2 = load_orjson( # "/home1/ava3/project/aiDataParser/task/keyword_extractor/output/merged_1.json" # ) # data2_map = {item["id"]: item for item in data2} # clean_data = [] # error_data = [] # for i1 in data1: # i2 = data2_map.get(i1["id"]) # if i2 is None: # error_data.append(i1) # continue # try: # i1["keyword_list"] = i2["keyword_list"] # i1["ai_code_version"] = i2["ai_code_version"] # clean_data.append(i1) # except Exception: # error_data.append(i2) # print( # f'data1 {len(data1)}\n', # f'data2 {len(data2)}\n', # f'clean_data {len(clean_data)}\n', # f'error_data {len(error_data)}\n', # ) # save_orjson( # path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/finall_dataset.json', # data=clean_data # ) # save_orjson( # path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/error_data.json', # data=error_data # ) ########################################################################## print(":D")