aiDataParser/core/data_normalizer.py
2025-11-29 07:47:06 +00:00

510 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import json
import orjson
import tiktoken
from pathlib import Path
def load_orjson(path: str | Path):
path = Path(path)
with path.open("rb") as f: # باید باینری باز بشه برای orjson
return orjson.loads(f.read())
def save_orjson(path, data):
with open(path, "wb") as f:
f.write(
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
)
def merge_json_dir(input_path, output_path):
directory = Path(input_path)
if not directory.is_dir():
raise ValueError(f"Not valid PATH: {input_path}")
seen_ids = set() # برای ردیابی idهای دیده‌شده (سریع!)
unique_data = [] # فقط داده‌های یکتا
failed_files = []
json_files = list(directory.glob("*.json"))
if not json_files:
print("⚠️ NO JSON File Found In This PATH")
return
for json_file in json_files:
try:
data = load_orjson(json_file)
if not data: # خالی یا None
failed_files.append(json_file.name)
continue
# if isinstance(data, dict):
# unique_data.append(data)
if isinstance(data, list) and isinstance(data[0], dict):
for item in data:
item_id = item.get("id")
if item_id is None:
# اگر id نداشت، می‌تونی تصمیم بگیری: نگه داری یا ردش کنی
# اینجا فرض می‌کنیم فقط مواردی با id معتبر مهم هستند
continue
if item_id not in seen_ids:
seen_ids.add(item_id)
unique_data.append(item)
else:
raise ValueError(f"no list available in this json -> {json_file}")
except (json.JSONDecodeError, ValueError, OSError, KeyError, TypeError) as e:
print(f"❌ Failed in process '{json_file.name}': {e}")
failed_files.append(json_file.name)
# گزارش خطاها
if failed_files:
print("\n❌ We lose this file:")
for name in failed_files:
print(f" - {name}")
else:
print("\n✅ All JSON added")
# ذخیره خروجی
try:
save_orjson(data=unique_data, path=output_path)
print(
f"\n💾 Final file saved: {output_path} (Total unique items: {len(unique_data)})"
)
except Exception as e:
print(f"❌ Error in saving final file: {e}")
def make_new_proccessed_ids_from_file(json_in, out_path):
data = load_orjson(json_in)
finall_data = []
for d in data:
if d["id"]:
finall_data.append(d["id"])
finall_data = set(finall_data)
finall_data = list(finall_data)
print(f"-- len ids {len(finall_data)}")
save_orjson(data=finall_data, path=out_path)
def __try_parse_json(value):
"""اگر value یک رشته باشد و بتوان آن را به JSON پارس کرد، نسخه پارس‌شده را برمی‌گرداند."""
if isinstance(value, str):
try:
parsed = json.loads(value)
# فقط اگر واقعاً JSON بود (نه یک عدد یا کلمه ساده)
if isinstance(parsed, (dict, list)):
return parsed
else:
return value # مثلاً اگر رشته "123" بود، نمی‌خواهیم به عدد تبدیلش کنیم
except (json.JSONDecodeError, TypeError):
pass
return value
def __deep_parse_json_strings(obj):
"""به صورت بازگشتی همه رشته‌هایی که JSON هستند را پارس می‌کند."""
if isinstance(obj, dict):
return {
key: __deep_parse_json_strings(__try_parse_json(value))
for key, value in obj.items()
}
elif isinstance(obj, list):
return [__deep_parse_json_strings(__try_parse_json(item)) for item in obj]
else:
return obj
def serialize_json_from_str_fields(json_in, out_path=None):
if out_path is None:
out_path = json_in
# بارگذاری داده
data = load_orjson(json_in)
# پارس کردن عمیق همه رشته‌های JSON
cleaned_data = __deep_parse_json_strings(data)
# ذخیره نتیجه
save_orjson(data=cleaned_data, path=out_path)
print(f"✅ all done '{out_path}'")
def make_format(in_path, out_path):
data = load_orjson(in_path)
f_data = []
for i in data:
form = {
"id": None,
"word": None,
"ai_code_version": None,
"ai_result": [],
}
form["id"] = i["id"]
form["word"] = i["word"]
form["ai_result"] = i["ai_code"]["ai_code"]["result"]
form["ai_code_version"] = i["ai_code"]["ai_code_version"]
f_data.append(form)
save_orjson(data=f_data, path=out_path)
def count_tokens(model_name, system_prompt, user_prompt):
"""
شمارش دقیق توکن‌های ورودی برای مدل‌های مختلف
"""
text = f"<|system|>\n{system_prompt}\n<|user|>\n{user_prompt}"
# --- انتخاب tokenizer ---
if "openai" in model_name.lower() or "oss" in model_name.lower():
# مدل‌های مشابه GPT یا OSS از tiktoken استفاده می‌کنند
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(text)
return len(tokens)
elif "gemma" in model_name.lower():
from transformers import AutoTokenizer
# SentencePiece tokenizer (Gemma از HuggingFace)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokens = tokenizer.encode(text)
return len(tokens)
elif "magistral" in model_name.lower() or "mistral" in model_name.lower():
from transformers import AutoTokenizer
# Mistral / Magistral tokenizer (BPE)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokens = tokenizer.encode(text)
return len(tokens)
else:
raise ValueError(f"Model {model_name} not recognized.")
# --- نحوه استفاده ---
if __name__ == "__main__":
# ##### یکی کردن تمام بچ های خروجی در یک فایل
# merge_json_dir(
# input_path= '/home1/ava3/project/aiDataParser/task/keyword_extractor/output/batch_data',
# output_path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/merged_1.json'
# )
###### ساخت یک proccessed id از فایل نهایی
# make_new_proccessed_ids_from_file(
# json_in ='/home1/ava3/word_bank_proccess/oss_120b_v1/merged_finall.json',
# out_path='/home1/ava3/word_bank_proccess/oss_120b_v1/proccessed_id.json',
# )
# جیسونی کردن تمام فیلد ها
# serialize_json_from_str_fields(
# json_in='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/merged_1.json',
# out_path='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/merged_2.json'
# )
# format finallize
# make_format(
# in_path='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/data.json',
# out_path='/home1/ava3/keyword_simpify_proccess/data_keyword_gemma27/data_f1.json'
# )
# input_path = "/home1/ava3/word_bank_proccess/oss_120b_v1/merged_finall.json"
# data = load_orjson(input_path)
# print(f"------{len(data)}-----")
# Truley = []
# for i in data:
# if i['ai_code']['ai_code']['is_correct'] is True : #and i['ai_code']['ai_code']['is_proper_noun'] is True
# Truley.append(i)
# ---------------- filter حقوقی word
# input_path = "/home1/ava3/word_bank_proccess/oss_120b_v1/merged_finall.json"
# data = load_orjson(input_path)
# Truley = []
# for item in data:
# if "حقوقی" in item['ai_code']['ai_code']['scope'] and item['ai_code']['ai_code']["is_correct"] is True:
# Truley.append(item)
# print(f'Truley {len(Truley)}')
# save_orjson(
# data=Truley,
# path='/home1/ava3/project/aidataparser_test/motaradef_dataset/motaradef.json'
# )
########################################################################
# raw_data = '/home1/ava3/data/mj_qa_section.json'
# raw_data = load_orjson(raw_data)
# ignore_data = '/home1/ava3/data/ignore_mj_qa_sections.json'
# ignore_data = load_orjson(ignore_data)
# ignore_data = set(ignore_data)
# f_data = []
# for item in raw_data:
# if item['id'] not in ignore_data:
# f_data.append(item)
# print(f'f_data {len(f_data)}')
# save_orjson(
# data=f_data,
# path='/home1/ava3/data/valid_mj_qa_sections.json'
# )
# ##########################################################################
# raw_data = load_orjson('/home1/ava3/data/valid_mj_qa_sections.json')
# f_data = []
# for item in raw_data:
# f_data.append(
# {
# 'id':item['id'],
# 'content':item['content']
# }
# )
# save_orjson(
# data=f_data,
# path='/home1/ava3/data/valid_mj_qa_sections_light.json')
##################################### work with tree #####################################
# fr_tree = '/home1/ava3/project/aiDataParser/task/match_code_fr_per/prompt_ir_def.json'
# fr_tree = load_orjson(fr_tree)
# code_title = []
# for k, v in fr_tree.items():
# code_title.append(
# k
# )
# save_orjson(
# data=code_title,
# path='/home1/ava3/project/aiDataParser/task/match_code_fr_per/all_code_title_persian.json'
# )
##################################### make tree #####################################
# fr_tree = '/home1/ava3/franc_legal_codes/translate/all_tree_franc.json'
# fr_tree = load_orjson(fr_tree)
# pr_data = '/home1/ava3/project/aiDataParser/task/france_translate/all_pr_fr_title_cleaned_1.json'
# pr_data = load_orjson(pr_data)
# def build_lookup(flat_list):
# lookup = {}
# for item in flat_list:
# key = item.get("france")
# key = key.strip()
# lookup[key] = item
# return lookup
# def clean_title(title: str) -> str:
# import re
# title = re.sub(r'\n+', '', title) # \s+ = هر ترکیبی از whitespace (space, \t, \n, \r, ...)
# if r'-\s+' in title:
# title = title.replace(r'-\s+', '-')
# cleaned = re.sub(r'\s+', ' ', title) # \s+ = هر ترکیبی از whitespace (space, \t, \n, \r, ...)
# return cleaned.strip()
# def enrich_tree(node, flatted_list, lvl=0):
# title = clean_title(node["title"])
# # یافتن مطابق در لیست مسطح
# enriched = flatted_list.get(title, {})
# persian = enriched.get("persian")
# if persian == None:
# unlist = {
# 'fr_title':title,
# 'pr_title':enriched.get('france')
# }
# print(unlist)
# return (unlist, 1)
# # ساخت گره جدید
# new_node = {
# "france": title,
# "persian": enriched.get("persian"), # پیش‌فرض: همان فرانسوی اگر ترجمه نبود
# "id": enriched.get("id"),
# "level":lvl
# # "ai_code_version": enriched.get("ai_code_version"),
# }
# lvl +=1
# # اضافه کردن زیربخش‌ها (sections) اگر وجود داشت
# if "sections" in node and node["sections"]:
# new_node["sections"] = [
# enrich_tree(child, flatted_list, lvl) for child in node["sections"]
# ]
# # return new_node
# return new_node
# f_data = []
# lookup = build_lookup(pr_data)
# for node in fr_tree:
# # f_data.append(enrich_tree(node, lookup))
# res = enrich_tree(node, lookup)
# if isinstance(res, tuple):
# res, _ = res
# f_data.append(
# res
# )
# save_orjson(
# path='/home1/ava3/project/aiDataParser/task/france_translate/tree_test1_title.json',
# data=f_data
# )
##########################################################################
# input_per = '/home1/ava3/project/aiDataParser/task/france_translate/all_pr_fr_title.json'
# input_per = load_orjson(input_per)
# for i in input_per:
# # ) °
# title = i['persian']
# # حذف °
# if '°' in title:
# title = title.replace('°', '')
# if '.' in title:
# title = title.replace('.', '')
# if ':' in title:
# title = title.split(':', 1)[1].strip()
# if ')' in title : #and '(' not in title:
# colon_count1 = title.count(')')
# colon_count2 = title.count('(')
# if colon_count1 == colon_count2:
# continue
# else:
# # print(colon_count1)
# # print(title)
# title = title.split(')', 1)[1].strip()
# # print(title)
# # break
# i['persian'] = title
# save_orjson(
# data=input_per,
# path='/home1/ava3/project/aiDataParser/task/france_translate/all_pr_fr_title_cleaned_1.json'
# )
##########################################################################
# input_path1 = "/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/temp1.json"
# data_ = load_orjson(input_path1)
# data_1 = []
# unvalid = []
# for i in data_:
# # if len(i['persian']) > 0 and isinstance(i['persian'], list) and bool(i['persian'][0] != None) and bool(i['persian'][0] != ''):
# data_1.append(
# {
# "id": str(i["id"]),
# # "ai_code_version": str(i["ai_code_version"]),
# 'persian' : i['fa']
# }
# )
# # else:
# # unvalid.append(str(i["id"]))
# input_path2 = "/home1/ava3/project/aiDataParser/task/france_translate/input_fr_title.json"
# data_2 = load_orjson(input_path2)
# data_2 = [
# {
# "id": str(i["id"]),
# "fr": str(i["fr"])
# }
# for i in data_2 # if str(i['id']) in unvalid
# ]
# # print(
# # f'-- data_1 {len(data_1)}\n',
# # f'-- data_2 {len(data_2)}\n',
# # )
# f_data = []
# for i in data_1:
# form = {}
# for j in data_2:
# if i['id'] == j['id'] and i['id'] not in unvalid:
# form['id'] = j['id']
# form['france'] = j['fr']
# form['persian'] = i['persian']
# # form['ai_code_version'] = i['ai_code_version']
# f_data.append(
# form
# )
# break
# save_orjson(
# data=f_data,
# path="/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/temp2.json",
# )
# save_orjson(
# data=data_2,
# path="/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/unvalid_step1.json",
# )
##########################################################################
# input_path1 = '/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/setp2_per.json'
# data_1 = load_orjson(input_path1)
# input_path2 = '/home1/ava3/project/aiDataParser/task/france_translate/input_fr_title.json'
# data_2 = load_orjson(input_path2)
# undata = []
# proccess_id = []
# for i in data_2:
# proccess_id.append(str(i['id']))
# for i in data_1:
# if isinstance(i, dict):
# for key in i.keys():
# if str(key) not in proccess_id:
# undata.append(i)
# print(f'--- translated qwen {len(undata)}')
# # print(f'--- NOT translated {len(undata)}')
# # save_orjson(
# # data=undata,
# # path='/home1/ava3/project/aiDataParser/task/france_translate/all_title_persian/step2_trnsalte_unfinished.json'
# # )
######################################### make keyword finall dataset #################################
# data1 = load_orjson(
# "/home1/ava3/project/aiDataParser/task/keyword_extractor/input/valid_mj_qa_sections.json"
# )
# data2 = load_orjson(
# "/home1/ava3/project/aiDataParser/task/keyword_extractor/output/merged_1.json"
# )
# data2_map = {item["id"]: item for item in data2}
# clean_data = []
# error_data = []
# for i1 in data1:
# i2 = data2_map.get(i1["id"])
# if i2 is None:
# error_data.append(i1)
# continue
# try:
# i1["keyword_list"] = i2["keyword_list"]
# i1["ai_code_version"] = i2["ai_code_version"]
# clean_data.append(i1)
# except Exception:
# error_data.append(i2)
# print(
# f'data1 {len(data1)}\n',
# f'data2 {len(data2)}\n',
# f'clean_data {len(clean_data)}\n',
# f'error_data {len(error_data)}\n',
# )
# save_orjson(
# path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/finall_dataset.json',
# data=clean_data
# )
# save_orjson(
# path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/error_data.json',
# data=error_data
# )
##########################################################################
print(":D")