SmartMultiServerCore added

This commit is contained in:
init_mahdi 2026-03-04 21:17:45 +00:00
parent 7b3611e9cc
commit 826ee8fd35
7 changed files with 1787 additions and 187 deletions

View File

@ -1 +1,3 @@
from aiDataParser.core import AsyncCore
from aiDataParser.core.ai_parser import AsyncCore
from aiDataParser.core.ai_stream_parser import StreamAsyncCore
from aiDataParser.core.data_normalizer import load_orjson, save_orjson

View File

@ -1 +0,0 @@
from aiDataParser.core.ai_parser import AsyncCore

File diff suppressed because it is too large Load Diff

534
core/ai_stream_parser.py Normal file
View File

@ -0,0 +1,534 @@
from typing import List
from pathlib import Path
import os, orjson, time, json, re, asyncio, traceback
from openai import AsyncOpenAI, LengthFinishReasonError
# ------------------------------ پردازش API ------------------------------
class StreamAsyncCore:
def __init__(
self,
model_name,
task_name,
api_url,
output_schema=None,
data_path=None,
reasoning_effort="low",
top_p=1,
temperature=0.0,
max_token=128000,
output_path=None,
ai_code_version=None,
request_timeout=30, # ثانیه
api_key="EMPTY",
save_number=2,
semaphore_number=5,
):
self.unvalid_chunk = []
self.sample_chunk = []
self.save_number = save_number
# json file of data
self.data_path = data_path
self.semaphore_number = semaphore_number
self.task_name = task_name
if output_path is None:
output_path = f"./{task_name}"
self.output_path = Path(output_path)
self._temp_path = self.output_path / "batch_data"
self._temp_processed_id_path = self._temp_path / "processed_id.json"
# Create output directory and subdirectories if they don't exist
self.output_path.mkdir(parents=True, exist_ok=True)
self._temp_path.mkdir(parents=True, exist_ok=True)
# self._temp_processed_id_path.mkdir(parents=True, exist_ok=True)
self.request_timeout = request_timeout
self.model_name = model_name
self.api_key = api_key
self.output_schema = output_schema
self.api_url = api_url
self.reasoning_effort = reasoning_effort
self.top_p = top_p
self.temperature = temperature
self.max_token = max_token
if ai_code_version is None:
ai_code_version = f"{model_name}_{reasoning_effort}"
self.ai_code_version = ai_code_version
self.PRIMARY_KEY = {"system_prompt", "user_prompt", "id"}
if data_path != None:
try:
self.data = self.__data_process()
print(f"📦 Loaded {len(self.data)} words")
except Exception as e:
raise ValueError(
f"Data loading/validation failed: {e}\n{traceback.format_exc()}"
)
def __validate_item(self, item, idx):
# Mandatory fields
for key in self.PRIMARY_KEY:
if key not in item:
raise ValueError(f"Missing mandatory key '{key}' in item #{idx}")
if not isinstance(item[key], str):
raise TypeError(
f"Item #{idx}: '{key}' must be a string, got {type(item[key]).__name__}"
)
# Optional field: assistant_prompt
if "assistant_prompt" not in item or item["assistant_prompt"] is None:
item["assistant_prompt"] = None
else:
if not isinstance(item["assistant_prompt"], str):
raise TypeError(
f"Item #{idx}: 'assistant_prompt' must be a string or absent, got {type(item['assistant_prompt']).__name__}"
)
return item # now normalized
def __data_process(self):
raw_data = self.__load_orjson(self.data_path)
if not isinstance(raw_data, list):
raise ValueError("Data must be a list of dictionaries.")
processed_data = []
for idx, item in enumerate(raw_data):
if not isinstance(item, dict):
raise ValueError(f"Item #{idx} is not a dictionary.")
validated_item = self.__validate_item(item, idx)
processed_data.append(validated_item)
return processed_data
def __get_max_number_file(self, directory):
# Pattern to match filenames like out_1.json, out_25.json, etc.
pattern = re.compile(r"output_(\d+)\.json$")
max_num = 0
for filename in os.listdir(directory):
match = pattern.match(filename)
if match:
num = int(match.group(1))
if num > max_num:
max_num = num
return max_num + 1
def __load_orjson(self, path: str | Path):
path = Path(path)
with path.open("rb") as f: # باید باینری باز بشه برای orjson
return orjson.loads(f.read())
def __save_orjson(self, path, data):
with open(path, "wb") as f:
f.write(
orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_NON_STR_KEYS)
)
def merge_json_dir(self, input_path, output_path):
directory = Path(input_path)
if not directory.is_dir():
raise ValueError(f"Not valid PATH: {input_path}")
seen_ids = set() # برای ردیابی idهای دیده‌شده (سریع!)
unique_data = [] # فقط داده‌های یکتا
failed_files = []
json_files = list(directory.glob("*.json"))
if not json_files:
print("⚠️ NO JSON File Found In This PATH")
return
for json_file in json_files:
try:
data = self.__load_orjson(json_file)
if not data: # خالی یا None
failed_files.append(json_file.name)
continue
if isinstance(data, list) and isinstance(data[0], dict):
for item in data:
item_id = item.get("id")
if item_id is None:
# اگر id نداشت، می‌تونی تصمیم بگیری: نگه داری یا ردش کنی
# اینجا فرض می‌کنیم فقط مواردی با id معتبر مهم هستند
continue
if item_id not in seen_ids:
seen_ids.add(item_id)
unique_data.append(item)
else:
raise ValueError(f"no list available in this json -> {json_file}")
except (
json.JSONDecodeError,
ValueError,
OSError,
KeyError,
TypeError,
) as e:
# print(f"❌ Failed in process '{json_file.name}': {e}")
failed_files.append(json_file.name)
# گزارش خطاها
if failed_files:
print("\n❌ We lose this file:")
for name in failed_files:
print(f" - {name}")
else:
print("\n✅ All JSON added")
# ذخیره خروجی
try:
self.__save_orjson(data=unique_data, path=output_path)
print(
f"\n💾 Final file saved: {output_path} (Total unique items: {len(unique_data)})"
)
except Exception as e:
print(f"❌ Error in saving final file: {e}")
def make_new_proccessed_ids_from_file(self, json_in, out_path):
data = self.__load_orjson(json_in)
finall_data = []
for d in data:
if d["id"]:
finall_data.append(d["id"])
finall_data = set(finall_data)
finall_data = list(finall_data)
print(f"-- len ids {len(finall_data)}")
self.__save_orjson(data=finall_data, path=out_path)
# ------------------------------ Main ------------------------------
async def __process_item(self, client, item):
try:
messages = [
{"role": "user", "content": item["user_prompt"]},
]
if item.get("system_prompt"):
messages.append({"role": "system", "content": item["system_prompt"]})
if item.get("assistant_prompt"):
messages.append(
{"role": "assistant", "content": item["assistant_prompt"]}
)
response = await client.chat.completions.parse(
model=self.model_name,
messages=messages,
temperature=self.temperature,
top_p=self.top_p,
reasoning_effort=self.reasoning_effort,
max_tokens=self.max_token,
stop=None,
response_format=self.output_schema,
)
parsed = (
response.choices[0].message.parsed
if response and response.choices and response.choices[0].message.parsed
else {"raw_text": str(response)}
)
parsed = self.output_schema.model_validate(parsed)
parsed = parsed.model_dump()
parsed = dict(parsed)
parsed["ai_code_version"] = self.ai_code_version
parsed["id"] = item["id"]
# parsed["item"] = item
return parsed, 200
except asyncio.TimeoutError:
print(f"⏳ Timeout on item {item['id']}")
return None, 408
except Exception as e:
print(f"⚠️ Error __process_item {item['id']}: {traceback.print_exc()}")
return None, 400
def async_eval(self, processed_id: List = []):
try:
asyncio.run(self.__async_eval(processed_id))
except KeyboardInterrupt:
print("\n🛑 Interrupted by user.")
traceback.print_exc()
async def __async_eval(self, processed_id: List):
"""
اجرای اصلی تکهستهای و async برای تولید خروجی نهایی.
"""
print("🔹 Starting async data processing...")
# ------------------ مرحله ۱: بازیابی شناسه‌های قبلاً پردازش‌شده ------------------
if not processed_id:
try:
processed_id = self.__load_orjson(self._temp_processed_id_path)
print(
f"📂 Loaded existing processed_id from {self._temp_processed_id_path}"
)
except Exception:
print("⚠️ No valid processed_id found. Starting fresh.")
processed_id = []
# ------------------ مرحله ۲: آماده‌سازی داده‌ها ------------------
all_processed_id = set(processed_id)
all_results = []
total_time = []
data = [item for item in self.data if item.get("id") not in all_processed_id]
print(
f" Total items: {len(self.data)} - {len(all_processed_id)} = {len(data)}"
)
# اگر چیزی برای پردازش نیست
if not data:
print("✅ Nothing new to process. All items are already done.")
return
# ------------------ مرحله ۳: شروع پردازش ------------------
print(f"🤖 Model: {self.model_name} | Reasoning: {self.reasoning_effort}")
async with AsyncOpenAI(base_url=self.api_url, api_key=self.api_key) as client:
semaphore = asyncio.Semaphore(5)
async def limited_process(item):
async with semaphore:
return await self.__process_item(client, item)
tasks = [asyncio.create_task(limited_process(item)) for item in data]
total_i = 0
# ✅ پردازش به ترتیب تکمیل (نه ترتیب لیست)
for i, task in enumerate(asyncio.as_completed(tasks), start=1):
start = time.time()
try:
parsed, status_code = await asyncio.wait_for(
task, timeout=self.request_timeout
) # ⏱ حداکثر 2 دقیقه
except asyncio.TimeoutError:
print(f"⏳ Task {i} timed out completely")
parsed, status_code = None, 408
total_time.append(time.time() - start)
if status_code == 200:
all_results.append(parsed)
all_processed_id.add(parsed.get("id"))
else:
print(f"⚠️ Skipped item (status={status_code})")
total_i += 1
# ✅ ذخیره‌ی موقت هر n مورد
if total_i >= self.save_number:
print(f"total_i {total_i}")
print(f"self.save_number {self.save_number}")
total_i = 0
self.__save_orjson(
data=list(all_processed_id),
path=self._temp_processed_id_path,
)
print(f"💾 Auto-saved processed ids: {len(all_processed_id)}")
number = self.__get_max_number_file(self._temp_path)
print(f"number {number}")
temp_output_path = self._temp_path / f"output_{number}.json"
self.__save_orjson(data=list(all_results), path=temp_output_path)
print(f"💾 Auto-saved partial data: {len(all_results)}")
all_results.clear()
# ✅ بعد از پایان تمام تسک‌ها، ذخیره نهایی برای داده‌های باقیمانده
if total_i > 0 or len(all_results) > 0:
print("💾 Final save of remaining data...")
self.__save_orjson(
data=list(all_processed_id),
path=self._temp_processed_id_path,
)
print(f"💾 Auto-saved processed ids: {len(all_processed_id)}")
number = self.__get_max_number_file(self._temp_path)
print(f"number {number}")
temp_output_path = self._temp_path / f"output_{number}.json"
self.__save_orjson(data=list(all_results), path=temp_output_path)
print(f"💾 Auto-saved partial data: {len(all_results)}")
all_results.clear()
# ------------------ مرحله ۴: ذخیره خروجی ------------------
final_data_path = self.output_path / f"final_data_{self.task_name}.json"
processed_id_path = self.output_path / "processed_id.json"
self.merge_json_dir(input_path=self._temp_path, output_path=final_data_path)
all_results = self.__load_orjson(final_data_path)
# make_new_proccessed_ids_from_file()
self.__save_orjson(data=list(all_processed_id), path=processed_id_path)
self.__save_orjson(data=all_results, path=final_data_path)
avg_time = (sum(total_time) / len(total_time)) if total_time else 0
print(
f"\n✅ Processing completed!\n"
f"📊 Total-Data: {len(data)} | "
f"⭕ Ignored-Data: {len(processed_id)} | "
f"📦 Proccessed-Data: {len(all_results)} | "
f"❌ Loss-Data: {len(data)-len(all_results)} | "
f"🕒 Avg Time: {avg_time:.2f}'s per item | "
f"🕒 Total Time: {sum(total_time):.4f}'s | "
f"💾 Results saved to: {final_data_path}"
)
async def single_async_item(
self, item, reasoning_effort, temperature, top_p, output_schema=None, max_token=4096,
stream=False, print_logs=False, return_reason=False, stop=None, return_used_token=False,
):
try:
async with AsyncOpenAI(base_url=self.api_url, api_key=self.api_key) as client:
semaphore = asyncio.Semaphore(self.semaphore_number)
async with semaphore:
messages = [{"role": "user", "content": item["user_prompt"]}]
if item.get("system_prompt"):
messages.insert(0, {"role": "system", "content": item["system_prompt"]})
if item.get("assistant_prompt"):
messages.append({"role": "assistant", "content": item["assistant_prompt"]})
# output_schema =None
if output_schema is not None:
# Use .parse for structured output
response = await client.chat.completions.parse(
model=self.model_name,
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_token,
stop=stop,
response_format=output_schema,
reasoning_effort=reasoning_effort
)
if print_logs:
print(f'parse response ---- {response}')
parsed_obj = response.choices[0].message.parsed
# print(f'parsed_obj {parsed_obj}')
if parsed_obj is None:
return {"error": "Failed to parse response", "raw": str(response)}
# Validate just in case (optional, چون .parse already does it)
if return_reason:
reasoning_content = response.choices[0].message.reasoning_content
if return_used_token:
_total_token = response.usage.total_tokens
return output_schema.model_validate(parsed_obj), str(reasoning_content), int(_total_token)
return output_schema.model_validate(parsed_obj), str(reasoning_content)
return output_schema.model_validate(parsed_obj)
else:
# Use .create for raw text
response = await client.chat.completions.create(
model=self.model_name,
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_token,
stop=None,
stream=stream,
reasoning_effort=reasoning_effort
# No response_format
)
# print(f'response-stream {stream}-> {response}')
content = response.choices[0].message.content
if return_reason:
reasoning_content = response.choices[0].message.reasoning_content
if return_used_token:
_total_token = response.usage.total_tokens
return content, str(reasoning_content), int(_total_token)
return content, str(reasoning_content)
if not content:
return {"error": "Empty response", "raw": str(response)}
return content
except LengthFinishReasonError as le:
# اینجاست که جادو اتفاق می‌افته
if max_token >= 50000:
return {
"error": "MAX_TOKEN_LIMIT_REACHED",
"max_token": max_token
}
new_max = min(max_token * 2, 50000)
return await self.single_async_item(
item=item,
reasoning_effort=reasoning_effort,
temperature=temperature,
top_p=top_p,
output_schema=output_schema,
max_token=new_max,
stream=stream,
print_logs=print_logs,
return_reason=return_reason,
return_used_token=return_used_token,
stop=stop
)
except asyncio.TimeoutError:
print(f"⏳ Timeout on item {item}")
return None
except Exception as e:
print(f"⚠️ Error __process_item {item}: {traceback.print_exc()}")
return None
async def single_async_item_stream(
self,
item,
reasoning_effort,
temperature,
top_p,
max_token=None,
):
async with AsyncOpenAI(
base_url=self.api_url,
api_key=self.api_key
) as client:
messages = [{"role": "user", "content": item["user_prompt"]}]
if item.get("system_prompt"):
messages.insert(0, {"role": "system", "content": item["system_prompt"]})
stream = await client.chat.completions.create(
model=self.model_name,
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_token,
reasoning_effort=reasoning_effort,
stream=True, # ⭐ مهم
# stream_options=
)
# c = 0
# v = 0
# v1 = 0
# q = 0
async for chunk in stream:
# if c == 0:
# self.sample_chunk.append(chunk)
# c += 1
# print(f'c {c}')
if not chunk.choices:
continue
delta = chunk.choices[0].delta
if delta and delta.content:
# if v1 == 0:
# self.sample_chunk.append(chunk)
# v += 1
# v1 += 1
# print(f'v {v}')
yield delta.content
# if v != c:
# if q == 0:
# self.sample_chunk.append(chunk)
# q +=1
# print('add-unvalid_chunk')
# self.unvalid_chunk.append(chunk)
# v = c

View File

@ -35,6 +35,7 @@ def merge_json_dir(input_path, output_path):
for json_file in json_files:
try:
data = load_orjson(json_file)
# print(f'data {type(data)}')
if not data: # خالی یا None
failed_files.append(json_file.name)
continue
@ -44,6 +45,9 @@ def merge_json_dir(input_path, output_path):
if isinstance(data, list) and isinstance(data[0], dict):
for item in data:
item_id = item.get("id")
if not item_id:
item_id = item.get("_id")
if item_id is None:
# اگر id نداشت، می‌تونی تصمیم بگیری: نگه داری یا ردش کنی
# اینجا فرض می‌کنیم فقط مواردی با id معتبر مهم هستند
@ -188,10 +192,10 @@ def count_tokens(model_name, system_prompt, user_prompt):
# --- نحوه استفاده ---
if __name__ == "__main__":
# ##### یکی کردن تمام بچ های خروجی در یک فایل
# merge_json_dir(
# input_path= '/home1/ava3/project/aiDataParser/task/keyword_extractor/output/batch_data',
# output_path='/home1/ava3/project/aiDataParser/task/keyword_extractor/output/merged_1.json'
# )
merge_json_dir(
input_path= '/home1/ava3/init_mahdi/project/rule_extractor/im_based/data/sequential/subject_unity/output_6152/batch_data',
output_path='/home1/ava3/init_mahdi/project/rule_extractor/im_based/data/sequential/subject_unity/output_6152/final_data_subject_unity.json'
)
###### ساخت یک proccessed id از فایل نهایی
# make_new_proccessed_ids_from_file(
@ -507,3 +511,4 @@ if __name__ == "__main__":
##########################################################################
print(":D")
# python3 -m aiDataParser.core.data_normalizer

View File

@ -1,43 +1,72 @@
######################################################################1
class Output(BaseModel):
simplify_list : List[str]
# ######################################################################
# import asyncio, time
# from pydantic import BaseModel
if __name__ == '__main__':
ruuner = AsyncCore(
model_name='gpt-oss-120b',
data_path='/home1/ava3/project/aiDataParser/task/simplify/input/prompt.json',
output_path='/home1/ava3/project/aiDataParser/task/simplify/output',
api_url="http://172.16.29.102:8001/v1/",
task_name='simplify-all-v1-oss-120b-med',
output_schema=Output,
reasoning_effort='medium',
ai_code_version='oss120b_med',
request_timeout=60,
save_number=2,
max_token=50000,
)
ruuner.async_eval()
######################################################################2
# class Response(BaseModel):
# result: int
class Output(BaseModel):
simplify_list : List[str]
if __name__ == '__main__':
ruuner = AsyncCore(
model_name='gpt-oss-120b',
data_path='/home1/ava3/project/aiDataParser/task/simplify/input/prompt.json',
output_path='/home1/ava3/project/aiDataParser/task/simplify/output',
api_url="http://172.16.29.102:8001/v1/",
task_name='simplify-all-v1-oss-120b-med',
output_schema=Output,
reasoning_effort='medium',
ai_code_version='oss120b_med',
request_timeout=60,
save_number=2,
max_token=50000,
)
# async def main():
# runner = SmartMultiServerCore()
# start = time.time()
# # res = await runner.single_item(user_prompt="سلام جواب این عبارت ریاضی چی میشه ؟ 145/5*2")
# res = await runner.single_item(
# user_prompt="سلام جواب این عبارت ریاضی چی میشه ؟ 145/5*2",
# # output_schema=Response,
# )
# end = time.time()
# print(f"res {res}", f"{end - start:.2f}", sep="\n")
llm_answer, _ = await RUNNER_PROMPT.single_simple_async_proccess_item(
item={"user_prompt": prompt, "system_prompt": SYSTEM_PROPMT2},
)
# asyncio.run(main())
# ######################################################################
# class Output(BaseModel):
# simplify_list : List[str]
# if __name__ == '__main__':
# ruuner = AsyncCore(
# model_name='gpt-oss-120b',
# data_path='/home1/ava3/project/aiDataParser/task/simplify/input/prompt.json',
# output_path='/home1/ava3/project/aiDataParser/task/simplify/output',
# api_url="http://172.16.29.102:8001/v1/",
# task_name='simplify-all-v1-oss-120b-med',
# output_schema=Output,
# reasoning_effort='medium',
# ai_code_version='oss120b_med',
# request_timeout=60,
# save_number=2,
# max_token=50000,
# )
# ruuner.async_eval()
# ######################################################################2
# class Output(BaseModel):
# simplify_list : List[str]
# if __name__ == '__main__':
# ruuner = AsyncCore(
# model_name='gpt-oss-120b',
# data_path='/home1/ava3/project/aiDataParser/task/simplify/input/prompt.json',
# output_path='/home1/ava3/project/aiDataParser/task/simplify/output',
# api_url="http://172.16.29.102:8001/v1/",
# task_name='simplify-all-v1-oss-120b-med',
# output_schema=Output,
# reasoning_effort='medium',
# ai_code_version='oss120b_med',
# request_timeout=60,
# save_number=2,
# max_token=50000,
# )
# llm_answer, _ = await RUNNER_PROMPT.single_simple_async_proccess_item(
# item={"user_prompt": prompt, "system_prompt": SYSTEM_PROPMT2},
# )
# ######################################################################2
# from aiDataParser.core.data_normalizer import merge_json_dir
# merge_json_dir(input_path="/home1/ava3/init_mahdi/data/mj_qa_section", output_path="/home1/ava3/init_mahdi/data/mj_qa_section_28_11_1404.json")
# python3 -m aiDataParser.core.example

View File

@ -1 +1,4 @@
orjson
orjson
openpyxl
beautifulsoup4
sacrebleu