This commit is contained in:
mdorstkar 2026-05-14 20:04:55 +03:30
parent c9b2ed340f
commit 39d014781d
8 changed files with 4407 additions and 78 deletions

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,6 @@
sn1
sn1
sn1
sn1
sn1
sn1

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -11,6 +11,11 @@ import asyncio
import traceback
from openai import AsyncOpenAI
import copy, asyncio, traceback
from openai import OpenAI, AsyncOpenAI, LengthFinishReasonError
from typing import List, Union
from pydantic import BaseModel
today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}'
SYSTEM_PROMPT = """
@ -148,6 +153,100 @@ async def single_simple_async_proccess_item(
traceback.print_exc()
raise RuntimeError(f"⚠️ Error in API call: {str(e)}")
class Result(BaseModel):
result : str
async def single_async_item(
api_url,
api_key,
item,
reasoning_effort,
temperature,
top_p,
semaphore_number,
model_name,
priority=1,
output_schema=None,
max_token=4096,
print_logs=False,
return_reason=False,
stop=None,
return_used_token=False,
timeout=300,
):
try:
async with AsyncOpenAI(
base_url=api_url, api_key=api_key
) as client:
semaphore = asyncio.Semaphore(semaphore_number)
async with semaphore:
messages = [{"role": "user", "content": item["user_prompt"]}]
if item.get("system_prompt"):
messages.insert(
0, {"role": "system", "content": item["system_prompt"]}
)
# if item.get("assistant_prompt"):
# messages.append(
# {"role": "assistant", "content": item["assistant_prompt"]}
# )
coro = client.chat.completions.parse(
model=model_name,
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_token,
stop=stop,
response_format=output_schema,
reasoning_effort=reasoning_effort,
extra_body={"priority": priority},
# priority=1,
)
response = await asyncio.wait_for(coro, timeout=timeout)
if print_logs:
print(f"parse response ---- {response}")
parsed_obj = response.choices[0].message.parsed
# print(f'parsed_obj {parsed_obj}')
if parsed_obj is None:
return {
"error": "Failed to parse response",
"raw": str(response),
}
parsed_obj = output_schema.model_validate(parsed_obj)
# Validate just in case (optional, چون .parse already does it)
if return_reason:
reasoning_content = response.choices[
0
].message.reasoning_content
if return_used_token:
_total_token = response.usage.total_tokens
item["llm_output"] = (
parsed_obj.model_dump(),
str(reasoning_content),
int(_total_token),
)
return item
item["llm_output"] = (
parsed_obj.model_dump(),
str(reasoning_content)
)
return item
item["llm_output"] = parsed_obj.model_dump()
return item
except asyncio.TimeoutError:
print(f"⏳ Timeout on item {item}")
return None
except Exception as e:
print(f"⚠️ Error __process_item {item}: {traceback.print_exc()}")
return None
async def main():
with open('./leader_data/khamenei_messages_4.json', 'r', encoding='utf-8') as file:
data = json.load(file)
@ -221,9 +320,33 @@ async def main():
print(f'all_paragraphs: {all_paragraphs}')
print('---------------------------------------------')
async def oss_test(SYSTEM_PROMPT,USER_PROMPT,Dictt):
item = {}
# item['assistant_prompt'] = "تو یک دستیار خبره در زمینه تدوین متون علمی هستی"
item['system_prompt'] = SYSTEM_PROMPT
item['user_prompt'] = f"{USER_PROMPT}\n{Dictt}"
response = await single_async_item(
api_url="http://2.188.15.102:8001/v1/",
api_key="EMPTY",
item=item,
reasoning_effort="medium",
temperature=0.1,
top_p=1,
semaphore_number=1,
model_name="gpt-oss-120b",
priority=1,
output_schema=Result,
max_token=None,
return_reason=True,
return_used_token=True,
timeout=300
)
print(response['llm_output'])
return response['llm_output']
if __name__ == "__main__":
asyncio.run(main())
# asyncio.run(main())
asyncio.run(oss_test())

View File

@ -6,6 +6,12 @@ import time
import datetime
from openai import OpenAI
from langchain_openai import ChatOpenAI
from nahj_get_metadata_oss import oss_test
import asyncio
import sqlite3
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}'
@ -271,16 +277,15 @@ paragraph_effect
مقادیر بین این دو (مثلاً 0.2 ، -0.4 ، 0.75) مجاز و نشاندهنده شدت نسبی هستند
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد:
{
"paragraph_id": str,
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (خیلی مهم) به صورت زیر باشد:
[{
"central_concepts": [
{
"concept": str,
"paragraph_effect": float
}
]
}
},...]
"""
SYSTEM_PROMPT_person = """
@ -321,11 +326,10 @@ paragraphs : لیستی از پاراگراف‌ها که هرکدام شامل:
شخصیتهای فرضی، نمادین یا کلی وارد نشوند
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد:
{
"paragraph_id": str,
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (بسیار مهم) به صورت زیر باشد:
[{
"persons": [str]
}
},...]
"""
SYSTEM_PROMPT_rules = """
@ -377,16 +381,15 @@ paragraphs : لیستی از پاراگراف‌ها که هرکدام شامل:
باید، لازم است، ضروری است، نیازمند است، واجب است، حیاتی است و مانند آن
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد:
{
"paragraph_id": str,
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (بسیار مهم) به صورت زیر باشد:
[{
"rules": [
{
"rule": str,
"type": "توصیفی" | "هنجاری"
}
]
}
},...]
"""
SYSTEM_PROMPT = SYSTEM_PROMPT_rules
@ -396,7 +399,7 @@ prompts = [SYSTEM_PROMPT_title,
SYSTEM_PROMPT_person,
SYSTEM_PROMPT_rules]
outs = ["-title","-central","-person","-rules"]
outs = ["title","central","person","rules"]
USER_PROMPT = '''
متن زیر را بر اساس دستورالعملهای سیستمی تحلیل کن و خروجی را در قالب دیکشنری پایتون ارائه بده:
@ -475,7 +478,7 @@ if __name__ == "__main__":
file_path = './nahj_data/nahj-metadata-jsonline.json'
for path in outs:
# for path in outs:
# 1. حذف فایل اگر وجود داشته باشد
if os.path.exists(file_path):
os.remove(file_path)
@ -484,7 +487,7 @@ if __name__ == "__main__":
with open(file_path, 'w') as f:
pass
output_metadata_json_path = f'./nahj_data/nahj-metadata{path}-TEST.json'
output_metadata_json_path = f'./nahj_data/nahj-metadata-TEST.json'
with open(input_data_path, 'r', encoding='utf-8') as file:
data = json.load(file)
SYSTEM_PROMPT = prompts[o]
@ -505,15 +508,15 @@ if __name__ == "__main__":
error_ids = []
test_enteries = []
all_paragraphs = 0
NN=-1 # این عدد صرفا برای آیدی استفاده میشود
period = 1
end = False
while True:
print(f"******* PERIOD :: {period} *******")
for index ,entery in enumerate(data, 1):
if index > 5:
NN+=1
if index > 799:
end = True
break
id = entery['id']
@ -531,6 +534,17 @@ if __name__ == "__main__":
print(f'id: {id} - record: {index}/{len(data)} - period: {period}')
for path in outs :
if path == "title":
SYSTEM_PROMPT = SYSTEM_PROMPT_title
elif path == "central":
SYSTEM_PROMPT = SYSTEM_PROMPT_central
elif path == "person":
SYSTEM_PROMPT = SYSTEM_PROMPT_person
elif path == "rules":
SYSTEM_PROMPT = SYSTEM_PROMPT_rules
llm_answer_data = ''
new_entry = {}
new_paragraphs = []
@ -545,21 +559,109 @@ if __name__ == "__main__":
# 'text': f"بخشی از {large_title} : {p['text'] }"
})
new_entry['paragraphs'] = new_paragraphs
try:
result_data = llm_request(new_entry)#gpt-4o
result_data = asyncio.run(oss_test(SYSTEM_PROMPT,USER_PROMPT,new_entry))#gpt-4o
# result_data = llm_request(new_entry)
llm_answer_data = text_to_dict(result_data)
if path == "title" :
entery['paragraph_metadata'] = []
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'].append({
'paragraph_id': sec['paragraph_id'],
'title': sec['title'],
'paragraph_type': sec['paragraph_type']
})
if len(llm_answer_data) != len(entery['paragraph_metadata']) :
print("error!!!!!!!!!!")
if entery['id'] not in error_ids:
error_ids.append(entery['id'])
if path == "central" :
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'][num]['central_concepts'] = sec['central_concepts']
if path == "person" :
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'][num]['persons'] = sec['persons']
if path == "rules" :
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'][num]['rules'] = sec['rules']
except Exception as e:
print(f'error id: {id} - {e} >> llm result: {result_data}')
# error_ids.append(id)
if id not in error_ids:
error_ids.append(id)
with open(current_peroid_errors_path, "a", encoding="utf-8") as f:
f.write(f"{id}\n")
continue
entery['paragraph_metadata'] = llm_answer_data
# entery['paragraph_metadata'] = llm_answer_data
context_id = id
title = entery['title']
large_title = entery['large_title']
url = entery['url']
typee = entery['type']
i_link = entery['interpretation_link']
N=0
for part in paragraphs :
id_ = f"num{NN}{N}"
text = part['text']
part_id = part['paragraph_id']
arabic_text = part['arabic_text']
ai_title = entery['paragraph_metadata'][N]['title']
paragraph_type = entery['paragraph_metadata'][N]['paragraph_type']
cursor.execute("INSERT INTO speeches (id, context_id, part_id, title, large_title, normalized_sentence, url, types, arabic_text, interpretation_links, ai_title, ai_paragraph_type) \
VALUES (?, ?, ?, ? ,? ,? ,? ,? ,? ,? ,? ,?)",
(id_, context_id, part_id, title, large_title, text, url, typee, arabic_text, i_link, ai_title, paragraph_type))
conn.commit()
central_concepts = entery['paragraph_metadata'][N]['central_concepts']
persons = entery['paragraph_metadata'][N]['persons']
rules = entery['paragraph_metadata'][N]['rules']
k=0
for row in central_concepts:
k+=1
c_id = context_id+part_id+f"c{k}"
concept = row['concept']
paragraph_effect = row['paragraph_effect']
cursor.execute("""INSERT INTO central_concepts (id, concept, paragraph_effect, part_id)
VALUES(?, ?, ?, ?)""",(c_id, concept, paragraph_effect, part_id))
conn.commit()
k=0
for row in persons:
k+=1
c_id = context_id+part_id+f"p{k}"
person = row
cursor.execute("""INSERT INTO persons (id, person, part_id)
VALUES(?, ?, ?)""",(c_id, person, part_id))
conn.commit()
k=0
for row in rules:
k+=1
c_id = context_id+part_id+f"r{k}"
rule = row['rule']
rule_type = row['type']
cursor.execute("""INSERT INTO rules (id, rule, type, part_id)
VALUES(?, ?, ?, ?)""",(c_id, rule, rule_type, part_id))
conn.commit()
N+=1
test_enteries.append(entery)
with open(output_metadata_jsonl_path, 'a', encoding='utf-8') as f:
json.dump(entery, f, ensure_ascii=False)
f.write('\n')
time.sleep(1)
passed_data_ids = find_passed_data_ids(output_metadata_jsonl_path)
@ -573,6 +675,8 @@ if __name__ == "__main__":
# with open(f'./leader_data/leader-metadata-bayanat-{id}.json', mode='w', encoding='utf-8') as file:
with open(output_metadata_json_path, mode='w', encoding='utf-8') as file:
result_message = json.dump(test_enteries, file, ensure_ascii=False, indent=2)
with open("./nahj_data/error_ids_TEST.json", mode='w', encoding='utf-8') as file:
result_message = json.dump(error_ids, file, ensure_ascii=False, indent=2)
print('all done!')
print('---------------------------------------------')