This commit is contained in:
mdorstkar 2026-05-14 20:04:55 +03:30
parent c9b2ed340f
commit 39d014781d
8 changed files with 4407 additions and 78 deletions

Binary file not shown.

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,6 @@
sn1
sn1
sn1
sn1
sn1
sn1

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -11,6 +11,11 @@ import asyncio
import traceback import traceback
from openai import AsyncOpenAI from openai import AsyncOpenAI
import copy, asyncio, traceback
from openai import OpenAI, AsyncOpenAI, LengthFinishReasonError
from typing import List, Union
from pydantic import BaseModel
today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}' today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}'
SYSTEM_PROMPT = """ SYSTEM_PROMPT = """
@ -148,6 +153,100 @@ async def single_simple_async_proccess_item(
traceback.print_exc() traceback.print_exc()
raise RuntimeError(f"⚠️ Error in API call: {str(e)}") raise RuntimeError(f"⚠️ Error in API call: {str(e)}")
class Result(BaseModel):
result : str
async def single_async_item(
api_url,
api_key,
item,
reasoning_effort,
temperature,
top_p,
semaphore_number,
model_name,
priority=1,
output_schema=None,
max_token=4096,
print_logs=False,
return_reason=False,
stop=None,
return_used_token=False,
timeout=300,
):
try:
async with AsyncOpenAI(
base_url=api_url, api_key=api_key
) as client:
semaphore = asyncio.Semaphore(semaphore_number)
async with semaphore:
messages = [{"role": "user", "content": item["user_prompt"]}]
if item.get("system_prompt"):
messages.insert(
0, {"role": "system", "content": item["system_prompt"]}
)
# if item.get("assistant_prompt"):
# messages.append(
# {"role": "assistant", "content": item["assistant_prompt"]}
# )
coro = client.chat.completions.parse(
model=model_name,
messages=messages,
temperature=temperature,
top_p=top_p,
max_tokens=max_token,
stop=stop,
response_format=output_schema,
reasoning_effort=reasoning_effort,
extra_body={"priority": priority},
# priority=1,
)
response = await asyncio.wait_for(coro, timeout=timeout)
if print_logs:
print(f"parse response ---- {response}")
parsed_obj = response.choices[0].message.parsed
# print(f'parsed_obj {parsed_obj}')
if parsed_obj is None:
return {
"error": "Failed to parse response",
"raw": str(response),
}
parsed_obj = output_schema.model_validate(parsed_obj)
# Validate just in case (optional, چون .parse already does it)
if return_reason:
reasoning_content = response.choices[
0
].message.reasoning_content
if return_used_token:
_total_token = response.usage.total_tokens
item["llm_output"] = (
parsed_obj.model_dump(),
str(reasoning_content),
int(_total_token),
)
return item
item["llm_output"] = (
parsed_obj.model_dump(),
str(reasoning_content)
)
return item
item["llm_output"] = parsed_obj.model_dump()
return item
except asyncio.TimeoutError:
print(f"⏳ Timeout on item {item}")
return None
except Exception as e:
print(f"⚠️ Error __process_item {item}: {traceback.print_exc()}")
return None
async def main(): async def main():
with open('./leader_data/khamenei_messages_4.json', 'r', encoding='utf-8') as file: with open('./leader_data/khamenei_messages_4.json', 'r', encoding='utf-8') as file:
data = json.load(file) data = json.load(file)
@ -221,9 +320,33 @@ async def main():
print(f'all_paragraphs: {all_paragraphs}') print(f'all_paragraphs: {all_paragraphs}')
print('---------------------------------------------') print('---------------------------------------------')
async def oss_test(SYSTEM_PROMPT,USER_PROMPT,Dictt):
item = {}
# item['assistant_prompt'] = "تو یک دستیار خبره در زمینه تدوین متون علمی هستی"
item['system_prompt'] = SYSTEM_PROMPT
item['user_prompt'] = f"{USER_PROMPT}\n{Dictt}"
response = await single_async_item(
api_url="http://2.188.15.102:8001/v1/",
api_key="EMPTY",
item=item,
reasoning_effort="medium",
temperature=0.1,
top_p=1,
semaphore_number=1,
model_name="gpt-oss-120b",
priority=1,
output_schema=Result,
max_token=None,
return_reason=True,
return_used_token=True,
timeout=300
)
print(response['llm_output'])
return response['llm_output']
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) # asyncio.run(main())
asyncio.run(oss_test())

View File

@ -6,6 +6,12 @@ import time
import datetime import datetime
from openai import OpenAI from openai import OpenAI
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from nahj_get_metadata_oss import oss_test
import asyncio
import sqlite3
conn = sqlite3.connect('./db/nahj.db')
cursor = conn.cursor()
today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}' today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}'
@ -271,16 +277,15 @@ paragraph_effect
مقادیر بین این دو (مثلاً 0.2 ، -0.4 ، 0.75) مجاز و نشاندهنده شدت نسبی هستند مقادیر بین این دو (مثلاً 0.2 ، -0.4 ، 0.75) مجاز و نشاندهنده شدت نسبی هستند
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد: ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (خیلی مهم) به صورت زیر باشد:
{ [{
"paragraph_id": str,
"central_concepts": [ "central_concepts": [
{ {
"concept": str, "concept": str,
"paragraph_effect": float "paragraph_effect": float
} }
] ]
} },...]
""" """
SYSTEM_PROMPT_person = """ SYSTEM_PROMPT_person = """
@ -321,11 +326,10 @@ paragraphs : لیستی از پاراگراف‌ها که هرکدام شامل:
شخصیتهای فرضی، نمادین یا کلی وارد نشوند شخصیتهای فرضی، نمادین یا کلی وارد نشوند
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد: ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (بسیار مهم) به صورت زیر باشد:
{ [{
"paragraph_id": str,
"persons": [str] "persons": [str]
} },...]
""" """
SYSTEM_PROMPT_rules = """ SYSTEM_PROMPT_rules = """
@ -377,16 +381,15 @@ paragraphs : لیستی از پاراگراف‌ها که هرکدام شامل:
باید، لازم است، ضروری است، نیازمند است، واجب است، حیاتی است و مانند آن باید، لازم است، ضروری است، نیازمند است، واجب است، حیاتی است و مانند آن
ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد: ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (بسیار مهم) به صورت زیر باشد:
{ [{
"paragraph_id": str,
"rules": [ "rules": [
{ {
"rule": str, "rule": str,
"type": "توصیفی" | "هنجاری" "type": "توصیفی" | "هنجاری"
} }
] ]
} },...]
""" """
SYSTEM_PROMPT = SYSTEM_PROMPT_rules SYSTEM_PROMPT = SYSTEM_PROMPT_rules
@ -396,7 +399,7 @@ prompts = [SYSTEM_PROMPT_title,
SYSTEM_PROMPT_person, SYSTEM_PROMPT_person,
SYSTEM_PROMPT_rules] SYSTEM_PROMPT_rules]
outs = ["-title","-central","-person","-rules"] outs = ["title","central","person","rules"]
USER_PROMPT = ''' USER_PROMPT = '''
متن زیر را بر اساس دستورالعملهای سیستمی تحلیل کن و خروجی را در قالب دیکشنری پایتون ارائه بده: متن زیر را بر اساس دستورالعملهای سیستمی تحلیل کن و خروجی را در قالب دیکشنری پایتون ارائه بده:
@ -475,7 +478,7 @@ if __name__ == "__main__":
file_path = './nahj_data/nahj-metadata-jsonline.json' file_path = './nahj_data/nahj-metadata-jsonline.json'
for path in outs: # for path in outs:
# 1. حذف فایل اگر وجود داشته باشد # 1. حذف فایل اگر وجود داشته باشد
if os.path.exists(file_path): if os.path.exists(file_path):
os.remove(file_path) os.remove(file_path)
@ -484,7 +487,7 @@ if __name__ == "__main__":
with open(file_path, 'w') as f: with open(file_path, 'w') as f:
pass pass
output_metadata_json_path = f'./nahj_data/nahj-metadata{path}-TEST.json' output_metadata_json_path = f'./nahj_data/nahj-metadata-TEST.json'
with open(input_data_path, 'r', encoding='utf-8') as file: with open(input_data_path, 'r', encoding='utf-8') as file:
data = json.load(file) data = json.load(file)
SYSTEM_PROMPT = prompts[o] SYSTEM_PROMPT = prompts[o]
@ -505,15 +508,15 @@ if __name__ == "__main__":
error_ids = [] error_ids = []
test_enteries = [] test_enteries = []
all_paragraphs = 0 all_paragraphs = 0
NN=-1 # این عدد صرفا برای آیدی استفاده میشود
period = 1 period = 1
end = False end = False
while True: while True:
print(f"******* PERIOD :: {period} *******") print(f"******* PERIOD :: {period} *******")
for index ,entery in enumerate(data, 1): for index ,entery in enumerate(data, 1):
NN+=1
if index > 5: if index > 799:
end = True end = True
break break
id = entery['id'] id = entery['id']
@ -531,6 +534,17 @@ if __name__ == "__main__":
print(f'id: {id} - record: {index}/{len(data)} - period: {period}') print(f'id: {id} - record: {index}/{len(data)} - period: {period}')
for path in outs :
if path == "title":
SYSTEM_PROMPT = SYSTEM_PROMPT_title
elif path == "central":
SYSTEM_PROMPT = SYSTEM_PROMPT_central
elif path == "person":
SYSTEM_PROMPT = SYSTEM_PROMPT_person
elif path == "rules":
SYSTEM_PROMPT = SYSTEM_PROMPT_rules
llm_answer_data = '' llm_answer_data = ''
new_entry = {} new_entry = {}
new_paragraphs = [] new_paragraphs = []
@ -545,21 +559,109 @@ if __name__ == "__main__":
# 'text': f"بخشی از {large_title} : {p['text'] }" # 'text': f"بخشی از {large_title} : {p['text'] }"
}) })
new_entry['paragraphs'] = new_paragraphs new_entry['paragraphs'] = new_paragraphs
try: try:
result_data = llm_request(new_entry)#gpt-4o result_data = asyncio.run(oss_test(SYSTEM_PROMPT,USER_PROMPT,new_entry))#gpt-4o
# result_data = llm_request(new_entry)
llm_answer_data = text_to_dict(result_data) llm_answer_data = text_to_dict(result_data)
if path == "title" :
entery['paragraph_metadata'] = []
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'].append({
'paragraph_id': sec['paragraph_id'],
'title': sec['title'],
'paragraph_type': sec['paragraph_type']
})
if len(llm_answer_data) != len(entery['paragraph_metadata']) :
print("error!!!!!!!!!!")
if entery['id'] not in error_ids:
error_ids.append(entery['id'])
if path == "central" :
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'][num]['central_concepts'] = sec['central_concepts']
if path == "person" :
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'][num]['persons'] = sec['persons']
if path == "rules" :
for num ,sec in enumerate(llm_answer_data):
entery['paragraph_metadata'][num]['rules'] = sec['rules']
except Exception as e: except Exception as e:
print(f'error id: {id} - {e} >> llm result: {result_data}') print(f'error id: {id} - {e} >> llm result: {result_data}')
# error_ids.append(id) if id not in error_ids:
error_ids.append(id)
with open(current_peroid_errors_path, "a", encoding="utf-8") as f: with open(current_peroid_errors_path, "a", encoding="utf-8") as f:
f.write(f"{id}\n") f.write(f"{id}\n")
continue continue
entery['paragraph_metadata'] = llm_answer_data # entery['paragraph_metadata'] = llm_answer_data
context_id = id
title = entery['title']
large_title = entery['large_title']
url = entery['url']
typee = entery['type']
i_link = entery['interpretation_link']
N=0
for part in paragraphs :
id_ = f"num{NN}{N}"
text = part['text']
part_id = part['paragraph_id']
arabic_text = part['arabic_text']
ai_title = entery['paragraph_metadata'][N]['title']
paragraph_type = entery['paragraph_metadata'][N]['paragraph_type']
cursor.execute("INSERT INTO speeches (id, context_id, part_id, title, large_title, normalized_sentence, url, types, arabic_text, interpretation_links, ai_title, ai_paragraph_type) \
VALUES (?, ?, ?, ? ,? ,? ,? ,? ,? ,? ,? ,?)",
(id_, context_id, part_id, title, large_title, text, url, typee, arabic_text, i_link, ai_title, paragraph_type))
conn.commit()
central_concepts = entery['paragraph_metadata'][N]['central_concepts']
persons = entery['paragraph_metadata'][N]['persons']
rules = entery['paragraph_metadata'][N]['rules']
k=0
for row in central_concepts:
k+=1
c_id = context_id+part_id+f"c{k}"
concept = row['concept']
paragraph_effect = row['paragraph_effect']
cursor.execute("""INSERT INTO central_concepts (id, concept, paragraph_effect, part_id)
VALUES(?, ?, ?, ?)""",(c_id, concept, paragraph_effect, part_id))
conn.commit()
k=0
for row in persons:
k+=1
c_id = context_id+part_id+f"p{k}"
person = row
cursor.execute("""INSERT INTO persons (id, person, part_id)
VALUES(?, ?, ?)""",(c_id, person, part_id))
conn.commit()
k=0
for row in rules:
k+=1
c_id = context_id+part_id+f"r{k}"
rule = row['rule']
rule_type = row['type']
cursor.execute("""INSERT INTO rules (id, rule, type, part_id)
VALUES(?, ?, ?, ?)""",(c_id, rule, rule_type, part_id))
conn.commit()
N+=1
test_enteries.append(entery) test_enteries.append(entery)
with open(output_metadata_jsonl_path, 'a', encoding='utf-8') as f: with open(output_metadata_jsonl_path, 'a', encoding='utf-8') as f:
json.dump(entery, f, ensure_ascii=False) json.dump(entery, f, ensure_ascii=False)
f.write('\n') f.write('\n')
time.sleep(1) time.sleep(1)
passed_data_ids = find_passed_data_ids(output_metadata_jsonl_path) passed_data_ids = find_passed_data_ids(output_metadata_jsonl_path)
@ -573,6 +675,8 @@ if __name__ == "__main__":
# with open(f'./leader_data/leader-metadata-bayanat-{id}.json', mode='w', encoding='utf-8') as file: # with open(f'./leader_data/leader-metadata-bayanat-{id}.json', mode='w', encoding='utf-8') as file:
with open(output_metadata_json_path, mode='w', encoding='utf-8') as file: with open(output_metadata_json_path, mode='w', encoding='utf-8') as file:
result_message = json.dump(test_enteries, file, ensure_ascii=False, indent=2) result_message = json.dump(test_enteries, file, ensure_ascii=False, indent=2)
with open("./nahj_data/error_ids_TEST.json", mode='w', encoding='utf-8') as file:
result_message = json.dump(error_ids, file, ensure_ascii=False, indent=2)
print('all done!') print('all done!')
print('---------------------------------------------') print('---------------------------------------------')