.

2026-05-14 20:04:55 +03:30 · 2026-05-14 20:04:55 +03:30 · 39d014781d
commit 39d014781d
parent c9b2ed340f
8 changed files with 4407 additions and 78 deletions
--- a/pycache/nahj_get_metadata_oss.cpython-312.pyc
+++ b/pycache/nahj_get_metadata_oss.cpython-312.pyc
--- a/db/nahj.db
+++ b/db/nahj.db
--- a/nahj-answer/error-in-getting-metadata-Final.txt
+++ b/nahj-answer/error-in-getting-metadata-Final.txt
--- a/nahj_data/error-ids3-Final.txt
+++ b/nahj_data/error-ids3-Final.txt
@ -0,0 +1,6 @@
 sn1
 sn1
 sn1
 sn1
 sn1
 sn1
--- a/nahj_data/nahj-metadata-TEST.json
+++ b/nahj_data/nahj-metadata-TEST.json
--- a/nahj_data/nahj-metadata-jsonline.json
+++ b/nahj_data/nahj-metadata-jsonline.json
--- a/nahj_get_metadata_oss.py
+++ b/nahj_get_metadata_oss.py
@ -11,6 +11,11 @@ import asyncio
 import traceback
 from openai import AsyncOpenAI 
 import copy, asyncio, traceback
 from openai import OpenAI, AsyncOpenAI, LengthFinishReasonError
 from typing import List, Union
 from pydantic import BaseModel
 today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}'
 SYSTEM_PROMPT = """
@ -148,6 +153,100 @@ async def single_simple_async_proccess_item(
            traceback.print_exc()
            raise RuntimeError(f"⚠️ Error in API call: {str(e)}")
 class Result(BaseModel):
    result : str 
 async def single_async_item(
    api_url,
    api_key,
    item,
    reasoning_effort,
    temperature,
    top_p,
    semaphore_number,
    model_name,
    priority=1,
    output_schema=None,
    max_token=4096,
    print_logs=False,
    return_reason=False,
    stop=None,
    return_used_token=False,
    timeout=300,
    ):
    try:
        async with AsyncOpenAI(
            base_url=api_url, api_key=api_key
        ) as client:
            semaphore = asyncio.Semaphore(semaphore_number)
            async with semaphore:
                messages = [{"role": "user", "content": item["user_prompt"]}]
                if item.get("system_prompt"):
                    messages.insert(
                        0, {"role": "system", "content": item["system_prompt"]}
                    )
                # if item.get("assistant_prompt"):
                #     messages.append(
                #         {"role": "assistant", "content": item["assistant_prompt"]}
                #     )
                coro = client.chat.completions.parse(
                    model=model_name,
                    messages=messages,
                    temperature=temperature,
                    top_p=top_p,
                    max_tokens=max_token,
                    stop=stop,
                    response_format=output_schema,
                    reasoning_effort=reasoning_effort,
                    extra_body={"priority": priority},
                    # priority=1,
                )
                response = await asyncio.wait_for(coro, timeout=timeout)
                if print_logs:
                    print(f"parse response ----  {response}")
                parsed_obj = response.choices[0].message.parsed 
                # print(f'parsed_obj {parsed_obj}')
                if parsed_obj is None:
                    return {
                        "error": "Failed to parse response",
                        "raw": str(response),
                    }
                parsed_obj = output_schema.model_validate(parsed_obj)
                # Validate just in case (optional, چون .parse already does it)
                if return_reason:
                    reasoning_content = response.choices[
                        0
                    ].message.reasoning_content
                    if return_used_token:
                        _total_token = response.usage.total_tokens
                        item["llm_output"] = (
                            parsed_obj.model_dump(),
                            str(reasoning_content),
                            int(_total_token),
                        )
                        return item
                    item["llm_output"] = (
                        parsed_obj.model_dump(),
                        str(reasoning_content)
                    )
                    return item
                item["llm_output"] = parsed_obj.model_dump()
                return item
    except asyncio.TimeoutError:
        print(f"⏳ Timeout on item {item}")
        return None
    except Exception as e:
        print(f"⚠️ Error __process_item {item}: {traceback.print_exc()}")
        return None
 async def main():
    with open('./leader_data/khamenei_messages_4.json', 'r', encoding='utf-8') as file:
            data = json.load(file)
@ -221,9 +320,33 @@ async def main():
    print(f'all_paragraphs: {all_paragraphs}')
    print('---------------------------------------------')
 async def oss_test(SYSTEM_PROMPT,USER_PROMPT,Dictt):
    item = {}
    # item['assistant_prompt'] = "تو یک دستیار خبره در زمینه تدوین متون علمی هستی"
    item['system_prompt'] = SYSTEM_PROMPT
    item['user_prompt'] = f"{USER_PROMPT}\n{Dictt}"
    response = await single_async_item(
                api_url="http://2.188.15.102:8001/v1/",
                api_key="EMPTY",
                item=item,
                reasoning_effort="medium",
                temperature=0.1,
                top_p=1,
                semaphore_number=1,
                model_name="gpt-oss-120b", 
                priority=1,
                output_schema=Result,
                max_token=None,
                return_reason=True,
                return_used_token=True,
                timeout=300
            )
    print(response['llm_output'])
    return response['llm_output']
 if __name__ == "__main__":
-    asyncio.run(main())
+    # asyncio.run(main())
    asyncio.run(oss_test())
--- a/nahj_get_metadata_v2.py
+++ b/nahj_get_metadata_v2.py
@ -6,6 +6,12 @@ import time
 import datetime
 from openai import OpenAI
 from langchain_openai import ChatOpenAI
 from nahj_get_metadata_oss import oss_test
 import asyncio
 import sqlite3
 conn = sqlite3.connect('./db/nahj.db')
 cursor = conn.cursor()
 today = f'{datetime.datetime.now().year}{datetime.datetime.now().month}{datetime.datetime.now().day}'
@ -271,16 +277,15 @@ paragraph_effect
 مقادیر بین این دو (مثلاً 0.2 ، -0.4 ، 0.75) مجاز و نشان‌دهنده شدت نسبی هستند
-ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد:
+ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (خیلی مهم) به صورت زیر باشد:
-{
+[{
    "paragraph_id": str,
    "central_concepts": [
      {
        "concept": str,
        "paragraph_effect": float
      }
    ]
-}
+},...]
 """
 SYSTEM_PROMPT_person = """
@ -321,11 +326,10 @@ paragraphs : لیستی از پاراگراف‌ها که هرکدام شامل:
 شخصیت‌های فرضی، نمادین یا کلی وارد نشوند
-ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد:
+ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (بسیار مهم) به صورت زیر باشد:
-{
+[{
    "paragraph_id": str,
    "persons": [str]
-}
+},...]
 """
 SYSTEM_PROMPT_rules = """
@ -377,16 +381,15 @@ paragraphs : لیستی از پاراگراف‌ها که هرکدام شامل:
 باید، لازم است، ضروری است، نیازمند است، واجب است، حیاتی است و مانند آن
-ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و به صورت زیر باشد:
+ساختار دقیق خروجی مورد انتظار باید لیستی از دیکشنری ها باشد که بازای هر پاراگراف تولید شده باشد و دقیقا (بسیار مهم) به صورت زیر باشد:
-{
+[{
    "paragraph_id": str,
    "rules": [
      {
        "rule": str,
        "type": "توصیفی" | "هنجاری"
      }
    ]
-}
+},...]
 """
 SYSTEM_PROMPT = SYSTEM_PROMPT_rules
@ -396,7 +399,7 @@ prompts = [SYSTEM_PROMPT_title,
           SYSTEM_PROMPT_person,
           SYSTEM_PROMPT_rules]
-outs = ["-title","-central","-person","-rules"]
+outs = ["title","central","person","rules"]
 USER_PROMPT = '''
 متن زیر را بر اساس دستورالعمل‌های سیستمی تحلیل کن و خروجی را در قالب دیکشنری پایتون ارائه بده:
@ -475,7 +478,7 @@ if __name__ == "__main__":
        file_path = './nahj_data/nahj-metadata-jsonline.json'
-        for path in outs:
+        # for path in outs:
        # 1. حذف فایل اگر وجود داشته باشد
        if os.path.exists(file_path):
            os.remove(file_path)
@ -484,7 +487,7 @@ if __name__ == "__main__":
        with open(file_path, 'w') as f:
            pass
-            output_metadata_json_path = f'./nahj_data/nahj-metadata{path}-TEST.json'
+        output_metadata_json_path = f'./nahj_data/nahj-metadata-TEST.json'
        with open(input_data_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        SYSTEM_PROMPT = prompts[o]
@ -505,15 +508,15 @@ if __name__ == "__main__":
        error_ids = []
        test_enteries = []
        all_paragraphs = 0
-
+        NN=-1 # این عدد صرفا برای آیدی استفاده میشود
        period = 1
        end = False
        while True:
                print(f"******* PERIOD :: {period} *******")
                for index ,entery in enumerate(data, 1):
-                    
+                    NN+=1
-                    if index > 5:
+                    if index > 799:
                        end = True
                        break
                    id = entery['id']
@ -531,6 +534,17 @@ if __name__ == "__main__":
                    print(f'id: {id} - record: {index}/{len(data)} - period: {period}')
                    for path in outs :
                        if path == "title":
                              SYSTEM_PROMPT = SYSTEM_PROMPT_title
                        elif path == "central":
                              SYSTEM_PROMPT = SYSTEM_PROMPT_central
                        elif path == "person":
                              SYSTEM_PROMPT = SYSTEM_PROMPT_person
                        elif path == "rules":
                              SYSTEM_PROMPT = SYSTEM_PROMPT_rules
                        llm_answer_data = ''
                        new_entry = {}
                        new_paragraphs = []
@ -545,21 +559,109 @@ if __name__ == "__main__":
                                # 'text': f"بخشی از {large_title} : {p['text'] }"
                            })
                        new_entry['paragraphs'] = new_paragraphs
                        try:
-                        result_data = llm_request(new_entry)#gpt-4o
+                            result_data = asyncio.run(oss_test(SYSTEM_PROMPT,USER_PROMPT,new_entry))#gpt-4o
                            # result_data = llm_request(new_entry)
                            llm_answer_data = text_to_dict(result_data)
                            if path == "title" :
                                entery['paragraph_metadata'] = []
                                for num ,sec in enumerate(llm_answer_data):
                                    entery['paragraph_metadata'].append({
                                         'paragraph_id': sec['paragraph_id'],
                                         'title': sec['title'],
                                         'paragraph_type': sec['paragraph_type']
                                    })
                            if len(llm_answer_data) != len(entery['paragraph_metadata']) :
                                print("error!!!!!!!!!!")
                                if entery['id'] not in error_ids:
                                    error_ids.append(entery['id'])
                            if path == "central" : 
                                for num ,sec in enumerate(llm_answer_data):
                                    entery['paragraph_metadata'][num]['central_concepts'] = sec['central_concepts']
                            if path == "person" : 
                                for num ,sec in enumerate(llm_answer_data):
                                    entery['paragraph_metadata'][num]['persons'] = sec['persons']
                            if path == "rules" : 
                                for num ,sec in enumerate(llm_answer_data):
                                    entery['paragraph_metadata'][num]['rules'] = sec['rules']
                        except Exception as e:
                            print(f'error id: {id} - {e} >> llm result: {result_data}')
-                        #  error_ids.append(id)
+                            if id not in error_ids:
                                error_ids.append(id)
                            with open(current_peroid_errors_path, "a", encoding="utf-8") as f:
                                f.write(f"{id}\n")
                            continue
-                    entery['paragraph_metadata'] = llm_answer_data
+                        # entery['paragraph_metadata'] = llm_answer_data
                    context_id = id
                    title = entery['title']
                    large_title = entery['large_title']
                    url = entery['url']
                    typee = entery['type']
                    i_link = entery['interpretation_link']
                    N=0
                    for part in paragraphs :
                        id_ = f"num{NN}{N}"
                        text = part['text']
                        part_id = part['paragraph_id']
                        arabic_text = part['arabic_text']
                        ai_title = entery['paragraph_metadata'][N]['title']
                        paragraph_type = entery['paragraph_metadata'][N]['paragraph_type']
                        cursor.execute("INSERT INTO speeches (id, context_id, part_id, title, large_title, normalized_sentence, url, types, arabic_text, interpretation_links, ai_title, ai_paragraph_type) \
                                       VALUES (?, ?, ?, ? ,? ,? ,? ,? ,? ,? ,? ,?)",
                                        (id_, context_id, part_id, title, large_title, text, url, typee, arabic_text, i_link, ai_title, paragraph_type))
                        conn.commit()
                        central_concepts = entery['paragraph_metadata'][N]['central_concepts']
                        persons = entery['paragraph_metadata'][N]['persons']
                        rules = entery['paragraph_metadata'][N]['rules']
                        k=0
                        for row in central_concepts:
                            k+=1
                            c_id = context_id+part_id+f"c{k}"
                            concept = row['concept']
                            paragraph_effect = row['paragraph_effect']
                            cursor.execute("""INSERT INTO central_concepts (id, concept, paragraph_effect, part_id)
                        VALUES(?, ?, ?, ?)""",(c_id, concept, paragraph_effect, part_id))
                        conn.commit()
                        k=0
                        for row in persons:
                            k+=1
                            c_id = context_id+part_id+f"p{k}"
                            person = row
                            cursor.execute("""INSERT INTO persons (id, person, part_id)
                        VALUES(?, ?, ?)""",(c_id, person, part_id))
                        conn.commit()
                        k=0
                        for row in rules:
                            k+=1
                            c_id = context_id+part_id+f"r{k}"
                            rule = row['rule']
                            rule_type = row['type']
                            cursor.execute("""INSERT INTO rules (id, rule, type, part_id)
                        VALUES(?, ?, ?, ?)""",(c_id, rule, rule_type, part_id))
                        conn.commit()
                        N+=1
                    test_enteries.append(entery)
                    with open(output_metadata_jsonl_path, 'a', encoding='utf-8') as f:
                            json.dump(entery, f, ensure_ascii=False)
                            f.write('\n')
                    time.sleep(1)
                passed_data_ids = find_passed_data_ids(output_metadata_jsonl_path)
@ -573,6 +675,8 @@ if __name__ == "__main__":
            # with open(f'./leader_data/leader-metadata-bayanat-{id}.json', mode='w', encoding='utf-8') as file:
        with open(output_metadata_json_path, mode='w', encoding='utf-8') as file:
                    result_message = json.dump(test_enteries, file, ensure_ascii=False, indent=2)
        with open("./nahj_data/error_ids_TEST.json", mode='w', encoding='utf-8') as file:
                    result_message = json.dump(error_ids, file, ensure_ascii=False, indent=2)
        print('all done!')
        print('---------------------------------------------')