دیتاست کلاسیفیکیشن

2025-07-14 16:36:21 +03:30 · 2025-07-14 16:36:21 +03:30 · 420979be7d
commit 420979be7d
parent 889cd4ed07
1 changed files with 127 additions and 1 deletions
--- a/llama3_classification_ds.py
+++ b/llama3_classification_ds.py
@ -1,3 +1,6 @@
+"""
+این فایل با نرمالایزر هضم کار می کند
+"""
 import json

 with open('./data/classes51.txt', 'r') as file:
@ -8,4 +11,127 @@ with open('./data/classification_ds.json', 'r') as file:
    
 # send content of some sections and classes to llama chat 
 # and ask about the best class
-    
+
+
+from html import escape
+from lxml import etree
+from datetime import datetime
+from elasticsearch import Elasticsearch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
+from threading import Thread
+import torch
+import time
+from concurrent.futures import ThreadPoolExecutor
+import concurrent
+import threading
+import json 
+import os.path
+import os
+import normalizer
+from funcs import write_to_json, read_from_json
+#lock = threading.Lock()
+#lock1 = threading.Lock()
+#from cleantext import clean
+#import re
+
+
+if torch.cuda.is_available():
+    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+counter = 0
+total = 0
+remained = 0
+id = ''
+keywords_count = 15
+        
+
+def command(text):
+    global remained
+    try:    
+        
+        messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را به خوبی تفسیر کنی. " },
+                     
+                     {"role": "user", "content": 
+                    '''با توجه به 51 کلاسی که در ادامه می آید، بهترین کلاس که از نظر محتوایی با متن زیر مطابقت دارد را از میان کلاس های ارائه شده انتخاب کن. تاکید می کنم که فقط اجازه داری یک کلاس را انتخاب کنی.
+                    نام کلاس باید دقیقا مطابق با عنوان های کلاس های 51 گانه باشد.
+                    هیچ توضیح اضافه ای پیش یا پس از عنوان کلاس ننویس.
+                    "متن": {}
+                    '''.format(text)
+        },
+        {"role": "user", "content": 
+                    '''کلاس های 51 گانه عبارت اند از: {}
+                    '''.format(classes)
+        },             
+                     ]
+       
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(model.device)
+
+        terminators = [
+                    tokenizer.eos_token_id,
+                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+                ]
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=256,
+            eos_token_id=terminators,
+            do_sample=True,
+            temperature=0.1,
+            top_p=0.85,
+        )
+
+        response = outputs[0][input_ids.shape[-1]:]
+        result = tokenizer.decode(response, skip_special_tokens=True)
+
+        return result
+
+    except Exception as inst:
+        print(type(inst))    # the exception type
+        print(inst.args)     # arguments stored in .args
+        print("Exception: " + str(inst))
+
+counter = 1
+if __name__ == "__main__":
+    start_time = time.time()
+    try:
+        classes_dict = []
+        count = 1
+        for content_item in sections:
+            
+            id = sections[counter]['id']
+       
+            prev_class = sections[counter]['domain_name']
+            content = sections[counter]['content']
+           
+            new_class = command(content) 
+            print("section " + str(count) + "/" + str(len(sections)) + " class extracting ... ")
+            classes_dict.append({
+                    'content':content,
+                    'prev-class': prev_class,
+                    'new-class': new_class
+                })
+            count+= 1
+            counter+= 500 
+            
+            if counter > 49387:
+                break
+        write_to_json(classes_dict, "./data/result.json")
+          
+    except Exception as inst:
+            print(type(inst))    # the exception type
+            print(inst.args)     # arguments stored in .args
+            
+
+
+    end_time = time.time()
+    print(end_time)
+    operation_time = (int(end_time-start_time)/60)/60
+    print(f"elapsed time: {operation_time} hours")
+    print(f" Finished!!! ")