kw source added

2025-08-07 16:57:27 +03:30 · 2025-08-07 16:57:27 +03:30 · b2b1be0a21
commit b2b1be0a21
parent 63673bc648
3 changed files with 1560 additions and 0 deletions
--- a/copy.py
+++ b/copy.py
--- a/p4_keyword_extractor.py
+++ b/p4_keyword_extractor.py
@ -1,4 +1,135 @@


+"""
+استخراج واژگان کلیدی از اجزاء قانونی
+"""
+
+from datetime import datetime
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import torch
+import time
+
+import os
+
+from normalizer import Normalizer
+from tokenizer import *
+
+address = os.getcwd()
+
+not_have_two_token_kw_list = []
+
+if torch.cuda.is_available():
+    model_id = "PartAI/Dorna-Llama3-8B-Instruct"
+    #model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+index_name_i = 'semantic_search-v10'
+
+
+counter = 0
+total = 0
+remained = 0
+id = ''
+keywords_count = 15
+   
+def generateKeywords(text):
+    global remained
+    try:
+        keywords_count = (len(text) / 1000) * 15
+        keywords_count = int(keywords_count)
+        if keywords_count == 0: 
+            keywords_count = 1
+       
+        
+        messages =  [{"role": "system", "content": "تو یک وکیل حقوق دان هستی و باید بتوانی متن های قانونی و حقوقی را بدون تغییر اصطلاحات فنی، به صورتی توضیح دهی که افراد غیر حقوق دان، معنای متن را درک کنند. " },
+                     {"role": "user", "content": 
+                    '''از "متن" حداقل {} عبارت های کلیدی مهم و پراهمیت را استخراج کن و عبارت های کلیدی را در قالب لیست به زبان فارسی چاپ کن و هر کلید عبارت کلیدی را در یک خط جدید قرار بده و هیچ گونه توضیحی در ابتدا یا انتهای پاسخ، اضافه نکن.
+                    هر عبارت کلیدی دارای یک شماره ترتیبی در ابتدای آن باشد. عبارت های کلیدی، دقیقا در متن موجود باشد. بسیار مهم و ضروری است که طول هر عبارت کلیدی حداقل دو توکن داشته باشد و عبارت کلیدی یک توکنی قابل قبول نیست. تاکید می کنم که هیچ عبارت کلیدی نباید فقط یک توکن داشته باشد. نام سازمان ها و نهادها و اشخاص حقوقی، حتما به عنوان عبارت کلیدی درنظر گرفته شود. هیچ عبارت کلیدی، فعل یا حرف اضافه نباشد و فقط شامل اسم هایی باشد که به هم اضافه شده اند. هیچ عبارت کلیدی نباید با حرف اضافه یا حرف «و» تمام شود. ضروری است که عبارت های کلیدی شامل ماده، بند، تبصره یا تاریخ ها نباشند.
+                    "متن": {}
+                    '''.format(keywords_count, text)
+        }]
+       
+        
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        ).to(model.device)
+
+        terminators = [
+                    tokenizer.eos_token_id,
+                    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+                ]
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+
+        
+        outputs = model.generate(
+            input_ids,
+            max_new_tokens=256,
+            eos_token_id=terminators,
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.85,
+        )
+        #lock0.release()
+        response = outputs[0][input_ids.shape[-1]:]
+        keywords = tokenizer.decode(response, skip_special_tokens=True)
+        #lock1.acquire()
+        # resp = es.update(index=index_name_i, id=id, doc={"content_keywords-llama3-str": str(keywords)})
+       
+
+        return keywords
+
+    except Exception as inst:
+        print(type(inst))    # the exception type
+        print(inst.args)     # arguments stored in .args
+        print("Exception: " + str(inst))
+
+
 def do_keyword_extract(sections):
+    start_time = time.time()
+    print("start_time: "+str(datetime.now()))
+    
+    try:
+        keywords_dict = []
+        
+        count = 1
+        for index, content_item in enumerate(sections):
+            id = content_item
+            # if not id in not_have_two_token_kw_list:
+            #     continue
+            content = content_item['content']
+            content_len = len(content.split())
+            # کنارگذاشتن محتواهای با حجم زیاد
+            if content_len > 2000:
+                print("too long content " + str(id))
+                continue
+
+            keywords = generateKeywords(content) 
+            print("section " + str(count) + "/" + str(len(not_have_two_token_kw_list)) + " keyword extracting ... ")
+            content_item['keywords'] = keywords
+            keywords_dict.append({
+                    'id':id,
+                    'keywords':keywords
+                })
+            count+=1
+            
+        # write_to_json(keywords_dict, "../data/sections_kw_110_llama_main.json")
+          
+    except Exception as inst:
+            print(type(inst))    # the exception type
+            print(inst.args)     # arguments stored in .args
+            
+    end_time = time.time()
+    print("end_time: "+ str(datetime.now()))
+    operation_time = (int(end_time-start_time)/60)/60
+    print(f"elapsed time: {operation_time} hours")
+    print(f"Finished!!!")
+    
+    return sections
+
+    
+if __name__ == "__main__":
    pass
--- a/tokenizer.py
+++ b/tokenizer.py
@ -0,0 +1,59 @@
+import re
+
+
+class Tokenizer():
+    def __init__(self):
+        pass
+
+    def tokenize_words(self, doc_string):
+        token_list = doc_string.strip().split()
+        #token_list = [x.strip("\u200c").strip('\u200f') for x in token_list if len(x.strip("\u200c")) != 0]
+        new_token_list = []
+        new_token = ''
+        for index,token in enumerate(token_list):
+            if len(token.strip("\u200c")) != 0:
+                new_token = token.strip("\u200c")
+            if len(token.strip("\u200f")) != 0:
+                new_token = new_token.strip('\u200f')
+            new_token_list.append(new_token)
+        return new_token_list
+
+    def tokenize_sentences(self, doc_string):
+        #finding the numbers
+        pattern = r"[-+]?\d*\.\d+|\d+"
+        nums_list = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, 'floatingpointnumber', doc_string)
+
+        pattern = r'([!\.\?؟]+)[\n]*'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r':\n'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r';\n'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r'؛\n'
+        tmp = re.findall(pattern, doc_string)
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        pattern = r'[\n]+'
+        doc_string = re.sub(pattern, self.add_tab, doc_string)
+
+        for number in nums_list:
+            pattern = 'floatingpointnumber'
+            doc_string = re.sub(pattern, number, doc_string, 1)
+
+        doc_string = doc_string.split('\t\t')
+        doc_string = [x for x in doc_string if len(x) > 0]
+        return doc_string
+
+    def add_tab(self, mystring):
+        mystring = mystring.group()  # this method return the string matched by re
+        mystring = mystring.strip(' ')  # ommiting the whitespace around the pucntuation
+        mystring = mystring.strip('\n') # ommiting the newline around the pucntuation
+        mystring = " " + mystring + "\t\t"  # adding a space after and before punctuation
+        return mystring