adding 512tokenizer and split content

2025-10-09 15:08:13 +03:30
4 changed files with 251 additions and 11 deletions
--- a/512_tokens_finder.py
+++ b/512_tokens_finder.py
@ -0,0 +1,135 @@
 # بسم الله
 from transformers import AutoTokenizer
 from typing import List
 import unicodedata
 import json
 import re
 # -----------------------------
 # Normalization Utilities
 # -----------------------------
 _ARABIC_TO_PERSIAN = {
    "ي": "ی", 
    "ك": "ک", 
    "ۀ": "ه",
    "ة": "ه",
    "ؤ": "و",
    "إ": "ا",
    "أ": "ا",
    "ٱ": "ا",
    "آ": "ا", 
 }
 sys_max = 0x110000
 _DIACRITICS = dict.fromkeys(
    i for i in range(sys_max) if unicodedata.category(chr(i)) == 'Mn'
 )
 _ZWNJ = "\u200c" # نیم‌فاصله
 _TATWEEL = "\u0640" # کشیدگی
 # الگوی Regex برای پیدا کردن کلمات (حروف، اعداد و زیرخط)
 _TOKEN_RE = re.compile(r"[\w\u0600-\u06FF]+", re.UNICODE)
 def nfkc(text: str) -> str:
    """متن را به فرم استاندارد NFKC یونی‌کد تبدیل می‌کند."""
    return unicodedata.normalize("NFKC", text)
 def strip_diacritics(text: str) -> str:
    """حرکات و علائم غیرفاصله‌دار را از متن حذف می‌کند."""
    return text.translate(_DIACRITICS)
 def unify_persian_arabic(text: str) -> str:
    """حروف عربی را به معادل فارسی آن‌ها تبدیل می‌کند."""
    return text.translate(str.maketrans(_ARABIC_TO_PERSIAN))
 def normalize_spaces(text: str) -> str:
    """نیم‌فاصله و کشیدگی را حذف و فواصل اضافی را یکپارچه می‌کند."""
    text = text.replace(_ZWNJ, " ")
    text = text.replace(_TATWEEL, "")
    text = re.sub(r"\s+", " ", text)
    return text.strip()
 def normalize_text(text: str) -> str:
    """
    تابع اصلی نرمال‌سازی که تمام مراحل را به ترتیب روی متن اعمال می‌کند.
    """
    if not isinstance(text, str):
        t = str(text)
    else:
        t = text
    t = nfkc(t)
    t = unify_persian_arabic(t)
    t = strip_diacritics(t)
    t = t.lower()
    t = normalize_spaces(t)
    return t
 def tokenize(text: str) -> List[str]:
    """متن را به توکن‌ها (کلمات) تقسیم می‌کند."""
    return _TOKEN_RE.findall(text)
 model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 longs_count = 0
 shorts_count = 0
 all_sections_count = 0
 file_path = "ALL_SECTIONS.json"
 with open(file_path, 'r', encoding='utf-8') as file:
    ALL_SECTIONS = json.load(file)
 NEW_ALL_SECTIONS = []
 for section in ALL_SECTIONS :
    all_sections_count+=1
    sentence = section["sentence-content"]
    normal_txt = normalize_text(sentence.strip())
    tokens = tokenizer.tokenize(normal_txt)
    if len(tokens) > 512 :
        longs_count+=1
        section["tokens_len"] = len(tokens)
        section["is_long"] = True
        section["is_short"] = False
    elif len(tokens) < 10 :
        shorts_count+=1
        section["tokens_len"] = len(tokens)
        section["is_long"] = False
        section["is_short"] = True
    else :
        section["tokens_len"] = len(tokens)
        section["is_long"] = False
        section["is_short"] = False
    NEW_ALL_SECTIONS.append(section)
 with open('512t_ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
        json.dump(NEW_ALL_SECTIONS, f, indent=4, ensure_ascii=False)
 print(f"All Sections : {all_sections_count}")
 print(f"Long Sections : {longs_count}")
 print(f"Short Sections : {shorts_count}")
--- a/elastic_helper.py
+++ b/elastic_helper.py
@ -14,16 +14,16 @@ class ElasticHelper():
    def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""):
-        if path_mappings : 
+        # if path_mappings : 
-           self.path_mappings = path_mappings
+        #    self.path_mappings = path_mappings
-        if es_pass == '' :
+        # if es_pass == '' :
-            self.es = Elasticsearch(es_url)
+        #     self.es = Elasticsearch(es_url)
-        else:
+        # else:
-            self.es = Elasticsearch(
+        #     self.es = Elasticsearch(
-                es_url,
+        #         es_url,
-                http_auth=(es_user, es_pass),
+        #         http_auth=(es_user, es_pass),
-            )
+        #     )
        # print(es_url)
        # print(self.es)
--- a/embedder_sbert_qavanin_285k.py
+++ b/embedder_sbert_qavanin_285k.py
@ -30,7 +30,7 @@ from transformers import AutoTokenizer
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 from sklearn.metrics.pairwise import cosine_similarity
-#from normalizer import cleaning
+from normalizer import cleaning
 try:
    from elastic_helper import ElasticHelper
 except Exception as error:
@ -44,7 +44,7 @@ except Exception as error:
 # Persian text processing
 # import hazm
-# from hazm import Normalizer, word_tokenize, POSTagger
+from hazm import Normalizer, word_tokenize, POSTagger
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
--- a/split_content_to_sentences.py
+++ b/split_content_to_sentences.py
@ -0,0 +1,105 @@
 # بسم الله
 import json
 from normalizer import cleaning
 # try:
 from elastic_helper import ElasticHelper
 # except Exception as error:
 #     eee = error
 #     pass
 def full_path_text_maker(full_path):
    """
    این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند
    Args:
        full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن
    Returns:
        full_path_text(str): متن بازسازی شده از مسیر یک سکشن
    """
    full_path_text = ""
    for i, path_item in enumerate(reversed(full_path)):
        if i == len(full_path) - 1:
            full_path_text += ''.join(f'{path_item}')
            break
        full_path_text += ''.join(f'{path_item} از ')
    full_path_text = full_path_text.strip()
    return full_path_text
 if __name__ == "__main__":
    eh_obj = ElasticHelper()
    path = ".\data\mj_qa_section-v02.zip"
    sections_elastic = eh_obj.iterateJsonFile(path, True)
    all_count = 0
    dont_cares = []
    ALL_SECTIONS = []
    n=0
    for index, item in enumerate(sections_elastic):
        source = item['source']
        section_path = source['other_info']['full_path']
        id = item['id']
        filtered_keys = ['فصل','موخره','امضاء','عنوان']
        section_path = source['other_info']['full_path']
        flag = False
        if '>' in section_path:
            path_parts = section_path.split('>')
            for key in filtered_keys:
                if key in path_parts[-1]:
                    dont_cares.append(id)
                    flag = True
                    break
            if flag:
                continue
        else:
            for key in filtered_keys:
                if key in section_path:
                    dont_cares.append(id)
                    flag = True
                    break
            if flag:
                continue
        qanon_title = source['qanon_title']
        full_path_text = full_path_text_maker(section_path.split('>'))
        section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: "
        try:
            content = cleaning(item['source']['content'])
            sentences = content.split(".")
            # # کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند
            # if len(content.split()) <= 10:
            #     continue
        except Exception as error:
            print(error)
            continue
        for sentence in sentences:
            if sentence == "":
                continue
            all_count +=1
            sentence_id = f"sn{n}"
            n+=1
            data = {
                'id': id,
                "sentence_id" : sentence_id,
                'fullpath': section_path,
                'qanon-title': qanon_title,
                'section-prefix': section_prefix,
                'sentence-content': sentence
            }
            ALL_SECTIONS.append(data)
    with open('ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
        json.dump(ALL_SECTIONS, f, indent=4, ensure_ascii=False)
    print(f'all_count: {all_count}')
    print(f'dont_cares: {len(dont_cares)}')
    print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}')