From d831e3ee5249849fa252620f6f8997f9a3130126 Mon Sep 17 00:00:00 2001 From: mdorstkar Date: Thu, 9 Oct 2025 15:08:13 +0330 Subject: [PATCH] adding 512tokenizer and split content --- 512_tokens_finder.py | 135 +++++++++++++++++++++++++++++++++ elastic_helper.py | 18 ++--- embedder_sbert_qavanin_285k.py | 4 +- split_content_to_sentences.py | 105 +++++++++++++++++++++++++ 4 files changed, 251 insertions(+), 11 deletions(-) create mode 100644 512_tokens_finder.py create mode 100644 split_content_to_sentences.py diff --git a/512_tokens_finder.py b/512_tokens_finder.py new file mode 100644 index 0000000..b047635 --- /dev/null +++ b/512_tokens_finder.py @@ -0,0 +1,135 @@ +# بسم الله + + +from transformers import AutoTokenizer +from typing import List +import unicodedata +import json +import re + + + +# ----------------------------- +# Normalization Utilities +# ----------------------------- +_ARABIC_TO_PERSIAN = { + "ي": "ی", + "ك": "ک", + "ۀ": "ه", + "ة": "ه", + "ؤ": "و", + "إ": "ا", + "أ": "ا", + "ٱ": "ا", + "آ": "ا", +} + +sys_max = 0x110000 +_DIACRITICS = dict.fromkeys( + i for i in range(sys_max) if unicodedata.category(chr(i)) == 'Mn' +) + +_ZWNJ = "\u200c" # نیم‌فاصله +_TATWEEL = "\u0640" # کشیدگی + +# الگوی Regex برای پیدا کردن کلمات (حروف، اعداد و زیرخط) +_TOKEN_RE = re.compile(r"[\w\u0600-\u06FF]+", re.UNICODE) + +def nfkc(text: str) -> str: + """متن را به فرم استاندارد NFKC یونی‌کد تبدیل می‌کند.""" + return unicodedata.normalize("NFKC", text) + + +def strip_diacritics(text: str) -> str: + """حرکات و علائم غیرفاصله‌دار را از متن حذف می‌کند.""" + return text.translate(_DIACRITICS) + + +def unify_persian_arabic(text: str) -> str: + """حروف عربی را به معادل فارسی آن‌ها تبدیل می‌کند.""" + return text.translate(str.maketrans(_ARABIC_TO_PERSIAN)) + + +def normalize_spaces(text: str) -> str: + """نیم‌فاصله و کشیدگی را حذف و فواصل اضافی را یکپارچه می‌کند.""" + text = text.replace(_ZWNJ, " ") + text = text.replace(_TATWEEL, "") + text = re.sub(r"\s+", " ", text) + return text.strip() + + +def normalize_text(text: str) -> str: + """ + تابع اصلی نرمال‌سازی که تمام مراحل را به ترتیب روی متن اعمال می‌کند. + """ + if not isinstance(text, str): + t = str(text) + else: + t = text + + t = nfkc(t) + t = unify_persian_arabic(t) + t = strip_diacritics(t) + t = t.lower() + t = normalize_spaces(t) + return t + + +def tokenize(text: str) -> List[str]: + """متن را به توکن‌ها (کلمات) تقسیم می‌کند.""" + return _TOKEN_RE.findall(text) + + + + + + +model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" +tokenizer = AutoTokenizer.from_pretrained(model_name) +longs_count = 0 +shorts_count = 0 +all_sections_count = 0 +file_path = "ALL_SECTIONS.json" +with open(file_path, 'r', encoding='utf-8') as file: + ALL_SECTIONS = json.load(file) + + +NEW_ALL_SECTIONS = [] +for section in ALL_SECTIONS : + all_sections_count+=1 + sentence = section["sentence-content"] + normal_txt = normalize_text(sentence.strip()) + tokens = tokenizer.tokenize(normal_txt) + + if len(tokens) > 512 : + longs_count+=1 + section["tokens_len"] = len(tokens) + section["is_long"] = True + section["is_short"] = False + + + elif len(tokens) < 10 : + shorts_count+=1 + section["tokens_len"] = len(tokens) + section["is_long"] = False + section["is_short"] = True + + + else : + section["tokens_len"] = len(tokens) + section["is_long"] = False + section["is_short"] = False + + + NEW_ALL_SECTIONS.append(section) + + + +with open('512t_ALL_SECTIONS.json', 'w', encoding='utf-8') as f: + json.dump(NEW_ALL_SECTIONS, f, indent=4, ensure_ascii=False) + +print(f"All Sections : {all_sections_count}") +print(f"Long Sections : {longs_count}") +print(f"Short Sections : {shorts_count}") + + diff --git a/elastic_helper.py b/elastic_helper.py index 57797bd..9774e16 100644 --- a/elastic_helper.py +++ b/elastic_helper.py @@ -14,16 +14,16 @@ class ElasticHelper(): def __init__(self, es_url="http://127.0.0.1:6900", es_pass="", es_user="elastic", path_mappings = ""): - if path_mappings : - self.path_mappings = path_mappings + # if path_mappings : + # self.path_mappings = path_mappings - if es_pass == '' : - self.es = Elasticsearch(es_url) - else: - self.es = Elasticsearch( - es_url, - http_auth=(es_user, es_pass), - ) + # if es_pass == '' : + # self.es = Elasticsearch(es_url) + # else: + # self.es = Elasticsearch( + # es_url, + # http_auth=(es_user, es_pass), + # ) # print(es_url) # print(self.es) diff --git a/embedder_sbert_qavanin_285k.py b/embedder_sbert_qavanin_285k.py index a063d73..0308d79 100644 --- a/embedder_sbert_qavanin_285k.py +++ b/embedder_sbert_qavanin_285k.py @@ -30,7 +30,7 @@ from transformers import AutoTokenizer from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.metrics.pairwise import cosine_similarity -#from normalizer import cleaning +from normalizer import cleaning try: from elastic_helper import ElasticHelper except Exception as error: @@ -44,7 +44,7 @@ except Exception as error: # Persian text processing # import hazm -# from hazm import Normalizer, word_tokenize, POSTagger +from hazm import Normalizer, word_tokenize, POSTagger # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') diff --git a/split_content_to_sentences.py b/split_content_to_sentences.py new file mode 100644 index 0000000..bbb40ea --- /dev/null +++ b/split_content_to_sentences.py @@ -0,0 +1,105 @@ +# بسم الله + +import json +from normalizer import cleaning +# try: +from elastic_helper import ElasticHelper +# except Exception as error: +# eee = error +# pass + + + + +def full_path_text_maker(full_path): + """ + این متد مسیر یک سکشن را می گیرد و متنی را بر اساس ترتیب بخش های آن از جزء به کل بازسازی می کند و بر می گرداند + + Args: + full_path(list): لیستی از عناصر مشخص کننده مسیر درختی این سکشن + Returns: + full_path_text(str): متن بازسازی شده از مسیر یک سکشن + """ + full_path_text = "" + for i, path_item in enumerate(reversed(full_path)): + if i == len(full_path) - 1: + full_path_text += ''.join(f'{path_item}') + break + full_path_text += ''.join(f'{path_item} از ') + full_path_text = full_path_text.strip() + return full_path_text + + + +if __name__ == "__main__": + eh_obj = ElasticHelper() + path = ".\data\mj_qa_section-v02.zip" + sections_elastic = eh_obj.iterateJsonFile(path, True) + all_count = 0 + dont_cares = [] + ALL_SECTIONS = [] + n=0 + for index, item in enumerate(sections_elastic): + + source = item['source'] + section_path = source['other_info']['full_path'] + id = item['id'] + + filtered_keys = ['فصل','موخره','امضاء','عنوان'] + section_path = source['other_info']['full_path'] + flag = False + if '>' in section_path: + path_parts = section_path.split('>') + for key in filtered_keys: + if key in path_parts[-1]: + dont_cares.append(id) + flag = True + break + if flag: + continue + else: + for key in filtered_keys: + if key in section_path: + dont_cares.append(id) + flag = True + break + if flag: + continue + + qanon_title = source['qanon_title'] + full_path_text = full_path_text_maker(section_path.split('>')) + section_prefix = f"محتوای {full_path_text} {cleaning(qanon_title)} عبارت است از: " + + try: + content = cleaning(item['source']['content']) + sentences = content.split(".") + # # کنار گذاشتن سکشن های خیلی کوچک که عملا محتوا ندارند + # if len(content.split()) <= 10: + # continue + except Exception as error: + print(error) + continue + + for sentence in sentences: + if sentence == "": + continue + all_count +=1 + sentence_id = f"sn{n}" + n+=1 + + data = { + 'id': id, + "sentence_id" : sentence_id, + 'fullpath': section_path, + 'qanon-title': qanon_title, + 'section-prefix': section_prefix, + 'sentence-content': sentence + } + + ALL_SECTIONS.append(data) + + with open('ALL_SECTIONS.json', 'w', encoding='utf-8') as f: + json.dump(ALL_SECTIONS, f, indent=4, ensure_ascii=False) + print(f'all_count: {all_count}') + print(f'dont_cares: {len(dont_cares)}') + print(f'ALL_SECTIONS without dont-cares: {len(ALL_SECTIONS)}') \ No newline at end of file