136 lines
3.4 KiB
Python
136 lines
3.4 KiB
Python
# بسم الله
|
||
|
||
|
||
from transformers import AutoTokenizer
|
||
from typing import List
|
||
import unicodedata
|
||
import json
|
||
import re
|
||
|
||
|
||
|
||
# -----------------------------
|
||
# Normalization Utilities
|
||
# -----------------------------
|
||
_ARABIC_TO_PERSIAN = {
|
||
"ي": "ی",
|
||
"ك": "ک",
|
||
"ۀ": "ه",
|
||
"ة": "ه",
|
||
"ؤ": "و",
|
||
"إ": "ا",
|
||
"أ": "ا",
|
||
"ٱ": "ا",
|
||
"آ": "ا",
|
||
}
|
||
|
||
sys_max = 0x110000
|
||
_DIACRITICS = dict.fromkeys(
|
||
i for i in range(sys_max) if unicodedata.category(chr(i)) == 'Mn'
|
||
)
|
||
|
||
_ZWNJ = "\u200c" # نیمفاصله
|
||
_TATWEEL = "\u0640" # کشیدگی
|
||
|
||
# الگوی Regex برای پیدا کردن کلمات (حروف، اعداد و زیرخط)
|
||
_TOKEN_RE = re.compile(r"[\w\u0600-\u06FF]+", re.UNICODE)
|
||
|
||
def nfkc(text: str) -> str:
|
||
"""متن را به فرم استاندارد NFKC یونیکد تبدیل میکند."""
|
||
return unicodedata.normalize("NFKC", text)
|
||
|
||
|
||
def strip_diacritics(text: str) -> str:
|
||
"""حرکات و علائم غیرفاصلهدار را از متن حذف میکند."""
|
||
return text.translate(_DIACRITICS)
|
||
|
||
|
||
def unify_persian_arabic(text: str) -> str:
|
||
"""حروف عربی را به معادل فارسی آنها تبدیل میکند."""
|
||
return text.translate(str.maketrans(_ARABIC_TO_PERSIAN))
|
||
|
||
|
||
def normalize_spaces(text: str) -> str:
|
||
"""نیمفاصله و کشیدگی را حذف و فواصل اضافی را یکپارچه میکند."""
|
||
text = text.replace(_ZWNJ, " ")
|
||
text = text.replace(_TATWEEL, "")
|
||
text = re.sub(r"\s+", " ", text)
|
||
return text.strip()
|
||
|
||
|
||
def normalize_text(text: str) -> str:
|
||
"""
|
||
تابع اصلی نرمالسازی که تمام مراحل را به ترتیب روی متن اعمال میکند.
|
||
"""
|
||
if not isinstance(text, str):
|
||
t = str(text)
|
||
else:
|
||
t = text
|
||
|
||
t = nfkc(t)
|
||
t = unify_persian_arabic(t)
|
||
t = strip_diacritics(t)
|
||
t = t.lower()
|
||
t = normalize_spaces(t)
|
||
return t
|
||
|
||
|
||
def tokenize(text: str) -> List[str]:
|
||
"""متن را به توکنها (کلمات) تقسیم میکند."""
|
||
return _TOKEN_RE.findall(text)
|
||
|
||
|
||
|
||
|
||
|
||
|
||
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
|
||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||
longs_count = 0
|
||
shorts_count = 0
|
||
all_sections_count = 0
|
||
file_path = "ALL_SECTIONS.json"
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
ALL_SECTIONS = json.load(file)
|
||
|
||
|
||
NEW_ALL_SECTIONS = []
|
||
for section in ALL_SECTIONS :
|
||
all_sections_count+=1
|
||
sentence = section["sentence-content"]
|
||
normal_txt = normalize_text(sentence.strip())
|
||
tokens = tokenizer.tokenize(normal_txt)
|
||
|
||
if len(tokens) > 512 :
|
||
longs_count+=1
|
||
section["tokens_len"] = len(tokens)
|
||
section["is_long"] = True
|
||
section["is_short"] = False
|
||
|
||
|
||
elif len(tokens) < 10 :
|
||
shorts_count+=1
|
||
section["tokens_len"] = len(tokens)
|
||
section["is_long"] = False
|
||
section["is_short"] = True
|
||
|
||
|
||
else :
|
||
section["tokens_len"] = len(tokens)
|
||
section["is_long"] = False
|
||
section["is_short"] = False
|
||
|
||
|
||
NEW_ALL_SECTIONS.append(section)
|
||
|
||
|
||
|
||
with open('512t_ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
|
||
json.dump(NEW_ALL_SECTIONS, f, indent=4, ensure_ascii=False)
|
||
|
||
print(f"All Sections : {all_sections_count}")
|
||
print(f"Long Sections : {longs_count}")
|
||
print(f"Short Sections : {shorts_count}")
|
||
|
||
|