rag_qavanin_api/512_tokens_finder.py

136 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# بسم الله
from transformers import AutoTokenizer
from typing import List
import unicodedata
import json
import re
# -----------------------------
# Normalization Utilities
# -----------------------------
_ARABIC_TO_PERSIAN = {
"ي": "ی",
"ك": "ک",
"ۀ": "ه",
"ة": "ه",
"ؤ": "و",
"إ": "ا",
"أ": "ا",
"ٱ": "ا",
"آ": "ا",
}
sys_max = 0x110000
_DIACRITICS = dict.fromkeys(
i for i in range(sys_max) if unicodedata.category(chr(i)) == 'Mn'
)
_ZWNJ = "\u200c" # نیم‌فاصله
_TATWEEL = "\u0640" # کشیدگی
# الگوی Regex برای پیدا کردن کلمات (حروف، اعداد و زیرخط)
_TOKEN_RE = re.compile(r"[\w\u0600-\u06FF]+", re.UNICODE)
def nfkc(text: str) -> str:
"""متن را به فرم استاندارد NFKC یونی‌کد تبدیل می‌کند."""
return unicodedata.normalize("NFKC", text)
def strip_diacritics(text: str) -> str:
"""حرکات و علائم غیرفاصله‌دار را از متن حذف می‌کند."""
return text.translate(_DIACRITICS)
def unify_persian_arabic(text: str) -> str:
"""حروف عربی را به معادل فارسی آن‌ها تبدیل می‌کند."""
return text.translate(str.maketrans(_ARABIC_TO_PERSIAN))
def normalize_spaces(text: str) -> str:
"""نیم‌فاصله و کشیدگی را حذف و فواصل اضافی را یکپارچه می‌کند."""
text = text.replace(_ZWNJ, " ")
text = text.replace(_TATWEEL, "")
text = re.sub(r"\s+", " ", text)
return text.strip()
def normalize_text(text: str) -> str:
"""
تابع اصلی نرمال‌سازی که تمام مراحل را به ترتیب روی متن اعمال می‌کند.
"""
if not isinstance(text, str):
t = str(text)
else:
t = text
t = nfkc(t)
t = unify_persian_arabic(t)
t = strip_diacritics(t)
t = t.lower()
t = normalize_spaces(t)
return t
def tokenize(text: str) -> List[str]:
"""متن را به توکن‌ها (کلمات) تقسیم می‌کند."""
return _TOKEN_RE.findall(text)
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
longs_count = 0
shorts_count = 0
all_sections_count = 0
file_path = "ALL_SECTIONS.json"
with open(file_path, 'r', encoding='utf-8') as file:
ALL_SECTIONS = json.load(file)
NEW_ALL_SECTIONS = []
for section in ALL_SECTIONS :
all_sections_count+=1
sentence = section["sentence-content"]
normal_txt = normalize_text(sentence.strip())
tokens = tokenizer.tokenize(normal_txt)
if len(tokens) > 512 :
longs_count+=1
section["tokens_len"] = len(tokens)
section["is_long"] = True
section["is_short"] = False
elif len(tokens) < 10 :
shorts_count+=1
section["tokens_len"] = len(tokens)
section["is_long"] = False
section["is_short"] = True
else :
section["tokens_len"] = len(tokens)
section["is_long"] = False
section["is_short"] = False
NEW_ALL_SECTIONS.append(section)
with open('512t_ALL_SECTIONS.json', 'w', encoding='utf-8') as f:
json.dump(NEW_ALL_SECTIONS, f, indent=4, ensure_ascii=False)
print(f"All Sections : {all_sections_count}")
print(f"Long Sections : {longs_count}")
print(f"Short Sections : {shorts_count}")