split speech sentences process

This commit is contained in:
ajokar 2025-08-20 19:56:31 +03:30
parent 3d78d3623d
commit 0402193403

View File

@ -27,6 +27,7 @@ from pathlib import Path
# NLP and ML libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
@ -108,7 +109,24 @@ class PersianVectorAnalyzer:
except Exception as e:
logger.error(f"Error loading model: {e}")
raise
def split_sentence(self, sentence:str):
sentences = []
sentence_len = len(self.tokenize_sentence(sentence))
if sentence_len < 512:
sentences.append(sentence)
else:
temp_sentences = str(sentence).split('.')
for sent in temp_sentences:
sent_len = len(self.tokenize_sentence(sent))
if sent_len > 512:
temp_sentences_2 = str(sentence).split('،')
for snt in temp_sentences_2:
sentences.append(snt)
else:
sentences.append(sent)
return sentences
def load_json_data(self, file_path: str) -> List[str]:
"""
Load Persian sentences from JSON file.
@ -136,11 +154,11 @@ class PersianVectorAnalyzer:
# Extract sentences from different possible keys
for key in ['persian_translate']:
if key in item and item[key]:
splited_sentences = str(item[key]).split('.')
splited_sentences = self.split_sentence(item[key])
for sent in splited_sentences:
sentences.append(sent)
elif isinstance(item, str):
splited_sentences = str(item).split('.')
splited_sentences = self.split_sentence(item[key])
for sent in splited_sentences:
sentences.append(sent)
elif isinstance(data, dict):
@ -181,6 +199,18 @@ class PersianVectorAnalyzer:
return text.strip()
def tokenize_sentence(self, sentence:str):
try:
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
print(self.model_name)
tokens = tokenizer.tokenize(sentence)
return tokens
except:
error = "An exception occurred in tokenizer : " + model_checkpoint
#file.write( error + '\n' )
return []
def extract_words(self, sentences: List[str]) -> List[str]:
"""
Extract all words from sentences.
@ -464,8 +494,8 @@ class PersianVectorAnalyzer:
Run the complete processing pipeline.
Args:
input_file: Path to input JSON file
output_dir: Output directory for results
input_file(str): Path to input JSON file
output_dir(str): Output directory for results
"""
# Create output directory
Path(output_dir).mkdir(exist_ok=True)
@ -475,6 +505,10 @@ class PersianVectorAnalyzer:
# Step 1: Load data
sentences = self.load_json_data(input_file)
for s in sentences:
s_len = len(self.tokenize_sentence(s))
if s_len > 512:
print(s)
# Step 2: Extract words
# all_words = self.extract_words(sentences)
@ -530,8 +564,8 @@ def main():
analyzer = PersianVectorAnalyzer()
# Define input and output paths
input_file = "./data/final_wisdom.json"
output_dir = "output"
input_file = "./out/nahj_speeches.json"
output_dir = "output-speechs"
# Run the complete pipeline
analyzer.process_pipeline(input_file, output_dir)