split speech sentences process
This commit is contained in:
parent
3d78d3623d
commit
0402193403
46
embedder.py
46
embedder.py
|
@ -27,6 +27,7 @@ from pathlib import Path
|
||||||
|
|
||||||
# NLP and ML libraries
|
# NLP and ML libraries
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from transformers import AutoTokenizer
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
from sklearn.manifold import TSNE
|
from sklearn.manifold import TSNE
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
@ -108,6 +109,23 @@ class PersianVectorAnalyzer:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error loading model: {e}")
|
logger.error(f"Error loading model: {e}")
|
||||||
raise
|
raise
|
||||||
|
def split_sentence(self, sentence:str):
|
||||||
|
sentences = []
|
||||||
|
sentence_len = len(self.tokenize_sentence(sentence))
|
||||||
|
if sentence_len < 512:
|
||||||
|
sentences.append(sentence)
|
||||||
|
else:
|
||||||
|
temp_sentences = str(sentence).split('.')
|
||||||
|
for sent in temp_sentences:
|
||||||
|
sent_len = len(self.tokenize_sentence(sent))
|
||||||
|
if sent_len > 512:
|
||||||
|
temp_sentences_2 = str(sentence).split('،')
|
||||||
|
for snt in temp_sentences_2:
|
||||||
|
sentences.append(snt)
|
||||||
|
else:
|
||||||
|
sentences.append(sent)
|
||||||
|
|
||||||
|
return sentences
|
||||||
|
|
||||||
def load_json_data(self, file_path: str) -> List[str]:
|
def load_json_data(self, file_path: str) -> List[str]:
|
||||||
"""
|
"""
|
||||||
|
@ -136,11 +154,11 @@ class PersianVectorAnalyzer:
|
||||||
# Extract sentences from different possible keys
|
# Extract sentences from different possible keys
|
||||||
for key in ['persian_translate']:
|
for key in ['persian_translate']:
|
||||||
if key in item and item[key]:
|
if key in item and item[key]:
|
||||||
splited_sentences = str(item[key]).split('.')
|
splited_sentences = self.split_sentence(item[key])
|
||||||
for sent in splited_sentences:
|
for sent in splited_sentences:
|
||||||
sentences.append(sent)
|
sentences.append(sent)
|
||||||
elif isinstance(item, str):
|
elif isinstance(item, str):
|
||||||
splited_sentences = str(item).split('.')
|
splited_sentences = self.split_sentence(item[key])
|
||||||
for sent in splited_sentences:
|
for sent in splited_sentences:
|
||||||
sentences.append(sent)
|
sentences.append(sent)
|
||||||
elif isinstance(data, dict):
|
elif isinstance(data, dict):
|
||||||
|
@ -181,6 +199,18 @@ class PersianVectorAnalyzer:
|
||||||
|
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
|
def tokenize_sentence(self, sentence:str):
|
||||||
|
|
||||||
|
try:
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
||||||
|
print(self.model_name)
|
||||||
|
tokens = tokenizer.tokenize(sentence)
|
||||||
|
return tokens
|
||||||
|
except:
|
||||||
|
error = "An exception occurred in tokenizer : " + model_checkpoint
|
||||||
|
#file.write( error + '\n' )
|
||||||
|
return []
|
||||||
|
|
||||||
def extract_words(self, sentences: List[str]) -> List[str]:
|
def extract_words(self, sentences: List[str]) -> List[str]:
|
||||||
"""
|
"""
|
||||||
Extract all words from sentences.
|
Extract all words from sentences.
|
||||||
|
@ -464,8 +494,8 @@ class PersianVectorAnalyzer:
|
||||||
Run the complete processing pipeline.
|
Run the complete processing pipeline.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_file: Path to input JSON file
|
input_file(str): Path to input JSON file
|
||||||
output_dir: Output directory for results
|
output_dir(str): Output directory for results
|
||||||
"""
|
"""
|
||||||
# Create output directory
|
# Create output directory
|
||||||
Path(output_dir).mkdir(exist_ok=True)
|
Path(output_dir).mkdir(exist_ok=True)
|
||||||
|
@ -475,6 +505,10 @@ class PersianVectorAnalyzer:
|
||||||
# Step 1: Load data
|
# Step 1: Load data
|
||||||
sentences = self.load_json_data(input_file)
|
sentences = self.load_json_data(input_file)
|
||||||
|
|
||||||
|
for s in sentences:
|
||||||
|
s_len = len(self.tokenize_sentence(s))
|
||||||
|
if s_len > 512:
|
||||||
|
print(s)
|
||||||
# Step 2: Extract words
|
# Step 2: Extract words
|
||||||
# all_words = self.extract_words(sentences)
|
# all_words = self.extract_words(sentences)
|
||||||
|
|
||||||
|
@ -530,8 +564,8 @@ def main():
|
||||||
analyzer = PersianVectorAnalyzer()
|
analyzer = PersianVectorAnalyzer()
|
||||||
|
|
||||||
# Define input and output paths
|
# Define input and output paths
|
||||||
input_file = "./data/final_wisdom.json"
|
input_file = "./out/nahj_speeches.json"
|
||||||
output_dir = "output"
|
output_dir = "output-speechs"
|
||||||
|
|
||||||
# Run the complete pipeline
|
# Run the complete pipeline
|
||||||
analyzer.process_pipeline(input_file, output_dir)
|
analyzer.process_pipeline(input_file, output_dir)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user