split speech sentences process

2025-08-20 19:56:31 +03:30 · 2025-08-20 19:56:31 +03:30 · 0402193403
commit 0402193403
parent 3d78d3623d
1 changed files with 41 additions and 7 deletions
--- a/embedder.py
+++ b/embedder.py
@ -27,6 +27,7 @@ from pathlib import Path
 # NLP and ML libraries
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 from sklearn.metrics.pairwise import cosine_similarity
@ -108,6 +109,23 @@ class PersianVectorAnalyzer:
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise
    def split_sentence(self, sentence:str):
        sentences = []
        sentence_len = len(self.tokenize_sentence(sentence))
        if sentence_len < 512:
            sentences.append(sentence)
        else:
            temp_sentences = str(sentence).split('.')
            for sent in temp_sentences:
                sent_len = len(self.tokenize_sentence(sent))
                if sent_len > 512:
                    temp_sentences_2 = str(sentence).split('،')
                    for snt in temp_sentences_2:
                        sentences.append(snt)
                else:
                    sentences.append(sent)
        return sentences
    def load_json_data(self, file_path: str) -> List[str]:
        """
@ -136,11 +154,11 @@ class PersianVectorAnalyzer:
                        # Extract sentences from different possible keys
                        for key in ['persian_translate']:
                            if key in item and item[key]:
-                                splited_sentences = str(item[key]).split('.')
+                                splited_sentences = self.split_sentence(item[key])
                                for sent in splited_sentences:
                                    sentences.append(sent)
                    elif isinstance(item, str):
-                        splited_sentences = str(item).split('.')
+                        splited_sentences = self.split_sentence(item[key])
                        for sent in splited_sentences:
                            sentences.append(sent)
            elif isinstance(data, dict):
@ -181,6 +199,18 @@ class PersianVectorAnalyzer:
        return text.strip()
    def tokenize_sentence(self, sentence:str):
        try:
            tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            print(self.model_name)
            tokens = tokenizer.tokenize(sentence)
            return tokens
        except:
            error = "An exception occurred in tokenizer : " + model_checkpoint
            #file.write( error + '\n' )
            return []
    def extract_words(self, sentences: List[str]) -> List[str]:
        """
        Extract all words from sentences.
@ -464,8 +494,8 @@ class PersianVectorAnalyzer:
        Run the complete processing pipeline.
        Args:
-            input_file: Path to input JSON file
+            input_file(str): Path to input JSON file
-            output_dir: Output directory for results
+            output_dir(str): Output directory for results
        """
        # Create output directory
        Path(output_dir).mkdir(exist_ok=True)
@ -475,6 +505,10 @@ class PersianVectorAnalyzer:
        # Step 1: Load data
        sentences = self.load_json_data(input_file)
        for s in sentences:
            s_len = len(self.tokenize_sentence(s))
            if s_len > 512:
                print(s)
        # Step 2: Extract words
        # all_words = self.extract_words(sentences)
@ -530,8 +564,8 @@ def main():
    analyzer = PersianVectorAnalyzer()
    # Define input and output paths
-    input_file = "./data/final_wisdom.json"
+    input_file = "./out/nahj_speeches.json"
-    output_dir = "output"
+    output_dir = "output-speechs"
    # Run the complete pipeline
    analyzer.process_pipeline(input_file, output_dir)