split speech sentences process

2025-08-20 19:56:31 +03:30 · 2025-08-20 19:56:31 +03:30 · 0402193403
commit 0402193403
parent 3d78d3623d
1 changed files with 41 additions and 7 deletions
--- a/embedder.py
+++ b/embedder.py
@ -27,6 +27,7 @@ from pathlib import Path

 # NLP and ML libraries
 from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 from sklearn.metrics.pairwise import cosine_similarity
@ -108,6 +109,23 @@ class PersianVectorAnalyzer:
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise
+    def split_sentence(self, sentence:str):
+        sentences = []
+        sentence_len = len(self.tokenize_sentence(sentence))
+        if sentence_len < 512:
+            sentences.append(sentence)
+        else:
+            temp_sentences = str(sentence).split('.')
+            for sent in temp_sentences:
+                sent_len = len(self.tokenize_sentence(sent))
+                if sent_len > 512:
+                    temp_sentences_2 = str(sentence).split('،')
+                    for snt in temp_sentences_2:
+                        sentences.append(snt)
+                else:
+                    sentences.append(sent)
+                    
+        return sentences
        
    def load_json_data(self, file_path: str) -> List[str]:
        """
@ -136,11 +154,11 @@ class PersianVectorAnalyzer:
                        # Extract sentences from different possible keys
                        for key in ['persian_translate']:
                            if key in item and item[key]:
-                                splited_sentences = str(item[key]).split('.')
+                                splited_sentences = self.split_sentence(item[key])
                                for sent in splited_sentences:
                                    sentences.append(sent)
                    elif isinstance(item, str):
-                        splited_sentences = str(item).split('.')
+                        splited_sentences = self.split_sentence(item[key])
                        for sent in splited_sentences:
                            sentences.append(sent)
            elif isinstance(data, dict):
@ -181,6 +199,18 @@ class PersianVectorAnalyzer:

        return text.strip()

+    def tokenize_sentence(self, sentence:str):
+        
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            print(self.model_name)
+            tokens = tokenizer.tokenize(sentence)
+            return tokens
+        except:
+            error = "An exception occurred in tokenizer : " + model_checkpoint
+            #file.write( error + '\n' )
+            return []
+
    def extract_words(self, sentences: List[str]) -> List[str]:
        """
        Extract all words from sentences.
@ -464,8 +494,8 @@ class PersianVectorAnalyzer:
        Run the complete processing pipeline.

        Args:
-            input_file: Path to input JSON file
-            output_dir: Output directory for results
+            input_file(str): Path to input JSON file
+            output_dir(str): Output directory for results
        """
        # Create output directory
        Path(output_dir).mkdir(exist_ok=True)
@ -475,6 +505,10 @@ class PersianVectorAnalyzer:
        # Step 1: Load data
        sentences = self.load_json_data(input_file)

+        for s in sentences:
+            s_len = len(self.tokenize_sentence(s))
+            if s_len > 512:
+                print(s)
        # Step 2: Extract words
        # all_words = self.extract_words(sentences)

@ -530,8 +564,8 @@ def main():
    analyzer = PersianVectorAnalyzer()

    # Define input and output paths
-    input_file = "./data/final_wisdom.json"
-    output_dir = "output"
+    input_file = "./out/nahj_speeches.json"
+    output_dir = "output-speechs"

    # Run the complete pipeline
    analyzer.process_pipeline(input_file, output_dir)