first process

2025-08-17 20:55:00 +03:30 · 2025-08-17 20:55:00 +03:30 · 7e456568e5
commit 7e456568e5
parent 8eb3f5e5ed
6 changed files with 1845122 additions and 3938 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,18 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            //"console": "integratedTerminal",
            "console": "internalConsole",
            "justMyCode": false,
            "python": "/home/gpu/NLP/.env/bin/python"
        }
    ]
 }
--- a/data/wisdom_data.json
+++ b/data/wisdom_data.json
--- a/embedder.py
+++ b/embedder.py
@ -0,0 +1,530 @@
 # !pip install hazm
 # !pip install transformers==4.26.0
 # !pip install --upgrade numpy
 # !pip install --upgrade sentence-transformers
 """
 Persian Sentence Processing and Vector Analysis
 ==============================================
 This script processes Persian sentences from a JSON file and performs:
 1. Word extraction and preprocessing
 2. Vector representation using multilingual transformer
 3. Similarity analysis for key words
 4. Dimensionality reduction to 3D
 5. 3D visualization with Persian labels
 Author: NLP Expert Assistant
 """
 import json
 import re
 import numpy as np
 import pandas as pd
 from typing import List, Dict, Tuple, Set
 from collections import Counter
 import logging
 from pathlib import Path
 # NLP and ML libraries
 from sentence_transformers import SentenceTransformer
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 from sklearn.metrics.pairwise import cosine_similarity
 # Visualization libraries
 # import matplotlib.pyplot as plt
 # import plotly.graph_objects as go
 # import plotly.express as px
 # from plotly.subplots import make_subplots
 # Persian text processing
 import hazm
 from hazm import Normalizer, word_tokenize, POSTagger
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 class PersianVectorAnalyzer:
    """
    A comprehensive class for Persian text processing and vector analysis.
    """
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        """
        Initialize the analyzer with the specified model.
        Args:
            model_name: The sentence transformer model to use
        """
        self.model_name = model_name
        self.model = None
        self.normalizer = Normalizer()
        self.stop_words = self._load_persian_stop_words()
        self.key_words = [
            "خدا", "بنده", "جهاد", "ولی", "زکات",
            "نماز", "صبر", "عبادت", "ولایت", "خلافت","پیامبر"
        ]
        logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")
    def _load_persian_stop_words(self) -> Set[str]:
        """
        Load Persian stop words.
        Returns:
            Set of Persian stop words
        """
        # Common Persian stop words
        stop_words = {
            'و', 'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'برای', 'تا',
            'را', 'هم', 'یا', 'اما', 'اگر', 'چون', 'چرا', 'چگونه', 'کجا',
            'چه', 'کی', 'چند', 'چقدر', 'همه', 'هیچ', 'بعضی', 'هر', 'همه',
            'خود', 'خویش', 'ما', 'شما', 'آنها', 'ایشان', 'اینها', 'آنها',
            'من', 'تو', 'او', 'ما', 'شما', 'آنها', 'ایشان', 'اینها',
            'است', 'هست', 'بود', 'شد', 'می', 'باید', 'خواهد', 'دارد',
            'کرد', 'شد', 'بود', 'هست', 'است', 'می‌شود', 'می‌کند',
            'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده',
            'اول', 'دوم', 'سوم', 'چهارم', 'پنجم', 'ششم', 'هفتم', 'هشتم', 'نهم', 'دهم',
            'سال', 'ماه', 'روز', 'هفته', 'ساعت', 'دقیقه', 'ثانیه','پس'
            'بله', 'نه', 'آری', 'خیر', 'بلی', 'نخیر',
            'حالا', 'الان', 'امروز', 'دیروز', 'فردا', 'هفته', 'ماه', 'سال',
            'بالا', 'پایین', 'چپ', 'راست', 'جلو', 'عقب', 'داخل', 'خارج',
            'بزرگ', 'کوچک', 'بلند', 'کوتاه', 'پهن', 'باریک', 'ضخیم', 'نازک',
        }
        return stop_words
    def load_model(self):
        """
        Load the sentence transformer model.
        """
        try:
            logger.info("Loading sentence transformer model...")
            self.model = SentenceTransformer(self.model_name)
            logger.info("Model loaded successfully!")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise
    def load_json_data(self, file_path: str) -> List[str]:
        """
        Load Persian sentences from JSON file.
        Args:
            file_path: Path to the JSON file
        Returns:
            List of Persian sentences
        """
        try:
            logger.info(f"Loading data from {file_path}")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if type(data) == dict:
                    temp_data = []
                    for item in data.items():
                        temp_data.append(item[1])
                    data = temp_data
            sentences = []
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict):
                        # Extract sentences from different possible keys
                        for key in ['persian_translate']:
                            if key in item and item[key]:
                                sentences.append(str(item[key]))
                    elif isinstance(item, str):
                        sentences.append(item)
            elif isinstance(data, dict):
                # If it's a single object, extract all string values
                for value in data.values():
                    if isinstance(value, str):
                        sentences.append(value)
            logger.info(f"Loaded {len(sentences)} sentences")
            return sentences
        except Exception as e:
            logger.error(f"Error loading JSON data: {e}")
            raise
    def preprocess_text(self, text: str) -> str:
        """
        Preprocess Persian text.
        Args:
            text: Raw Persian text
        Returns:
            Preprocessed text
        """
        # Normalize text
        text = self.normalizer.normalize(text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep Persian characters
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
        return text.strip()
    def extract_words(self, sentences: List[str]) -> List[str]:
        """
        Extract all words from sentences.
        Args:
            sentences: List of Persian sentences
        Returns:
            List of all words
        """
        all_words = []
        for sentence in sentences:
            # Preprocess sentence
            processed_sentence = self.preprocess_text(sentence)
            # Tokenize
            words = word_tokenize(processed_sentence)
            # words = processed_sentence.split()
            # Filter out empty strings and very short words
            words = [word for word in words if len(word) > 1]
            all_words.extend(words)
        logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
        return all_words
    def remove_stop_words(self, words: List[str]) -> List[str]:
        """
        Remove stop words from the word list.
        Args:
            words: List of words
        Returns:
            List of words without stop words
        """
        filtered_words = [word for word in words if word not in self.stop_words]
        logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
        return filtered_words
    def get_unique_words(self, words: List[str]) -> List[str]:
        """
        Get unique words from the list.
        Args:
            words: List of words
        Returns:
            List of unique words
        """
        unique_words = list(set(words))
        logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
        return unique_words
    def compute_word_vectors(self, words: List[str]) -> Dict[str, List[float]]:
        """
        Compute vector representations for words.
        Args:
            words: List of unique words
        Returns:
            Dictionary mapping words to their vector representations
        """
        if self.model is None:
            self.load_model()
        logger.info(f"Computing vectors for {len(words)} words...")
        # Compute embeddings
        embeddings = self.model.encode(words, show_progress_bar=True)
        # Create dictionary
        word_vectors = {}
        for i, word in enumerate(words):
            word_vectors[word] = embeddings[i].tolist()
        logger.info("Word vectors computed successfully!")
        return word_vectors
    def find_closest_words(self, word_vectors: Dict[str, List[float]],
                          key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:
        """
        Find the closest words to each key word.
        Args:
            word_vectors: Dictionary of word vectors
            key_words: List of key words to find neighbors for
            top_k: Number of closest words to find
        Returns:
            Dictionary mapping key words to their closest neighbors
        """
        logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")
        # Convert to numpy arrays for faster computation
        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))
        closest_words = {}
        for key_word in key_words:
            if key_word in word_vectors:
                # Get the key word vector
                key_vector = np.array(word_vectors[key_word]).reshape(1, -1)
                # Compute cosine similarities
                similarities = cosine_similarity(key_vector, vectors)[0]
                # Get indices of top k similar words (excluding the key word itself)
                word_indices = np.argsort(similarities)[::-1]
                # Filter out the key word itself and get top k
                closest_indices = []
                for idx in word_indices:
                    if words[idx] != key_word and len(closest_indices) < top_k:
                        closest_indices.append(idx)
                # Get the closest words
                closest_words[key_word] = [words[idx] for idx in closest_indices]
                logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
            else:
                logger.warning(f"Key word '{key_word}' not found in word vectors")
                closest_words[key_word] = []
        return closest_words
    def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
                    method: str = 'tsne') -> Dict[str, List[float]]:
        """
        Reduce word vectors to 3D coordinates.
        Args:
            word_vectors: Dictionary of word vectors
            method: Dimensionality reduction method ('pca' or 'tsne')
        Returns:
            Dictionary mapping words to their 3D coordinates
        """
        logger.info(f"Reducing dimensions to 3D using {method.upper()}...")
        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))
        if method.lower() == 'pca':
            reducer = PCA(n_components=3, random_state=42)
        elif method.lower() == 'tsne':
            reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
        else:
            raise ValueError("Method must be 'pca' or 'tsne'")
        # Reduce dimensions
        reduced_vectors = reducer.fit_transform(vectors)
        # Create dictionary
        word_vectors_3d = {}
        for i, word in enumerate(words):
            word_vectors_3d[word] = reduced_vectors[i].tolist()
        logger.info("Dimensionality reduction completed!")
        return word_vectors_3d
    def save_json(self, data: dict, file_path: str):
        """
        Save data to JSON file.
        Args:
            data: Data to save
            file_path: Output file path
        """
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.info(f"Data saved to {file_path}")
        except Exception as e:
            logger.error(f"Error saving to {file_path}: {e}")
            raise
    # def create_3d_visualization(self, word_vectors_3d: Dict[str, List[float]],
    #                            selected_words: Dict[str, List[str]],
    #                            output_path: str = "persian_words_3d.html"):
    #     """
    #     Create 3D visualization of words.
    #     Args:
    #         word_vectors_3d: Dictionary of 3D word coordinates
    #         selected_words: Dictionary of selected words for each key word
    #         output_path: Output file path for the visualization
    #     """
    #     logger.info("Creating 3D visualization...")
    #     # Prepare data for plotting
    #     words = list(word_vectors_3d.keys())
    #     coords = np.array(list(word_vectors_3d.values()))
    #     # Create color mapping for key words and their neighbors
    #     colors = []
    #     sizes = []
    #     hover_texts = []
    #     for word in words:
    #         # Check if word is a key word
    #         is_key_word = word in self.key_words
    #         # Check if word is in selected words
    #         in_selected = False
    #         key_word_group = None
    #         for key_word, selected_list in selected_words.items():
    #             if word in selected_list:
    #                 in_selected = True
    #                 key_word_group = key_word
    #                 break
    #         if is_key_word:
    #             colors.append('red')
    #             sizes.append(15)
    #             hover_texts.append(f"کلیدواژه: {word}")
    #         elif in_selected:
    #             colors.append('blue')
    #             sizes.append(10)
    #             hover_texts.append(f"کلمه مرتبط با '{key_word_group}': {word}")
    #         else:
    #             colors.append('lightgray')
    #             sizes.append(5)
    #             hover_texts.append(f"کلمه: {word}")
    #     # Create 3D scatter plot
    #     fig = go.Figure()
    #     # Add scatter plot
    #     fig.add_trace(go.Scatter3d(
    #         x=coords[:, 0],
    #         y=coords[:, 1],
    #         z=coords[:, 2],
    #         mode='markers+text',
    #         marker=dict(
    #             size=sizes,
    #             color=colors,
    #             opacity=0.8
    #         ),
    #         text=words,
    #         textposition="middle center",
    #         hovertext=hover_texts,
    #         hoverinfo='text'
    #     ))
    #     # Update layout
    #     fig.update_layout(
    #         title={
    #             'text': 'نمایش سه‌بعدی کلمات فارسی',
    #             'x': 0.5,
    #             'xanchor': 'center',
    #             'font': {'size': 20}
    #         },
    #         scene=dict(
    #             xaxis_title='محور X',
    #             yaxis_title='محور Y',
    #             zaxis_title='محور Z',
    #             camera=dict(
    #                 eye=dict(x=1.5, y=1.5, z=1.5)
    #             )
    #         ),
    #         width=1000,
    #         height=800,
    #         showlegend=False
    #     )
    #     # Save the plot
    #     fig.write_html(output_path)
    #     logger.info(f"3D visualization saved to {output_path}")
    #     return fig
    def process_pipeline(self, input_file: str, output_dir: str = "output"):
        """
        Run the complete processing pipeline.
        Args:
            input_file: Path to input JSON file
            output_dir: Output directory for results
        """
        # Create output directory
        Path(output_dir).mkdir(exist_ok=True)
        logger.info("Starting Persian Vector Analysis Pipeline...")
        # Step 1: Load data
        sentences = self.load_json_data(input_file)
        # Step 2: Extract words
        all_words = self.extract_words(sentences)
        # Step 3: Remove stop words
        # filtered_words = self.remove_stop_words(all_words)
        filtered_words = all_words
        # Step 4: Get unique words
        unique_words = self.get_unique_words(filtered_words)
        # Step 5: Compute word vectors
        word_vectors = self.compute_word_vectors(unique_words)
        # Step 6: Save word vectors
        self.save_json(word_vectors, f"{output_dir}/words_vector.json")
        # Step 7: Find closest words to key words
        selected_words = self.find_closest_words(word_vectors, self.key_words)
        # Step 8: Save selected words
        self.save_json(selected_words, f"{output_dir}/selected_words.json")
        # Step 9: Reduce to 3D
        word_vectors_3d = self.reduce_to_3d(word_vectors, method='tsne')
        # Step 10: Save 3D vectors
        self.save_json(word_vectors_3d, f"{output_dir}/words_vector_3d.json")
        # Step 11: Create visualization
        # self.create_3d_visualization(word_vectors_3d, selected_words,
                                #    f"{output_dir}/persian_words_3d.html")
        logger.info("Pipeline completed successfully!")
        # Print summary
        print("\n" + "="*50)
        print("PIPELINE SUMMARY")
        print("="*50)
        print(f"Input sentences: {len(sentences)}")
        print(f"Total words extracted: {len(all_words)}")
        print(f"Unique words after preprocessing: {len(unique_words)}")
        print(f"Word vectors computed: {len(word_vectors)}")
        print(f"Key words processed: {len(self.key_words)}")
        print(f"Output files saved to: {output_dir}/")
        print("="*50)
 def main():
    """
    Main function to run the Persian Vector Analysis.
    """
    # Initialize analyzer
    analyzer = PersianVectorAnalyzer()
    # Define input and output paths
    input_file = "./data/final_wisdom.json"
    output_dir = "output"
    # Run the complete pipeline
    analyzer.process_pipeline(input_file, output_dir)
 if __name__ == "__main__":
    main()
--- a/output/selected_words.json
+++ b/output/selected_words.json
@ -0,0 +1,223 @@
 {
  "خدا": [
    "بالله",
    "خدای",
    "خداوند",
    "خدایی",
    "الله",
    "خدایا",
    "الهی",
    "لله",
    "آله",
    "خداییم",
    "الرب",
    "خداوندا",
    "خدایش",
    "حضرت",
    "یاسر",
    "آیه",
    "بهشتش",
    "تعالی",
    "باطنم",
    "وعید"
  ],
  "بنده": [
    "مالک",
    "پیشگاه",
    "قربانگاه",
    "فرمانروایی",
    "کوچ",
    "مالکی",
    "قربانگاههای",
    "خزانهدار",
    "پیشوای",
    "جانشین",
    "همنشین",
    "مأمور",
    "مستولی",
    "منکرات",
    "بندهاش",
    "اختیار",
    "منکری",
    "حاکم",
    "عبد",
    "زمامداران"
  ],
  "جهاد": [
    "مجاهد",
    "اسلام",
    "مسلم",
    "شامیان",
    "علیهالسلام",
    "مسلمانان",
    "قرآن",
    "طلبان",
    "صلیالله",
    "عبیدالله",
    "امان",
    "عبدالله",
    "شامی",
    "خلافت",
    "پیغمبر",
    "مسلمین",
    "سپاه",
    "سید",
    "علی",
    "پیامبر"
  ],
  "ولی": [
    "اما",
    "مگر",
    "وإنما",
    "إلا",
    "اگرچه",
    "برخلاف",
    "خلافی",
    "درحالیکه",
    "بلکه",
    "إلیها",
    "غیرش",
    "لان",
    "وگرنه",
    "بخلاف",
    "ورزند",
    "چنانچه",
    "وگروه",
    "بس",
    "وبالش",
    "واگر"
  ],
  "زکات": [
    "گلوگاه",
    "غنائمی",
    "مینگرند",
    "غبن",
    "دراز",
    "نزند",
    "میافکند",
    "گرچه",
    "زبیر",
    "تابی",
    "طغیان",
    "بلاغت",
    "توفیق",
    "ضبائی",
    "قیمة",
    "فریفتند",
    "آمیز",
    "پوشی",
    "طویلة",
    "سوگشان"
  ],
  "نماز": [
    "دعا",
    "صلوات",
    "دعای",
    "دعایی",
    "عبادت",
    "مومنان",
    "مؤمنان",
    "ایمانی",
    "مؤمنی",
    "مؤمن",
    "مومن",
    "برکت",
    "ایمان",
    "المؤمنین",
    "ایمانش",
    "رحمت",
    "مؤمنانم",
    "دینی",
    "ایمانتان",
    "معنوی"
  ],
  "صبر": [
    "انتظار",
    "یصبر",
    "لایصبر",
    "صبور",
    "پروا",
    "متکبر",
    "تعویذ",
    "دعائم",
    "سکونت",
    "رکاب",
    "إرواد",
    "ماند",
    "پرخوری",
    "دنبال",
    "استهزاء",
    "میپیچید",
    "دوشید",
    "بیندیشید",
    "تقوای",
    "نفرماید"
  ],
  "عبادت": [
    "دعایی",
    "دعای",
    "صلوات",
    "نماز",
    "دعا",
    "خدای",
    "مومن",
    "خداوند",
    "بالله",
    "خدا",
    "برکت",
    "مؤمنانم",
    "الهی",
    "خدایا",
    "الرب",
    "لله",
    "آله",
    "ایمانی",
    "الله",
    "خدایی"
  ],
  "ولایت": [],
  "خلافت": [
    "سپاه",
    "حاکم",
    "امت",
    "فرمانروایی",
    "لشکر",
    "قصار",
    "امان",
    "برترند",
    "نهاد",
    "زمامداران",
    "وحکمة",
    "ستمگری",
    "الإبل",
    "بالإبل",
    "مسلط",
    "سرکش",
    "اختیار",
    "امانی",
    "مأموریت",
    "حکومتی"
  ],
  "پیامبر": [
    "پیغمبر",
    "پیامبرش",
    "پیامبران",
    "پیامبرتان",
    "قرآن",
    "رسولالله",
    "مجاهد",
    "عبیدالله",
    "الله",
    "مسلم",
    "ربانی",
    "اسلام",
    "خدای",
    "ایمانی",
    "یاسر",
    "شهید",
    "خدایی",
    "بالله",
    "صلیالله",
    "خدا"
  ]
 }
--- a/output/words_vector.json
+++ b/output/words_vector.json
--- a/output/words_vector_3d.json
+++ b/output/words_vector_3d.json