nahj/embedder.py

# !pip install hazm
# !pip install transformers==4.26.0
# !pip install --upgrade numpy
# !pip install --upgrade sentence-transformers
"""
Persian Sentence Processing and Vector Analysis
==============================================

This script processes Persian sentences from a JSON file and performs:
1. Word extraction and preprocessing
2. Vector representation using multilingual transformer
3. Similarity analysis for key words
4. Dimensionality reduction to 3D
5. 3D visualization with Persian labels

Author: NLP Expert Assistant
"""

import json
import re
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Set
from collections import Counter
import logging
from pathlib import Path

# NLP and ML libraries
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# Visualization libraries
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# import plotly.express as px
# from plotly.subplots import make_subplots

# Persian text processing
import hazm
from hazm import Normalizer, word_tokenize, POSTagger

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class PersianVectorAnalyzer:
    """
    A comprehensive class for Persian text processing and vector analysis.
    """

    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        """
        Initialize the analyzer with the specified model.

        Args:
            model_name: The sentence transformer model to use
        """
        self.model_name = model_name
        self.model = None
        self.normalizer = Normalizer()
        self.stop_words = self._load_persian_stop_words()
        self.key_words = [
            "خدا", "بنده", "جهاد", "ولی", "زکات",
            "نماز", "صبر", "عبادت", "ولایت", "خلافت","پیامبر"
        ]

        logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")

    def _load_persian_stop_words(self) -> Set[str]:
        """
        Load Persian stop words.

        Returns:
            Set of Persian stop words
        """
        # Common Persian stop words
        stop_words = {
            'و', 'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'برای', 'تا',
            'را', 'هم', 'یا', 'اما', 'اگر', 'چون', 'چرا', 'چگونه', 'کجا',
            'چه', 'کی', 'چند', 'چقدر', 'همه', 'هیچ', 'بعضی', 'هر', 'همه',
            'خود', 'خویش', 'ما', 'شما', 'آنها', 'ایشان', 'اینها', 'آنها',
            'من', 'تو', 'او', 'ما', 'شما', 'آنها', 'ایشان', 'اینها',
            'است', 'هست', 'بود', 'شد', 'می', 'باید', 'خواهد', 'دارد',
            'کرد', 'شد', 'بود', 'هست', 'است', 'می‌شود', 'می‌کند',
            'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده',
            'اول', 'دوم', 'سوم', 'چهارم', 'پنجم', 'ششم', 'هفتم', 'هشتم', 'نهم', 'دهم',
            'سال', 'ماه', 'روز', 'هفته', 'ساعت', 'دقیقه', 'ثانیه','پس'
            'بله', 'نه', 'آری', 'خیر', 'بلی', 'نخیر',
            'حالا', 'الان', 'امروز', 'دیروز', 'فردا', 'هفته', 'ماه', 'سال',
            'بالا', 'پایین', 'چپ', 'راست', 'جلو', 'عقب', 'داخل', 'خارج',
            'بزرگ', 'کوچک', 'بلند', 'کوتاه', 'پهن', 'باریک', 'ضخیم', 'نازک',


        }
        return stop_words

    def load_model(self):
        """
        Load the sentence transformer model.
        """
        try:
            logger.info("Loading sentence transformer model...")
            self.model = SentenceTransformer(self.model_name)
            logger.info("Model loaded successfully!")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise

    def load_json_data(self, file_path: str) -> List[str]:
        """
        Load Persian sentences from JSON file.

        Args:
            file_path: Path to the JSON file

        Returns:
            List of Persian sentences
        """
        try:
            logger.info(f"Loading data from {file_path}")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if type(data) == dict:
                    temp_data = []
                    for item in data.items():
                        temp_data.append(item[1])
                    data = temp_data

            sentences = []
            if isinstance(data, list):
                for item in data:
                    if isinstance(item, dict):
                        # Extract sentences from different possible keys
                        for key in ['persian_translate']:
                            if key in item and item[key]:
                                sentences.append(str(item[key]))
                    elif isinstance(item, str):
                        sentences.append(item)
            elif isinstance(data, dict):
                # If it's a single object, extract all string values
                for value in data.values():
                    if isinstance(value, str):
                        sentences.append(value)

            logger.info(f"Loaded {len(sentences)} sentences")
            return sentences

        except Exception as e:
            logger.error(f"Error loading JSON data: {e}")
            raise

    def preprocess_text(self, text: str) -> str:
        """
        Preprocess Persian text.

        Args:
            text: Raw Persian text

        Returns:
            Preprocessed text
        """

        # Normalize text
        text = self.normalizer.normalize(text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters but keep Persian characters
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)

        return text.strip()

    def extract_words(self, sentences: List[str]) -> List[str]:
        """
        Extract all words from sentences.

        Args:
            sentences: List of Persian sentences

        Returns:
            List of all words
        """
        all_words = []

        for sentence in sentences:
            # Preprocess sentence
            processed_sentence = self.preprocess_text(sentence)

            # Tokenize
            words = word_tokenize(processed_sentence)
            # words = processed_sentence.split()
            # Filter out empty strings and very short words
            words = [word for word in words if len(word) > 1]

            all_words.extend(words)

        logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
        return all_words

    def remove_stop_words(self, words: List[str]) -> List[str]:
        """
        Remove stop words from the word list.

        Args:
            words: List of words

        Returns:
            List of words without stop words
        """
        filtered_words = [word for word in words if word not in self.stop_words]
        logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
        return filtered_words

    def get_unique_words(self, words: List[str]) -> List[str]:
        """
        Get unique words from the list.

        Args:
            words: List of words

        Returns:
            List of unique words
        """
        unique_words = list(set(words))
        logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
        return unique_words

    def compute_word_vectors(self, words: List[str]) -> Dict[str, List[float]]:
        """
        Compute vector representations for words.

        Args:
            words: List of unique words

        Returns:
            Dictionary mapping words to their vector representations
        """
        if self.model is None:
            self.load_model()

        logger.info(f"Computing vectors for {len(words)} words...")

        # Compute embeddings
        embeddings = self.model.encode(words, show_progress_bar=True)

        # Create dictionary
        word_vectors = {}
        for i, word in enumerate(words):
            word_vectors[word] = embeddings[i].tolist()

        logger.info("Word vectors computed successfully!")
        return word_vectors

    def find_closest_words(self, word_vectors: Dict[str, List[float]],
                          key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:
        """
        Find the closest words to each key word.

        Args:
            word_vectors: Dictionary of word vectors
            key_words: List of key words to find neighbors for
            top_k: Number of closest words to find

        Returns:
            Dictionary mapping key words to their closest neighbors
        """
        logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")

        # Convert to numpy arrays for faster computation
        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))

        closest_words = {}

        for key_word in key_words:
            if key_word in word_vectors:
                # Get the key word vector
                key_vector = np.array(word_vectors[key_word]).reshape(1, -1)

                # Compute cosine similarities
                similarities = cosine_similarity(key_vector, vectors)[0]

                # Get indices of top k similar words (excluding the key word itself)
                word_indices = np.argsort(similarities)[::-1]

                # Filter out the key word itself and get top k
                closest_indices = []
                for idx in word_indices:
                    if words[idx] != key_word and len(closest_indices) < top_k:
                        closest_indices.append(idx)

                # Get the closest words
                closest_words[key_word] = [words[idx] for idx in closest_indices]
                logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
            else:
                logger.warning(f"Key word '{key_word}' not found in word vectors")
                closest_words[key_word] = []

        return closest_words

    def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
                    method: str = 'tsne') -> Dict[str, List[float]]:
        """
        Reduce word vectors to 3D coordinates.

        Args:
            word_vectors: Dictionary of word vectors
            method: Dimensionality reduction method ('pca' or 'tsne')

        Returns:
            Dictionary mapping words to their 3D coordinates
        """
        logger.info(f"Reducing dimensions to 3D using {method.upper()}...")

        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))

        if method.lower() == 'pca':
            reducer = PCA(n_components=3, random_state=42)
        elif method.lower() == 'tsne':
            reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
        else:
            raise ValueError("Method must be 'pca' or 'tsne'")

        # Reduce dimensions
        reduced_vectors = reducer.fit_transform(vectors)

        # Create dictionary
        word_vectors_3d = {}
        for i, word in enumerate(words):
            word_vectors_3d[word] = reduced_vectors[i].tolist()

        logger.info("Dimensionality reduction completed!")
        return word_vectors_3d

    def save_json(self, data: dict, file_path: str):
        """
        Save data to JSON file.

        Args:
            data: Data to save
            file_path: Output file path
        """
        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.info(f"Data saved to {file_path}")
        except Exception as e:
            logger.error(f"Error saving to {file_path}: {e}")
            raise

    # def create_3d_visualization(self, word_vectors_3d: Dict[str, List[float]],
    #                            selected_words: Dict[str, List[str]],
    #                            output_path: str = "persian_words_3d.html"):
    #     """
    #     Create 3D visualization of words.

    #     Args:
    #         word_vectors_3d: Dictionary of 3D word coordinates
    #         selected_words: Dictionary of selected words for each key word
    #         output_path: Output file path for the visualization
    #     """
    #     logger.info("Creating 3D visualization...")

    #     # Prepare data for plotting
    #     words = list(word_vectors_3d.keys())
    #     coords = np.array(list(word_vectors_3d.values()))

    #     # Create color mapping for key words and their neighbors
    #     colors = []
    #     sizes = []
    #     hover_texts = []

    #     for word in words:
    #         # Check if word is a key word
    #         is_key_word = word in self.key_words

    #         # Check if word is in selected words
    #         in_selected = False
    #         key_word_group = None
    #         for key_word, selected_list in selected_words.items():
    #             if word in selected_list:
    #                 in_selected = True
    #                 key_word_group = key_word
    #                 break

    #         if is_key_word:
    #             colors.append('red')
    #             sizes.append(15)
    #             hover_texts.append(f"کلیدواژه: {word}")
    #         elif in_selected:
    #             colors.append('blue')
    #             sizes.append(10)
    #             hover_texts.append(f"کلمه مرتبط با '{key_word_group}': {word}")
    #         else:
    #             colors.append('lightgray')
    #             sizes.append(5)
    #             hover_texts.append(f"کلمه: {word}")

    #     # Create 3D scatter plot
    #     fig = go.Figure()

    #     # Add scatter plot
    #     fig.add_trace(go.Scatter3d(
    #         x=coords[:, 0],
    #         y=coords[:, 1],
    #         z=coords[:, 2],
    #         mode='markers+text',
    #         marker=dict(
    #             size=sizes,
    #             color=colors,
    #             opacity=0.8
    #         ),
    #         text=words,
    #         textposition="middle center",
    #         hovertext=hover_texts,
    #         hoverinfo='text'
    #     ))

    #     # Update layout
    #     fig.update_layout(
    #         title={
    #             'text': 'نمایش سه‌بعدی کلمات فارسی',
    #             'x': 0.5,
    #             'xanchor': 'center',
    #             'font': {'size': 20}
    #         },
    #         scene=dict(
    #             xaxis_title='محور X',
    #             yaxis_title='محور Y',
    #             zaxis_title='محور Z',
    #             camera=dict(
    #                 eye=dict(x=1.5, y=1.5, z=1.5)
    #             )
    #         ),
    #         width=1000,
    #         height=800,
    #         showlegend=False
    #     )

    #     # Save the plot
    #     fig.write_html(output_path)
    #     logger.info(f"3D visualization saved to {output_path}")

    #     return fig

    def process_pipeline(self, input_file: str, output_dir: str = "output"):
        """
        Run the complete processing pipeline.

        Args:
            input_file: Path to input JSON file
            output_dir: Output directory for results
        """
        # Create output directory
        Path(output_dir).mkdir(exist_ok=True)

        logger.info("Starting Persian Vector Analysis Pipeline...")

        # Step 1: Load data
        sentences = self.load_json_data(input_file)

        # Step 2: Extract words
        all_words = self.extract_words(sentences)

        # Step 3: Remove stop words
        # filtered_words = self.remove_stop_words(all_words)
        filtered_words = all_words

        # Step 4: Get unique words
        unique_words = self.get_unique_words(filtered_words)

        # Step 5: Compute word vectors
        word_vectors = self.compute_word_vectors(unique_words)

        # Step 6: Save word vectors
        self.save_json(word_vectors, f"{output_dir}/words_vector.json")

        # Step 7: Find closest words to key words
        selected_words = self.find_closest_words(word_vectors, self.key_words)

        # Step 8: Save selected words
        self.save_json(selected_words, f"{output_dir}/selected_words.json")

        # Step 9: Reduce to 3D
        word_vectors_3d = self.reduce_to_3d(word_vectors, method='tsne')

        # Step 10: Save 3D vectors
        self.save_json(word_vectors_3d, f"{output_dir}/words_vector_3d.json")

        # Step 11: Create visualization
        # self.create_3d_visualization(word_vectors_3d, selected_words,
                                #    f"{output_dir}/persian_words_3d.html")

        logger.info("Pipeline completed successfully!")

        # Print summary
        print("\n" + "="*50)
        print("PIPELINE SUMMARY")
        print("="*50)
        print(f"Input sentences: {len(sentences)}")
        print(f"Total words extracted: {len(all_words)}")
        print(f"Unique words after preprocessing: {len(unique_words)}")
        print(f"Word vectors computed: {len(word_vectors)}")
        print(f"Key words processed: {len(self.key_words)}")
        print(f"Output files saved to: {output_dir}/")
        print("="*50)


def main():
    """
    Main function to run the Persian Vector Analysis.
    """
    # Initialize analyzer
    analyzer = PersianVectorAnalyzer()

    # Define input and output paths
    input_file = "./data/final_wisdom.json"
    output_dir = "output"

    # Run the complete pipeline
    analyzer.process_pipeline(input_file, output_dir)


if __name__ == "__main__":
    main()