nahj_rag/2_embedder_fasttext.py

# بسم الله
# !pip install hazm
# !pip install transformers==4.26.0
# !pip install --upgrade numpy
# !pip install --upgrade sentence-transformers
"""
Persian Sentence Processing and Vector Analysis
==============================================

This script processes Persian sentences from a JSON file and performs:
1. Word extraction and preprocessing
2. Vector representation using multilingual transformer
3. Similarity analysis for key words
4. Dimensionality reduction to 3D
5. 3D visualization with Persian labels

Author: NLP Expert Assistant
"""

import json
import re
import csv
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Set
from collections import Counter
import logging
from pathlib import Path
from fasttext import tokenize
# NLP and ML libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# Visualization libraries
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# import plotly.express as px
# from plotly.subplots import make_subplots

# Persian text processing
# import hazm
# from hazm import Normalizer, word_tokenize, POSTagger
from normalizer import normalize_persian
from datetime import datetime

import fasttext.util
from gensim.models.fasttext import FastText
import logging
from typing import List, Dict, Union

    # فرض کنید این یک placeholder برای logger واقعی است
logger = logging.getLogger(__name__)

# fasttext.util.download_model('fa', if_exists='ignore')  # peraian model


strt_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class FasttextPersianVectorAnalyzer:
    """
    A comprehensive class for Persian text processing and vector analysis.
    """

    def __init__(self, model_name: str = './models/khamenei_all_180_clean_model.bin'):
        """
        Initialize the analyzer with the specified model.

        Args:
            model_name: The sentence transformer model to use
        """
        self.model_name = model_name
        self.model = None
        # self.normalizer = Normalizer()
        self.stop_words = self._load_persian_stop_words()
        self.key_words = [

        ]

        logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")

    def _load_persian_stop_words(self) -> Set[str]:

        stop_words = {


        }
        return stop_words

    def load_model(self):
        """
        Load the sentence transformer model.
        """
        try:
            logger.info("Loading sentence transformer model...")
            self.model = fasttext.load_model(self.model_name)
            logger.info("Model loaded successfully!")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
            raise
    def split_sentence(self, sentence:str):
        sentences = []
        sentence_len = len(tokenize(sentence))
        if sentence_len < 512:
            sentences.append(sentence)
        else:
            temp_sentences = str(sentence).split('.')
            for sent in temp_sentences:
                sent_len = len(tokenize(sent))
                if sent_len > 512:
                    temp_sentences_2 = str(sent).split('،')
                    for snt in temp_sentences_2:
                        sentences.append(snt)
                else:
                    sentences.append(sent)

        return sentences


    def load_json_data(self, file_path: str) -> List[str]:

        try:
            logger.info(f"Loading data from {file_path}")
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if type(data) == dict:
                    temp_data = []
                    for item in data.items():
                        temp_data.append(item[1])
                    data = temp_data

            sentences = []
            if isinstance(data, list):
                for index, item in enumerate(data):
                    print(f'split sentence {index}')
                    if isinstance(item, dict):
                        # Extract sentences from different possible keys
                        for key in ['sentece_text']:
                            if key in item and item[key]:
                                splited_sentences = self.split_sentence(item[key])
                                for sent in splited_sentences:
                                    sentences.append(sent)
                    elif isinstance(item, str):
                        splited_sentences = self.split_sentence(item[key])
                        for sent in splited_sentences:
                            sentences.append(sent)
            elif isinstance(data, dict):
                # If it's a single object, extract all string values
                for value in data.values():
                    if isinstance(value, str):
                        splited_sentences = str(value).split('.')
                        for sent in splited_sentences:
                            sentences.append(sent)

            sentences = [senten for senten in sentences if senten]
            logger.info(f"Loaded {len(sentences)} sentences")
            return sentences


        except Exception as e:
            logger.error(f"Error loading JSON data: {e}")
            raise

    def preprocess_text(self, text: str) -> str:


        # Normalize text
        text = normalize_persian(text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters but keep Persian characters
        text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)

        return text.strip()

    def extract_words(self, sentences: List[str]) -> List[str]:

        all_words = []

        for sentence in sentences:
            # Preprocess sentence
            processed_sentence = self.preprocess_text(sentence)

            # Tokenize
            words = word_tokenize(processed_sentence)
            # words = processed_sentence.split()
            # Filter out empty strings and very short words
            words = [word for word in words if len(word) > 1]

            all_words.extend(words)

        logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
        return all_words

    def remove_stop_words(self, words: List[str]) -> List[str]:

        filtered_words = [word for word in words if word not in self.stop_words]
        logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
        return filtered_words

    def get_unique_words(self, words: List[str]) -> List[str]:

        unique_words = list(set(words))
        logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
        return unique_words

    def compute_word_vectors(self, ids, dates,sentences: List[str], titles, urls) -> Dict[str, List[float]]:
        """Compute sentence vectors using FastText model"""
        if self.model is None:
            self.load_model()

        logger.info(f"Computing vectors for {len(sentences)} sentences...")

        # لیست موقت برای ذخیره بردارهای جمله
        sentences_vectors_list = []
        valid_sentences = []
        new_years = []
        for index, sent in enumerate(sentences):
            # 🟢 تمیز کردن کامل متن از کاراکترهای مشکل‌ساز
            cleaned_sent = sent.strip()
            # حذف تمام کاراکترهای جدید خط و فاصله‌های اضافی
            cleaned_sent = re.sub(r'\s+', ' ', cleaned_sent)
            cleaned_sent = cleaned_sent.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
            cleaned_sent = cleaned_sent.strip()

            # پرش از خطوط خالی پس از پاکسازی
            if not cleaned_sent:
                continue

            try:
                # get_sentence_vector بردار میانگین کلمات جمله را تولید می‌کند.
                vector = self.model.get_sentence_vector(cleaned_sent)
                sentences_vectors_list.append(vector)
                valid_sentences.append(cleaned_sent)
                # new_years.append(years[index])
                print(f'sentence {index} embedded: {cleaned_sent[:10]}...')
            except Exception as e:
                logger.warning(f"Error processing sentence: {e}")
                continue

        # تبدیل لیست بردارها به آرایه NumPy
        embeddings = np.array(sentences_vectors_list)

        # Create dictionary
        sentences_vectors = {}

        for i, (id, date, sent, title, url, embedding) in enumerate(zip(ids, dates, valid_sentences, titles, urls, embeddings)):
            sentences_vectors[f'sentence-{i+1}'] = {
                'id': id,
                'date': date,
                'title': title,
                'sentence': sent,
                'url': url,
                'embeddings': embedding.tolist()
            }

        logger.info(f"Successfully computed vectors for {len(sentences_vectors)} sentences!")
        return sentences_vectors

    def find_closest_words(self, word_vectors: Dict[str, List[float]],
                          key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:

        logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")

        # Convert to numpy arrays for faster computation
        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))

        closest_words = {}

        for key_word in key_words:
            if key_word in word_vectors:
                # Get the key word vector
                key_vector = np.array(word_vectors[key_word]).reshape(1, -1)

                # Compute cosine similarities
                similarities = cosine_similarity(key_vector, vectors)[0]

                # Get indices of top k similar words (excluding the key word itself)
                word_indices = np.argsort(similarities)[::-1]

                # Filter out the key word itself and get top k
                closest_indices = []
                for idx in word_indices:
                    if words[idx] != key_word and len(closest_indices) < top_k:
                        closest_indices.append(idx)

                # Get the closest words
                closest_words[key_word] = [words[idx] for idx in closest_indices]
                logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
            else:
                logger.warning(f"Key word '{key_word}' not found in word vectors")
                closest_words[key_word] = []

        return closest_words

    def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
                    method: str = 'tsne') -> Dict[str, List[float]]:

        logger.info(f"Reducing dimensions to 3D using {method.upper()}...")

        words = list(word_vectors.keys())
        vectors = np.array(list(word_vectors.values()))

        if method.lower() == 'pca':
            reducer = PCA(n_components=3, random_state=42)
        elif method.lower() == 'tsne':
            reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
        else:
            raise ValueError("Method must be 'pca' or 'tsne'")

        # Reduce dimensions
        reduced_vectors = reducer.fit_transform(vectors)

        # Create dictionary
        word_vectors_3d = {}
        for i, word in enumerate(words):
            word_vectors_3d[word] = reduced_vectors[i].tolist()

        logger.info("Dimensionality reduction completed!")
        return word_vectors_3d

    def save_json(self, data: dict, file_path: str):

        try:
            with open(file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            logger.info(f"Data saved to {file_path}")
        except Exception as e:
            logger.error(f"Error saving to {file_path}: {e}")
            raise

    def convert_csv_to_json(csv_path, json_path):
        with open(csv_path, 'r', encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file)
            data = [row for row in csv_reader]

        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=2)

    def process_pipeline(self, input_file: str, output_dir: str = "output"):

        # Create output directory
        Path(output_dir).mkdir(exist_ok=True)

        logger.info("Starting Persian Vector Analysis Pipeline...")

        # Step 1: Load data
        # sentences = self.load_json_data(input_file)

        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        ids = []
        dates = []
        sentences = []
        titles = []
        urls = []
        for sec in data :
            ids.append(sec['id'])
            dates.append(sec['date'])
            sentences.append(sec['text'])
            titles.append(sec['title'])
            urls.append(sec['url'])

        print(f'len sentences: {len(sentences)}')
        # Step 5: Compute word vectors
        sentences_vectors = self.compute_word_vectors(ids, dates,sentences, titles, urls)

        # Step 6: Save word vectors
        self.save_json(sentences_vectors, f"{output_dir}/embedding_FastText_khamenei.json")


        logger.info("Pipeline completed successfully!")

        # Print summary
        print("\n" + "="*50)
        print("PIPELINE SUMMARY")
        print("="*50)
        print(f"Input sentences: {len(sentences)}")

        print(f"Output files saved to: {output_dir}/")
        print("="*50)


def main():
    """
    Main function to run the Persian Vector Analysis.
    """
    # Initialize analyzer
    analyzer = FasttextPersianVectorAnalyzer()

    # Define input and output paths
    # input_file = "./output/512_final_final_wisdom.json"
    # input_file = "./output/512_final_nahj_letters.json"
    input_file = "./data/raw-khamenei.json"

    output_dir = "output-khamenei"

    # Run the complete pipeline
    analyzer.process_pipeline(input_file, output_dir)


if __name__ == "__main__":
    main()

end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"start time : {strt_time}\nend time : {end_time}")