nahj_rag/2_embedder_fasttext.py
2026-02-17 16:52:37 +00:00

416 lines
14 KiB
Python

# بسم الله
# !pip install hazm
# !pip install transformers==4.26.0
# !pip install --upgrade numpy
# !pip install --upgrade sentence-transformers
"""
Persian Sentence Processing and Vector Analysis
==============================================
This script processes Persian sentences from a JSON file and performs:
1. Word extraction and preprocessing
2. Vector representation using multilingual transformer
3. Similarity analysis for key words
4. Dimensionality reduction to 3D
5. 3D visualization with Persian labels
Author: NLP Expert Assistant
"""
import json
import re
import csv
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Set
from collections import Counter
import logging
from pathlib import Path
from fasttext import tokenize
# NLP and ML libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
# Visualization libraries
# import matplotlib.pyplot as plt
# import plotly.graph_objects as go
# import plotly.express as px
# from plotly.subplots import make_subplots
# Persian text processing
# import hazm
# from hazm import Normalizer, word_tokenize, POSTagger
from normalizer import normalize_persian
from datetime import datetime
import fasttext.util
from gensim.models.fasttext import FastText
import logging
from typing import List, Dict, Union
# فرض کنید این یک placeholder برای logger واقعی است
logger = logging.getLogger(__name__)
# fasttext.util.download_model('fa', if_exists='ignore') # peraian model
strt_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class FasttextPersianVectorAnalyzer:
"""
A comprehensive class for Persian text processing and vector analysis.
"""
def __init__(self, model_name: str = './models/khamenei_all_180_clean_model.bin'):
"""
Initialize the analyzer with the specified model.
Args:
model_name: The sentence transformer model to use
"""
self.model_name = model_name
self.model = None
# self.normalizer = Normalizer()
self.stop_words = self._load_persian_stop_words()
self.key_words = [
]
logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")
def _load_persian_stop_words(self) -> Set[str]:
stop_words = {
}
return stop_words
def load_model(self):
"""
Load the sentence transformer model.
"""
try:
logger.info("Loading sentence transformer model...")
self.model = fasttext.load_model(self.model_name)
logger.info("Model loaded successfully!")
except Exception as e:
logger.error(f"Error loading model: {e}")
raise
def split_sentence(self, sentence:str):
sentences = []
sentence_len = len(tokenize(sentence))
if sentence_len < 512:
sentences.append(sentence)
else:
temp_sentences = str(sentence).split('.')
for sent in temp_sentences:
sent_len = len(tokenize(sent))
if sent_len > 512:
temp_sentences_2 = str(sent).split('،')
for snt in temp_sentences_2:
sentences.append(snt)
else:
sentences.append(sent)
return sentences
def load_json_data(self, file_path: str) -> List[str]:
try:
logger.info(f"Loading data from {file_path}")
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if type(data) == dict:
temp_data = []
for item in data.items():
temp_data.append(item[1])
data = temp_data
sentences = []
if isinstance(data, list):
for index, item in enumerate(data):
print(f'split sentence {index}')
if isinstance(item, dict):
# Extract sentences from different possible keys
for key in ['sentece_text']:
if key in item and item[key]:
splited_sentences = self.split_sentence(item[key])
for sent in splited_sentences:
sentences.append(sent)
elif isinstance(item, str):
splited_sentences = self.split_sentence(item[key])
for sent in splited_sentences:
sentences.append(sent)
elif isinstance(data, dict):
# If it's a single object, extract all string values
for value in data.values():
if isinstance(value, str):
splited_sentences = str(value).split('.')
for sent in splited_sentences:
sentences.append(sent)
sentences = [senten for senten in sentences if senten]
logger.info(f"Loaded {len(sentences)} sentences")
return sentences
except Exception as e:
logger.error(f"Error loading JSON data: {e}")
raise
def preprocess_text(self, text: str) -> str:
# Normalize text
text = normalize_persian(text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep Persian characters
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
return text.strip()
def extract_words(self, sentences: List[str]) -> List[str]:
all_words = []
for sentence in sentences:
# Preprocess sentence
processed_sentence = self.preprocess_text(sentence)
# Tokenize
words = word_tokenize(processed_sentence)
# words = processed_sentence.split()
# Filter out empty strings and very short words
words = [word for word in words if len(word) > 1]
all_words.extend(words)
logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
return all_words
def remove_stop_words(self, words: List[str]) -> List[str]:
filtered_words = [word for word in words if word not in self.stop_words]
logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
return filtered_words
def get_unique_words(self, words: List[str]) -> List[str]:
unique_words = list(set(words))
logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
return unique_words
def compute_word_vectors(self, ids, dates,sentences: List[str], titles, urls) -> Dict[str, List[float]]:
"""Compute sentence vectors using FastText model"""
if self.model is None:
self.load_model()
logger.info(f"Computing vectors for {len(sentences)} sentences...")
# لیست موقت برای ذخیره بردارهای جمله
sentences_vectors_list = []
valid_sentences = []
new_years = []
for index, sent in enumerate(sentences):
# 🟢 تمیز کردن کامل متن از کاراکترهای مشکل‌ساز
cleaned_sent = sent.strip()
# حذف تمام کاراکترهای جدید خط و فاصله‌های اضافی
cleaned_sent = re.sub(r'\s+', ' ', cleaned_sent)
cleaned_sent = cleaned_sent.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
cleaned_sent = cleaned_sent.strip()
# پرش از خطوط خالی پس از پاکسازی
if not cleaned_sent:
continue
try:
# get_sentence_vector بردار میانگین کلمات جمله را تولید می‌کند.
vector = self.model.get_sentence_vector(cleaned_sent)
sentences_vectors_list.append(vector)
valid_sentences.append(cleaned_sent)
# new_years.append(years[index])
print(f'sentence {index} embedded: {cleaned_sent[:10]}...')
except Exception as e:
logger.warning(f"Error processing sentence: {e}")
continue
# تبدیل لیست بردارها به آرایه NumPy
embeddings = np.array(sentences_vectors_list)
# Create dictionary
sentences_vectors = {}
for i, (id, date, sent, title, url, embedding) in enumerate(zip(ids, dates, valid_sentences, titles, urls, embeddings)):
sentences_vectors[f'sentence-{i+1}'] = {
'id': id,
'date': date,
'title': title,
'sentence': sent,
'url': url,
'embeddings': embedding.tolist()
}
logger.info(f"Successfully computed vectors for {len(sentences_vectors)} sentences!")
return sentences_vectors
def find_closest_words(self, word_vectors: Dict[str, List[float]],
key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:
logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")
# Convert to numpy arrays for faster computation
words = list(word_vectors.keys())
vectors = np.array(list(word_vectors.values()))
closest_words = {}
for key_word in key_words:
if key_word in word_vectors:
# Get the key word vector
key_vector = np.array(word_vectors[key_word]).reshape(1, -1)
# Compute cosine similarities
similarities = cosine_similarity(key_vector, vectors)[0]
# Get indices of top k similar words (excluding the key word itself)
word_indices = np.argsort(similarities)[::-1]
# Filter out the key word itself and get top k
closest_indices = []
for idx in word_indices:
if words[idx] != key_word and len(closest_indices) < top_k:
closest_indices.append(idx)
# Get the closest words
closest_words[key_word] = [words[idx] for idx in closest_indices]
logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
else:
logger.warning(f"Key word '{key_word}' not found in word vectors")
closest_words[key_word] = []
return closest_words
def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
method: str = 'tsne') -> Dict[str, List[float]]:
logger.info(f"Reducing dimensions to 3D using {method.upper()}...")
words = list(word_vectors.keys())
vectors = np.array(list(word_vectors.values()))
if method.lower() == 'pca':
reducer = PCA(n_components=3, random_state=42)
elif method.lower() == 'tsne':
reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
else:
raise ValueError("Method must be 'pca' or 'tsne'")
# Reduce dimensions
reduced_vectors = reducer.fit_transform(vectors)
# Create dictionary
word_vectors_3d = {}
for i, word in enumerate(words):
word_vectors_3d[word] = reduced_vectors[i].tolist()
logger.info("Dimensionality reduction completed!")
return word_vectors_3d
def save_json(self, data: dict, file_path: str):
try:
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
logger.info(f"Data saved to {file_path}")
except Exception as e:
logger.error(f"Error saving to {file_path}: {e}")
raise
def convert_csv_to_json(csv_path, json_path):
with open(csv_path, 'r', encoding='utf-8') as csv_file:
csv_reader = csv.reader(csv_file)
data = [row for row in csv_reader]
with open(json_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=2)
def process_pipeline(self, input_file: str, output_dir: str = "output"):
# Create output directory
Path(output_dir).mkdir(exist_ok=True)
logger.info("Starting Persian Vector Analysis Pipeline...")
# Step 1: Load data
# sentences = self.load_json_data(input_file)
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
ids = []
dates = []
sentences = []
titles = []
urls = []
for sec in data :
ids.append(sec['id'])
dates.append(sec['date'])
sentences.append(sec['text'])
titles.append(sec['title'])
urls.append(sec['url'])
print(f'len sentences: {len(sentences)}')
# Step 5: Compute word vectors
sentences_vectors = self.compute_word_vectors(ids, dates,sentences, titles, urls)
# Step 6: Save word vectors
self.save_json(sentences_vectors, f"{output_dir}/embedding_FastText_khamenei.json")
logger.info("Pipeline completed successfully!")
# Print summary
print("\n" + "="*50)
print("PIPELINE SUMMARY")
print("="*50)
print(f"Input sentences: {len(sentences)}")
print(f"Output files saved to: {output_dir}/")
print("="*50)
def main():
"""
Main function to run the Persian Vector Analysis.
"""
# Initialize analyzer
analyzer = FasttextPersianVectorAnalyzer()
# Define input and output paths
# input_file = "./output/512_final_final_wisdom.json"
# input_file = "./output/512_final_nahj_letters.json"
input_file = "./data/raw-khamenei.json"
output_dir = "output-khamenei"
# Run the complete pipeline
analyzer.process_pipeline(input_file, output_dir)
if __name__ == "__main__":
main()
end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"start time : {strt_time}\nend time : {end_time}")