first process
This commit is contained in:
parent
8eb3f5e5ed
commit
7e456568e5
18
.vscode/launch.json
vendored
Normal file
18
.vscode/launch.json
vendored
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python Debugger: Current File",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${file}",
|
||||||
|
//"console": "integratedTerminal",
|
||||||
|
"console": "internalConsole",
|
||||||
|
"justMyCode": false,
|
||||||
|
"python": "/home/gpu/NLP/.env/bin/python"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
530
embedder.py
Normal file
530
embedder.py
Normal file
|
@ -0,0 +1,530 @@
|
||||||
|
# !pip install hazm
|
||||||
|
# !pip install transformers==4.26.0
|
||||||
|
# !pip install --upgrade numpy
|
||||||
|
# !pip install --upgrade sentence-transformers
|
||||||
|
"""
|
||||||
|
Persian Sentence Processing and Vector Analysis
|
||||||
|
==============================================
|
||||||
|
|
||||||
|
This script processes Persian sentences from a JSON file and performs:
|
||||||
|
1. Word extraction and preprocessing
|
||||||
|
2. Vector representation using multilingual transformer
|
||||||
|
3. Similarity analysis for key words
|
||||||
|
4. Dimensionality reduction to 3D
|
||||||
|
5. 3D visualization with Persian labels
|
||||||
|
|
||||||
|
Author: NLP Expert Assistant
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from typing import List, Dict, Tuple, Set
|
||||||
|
from collections import Counter
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# NLP and ML libraries
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.manifold import TSNE
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
# Visualization libraries
|
||||||
|
# import matplotlib.pyplot as plt
|
||||||
|
# import plotly.graph_objects as go
|
||||||
|
# import plotly.express as px
|
||||||
|
# from plotly.subplots import make_subplots
|
||||||
|
|
||||||
|
# Persian text processing
|
||||||
|
import hazm
|
||||||
|
from hazm import Normalizer, word_tokenize, POSTagger
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class PersianVectorAnalyzer:
|
||||||
|
"""
|
||||||
|
A comprehensive class for Persian text processing and vector analysis.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
||||||
|
"""
|
||||||
|
Initialize the analyzer with the specified model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_name: The sentence transformer model to use
|
||||||
|
"""
|
||||||
|
self.model_name = model_name
|
||||||
|
self.model = None
|
||||||
|
self.normalizer = Normalizer()
|
||||||
|
self.stop_words = self._load_persian_stop_words()
|
||||||
|
self.key_words = [
|
||||||
|
"خدا", "بنده", "جهاد", "ولی", "زکات",
|
||||||
|
"نماز", "صبر", "عبادت", "ولایت", "خلافت","پیامبر"
|
||||||
|
]
|
||||||
|
|
||||||
|
logger.info(f"Initializing Persian Vector Analyzer with model: {model_name}")
|
||||||
|
|
||||||
|
def _load_persian_stop_words(self) -> Set[str]:
|
||||||
|
"""
|
||||||
|
Load Persian stop words.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Set of Persian stop words
|
||||||
|
"""
|
||||||
|
# Common Persian stop words
|
||||||
|
stop_words = {
|
||||||
|
'و', 'در', 'به', 'از', 'که', 'این', 'آن', 'با', 'برای', 'تا',
|
||||||
|
'را', 'هم', 'یا', 'اما', 'اگر', 'چون', 'چرا', 'چگونه', 'کجا',
|
||||||
|
'چه', 'کی', 'چند', 'چقدر', 'همه', 'هیچ', 'بعضی', 'هر', 'همه',
|
||||||
|
'خود', 'خویش', 'ما', 'شما', 'آنها', 'ایشان', 'اینها', 'آنها',
|
||||||
|
'من', 'تو', 'او', 'ما', 'شما', 'آنها', 'ایشان', 'اینها',
|
||||||
|
'است', 'هست', 'بود', 'شد', 'می', 'باید', 'خواهد', 'دارد',
|
||||||
|
'کرد', 'شد', 'بود', 'هست', 'است', 'میشود', 'میکند',
|
||||||
|
'یک', 'دو', 'سه', 'چهار', 'پنج', 'شش', 'هفت', 'هشت', 'نه', 'ده',
|
||||||
|
'اول', 'دوم', 'سوم', 'چهارم', 'پنجم', 'ششم', 'هفتم', 'هشتم', 'نهم', 'دهم',
|
||||||
|
'سال', 'ماه', 'روز', 'هفته', 'ساعت', 'دقیقه', 'ثانیه','پس'
|
||||||
|
'بله', 'نه', 'آری', 'خیر', 'بلی', 'نخیر',
|
||||||
|
'حالا', 'الان', 'امروز', 'دیروز', 'فردا', 'هفته', 'ماه', 'سال',
|
||||||
|
'بالا', 'پایین', 'چپ', 'راست', 'جلو', 'عقب', 'داخل', 'خارج',
|
||||||
|
'بزرگ', 'کوچک', 'بلند', 'کوتاه', 'پهن', 'باریک', 'ضخیم', 'نازک',
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
return stop_words
|
||||||
|
|
||||||
|
def load_model(self):
|
||||||
|
"""
|
||||||
|
Load the sentence transformer model.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info("Loading sentence transformer model...")
|
||||||
|
self.model = SentenceTransformer(self.model_name)
|
||||||
|
logger.info("Model loaded successfully!")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_json_data(self, file_path: str) -> List[str]:
|
||||||
|
"""
|
||||||
|
Load Persian sentences from JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the JSON file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Persian sentences
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Loading data from {file_path}")
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if type(data) == dict:
|
||||||
|
temp_data = []
|
||||||
|
for item in data.items():
|
||||||
|
temp_data.append(item[1])
|
||||||
|
data = temp_data
|
||||||
|
|
||||||
|
sentences = []
|
||||||
|
if isinstance(data, list):
|
||||||
|
for item in data:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
# Extract sentences from different possible keys
|
||||||
|
for key in ['persian_translate']:
|
||||||
|
if key in item and item[key]:
|
||||||
|
sentences.append(str(item[key]))
|
||||||
|
elif isinstance(item, str):
|
||||||
|
sentences.append(item)
|
||||||
|
elif isinstance(data, dict):
|
||||||
|
# If it's a single object, extract all string values
|
||||||
|
for value in data.values():
|
||||||
|
if isinstance(value, str):
|
||||||
|
sentences.append(value)
|
||||||
|
|
||||||
|
logger.info(f"Loaded {len(sentences)} sentences")
|
||||||
|
return sentences
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading JSON data: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def preprocess_text(self, text: str) -> str:
|
||||||
|
"""
|
||||||
|
Preprocess Persian text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw Persian text
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Preprocessed text
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Normalize text
|
||||||
|
text = self.normalizer.normalize(text)
|
||||||
|
|
||||||
|
# Remove extra whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
|
# Remove special characters but keep Persian characters
|
||||||
|
text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]', '', text)
|
||||||
|
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def extract_words(self, sentences: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Extract all words from sentences.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sentences: List of Persian sentences
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of all words
|
||||||
|
"""
|
||||||
|
all_words = []
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
# Preprocess sentence
|
||||||
|
processed_sentence = self.preprocess_text(sentence)
|
||||||
|
|
||||||
|
# Tokenize
|
||||||
|
words = word_tokenize(processed_sentence)
|
||||||
|
# words = processed_sentence.split()
|
||||||
|
# Filter out empty strings and very short words
|
||||||
|
words = [word for word in words if len(word) > 1]
|
||||||
|
|
||||||
|
all_words.extend(words)
|
||||||
|
|
||||||
|
logger.info(f"Extracted {len(all_words)} words from {len(sentences)} sentences")
|
||||||
|
return all_words
|
||||||
|
|
||||||
|
def remove_stop_words(self, words: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Remove stop words from the word list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: List of words
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of words without stop words
|
||||||
|
"""
|
||||||
|
filtered_words = [word for word in words if word not in self.stop_words]
|
||||||
|
logger.info(f"Removed {len(words) - len(filtered_words)} stop words")
|
||||||
|
return filtered_words
|
||||||
|
|
||||||
|
def get_unique_words(self, words: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Get unique words from the list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: List of words
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of unique words
|
||||||
|
"""
|
||||||
|
unique_words = list(set(words))
|
||||||
|
logger.info(f"Found {len(unique_words)} unique words from {len(words)} total words")
|
||||||
|
return unique_words
|
||||||
|
|
||||||
|
def compute_word_vectors(self, words: List[str]) -> Dict[str, List[float]]:
|
||||||
|
"""
|
||||||
|
Compute vector representations for words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: List of unique words
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping words to their vector representations
|
||||||
|
"""
|
||||||
|
if self.model is None:
|
||||||
|
self.load_model()
|
||||||
|
|
||||||
|
logger.info(f"Computing vectors for {len(words)} words...")
|
||||||
|
|
||||||
|
# Compute embeddings
|
||||||
|
embeddings = self.model.encode(words, show_progress_bar=True)
|
||||||
|
|
||||||
|
# Create dictionary
|
||||||
|
word_vectors = {}
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
word_vectors[word] = embeddings[i].tolist()
|
||||||
|
|
||||||
|
logger.info("Word vectors computed successfully!")
|
||||||
|
return word_vectors
|
||||||
|
|
||||||
|
def find_closest_words(self, word_vectors: Dict[str, List[float]],
|
||||||
|
key_words: List[str], top_k: int = 20) -> Dict[str, List[str]]:
|
||||||
|
"""
|
||||||
|
Find the closest words to each key word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word_vectors: Dictionary of word vectors
|
||||||
|
key_words: List of key words to find neighbors for
|
||||||
|
top_k: Number of closest words to find
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping key words to their closest neighbors
|
||||||
|
"""
|
||||||
|
logger.info(f"Finding {top_k} closest words for {len(key_words)} key words...")
|
||||||
|
|
||||||
|
# Convert to numpy arrays for faster computation
|
||||||
|
words = list(word_vectors.keys())
|
||||||
|
vectors = np.array(list(word_vectors.values()))
|
||||||
|
|
||||||
|
closest_words = {}
|
||||||
|
|
||||||
|
for key_word in key_words:
|
||||||
|
if key_word in word_vectors:
|
||||||
|
# Get the key word vector
|
||||||
|
key_vector = np.array(word_vectors[key_word]).reshape(1, -1)
|
||||||
|
|
||||||
|
# Compute cosine similarities
|
||||||
|
similarities = cosine_similarity(key_vector, vectors)[0]
|
||||||
|
|
||||||
|
# Get indices of top k similar words (excluding the key word itself)
|
||||||
|
word_indices = np.argsort(similarities)[::-1]
|
||||||
|
|
||||||
|
# Filter out the key word itself and get top k
|
||||||
|
closest_indices = []
|
||||||
|
for idx in word_indices:
|
||||||
|
if words[idx] != key_word and len(closest_indices) < top_k:
|
||||||
|
closest_indices.append(idx)
|
||||||
|
|
||||||
|
# Get the closest words
|
||||||
|
closest_words[key_word] = [words[idx] for idx in closest_indices]
|
||||||
|
logger.info(f"Found {len(closest_words[key_word])} closest words for '{key_word}'")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Key word '{key_word}' not found in word vectors")
|
||||||
|
closest_words[key_word] = []
|
||||||
|
|
||||||
|
return closest_words
|
||||||
|
|
||||||
|
def reduce_to_3d(self, word_vectors: Dict[str, List[float]],
|
||||||
|
method: str = 'tsne') -> Dict[str, List[float]]:
|
||||||
|
"""
|
||||||
|
Reduce word vectors to 3D coordinates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word_vectors: Dictionary of word vectors
|
||||||
|
method: Dimensionality reduction method ('pca' or 'tsne')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping words to their 3D coordinates
|
||||||
|
"""
|
||||||
|
logger.info(f"Reducing dimensions to 3D using {method.upper()}...")
|
||||||
|
|
||||||
|
words = list(word_vectors.keys())
|
||||||
|
vectors = np.array(list(word_vectors.values()))
|
||||||
|
|
||||||
|
if method.lower() == 'pca':
|
||||||
|
reducer = PCA(n_components=3, random_state=42)
|
||||||
|
elif method.lower() == 'tsne':
|
||||||
|
reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(vectors)-1))
|
||||||
|
else:
|
||||||
|
raise ValueError("Method must be 'pca' or 'tsne'")
|
||||||
|
|
||||||
|
# Reduce dimensions
|
||||||
|
reduced_vectors = reducer.fit_transform(vectors)
|
||||||
|
|
||||||
|
# Create dictionary
|
||||||
|
word_vectors_3d = {}
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
word_vectors_3d[word] = reduced_vectors[i].tolist()
|
||||||
|
|
||||||
|
logger.info("Dimensionality reduction completed!")
|
||||||
|
return word_vectors_3d
|
||||||
|
|
||||||
|
def save_json(self, data: dict, file_path: str):
|
||||||
|
"""
|
||||||
|
Save data to JSON file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Data to save
|
||||||
|
file_path: Output file path
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open(file_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
logger.info(f"Data saved to {file_path}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error saving to {file_path}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
# def create_3d_visualization(self, word_vectors_3d: Dict[str, List[float]],
|
||||||
|
# selected_words: Dict[str, List[str]],
|
||||||
|
# output_path: str = "persian_words_3d.html"):
|
||||||
|
# """
|
||||||
|
# Create 3D visualization of words.
|
||||||
|
|
||||||
|
# Args:
|
||||||
|
# word_vectors_3d: Dictionary of 3D word coordinates
|
||||||
|
# selected_words: Dictionary of selected words for each key word
|
||||||
|
# output_path: Output file path for the visualization
|
||||||
|
# """
|
||||||
|
# logger.info("Creating 3D visualization...")
|
||||||
|
|
||||||
|
# # Prepare data for plotting
|
||||||
|
# words = list(word_vectors_3d.keys())
|
||||||
|
# coords = np.array(list(word_vectors_3d.values()))
|
||||||
|
|
||||||
|
# # Create color mapping for key words and their neighbors
|
||||||
|
# colors = []
|
||||||
|
# sizes = []
|
||||||
|
# hover_texts = []
|
||||||
|
|
||||||
|
# for word in words:
|
||||||
|
# # Check if word is a key word
|
||||||
|
# is_key_word = word in self.key_words
|
||||||
|
|
||||||
|
# # Check if word is in selected words
|
||||||
|
# in_selected = False
|
||||||
|
# key_word_group = None
|
||||||
|
# for key_word, selected_list in selected_words.items():
|
||||||
|
# if word in selected_list:
|
||||||
|
# in_selected = True
|
||||||
|
# key_word_group = key_word
|
||||||
|
# break
|
||||||
|
|
||||||
|
# if is_key_word:
|
||||||
|
# colors.append('red')
|
||||||
|
# sizes.append(15)
|
||||||
|
# hover_texts.append(f"کلیدواژه: {word}")
|
||||||
|
# elif in_selected:
|
||||||
|
# colors.append('blue')
|
||||||
|
# sizes.append(10)
|
||||||
|
# hover_texts.append(f"کلمه مرتبط با '{key_word_group}': {word}")
|
||||||
|
# else:
|
||||||
|
# colors.append('lightgray')
|
||||||
|
# sizes.append(5)
|
||||||
|
# hover_texts.append(f"کلمه: {word}")
|
||||||
|
|
||||||
|
# # Create 3D scatter plot
|
||||||
|
# fig = go.Figure()
|
||||||
|
|
||||||
|
# # Add scatter plot
|
||||||
|
# fig.add_trace(go.Scatter3d(
|
||||||
|
# x=coords[:, 0],
|
||||||
|
# y=coords[:, 1],
|
||||||
|
# z=coords[:, 2],
|
||||||
|
# mode='markers+text',
|
||||||
|
# marker=dict(
|
||||||
|
# size=sizes,
|
||||||
|
# color=colors,
|
||||||
|
# opacity=0.8
|
||||||
|
# ),
|
||||||
|
# text=words,
|
||||||
|
# textposition="middle center",
|
||||||
|
# hovertext=hover_texts,
|
||||||
|
# hoverinfo='text'
|
||||||
|
# ))
|
||||||
|
|
||||||
|
# # Update layout
|
||||||
|
# fig.update_layout(
|
||||||
|
# title={
|
||||||
|
# 'text': 'نمایش سهبعدی کلمات فارسی',
|
||||||
|
# 'x': 0.5,
|
||||||
|
# 'xanchor': 'center',
|
||||||
|
# 'font': {'size': 20}
|
||||||
|
# },
|
||||||
|
# scene=dict(
|
||||||
|
# xaxis_title='محور X',
|
||||||
|
# yaxis_title='محور Y',
|
||||||
|
# zaxis_title='محور Z',
|
||||||
|
# camera=dict(
|
||||||
|
# eye=dict(x=1.5, y=1.5, z=1.5)
|
||||||
|
# )
|
||||||
|
# ),
|
||||||
|
# width=1000,
|
||||||
|
# height=800,
|
||||||
|
# showlegend=False
|
||||||
|
# )
|
||||||
|
|
||||||
|
# # Save the plot
|
||||||
|
# fig.write_html(output_path)
|
||||||
|
# logger.info(f"3D visualization saved to {output_path}")
|
||||||
|
|
||||||
|
# return fig
|
||||||
|
|
||||||
|
def process_pipeline(self, input_file: str, output_dir: str = "output"):
|
||||||
|
"""
|
||||||
|
Run the complete processing pipeline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_file: Path to input JSON file
|
||||||
|
output_dir: Output directory for results
|
||||||
|
"""
|
||||||
|
# Create output directory
|
||||||
|
Path(output_dir).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
logger.info("Starting Persian Vector Analysis Pipeline...")
|
||||||
|
|
||||||
|
# Step 1: Load data
|
||||||
|
sentences = self.load_json_data(input_file)
|
||||||
|
|
||||||
|
# Step 2: Extract words
|
||||||
|
all_words = self.extract_words(sentences)
|
||||||
|
|
||||||
|
# Step 3: Remove stop words
|
||||||
|
# filtered_words = self.remove_stop_words(all_words)
|
||||||
|
filtered_words = all_words
|
||||||
|
|
||||||
|
# Step 4: Get unique words
|
||||||
|
unique_words = self.get_unique_words(filtered_words)
|
||||||
|
|
||||||
|
# Step 5: Compute word vectors
|
||||||
|
word_vectors = self.compute_word_vectors(unique_words)
|
||||||
|
|
||||||
|
# Step 6: Save word vectors
|
||||||
|
self.save_json(word_vectors, f"{output_dir}/words_vector.json")
|
||||||
|
|
||||||
|
# Step 7: Find closest words to key words
|
||||||
|
selected_words = self.find_closest_words(word_vectors, self.key_words)
|
||||||
|
|
||||||
|
# Step 8: Save selected words
|
||||||
|
self.save_json(selected_words, f"{output_dir}/selected_words.json")
|
||||||
|
|
||||||
|
# Step 9: Reduce to 3D
|
||||||
|
word_vectors_3d = self.reduce_to_3d(word_vectors, method='tsne')
|
||||||
|
|
||||||
|
# Step 10: Save 3D vectors
|
||||||
|
self.save_json(word_vectors_3d, f"{output_dir}/words_vector_3d.json")
|
||||||
|
|
||||||
|
# Step 11: Create visualization
|
||||||
|
# self.create_3d_visualization(word_vectors_3d, selected_words,
|
||||||
|
# f"{output_dir}/persian_words_3d.html")
|
||||||
|
|
||||||
|
logger.info("Pipeline completed successfully!")
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print("\n" + "="*50)
|
||||||
|
print("PIPELINE SUMMARY")
|
||||||
|
print("="*50)
|
||||||
|
print(f"Input sentences: {len(sentences)}")
|
||||||
|
print(f"Total words extracted: {len(all_words)}")
|
||||||
|
print(f"Unique words after preprocessing: {len(unique_words)}")
|
||||||
|
print(f"Word vectors computed: {len(word_vectors)}")
|
||||||
|
print(f"Key words processed: {len(self.key_words)}")
|
||||||
|
print(f"Output files saved to: {output_dir}/")
|
||||||
|
print("="*50)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main function to run the Persian Vector Analysis.
|
||||||
|
"""
|
||||||
|
# Initialize analyzer
|
||||||
|
analyzer = PersianVectorAnalyzer()
|
||||||
|
|
||||||
|
# Define input and output paths
|
||||||
|
input_file = "./data/final_wisdom.json"
|
||||||
|
output_dir = "output"
|
||||||
|
|
||||||
|
# Run the complete pipeline
|
||||||
|
analyzer.process_pipeline(input_file, output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
223
output/selected_words.json
Normal file
223
output/selected_words.json
Normal file
|
@ -0,0 +1,223 @@
|
||||||
|
{
|
||||||
|
"خدا": [
|
||||||
|
"بالله",
|
||||||
|
"خدای",
|
||||||
|
"خداوند",
|
||||||
|
"خدایی",
|
||||||
|
"الله",
|
||||||
|
"خدایا",
|
||||||
|
"الهی",
|
||||||
|
"لله",
|
||||||
|
"آله",
|
||||||
|
"خداییم",
|
||||||
|
"الرب",
|
||||||
|
"خداوندا",
|
||||||
|
"خدایش",
|
||||||
|
"حضرت",
|
||||||
|
"یاسر",
|
||||||
|
"آیه",
|
||||||
|
"بهشتش",
|
||||||
|
"تعالی",
|
||||||
|
"باطنم",
|
||||||
|
"وعید"
|
||||||
|
],
|
||||||
|
"بنده": [
|
||||||
|
"مالک",
|
||||||
|
"پیشگاه",
|
||||||
|
"قربانگاه",
|
||||||
|
"فرمانروایی",
|
||||||
|
"کوچ",
|
||||||
|
"مالکی",
|
||||||
|
"قربانگاههای",
|
||||||
|
"خزانهدار",
|
||||||
|
"پیشوای",
|
||||||
|
"جانشین",
|
||||||
|
"همنشین",
|
||||||
|
"مأمور",
|
||||||
|
"مستولی",
|
||||||
|
"منکرات",
|
||||||
|
"بندهاش",
|
||||||
|
"اختیار",
|
||||||
|
"منکری",
|
||||||
|
"حاکم",
|
||||||
|
"عبد",
|
||||||
|
"زمامداران"
|
||||||
|
],
|
||||||
|
"جهاد": [
|
||||||
|
"مجاهد",
|
||||||
|
"اسلام",
|
||||||
|
"مسلم",
|
||||||
|
"شامیان",
|
||||||
|
"علیهالسلام",
|
||||||
|
"مسلمانان",
|
||||||
|
"قرآن",
|
||||||
|
"طلبان",
|
||||||
|
"صلیالله",
|
||||||
|
"عبیدالله",
|
||||||
|
"امان",
|
||||||
|
"عبدالله",
|
||||||
|
"شامی",
|
||||||
|
"خلافت",
|
||||||
|
"پیغمبر",
|
||||||
|
"مسلمین",
|
||||||
|
"سپاه",
|
||||||
|
"سید",
|
||||||
|
"علی",
|
||||||
|
"پیامبر"
|
||||||
|
],
|
||||||
|
"ولی": [
|
||||||
|
"اما",
|
||||||
|
"مگر",
|
||||||
|
"وإنما",
|
||||||
|
"إلا",
|
||||||
|
"اگرچه",
|
||||||
|
"برخلاف",
|
||||||
|
"خلافی",
|
||||||
|
"درحالیکه",
|
||||||
|
"بلکه",
|
||||||
|
"إلیها",
|
||||||
|
"غیرش",
|
||||||
|
"لان",
|
||||||
|
"وگرنه",
|
||||||
|
"بخلاف",
|
||||||
|
"ورزند",
|
||||||
|
"چنانچه",
|
||||||
|
"وگروه",
|
||||||
|
"بس",
|
||||||
|
"وبالش",
|
||||||
|
"واگر"
|
||||||
|
],
|
||||||
|
"زکات": [
|
||||||
|
"گلوگاه",
|
||||||
|
"غنائمی",
|
||||||
|
"مینگرند",
|
||||||
|
"غبن",
|
||||||
|
"دراز",
|
||||||
|
"نزند",
|
||||||
|
"میافکند",
|
||||||
|
"گرچه",
|
||||||
|
"زبیر",
|
||||||
|
"تابی",
|
||||||
|
"طغیان",
|
||||||
|
"بلاغت",
|
||||||
|
"توفیق",
|
||||||
|
"ضبائی",
|
||||||
|
"قیمة",
|
||||||
|
"فریفتند",
|
||||||
|
"آمیز",
|
||||||
|
"پوشی",
|
||||||
|
"طویلة",
|
||||||
|
"سوگشان"
|
||||||
|
],
|
||||||
|
"نماز": [
|
||||||
|
"دعا",
|
||||||
|
"صلوات",
|
||||||
|
"دعای",
|
||||||
|
"دعایی",
|
||||||
|
"عبادت",
|
||||||
|
"مومنان",
|
||||||
|
"مؤمنان",
|
||||||
|
"ایمانی",
|
||||||
|
"مؤمنی",
|
||||||
|
"مؤمن",
|
||||||
|
"مومن",
|
||||||
|
"برکت",
|
||||||
|
"ایمان",
|
||||||
|
"المؤمنین",
|
||||||
|
"ایمانش",
|
||||||
|
"رحمت",
|
||||||
|
"مؤمنانم",
|
||||||
|
"دینی",
|
||||||
|
"ایمانتان",
|
||||||
|
"معنوی"
|
||||||
|
],
|
||||||
|
"صبر": [
|
||||||
|
"انتظار",
|
||||||
|
"یصبر",
|
||||||
|
"لایصبر",
|
||||||
|
"صبور",
|
||||||
|
"پروا",
|
||||||
|
"متکبر",
|
||||||
|
"تعویذ",
|
||||||
|
"دعائم",
|
||||||
|
"سکونت",
|
||||||
|
"رکاب",
|
||||||
|
"إرواد",
|
||||||
|
"ماند",
|
||||||
|
"پرخوری",
|
||||||
|
"دنبال",
|
||||||
|
"استهزاء",
|
||||||
|
"میپیچید",
|
||||||
|
"دوشید",
|
||||||
|
"بیندیشید",
|
||||||
|
"تقوای",
|
||||||
|
"نفرماید"
|
||||||
|
],
|
||||||
|
"عبادت": [
|
||||||
|
"دعایی",
|
||||||
|
"دعای",
|
||||||
|
"صلوات",
|
||||||
|
"نماز",
|
||||||
|
"دعا",
|
||||||
|
"خدای",
|
||||||
|
"مومن",
|
||||||
|
"خداوند",
|
||||||
|
"بالله",
|
||||||
|
"خدا",
|
||||||
|
"برکت",
|
||||||
|
"مؤمنانم",
|
||||||
|
"الهی",
|
||||||
|
"خدایا",
|
||||||
|
"الرب",
|
||||||
|
"لله",
|
||||||
|
"آله",
|
||||||
|
"ایمانی",
|
||||||
|
"الله",
|
||||||
|
"خدایی"
|
||||||
|
],
|
||||||
|
"ولایت": [],
|
||||||
|
"خلافت": [
|
||||||
|
"سپاه",
|
||||||
|
"حاکم",
|
||||||
|
"امت",
|
||||||
|
"فرمانروایی",
|
||||||
|
"لشکر",
|
||||||
|
"قصار",
|
||||||
|
"امان",
|
||||||
|
"برترند",
|
||||||
|
"نهاد",
|
||||||
|
"زمامداران",
|
||||||
|
"وحکمة",
|
||||||
|
"ستمگری",
|
||||||
|
"الإبل",
|
||||||
|
"بالإبل",
|
||||||
|
"مسلط",
|
||||||
|
"سرکش",
|
||||||
|
"اختیار",
|
||||||
|
"امانی",
|
||||||
|
"مأموریت",
|
||||||
|
"حکومتی"
|
||||||
|
],
|
||||||
|
"پیامبر": [
|
||||||
|
"پیغمبر",
|
||||||
|
"پیامبرش",
|
||||||
|
"پیامبران",
|
||||||
|
"پیامبرتان",
|
||||||
|
"قرآن",
|
||||||
|
"رسولالله",
|
||||||
|
"مجاهد",
|
||||||
|
"عبیدالله",
|
||||||
|
"الله",
|
||||||
|
"مسلم",
|
||||||
|
"ربانی",
|
||||||
|
"اسلام",
|
||||||
|
"خدای",
|
||||||
|
"ایمانی",
|
||||||
|
"یاسر",
|
||||||
|
"شهید",
|
||||||
|
"خدایی",
|
||||||
|
"بالله",
|
||||||
|
"صلیالله",
|
||||||
|
"خدا"
|
||||||
|
]
|
||||||
|
}
|
1820764
output/words_vector.json
Normal file
1820764
output/words_vector.json
Normal file
File diff suppressed because it is too large
Load Diff
23587
output/words_vector_3d.json
Normal file
23587
output/words_vector_3d.json
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user