add some readme files

This commit is contained in:
ajokar 2025-08-16 14:24:11 +03:30
parent f137ba54d2
commit b7f15bd846
9 changed files with 218 additions and 1408 deletions

View File

@ -2,19 +2,30 @@
سورس اجرای پردازش های مختلف روی اجزای قانونی سورس اجرای پردازش های مختلف روی اجزای قانونی
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن
""" """
from p1_classifier import do_classify from p1_classifier import do_classify
from p2_ner_recognizer import do_ner_recognize from p2_ner_recognizer import do_ner_recognize
from p3_words_embedder import do_word_embedder from p3_words_embedder import do_word_embedder
# from p4_keyword_extractor import do_keyword_extract from p4_keyword_extractor import do_keyword_extract
# from p5_simplifier import do_simplify from p5_simplifier import do_representation
from elastic_helper import ElasticHelper from elastic_helper import ElasticHelper
import json
def get_sections(): def get_sections():
sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
eh_obj = ElasticHelper() # region خواندن کل سکشن ها از فایل جیسون
sections = eh_obj.iterateJsonFile(sections_path, True) # sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
sections = convert_to_dict(sections) # eh_obj = ElasticHelper()
# sections = eh_obj.iterateJsonFile(sections_path, True)
# sections = convert_to_dict(sections)
# endregion
# region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
sections = json.load(file)
# endregion
return sections return sections
def convert_to_dict(sections): def convert_to_dict(sections):
@ -31,6 +42,15 @@ def main():
# get sections to do nlp processes # get sections to do nlp processes
sections = get_sections() sections = get_sections()
# temp_sections = {}
# for i, item in enumerate(sections):
# if i>3:
# break
# temp_sections[item] = sections[item]
# sections = temp_sections
# dictsections = {} # dictsections = {}
# for item in sections: # for item in sections:
# if not item['id'] == 'qs2180272': # if not item['id'] == 'qs2180272':
@ -46,13 +66,21 @@ def main():
sections = do_ner_recognize(sections) sections = do_ner_recognize(sections)
# 3. word embedder # 3. word embedder
#sections = do_word_embedder(sections) sections = do_word_embedder(sections)
# 4. keyword extract # 4. keyword extract
# result_kw = do_keyword_extract(sections) # keyword_extract_result, sections = do_keyword_extract(sections)
# if keyword_extract_result:
# print(f'keyword extraction finished successfully!')
# 5. simpify # 5. simpify
# result_simp = do_simplify(sections) # representation_result, sections = do_representation(sections)
# if representation_result:
# print(f'representation finished successfully!')
with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
data = json.dumps(sections, ensure_ascii=False, indent=2)
output_file.write(data)
print('all nlp processes finished successfully!') print('all nlp processes finished successfully!')

File diff suppressed because it is too large Load Diff

View File

@ -4,20 +4,18 @@
""" """
from transformers import pipeline from transformers import pipeline
from normalizer import cleaning from normalizer import cleaning
from elastic_helper import ElasticHelper
import transformers import transformers
import json import json
import datetime import datetime
import pandas as pd import pandas as pd
from transformers import AutoTokenizer from transformers import AutoTokenizer
print(transformers.__version__) print(f'transformers version: {transformers.__version__}')
# finetuned model for classification path # finetuned model for classification path
model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680' model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f'Classification Model Loaded: {model_checkpoint}')
# window_size = tokenizer.model_max_length#512#200
window_size = 512 window_size = 512
""" """
(یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست (یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
@ -27,8 +25,8 @@ window_size = 512
step_size = 350#100 step_size = 350#100
# تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم # تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
Top_k = 4 Top_k = 4
# set device = 0 => to use GPU
classifier = pipeline("text-classification", model_checkpoint, framework="pt") classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)
def get_class(sentences, top_k:int=4): def get_class(sentences, top_k:int=4):
# sentences = cleaning(sentences) # sentences = cleaning(sentences)
@ -177,14 +175,15 @@ def do_classify(sections):
for index, id in enumerate(sections): for index, id in enumerate(sections):
source = sections[id]['source'] source = sections[id]
classification_result, classification_status, desc = single_section_classification(id, source) classification_result, classification_status, desc = single_section_classification(id, source)
if not classification_status: if not classification_status:
print(f'id: {id} classification error. error description: {desc}') print(f'id: {id} classification error. error description: {desc}')
# ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری # ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
new_sections_dict[id] = classification_result # new_sections_dict[id] = classification_result
sections[id]['ai_codes'] = classification_result
""" برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود""" """ برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
# qanon_title = source['qanon_title'] # qanon_title = source['qanon_title']
@ -198,8 +197,8 @@ def do_classify(sections):
print(f'end: {datetime.datetime.now()}') print(f'end: {datetime.datetime.now()}')
print('classification finished!') print('classification finished!')
classified_sections_dict = new_sections_dict # classified_sections_dict = new_sections_dict
return classified_sections_dict return sections

View File

@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
tagger = SequenceTagger.load(model) tagger = SequenceTagger.load(model)
print('model read and tagger initialized') print('model read and tagger initialized')
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
def prepare_data(ner_obj_list): def prepare_data(ner_obj_list):
ner_data_list = [] ner_data_list = []
@ -208,7 +208,7 @@ def do_ner_recognize(sections):
ner_data_list = prepare_data(ner_obj_list) ner_data_list = prepare_data(ner_obj_list)
sections[id]['ners_v2'] = ner_data_list sections[id]['ners_v2'] = ner_data_list
print(f'ner process: {index+1}/{len_sections}') print(f'ner process: {index+1}/{len_sections}')
print(f'len_sections ner recognization finished!') print(f'{len_sections} ner recognization finished!')
return sections return sections

View File

@ -6,7 +6,8 @@ import json
import datetime import datetime
import numpy as np import numpy as np
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' date = datetime.datetime.now()
today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25 model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
# model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30 # model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
model = SentenceTransformer(model_name) model = SentenceTransformer(model_name)
def do_word_embedder(sections): def do_word_embedder(sections):
for index, item in enumerate(sections): for index, id in enumerate(sections):
embeddings = single_section_embedder(sections[item]['content']) embeddings = single_section_embedder(sections[id]['content'])
sections[item]['embeddings'] = embeddings sections[id]['embeddings'] = embeddings.tolist()
with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file: with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
data = json.dumps(sections, ensure_ascii=False) data = json.dumps(sections, ensure_ascii=False)
output_file.write(data) output_file.write(data)
return sections
def single_section_embedder(sentence): def single_section_embedder(sentence):
""" """
این متد، متن ورودی را تبدیل به بردار متناظر آن می کند این متد، متن ورودی را تبدیل به بردار متناظر آن می کند

View File

@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#model_id = "meta-llama/Llama-3.1-70B-Instruct" #model_id = "meta-llama/Llama-3.1-70B-Instruct"
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
bnb_config = BitsAndBytesConfig( bnb_config = BitsAndBytesConfig(
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16 load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
@ -61,7 +61,7 @@ def generate(formatted_prompt):
# Gemini Prompt # Gemini Prompt
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد: USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
یک لیست فارسی یک لیست فارسی
شماره ترتیبی در ابتدای هر عبارت هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد
هر عبارت در یک خط جداگانه هر عبارت در یک خط جداگانه
بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
موارد زیر در استخراج عبارات کلیدی الزامی است: موارد زیر در استخراج عبارات کلیدی الزامی است:
@ -94,7 +94,7 @@ def generate(formatted_prompt):
max_new_tokens=2048, max_new_tokens=2048,
eos_token_id=terminators, eos_token_id=terminators,
do_sample=True, do_sample=True,
temperature=0.6, temperature=0.1,
top_p=0.9, top_p=0.9,
) )
response = outputs[0][input_ids.shape[-1]:] response = outputs[0][input_ids.shape[-1]:]
@ -168,9 +168,9 @@ def do_keyword_extract(sections):
period_ids_text += f"{id} \n" period_ids_text += f"{id} \n"
print(f"section: {counter}-id: {id}") print(f"section kw extracting: {counter} - id: {id}")
# temp_dict.append(item) # temp_dict.append(item)
if counter % 1000 == 0: if counter % 5000 == 0:
outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8') outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2)) outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
outputfile.close() outputfile.close()
@ -196,12 +196,13 @@ def do_keyword_extract(sections):
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ") print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
print("end") print("end")
return True operation_result = True
return operation_result, sections
if __name__ == "__main__": if __name__ == "__main__":
print(f'start: {datetime.datetime.now()}') print(f'start: {datetime.datetime.now()}')
sections = get_sections() sections = get_sections()
sections = do_keyword_extract(sections) operation_result = do_keyword_extract(sections)
print(f'end: {datetime.datetime.now()}') print(f'end: {datetime.datetime.now()}')

View File

@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
import torch import torch
import json import json
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}' today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
if torch.cuda.is_available(): if torch.cuda.is_available():
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct" model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
@ -56,17 +56,20 @@ def single_section_representation(content):
outputs = model.generate( outputs = model.generate(
input_ids, input_ids,
max_new_tokens=500, max_new_tokens=2048,
eos_token_id=terminators, eos_token_id=terminators,
do_sample=True, do_sample=True,
temperature=0.7, temperature=0.1,
top_p=0.85, top_p=0.85,
) )
response = outputs[0][input_ids.shape[-1]:] response = outputs[0][input_ids.shape[-1]:]
sentences = tokenizer.decode(response, skip_special_tokens=True) sentences = tokenizer.decode(response, skip_special_tokens=True)
# حذف جملات تکراری
sentences = list(set(sentences))
result = True result = True
desc = 'operation successful' desc = 'Operation successful'
return result, desc, sentences return result, desc, sentences
except Exception as error: except Exception as error:
@ -77,7 +80,7 @@ def single_section_representation(content):
def do_representation(sections): def do_representation(sections):
print(f"start time: {datetime.datetime.now()}") print(f"start time: {datetime.datetime.now()}")
for index, id in sections: for index, id in enumerate(sections):
result, desc, sentences = single_section_representation(sections[id]['content']) result, desc, sentences = single_section_representation(sections[id]['content'])
if not result: if not result:
error_content = f'id: {id} - error: {desc}\n' error_content = f'id: {id} - error: {desc}\n'
@ -85,11 +88,16 @@ def do_representation(sections):
file.write(error_content) file.write(error_content)
sections[id]['represented_sentences'] = sentences sections[id]['represented_sentences'] = sentences
print(f'representation process. section {index+1}/{len(sections)} - id: {id}')
with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))
print(f"end time: {datetime.datetime.now()}") print(f"end time: {datetime.datetime.now()}")
print(" *** finished! *** ") print(" *** finished! *** ")
operation_result = True
return operation_result, sections
if __name__ == "__main__": if __name__ == "__main__":
pass pass

View File

@ -0,0 +1,71 @@
# Section Classification Script
This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks.
## Requirements
Before using this script, please install the required libraries:
```bash
pip install transformers pandas
```
You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model.
## How It Works
- The script loads a fine-tuned transformer model for text classification.
- It processes each section of text, possibly splitting long texts into windows to fit the model's input size.
- For each section, it predicts the top classes and saves the results.
## Main Functions
- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes.
- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text.
- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results.
- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes.
- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file.
## Usage Example
Suppose you have your sections data as a dictionary:
```python
sections = {
"1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."},
"2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}
}
```
You can classify all sections as follows:
```python
from classification import do_classify
result = do_classify(sections)
```
After running, the results will be saved in a JSON file in the `./data/classification/` directory.
## Output Structure
Each section will have a new field `ai_codes` with the classification results:
```json
"1": {
"content": "First section text",
"ai_codes": {
"best-class": {"label": "ClassA", "score": 0.85},
"other-classes": [
{"label": "ClassB", "score": 0.10},
{"label": "ClassC", "score": 0.05}
]
}
}
```
## Notes
- Make sure the model path in `model_checkpoint` is correct and the model files are available.
- The script supports Persian and other languages, depending on your model.
- The output JSON file will be saved in `./data/classification/`.

View File

@ -0,0 +1,70 @@
# Sentence Embedding Generator
This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library.
## Requirements
Before using this script, please install the required libraries:
```bash
pip install sentence-transformers numpy
```
## How It Works
- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
- There are two main functions:
- `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector.
- `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file.
## Usage
### 1. Get Embedding for a Single Sentence
```python
from embedding import single_section_embedder
sentence = "This is a sample sentence."
embedding = single_section_embedder(sentence)
print(embedding)
```
### 2. Generate Embeddings for Multiple Sections and Save to File
Suppose your data is structured like this:
```python
sections = {
"1": {"content": "First section text"},
"2": {"content": "Second section text"}
}
```
You can generate and save embeddings as follows:
```python
from embedding import do_word_embedder
result = do_word_embedder(sections)
```
After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section.
## Output Structure
The output is a JSON file where each section has its embedding added:
```json
{
"1": {
"content": "First section text",
"embeddings": [0.123, 0.456, ...]
},
...
}
```
## Notes
- Make sure the folder `./data/embeddings/` exists before running the script.
- The script supports Persian language.