add some readme files

This commit is contained in:
ajokar 2025-08-16 14:24:11 +03:30
parent f137ba54d2
commit b7f15bd846
9 changed files with 218 additions and 1408 deletions

View File

@ -2,19 +2,30 @@
سورس اجرای پردازش های مختلف روی اجزای قانونی
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن
"""
from p1_classifier import do_classify
from p2_ner_recognizer import do_ner_recognize
from p3_words_embedder import do_word_embedder
# from p4_keyword_extractor import do_keyword_extract
# from p5_simplifier import do_simplify
from p4_keyword_extractor import do_keyword_extract
from p5_simplifier import do_representation
from elastic_helper import ElasticHelper
import json
def get_sections():
sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
eh_obj = ElasticHelper()
sections = eh_obj.iterateJsonFile(sections_path, True)
sections = convert_to_dict(sections)
# region خواندن کل سکشن ها از فایل جیسون
# sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
# eh_obj = ElasticHelper()
# sections = eh_obj.iterateJsonFile(sections_path, True)
# sections = convert_to_dict(sections)
# endregion
# region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
sections = json.load(file)
# endregion
return sections
def convert_to_dict(sections):
@ -31,6 +42,15 @@ def main():
# get sections to do nlp processes
sections = get_sections()
# temp_sections = {}
# for i, item in enumerate(sections):
# if i>3:
# break
# temp_sections[item] = sections[item]
# sections = temp_sections
# dictsections = {}
# for item in sections:
# if not item['id'] == 'qs2180272':
@ -46,13 +66,21 @@ def main():
sections = do_ner_recognize(sections)
# 3. word embedder
#sections = do_word_embedder(sections)
sections = do_word_embedder(sections)
# 4. keyword extract
# result_kw = do_keyword_extract(sections)
# keyword_extract_result, sections = do_keyword_extract(sections)
# if keyword_extract_result:
# print(f'keyword extraction finished successfully!')
# 5. simpify
# result_simp = do_simplify(sections)
# representation_result, sections = do_representation(sections)
# if representation_result:
# print(f'representation finished successfully!')
with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
data = json.dumps(sections, ensure_ascii=False, indent=2)
output_file.write(data)
print('all nlp processes finished successfully!')

File diff suppressed because it is too large Load Diff

View File

@ -4,20 +4,18 @@
"""
from transformers import pipeline
from normalizer import cleaning
from elastic_helper import ElasticHelper
import transformers
import json
import datetime
import pandas as pd
from transformers import AutoTokenizer
print(transformers.__version__)
print(f'transformers version: {transformers.__version__}')
# finetuned model for classification path
model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# window_size = tokenizer.model_max_length#512#200
print(f'Classification Model Loaded: {model_checkpoint}')
window_size = 512
"""
(یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
@ -27,8 +25,8 @@ window_size = 512
step_size = 350#100
# تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
Top_k = 4
classifier = pipeline("text-classification", model_checkpoint, framework="pt")
# set device = 0 => to use GPU
classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)
def get_class(sentences, top_k:int=4):
# sentences = cleaning(sentences)
@ -177,14 +175,15 @@ def do_classify(sections):
for index, id in enumerate(sections):
source = sections[id]['source']
source = sections[id]
classification_result, classification_status, desc = single_section_classification(id, source)
if not classification_status:
print(f'id: {id} classification error. error description: {desc}')
# ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
new_sections_dict[id] = classification_result
# new_sections_dict[id] = classification_result
sections[id]['ai_codes'] = classification_result
""" برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
# qanon_title = source['qanon_title']
@ -198,8 +197,8 @@ def do_classify(sections):
print(f'end: {datetime.datetime.now()}')
print('classification finished!')
classified_sections_dict = new_sections_dict
# classified_sections_dict = new_sections_dict
return classified_sections_dict
return sections

View File

@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
tagger = SequenceTagger.load(model)
print('model read and tagger initialized')
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
def prepare_data(ner_obj_list):
ner_data_list = []
@ -208,7 +208,7 @@ def do_ner_recognize(sections):
ner_data_list = prepare_data(ner_obj_list)
sections[id]['ners_v2'] = ner_data_list
print(f'ner process: {index+1}/{len_sections}')
print(f'len_sections ner recognization finished!')
print(f'{len_sections} ner recognization finished!')
return sections

View File

@ -6,7 +6,8 @@ import json
import datetime
import numpy as np
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
date = datetime.datetime.now()
today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
# model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
model = SentenceTransformer(model_name)
def do_word_embedder(sections):
for index, item in enumerate(sections):
embeddings = single_section_embedder(sections[item]['content'])
sections[item]['embeddings'] = embeddings
for index, id in enumerate(sections):
embeddings = single_section_embedder(sections[id]['content'])
sections[id]['embeddings'] = embeddings.tolist()
with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
data = json.dumps(sections, ensure_ascii=False)
output_file.write(data)
return sections
def single_section_embedder(sentence):
"""
این متد، متن ورودی را تبدیل به بردار متناظر آن می کند

View File

@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
#model_id = "meta-llama/Llama-3.1-70B-Instruct"
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
bnb_config = BitsAndBytesConfig(
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
@ -61,7 +61,7 @@ def generate(formatted_prompt):
# Gemini Prompt
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
یک لیست فارسی
شماره ترتیبی در ابتدای هر عبارت
هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد
هر عبارت در یک خط جداگانه
بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
موارد زیر در استخراج عبارات کلیدی الزامی است:
@ -94,7 +94,7 @@ def generate(formatted_prompt):
max_new_tokens=2048,
eos_token_id=terminators,
do_sample=True,
temperature=0.6,
temperature=0.1,
top_p=0.9,
)
response = outputs[0][input_ids.shape[-1]:]
@ -168,9 +168,9 @@ def do_keyword_extract(sections):
period_ids_text += f"{id} \n"
print(f"section: {counter}-id: {id}")
print(f"section kw extracting: {counter} - id: {id}")
# temp_dict.append(item)
if counter % 1000 == 0:
if counter % 5000 == 0:
outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
outputfile.close()
@ -196,12 +196,13 @@ def do_keyword_extract(sections):
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
print("end")
return True
operation_result = True
return operation_result, sections
if __name__ == "__main__":
print(f'start: {datetime.datetime.now()}')
sections = get_sections()
sections = do_keyword_extract(sections)
operation_result = do_keyword_extract(sections)
print(f'end: {datetime.datetime.now()}')

View File

@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
if torch.cuda.is_available():
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
@ -56,17 +56,20 @@ def single_section_representation(content):
outputs = model.generate(
input_ids,
max_new_tokens=500,
max_new_tokens=2048,
eos_token_id=terminators,
do_sample=True,
temperature=0.7,
temperature=0.1,
top_p=0.85,
)
response = outputs[0][input_ids.shape[-1]:]
sentences = tokenizer.decode(response, skip_special_tokens=True)
# حذف جملات تکراری
sentences = list(set(sentences))
result = True
desc = 'operation successful'
desc = 'Operation successful'
return result, desc, sentences
except Exception as error:
@ -77,7 +80,7 @@ def single_section_representation(content):
def do_representation(sections):
print(f"start time: {datetime.datetime.now()}")
for index, id in sections:
for index, id in enumerate(sections):
result, desc, sentences = single_section_representation(sections[id]['content'])
if not result:
error_content = f'id: {id} - error: {desc}\n'
@ -85,11 +88,16 @@ def do_representation(sections):
file.write(error_content)
sections[id]['represented_sentences'] = sentences
print(f'representation process. section {index+1}/{len(sections)} - id: {id}')
with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))
print(f"end time: {datetime.datetime.now()}")
print(" *** finished! *** ")
operation_result = True
return operation_result, sections
if __name__ == "__main__":
pass

View File

@ -0,0 +1,71 @@
# Section Classification Script
This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks.
## Requirements
Before using this script, please install the required libraries:
```bash
pip install transformers pandas
```
You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model.
## How It Works
- The script loads a fine-tuned transformer model for text classification.
- It processes each section of text, possibly splitting long texts into windows to fit the model's input size.
- For each section, it predicts the top classes and saves the results.
## Main Functions
- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes.
- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text.
- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results.
- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes.
- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file.
## Usage Example
Suppose you have your sections data as a dictionary:
```python
sections = {
"1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."},
"2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}
}
```
You can classify all sections as follows:
```python
from classification import do_classify
result = do_classify(sections)
```
After running, the results will be saved in a JSON file in the `./data/classification/` directory.
## Output Structure
Each section will have a new field `ai_codes` with the classification results:
```json
"1": {
"content": "First section text",
"ai_codes": {
"best-class": {"label": "ClassA", "score": 0.85},
"other-classes": [
{"label": "ClassB", "score": 0.10},
{"label": "ClassC", "score": 0.05}
]
}
}
```
## Notes
- Make sure the model path in `model_checkpoint` is correct and the model files are available.
- The script supports Persian and other languages, depending on your model.
- The output JSON file will be saved in `./data/classification/`.

View File

@ -0,0 +1,70 @@
# Sentence Embedding Generator
This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library.
## Requirements
Before using this script, please install the required libraries:
```bash
pip install sentence-transformers numpy
```
## How It Works
- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
- There are two main functions:
- `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector.
- `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file.
## Usage
### 1. Get Embedding for a Single Sentence
```python
from embedding import single_section_embedder
sentence = "This is a sample sentence."
embedding = single_section_embedder(sentence)
print(embedding)
```
### 2. Generate Embeddings for Multiple Sections and Save to File
Suppose your data is structured like this:
```python
sections = {
"1": {"content": "First section text"},
"2": {"content": "Second section text"}
}
```
You can generate and save embeddings as follows:
```python
from embedding import do_word_embedder
result = do_word_embedder(sections)
```
After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section.
## Output Structure
The output is a JSON file where each section has its embedding added:
```json
{
"1": {
"content": "First section text",
"embeddings": [0.123, 0.456, ...]
},
...
}
```
## Notes
- Make sure the folder `./data/embeddings/` exists before running the script.
- The script supports Persian language.