add some readme files
This commit is contained in:
parent
f137ba54d2
commit
b7f15bd846
|
@ -2,19 +2,30 @@
|
|||
سورس اجرای پردازش های مختلف روی اجزای قانونی
|
||||
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن
|
||||
"""
|
||||
|
||||
from p1_classifier import do_classify
|
||||
from p2_ner_recognizer import do_ner_recognize
|
||||
from p3_words_embedder import do_word_embedder
|
||||
# from p4_keyword_extractor import do_keyword_extract
|
||||
# from p5_simplifier import do_simplify
|
||||
from p4_keyword_extractor import do_keyword_extract
|
||||
from p5_simplifier import do_representation
|
||||
|
||||
from elastic_helper import ElasticHelper
|
||||
import json
|
||||
|
||||
def get_sections():
|
||||
sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
|
||||
eh_obj = ElasticHelper()
|
||||
sections = eh_obj.iterateJsonFile(sections_path, True)
|
||||
sections = convert_to_dict(sections)
|
||||
|
||||
# region خواندن کل سکشن ها از فایل جیسون
|
||||
# sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
|
||||
# eh_obj = ElasticHelper()
|
||||
# sections = eh_obj.iterateJsonFile(sections_path, True)
|
||||
# sections = convert_to_dict(sections)
|
||||
# endregion
|
||||
|
||||
# region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
|
||||
with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
|
||||
sections = json.load(file)
|
||||
# endregion
|
||||
|
||||
return sections
|
||||
|
||||
def convert_to_dict(sections):
|
||||
|
@ -31,6 +42,15 @@ def main():
|
|||
# get sections to do nlp processes
|
||||
sections = get_sections()
|
||||
|
||||
|
||||
# temp_sections = {}
|
||||
# for i, item in enumerate(sections):
|
||||
# if i>3:
|
||||
# break
|
||||
# temp_sections[item] = sections[item]
|
||||
|
||||
# sections = temp_sections
|
||||
|
||||
# dictsections = {}
|
||||
# for item in sections:
|
||||
# if not item['id'] == 'qs2180272':
|
||||
|
@ -46,13 +66,21 @@ def main():
|
|||
sections = do_ner_recognize(sections)
|
||||
|
||||
# 3. word embedder
|
||||
#sections = do_word_embedder(sections)
|
||||
sections = do_word_embedder(sections)
|
||||
|
||||
# 4. keyword extract
|
||||
# result_kw = do_keyword_extract(sections)
|
||||
# keyword_extract_result, sections = do_keyword_extract(sections)
|
||||
# if keyword_extract_result:
|
||||
# print(f'keyword extraction finished successfully!')
|
||||
|
||||
# 5. simpify
|
||||
# result_simp = do_simplify(sections)
|
||||
# representation_result, sections = do_representation(sections)
|
||||
# if representation_result:
|
||||
# print(f'representation finished successfully!')
|
||||
|
||||
with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
|
||||
data = json.dumps(sections, ensure_ascii=False, indent=2)
|
||||
output_file.write(data)
|
||||
|
||||
print('all nlp processes finished successfully!')
|
||||
|
||||
|
|
1370
normalizer copy.py
1370
normalizer copy.py
File diff suppressed because it is too large
Load Diff
|
@ -4,20 +4,18 @@
|
|||
"""
|
||||
from transformers import pipeline
|
||||
from normalizer import cleaning
|
||||
from elastic_helper import ElasticHelper
|
||||
import transformers
|
||||
import json
|
||||
import datetime
|
||||
import pandas as pd
|
||||
from transformers import AutoTokenizer
|
||||
print(transformers.__version__)
|
||||
print(f'transformers version: {transformers.__version__}')
|
||||
|
||||
# finetuned model for classification path
|
||||
model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680'
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
||||
|
||||
# window_size = tokenizer.model_max_length#512#200
|
||||
print(f'Classification Model Loaded: {model_checkpoint}')
|
||||
window_size = 512
|
||||
"""
|
||||
(یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
|
||||
|
@ -27,8 +25,8 @@ window_size = 512
|
|||
step_size = 350#100
|
||||
# تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
|
||||
Top_k = 4
|
||||
|
||||
classifier = pipeline("text-classification", model_checkpoint, framework="pt")
|
||||
# set device = 0 => to use GPU
|
||||
classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)
|
||||
|
||||
def get_class(sentences, top_k:int=4):
|
||||
# sentences = cleaning(sentences)
|
||||
|
@ -177,14 +175,15 @@ def do_classify(sections):
|
|||
|
||||
for index, id in enumerate(sections):
|
||||
|
||||
source = sections[id]['source']
|
||||
source = sections[id]
|
||||
classification_result, classification_status, desc = single_section_classification(id, source)
|
||||
|
||||
if not classification_status:
|
||||
print(f'id: {id} classification error. error description: {desc}')
|
||||
|
||||
# ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
|
||||
new_sections_dict[id] = classification_result
|
||||
# new_sections_dict[id] = classification_result
|
||||
sections[id]['ai_codes'] = classification_result
|
||||
|
||||
""" برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
|
||||
# qanon_title = source['qanon_title']
|
||||
|
@ -198,8 +197,8 @@ def do_classify(sections):
|
|||
print(f'end: {datetime.datetime.now()}')
|
||||
print('classification finished!')
|
||||
|
||||
classified_sections_dict = new_sections_dict
|
||||
# classified_sections_dict = new_sections_dict
|
||||
|
||||
return classified_sections_dict
|
||||
return sections
|
||||
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
|
|||
tagger = SequenceTagger.load(model)
|
||||
print('model read and tagger initialized')
|
||||
|
||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
||||
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
|
||||
|
||||
def prepare_data(ner_obj_list):
|
||||
ner_data_list = []
|
||||
|
@ -208,7 +208,7 @@ def do_ner_recognize(sections):
|
|||
ner_data_list = prepare_data(ner_obj_list)
|
||||
sections[id]['ners_v2'] = ner_data_list
|
||||
print(f'ner process: {index+1}/{len_sections}')
|
||||
print(f'len_sections ner recognization finished!')
|
||||
print(f'{len_sections} ner recognization finished!')
|
||||
|
||||
return sections
|
||||
|
||||
|
|
|
@ -6,7 +6,8 @@ import json
|
|||
import datetime
|
||||
import numpy as np
|
||||
|
||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
||||
date = datetime.datetime.now()
|
||||
today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
|
||||
|
||||
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
|
||||
# model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
|
||||
|
@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
|
|||
model = SentenceTransformer(model_name)
|
||||
|
||||
def do_word_embedder(sections):
|
||||
for index, item in enumerate(sections):
|
||||
embeddings = single_section_embedder(sections[item]['content'])
|
||||
sections[item]['embeddings'] = embeddings
|
||||
for index, id in enumerate(sections):
|
||||
embeddings = single_section_embedder(sections[id]['content'])
|
||||
sections[id]['embeddings'] = embeddings.tolist()
|
||||
|
||||
with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
|
||||
data = json.dumps(sections, ensure_ascii=False)
|
||||
output_file.write(data)
|
||||
|
||||
return sections
|
||||
|
||||
def single_section_embedder(sentence):
|
||||
"""
|
||||
این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
|
||||
|
|
|
@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
|||
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
#model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||
|
||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
||||
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||
|
@ -61,7 +61,7 @@ def generate(formatted_prompt):
|
|||
# Gemini Prompt
|
||||
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
||||
• یک لیست فارسی
|
||||
• شماره ترتیبی در ابتدای هر عبارت
|
||||
• هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد
|
||||
• هر عبارت در یک خط جداگانه
|
||||
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
||||
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
||||
|
@ -94,7 +94,7 @@ def generate(formatted_prompt):
|
|||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.6,
|
||||
temperature=0.1,
|
||||
top_p=0.9,
|
||||
)
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
|
@ -168,9 +168,9 @@ def do_keyword_extract(sections):
|
|||
|
||||
period_ids_text += f"{id} \n"
|
||||
|
||||
print(f"section: {counter}-id: {id}")
|
||||
print(f"section kw extracting: {counter} - id: {id}")
|
||||
# temp_dict.append(item)
|
||||
if counter % 1000 == 0:
|
||||
if counter % 5000 == 0:
|
||||
outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
|
||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
|
||||
outputfile.close()
|
||||
|
@ -196,12 +196,13 @@ def do_keyword_extract(sections):
|
|||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||
print("end")
|
||||
|
||||
return True
|
||||
operation_result = True
|
||||
return operation_result, sections
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f'start: {datetime.datetime.now()}')
|
||||
sections = get_sections()
|
||||
|
||||
sections = do_keyword_extract(sections)
|
||||
operation_result = do_keyword_extract(sections)
|
||||
|
||||
print(f'end: {datetime.datetime.now()}')
|
|
@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||
import torch
|
||||
import json
|
||||
|
||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
||||
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
|
||||
|
||||
if torch.cuda.is_available():
|
||||
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||
|
@ -56,17 +56,20 @@ def single_section_representation(content):
|
|||
|
||||
outputs = model.generate(
|
||||
input_ids,
|
||||
max_new_tokens=500,
|
||||
max_new_tokens=2048,
|
||||
eos_token_id=terminators,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
temperature=0.1,
|
||||
top_p=0.85,
|
||||
)
|
||||
|
||||
response = outputs[0][input_ids.shape[-1]:]
|
||||
sentences = tokenizer.decode(response, skip_special_tokens=True)
|
||||
# حذف جملات تکراری
|
||||
sentences = list(set(sentences))
|
||||
|
||||
result = True
|
||||
desc = 'operation successful'
|
||||
desc = 'Operation successful'
|
||||
return result, desc, sentences
|
||||
|
||||
except Exception as error:
|
||||
|
@ -77,7 +80,7 @@ def single_section_representation(content):
|
|||
def do_representation(sections):
|
||||
print(f"start time: {datetime.datetime.now()}")
|
||||
|
||||
for index, id in sections:
|
||||
for index, id in enumerate(sections):
|
||||
result, desc, sentences = single_section_representation(sections[id]['content'])
|
||||
if not result:
|
||||
error_content = f'id: {id} - error: {desc}\n'
|
||||
|
@ -85,11 +88,16 @@ def do_representation(sections):
|
|||
file.write(error_content)
|
||||
|
||||
sections[id]['represented_sentences'] = sentences
|
||||
print(f'representation process. section {index+1}/{len(sections)} - id: {id}')
|
||||
|
||||
with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
|
||||
outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))
|
||||
|
||||
print(f"end time: {datetime.datetime.now()}")
|
||||
print(" *** finished! *** ")
|
||||
|
||||
operation_result = True
|
||||
return operation_result, sections
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
71
readme/readme-classifier.md
Normal file
71
readme/readme-classifier.md
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Section Classification Script
|
||||
|
||||
This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks.
|
||||
|
||||
## Requirements
|
||||
|
||||
Before using this script, please install the required libraries:
|
||||
|
||||
```bash
|
||||
pip install transformers pandas
|
||||
```
|
||||
|
||||
You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model.
|
||||
|
||||
## How It Works
|
||||
|
||||
- The script loads a fine-tuned transformer model for text classification.
|
||||
- It processes each section of text, possibly splitting long texts into windows to fit the model's input size.
|
||||
- For each section, it predicts the top classes and saves the results.
|
||||
|
||||
## Main Functions
|
||||
|
||||
- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes.
|
||||
- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text.
|
||||
- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results.
|
||||
- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes.
|
||||
- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file.
|
||||
|
||||
## Usage Example
|
||||
|
||||
Suppose you have your sections data as a dictionary:
|
||||
|
||||
```python
|
||||
sections = {
|
||||
"1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."},
|
||||
"2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}
|
||||
}
|
||||
```
|
||||
|
||||
You can classify all sections as follows:
|
||||
|
||||
```python
|
||||
from classification import do_classify
|
||||
|
||||
result = do_classify(sections)
|
||||
```
|
||||
|
||||
After running, the results will be saved in a JSON file in the `./data/classification/` directory.
|
||||
|
||||
## Output Structure
|
||||
|
||||
Each section will have a new field `ai_codes` with the classification results:
|
||||
|
||||
```json
|
||||
"1": {
|
||||
"content": "First section text",
|
||||
"ai_codes": {
|
||||
"best-class": {"label": "ClassA", "score": 0.85},
|
||||
"other-classes": [
|
||||
{"label": "ClassB", "score": 0.10},
|
||||
{"label": "ClassC", "score": 0.05}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Make sure the model path in `model_checkpoint` is correct and the model files are available.
|
||||
- The script supports Persian and other languages, depending on your model.
|
||||
- The output JSON file will be saved in `./data/classification/`.
|
70
readme/readme-words-embedder.md
Normal file
70
readme/readme-words-embedder.md
Normal file
|
@ -0,0 +1,70 @@
|
|||
# Sentence Embedding Generator
|
||||
|
||||
This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library.
|
||||
|
||||
## Requirements
|
||||
|
||||
Before using this script, please install the required libraries:
|
||||
|
||||
```bash
|
||||
pip install sentence-transformers numpy
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
|
||||
- There are two main functions:
|
||||
- `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector.
|
||||
- `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file.
|
||||
|
||||
## Usage
|
||||
|
||||
### 1. Get Embedding for a Single Sentence
|
||||
|
||||
```python
|
||||
from embedding import single_section_embedder
|
||||
|
||||
sentence = "This is a sample sentence."
|
||||
embedding = single_section_embedder(sentence)
|
||||
print(embedding)
|
||||
```
|
||||
|
||||
### 2. Generate Embeddings for Multiple Sections and Save to File
|
||||
|
||||
Suppose your data is structured like this:
|
||||
|
||||
```python
|
||||
sections = {
|
||||
"1": {"content": "First section text"},
|
||||
"2": {"content": "Second section text"}
|
||||
}
|
||||
```
|
||||
|
||||
You can generate and save embeddings as follows:
|
||||
|
||||
```python
|
||||
from embedding import do_word_embedder
|
||||
|
||||
result = do_word_embedder(sections)
|
||||
```
|
||||
|
||||
After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section.
|
||||
|
||||
## Output Structure
|
||||
|
||||
The output is a JSON file where each section has its embedding added:
|
||||
|
||||
```json
|
||||
{
|
||||
"1": {
|
||||
"content": "First section text",
|
||||
"embeddings": [0.123, 0.456, ...]
|
||||
},
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Make sure the folder `./data/embeddings/` exists before running the script.
|
||||
- The script supports Persian language.
|
Loading…
Reference in New Issue
Block a user