add some readme files
This commit is contained in:
parent
f137ba54d2
commit
b7f15bd846
|
@ -2,19 +2,30 @@
|
||||||
سورس اجرای پردازش های مختلف روی اجزای قانونی
|
سورس اجرای پردازش های مختلف روی اجزای قانونی
|
||||||
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن
|
شامل: کلاسیفیکیشن، تشخیص موجودیت های نامدار، استخراج بردار کلمات، استخراج کلیدواژه ها و سادهسازی(بازنمایی) متن
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from p1_classifier import do_classify
|
from p1_classifier import do_classify
|
||||||
from p2_ner_recognizer import do_ner_recognize
|
from p2_ner_recognizer import do_ner_recognize
|
||||||
from p3_words_embedder import do_word_embedder
|
from p3_words_embedder import do_word_embedder
|
||||||
# from p4_keyword_extractor import do_keyword_extract
|
from p4_keyword_extractor import do_keyword_extract
|
||||||
# from p5_simplifier import do_simplify
|
from p5_simplifier import do_representation
|
||||||
|
|
||||||
from elastic_helper import ElasticHelper
|
from elastic_helper import ElasticHelper
|
||||||
|
import json
|
||||||
|
|
||||||
def get_sections():
|
def get_sections():
|
||||||
sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
|
|
||||||
eh_obj = ElasticHelper()
|
# region خواندن کل سکشن ها از فایل جیسون
|
||||||
sections = eh_obj.iterateJsonFile(sections_path, True)
|
# sections_path = "/home/gpu/data_11/14040423/mj_qa_section.zip"
|
||||||
sections = convert_to_dict(sections)
|
# eh_obj = ElasticHelper()
|
||||||
|
# sections = eh_obj.iterateJsonFile(sections_path, True)
|
||||||
|
# sections = convert_to_dict(sections)
|
||||||
|
# endregion
|
||||||
|
|
||||||
|
# region خواندن تعداد محدودی از سکشن ها از طریق فایل جیسون
|
||||||
|
with open('./data/recent_sections.json', 'r', encoding='utf-8') as file:
|
||||||
|
sections = json.load(file)
|
||||||
|
# endregion
|
||||||
|
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def convert_to_dict(sections):
|
def convert_to_dict(sections):
|
||||||
|
@ -31,6 +42,15 @@ def main():
|
||||||
# get sections to do nlp processes
|
# get sections to do nlp processes
|
||||||
sections = get_sections()
|
sections = get_sections()
|
||||||
|
|
||||||
|
|
||||||
|
# temp_sections = {}
|
||||||
|
# for i, item in enumerate(sections):
|
||||||
|
# if i>3:
|
||||||
|
# break
|
||||||
|
# temp_sections[item] = sections[item]
|
||||||
|
|
||||||
|
# sections = temp_sections
|
||||||
|
|
||||||
# dictsections = {}
|
# dictsections = {}
|
||||||
# for item in sections:
|
# for item in sections:
|
||||||
# if not item['id'] == 'qs2180272':
|
# if not item['id'] == 'qs2180272':
|
||||||
|
@ -46,13 +66,21 @@ def main():
|
||||||
sections = do_ner_recognize(sections)
|
sections = do_ner_recognize(sections)
|
||||||
|
|
||||||
# 3. word embedder
|
# 3. word embedder
|
||||||
#sections = do_word_embedder(sections)
|
sections = do_word_embedder(sections)
|
||||||
|
|
||||||
# 4. keyword extract
|
# 4. keyword extract
|
||||||
# result_kw = do_keyword_extract(sections)
|
# keyword_extract_result, sections = do_keyword_extract(sections)
|
||||||
|
# if keyword_extract_result:
|
||||||
|
# print(f'keyword extraction finished successfully!')
|
||||||
|
|
||||||
# 5. simpify
|
# 5. simpify
|
||||||
# result_simp = do_simplify(sections)
|
# representation_result, sections = do_representation(sections)
|
||||||
|
# if representation_result:
|
||||||
|
# print(f'representation finished successfully!')
|
||||||
|
|
||||||
|
with open(f'./data/sections_full_metadata.json', 'w', encoding='utf-8') as output_file:
|
||||||
|
data = json.dumps(sections, ensure_ascii=False, indent=2)
|
||||||
|
output_file.write(data)
|
||||||
|
|
||||||
print('all nlp processes finished successfully!')
|
print('all nlp processes finished successfully!')
|
||||||
|
|
||||||
|
|
1370
normalizer copy.py
1370
normalizer copy.py
File diff suppressed because it is too large
Load Diff
|
@ -4,20 +4,18 @@
|
||||||
"""
|
"""
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from normalizer import cleaning
|
from normalizer import cleaning
|
||||||
from elastic_helper import ElasticHelper
|
|
||||||
import transformers
|
import transformers
|
||||||
import json
|
import json
|
||||||
import datetime
|
import datetime
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
print(transformers.__version__)
|
print(f'transformers version: {transformers.__version__}')
|
||||||
|
|
||||||
# finetuned model for classification path
|
# finetuned model for classification path
|
||||||
model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680'
|
model_checkpoint = './models/classifier/findtuned_classification_hoosh_with_path_v2__30/checkpoint-1680'
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
||||||
|
print(f'Classification Model Loaded: {model_checkpoint}')
|
||||||
# window_size = tokenizer.model_max_length#512#200
|
|
||||||
window_size = 512
|
window_size = 512
|
||||||
"""
|
"""
|
||||||
(یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
|
(یعنی سایز پنجره) به این دلیل که تعداد توکن های ورودی به مدل، محدود به متغیر بالاست
|
||||||
|
@ -27,8 +25,8 @@ window_size = 512
|
||||||
step_size = 350#100
|
step_size = 350#100
|
||||||
# تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
|
# تعداد کلاس هایی که بازای هر سکشن از مدل درخواست می کنیم
|
||||||
Top_k = 4
|
Top_k = 4
|
||||||
|
# set device = 0 => to use GPU
|
||||||
classifier = pipeline("text-classification", model_checkpoint, framework="pt")
|
classifier = pipeline("text-classification", model_checkpoint, framework="pt", device=0)
|
||||||
|
|
||||||
def get_class(sentences, top_k:int=4):
|
def get_class(sentences, top_k:int=4):
|
||||||
# sentences = cleaning(sentences)
|
# sentences = cleaning(sentences)
|
||||||
|
@ -177,14 +175,15 @@ def do_classify(sections):
|
||||||
|
|
||||||
for index, id in enumerate(sections):
|
for index, id in enumerate(sections):
|
||||||
|
|
||||||
source = sections[id]['source']
|
source = sections[id]
|
||||||
classification_result, classification_status, desc = single_section_classification(id, source)
|
classification_result, classification_status, desc = single_section_classification(id, source)
|
||||||
|
|
||||||
if not classification_status:
|
if not classification_status:
|
||||||
print(f'id: {id} classification error. error description: {desc}')
|
print(f'id: {id} classification error. error description: {desc}')
|
||||||
|
|
||||||
# ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
|
# ساماندهی کلاس های پیش بینی شده در عنوان بهترین کلاس و دیگر کلاسها بر اساس امتیاز تخمین مدل و ذخیره در دیکشنری
|
||||||
new_sections_dict[id] = classification_result
|
# new_sections_dict[id] = classification_result
|
||||||
|
sections[id]['ai_codes'] = classification_result
|
||||||
|
|
||||||
""" برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
|
""" برای حالت تست که می خواهیم عملکرد مدل کلاسیفایر را ارزیابی کنیم، بدین جهت که تنوعی از قوانین مختلف را بررسی کنیم، عنوان قوانین را ذخیره می کنیم تا از تکرار بررسی سکشن های متعدد از یک قانون پرهیز شود"""
|
||||||
# qanon_title = source['qanon_title']
|
# qanon_title = source['qanon_title']
|
||||||
|
@ -198,8 +197,8 @@ def do_classify(sections):
|
||||||
print(f'end: {datetime.datetime.now()}')
|
print(f'end: {datetime.datetime.now()}')
|
||||||
print('classification finished!')
|
print('classification finished!')
|
||||||
|
|
||||||
classified_sections_dict = new_sections_dict
|
# classified_sections_dict = new_sections_dict
|
||||||
|
|
||||||
return classified_sections_dict
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ model = "./models/ner/2025-07-22--20-44-37--HooshvareLab--bert-fa-base-uncased-n
|
||||||
tagger = SequenceTagger.load(model)
|
tagger = SequenceTagger.load(model)
|
||||||
print('model read and tagger initialized')
|
print('model read and tagger initialized')
|
||||||
|
|
||||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
|
||||||
|
|
||||||
def prepare_data(ner_obj_list):
|
def prepare_data(ner_obj_list):
|
||||||
ner_data_list = []
|
ner_data_list = []
|
||||||
|
@ -208,7 +208,7 @@ def do_ner_recognize(sections):
|
||||||
ner_data_list = prepare_data(ner_obj_list)
|
ner_data_list = prepare_data(ner_obj_list)
|
||||||
sections[id]['ners_v2'] = ner_data_list
|
sections[id]['ners_v2'] = ner_data_list
|
||||||
print(f'ner process: {index+1}/{len_sections}')
|
print(f'ner process: {index+1}/{len_sections}')
|
||||||
print(f'len_sections ner recognization finished!')
|
print(f'{len_sections} ner recognization finished!')
|
||||||
|
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,8 @@ import json
|
||||||
import datetime
|
import datetime
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
date = datetime.datetime.now()
|
||||||
|
today = f'{date.year}-{date.month}-{date.day}-{date.hour}'
|
||||||
|
|
||||||
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
|
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
|
||||||
# model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
|
# model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'#87-30
|
||||||
|
@ -19,14 +20,16 @@ model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'#89-25
|
||||||
model = SentenceTransformer(model_name)
|
model = SentenceTransformer(model_name)
|
||||||
|
|
||||||
def do_word_embedder(sections):
|
def do_word_embedder(sections):
|
||||||
for index, item in enumerate(sections):
|
for index, id in enumerate(sections):
|
||||||
embeddings = single_section_embedder(sections[item]['content'])
|
embeddings = single_section_embedder(sections[id]['content'])
|
||||||
sections[item]['embeddings'] = embeddings
|
sections[id]['embeddings'] = embeddings.tolist()
|
||||||
|
|
||||||
with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
|
with open(f'./data/embeddings/sections_embeddings_{today}.json', 'w', encoding='utf-8') as output_file:
|
||||||
data = json.dumps(sections, ensure_ascii=False)
|
data = json.dumps(sections, ensure_ascii=False)
|
||||||
output_file.write(data)
|
output_file.write(data)
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
def single_section_embedder(sentence):
|
def single_section_embedder(sentence):
|
||||||
"""
|
"""
|
||||||
این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
|
این متد، متن ورودی را تبدیل به بردار متناظر آن می کند
|
||||||
|
|
|
@ -15,7 +15,7 @@ os.environ['HF_HOME'] = "/home/admin/HFHOME"
|
||||||
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
#model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
#model_id = "meta-llama/Llama-3.1-70B-Instruct"
|
||||||
|
|
||||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
|
||||||
|
|
||||||
bnb_config = BitsAndBytesConfig(
|
bnb_config = BitsAndBytesConfig(
|
||||||
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
load_in_8bit=True, bnb_8bit_use_double_quant=True, bnb_8bit_quant_type="nf8", bnb_8bit_compute_dtype=torch.bfloat16
|
||||||
|
@ -61,7 +61,7 @@ def generate(formatted_prompt):
|
||||||
# Gemini Prompt
|
# Gemini Prompt
|
||||||
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
USER_PROMPT = f"""از متن ارائه شده، حداقل {keywords_count} عبارت کلیدی مهم و معنادار را استخراج کنید. خروجی باید به صورت زیر باشد:
|
||||||
• یک لیست فارسی
|
• یک لیست فارسی
|
||||||
• شماره ترتیبی در ابتدای هر عبارت
|
• هیچ عدد یا علامت و نمادی در ابتدا یا انتهای هر عبارت کلیدی قرار نگیرد
|
||||||
• هر عبارت در یک خط جداگانه
|
• هر عبارت در یک خط جداگانه
|
||||||
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
• بدون هیچ توضیح اضافی در ابتدا یا انتهای پاسخ
|
||||||
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
موارد زیر در استخراج عبارات کلیدی الزامی است:
|
||||||
|
@ -94,7 +94,7 @@ def generate(formatted_prompt):
|
||||||
max_new_tokens=2048,
|
max_new_tokens=2048,
|
||||||
eos_token_id=terminators,
|
eos_token_id=terminators,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
temperature=0.6,
|
temperature=0.1,
|
||||||
top_p=0.9,
|
top_p=0.9,
|
||||||
)
|
)
|
||||||
response = outputs[0][input_ids.shape[-1]:]
|
response = outputs[0][input_ids.shape[-1]:]
|
||||||
|
@ -168,9 +168,9 @@ def do_keyword_extract(sections):
|
||||||
|
|
||||||
period_ids_text += f"{id} \n"
|
period_ids_text += f"{id} \n"
|
||||||
|
|
||||||
print(f"section: {counter}-id: {id}")
|
print(f"section kw extracting: {counter} - id: {id}")
|
||||||
# temp_dict.append(item)
|
# temp_dict.append(item)
|
||||||
if counter % 1000 == 0:
|
if counter % 5000 == 0:
|
||||||
outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
|
outputfile = open(f'./data/keyword/sections_kw_llama8b_{str(file_counter)}_{today}.json', "a+", encoding='utf-8')
|
||||||
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
|
outputfile.write(json.dumps(period_sections, ensure_ascii=False, indent=2))
|
||||||
outputfile.close()
|
outputfile.close()
|
||||||
|
@ -196,12 +196,13 @@ def do_keyword_extract(sections):
|
||||||
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
print(f"elapsed time: {(end_time-start_time)/86400} Days!!! ")
|
||||||
print("end")
|
print("end")
|
||||||
|
|
||||||
return True
|
operation_result = True
|
||||||
|
return operation_result, sections
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print(f'start: {datetime.datetime.now()}')
|
print(f'start: {datetime.datetime.now()}')
|
||||||
sections = get_sections()
|
sections = get_sections()
|
||||||
|
|
||||||
sections = do_keyword_extract(sections)
|
operation_result = do_keyword_extract(sections)
|
||||||
|
|
||||||
print(f'end: {datetime.datetime.now()}')
|
print(f'end: {datetime.datetime.now()}')
|
|
@ -6,7 +6,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
import torch
|
import torch
|
||||||
import json
|
import json
|
||||||
|
|
||||||
today = f'{datetime.datetime.year}-{datetime.datetime.month}-{datetime.datetime.day}-{datetime.datetime.hour}'
|
today = f'{datetime.datetime.now().year}-{datetime.datetime.now().month}-{datetime.datetime.now().day}-{datetime.datetime.now().hour}'
|
||||||
|
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
|
@ -56,17 +56,20 @@ def single_section_representation(content):
|
||||||
|
|
||||||
outputs = model.generate(
|
outputs = model.generate(
|
||||||
input_ids,
|
input_ids,
|
||||||
max_new_tokens=500,
|
max_new_tokens=2048,
|
||||||
eos_token_id=terminators,
|
eos_token_id=terminators,
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
temperature=0.7,
|
temperature=0.1,
|
||||||
top_p=0.85,
|
top_p=0.85,
|
||||||
)
|
)
|
||||||
|
|
||||||
response = outputs[0][input_ids.shape[-1]:]
|
response = outputs[0][input_ids.shape[-1]:]
|
||||||
sentences = tokenizer.decode(response, skip_special_tokens=True)
|
sentences = tokenizer.decode(response, skip_special_tokens=True)
|
||||||
|
# حذف جملات تکراری
|
||||||
|
sentences = list(set(sentences))
|
||||||
|
|
||||||
result = True
|
result = True
|
||||||
desc = 'operation successful'
|
desc = 'Operation successful'
|
||||||
return result, desc, sentences
|
return result, desc, sentences
|
||||||
|
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
|
@ -77,7 +80,7 @@ def single_section_representation(content):
|
||||||
def do_representation(sections):
|
def do_representation(sections):
|
||||||
print(f"start time: {datetime.datetime.now()}")
|
print(f"start time: {datetime.datetime.now()}")
|
||||||
|
|
||||||
for index, id in sections:
|
for index, id in enumerate(sections):
|
||||||
result, desc, sentences = single_section_representation(sections[id]['content'])
|
result, desc, sentences = single_section_representation(sections[id]['content'])
|
||||||
if not result:
|
if not result:
|
||||||
error_content = f'id: {id} - error: {desc}\n'
|
error_content = f'id: {id} - error: {desc}\n'
|
||||||
|
@ -85,11 +88,16 @@ def do_representation(sections):
|
||||||
file.write(error_content)
|
file.write(error_content)
|
||||||
|
|
||||||
sections[id]['represented_sentences'] = sentences
|
sections[id]['represented_sentences'] = sentences
|
||||||
|
print(f'representation process. section {index+1}/{len(sections)} - id: {id}')
|
||||||
|
|
||||||
|
with open(f'./data/represent/sections_represent_llama8b_{today}.json', "w", encoding='utf-8') as outputfile:
|
||||||
|
outputfile.write(json.dumps(sections, ensure_ascii=False, indent = 4))
|
||||||
|
|
||||||
print(f"end time: {datetime.datetime.now()}")
|
print(f"end time: {datetime.datetime.now()}")
|
||||||
print(" *** finished! *** ")
|
print(" *** finished! *** ")
|
||||||
|
|
||||||
|
operation_result = True
|
||||||
|
return operation_result, sections
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pass
|
pass
|
71
readme/readme-classifier.md
Normal file
71
readme/readme-classifier.md
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
# Section Classification Script
|
||||||
|
|
||||||
|
This project provides a Python script (`classification.py`) for classifying text sections using a fine-tuned transformer model. The script is designed to suggest the most relevant classes for each section of text, which is useful for legal documents, content categorization, and similar NLP tasks.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Before using this script, please install the required libraries:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install transformers pandas
|
||||||
|
```
|
||||||
|
|
||||||
|
You also need a fine-tuned classification model and its tokenizer. Update the `model_checkpoint` path in the script to point to your model.
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
- The script loads a fine-tuned transformer model for text classification.
|
||||||
|
- It processes each section of text, possibly splitting long texts into windows to fit the model's input size.
|
||||||
|
- For each section, it predicts the top classes and saves the results.
|
||||||
|
|
||||||
|
## Main Functions
|
||||||
|
|
||||||
|
- `get_class(sentences, top_k=4)`: Classifies a sentence or text and returns the top `k` classes.
|
||||||
|
- `mean_classes(input_classes)`: Aggregates class results from multiple windows of a long text.
|
||||||
|
- `get_window_classes(text)`: Handles splitting long texts into windows and aggregates their classification results.
|
||||||
|
- `single_section_classification(id, section_source)`: Classifies a single section and returns the best and other suggested classes.
|
||||||
|
- `do_classify(sections)`: Classifies all sections in a dictionary and saves the results to a JSON file.
|
||||||
|
|
||||||
|
## Usage Example
|
||||||
|
|
||||||
|
Suppose you have your sections data as a dictionary:
|
||||||
|
|
||||||
|
```python
|
||||||
|
sections = {
|
||||||
|
"1": {"content": "First section text", "other_info": {"full_path": "..."}, "qanon_title": "..."},
|
||||||
|
"2": {"content": "Second section text", "other_info": {"full_path": "..."}, "qanon_title": "..."}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can classify all sections as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from classification import do_classify
|
||||||
|
|
||||||
|
result = do_classify(sections)
|
||||||
|
```
|
||||||
|
|
||||||
|
After running, the results will be saved in a JSON file in the `./data/classification/` directory.
|
||||||
|
|
||||||
|
## Output Structure
|
||||||
|
|
||||||
|
Each section will have a new field `ai_codes` with the classification results:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"1": {
|
||||||
|
"content": "First section text",
|
||||||
|
"ai_codes": {
|
||||||
|
"best-class": {"label": "ClassA", "score": 0.85},
|
||||||
|
"other-classes": [
|
||||||
|
{"label": "ClassB", "score": 0.10},
|
||||||
|
{"label": "ClassC", "score": 0.05}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Make sure the model path in `model_checkpoint` is correct and the model files are available.
|
||||||
|
- The script supports Persian and other languages, depending on your model.
|
||||||
|
- The output JSON file will be saved in `./data/classification/`.
|
70
readme/readme-words-embedder.md
Normal file
70
readme/readme-words-embedder.md
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
# Sentence Embedding Generator
|
||||||
|
|
||||||
|
This project provides a Python script (`embedding.py`) for generating sentence embeddings using the [Sentence Transformers]library.
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
Before using this script, please install the required libraries:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install sentence-transformers numpy
|
||||||
|
```
|
||||||
|
|
||||||
|
## How It Works
|
||||||
|
|
||||||
|
- The script uses the pre-trained model: `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`.
|
||||||
|
- There are two main functions:
|
||||||
|
- `single_section_embedder(sentence)`: Takes a sentence (string) and returns its embedding as a vector.
|
||||||
|
- `do_word_embedder(sections)`: Takes a dictionary of sections (each with a `content` field), generates embeddings for each section, and saves the results as a JSON file.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### 1. Get Embedding for a Single Sentence
|
||||||
|
|
||||||
|
```python
|
||||||
|
from embedding import single_section_embedder
|
||||||
|
|
||||||
|
sentence = "This is a sample sentence."
|
||||||
|
embedding = single_section_embedder(sentence)
|
||||||
|
print(embedding)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Generate Embeddings for Multiple Sections and Save to File
|
||||||
|
|
||||||
|
Suppose your data is structured like this:
|
||||||
|
|
||||||
|
```python
|
||||||
|
sections = {
|
||||||
|
"1": {"content": "First section text"},
|
||||||
|
"2": {"content": "Second section text"}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can generate and save embeddings as follows:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from embedding import do_word_embedder
|
||||||
|
|
||||||
|
result = do_word_embedder(sections)
|
||||||
|
```
|
||||||
|
|
||||||
|
After running, a file named like `sections_embeddings_YEAR-MONTH-DAY-HOUR.json` will be created in the `./data/embeddings/` directory, containing the embeddings for each section.
|
||||||
|
|
||||||
|
## Output Structure
|
||||||
|
|
||||||
|
The output is a JSON file where each section has its embedding added:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"1": {
|
||||||
|
"content": "First section text",
|
||||||
|
"embeddings": [0.123, 0.456, ...]
|
||||||
|
},
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Make sure the folder `./data/embeddings/` exists before running the script.
|
||||||
|
- The script supports Persian language.
|
Loading…
Reference in New Issue
Block a user