51 lines
1.6 KiB
Python
51 lines
1.6 KiB
Python
import flair
|
|
from flair.data import Corpus
|
|
from flair.datasets import ColumnDataset
|
|
from flair.datasets import ColumnCorpus
|
|
from flair.embeddings import TransformerDocumentEmbeddings
|
|
from flair.models import SequenceTagger
|
|
from flair.trainers import ModelTrainer
|
|
|
|
model = 'zedfum/arman-longformer-8k-finetuned-ensani'
|
|
dataset = './jokar/Flair_NER/data/DATASET.txt'
|
|
|
|
# define columns
|
|
columns = {0 : 'text', 1 : 'ner'}
|
|
# directory where the data resides
|
|
data_folder = './data/'
|
|
# initializing the corpus
|
|
corpus = ColumnCorpus(data_folder, columns,
|
|
train_file = 'DATASET_3.txt'
|
|
)
|
|
|
|
# Load the dataset for training
|
|
#corpus = ColumnDataset(dataset, column_format={0: 'text', 1: 'ner'})
|
|
|
|
# Create a Flair embedding layer using the safetensors model
|
|
embeddings = TransformerDocumentEmbeddings(model, layers='-1', pooling='mean')
|
|
|
|
# tag to predict
|
|
tag_type = 'ner'
|
|
# make tag dictionary from the corpus
|
|
tag_dictionary = corpus.make_label_dictionary(label_type=tag_type)
|
|
print(tag_dictionary)
|
|
print('#'*50)
|
|
print('#'*50)
|
|
print('#'*50)
|
|
|
|
# Create a SequenceTagger model for NER
|
|
tagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=corpus.make_tag_dictionary(tag_type=tag_type),tag_type=tag_type)
|
|
print('SequenceTagger')
|
|
print('#'*50)
|
|
print('#'*50)
|
|
print('#'*50)
|
|
|
|
# Initialize the trainer
|
|
trainer = ModelTrainer(tagger, corpus)
|
|
print('ModelTrainer')
|
|
print('#'*50)
|
|
print('#'*50)
|
|
print('#'*50)
|
|
# Train the model
|
|
trainer.train('./jokar/Flair_NER/trained', learning_rate=0.65e-4, mini_batch_size=8, max_epochs=10)
|
|
print('trained!') |