hazm_docker/app/hazm_api.py

82 lines
2.6 KiB
Python

from fastapi import APIRouter
from pydantic import BaseModel
from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
import warnings
import os
from typing import Optional
warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
from fastapi.responses import HTMLResponse
import plot_dependency_graph
router = APIRouter()
class TextData(BaseModel):
text: str
class Token(BaseModel):
id: int
text: str
lemma: str
upos: str
xpos: str
feats: Optional[str] = None
head: int
deprel: str
deps: Optional[str] = None
misc: Optional[str] = None
path_base_model = '/MODELS'
word_tokenizer = WordTokenizer()
postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
lemmatizer = Lemmatizer()
parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
def dependency_parse(data):
tokens = word_tokenizer.tokenize(data.text)
dependency_graph = parser.parse(tokens)
conll = dependency_graph.to_conll(10)
# تبدیل خروجی به دیکشنری
parsed_lines = conll.splitlines()
parsed_dict = []
for line in parsed_lines:
if line.strip(): # اگر خط خالی نباشد
cols = line.split('\t')
token_dict = {
"id": int(cols[0]),
"text": cols[1],
"lemma": cols[2],
"upos": cols[3],
"xpos": cols[4],
"feats": cols[5] if cols[5] != "_" else None,
"head": int(cols[6]) if cols[6] != "_" else None,
"deprel": cols[7],
"deps": cols[8] if cols[8] != "_" else None,
"misc": cols[9] if cols[9] != "_" else None,
}
parsed_dict.append(token_dict)
return [parsed_dict]
@router.post("/dep", tags=['Hazm'])
async def parse_dependencies(data: TextData) -> list[list[Token]]:
return dependency_parse(data)
@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
async def plot_dependency_tree(data: TextData):
return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
@router.post("/chunks", tags=['Hazm'])
async def chunks(data: TextData):
word_tokenizer = WordTokenizer()
postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model')
chunker = Chunker(model=path_base_model + 'hazm/chunker.model')
tokens = word_tokenizer.tokenize(data.text)
pos_tags = postagger.tag(tokens)
result = chunker.parse(pos_tags)
brackets = tree2brackets(result)
return {"chunk_tree": brackets}