hazm_docker/app/hazm_api.py

80 lines
2.5 KiB
Python
Raw Permalink Normal View History

2025-02-02 14:04:55 +00:00
from fastapi import APIRouter
from pydantic import BaseModel
from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
import warnings
import os
from typing import Optional
warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
from fastapi.responses import HTMLResponse
import plot_dependency_graph
router = APIRouter()
class TextData(BaseModel):
text: str
class Token(BaseModel):
id: int
text: str
lemma: str
upos: str
xpos: str
feats: Optional[str] = None
head: int
deprel: str
deps: Optional[str] = None
misc: Optional[str] = None
path_base_model = '/MODELS'
word_tokenizer = WordTokenizer()
postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
lemmatizer = Lemmatizer()
parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
2025-02-04 06:03:37 +00:00
chunker = Chunker(model=os.path.join(path_base_model, 'hazm/chunker.model'))
2025-02-02 14:04:55 +00:00
def dependency_parse(data):
tokens = word_tokenizer.tokenize(data.text)
dependency_graph = parser.parse(tokens)
conll = dependency_graph.to_conll(10)
# تبدیل خروجی به دیکشنری
parsed_lines = conll.splitlines()
parsed_dict = []
for line in parsed_lines:
if line.strip(): # اگر خط خالی نباشد
cols = line.split('\t')
token_dict = {
"id": int(cols[0]),
"text": cols[1],
"lemma": cols[2],
"upos": cols[3],
"xpos": cols[4],
"feats": cols[5] if cols[5] != "_" else None,
"head": int(cols[6]) if cols[6] != "_" else None,
"deprel": cols[7],
"deps": cols[8] if cols[8] != "_" else None,
"misc": cols[9] if cols[9] != "_" else None,
}
parsed_dict.append(token_dict)
return [parsed_dict]
@router.post("/dep", tags=['Hazm'])
async def parse_dependencies(data: TextData) -> list[list[Token]]:
return dependency_parse(data)
@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
async def plot_dependency_tree(data: TextData):
return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
@router.post("/chunks", tags=['Hazm'])
async def chunks(data: TextData):
tokens = word_tokenizer.tokenize(data.text)
pos_tags = postagger.tag(tokens)
result = chunker.parse(pos_tags)
brackets = tree2brackets(result)
return {"chunk_tree": brackets}