82 lines
2.6 KiB
Python
82 lines
2.6 KiB
Python
from fastapi import APIRouter
|
|
from pydantic import BaseModel
|
|
from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
|
|
import warnings
|
|
import os
|
|
from typing import Optional
|
|
warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
|
|
from fastapi.responses import HTMLResponse
|
|
|
|
import plot_dependency_graph
|
|
|
|
router = APIRouter()
|
|
|
|
class TextData(BaseModel):
|
|
text: str
|
|
|
|
class Token(BaseModel):
|
|
id: int
|
|
text: str
|
|
lemma: str
|
|
upos: str
|
|
xpos: str
|
|
feats: Optional[str] = None
|
|
head: int
|
|
deprel: str
|
|
deps: Optional[str] = None
|
|
misc: Optional[str] = None
|
|
|
|
path_base_model = '/MODELS'
|
|
|
|
word_tokenizer = WordTokenizer()
|
|
postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
|
|
lemmatizer = Lemmatizer()
|
|
parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
|
|
|
|
def dependency_parse(data):
|
|
tokens = word_tokenizer.tokenize(data.text)
|
|
dependency_graph = parser.parse(tokens)
|
|
conll = dependency_graph.to_conll(10)
|
|
|
|
# تبدیل خروجی به دیکشنری
|
|
parsed_lines = conll.splitlines()
|
|
parsed_dict = []
|
|
for line in parsed_lines:
|
|
if line.strip(): # اگر خط خالی نباشد
|
|
cols = line.split('\t')
|
|
token_dict = {
|
|
"id": int(cols[0]),
|
|
"text": cols[1],
|
|
"lemma": cols[2],
|
|
"upos": cols[3],
|
|
"xpos": cols[4],
|
|
"feats": cols[5] if cols[5] != "_" else None,
|
|
"head": int(cols[6]) if cols[6] != "_" else None,
|
|
"deprel": cols[7],
|
|
"deps": cols[8] if cols[8] != "_" else None,
|
|
"misc": cols[9] if cols[9] != "_" else None,
|
|
}
|
|
parsed_dict.append(token_dict)
|
|
|
|
return [parsed_dict]
|
|
|
|
@router.post("/dep", tags=['Hazm'])
|
|
async def parse_dependencies(data: TextData) -> list[list[Token]]:
|
|
return dependency_parse(data)
|
|
|
|
|
|
@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
|
|
async def plot_dependency_tree(data: TextData):
|
|
return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
|
|
|
|
|
|
@router.post("/chunks", tags=['Hazm'])
|
|
async def chunks(data: TextData):
|
|
word_tokenizer = WordTokenizer()
|
|
postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model')
|
|
chunker = Chunker(model=path_base_model + 'hazm/chunker.model')
|
|
tokens = word_tokenizer.tokenize(data.text)
|
|
pos_tags = postagger.tag(tokens)
|
|
result = chunker.parse(pos_tags)
|
|
brackets = tree2brackets(result)
|
|
return {"chunk_tree": brackets} |