from fastapi import APIRouter from pydantic import BaseModel from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets import warnings import os from typing import Optional warnings.filterwarnings("ignore", category=UserWarning, module="nltk") from fastapi.responses import HTMLResponse import plot_dependency_graph router = APIRouter() class TextData(BaseModel): text: str class Token(BaseModel): id: int text: str lemma: str upos: str xpos: str feats: Optional[str] = None head: int deprel: str deps: Optional[str] = None misc: Optional[str] = None path_base_model = '/MODELS' word_tokenizer = WordTokenizer() postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model')) lemmatizer = Lemmatizer() parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser')) chunker = Chunker(model=os.path.join(path_base_model, 'hazm/chunker.model')) def dependency_parse(data): tokens = word_tokenizer.tokenize(data.text) dependency_graph = parser.parse(tokens) conll = dependency_graph.to_conll(10) # تبدیل خروجی به دیکشنری parsed_lines = conll.splitlines() parsed_dict = [] for line in parsed_lines: if line.strip(): # اگر خط خالی نباشد cols = line.split('\t') token_dict = { "id": int(cols[0]), "text": cols[1], "lemma": cols[2], "upos": cols[3], "xpos": cols[4], "feats": cols[5] if cols[5] != "_" else None, "head": int(cols[6]) if cols[6] != "_" else None, "deprel": cols[7], "deps": cols[8] if cols[8] != "_" else None, "misc": cols[9] if cols[9] != "_" else None, } parsed_dict.append(token_dict) return [parsed_dict] @router.post("/dep", tags=['Hazm']) async def parse_dependencies(data: TextData) -> list[list[Token]]: return dependency_parse(data) @router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse) async def plot_dependency_tree(data: TextData): return plot_dependency_graph.plot_dependency_graph(dependency_parse(data)) @router.post("/chunks", tags=['Hazm']) async def chunks(data: TextData): tokens = word_tokenizer.tokenize(data.text) pos_tags = postagger.tag(tokens) result = chunker.parse(pos_tags) brackets = tree2brackets(result) return {"chunk_tree": brackets}