add hazm docker files

This commit is contained in:
HamidReza Montazer 2025-02-02 14:04:55 +00:00
parent 8ef628ea64
commit 96bf9b7290
10 changed files with 351 additions and 0 deletions

1
.dockerignore Normal file
View File

@ -0,0 +1 @@
.env

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.env

9
Dockerfile Normal file
View File

@ -0,0 +1,9 @@
FROM python:3.10-slim
WORKDIR /app
COPY . .
RUN pip install -r requirements.txt
EXPOSE 5110
CMD ["python", "app/main.py"]

Binary file not shown.

Binary file not shown.

82
app/hazm_api.py Normal file
View File

@ -0,0 +1,82 @@
from fastapi import APIRouter
from pydantic import BaseModel
from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
import warnings
import os
from typing import Optional
warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
from fastapi.responses import HTMLResponse
import plot_dependency_graph
router = APIRouter()
class TextData(BaseModel):
text: str
class Token(BaseModel):
id: int
text: str
lemma: str
upos: str
xpos: str
feats: Optional[str] = None
head: int
deprel: str
deps: Optional[str] = None
misc: Optional[str] = None
path_base_model = '/MODELS'
word_tokenizer = WordTokenizer()
postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
lemmatizer = Lemmatizer()
parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
def dependency_parse(data):
tokens = word_tokenizer.tokenize(data.text)
dependency_graph = parser.parse(tokens)
conll = dependency_graph.to_conll(10)
# تبدیل خروجی به دیکشنری
parsed_lines = conll.splitlines()
parsed_dict = []
for line in parsed_lines:
if line.strip(): # اگر خط خالی نباشد
cols = line.split('\t')
token_dict = {
"id": int(cols[0]),
"text": cols[1],
"lemma": cols[2],
"upos": cols[3],
"xpos": cols[4],
"feats": cols[5] if cols[5] != "_" else None,
"head": int(cols[6]) if cols[6] != "_" else None,
"deprel": cols[7],
"deps": cols[8] if cols[8] != "_" else None,
"misc": cols[9] if cols[9] != "_" else None,
}
parsed_dict.append(token_dict)
return [parsed_dict]
@router.post("/dep", tags=['Hazm'])
async def parse_dependencies(data: TextData) -> list[list[Token]]:
return dependency_parse(data)
@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
async def plot_dependency_tree(data: TextData):
return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
@router.post("/chunks", tags=['Hazm'])
async def chunks(data: TextData):
word_tokenizer = WordTokenizer()
postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model')
chunker = Chunker(model=path_base_model + 'hazm/chunker.model')
tokens = word_tokenizer.tokenize(data.text)
pos_tags = postagger.tag(tokens)
result = chunker.parse(pos_tags)
brackets = tree2brackets(result)
return {"chunk_tree": brackets}

16
app/main.py Normal file
View File

@ -0,0 +1,16 @@
import uvicorn
from fastapi import FastAPI
import hazm_api
app = FastAPI()
app.include_router(hazm_api.router, prefix="/hazm")
@app.get("/")
async def read_root():
return "Hazm is OK!"
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=5110)

View File

@ -0,0 +1,225 @@
import spacy
from spacy import displacy
ud_tags_translation = {
# UD Standard tags for Hazm and Dadmatools
"acl": "بند موصولی",
"advcl": "بند قیدی",
"advmod": "وابسته قیدی",
"amod": "وابسته وصفی",
"appos": "متمم تشریحی",
"aux": "فعل کمکی",
"case": "نشانه یا حرف اضافه",
"cc": "حرف ربط همپایه‌ساز",
"ccomp": "مکمل فعل",
"compound": "ترکیب کلمات",
"compound:lvc": "فعل‌یار",
"conj": "همپایه",
"cop": "فعل ربطی",
"csubj": "فاعل بند",
"dep": "وابسته نامشخص",
"det": "معرف",
"fixed": "ترکیب ثابت",
"flat:name": "نام",
"flat:num": "عدد",
"goeswith": "جزء ناقص",
"iobj": "مفعول غیرمستقیم",
"mark": "نشانه بند",
"nmod": "متمم اسمی",
"nmod:poss": 'مالکیت',
"nsubj": "فاعل اسمی",
"nsubj:pass": "فاعل جمله مجهول",
"nummod": "وابسته عددی",
"obj": "مفعول مستقیم",
"obl": "متمم قیدی",
"obl:arg": "وابسته قیدی مرتبط با فعل",
"parataxis": "عبارت موازی",
"punct": "علائم نگارشی",
"root": "ریشه جمله",
"vocative": "حالت ندا",
"xcomp": "متمم باز",
# Penn Treebank Standard for Parsivar
"SBJ": "فاعل",
"OBJ": "مفعول",
"NVE": "فعل‌یار",
"ENC": "فعل‌یار پی‌بستی",
"VPP": "مفعول حرف‌اضافه‌ای",
"OBJ2": "مفعول دوم",
"TAM": "تمییز",
"MOS": "مسند",
"PROG": "مستمرساز",
"ADVC": "متمم قیدی",
"VCL": "بند متممی فعل",
"VPRT": "حرف اضافه فعلی",
"LVP": "جزء همکرد",
"PARCL": "بند وصفی",
"ADV": "قید",
"AJUCL": "بند افزوده فعل",
"PART": "افزوده پرسشی فعل",
"VCONJ": "همپایه فعل",
"NPREMOD": "صفت پیشین اسم",
"NPOSTMOD": "صفت پسین اسم",
"NPP": "حرف اضافه اسم",
"NCL": "بند اسم",
"NE": "اسم‌یار",
"MESU": "ممیز",
"NPRT": "جزء اسمی",
"MOZ": "مضاف الیه",
"APP": "بدل",
"NCONJ": "همپایه اسم",
"NADV": "قید اسم",
"COMPPP": "حرف اضافه تفضیلی",
"ADJADV": "قید صفت",
"ACL": "متمم بندی صفت",
"AJPP": "متمم حرف اضافه‌ای صفت",
"NEZ": "متمم نشانه اضافه‌ای صفت",
"AJCONJ": "همپایه صفت",
"APREMOD": "وابسته پیشین صفت",
"APOSTMOD": "وابسته پسین صفت",
"PREDEP": "وابسته پیشین",
"POSDEP": "وابسته پسین",
"PCONJ": "همپایه حرف اضافه",
"AVCONJ": "همپایه قید",
"PRD": "گزاره",
"ROOT": "ریشه جمله",
"PUNC": "علامت نگارشی"
}
def json_to_conllu(json_data):
"""
Convert JSON in the given format to CoNLL-U format.
Args:
json_data (dict): JSON input as a Python dictionary.
Returns:
str: CoNLL-U formatted string.
"""
conllu_output = []
# If sentence text is available, concatenate token texts
sentence_text = " ".join(token.get("text", "_") for token in json_data)
conllu_output.append(f"# text = {sentence_text}")
# Add tokens in CoNLL-U format
for token in json_data:
id_ = token.get("id", "_")
form = token.get("text", "_")
lemma = "_" # Lemma not provided
upos = token.get("upos", "_")
xpos = token.get("xpos", "_")
feats = token.get("feats", "_")
head = token.get("head", "_")
deprel = token.get("deprel", "_")
deps = "_"
misc = "_"
conllu_output.append(
f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
)
# Add a blank line after each sentence
conllu_output.append("")
# Join all lines into a single string
return "\n".join(conllu_output)
# Parse CoNLL-U data into a format that spaCy can use
def conllu_to_spacy(conllu_data):
lines = conllu_data.strip().split("\n")
words = []
for line in lines:
if line.startswith("#"): # Skip comment lines
continue
parts = line.split("\t")
if len(parts) != 10: # Skip invalid lines
continue
index, word, _, _, _, _, head, dep, _, _ = parts
index = int(index)
head = int(head)
words.append((index, word, head, dep))
return words
# Create a Doc object in spaCy
def create_spacy_doc(conllu_data, model):
words = conllu_to_spacy(conllu_data)
num = len(words)
for index, word, head, dep in words:
dep_name = ud_tags_translation.get(dep, dep)
words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
words = list(reversed(words))
# Create a doc object
doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
# Add syntactic dependencies
for word, (index, _, head, dep) in zip(doc, words):
word.head = doc[head - 1] if head > 0 else doc[index - 1]
word.dep_ = dep
return doc
# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
model = spacy.blank("xx") # Use 'xx' for a multilingual model
def plot_dependency_graph(graph_data):
result = '<div>\n'
for sent_graph in graph_data:
conllu_data = json_to_conllu(sent_graph)
# Create a doc from CoNLL-U data
doc = create_spacy_doc(conllu_data, model)
# Visualize the dependency tree
result += displacy.render(doc, style="dep")
result += '\n</div>'
return result
if __name__ == "__main__":
test = [
[
{
"id": 1,
"text": "هوا",
"lemma": "هوا",
"upos": "NOUN",
"xpos": "NOUN",
"feats": None,
"head": 2,
"deprel": "nsubj",
"deps": None,
"misc": None
},
{
"id": 2,
"text": "خوب",
"lemma": "خوب",
"upos": "ADJ",
"xpos": "ADJ",
"feats": None,
"head": 0,
"deprel": "root",
"deps": None,
"misc": None
},
{
"id": 3,
"text": "است",
"lemma": "است",
"upos": "VERB",
"xpos": "VERB",
"feats": None,
"head": 2,
"deprel": "cop",
"deps": None,
"misc": None
}
]
]
print(plot_dependency_graph(test))

12
docker-compose.yml Normal file
View File

@ -0,0 +1,12 @@
services:
hazm_api:
image: hazm_api
container_name: hazm_container
ports:
- "5110:5110"
volumes:
- models:/MODELS/
volumes:
models:
external: true

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
spacy
fastapi
pydantic
hazm
uvicorn