add hazm docker files
This commit is contained in:
parent
8ef628ea64
commit
96bf9b7290
1
.dockerignore
Normal file
1
.dockerignore
Normal file
|
@ -0,0 +1 @@
|
||||||
|
.env
|
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
.env
|
9
Dockerfile
Normal file
9
Dockerfile
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
FROM python:3.10-slim
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN pip install -r requirements.txt
|
||||||
|
|
||||||
|
EXPOSE 5110
|
||||||
|
CMD ["python", "app/main.py"]
|
BIN
app/__pycache__/hazm_api.cpython-310.pyc
Normal file
BIN
app/__pycache__/hazm_api.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/plot_dependency_graph.cpython-310.pyc
Normal file
BIN
app/__pycache__/plot_dependency_graph.cpython-310.pyc
Normal file
Binary file not shown.
82
app/hazm_api.py
Normal file
82
app/hazm_api.py
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
from fastapi import APIRouter
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
|
||||||
|
import warnings
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
|
||||||
|
from fastapi.responses import HTMLResponse
|
||||||
|
|
||||||
|
import plot_dependency_graph
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
class TextData(BaseModel):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
class Token(BaseModel):
|
||||||
|
id: int
|
||||||
|
text: str
|
||||||
|
lemma: str
|
||||||
|
upos: str
|
||||||
|
xpos: str
|
||||||
|
feats: Optional[str] = None
|
||||||
|
head: int
|
||||||
|
deprel: str
|
||||||
|
deps: Optional[str] = None
|
||||||
|
misc: Optional[str] = None
|
||||||
|
|
||||||
|
path_base_model = '/MODELS'
|
||||||
|
|
||||||
|
word_tokenizer = WordTokenizer()
|
||||||
|
postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
|
||||||
|
lemmatizer = Lemmatizer()
|
||||||
|
parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
|
||||||
|
|
||||||
|
def dependency_parse(data):
|
||||||
|
tokens = word_tokenizer.tokenize(data.text)
|
||||||
|
dependency_graph = parser.parse(tokens)
|
||||||
|
conll = dependency_graph.to_conll(10)
|
||||||
|
|
||||||
|
# تبدیل خروجی به دیکشنری
|
||||||
|
parsed_lines = conll.splitlines()
|
||||||
|
parsed_dict = []
|
||||||
|
for line in parsed_lines:
|
||||||
|
if line.strip(): # اگر خط خالی نباشد
|
||||||
|
cols = line.split('\t')
|
||||||
|
token_dict = {
|
||||||
|
"id": int(cols[0]),
|
||||||
|
"text": cols[1],
|
||||||
|
"lemma": cols[2],
|
||||||
|
"upos": cols[3],
|
||||||
|
"xpos": cols[4],
|
||||||
|
"feats": cols[5] if cols[5] != "_" else None,
|
||||||
|
"head": int(cols[6]) if cols[6] != "_" else None,
|
||||||
|
"deprel": cols[7],
|
||||||
|
"deps": cols[8] if cols[8] != "_" else None,
|
||||||
|
"misc": cols[9] if cols[9] != "_" else None,
|
||||||
|
}
|
||||||
|
parsed_dict.append(token_dict)
|
||||||
|
|
||||||
|
return [parsed_dict]
|
||||||
|
|
||||||
|
@router.post("/dep", tags=['Hazm'])
|
||||||
|
async def parse_dependencies(data: TextData) -> list[list[Token]]:
|
||||||
|
return dependency_parse(data)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
|
||||||
|
async def plot_dependency_tree(data: TextData):
|
||||||
|
return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/chunks", tags=['Hazm'])
|
||||||
|
async def chunks(data: TextData):
|
||||||
|
word_tokenizer = WordTokenizer()
|
||||||
|
postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model')
|
||||||
|
chunker = Chunker(model=path_base_model + 'hazm/chunker.model')
|
||||||
|
tokens = word_tokenizer.tokenize(data.text)
|
||||||
|
pos_tags = postagger.tag(tokens)
|
||||||
|
result = chunker.parse(pos_tags)
|
||||||
|
brackets = tree2brackets(result)
|
||||||
|
return {"chunk_tree": brackets}
|
16
app/main.py
Normal file
16
app/main.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI
|
||||||
|
import hazm_api
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
app.include_router(hazm_api.router, prefix="/hazm")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def read_root():
|
||||||
|
return "Hazm is OK!"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=5110)
|
225
app/plot_dependency_graph.py
Normal file
225
app/plot_dependency_graph.py
Normal file
|
@ -0,0 +1,225 @@
|
||||||
|
import spacy
|
||||||
|
from spacy import displacy
|
||||||
|
|
||||||
|
ud_tags_translation = {
|
||||||
|
# UD Standard tags for Hazm and Dadmatools
|
||||||
|
"acl": "بند موصولی",
|
||||||
|
"advcl": "بند قیدی",
|
||||||
|
"advmod": "وابسته قیدی",
|
||||||
|
"amod": "وابسته وصفی",
|
||||||
|
"appos": "متمم تشریحی",
|
||||||
|
"aux": "فعل کمکی",
|
||||||
|
"case": "نشانه یا حرف اضافه",
|
||||||
|
"cc": "حرف ربط همپایهساز",
|
||||||
|
"ccomp": "مکمل فعل",
|
||||||
|
"compound": "ترکیب کلمات",
|
||||||
|
"compound:lvc": "فعلیار",
|
||||||
|
"conj": "همپایه",
|
||||||
|
"cop": "فعل ربطی",
|
||||||
|
"csubj": "فاعل بند",
|
||||||
|
"dep": "وابسته نامشخص",
|
||||||
|
"det": "معرف",
|
||||||
|
"fixed": "ترکیب ثابت",
|
||||||
|
"flat:name": "نام",
|
||||||
|
"flat:num": "عدد",
|
||||||
|
"goeswith": "جزء ناقص",
|
||||||
|
"iobj": "مفعول غیرمستقیم",
|
||||||
|
"mark": "نشانه بند",
|
||||||
|
"nmod": "متمم اسمی",
|
||||||
|
"nmod:poss": 'مالکیت',
|
||||||
|
"nsubj": "فاعل اسمی",
|
||||||
|
"nsubj:pass": "فاعل جمله مجهول",
|
||||||
|
"nummod": "وابسته عددی",
|
||||||
|
"obj": "مفعول مستقیم",
|
||||||
|
"obl": "متمم قیدی",
|
||||||
|
"obl:arg": "وابسته قیدی مرتبط با فعل",
|
||||||
|
"parataxis": "عبارت موازی",
|
||||||
|
"punct": "علائم نگارشی",
|
||||||
|
"root": "ریشه جمله",
|
||||||
|
"vocative": "حالت ندا",
|
||||||
|
"xcomp": "متمم باز",
|
||||||
|
|
||||||
|
# Penn Treebank Standard for Parsivar
|
||||||
|
"SBJ": "فاعل",
|
||||||
|
"OBJ": "مفعول",
|
||||||
|
"NVE": "فعلیار",
|
||||||
|
"ENC": "فعلیار پیبستی",
|
||||||
|
"VPP": "مفعول حرفاضافهای",
|
||||||
|
"OBJ2": "مفعول دوم",
|
||||||
|
"TAM": "تمییز",
|
||||||
|
"MOS": "مسند",
|
||||||
|
"PROG": "مستمرساز",
|
||||||
|
"ADVC": "متمم قیدی",
|
||||||
|
"VCL": "بند متممی فعل",
|
||||||
|
"VPRT": "حرف اضافه فعلی",
|
||||||
|
"LVP": "جزء همکرد",
|
||||||
|
"PARCL": "بند وصفی",
|
||||||
|
"ADV": "قید",
|
||||||
|
"AJUCL": "بند افزوده فعل",
|
||||||
|
"PART": "افزوده پرسشی فعل",
|
||||||
|
"VCONJ": "همپایه فعل",
|
||||||
|
"NPREMOD": "صفت پیشین اسم",
|
||||||
|
"NPOSTMOD": "صفت پسین اسم",
|
||||||
|
"NPP": "حرف اضافه اسم",
|
||||||
|
"NCL": "بند اسم",
|
||||||
|
"NE": "اسمیار",
|
||||||
|
"MESU": "ممیز",
|
||||||
|
"NPRT": "جزء اسمی",
|
||||||
|
"MOZ": "مضاف الیه",
|
||||||
|
"APP": "بدل",
|
||||||
|
"NCONJ": "همپایه اسم",
|
||||||
|
"NADV": "قید اسم",
|
||||||
|
"COMPPP": "حرف اضافه تفضیلی",
|
||||||
|
"ADJADV": "قید صفت",
|
||||||
|
"ACL": "متمم بندی صفت",
|
||||||
|
"AJPP": "متمم حرف اضافهای صفت",
|
||||||
|
"NEZ": "متمم نشانه اضافهای صفت",
|
||||||
|
"AJCONJ": "همپایه صفت",
|
||||||
|
"APREMOD": "وابسته پیشین صفت",
|
||||||
|
"APOSTMOD": "وابسته پسین صفت",
|
||||||
|
"PREDEP": "وابسته پیشین",
|
||||||
|
"POSDEP": "وابسته پسین",
|
||||||
|
"PCONJ": "همپایه حرف اضافه",
|
||||||
|
"AVCONJ": "همپایه قید",
|
||||||
|
"PRD": "گزاره",
|
||||||
|
"ROOT": "ریشه جمله",
|
||||||
|
"PUNC": "علامت نگارشی"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def json_to_conllu(json_data):
|
||||||
|
"""
|
||||||
|
Convert JSON in the given format to CoNLL-U format.
|
||||||
|
Args:
|
||||||
|
json_data (dict): JSON input as a Python dictionary.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: CoNLL-U formatted string.
|
||||||
|
"""
|
||||||
|
conllu_output = []
|
||||||
|
|
||||||
|
# If sentence text is available, concatenate token texts
|
||||||
|
sentence_text = " ".join(token.get("text", "_") for token in json_data)
|
||||||
|
conllu_output.append(f"# text = {sentence_text}")
|
||||||
|
|
||||||
|
# Add tokens in CoNLL-U format
|
||||||
|
for token in json_data:
|
||||||
|
id_ = token.get("id", "_")
|
||||||
|
form = token.get("text", "_")
|
||||||
|
lemma = "_" # Lemma not provided
|
||||||
|
upos = token.get("upos", "_")
|
||||||
|
xpos = token.get("xpos", "_")
|
||||||
|
feats = token.get("feats", "_")
|
||||||
|
head = token.get("head", "_")
|
||||||
|
deprel = token.get("deprel", "_")
|
||||||
|
deps = "_"
|
||||||
|
misc = "_"
|
||||||
|
|
||||||
|
conllu_output.append(
|
||||||
|
f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add a blank line after each sentence
|
||||||
|
conllu_output.append("")
|
||||||
|
|
||||||
|
# Join all lines into a single string
|
||||||
|
return "\n".join(conllu_output)
|
||||||
|
|
||||||
|
|
||||||
|
# Parse CoNLL-U data into a format that spaCy can use
|
||||||
|
def conllu_to_spacy(conllu_data):
|
||||||
|
lines = conllu_data.strip().split("\n")
|
||||||
|
words = []
|
||||||
|
for line in lines:
|
||||||
|
if line.startswith("#"): # Skip comment lines
|
||||||
|
continue
|
||||||
|
parts = line.split("\t")
|
||||||
|
if len(parts) != 10: # Skip invalid lines
|
||||||
|
continue
|
||||||
|
index, word, _, _, _, _, head, dep, _, _ = parts
|
||||||
|
index = int(index)
|
||||||
|
head = int(head)
|
||||||
|
words.append((index, word, head, dep))
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
# Create a Doc object in spaCy
|
||||||
|
def create_spacy_doc(conllu_data, model):
|
||||||
|
words = conllu_to_spacy(conllu_data)
|
||||||
|
num = len(words)
|
||||||
|
for index, word, head, dep in words:
|
||||||
|
dep_name = ud_tags_translation.get(dep, dep)
|
||||||
|
words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
|
||||||
|
words = list(reversed(words))
|
||||||
|
|
||||||
|
# Create a doc object
|
||||||
|
doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
|
||||||
|
# Add syntactic dependencies
|
||||||
|
for word, (index, _, head, dep) in zip(doc, words):
|
||||||
|
word.head = doc[head - 1] if head > 0 else doc[index - 1]
|
||||||
|
word.dep_ = dep
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
|
||||||
|
model = spacy.blank("xx") # Use 'xx' for a multilingual model
|
||||||
|
|
||||||
|
|
||||||
|
def plot_dependency_graph(graph_data):
|
||||||
|
result = '<div>\n'
|
||||||
|
for sent_graph in graph_data:
|
||||||
|
|
||||||
|
conllu_data = json_to_conllu(sent_graph)
|
||||||
|
|
||||||
|
# Create a doc from CoNLL-U data
|
||||||
|
doc = create_spacy_doc(conllu_data, model)
|
||||||
|
|
||||||
|
# Visualize the dependency tree
|
||||||
|
result += displacy.render(doc, style="dep")
|
||||||
|
|
||||||
|
result += '\n</div>'
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test = [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"text": "هوا",
|
||||||
|
"lemma": "هوا",
|
||||||
|
"upos": "NOUN",
|
||||||
|
"xpos": "NOUN",
|
||||||
|
"feats": None,
|
||||||
|
"head": 2,
|
||||||
|
"deprel": "nsubj",
|
||||||
|
"deps": None,
|
||||||
|
"misc": None
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"text": "خوب",
|
||||||
|
"lemma": "خوب",
|
||||||
|
"upos": "ADJ",
|
||||||
|
"xpos": "ADJ",
|
||||||
|
"feats": None,
|
||||||
|
"head": 0,
|
||||||
|
"deprel": "root",
|
||||||
|
"deps": None,
|
||||||
|
"misc": None
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"text": "است",
|
||||||
|
"lemma": "است",
|
||||||
|
"upos": "VERB",
|
||||||
|
"xpos": "VERB",
|
||||||
|
"feats": None,
|
||||||
|
"head": 2,
|
||||||
|
"deprel": "cop",
|
||||||
|
"deps": None,
|
||||||
|
"misc": None
|
||||||
|
}
|
||||||
|
]
|
||||||
|
]
|
||||||
|
print(plot_dependency_graph(test))
|
12
docker-compose.yml
Normal file
12
docker-compose.yml
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
services:
|
||||||
|
hazm_api:
|
||||||
|
image: hazm_api
|
||||||
|
container_name: hazm_container
|
||||||
|
ports:
|
||||||
|
- "5110:5110"
|
||||||
|
volumes:
|
||||||
|
- models:/MODELS/
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
models:
|
||||||
|
external: true
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
spacy
|
||||||
|
fastapi
|
||||||
|
pydantic
|
||||||
|
hazm
|
||||||
|
uvicorn
|
Loading…
Reference in New Issue
Block a user