add hazm docker files
This commit is contained in:
parent
8ef628ea64
commit
96bf9b7290
1
.dockerignore
Normal file
1
.dockerignore
Normal file
|
@ -0,0 +1 @@
|
|||
.env
|
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
.env
|
9
Dockerfile
Normal file
9
Dockerfile
Normal file
|
@ -0,0 +1,9 @@
|
|||
FROM python:3.10-slim
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
EXPOSE 5110
|
||||
CMD ["python", "app/main.py"]
|
BIN
app/__pycache__/hazm_api.cpython-310.pyc
Normal file
BIN
app/__pycache__/hazm_api.cpython-310.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/plot_dependency_graph.cpython-310.pyc
Normal file
BIN
app/__pycache__/plot_dependency_graph.cpython-310.pyc
Normal file
Binary file not shown.
82
app/hazm_api.py
Normal file
82
app/hazm_api.py
Normal file
|
@ -0,0 +1,82 @@
|
|||
from fastapi import APIRouter
|
||||
from pydantic import BaseModel
|
||||
from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
|
||||
import warnings
|
||||
import os
|
||||
from typing import Optional
|
||||
warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
|
||||
from fastapi.responses import HTMLResponse
|
||||
|
||||
import plot_dependency_graph
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
class TextData(BaseModel):
|
||||
text: str
|
||||
|
||||
class Token(BaseModel):
|
||||
id: int
|
||||
text: str
|
||||
lemma: str
|
||||
upos: str
|
||||
xpos: str
|
||||
feats: Optional[str] = None
|
||||
head: int
|
||||
deprel: str
|
||||
deps: Optional[str] = None
|
||||
misc: Optional[str] = None
|
||||
|
||||
path_base_model = '/MODELS'
|
||||
|
||||
word_tokenizer = WordTokenizer()
|
||||
postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
|
||||
lemmatizer = Lemmatizer()
|
||||
parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
|
||||
|
||||
def dependency_parse(data):
|
||||
tokens = word_tokenizer.tokenize(data.text)
|
||||
dependency_graph = parser.parse(tokens)
|
||||
conll = dependency_graph.to_conll(10)
|
||||
|
||||
# تبدیل خروجی به دیکشنری
|
||||
parsed_lines = conll.splitlines()
|
||||
parsed_dict = []
|
||||
for line in parsed_lines:
|
||||
if line.strip(): # اگر خط خالی نباشد
|
||||
cols = line.split('\t')
|
||||
token_dict = {
|
||||
"id": int(cols[0]),
|
||||
"text": cols[1],
|
||||
"lemma": cols[2],
|
||||
"upos": cols[3],
|
||||
"xpos": cols[4],
|
||||
"feats": cols[5] if cols[5] != "_" else None,
|
||||
"head": int(cols[6]) if cols[6] != "_" else None,
|
||||
"deprel": cols[7],
|
||||
"deps": cols[8] if cols[8] != "_" else None,
|
||||
"misc": cols[9] if cols[9] != "_" else None,
|
||||
}
|
||||
parsed_dict.append(token_dict)
|
||||
|
||||
return [parsed_dict]
|
||||
|
||||
@router.post("/dep", tags=['Hazm'])
|
||||
async def parse_dependencies(data: TextData) -> list[list[Token]]:
|
||||
return dependency_parse(data)
|
||||
|
||||
|
||||
@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
|
||||
async def plot_dependency_tree(data: TextData):
|
||||
return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
|
||||
|
||||
|
||||
@router.post("/chunks", tags=['Hazm'])
|
||||
async def chunks(data: TextData):
|
||||
word_tokenizer = WordTokenizer()
|
||||
postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model')
|
||||
chunker = Chunker(model=path_base_model + 'hazm/chunker.model')
|
||||
tokens = word_tokenizer.tokenize(data.text)
|
||||
pos_tags = postagger.tag(tokens)
|
||||
result = chunker.parse(pos_tags)
|
||||
brackets = tree2brackets(result)
|
||||
return {"chunk_tree": brackets}
|
16
app/main.py
Normal file
16
app/main.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
import hazm_api
|
||||
|
||||
|
||||
app = FastAPI()
|
||||
app.include_router(hazm_api.router, prefix="/hazm")
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def read_root():
|
||||
return "Hazm is OK!"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=5110)
|
225
app/plot_dependency_graph.py
Normal file
225
app/plot_dependency_graph.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
import spacy
|
||||
from spacy import displacy
|
||||
|
||||
ud_tags_translation = {
|
||||
# UD Standard tags for Hazm and Dadmatools
|
||||
"acl": "بند موصولی",
|
||||
"advcl": "بند قیدی",
|
||||
"advmod": "وابسته قیدی",
|
||||
"amod": "وابسته وصفی",
|
||||
"appos": "متمم تشریحی",
|
||||
"aux": "فعل کمکی",
|
||||
"case": "نشانه یا حرف اضافه",
|
||||
"cc": "حرف ربط همپایهساز",
|
||||
"ccomp": "مکمل فعل",
|
||||
"compound": "ترکیب کلمات",
|
||||
"compound:lvc": "فعلیار",
|
||||
"conj": "همپایه",
|
||||
"cop": "فعل ربطی",
|
||||
"csubj": "فاعل بند",
|
||||
"dep": "وابسته نامشخص",
|
||||
"det": "معرف",
|
||||
"fixed": "ترکیب ثابت",
|
||||
"flat:name": "نام",
|
||||
"flat:num": "عدد",
|
||||
"goeswith": "جزء ناقص",
|
||||
"iobj": "مفعول غیرمستقیم",
|
||||
"mark": "نشانه بند",
|
||||
"nmod": "متمم اسمی",
|
||||
"nmod:poss": 'مالکیت',
|
||||
"nsubj": "فاعل اسمی",
|
||||
"nsubj:pass": "فاعل جمله مجهول",
|
||||
"nummod": "وابسته عددی",
|
||||
"obj": "مفعول مستقیم",
|
||||
"obl": "متمم قیدی",
|
||||
"obl:arg": "وابسته قیدی مرتبط با فعل",
|
||||
"parataxis": "عبارت موازی",
|
||||
"punct": "علائم نگارشی",
|
||||
"root": "ریشه جمله",
|
||||
"vocative": "حالت ندا",
|
||||
"xcomp": "متمم باز",
|
||||
|
||||
# Penn Treebank Standard for Parsivar
|
||||
"SBJ": "فاعل",
|
||||
"OBJ": "مفعول",
|
||||
"NVE": "فعلیار",
|
||||
"ENC": "فعلیار پیبستی",
|
||||
"VPP": "مفعول حرفاضافهای",
|
||||
"OBJ2": "مفعول دوم",
|
||||
"TAM": "تمییز",
|
||||
"MOS": "مسند",
|
||||
"PROG": "مستمرساز",
|
||||
"ADVC": "متمم قیدی",
|
||||
"VCL": "بند متممی فعل",
|
||||
"VPRT": "حرف اضافه فعلی",
|
||||
"LVP": "جزء همکرد",
|
||||
"PARCL": "بند وصفی",
|
||||
"ADV": "قید",
|
||||
"AJUCL": "بند افزوده فعل",
|
||||
"PART": "افزوده پرسشی فعل",
|
||||
"VCONJ": "همپایه فعل",
|
||||
"NPREMOD": "صفت پیشین اسم",
|
||||
"NPOSTMOD": "صفت پسین اسم",
|
||||
"NPP": "حرف اضافه اسم",
|
||||
"NCL": "بند اسم",
|
||||
"NE": "اسمیار",
|
||||
"MESU": "ممیز",
|
||||
"NPRT": "جزء اسمی",
|
||||
"MOZ": "مضاف الیه",
|
||||
"APP": "بدل",
|
||||
"NCONJ": "همپایه اسم",
|
||||
"NADV": "قید اسم",
|
||||
"COMPPP": "حرف اضافه تفضیلی",
|
||||
"ADJADV": "قید صفت",
|
||||
"ACL": "متمم بندی صفت",
|
||||
"AJPP": "متمم حرف اضافهای صفت",
|
||||
"NEZ": "متمم نشانه اضافهای صفت",
|
||||
"AJCONJ": "همپایه صفت",
|
||||
"APREMOD": "وابسته پیشین صفت",
|
||||
"APOSTMOD": "وابسته پسین صفت",
|
||||
"PREDEP": "وابسته پیشین",
|
||||
"POSDEP": "وابسته پسین",
|
||||
"PCONJ": "همپایه حرف اضافه",
|
||||
"AVCONJ": "همپایه قید",
|
||||
"PRD": "گزاره",
|
||||
"ROOT": "ریشه جمله",
|
||||
"PUNC": "علامت نگارشی"
|
||||
}
|
||||
|
||||
|
||||
def json_to_conllu(json_data):
|
||||
"""
|
||||
Convert JSON in the given format to CoNLL-U format.
|
||||
Args:
|
||||
json_data (dict): JSON input as a Python dictionary.
|
||||
|
||||
Returns:
|
||||
str: CoNLL-U formatted string.
|
||||
"""
|
||||
conllu_output = []
|
||||
|
||||
# If sentence text is available, concatenate token texts
|
||||
sentence_text = " ".join(token.get("text", "_") for token in json_data)
|
||||
conllu_output.append(f"# text = {sentence_text}")
|
||||
|
||||
# Add tokens in CoNLL-U format
|
||||
for token in json_data:
|
||||
id_ = token.get("id", "_")
|
||||
form = token.get("text", "_")
|
||||
lemma = "_" # Lemma not provided
|
||||
upos = token.get("upos", "_")
|
||||
xpos = token.get("xpos", "_")
|
||||
feats = token.get("feats", "_")
|
||||
head = token.get("head", "_")
|
||||
deprel = token.get("deprel", "_")
|
||||
deps = "_"
|
||||
misc = "_"
|
||||
|
||||
conllu_output.append(
|
||||
f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
|
||||
)
|
||||
|
||||
# Add a blank line after each sentence
|
||||
conllu_output.append("")
|
||||
|
||||
# Join all lines into a single string
|
||||
return "\n".join(conllu_output)
|
||||
|
||||
|
||||
# Parse CoNLL-U data into a format that spaCy can use
|
||||
def conllu_to_spacy(conllu_data):
|
||||
lines = conllu_data.strip().split("\n")
|
||||
words = []
|
||||
for line in lines:
|
||||
if line.startswith("#"): # Skip comment lines
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) != 10: # Skip invalid lines
|
||||
continue
|
||||
index, word, _, _, _, _, head, dep, _, _ = parts
|
||||
index = int(index)
|
||||
head = int(head)
|
||||
words.append((index, word, head, dep))
|
||||
return words
|
||||
|
||||
|
||||
# Create a Doc object in spaCy
|
||||
def create_spacy_doc(conllu_data, model):
|
||||
words = conllu_to_spacy(conllu_data)
|
||||
num = len(words)
|
||||
for index, word, head, dep in words:
|
||||
dep_name = ud_tags_translation.get(dep, dep)
|
||||
words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
|
||||
words = list(reversed(words))
|
||||
|
||||
# Create a doc object
|
||||
doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
|
||||
# Add syntactic dependencies
|
||||
for word, (index, _, head, dep) in zip(doc, words):
|
||||
word.head = doc[head - 1] if head > 0 else doc[index - 1]
|
||||
word.dep_ = dep
|
||||
return doc
|
||||
|
||||
|
||||
# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
|
||||
model = spacy.blank("xx") # Use 'xx' for a multilingual model
|
||||
|
||||
|
||||
def plot_dependency_graph(graph_data):
|
||||
result = '<div>\n'
|
||||
for sent_graph in graph_data:
|
||||
|
||||
conllu_data = json_to_conllu(sent_graph)
|
||||
|
||||
# Create a doc from CoNLL-U data
|
||||
doc = create_spacy_doc(conllu_data, model)
|
||||
|
||||
# Visualize the dependency tree
|
||||
result += displacy.render(doc, style="dep")
|
||||
|
||||
result += '\n</div>'
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test = [
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"text": "هوا",
|
||||
"lemma": "هوا",
|
||||
"upos": "NOUN",
|
||||
"xpos": "NOUN",
|
||||
"feats": None,
|
||||
"head": 2,
|
||||
"deprel": "nsubj",
|
||||
"deps": None,
|
||||
"misc": None
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "خوب",
|
||||
"lemma": "خوب",
|
||||
"upos": "ADJ",
|
||||
"xpos": "ADJ",
|
||||
"feats": None,
|
||||
"head": 0,
|
||||
"deprel": "root",
|
||||
"deps": None,
|
||||
"misc": None
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "است",
|
||||
"lemma": "است",
|
||||
"upos": "VERB",
|
||||
"xpos": "VERB",
|
||||
"feats": None,
|
||||
"head": 2,
|
||||
"deprel": "cop",
|
||||
"deps": None,
|
||||
"misc": None
|
||||
}
|
||||
]
|
||||
]
|
||||
print(plot_dependency_graph(test))
|
12
docker-compose.yml
Normal file
12
docker-compose.yml
Normal file
|
@ -0,0 +1,12 @@
|
|||
services:
|
||||
hazm_api:
|
||||
image: hazm_api
|
||||
container_name: hazm_container
|
||||
ports:
|
||||
- "5110:5110"
|
||||
volumes:
|
||||
- models:/MODELS/
|
||||
|
||||
volumes:
|
||||
models:
|
||||
external: true
|
5
requirements.txt
Normal file
5
requirements.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
spacy
|
||||
fastapi
|
||||
pydantic
|
||||
hazm
|
||||
uvicorn
|
Loading…
Reference in New Issue
Block a user