add hazm docker files

2025-02-02 14:04:55 +00:00 · 2025-02-02 14:04:55 +00:00 · 96bf9b7290
commit 96bf9b7290
parent 8ef628ea64
10 changed files with 351 additions and 0 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -0,0 +1 @@
+.env
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.env
--- a/9
+++ b/9
@ -0,0 +1,9 @@
+FROM python:3.10-slim
+WORKDIR /app
+
+COPY . .
+
+RUN pip install -r requirements.txt
+
+EXPOSE 5110
+CMD ["python", "app/main.py"]
--- a/app/pycache/hazm_api.cpython-310.pyc
+++ b/app/pycache/hazm_api.cpython-310.pyc
--- a/app/pycache/plot_dependency_graph.cpython-310.pyc
+++ b/app/pycache/plot_dependency_graph.cpython-310.pyc
--- a/app/hazm_api.py
+++ b/app/hazm_api.py
@ -0,0 +1,82 @@
+from fastapi import APIRouter
+from pydantic import BaseModel
+from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets
+import warnings
+import os
+from typing import Optional
+warnings.filterwarnings("ignore", category=UserWarning, module="nltk")
+from fastapi.responses import HTMLResponse
+
+import plot_dependency_graph
+
+router = APIRouter()
+
+class TextData(BaseModel):
+    text: str
+
+class Token(BaseModel):
+    id: int
+    text: str
+    lemma: str
+    upos: str
+    xpos: str
+    feats: Optional[str] = None
+    head: int
+    deprel: str
+    deps: Optional[str] = None
+    misc: Optional[str] = None
+ 
+path_base_model = '/MODELS'
+
+word_tokenizer = WordTokenizer()
+postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model'))
+lemmatizer = Lemmatizer()
+parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser'))
+
+def dependency_parse(data):
+    tokens = word_tokenizer.tokenize(data.text)
+    dependency_graph = parser.parse(tokens)
+    conll = dependency_graph.to_conll(10)
+    
+    # تبدیل خروجی به دیکشنری
+    parsed_lines = conll.splitlines()
+    parsed_dict = []
+    for line in parsed_lines:
+        if line.strip():  # اگر خط خالی نباشد
+            cols = line.split('\t')
+            token_dict = {
+                "id": int(cols[0]),
+                "text": cols[1],
+                "lemma": cols[2],
+                "upos": cols[3],
+                "xpos": cols[4],
+                "feats": cols[5] if cols[5] != "_" else None,
+                "head": int(cols[6]) if cols[6] != "_" else None,
+                "deprel": cols[7],
+                "deps": cols[8] if cols[8] != "_" else None,
+                "misc": cols[9] if cols[9] != "_" else None,
+            }
+            parsed_dict.append(token_dict)
+    
+    return [parsed_dict]
+
+@router.post("/dep", tags=['Hazm'])
+async def parse_dependencies(data: TextData) -> list[list[Token]]:
+    return dependency_parse(data)
+
+
+@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse)
+async def plot_dependency_tree(data: TextData):
+    return plot_dependency_graph.plot_dependency_graph(dependency_parse(data))
+
+
+@router.post("/chunks", tags=['Hazm'])
+async def chunks(data: TextData):
+    word_tokenizer = WordTokenizer()
+    postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model')
+    chunker = Chunker(model=path_base_model + 'hazm/chunker.model')
+    tokens = word_tokenizer.tokenize(data.text)
+    pos_tags = postagger.tag(tokens)
+    result = chunker.parse(pos_tags)
+    brackets = tree2brackets(result)
+    return {"chunk_tree": brackets}
--- a/app/main.py
+++ b/app/main.py
@ -0,0 +1,16 @@
+import uvicorn
+from fastapi import FastAPI
+import hazm_api
+
+
+app = FastAPI()
+app.include_router(hazm_api.router, prefix="/hazm")
+
+
+@app.get("/")
+async def read_root():    
+    return "Hazm is OK!"
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=5110)
--- a/app/plot_dependency_graph.py
+++ b/app/plot_dependency_graph.py
@ -0,0 +1,225 @@
+import spacy
+from spacy import displacy
+
+ud_tags_translation = {
+    # UD Standard tags for Hazm and Dadmatools
+    "acl": "بند موصولی",
+    "advcl": "بند قیدی",
+    "advmod": "وابسته قیدی",
+    "amod": "وابسته وصفی",
+    "appos": "متمم تشریحی",
+    "aux": "فعل کمکی",
+    "case": "نشانه یا حرف اضافه",
+    "cc": "حرف ربط همپایه‌ساز",
+    "ccomp": "مکمل فعل",
+    "compound": "ترکیب کلمات",
+    "compound:lvc": "فعل‌یار",
+    "conj": "همپایه",
+    "cop": "فعل ربطی",
+    "csubj": "فاعل بند",
+    "dep": "وابسته نامشخص",
+    "det": "معرف",
+    "fixed": "ترکیب ثابت",
+    "flat:name": "نام",
+    "flat:num": "عدد",
+    "goeswith": "جزء ناقص",
+    "iobj": "مفعول غیرمستقیم",
+    "mark": "نشانه بند",
+    "nmod": "متمم اسمی",
+    "nmod:poss": 'مالکیت',
+    "nsubj": "فاعل اسمی",
+    "nsubj:pass": "فاعل جمله مجهول",
+    "nummod": "وابسته عددی",
+    "obj": "مفعول مستقیم",
+    "obl": "متمم قیدی",
+    "obl:arg": "وابسته قیدی مرتبط با فعل",
+    "parataxis": "عبارت موازی",
+    "punct": "علائم نگارشی",
+    "root": "ریشه جمله",
+    "vocative": "حالت ندا",
+    "xcomp": "متمم باز",
+
+    # Penn Treebank Standard for Parsivar
+    "SBJ": "فاعل",
+    "OBJ": "مفعول",
+    "NVE": "فعل‌یار",
+    "ENC": "فعل‌یار پی‌بستی",
+    "VPP": "مفعول حرف‌اضافه‌ای",
+    "OBJ2": "مفعول دوم",
+    "TAM": "تمییز",
+    "MOS": "مسند",
+    "PROG": "مستمرساز",
+    "ADVC": "متمم قیدی",
+    "VCL": "بند متممی فعل",
+    "VPRT": "حرف اضافه فعلی",
+    "LVP": "جزء همکرد",
+    "PARCL": "بند وصفی",
+    "ADV": "قید",
+    "AJUCL": "بند افزوده فعل",
+    "PART": "افزوده پرسشی فعل",
+    "VCONJ": "همپایه فعل",
+    "NPREMOD": "صفت پیشین اسم",
+    "NPOSTMOD": "صفت پسین اسم",
+    "NPP": "حرف اضافه اسم",
+    "NCL": "بند اسم",
+    "NE": "اسم‌یار",
+    "MESU": "ممیز",
+    "NPRT": "جزء اسمی",
+    "MOZ": "مضاف الیه",
+    "APP": "بدل",
+    "NCONJ": "همپایه اسم",
+    "NADV": "قید اسم",
+    "COMPPP": "حرف اضافه تفضیلی",
+    "ADJADV": "قید صفت",
+    "ACL": "متمم بندی صفت",
+    "AJPP": "متمم حرف اضافه‌ای صفت",
+    "NEZ": "متمم نشانه اضافه‌ای صفت",
+    "AJCONJ": "همپایه صفت",
+    "APREMOD": "وابسته پیشین صفت",
+    "APOSTMOD": "وابسته پسین صفت",
+    "PREDEP": "وابسته پیشین",
+    "POSDEP": "وابسته پسین",
+    "PCONJ": "همپایه حرف اضافه",
+    "AVCONJ": "همپایه قید",
+    "PRD": "گزاره",
+    "ROOT": "ریشه جمله",
+    "PUNC": "علامت نگارشی"
+}
+
+
+def json_to_conllu(json_data):
+    """
+    Convert JSON in the given format to CoNLL-U format.
+    Args:
+        json_data (dict): JSON input as a Python dictionary.
+
+    Returns:
+        str: CoNLL-U formatted string.
+    """
+    conllu_output = []
+
+    # If sentence text is available, concatenate token texts
+    sentence_text = " ".join(token.get("text", "_") for token in json_data)
+    conllu_output.append(f"# text = {sentence_text}")
+
+    # Add tokens in CoNLL-U format
+    for token in json_data:
+        id_ = token.get("id", "_")
+        form = token.get("text", "_")
+        lemma = "_"  # Lemma not provided
+        upos = token.get("upos", "_")
+        xpos = token.get("xpos", "_")
+        feats = token.get("feats", "_")
+        head = token.get("head", "_")
+        deprel = token.get("deprel", "_")
+        deps = "_"
+        misc = "_"
+
+        conllu_output.append(
+            f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
+        )
+
+    # Add a blank line after each sentence
+    conllu_output.append("")
+
+    # Join all lines into a single string
+    return "\n".join(conllu_output)
+
+
+# Parse CoNLL-U data into a format that spaCy can use
+def conllu_to_spacy(conllu_data):
+    lines = conllu_data.strip().split("\n")
+    words = []
+    for line in lines:
+        if line.startswith("#"):  # Skip comment lines
+            continue
+        parts = line.split("\t")
+        if len(parts) != 10:  # Skip invalid lines
+            continue
+        index, word, _, _, _, _, head, dep, _, _ = parts
+        index = int(index)
+        head = int(head)
+        words.append((index, word, head, dep))
+    return words
+
+
+# Create a Doc object in spaCy
+def create_spacy_doc(conllu_data, model):
+    words = conllu_to_spacy(conllu_data)
+    num = len(words)
+    for index, word, head, dep in words:
+        dep_name = ud_tags_translation.get(dep, dep)
+        words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
+    words = list(reversed(words))
+
+    # Create a doc object
+    doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
+    # Add syntactic dependencies
+    for word, (index, _, head, dep) in zip(doc, words):
+        word.head = doc[head - 1] if head > 0 else doc[index - 1]
+        word.dep_ = dep
+    return doc
+
+
+# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
+model = spacy.blank("xx")  # Use 'xx' for a multilingual model
+
+
+def plot_dependency_graph(graph_data):
+    result = '<div>\n'
+    for sent_graph in graph_data:
+
+        conllu_data = json_to_conllu(sent_graph)
+
+        # Create a doc from CoNLL-U data
+        doc = create_spacy_doc(conllu_data, model)
+
+        # Visualize the dependency tree
+        result += displacy.render(doc, style="dep")
+    
+    result += '\n</div>'
+    return result
+
+
+if __name__ == "__main__":
+    test = [
+  [
+    {
+      "id": 1,
+      "text": "هوا",
+      "lemma": "هوا",
+      "upos": "NOUN",
+      "xpos": "NOUN",
+      "feats": None,
+      "head": 2,
+      "deprel": "nsubj",
+      "deps": None,
+      "misc": None
+    },
+    {
+      "id": 2,
+      "text": "خوب",
+      "lemma": "خوب",
+      "upos": "ADJ",
+      "xpos": "ADJ",
+      "feats": None,
+      "head": 0,
+      "deprel": "root",
+      "deps": None,
+      "misc": None
+    },
+    {
+      "id": 3,
+      "text": "است",
+      "lemma": "است",
+      "upos": "VERB",
+      "xpos": "VERB",
+      "feats": None,
+      "head": 2,
+      "deprel": "cop",
+      "deps": None,
+      "misc": None
+    }
+  ]
+]
+    print(plot_dependency_graph(test))
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,12 @@
+services:
+  hazm_api:
+    image: hazm_api
+    container_name: hazm_container
+    ports:
+      - "5110:5110"
+    volumes:
+      - models:/MODELS/
+
+volumes:
+  models:
+    external: true
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+spacy
+fastapi
+pydantic
+hazm
+uvicorn