From 96bf9b7290bc88cfd815b4b3396735eb97454ae4 Mon Sep 17 00:00:00 2001 From: HamidRezaMontazer Date: Sun, 2 Feb 2025 14:04:55 +0000 Subject: [PATCH] add hazm docker files --- .dockerignore | 1 + .gitignore | 1 + Dockerfile | 9 + app/__pycache__/hazm_api.cpython-310.pyc | Bin 0 -> 2868 bytes .../plot_dependency_graph.cpython-310.pyc | Bin 0 -> 5946 bytes app/hazm_api.py | 82 +++++++ app/main.py | 16 ++ app/plot_dependency_graph.py | 225 ++++++++++++++++++ docker-compose.yml | 12 + requirements.txt | 5 + 10 files changed, 351 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 app/__pycache__/hazm_api.cpython-310.pyc create mode 100644 app/__pycache__/plot_dependency_graph.cpython-310.pyc create mode 100644 app/hazm_api.py create mode 100644 app/main.py create mode 100644 app/plot_dependency_graph.py create mode 100644 docker-compose.yml create mode 100644 requirements.txt diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1ce7c80 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.10-slim +WORKDIR /app + +COPY . . + +RUN pip install -r requirements.txt + +EXPOSE 5110 +CMD ["python", "app/main.py"] \ No newline at end of file diff --git a/app/__pycache__/hazm_api.cpython-310.pyc b/app/__pycache__/hazm_api.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0c74339952e5b9bd72f030ec8108f0c546309ea GIT binary patch literal 2868 zcmZ`*TW=f36`t7}mrIJGWZh`VwrnRr2^410hh6|9bsH6RT3dD^89^6F7i-RlT4^uL z>?(-_3P>Tp>K_!S)HnZ*{)hte*gokG^ubU4&Tz%Z4!Vmyb1ri^bLKl|wwRj>7@mLq z@UC1sxF?py%7}~Ab4#qrWzmzHcvpI?$Mzg?Tdcov&Ukp=^xUAH za#7qlx2f-*CGLuQ#{R(AH*d}a>wWRSSRV>!hxH#`QKb7^H-G!`voDIHN-7e57nkyR zA!OS3^xT()61&Br%##zTbo1qlo!xje0vkM&Sr%7D-QAL7nG2Z@k6*^BgthVM{!xAi zX0B3F{`|FyhljE%w=uu}Vq7If9;ez%MtPy6cJj13>|5F&#+4iu>R5YOA&yds-tDKm z&!2rE%W;vHa!CH(d~66$@Rchdl7(Eb(0awBEgiuh10mR@t^Hm3U9}Zgv34qO+kIF2 zQIyA-j3OOG(R4BJ?I`;CC{AmOT~5R!eS`y- z>07Q{!<=jy`%#{JBUKrv5!|oQ9E}Z}fgx`?R0k#qQ`on@E!1I>k0Oz%As|LtA|UJ@_0c0WPSue=>7GI^(saWk$7Y z`=8Mld~KVnwE|l3q!pdN1*iVRy$B}$CCc#EYZG_GftUI1D>}w^ys0P@SB6-`QIcK} z;ICW|&7&)i1gh*;J`oBRfN~p+L7G}8ZiC1jA_gP(h}$GG14_T8J2(OnRkH%+&sYOl zZ+d>(HLZp<8Wwq)>YyB_NtGtKEVYX~PsV6Sr5<7l?ICdifqoNMrn=#oX-nO`0gaK0 z$NSoy&et{zh-f-vtCmDEth7Tq?F@^w)Pb1?co|kVP)-1UZ)!9ABWTN?g0y*n*S8&m zbnuqPr|$~C4Q>Um_ursxeeYT7DF&Q41Nf<(r+__5r~~Mh>Nm8Ar(|`c^7p_>2Sw-f zPkbdE;a_>hCIH9|d&^O<&6IDsnczdG7mcSsBupYpTE~<(>wDi)+mKG0gK=6^)T?i) zKSHBBs;=nKFpbOdNB$66j6YN`$)5{WA%V`2EIUj+gA&)p^7X9O$!bo-xT~I{%ZJV` zeb}F_rY&@=-R)TFb4VwR!H_Pm_q`&Z5|E8oyI{zxGlq2Ll_mHp?wWI?bY(-bYssE) zq&hork-QF)-J3YzTqJvALOQSVC*BznTR4+OLP_LqD>`=dG04fHDNDu|sirkc2S$v@ z^L|rNn3PGX`EgwBN3U^n(`_bGMgFQ^5;2Kj0|V;UB${ONr>FS~D0CBru`X_ndNK8c z41QfPOM*>#lvdimzQ;cdtv8L>XBbvey(cJkc!$@}1#ndeb^jyX*o(2+I8g*SEO9L2 zyh?_OVvB18i2Y$#it2=-Ix*c2xvu?hW0j-mmfFHKlJ=Q zzMOW~3-!W}uAx_|d7AkpP3kH0_hW8jbB(ASvYGfe6k6>Z6iKdLk@hD8x+?uG8fmXNmt0!R^4! literal 0 HcmV?d00001 diff --git a/app/__pycache__/plot_dependency_graph.cpython-310.pyc b/app/__pycache__/plot_dependency_graph.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff91cb1e1d88c3864bb044ca41845f1a0833abec GIT binary patch literal 5946 zcmb7Id3YPe72lnml~=Nz7?MB|0uh8G;z+m;V*5p@PGxe8F-5r18x!Hz&D8^uq18;zFFJ`+$wGdZW9y0NwFQcL)<}}#9PE$ z#oNSAai_RTyj|Qa?h&2h9b(sgjSuRBOuSRPYeE||#Jk0N@aE#Z;(d6Vwl)E5hoqB{s+ZFc&70X z{c+X)A!A4%jM$B~Wk#8x4a`-u2S&i zk;iHIt!nXnKIaAtOk(O}c{1wn^`G$fgTrL?W;Unl z1P22W4pVq4mmMiIk>GU#NF$Z2a19jom5^kjlv`LoQeA3VSt0O~G=!ZmEP9Rcw?pko z*yJ()Nh0wI7LiQ)Pm^7wxgkAfi@@FgRl)xVgxdoJ!W%M4uQi(-wF@DdrWPBwg5JWY z>cmR>{C(0I%GvHMX>V8w=Re{f@OMYmgxkTHr*jbdc*R;75aAT{pMe$jLl%k?1PPqU zN0ZLR!chUQsv|23@+^gq8mmRewEuW{^3cxlFpae$dfdR6uzz_%Ib^Riv&v>_;#Md| zk%Lz-F`@=r^GVEHLTqakkC3-1d?)=!;2a1nL)pUU5kokz0YxaOdF)IzXDA4Q%uF4s z=`r4bq1L1`R5+97A1Q-Kmi-7d3Ru8&H5SA9q?7cLW9cAyAPokIGDu!TGKEgQkWG0? zI5GhU9zuSVx66Hs_6bTOCztb-bSmYZfcDT`nL%2cbE%}4-fRc{eUuDL3{WEcX=#qt zB}z4QNr(f|=)bZ%2)c@CrO|^P+7z)g#W$JOh!1p0qbuG~SQ>Z%BG%F*T0VGaCuOB7 zDP-n>-rm9~Gghc%I#JbfU44*VV0doD>Q#BUOQ=&x0AFKqV|_|Z-qYVYixy@)azWT}K&HO5d2C`jS3 zz^szDcZ11tVqdG2v$<7rqcSC^IaOk;R;#_WAe}XVP6#uiA>gf4L!8xZ|8=3D6bS0< zOplD(2828woTuymsnAs(giQFd$s%7ZXD@oy;}ZO4mt z_xHr3>1@;+wxdJXc(TzAIcGHKMZFwo;;UDmwXV{$Ld|P;hTK-wqE$xRTs9%FD@K=# zbjpjhR@ciHylB#mCZoMuyy09nN~7sqHtB3x5mB@Ia7c2pbxXOP(>jZzXNxGh)7hbb zx74FE{D`ZZW|9x-N9T&I@X*cSk`B$)C2uzy&|nEB7&JE|_v|rG>IpiE8$-4iW0I{n zTbf>Oqn&+NbNaBvD{U)>bEEc3H@V(fIhxCQ$#L6RIh-6HO^94-BihM)eq}zB^Ab1; z+F4;|Q(F>4PBK5dBELn3+lK6{J(hP?Z6KjIfZeuO8$VH5%F88i6wPD`E7~&H=riuB z6*cp=lKL7bgyiJOrY3CSr4q?xftaa0^2 zNvq!yujEYC6a?;-^yfRy2y*l(t(#dQzh5NMIx@lJlHOaw1)wd7B!MT)Hqo!9AnM zw;`D~8qQMm^P3_ivlJ>dR357ork2Nx<~F@#;%(t=71@+oG>f64RSa+7DyH z{uhHn3{LQwc_%A{y+|<(I^wP{|I?d@!`%O#zVB~A`Se)0f$A9Bhd)y%tX_s zYl>lbE)+jZdx1kRBgDuEl&=)O?tH)yRg(}^6M;qVVVlVDx-*6$W%(F$da1p++EEw( zk+w|Q_2{_1>OKs+wD^2&{N$8_{Ivu3P(i6H1hom^;eb5WteoRy(U$WHBH<;6+=S;O zvo7w7D5fgEDOg-jT8@o!?%IN4kN&|BQbo;`MrST1O}bXBm&SNHuhLg#ZX(v`+;4NKAP}iu*6G|bTe)2rcWI-gXlp^_or~WR zV=+4k--x{(7?NykY=#>s_UH_#;%^li0{N{r-uZ|@#0144V$tvr4cLBl&lutGHn#3N z*+tmMH=$2(Y87e2Gvni-Hj&=ED&kPip!6e2w`A<`hDh5=>W`VK+$d?BZ6I;TSu*5M zad%>-jHv3awmIr`3mHmnRL-F6D*J51$c)Vv_{4j|Yi#Y4>Qk#QgQ|o_KXoof-X9y4+s~^KkdvLVv z>^^MNI&qly_ews{)pupA$tlqc;^K@GP$3S&P&vz7pGjsn23D%dOF@yqmf_imLo<)< z+N(2!GKty}tTwvUXY5s!dF}+z(ol>TXYAkHVE^Kr{ga#QA3Vff;SKEX++u&@VfI%Z zVK4JW_7~p7{>+=%pZFa1M?RPRfw!>V^P|}B_|g2gd>;D^pU-~H7qDOPW7tdlSaz5% zWWVH#*o%BI`vpIa{hTjhKjX)<7x)S6dA^kWlt{Gm*eUe|v4)PB6 z3Es&*&b!#h_*Lu~el>rZUBghKXFt|s)_}F#Ix2FsWkgzFwAWDZms#`w(fcBQwPiVf F?myd*j{5)r literal 0 HcmV?d00001 diff --git a/app/hazm_api.py b/app/hazm_api.py new file mode 100644 index 0000000..4d03e5f --- /dev/null +++ b/app/hazm_api.py @@ -0,0 +1,82 @@ +from fastapi import APIRouter +from pydantic import BaseModel +from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets +import warnings +import os +from typing import Optional +warnings.filterwarnings("ignore", category=UserWarning, module="nltk") +from fastapi.responses import HTMLResponse + +import plot_dependency_graph + +router = APIRouter() + +class TextData(BaseModel): + text: str + +class Token(BaseModel): + id: int + text: str + lemma: str + upos: str + xpos: str + feats: Optional[str] = None + head: int + deprel: str + deps: Optional[str] = None + misc: Optional[str] = None + +path_base_model = '/MODELS' + +word_tokenizer = WordTokenizer() +postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model')) +lemmatizer = Lemmatizer() +parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser')) + +def dependency_parse(data): + tokens = word_tokenizer.tokenize(data.text) + dependency_graph = parser.parse(tokens) + conll = dependency_graph.to_conll(10) + + # تبدیل خروجی به دیکشنری + parsed_lines = conll.splitlines() + parsed_dict = [] + for line in parsed_lines: + if line.strip(): # اگر خط خالی نباشد + cols = line.split('\t') + token_dict = { + "id": int(cols[0]), + "text": cols[1], + "lemma": cols[2], + "upos": cols[3], + "xpos": cols[4], + "feats": cols[5] if cols[5] != "_" else None, + "head": int(cols[6]) if cols[6] != "_" else None, + "deprel": cols[7], + "deps": cols[8] if cols[8] != "_" else None, + "misc": cols[9] if cols[9] != "_" else None, + } + parsed_dict.append(token_dict) + + return [parsed_dict] + +@router.post("/dep", tags=['Hazm']) +async def parse_dependencies(data: TextData) -> list[list[Token]]: + return dependency_parse(data) + + +@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse) +async def plot_dependency_tree(data: TextData): + return plot_dependency_graph.plot_dependency_graph(dependency_parse(data)) + + +@router.post("/chunks", tags=['Hazm']) +async def chunks(data: TextData): + word_tokenizer = WordTokenizer() + postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model') + chunker = Chunker(model=path_base_model + 'hazm/chunker.model') + tokens = word_tokenizer.tokenize(data.text) + pos_tags = postagger.tag(tokens) + result = chunker.parse(pos_tags) + brackets = tree2brackets(result) + return {"chunk_tree": brackets} \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..074769e --- /dev/null +++ b/app/main.py @@ -0,0 +1,16 @@ +import uvicorn +from fastapi import FastAPI +import hazm_api + + +app = FastAPI() +app.include_router(hazm_api.router, prefix="/hazm") + + +@app.get("/") +async def read_root(): + return "Hazm is OK!" + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=5110) \ No newline at end of file diff --git a/app/plot_dependency_graph.py b/app/plot_dependency_graph.py new file mode 100644 index 0000000..7520798 --- /dev/null +++ b/app/plot_dependency_graph.py @@ -0,0 +1,225 @@ +import spacy +from spacy import displacy + +ud_tags_translation = { + # UD Standard tags for Hazm and Dadmatools + "acl": "بند موصولی", + "advcl": "بند قیدی", + "advmod": "وابسته قیدی", + "amod": "وابسته وصفی", + "appos": "متمم تشریحی", + "aux": "فعل کمکی", + "case": "نشانه یا حرف اضافه", + "cc": "حرف ربط همپایه‌ساز", + "ccomp": "مکمل فعل", + "compound": "ترکیب کلمات", + "compound:lvc": "فعل‌یار", + "conj": "همپایه", + "cop": "فعل ربطی", + "csubj": "فاعل بند", + "dep": "وابسته نامشخص", + "det": "معرف", + "fixed": "ترکیب ثابت", + "flat:name": "نام", + "flat:num": "عدد", + "goeswith": "جزء ناقص", + "iobj": "مفعول غیرمستقیم", + "mark": "نشانه بند", + "nmod": "متمم اسمی", + "nmod:poss": 'مالکیت', + "nsubj": "فاعل اسمی", + "nsubj:pass": "فاعل جمله مجهول", + "nummod": "وابسته عددی", + "obj": "مفعول مستقیم", + "obl": "متمم قیدی", + "obl:arg": "وابسته قیدی مرتبط با فعل", + "parataxis": "عبارت موازی", + "punct": "علائم نگارشی", + "root": "ریشه جمله", + "vocative": "حالت ندا", + "xcomp": "متمم باز", + + # Penn Treebank Standard for Parsivar + "SBJ": "فاعل", + "OBJ": "مفعول", + "NVE": "فعل‌یار", + "ENC": "فعل‌یار پی‌بستی", + "VPP": "مفعول حرف‌اضافه‌ای", + "OBJ2": "مفعول دوم", + "TAM": "تمییز", + "MOS": "مسند", + "PROG": "مستمرساز", + "ADVC": "متمم قیدی", + "VCL": "بند متممی فعل", + "VPRT": "حرف اضافه فعلی", + "LVP": "جزء همکرد", + "PARCL": "بند وصفی", + "ADV": "قید", + "AJUCL": "بند افزوده فعل", + "PART": "افزوده پرسشی فعل", + "VCONJ": "همپایه فعل", + "NPREMOD": "صفت پیشین اسم", + "NPOSTMOD": "صفت پسین اسم", + "NPP": "حرف اضافه اسم", + "NCL": "بند اسم", + "NE": "اسم‌یار", + "MESU": "ممیز", + "NPRT": "جزء اسمی", + "MOZ": "مضاف الیه", + "APP": "بدل", + "NCONJ": "همپایه اسم", + "NADV": "قید اسم", + "COMPPP": "حرف اضافه تفضیلی", + "ADJADV": "قید صفت", + "ACL": "متمم بندی صفت", + "AJPP": "متمم حرف اضافه‌ای صفت", + "NEZ": "متمم نشانه اضافه‌ای صفت", + "AJCONJ": "همپایه صفت", + "APREMOD": "وابسته پیشین صفت", + "APOSTMOD": "وابسته پسین صفت", + "PREDEP": "وابسته پیشین", + "POSDEP": "وابسته پسین", + "PCONJ": "همپایه حرف اضافه", + "AVCONJ": "همپایه قید", + "PRD": "گزاره", + "ROOT": "ریشه جمله", + "PUNC": "علامت نگارشی" +} + + +def json_to_conllu(json_data): + """ + Convert JSON in the given format to CoNLL-U format. + Args: + json_data (dict): JSON input as a Python dictionary. + + Returns: + str: CoNLL-U formatted string. + """ + conllu_output = [] + + # If sentence text is available, concatenate token texts + sentence_text = " ".join(token.get("text", "_") for token in json_data) + conllu_output.append(f"# text = {sentence_text}") + + # Add tokens in CoNLL-U format + for token in json_data: + id_ = token.get("id", "_") + form = token.get("text", "_") + lemma = "_" # Lemma not provided + upos = token.get("upos", "_") + xpos = token.get("xpos", "_") + feats = token.get("feats", "_") + head = token.get("head", "_") + deprel = token.get("deprel", "_") + deps = "_" + misc = "_" + + conllu_output.append( + f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}" + ) + + # Add a blank line after each sentence + conllu_output.append("") + + # Join all lines into a single string + return "\n".join(conllu_output) + + +# Parse CoNLL-U data into a format that spaCy can use +def conllu_to_spacy(conllu_data): + lines = conllu_data.strip().split("\n") + words = [] + for line in lines: + if line.startswith("#"): # Skip comment lines + continue + parts = line.split("\t") + if len(parts) != 10: # Skip invalid lines + continue + index, word, _, _, _, _, head, dep, _, _ = parts + index = int(index) + head = int(head) + words.append((index, word, head, dep)) + return words + + +# Create a Doc object in spaCy +def create_spacy_doc(conllu_data, model): + words = conllu_to_spacy(conllu_data) + num = len(words) + for index, word, head, dep in words: + dep_name = ud_tags_translation.get(dep, dep) + words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name) + words = list(reversed(words)) + + # Create a doc object + doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words]) + # Add syntactic dependencies + for word, (index, _, head, dep) in zip(doc, words): + word.head = doc[head - 1] if head > 0 else doc[index - 1] + word.dep_ = dep + return doc + + +# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components) +model = spacy.blank("xx") # Use 'xx' for a multilingual model + + +def plot_dependency_graph(graph_data): + result = '
\n' + for sent_graph in graph_data: + + conllu_data = json_to_conllu(sent_graph) + + # Create a doc from CoNLL-U data + doc = create_spacy_doc(conllu_data, model) + + # Visualize the dependency tree + result += displacy.render(doc, style="dep") + + result += '\n
' + return result + + +if __name__ == "__main__": + test = [ + [ + { + "id": 1, + "text": "هوا", + "lemma": "هوا", + "upos": "NOUN", + "xpos": "NOUN", + "feats": None, + "head": 2, + "deprel": "nsubj", + "deps": None, + "misc": None + }, + { + "id": 2, + "text": "خوب", + "lemma": "خوب", + "upos": "ADJ", + "xpos": "ADJ", + "feats": None, + "head": 0, + "deprel": "root", + "deps": None, + "misc": None + }, + { + "id": 3, + "text": "است", + "lemma": "است", + "upos": "VERB", + "xpos": "VERB", + "feats": None, + "head": 2, + "deprel": "cop", + "deps": None, + "misc": None + } + ] +] + print(plot_dependency_graph(test)) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4c466e5 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,12 @@ +services: + hazm_api: + image: hazm_api + container_name: hazm_container + ports: + - "5110:5110" + volumes: + - models:/MODELS/ + +volumes: + models: + external: true \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fa8a8cb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +spacy +fastapi +pydantic +hazm +uvicorn \ No newline at end of file