diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2eea525 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1ce7c80 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.10-slim +WORKDIR /app + +COPY . . + +RUN pip install -r requirements.txt + +EXPOSE 5110 +CMD ["python", "app/main.py"] \ No newline at end of file diff --git a/app/__pycache__/hazm_api.cpython-310.pyc b/app/__pycache__/hazm_api.cpython-310.pyc new file mode 100644 index 0000000..d0c7433 Binary files /dev/null and b/app/__pycache__/hazm_api.cpython-310.pyc differ diff --git a/app/__pycache__/plot_dependency_graph.cpython-310.pyc b/app/__pycache__/plot_dependency_graph.cpython-310.pyc new file mode 100644 index 0000000..ff91cb1 Binary files /dev/null and b/app/__pycache__/plot_dependency_graph.cpython-310.pyc differ diff --git a/app/hazm_api.py b/app/hazm_api.py new file mode 100644 index 0000000..4d03e5f --- /dev/null +++ b/app/hazm_api.py @@ -0,0 +1,82 @@ +from fastapi import APIRouter +from pydantic import BaseModel +from hazm import WordTokenizer, POSTagger, Lemmatizer, DependencyParser, Chunker, tree2brackets +import warnings +import os +from typing import Optional +warnings.filterwarnings("ignore", category=UserWarning, module="nltk") +from fastapi.responses import HTMLResponse + +import plot_dependency_graph + +router = APIRouter() + +class TextData(BaseModel): + text: str + +class Token(BaseModel): + id: int + text: str + lemma: str + upos: str + xpos: str + feats: Optional[str] = None + head: int + deprel: str + deps: Optional[str] = None + misc: Optional[str] = None + +path_base_model = '/MODELS' + +word_tokenizer = WordTokenizer() +postagger = POSTagger(model=os.path.join(path_base_model, 'hazm/pos_tagger.model')) +lemmatizer = Lemmatizer() +parser = DependencyParser(tagger=postagger, lemmatizer=lemmatizer, working_dir=os.path.join(path_base_model, 'hazm/universal_dependency_parser')) + +def dependency_parse(data): + tokens = word_tokenizer.tokenize(data.text) + dependency_graph = parser.parse(tokens) + conll = dependency_graph.to_conll(10) + + # تبدیل خروجی به دیکشنری + parsed_lines = conll.splitlines() + parsed_dict = [] + for line in parsed_lines: + if line.strip(): # اگر خط خالی نباشد + cols = line.split('\t') + token_dict = { + "id": int(cols[0]), + "text": cols[1], + "lemma": cols[2], + "upos": cols[3], + "xpos": cols[4], + "feats": cols[5] if cols[5] != "_" else None, + "head": int(cols[6]) if cols[6] != "_" else None, + "deprel": cols[7], + "deps": cols[8] if cols[8] != "_" else None, + "misc": cols[9] if cols[9] != "_" else None, + } + parsed_dict.append(token_dict) + + return [parsed_dict] + +@router.post("/dep", tags=['Hazm']) +async def parse_dependencies(data: TextData) -> list[list[Token]]: + return dependency_parse(data) + + +@router.post('/plot_dep', tags=['Hazm'], response_class=HTMLResponse) +async def plot_dependency_tree(data: TextData): + return plot_dependency_graph.plot_dependency_graph(dependency_parse(data)) + + +@router.post("/chunks", tags=['Hazm']) +async def chunks(data: TextData): + word_tokenizer = WordTokenizer() + postagger = POSTagger(model=path_base_model + 'hazm/pos_tagger.model') + chunker = Chunker(model=path_base_model + 'hazm/chunker.model') + tokens = word_tokenizer.tokenize(data.text) + pos_tags = postagger.tag(tokens) + result = chunker.parse(pos_tags) + brackets = tree2brackets(result) + return {"chunk_tree": brackets} \ No newline at end of file diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..074769e --- /dev/null +++ b/app/main.py @@ -0,0 +1,16 @@ +import uvicorn +from fastapi import FastAPI +import hazm_api + + +app = FastAPI() +app.include_router(hazm_api.router, prefix="/hazm") + + +@app.get("/") +async def read_root(): + return "Hazm is OK!" + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=5110) \ No newline at end of file diff --git a/app/plot_dependency_graph.py b/app/plot_dependency_graph.py new file mode 100644 index 0000000..7520798 --- /dev/null +++ b/app/plot_dependency_graph.py @@ -0,0 +1,225 @@ +import spacy +from spacy import displacy + +ud_tags_translation = { + # UD Standard tags for Hazm and Dadmatools + "acl": "بند موصولی", + "advcl": "بند قیدی", + "advmod": "وابسته قیدی", + "amod": "وابسته وصفی", + "appos": "متمم تشریحی", + "aux": "فعل کمکی", + "case": "نشانه یا حرف اضافه", + "cc": "حرف ربط همپایه‌ساز", + "ccomp": "مکمل فعل", + "compound": "ترکیب کلمات", + "compound:lvc": "فعل‌یار", + "conj": "همپایه", + "cop": "فعل ربطی", + "csubj": "فاعل بند", + "dep": "وابسته نامشخص", + "det": "معرف", + "fixed": "ترکیب ثابت", + "flat:name": "نام", + "flat:num": "عدد", + "goeswith": "جزء ناقص", + "iobj": "مفعول غیرمستقیم", + "mark": "نشانه بند", + "nmod": "متمم اسمی", + "nmod:poss": 'مالکیت', + "nsubj": "فاعل اسمی", + "nsubj:pass": "فاعل جمله مجهول", + "nummod": "وابسته عددی", + "obj": "مفعول مستقیم", + "obl": "متمم قیدی", + "obl:arg": "وابسته قیدی مرتبط با فعل", + "parataxis": "عبارت موازی", + "punct": "علائم نگارشی", + "root": "ریشه جمله", + "vocative": "حالت ندا", + "xcomp": "متمم باز", + + # Penn Treebank Standard for Parsivar + "SBJ": "فاعل", + "OBJ": "مفعول", + "NVE": "فعل‌یار", + "ENC": "فعل‌یار پی‌بستی", + "VPP": "مفعول حرف‌اضافه‌ای", + "OBJ2": "مفعول دوم", + "TAM": "تمییز", + "MOS": "مسند", + "PROG": "مستمرساز", + "ADVC": "متمم قیدی", + "VCL": "بند متممی فعل", + "VPRT": "حرف اضافه فعلی", + "LVP": "جزء همکرد", + "PARCL": "بند وصفی", + "ADV": "قید", + "AJUCL": "بند افزوده فعل", + "PART": "افزوده پرسشی فعل", + "VCONJ": "همپایه فعل", + "NPREMOD": "صفت پیشین اسم", + "NPOSTMOD": "صفت پسین اسم", + "NPP": "حرف اضافه اسم", + "NCL": "بند اسم", + "NE": "اسم‌یار", + "MESU": "ممیز", + "NPRT": "جزء اسمی", + "MOZ": "مضاف الیه", + "APP": "بدل", + "NCONJ": "همپایه اسم", + "NADV": "قید اسم", + "COMPPP": "حرف اضافه تفضیلی", + "ADJADV": "قید صفت", + "ACL": "متمم بندی صفت", + "AJPP": "متمم حرف اضافه‌ای صفت", + "NEZ": "متمم نشانه اضافه‌ای صفت", + "AJCONJ": "همپایه صفت", + "APREMOD": "وابسته پیشین صفت", + "APOSTMOD": "وابسته پسین صفت", + "PREDEP": "وابسته پیشین", + "POSDEP": "وابسته پسین", + "PCONJ": "همپایه حرف اضافه", + "AVCONJ": "همپایه قید", + "PRD": "گزاره", + "ROOT": "ریشه جمله", + "PUNC": "علامت نگارشی" +} + + +def json_to_conllu(json_data): + """ + Convert JSON in the given format to CoNLL-U format. + Args: + json_data (dict): JSON input as a Python dictionary. + + Returns: + str: CoNLL-U formatted string. + """ + conllu_output = [] + + # If sentence text is available, concatenate token texts + sentence_text = " ".join(token.get("text", "_") for token in json_data) + conllu_output.append(f"# text = {sentence_text}") + + # Add tokens in CoNLL-U format + for token in json_data: + id_ = token.get("id", "_") + form = token.get("text", "_") + lemma = "_" # Lemma not provided + upos = token.get("upos", "_") + xpos = token.get("xpos", "_") + feats = token.get("feats", "_") + head = token.get("head", "_") + deprel = token.get("deprel", "_") + deps = "_" + misc = "_" + + conllu_output.append( + f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}" + ) + + # Add a blank line after each sentence + conllu_output.append("") + + # Join all lines into a single string + return "\n".join(conllu_output) + + +# Parse CoNLL-U data into a format that spaCy can use +def conllu_to_spacy(conllu_data): + lines = conllu_data.strip().split("\n") + words = [] + for line in lines: + if line.startswith("#"): # Skip comment lines + continue + parts = line.split("\t") + if len(parts) != 10: # Skip invalid lines + continue + index, word, _, _, _, _, head, dep, _, _ = parts + index = int(index) + head = int(head) + words.append((index, word, head, dep)) + return words + + +# Create a Doc object in spaCy +def create_spacy_doc(conllu_data, model): + words = conllu_to_spacy(conllu_data) + num = len(words) + for index, word, head, dep in words: + dep_name = ud_tags_translation.get(dep, dep) + words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name) + words = list(reversed(words)) + + # Create a doc object + doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words]) + # Add syntactic dependencies + for word, (index, _, head, dep) in zip(doc, words): + word.head = doc[head - 1] if head > 0 else doc[index - 1] + word.dep_ = dep + return doc + + +# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components) +model = spacy.blank("xx") # Use 'xx' for a multilingual model + + +def plot_dependency_graph(graph_data): + result = '
\n' + for sent_graph in graph_data: + + conllu_data = json_to_conllu(sent_graph) + + # Create a doc from CoNLL-U data + doc = create_spacy_doc(conllu_data, model) + + # Visualize the dependency tree + result += displacy.render(doc, style="dep") + + result += '\n
' + return result + + +if __name__ == "__main__": + test = [ + [ + { + "id": 1, + "text": "هوا", + "lemma": "هوا", + "upos": "NOUN", + "xpos": "NOUN", + "feats": None, + "head": 2, + "deprel": "nsubj", + "deps": None, + "misc": None + }, + { + "id": 2, + "text": "خوب", + "lemma": "خوب", + "upos": "ADJ", + "xpos": "ADJ", + "feats": None, + "head": 0, + "deprel": "root", + "deps": None, + "misc": None + }, + { + "id": 3, + "text": "است", + "lemma": "است", + "upos": "VERB", + "xpos": "VERB", + "feats": None, + "head": 2, + "deprel": "cop", + "deps": None, + "misc": None + } + ] +] + print(plot_dependency_graph(test)) \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..4c466e5 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,12 @@ +services: + hazm_api: + image: hazm_api + container_name: hazm_container + ports: + - "5110:5110" + volumes: + - models:/MODELS/ + +volumes: + models: + external: true \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fa8a8cb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +spacy +fastapi +pydantic +hazm +uvicorn \ No newline at end of file