hazm_docker/app/plot_dependency_graph.py

import spacy
from spacy import displacy

ud_tags_translation = {
    # UD Standard tags for Hazm and Dadmatools
    "acl": "بند موصولی",
    "advcl": "بند قیدی",
    "advmod": "وابسته قیدی",
    "amod": "وابسته وصفی",
    "appos": "متمم تشریحی",
    "aux": "فعل کمکی",
    "case": "نشانه یا حرف اضافه",
    "cc": "حرف ربط همپایه‌ساز",
    "ccomp": "مکمل فعل",
    "compound": "ترکیب کلمات",
    "compound:lvc": "فعل‌یار",
    "conj": "همپایه",
    "cop": "فعل ربطی",
    "csubj": "فاعل بند",
    "dep": "وابسته نامشخص",
    "det": "معرف",
    "fixed": "ترکیب ثابت",
    "flat:name": "نام",
    "flat:num": "عدد",
    "goeswith": "جزء ناقص",
    "iobj": "مفعول غیرمستقیم",
    "mark": "نشانه بند",
    "nmod": "متمم اسمی",
    "nmod:poss": 'مالکیت',
    "nsubj": "فاعل اسمی",
    "nsubj:pass": "فاعل جمله مجهول",
    "nummod": "وابسته عددی",
    "obj": "مفعول مستقیم",
    "obl": "متمم قیدی",
    "obl:arg": "وابسته قیدی مرتبط با فعل",
    "parataxis": "عبارت موازی",
    "punct": "علائم نگارشی",
    "root": "ریشه جمله",
    "vocative": "حالت ندا",
    "xcomp": "متمم باز",

    # Penn Treebank Standard for Parsivar
    "SBJ": "فاعل",
    "OBJ": "مفعول",
    "NVE": "فعل‌یار",
    "ENC": "فعل‌یار پی‌بستی",
    "VPP": "مفعول حرف‌اضافه‌ای",
    "OBJ2": "مفعول دوم",
    "TAM": "تمییز",
    "MOS": "مسند",
    "PROG": "مستمرساز",
    "ADVC": "متمم قیدی",
    "VCL": "بند متممی فعل",
    "VPRT": "حرف اضافه فعلی",
    "LVP": "جزء همکرد",
    "PARCL": "بند وصفی",
    "ADV": "قید",
    "AJUCL": "بند افزوده فعل",
    "PART": "افزوده پرسشی فعل",
    "VCONJ": "همپایه فعل",
    "NPREMOD": "صفت پیشین اسم",
    "NPOSTMOD": "صفت پسین اسم",
    "NPP": "حرف اضافه اسم",
    "NCL": "بند اسم",
    "NE": "اسم‌یار",
    "MESU": "ممیز",
    "NPRT": "جزء اسمی",
    "MOZ": "مضاف الیه",
    "APP": "بدل",
    "NCONJ": "همپایه اسم",
    "NADV": "قید اسم",
    "COMPPP": "حرف اضافه تفضیلی",
    "ADJADV": "قید صفت",
    "ACL": "متمم بندی صفت",
    "AJPP": "متمم حرف اضافه‌ای صفت",
    "NEZ": "متمم نشانه اضافه‌ای صفت",
    "AJCONJ": "همپایه صفت",
    "APREMOD": "وابسته پیشین صفت",
    "APOSTMOD": "وابسته پسین صفت",
    "PREDEP": "وابسته پیشین",
    "POSDEP": "وابسته پسین",
    "PCONJ": "همپایه حرف اضافه",
    "AVCONJ": "همپایه قید",
    "PRD": "گزاره",
    "ROOT": "ریشه جمله",
    "PUNC": "علامت نگارشی"
}


def json_to_conllu(json_data):
    """
    Convert JSON in the given format to CoNLL-U format.
    Args:
        json_data (dict): JSON input as a Python dictionary.

    Returns:
        str: CoNLL-U formatted string.
    """
    conllu_output = []

    # If sentence text is available, concatenate token texts
    sentence_text = " ".join(token.get("text", "_") for token in json_data)
    conllu_output.append(f"# text = {sentence_text}")

    # Add tokens in CoNLL-U format
    for token in json_data:
        id_ = token.get("id", "_")
        form = token.get("text", "_")
        lemma = "_"  # Lemma not provided
        upos = token.get("upos", "_")
        xpos = token.get("xpos", "_")
        feats = token.get("feats", "_")
        head = token.get("head", "_")
        deprel = token.get("deprel", "_")
        deps = "_"
        misc = "_"

        conllu_output.append(
            f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
        )

    # Add a blank line after each sentence
    conllu_output.append("")

    # Join all lines into a single string
    return "\n".join(conllu_output)


# Parse CoNLL-U data into a format that spaCy can use
def conllu_to_spacy(conllu_data):
    lines = conllu_data.strip().split("\n")
    words = []
    for line in lines:
        if line.startswith("#"):  # Skip comment lines
            continue
        parts = line.split("\t")
        if len(parts) != 10:  # Skip invalid lines
            continue
        index, word, _, _, _, _, head, dep, _, _ = parts
        index = int(index)
        head = int(head)
        words.append((index, word, head, dep))
    return words


# Create a Doc object in spaCy
def create_spacy_doc(conllu_data, model):
    words = conllu_to_spacy(conllu_data)
    num = len(words)
    for index, word, head, dep in words:
        dep_name = ud_tags_translation.get(dep, dep)
        words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
    words = list(reversed(words))

    # Create a doc object
    doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
    # Add syntactic dependencies
    for word, (index, _, head, dep) in zip(doc, words):
        word.head = doc[head - 1] if head > 0 else doc[index - 1]
        word.dep_ = dep
    return doc


# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
model = spacy.blank("xx")  # Use 'xx' for a multilingual model


def plot_dependency_graph(graph_data):
    result = '<div>\n'
    for sent_graph in graph_data:

        conllu_data = json_to_conllu(sent_graph)

        # Create a doc from CoNLL-U data
        doc = create_spacy_doc(conllu_data, model)

        # Visualize the dependency tree
        result += displacy.render(doc, style="dep")
    
    result += '\n</div>'
    return result


if __name__ == "__main__":
    test = [
  [
    {
      "id": 1,
      "text": "هوا",
      "lemma": "هوا",
      "upos": "NOUN",
      "xpos": "NOUN",
      "feats": None,
      "head": 2,
      "deprel": "nsubj",
      "deps": None,
      "misc": None
    },
    {
      "id": 2,
      "text": "خوب",
      "lemma": "خوب",
      "upos": "ADJ",
      "xpos": "ADJ",
      "feats": None,
      "head": 0,
      "deprel": "root",
      "deps": None,
      "misc": None
    },
    {
      "id": 3,
      "text": "است",
      "lemma": "است",
      "upos": "VERB",
      "xpos": "VERB",
      "feats": None,
      "head": 2,
      "deprel": "cop",
      "deps": None,
      "misc": None
    }
  ]
]
    print(plot_dependency_graph(test))
add hazm docker files 2025-02-02 14:04:55 +00:00			`import spacy`
			`from spacy import displacy`

			`ud_tags_translation = {`
			`# UD Standard tags for Hazm and Dadmatools`
			`"acl": "بند موصولی",`
			`"advcl": "بند قیدی",`
			`"advmod": "وابسته قیدی",`
			`"amod": "وابسته وصفی",`
			`"appos": "متمم تشریحی",`
			`"aux": "فعل کمکی",`
			`"case": "نشانه یا حرف اضافه",`
			`"cc": "حرف ربط همپایه‌ساز",`
			`"ccomp": "مکمل فعل",`
			`"compound": "ترکیب کلمات",`
			`"compound:lvc": "فعل‌یار",`
			`"conj": "همپایه",`
			`"cop": "فعل ربطی",`
			`"csubj": "فاعل بند",`
			`"dep": "وابسته نامشخص",`
			`"det": "معرف",`
			`"fixed": "ترکیب ثابت",`
			`"flat:name": "نام",`
			`"flat:num": "عدد",`
			`"goeswith": "جزء ناقص",`
			`"iobj": "مفعول غیرمستقیم",`
			`"mark": "نشانه بند",`
			`"nmod": "متمم اسمی",`
			`"nmod:poss": 'مالکیت',`
			`"nsubj": "فاعل اسمی",`
			`"nsubj:pass": "فاعل جمله مجهول",`
			`"nummod": "وابسته عددی",`
			`"obj": "مفعول مستقیم",`
			`"obl": "متمم قیدی",`
			`"obl:arg": "وابسته قیدی مرتبط با فعل",`
			`"parataxis": "عبارت موازی",`
			`"punct": "علائم نگارشی",`
			`"root": "ریشه جمله",`
			`"vocative": "حالت ندا",`
			`"xcomp": "متمم باز",`

			`# Penn Treebank Standard for Parsivar`
			`"SBJ": "فاعل",`
			`"OBJ": "مفعول",`
			`"NVE": "فعل‌یار",`
			`"ENC": "فعل‌یار پی‌بستی",`
			`"VPP": "مفعول حرف‌اضافه‌ای",`
			`"OBJ2": "مفعول دوم",`
			`"TAM": "تمییز",`
			`"MOS": "مسند",`
			`"PROG": "مستمرساز",`
			`"ADVC": "متمم قیدی",`
			`"VCL": "بند متممی فعل",`
			`"VPRT": "حرف اضافه فعلی",`
			`"LVP": "جزء همکرد",`
			`"PARCL": "بند وصفی",`
			`"ADV": "قید",`
			`"AJUCL": "بند افزوده فعل",`
			`"PART": "افزوده پرسشی فعل",`
			`"VCONJ": "همپایه فعل",`
			`"NPREMOD": "صفت پیشین اسم",`
			`"NPOSTMOD": "صفت پسین اسم",`
			`"NPP": "حرف اضافه اسم",`
			`"NCL": "بند اسم",`
			`"NE": "اسم‌یار",`
			`"MESU": "ممیز",`
			`"NPRT": "جزء اسمی",`
			`"MOZ": "مضاف الیه",`
			`"APP": "بدل",`
			`"NCONJ": "همپایه اسم",`
			`"NADV": "قید اسم",`
			`"COMPPP": "حرف اضافه تفضیلی",`
			`"ADJADV": "قید صفت",`
			`"ACL": "متمم بندی صفت",`
			`"AJPP": "متمم حرف اضافه‌ای صفت",`
			`"NEZ": "متمم نشانه اضافه‌ای صفت",`
			`"AJCONJ": "همپایه صفت",`
			`"APREMOD": "وابسته پیشین صفت",`
			`"APOSTMOD": "وابسته پسین صفت",`
			`"PREDEP": "وابسته پیشین",`
			`"POSDEP": "وابسته پسین",`
			`"PCONJ": "همپایه حرف اضافه",`
			`"AVCONJ": "همپایه قید",`
			`"PRD": "گزاره",`
			`"ROOT": "ریشه جمله",`
			`"PUNC": "علامت نگارشی"`
			`}`


			`def json_to_conllu(json_data):`
			`"""`
			`Convert JSON in the given format to CoNLL-U format.`
			`Args:`
			`json_data (dict): JSON input as a Python dictionary.`

			`Returns:`
			`str: CoNLL-U formatted string.`
			`"""`
			`conllu_output = []`

			`# If sentence text is available, concatenate token texts`
			`sentence_text = " ".join(token.get("text", "_") for token in json_data)`
			`conllu_output.append(f"# text = {sentence_text}")`

			`# Add tokens in CoNLL-U format`
			`for token in json_data:`
			`id_ = token.get("id", "_")`
			`form = token.get("text", "_")`
			`lemma = "_" # Lemma not provided`
			`upos = token.get("upos", "_")`
			`xpos = token.get("xpos", "_")`
			`feats = token.get("feats", "_")`
			`head = token.get("head", "_")`
			`deprel = token.get("deprel", "_")`
			`deps = "_"`
			`misc = "_"`

			`conllu_output.append(`
			`f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"`
			`)`

			`# Add a blank line after each sentence`
			`conllu_output.append("")`

			`# Join all lines into a single string`
			`return "\n".join(conllu_output)`


			`# Parse CoNLL-U data into a format that spaCy can use`
			`def conllu_to_spacy(conllu_data):`
			`lines = conllu_data.strip().split("\n")`
			`words = []`
			`for line in lines:`
			`if line.startswith("#"): # Skip comment lines`
			`continue`
			`parts = line.split("\t")`
			`if len(parts) != 10: # Skip invalid lines`
			`continue`
			`index, word, _, _, _, _, head, dep, _, _ = parts`
			`index = int(index)`
			`head = int(head)`
			`words.append((index, word, head, dep))`
			`return words`


			`# Create a Doc object in spaCy`
			`def create_spacy_doc(conllu_data, model):`
			`words = conllu_to_spacy(conllu_data)`
			`num = len(words)`
			`for index, word, head, dep in words:`
			`dep_name = ud_tags_translation.get(dep, dep)`
			`words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)`
			`words = list(reversed(words))`

			`# Create a doc object`
			`doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])`
			`# Add syntactic dependencies`
			`for word, (index, _, head, dep) in zip(doc, words):`
			`word.head = doc[head - 1] if head > 0 else doc[index - 1]`
			`word.dep_ = dep`
			`return doc`


			`# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)`
			`model = spacy.blank("xx") # Use 'xx' for a multilingual model`


			`def plot_dependency_graph(graph_data):`
			`result = '<div>\n'`
			`for sent_graph in graph_data:`

			`conllu_data = json_to_conllu(sent_graph)`

			`# Create a doc from CoNLL-U data`
			`doc = create_spacy_doc(conllu_data, model)`

			`# Visualize the dependency tree`
			`result += displacy.render(doc, style="dep")`

			`result += '\n</div>'`
			`return result`


			`if __name__ == "__main__":`
			`test = [`
			`[`
			`{`
			`"id": 1,`
			`"text": "هوا",`
			`"lemma": "هوا",`
			`"upos": "NOUN",`
			`"xpos": "NOUN",`
			`"feats": None,`
			`"head": 2,`
			`"deprel": "nsubj",`
			`"deps": None,`
			`"misc": None`
			`},`
			`{`
			`"id": 2,`
			`"text": "خوب",`
			`"lemma": "خوب",`
			`"upos": "ADJ",`
			`"xpos": "ADJ",`
			`"feats": None,`
			`"head": 0,`
			`"deprel": "root",`
			`"deps": None,`
			`"misc": None`
			`},`
			`{`
			`"id": 3,`
			`"text": "است",`
			`"lemma": "است",`
			`"upos": "VERB",`
			`"xpos": "VERB",`
			`"feats": None,`
			`"head": 2,`
			`"deprel": "cop",`
			`"deps": None,`
			`"misc": None`
			`}`
			`]`
			`]`
			`print(plot_dependency_graph(test))`