hazm_docker/app/plot_dependency_graph.py

import spacy
from spacy import displacy

ud_tags_translation = {
    # UD Standard tags for Hazm and Dadmatools
    "acl": "بند موصولی",
    "advcl": "بند قیدی",
    "advmod": "وابسته قیدی",
    "amod": "وابسته وصفی",
    "appos": "متمم تشریحی",
    "aux": "فعل کمکی",
    "case": "نشانه یا حرف اضافه",
    "cc": "حرف ربط همپایه‌ساز",
    "ccomp": "مکمل فعل",
    "compound": "ترکیب کلمات",
    "compound:lvc": "فعل‌یار",
    "conj": "همپایه",
    "cop": "فعل ربطی",
    "csubj": "فاعل بند",
    "dep": "وابسته نامشخص",
    "det": "معرف",
    "fixed": "ترکیب ثابت",
    "flat:name": "نام",
    "flat:num": "عدد",
    "goeswith": "جزء ناقص",
    "iobj": "مفعول غیرمستقیم",
    "mark": "نشانه بند",
    "nmod": "متمم اسمی",
    "nmod:poss": 'مالکیت',
    "nsubj": "فاعل اسمی",
    "nsubj:pass": "فاعل جمله مجهول",
    "nummod": "وابسته عددی",
    "obj": "مفعول مستقیم",
    "obl": "متمم قیدی",
    "obl:arg": "وابسته قیدی مرتبط با فعل",
    "parataxis": "عبارت موازی",
    "punct": "علائم نگارشی",
    "root": "ریشه جمله",
    "vocative": "حالت ندا",
    "xcomp": "متمم باز",

    # Penn Treebank Standard for Parsivar
    "SBJ": "فاعل",
    "OBJ": "مفعول",
    "NVE": "فعل‌یار",
    "ENC": "فعل‌یار پی‌بستی",
    "VPP": "مفعول حرف‌اضافه‌ای",
    "OBJ2": "مفعول دوم",
    "TAM": "تمییز",
    "MOS": "مسند",
    "PROG": "مستمرساز",
    "ADVC": "متمم قیدی",
    "VCL": "بند متممی فعل",
    "VPRT": "حرف اضافه فعلی",
    "LVP": "جزء همکرد",
    "PARCL": "بند وصفی",
    "ADV": "قید",
    "AJUCL": "بند افزوده فعل",
    "PART": "افزوده پرسشی فعل",
    "VCONJ": "همپایه فعل",
    "NPREMOD": "صفت پیشین اسم",
    "NPOSTMOD": "صفت پسین اسم",
    "NPP": "حرف اضافه اسم",
    "NCL": "بند اسم",
    "NE": "اسم‌یار",
    "MESU": "ممیز",
    "NPRT": "جزء اسمی",
    "MOZ": "مضاف الیه",
    "APP": "بدل",
    "NCONJ": "همپایه اسم",
    "NADV": "قید اسم",
    "COMPPP": "حرف اضافه تفضیلی",
    "ADJADV": "قید صفت",
    "ACL": "متمم بندی صفت",
    "AJPP": "متمم حرف اضافه‌ای صفت",
    "NEZ": "متمم نشانه اضافه‌ای صفت",
    "AJCONJ": "همپایه صفت",
    "APREMOD": "وابسته پیشین صفت",
    "APOSTMOD": "وابسته پسین صفت",
    "PREDEP": "وابسته پیشین",
    "POSDEP": "وابسته پسین",
    "PCONJ": "همپایه حرف اضافه",
    "AVCONJ": "همپایه قید",
    "PRD": "گزاره",
    "ROOT": "ریشه جمله",
    "PUNC": "علامت نگارشی"
}


def json_to_conllu(json_data):
    """
    Convert JSON in the given format to CoNLL-U format.
    Args:
        json_data (dict): JSON input as a Python dictionary.

    Returns:
        str: CoNLL-U formatted string.
    """
    conllu_output = []

    # If sentence text is available, concatenate token texts
    sentence_text = " ".join(token.get("text", "_") for token in json_data)
    conllu_output.append(f"# text = {sentence_text}")

    # Add tokens in CoNLL-U format
    for token in json_data:
        id_ = token.get("id", "_")
        form = token.get("text", "_")
        lemma = "_"  # Lemma not provided
        upos = token.get("upos", "_")
        xpos = token.get("xpos", "_")
        feats = token.get("feats", "_")
        head = token.get("head", "_")
        deprel = token.get("deprel", "_")
        deps = "_"
        misc = "_"

        conllu_output.append(
            f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
        )

    # Add a blank line after each sentence
    conllu_output.append("")

    # Join all lines into a single string
    return "\n".join(conllu_output)


# Parse CoNLL-U data into a format that spaCy can use
def conllu_to_spacy(conllu_data):
    lines = conllu_data.strip().split("\n")
    words = []
    for line in lines:
        if line.startswith("#"):  # Skip comment lines
            continue
        parts = line.split("\t")
        if len(parts) != 10:  # Skip invalid lines
            continue
        index, word, _, _, _, _, head, dep, _, _ = parts
        index = int(index)
        head = int(head)
        words.append((index, word, head, dep))
    return words


# Create a Doc object in spaCy
def create_spacy_doc(conllu_data, model):
    words = conllu_to_spacy(conllu_data)
    num = len(words)
    for index, word, head, dep in words:
        dep_name = ud_tags_translation.get(dep, dep)
        words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
    words = list(reversed(words))

    # Create a doc object
    doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
    # Add syntactic dependencies
    for word, (index, _, head, dep) in zip(doc, words):
        word.head = doc[head - 1] if head > 0 else doc[index - 1]
        word.dep_ = dep
    return doc


# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
model = spacy.blank("xx")  # Use 'xx' for a multilingual model


def plot_dependency_graph(graph_data):
    result = '<div>\n'
    for sent_graph in graph_data:

        conllu_data = json_to_conllu(sent_graph)

        # Create a doc from CoNLL-U data
        doc = create_spacy_doc(conllu_data, model)

        # Visualize the dependency tree
        result += displacy.render(doc, style="dep")

    result += '\n</div>'
    return result


if __name__ == "__main__":
    test = [
  [
    {
      "id": 1,
      "text": "هوا",
      "lemma": "هوا",
      "upos": "NOUN",
      "xpos": "NOUN",
      "feats": None,
      "head": 2,
      "deprel": "nsubj",
      "deps": None,
      "misc": None
    },
    {
      "id": 2,
      "text": "خوب",
      "lemma": "خوب",
      "upos": "ADJ",
      "xpos": "ADJ",
      "feats": None,
      "head": 0,
      "deprel": "root",
      "deps": None,
      "misc": None
    },
    {
      "id": 3,
      "text": "است",
      "lemma": "است",
      "upos": "VERB",
      "xpos": "VERB",
      "feats": None,
      "head": 2,
      "deprel": "cop",
      "deps": None,
      "misc": None
    }
  ]
]
    print(plot_dependency_graph(test))