hazm_docker/app/plot_dependency_graph.py

225 lines
6.5 KiB
Python
Raw Permalink Normal View History

2025-02-02 14:04:55 +00:00
import spacy
from spacy import displacy
ud_tags_translation = {
# UD Standard tags for Hazm and Dadmatools
"acl": "بند موصولی",
"advcl": "بند قیدی",
"advmod": "وابسته قیدی",
"amod": "وابسته وصفی",
"appos": "متمم تشریحی",
"aux": "فعل کمکی",
"case": "نشانه یا حرف اضافه",
"cc": "حرف ربط همپایه‌ساز",
"ccomp": "مکمل فعل",
"compound": "ترکیب کلمات",
"compound:lvc": "فعل‌یار",
"conj": "همپایه",
"cop": "فعل ربطی",
"csubj": "فاعل بند",
"dep": "وابسته نامشخص",
"det": "معرف",
"fixed": "ترکیب ثابت",
"flat:name": "نام",
"flat:num": "عدد",
"goeswith": "جزء ناقص",
"iobj": "مفعول غیرمستقیم",
"mark": "نشانه بند",
"nmod": "متمم اسمی",
"nmod:poss": 'مالکیت',
"nsubj": "فاعل اسمی",
"nsubj:pass": "فاعل جمله مجهول",
"nummod": "وابسته عددی",
"obj": "مفعول مستقیم",
"obl": "متمم قیدی",
"obl:arg": "وابسته قیدی مرتبط با فعل",
"parataxis": "عبارت موازی",
"punct": "علائم نگارشی",
"root": "ریشه جمله",
"vocative": "حالت ندا",
"xcomp": "متمم باز",
# Penn Treebank Standard for Parsivar
"SBJ": "فاعل",
"OBJ": "مفعول",
"NVE": "فعل‌یار",
"ENC": "فعل‌یار پی‌بستی",
"VPP": "مفعول حرف‌اضافه‌ای",
"OBJ2": "مفعول دوم",
"TAM": "تمییز",
"MOS": "مسند",
"PROG": "مستمرساز",
"ADVC": "متمم قیدی",
"VCL": "بند متممی فعل",
"VPRT": "حرف اضافه فعلی",
"LVP": "جزء همکرد",
"PARCL": "بند وصفی",
"ADV": "قید",
"AJUCL": "بند افزوده فعل",
"PART": "افزوده پرسشی فعل",
"VCONJ": "همپایه فعل",
"NPREMOD": "صفت پیشین اسم",
"NPOSTMOD": "صفت پسین اسم",
"NPP": "حرف اضافه اسم",
"NCL": "بند اسم",
"NE": "اسم‌یار",
"MESU": "ممیز",
"NPRT": "جزء اسمی",
"MOZ": "مضاف الیه",
"APP": "بدل",
"NCONJ": "همپایه اسم",
"NADV": "قید اسم",
"COMPPP": "حرف اضافه تفضیلی",
"ADJADV": "قید صفت",
"ACL": "متمم بندی صفت",
"AJPP": "متمم حرف اضافه‌ای صفت",
"NEZ": "متمم نشانه اضافه‌ای صفت",
"AJCONJ": "همپایه صفت",
"APREMOD": "وابسته پیشین صفت",
"APOSTMOD": "وابسته پسین صفت",
"PREDEP": "وابسته پیشین",
"POSDEP": "وابسته پسین",
"PCONJ": "همپایه حرف اضافه",
"AVCONJ": "همپایه قید",
"PRD": "گزاره",
"ROOT": "ریشه جمله",
"PUNC": "علامت نگارشی"
}
def json_to_conllu(json_data):
"""
Convert JSON in the given format to CoNLL-U format.
Args:
json_data (dict): JSON input as a Python dictionary.
Returns:
str: CoNLL-U formatted string.
"""
conllu_output = []
# If sentence text is available, concatenate token texts
sentence_text = " ".join(token.get("text", "_") for token in json_data)
conllu_output.append(f"# text = {sentence_text}")
# Add tokens in CoNLL-U format
for token in json_data:
id_ = token.get("id", "_")
form = token.get("text", "_")
lemma = "_" # Lemma not provided
upos = token.get("upos", "_")
xpos = token.get("xpos", "_")
feats = token.get("feats", "_")
head = token.get("head", "_")
deprel = token.get("deprel", "_")
deps = "_"
misc = "_"
conllu_output.append(
f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
)
# Add a blank line after each sentence
conllu_output.append("")
# Join all lines into a single string
return "\n".join(conllu_output)
# Parse CoNLL-U data into a format that spaCy can use
def conllu_to_spacy(conllu_data):
lines = conllu_data.strip().split("\n")
words = []
for line in lines:
if line.startswith("#"): # Skip comment lines
continue
parts = line.split("\t")
if len(parts) != 10: # Skip invalid lines
continue
index, word, _, _, _, _, head, dep, _, _ = parts
index = int(index)
head = int(head)
words.append((index, word, head, dep))
return words
# Create a Doc object in spaCy
def create_spacy_doc(conllu_data, model):
words = conllu_to_spacy(conllu_data)
num = len(words)
for index, word, head, dep in words:
dep_name = ud_tags_translation.get(dep, dep)
words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
words = list(reversed(words))
# Create a doc object
doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
# Add syntactic dependencies
for word, (index, _, head, dep) in zip(doc, words):
word.head = doc[head - 1] if head > 0 else doc[index - 1]
word.dep_ = dep
return doc
# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
model = spacy.blank("xx") # Use 'xx' for a multilingual model
def plot_dependency_graph(graph_data):
result = '<div>\n'
for sent_graph in graph_data:
conllu_data = json_to_conllu(sent_graph)
# Create a doc from CoNLL-U data
doc = create_spacy_doc(conllu_data, model)
# Visualize the dependency tree
result += displacy.render(doc, style="dep")
result += '\n</div>'
return result
if __name__ == "__main__":
test = [
[
{
"id": 1,
"text": "هوا",
"lemma": "هوا",
"upos": "NOUN",
"xpos": "NOUN",
"feats": None,
"head": 2,
"deprel": "nsubj",
"deps": None,
"misc": None
},
{
"id": 2,
"text": "خوب",
"lemma": "خوب",
"upos": "ADJ",
"xpos": "ADJ",
"feats": None,
"head": 0,
"deprel": "root",
"deps": None,
"misc": None
},
{
"id": 3,
"text": "است",
"lemma": "است",
"upos": "VERB",
"xpos": "VERB",
"feats": None,
"head": 2,
"deprel": "cop",
"deps": None,
"misc": None
}
]
]
print(plot_dependency_graph(test))