225 lines
6.5 KiB
Python
225 lines
6.5 KiB
Python
import spacy
|
|
from spacy import displacy
|
|
|
|
ud_tags_translation = {
|
|
# UD Standard tags for Hazm and Dadmatools
|
|
"acl": "بند موصولی",
|
|
"advcl": "بند قیدی",
|
|
"advmod": "وابسته قیدی",
|
|
"amod": "وابسته وصفی",
|
|
"appos": "متمم تشریحی",
|
|
"aux": "فعل کمکی",
|
|
"case": "نشانه یا حرف اضافه",
|
|
"cc": "حرف ربط همپایهساز",
|
|
"ccomp": "مکمل فعل",
|
|
"compound": "ترکیب کلمات",
|
|
"compound:lvc": "فعلیار",
|
|
"conj": "همپایه",
|
|
"cop": "فعل ربطی",
|
|
"csubj": "فاعل بند",
|
|
"dep": "وابسته نامشخص",
|
|
"det": "معرف",
|
|
"fixed": "ترکیب ثابت",
|
|
"flat:name": "نام",
|
|
"flat:num": "عدد",
|
|
"goeswith": "جزء ناقص",
|
|
"iobj": "مفعول غیرمستقیم",
|
|
"mark": "نشانه بند",
|
|
"nmod": "متمم اسمی",
|
|
"nmod:poss": 'مالکیت',
|
|
"nsubj": "فاعل اسمی",
|
|
"nsubj:pass": "فاعل جمله مجهول",
|
|
"nummod": "وابسته عددی",
|
|
"obj": "مفعول مستقیم",
|
|
"obl": "متمم قیدی",
|
|
"obl:arg": "وابسته قیدی مرتبط با فعل",
|
|
"parataxis": "عبارت موازی",
|
|
"punct": "علائم نگارشی",
|
|
"root": "ریشه جمله",
|
|
"vocative": "حالت ندا",
|
|
"xcomp": "متمم باز",
|
|
|
|
# Penn Treebank Standard for Parsivar
|
|
"SBJ": "فاعل",
|
|
"OBJ": "مفعول",
|
|
"NVE": "فعلیار",
|
|
"ENC": "فعلیار پیبستی",
|
|
"VPP": "مفعول حرفاضافهای",
|
|
"OBJ2": "مفعول دوم",
|
|
"TAM": "تمییز",
|
|
"MOS": "مسند",
|
|
"PROG": "مستمرساز",
|
|
"ADVC": "متمم قیدی",
|
|
"VCL": "بند متممی فعل",
|
|
"VPRT": "حرف اضافه فعلی",
|
|
"LVP": "جزء همکرد",
|
|
"PARCL": "بند وصفی",
|
|
"ADV": "قید",
|
|
"AJUCL": "بند افزوده فعل",
|
|
"PART": "افزوده پرسشی فعل",
|
|
"VCONJ": "همپایه فعل",
|
|
"NPREMOD": "صفت پیشین اسم",
|
|
"NPOSTMOD": "صفت پسین اسم",
|
|
"NPP": "حرف اضافه اسم",
|
|
"NCL": "بند اسم",
|
|
"NE": "اسمیار",
|
|
"MESU": "ممیز",
|
|
"NPRT": "جزء اسمی",
|
|
"MOZ": "مضاف الیه",
|
|
"APP": "بدل",
|
|
"NCONJ": "همپایه اسم",
|
|
"NADV": "قید اسم",
|
|
"COMPPP": "حرف اضافه تفضیلی",
|
|
"ADJADV": "قید صفت",
|
|
"ACL": "متمم بندی صفت",
|
|
"AJPP": "متمم حرف اضافهای صفت",
|
|
"NEZ": "متمم نشانه اضافهای صفت",
|
|
"AJCONJ": "همپایه صفت",
|
|
"APREMOD": "وابسته پیشین صفت",
|
|
"APOSTMOD": "وابسته پسین صفت",
|
|
"PREDEP": "وابسته پیشین",
|
|
"POSDEP": "وابسته پسین",
|
|
"PCONJ": "همپایه حرف اضافه",
|
|
"AVCONJ": "همپایه قید",
|
|
"PRD": "گزاره",
|
|
"ROOT": "ریشه جمله",
|
|
"PUNC": "علامت نگارشی"
|
|
}
|
|
|
|
|
|
def json_to_conllu(json_data):
|
|
"""
|
|
Convert JSON in the given format to CoNLL-U format.
|
|
Args:
|
|
json_data (dict): JSON input as a Python dictionary.
|
|
|
|
Returns:
|
|
str: CoNLL-U formatted string.
|
|
"""
|
|
conllu_output = []
|
|
|
|
# If sentence text is available, concatenate token texts
|
|
sentence_text = " ".join(token.get("text", "_") for token in json_data)
|
|
conllu_output.append(f"# text = {sentence_text}")
|
|
|
|
# Add tokens in CoNLL-U format
|
|
for token in json_data:
|
|
id_ = token.get("id", "_")
|
|
form = token.get("text", "_")
|
|
lemma = "_" # Lemma not provided
|
|
upos = token.get("upos", "_")
|
|
xpos = token.get("xpos", "_")
|
|
feats = token.get("feats", "_")
|
|
head = token.get("head", "_")
|
|
deprel = token.get("deprel", "_")
|
|
deps = "_"
|
|
misc = "_"
|
|
|
|
conllu_output.append(
|
|
f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
|
|
)
|
|
|
|
# Add a blank line after each sentence
|
|
conllu_output.append("")
|
|
|
|
# Join all lines into a single string
|
|
return "\n".join(conllu_output)
|
|
|
|
|
|
# Parse CoNLL-U data into a format that spaCy can use
|
|
def conllu_to_spacy(conllu_data):
|
|
lines = conllu_data.strip().split("\n")
|
|
words = []
|
|
for line in lines:
|
|
if line.startswith("#"): # Skip comment lines
|
|
continue
|
|
parts = line.split("\t")
|
|
if len(parts) != 10: # Skip invalid lines
|
|
continue
|
|
index, word, _, _, _, _, head, dep, _, _ = parts
|
|
index = int(index)
|
|
head = int(head)
|
|
words.append((index, word, head, dep))
|
|
return words
|
|
|
|
|
|
# Create a Doc object in spaCy
|
|
def create_spacy_doc(conllu_data, model):
|
|
words = conllu_to_spacy(conllu_data)
|
|
num = len(words)
|
|
for index, word, head, dep in words:
|
|
dep_name = ud_tags_translation.get(dep, dep)
|
|
words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name)
|
|
words = list(reversed(words))
|
|
|
|
# Create a doc object
|
|
doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words])
|
|
# Add syntactic dependencies
|
|
for word, (index, _, head, dep) in zip(doc, words):
|
|
word.head = doc[head - 1] if head > 0 else doc[index - 1]
|
|
word.dep_ = dep
|
|
return doc
|
|
|
|
|
|
# Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components)
|
|
model = spacy.blank("xx") # Use 'xx' for a multilingual model
|
|
|
|
|
|
def plot_dependency_graph(graph_data):
|
|
result = '<div>\n'
|
|
for sent_graph in graph_data:
|
|
|
|
conllu_data = json_to_conllu(sent_graph)
|
|
|
|
# Create a doc from CoNLL-U data
|
|
doc = create_spacy_doc(conllu_data, model)
|
|
|
|
# Visualize the dependency tree
|
|
result += displacy.render(doc, style="dep")
|
|
|
|
result += '\n</div>'
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test = [
|
|
[
|
|
{
|
|
"id": 1,
|
|
"text": "هوا",
|
|
"lemma": "هوا",
|
|
"upos": "NOUN",
|
|
"xpos": "NOUN",
|
|
"feats": None,
|
|
"head": 2,
|
|
"deprel": "nsubj",
|
|
"deps": None,
|
|
"misc": None
|
|
},
|
|
{
|
|
"id": 2,
|
|
"text": "خوب",
|
|
"lemma": "خوب",
|
|
"upos": "ADJ",
|
|
"xpos": "ADJ",
|
|
"feats": None,
|
|
"head": 0,
|
|
"deprel": "root",
|
|
"deps": None,
|
|
"misc": None
|
|
},
|
|
{
|
|
"id": 3,
|
|
"text": "است",
|
|
"lemma": "است",
|
|
"upos": "VERB",
|
|
"xpos": "VERB",
|
|
"feats": None,
|
|
"head": 2,
|
|
"deprel": "cop",
|
|
"deps": None,
|
|
"misc": None
|
|
}
|
|
]
|
|
]
|
|
print(plot_dependency_graph(test)) |