import spacy from spacy import displacy ud_tags_translation = { # UD Standard tags for Hazm and Dadmatools "acl": "بند موصولی", "advcl": "بند قیدی", "advmod": "وابسته قیدی", "amod": "وابسته وصفی", "appos": "متمم تشریحی", "aux": "فعل کمکی", "case": "نشانه یا حرف اضافه", "cc": "حرف ربط همپایه‌ساز", "ccomp": "مکمل فعل", "compound": "ترکیب کلمات", "compound:lvc": "فعل‌یار", "conj": "همپایه", "cop": "فعل ربطی", "csubj": "فاعل بند", "dep": "وابسته نامشخص", "det": "معرف", "fixed": "ترکیب ثابت", "flat:name": "نام", "flat:num": "عدد", "goeswith": "جزء ناقص", "iobj": "مفعول غیرمستقیم", "mark": "نشانه بند", "nmod": "متمم اسمی", "nmod:poss": 'مالکیت', "nsubj": "فاعل اسمی", "nsubj:pass": "فاعل جمله مجهول", "nummod": "وابسته عددی", "obj": "مفعول مستقیم", "obl": "متمم قیدی", "obl:arg": "وابسته قیدی مرتبط با فعل", "parataxis": "عبارت موازی", "punct": "علائم نگارشی", "root": "ریشه جمله", "vocative": "حالت ندا", "xcomp": "متمم باز", # Penn Treebank Standard for Parsivar "SBJ": "فاعل", "OBJ": "مفعول", "NVE": "فعل‌یار", "ENC": "فعل‌یار پی‌بستی", "VPP": "مفعول حرف‌اضافه‌ای", "OBJ2": "مفعول دوم", "TAM": "تمییز", "MOS": "مسند", "PROG": "مستمرساز", "ADVC": "متمم قیدی", "VCL": "بند متممی فعل", "VPRT": "حرف اضافه فعلی", "LVP": "جزء همکرد", "PARCL": "بند وصفی", "ADV": "قید", "AJUCL": "بند افزوده فعل", "PART": "افزوده پرسشی فعل", "VCONJ": "همپایه فعل", "NPREMOD": "صفت پیشین اسم", "NPOSTMOD": "صفت پسین اسم", "NPP": "حرف اضافه اسم", "NCL": "بند اسم", "NE": "اسم‌یار", "MESU": "ممیز", "NPRT": "جزء اسمی", "MOZ": "مضاف الیه", "APP": "بدل", "NCONJ": "همپایه اسم", "NADV": "قید اسم", "COMPPP": "حرف اضافه تفضیلی", "ADJADV": "قید صفت", "ACL": "متمم بندی صفت", "AJPP": "متمم حرف اضافه‌ای صفت", "NEZ": "متمم نشانه اضافه‌ای صفت", "AJCONJ": "همپایه صفت", "APREMOD": "وابسته پیشین صفت", "APOSTMOD": "وابسته پسین صفت", "PREDEP": "وابسته پیشین", "POSDEP": "وابسته پسین", "PCONJ": "همپایه حرف اضافه", "AVCONJ": "همپایه قید", "PRD": "گزاره", "ROOT": "ریشه جمله", "PUNC": "علامت نگارشی" } def json_to_conllu(json_data): """ Convert JSON in the given format to CoNLL-U format. Args: json_data (dict): JSON input as a Python dictionary. Returns: str: CoNLL-U formatted string. """ conllu_output = [] # If sentence text is available, concatenate token texts sentence_text = " ".join(token.get("text", "_") for token in json_data) conllu_output.append(f"# text = {sentence_text}") # Add tokens in CoNLL-U format for token in json_data: id_ = token.get("id", "_") form = token.get("text", "_") lemma = "_" # Lemma not provided upos = token.get("upos", "_") xpos = token.get("xpos", "_") feats = token.get("feats", "_") head = token.get("head", "_") deprel = token.get("deprel", "_") deps = "_" misc = "_" conllu_output.append( f"{id_}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}" ) # Add a blank line after each sentence conllu_output.append("") # Join all lines into a single string return "\n".join(conllu_output) # Parse CoNLL-U data into a format that spaCy can use def conllu_to_spacy(conllu_data): lines = conllu_data.strip().split("\n") words = [] for line in lines: if line.startswith("#"): # Skip comment lines continue parts = line.split("\t") if len(parts) != 10: # Skip invalid lines continue index, word, _, _, _, _, head, dep, _, _ = parts index = int(index) head = int(head) words.append((index, word, head, dep)) return words # Create a Doc object in spaCy def create_spacy_doc(conllu_data, model): words = conllu_to_spacy(conllu_data) num = len(words) for index, word, head, dep in words: dep_name = ud_tags_translation.get(dep, dep) words[index-1] = (num - index + 1, word, 0 if head == 0 else num - head + 1, dep_name) words = list(reversed(words)) # Create a doc object doc = spacy.tokens.Doc(model.vocab, words=[word[1] for word in words]) # Add syntactic dependencies for word, (index, _, head, dep) in zip(doc, words): word.head = doc[head - 1] if head > 0 else doc[index - 1] word.dep_ = dep return doc # Load the spacy model (use a blank model since the language is Persian and we don't need any pre-trained components) model = spacy.blank("xx") # Use 'xx' for a multilingual model def plot_dependency_graph(graph_data): result = '
\n' for sent_graph in graph_data: conllu_data = json_to_conllu(sent_graph) # Create a doc from CoNLL-U data doc = create_spacy_doc(conllu_data, model) # Visualize the dependency tree result += displacy.render(doc, style="dep") result += '\n
' return result if __name__ == "__main__": test = [ [ { "id": 1, "text": "هوا", "lemma": "هوا", "upos": "NOUN", "xpos": "NOUN", "feats": None, "head": 2, "deprel": "nsubj", "deps": None, "misc": None }, { "id": 2, "text": "خوب", "lemma": "خوب", "upos": "ADJ", "xpos": "ADJ", "feats": None, "head": 0, "deprel": "root", "deps": None, "misc": None }, { "id": 3, "text": "است", "lemma": "است", "upos": "VERB", "xpos": "VERB", "feats": None, "head": 2, "deprel": "cop", "deps": None, "misc": None } ] ] print(plot_dependency_graph(test))