33 lines
1001 B
Python
33 lines
1001 B
Python
import parsivar
|
|
import hazm
|
|
import json
|
|
|
|
with open("main_classes_dataset_POS_03.json", "r") as f:
|
|
qs = json.load(f)
|
|
|
|
parsivar_normalizer = parsivar.Normalizer()
|
|
parsivar_tokenizer = parsivar.Tokenizer()
|
|
parsivar_tagger = parsivar.POSTagger(
|
|
tagging_model="wapiti"
|
|
) # tagging_model = "wapiti" or "stanford". "wapiti" is faster than "stanford"
|
|
|
|
hazm_normalizer = hazm.Normalizer()
|
|
hazm_tagger = hazm.POSTagger(model="pos_tagger.model", universal_tag=False)
|
|
|
|
for cls, cls_list in qs.items():
|
|
for i, q in enumerate(cls_list):
|
|
text = q["content"]
|
|
|
|
qs[cls][i]["parsivar"] = str(
|
|
parsivar_tagger.parse(
|
|
parsivar_tokenizer.tokenize_words(parsivar_normalizer.normalize(text))
|
|
)
|
|
)
|
|
|
|
qs[cls][i]["hazm"] = str(
|
|
hazm_tagger.tag(tokens=hazm.word_tokenize(hazm_normalizer.normalize(text)))
|
|
)
|
|
|
|
with open("hazm_parsivar_added.json", "w", encoding="utf-8") as f:
|
|
json.dump(qs, f, indent=4, ensure_ascii=False)
|