{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import json" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "this_dir_path = Path().resolve()\n", "data_path = this_dir_path / \"Data\"\n", "verb_path = data_path / \"PVC\" / \"Data\" / \"TXT\" / \"verb.txt\"\n", "processed_past_verb_path = data_path / \"verbs_past_PVC.json\"\n", "processed_present_verb_path = data_path / \"verbs_present_PVC.json\"\n", "\n", "# You can download the PVC data from https://www.peykaregan.ir/dataset/%D9%85%D8%AC%D9%85%D9%88%D8%B9%D9%87-%D8%A7%D9%81%D8%B9%D8%A7%D9%84-%D8%AA%D8%B5%D8%B1%DB%8C%D9%81%E2%80%8C%D8%B4%D8%AF%D9%87-%D9%81%D8%A7%D8%B1%D8%B3%DB%8C\n", "# After downloading, unzip the data and place it in a directory called Data." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "with open(verb_path, \"r\") as f:\n", " lines = f.readlines()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "result: dict[str, list] = {}\n", "for line in lines:\n", " verb = line.split(\"@\")[0]\n", " idx = line.split(\"@\")[1]\n", " if isinstance(result.get(idx), list):\n", " result[idx].append(verb)\n", " else:\n", " result[idx] = [verb]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "result_simple_past: dict[str, str] = {}\n", "for line in lines:\n", " if line.split(\"@\")[2:] == [\n", " \"ACTIVE\",\n", " \"SINGULAR\",\n", " \"FIRST_PERSON\",\n", " \"POSITIVE\",\n", " \"م\",\n", " \"PAST\",\n", " \"INDICATIVE\",\n", " \"SIMPLE\\n\",\n", " ]:\n", " result_simple_past[line.split(\"@\")[1]] = line.split(\"@\")[0]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "result_simple_past_list = {\n", " key: value[:-1]\n", " for key, value in sorted(result_simple_past.items(), key=lambda x: x[1])\n", "}\n", "with open(processed_past_verb_path, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(result_simple_past_list, f, ensure_ascii=False, indent=4)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "result_simple_present: dict[str, str] = {}\n", "for line in lines:\n", " if line.split(\"@\")[2:] == [\n", " \"ACTIVE\",\n", " \"SINGULAR\",\n", " \"FIRST_PERSON\",\n", " \"POSITIVE\",\n", " \"م\",\n", " \"PRESENT\",\n", " \"INDICATIVE\",\n", " \"SIMPLE\\n\",\n", " ]:\n", " result_simple_present[line.split(\"@\")[1]] = line.split(\"@\")[0]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "result_simple_present_list = {\n", " key: value[:-1]\n", " for key, value in sorted(result_simple_present.items(), key=lambda x: x[1])\n", "}\n", "with open(processed_present_verb_path, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(result_simple_present_list, f, ensure_ascii=False, indent=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }