From 600536794db347c29e86e2c865d6936c95d2d414 Mon Sep 17 00:00:00 2001
From: HamidReza Montazer <montazerh82@gmail.com>
Date: Thu, 19 Dec 2024 09:58:45 +0330
Subject: [PATCH] add files from Kafi

---
 .gitignore                             |   2 +
 1.extract_verbs_from_PVC.ipynb         | 151 ++++++++++++++++++++++
 2.conjugate_PVC_verbs_using_hazm.ipynb | 168 +++++++++++++++++++++++++
 3.dadegan_noor.ipynb                   | 118 +++++++++++++++++
 4 files changed, 439 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 1.extract_verbs_from_PVC.ipynb
 create mode 100644 2.conjugate_PVC_verbs_using_hazm.ipynb
 create mode 100644 3.dadegan_noor.ipynb

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e5d7cfc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+Data/
+.env/
\ No newline at end of file
diff --git a/1.extract_verbs_from_PVC.ipynb b/1.extract_verbs_from_PVC.ipynb
new file mode 100644
index 0000000..4021501
--- /dev/null
+++ b/1.extract_verbs_from_PVC.ipynb
@@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "this_dir_path = Path().resolve()\n",
+    "data_path = this_dir_path / \"Data\"\n",
+    "verb_path = data_path / \"PVC\" / \"Data\" / \"TXT\" / \"verb.txt\"\n",
+    "processed_past_verb_path = data_path / \"verbs_past_PVC.json\"\n",
+    "processed_present_verb_path = data_path / \"verbs_present_PVC.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(verb_path, \"r\") as f:\n",
+    "    lines = f.readlines()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result: dict[str, list] = {}\n",
+    "for line in lines:\n",
+    "    verb = line.split(\"@\")[0]\n",
+    "    idx = line.split(\"@\")[1]\n",
+    "    if isinstance(result.get(idx), list):\n",
+    "        result[idx].append(verb)\n",
+    "    else:\n",
+    "        result[idx] = [verb]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_simple_past: dict[str, str] = {}\n",
+    "for line in lines:\n",
+    "    if line.split(\"@\")[2:] == [\n",
+    "        \"ACTIVE\",\n",
+    "        \"SINGULAR\",\n",
+    "        \"FIRST_PERSON\",\n",
+    "        \"POSITIVE\",\n",
+    "        \"م\",\n",
+    "        \"PAST\",\n",
+    "        \"INDICATIVE\",\n",
+    "        \"SIMPLE\\n\",\n",
+    "    ]:\n",
+    "        result_simple_past[line.split(\"@\")[1]] = line.split(\"@\")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_simple_past_list = {\n",
+    "    key: value[:-1]\n",
+    "    for key, value in sorted(result_simple_past.items(), key=lambda x: x[1])\n",
+    "}\n",
+    "with open(processed_past_verb_path, \"w\", encoding=\"utf-8\") as f:\n",
+    "    json.dump(result_simple_past_list, f, ensure_ascii=False, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_simple_present: dict[str, str] = {}\n",
+    "for line in lines:\n",
+    "    if line.split(\"@\")[2:] == [\n",
+    "        \"ACTIVE\",\n",
+    "        \"SINGULAR\",\n",
+    "        \"FIRST_PERSON\",\n",
+    "        \"POSITIVE\",\n",
+    "        \"م\",\n",
+    "        \"PRESENT\",\n",
+    "        \"INDICATIVE\",\n",
+    "        \"SIMPLE\\n\",\n",
+    "    ]:\n",
+    "        result_simple_present[line.split(\"@\")[1]] = line.split(\"@\")[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_simple_present_list = {\n",
+    "    key: value[:-1]\n",
+    "    for key, value in sorted(result_simple_present.items(), key=lambda x: x[1])\n",
+    "}\n",
+    "with open(processed_present_verb_path, \"w\", encoding=\"utf-8\") as f:\n",
+    "    json.dump(result_simple_present_list, f, ensure_ascii=False, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/2.conjugate_PVC_verbs_using_hazm.ipynb b/2.conjugate_PVC_verbs_using_hazm.ipynb
new file mode 100644
index 0000000..8817c6d
--- /dev/null
+++ b/2.conjugate_PVC_verbs_using_hazm.ipynb
@@ -0,0 +1,168 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import hazm\n",
+    "import json\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "this_dir_path = Path().resolve()\n",
+    "data_path = this_dir_path / \"Data\"\n",
+    "past_verb_path = data_path / \"verbs_past_PVC.json\"\n",
+    "present_verb_path = data_path / \"verbs_present_PVC.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(past_verb_path, \"r\") as f:\n",
+    "    past_verbs = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conj = hazm.Conjugation()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['آجیدم', 'آجیدی', 'آجید', 'آجیدیم', 'آجیدید', 'آجیدند']"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "conj.perfective_past(past_verbs[\"1\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['نآجیدم', 'نآجیدی', 'نآجید', 'نآجیدیم', 'نآجیدید', 'نآجیدند']\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['نیاجیدم', 'نیاجیدی', 'نیاجید', 'نیاجیدیم', 'نیاجیدید', 'نیاجیدند']"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(conj.negative_perfective_past(past_verbs[\"1\"]))\n",
+    "\n",
+    "\n",
+    "def negation(verb: str):\n",
+    "    verb = verb.split()\n",
+    "    if verb[-1].startswith(\"آ\"):\n",
+    "        verb[-1] = \"نیا\" + verb[-1][1:]\n",
+    "    else:\n",
+    "        verb[-1] = \"ن\" + verb[-1]\n",
+    "    verb = \" \".join(verb)\n",
+    "    return verb\n",
+    "\n",
+    "\n",
+    "[negation(verb) for verb in conj.perfective_past(past_verbs[\"1\"])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['آجیده شدم',\n",
+       " 'آجیده شدی',\n",
+       " 'آجیده شد',\n",
+       " 'آجیده شدیم',\n",
+       " 'آجیده شدید',\n",
+       " 'آجیده شدند']"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "conj.passive_perfective_past(past_verbs[\"1\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# فعل‌های متعدی می‌توانند مجهول شوند"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/3.dadegan_noor.ipynb b/3.dadegan_noor.ipynb
new file mode 100644
index 0000000..8194ff6
--- /dev/null
+++ b/3.dadegan_noor.ipynb
@@ -0,0 +1,118 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "from bs4 import BeautifulSoup\n",
+    "from pathlib import Path\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "this_dir_path = Path().resolve()\n",
+    "data_path = this_dir_path / \"Data\"\n",
+    "input_verb_path = data_path / \"verbs_noor.txt\"\n",
+    "htmls_path = data_path / \"htmls_noor\"\n",
+    "output_verb_path = data_path / \"verbs_noor.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(input_verb_path, \"r\") as f:\n",
+    "    verbs = [verb[:-1] for verb in f.readlines()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 3/3 [00:01<00:00,  2.62it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for verb in tqdm(verbs):\n",
+    "    response = requests.get(\"http://search.dadegan.ir/\", params={\"q\": verb})\n",
+    "    with open(htmls_path / (verb + \".html\"), \"w\") as f:\n",
+    "        f.write(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 4167/4167 [00:22<00:00, 186.02it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "result = {}\n",
+    "\n",
+    "for verb in tqdm(verbs):\n",
+    "    with open(htmls_path / (verb + \".html\"), \"r\") as f:\n",
+    "        text = f.read()\n",
+    "    soup = BeautifulSoup(text, \"html.parser\")\n",
+    "\n",
+    "    stems = soup.findAll(\"td\", {\"class\": \"c3\"})[1:]\n",
+    "    past_stem = stems[0].text\n",
+    "    present_stem = stems[1].text\n",
+    "\n",
+    "    result[verb] = {}\n",
+    "\n",
+    "    result[verb][\"past_stem\"] = past_stem\n",
+    "    result[verb][\"present_stem\"] = present_stem"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}