From 600536794db347c29e86e2c865d6936c95d2d414 Mon Sep 17 00:00:00 2001 From: HamidReza Montazer Date: Thu, 19 Dec 2024 09:58:45 +0330 Subject: [PATCH] add files from Kafi --- .gitignore | 2 + 1.extract_verbs_from_PVC.ipynb | 151 ++++++++++++++++++++++ 2.conjugate_PVC_verbs_using_hazm.ipynb | 168 +++++++++++++++++++++++++ 3.dadegan_noor.ipynb | 118 +++++++++++++++++ 4 files changed, 439 insertions(+) create mode 100644 .gitignore create mode 100644 1.extract_verbs_from_PVC.ipynb create mode 100644 2.conjugate_PVC_verbs_using_hazm.ipynb create mode 100644 3.dadegan_noor.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e5d7cfc --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +Data/ +.env/ \ No newline at end of file diff --git a/1.extract_verbs_from_PVC.ipynb b/1.extract_verbs_from_PVC.ipynb new file mode 100644 index 0000000..4021501 --- /dev/null +++ b/1.extract_verbs_from_PVC.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "this_dir_path = Path().resolve()\n", + "data_path = this_dir_path / \"Data\"\n", + "verb_path = data_path / \"PVC\" / \"Data\" / \"TXT\" / \"verb.txt\"\n", + "processed_past_verb_path = data_path / \"verbs_past_PVC.json\"\n", + "processed_present_verb_path = data_path / \"verbs_present_PVC.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open(verb_path, \"r\") as f:\n", + " lines = f.readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "result: dict[str, list] = {}\n", + "for line in lines:\n", + " verb = line.split(\"@\")[0]\n", + " idx = line.split(\"@\")[1]\n", + " if isinstance(result.get(idx), list):\n", + " result[idx].append(verb)\n", + " else:\n", + " result[idx] = [verb]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "result_simple_past: dict[str, str] = {}\n", + "for line in lines:\n", + " if line.split(\"@\")[2:] == [\n", + " \"ACTIVE\",\n", + " \"SINGULAR\",\n", + " \"FIRST_PERSON\",\n", + " \"POSITIVE\",\n", + " \"م\",\n", + " \"PAST\",\n", + " \"INDICATIVE\",\n", + " \"SIMPLE\\n\",\n", + " ]:\n", + " result_simple_past[line.split(\"@\")[1]] = line.split(\"@\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "result_simple_past_list = {\n", + " key: value[:-1]\n", + " for key, value in sorted(result_simple_past.items(), key=lambda x: x[1])\n", + "}\n", + "with open(processed_past_verb_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(result_simple_past_list, f, ensure_ascii=False, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "result_simple_present: dict[str, str] = {}\n", + "for line in lines:\n", + " if line.split(\"@\")[2:] == [\n", + " \"ACTIVE\",\n", + " \"SINGULAR\",\n", + " \"FIRST_PERSON\",\n", + " \"POSITIVE\",\n", + " \"م\",\n", + " \"PRESENT\",\n", + " \"INDICATIVE\",\n", + " \"SIMPLE\\n\",\n", + " ]:\n", + " result_simple_present[line.split(\"@\")[1]] = line.split(\"@\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "result_simple_present_list = {\n", + " key: value[:-1]\n", + " for key, value in sorted(result_simple_present.items(), key=lambda x: x[1])\n", + "}\n", + "with open(processed_present_verb_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(result_simple_present_list, f, ensure_ascii=False, indent=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2.conjugate_PVC_verbs_using_hazm.ipynb b/2.conjugate_PVC_verbs_using_hazm.ipynb new file mode 100644 index 0000000..8817c6d --- /dev/null +++ b/2.conjugate_PVC_verbs_using_hazm.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import hazm\n", + "import json\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "this_dir_path = Path().resolve()\n", + "data_path = this_dir_path / \"Data\"\n", + "past_verb_path = data_path / \"verbs_past_PVC.json\"\n", + "present_verb_path = data_path / \"verbs_present_PVC.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "with open(past_verb_path, \"r\") as f:\n", + " past_verbs = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "conj = hazm.Conjugation()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['آجیدم', 'آجیدی', 'آجید', 'آجیدیم', 'آجیدید', 'آجیدند']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conj.perfective_past(past_verbs[\"1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['نآجیدم', 'نآجیدی', 'نآجید', 'نآجیدیم', 'نآجیدید', 'نآجیدند']\n" + ] + }, + { + "data": { + "text/plain": [ + "['نیاجیدم', 'نیاجیدی', 'نیاجید', 'نیاجیدیم', 'نیاجیدید', 'نیاجیدند']" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(conj.negative_perfective_past(past_verbs[\"1\"]))\n", + "\n", + "\n", + "def negation(verb: str):\n", + " verb = verb.split()\n", + " if verb[-1].startswith(\"آ\"):\n", + " verb[-1] = \"نیا\" + verb[-1][1:]\n", + " else:\n", + " verb[-1] = \"ن\" + verb[-1]\n", + " verb = \" \".join(verb)\n", + " return verb\n", + "\n", + "\n", + "[negation(verb) for verb in conj.perfective_past(past_verbs[\"1\"])]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['آجیده شدم',\n", + " 'آجیده شدی',\n", + " 'آجیده شد',\n", + " 'آجیده شدیم',\n", + " 'آجیده شدید',\n", + " 'آجیده شدند']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conj.passive_perfective_past(past_verbs[\"1\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# فعل‌های متعدی می‌توانند مجهول شوند" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/3.dadegan_noor.ipynb b/3.dadegan_noor.ipynb new file mode 100644 index 0000000..8194ff6 --- /dev/null +++ b/3.dadegan_noor.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "from pathlib import Path\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "this_dir_path = Path().resolve()\n", + "data_path = this_dir_path / \"Data\"\n", + "input_verb_path = data_path / \"verbs_noor.txt\"\n", + "htmls_path = data_path / \"htmls_noor\"\n", + "output_verb_path = data_path / \"verbs_noor.json\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open(input_verb_path, \"r\") as f:\n", + " verbs = [verb[:-1] for verb in f.readlines()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 3/3 [00:01<00:00, 2.62it/s]\n" + ] + } + ], + "source": [ + "for verb in tqdm(verbs):\n", + " response = requests.get(\"http://search.dadegan.ir/\", params={\"q\": verb})\n", + " with open(htmls_path / (verb + \".html\"), \"w\") as f:\n", + " f.write(response.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4167/4167 [00:22<00:00, 186.02it/s]\n" + ] + } + ], + "source": [ + "result = {}\n", + "\n", + "for verb in tqdm(verbs):\n", + " with open(htmls_path / (verb + \".html\"), \"r\") as f:\n", + " text = f.read()\n", + " soup = BeautifulSoup(text, \"html.parser\")\n", + "\n", + " stems = soup.findAll(\"td\", {\"class\": \"c3\"})[1:]\n", + " past_stem = stems[0].text\n", + " present_stem = stems[1].text\n", + "\n", + " result[verb] = {}\n", + "\n", + " result[verb][\"past_stem\"] = past_stem\n", + " result[verb][\"present_stem\"] = present_stem" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}