{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "from pathlib import Path\n", "from tqdm import tqdm\n", "import json" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "this_dir_path = Path().resolve()\n", "data_path = this_dir_path / \"Data\"\n", "input_verb_path = data_path / \"verbs_noor.txt\"\n", "htmls_path = data_path / \"htmls_noor\"\n", "output_verb_path = data_path / \"verbs_noor.json\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "with open(input_verb_path, \"r\") as f:\n", " verbs = [verb[:-1] for verb in f.readlines()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for verb in tqdm(verbs):\n", " response = requests.get(\"http://search.dadegan.ir/\", params={\"q\": verb})\n", " with open(htmls_path / (verb + \".html\"), \"w\") as f:\n", " f.write(response.text)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def find_start_and_end(phrase, subphrase):\n", " words1 = phrase.split()\n", " words2 = subphrase.split()\n", "\n", " for i in range(len(words1)):\n", " if words1[i : i + len(words2)] == words2:\n", " start = i\n", " end = i + len(words2) - 1\n", " return start, end\n", "\n", " return \"404 - Not found!\"" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 4167/4167 [00:34<00:00, 122.03it/s]\n" ] } ], "source": [ "results = []\n", "\n", "for verb in tqdm(verbs):\n", " with open(htmls_path / (verb + \".html\"), \"r\") as f:\n", " text = f.read()\n", " soup = BeautifulSoup(text, \"html.parser\")\n", "\n", " stems = soup.findAll(\"td\", {\"class\": \"c3\"})[1:]\n", " past_stem = stems[0].text\n", " present_stem = stems[1].text\n", "\n", " structures = soup.select(\"div.section-wrapper.more-examples\")\n", " for structure in structures:\n", " examples = structure.findAll(\"div\", {\"class\": \"example\"})\n", " valency = structure.findAll(\"div\", {\"class\": \"valency\"})[0]\n", " for example in examples:\n", " result = {}\n", " result[\"verb\"] = verb\n", " result[\"past_stem\"] = past_stem\n", " result[\"present_stem\"] = present_stem\n", " result[\"structure_key\"] = [\n", " e.get(\"class\")[0] for e in valency.findAll(\"span\", recursive=False)\n", " ]\n", " result[\"structure_fa\"] = [\n", " e.text for e in valency.findAll(\"span\", recursive=False)\n", " ]\n", " result[\"html\"] = str(example)\n", " result[\"text\"] = example.text\n", "\n", " result[\"tags\"] = []\n", " spans = example.findAll(\"span\", recursive=False)\n", " for span in spans:\n", " span_text = span.get_text(strip=True)\n", " start_index, end_index = find_start_and_end(example.text, span_text)\n", "\n", " result[\"tags\"].append(\n", " {\n", " \"text\": span_text,\n", " \"start\": start_index,\n", " \"end\": end_index,\n", " \"class\": span[\"class\"][0],\n", " }\n", " )\n", "\n", " results.append(result)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "with open(output_verb_path, \"w\", encoding=\"utf-8\") as f:\n", " json.dump(results, f, ensure_ascii=False, indent=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 2 }