diff --git a/3.dadegan_noor.ipynb b/3.dadegan_noor.ipynb index 8194ff6..f6bae53 100644 --- a/3.dadegan_noor.ipynb +++ b/3.dadegan_noor.ipynb @@ -2,19 +2,20 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "from pathlib import Path\n", - "from tqdm import tqdm" + "from tqdm import tqdm\n", + "import json" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -27,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -57,19 +58,19 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4167/4167 [00:22<00:00, 186.02it/s]\n" + "100%|██████████| 4167/4167 [00:33<00:00, 122.76it/s]\n" ] } ], "source": [ - "result = {}\n", + "results = []\n", "\n", "for verb in tqdm(verbs):\n", " with open(htmls_path / (verb + \".html\"), \"r\") as f:\n", @@ -80,10 +81,27 @@ " past_stem = stems[0].text\n", " present_stem = stems[1].text\n", "\n", - " result[verb] = {}\n", - "\n", - " result[verb][\"past_stem\"] = past_stem\n", - " result[verb][\"present_stem\"] = present_stem" + " structures = soup.select(\"div.section-wrapper.more-examples\")\n", + " for structure in structures:\n", + " examples = structure.findAll(\"div\", {\"class\": \"example\"})\n", + " for example in examples:\n", + " result = {}\n", + " result[\"verb\"] = verb\n", + " result[\"past_stem\"] = past_stem\n", + " result[\"present_stem\"] = present_stem\n", + " result[\"structure\"] = str(structure.findAll(\"div\", {\"class\": \"valency\"})[0])\n", + " result[\"text\"] = example.text\n", + " results.append(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "with open(output_verb_path, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(results, f, ensure_ascii=False, indent=4)" ] }, {