diff --git a/3.dadegan_noor.ipynb b/3.dadegan_noor.ipynb index f6bae53..4428c63 100644 --- a/3.dadegan_noor.ipynb +++ b/3.dadegan_noor.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 39, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -40,15 +40,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 3/3 [00:01<00:00, 2.62it/s]\n" - ] - } - ], + "outputs": [], "source": [ "for verb in tqdm(verbs):\n", " response = requests.get(\"http://search.dadegan.ir/\", params={\"q\": verb})\n", @@ -58,14 +50,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def find_start_and_end(phrase, subphrase):\n", + " words1 = phrase.split()\n", + " words2 = subphrase.split()\n", + "\n", + " for i in range(len(words1)):\n", + " if words1[i : i + len(words2)] == words2:\n", + " start = i\n", + " end = i + len(words2) - 1\n", + " return start, end\n", + "\n", + " return \"404 - Not found!\"" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4167/4167 [00:33<00:00, 122.76it/s]\n" + "100%|██████████| 4167/4167 [00:34<00:00, 122.03it/s]\n" ] } ], @@ -84,19 +95,42 @@ " structures = soup.select(\"div.section-wrapper.more-examples\")\n", " for structure in structures:\n", " examples = structure.findAll(\"div\", {\"class\": \"example\"})\n", + " valency = structure.findAll(\"div\", {\"class\": \"valency\"})[0]\n", " for example in examples:\n", " result = {}\n", " result[\"verb\"] = verb\n", " result[\"past_stem\"] = past_stem\n", " result[\"present_stem\"] = present_stem\n", - " result[\"structure\"] = str(structure.findAll(\"div\", {\"class\": \"valency\"})[0])\n", + " result[\"structure_key\"] = [\n", + " e.get(\"class\")[0] for e in valency.findAll(\"span\", recursive=False)\n", + " ]\n", + " result[\"structure_fa\"] = [\n", + " e.text for e in valency.findAll(\"span\", recursive=False)\n", + " ]\n", + " result[\"html\"] = str(example)\n", " result[\"text\"] = example.text\n", + "\n", + " result[\"tags\"] = []\n", + " spans = example.findAll(\"span\", recursive=False)\n", + " for span in spans:\n", + " span_text = span.get_text(strip=True)\n", + " start_index, end_index = find_start_and_end(example.text, span_text)\n", + "\n", + " result[\"tags\"].append(\n", + " {\n", + " \"text\": span_text,\n", + " \"start\": start_index,\n", + " \"end\": end_index,\n", + " \"class\": span[\"class\"][0],\n", + " }\n", + " )\n", + "\n", " results.append(result)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [