format output

2024-12-25 11:21:06 +03:30 · 2024-12-25 11:21:06 +03:30 · 65f4c0c380
commit 65f4c0c380
parent 8752b6225c
1 changed files with 50 additions and 16 deletions
--- a/3.dadegan_noor.ipynb
+++ b/3.dadegan_noor.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -15,7 +15,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -28,7 +28,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -40,15 +40,7 @@
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 3/3 [00:01<00:00,  2.62it/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "for verb in tqdm(verbs):\n",
    "    response = requests.get(\"http://search.dadegan.ir/\", params={\"q\": verb})\n",
@ -58,14 +50,33 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_start_and_end(phrase, subphrase):\n",
+    "    words1 = phrase.split()\n",
+    "    words2 = subphrase.split()\n",
+    "\n",
+    "    for i in range(len(words1)):\n",
+    "        if words1[i : i + len(words2)] == words2:\n",
+    "            start = i\n",
+    "            end = i + len(words2) - 1\n",
+    "            return start, end\n",
+    "\n",
+    "    return \"404 - Not found!\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "100%|██████████| 4167/4167 [00:33<00:00, 122.76it/s]\n"
+      "100%|██████████| 4167/4167 [00:34<00:00, 122.03it/s]\n"
     ]
    }
   ],
@ -84,19 +95,42 @@
    "    structures = soup.select(\"div.section-wrapper.more-examples\")\n",
    "    for structure in structures:\n",
    "        examples = structure.findAll(\"div\", {\"class\": \"example\"})\n",
+    "        valency = structure.findAll(\"div\", {\"class\": \"valency\"})[0]\n",
    "        for example in examples:\n",
    "            result = {}\n",
    "            result[\"verb\"] = verb\n",
    "            result[\"past_stem\"] = past_stem\n",
    "            result[\"present_stem\"] = present_stem\n",
-    "            result[\"structure\"] = str(structure.findAll(\"div\", {\"class\": \"valency\"})[0])\n",
+    "            result[\"structure_key\"] = [\n",
+    "                e.get(\"class\")[0] for e in valency.findAll(\"span\", recursive=False)\n",
+    "            ]\n",
+    "            result[\"structure_fa\"] = [\n",
+    "                e.text for e in valency.findAll(\"span\", recursive=False)\n",
+    "            ]\n",
+    "            result[\"html\"] = str(example)\n",
    "            result[\"text\"] = example.text\n",
+    "\n",
+    "            result[\"tags\"] = []\n",
+    "            spans = example.findAll(\"span\", recursive=False)\n",
+    "            for span in spans:\n",
+    "                span_text = span.get_text(strip=True)\n",
+    "                start_index, end_index = find_start_and_end(example.text, span_text)\n",
+    "\n",
+    "                result[\"tags\"].append(\n",
+    "                    {\n",
+    "                        \"text\": span_text,\n",
+    "                        \"start\": start_index,\n",
+    "                        \"end\": end_index,\n",
+    "                        \"class\": span[\"class\"][0],\n",
+    "                    }\n",
+    "                )\n",
+    "\n",
    "            results.append(result)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [