In [1]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from tqdm import tqdm
import json

In [2]:
this_dir_path = Path().resolve()
data_path = this_dir_path / "Data"
input_verb_path = data_path / "verbs_noor.txt"
htmls_path = data_path / "htmls_noor"
output_verb_path = data_path / "verbs_noor.json"

In [3]:
with open(input_verb_path, "r") as f:
    verbs = [verb[:-1] for verb in f.readlines()]

In [None]:
for verb in tqdm(verbs):
    response = requests.get("http://search.dadegan.ir/", params={"q": verb})
    with open(htmls_path / (verb + ".html"), "w") as f:
        f.write(response.text)

In [17]:
def find_start_and_end(phrase, subphrase):
    words1 = phrase.split()
    words2 = subphrase.split()

    for i in range(len(words1)):
        if words1[i : i + len(words2)] == words2:
            start = i
            end = i + len(words2) - 1
            return start, end

    return "404 - Not found!"

In [19]:
results = []

for verb in tqdm(verbs):
    with open(htmls_path / (verb + ".html"), "r") as f:
        text = f.read()
    soup = BeautifulSoup(text, "html.parser")

    stems = soup.findAll("td", {"class": "c3"})[1:]
    past_stem = stems[0].text
    present_stem = stems[1].text

    structures = soup.select("div.section-wrapper.more-examples")
    for structure in structures:
        examples = structure.findAll("div", {"class": "example"})
        valency = structure.findAll("div", {"class": "valency"})[0]
        for example in examples:
            result = {}
            result["verb"] = verb
            result["past_stem"] = past_stem
            result["present_stem"] = present_stem
            result["structure_key"] = [
                e.get("class")[0] for e in valency.findAll("span", recursive=False)
            ]
            result["structure_fa"] = [
                e.text for e in valency.findAll("span", recursive=False)
            ]
            result["html"] = str(example)
            result["text"] = example.text

            result["tags"] = []
            spans = example.findAll("span", recursive=False)
            for span in spans:
                span_text = span.get_text(strip=True)
                start_index, end_index = find_start_and_end(example.text, span_text)

                result["tags"].append(
                    {
                        "text": span_text,
                        "start": start_index,
                        "end": end_index,
                        "class": span["class"][0],
                    }
                )

            results.append(result)

100%|██████████| 4167/4167 [00:34<00:00, 122.03it/s]


In [20]:
with open(output_verb_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)