lear 0.2.0 - Docs.rs

"""
borrowed with love from http://shakespeare.mit.edu/lear/full.html
"""
from __future__ import annotations

import json
from dataclasses import dataclass
from os import path
from typing import Any, Dict, List

from bs4 import BeautifulSoup
from bs4.element import Tag
from dataclasses_json import DataClassJsonMixin
from more_itertools import peekable


@dataclass(frozen=True)
class Line(DataClassJsonMixin):
    """A bit of text from the play"""


@dataclass(frozen=True)
class Text(Line):
    """Spoken text"""

    text: str


@dataclass(frozen=True)
class Direction(Line):
    """A stage direction, which will be formatted differently"""

    direction: str


@dataclass(frozen=True)
class TextBlock(DataClassJsonMixin):
    """Base element for display"""


@dataclass(frozen=True)
class Dialogue(TextBlock):
    """Represents a block of dialogue spoken by a single character"""

    character: str
    act: int
    scene: int
    start: int
    end: int
    lines: List[Line]

    @staticmethod
    def from_html(a: Tag, blockquote: Tag) -> Dialogue:
        """Parses out the weird MIT formatting"""
        character = a.text
        lines = list(blockquote)
        dialogue_lines = [line for line in lines if line.name == "a"]
        # we need to grab some numbers for the citation (act.scene.start-stop)
        (first_line, last_line) = (dialogue_lines[0], dialogue_lines[-1])
        [act, scene, start] = [int(token) for token in first_line["name"].split(".")]
        [_, _, end] = [int(token) for token in last_line["name"].split(".")]
        # now we need to categorize all of the lines as either stage directions
        # or actual dialogue
        contents: List[Line] = []
        for line in lines:
            if line.name == "a":
                contents.append(Text(line.text))
            elif line.name == "p":
                contents.append(Direction(line.text))
        return Dialogue(character, act, scene, start, end, contents)

    def to_dict(self, encode_json=False) -> Dict[str, Any]:
        return {"Dialogue": super().to_dict(encode_json=encode_json)}


@dataclass(frozen=True)
class Heading(TextBlock):
    act: str
    scene: str
    setting: str
    staging: Direction

    @staticmethod
    def from_html(act: str, setting: Tag, staging: Tag) -> Heading:
        print(setting.text)
        [scene, setting] = setting.text.strip().split(".", 1)
        return Heading(act, scene.strip(), setting.strip(), staging.text.strip())

    def to_dict(self, encode_json=False) -> Dict[str, Any]:
        # return super().to_dict(encode_json=encode_json)
        return {"Heading": super().to_dict(encode_json=encode_json)}


# read file
working_dir = path.dirname(__file__)
with open(path.join(working_dir, "lear.html"), "r", encoding="utf-8") as f:
    lear_html = f.read()

# get the tags we care about
soup = BeautifulSoup(lear_html, "html.parser")
lear = peekable(tag for tag in soup.body if tag.name is not None)

scenes: List[List[TextBlock]] = []
current_scene: List[TextBlock] = []
act = "0"

while lear.peek(None):
    tag: Tag = next(lear)
    # Scene indicators, we want to skip these
    if tag.name == "h3":
        scenes.append(current_scene)
        current_scene = []
        if tag.text.startswith("Act"):
            act = tag.text
            tag = next(lear)
        staging = next(lear)
        scene_info = Heading.from_html(act, tag, staging)
        current_scene.append(scene_info)
    else:
        # characters giving speeches
        blockquote: Tag = next(lear)
        dialogue = Dialogue.from_html(tag, blockquote)
        current_scene.append(dialogue)

scenes.append(current_scene)

for (index, scene) in enumerate(scenes):
    print(f"{index:02d}.json")
    file = path.join(working_dir, "scenes", f"{index:02d}.json")
    with open(file, "w", encoding="utf-8") as f:
        data = [block.to_dict(encode_json=True) for block in scene]
        json.dump(data, f, indent=4)