story-dl 0.1.0

Story web scraping
Documentation
use {
    html5ever::{
        driver::ParseOpts,
        parse_document,
        rcdom::{Handle, NodeData, RcDom},
        tendril::TendrilSink,
    },
    std::io,
};

pub fn parse(body: impl AsRef<str>) -> io::Result<String> {
    let mut body = body.as_ref().as_bytes();

    let dom = parse_document(RcDom::default(), ParseOpts::default())
        .from_utf8()
        .read_from(&mut body)?;

    let mut buf = String::with_capacity(body.len() / 6);

    convert(&mut buf, &dom.document);

    Ok(buf)
}

fn convert(buf: &mut String, document: &Handle) {
    let node = &document.data;

    match &node {
        NodeData::Comment { .. }
        | NodeData::Doctype { .. }
        | NodeData::ProcessingInstruction { .. } => {}
        NodeData::Document => {
            for node in document.children.borrow().iter() {
                convert(buf, node)
            }
        }
        NodeData::Text { contents } => {
            let mut prev = buf.is_empty() || buf.ends_with(' ') || buf.ends_with('\n');

            for c in contents.borrow().chars() {
                match c {
                    ' ' | '\n' => {
                        if !prev {
                            prev = true;

                            buf.push(' ');
                        }
                    }
                    _ => {
                        prev = false;

                        buf.push(c);
                    }
                }
            }
        }
        NodeData::Element { name, attrs, .. } => {
            let tag: &str = &name.local.to_ascii_lowercase().to_lowercase();
            let attrs = attrs.borrow();

            match tag {
                "head" | "style" | "script" => {}
                _ => {
                    match tag {
                        "a" => buf.push_str("["),
                        "b" | "strong" => buf.push_str("**"),
                        "i" | "em" => buf.push_str("*"),
                        "p" | "div" => {
                            double_newline(buf);
                        }
                        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
                            buf.push_str(match tag {
                                "h1" => "# ",
                                "h2" => "## ",
                                "h3" => "### ",
                                "h4" => "#### ",
                                "h5" => "##### ",
                                "h6" => "###### ",
                                _ => unreachable!(),
                            });
                        }
                        "hr" => {
                            newline(buf);
                            buf.push_str("---");
                            newline(buf);
                        }
                        "br" => double_newline(buf),
                        "img" => {
                            let mut src = "";
                            let mut alt = "no alt text";

                            for attr in attrs.iter() {
                                let name: &str =
                                    &attr.name.local.to_ascii_lowercase().to_lowercase();

                                match name {
                                    "alt" => {
                                        alt = &attr.value;
                                    }
                                    "src" => {
                                        src = &attr.value;
                                    }
                                    _ => {}
                                }
                            }

                            buf.push_str("![");
                            buf.push_str(alt);
                            buf.push_str("](");
                            buf.push_str(src);
                            buf.push_str(")")
                        }
                        _ => {}
                    }

                    for node in document.children.borrow().iter() {
                        convert(buf, node)
                    }

                    match tag {
                        "a" => {
                            let mut url = "";

                            for attr in attrs.iter() {
                                let name: &str =
                                    &attr.name.local.to_ascii_lowercase().to_lowercase();

                                if let "href" = name {
                                    url = &attr.value;
                                }
                            }

                            buf.push_str("](");
                            buf.push_str(url);
                            buf.push_str(")")
                        }
                        "b" | "strong" => buf.push_str("**"),
                        "i" | "em" => buf.push_str("*"),
                        _ => {}
                    }
                }
            }
        }
    }
}

fn trim_ending_whitespace(buf: &mut String) {
    while buf.ends_with(' ') || buf.ends_with('\t') {
        let end = buf.len() - 1;

        buf.remove(end);
    }
}

fn double_newline(buf: &mut String) {
    trim_ending_whitespace(buf);

    if !buf.ends_with("\n\n") {
        if buf.ends_with('\n') {
            buf.push('\n')
        } else if !buf.is_empty() {
            buf.push_str("\n\n")
        }
    }
}

fn newline(buf: &mut String) {
    trim_ending_whitespace(buf);

    if buf.ends_with('\n') {
    } else if !buf.is_empty() {
        buf.push('\n')
    }
}