1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use regex::Regex;
use select::document::Document;
use select::node::Node;
use select::predicate::{Class, Name};

pub fn parse(content: &str) -> Result<String, String> {
    let document = Document::from(content);
    let found_content = document
        .find(Class("doc-content"))
        .next()
        .ok_or("Not found `doc-content`")?;

    let result: Vec<String> = found_content
        .children()
        .into_iter()
        .map(handle_node)
        .filter(|s| s.is_some())
        .map(|s| match s {
            Some(s) => s,
            None => "".to_string(),
        })
        .collect();

    let result = insert_newline(result);

    Ok(result)
}

fn handle_node(node: Node) -> Option<String> {
    let Some(name) = node.name() else { return None; };
    let header_format = |n: usize| Some(format!("{} {}", "#".repeat(n), node.text()));
    match name {
        "h1" => header_format(1),
        "h2" => header_format(2),
        "h3" => header_format(3),
        "h4" => header_format(4),
        "h5" => header_format(5),
        "h6" => header_format(6),
        "p" => Some(
            node.children()
                .into_iter()
                .map(handle_span)
                .collect::<Vec<String>>()
                .join(""),
        ),
        "ul" => Some(
            node.children()
                .map(|c| format!("- {}", c.text()))
                .collect::<Vec<String>>()
                .join("\n"),
        ),
        "table" => Some(handle_table(node)),
        _ => None,
    }
}

fn handle_span(node: Node) -> String {
    node.children()
        .into_iter()
        .map(|inner| {
            let Some(inner_name) = inner.name() else { return node.text(); };
            match inner_name {
                "a" => {
                    let Some(href) = inner.attr("href") else { return "".to_string();};
                    // Remove Google redirect link
                    let extract_link_regex =
                        Regex::new(r"(?m)https://www\.google\.com/url\?q=(.*?)&sa.*$").unwrap();
                    let href = extract_link_regex.replace_all(href, "$1");
                    return format!("[{}]({})", inner.text(), href);
                }
                "img" => format!("![]({})", inner.attr("src").unwrap()),
                _ => inner.text(),
            }
        })
        .collect::<Vec<String>>()
        .join("")
}

fn handle_table(table: Node) -> String {
    let mut table_md = String::new();
    table.find(Name("tr")).enumerate().for_each(|(i, row)| {
        table_md.push('|');
        let mut child_len = 0;
        row.children().enumerate().for_each(|(j, col)| {
            let text = col
                .first_child()
                .and_then(handle_node)
                .map_or("".to_string(), |s| s + "|");
            table_md.push_str(&text);
            child_len = j + 1;
        });
        table_md.push('\n');
        if i == 0 {
            let header_sep = format!("|{}\n", "-|".repeat(child_len));
            table_md.push_str(&header_sep);
        }
    });
    table_md
}

fn insert_newline(s: Vec<String>) -> String {
    let mut in_code = false;
    let result: Vec<String> = s
        .iter()
        .map(|ss| -> String {
            if ss.contains("```") {
                in_code = !in_code;
            }
            if in_code {
                ss.to_owned()
            } else {
                ss.to_owned() + "\n"
            }
        })
        .collect();

    result.join("\n")
}