docs_to_markdown/
lib.rs

1use regex::Regex;
2use select::document::Document;
3use select::node::Node;
4use select::predicate::{Class, Name};
5
6pub fn parse(content: &str) -> Result<String, String> {
7    let document = Document::from(content);
8    let found_content = document
9        .find(Class("doc-content"))
10        .next()
11        .ok_or("Not found `doc-content`")?;
12
13    let result: Vec<String> = found_content
14        .children()
15        .into_iter()
16        .map(handle_node)
17        .filter(|s| s.is_some())
18        .map(|s| match s {
19            Some(s) => s,
20            None => "".to_string(),
21        })
22        .collect();
23
24    let result = insert_newline(result);
25
26    Ok(result)
27}
28
29fn handle_node(node: Node) -> Option<String> {
30    let Some(name) = node.name() else { return None; };
31    let header_format = |n: usize| Some(format!("{} {}", "#".repeat(n), node.text()));
32    match name {
33        "h1" => header_format(1),
34        "h2" => header_format(2),
35        "h3" => header_format(3),
36        "h4" => header_format(4),
37        "h5" => header_format(5),
38        "h6" => header_format(6),
39        "p" => Some(
40            node.children()
41                .into_iter()
42                .map(handle_span)
43                .collect::<Vec<String>>()
44                .join(""),
45        ),
46        "ul" => Some(
47            node.children()
48                .map(|c| format!("- {}", c.text()))
49                .collect::<Vec<String>>()
50                .join("\n"),
51        ),
52        "table" => Some(handle_table(node)),
53        _ => None,
54    }
55}
56
57fn handle_span(node: Node) -> String {
58    node.children()
59        .into_iter()
60        .map(|inner| {
61            let Some(inner_name) = inner.name() else { return node.text(); };
62            match inner_name {
63                "a" => {
64                    let Some(href) = inner.attr("href") else { return "".to_string();};
65                    // Remove Google redirect link
66                    let extract_link_regex =
67                        Regex::new(r"(?m)https://www\.google\.com/url\?q=(.*?)&sa.*$").unwrap();
68                    let href = extract_link_regex.replace_all(href, "$1");
69                    return format!("[{}]({})", inner.text(), href);
70                }
71                "img" => format!("![]({})", inner.attr("src").unwrap()),
72                _ => inner.text(),
73            }
74        })
75        .collect::<Vec<String>>()
76        .join("")
77}
78
79fn handle_table(table: Node) -> String {
80    let mut table_md = String::new();
81    table.find(Name("tr")).enumerate().for_each(|(i, row)| {
82        table_md.push('|');
83        let mut child_len = 0;
84        row.children().enumerate().for_each(|(j, col)| {
85            let text = col
86                .first_child()
87                .and_then(handle_node)
88                .map_or("".to_string(), |s| s + "|");
89            table_md.push_str(&text);
90            child_len = j + 1;
91        });
92        table_md.push('\n');
93        if i == 0 {
94            let header_sep = format!("|{}\n", "-|".repeat(child_len));
95            table_md.push_str(&header_sep);
96        }
97    });
98    table_md
99}
100
101fn insert_newline(s: Vec<String>) -> String {
102    let mut in_code = false;
103    let result: Vec<String> = s
104        .iter()
105        .map(|ss| -> String {
106            if ss.contains("```") {
107                in_code = !in_code;
108            }
109            if in_code {
110                ss.to_owned()
111            } else {
112                ss.to_owned() + "\n"
113            }
114        })
115        .collect();
116
117    result.join("\n")
118}