1use regex::Regex;
2use select::document::Document;
3use select::node::Node;
4use select::predicate::{Class, Name};
5
6pub fn parse(content: &str) -> Result<String, String> {
7 let document = Document::from(content);
8 let found_content = document
9 .find(Class("doc-content"))
10 .next()
11 .ok_or("Not found `doc-content`")?;
12
13 let result: Vec<String> = found_content
14 .children()
15 .into_iter()
16 .map(handle_node)
17 .filter(|s| s.is_some())
18 .map(|s| match s {
19 Some(s) => s,
20 None => "".to_string(),
21 })
22 .collect();
23
24 let result = insert_newline(result);
25
26 Ok(result)
27}
28
29fn handle_node(node: Node) -> Option<String> {
30 let Some(name) = node.name() else { return None; };
31 let header_format = |n: usize| Some(format!("{} {}", "#".repeat(n), node.text()));
32 match name {
33 "h1" => header_format(1),
34 "h2" => header_format(2),
35 "h3" => header_format(3),
36 "h4" => header_format(4),
37 "h5" => header_format(5),
38 "h6" => header_format(6),
39 "p" => Some(
40 node.children()
41 .into_iter()
42 .map(handle_span)
43 .collect::<Vec<String>>()
44 .join(""),
45 ),
46 "ul" => Some(
47 node.children()
48 .map(|c| format!("- {}", c.text()))
49 .collect::<Vec<String>>()
50 .join("\n"),
51 ),
52 "table" => Some(handle_table(node)),
53 _ => None,
54 }
55}
56
57fn handle_span(node: Node) -> String {
58 node.children()
59 .into_iter()
60 .map(|inner| {
61 let Some(inner_name) = inner.name() else { return node.text(); };
62 match inner_name {
63 "a" => {
64 let Some(href) = inner.attr("href") else { return "".to_string();};
65 let extract_link_regex =
67 Regex::new(r"(?m)https://www\.google\.com/url\?q=(.*?)&sa.*$").unwrap();
68 let href = extract_link_regex.replace_all(href, "$1");
69 return format!("[{}]({})", inner.text(), href);
70 }
71 "img" => format!("", inner.attr("src").unwrap()),
72 _ => inner.text(),
73 }
74 })
75 .collect::<Vec<String>>()
76 .join("")
77}
78
79fn handle_table(table: Node) -> String {
80 let mut table_md = String::new();
81 table.find(Name("tr")).enumerate().for_each(|(i, row)| {
82 table_md.push('|');
83 let mut child_len = 0;
84 row.children().enumerate().for_each(|(j, col)| {
85 let text = col
86 .first_child()
87 .and_then(handle_node)
88 .map_or("".to_string(), |s| s + "|");
89 table_md.push_str(&text);
90 child_len = j + 1;
91 });
92 table_md.push('\n');
93 if i == 0 {
94 let header_sep = format!("|{}\n", "-|".repeat(child_len));
95 table_md.push_str(&header_sep);
96 }
97 });
98 table_md
99}
100
101fn insert_newline(s: Vec<String>) -> String {
102 let mut in_code = false;
103 let result: Vec<String> = s
104 .iter()
105 .map(|ss| -> String {
106 if ss.contains("```") {
107 in_code = !in_code;
108 }
109 if in_code {
110 ss.to_owned()
111 } else {
112 ss.to_owned() + "\n"
113 }
114 })
115 .collect();
116
117 result.join("\n")
118}