1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use regex::Regex;
use select::document::Document;
use select::node::Node;
use select::predicate::{Class, Name};
pub fn parse(content: &str) -> Result<String, String> {
let document = Document::from(content);
let found_content = document
.find(Class("doc-content"))
.next()
.ok_or("Not found `doc-content`")?;
let result: Vec<String> = found_content
.children()
.into_iter()
.map(handle_node)
.filter(|s| s.is_some())
.map(|s| match s {
Some(s) => s,
None => "".to_string(),
})
.collect();
let result = insert_newline(result);
Ok(result)
}
fn handle_node(node: Node) -> Option<String> {
let Some(name) = node.name() else { return None; };
let header_format = |n: usize| Some(format!("{} {}", "#".repeat(n), node.text()));
match name {
"h1" => header_format(1),
"h2" => header_format(2),
"h3" => header_format(3),
"h4" => header_format(4),
"h5" => header_format(5),
"h6" => header_format(6),
"p" => Some(
node.children()
.into_iter()
.map(handle_span)
.collect::<Vec<String>>()
.join(""),
),
"ul" => Some(
node.children()
.map(|c| format!("- {}", c.text()))
.collect::<Vec<String>>()
.join("\n"),
),
"table" => Some(handle_table(node)),
_ => None,
}
}
fn handle_span(node: Node) -> String {
node.children()
.into_iter()
.map(|inner| {
let Some(inner_name) = inner.name() else { return node.text(); };
match inner_name {
"a" => {
let Some(href) = inner.attr("href") else { return "".to_string();};
let extract_link_regex =
Regex::new(r"(?m)https://www\.google\.com/url\?q=(.*?)&sa.*$").unwrap();
let href = extract_link_regex.replace_all(href, "$1");
return format!("[{}]({})", inner.text(), href);
}
"img" => format!("![]({})", inner.attr("src").unwrap()),
_ => inner.text(),
}
})
.collect::<Vec<String>>()
.join("")
}
fn handle_table(table: Node) -> String {
let mut table_md = String::new();
table.find(Name("tr")).enumerate().for_each(|(i, row)| {
table_md.push('|');
let mut child_len = 0;
row.children().enumerate().for_each(|(j, col)| {
let text = col
.first_child()
.and_then(handle_node)
.map_or("".to_string(), |s| s + "|");
table_md.push_str(&text);
child_len = j + 1;
});
table_md.push('\n');
if i == 0 {
let header_sep = format!("|{}\n", "-|".repeat(child_len));
table_md.push_str(&header_sep);
}
});
table_md
}
fn insert_newline(s: Vec<String>) -> String {
let mut in_code = false;
let result: Vec<String> = s
.iter()
.map(|ss| -> String {
if ss.contains("```") {
in_code = !in_code;
}
if in_code {
ss.to_owned()
} else {
ss.to_owned() + "\n"
}
})
.collect();
result.join("\n")
}