1use regex::Regex;
7use scraper::{Html, Selector};
8use std::sync::LazyLock;
9
10const SKIP_TAGS: &[&str] = &["script", "style", "noscript", "iframe"];
12
13const NAV_TAGS: &[&str] = &["nav", "header", "footer", "aside"];
15
16const UI_TAGS: &[&str] = &["button", "details", "summary"];
18
19static LINK_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<link[^>]*>").unwrap());
21
22static META_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<meta[^>]*>").unwrap());
23
24static COPY_PATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"Copy item path").unwrap());
26
27static ANCHOR_LINK_REGEX: LazyLock<Regex> =
29 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
30
31static SOURCE_LINK_REGEX: LazyLock<Regex> =
33 LazyLock::new(|| Regex::new(r"\[Source\]\([^)]*\)").unwrap());
34
35static RELATIVE_LINK_REGEX: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r"\[[^\]]*\]\([a-zA-Z][^)]*\.html\)").unwrap());
39
40static SECTION_MARKER_REGEX: LazyLock<Regex> =
42 LazyLock::new(|| Regex::new(r"\[§\]\([^)]*\)").unwrap());
43
44#[must_use]
49pub fn clean_html(html: &str) -> String {
50 let document = Html::parse_document(html);
51 remove_unwanted_elements(&document, html)
52}
53
54fn remove_unwanted_elements(document: &Html, original_html: &str) -> String {
56 let mut result = original_html.to_string();
57
58 for tag in SKIP_TAGS {
60 if let Ok(selector) = Selector::parse(tag) {
61 let elements: Vec<_> = document.select(&selector).collect();
62 for element in elements {
63 let element_html = element.html();
64 result = result.replace(&element_html, "");
65 }
66 }
67 }
68
69 let mut updated_doc = Html::parse_document(&result);
71
72 for tag in NAV_TAGS {
74 if let Ok(selector) = Selector::parse(tag) {
75 let elements: Vec<_> = updated_doc.select(&selector).collect();
76 for element in elements {
77 let element_html = element.html();
78 result = result.replace(&element_html, "");
79 }
80 }
81 }
82
83 updated_doc = Html::parse_document(&result);
85
86 for tag in UI_TAGS {
88 if let Ok(selector) = Selector::parse(tag) {
89 let elements: Vec<_> = updated_doc.select(&selector).collect();
90 for element in elements {
91 let element_html = element.html();
92 result = result.replace(&element_html, "");
93 }
94 }
95 }
96
97 result = LINK_TAG_REGEX.replace_all(&result, "").to_string();
99 result = META_TAG_REGEX.replace_all(&result, "").to_string();
100
101 result = COPY_PATH_REGEX.replace_all(&result, "").to_string();
103 result = ANCHOR_LINK_REGEX.replace_all(&result, "").to_string();
104
105 result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
107 result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
108
109 result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
111
112 result
113}
114
115#[must_use]
119pub fn html_to_text(html: &str) -> String {
120 let document = Html::parse_document(html);
121
122 let mut text_parts = Vec::new();
124
125 let body_selector = Selector::parse("body").unwrap();
127
128 if let Some(body) = document.select(&body_selector).next() {
129 extract_text_excluding_skip_tags(&body, &mut text_parts);
130 } else {
131 let all_selector = Selector::parse("*").unwrap();
133 if let Some(root) = document.select(&all_selector).next() {
134 extract_text_excluding_skip_tags(&root, &mut text_parts);
135 }
136 }
137
138 clean_whitespace(&text_parts.join(" "))
139}
140
141fn extract_text_excluding_skip_tags(
143 element: &scraper::element_ref::ElementRef,
144 text_parts: &mut Vec<String>,
145) {
146 let tag_name = element.value().name().to_lowercase();
147
148 if SKIP_TAGS.contains(&tag_name.as_str()) {
150 return;
151 }
152
153 for text in element.text() {
155 let trimmed = text.trim();
156 if !trimmed.is_empty() {
157 text_parts.push(trimmed.to_string());
158 }
159 }
160}
161
162#[allow(dead_code)]
164fn is_block_element(tag: &str) -> bool {
165 const BLOCK_ELEMENTS: &[&str] = &[
166 "address",
167 "article",
168 "aside",
169 "blockquote",
170 "body",
171 "canvas",
172 "dd",
173 "div",
174 "dl",
175 "dt",
176 "fieldset",
177 "figcaption",
178 "figure",
179 "footer",
180 "form",
181 "h1",
182 "h2",
183 "h3",
184 "h4",
185 "h5",
186 "h6",
187 "head",
188 "header",
189 "hgroup",
190 "hr",
191 "html",
192 "li",
193 "main",
194 "nav",
195 "noscript",
196 "ol",
197 "p",
198 "pre",
199 "section",
200 "table",
201 "tbody",
202 "td",
203 "tfoot",
204 "th",
205 "thead",
206 "tr",
207 "ul",
208 "video",
209 ];
210 BLOCK_ELEMENTS.contains(&tag)
211}
212
213#[must_use]
218pub fn extract_documentation(html: &str) -> String {
219 let main_content = extract_main_content(html);
221 let cleaned_html = clean_html(&main_content);
222 let markdown = html2md::parse_html(&cleaned_html);
223
224 clean_markdown(&markdown)
226}
227
228fn clean_markdown(markdown: &str) -> String {
230 let result = markdown.to_string();
231
232 let result = SOURCE_LINK_REGEX.replace_all(&result, "").to_string();
234
235 let result = RELATIVE_LINK_REGEX.replace_all(&result, "").to_string();
237
238 let result = SECTION_MARKER_REGEX.replace_all(&result, "").to_string();
240
241 let result = result.replace("\n\n\n", "\n\n");
243
244 result.trim().to_string()
245}
246
247fn extract_main_content(html: &str) -> String {
252 let document = Html::parse_document(html);
253
254 if let Ok(selector) = Selector::parse("#main-content") {
256 if let Some(main_section) = document.select(&selector).next() {
257 return main_section.html();
258 }
259 }
260
261 if let Ok(selector) = Selector::parse("#rustdoc_body_wrapper") {
263 if let Some(wrapper) = document.select(&selector).next() {
264 return wrapper.html();
265 }
266 }
267
268 html.to_string()
270}
271
272#[must_use]
274pub fn extract_search_results(html: &str, item_path: &str) -> String {
275 let main_content = extract_main_content(html);
276 let cleaned_html = clean_html(&main_content);
277 let markdown = html2md::parse_html(&cleaned_html);
278 let cleaned_markdown = clean_markdown(&markdown);
279
280 if cleaned_markdown.trim().is_empty() {
281 format!("未找到项目 '{item_path}' 的文档")
282 } else {
283 format!("## 搜索结果: {item_path}\n\n{cleaned_markdown}")
284 }
285}
286
287fn clean_whitespace(text: &str) -> String {
289 text.split_whitespace().collect::<Vec<_>>().join(" ")
290}
291
292#[cfg(test)]
293mod tests {
294 use super::*;
295
296 #[test]
297 fn test_clean_html_removes_script() {
298 let html = "<html><script>var x = 1;</script><body>Hello</body></html>";
299 let cleaned = clean_html(html);
300 assert!(!cleaned.contains("script"));
301 assert!(!cleaned.contains("var x"));
302 assert!(cleaned.contains("Hello"));
303 }
304
305 #[test]
306 fn test_clean_html_removes_style() {
307 let html = "<html><style>.foo { color: red; }</style><body>Content</body></html>";
308 let cleaned = clean_html(html);
309 assert!(!cleaned.contains("style"));
310 assert!(!cleaned.contains(".foo"));
311 assert!(cleaned.contains("Content"));
312 }
313
314 #[test]
315 fn test_html_to_text_removes_tags() {
316 let html = "<p>Hello <strong>World</strong>!</p>";
317 let text = html_to_text(html);
318 assert!(!text.contains('<'));
319 assert!(!text.contains('>'));
320 assert!(text.contains("Hello"));
321 assert!(text.contains("World"));
322 }
323
324 #[test]
325 fn test_html_to_text_handles_entities() {
326 let html = r"<p>Tom & Jerry</p>";
329 let text = html_to_text(html);
330 assert!(text.contains('&') || text.contains("Tom") || text.contains("Jerry"));
332 }
333
334 #[test]
335 fn test_clean_whitespace() {
336 assert_eq!(clean_whitespace(" hello world "), "hello world");
337 assert_eq!(clean_whitespace("\t\nhello\n\tworld\t\n"), "hello world");
338 }
339
340 #[test]
341 fn test_extract_documentation() {
342 let html = "<html><body><h1>Title</h1><p>Content</p></body></html>";
343 let docs = extract_documentation(html);
344 assert!(docs.contains("Title"));
345 assert!(docs.contains("Content"));
346 }
347
348 #[test]
349 fn test_extract_search_results_found() {
350 let html = "<html><body><h1>Result</h1></body></html>";
351 let result = extract_search_results(html, "serde::Serialize");
352 assert!(result.contains("搜索结果"));
353 assert!(result.contains("serde::Serialize"));
354 assert!(result.contains("Result"));
355 }
356
357 #[test]
358 fn test_extract_search_results_not_found() {
359 let html = "<html><body></body></html>";
360 let result = extract_search_results(html, "nonexistent");
361 assert!(result.contains("未找到项目"));
362 assert!(result.contains("nonexistent"));
363 }
364
365 #[test]
366 fn test_clean_html_removes_link_tags() {
367 let html = r#"<html><head><link rel="stylesheet" href="test.css"></head><body>Hello</body></html>"#;
368 let cleaned = clean_html(html);
369 assert!(
370 !cleaned.contains("link"),
371 "link tag should be removed, got: {cleaned}"
372 );
373 assert!(
374 !cleaned.contains("stylesheet"),
375 "stylesheet should be removed, got: {cleaned}"
376 );
377 assert!(
378 cleaned.contains("Hello"),
379 "Body content should remain, got: {cleaned}"
380 );
381 }
382
383 #[test]
384 fn test_clean_html_removes_meta_tags() {
385 let html = r#"<html><head><meta charset="utf-8"></head><body>Content</body></html>"#;
386 let cleaned = clean_html(html);
387 assert!(
388 !cleaned.contains("meta"),
389 "meta tag should be removed, got: {cleaned}"
390 );
391 assert!(
392 cleaned.contains("Content"),
393 "Body content should remain, got: {cleaned}"
394 );
395 }
396}