1use scraper::{Html, ElementRef};
13use url::Url;
14
15use crate::selector::{SELECTORS, try_parse_selector, heading_selector};
16use crate::types::{
17 Heading, Image, ImageLoading, ListContent, ListType, ListItem,
18 TableContent, TableRow, TableCell, CodeBlock, Quote,
19 ParserConfig, ParserResult,
20};
21
22pub fn extract_headings(document: &Html) -> ParserResult<Vec<Heading>> {
28 let mut headings = Vec::new();
29
30 for level in 1..=6 {
31 let selector = heading_selector(level);
32
33 for element in document.select(selector) {
34 let text = element.text().collect::<String>().trim().to_string();
35
36 if text.is_empty() {
37 continue;
38 }
39
40 let mut heading = Heading::new(level, &text);
41
42 if let Some(id) = element.value().attr("id") {
44 heading.id = Some(id.to_string());
45 }
46
47 heading.classes = element.value().classes()
49 .map(|c| c.to_string())
50 .collect();
51
52 headings.push(heading);
53 }
54 }
55
56 Ok(headings)
57}
58
59pub fn get_main_heading(document: &Html) -> Option<String> {
61 document.select(&SELECTORS.h1)
62 .next()
63 .map(|el| el.text().collect::<String>().trim().to_string())
64 .filter(|s| !s.is_empty())
65}
66
67pub fn build_outline(headings: &[Heading]) -> Vec<OutlineItem> {
69 let mut outline = Vec::new();
70 let mut stack: Vec<(u8, usize)> = Vec::new(); for heading in headings {
73 let item = OutlineItem {
74 level: heading.level,
75 text: heading.text.clone(),
76 id: heading.id.clone(),
77 children: Vec::new(),
78 };
79
80 while let Some((level, _)) = stack.last() {
82 if *level >= heading.level {
83 stack.pop();
84 } else {
85 break;
86 }
87 }
88
89 outline.push(item);
90 stack.push((heading.level, outline.len() - 1));
91 }
92
93 outline
94}
95
96#[derive(Debug, Clone)]
98pub struct OutlineItem {
99 pub level: u8,
100 pub text: String,
101 pub id: Option<String>,
102 pub children: Vec<OutlineItem>,
103}
104
105pub fn extract_paragraphs(document: &Html, config: &ParserConfig) -> ParserResult<Vec<String>> {
111 let mut paragraphs = Vec::new();
112
113 for element in document.select(&SELECTORS.p) {
114 let text = element.text().collect::<String>().trim().to_string();
115
116 if text.len() >= config.min_paragraph_length {
118 paragraphs.push(text);
119 }
120 }
121
122 Ok(paragraphs)
123}
124
125pub fn extract_lists(document: &Html) -> ParserResult<Vec<ListContent>> {
131 let mut lists = Vec::new();
132
133 for ol in document.select(&SELECTORS.ol) {
135 if let Some(list) = extract_list(&ol, ListType::Ordered) {
136 lists.push(list);
137 }
138 }
139
140 for ul in document.select(&SELECTORS.ul) {
142 if let Some(list) = extract_list(&ul, ListType::Unordered) {
143 lists.push(list);
144 }
145 }
146
147 for dl in document.select(&SELECTORS.dl) {
149 if let Some(list) = extract_definition_list(&dl) {
150 lists.push(list);
151 }
152 }
153
154 Ok(lists)
155}
156
157fn extract_list(element: &ElementRef, list_type: ListType) -> Option<ListContent> {
159 let mut list = ListContent::new(list_type);
160
161 for child in element.children() {
163 if let Some(li) = ElementRef::wrap(child) {
164 if li.value().name() == "li" {
165 let item = extract_list_item(&li);
166 list.add_item(item);
167 }
168 }
169 }
170
171 if list.is_empty() {
172 None
173 } else {
174 Some(list)
175 }
176}
177
178fn extract_list_item(element: &ElementRef) -> ListItem {
180 let mut text = String::new();
182 let mut nested: Option<ListContent> = None;
183
184 for child in element.children() {
185 match child.value() {
186 scraper::Node::Text(t) => {
187 text.push_str(t.text.trim());
188 }
189 scraper::Node::Element(el) => {
190 if let Some(child_el) = ElementRef::wrap(child) {
191 match el.name() {
192 "ul" => {
193 nested = extract_list(&child_el, ListType::Unordered);
194 }
195 "ol" => {
196 nested = extract_list(&child_el, ListType::Ordered);
197 }
198 _ => {
199 text.push_str(&child_el.text().collect::<String>());
201 }
202 }
203 }
204 }
205 _ => {}
206 }
207 }
208
209 if let Some(nested_list) = nested {
210 ListItem::with_nested(text.trim(), nested_list)
211 } else {
212 ListItem::new(text.trim())
213 }
214}
215
216fn extract_definition_list(element: &ElementRef) -> Option<ListContent> {
218 let mut list = ListContent::new(ListType::Definition);
219
220 let mut current_term: Option<String> = None;
221
222 for child in element.children() {
223 if let Some(el) = ElementRef::wrap(child) {
224 match el.value().name() {
225 "dt" => {
226 current_term = Some(el.text().collect::<String>().trim().to_string());
227 }
228 "dd" => {
229 let definition = el.text().collect::<String>().trim().to_string();
230 let item_text = if let Some(term) = current_term.take() {
231 format!("{}: {}", term, definition)
232 } else {
233 definition
234 };
235 list.add_item(ListItem::new(item_text));
236 }
237 _ => {}
238 }
239 }
240 }
241
242 if list.is_empty() {
243 None
244 } else {
245 Some(list)
246 }
247}
248
249pub fn extract_tables(document: &Html) -> ParserResult<Vec<TableContent>> {
255 let mut tables = Vec::new();
256
257 for table_el in document.select(&SELECTORS.table) {
258 if let Some(table) = extract_table(&table_el) {
259 tables.push(table);
260 }
261 }
262
263 Ok(tables)
264}
265
266fn extract_table(element: &ElementRef) -> Option<TableContent> {
268 let mut table = TableContent::new();
269
270 if let Some(caption) = element.select(&SELECTORS.caption).next() {
272 table.caption = Some(caption.text().collect::<String>().trim().to_string());
273 }
274
275 table.summary = element.value().attr("summary").map(|s| s.to_string());
277
278 if let Some(thead) = element.select(&SELECTORS.thead).next() {
280 for tr in thead.select(&SELECTORS.tr) {
281 let row = extract_table_row(&tr, true);
282 if !row.cells.is_empty() {
283 table.headers.push(row);
284 }
285 }
286 } else {
287 if let Some(first_tr) = element.select(&SELECTORS.tr).next() {
289 let cells: Vec<_> = first_tr.select(&SELECTORS.th).collect();
290 if !cells.is_empty() {
291 let row = extract_table_row(&first_tr, true);
292 table.headers.push(row);
293 }
294 }
295 }
296
297 let tbody_selector = &SELECTORS.tbody;
299 let rows_to_process: Vec<ElementRef> = if let Some(tbody) = element.select(tbody_selector).next() {
300 tbody.select(&SELECTORS.tr).collect()
301 } else {
302 let all_rows: Vec<_> = element.select(&SELECTORS.tr).collect();
304 if !table.headers.is_empty() && !all_rows.is_empty() {
305 all_rows.into_iter().skip(1).collect()
306 } else {
307 all_rows
308 }
309 };
310
311 for tr in rows_to_process {
312 let row = extract_table_row(&tr, false);
313 if !row.cells.is_empty() {
314 if row.cells.len() > table.column_count {
316 table.column_count = row.cells.len();
317 }
318 table.rows.push(row);
319 }
320 }
321
322 if table.is_empty() {
323 None
324 } else {
325 Some(table)
326 }
327}
328
329fn extract_table_row(element: &ElementRef, is_header: bool) -> TableRow {
331 let mut cells = Vec::new();
332
333 for child in element.children() {
335 if let Some(cell_el) = ElementRef::wrap(child) {
336 let tag = cell_el.value().name();
337 if tag == "th" || tag == "td" {
338 let cell = extract_table_cell(&cell_el, tag == "th");
339 cells.push(cell);
340 }
341 }
342 }
343
344 TableRow {
345 cells,
346 is_header_row: is_header,
347 }
348}
349
350fn extract_table_cell(element: &ElementRef, is_header: bool) -> TableCell {
352 let content = element.text().collect::<String>().trim().to_string();
353
354 let colspan = element.value().attr("colspan")
355 .and_then(|s| s.parse().ok())
356 .unwrap_or(1);
357
358 let rowspan = element.value().attr("rowspan")
359 .and_then(|s| s.parse().ok())
360 .unwrap_or(1);
361
362 TableCell {
363 content,
364 is_header,
365 colspan,
366 rowspan,
367 }
368}
369
370pub fn extract_code_blocks(document: &Html) -> ParserResult<Vec<CodeBlock>> {
376 let mut code_blocks = Vec::new();
377 let mut seen_codes: std::collections::HashSet<String> = std::collections::HashSet::new();
378
379 for pre in document.select(&SELECTORS.pre) {
381 let code = if let Some(code_el) = pre.select(&SELECTORS.code).next() {
382 extract_code_block(&code_el, false)
383 } else {
384 extract_code_block(&pre, false)
385 };
386
387 if !code.code.trim().is_empty() && !seen_codes.contains(&code.code) {
389 seen_codes.insert(code.code.clone());
390 code_blocks.push(code);
391 }
392 }
393
394 for code_el in document.select(&SELECTORS.code) {
396 let in_pre = code_el.ancestors()
398 .any(|ancestor| {
399 ancestor.value().as_element()
400 .map(|e| e.name() == "pre")
401 .unwrap_or(false)
402 });
403
404 if !in_pre {
405 let code = extract_code_block(&code_el, true);
406 if !code.code.trim().is_empty() && !seen_codes.contains(&code.code) {
407 seen_codes.insert(code.code.clone());
408 code_blocks.push(code);
409 }
410 }
411 }
412
413 Ok(code_blocks)
414}
415
416fn extract_code_block(element: &ElementRef, is_inline: bool) -> CodeBlock {
418 let code = element.text().collect::<String>();
419
420 let language = element.value().classes()
422 .find(|c| {
423 c.starts_with("language-") ||
424 c.starts_with("lang-") ||
425 c.starts_with("hljs-") ||
426 is_known_language(c)
427 })
428 .map(|c| {
429 c.trim_start_matches("language-")
430 .trim_start_matches("lang-")
431 .trim_start_matches("hljs-")
432 .to_string()
433 });
434
435 let language = language.or_else(|| {
437 element.value().attr("data-language")
438 .or_else(|| element.value().attr("data-lang"))
439 .map(|s| s.to_string())
440 });
441
442 let mut block = CodeBlock::new(&code);
443 block.language = language;
444 block.is_inline = is_inline;
445
446 block
447}
448
449fn is_known_language(class: &str) -> bool {
451 let known = [
452 "rust", "python", "javascript", "typescript", "java", "c", "cpp",
453 "csharp", "go", "ruby", "php", "swift", "kotlin", "scala", "html",
454 "css", "sql", "bash", "shell", "json", "yaml", "xml", "markdown",
455 ];
456 known.contains(&class.to_lowercase().as_str())
457}
458
459pub fn extract_quotes(document: &Html) -> ParserResult<Vec<Quote>> {
465 let mut quotes = Vec::new();
466
467 for blockquote in document.select(&SELECTORS.blockquote) {
468 let text = blockquote.text().collect::<String>().trim().to_string();
469
470 if text.is_empty() {
471 continue;
472 }
473
474 let mut quote = Quote::new(&text);
475
476 quote.cite_url = blockquote.value().attr("cite").map(|s| s.to_string());
478
479 if let Some(sel) = try_parse_selector("footer, cite") {
481 if let Some(cite_el) = blockquote.select(&sel).next() {
482 quote.cite = Some(cite_el.text().collect::<String>().trim().to_string());
483 }
484 }
485
486 quotes.push(quote);
487 }
488
489 Ok(quotes)
490}
491
492pub fn extract_images(document: &Html, base_url: Option<&Url>) -> ParserResult<Vec<Image>> {
498 let mut images = Vec::new();
499
500 for img in document.select(&SELECTORS.img) {
501 if let Some(image) = extract_image(&img, base_url) {
502 images.push(image);
503 }
504 }
505
506 Ok(images)
507}
508
509fn extract_image(element: &ElementRef, base_url: Option<&Url>) -> Option<Image> {
511 let src = element.value().attr("src")
512 .or_else(|| element.value().attr("data-src"))
513 .or_else(|| element.value().attr("data-lazy-src"))?;
514
515 let alt = element.value().attr("alt").unwrap_or("").to_string();
516
517 let mut image = Image::new(src, &alt);
518
519 image.url = resolve_image_url(src, base_url);
521
522 image.width = element.value().attr("width")
524 .and_then(|s| s.trim_end_matches("px").parse().ok());
525 image.height = element.value().attr("height")
526 .and_then(|s| s.trim_end_matches("px").parse().ok());
527
528 image.srcset = element.value().attr("srcset").map(|s| s.to_string());
530 image.sizes = element.value().attr("sizes").map(|s| s.to_string());
531
532 image.loading = match element.value().attr("loading") {
534 Some("lazy") => ImageLoading::Lazy,
535 _ => ImageLoading::Eager,
536 };
537
538 image.title = element.value().attr("title").map(|s| s.to_string());
540
541 Some(image)
542}
543
544fn resolve_image_url(src: &str, base_url: Option<&Url>) -> Option<String> {
546 let trimmed = src.trim();
547
548 if trimmed.is_empty() || trimmed.starts_with("data:") {
549 return None;
550 }
551
552 if trimmed.starts_with("http://") || trimmed.starts_with("https://") {
553 return Some(trimmed.to_string());
554 }
555
556 if trimmed.starts_with("//") {
557 return Some(format!("https:{}", trimmed));
558 }
559
560 base_url
561 .and_then(|base| base.join(trimmed).ok())
562 .map(|u| u.to_string())
563}
564
565#[cfg(test)]
570mod tests {
571 use super::*;
572
573 fn parse_html(html: &str) -> Html {
574 Html::parse_document(html)
575 }
576
577 #[test]
578 fn test_extract_headings() {
579 let doc = parse_html(r#"
580 <html><body>
581 <h1 id="main">Main Title</h1>
582 <h2>Section 1</h2>
583 <h2>Section 2</h2>
584 <h3>Subsection</h3>
585 </body></html>
586 "#);
587
588 let headings = extract_headings(&doc).unwrap();
589 assert_eq!(headings.len(), 4);
590 assert_eq!(headings[0].level, 1);
591 assert_eq!(headings[0].text, "Main Title");
592 assert_eq!(headings[0].id, Some("main".to_string()));
593 }
594
595 #[test]
596 fn test_get_main_heading() {
597 let doc = parse_html("<html><body><h1>Main Title</h1></body></html>");
598 assert_eq!(get_main_heading(&doc), Some("Main Title".to_string()));
599 }
600
601 #[test]
602 fn test_extract_paragraphs() {
603 let doc = parse_html(r#"
604 <html><body>
605 <p>This is a long enough paragraph to be included.</p>
606 <p>Short</p>
607 <p>Another paragraph that should be extracted.</p>
608 </body></html>
609 "#);
610
611 let config = ParserConfig::default();
612 let paragraphs = extract_paragraphs(&doc, &config).unwrap();
613 assert_eq!(paragraphs.len(), 2);
614 }
615
616 #[test]
617 fn test_extract_ordered_list() {
618 let doc = parse_html(r#"
619 <ol>
620 <li>First item</li>
621 <li>Second item</li>
622 <li>Third item</li>
623 </ol>
624 "#);
625
626 let lists = extract_lists(&doc).unwrap();
627 assert_eq!(lists.len(), 1);
628 assert_eq!(lists[0].list_type, ListType::Ordered);
629 assert_eq!(lists[0].items.len(), 3);
630 }
631
632 #[test]
633 fn test_extract_nested_list() {
634 let doc = parse_html(r#"
635 <ul>
636 <li>Item 1
637 <ul>
638 <li>Nested 1</li>
639 <li>Nested 2</li>
640 </ul>
641 </li>
642 <li>Item 2</li>
643 </ul>
644 "#);
645
646 let lists = extract_lists(&doc).unwrap();
647 assert!(!lists.is_empty());
648 assert!(lists[0].items[0].nested.is_some());
650 }
651
652 #[test]
653 fn test_extract_table() {
654 let doc = parse_html(r#"
655 <table>
656 <caption>Test Table</caption>
657 <thead>
658 <tr><th>Header 1</th><th>Header 2</th></tr>
659 </thead>
660 <tbody>
661 <tr><td>Cell 1</td><td>Cell 2</td></tr>
662 <tr><td>Cell 3</td><td>Cell 4</td></tr>
663 </tbody>
664 </table>
665 "#);
666
667 let tables = extract_tables(&doc).unwrap();
668 assert_eq!(tables.len(), 1);
669 assert_eq!(tables[0].caption, Some("Test Table".to_string()));
670 assert_eq!(tables[0].headers.len(), 1);
671 assert_eq!(tables[0].rows.len(), 2);
672 assert_eq!(tables[0].column_count, 2);
673 }
674
675 #[test]
676 fn test_extract_code_block() {
677 let doc = parse_html(r#"
678 <pre><code class="language-rust">
679 fn main() {
680 println!("Hello");
681 }
682 </code></pre>
683 "#);
684
685 let code_blocks = extract_code_blocks(&doc).unwrap();
686 assert_eq!(code_blocks.len(), 1);
687 assert_eq!(code_blocks[0].language, Some("rust".to_string()));
688 assert!(!code_blocks[0].is_inline);
689 }
690
691 #[test]
692 fn test_extract_inline_code() {
693 let doc = parse_html(r#"<p>Use the <code>println!</code> macro.</p>"#);
694
695 let code_blocks = extract_code_blocks(&doc).unwrap();
696 assert_eq!(code_blocks.len(), 1);
697 assert!(code_blocks[0].is_inline);
698 }
699
700 #[test]
701 fn test_extract_quotes() {
702 let doc = parse_html(r#"
703 <blockquote cite="https://example.com">
704 <p>This is a quote.</p>
705 <footer>— Author Name</footer>
706 </blockquote>
707 "#);
708
709 let quotes = extract_quotes(&doc).unwrap();
710 assert_eq!(quotes.len(), 1);
711 assert!(quotes[0].text.contains("This is a quote"));
712 assert_eq!(quotes[0].cite_url, Some("https://example.com".to_string()));
713 }
714
715 #[test]
716 fn test_extract_images() {
717 let doc = parse_html(r#"
718 <img src="/images/photo.jpg"
719 alt="A photo"
720 title="Photo title"
721 width="800"
722 height="600"
723 loading="lazy">
724 "#);
725
726 let base = Url::parse("https://example.com").unwrap();
727 let images = extract_images(&doc, Some(&base)).unwrap();
728
729 assert_eq!(images.len(), 1);
730 assert_eq!(images[0].alt, "A photo");
731 assert_eq!(images[0].title, Some("Photo title".to_string()));
732 assert_eq!(images[0].width, Some(800));
733 assert_eq!(images[0].height, Some(600));
734 assert_eq!(images[0].loading, ImageLoading::Lazy);
735 assert_eq!(images[0].url, Some("https://example.com/images/photo.jpg".to_string()));
736 }
737
738 #[test]
739 fn test_image_decorative() {
740 let doc = parse_html(r#"<img src="/spacer.gif" alt="">"#);
741 let images = extract_images(&doc, None).unwrap();
742 assert!(images[0].is_decorative);
743 }
744
745 #[test]
746 fn test_table_with_colspan() {
747 let doc = parse_html(r#"
748 <table>
749 <tr><td colspan="2">Spanning cell</td></tr>
750 <tr><td>Cell 1</td><td>Cell 2</td></tr>
751 </table>
752 "#);
753
754 let tables = extract_tables(&doc).unwrap();
755 assert_eq!(tables[0].rows[0].cells[0].colspan, 2);
756 }
757
758 #[test]
759 fn test_definition_list() {
760 let doc = parse_html(r#"
761 <dl>
762 <dt>Term 1</dt>
763 <dd>Definition 1</dd>
764 <dt>Term 2</dt>
765 <dd>Definition 2</dd>
766 </dl>
767 "#);
768
769 let lists = extract_lists(&doc).unwrap();
770 assert_eq!(lists.len(), 1);
771 assert_eq!(lists[0].list_type, ListType::Definition);
772 assert_eq!(lists[0].items.len(), 2);
773 }
774
775 #[test]
776 fn test_build_outline() {
777 let headings = vec![
778 Heading::new(1, "Main"),
779 Heading::new(2, "Section 1"),
780 Heading::new(3, "Subsection 1.1"),
781 Heading::new(2, "Section 2"),
782 ];
783
784 let outline = build_outline(&headings);
785 assert_eq!(outline.len(), 4);
786 }
787
788 #[test]
789 fn test_is_known_language() {
790 assert!(is_known_language("rust"));
791 assert!(is_known_language("Python"));
792 assert!(is_known_language("JAVASCRIPT"));
793 assert!(!is_known_language("unknown-lang"));
794 }
795
796 #[test]
797 fn test_responsive_image() {
798 let doc = parse_html(r#"
799 <img src="/img.jpg"
800 srcset="/img-320.jpg 320w, /img-640.jpg 640w"
801 sizes="(max-width: 320px) 280px, 640px"
802 alt="Responsive">
803 "#);
804
805 let images = extract_images(&doc, None).unwrap();
806 assert!(images[0].is_responsive());
807 assert!(images[0].srcset.is_some());
808 assert!(images[0].sizes.is_some());
809 }
810}