Skip to main content

fetchkit/
convert.rs

1//! HTML conversion utilities
2
3use crate::types::{PageLink, PageMetadata};
4
5/// Check if content-type indicates markdown (e.g. `text/markdown`).
6pub fn is_markdown_content_type(content_type: &Option<String>) -> bool {
7    content_type
8        .as_deref()
9        .and_then(|ct| ct.split(';').next())
10        .map(|media_type| media_type.trim().eq_ignore_ascii_case("text/markdown"))
11        .unwrap_or(false)
12}
13
14/// Check if content-type indicates plain text (e.g. `text/plain`).
15pub fn is_plain_text_content_type(content_type: &Option<String>) -> bool {
16    content_type
17        .as_deref()
18        .and_then(|ct| ct.split(';').next())
19        .map(|media_type| media_type.trim().eq_ignore_ascii_case("text/plain"))
20        .unwrap_or(false)
21}
22
23/// Check if content is HTML based on content type and body
24///
25/// Returns `true` if the content type contains `text/html` or `application/xhtml`,
26/// or if the body starts with `<!DOCTYPE` or `<html`.
27pub fn is_html(content_type: &Option<String>, body: &str) -> bool {
28    // Check Content-Type
29    if let Some(ct) = content_type {
30        let ct_lower = ct.to_lowercase();
31        if ct_lower.contains("text/html") || ct_lower.contains("application/xhtml") {
32            return true;
33        }
34    }
35
36    // Check body start
37    let trimmed = body.trim_start();
38    trimmed.starts_with("<!DOCTYPE") || trimmed.starts_with("<html")
39}
40
41/// Convert HTML to markdown
42///
43/// Converts common HTML elements (headings, lists, emphasis, code blocks, links,
44/// blockquotes) to their Markdown equivalents. Strips script, style, noscript,
45/// iframe, and svg elements. Decodes HTML entities.
46///
47/// # Examples
48///
49/// ```
50/// use fetchkit::html_to_markdown;
51///
52/// let html = "<h1>Title</h1><p><strong>Bold</strong> text</p>";
53/// let md = html_to_markdown(html);
54/// assert!(md.contains("# Title"));
55/// assert!(md.contains("**Bold**"));
56/// ```
57pub fn html_to_markdown(html: &str) -> String {
58    let mut output = String::new();
59    let mut in_skip_element = 0;
60    let mut skip_elements: Vec<String> = Vec::new();
61    let mut in_pre = false;
62    let mut in_blockquote = false;
63
64    // Link tracking: when we see <a href="...">, save href and record the output
65    // position. On </a>, wrap the text collected since then in [text](href).
66    let mut link_href: Option<String> = None;
67    let mut link_start: usize = 0;
68
69    // List tracking: stack of list types (true=ordered, false=unordered) with item counter
70    let mut list_stack: Vec<(bool, usize)> = Vec::new();
71
72    // Table tracking
73    let mut in_table = false;
74    let mut table_rows: Vec<Vec<String>> = Vec::new();
75    let mut current_row: Vec<String> = Vec::new();
76    let mut in_cell = false;
77    let mut cell_buf = String::new();
78    let mut is_header_row = false;
79
80    let mut chars = html.chars().peekable();
81
82    while let Some(c) = chars.next() {
83        if c == '<' {
84            // Parse tag
85            let mut tag = String::new();
86            while let Some(&next) = chars.peek() {
87                if next == '>' {
88                    chars.next();
89                    break;
90                }
91                tag.push(chars.next().unwrap());
92            }
93
94            let tag_lower = tag.to_lowercase();
95            let is_closing = tag_lower.starts_with('/');
96            let tag_name = if is_closing {
97                tag_lower[1..].split_whitespace().next().unwrap_or("")
98            } else {
99                tag_lower.split_whitespace().next().unwrap_or("")
100            };
101
102            // THREAT[TM-CONV-001]: Strip script/style/iframe/svg to prevent injection
103            let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
104            if skip_tags.contains(&tag_name) {
105                if is_closing {
106                    if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
107                        skip_elements.remove(pos);
108                        in_skip_element = skip_elements.len();
109                    }
110                } else if !tag.ends_with('/') {
111                    skip_elements.push(tag_name.to_string());
112                    in_skip_element = skip_elements.len();
113                }
114                continue;
115            }
116
117            if in_skip_element > 0 {
118                continue;
119            }
120
121            // Handle markdown conversion
122            match tag_name {
123                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
124                    let level = tag_name[1..].parse::<usize>().unwrap_or(1);
125                    if !is_closing {
126                        output.push('\n');
127                        for _ in 0..level {
128                            output.push('#');
129                        }
130                        output.push(' ');
131                    } else {
132                        output.push_str("\n\n");
133                    }
134                }
135                "p" | "div" | "section" | "article" | "main" | "header" | "footer"
136                    if is_closing =>
137                {
138                    output.push_str("\n\n");
139                }
140                "br" => {
141                    output.push('\n');
142                }
143                "hr" => {
144                    output.push_str("\n---\n");
145                }
146                "ul" => {
147                    if is_closing {
148                        list_stack.pop();
149                        if list_stack.is_empty() {
150                            output.push('\n');
151                        }
152                    } else {
153                        list_stack.push((false, 0));
154                    }
155                }
156                "ol" => {
157                    if is_closing {
158                        list_stack.pop();
159                        if list_stack.is_empty() {
160                            output.push('\n');
161                        }
162                    } else {
163                        list_stack.push((true, 0));
164                    }
165                }
166                "li" if !is_closing => {
167                    output.push('\n');
168                    let depth = list_stack.len().saturating_sub(1);
169                    for _ in 0..depth {
170                        output.push_str("  ");
171                    }
172                    if let Some((is_ordered, counter)) = list_stack.last_mut() {
173                        if *is_ordered {
174                            *counter += 1;
175                            output.push_str(&format!("{}. ", *counter));
176                        } else {
177                            output.push_str("- ");
178                        }
179                    } else {
180                        output.push_str("- ");
181                    }
182                }
183                "strong" | "b" => {
184                    output.push_str("**");
185                }
186                "em" | "i" => {
187                    output.push('*');
188                }
189                "pre" => {
190                    if !is_closing {
191                        output.push_str("\n```\n");
192                        in_pre = true;
193                    } else {
194                        output.push_str("\n```\n");
195                        in_pre = false;
196                    }
197                }
198                "code" if !in_pre => {
199                    output.push('`');
200                }
201                "blockquote" => {
202                    if !is_closing {
203                        in_blockquote = true;
204                        output.push_str("\n> ");
205                    } else {
206                        in_blockquote = false;
207                        output.push('\n');
208                    }
209                }
210                "a" => {
211                    if !is_closing {
212                        if let Some(href) = extract_attribute(&tag, "href") {
213                            if !href.is_empty() {
214                                link_href = Some(href);
215                                link_start = output.len();
216                            }
217                        }
218                    } else if let Some(href) = link_href.take() {
219                        let text = output[link_start..].trim().to_string();
220                        output.truncate(link_start);
221                        if text.is_empty() {
222                            output.push_str(&format!("<{}>", href));
223                        } else {
224                            output.push_str(&format!("[{}]({})", text, href));
225                        }
226                    }
227                }
228                "img" if !is_closing => {
229                    let alt = extract_attribute(&tag, "alt").unwrap_or_default();
230                    if let Some(src) = extract_attribute(&tag, "src") {
231                        output.push_str(&format!("![{}]({})", alt, src));
232                    }
233                }
234                // Table handling
235                "table" => {
236                    if !is_closing {
237                        in_table = true;
238                        table_rows.clear();
239                    } else {
240                        in_table = false;
241                        render_table(&table_rows, &mut output);
242                        table_rows.clear();
243                    }
244                }
245                "tr" => {
246                    if !is_closing {
247                        current_row.clear();
248                        is_header_row = false;
249                    } else if in_table {
250                        table_rows.push(current_row.clone());
251                        if is_header_row && table_rows.len() == 1 {
252                            let sep: Vec<String> =
253                                current_row.iter().map(|_| "---".to_string()).collect();
254                            table_rows.push(sep);
255                        }
256                        current_row.clear();
257                    }
258                }
259                "th" => {
260                    if !is_closing {
261                        in_cell = true;
262                        cell_buf.clear();
263                        is_header_row = true;
264                    } else {
265                        in_cell = false;
266                        current_row.push(cell_buf.trim().to_string());
267                        cell_buf.clear();
268                    }
269                }
270                "td" => {
271                    if !is_closing {
272                        in_cell = true;
273                        cell_buf.clear();
274                    } else {
275                        in_cell = false;
276                        current_row.push(cell_buf.trim().to_string());
277                        cell_buf.clear();
278                    }
279                }
280                // Definition lists
281                "dl" if is_closing => {
282                    output.push_str("\n\n");
283                }
284                "dt" => {
285                    if !is_closing {
286                        output.push_str("\n**");
287                    } else {
288                        output.push_str("**\n");
289                    }
290                }
291                "dd" => {
292                    if !is_closing {
293                        output.push_str(": ");
294                    } else {
295                        output.push('\n');
296                    }
297                }
298                _ => {}
299            }
300        } else if in_skip_element == 0 {
301            // Text content
302            let decoded = decode_entity(c, &mut chars);
303
304            if in_cell {
305                cell_buf.push(decoded);
306            } else if in_table {
307                // Ignore text outside cells but inside table
308            } else if in_blockquote && decoded == '\n' {
309                output.push_str("\n> ");
310            } else {
311                output.push(decoded);
312            }
313        }
314    }
315
316    clean_whitespace(&output)
317}
318
319/// Render collected table rows as a markdown table.
320fn render_table(rows: &[Vec<String>], output: &mut String) {
321    if rows.is_empty() {
322        return;
323    }
324
325    output.push('\n');
326    for row in rows {
327        output.push_str("| ");
328        output.push_str(&row.join(" | "));
329        output.push_str(" |\n");
330    }
331}
332
333/// Convert HTML to plain text
334///
335/// Strips all HTML tags and returns plain text content. Handles newlines
336/// for block elements (p, div, headings). Decodes HTML entities.
337///
338/// # Examples
339///
340/// ```
341/// use fetchkit::html_to_text;
342///
343/// let html = "<h1>Title</h1><p>Paragraph with &amp; entity</p>";
344/// let text = html_to_text(html);
345/// assert!(text.contains("Title"));
346/// assert!(text.contains("Paragraph with & entity"));
347/// ```
348pub fn html_to_text(html: &str) -> String {
349    let mut output = String::new();
350    let mut in_skip_element = 0;
351    let mut skip_elements: Vec<String> = Vec::new();
352
353    let mut chars = html.chars().peekable();
354
355    while let Some(c) = chars.next() {
356        if c == '<' {
357            // Parse tag
358            let mut tag = String::new();
359            while let Some(&next) = chars.peek() {
360                if next == '>' {
361                    chars.next();
362                    break;
363                }
364                tag.push(chars.next().unwrap());
365            }
366
367            let tag_lower = tag.to_lowercase();
368            let is_closing = tag_lower.starts_with('/');
369            let tag_name = if is_closing {
370                tag_lower[1..].split_whitespace().next().unwrap_or("")
371            } else {
372                tag_lower.split_whitespace().next().unwrap_or("")
373            };
374
375            // THREAT[TM-CONV-001]: Strip script/style/iframe/svg to prevent injection
376            let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
377            if skip_tags.contains(&tag_name) {
378                if is_closing {
379                    if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
380                        skip_elements.remove(pos);
381                        in_skip_element = skip_elements.len();
382                    }
383                } else if !tag.ends_with('/') {
384                    skip_elements.push(tag_name.to_string());
385                    in_skip_element = skip_elements.len();
386                }
387                continue;
388            }
389
390            if in_skip_element > 0 {
391                continue;
392            }
393
394            // Handle newline-inducing elements
395            let newline_tags = [
396                "p", "div", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr",
397            ];
398            if newline_tags.contains(&tag_name) && (is_closing || tag_name == "br") {
399                output.push('\n');
400            } else if newline_tags.contains(&tag_name) && !is_closing {
401                // Opening tags like h1-h6, p, etc. also add newline
402                if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "p") {
403                    output.push('\n');
404                }
405            }
406        } else if in_skip_element == 0 {
407            // Text content
408            let decoded = decode_entity(c, &mut chars);
409            output.push(decoded);
410        }
411    }
412
413    clean_whitespace(&output)
414}
415
416/// Extract attribute value from tag
417fn extract_attribute(tag: &str, attr: &str) -> Option<String> {
418    let pattern = format!("{}=", attr);
419    let start = tag.char_indices().find_map(|(idx, _)| {
420        tag.get(idx..idx + pattern.len())
421            .filter(|candidate| candidate.eq_ignore_ascii_case(&pattern))
422            .map(|_| idx)
423    });
424
425    if let Some(start) = start {
426        let rest = &tag[start + pattern.len()..];
427        let rest = rest.trim_start();
428
429        if let Some(rest) = rest.strip_prefix('"') {
430            if let Some(end) = rest.find('"') {
431                return Some(rest[..end].to_string());
432            }
433        } else if let Some(rest) = rest.strip_prefix('\'') {
434            if let Some(end) = rest.find('\'') {
435                return Some(rest[..end].to_string());
436            }
437        } else {
438            let end = rest
439                .find(|c: char| c.is_whitespace() || c == '>')
440                .unwrap_or(rest.len());
441            return Some(rest[..end].to_string());
442        }
443    }
444    None
445}
446
447/// Decode HTML entity starting from ampersand
448// THREAT[TM-CONV-004]: Limited named-entity set; rejects long/unknown sequences
449fn decode_entity(c: char, chars: &mut std::iter::Peekable<std::str::Chars>) -> char {
450    if c != '&' {
451        return c;
452    }
453
454    let mut entity = String::new();
455    while let Some(&next) = chars.peek() {
456        if next == ';' {
457            chars.next();
458            break;
459        }
460        if next.is_whitespace() || entity.len() > 10 {
461            // Not a valid entity
462            return '&';
463        }
464        entity.push(chars.next().unwrap());
465    }
466
467    match entity.as_str() {
468        "amp" => '&',
469        "lt" => '<',
470        "gt" => '>',
471        "quot" => '"',
472        "apos" | "#39" => '\'',
473        "nbsp" => ' ',
474        "mdash" => '—',
475        "ndash" => '–',
476        "copy" => '©',
477        "reg" => '®',
478        "trade" => '™',
479        "bull" => '•',
480        "hellip" => '…',
481        "laquo" => '«',
482        "raquo" => '»',
483        "lsquo" => '\u{2018}',
484        "rsquo" => '\u{2019}',
485        "ldquo" => '\u{201C}',
486        "rdquo" => '\u{201D}',
487        "euro" => '€',
488        "pound" => '£',
489        "yen" => '¥',
490        "cent" => '¢',
491        "deg" => '°',
492        "micro" => 'µ',
493        "para" => '¶',
494        "sect" => '§',
495        "middot" => '·',
496        "times" => '×',
497        "divide" => '÷',
498        "plusmn" => '±',
499        "frac12" => '½',
500        "frac14" => '¼',
501        "frac34" => '¾',
502        "larr" => '←',
503        "rarr" => '→',
504        "uarr" => '↑',
505        "darr" => '↓',
506        _ => {
507            // Check for numeric entities
508            if let Some(num_str) = entity.strip_prefix('#') {
509                if let Some(stripped) = num_str.strip_prefix('x') {
510                    // Hex entity
511                    if let Ok(code) = u32::from_str_radix(stripped, 16) {
512                        if let Some(ch) = char::from_u32(code) {
513                            return ch;
514                        }
515                    }
516                } else if let Ok(code) = num_str.parse::<u32>() {
517                    if let Some(ch) = char::from_u32(code) {
518                        return ch;
519                    }
520                }
521            }
522            // Unknown entity - return original
523            '&'
524        }
525    }
526}
527
528/// Clean whitespace: collapse runs, trim, keep max 2 newlines.
529/// Preserves indentation (spaces after newlines) for list nesting.
530pub fn clean_whitespace(s: &str) -> String {
531    let mut result = String::new();
532    let mut last_was_space = false;
533    let mut newline_count = 0;
534    let mut at_line_start = true;
535
536    for c in s.chars() {
537        if c == '\n' {
538            // Remove trailing space before newline
539            if last_was_space && result.ends_with(' ') {
540                result.pop();
541            }
542            newline_count += 1;
543            last_was_space = true;
544            at_line_start = true;
545            if newline_count <= 2 {
546                result.push(c);
547            }
548        } else if c == ' ' || c == '\t' {
549            if at_line_start {
550                // Preserve indentation at line start
551                result.push(c);
552            } else {
553                newline_count = 0;
554                if !last_was_space {
555                    result.push(' ');
556                    last_was_space = true;
557                }
558            }
559        } else if c.is_whitespace() {
560            newline_count = 0;
561            if !last_was_space {
562                result.push(' ');
563                last_was_space = true;
564            }
565        } else {
566            newline_count = 0;
567            last_was_space = false;
568            at_line_start = false;
569            result.push(c);
570        }
571    }
572
573    result.trim().to_string()
574}
575
576/// Filter excessive newlines: keep at most 2 consecutive newlines
577pub fn filter_excessive_newlines(s: &str) -> String {
578    let mut result = String::new();
579    let mut newline_count = 0;
580
581    for c in s.chars() {
582        if c == '\n' {
583            newline_count += 1;
584            if newline_count <= 2 {
585                result.push(c);
586            }
587        } else {
588            newline_count = 0;
589            result.push(c);
590        }
591    }
592
593    result
594}
595
596/// Extract structured metadata from HTML in a single pass.
597///
598/// Extracts title, description, language, canonical URL, author,
599/// published/modified dates, links, and heading outline from HTML.
600///
601/// # Examples
602///
603/// ```
604/// use fetchkit::{extract_metadata, extract_headings};
605///
606/// let html = r#"<html lang="en"><head><title>Hello</title></head><body><h1>World</h1></body></html>"#;
607/// let mut meta = extract_metadata(html);
608/// meta.headings = extract_headings(html);
609/// assert_eq!(meta.title.as_deref(), Some("Hello"));
610/// assert_eq!(meta.language.as_deref(), Some("en"));
611/// assert_eq!(meta.headings, vec!["# World"]);
612/// ```
613pub fn extract_metadata(html: &str) -> PageMetadata {
614    let mut meta = PageMetadata::default();
615    let mut chars = html.chars().peekable();
616    let mut in_title = false;
617    let mut title_buf = String::new();
618    let mut in_skip_element = 0;
619    let mut skip_elements: Vec<String> = Vec::new();
620    // Track current <a> href for link extraction
621    let mut current_link_href: Option<String> = None;
622    let mut current_link_text = String::new();
623
624    while let Some(c) = chars.next() {
625        if c == '<' {
626            let mut tag = String::new();
627            while let Some(&next) = chars.peek() {
628                if next == '>' {
629                    chars.next();
630                    break;
631                }
632                tag.push(chars.next().unwrap());
633            }
634
635            let tag_lower = tag.to_lowercase();
636            let is_closing = tag_lower.starts_with('/');
637            let tag_name = if is_closing {
638                tag_lower[1..].split_whitespace().next().unwrap_or("")
639            } else {
640                tag_lower.split_whitespace().next().unwrap_or("")
641            };
642
643            // Skip dangerous elements
644            let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
645            if skip_tags.contains(&tag_name) {
646                if is_closing {
647                    if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
648                        skip_elements.remove(pos);
649                        in_skip_element = skip_elements.len();
650                    }
651                } else if !tag.ends_with('/') {
652                    skip_elements.push(tag_name.to_string());
653                    in_skip_element = skip_elements.len();
654                }
655                continue;
656            }
657
658            if in_skip_element > 0 {
659                continue;
660            }
661
662            match tag_name {
663                "html" if !is_closing => {
664                    if let Some(lang) = extract_attribute(&tag, "lang") {
665                        if meta.language.is_none() && !lang.is_empty() {
666                            meta.language = Some(lang);
667                        }
668                    }
669                }
670                "title" => {
671                    if !is_closing {
672                        in_title = true;
673                        title_buf.clear();
674                    } else {
675                        in_title = false;
676                        let title = title_buf.trim().to_string();
677                        if meta.title.is_none() && !title.is_empty() {
678                            meta.title = Some(title);
679                        }
680                    }
681                }
682                "meta" if !is_closing => {
683                    extract_meta_tag(&tag, &mut meta);
684                }
685                "link" if !is_closing => {
686                    if let Some(rel) = extract_attribute(&tag, "rel") {
687                        if rel == "canonical" {
688                            if let Some(href) = extract_attribute(&tag, "href") {
689                                if meta.canonical_url.is_none() && !href.is_empty() {
690                                    meta.canonical_url = Some(href);
691                                }
692                            }
693                        }
694                    }
695                }
696                "time" if !is_closing => {
697                    if let Some(datetime) = extract_attribute(&tag, "datetime") {
698                        if meta.published_date.is_none() && !datetime.is_empty() {
699                            meta.published_date = Some(datetime);
700                        }
701                    }
702                }
703                "a" => {
704                    if !is_closing {
705                        if let Some(href) = extract_attribute(&tag, "href") {
706                            if !href.is_empty() {
707                                current_link_href = Some(href);
708                                current_link_text.clear();
709                            }
710                        }
711                    } else if let Some(href) = current_link_href.take() {
712                        let text = current_link_text.trim().to_string();
713                        // Cap links at 500 to prevent DoS on link-heavy pages
714                        if meta.links.len() < 500 {
715                            meta.links.push(PageLink { text, href });
716                        }
717                        current_link_text.clear();
718                    }
719                }
720                _ => {}
721            }
722        } else if in_skip_element == 0 {
723            let decoded = decode_entity(c, &mut chars);
724            if in_title {
725                title_buf.push(decoded);
726            }
727            if current_link_href.is_some() {
728                current_link_text.push(decoded);
729            }
730        }
731    }
732
733    meta
734}
735
736/// Second pass specifically for heading extraction (cheap — headings are sparse).
737/// Called after the main metadata extraction to keep the main function clean.
738pub fn extract_headings(html: &str) -> Vec<String> {
739    let mut headings = Vec::new();
740    let mut chars = html.chars().peekable();
741    let mut in_heading: Option<u8> = None; // heading level 1-6
742    let mut heading_buf = String::new();
743    let mut in_skip_element = 0;
744    let mut skip_elements: Vec<String> = Vec::new();
745
746    while let Some(c) = chars.next() {
747        if c == '<' {
748            let mut tag = String::new();
749            while let Some(&next) = chars.peek() {
750                if next == '>' {
751                    chars.next();
752                    break;
753                }
754                tag.push(chars.next().unwrap());
755            }
756
757            let tag_lower = tag.to_lowercase();
758            let is_closing = tag_lower.starts_with('/');
759            let tag_name = if is_closing {
760                tag_lower[1..].split_whitespace().next().unwrap_or("")
761            } else {
762                tag_lower.split_whitespace().next().unwrap_or("")
763            };
764
765            let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
766            if skip_tags.contains(&tag_name) {
767                if is_closing {
768                    if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
769                        skip_elements.remove(pos);
770                        in_skip_element = skip_elements.len();
771                    }
772                } else if !tag.ends_with('/') {
773                    skip_elements.push(tag_name.to_string());
774                    in_skip_element = skip_elements.len();
775                }
776                continue;
777            }
778
779            if in_skip_element > 0 {
780                continue;
781            }
782
783            if let Some(level) = heading_level(tag_name) {
784                if is_closing {
785                    if in_heading == Some(level) {
786                        let text = heading_buf.trim().to_string();
787                        if !text.is_empty() && headings.len() < 200 {
788                            let prefix = "#".repeat(level as usize);
789                            headings.push(format!("{} {}", prefix, text));
790                        }
791                        in_heading = None;
792                        heading_buf.clear();
793                    }
794                } else {
795                    in_heading = Some(level);
796                    heading_buf.clear();
797                }
798            }
799        } else if in_skip_element == 0 {
800            let decoded = decode_entity(c, &mut chars);
801            if in_heading.is_some() {
802                heading_buf.push(decoded);
803            }
804        }
805    }
806
807    headings
808}
809
810fn heading_level(tag_name: &str) -> Option<u8> {
811    match tag_name {
812        "h1" => Some(1),
813        "h2" => Some(2),
814        "h3" => Some(3),
815        "h4" => Some(4),
816        "h5" => Some(5),
817        "h6" => Some(6),
818        _ => None,
819    }
820}
821
822/// Extract metadata from a `<meta>` tag.
823fn extract_meta_tag(tag: &str, meta: &mut PageMetadata) {
824    // <meta name="..." content="...">
825    if let Some(content) = extract_attribute(tag, "content") {
826        if content.is_empty() {
827            return;
828        }
829        // Check name attribute
830        if let Some(name) = extract_attribute(tag, "name") {
831            match name.to_lowercase().as_str() {
832                "description" if meta.description.is_none() => {
833                    meta.description = Some(content.clone());
834                }
835                "author" if meta.author.is_none() => {
836                    meta.author = Some(content.clone());
837                }
838                _ => {}
839            }
840        }
841        // Check property attribute (Open Graph)
842        if let Some(property) = extract_attribute(tag, "property") {
843            match property.to_lowercase().as_str() {
844                "og:title" => {
845                    // og:title overrides <title>
846                    meta.title = Some(content.clone());
847                }
848                "og:description" => {
849                    // og:description overrides <meta description>
850                    meta.description = Some(content.clone());
851                }
852                "article:published_time" if meta.published_date.is_none() => {
853                    meta.published_date = Some(content.clone());
854                }
855                "article:modified_time" if meta.modified_date.is_none() => {
856                    meta.modified_date = Some(content);
857                }
858                _ => {}
859            }
860        }
861    }
862}
863
864/// Strip boilerplate elements from HTML, keeping only main content.
865///
866/// Removes `<nav>`, `<footer>`, `<aside>`, and elements with
867/// `role="navigation"`, `role="banner"`, `role="contentinfo"`.
868/// If `<main>` or `<article>` is present, extracts only their content.
869///
870/// # Examples
871///
872/// ```
873/// use fetchkit::strip_boilerplate;
874///
875/// let html = r#"<nav>Menu</nav><main><p>Content</p></main><footer>Footer</footer>"#;
876/// let result = strip_boilerplate(html);
877/// assert!(result.contains("Content"));
878/// assert!(!result.contains("Menu"));
879/// assert!(!result.contains("Footer"));
880/// ```
881pub fn strip_boilerplate(html: &str) -> String {
882    // Strategy: if <main> or <article> exists, extract just that content.
883    // Otherwise, strip known boilerplate elements.
884
885    // Check if there's a <main> or <article> to focus on
886    if let Some(focused) = extract_main_content(html) {
887        return focused;
888    }
889
890    // Fallback: strip boilerplate elements
891    strip_boilerplate_elements(html)
892}
893
894/// Extract content from `<main>` or `<article>` tag if present.
895fn extract_main_content(html: &str) -> Option<String> {
896    // Try <main> first, then <article>
897    for target_tag in &["main", "article"] {
898        if let Some(content) = extract_tag_content(html, target_tag) {
899            return Some(content);
900        }
901    }
902
903    // Try role="main"
904    extract_role_content(html, "main")
905}
906
907/// Extract the inner content of the first occurrence of a given tag.
908fn extract_tag_content(html: &str, target: &str) -> Option<String> {
909    let mut chars = html.chars().peekable();
910    let mut depth = 0i32;
911    let mut capturing = false;
912    let mut output = String::new();
913
914    while let Some(c) = chars.next() {
915        if c == '<' {
916            let mut tag = String::new();
917            while let Some(&next) = chars.peek() {
918                if next == '>' {
919                    chars.next();
920                    break;
921                }
922                tag.push(chars.next().unwrap());
923            }
924
925            let tag_lower = tag.to_lowercase();
926            let is_closing = tag_lower.starts_with('/');
927            let tag_name = if is_closing {
928                tag_lower[1..].split_whitespace().next().unwrap_or("")
929            } else {
930                tag_lower.split_whitespace().next().unwrap_or("")
931            };
932
933            if tag_name == target {
934                if is_closing {
935                    depth -= 1;
936                    if depth == 0 && capturing {
937                        return Some(output);
938                    }
939                } else if !tag.ends_with('/') {
940                    depth += 1;
941                    if depth == 1 && !capturing {
942                        capturing = true;
943                        continue;
944                    }
945                }
946            }
947
948            if capturing {
949                output.push('<');
950                output.push_str(&tag);
951                output.push('>');
952            }
953        } else if capturing {
954            output.push(c);
955        }
956    }
957
958    None
959}
960
961/// Extract content of the first element with a given role attribute.
962fn extract_role_content(html: &str, role: &str) -> Option<String> {
963    let mut chars = html.chars().peekable();
964    let mut capture_tag: Option<String> = None;
965    let mut depth = 0i32;
966    let mut output = String::new();
967
968    while let Some(c) = chars.next() {
969        if c == '<' {
970            let mut tag = String::new();
971            while let Some(&next) = chars.peek() {
972                if next == '>' {
973                    chars.next();
974                    break;
975                }
976                tag.push(chars.next().unwrap());
977            }
978
979            let tag_lower = tag.to_lowercase();
980            let is_closing = tag_lower.starts_with('/');
981            let tag_name = if is_closing {
982                tag_lower[1..].split_whitespace().next().unwrap_or("")
983            } else {
984                tag_lower.split_whitespace().next().unwrap_or("")
985            };
986
987            if let Some(ref target) = capture_tag {
988                if tag_name == target.as_str() {
989                    if is_closing {
990                        depth -= 1;
991                        if depth == 0 {
992                            return Some(output);
993                        }
994                    } else if !tag.ends_with('/') {
995                        depth += 1;
996                    }
997                }
998
999                if depth > 0 {
1000                    output.push('<');
1001                    output.push_str(&tag);
1002                    output.push('>');
1003                }
1004            } else if !is_closing {
1005                // Check for role attribute
1006                if let Some(attr_role) = extract_attribute(&tag, "role") {
1007                    if attr_role.eq_ignore_ascii_case(role) && !tag.ends_with('/') {
1008                        capture_tag = Some(tag_name.to_string());
1009                        depth = 1;
1010                        continue;
1011                    }
1012                }
1013            }
1014        } else if capture_tag.is_some() && depth > 0 {
1015            output.push(c);
1016        }
1017    }
1018
1019    None
1020}
1021
1022/// Boilerplate tags to strip when no <main>/<article> found.
1023const BOILERPLATE_TAGS: &[&str] = &["nav", "footer", "aside", "header"];
1024
1025/// Roles that indicate boilerplate.
1026const BOILERPLATE_ROLES: &[&str] = &["navigation", "banner", "contentinfo", "complementary"];
1027
1028/// Strip known boilerplate elements from HTML.
1029fn strip_boilerplate_elements(html: &str) -> String {
1030    let mut output = String::new();
1031    let mut chars = html.chars().peekable();
1032    let mut skip_depth = 0i32;
1033    let mut skip_tag: Option<String> = None;
1034
1035    while let Some(c) = chars.next() {
1036        if c == '<' {
1037            let mut tag = String::new();
1038            while let Some(&next) = chars.peek() {
1039                if next == '>' {
1040                    chars.next();
1041                    break;
1042                }
1043                tag.push(chars.next().unwrap());
1044            }
1045
1046            let tag_lower = tag.to_lowercase();
1047            let is_closing = tag_lower.starts_with('/');
1048            let tag_name = if is_closing {
1049                tag_lower[1..].split_whitespace().next().unwrap_or("")
1050            } else {
1051                tag_lower.split_whitespace().next().unwrap_or("")
1052            };
1053
1054            // Track skip state
1055            if let Some(ref target) = skip_tag {
1056                if tag_name == target.as_str() {
1057                    if is_closing {
1058                        skip_depth -= 1;
1059                        if skip_depth == 0 {
1060                            skip_tag = None;
1061                            continue;
1062                        }
1063                    } else if !tag.ends_with('/') {
1064                        skip_depth += 1;
1065                    }
1066                }
1067                continue; // Skip everything inside boilerplate
1068            }
1069
1070            // Check if this tag should be skipped
1071            if !is_closing && !tag.ends_with('/') {
1072                let is_boilerplate_tag = BOILERPLATE_TAGS.contains(&tag_name);
1073                let is_boilerplate_role = extract_attribute(&tag, "role")
1074                    .map(|r| {
1075                        BOILERPLATE_ROLES
1076                            .iter()
1077                            .any(|br| r.eq_ignore_ascii_case(br))
1078                    })
1079                    .unwrap_or(false);
1080
1081                if is_boilerplate_tag || is_boilerplate_role {
1082                    skip_tag = Some(tag_name.to_string());
1083                    skip_depth = 1;
1084                    continue;
1085                }
1086            }
1087
1088            output.push('<');
1089            output.push_str(&tag);
1090            output.push('>');
1091        } else if skip_tag.is_none() {
1092            output.push(c);
1093        }
1094    }
1095
1096    output
1097}
1098
1099#[cfg(test)]
1100mod tests {
1101    use super::*;
1102
1103    #[test]
1104    fn test_is_html_by_content_type() {
1105        assert!(is_html(&Some("text/html".to_string()), ""));
1106        assert!(is_html(&Some("text/html; charset=utf-8".to_string()), ""));
1107        assert!(is_html(&Some("application/xhtml+xml".to_string()), ""));
1108        assert!(!is_html(&Some("text/plain".to_string()), ""));
1109        assert!(!is_html(&Some("application/json".to_string()), ""));
1110    }
1111
1112    #[test]
1113    fn test_is_html_by_body() {
1114        assert!(is_html(&None, "<!DOCTYPE html><html>"));
1115        assert!(is_html(&None, "  <!DOCTYPE html>"));
1116        assert!(is_html(&None, "<html><body>"));
1117        assert!(!is_html(&None, "Hello world"));
1118        assert!(!is_html(&None, "{\"json\": true}"));
1119    }
1120
1121    #[test]
1122    fn test_html_to_markdown_headers() {
1123        let html = "<h1>Title</h1><h2>Subtitle</h2>";
1124        let md = html_to_markdown(html);
1125        assert!(md.contains("# Title"));
1126        assert!(md.contains("## Subtitle"));
1127    }
1128
1129    #[test]
1130    fn test_html_to_markdown_paragraphs() {
1131        let html = "<p>First paragraph</p><p>Second paragraph</p>";
1132        let md = html_to_markdown(html);
1133        assert!(md.contains("First paragraph"));
1134        assert!(md.contains("Second paragraph"));
1135    }
1136
1137    #[test]
1138    fn test_html_to_markdown_lists() {
1139        let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
1140        let md = html_to_markdown(html);
1141        assert!(md.contains("- Item 1"));
1142        assert!(md.contains("- Item 2"));
1143    }
1144
1145    #[test]
1146    fn test_html_to_markdown_emphasis() {
1147        let html = "<p><strong>bold</strong> and <em>italic</em></p>";
1148        let md = html_to_markdown(html);
1149        assert!(md.contains("**bold**"));
1150        assert!(md.contains("*italic*"));
1151    }
1152
1153    #[test]
1154    fn test_html_to_markdown_code() {
1155        let html = "<pre>code block</pre>";
1156        let md = html_to_markdown(html);
1157        assert!(md.contains("```"));
1158        assert!(md.contains("code block"));
1159    }
1160
1161    #[test]
1162    fn test_html_to_markdown_skip_script() {
1163        let html = "<p>Before</p><script>alert('bad');</script><p>After</p>";
1164        let md = html_to_markdown(html);
1165        assert!(md.contains("Before"));
1166        assert!(md.contains("After"));
1167        assert!(!md.contains("alert"));
1168    }
1169
1170    #[test]
1171    fn test_html_to_text_simple() {
1172        let html = "<p>Hello</p><p>World</p>";
1173        let text = html_to_text(html);
1174        assert!(text.contains("Hello"));
1175        assert!(text.contains("World"));
1176    }
1177
1178    #[test]
1179    fn test_html_to_text_skip_script() {
1180        let html = "<p>Before</p><script>alert('bad');</script><p>After</p>";
1181        let text = html_to_text(html);
1182        assert!(text.contains("Before"));
1183        assert!(text.contains("After"));
1184        assert!(!text.contains("alert"));
1185    }
1186
1187    #[test]
1188    fn test_entity_decoding() {
1189        let html = "<p>&amp; &lt; &gt; &quot; &apos; &nbsp; &mdash; &ndash; &copy; &reg;</p>";
1190        let text = html_to_text(html);
1191        assert!(text.contains('&'));
1192        assert!(text.contains('<'));
1193        assert!(text.contains('>'));
1194        assert!(text.contains('"'));
1195        assert!(text.contains('\''));
1196        assert!(text.contains('—'));
1197        assert!(text.contains('–'));
1198        assert!(text.contains('©'));
1199        assert!(text.contains('®'));
1200    }
1201
1202    #[test]
1203    fn test_filter_excessive_newlines() {
1204        let input = "line1\n\n\n\n\nline2";
1205        let output = filter_excessive_newlines(input);
1206        assert_eq!(output, "line1\n\nline2");
1207    }
1208
1209    #[test]
1210    fn test_clean_whitespace() {
1211        let input = "  hello   world  \n\n\n\n  test  ";
1212        let output = clean_whitespace(input);
1213        assert_eq!(output, "hello world\n\n  test");
1214    }
1215
1216    #[test]
1217    fn test_clean_whitespace_preserves_indentation() {
1218        let input = "top\n  indented\n    deeper";
1219        let output = clean_whitespace(input);
1220        assert_eq!(output, "top\n  indented\n    deeper");
1221    }
1222
1223    #[test]
1224    fn test_is_markdown_content_type() {
1225        assert!(is_markdown_content_type(&Some("text/markdown".to_string())));
1226        assert!(is_markdown_content_type(&Some(
1227            "text/markdown; charset=utf-8".to_string()
1228        )));
1229        assert!(is_markdown_content_type(&Some("Text/Markdown".to_string())));
1230        assert!(!is_markdown_content_type(&Some(
1231            "text/html; profile=\"text/markdown\"".to_string()
1232        )));
1233        assert!(!is_markdown_content_type(&Some("text/html".to_string())));
1234        assert!(!is_markdown_content_type(&Some("text/plain".to_string())));
1235        assert!(!is_markdown_content_type(&None));
1236    }
1237
1238    #[test]
1239    fn test_is_plain_text_content_type() {
1240        assert!(is_plain_text_content_type(&Some("text/plain".to_string())));
1241        assert!(is_plain_text_content_type(&Some(
1242            "text/plain; charset=utf-8".to_string()
1243        )));
1244        assert!(is_plain_text_content_type(&Some("Text/Plain".to_string())));
1245        assert!(!is_plain_text_content_type(&Some(
1246            "text/html; profile=\"text/plain\"".to_string()
1247        )));
1248        assert!(!is_plain_text_content_type(&Some("text/html".to_string())));
1249        assert!(!is_plain_text_content_type(&Some(
1250            "text/markdown".to_string()
1251        )));
1252        assert!(!is_plain_text_content_type(&None));
1253    }
1254
1255    #[test]
1256    fn test_extract_attribute() {
1257        assert_eq!(
1258            extract_attribute("a href=\"https://example.com\" class=\"link\"", "href"),
1259            Some("https://example.com".to_string())
1260        );
1261        assert_eq!(
1262            extract_attribute("img src='image.png'", "src"),
1263            Some("image.png".to_string())
1264        );
1265        assert_eq!(
1266            extract_attribute("div class=test", "class"),
1267            Some("test".to_string())
1268        );
1269        assert_eq!(
1270            extract_attribute("a title=\"İİ\" href=x", "href"),
1271            Some("x".to_string())
1272        );
1273    }
1274
1275    #[test]
1276    fn test_extract_metadata_title() {
1277        let html = "<html><head><title>My Page</title></head><body></body></html>";
1278        let meta = extract_metadata(html);
1279        assert_eq!(meta.title.as_deref(), Some("My Page"));
1280    }
1281
1282    #[test]
1283    fn test_extract_metadata_og_title_overrides() {
1284        let html = r#"<html><head>
1285            <title>Basic Title</title>
1286            <meta property="og:title" content="OG Title">
1287        </head></html>"#;
1288        let meta = extract_metadata(html);
1289        assert_eq!(meta.title.as_deref(), Some("OG Title"));
1290    }
1291
1292    #[test]
1293    fn test_extract_metadata_description() {
1294        let html = r#"<html><head>
1295            <meta name="description" content="A page about things">
1296        </head></html>"#;
1297        let meta = extract_metadata(html);
1298        assert_eq!(meta.description.as_deref(), Some("A page about things"));
1299    }
1300
1301    #[test]
1302    fn test_extract_metadata_og_description_overrides() {
1303        let html = r#"<html><head>
1304            <meta name="description" content="Basic desc">
1305            <meta property="og:description" content="OG desc">
1306        </head></html>"#;
1307        let meta = extract_metadata(html);
1308        assert_eq!(meta.description.as_deref(), Some("OG desc"));
1309    }
1310
1311    #[test]
1312    fn test_extract_metadata_language() {
1313        let html = r#"<html lang="en-US"><head><title>Test</title></head></html>"#;
1314        let meta = extract_metadata(html);
1315        assert_eq!(meta.language.as_deref(), Some("en-US"));
1316    }
1317
1318    #[test]
1319    fn test_extract_metadata_canonical_url() {
1320        let html = r#"<html><head>
1321            <link rel="canonical" href="https://example.com/page">
1322        </head></html>"#;
1323        let meta = extract_metadata(html);
1324        assert_eq!(
1325            meta.canonical_url.as_deref(),
1326            Some("https://example.com/page")
1327        );
1328    }
1329
1330    #[test]
1331    fn test_extract_metadata_author() {
1332        let html = r#"<html><head>
1333            <meta name="author" content="Jane Doe">
1334        </head></html>"#;
1335        let meta = extract_metadata(html);
1336        assert_eq!(meta.author.as_deref(), Some("Jane Doe"));
1337    }
1338
1339    #[test]
1340    fn test_extract_metadata_dates() {
1341        let html = r#"<html><head>
1342            <meta property="article:published_time" content="2024-01-15T10:00:00Z">
1343            <meta property="article:modified_time" content="2024-02-20T12:00:00Z">
1344        </head></html>"#;
1345        let meta = extract_metadata(html);
1346        assert_eq!(meta.published_date.as_deref(), Some("2024-01-15T10:00:00Z"));
1347        assert_eq!(meta.modified_date.as_deref(), Some("2024-02-20T12:00:00Z"));
1348    }
1349
1350    #[test]
1351    fn test_extract_metadata_time_element() {
1352        let html = r#"<html><body>
1353            <time datetime="2024-03-01">March 1, 2024</time>
1354        </body></html>"#;
1355        let meta = extract_metadata(html);
1356        assert_eq!(meta.published_date.as_deref(), Some("2024-03-01"));
1357    }
1358
1359    #[test]
1360    fn test_extract_metadata_links() {
1361        let html = r#"<html><body>
1362            <a href="https://example.com">Example</a>
1363            <a href="/about">About Us</a>
1364        </body></html>"#;
1365        let meta = extract_metadata(html);
1366        assert_eq!(meta.links.len(), 2);
1367        assert_eq!(meta.links[0].href, "https://example.com");
1368        assert_eq!(meta.links[0].text, "Example");
1369        assert_eq!(meta.links[1].href, "/about");
1370        assert_eq!(meta.links[1].text, "About Us");
1371    }
1372
1373    #[test]
1374    fn test_extract_headings() {
1375        let html = "<h1>Title</h1><h2>Section 1</h2><h3>Subsection</h3><h2>Section 2</h2>";
1376        let headings = extract_headings(html);
1377        assert_eq!(
1378            headings,
1379            vec!["# Title", "## Section 1", "### Subsection", "## Section 2"]
1380        );
1381    }
1382
1383    #[test]
1384    fn test_extract_metadata_skips_script_content() {
1385        let html = r#"<html><head>
1386            <title>Real Title</title>
1387            <script>document.title = "Fake";</script>
1388        </head><body>
1389            <a href="/real">Real Link</a>
1390            <script><a href="/fake">Fake</a></script>
1391        </body></html>"#;
1392        let meta = extract_metadata(html);
1393        assert_eq!(meta.title.as_deref(), Some("Real Title"));
1394        assert_eq!(meta.links.len(), 1);
1395        assert_eq!(meta.links[0].href, "/real");
1396    }
1397
1398    #[test]
1399    fn test_extract_metadata_empty_html() {
1400        let meta = extract_metadata("");
1401        assert!(meta.is_empty());
1402    }
1403
1404    #[test]
1405    fn test_extract_metadata_full_page() {
1406        let html = r#"<!DOCTYPE html>
1407<html lang="en">
1408<head>
1409    <title>Article Title</title>
1410    <meta name="description" content="An interesting article">
1411    <meta name="author" content="John Smith">
1412    <meta property="og:title" content="OG Article Title">
1413    <meta property="article:published_time" content="2024-06-15">
1414    <link rel="canonical" href="https://example.com/article">
1415</head>
1416<body>
1417    <h1>Article Title</h1>
1418    <p>Some content with a <a href="https://link.example.com">link</a>.</p>
1419    <h2>Section One</h2>
1420    <p>More content.</p>
1421</body>
1422</html>"#;
1423        let mut meta = extract_metadata(html);
1424        meta.headings = extract_headings(html);
1425
1426        assert_eq!(meta.title.as_deref(), Some("OG Article Title"));
1427        assert_eq!(meta.description.as_deref(), Some("An interesting article"));
1428        assert_eq!(meta.author.as_deref(), Some("John Smith"));
1429        assert_eq!(meta.language.as_deref(), Some("en"));
1430        assert_eq!(
1431            meta.canonical_url.as_deref(),
1432            Some("https://example.com/article")
1433        );
1434        assert_eq!(meta.published_date.as_deref(), Some("2024-06-15"));
1435        assert_eq!(meta.links.len(), 1);
1436        assert_eq!(meta.links[0].text, "link");
1437        assert_eq!(meta.headings, vec!["# Article Title", "## Section One"]);
1438        assert!(!meta.is_empty());
1439    }
1440
1441    #[test]
1442    fn test_page_metadata_is_empty() {
1443        let meta = PageMetadata::default();
1444        assert!(meta.is_empty());
1445
1446        let meta = PageMetadata {
1447            title: Some("test".to_string()),
1448            ..Default::default()
1449        };
1450        assert!(!meta.is_empty());
1451    }
1452
1453    #[test]
1454    fn test_strip_boilerplate_extracts_main() {
1455        let html = r#"<nav><a href="/">Home</a></nav>
1456            <main><p>Important content</p></main>
1457            <footer>Copyright 2024</footer>"#;
1458        let result = strip_boilerplate(html);
1459        assert!(result.contains("Important content"));
1460        assert!(!result.contains("Home"));
1461        assert!(!result.contains("Copyright"));
1462    }
1463
1464    #[test]
1465    fn test_strip_boilerplate_extracts_article() {
1466        let html = r#"<nav>Menu</nav>
1467            <article><h1>Title</h1><p>Body text</p></article>
1468            <aside>Sidebar</aside>"#;
1469        let result = strip_boilerplate(html);
1470        assert!(result.contains("Title"));
1471        assert!(result.contains("Body text"));
1472        assert!(!result.contains("Menu"));
1473        assert!(!result.contains("Sidebar"));
1474    }
1475
1476    #[test]
1477    fn test_strip_boilerplate_main_takes_precedence_over_article() {
1478        let html = r#"<main><p>Main content</p></main>
1479            <article><p>Article content</p></article>"#;
1480        let result = strip_boilerplate(html);
1481        assert!(result.contains("Main content"));
1482        // Article is outside main, so not included
1483        assert!(!result.contains("Article content"));
1484    }
1485
1486    #[test]
1487    fn test_strip_boilerplate_fallback_strips_nav_footer_aside() {
1488        let html = r#"<div>
1489            <nav>Navigation links</nav>
1490            <p>Content paragraph</p>
1491            <footer>Footer info</footer>
1492            <aside>Sidebar widget</aside>
1493        </div>"#;
1494        let result = strip_boilerplate(html);
1495        assert!(result.contains("Content paragraph"));
1496        assert!(!result.contains("Navigation links"));
1497        assert!(!result.contains("Footer info"));
1498        assert!(!result.contains("Sidebar widget"));
1499    }
1500
1501    #[test]
1502    fn test_strip_boilerplate_role_navigation() {
1503        let html = r#"<div role="navigation">Nav menu</div>
1504            <p>Content</p>
1505            <div role="contentinfo">Footer stuff</div>"#;
1506        let result = strip_boilerplate(html);
1507        assert!(result.contains("Content"));
1508        assert!(!result.contains("Nav menu"));
1509        assert!(!result.contains("Footer stuff"));
1510    }
1511
1512    #[test]
1513    fn test_strip_boilerplate_role_main() {
1514        let html = r#"<nav>Nav</nav>
1515            <div role="main"><p>Main content here</p></div>
1516            <footer>Foot</footer>"#;
1517        let result = strip_boilerplate(html);
1518        assert!(result.contains("Main content here"));
1519        assert!(!result.contains("Nav"));
1520        assert!(!result.contains("Foot"));
1521    }
1522
1523    #[test]
1524    fn test_strip_boilerplate_nested_nav() {
1525        let html = r#"<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
1526            <p>Page content</p>"#;
1527        let result = strip_boilerplate(html);
1528        assert!(result.contains("Page content"));
1529        assert!(!result.contains("Home"));
1530        assert!(!result.contains("About"));
1531    }
1532
1533    #[test]
1534    fn test_strip_boilerplate_no_semantic_html() {
1535        // No main/article/nav/footer — returns everything
1536        let html = "<div><p>Content 1</p></div><div><p>Content 2</p></div>";
1537        let result = strip_boilerplate(html);
1538        assert!(result.contains("Content 1"));
1539        assert!(result.contains("Content 2"));
1540    }
1541
1542    #[test]
1543    fn test_strip_boilerplate_preserves_header_inside_main() {
1544        let html = r#"<header>Site header</header>
1545            <main><header><h1>Article header</h1></header><p>Body</p></main>"#;
1546        let result = strip_boilerplate(html);
1547        assert!(result.contains("Article header"));
1548        assert!(result.contains("Body"));
1549        assert!(!result.contains("Site header"));
1550    }
1551
1552    #[test]
1553    fn test_html_to_markdown_links() {
1554        let html = r#"<p>Visit <a href="https://example.com">Example Site</a> today.</p>"#;
1555        let md = html_to_markdown(html);
1556        assert!(
1557            md.contains("[Example Site](https://example.com)"),
1558            "Got: {}",
1559            md
1560        );
1561    }
1562
1563    #[test]
1564    fn test_html_to_markdown_link_no_text() {
1565        let html = r#"<a href="https://example.com"></a>"#;
1566        let md = html_to_markdown(html);
1567        assert!(md.contains("<https://example.com>"), "Got: {}", md);
1568    }
1569
1570    #[test]
1571    fn test_html_to_markdown_images() {
1572        let html = r#"<img src="photo.jpg" alt="A photo">"#;
1573        let md = html_to_markdown(html);
1574        assert!(md.contains("![A photo](photo.jpg)"), "Got: {}", md);
1575    }
1576
1577    #[test]
1578    fn test_html_to_markdown_image_no_alt() {
1579        let html = r#"<img src="photo.jpg">"#;
1580        let md = html_to_markdown(html);
1581        assert!(md.contains("![](photo.jpg)"), "Got: {}", md);
1582    }
1583
1584    #[test]
1585    fn test_html_to_markdown_ordered_list() {
1586        let html = "<ol><li>First</li><li>Second</li><li>Third</li></ol>";
1587        let md = html_to_markdown(html);
1588        assert!(md.contains("1. First"), "Got: {}", md);
1589        assert!(md.contains("2. Second"), "Got: {}", md);
1590        assert!(md.contains("3. Third"), "Got: {}", md);
1591    }
1592
1593    #[test]
1594    fn test_html_to_markdown_nested_lists() {
1595        let html = "<ul><li>Top<ul><li>Nested</li></ul></li></ul>";
1596        let md = html_to_markdown(html);
1597        assert!(md.contains("- Top"), "Got: {}", md);
1598        assert!(md.contains("  - Nested"), "Got: {}", md);
1599    }
1600
1601    #[test]
1602    fn test_html_to_markdown_table() {
1603        let html = r#"<table>
1604            <tr><th>Name</th><th>Age</th></tr>
1605            <tr><td>Alice</td><td>30</td></tr>
1606            <tr><td>Bob</td><td>25</td></tr>
1607        </table>"#;
1608        let md = html_to_markdown(html);
1609        assert!(md.contains("| Name | Age |"), "Got: {}", md);
1610        assert!(md.contains("| --- | --- |"), "Got: {}", md);
1611        assert!(md.contains("| Alice | 30 |"), "Got: {}", md);
1612        assert!(md.contains("| Bob | 25 |"), "Got: {}", md);
1613    }
1614
1615    #[test]
1616    fn test_html_to_markdown_table_no_header() {
1617        let html = r#"<table>
1618            <tr><td>A</td><td>B</td></tr>
1619            <tr><td>C</td><td>D</td></tr>
1620        </table>"#;
1621        let md = html_to_markdown(html);
1622        assert!(md.contains("| A | B |"), "Got: {}", md);
1623        assert!(md.contains("| C | D |"), "Got: {}", md);
1624    }
1625
1626    #[test]
1627    fn test_html_to_markdown_definition_list() {
1628        let html = "<dl><dt>Term</dt><dd>Definition</dd></dl>";
1629        let md = html_to_markdown(html);
1630        assert!(md.contains("**Term**"), "Got: {}", md);
1631        assert!(md.contains(": Definition"), "Got: {}", md);
1632    }
1633
1634    #[test]
1635    fn test_html_to_markdown_expanded_entities() {
1636        let html = "<p>&trade; &bull; &hellip; &euro; &pound; &larr; &rarr;</p>";
1637        let md = html_to_markdown(html);
1638        assert!(md.contains('™'), "Got: {}", md);
1639        assert!(md.contains('•'), "Got: {}", md);
1640        assert!(md.contains('…'), "Got: {}", md);
1641        assert!(md.contains('€'), "Got: {}", md);
1642        assert!(md.contains('£'), "Got: {}", md);
1643        assert!(md.contains('←'), "Got: {}", md);
1644        assert!(md.contains('→'), "Got: {}", md);
1645    }
1646
1647    #[test]
1648    fn test_html_to_markdown_smart_quotes() {
1649        let html = "<p>&ldquo;Hello&rdquo; &lsquo;World&rsquo;</p>";
1650        let md = html_to_markdown(html);
1651        assert!(md.contains('\u{201C}'), "Got: {}", md);
1652        assert!(md.contains('\u{201D}'), "Got: {}", md);
1653        assert!(md.contains('\u{2018}'), "Got: {}", md);
1654        assert!(md.contains('\u{2019}'), "Got: {}", md);
1655    }
1656}