1use crate::types::{PageLink, PageMetadata};
4
5pub fn is_markdown_content_type(content_type: &Option<String>) -> bool {
7 content_type
8 .as_deref()
9 .and_then(|ct| ct.split(';').next())
10 .map(|media_type| media_type.trim().eq_ignore_ascii_case("text/markdown"))
11 .unwrap_or(false)
12}
13
14pub fn is_plain_text_content_type(content_type: &Option<String>) -> bool {
16 content_type
17 .as_deref()
18 .and_then(|ct| ct.split(';').next())
19 .map(|media_type| media_type.trim().eq_ignore_ascii_case("text/plain"))
20 .unwrap_or(false)
21}
22
23pub fn is_html(content_type: &Option<String>, body: &str) -> bool {
28 if let Some(ct) = content_type {
30 let ct_lower = ct.to_lowercase();
31 if ct_lower.contains("text/html") || ct_lower.contains("application/xhtml") {
32 return true;
33 }
34 }
35
36 let trimmed = body.trim_start();
38 trimmed.starts_with("<!DOCTYPE") || trimmed.starts_with("<html")
39}
40
41pub fn html_to_markdown(html: &str) -> String {
58 let mut output = String::new();
59 let mut in_skip_element = 0;
60 let mut skip_elements: Vec<String> = Vec::new();
61 let mut in_pre = false;
62 let mut in_blockquote = false;
63
64 let mut link_href: Option<String> = None;
67 let mut link_start: usize = 0;
68
69 let mut list_stack: Vec<(bool, usize)> = Vec::new();
71
72 let mut in_table = false;
74 let mut table_rows: Vec<Vec<String>> = Vec::new();
75 let mut current_row: Vec<String> = Vec::new();
76 let mut in_cell = false;
77 let mut cell_buf = String::new();
78 let mut is_header_row = false;
79
80 let mut chars = html.chars().peekable();
81
82 while let Some(c) = chars.next() {
83 if c == '<' {
84 let mut tag = String::new();
86 while let Some(&next) = chars.peek() {
87 if next == '>' {
88 chars.next();
89 break;
90 }
91 tag.push(chars.next().unwrap());
92 }
93
94 let tag_lower = tag.to_lowercase();
95 let is_closing = tag_lower.starts_with('/');
96 let tag_name = if is_closing {
97 tag_lower[1..].split_whitespace().next().unwrap_or("")
98 } else {
99 tag_lower.split_whitespace().next().unwrap_or("")
100 };
101
102 let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
104 if skip_tags.contains(&tag_name) {
105 if is_closing {
106 if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
107 skip_elements.remove(pos);
108 in_skip_element = skip_elements.len();
109 }
110 } else if !tag.ends_with('/') {
111 skip_elements.push(tag_name.to_string());
112 in_skip_element = skip_elements.len();
113 }
114 continue;
115 }
116
117 if in_skip_element > 0 {
118 continue;
119 }
120
121 match tag_name {
123 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
124 let level = tag_name[1..].parse::<usize>().unwrap_or(1);
125 if !is_closing {
126 output.push('\n');
127 for _ in 0..level {
128 output.push('#');
129 }
130 output.push(' ');
131 } else {
132 output.push_str("\n\n");
133 }
134 }
135 "p" | "div" | "section" | "article" | "main" | "header" | "footer"
136 if is_closing =>
137 {
138 output.push_str("\n\n");
139 }
140 "br" => {
141 output.push('\n');
142 }
143 "hr" => {
144 output.push_str("\n---\n");
145 }
146 "ul" => {
147 if is_closing {
148 list_stack.pop();
149 if list_stack.is_empty() {
150 output.push('\n');
151 }
152 } else {
153 list_stack.push((false, 0));
154 }
155 }
156 "ol" => {
157 if is_closing {
158 list_stack.pop();
159 if list_stack.is_empty() {
160 output.push('\n');
161 }
162 } else {
163 list_stack.push((true, 0));
164 }
165 }
166 "li" if !is_closing => {
167 output.push('\n');
168 let depth = list_stack.len().saturating_sub(1);
169 for _ in 0..depth {
170 output.push_str(" ");
171 }
172 if let Some((is_ordered, counter)) = list_stack.last_mut() {
173 if *is_ordered {
174 *counter += 1;
175 output.push_str(&format!("{}. ", *counter));
176 } else {
177 output.push_str("- ");
178 }
179 } else {
180 output.push_str("- ");
181 }
182 }
183 "strong" | "b" => {
184 output.push_str("**");
185 }
186 "em" | "i" => {
187 output.push('*');
188 }
189 "pre" => {
190 if !is_closing {
191 output.push_str("\n```\n");
192 in_pre = true;
193 } else {
194 output.push_str("\n```\n");
195 in_pre = false;
196 }
197 }
198 "code" if !in_pre => {
199 output.push('`');
200 }
201 "blockquote" => {
202 if !is_closing {
203 in_blockquote = true;
204 output.push_str("\n> ");
205 } else {
206 in_blockquote = false;
207 output.push('\n');
208 }
209 }
210 "a" => {
211 if !is_closing {
212 if let Some(href) = extract_attribute(&tag, "href") {
213 if !href.is_empty() {
214 link_href = Some(href);
215 link_start = output.len();
216 }
217 }
218 } else if let Some(href) = link_href.take() {
219 let text = output[link_start..].trim().to_string();
220 output.truncate(link_start);
221 if text.is_empty() {
222 output.push_str(&format!("<{}>", href));
223 } else {
224 output.push_str(&format!("[{}]({})", text, href));
225 }
226 }
227 }
228 "img" if !is_closing => {
229 let alt = extract_attribute(&tag, "alt").unwrap_or_default();
230 if let Some(src) = extract_attribute(&tag, "src") {
231 output.push_str(&format!("", alt, src));
232 }
233 }
234 "table" => {
236 if !is_closing {
237 in_table = true;
238 table_rows.clear();
239 } else {
240 in_table = false;
241 render_table(&table_rows, &mut output);
242 table_rows.clear();
243 }
244 }
245 "tr" => {
246 if !is_closing {
247 current_row.clear();
248 is_header_row = false;
249 } else if in_table {
250 table_rows.push(current_row.clone());
251 if is_header_row && table_rows.len() == 1 {
252 let sep: Vec<String> =
253 current_row.iter().map(|_| "---".to_string()).collect();
254 table_rows.push(sep);
255 }
256 current_row.clear();
257 }
258 }
259 "th" => {
260 if !is_closing {
261 in_cell = true;
262 cell_buf.clear();
263 is_header_row = true;
264 } else {
265 in_cell = false;
266 current_row.push(cell_buf.trim().to_string());
267 cell_buf.clear();
268 }
269 }
270 "td" => {
271 if !is_closing {
272 in_cell = true;
273 cell_buf.clear();
274 } else {
275 in_cell = false;
276 current_row.push(cell_buf.trim().to_string());
277 cell_buf.clear();
278 }
279 }
280 "dl" if is_closing => {
282 output.push_str("\n\n");
283 }
284 "dt" => {
285 if !is_closing {
286 output.push_str("\n**");
287 } else {
288 output.push_str("**\n");
289 }
290 }
291 "dd" => {
292 if !is_closing {
293 output.push_str(": ");
294 } else {
295 output.push('\n');
296 }
297 }
298 _ => {}
299 }
300 } else if in_skip_element == 0 {
301 let decoded = decode_entity(c, &mut chars);
303
304 if in_cell {
305 cell_buf.push(decoded);
306 } else if in_table {
307 } else if in_blockquote && decoded == '\n' {
309 output.push_str("\n> ");
310 } else {
311 output.push(decoded);
312 }
313 }
314 }
315
316 clean_whitespace(&output)
317}
318
319fn render_table(rows: &[Vec<String>], output: &mut String) {
321 if rows.is_empty() {
322 return;
323 }
324
325 output.push('\n');
326 for row in rows {
327 output.push_str("| ");
328 output.push_str(&row.join(" | "));
329 output.push_str(" |\n");
330 }
331}
332
333pub fn html_to_text(html: &str) -> String {
349 let mut output = String::new();
350 let mut in_skip_element = 0;
351 let mut skip_elements: Vec<String> = Vec::new();
352
353 let mut chars = html.chars().peekable();
354
355 while let Some(c) = chars.next() {
356 if c == '<' {
357 let mut tag = String::new();
359 while let Some(&next) = chars.peek() {
360 if next == '>' {
361 chars.next();
362 break;
363 }
364 tag.push(chars.next().unwrap());
365 }
366
367 let tag_lower = tag.to_lowercase();
368 let is_closing = tag_lower.starts_with('/');
369 let tag_name = if is_closing {
370 tag_lower[1..].split_whitespace().next().unwrap_or("")
371 } else {
372 tag_lower.split_whitespace().next().unwrap_or("")
373 };
374
375 let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
377 if skip_tags.contains(&tag_name) {
378 if is_closing {
379 if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
380 skip_elements.remove(pos);
381 in_skip_element = skip_elements.len();
382 }
383 } else if !tag.ends_with('/') {
384 skip_elements.push(tag_name.to_string());
385 in_skip_element = skip_elements.len();
386 }
387 continue;
388 }
389
390 if in_skip_element > 0 {
391 continue;
392 }
393
394 let newline_tags = [
396 "p", "div", "br", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr",
397 ];
398 if newline_tags.contains(&tag_name) && (is_closing || tag_name == "br") {
399 output.push('\n');
400 } else if newline_tags.contains(&tag_name) && !is_closing {
401 if matches!(tag_name, "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "p") {
403 output.push('\n');
404 }
405 }
406 } else if in_skip_element == 0 {
407 let decoded = decode_entity(c, &mut chars);
409 output.push(decoded);
410 }
411 }
412
413 clean_whitespace(&output)
414}
415
416fn extract_attribute(tag: &str, attr: &str) -> Option<String> {
418 let pattern = format!("{}=", attr);
419 let start = tag.char_indices().find_map(|(idx, _)| {
420 tag.get(idx..idx + pattern.len())
421 .filter(|candidate| candidate.eq_ignore_ascii_case(&pattern))
422 .map(|_| idx)
423 });
424
425 if let Some(start) = start {
426 let rest = &tag[start + pattern.len()..];
427 let rest = rest.trim_start();
428
429 if let Some(rest) = rest.strip_prefix('"') {
430 if let Some(end) = rest.find('"') {
431 return Some(rest[..end].to_string());
432 }
433 } else if let Some(rest) = rest.strip_prefix('\'') {
434 if let Some(end) = rest.find('\'') {
435 return Some(rest[..end].to_string());
436 }
437 } else {
438 let end = rest
439 .find(|c: char| c.is_whitespace() || c == '>')
440 .unwrap_or(rest.len());
441 return Some(rest[..end].to_string());
442 }
443 }
444 None
445}
446
447fn decode_entity(c: char, chars: &mut std::iter::Peekable<std::str::Chars>) -> char {
450 if c != '&' {
451 return c;
452 }
453
454 let mut entity = String::new();
455 while let Some(&next) = chars.peek() {
456 if next == ';' {
457 chars.next();
458 break;
459 }
460 if next.is_whitespace() || entity.len() > 10 {
461 return '&';
463 }
464 entity.push(chars.next().unwrap());
465 }
466
467 match entity.as_str() {
468 "amp" => '&',
469 "lt" => '<',
470 "gt" => '>',
471 "quot" => '"',
472 "apos" | "#39" => '\'',
473 "nbsp" => ' ',
474 "mdash" => '—',
475 "ndash" => '–',
476 "copy" => '©',
477 "reg" => '®',
478 "trade" => '™',
479 "bull" => '•',
480 "hellip" => '…',
481 "laquo" => '«',
482 "raquo" => '»',
483 "lsquo" => '\u{2018}',
484 "rsquo" => '\u{2019}',
485 "ldquo" => '\u{201C}',
486 "rdquo" => '\u{201D}',
487 "euro" => '€',
488 "pound" => '£',
489 "yen" => '¥',
490 "cent" => '¢',
491 "deg" => '°',
492 "micro" => 'µ',
493 "para" => '¶',
494 "sect" => '§',
495 "middot" => '·',
496 "times" => '×',
497 "divide" => '÷',
498 "plusmn" => '±',
499 "frac12" => '½',
500 "frac14" => '¼',
501 "frac34" => '¾',
502 "larr" => '←',
503 "rarr" => '→',
504 "uarr" => '↑',
505 "darr" => '↓',
506 _ => {
507 if let Some(num_str) = entity.strip_prefix('#') {
509 if let Some(stripped) = num_str.strip_prefix('x') {
510 if let Ok(code) = u32::from_str_radix(stripped, 16) {
512 if let Some(ch) = char::from_u32(code) {
513 return ch;
514 }
515 }
516 } else if let Ok(code) = num_str.parse::<u32>() {
517 if let Some(ch) = char::from_u32(code) {
518 return ch;
519 }
520 }
521 }
522 '&'
524 }
525 }
526}
527
528pub fn clean_whitespace(s: &str) -> String {
531 let mut result = String::new();
532 let mut last_was_space = false;
533 let mut newline_count = 0;
534 let mut at_line_start = true;
535
536 for c in s.chars() {
537 if c == '\n' {
538 if last_was_space && result.ends_with(' ') {
540 result.pop();
541 }
542 newline_count += 1;
543 last_was_space = true;
544 at_line_start = true;
545 if newline_count <= 2 {
546 result.push(c);
547 }
548 } else if c == ' ' || c == '\t' {
549 if at_line_start {
550 result.push(c);
552 } else {
553 newline_count = 0;
554 if !last_was_space {
555 result.push(' ');
556 last_was_space = true;
557 }
558 }
559 } else if c.is_whitespace() {
560 newline_count = 0;
561 if !last_was_space {
562 result.push(' ');
563 last_was_space = true;
564 }
565 } else {
566 newline_count = 0;
567 last_was_space = false;
568 at_line_start = false;
569 result.push(c);
570 }
571 }
572
573 result.trim().to_string()
574}
575
576pub fn filter_excessive_newlines(s: &str) -> String {
578 let mut result = String::new();
579 let mut newline_count = 0;
580
581 for c in s.chars() {
582 if c == '\n' {
583 newline_count += 1;
584 if newline_count <= 2 {
585 result.push(c);
586 }
587 } else {
588 newline_count = 0;
589 result.push(c);
590 }
591 }
592
593 result
594}
595
596pub fn extract_metadata(html: &str) -> PageMetadata {
614 let mut meta = PageMetadata::default();
615 let mut chars = html.chars().peekable();
616 let mut in_title = false;
617 let mut title_buf = String::new();
618 let mut in_skip_element = 0;
619 let mut skip_elements: Vec<String> = Vec::new();
620 let mut current_link_href: Option<String> = None;
622 let mut current_link_text = String::new();
623
624 while let Some(c) = chars.next() {
625 if c == '<' {
626 let mut tag = String::new();
627 while let Some(&next) = chars.peek() {
628 if next == '>' {
629 chars.next();
630 break;
631 }
632 tag.push(chars.next().unwrap());
633 }
634
635 let tag_lower = tag.to_lowercase();
636 let is_closing = tag_lower.starts_with('/');
637 let tag_name = if is_closing {
638 tag_lower[1..].split_whitespace().next().unwrap_or("")
639 } else {
640 tag_lower.split_whitespace().next().unwrap_or("")
641 };
642
643 let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
645 if skip_tags.contains(&tag_name) {
646 if is_closing {
647 if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
648 skip_elements.remove(pos);
649 in_skip_element = skip_elements.len();
650 }
651 } else if !tag.ends_with('/') {
652 skip_elements.push(tag_name.to_string());
653 in_skip_element = skip_elements.len();
654 }
655 continue;
656 }
657
658 if in_skip_element > 0 {
659 continue;
660 }
661
662 match tag_name {
663 "html" if !is_closing => {
664 if let Some(lang) = extract_attribute(&tag, "lang") {
665 if meta.language.is_none() && !lang.is_empty() {
666 meta.language = Some(lang);
667 }
668 }
669 }
670 "title" => {
671 if !is_closing {
672 in_title = true;
673 title_buf.clear();
674 } else {
675 in_title = false;
676 let title = title_buf.trim().to_string();
677 if meta.title.is_none() && !title.is_empty() {
678 meta.title = Some(title);
679 }
680 }
681 }
682 "meta" if !is_closing => {
683 extract_meta_tag(&tag, &mut meta);
684 }
685 "link" if !is_closing => {
686 if let Some(rel) = extract_attribute(&tag, "rel") {
687 if rel == "canonical" {
688 if let Some(href) = extract_attribute(&tag, "href") {
689 if meta.canonical_url.is_none() && !href.is_empty() {
690 meta.canonical_url = Some(href);
691 }
692 }
693 }
694 }
695 }
696 "time" if !is_closing => {
697 if let Some(datetime) = extract_attribute(&tag, "datetime") {
698 if meta.published_date.is_none() && !datetime.is_empty() {
699 meta.published_date = Some(datetime);
700 }
701 }
702 }
703 "a" => {
704 if !is_closing {
705 if let Some(href) = extract_attribute(&tag, "href") {
706 if !href.is_empty() {
707 current_link_href = Some(href);
708 current_link_text.clear();
709 }
710 }
711 } else if let Some(href) = current_link_href.take() {
712 let text = current_link_text.trim().to_string();
713 if meta.links.len() < 500 {
715 meta.links.push(PageLink { text, href });
716 }
717 current_link_text.clear();
718 }
719 }
720 _ => {}
721 }
722 } else if in_skip_element == 0 {
723 let decoded = decode_entity(c, &mut chars);
724 if in_title {
725 title_buf.push(decoded);
726 }
727 if current_link_href.is_some() {
728 current_link_text.push(decoded);
729 }
730 }
731 }
732
733 meta
734}
735
736pub fn extract_headings(html: &str) -> Vec<String> {
739 let mut headings = Vec::new();
740 let mut chars = html.chars().peekable();
741 let mut in_heading: Option<u8> = None; let mut heading_buf = String::new();
743 let mut in_skip_element = 0;
744 let mut skip_elements: Vec<String> = Vec::new();
745
746 while let Some(c) = chars.next() {
747 if c == '<' {
748 let mut tag = String::new();
749 while let Some(&next) = chars.peek() {
750 if next == '>' {
751 chars.next();
752 break;
753 }
754 tag.push(chars.next().unwrap());
755 }
756
757 let tag_lower = tag.to_lowercase();
758 let is_closing = tag_lower.starts_with('/');
759 let tag_name = if is_closing {
760 tag_lower[1..].split_whitespace().next().unwrap_or("")
761 } else {
762 tag_lower.split_whitespace().next().unwrap_or("")
763 };
764
765 let skip_tags = ["script", "style", "noscript", "iframe", "svg"];
766 if skip_tags.contains(&tag_name) {
767 if is_closing {
768 if let Some(pos) = skip_elements.iter().rposition(|t| t == tag_name) {
769 skip_elements.remove(pos);
770 in_skip_element = skip_elements.len();
771 }
772 } else if !tag.ends_with('/') {
773 skip_elements.push(tag_name.to_string());
774 in_skip_element = skip_elements.len();
775 }
776 continue;
777 }
778
779 if in_skip_element > 0 {
780 continue;
781 }
782
783 if let Some(level) = heading_level(tag_name) {
784 if is_closing {
785 if in_heading == Some(level) {
786 let text = heading_buf.trim().to_string();
787 if !text.is_empty() && headings.len() < 200 {
788 let prefix = "#".repeat(level as usize);
789 headings.push(format!("{} {}", prefix, text));
790 }
791 in_heading = None;
792 heading_buf.clear();
793 }
794 } else {
795 in_heading = Some(level);
796 heading_buf.clear();
797 }
798 }
799 } else if in_skip_element == 0 {
800 let decoded = decode_entity(c, &mut chars);
801 if in_heading.is_some() {
802 heading_buf.push(decoded);
803 }
804 }
805 }
806
807 headings
808}
809
810fn heading_level(tag_name: &str) -> Option<u8> {
811 match tag_name {
812 "h1" => Some(1),
813 "h2" => Some(2),
814 "h3" => Some(3),
815 "h4" => Some(4),
816 "h5" => Some(5),
817 "h6" => Some(6),
818 _ => None,
819 }
820}
821
822fn extract_meta_tag(tag: &str, meta: &mut PageMetadata) {
824 if let Some(content) = extract_attribute(tag, "content") {
826 if content.is_empty() {
827 return;
828 }
829 if let Some(name) = extract_attribute(tag, "name") {
831 match name.to_lowercase().as_str() {
832 "description" if meta.description.is_none() => {
833 meta.description = Some(content.clone());
834 }
835 "author" if meta.author.is_none() => {
836 meta.author = Some(content.clone());
837 }
838 _ => {}
839 }
840 }
841 if let Some(property) = extract_attribute(tag, "property") {
843 match property.to_lowercase().as_str() {
844 "og:title" => {
845 meta.title = Some(content.clone());
847 }
848 "og:description" => {
849 meta.description = Some(content.clone());
851 }
852 "article:published_time" if meta.published_date.is_none() => {
853 meta.published_date = Some(content.clone());
854 }
855 "article:modified_time" if meta.modified_date.is_none() => {
856 meta.modified_date = Some(content);
857 }
858 _ => {}
859 }
860 }
861 }
862}
863
864pub fn strip_boilerplate(html: &str) -> String {
882 if let Some(focused) = extract_main_content(html) {
887 return focused;
888 }
889
890 strip_boilerplate_elements(html)
892}
893
894fn extract_main_content(html: &str) -> Option<String> {
896 for target_tag in &["main", "article"] {
898 if let Some(content) = extract_tag_content(html, target_tag) {
899 return Some(content);
900 }
901 }
902
903 extract_role_content(html, "main")
905}
906
907fn extract_tag_content(html: &str, target: &str) -> Option<String> {
909 let mut chars = html.chars().peekable();
910 let mut depth = 0i32;
911 let mut capturing = false;
912 let mut output = String::new();
913
914 while let Some(c) = chars.next() {
915 if c == '<' {
916 let mut tag = String::new();
917 while let Some(&next) = chars.peek() {
918 if next == '>' {
919 chars.next();
920 break;
921 }
922 tag.push(chars.next().unwrap());
923 }
924
925 let tag_lower = tag.to_lowercase();
926 let is_closing = tag_lower.starts_with('/');
927 let tag_name = if is_closing {
928 tag_lower[1..].split_whitespace().next().unwrap_or("")
929 } else {
930 tag_lower.split_whitespace().next().unwrap_or("")
931 };
932
933 if tag_name == target {
934 if is_closing {
935 depth -= 1;
936 if depth == 0 && capturing {
937 return Some(output);
938 }
939 } else if !tag.ends_with('/') {
940 depth += 1;
941 if depth == 1 && !capturing {
942 capturing = true;
943 continue;
944 }
945 }
946 }
947
948 if capturing {
949 output.push('<');
950 output.push_str(&tag);
951 output.push('>');
952 }
953 } else if capturing {
954 output.push(c);
955 }
956 }
957
958 None
959}
960
961fn extract_role_content(html: &str, role: &str) -> Option<String> {
963 let mut chars = html.chars().peekable();
964 let mut capture_tag: Option<String> = None;
965 let mut depth = 0i32;
966 let mut output = String::new();
967
968 while let Some(c) = chars.next() {
969 if c == '<' {
970 let mut tag = String::new();
971 while let Some(&next) = chars.peek() {
972 if next == '>' {
973 chars.next();
974 break;
975 }
976 tag.push(chars.next().unwrap());
977 }
978
979 let tag_lower = tag.to_lowercase();
980 let is_closing = tag_lower.starts_with('/');
981 let tag_name = if is_closing {
982 tag_lower[1..].split_whitespace().next().unwrap_or("")
983 } else {
984 tag_lower.split_whitespace().next().unwrap_or("")
985 };
986
987 if let Some(ref target) = capture_tag {
988 if tag_name == target.as_str() {
989 if is_closing {
990 depth -= 1;
991 if depth == 0 {
992 return Some(output);
993 }
994 } else if !tag.ends_with('/') {
995 depth += 1;
996 }
997 }
998
999 if depth > 0 {
1000 output.push('<');
1001 output.push_str(&tag);
1002 output.push('>');
1003 }
1004 } else if !is_closing {
1005 if let Some(attr_role) = extract_attribute(&tag, "role") {
1007 if attr_role.eq_ignore_ascii_case(role) && !tag.ends_with('/') {
1008 capture_tag = Some(tag_name.to_string());
1009 depth = 1;
1010 continue;
1011 }
1012 }
1013 }
1014 } else if capture_tag.is_some() && depth > 0 {
1015 output.push(c);
1016 }
1017 }
1018
1019 None
1020}
1021
1022const BOILERPLATE_TAGS: &[&str] = &["nav", "footer", "aside", "header"];
1024
1025const BOILERPLATE_ROLES: &[&str] = &["navigation", "banner", "contentinfo", "complementary"];
1027
1028fn strip_boilerplate_elements(html: &str) -> String {
1030 let mut output = String::new();
1031 let mut chars = html.chars().peekable();
1032 let mut skip_depth = 0i32;
1033 let mut skip_tag: Option<String> = None;
1034
1035 while let Some(c) = chars.next() {
1036 if c == '<' {
1037 let mut tag = String::new();
1038 while let Some(&next) = chars.peek() {
1039 if next == '>' {
1040 chars.next();
1041 break;
1042 }
1043 tag.push(chars.next().unwrap());
1044 }
1045
1046 let tag_lower = tag.to_lowercase();
1047 let is_closing = tag_lower.starts_with('/');
1048 let tag_name = if is_closing {
1049 tag_lower[1..].split_whitespace().next().unwrap_or("")
1050 } else {
1051 tag_lower.split_whitespace().next().unwrap_or("")
1052 };
1053
1054 if let Some(ref target) = skip_tag {
1056 if tag_name == target.as_str() {
1057 if is_closing {
1058 skip_depth -= 1;
1059 if skip_depth == 0 {
1060 skip_tag = None;
1061 continue;
1062 }
1063 } else if !tag.ends_with('/') {
1064 skip_depth += 1;
1065 }
1066 }
1067 continue; }
1069
1070 if !is_closing && !tag.ends_with('/') {
1072 let is_boilerplate_tag = BOILERPLATE_TAGS.contains(&tag_name);
1073 let is_boilerplate_role = extract_attribute(&tag, "role")
1074 .map(|r| {
1075 BOILERPLATE_ROLES
1076 .iter()
1077 .any(|br| r.eq_ignore_ascii_case(br))
1078 })
1079 .unwrap_or(false);
1080
1081 if is_boilerplate_tag || is_boilerplate_role {
1082 skip_tag = Some(tag_name.to_string());
1083 skip_depth = 1;
1084 continue;
1085 }
1086 }
1087
1088 output.push('<');
1089 output.push_str(&tag);
1090 output.push('>');
1091 } else if skip_tag.is_none() {
1092 output.push(c);
1093 }
1094 }
1095
1096 output
1097}
1098
1099#[cfg(test)]
1100mod tests {
1101 use super::*;
1102
1103 #[test]
1104 fn test_is_html_by_content_type() {
1105 assert!(is_html(&Some("text/html".to_string()), ""));
1106 assert!(is_html(&Some("text/html; charset=utf-8".to_string()), ""));
1107 assert!(is_html(&Some("application/xhtml+xml".to_string()), ""));
1108 assert!(!is_html(&Some("text/plain".to_string()), ""));
1109 assert!(!is_html(&Some("application/json".to_string()), ""));
1110 }
1111
1112 #[test]
1113 fn test_is_html_by_body() {
1114 assert!(is_html(&None, "<!DOCTYPE html><html>"));
1115 assert!(is_html(&None, " <!DOCTYPE html>"));
1116 assert!(is_html(&None, "<html><body>"));
1117 assert!(!is_html(&None, "Hello world"));
1118 assert!(!is_html(&None, "{\"json\": true}"));
1119 }
1120
1121 #[test]
1122 fn test_html_to_markdown_headers() {
1123 let html = "<h1>Title</h1><h2>Subtitle</h2>";
1124 let md = html_to_markdown(html);
1125 assert!(md.contains("# Title"));
1126 assert!(md.contains("## Subtitle"));
1127 }
1128
1129 #[test]
1130 fn test_html_to_markdown_paragraphs() {
1131 let html = "<p>First paragraph</p><p>Second paragraph</p>";
1132 let md = html_to_markdown(html);
1133 assert!(md.contains("First paragraph"));
1134 assert!(md.contains("Second paragraph"));
1135 }
1136
1137 #[test]
1138 fn test_html_to_markdown_lists() {
1139 let html = "<ul><li>Item 1</li><li>Item 2</li></ul>";
1140 let md = html_to_markdown(html);
1141 assert!(md.contains("- Item 1"));
1142 assert!(md.contains("- Item 2"));
1143 }
1144
1145 #[test]
1146 fn test_html_to_markdown_emphasis() {
1147 let html = "<p><strong>bold</strong> and <em>italic</em></p>";
1148 let md = html_to_markdown(html);
1149 assert!(md.contains("**bold**"));
1150 assert!(md.contains("*italic*"));
1151 }
1152
1153 #[test]
1154 fn test_html_to_markdown_code() {
1155 let html = "<pre>code block</pre>";
1156 let md = html_to_markdown(html);
1157 assert!(md.contains("```"));
1158 assert!(md.contains("code block"));
1159 }
1160
1161 #[test]
1162 fn test_html_to_markdown_skip_script() {
1163 let html = "<p>Before</p><script>alert('bad');</script><p>After</p>";
1164 let md = html_to_markdown(html);
1165 assert!(md.contains("Before"));
1166 assert!(md.contains("After"));
1167 assert!(!md.contains("alert"));
1168 }
1169
1170 #[test]
1171 fn test_html_to_text_simple() {
1172 let html = "<p>Hello</p><p>World</p>";
1173 let text = html_to_text(html);
1174 assert!(text.contains("Hello"));
1175 assert!(text.contains("World"));
1176 }
1177
1178 #[test]
1179 fn test_html_to_text_skip_script() {
1180 let html = "<p>Before</p><script>alert('bad');</script><p>After</p>";
1181 let text = html_to_text(html);
1182 assert!(text.contains("Before"));
1183 assert!(text.contains("After"));
1184 assert!(!text.contains("alert"));
1185 }
1186
1187 #[test]
1188 fn test_entity_decoding() {
1189 let html = "<p>& < > " ' — – © ®</p>";
1190 let text = html_to_text(html);
1191 assert!(text.contains('&'));
1192 assert!(text.contains('<'));
1193 assert!(text.contains('>'));
1194 assert!(text.contains('"'));
1195 assert!(text.contains('\''));
1196 assert!(text.contains('—'));
1197 assert!(text.contains('–'));
1198 assert!(text.contains('©'));
1199 assert!(text.contains('®'));
1200 }
1201
1202 #[test]
1203 fn test_filter_excessive_newlines() {
1204 let input = "line1\n\n\n\n\nline2";
1205 let output = filter_excessive_newlines(input);
1206 assert_eq!(output, "line1\n\nline2");
1207 }
1208
1209 #[test]
1210 fn test_clean_whitespace() {
1211 let input = " hello world \n\n\n\n test ";
1212 let output = clean_whitespace(input);
1213 assert_eq!(output, "hello world\n\n test");
1214 }
1215
1216 #[test]
1217 fn test_clean_whitespace_preserves_indentation() {
1218 let input = "top\n indented\n deeper";
1219 let output = clean_whitespace(input);
1220 assert_eq!(output, "top\n indented\n deeper");
1221 }
1222
1223 #[test]
1224 fn test_is_markdown_content_type() {
1225 assert!(is_markdown_content_type(&Some("text/markdown".to_string())));
1226 assert!(is_markdown_content_type(&Some(
1227 "text/markdown; charset=utf-8".to_string()
1228 )));
1229 assert!(is_markdown_content_type(&Some("Text/Markdown".to_string())));
1230 assert!(!is_markdown_content_type(&Some(
1231 "text/html; profile=\"text/markdown\"".to_string()
1232 )));
1233 assert!(!is_markdown_content_type(&Some("text/html".to_string())));
1234 assert!(!is_markdown_content_type(&Some("text/plain".to_string())));
1235 assert!(!is_markdown_content_type(&None));
1236 }
1237
1238 #[test]
1239 fn test_is_plain_text_content_type() {
1240 assert!(is_plain_text_content_type(&Some("text/plain".to_string())));
1241 assert!(is_plain_text_content_type(&Some(
1242 "text/plain; charset=utf-8".to_string()
1243 )));
1244 assert!(is_plain_text_content_type(&Some("Text/Plain".to_string())));
1245 assert!(!is_plain_text_content_type(&Some(
1246 "text/html; profile=\"text/plain\"".to_string()
1247 )));
1248 assert!(!is_plain_text_content_type(&Some("text/html".to_string())));
1249 assert!(!is_plain_text_content_type(&Some(
1250 "text/markdown".to_string()
1251 )));
1252 assert!(!is_plain_text_content_type(&None));
1253 }
1254
1255 #[test]
1256 fn test_extract_attribute() {
1257 assert_eq!(
1258 extract_attribute("a href=\"https://example.com\" class=\"link\"", "href"),
1259 Some("https://example.com".to_string())
1260 );
1261 assert_eq!(
1262 extract_attribute("img src='image.png'", "src"),
1263 Some("image.png".to_string())
1264 );
1265 assert_eq!(
1266 extract_attribute("div class=test", "class"),
1267 Some("test".to_string())
1268 );
1269 assert_eq!(
1270 extract_attribute("a title=\"İİ\" href=x", "href"),
1271 Some("x".to_string())
1272 );
1273 }
1274
1275 #[test]
1276 fn test_extract_metadata_title() {
1277 let html = "<html><head><title>My Page</title></head><body></body></html>";
1278 let meta = extract_metadata(html);
1279 assert_eq!(meta.title.as_deref(), Some("My Page"));
1280 }
1281
1282 #[test]
1283 fn test_extract_metadata_og_title_overrides() {
1284 let html = r#"<html><head>
1285 <title>Basic Title</title>
1286 <meta property="og:title" content="OG Title">
1287 </head></html>"#;
1288 let meta = extract_metadata(html);
1289 assert_eq!(meta.title.as_deref(), Some("OG Title"));
1290 }
1291
1292 #[test]
1293 fn test_extract_metadata_description() {
1294 let html = r#"<html><head>
1295 <meta name="description" content="A page about things">
1296 </head></html>"#;
1297 let meta = extract_metadata(html);
1298 assert_eq!(meta.description.as_deref(), Some("A page about things"));
1299 }
1300
1301 #[test]
1302 fn test_extract_metadata_og_description_overrides() {
1303 let html = r#"<html><head>
1304 <meta name="description" content="Basic desc">
1305 <meta property="og:description" content="OG desc">
1306 </head></html>"#;
1307 let meta = extract_metadata(html);
1308 assert_eq!(meta.description.as_deref(), Some("OG desc"));
1309 }
1310
1311 #[test]
1312 fn test_extract_metadata_language() {
1313 let html = r#"<html lang="en-US"><head><title>Test</title></head></html>"#;
1314 let meta = extract_metadata(html);
1315 assert_eq!(meta.language.as_deref(), Some("en-US"));
1316 }
1317
1318 #[test]
1319 fn test_extract_metadata_canonical_url() {
1320 let html = r#"<html><head>
1321 <link rel="canonical" href="https://example.com/page">
1322 </head></html>"#;
1323 let meta = extract_metadata(html);
1324 assert_eq!(
1325 meta.canonical_url.as_deref(),
1326 Some("https://example.com/page")
1327 );
1328 }
1329
1330 #[test]
1331 fn test_extract_metadata_author() {
1332 let html = r#"<html><head>
1333 <meta name="author" content="Jane Doe">
1334 </head></html>"#;
1335 let meta = extract_metadata(html);
1336 assert_eq!(meta.author.as_deref(), Some("Jane Doe"));
1337 }
1338
1339 #[test]
1340 fn test_extract_metadata_dates() {
1341 let html = r#"<html><head>
1342 <meta property="article:published_time" content="2024-01-15T10:00:00Z">
1343 <meta property="article:modified_time" content="2024-02-20T12:00:00Z">
1344 </head></html>"#;
1345 let meta = extract_metadata(html);
1346 assert_eq!(meta.published_date.as_deref(), Some("2024-01-15T10:00:00Z"));
1347 assert_eq!(meta.modified_date.as_deref(), Some("2024-02-20T12:00:00Z"));
1348 }
1349
1350 #[test]
1351 fn test_extract_metadata_time_element() {
1352 let html = r#"<html><body>
1353 <time datetime="2024-03-01">March 1, 2024</time>
1354 </body></html>"#;
1355 let meta = extract_metadata(html);
1356 assert_eq!(meta.published_date.as_deref(), Some("2024-03-01"));
1357 }
1358
1359 #[test]
1360 fn test_extract_metadata_links() {
1361 let html = r#"<html><body>
1362 <a href="https://example.com">Example</a>
1363 <a href="/about">About Us</a>
1364 </body></html>"#;
1365 let meta = extract_metadata(html);
1366 assert_eq!(meta.links.len(), 2);
1367 assert_eq!(meta.links[0].href, "https://example.com");
1368 assert_eq!(meta.links[0].text, "Example");
1369 assert_eq!(meta.links[1].href, "/about");
1370 assert_eq!(meta.links[1].text, "About Us");
1371 }
1372
1373 #[test]
1374 fn test_extract_headings() {
1375 let html = "<h1>Title</h1><h2>Section 1</h2><h3>Subsection</h3><h2>Section 2</h2>";
1376 let headings = extract_headings(html);
1377 assert_eq!(
1378 headings,
1379 vec!["# Title", "## Section 1", "### Subsection", "## Section 2"]
1380 );
1381 }
1382
1383 #[test]
1384 fn test_extract_metadata_skips_script_content() {
1385 let html = r#"<html><head>
1386 <title>Real Title</title>
1387 <script>document.title = "Fake";</script>
1388 </head><body>
1389 <a href="/real">Real Link</a>
1390 <script><a href="/fake">Fake</a></script>
1391 </body></html>"#;
1392 let meta = extract_metadata(html);
1393 assert_eq!(meta.title.as_deref(), Some("Real Title"));
1394 assert_eq!(meta.links.len(), 1);
1395 assert_eq!(meta.links[0].href, "/real");
1396 }
1397
1398 #[test]
1399 fn test_extract_metadata_empty_html() {
1400 let meta = extract_metadata("");
1401 assert!(meta.is_empty());
1402 }
1403
1404 #[test]
1405 fn test_extract_metadata_full_page() {
1406 let html = r#"<!DOCTYPE html>
1407<html lang="en">
1408<head>
1409 <title>Article Title</title>
1410 <meta name="description" content="An interesting article">
1411 <meta name="author" content="John Smith">
1412 <meta property="og:title" content="OG Article Title">
1413 <meta property="article:published_time" content="2024-06-15">
1414 <link rel="canonical" href="https://example.com/article">
1415</head>
1416<body>
1417 <h1>Article Title</h1>
1418 <p>Some content with a <a href="https://link.example.com">link</a>.</p>
1419 <h2>Section One</h2>
1420 <p>More content.</p>
1421</body>
1422</html>"#;
1423 let mut meta = extract_metadata(html);
1424 meta.headings = extract_headings(html);
1425
1426 assert_eq!(meta.title.as_deref(), Some("OG Article Title"));
1427 assert_eq!(meta.description.as_deref(), Some("An interesting article"));
1428 assert_eq!(meta.author.as_deref(), Some("John Smith"));
1429 assert_eq!(meta.language.as_deref(), Some("en"));
1430 assert_eq!(
1431 meta.canonical_url.as_deref(),
1432 Some("https://example.com/article")
1433 );
1434 assert_eq!(meta.published_date.as_deref(), Some("2024-06-15"));
1435 assert_eq!(meta.links.len(), 1);
1436 assert_eq!(meta.links[0].text, "link");
1437 assert_eq!(meta.headings, vec!["# Article Title", "## Section One"]);
1438 assert!(!meta.is_empty());
1439 }
1440
1441 #[test]
1442 fn test_page_metadata_is_empty() {
1443 let meta = PageMetadata::default();
1444 assert!(meta.is_empty());
1445
1446 let meta = PageMetadata {
1447 title: Some("test".to_string()),
1448 ..Default::default()
1449 };
1450 assert!(!meta.is_empty());
1451 }
1452
1453 #[test]
1454 fn test_strip_boilerplate_extracts_main() {
1455 let html = r#"<nav><a href="/">Home</a></nav>
1456 <main><p>Important content</p></main>
1457 <footer>Copyright 2024</footer>"#;
1458 let result = strip_boilerplate(html);
1459 assert!(result.contains("Important content"));
1460 assert!(!result.contains("Home"));
1461 assert!(!result.contains("Copyright"));
1462 }
1463
1464 #[test]
1465 fn test_strip_boilerplate_extracts_article() {
1466 let html = r#"<nav>Menu</nav>
1467 <article><h1>Title</h1><p>Body text</p></article>
1468 <aside>Sidebar</aside>"#;
1469 let result = strip_boilerplate(html);
1470 assert!(result.contains("Title"));
1471 assert!(result.contains("Body text"));
1472 assert!(!result.contains("Menu"));
1473 assert!(!result.contains("Sidebar"));
1474 }
1475
1476 #[test]
1477 fn test_strip_boilerplate_main_takes_precedence_over_article() {
1478 let html = r#"<main><p>Main content</p></main>
1479 <article><p>Article content</p></article>"#;
1480 let result = strip_boilerplate(html);
1481 assert!(result.contains("Main content"));
1482 assert!(!result.contains("Article content"));
1484 }
1485
1486 #[test]
1487 fn test_strip_boilerplate_fallback_strips_nav_footer_aside() {
1488 let html = r#"<div>
1489 <nav>Navigation links</nav>
1490 <p>Content paragraph</p>
1491 <footer>Footer info</footer>
1492 <aside>Sidebar widget</aside>
1493 </div>"#;
1494 let result = strip_boilerplate(html);
1495 assert!(result.contains("Content paragraph"));
1496 assert!(!result.contains("Navigation links"));
1497 assert!(!result.contains("Footer info"));
1498 assert!(!result.contains("Sidebar widget"));
1499 }
1500
1501 #[test]
1502 fn test_strip_boilerplate_role_navigation() {
1503 let html = r#"<div role="navigation">Nav menu</div>
1504 <p>Content</p>
1505 <div role="contentinfo">Footer stuff</div>"#;
1506 let result = strip_boilerplate(html);
1507 assert!(result.contains("Content"));
1508 assert!(!result.contains("Nav menu"));
1509 assert!(!result.contains("Footer stuff"));
1510 }
1511
1512 #[test]
1513 fn test_strip_boilerplate_role_main() {
1514 let html = r#"<nav>Nav</nav>
1515 <div role="main"><p>Main content here</p></div>
1516 <footer>Foot</footer>"#;
1517 let result = strip_boilerplate(html);
1518 assert!(result.contains("Main content here"));
1519 assert!(!result.contains("Nav"));
1520 assert!(!result.contains("Foot"));
1521 }
1522
1523 #[test]
1524 fn test_strip_boilerplate_nested_nav() {
1525 let html = r#"<nav><ul><li><a href="/">Home</a></li><li><a href="/about">About</a></li></ul></nav>
1526 <p>Page content</p>"#;
1527 let result = strip_boilerplate(html);
1528 assert!(result.contains("Page content"));
1529 assert!(!result.contains("Home"));
1530 assert!(!result.contains("About"));
1531 }
1532
1533 #[test]
1534 fn test_strip_boilerplate_no_semantic_html() {
1535 let html = "<div><p>Content 1</p></div><div><p>Content 2</p></div>";
1537 let result = strip_boilerplate(html);
1538 assert!(result.contains("Content 1"));
1539 assert!(result.contains("Content 2"));
1540 }
1541
1542 #[test]
1543 fn test_strip_boilerplate_preserves_header_inside_main() {
1544 let html = r#"<header>Site header</header>
1545 <main><header><h1>Article header</h1></header><p>Body</p></main>"#;
1546 let result = strip_boilerplate(html);
1547 assert!(result.contains("Article header"));
1548 assert!(result.contains("Body"));
1549 assert!(!result.contains("Site header"));
1550 }
1551
1552 #[test]
1553 fn test_html_to_markdown_links() {
1554 let html = r#"<p>Visit <a href="https://example.com">Example Site</a> today.</p>"#;
1555 let md = html_to_markdown(html);
1556 assert!(
1557 md.contains("[Example Site](https://example.com)"),
1558 "Got: {}",
1559 md
1560 );
1561 }
1562
1563 #[test]
1564 fn test_html_to_markdown_link_no_text() {
1565 let html = r#"<a href="https://example.com"></a>"#;
1566 let md = html_to_markdown(html);
1567 assert!(md.contains("<https://example.com>"), "Got: {}", md);
1568 }
1569
1570 #[test]
1571 fn test_html_to_markdown_images() {
1572 let html = r#"<img src="photo.jpg" alt="A photo">"#;
1573 let md = html_to_markdown(html);
1574 assert!(md.contains(""), "Got: {}", md);
1575 }
1576
1577 #[test]
1578 fn test_html_to_markdown_image_no_alt() {
1579 let html = r#"<img src="photo.jpg">"#;
1580 let md = html_to_markdown(html);
1581 assert!(md.contains(""), "Got: {}", md);
1582 }
1583
1584 #[test]
1585 fn test_html_to_markdown_ordered_list() {
1586 let html = "<ol><li>First</li><li>Second</li><li>Third</li></ol>";
1587 let md = html_to_markdown(html);
1588 assert!(md.contains("1. First"), "Got: {}", md);
1589 assert!(md.contains("2. Second"), "Got: {}", md);
1590 assert!(md.contains("3. Third"), "Got: {}", md);
1591 }
1592
1593 #[test]
1594 fn test_html_to_markdown_nested_lists() {
1595 let html = "<ul><li>Top<ul><li>Nested</li></ul></li></ul>";
1596 let md = html_to_markdown(html);
1597 assert!(md.contains("- Top"), "Got: {}", md);
1598 assert!(md.contains(" - Nested"), "Got: {}", md);
1599 }
1600
1601 #[test]
1602 fn test_html_to_markdown_table() {
1603 let html = r#"<table>
1604 <tr><th>Name</th><th>Age</th></tr>
1605 <tr><td>Alice</td><td>30</td></tr>
1606 <tr><td>Bob</td><td>25</td></tr>
1607 </table>"#;
1608 let md = html_to_markdown(html);
1609 assert!(md.contains("| Name | Age |"), "Got: {}", md);
1610 assert!(md.contains("| --- | --- |"), "Got: {}", md);
1611 assert!(md.contains("| Alice | 30 |"), "Got: {}", md);
1612 assert!(md.contains("| Bob | 25 |"), "Got: {}", md);
1613 }
1614
1615 #[test]
1616 fn test_html_to_markdown_table_no_header() {
1617 let html = r#"<table>
1618 <tr><td>A</td><td>B</td></tr>
1619 <tr><td>C</td><td>D</td></tr>
1620 </table>"#;
1621 let md = html_to_markdown(html);
1622 assert!(md.contains("| A | B |"), "Got: {}", md);
1623 assert!(md.contains("| C | D |"), "Got: {}", md);
1624 }
1625
1626 #[test]
1627 fn test_html_to_markdown_definition_list() {
1628 let html = "<dl><dt>Term</dt><dd>Definition</dd></dl>";
1629 let md = html_to_markdown(html);
1630 assert!(md.contains("**Term**"), "Got: {}", md);
1631 assert!(md.contains(": Definition"), "Got: {}", md);
1632 }
1633
1634 #[test]
1635 fn test_html_to_markdown_expanded_entities() {
1636 let html = "<p>™ • … € £ ← →</p>";
1637 let md = html_to_markdown(html);
1638 assert!(md.contains('™'), "Got: {}", md);
1639 assert!(md.contains('•'), "Got: {}", md);
1640 assert!(md.contains('…'), "Got: {}", md);
1641 assert!(md.contains('€'), "Got: {}", md);
1642 assert!(md.contains('£'), "Got: {}", md);
1643 assert!(md.contains('←'), "Got: {}", md);
1644 assert!(md.contains('→'), "Got: {}", md);
1645 }
1646
1647 #[test]
1648 fn test_html_to_markdown_smart_quotes() {
1649 let html = "<p>“Hello” ‘World’</p>";
1650 let md = html_to_markdown(html);
1651 assert!(md.contains('\u{201C}'), "Got: {}", md);
1652 assert!(md.contains('\u{201D}'), "Got: {}", md);
1653 assert!(md.contains('\u{2018}'), "Got: {}", md);
1654 assert!(md.contains('\u{2019}'), "Got: {}", md);
1655 }
1656}