1#![allow(clippy::collapsible_match)]
8#[cfg(feature = "inline-images")]
45use std::cell::RefCell;
46use std::collections::{BTreeMap, HashMap};
47#[cfg(feature = "inline-images")]
48use std::rc::Rc;
49
50use std::borrow::Cow;
51use std::str;
52
53use crate::error::Result;
54#[cfg(feature = "inline-images")]
55use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
56use crate::options::{ConversionOptions, HeadingStyle, ListIndentType};
57use crate::text;
58
59#[cfg(feature = "inline-images")]
60type InlineCollectorHandle = Rc<RefCell<InlineImageCollector>>;
61#[cfg(not(feature = "inline-images"))]
62type InlineCollectorHandle = ();
63
64fn chomp_inline(text: &str) -> (&str, &str, &str) {
70 if text.is_empty() {
71 return ("", "", "");
72 }
73
74 let prefix = if text.starts_with(&[' ', '\t'][..]) { " " } else { "" };
75
76 let has_trailing_linebreak = text.ends_with(" \n") || text.ends_with("\\\n");
77
78 let suffix = if has_trailing_linebreak {
79 if text.ends_with(" \n") { " \n" } else { "\\\n" }
80 } else if text.ends_with(&[' ', '\t'][..]) {
81 " "
82 } else {
83 ""
84 };
85
86 let trimmed = if has_trailing_linebreak {
87 if let Some(stripped) = text.strip_suffix(" \n") {
88 stripped.trim()
89 } else if let Some(stripped) = text.strip_suffix("\\\n") {
90 stripped.trim()
91 } else {
92 text.trim()
93 }
94 } else {
95 text.trim()
96 };
97
98 (prefix, suffix, trimmed)
99}
100
101fn trim_trailing_whitespace(output: &mut String) {
106 while output.ends_with(' ') || output.ends_with('\t') {
107 output.pop();
108 }
109}
110
111fn trim_line_end_whitespace(output: &mut String) {
113 if output.is_empty() {
114 return;
115 }
116
117 let mut cleaned = String::with_capacity(output.len());
118 for (idx, line) in output.split('\n').enumerate() {
119 if idx > 0 {
120 cleaned.push('\n');
121 }
122
123 let has_soft_break = line.ends_with(" ");
124 let trimmed = line.trim_end_matches([' ', '\t']);
125
126 if has_soft_break {
127 cleaned.push_str(trimmed);
128 cleaned.push_str(" ");
129 } else {
130 cleaned.push_str(trimmed);
131 }
132 }
133
134 cleaned.push('\n');
135 *output = cleaned;
136}
137
138fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
140 if value.len() <= max_len {
141 return;
142 }
143
144 let mut new_len = max_len.min(value.len());
145 while new_len > 0 && !value.is_char_boundary(new_len) {
146 new_len -= 1;
147 }
148 value.truncate(new_len);
149}
150
151fn dedent_code_block(content: &str) -> String {
156 let lines: Vec<&str> = content.lines().collect();
157 if lines.is_empty() {
158 return String::new();
159 }
160
161 let min_indent = lines
162 .iter()
163 .filter(|line| !line.trim().is_empty())
164 .map(|line| {
165 line.char_indices()
166 .take_while(|(_, c)| c.is_whitespace())
167 .map(|(idx, c)| idx + c.len_utf8())
168 .last()
169 .unwrap_or(0)
170 })
171 .min()
172 .unwrap_or(0);
173
174 lines
175 .iter()
176 .map(|line| {
177 if line.trim().is_empty() {
178 *line
179 } else {
180 &line[min_indent.min(line.len())..]
181 }
182 })
183 .collect::<Vec<_>>()
184 .join("\n")
185}
186
187fn calculate_list_continuation_indent(depth: usize) -> usize {
208 if depth > 0 { 2 * depth - 1 } else { 0 }
209}
210
211fn is_loose_list(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
233 if let Some(node) = node_handle.get(parser) {
234 if let tl::Node::Tag(tag) = node {
235 let children = tag.children();
236 {
237 for child_handle in children.top().iter() {
238 if let Some(child_node) = child_handle.get(parser) {
239 if let tl::Node::Tag(child_tag) = child_node {
240 if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
241 let li_children = child_tag.children();
242 {
243 for li_child_handle in li_children.top().iter() {
244 if let Some(li_child_node) = li_child_handle.get(parser) {
245 if let tl::Node::Tag(li_child_tag) = li_child_node {
246 if tag_name_eq(li_child_tag.name().as_utf8_str(), "p") {
247 return true;
248 }
249 }
250 }
251 }
252 }
253 }
254 }
255 }
256 }
257 }
258 }
259 }
260 false
261}
262
263fn add_list_continuation_indent(output: &mut String, list_depth: usize, blank_line: bool, options: &ConversionOptions) {
287 trim_trailing_whitespace(output);
288
289 if blank_line {
290 if !output.ends_with("\n\n") {
291 if output.ends_with('\n') {
292 output.push('\n');
293 } else {
294 output.push_str("\n\n");
295 }
296 }
297 } else if !output.ends_with('\n') {
298 output.push('\n');
299 }
300
301 let indent_level = calculate_list_continuation_indent(list_depth);
302 let indent_char = match options.list_indent_type {
303 ListIndentType::Tabs => "\t",
304 ListIndentType::Spaces => &" ".repeat(options.list_indent_width),
305 };
306 output.push_str(&indent_char.repeat(indent_level));
307}
308
309fn continuation_indent_string(list_depth: usize, options: &ConversionOptions) -> Option<String> {
311 let indent_level = calculate_list_continuation_indent(list_depth);
312 if indent_level == 0 {
313 return None;
314 }
315
316 let indent = match options.list_indent_type {
317 ListIndentType::Tabs => "\t".repeat(indent_level),
318 ListIndentType::Spaces => " ".repeat(options.list_indent_width * indent_level),
319 };
320 Some(indent)
321}
322
323fn add_list_leading_separator(output: &mut String, ctx: &Context) {
330 if ctx.in_table_cell {
331 let is_table_continuation =
332 !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
333 if is_table_continuation {
334 output.push_str("<br>");
335 }
336 return;
337 }
338
339 if !output.is_empty() && !ctx.in_list {
340 let needs_newline =
341 !output.ends_with("\n\n") && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
342 if needs_newline {
343 output.push_str("\n\n");
344 }
345 return;
346 }
347
348 if ctx.in_list_item && !output.is_empty() {
349 let needs_newline =
350 !output.ends_with('\n') && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
351 if needs_newline {
352 trim_trailing_whitespace(output);
353 output.push('\n');
354 }
355 }
356}
357
358fn add_nested_list_trailing_separator(output: &mut String, ctx: &Context) {
363 if !ctx.in_list_item {
364 return;
365 }
366
367 if ctx.loose_list {
368 if !output.ends_with("\n\n") {
369 if !output.ends_with('\n') {
370 output.push('\n');
371 }
372 output.push('\n');
373 }
374 } else if !output.ends_with('\n') {
375 output.push('\n');
376 }
377}
378
379fn calculate_list_nesting_depth(ctx: &Context) -> usize {
385 if ctx.in_list && !ctx.in_list_item {
386 ctx.list_depth + 1
387 } else {
388 ctx.list_depth
389 }
390}
391
392#[allow(clippy::too_many_arguments)]
397fn process_list_children(
398 node_handle: &tl::NodeHandle,
399 parser: &tl::Parser,
400 output: &mut String,
401 options: &ConversionOptions,
402 ctx: &Context,
403 depth: usize,
404 is_ordered: bool,
405 is_loose: bool,
406 nested_depth: usize,
407 start_counter: usize,
408 dom_ctx: &DomContext,
409) {
410 let mut counter = start_counter;
411
412 if let Some(node) = node_handle.get(parser) {
413 if let tl::Node::Tag(tag) = node {
414 let children = tag.children();
415 {
416 for child_handle in children.top().iter() {
417 if let Some(child_node) = child_handle.get(parser) {
418 if let tl::Node::Raw(bytes) = child_node {
419 if bytes.as_utf8_str().trim().is_empty() {
420 continue;
421 }
422 }
423 }
424
425 let list_ctx = Context {
426 in_ordered_list: is_ordered,
427 list_counter: if is_ordered { counter } else { 0 },
428 in_list: true,
429 list_depth: nested_depth,
430 ul_depth: if is_ordered { ctx.ul_depth } else { ctx.ul_depth + 1 },
431 loose_list: is_loose,
432 prev_item_had_blocks: false,
433 ..ctx.clone()
434 };
435
436 walk_node(child_handle, parser, output, options, &list_ctx, depth, dom_ctx);
437
438 if is_ordered {
439 if let Some(child_node) = child_handle.get(parser) {
440 if let tl::Node::Tag(child_tag) = child_node {
441 if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
442 counter += 1;
443 }
444 }
445 }
446 }
447 }
448 }
449 }
450 }
451}
452
453#[derive(Debug, Clone)]
455struct Context {
456 in_code: bool,
458 list_counter: usize,
460 in_ordered_list: bool,
462 last_was_dt: bool,
464 blockquote_depth: usize,
466 in_table_cell: bool,
468 convert_as_inline: bool,
470 inline_depth: usize,
472 in_list_item: bool,
474 list_depth: usize,
476 ul_depth: usize,
478 in_list: bool,
480 loose_list: bool,
482 prev_item_had_blocks: bool,
484 in_heading: bool,
486 heading_tag: Option<String>,
488 in_paragraph: bool,
490 in_ruby: bool,
492 in_strong: bool,
494 #[cfg(feature = "inline-images")]
495 inline_collector: Option<InlineCollectorHandle>,
497 #[cfg(feature = "metadata")]
498 metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
500}
501
502struct DomContext {
503 parent_map: HashMap<u32, Option<u32>>,
504 children_map: HashMap<u32, Vec<tl::NodeHandle>>,
505 root_children: Vec<tl::NodeHandle>,
506 node_map: HashMap<u32, tl::NodeHandle>,
507}
508
509fn escape_link_label(text: &str) -> String {
510 if text.is_empty() {
511 return String::new();
512 }
513
514 let mut result = String::with_capacity(text.len());
515 let mut backslash_count = 0usize;
516 let mut bracket_depth = 0usize;
517
518 for ch in text.chars() {
519 if ch == '\\' {
520 result.push('\\');
521 backslash_count += 1;
522 continue;
523 }
524
525 let is_escaped = backslash_count % 2 == 1;
526 backslash_count = 0;
527
528 match ch {
529 '[' if !is_escaped => {
530 bracket_depth = bracket_depth.saturating_add(1);
531 result.push('[');
532 }
533 ']' if !is_escaped => {
534 if bracket_depth == 0 {
535 result.push('\\');
536 } else {
537 bracket_depth -= 1;
538 }
539 result.push(']');
540 }
541 _ => result.push(ch),
542 }
543 }
544
545 result
546}
547
548fn append_markdown_link(
549 output: &mut String,
550 label: &str,
551 href: &str,
552 title: Option<&str>,
553 raw_text: &str,
554 options: &ConversionOptions,
555) {
556 output.push('[');
557 output.push_str(label);
558 output.push_str("](");
559
560 if href.is_empty() {
561 output.push_str("<>");
562 } else if href.contains(' ') || href.contains('\n') {
563 output.push('<');
564 output.push_str(href);
565 output.push('>');
566 } else {
567 let open_count = href.chars().filter(|&c| c == '(').count();
568 let close_count = href.chars().filter(|&c| c == ')').count();
569
570 if open_count == close_count {
571 output.push_str(href);
572 } else {
573 let escaped_href = href.replace("(", "\\(").replace(")", "\\)");
574 output.push_str(&escaped_href);
575 }
576 }
577
578 if let Some(title_text) = title {
579 output.push_str(" \"");
580 if title_text.contains('"') {
581 let escaped_title = title_text.replace('"', "\\\"");
582 output.push_str(&escaped_title);
583 } else {
584 output.push_str(title_text);
585 }
586 output.push('"');
587 } else if options.default_title && raw_text == href {
588 output.push_str(" \"");
589 if href.contains('"') {
590 let escaped_href = href.replace('"', "\\\"");
591 output.push_str(&escaped_href);
592 } else {
593 output.push_str(href);
594 }
595 output.push('"');
596 }
597
598 output.push(')');
599}
600
601fn heading_level_from_name(name: &str) -> Option<usize> {
602 match name {
603 "h1" => Some(1),
604 "h2" => Some(2),
605 "h3" => Some(3),
606 "h4" => Some(4),
607 "h5" => Some(5),
608 "h6" => Some(6),
609 _ => None,
610 }
611}
612
613fn find_single_heading_child(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<(usize, tl::NodeHandle)> {
614 let node = node_handle.get(parser)?;
615
616 let tl::Node::Tag(tag) = node else {
617 return None;
618 };
619
620 let children = tag.children();
621 let mut heading_data: Option<(usize, tl::NodeHandle)> = None;
622
623 for child_handle in children.top().iter() {
624 let Some(child_node) = child_handle.get(parser) else {
625 continue;
626 };
627
628 match child_node {
629 tl::Node::Raw(bytes) => {
630 if !bytes.as_utf8_str().trim().is_empty() {
631 return None;
632 }
633 }
634 tl::Node::Tag(child_tag) => {
635 let name = normalized_tag_name(child_tag.name().as_utf8_str());
636 if let Some(level) = heading_level_from_name(name.as_ref()) {
637 if heading_data.is_some() {
638 return None;
639 }
640 heading_data = Some((level, *child_handle));
641 } else {
642 return None;
643 }
644 }
645 _ => return None,
646 }
647 }
648
649 heading_data
650}
651
652fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
653 if text.is_empty() {
654 return;
655 }
656
657 if ctx.convert_as_inline {
658 output.push_str(text);
659 return;
660 }
661
662 if ctx.in_table_cell {
663 let is_table_continuation =
664 !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
665 if is_table_continuation {
666 output.push_str("<br>");
667 }
668 output.push_str(text);
669 return;
670 }
671
672 if ctx.in_list_item {
673 if output.ends_with('\n') {
674 if let Some(indent) = continuation_indent_string(ctx.list_depth, options) {
675 output.push_str(&indent);
676 }
677 } else if !output.ends_with(' ') && !output.is_empty() {
678 output.push(' ');
679 }
680 } else if !output.is_empty() && !output.ends_with("\n\n") {
681 if output.ends_with('\n') {
682 output.push('\n');
683 } else {
684 trim_trailing_whitespace(output);
685 output.push_str("\n\n");
686 }
687 }
688
689 let heading_suffix = if ctx.in_list_item || ctx.blockquote_depth > 0 {
690 "\n"
691 } else {
692 "\n\n"
693 };
694
695 match options.heading_style {
696 HeadingStyle::Underlined => {
697 if level == 1 {
698 output.push_str(text);
699 output.push('\n');
700 output.push_str(&"=".repeat(text.len()));
701 output.push_str(heading_suffix);
702 } else if level == 2 {
703 output.push_str(text);
704 output.push('\n');
705 output.push_str(&"-".repeat(text.len()));
706 output.push_str(heading_suffix);
707 } else {
708 output.push_str(&"#".repeat(level));
709 output.push(' ');
710 output.push_str(text);
711 output.push_str(heading_suffix);
712 }
713 }
714 HeadingStyle::Atx => {
715 output.push_str(&"#".repeat(level));
716 output.push(' ');
717 output.push_str(text);
718 output.push_str(heading_suffix);
719 }
720 HeadingStyle::AtxClosed => {
721 output.push_str(&"#".repeat(level));
722 output.push(' ');
723 output.push_str(text);
724 output.push(' ');
725 output.push_str(&"#".repeat(level));
726 output.push_str(heading_suffix);
727 }
728 }
729}
730
731fn normalize_heading_text<'a>(text: &'a str) -> Cow<'a, str> {
732 if !text.contains('\n') && !text.contains('\r') {
733 return Cow::Borrowed(text);
734 }
735
736 let mut normalized = String::with_capacity(text.len());
737 let mut pending_space = false;
738
739 for ch in text.chars() {
740 match ch {
741 '\n' | '\r' => {
742 if !normalized.is_empty() {
743 pending_space = true;
744 }
745 }
746 ' ' | '\t' if pending_space => continue,
747 _ => {
748 if pending_space {
749 if !normalized.ends_with(' ') {
750 normalized.push(' ');
751 }
752 pending_space = false;
753 }
754 normalized.push(ch);
755 }
756 }
757 }
758
759 Cow::Owned(normalized)
760}
761
762fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser) -> DomContext {
763 let mut ctx = DomContext {
764 parent_map: HashMap::new(),
765 children_map: HashMap::new(),
766 root_children: dom.children().to_vec(),
767 node_map: HashMap::new(),
768 };
769
770 for child_handle in dom.children().iter() {
771 record_node_hierarchy(child_handle, None, parser, &mut ctx);
772 }
773
774 ctx
775}
776
777fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
779 for handle in dom_ctx.node_map.values() {
780 if let Some(tl::Node::Tag(tag)) = handle.get(parser) {
781 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
782 if is_block_level_element(tag_name.as_ref()) {
783 let mut current = dom_ctx.parent_map.get(&handle.get_inner()).and_then(|p| *p);
784 while let Some(parent_id) = current {
785 if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
786 if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
787 let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
788 if is_inline_element(parent_name.as_ref()) {
789 return true;
790 }
791 }
792 }
793 current = dom_ctx.parent_map.get(&parent_id).and_then(|p| *p);
794 }
795 }
796 }
797 }
798
799 false
800}
801
802fn repair_with_html5ever(input: &str) -> Option<String> {
804 use html5ever::serialize::{SerializeOpts, serialize};
805 use html5ever::tendril::TendrilSink;
806 use markup5ever_rcdom::{RcDom, SerializableHandle};
807
808 let dom = html5ever::parse_document(RcDom::default(), Default::default())
809 .from_utf8()
810 .read_from(&mut input.as_bytes())
811 .ok()?;
812
813 let mut buf = Vec::with_capacity(input.len());
814 let handle = SerializableHandle::from(dom.document.clone());
815 serialize(&mut buf, &handle, SerializeOpts::default()).ok()?;
816 String::from_utf8(buf).ok()
817}
818
819fn record_node_hierarchy(node_handle: &tl::NodeHandle, parent: Option<u32>, parser: &tl::Parser, ctx: &mut DomContext) {
820 let id = node_handle.get_inner();
821 ctx.parent_map.insert(id, parent);
822 ctx.node_map.insert(id, *node_handle);
823
824 if let Some(node) = node_handle.get(parser) {
825 if let tl::Node::Tag(tag) = node {
826 let children: Vec<_> = tag.children().top().iter().copied().collect();
827 ctx.children_map.insert(id, children.clone());
828 for child in children {
829 record_node_hierarchy(&child, Some(id), parser, ctx);
830 }
831 }
832 }
833}
834
835fn is_hocr_document(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
844 fn check_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
845 if let Some(node) = node_handle.get(parser) {
846 match node {
847 tl::Node::Tag(tag) => {
848 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
849
850 if tag_name == "meta" {
851 if let Some(name_attr) = tag.attributes().get("name") {
852 if let Some(name_bytes) = name_attr {
853 let name_value = name_bytes.as_utf8_str();
854 if name_value == "ocr-system" || name_value == "ocr-capabilities" {
855 return true;
856 }
857 }
858 }
859 }
860
861 if let Some(class_attr) = tag.attributes().get("class") {
862 if let Some(class_bytes) = class_attr {
863 let class_value = class_bytes.as_utf8_str();
864 if class_value.contains("ocr_page")
865 || class_value.contains("ocrx_word")
866 || class_value.contains("ocr_carea")
867 || class_value.contains("ocr_par")
868 || class_value.contains("ocr_line")
869 {
870 return true;
871 }
872 }
873 }
874
875 let children = tag.children();
876 {
877 for child_handle in children.top().iter() {
878 if check_node(child_handle, parser) {
879 return true;
880 }
881 }
882 }
883 false
884 }
885 _ => false,
886 }
887 } else {
888 false
889 }
890 }
891
892 check_node(node_handle, parser)
893}
894
895fn extract_metadata(
906 node_handle: &tl::NodeHandle,
907 parser: &tl::Parser,
908 options: &ConversionOptions,
909) -> BTreeMap<String, String> {
910 let mut metadata = BTreeMap::new();
911
912 fn find_head(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<tl::NodeHandle> {
913 if let Some(node) = node_handle.get(parser) {
914 if let tl::Node::Tag(tag) = node {
915 if tag_name_eq(tag.name().as_utf8_str(), "head") {
916 return Some(*node_handle);
917 }
918 let children = tag.children();
919 {
920 for child_handle in children.top().iter() {
921 if let Some(result) = find_head(child_handle, parser) {
922 return Some(result);
923 }
924 }
925 }
926 }
927 }
928 None
929 }
930
931 let head_handle = match find_head(node_handle, parser) {
932 Some(h) => h,
933 None => return metadata,
934 };
935
936 if let Some(head_node) = head_handle.get(parser) {
937 if let tl::Node::Tag(head_tag) = head_node {
938 let children = head_tag.children();
939 {
940 for child_handle in children.top().iter() {
941 if let Some(child_node) = child_handle.get(parser) {
942 if let tl::Node::Tag(child_tag) = child_node {
943 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
944
945 match tag_name.as_ref() {
946 "title" => {
947 if options.strip_tags.contains(&"title".to_string())
948 || options.preserve_tags.contains(&"title".to_string())
949 {
950 } else {
951 let title_children = child_tag.children();
952 {
953 if let Some(first_child) = title_children.top().iter().next() {
954 if let Some(text_node) = first_child.get(parser) {
955 if let tl::Node::Raw(bytes) = text_node {
956 let title = text::normalize_whitespace(&bytes.as_utf8_str())
957 .trim()
958 .to_string();
959 if !title.is_empty() {
960 metadata.insert("title".to_string(), title);
961 }
962 }
963 }
964 }
965 }
966 }
967 }
968 "base" => {
969 if let Some(href_attr) = child_tag.attributes().get("href") {
970 if let Some(href_bytes) = href_attr {
971 let href = href_bytes.as_utf8_str().to_string();
972 if !href.is_empty() {
973 metadata.insert("base-href".to_string(), href);
974 }
975 }
976 }
977 }
978 "meta" => {
979 if !options.strip_tags.contains(&"meta".to_string())
980 && !options.preserve_tags.contains(&"meta".to_string())
981 {
982 let mut name_attr = None;
983 let mut property_attr = None;
984 let mut http_equiv_attr = None;
985 let mut content_attr = None;
986
987 if let Some(attr) = child_tag.attributes().get("name") {
988 if let Some(bytes) = attr {
989 name_attr = Some(bytes.as_utf8_str().to_string());
990 }
991 }
992 if let Some(attr) = child_tag.attributes().get("property") {
993 if let Some(bytes) = attr {
994 property_attr = Some(bytes.as_utf8_str().to_string());
995 }
996 }
997 if let Some(attr) = child_tag.attributes().get("http-equiv") {
998 if let Some(bytes) = attr {
999 http_equiv_attr = Some(bytes.as_utf8_str().to_string());
1000 }
1001 }
1002 if let Some(attr) = child_tag.attributes().get("content") {
1003 if let Some(bytes) = attr {
1004 content_attr = Some(bytes.as_utf8_str().to_string());
1005 }
1006 }
1007
1008 if let Some(content) = content_attr {
1009 if let Some(name) = name_attr {
1010 let key = format!("meta-{}", name.to_lowercase());
1011 metadata.insert(key, content);
1012 } else if let Some(property) = property_attr {
1013 let key = format!("meta-{}", property.to_lowercase().replace(':', "-"));
1014 metadata.insert(key, content);
1015 } else if let Some(http_equiv) = http_equiv_attr {
1016 let key = format!("meta-{}", http_equiv.to_lowercase());
1017 metadata.insert(key, content);
1018 }
1019 }
1020 }
1021 }
1022 "link" => {
1023 let mut rel_attr = None;
1024 let mut href_attr = None;
1025
1026 if let Some(attr) = child_tag.attributes().get("rel") {
1027 if let Some(bytes) = attr {
1028 rel_attr = Some(bytes.as_utf8_str().to_string());
1029 }
1030 }
1031 if let Some(attr) = child_tag.attributes().get("href") {
1032 if let Some(bytes) = attr {
1033 href_attr = Some(bytes.as_utf8_str().to_string());
1034 }
1035 }
1036
1037 if let (Some(rel), Some(href)) = (rel_attr, href_attr) {
1038 let rel_lower = rel.to_lowercase();
1039 match rel_lower.as_str() {
1040 "canonical" => {
1041 metadata.insert("canonical".to_string(), href);
1042 }
1043 "author" | "license" | "alternate" => {
1044 metadata.insert(format!("link-{}", rel_lower), href);
1045 }
1046 _ => {}
1047 }
1048 }
1049 }
1050 _ => {}
1051 }
1052 }
1053 }
1054 }
1055 }
1056 }
1057 }
1058
1059 metadata
1060}
1061
1062fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
1064 if metadata.is_empty() {
1065 return String::new();
1066 }
1067
1068 let mut lines = vec!["---".to_string()];
1069 for (key, value) in metadata {
1070 let needs_quotes = value.contains(':') || value.contains('#') || value.contains('[') || value.contains(']');
1071 if needs_quotes {
1072 let escaped = value.replace('\\', "\\\\").replace('"', "\\\"");
1073 lines.push(format!("{}: \"{}\"", key, escaped));
1074 } else {
1075 lines.push(format!("{}: {}", key, value));
1076 }
1077 }
1078 lines.push("---".to_string());
1079
1080 lines.join("\n") + "\n\n"
1081}
1082
1083fn is_empty_inline_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
1085 const EMPTY_WHEN_NO_CONTENT_TAGS: &[&str] = &[
1086 "abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u",
1087 ];
1088
1089 if let Some(node) = node_handle.get(parser) {
1090 if let tl::Node::Tag(tag) = node {
1091 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1092 if EMPTY_WHEN_NO_CONTENT_TAGS.contains(&tag_name.as_ref()) {
1093 return get_text_content(node_handle, parser).trim().is_empty();
1094 }
1095 }
1096 }
1097 false
1098}
1099
1100fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1102 let mut text = String::with_capacity(64);
1103 if let Some(node) = node_handle.get(parser) {
1104 match node {
1105 tl::Node::Raw(bytes) => {
1106 text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1107 }
1108 tl::Node::Tag(tag) => {
1109 let children = tag.children();
1110 {
1111 for child_handle in children.top().iter() {
1112 text.push_str(&get_text_content(child_handle, parser));
1113 }
1114 }
1115 }
1116 _ => {}
1117 }
1118 }
1119 text
1120}
1121
1122fn collect_link_label_text(children: &[tl::NodeHandle], parser: &tl::Parser) -> (String, Vec<tl::NodeHandle>, bool) {
1124 let mut text = String::new();
1125 let mut saw_block = false;
1126 let mut block_nodes = Vec::new();
1127 let mut stack: Vec<_> = children.iter().rev().copied().collect();
1128
1129 while let Some(handle) = stack.pop() {
1130 if let Some(node) = handle.get(parser) {
1131 match node {
1132 tl::Node::Raw(bytes) => {
1133 text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1134 }
1135 tl::Node::Tag(tag) => {
1136 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1137 if is_block_level_element(tag_name.as_ref()) {
1138 saw_block = true;
1139 block_nodes.push(handle);
1140 continue;
1141 }
1142
1143 let tag_children = tag.children();
1144 {
1145 let mut child_nodes: Vec<_> = tag_children.top().iter().copied().collect();
1146 child_nodes.reverse();
1147 for child in child_nodes {
1148 stack.push(child);
1149 }
1150 }
1151 }
1152 _ => {}
1153 }
1154 }
1155 }
1156
1157 (text, block_nodes, saw_block)
1158}
1159
1160fn normalize_link_label(label: &str) -> String {
1161 let collapsed = label
1162 .chars()
1163 .map(|ch| if ch == '\n' || ch == '\r' { ' ' } else { ch })
1164 .collect::<String>();
1165 text::normalize_whitespace(&collapsed).trim().to_string()
1166}
1167
1168fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1170 if let Some(node) = node_handle.get(parser) {
1171 if let tl::Node::Tag(tag) = node {
1172 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1173 let mut html = String::with_capacity(256);
1174 html.push('<');
1175 html.push_str(&tag_name);
1176
1177 for (key, value_opt) in tag.attributes().iter() {
1178 html.push(' ');
1179 html.push_str(&key);
1180 if let Some(value) = value_opt {
1181 html.push_str("=\"");
1182 html.push_str(&value);
1183 html.push('"');
1184 }
1185 }
1186
1187 let has_children = !tag.children().top().is_empty();
1188 if !has_children {
1189 html.push_str(" />");
1190 } else {
1191 html.push('>');
1192 let children = tag.children();
1193 {
1194 for child_handle in children.top().iter() {
1195 html.push_str(&serialize_node(child_handle, parser));
1196 }
1197 }
1198 html.push_str("</");
1199 html.push_str(&tag_name);
1200 html.push('>');
1201 }
1202 return html;
1203 }
1204 }
1205 String::new()
1206}
1207
1208#[cfg(feature = "inline-images")]
1209fn non_empty_trimmed(value: &str) -> Option<String> {
1210 let trimmed = value.trim();
1211 if trimmed.is_empty() {
1212 None
1213 } else {
1214 Some(trimmed.to_string())
1215 }
1216}
1217
1218#[cfg(feature = "inline-images")]
1219fn handle_inline_data_image(
1220 collector_ref: &InlineCollectorHandle,
1221 src: &str,
1222 alt: &str,
1223 title: Option<&str>,
1224 attributes: BTreeMap<String, String>,
1225) {
1226 let trimmed_src = src.trim();
1227 if !trimmed_src.starts_with("data:") {
1228 return;
1229 }
1230
1231 let mut collector = collector_ref.borrow_mut();
1232 let index = collector.next_index();
1233
1234 let Some((meta, payload)) = trimmed_src.split_once(',') else {
1235 collector.warn_skip(index, "missing data URI separator");
1236 return;
1237 };
1238
1239 if payload.trim().is_empty() {
1240 collector.warn_skip(index, "empty data URI payload");
1241 return;
1242 }
1243
1244 if !meta.starts_with("data:") {
1245 collector.warn_skip(index, "invalid data URI scheme");
1246 return;
1247 }
1248
1249 let header = &meta["data:".len()..];
1250 if header.is_empty() {
1251 collector.warn_skip(index, "missing MIME type");
1252 return;
1253 }
1254
1255 let mut segments = header.split(';');
1256 let mime = segments.next().unwrap_or("");
1257 let Some((top_level, subtype_raw)) = mime.split_once('/') else {
1258 collector.warn_skip(index, "missing MIME subtype");
1259 return;
1260 };
1261
1262 if !top_level.eq_ignore_ascii_case("image") {
1263 collector.warn_skip(index, format!("unsupported MIME type {mime}"));
1264 return;
1265 }
1266
1267 let subtype_raw = subtype_raw.trim();
1268 if subtype_raw.is_empty() {
1269 collector.warn_skip(index, "missing MIME subtype");
1270 return;
1271 }
1272
1273 let subtype_lower = subtype_raw.to_ascii_lowercase();
1274
1275 let mut is_base64 = false;
1276 let mut inline_name: Option<String> = None;
1277 for segment in segments {
1278 if segment.eq_ignore_ascii_case("base64") {
1279 is_base64 = true;
1280 } else if let Some(value) = segment.strip_prefix("name=") {
1281 inline_name = non_empty_trimmed(value.trim_matches('"'));
1282 } else if let Some(value) = segment.strip_prefix("filename=") {
1283 inline_name = non_empty_trimmed(value.trim_matches('"'));
1284 }
1285 }
1286
1287 if !is_base64 {
1288 collector.warn_skip(index, "missing base64 encoding marker");
1289 return;
1290 }
1291
1292 use base64::{Engine as _, engine::general_purpose::STANDARD};
1293
1294 let payload_clean = payload.trim();
1295 let decoded = match STANDARD.decode(payload_clean) {
1296 Ok(bytes) => bytes,
1297 Err(_) => {
1298 collector.warn_skip(index, "invalid base64 payload");
1299 return;
1300 }
1301 };
1302
1303 if decoded.is_empty() {
1304 collector.warn_skip(index, "empty base64 payload");
1305 return;
1306 }
1307
1308 let max_size = collector.max_decoded_size();
1309 if decoded.len() as u64 > max_size {
1310 collector.warn_skip(
1311 index,
1312 format!(
1313 "decoded payload ({} bytes) exceeds configured max ({})",
1314 decoded.len(),
1315 max_size
1316 ),
1317 );
1318 return;
1319 }
1320
1321 let format = match subtype_lower.as_str() {
1322 "png" => InlineImageFormat::Png,
1323 "jpeg" | "jpg" => InlineImageFormat::Jpeg,
1324 "gif" => InlineImageFormat::Gif,
1325 "bmp" => InlineImageFormat::Bmp,
1326 "webp" => InlineImageFormat::Webp,
1327 "svg+xml" => InlineImageFormat::Svg,
1328 other => InlineImageFormat::Other(other.to_string()),
1329 };
1330
1331 let description = non_empty_trimmed(alt).or_else(|| title.and_then(non_empty_trimmed));
1332
1333 let filename_candidate = attributes
1334 .get("data-filename")
1335 .cloned()
1336 .or_else(|| attributes.get("filename").cloned())
1337 .or_else(|| attributes.get("data-name").cloned())
1338 .or(inline_name);
1339
1340 let dimensions = collector.infer_dimensions(index, &decoded, &format);
1341
1342 let image = collector.build_image(
1343 decoded,
1344 format,
1345 filename_candidate,
1346 description,
1347 dimensions,
1348 InlineImageSource::ImgDataUri,
1349 attributes,
1350 );
1351
1352 collector.push_image(index, image);
1353}
1354
1355#[cfg(feature = "inline-images")]
1356fn handle_inline_svg(
1357 collector_ref: &InlineCollectorHandle,
1358 node_handle: &tl::NodeHandle,
1359 parser: &tl::Parser,
1360 title_opt: Option<String>,
1361 attributes: BTreeMap<String, String>,
1362) {
1363 {
1364 let borrow = collector_ref.borrow();
1365 if !borrow.capture_svg() {
1366 return;
1367 }
1368 }
1369
1370 let mut collector = collector_ref.borrow_mut();
1371 let index = collector.next_index();
1372
1373 let serialized = serialize_element(node_handle, parser);
1374 if serialized.is_empty() {
1375 collector.warn_skip(index, "unable to serialize SVG element");
1376 return;
1377 }
1378
1379 let data = serialized.into_bytes();
1380 let max_size = collector.max_decoded_size();
1381 if data.len() as u64 > max_size {
1382 collector.warn_skip(
1383 index,
1384 format!(
1385 "serialized SVG payload ({} bytes) exceeds configured max ({})",
1386 data.len(),
1387 max_size
1388 ),
1389 );
1390 return;
1391 }
1392
1393 let description = attributes
1394 .get("aria-label")
1395 .and_then(|value| non_empty_trimmed(value))
1396 .or_else(|| title_opt.clone().and_then(|t| non_empty_trimmed(&t)));
1397
1398 let filename_candidate = attributes
1399 .get("data-filename")
1400 .cloned()
1401 .or_else(|| attributes.get("filename").cloned())
1402 .or_else(|| attributes.get("data-name").cloned());
1403
1404 let image = collector.build_image(
1405 data,
1406 InlineImageFormat::Svg,
1407 filename_candidate,
1408 description,
1409 None,
1410 InlineImageSource::SvgElement,
1411 attributes,
1412 );
1413
1414 collector.push_image(index, image);
1415}
1416
1417fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1419 if let Some(node) = node_handle.get(parser) {
1420 match node {
1421 tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
1422 tl::Node::Tag(_) => serialize_element(node_handle, parser),
1423 _ => String::new(),
1424 }
1425 } else {
1426 String::new()
1427 }
1428}
1429
1430pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
1432 convert_html_impl(html, options, None, None)
1433}
1434
1435#[cfg(feature = "inline-images")]
1436pub(crate) fn convert_html_with_inline_collector(
1437 html: &str,
1438 options: &ConversionOptions,
1439 collector: InlineCollectorHandle,
1440) -> Result<String> {
1441 convert_html_impl(html, options, Some(collector), None)
1442}
1443
1444#[cfg(feature = "metadata")]
1445pub(crate) fn convert_html_with_metadata(
1446 html: &str,
1447 options: &ConversionOptions,
1448 metadata_collector: crate::metadata::MetadataCollectorHandle,
1449) -> Result<String> {
1450 convert_html_impl(html, options, None, Some(metadata_collector))
1451}
1452
1453#[cfg_attr(not(feature = "inline-images"), allow(unused_variables))]
1454#[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
1455fn convert_html_impl(
1456 html: &str,
1457 options: &ConversionOptions,
1458 inline_collector: Option<InlineCollectorHandle>,
1459 #[cfg(feature = "metadata")] metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
1460 #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
1461) -> Result<String> {
1462 let mut preprocessed = preprocess_html(html).into_owned();
1463 let mut preprocessed_len = preprocessed.len();
1464
1465 let parser_options = tl::ParserOptions::default();
1466 let mut dom_guard = unsafe {
1467 tl::parse_owned(preprocessed.clone(), parser_options)
1468 .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1469 };
1470 let mut dom_ref = dom_guard.get_ref();
1471 let mut parser = dom_ref.parser();
1472 let mut dom_ctx = build_dom_context(dom_ref, parser);
1473 let mut output = String::with_capacity(preprocessed_len);
1474
1475 if has_inline_block_misnest(&dom_ctx, parser) {
1476 if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
1477 preprocessed = preprocess_html(&repaired_html).into_owned();
1478 preprocessed_len = preprocessed.len();
1479 dom_guard = unsafe {
1480 tl::parse_owned(preprocessed.clone(), parser_options)
1481 .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1482 };
1483 dom_ref = dom_guard.get_ref();
1484 parser = dom_ref.parser();
1485 dom_ctx = build_dom_context(dom_ref, parser);
1486 output = String::with_capacity(preprocessed_len);
1487 }
1488 }
1489
1490 let mut is_hocr = false;
1491 for child_handle in dom_ref.children().iter() {
1492 if is_hocr_document(child_handle, parser) {
1493 is_hocr = true;
1494 break;
1495 }
1496 }
1497
1498 if options.extract_metadata && !options.convert_as_inline && !is_hocr {
1499 for child_handle in dom_ref.children().iter() {
1500 let metadata = extract_metadata(child_handle, parser, options);
1501 if !metadata.is_empty() {
1502 let metadata_frontmatter = format_metadata_frontmatter(&metadata);
1503 output.push_str(&metadata_frontmatter);
1504 break;
1505 }
1506 }
1507 }
1508
1509 if is_hocr {
1510 use crate::hocr::{convert_to_markdown_with_options as convert_hocr_to_markdown, extract_hocr_document};
1511
1512 let (elements, metadata) = extract_hocr_document(dom_ref, options.debug);
1513
1514 if options.extract_metadata && !options.convert_as_inline {
1515 let mut metadata_map = BTreeMap::new();
1516 if let Some(system) = metadata.ocr_system {
1517 metadata_map.insert("ocr-system".to_string(), system);
1518 }
1519 if !metadata.ocr_capabilities.is_empty() {
1520 metadata_map.insert("ocr-capabilities".to_string(), metadata.ocr_capabilities.join(", "));
1521 }
1522 if let Some(pages) = metadata.ocr_number_of_pages {
1523 metadata_map.insert("ocr-number-of-pages".to_string(), pages.to_string());
1524 }
1525 if !metadata.ocr_langs.is_empty() {
1526 metadata_map.insert("ocr-langs".to_string(), metadata.ocr_langs.join(", "));
1527 }
1528 if !metadata.ocr_scripts.is_empty() {
1529 metadata_map.insert("ocr-scripts".to_string(), metadata.ocr_scripts.join(", "));
1530 }
1531
1532 if !metadata_map.is_empty() {
1533 output.push_str(&format_metadata_frontmatter(&metadata_map));
1534 }
1535 }
1536
1537 let mut markdown = convert_hocr_to_markdown(&elements, true, options.hocr_spatial_tables);
1538
1539 if markdown.trim().is_empty() {
1540 return Ok(output);
1541 }
1542
1543 markdown.truncate(markdown.trim_end().len());
1544 output.push_str(&markdown);
1545 output.push('\n');
1546
1547 return Ok(output);
1548 }
1549
1550 #[cfg(feature = "metadata")]
1551 if let Some(ref collector) = metadata_collector {
1552 if !is_hocr {
1553 for child_handle in dom_ref.children().iter() {
1554 let head_meta = extract_metadata(child_handle, parser, options);
1555 if !head_meta.is_empty() {
1556 collector.borrow_mut().set_head_metadata(head_meta);
1557 break;
1558 }
1559 }
1560 }
1561 }
1562
1563 #[cfg(feature = "metadata")]
1564 if let Some(ref collector) = metadata_collector {
1565 for child_handle in dom_ref.children().iter() {
1566 if let Some(tl::Node::Tag(tag)) = child_handle.get(parser) {
1567 let tag_name = tag.name().as_utf8_str();
1568 if tag_name == "html" || tag_name == "body" {
1569 if let Some(lang) = tag.attributes().get("lang") {
1570 if let Some(lang_bytes) = lang {
1571 let lang_str = lang_bytes.as_utf8_str();
1572 collector.borrow_mut().set_language(lang_str.to_string());
1573 }
1574 }
1575 if let Some(dir) = tag.attributes().get("dir") {
1576 if let Some(dir_bytes) = dir {
1577 let dir_str = dir_bytes.as_utf8_str();
1578 collector.borrow_mut().set_text_direction(dir_str.to_string());
1579 }
1580 }
1581 }
1582 }
1583 }
1584 }
1585
1586 let ctx = Context {
1587 in_code: false,
1588 list_counter: 0,
1589 in_ordered_list: false,
1590 last_was_dt: false,
1591 blockquote_depth: 0,
1592 in_table_cell: false,
1593 convert_as_inline: options.convert_as_inline,
1594 inline_depth: 0,
1595 in_list_item: false,
1596 list_depth: 0,
1597 ul_depth: 0,
1598 in_list: false,
1599 loose_list: false,
1600 prev_item_had_blocks: false,
1601 in_heading: false,
1602 heading_tag: None,
1603 in_paragraph: false,
1604 in_ruby: false,
1605 in_strong: false,
1606 #[cfg(feature = "inline-images")]
1607 inline_collector: inline_collector.clone(),
1608 #[cfg(feature = "metadata")]
1609 metadata_collector: metadata_collector.clone(),
1610 };
1611
1612 for child_handle in dom_ref.children().iter() {
1613 walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
1614 }
1615
1616 trim_line_end_whitespace(&mut output);
1617 let trimmed = output.trim_end_matches('\n');
1618 if trimmed.is_empty() {
1619 Ok(String::new())
1620 } else {
1621 Ok(format!("{}\n", trimmed))
1622 }
1623}
1624
1625fn preprocess_html(input: &str) -> Cow<'_, str> {
1626 const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1627 const TAGS: [&[u8]; 2] = [b"script", b"style"];
1628 const SVG: &[u8] = b"svg";
1629 const DOCTYPE: &[u8] = b"doctype";
1630 const EMPTY_COMMENT: &[u8] = b"<!---->";
1631
1632 let bytes = input.as_bytes();
1633 let len = bytes.len();
1634 if len == 0 {
1635 return Cow::Borrowed(input);
1636 }
1637
1638 let mut idx = 0;
1639 let mut last = 0;
1640 let mut output: Option<String> = None;
1641 let mut svg_depth = 0usize;
1642
1643 while idx < len {
1644 if bytes[idx] == b'<' {
1645 if bytes[idx..].starts_with(EMPTY_COMMENT) {
1646 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1647 out.push_str(&input[last..idx]);
1648 out.push_str("<!-- -->");
1649 idx += EMPTY_COMMENT.len();
1650 last = idx;
1651 continue;
1652 }
1653
1654 let mut replaced = false;
1655 for (pattern, replacement) in &SELF_CLOSING {
1656 if bytes[idx..].starts_with(pattern) {
1657 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1658 out.push_str(&input[last..idx]);
1659 out.push_str(replacement);
1660 idx += pattern.len();
1661 last = idx;
1662 replaced = true;
1663 break;
1664 }
1665 }
1666 if replaced {
1667 continue;
1668 }
1669
1670 if matches_tag_start(bytes, idx + 1, SVG) {
1671 if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1672 svg_depth += 1;
1673 idx = open_end;
1674 continue;
1675 }
1676 } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1677 if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1678 if svg_depth > 0 {
1679 svg_depth = svg_depth.saturating_sub(1);
1680 }
1681 idx = close_end;
1682 continue;
1683 }
1684 }
1685
1686 if svg_depth == 0 {
1687 let mut handled = false;
1688 for tag in TAGS {
1689 if matches_tag_start(bytes, idx + 1, tag) {
1690 if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1691 let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1692 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1693 out.push_str(&input[last..idx]);
1694 out.push_str(&input[idx..open_end]);
1695 out.push_str("</");
1696 out.push_str(str::from_utf8(tag).unwrap());
1697 out.push('>');
1698
1699 last = remove_end;
1700 idx = remove_end;
1701 handled = true;
1702 }
1703 }
1704
1705 if handled {
1706 break;
1707 }
1708 }
1709
1710 if handled {
1711 continue;
1712 }
1713
1714 if idx + 2 < len && bytes[idx + 1] == b'!' {
1715 let mut cursor = idx + 2;
1716 while cursor < len && bytes[cursor].is_ascii_whitespace() {
1717 cursor += 1;
1718 }
1719
1720 if cursor + DOCTYPE.len() <= len
1721 && bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
1722 {
1723 if let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len()) {
1724 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1725 out.push_str(&input[last..idx]);
1726 last = end;
1727 idx = end;
1728 continue;
1729 }
1730 }
1731 }
1732 }
1733
1734 let is_valid_tag = if idx + 1 < len {
1735 match bytes[idx + 1] {
1736 b'!' => {
1737 idx + 2 < len
1738 && (bytes[idx + 2] == b'-'
1739 || bytes[idx + 2].is_ascii_alphabetic()
1740 || bytes[idx + 2].is_ascii_uppercase())
1741 }
1742 b'/' => {
1743 idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1744 }
1745 b'?' => true,
1746 c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
1747 _ => false,
1748 }
1749 } else {
1750 false
1751 };
1752
1753 if !is_valid_tag {
1754 let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1755 out.push_str(&input[last..idx]);
1756 out.push_str("<");
1757 idx += 1;
1758 last = idx;
1759 continue;
1760 }
1761 }
1762
1763 idx += 1;
1764 }
1765
1766 if let Some(mut out) = output {
1767 if last < len {
1768 out.push_str(&input[last..]);
1769 }
1770 Cow::Owned(out)
1771 } else {
1772 Cow::Borrowed(input)
1773 }
1774}
1775
1776#[cfg(test)]
1777fn normalize_self_closing_tags(input: &str) -> Cow<'_, str> {
1778 const REPLACEMENTS: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1779
1780 if !REPLACEMENTS
1781 .iter()
1782 .any(|(pattern, _)| input.as_bytes().windows(pattern.len()).any(|w| w == *pattern))
1783 {
1784 return Cow::Borrowed(input);
1785 }
1786
1787 let bytes = input.as_bytes();
1788 let mut output = String::with_capacity(input.len());
1789 let mut idx = 0;
1790 let mut last = 0;
1791
1792 while idx < bytes.len() {
1793 let mut matched = false;
1794 for (pattern, replacement) in &REPLACEMENTS {
1795 if bytes[idx..].starts_with(*pattern) {
1796 output.push_str(&input[last..idx]);
1797 output.push_str(replacement);
1798 idx += pattern.len();
1799 last = idx;
1800 matched = true;
1801 break;
1802 }
1803 }
1804
1805 if !matched {
1806 idx += 1;
1807 }
1808 }
1809
1810 if last < input.len() {
1811 output.push_str(&input[last..]);
1812 }
1813
1814 Cow::Owned(output)
1815}
1816
1817#[cfg(test)]
1829fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
1830 let bytes = input.as_bytes();
1831 let len = bytes.len();
1832 let mut idx = 0;
1833 let mut last = 0;
1834 let mut output: Option<String> = None;
1835
1836 while idx < len {
1837 if bytes[idx] == b'<' {
1838 if idx + 1 < len {
1839 let next = bytes[idx + 1];
1840
1841 let is_valid_tag = match next {
1842 b'!' => {
1843 idx + 2 < len
1844 && (bytes[idx + 2] == b'-'
1845 || bytes[idx + 2].is_ascii_alphabetic()
1846 || bytes[idx + 2].is_ascii_uppercase())
1847 }
1848 b'/' => {
1849 idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1850 }
1851 b'?' => true,
1852 c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
1853 _ => false,
1854 };
1855
1856 if !is_valid_tag {
1857 let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1858 out.push_str(&input[last..idx]);
1859 out.push_str("<");
1860 last = idx + 1;
1861 }
1862 } else {
1863 let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1864 out.push_str(&input[last..idx]);
1865 out.push_str("<");
1866 last = idx + 1;
1867 }
1868 }
1869 idx += 1;
1870 }
1871
1872 if let Some(mut out) = output {
1873 if last < input.len() {
1874 out.push_str(&input[last..]);
1875 }
1876 Cow::Owned(out)
1877 } else {
1878 Cow::Borrowed(input)
1879 }
1880}
1881
1882fn normalized_tag_name<'a>(raw: Cow<'a, str>) -> Cow<'a, str> {
1883 if raw.as_bytes().iter().any(|b| b.is_ascii_uppercase()) {
1884 let mut owned = raw.into_owned();
1885 owned.make_ascii_lowercase();
1886 Cow::Owned(owned)
1887 } else {
1888 raw
1889 }
1890}
1891
1892fn tag_name_eq(name: Cow<'_, str>, needle: &str) -> bool {
1893 name.eq_ignore_ascii_case(needle)
1894}
1895
1896fn should_drop_for_preprocessing(
1897 node_handle: &tl::NodeHandle,
1898 tag_name: &str,
1899 tag: &tl::HTMLTag,
1900 parser: &tl::Parser,
1901 dom_ctx: &DomContext,
1902 options: &ConversionOptions,
1903) -> bool {
1904 if !options.preprocessing.enabled {
1905 return false;
1906 }
1907
1908 if options.preprocessing.remove_navigation {
1909 let has_nav_hint = element_has_navigation_hint(tag);
1910
1911 if tag_name == "nav" {
1912 return true;
1913 }
1914
1915 if tag_name == "header" {
1916 let inside_semantic_content = has_semantic_content_ancestor(node_handle, parser, dom_ctx);
1917 if !inside_semantic_content {
1918 return true;
1919 }
1920 if has_nav_hint {
1921 return true;
1922 }
1923 } else if tag_name == "footer" || tag_name == "aside" {
1924 if has_nav_hint {
1925 return true;
1926 }
1927 } else if has_nav_hint && !matches!(tag_name, "main" | "article" | "html" | "body" | "head") {
1928 return true;
1929 }
1930 }
1931
1932 if options.preprocessing.remove_forms {
1933 if tag_name == "form" {
1934 let preserves_form = options.preserve_tags.iter().any(|t| t == "form");
1935 if !preserves_form {
1936 return true;
1937 }
1938 } else if matches!(
1939 tag_name,
1940 "button" | "select" | "textarea" | "label" | "fieldset" | "legend"
1941 ) {
1942 return true;
1943 }
1944 }
1945
1946 false
1947}
1948
1949fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
1950 let mut current_id = node_handle.get_inner();
1951 while let Some(parent_id) = dom_ctx.parent_map.get(¤t_id).copied().flatten() {
1952 if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
1953 if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
1954 let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
1955 if matches!(parent_name.as_ref(), "main" | "article" | "section") {
1956 return true;
1957 }
1958 if tag_has_main_semantics(parent_tag) {
1959 return true;
1960 }
1961 }
1962 }
1963 current_id = parent_id;
1964 }
1965 false
1966}
1967
1968fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
1969 if let Some(role_attr) = tag.attributes().get("role") {
1970 if let Some(role) = role_attr {
1971 let lowered = role.as_utf8_str().to_ascii_lowercase();
1972 if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
1973 return true;
1974 }
1975 }
1976 }
1977
1978 if let Some(class_attr) = tag.attributes().get("class") {
1979 if let Some(class_bytes) = class_attr {
1980 let class_value = class_bytes.as_utf8_str().to_ascii_lowercase();
1981 const MAIN_CLASS_HINTS: &[&str] = &[
1982 "mw-body",
1983 "mw-parser-output",
1984 "content-body",
1985 "content-container",
1986 "article-body",
1987 "article-content",
1988 "main-content",
1989 "page-content",
1990 "entry-content",
1991 "post-content",
1992 "document-body",
1993 ];
1994 if MAIN_CLASS_HINTS.iter().any(|hint| class_value.contains(hint)) {
1995 return true;
1996 }
1997 }
1998 }
1999
2000 false
2001}
2002
2003fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
2004 if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
2005 return true;
2006 }
2007
2008 if attribute_contains_any(
2009 tag,
2010 "aria-label",
2011 &["navigation", "menu", "contents", "table of contents", "toc"],
2012 ) {
2013 return true;
2014 }
2015
2016 const NAV_KEYWORDS: &[&str] = &[
2017 "nav",
2018 "navigation",
2019 "navbar",
2020 "breadcrumbs",
2021 "breadcrumb",
2022 "toc",
2023 "sidebar",
2024 "sidenav",
2025 "menu",
2026 "menubar",
2027 "mainmenu",
2028 "subnav",
2029 "tabs",
2030 "tablist",
2031 "toolbar",
2032 "pager",
2033 "pagination",
2034 "skipnav",
2035 "skip-link",
2036 "skiplinks",
2037 "site-nav",
2038 "site-menu",
2039 "site-header",
2040 "site-footer",
2041 "topbar",
2042 "bottombar",
2043 "masthead",
2044 "vector-nav",
2045 "vector-header",
2046 "vector-footer",
2047 ];
2048
2049 attribute_matches_any(tag, "class", NAV_KEYWORDS) || attribute_matches_any(tag, "id", NAV_KEYWORDS)
2050}
2051
2052fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2053 let Some(attr_value) = tag.attributes().get(attr) else {
2054 return false;
2055 };
2056 let Some(value) = attr_value else {
2057 return false;
2058 };
2059 let raw = value.as_utf8_str();
2060 raw.split_whitespace()
2061 .map(|token| {
2062 token
2063 .chars()
2064 .map(|c| match c {
2065 '_' | ':' | '.' | '/' => '-',
2066 _ => c,
2067 })
2068 .collect::<String>()
2069 .to_ascii_lowercase()
2070 })
2071 .filter(|token| !token.is_empty())
2072 .any(|token| keywords.iter().any(|kw| token == *kw))
2073}
2074
2075fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2076 let Some(attr_value) = tag.attributes().get(attr) else {
2077 return false;
2078 };
2079 let Some(value) = attr_value else {
2080 return false;
2081 };
2082 let lower = value.as_utf8_str().to_ascii_lowercase();
2083 keywords.iter().any(|kw| lower.contains(*kw))
2084}
2085
2086fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
2090 let mut html = String::new();
2091 serialize_node_to_html(handle, parser, &mut html);
2092 html
2093}
2094
2095fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
2097 match handle.get(parser) {
2098 Some(tl::Node::Tag(tag)) => {
2099 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2100
2101 output.push('<');
2102 output.push_str(&tag_name);
2103
2104 for (key, value) in tag.attributes().iter() {
2105 output.push(' ');
2106 output.push_str(&key);
2107 if let Some(val) = value {
2108 output.push_str("=\"");
2109 output.push_str(&val);
2110 output.push('"');
2111 }
2112 }
2113
2114 output.push('>');
2115
2116 let children = tag.children();
2117 for child_handle in children.top().iter() {
2118 serialize_node_to_html(child_handle, parser, output);
2119 }
2120
2121 if !matches!(
2122 tag_name.as_ref(),
2123 "br" | "hr"
2124 | "img"
2125 | "input"
2126 | "meta"
2127 | "link"
2128 | "area"
2129 | "base"
2130 | "col"
2131 | "embed"
2132 | "param"
2133 | "source"
2134 | "track"
2135 | "wbr"
2136 ) {
2137 output.push_str("</");
2138 output.push_str(&tag_name);
2139 output.push('>');
2140 }
2141 }
2142 Some(tl::Node::Raw(bytes)) => {
2143 if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
2144 output.push_str(text);
2145 }
2146 }
2147 _ => {}
2148 }
2149}
2150
2151#[cfg(test)]
2152fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
2153 const TAGS: [&[u8]; 2] = [b"script", b"style"];
2154 const SVG: &[u8] = b"svg";
2155
2156 let bytes = input.as_bytes();
2157 let len = bytes.len();
2158 let mut idx = 0;
2159 let mut last = 0;
2160 let mut output: Option<String> = None;
2161 let mut svg_depth = 0usize;
2162
2163 while idx < len {
2164 if bytes[idx] == b'<' {
2165 if matches_tag_start(bytes, idx + 1, SVG) {
2166 if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
2167 svg_depth += 1;
2168 idx = open_end;
2169 continue;
2170 }
2171 } else if matches_end_tag_start(bytes, idx + 1, SVG) {
2172 if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
2173 if svg_depth > 0 {
2174 svg_depth = svg_depth.saturating_sub(1);
2175 }
2176 idx = close_end;
2177 continue;
2178 }
2179 }
2180
2181 if svg_depth == 0 {
2182 let mut handled = false;
2183 for tag in TAGS {
2184 if matches_tag_start(bytes, idx + 1, tag) {
2185 if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
2186 let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
2187 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
2188 out.push_str(&input[last..idx]);
2189 out.push_str(&input[idx..open_end]);
2190 out.push_str("</");
2191 out.push_str(str::from_utf8(tag).unwrap());
2192 out.push('>');
2193
2194 last = remove_end;
2195 idx = remove_end;
2196 handled = true;
2197 }
2198 }
2199
2200 if handled {
2201 break;
2202 }
2203 }
2204
2205 if handled {
2206 continue;
2207 }
2208 }
2209 }
2210
2211 idx += 1;
2212 }
2213
2214 if let Some(mut out) = output {
2215 if last < input.len() {
2216 out.push_str(&input[last..]);
2217 }
2218 Cow::Owned(out)
2219 } else {
2220 Cow::Borrowed(input)
2221 }
2222}
2223
2224fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
2225 if start >= bytes.len() {
2226 return false;
2227 }
2228
2229 if start + tag.len() > bytes.len() {
2230 return false;
2231 }
2232
2233 if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
2234 return false;
2235 }
2236
2237 start += tag.len();
2238
2239 match bytes.get(start) {
2240 Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
2241 Some(_) => false,
2242 None => true,
2243 }
2244}
2245
2246fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
2247 let len = bytes.len();
2248 let mut in_quote: Option<u8> = None;
2249
2250 while idx < len {
2251 match bytes[idx] {
2252 b'"' | b'\'' => {
2253 if let Some(current) = in_quote {
2254 if current == bytes[idx] {
2255 in_quote = None;
2256 }
2257 } else {
2258 in_quote = Some(bytes[idx]);
2259 }
2260 }
2261 b'>' if in_quote.is_none() => return Some(idx + 1),
2262 _ => {}
2263 }
2264 idx += 1;
2265 }
2266
2267 None
2268}
2269
2270fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
2271 let len = bytes.len();
2272 let mut depth = 1usize;
2273
2274 while idx < len {
2275 if bytes[idx] == b'<' {
2276 if matches_tag_start(bytes, idx + 1, tag) {
2277 if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
2278 depth += 1;
2279 idx = next;
2280 continue;
2281 }
2282 } else if matches_end_tag_start(bytes, idx + 1, tag) {
2283 if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
2284 depth -= 1;
2285 if depth == 0 {
2286 return Some(close);
2287 }
2288 idx = close;
2289 continue;
2290 }
2291 }
2292 }
2293
2294 idx += 1;
2295 }
2296
2297 None
2298}
2299
2300fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
2301 if start >= bytes.len() || bytes[start] != b'/' {
2302 return false;
2303 }
2304 matches_tag_start(bytes, start + 1, tag)
2305}
2306
2307fn is_inline_element(tag_name: &str) -> bool {
2309 matches!(
2310 tag_name,
2311 "a" | "abbr"
2312 | "b"
2313 | "bdi"
2314 | "bdo"
2315 | "br"
2316 | "cite"
2317 | "code"
2318 | "data"
2319 | "dfn"
2320 | "em"
2321 | "i"
2322 | "kbd"
2323 | "mark"
2324 | "q"
2325 | "rp"
2326 | "rt"
2327 | "ruby"
2328 | "s"
2329 | "samp"
2330 | "small"
2331 | "span"
2332 | "strong"
2333 | "sub"
2334 | "sup"
2335 | "time"
2336 | "u"
2337 | "var"
2338 | "wbr"
2339 | "del"
2340 | "ins"
2341 | "img"
2342 | "map"
2343 | "area"
2344 | "audio"
2345 | "video"
2346 | "picture"
2347 | "source"
2348 | "track"
2349 | "embed"
2350 | "object"
2351 | "param"
2352 | "input"
2353 | "label"
2354 | "button"
2355 | "select"
2356 | "textarea"
2357 | "output"
2358 | "progress"
2359 | "meter"
2360 )
2361}
2362
2363fn is_block_level_element(tag_name: &str) -> bool {
2365 !is_inline_element(tag_name)
2366 && matches!(
2367 tag_name,
2368 "address"
2369 | "article"
2370 | "aside"
2371 | "blockquote"
2372 | "canvas"
2373 | "dd"
2374 | "div"
2375 | "dl"
2376 | "dt"
2377 | "fieldset"
2378 | "figcaption"
2379 | "figure"
2380 | "footer"
2381 | "form"
2382 | "h1"
2383 | "h2"
2384 | "h3"
2385 | "h4"
2386 | "h5"
2387 | "h6"
2388 | "header"
2389 | "hr"
2390 | "li"
2391 | "main"
2392 | "nav"
2393 | "ol"
2394 | "p"
2395 | "pre"
2396 | "section"
2397 | "table"
2398 | "tfoot"
2399 | "ul"
2400 )
2401}
2402
2403fn get_next_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2404 let id = node_handle.get_inner();
2405 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2406
2407 let siblings = if let Some(parent_id) = parent {
2408 dom_ctx.children_map.get(&parent_id)?
2409 } else {
2410 &dom_ctx.root_children
2411 };
2412
2413 let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2414
2415 for sibling in siblings.iter().skip(position + 1) {
2416 if let Some(node) = sibling.get(parser) {
2417 match node {
2418 tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2419 tl::Node::Raw(raw) => {
2420 if !raw.as_utf8_str().trim().is_empty() {
2421 return None;
2422 }
2423 }
2424 _ => {}
2425 }
2426 }
2427 }
2428
2429 None
2430}
2431
2432fn get_previous_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2433 let id = node_handle.get_inner();
2434 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2435
2436 let siblings = if let Some(parent_id) = parent {
2437 dom_ctx.children_map.get(&parent_id)?
2438 } else {
2439 &dom_ctx.root_children
2440 };
2441
2442 let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2443
2444 for sibling in siblings.iter().take(position).rev() {
2445 if let Some(node) = sibling.get(parser) {
2446 match node {
2447 tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2448 tl::Node::Raw(raw) => {
2449 if !raw.as_utf8_str().trim().is_empty() {
2450 return None;
2451 }
2452 }
2453 _ => {}
2454 }
2455 }
2456 }
2457
2458 None
2459}
2460
2461fn previous_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2462 let id = node_handle.get_inner();
2463 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2464
2465 let siblings = if let Some(parent_id) = parent {
2466 if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2467 children
2468 } else {
2469 return false;
2470 }
2471 } else {
2472 &dom_ctx.root_children
2473 };
2474
2475 let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2476 return false;
2477 };
2478
2479 for sibling in siblings.iter().take(position).rev() {
2480 if let Some(node) = sibling.get(parser) {
2481 match node {
2482 tl::Node::Tag(tag) => {
2483 let name = normalized_tag_name(tag.name().as_utf8_str());
2484 return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2485 }
2486 tl::Node::Raw(raw) => {
2487 if raw.as_utf8_str().trim().is_empty() {
2488 continue;
2489 }
2490 return false;
2491 }
2492 _ => continue,
2493 }
2494 }
2495 }
2496
2497 false
2498}
2499
2500fn next_sibling_is_whitespace_text(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2501 let id = node_handle.get_inner();
2502 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2503
2504 let siblings = if let Some(parent_id) = parent {
2505 if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2506 children
2507 } else {
2508 return false;
2509 }
2510 } else {
2511 &dom_ctx.root_children
2512 };
2513
2514 let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2515 return false;
2516 };
2517
2518 for sibling in siblings.iter().skip(position + 1) {
2519 if let Some(node) = sibling.get(parser) {
2520 match node {
2521 tl::Node::Raw(raw) => return raw.as_utf8_str().trim().is_empty(),
2522 tl::Node::Tag(_) => return false,
2523 _ => continue,
2524 }
2525 }
2526 }
2527
2528 false
2529}
2530
2531fn next_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2532 let id = node_handle.get_inner();
2533 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2534
2535 let siblings = if let Some(parent_id) = parent {
2536 if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2537 children
2538 } else {
2539 return false;
2540 }
2541 } else {
2542 &dom_ctx.root_children
2543 };
2544
2545 let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2546 return false;
2547 };
2548
2549 for sibling in siblings.iter().skip(position + 1) {
2550 if let Some(node) = sibling.get(parser) {
2551 match node {
2552 tl::Node::Tag(tag) => {
2553 let name = normalized_tag_name(tag.name().as_utf8_str());
2554 return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2555 }
2556 tl::Node::Raw(raw) => {
2557 if raw.as_utf8_str().trim().is_empty() {
2558 continue;
2559 }
2560 return false;
2561 }
2562 _ => continue,
2563 }
2564 }
2565 }
2566
2567 false
2568}
2569
2570fn append_inline_suffix(
2571 output: &mut String,
2572 suffix: &str,
2573 has_core_content: bool,
2574 node_handle: &tl::NodeHandle,
2575 parser: &tl::Parser,
2576 dom_ctx: &DomContext,
2577) {
2578 if suffix.is_empty() {
2579 return;
2580 }
2581
2582 if suffix == " " && has_core_content && next_sibling_is_whitespace_text(node_handle, parser, dom_ctx) {
2583 return;
2584 }
2585
2586 output.push_str(suffix);
2587}
2588
2589#[allow(clippy::only_used_in_recursion)]
2591fn walk_node(
2592 node_handle: &tl::NodeHandle,
2593 parser: &tl::Parser,
2594 output: &mut String,
2595 options: &ConversionOptions,
2596 ctx: &Context,
2597 depth: usize,
2598 dom_ctx: &DomContext,
2599) {
2600 let Some(node) = node_handle.get(parser) else { return };
2601
2602 match node {
2603 tl::Node::Raw(bytes) => {
2604 let mut text = text::decode_html_entities(&bytes.as_utf8_str());
2605
2606 if text.is_empty() {
2607 return;
2608 }
2609
2610 let had_newlines = text.contains('\n');
2611
2612 if options.strip_newlines {
2613 text = text.replace(['\r', '\n'], " ");
2614 }
2615
2616 if text.trim().is_empty() {
2617 if ctx.in_code {
2618 output.push_str(&text);
2619 return;
2620 }
2621
2622 if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2623 if ctx.convert_as_inline || ctx.in_table_cell || ctx.in_list_item {
2624 output.push_str(&text);
2625 return;
2626 }
2627 if text.contains("\n\n") || text.contains("\r\n\r\n") {
2628 if !output.ends_with("\n\n") {
2629 output.push('\n');
2630 }
2631 return;
2632 }
2633 output.push_str(&text);
2634 return;
2635 }
2636
2637 if had_newlines {
2638 if output.is_empty() {
2639 return;
2640 }
2641 if !output.ends_with("\n\n") {
2642 if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2643 if is_inline_element(&next_tag) {
2644 return;
2645 }
2646 }
2647 }
2648 return;
2649 }
2650
2651 if previous_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2652 && next_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2653 {
2654 if text.chars().count() > 1 {
2655 if !output.ends_with(' ') {
2656 output.push(' ');
2657 }
2658 } else {
2659 output.push_str(&text);
2660 }
2661 } else {
2662 output.push_str(&text);
2663 }
2664 return;
2665 }
2666
2667 let processed_text = if ctx.in_code || ctx.in_ruby {
2668 text
2669 } else if ctx.in_table_cell {
2670 let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
2671 let normalized_text = text::normalize_whitespace(&text);
2672 text::escape(
2673 &normalized_text,
2674 options.escape_misc,
2675 options.escape_asterisks,
2676 options.escape_underscores,
2677 options.escape_ascii,
2678 )
2679 } else {
2680 text::escape(
2681 &text,
2682 options.escape_misc,
2683 options.escape_asterisks,
2684 options.escape_underscores,
2685 options.escape_ascii,
2686 )
2687 };
2688 if options.escape_misc {
2689 escaped
2690 } else {
2691 escaped.replace('|', r"\|")
2692 }
2693 } else if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2694 text::escape(
2695 &text,
2696 options.escape_misc,
2697 options.escape_asterisks,
2698 options.escape_underscores,
2699 options.escape_ascii,
2700 )
2701 } else {
2702 let has_trailing_single_newline =
2703 text.ends_with('\n') && !text.ends_with("\n\n") && !text.ends_with("\r\n\r\n");
2704
2705 let normalized_text = text::normalize_whitespace(&text);
2706
2707 let (prefix, suffix, core) = text::chomp(&normalized_text);
2708
2709 let skip_prefix = output.ends_with("\n\n")
2710 || output.ends_with("* ")
2711 || output.ends_with("- ")
2712 || output.ends_with(". ")
2713 || output.ends_with("] ")
2714 || (output.ends_with('\n') && prefix == " ")
2715 || (output.ends_with(' ')
2716 && prefix == " "
2717 && !previous_sibling_is_inline_tag(node_handle, parser, dom_ctx));
2718
2719 let mut final_text = String::new();
2720 if !skip_prefix && !prefix.is_empty() {
2721 final_text.push_str(prefix);
2722 }
2723
2724 let escaped_core = text::escape(
2725 core,
2726 options.escape_misc,
2727 options.escape_asterisks,
2728 options.escape_underscores,
2729 options.escape_ascii,
2730 );
2731 final_text.push_str(&escaped_core);
2732
2733 if !suffix.is_empty() {
2734 final_text.push_str(suffix);
2735 } else if has_trailing_single_newline {
2736 let at_paragraph_break = output.ends_with("\n\n");
2737 if options.debug {
2738 eprintln!(
2739 "[DEBUG] Text had trailing single newline that was chomped, at_paragraph_break={}",
2740 at_paragraph_break
2741 );
2742 }
2743 if !at_paragraph_break {
2744 if text.contains("\n\n") || text.contains("\r\n\r\n") {
2745 final_text.push('\n');
2746 } else if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2747 if options.debug {
2748 eprintln!("[DEBUG] Next sibling tag after newline: {}", next_tag);
2749 }
2750 if matches!(next_tag.as_str(), "span") {
2751 } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2752 final_text.push(' ');
2753 } else {
2754 final_text.push('\n');
2755 }
2756 } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2757 final_text.push(' ');
2758 } else {
2759 final_text.push('\n');
2760 }
2761 }
2762 }
2763
2764 final_text
2765 };
2766
2767 if ctx.in_list_item && processed_text.contains("\n\n") {
2768 let parts: Vec<&str> = processed_text.split("\n\n").collect();
2769 for (i, part) in parts.iter().enumerate() {
2770 if i > 0 {
2771 output.push_str("\n\n");
2772 output.push_str(&" ".repeat(4 * ctx.list_depth));
2773 }
2774 output.push_str(part.trim());
2775 }
2776 } else {
2777 output.push_str(&processed_text);
2778 }
2779 }
2780
2781 tl::Node::Tag(tag) => {
2782 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2783
2784 if should_drop_for_preprocessing(node_handle, tag_name.as_ref(), tag, parser, dom_ctx, options) {
2785 trim_trailing_whitespace(output);
2786 if options.debug {
2787 eprintln!("[DEBUG] Dropping <{}> subtree due to preprocessing settings", tag_name);
2788 }
2789 return;
2790 }
2791
2792 if options.strip_tags.iter().any(|t| t.as_str() == tag_name) {
2793 let children = tag.children();
2794 {
2795 for child_handle in children.top().iter() {
2796 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2797 }
2798 }
2799 return;
2800 }
2801
2802 if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
2803 let html = serialize_tag_to_html(node_handle, parser);
2804 output.push_str(&html);
2805 return;
2806 }
2807
2808 #[cfg(feature = "metadata")]
2809 if matches!(tag_name.as_ref(), "html" | "head" | "body") {
2810 if let Some(ref collector) = ctx.metadata_collector {
2811 let mut c = collector.borrow_mut();
2812
2813 if let Some(lang) = tag.attributes().get("lang").flatten() {
2814 c.set_language(lang.as_utf8_str().to_string());
2815 }
2816
2817 if let Some(dir) = tag.attributes().get("dir").flatten() {
2818 c.set_text_direction(dir.as_utf8_str().to_string());
2819 }
2820 }
2821 }
2822
2823 match tag_name.as_ref() {
2824 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
2825 let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
2826
2827 let mut text = String::new();
2828 let heading_ctx = Context {
2829 in_heading: true,
2830 convert_as_inline: true,
2831 heading_tag: Some(tag_name.to_string()),
2832 ..ctx.clone()
2833 };
2834 let children = tag.children();
2835 {
2836 for child_handle in children.top().iter() {
2837 walk_node(
2838 child_handle,
2839 parser,
2840 &mut text,
2841 options,
2842 &heading_ctx,
2843 depth + 1,
2844 dom_ctx,
2845 );
2846 }
2847 }
2848 let trimmed = text.trim();
2849 if !trimmed.is_empty() {
2850 let normalized = normalize_heading_text(trimmed);
2851 push_heading(output, ctx, options, level, normalized.as_ref());
2852
2853 #[cfg(feature = "metadata")]
2854 if let Some(ref collector) = ctx.metadata_collector {
2855 let id = tag
2856 .attributes()
2857 .get("id")
2858 .flatten()
2859 .map(|v| v.as_utf8_str().to_string());
2860 collector
2861 .borrow_mut()
2862 .add_header(level as u8, normalized.to_string(), id, depth, 0);
2863 }
2864 }
2865 }
2866
2867 "p" => {
2868 let content_start_pos = output.len();
2869
2870 let is_table_continuation =
2871 ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
2872
2873 let is_list_continuation = ctx.in_list_item
2874 && !output.is_empty()
2875 && !output.ends_with("* ")
2876 && !output.ends_with("- ")
2877 && !output.ends_with(". ");
2878
2879 let after_code_block = output.ends_with("```\n");
2880 let needs_leading_sep = !ctx.in_table_cell
2881 && !ctx.in_list_item
2882 && !ctx.convert_as_inline
2883 && ctx.blockquote_depth == 0
2884 && !output.is_empty()
2885 && !output.ends_with("\n\n")
2886 && !after_code_block;
2887
2888 if is_table_continuation {
2889 trim_trailing_whitespace(output);
2890 output.push_str("<br>");
2891 } else if is_list_continuation {
2892 add_list_continuation_indent(output, ctx.list_depth, true, options);
2893 } else if needs_leading_sep {
2894 trim_trailing_whitespace(output);
2895 output.push_str("\n\n");
2896 }
2897
2898 let p_ctx = Context {
2899 in_paragraph: true,
2900 ..ctx.clone()
2901 };
2902
2903 let children = tag.children();
2904 {
2905 let child_handles: Vec<_> = children.top().iter().collect();
2906 for (i, child_handle) in child_handles.iter().enumerate() {
2907 if let Some(node) = child_handle.get(parser) {
2908 if let tl::Node::Raw(bytes) = node {
2909 let text = bytes.as_utf8_str();
2910 if text.trim().is_empty() && i > 0 && i < child_handles.len() - 1 {
2911 let prev = &child_handles[i - 1];
2912 let next = &child_handles[i + 1];
2913 if is_empty_inline_element(prev, parser)
2914 && is_empty_inline_element(next, parser)
2915 {
2916 continue;
2917 }
2918 }
2919 }
2920 }
2921 walk_node(child_handle, parser, output, options, &p_ctx, depth + 1, dom_ctx);
2922 }
2923 }
2924
2925 let has_content = output.len() > content_start_pos;
2926
2927 if has_content && !ctx.convert_as_inline && !ctx.in_table_cell {
2928 output.push_str("\n\n");
2929 }
2930 }
2931
2932 "strong" | "b" => {
2933 if ctx.in_code {
2934 let children = tag.children();
2935 {
2936 for child_handle in children.top().iter() {
2937 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2938 }
2939 }
2940 } else {
2941 let mut content = String::with_capacity(64);
2942 let children = tag.children();
2943 {
2944 let strong_ctx = Context {
2945 inline_depth: ctx.inline_depth + 1,
2946 in_strong: true,
2947 ..ctx.clone()
2948 };
2949 for child_handle in children.top().iter() {
2950 walk_node(
2951 child_handle,
2952 parser,
2953 &mut content,
2954 options,
2955 &strong_ctx,
2956 depth + 1,
2957 dom_ctx,
2958 );
2959 }
2960 }
2961 let (prefix, suffix, trimmed) = chomp_inline(&content);
2962 if !content.trim().is_empty() {
2963 output.push_str(prefix);
2964 if ctx.in_strong {
2965 output.push_str(trimmed);
2966 } else {
2967 output.push(options.strong_em_symbol);
2968 output.push(options.strong_em_symbol);
2969 output.push_str(trimmed);
2970 output.push(options.strong_em_symbol);
2971 output.push(options.strong_em_symbol);
2972 }
2973 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
2974 } else if !content.is_empty() {
2975 output.push_str(prefix);
2976 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
2977 }
2978 }
2979 }
2980
2981 "em" | "i" => {
2982 if ctx.in_code {
2983 let children = tag.children();
2984 {
2985 for child_handle in children.top().iter() {
2986 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2987 }
2988 }
2989 } else {
2990 let mut content = String::with_capacity(64);
2991 let children = tag.children();
2992 {
2993 let em_ctx = Context {
2994 inline_depth: ctx.inline_depth + 1,
2995 ..ctx.clone()
2996 };
2997 for child_handle in children.top().iter() {
2998 walk_node(child_handle, parser, &mut content, options, &em_ctx, depth + 1, dom_ctx);
2999 }
3000 }
3001 let (prefix, suffix, trimmed) = chomp_inline(&content);
3002 if !content.trim().is_empty() {
3003 output.push_str(prefix);
3004 output.push(options.strong_em_symbol);
3005 output.push_str(trimmed);
3006 output.push(options.strong_em_symbol);
3007 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3008 } else if !content.is_empty() {
3009 output.push_str(prefix);
3010 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3011 } else if let Some(class_value) = tag
3012 .attributes()
3013 .get("class")
3014 .and_then(|v| v.as_ref().map(|val| val.as_utf8_str().to_string()))
3015 {
3016 if class_value.contains("caret") && !output.ends_with(' ') {
3017 output.push_str(" > ");
3018 }
3019 }
3020 }
3021 }
3022
3023 "a" => {
3024 const MAX_LINK_LABEL_LEN: usize = 512;
3025
3026 let href_attr = tag
3027 .attributes()
3028 .get("href")
3029 .flatten()
3030 .map(|v| text::decode_html_entities(&v.as_utf8_str()));
3031 let title = tag
3032 .attributes()
3033 .get("title")
3034 .flatten()
3035 .map(|v| v.as_utf8_str().to_string());
3036
3037 if let Some(href) = href_attr {
3038 let raw_text = text::normalize_whitespace(&get_text_content(node_handle, parser))
3039 .trim()
3040 .to_string();
3041
3042 let is_autolink = options.autolinks
3043 && !options.default_title
3044 && !href.is_empty()
3045 && (raw_text == href || (href.starts_with("mailto:") && raw_text == href[7..]));
3046
3047 if is_autolink {
3048 output.push('<');
3049 if href.starts_with("mailto:") && raw_text == href[7..] {
3050 output.push_str(&raw_text);
3051 } else {
3052 output.push_str(&href);
3053 }
3054 output.push('>');
3055 return;
3056 }
3057
3058 if let Some((heading_level, heading_handle)) = find_single_heading_child(node_handle, parser) {
3059 if let Some(heading_node) = heading_handle.get(parser) {
3060 if let tl::Node::Tag(heading_tag) = heading_node {
3061 let heading_name =
3062 normalized_tag_name(heading_tag.name().as_utf8_str()).into_owned();
3063 let mut heading_text = String::new();
3064 let heading_ctx = Context {
3065 in_heading: true,
3066 convert_as_inline: true,
3067 heading_tag: Some(heading_name),
3068 ..ctx.clone()
3069 };
3070 walk_node(
3071 &heading_handle,
3072 parser,
3073 &mut heading_text,
3074 options,
3075 &heading_ctx,
3076 depth + 1,
3077 dom_ctx,
3078 );
3079 let trimmed_heading = heading_text.trim();
3080 if !trimmed_heading.is_empty() {
3081 let escaped_label = escape_link_label(trimmed_heading);
3082 let mut link_buffer = String::new();
3083 append_markdown_link(
3084 &mut link_buffer,
3085 &escaped_label,
3086 href.as_str(),
3087 title.as_deref(),
3088 raw_text.as_str(),
3089 options,
3090 );
3091 push_heading(output, ctx, options, heading_level, link_buffer.as_str());
3092 return;
3093 }
3094 }
3095 }
3096 }
3097
3098 let children: Vec<_> = tag.children().top().iter().copied().collect();
3099 let (inline_label, _block_nodes, saw_block) = collect_link_label_text(&children, parser);
3100 let mut label = if saw_block {
3101 let mut content = String::new();
3102 let link_ctx = Context {
3103 inline_depth: ctx.inline_depth + 1,
3104 convert_as_inline: true,
3105 ..ctx.clone()
3106 };
3107 for child_handle in children.iter() {
3108 let mut child_buf = String::new();
3109 walk_node(
3110 child_handle,
3111 parser,
3112 &mut child_buf,
3113 options,
3114 &link_ctx,
3115 depth + 1,
3116 dom_ctx,
3117 );
3118 if !child_buf.trim().is_empty()
3119 && !content.is_empty()
3120 && !content.chars().last().map(|c| c.is_whitespace()).unwrap_or(true)
3121 && !child_buf.chars().next().map(|c| c.is_whitespace()).unwrap_or(true)
3122 {
3123 content.push(' ');
3124 }
3125 content.push_str(&child_buf);
3126 }
3127 if content.trim().is_empty() {
3128 normalize_link_label(&inline_label)
3129 } else {
3130 normalize_link_label(&content)
3131 }
3132 } else {
3133 let mut content = String::new();
3134 let link_ctx = Context {
3135 inline_depth: ctx.inline_depth + 1,
3136 ..ctx.clone()
3137 };
3138 for child_handle in children.iter() {
3139 walk_node(
3140 child_handle,
3141 parser,
3142 &mut content,
3143 options,
3144 &link_ctx,
3145 depth + 1,
3146 dom_ctx,
3147 );
3148 }
3149 normalize_link_label(&content)
3150 };
3151
3152 if label.is_empty() && saw_block {
3153 let fallback = text::normalize_whitespace(&get_text_content(node_handle, parser));
3154 label = normalize_link_label(&fallback);
3155 }
3156
3157 if label.is_empty() && !raw_text.is_empty() {
3158 label = normalize_link_label(&raw_text);
3159 }
3160
3161 if label.is_empty() && !href.is_empty() && !children.is_empty() {
3162 label = href.clone();
3163 }
3164
3165 if label.len() > MAX_LINK_LABEL_LEN {
3166 truncate_at_char_boundary(&mut label, MAX_LINK_LABEL_LEN);
3167 label.push('…');
3168 }
3169
3170 let escaped_label = escape_link_label(&label);
3171 append_markdown_link(
3172 output,
3173 &escaped_label,
3174 href.as_str(),
3175 title.as_deref(),
3176 label.as_str(),
3177 options,
3178 );
3179
3180 #[cfg(feature = "metadata")]
3181 if let Some(ref collector) = ctx.metadata_collector {
3182 let rel_attr = tag
3183 .attributes()
3184 .get("rel")
3185 .flatten()
3186 .map(|v| v.as_utf8_str().to_string());
3187 let mut attributes_map = BTreeMap::new();
3188 for (key, value_opt) in tag.attributes().iter() {
3189 let key_str = key.to_string();
3190 if key_str == "href" {
3191 continue;
3192 }
3193
3194 let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3195 attributes_map.insert(key_str, value);
3196 }
3197 collector.borrow_mut().add_link(
3198 href.clone(),
3199 label.clone(),
3200 title.clone(),
3201 rel_attr,
3202 attributes_map,
3203 );
3204 }
3205 } else {
3206 let children = tag.children();
3207 {
3208 for child_handle in children.top().iter() {
3209 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3210 }
3211 }
3212 }
3213 }
3214
3215 "img" => {
3216 use std::borrow::Cow;
3217
3218 let src = tag
3219 .attributes()
3220 .get("src")
3221 .flatten()
3222 .map(|v| v.as_utf8_str())
3223 .unwrap_or(Cow::Borrowed(""));
3224
3225 let alt = tag
3226 .attributes()
3227 .get("alt")
3228 .flatten()
3229 .map(|v| v.as_utf8_str())
3230 .unwrap_or(Cow::Borrowed(""));
3231
3232 let title = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str());
3233 #[cfg(feature = "metadata")]
3234 let mut attributes_map = BTreeMap::new();
3235 #[cfg(feature = "metadata")]
3236 let mut width: Option<u32> = None;
3237 #[cfg(feature = "metadata")]
3238 let mut height: Option<u32> = None;
3239 #[cfg(feature = "metadata")]
3240 for (key, value_opt) in tag.attributes().iter() {
3241 let key_str = key.to_string();
3242 if key_str == "src" {
3243 continue;
3244 }
3245 let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3246 if key_str == "width" {
3247 if let Ok(parsed) = value.parse::<u32>() {
3248 width = Some(parsed);
3249 }
3250 } else if key_str == "height" {
3251 if let Ok(parsed) = value.parse::<u32>() {
3252 height = Some(parsed);
3253 }
3254 }
3255 attributes_map.insert(key_str, value);
3256 }
3257
3258 #[cfg(feature = "inline-images")]
3259 if let Some(ref collector_ref) = ctx.inline_collector {
3260 let mut attributes_map = BTreeMap::new();
3261 for (key, value_opt) in tag.attributes().iter() {
3262 let key_str = key.to_string();
3263 let keep = key_str == "width"
3264 || key_str == "height"
3265 || key_str == "filename"
3266 || key_str == "aria-label"
3267 || key_str.starts_with("data-");
3268 if keep {
3269 let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
3270 attributes_map.insert(key_str, value);
3271 }
3272 }
3273 handle_inline_data_image(
3274 collector_ref,
3275 src.as_ref(),
3276 alt.as_ref(),
3277 title.as_deref(),
3278 attributes_map,
3279 );
3280 }
3281
3282 let keep_as_markdown = ctx.in_heading
3283 && ctx
3284 .heading_tag
3285 .as_ref()
3286 .is_some_and(|tag| options.keep_inline_images_in.iter().any(|t| t == tag));
3287
3288 let should_use_alt_text = !keep_as_markdown
3289 && (ctx.convert_as_inline
3290 || (ctx.in_heading
3291 && ctx
3292 .heading_tag
3293 .as_ref()
3294 .is_none_or(|tag| !options.keep_inline_images_in.iter().any(|t| t == tag))));
3295
3296 if should_use_alt_text {
3297 output.push_str(&alt);
3298 } else {
3299 output.push_str(";
3302 output.push_str(&src);
3303 if let Some(ref title_text) = title {
3304 output.push_str(" \"");
3305 output.push_str(title_text);
3306 output.push('"');
3307 }
3308 output.push(')');
3309 }
3310
3311 #[cfg(feature = "metadata")]
3312 if let Some(ref collector) = ctx.metadata_collector {
3313 if !src.is_empty() {
3314 let dimensions = match (width, height) {
3315 (Some(w), Some(h)) => Some((w, h)),
3316 _ => None,
3317 };
3318 collector.borrow_mut().add_image(
3319 src.to_string(),
3320 if alt.is_empty() { None } else { Some(alt.to_string()) },
3321 title.as_deref().map(|t| t.to_string()),
3322 dimensions,
3323 attributes_map.clone(),
3324 );
3325 }
3326 }
3327 }
3328
3329 "mark" => {
3330 if ctx.convert_as_inline {
3331 let children = tag.children();
3332 {
3333 for child_handle in children.top().iter() {
3334 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3335 }
3336 }
3337 } else {
3338 use crate::options::HighlightStyle;
3339 match options.highlight_style {
3340 HighlightStyle::DoubleEqual => {
3341 output.push_str("==");
3342 let children = tag.children();
3343 {
3344 for child_handle in children.top().iter() {
3345 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3346 }
3347 }
3348 output.push_str("==");
3349 }
3350 HighlightStyle::Html => {
3351 output.push_str("<mark>");
3352 let children = tag.children();
3353 {
3354 for child_handle in children.top().iter() {
3355 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3356 }
3357 }
3358 output.push_str("</mark>");
3359 }
3360 HighlightStyle::Bold => {
3361 let symbol = options.strong_em_symbol.to_string().repeat(2);
3362 output.push_str(&symbol);
3363 let bold_ctx = Context {
3364 in_strong: true,
3365 ..ctx.clone()
3366 };
3367 let children = tag.children();
3368 {
3369 for child_handle in children.top().iter() {
3370 walk_node(child_handle, parser, output, options, &bold_ctx, depth + 1, dom_ctx);
3371 }
3372 }
3373 output.push_str(&symbol);
3374 }
3375 HighlightStyle::None => {
3376 let children = tag.children();
3377 {
3378 for child_handle in children.top().iter() {
3379 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3380 }
3381 }
3382 }
3383 }
3384 }
3385 }
3386
3387 "del" | "s" => {
3388 if ctx.in_code {
3389 let children = tag.children();
3390 {
3391 for child_handle in children.top().iter() {
3392 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3393 }
3394 }
3395 } else {
3396 let mut content = String::with_capacity(32);
3397 let children = tag.children();
3398 {
3399 for child_handle in children.top().iter() {
3400 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3401 }
3402 }
3403 let (prefix, suffix, trimmed) = chomp_inline(&content);
3404 if !content.trim().is_empty() {
3405 output.push_str(prefix);
3406 output.push_str("~~");
3407 output.push_str(trimmed);
3408 output.push_str("~~");
3409 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3410 } else if !content.is_empty() {
3411 output.push_str(prefix);
3412 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3413 }
3414 }
3415 }
3416
3417 "ins" => {
3418 let mut content = String::with_capacity(32);
3419 let children = tag.children();
3420 {
3421 for child_handle in children.top().iter() {
3422 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3423 }
3424 }
3425 let (prefix, suffix, trimmed) = chomp_inline(&content);
3426 if !trimmed.is_empty() {
3427 output.push_str(prefix);
3428 output.push_str("==");
3429 output.push_str(trimmed);
3430 output.push_str("==");
3431 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3432 }
3433 }
3434
3435 "u" | "small" => {
3436 let children = tag.children();
3437 {
3438 for child_handle in children.top().iter() {
3439 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3440 }
3441 }
3442 }
3443
3444 "sub" => {
3445 if !ctx.in_code && !options.sub_symbol.is_empty() {
3446 output.push_str(&options.sub_symbol);
3447 }
3448 let children = tag.children();
3449 {
3450 for child_handle in children.top().iter() {
3451 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3452 }
3453 }
3454 if !ctx.in_code && !options.sub_symbol.is_empty() {
3455 if options.sub_symbol.starts_with('<') && !options.sub_symbol.starts_with("</") {
3456 output.push_str(&options.sub_symbol.replace('<', "</"));
3457 } else {
3458 output.push_str(&options.sub_symbol);
3459 }
3460 }
3461 }
3462
3463 "sup" => {
3464 if !ctx.in_code && !options.sup_symbol.is_empty() {
3465 output.push_str(&options.sup_symbol);
3466 }
3467 let children = tag.children();
3468 {
3469 for child_handle in children.top().iter() {
3470 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3471 }
3472 }
3473 if !ctx.in_code && !options.sup_symbol.is_empty() {
3474 if options.sup_symbol.starts_with('<') && !options.sup_symbol.starts_with("</") {
3475 output.push_str(&options.sup_symbol.replace('<', "</"));
3476 } else {
3477 output.push_str(&options.sup_symbol);
3478 }
3479 }
3480 }
3481
3482 "kbd" | "samp" => {
3483 let code_ctx = Context {
3484 in_code: true,
3485 ..ctx.clone()
3486 };
3487 let mut content = String::with_capacity(32);
3488 let children = tag.children();
3489 {
3490 for child_handle in children.top().iter() {
3491 walk_node(
3492 child_handle,
3493 parser,
3494 &mut content,
3495 options,
3496 &code_ctx,
3497 depth + 1,
3498 dom_ctx,
3499 );
3500 }
3501 }
3502 let normalized = text::normalize_whitespace(&content);
3503 let (prefix, suffix, trimmed) = chomp_inline(&normalized);
3504 if !content.trim().is_empty() {
3505 output.push_str(prefix);
3506 output.push('`');
3507 output.push_str(trimmed);
3508 output.push('`');
3509 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3510 } else if !content.is_empty() {
3511 output.push_str(prefix);
3512 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3513 }
3514 }
3515
3516 "var" => {
3517 let mut content = String::with_capacity(32);
3518 let children = tag.children();
3519 {
3520 for child_handle in children.top().iter() {
3521 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3522 }
3523 }
3524 let (prefix, suffix, trimmed) = chomp_inline(&content);
3525 if !trimmed.is_empty() {
3526 output.push_str(prefix);
3527 output.push(options.strong_em_symbol);
3528 output.push_str(trimmed);
3529 output.push(options.strong_em_symbol);
3530 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3531 }
3532 }
3533
3534 "dfn" => {
3535 let mut content = String::with_capacity(32);
3536 let children = tag.children();
3537 {
3538 for child_handle in children.top().iter() {
3539 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3540 }
3541 }
3542 let (prefix, suffix, trimmed) = chomp_inline(&content);
3543 if !trimmed.is_empty() {
3544 output.push_str(prefix);
3545 output.push(options.strong_em_symbol);
3546 output.push_str(trimmed);
3547 output.push(options.strong_em_symbol);
3548 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3549 }
3550 }
3551
3552 "abbr" => {
3553 let mut content = String::with_capacity(32);
3554 let children = tag.children();
3555 {
3556 for child_handle in children.top().iter() {
3557 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3558 }
3559 }
3560 let trimmed = content.trim();
3561
3562 if !trimmed.is_empty() {
3563 output.push_str(trimmed);
3564
3565 if let Some(title) = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str()) {
3566 let trimmed_title = title.trim();
3567 if !trimmed_title.is_empty() {
3568 output.push_str(" (");
3569 output.push_str(trimmed_title);
3570 output.push(')');
3571 }
3572 }
3573 }
3574 }
3575
3576 "time" | "data" => {
3577 let children = tag.children();
3578 {
3579 for child_handle in children.top().iter() {
3580 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3581 }
3582 }
3583 }
3584
3585 "wbr" => {}
3586
3587 "code" => {
3588 let code_ctx = Context {
3589 in_code: true,
3590 ..ctx.clone()
3591 };
3592
3593 if !ctx.in_code {
3594 let mut content = String::with_capacity(32);
3595 let children = tag.children();
3596 {
3597 for child_handle in children.top().iter() {
3598 walk_node(
3599 child_handle,
3600 parser,
3601 &mut content,
3602 options,
3603 &code_ctx,
3604 depth + 1,
3605 dom_ctx,
3606 );
3607 }
3608 }
3609
3610 let trimmed = &content;
3611
3612 if !content.trim().is_empty() {
3613 let contains_backtick = trimmed.contains('`');
3614
3615 let needs_delimiter_spaces = {
3616 let first_char = trimmed.chars().next();
3617 let last_char = trimmed.chars().last();
3618 let starts_with_space = first_char == Some(' ');
3619 let ends_with_space = last_char == Some(' ');
3620 let starts_with_backtick = first_char == Some('`');
3621 let ends_with_backtick = last_char == Some('`');
3622 let all_spaces = trimmed.chars().all(|c| c == ' ');
3623
3624 all_spaces
3625 || starts_with_backtick
3626 || ends_with_backtick
3627 || (starts_with_space && ends_with_space && contains_backtick)
3628 };
3629
3630 let (num_backticks, needs_spaces) = if contains_backtick {
3631 let max_consecutive = trimmed
3632 .chars()
3633 .fold((0, 0), |(max, current), c| {
3634 if c == '`' {
3635 let new_current = current + 1;
3636 (max.max(new_current), new_current)
3637 } else {
3638 (max, 0)
3639 }
3640 })
3641 .0;
3642 let num = if max_consecutive == 1 { 2 } else { 1 };
3643 (num, needs_delimiter_spaces)
3644 } else {
3645 (1, needs_delimiter_spaces)
3646 };
3647
3648 for _ in 0..num_backticks {
3649 output.push('`');
3650 }
3651 if needs_spaces {
3652 output.push(' ');
3653 }
3654 output.push_str(trimmed);
3655 if needs_spaces {
3656 output.push(' ');
3657 }
3658 for _ in 0..num_backticks {
3659 output.push('`');
3660 }
3661 }
3662 } else {
3663 let children = tag.children();
3664 {
3665 for child_handle in children.top().iter() {
3666 walk_node(child_handle, parser, output, options, &code_ctx, depth + 1, dom_ctx);
3667 }
3668 }
3669 }
3670 }
3671
3672 "pre" => {
3673 let code_ctx = Context {
3674 in_code: true,
3675 ..ctx.clone()
3676 };
3677
3678 let mut content = String::with_capacity(256);
3679 let children = tag.children();
3680 {
3681 for child_handle in children.top().iter() {
3682 walk_node(
3683 child_handle,
3684 parser,
3685 &mut content,
3686 options,
3687 &code_ctx,
3688 depth + 1,
3689 dom_ctx,
3690 );
3691 }
3692 }
3693
3694 if !content.is_empty() {
3695 let leading_newlines = content.chars().take_while(|&c| c == '\n').count();
3696 let trailing_newlines = content.chars().rev().take_while(|&c| c == '\n').count();
3697 let core = content.trim_matches('\n');
3698 let is_whitespace_only = core.trim().is_empty();
3699
3700 let processed_content = if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
3701 content
3702 } else {
3703 let mut core_text = if leading_newlines > 0 {
3704 dedent_code_block(core)
3705 } else {
3706 core.to_string()
3707 };
3708
3709 if is_whitespace_only {
3710 let mut rebuilt = String::new();
3711 for _ in 0..leading_newlines {
3712 rebuilt.push('\n');
3713 }
3714 rebuilt.push_str(&core_text);
3715 for _ in 0..trailing_newlines {
3716 rebuilt.push('\n');
3717 }
3718 rebuilt
3719 } else {
3720 for _ in 0..trailing_newlines {
3721 core_text.push('\n');
3722 }
3723 core_text
3724 }
3725 };
3726
3727 match options.code_block_style {
3728 crate::options::CodeBlockStyle::Indented => {
3729 if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3730 if output.ends_with('\n') {
3731 output.push('\n');
3732 } else {
3733 output.push_str("\n\n");
3734 }
3735 }
3736
3737 let indented = processed_content
3738 .lines()
3739 .map(|line| {
3740 if line.is_empty() {
3741 String::new()
3742 } else {
3743 format!(" {}", line)
3744 }
3745 })
3746 .collect::<Vec<_>>()
3747 .join("\n");
3748 output.push_str(&indented);
3749
3750 output.push_str("\n\n");
3751 }
3752 crate::options::CodeBlockStyle::Backticks | crate::options::CodeBlockStyle::Tildes => {
3753 if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3754 if output.ends_with('\n') {
3755 output.push('\n');
3756 } else {
3757 output.push_str("\n\n");
3758 }
3759 }
3760
3761 let fence = if options.code_block_style == crate::options::CodeBlockStyle::Backticks {
3762 "```"
3763 } else {
3764 "~~~"
3765 };
3766
3767 output.push_str(fence);
3768 if !options.code_language.is_empty() {
3769 output.push_str(&options.code_language);
3770 }
3771 output.push('\n');
3772 output.push_str(&processed_content);
3773 output.push('\n');
3774 output.push_str(fence);
3775 output.push('\n');
3776 }
3777 }
3778 }
3779 }
3780
3781 "blockquote" => {
3782 if ctx.convert_as_inline {
3783 let children = tag.children();
3784 {
3785 for child_handle in children.top().iter() {
3786 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3787 }
3788 }
3789 return;
3790 }
3791
3792 let cite = tag
3793 .attributes()
3794 .get("cite")
3795 .flatten()
3796 .map(|v| v.as_utf8_str().to_string());
3797
3798 let blockquote_ctx = Context {
3799 blockquote_depth: ctx.blockquote_depth + 1,
3800 ..ctx.clone()
3801 };
3802 let mut content = String::with_capacity(256);
3803 let children = tag.children();
3804 {
3805 for child_handle in children.top().iter() {
3806 walk_node(
3807 child_handle,
3808 parser,
3809 &mut content,
3810 options,
3811 &blockquote_ctx,
3812 depth + 1,
3813 dom_ctx,
3814 );
3815 }
3816 }
3817
3818 let trimmed_content = content.trim();
3819
3820 if !trimmed_content.is_empty() {
3821 if ctx.blockquote_depth > 0 {
3822 output.push_str("\n\n\n");
3823 } else if !output.is_empty() {
3824 if !output.ends_with('\n') {
3825 output.push('\n');
3826 } else if output.ends_with("\n\n") {
3827 output.truncate(output.len() - 1);
3828 }
3829 }
3830
3831 let prefix = "> ";
3832
3833 for line in trimmed_content.lines() {
3834 output.push_str(prefix);
3835 output.push_str(line.trim());
3836 output.push('\n');
3837 }
3838
3839 if let Some(url) = cite {
3840 output.push('\n');
3841 output.push_str("— <");
3842 output.push_str(&url);
3843 output.push_str(">\n\n");
3844 }
3845
3846 while output.ends_with('\n') {
3847 output.truncate(output.len() - 1);
3848 }
3849 }
3850 }
3851
3852 "br" => {
3853 if ctx.in_heading {
3854 trim_trailing_whitespace(output);
3855 output.push_str(" ");
3856 } else {
3857 use crate::options::NewlineStyle;
3858 if output.is_empty() || output.ends_with('\n') {
3859 output.push('\n');
3860 } else {
3861 match options.newline_style {
3862 NewlineStyle::Spaces => output.push_str(" \n"),
3863 NewlineStyle::Backslash => output.push_str("\\\n"),
3864 }
3865 }
3866 }
3867 }
3868
3869 "hr" => {
3870 if !output.is_empty() {
3871 let prev_tag = get_previous_sibling_tag(node_handle, parser, dom_ctx);
3872 let last_line_is_blockquote = output
3873 .rsplit('\n')
3874 .find(|line| !line.trim().is_empty())
3875 .map(|line| line.trim_start().starts_with('>'))
3876 .unwrap_or(false);
3877 let needs_blank_line = !ctx.in_paragraph
3878 && !matches!(prev_tag.as_deref(), Some("blockquote"))
3879 && !last_line_is_blockquote;
3880
3881 if options.debug {
3882 eprintln!(
3883 "[DEBUG] <hr> prev_tag={:?} needs_blank_line={} in_paragraph={}",
3884 prev_tag, needs_blank_line, ctx.in_paragraph
3885 );
3886 }
3887
3888 if ctx.in_paragraph || !needs_blank_line {
3889 if !output.ends_with('\n') {
3890 output.push('\n');
3891 }
3892 } else {
3893 trim_trailing_whitespace(output);
3894 if output.ends_with('\n') {
3895 if !output.ends_with("\n\n") {
3896 output.push('\n');
3897 }
3898 } else {
3899 output.push_str("\n\n");
3900 }
3901 }
3902 }
3903 output.push_str("---\n");
3904 }
3905
3906 "ul" => {
3907 add_list_leading_separator(output, ctx);
3908
3909 let nested_depth = calculate_list_nesting_depth(ctx);
3910 let is_loose = is_loose_list(node_handle, parser);
3911
3912 process_list_children(
3913 node_handle,
3914 parser,
3915 output,
3916 options,
3917 ctx,
3918 depth,
3919 false,
3920 is_loose,
3921 nested_depth,
3922 1,
3923 dom_ctx,
3924 );
3925
3926 add_nested_list_trailing_separator(output, ctx);
3927 }
3928
3929 "ol" => {
3930 add_list_leading_separator(output, ctx);
3931
3932 let nested_depth = calculate_list_nesting_depth(ctx);
3933 let is_loose = is_loose_list(node_handle, parser);
3934
3935 let start = tag
3936 .attributes()
3937 .get("start")
3938 .flatten()
3939 .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
3940 .unwrap_or(1);
3941
3942 process_list_children(
3943 node_handle,
3944 parser,
3945 output,
3946 options,
3947 ctx,
3948 depth,
3949 true,
3950 is_loose,
3951 nested_depth,
3952 start,
3953 dom_ctx,
3954 );
3955
3956 add_nested_list_trailing_separator(output, ctx);
3957 }
3958
3959 "li" => {
3960 if ctx.list_depth > 0 {
3961 let indent = match options.list_indent_type {
3962 ListIndentType::Tabs => "\t".repeat(ctx.list_depth),
3963 ListIndentType::Spaces => " ".repeat(ctx.list_depth * options.list_indent_width),
3964 };
3965 output.push_str(&indent);
3966 }
3967
3968 let mut has_block_children = false;
3969 let children = tag.children();
3970 {
3971 for child_handle in children.top().iter() {
3972 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
3973 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
3974 if matches!(
3975 tag_name.as_ref(),
3976 "p" | "div" | "blockquote" | "pre" | "table" | "hr" | "dl"
3977 ) {
3978 has_block_children = true;
3979 break;
3980 }
3981 }
3982 }
3983 }
3984
3985 fn find_checkbox<'a>(
3986 node_handle: &tl::NodeHandle,
3987 parser: &'a tl::Parser<'a>,
3988 ) -> Option<(bool, tl::NodeHandle)> {
3989 if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
3990 if tag_name_eq(node_tag.name().as_utf8_str(), "input") {
3991 let input_type = node_tag.attributes().get("type").flatten().map(|v| v.as_utf8_str());
3992
3993 if input_type.as_deref() == Some("checkbox") {
3994 let checked = node_tag.attributes().get("checked").is_some();
3995 return Some((checked, *node_handle));
3996 }
3997 }
3998
3999 let children = node_tag.children();
4000 {
4001 for child_handle in children.top().iter() {
4002 if let Some(result) = find_checkbox(child_handle, parser) {
4003 return Some(result);
4004 }
4005 }
4006 }
4007 }
4008 None
4009 }
4010
4011 let (is_task_list, task_checked, checkbox_node) =
4012 if let Some((checked, node)) = find_checkbox(node_handle, parser) {
4013 (true, checked, Some(node))
4014 } else {
4015 (false, false, None)
4016 };
4017
4018 let li_ctx = Context {
4019 in_list_item: true,
4020 list_depth: ctx.list_depth + 1,
4021 ..ctx.clone()
4022 };
4023
4024 if is_task_list {
4025 output.push('-');
4026 output.push(' ');
4027 output.push_str(if task_checked { "[x]" } else { "[ ]" });
4028
4029 fn is_checkbox_node(node_handle: &tl::NodeHandle, checkbox: &Option<tl::NodeHandle>) -> bool {
4030 if let Some(cb) = checkbox {
4031 node_handle == cb
4032 } else {
4033 false
4034 }
4035 }
4036
4037 fn contains_checkbox<'a>(
4038 node_handle: &tl::NodeHandle,
4039 parser: &'a tl::Parser<'a>,
4040 checkbox: &Option<tl::NodeHandle>,
4041 ) -> bool {
4042 if is_checkbox_node(node_handle, checkbox) {
4043 return true;
4044 }
4045 if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4046 let children = node_tag.children();
4047 {
4048 for child_handle in children.top().iter() {
4049 if contains_checkbox(child_handle, parser, checkbox) {
4050 return true;
4051 }
4052 }
4053 }
4054 }
4055 false
4056 }
4057
4058 #[allow(clippy::too_many_arguments)]
4059 fn render_li_content<'a>(
4060 node_handle: &tl::NodeHandle,
4061 parser: &'a tl::Parser<'a>,
4062 output: &mut String,
4063 options: &ConversionOptions,
4064 ctx: &Context,
4065 depth: usize,
4066 checkbox: &Option<tl::NodeHandle>,
4067 dom_ctx: &DomContext,
4068 ) {
4069 if is_checkbox_node(node_handle, checkbox) {
4070 return;
4071 }
4072
4073 if contains_checkbox(node_handle, parser, checkbox) {
4074 if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4075 let children = node_tag.children();
4076 {
4077 for child_handle in children.top().iter() {
4078 render_li_content(
4079 child_handle,
4080 parser,
4081 output,
4082 options,
4083 ctx,
4084 depth,
4085 checkbox,
4086 dom_ctx,
4087 );
4088 }
4089 }
4090 }
4091 } else {
4092 walk_node(node_handle, parser, output, options, ctx, depth, dom_ctx);
4093 }
4094 }
4095
4096 let mut task_text = String::new();
4097 let children = tag.children();
4098 {
4099 for child_handle in children.top().iter() {
4100 render_li_content(
4101 child_handle,
4102 parser,
4103 &mut task_text,
4104 options,
4105 &li_ctx,
4106 depth + 1,
4107 &checkbox_node,
4108 dom_ctx,
4109 );
4110 }
4111 }
4112 output.push(' ');
4113 let trimmed_task = task_text.trim();
4114 if !trimmed_task.is_empty() {
4115 output.push_str(trimmed_task);
4116 }
4117 } else {
4118 if !ctx.in_table_cell {
4119 if ctx.in_ordered_list {
4120 output.push_str(&format!("{}. ", ctx.list_counter));
4121 } else {
4122 let bullets: Vec<char> = options.bullets.chars().collect();
4123 let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
4124 let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
4125 output.push(bullet);
4126 output.push(' ');
4127 }
4128 }
4129
4130 let children = tag.children();
4131 {
4132 for child_handle in children.top().iter() {
4133 walk_node(child_handle, parser, output, options, &li_ctx, depth + 1, dom_ctx);
4134 }
4135 }
4136
4137 trim_trailing_whitespace(output);
4138 }
4139
4140 if !ctx.in_table_cell {
4141 if has_block_children || ctx.loose_list || ctx.prev_item_had_blocks {
4142 if !output.ends_with("\n\n") {
4143 if output.ends_with('\n') {
4144 output.push('\n');
4145 } else {
4146 output.push_str("\n\n");
4147 }
4148 }
4149 } else if !output.ends_with('\n') {
4150 output.push('\n');
4151 }
4152 }
4153 }
4154
4155 "table" => {
4156 let mut table_output = String::new();
4157 convert_table(node_handle, parser, &mut table_output, options, ctx, dom_ctx);
4158
4159 if ctx.in_list_item {
4160 let has_caption = table_output.starts_with('*');
4161
4162 if !has_caption {
4163 trim_trailing_whitespace(output);
4164 if !output.is_empty() && !output.ends_with('\n') {
4165 output.push('\n');
4166 }
4167 }
4168
4169 let indented = indent_table_for_list(&table_output, ctx.list_depth, options);
4170 output.push_str(&indented);
4171 } else {
4172 if !output.ends_with("\n\n") {
4173 if output.is_empty() || !output.ends_with('\n') {
4174 output.push_str("\n\n");
4175 } else {
4176 output.push('\n');
4177 }
4178 }
4179 output.push_str(&table_output);
4180 }
4181
4182 if !output.ends_with('\n') {
4183 output.push('\n');
4184 }
4185 }
4186
4187 "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" => {}
4188
4189 "caption" => {
4190 let mut text = String::new();
4191 let children = tag.children();
4192 {
4193 for child_handle in children.top().iter() {
4194 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4195 }
4196 }
4197 let text = text.trim();
4198 if !text.is_empty() {
4199 let escaped_text = text.replace('-', r"\-");
4200 output.push('*');
4201 output.push_str(&escaped_text);
4202 output.push_str("*\n\n");
4203 }
4204 }
4205
4206 "colgroup" | "col" => {}
4207
4208 "article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
4209 if ctx.convert_as_inline {
4210 let children = tag.children();
4211 {
4212 for child_handle in children.top().iter() {
4213 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4214 }
4215 }
4216 return;
4217 }
4218
4219 let mut content = String::with_capacity(256);
4220 let children = tag.children();
4221 {
4222 for child_handle in children.top().iter() {
4223 walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4224 }
4225 }
4226 if content.trim().is_empty() {
4227 return;
4228 }
4229
4230 if !output.is_empty() && !output.ends_with("\n\n") {
4231 output.push_str("\n\n");
4232 }
4233 output.push_str(&content);
4234 if content.ends_with('\n') && !content.ends_with("\n\n") {
4235 output.push('\n');
4236 } else if !content.ends_with('\n') {
4237 output.push_str("\n\n");
4238 }
4239 }
4240
4241 "figure" => {
4242 if ctx.convert_as_inline {
4243 let children = tag.children();
4244 {
4245 for child_handle in children.top().iter() {
4246 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4247 }
4248 }
4249 return;
4250 }
4251
4252 if !output.is_empty() && !output.ends_with("\n\n") {
4253 output.push_str("\n\n");
4254 }
4255
4256 let mut figure_content = String::new();
4257 let children = tag.children();
4258 {
4259 for child_handle in children.top().iter() {
4260 walk_node(child_handle, parser, &mut figure_content, options, ctx, depth, dom_ctx);
4261 }
4262 }
4263
4264 figure_content = figure_content.replace("\n;
4612 output.push_str(&src);
4613 output.push(')');
4614 if !ctx.in_paragraph && !ctx.convert_as_inline {
4615 output.push_str("\n\n");
4616 }
4617 }
4618
4619 let mut fallback = String::new();
4620 let children = tag.children();
4621 {
4622 for child_handle in children.top().iter() {
4623 let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4624 tag_name_eq(child_tag.name().as_utf8_str(), "source")
4625 } else {
4626 false
4627 };
4628
4629 if !is_source {
4630 walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4631 }
4632 }
4633 }
4634 if !fallback.is_empty() {
4635 output.push_str(fallback.trim());
4636 if !ctx.in_paragraph && !ctx.convert_as_inline {
4637 output.push_str("\n\n");
4638 }
4639 }
4640 }
4641
4642 "video" => {
4643 use std::borrow::Cow;
4644
4645 let src = tag
4646 .attributes()
4647 .get("src")
4648 .flatten()
4649 .map(|v| v.as_utf8_str())
4650 .or_else(|| {
4651 let children = tag.children();
4652 {
4653 for child_handle in children.top().iter() {
4654 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4655 if tag_name_eq(child_tag.name().as_utf8_str(), "source") {
4656 return child_tag
4657 .attributes()
4658 .get("src")
4659 .flatten()
4660 .map(|v| v.as_utf8_str());
4661 }
4662 }
4663 }
4664 }
4665 None
4666 })
4667 .unwrap_or(Cow::Borrowed(""));
4668
4669 if !src.is_empty() {
4670 output.push('[');
4671 output.push_str(&src);
4672 output.push_str("](");
4673 output.push_str(&src);
4674 output.push(')');
4675 if !ctx.in_paragraph && !ctx.convert_as_inline {
4676 output.push_str("\n\n");
4677 }
4678 }
4679
4680 let mut fallback = String::new();
4681 let children = tag.children();
4682 {
4683 for child_handle in children.top().iter() {
4684 let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4685 tag_name_eq(child_tag.name().as_utf8_str(), "source")
4686 } else {
4687 false
4688 };
4689
4690 if !is_source {
4691 walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4692 }
4693 }
4694 }
4695 if !fallback.is_empty() {
4696 output.push_str(fallback.trim());
4697 if !ctx.in_paragraph && !ctx.convert_as_inline {
4698 output.push_str("\n\n");
4699 }
4700 }
4701 }
4702
4703 "source" => {}
4704
4705 "picture" => {
4706 let children = tag.children();
4707 {
4708 for child_handle in children.top().iter() {
4709 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4710 if tag_name_eq(child_tag.name().as_utf8_str(), "img") {
4711 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4712 break;
4713 }
4714 }
4715 }
4716 }
4717 }
4718
4719 "iframe" => {
4720 use std::borrow::Cow;
4721
4722 let src = tag
4723 .attributes()
4724 .get("src")
4725 .flatten()
4726 .map(|v| v.as_utf8_str())
4727 .unwrap_or(Cow::Borrowed(""));
4728
4729 if !src.is_empty() {
4730 output.push('[');
4731 output.push_str(&src);
4732 output.push_str("](");
4733 output.push_str(&src);
4734 output.push(')');
4735 if !ctx.in_paragraph && !ctx.convert_as_inline {
4736 output.push_str("\n\n");
4737 }
4738 }
4739 }
4740
4741 "svg" => {
4742 let mut title = String::from("SVG Image");
4743 let children = tag.children();
4744 {
4745 for child_handle in children.top().iter() {
4746 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4747 if tag_name_eq(child_tag.name().as_utf8_str(), "title") {
4748 title = get_text_content(child_handle, parser).trim().to_string();
4749 break;
4750 }
4751 }
4752 }
4753 }
4754
4755 #[cfg(feature = "inline-images")]
4756 if let Some(ref collector_ref) = ctx.inline_collector {
4757 let title_opt = if title == "SVG Image" {
4758 None
4759 } else {
4760 Some(title.clone())
4761 };
4762 let mut attributes_map = BTreeMap::new();
4763 for (key, value_opt) in tag.attributes().iter() {
4764 let key_str = key.to_string();
4765 let keep = key_str == "width"
4766 || key_str == "height"
4767 || key_str == "filename"
4768 || key_str == "aria-label"
4769 || key_str.starts_with("data-");
4770 if keep {
4771 let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
4772 attributes_map.insert(key_str, value);
4773 }
4774 }
4775 handle_inline_svg(collector_ref, node_handle, parser, title_opt, attributes_map);
4776 }
4777
4778 if ctx.convert_as_inline {
4779 output.push_str(&title);
4780 } else {
4781 use base64::{Engine as _, engine::general_purpose::STANDARD};
4782
4783 let svg_html = serialize_element(node_handle, parser);
4784
4785 let base64_svg = STANDARD.encode(svg_html.as_bytes());
4786
4787 output.push_str(";
4790 output.push_str(&base64_svg);
4791 output.push(')');
4792 }
4793 }
4794
4795 "math" => {
4796 let text_content = get_text_content(node_handle, parser).trim().to_string();
4797
4798 if text_content.is_empty() {
4799 return;
4800 }
4801
4802 let math_html = serialize_element(node_handle, parser);
4803
4804 let escaped_text = text::escape(
4805 &text_content,
4806 options.escape_misc,
4807 options.escape_asterisks,
4808 options.escape_underscores,
4809 options.escape_ascii,
4810 );
4811
4812 let is_display_block = tag
4813 .attributes()
4814 .get("display")
4815 .flatten()
4816 .map(|v| v.as_utf8_str() == "block")
4817 .unwrap_or(false);
4818
4819 if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4820 output.push_str("\n\n");
4821 }
4822
4823 output.push_str("<!-- MathML: ");
4824 output.push_str(&math_html);
4825 output.push_str(" --> ");
4826 output.push_str(&escaped_text);
4827
4828 if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4829 output.push_str("\n\n");
4830 }
4831 }
4832
4833 "form" => {
4834 if ctx.convert_as_inline {
4835 let children = tag.children();
4836 {
4837 for child_handle in children.top().iter() {
4838 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4839 }
4840 }
4841 return;
4842 }
4843
4844 let mut content = String::new();
4845 let children = tag.children();
4846 {
4847 for child_handle in children.top().iter() {
4848 walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4849 }
4850 }
4851 let trimmed = content.trim();
4852 if !trimmed.is_empty() {
4853 if !output.is_empty() && !output.ends_with("\n\n") {
4854 output.push_str("\n\n");
4855 }
4856 output.push_str(trimmed);
4857 output.push_str("\n\n");
4858 }
4859 }
4860
4861 "fieldset" => {
4862 if ctx.convert_as_inline {
4863 let children = tag.children();
4864 {
4865 for child_handle in children.top().iter() {
4866 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4867 }
4868 }
4869 return;
4870 }
4871 let mut content = String::new();
4872 let children = tag.children();
4873 {
4874 for child_handle in children.top().iter() {
4875 walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4876 }
4877 }
4878 let trimmed = content.trim();
4879 if !trimmed.is_empty() {
4880 if !output.is_empty() && !output.ends_with("\n\n") {
4881 output.push_str("\n\n");
4882 }
4883 output.push_str(trimmed);
4884 output.push_str("\n\n");
4885 }
4886 }
4887
4888 "legend" => {
4889 let mut content = String::new();
4890 let mut legend_ctx = ctx.clone();
4891 if !ctx.convert_as_inline {
4892 legend_ctx.in_strong = true;
4893 }
4894 let children = tag.children();
4895 {
4896 for child_handle in children.top().iter() {
4897 walk_node(
4898 child_handle,
4899 parser,
4900 &mut content,
4901 options,
4902 &legend_ctx,
4903 depth + 1,
4904 dom_ctx,
4905 );
4906 }
4907 }
4908 let trimmed = content.trim();
4909 if !trimmed.is_empty() {
4910 if ctx.convert_as_inline {
4911 output.push_str(trimmed);
4912 } else {
4913 let symbol = options.strong_em_symbol.to_string().repeat(2);
4914 output.push_str(&symbol);
4915 output.push_str(trimmed);
4916 output.push_str(&symbol);
4917 output.push_str("\n\n");
4918 }
4919 }
4920 }
4921
4922 "label" => {
4923 let mut content = String::new();
4924 let children = tag.children();
4925 {
4926 for child_handle in children.top().iter() {
4927 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4928 }
4929 }
4930 let trimmed = content.trim();
4931 if !trimmed.is_empty() {
4932 output.push_str(trimmed);
4933 if !ctx.convert_as_inline {
4934 output.push_str("\n\n");
4935 }
4936 }
4937 }
4938
4939 "input" => {}
4940
4941 "textarea" => {
4942 let start_len = output.len();
4943 let children = tag.children();
4944 {
4945 for child_handle in children.top().iter() {
4946 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4947 }
4948 }
4949
4950 if !ctx.convert_as_inline && output.len() > start_len {
4951 output.push_str("\n\n");
4952 }
4953 }
4954
4955 "select" => {
4956 let start_len = output.len();
4957 let children = tag.children();
4958 {
4959 for child_handle in children.top().iter() {
4960 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4961 }
4962 }
4963
4964 if !ctx.convert_as_inline && output.len() > start_len {
4965 output.push('\n');
4966 }
4967 }
4968
4969 "option" => {
4970 let selected = tag.attributes().iter().any(|(name, _)| name.as_ref() == "selected");
4971
4972 let mut text = String::new();
4973 let children = tag.children();
4974 {
4975 for child_handle in children.top().iter() {
4976 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4977 }
4978 }
4979 let trimmed = text.trim();
4980 if !trimmed.is_empty() {
4981 if selected && !ctx.convert_as_inline {
4982 output.push_str("* ");
4983 }
4984 output.push_str(trimmed);
4985 if !ctx.convert_as_inline {
4986 output.push('\n');
4987 }
4988 }
4989 }
4990
4991 "optgroup" => {
4992 use std::borrow::Cow;
4993
4994 let label = tag
4995 .attributes()
4996 .get("label")
4997 .flatten()
4998 .map(|v| v.as_utf8_str())
4999 .unwrap_or(Cow::Borrowed(""));
5000
5001 if !label.is_empty() {
5002 let symbol = options.strong_em_symbol.to_string().repeat(2);
5003 output.push_str(&symbol);
5004 output.push_str(&label);
5005 output.push_str(&symbol);
5006 output.push('\n');
5007 }
5008
5009 let children = tag.children();
5010 {
5011 for child_handle in children.top().iter() {
5012 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5013 }
5014 }
5015 }
5016
5017 "button" => {
5018 let start_len = output.len();
5019 let children = tag.children();
5020 {
5021 for child_handle in children.top().iter() {
5022 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5023 }
5024 }
5025
5026 if !ctx.convert_as_inline && output.len() > start_len {
5027 output.push_str("\n\n");
5028 }
5029 }
5030
5031 "progress" => {
5032 let start_len = output.len();
5033 let children = tag.children();
5034 {
5035 for child_handle in children.top().iter() {
5036 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5037 }
5038 }
5039
5040 if !ctx.convert_as_inline && output.len() > start_len {
5041 output.push_str("\n\n");
5042 }
5043 }
5044
5045 "meter" => {
5046 let start_len = output.len();
5047 let children = tag.children();
5048 {
5049 for child_handle in children.top().iter() {
5050 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5051 }
5052 }
5053
5054 if !ctx.convert_as_inline && output.len() > start_len {
5055 output.push_str("\n\n");
5056 }
5057 }
5058
5059 "output" => {
5060 let start_len = output.len();
5061 let children = tag.children();
5062 {
5063 for child_handle in children.top().iter() {
5064 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5065 }
5066 }
5067
5068 if !ctx.convert_as_inline && output.len() > start_len {
5069 output.push_str("\n\n");
5070 }
5071 }
5072
5073 "datalist" => {
5074 let start_len = output.len();
5075 let children = tag.children();
5076 {
5077 for child_handle in children.top().iter() {
5078 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5079 }
5080 }
5081
5082 if !ctx.convert_as_inline && output.len() > start_len {
5083 output.push('\n');
5084 }
5085 }
5086
5087 "ruby" => {
5088 let ruby_ctx = ctx.clone();
5089
5090 let tag_sequence: Vec<String> = tag
5091 .children()
5092 .top()
5093 .iter()
5094 .filter_map(|child_handle| {
5095 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5096 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5097 if matches!(tag_name.as_ref(), "rb" | "rt" | "rtc") {
5098 Some(tag_name.into_owned())
5099 } else {
5100 None
5101 }
5102 } else {
5103 None
5104 }
5105 })
5106 .collect();
5107
5108 let has_rtc = tag_sequence.iter().any(|tag| tag == "rtc");
5109
5110 let is_interleaved = tag_sequence.windows(2).any(|w| w[0] == "rb" && w[1] == "rt");
5111
5112 if is_interleaved && !has_rtc {
5113 let mut current_base = String::new();
5114 let children = tag.children();
5115 {
5116 for child_handle in children.top().iter() {
5117 if let Some(node) = child_handle.get(parser) {
5118 match node {
5119 tl::Node::Tag(child_tag) => {
5120 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5121 if tag_name == "rt" {
5122 let mut annotation = String::new();
5123 walk_node(
5124 child_handle,
5125 parser,
5126 &mut annotation,
5127 options,
5128 &ruby_ctx,
5129 depth,
5130 dom_ctx,
5131 );
5132 if !current_base.is_empty() {
5133 output.push_str(current_base.trim());
5134 current_base.clear();
5135 }
5136 output.push_str(annotation.trim());
5137 } else if tag_name == "rb" {
5138 if !current_base.is_empty() {
5139 output.push_str(current_base.trim());
5140 current_base.clear();
5141 }
5142 walk_node(
5143 child_handle,
5144 parser,
5145 &mut current_base,
5146 options,
5147 &ruby_ctx,
5148 depth,
5149 dom_ctx,
5150 );
5151 } else if tag_name != "rp" {
5152 walk_node(
5153 child_handle,
5154 parser,
5155 &mut current_base,
5156 options,
5157 &ruby_ctx,
5158 depth,
5159 dom_ctx,
5160 );
5161 }
5162 }
5163 tl::Node::Raw(_) => {
5164 walk_node(
5165 child_handle,
5166 parser,
5167 &mut current_base,
5168 options,
5169 &ruby_ctx,
5170 depth,
5171 dom_ctx,
5172 );
5173 }
5174 _ => {}
5175 }
5176 }
5177 }
5178 }
5179 if !current_base.is_empty() {
5180 output.push_str(current_base.trim());
5181 }
5182 } else {
5183 let mut base_text = String::new();
5184 let mut rt_annotations = Vec::new();
5185 let mut rtc_content = String::new();
5186
5187 let children = tag.children();
5188 {
5189 for child_handle in children.top().iter() {
5190 if let Some(node) = child_handle.get(parser) {
5191 match node {
5192 tl::Node::Tag(child_tag) => {
5193 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5194 if tag_name == "rt" {
5195 let mut annotation = String::new();
5196 walk_node(
5197 child_handle,
5198 parser,
5199 &mut annotation,
5200 options,
5201 &ruby_ctx,
5202 depth,
5203 dom_ctx,
5204 );
5205 rt_annotations.push(annotation);
5206 } else if tag_name == "rtc" {
5207 walk_node(
5208 child_handle,
5209 parser,
5210 &mut rtc_content,
5211 options,
5212 &ruby_ctx,
5213 depth,
5214 dom_ctx,
5215 );
5216 } else if tag_name != "rp" {
5217 walk_node(
5218 child_handle,
5219 parser,
5220 &mut base_text,
5221 options,
5222 &ruby_ctx,
5223 depth,
5224 dom_ctx,
5225 );
5226 }
5227 }
5228 tl::Node::Raw(_) => {
5229 walk_node(
5230 child_handle,
5231 parser,
5232 &mut base_text,
5233 options,
5234 &ruby_ctx,
5235 depth,
5236 dom_ctx,
5237 );
5238 }
5239 _ => {}
5240 }
5241 }
5242 }
5243 }
5244
5245 let trimmed_base = base_text.trim();
5246
5247 output.push_str(trimmed_base);
5248
5249 if !rt_annotations.is_empty() {
5250 let rt_text = rt_annotations.iter().map(|s| s.trim()).collect::<Vec<_>>().join("");
5251 if !rt_text.is_empty() {
5252 if has_rtc && !rtc_content.trim().is_empty() && rt_annotations.len() > 1 {
5253 output.push('(');
5254 output.push_str(&rt_text);
5255 output.push(')');
5256 } else {
5257 output.push_str(&rt_text);
5258 }
5259 }
5260 }
5261
5262 if !rtc_content.trim().is_empty() {
5263 output.push_str(rtc_content.trim());
5264 }
5265 }
5266 }
5267
5268 "rb" => {
5269 let mut text = String::new();
5270 let children = tag.children();
5271 {
5272 for child_handle in children.top().iter() {
5273 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5274 }
5275 }
5276 output.push_str(text.trim());
5277 }
5278
5279 "rt" => {
5280 let mut text = String::new();
5281 let children = tag.children();
5282 {
5283 for child_handle in children.top().iter() {
5284 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5285 }
5286 }
5287 let trimmed = text.trim();
5288
5289 if output.ends_with('(') {
5290 output.push_str(trimmed);
5291 } else {
5292 output.push('(');
5293 output.push_str(trimmed);
5294 output.push(')');
5295 }
5296 }
5297
5298 "rp" => {
5299 let mut content = String::new();
5300 let children = tag.children();
5301 {
5302 for child_handle in children.top().iter() {
5303 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
5304 }
5305 }
5306 let trimmed = content.trim();
5307 if !trimmed.is_empty() {
5308 output.push_str(trimmed);
5309 }
5310 }
5311
5312 "rtc" => {
5313 let children = tag.children();
5314 {
5315 for child_handle in children.top().iter() {
5316 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5317 }
5318 }
5319 }
5320
5321 "div" => {
5322 if ctx.convert_as_inline {
5323 let children = tag.children();
5324 {
5325 for child_handle in children.top().iter() {
5326 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5327 }
5328 }
5329 return;
5330 }
5331
5332 let content_start_pos = output.len();
5333
5334 let is_table_continuation =
5335 ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
5336
5337 let is_list_continuation = ctx.in_list_item
5338 && !output.is_empty()
5339 && !output.ends_with("* ")
5340 && !output.ends_with("- ")
5341 && !output.ends_with(". ");
5342
5343 let needs_leading_sep = !ctx.in_table_cell
5344 && !ctx.in_list_item
5345 && !ctx.convert_as_inline
5346 && !output.is_empty()
5347 && !output.ends_with("\n\n");
5348
5349 if is_table_continuation {
5350 trim_trailing_whitespace(output);
5351 output.push_str("<br>");
5352 } else if is_list_continuation {
5353 add_list_continuation_indent(output, ctx.list_depth, false, options);
5354 } else if needs_leading_sep {
5355 trim_trailing_whitespace(output);
5356 output.push_str("\n\n");
5357 }
5358
5359 let children = tag.children();
5360 {
5361 for child_handle in children.top().iter() {
5362 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5363 }
5364 }
5365
5366 let has_content = output.len() > content_start_pos;
5367
5368 if has_content {
5369 if content_start_pos == 0 && output.starts_with('\n') && !output.starts_with("\n\n") {
5370 output.remove(0);
5371 }
5372 trim_trailing_whitespace(output);
5373
5374 if ctx.in_table_cell {
5375 } else if ctx.in_list_item {
5376 if is_list_continuation {
5377 if !output.ends_with('\n') {
5378 output.push('\n');
5379 }
5380 } else if !output.ends_with("\n\n") {
5381 if output.ends_with('\n') {
5382 output.push('\n');
5383 } else {
5384 output.push_str("\n\n");
5385 }
5386 }
5387 } else if !ctx.in_list_item && !ctx.convert_as_inline {
5388 if output.ends_with("\n\n") {
5389 } else if output.ends_with('\n') {
5390 output.push('\n');
5391 } else {
5392 output.push_str("\n\n");
5393 }
5394 }
5395 }
5396 }
5397
5398 "head" => {
5399 let children = tag.children();
5400 let has_body_like = children.top().iter().any(|child_handle| {
5401 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5402 let child_name = normalized_tag_name(child_tag.name().as_utf8_str());
5403 matches!(
5404 child_name.as_ref(),
5405 "body" | "main" | "article" | "section" | "div" | "p"
5406 )
5407 } else {
5408 false
5409 }
5410 });
5411
5412 if has_body_like {
5413 for child_handle in children.top().iter() {
5414 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5415 }
5416 }
5417 }
5418
5419 "script" =>
5420 {
5421 #[cfg(feature = "metadata")]
5422 if let Some(type_attr) = tag.attributes().get("type").flatten() {
5423 if type_attr.as_utf8_str() == "application/ld+json" {
5424 if let Some(ref collector) = ctx.metadata_collector {
5425 let json = get_text_content(node_handle, parser);
5426 collector.borrow_mut().add_json_ld(json);
5427 }
5428 }
5429 }
5430 }
5431 "style" => {}
5432
5433 "span" => {
5434 let is_hocr_word = tag.attributes().iter().any(|(name, value)| {
5435 name.as_ref() == "class" && value.as_ref().is_some_and(|v| v.as_ref().contains("ocrx_word"))
5436 });
5437
5438 if is_hocr_word
5439 && !output.is_empty()
5440 && !output.ends_with(' ')
5441 && !output.ends_with('\t')
5442 && !output.ends_with('\n')
5443 {
5444 output.push(' ');
5445 }
5446
5447 if !ctx.in_code
5448 && options.whitespace_mode == crate::options::WhitespaceMode::Normalized
5449 && output.ends_with('\n')
5450 && !output.ends_with("\n\n")
5451 {
5452 output.pop();
5453 }
5454
5455 let children = tag.children();
5456 {
5457 for child_handle in children.top().iter() {
5458 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5459 }
5460 }
5461 }
5462
5463 _ => {
5464 let len_before = output.len();
5465 let had_trailing_space = output.ends_with(' ');
5466
5467 let children = tag.children();
5468 {
5469 for child_handle in children.top().iter() {
5470 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5471 }
5472 }
5473
5474 let len_after = output.len();
5475 if len_after > len_before {
5476 let start_idx = if output.is_char_boundary(len_before) {
5477 len_before
5478 } else {
5479 let capped = len_before.min(output.len());
5480 output
5481 .char_indices()
5482 .map(|(idx, _)| idx)
5483 .take_while(|idx| *idx <= capped)
5484 .last()
5485 .unwrap_or(capped)
5486 };
5487
5488 let added_content = output[start_idx..].to_string();
5489 if options.debug {
5490 eprintln!(
5491 "[DEBUG] <{}> added {:?}, trim={:?}, had_trailing_space={}",
5492 tag_name,
5493 added_content,
5494 added_content.trim(),
5495 had_trailing_space
5496 );
5497 }
5498
5499 let is_code_block = added_content.starts_with(" ")
5500 || added_content.starts_with("```")
5501 || added_content.starts_with("~~~");
5502
5503 if options.debug && added_content.trim().is_empty() {
5504 eprintln!(
5505 "[DEBUG] Whitespace-only content, is_code_block={}, will_truncate={}",
5506 is_code_block, !is_code_block
5507 );
5508 }
5509
5510 if added_content.trim().is_empty() && !is_code_block {
5511 output.truncate(start_idx);
5512 if !had_trailing_space && added_content.contains(' ') {
5513 output.push(' ');
5514 }
5515 if options.debug {
5516 eprintln!(
5517 "[DEBUG] Truncated, output now ends with space: {}",
5518 output.ends_with(' ')
5519 );
5520 }
5521 }
5522 }
5523 }
5524 }
5525 }
5526
5527 tl::Node::Comment(_) => {}
5528 }
5529}
5530
5531fn get_colspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5533 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5534 if let Some(Some(bytes)) = tag.attributes().get("colspan") {
5535 if let Ok(colspan) = bytes.as_utf8_str().parse::<usize>() {
5536 return colspan;
5537 }
5538 }
5539 }
5540 1
5541}
5542
5543fn get_colspan_rowspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> (usize, usize) {
5545 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5546 let attrs = tag.attributes();
5547 let colspan = attrs
5548 .get("colspan")
5549 .flatten()
5550 .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5551 .unwrap_or(1);
5552 let rowspan = attrs
5553 .get("rowspan")
5554 .flatten()
5555 .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5556 .unwrap_or(1);
5557 (colspan, rowspan)
5558 } else {
5559 (1, 1)
5560 }
5561}
5562
5563fn convert_table_cell(
5565 node_handle: &tl::NodeHandle,
5566 parser: &tl::Parser,
5567 output: &mut String,
5568 options: &ConversionOptions,
5569 ctx: &Context,
5570 _tag_name: &str,
5571 dom_ctx: &DomContext,
5572) {
5573 let mut text = String::with_capacity(128);
5574
5575 let cell_ctx = Context {
5576 in_table_cell: true,
5577 ..ctx.clone()
5578 };
5579
5580 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5581 let children = tag.children();
5582 {
5583 for child_handle in children.top().iter() {
5584 walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
5585 }
5586 }
5587 }
5588
5589 let text = text.trim();
5590 let text = if options.br_in_tables {
5591 text.split('\n')
5592 .filter(|s| !s.is_empty())
5593 .collect::<Vec<_>>()
5594 .join("<br>")
5595 } else {
5596 text.replace('\n', " ")
5597 };
5598
5599 let colspan = get_colspan(node_handle, parser);
5600
5601 output.push(' ');
5602 output.push_str(&text);
5603 output.push_str(&" |".repeat(colspan));
5604}
5605
5606#[allow(clippy::too_many_arguments)]
5608fn convert_table_row(
5609 node_handle: &tl::NodeHandle,
5610 parser: &tl::Parser,
5611 output: &mut String,
5612 options: &ConversionOptions,
5613 ctx: &Context,
5614 row_index: usize,
5615 rowspan_tracker: &mut std::collections::HashMap<usize, (String, usize)>,
5616 dom_ctx: &DomContext,
5617) {
5618 let mut row_text = String::with_capacity(256);
5619 let mut cells = Vec::new();
5620
5621 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5622 let children = tag.children();
5623 {
5624 for child_handle in children.top().iter() {
5625 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5626 let cell_name = normalized_tag_name(child_tag.name().as_utf8_str());
5627 if cell_name == "th" || cell_name == "td" {
5628 cells.push(*child_handle);
5629 }
5630 }
5631 }
5632 }
5633 }
5634
5635 let mut col_index = 0;
5636 let mut cell_iter = cells.iter();
5637
5638 loop {
5639 if let Some((_content, remaining_rows)) = rowspan_tracker.get_mut(&col_index) {
5640 if *remaining_rows > 0 {
5641 row_text.push(' ');
5642 row_text.push_str(" |");
5643 *remaining_rows -= 1;
5644 if *remaining_rows == 0 {
5645 rowspan_tracker.remove(&col_index);
5646 }
5647 col_index += 1;
5648 continue;
5649 }
5650 }
5651
5652 if let Some(cell_handle) = cell_iter.next() {
5653 let cell_start = row_text.len();
5654 convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
5655
5656 let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
5657
5658 if rowspan > 1 {
5659 let cell_text = &row_text[cell_start..];
5660 let cell_content = cell_text
5661 .trim_start_matches(' ')
5662 .trim_end_matches(" |")
5663 .trim()
5664 .to_string();
5665 rowspan_tracker.insert(col_index, (cell_content, rowspan - 1));
5666 }
5667
5668 col_index += colspan;
5669 } else {
5670 break;
5671 }
5672 }
5673
5674 output.push('|');
5675 output.push_str(&row_text);
5676 output.push('\n');
5677
5678 let is_first_row = row_index == 0;
5679 if is_first_row {
5680 let total_cols = cells.iter().map(|h| get_colspan(h, parser)).sum::<usize>().max(1);
5681 output.push_str("| ");
5682 for i in 0..total_cols {
5683 if i > 0 {
5684 output.push_str(" | ");
5685 }
5686 output.push_str("---");
5687 }
5688 output.push_str(" |\n");
5689 }
5690}
5691
5692fn table_has_header(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5693 if let Some(node) = node_handle.get(parser) {
5694 if let tl::Node::Tag(tag) = node {
5695 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5696 if tag_name.as_ref() == "th" {
5697 return true;
5698 }
5699 let children = tag.children();
5700 for child in children.top().iter() {
5701 if table_has_header(child, parser) {
5702 return true;
5703 }
5704 }
5705 }
5706 }
5707 false
5708}
5709
5710fn table_has_caption(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5711 if let Some(node) = node_handle.get(parser) {
5712 if let tl::Node::Tag(tag) = node {
5713 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5714 if tag_name.as_ref() == "caption" {
5715 return true;
5716 }
5717 let children = tag.children();
5718 for child in children.top().iter() {
5719 if table_has_caption(child, parser) {
5720 return true;
5721 }
5722 }
5723 }
5724 }
5725 false
5726}
5727
5728fn table_contains_nested_table(node_handle: &tl::NodeHandle, parser: &tl::Parser, is_root: bool) -> bool {
5729 if let Some(node) = node_handle.get(parser) {
5730 if let tl::Node::Tag(tag) = node {
5731 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5732 if !is_root && tag_name.as_ref() == "table" {
5733 return true;
5734 }
5735
5736 for child in tag.children().top().iter() {
5737 if table_contains_nested_table(child, parser, false) {
5738 return true;
5739 }
5740 }
5741 }
5742 }
5743 false
5744}
5745
5746fn collect_table_row_counts(
5747 node_handle: &tl::NodeHandle,
5748 parser: &tl::Parser,
5749 counts: &mut Vec<usize>,
5750 has_span: &mut bool,
5751) {
5752 if let Some(node) = node_handle.get(parser) {
5753 if let tl::Node::Tag(tag) = node {
5754 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5755 match tag_name.as_ref() {
5756 "tr" => {
5757 let mut cell_count = 0;
5758 for child in tag.children().top().iter() {
5759 if let Some(tl::Node::Tag(cell_tag)) = child.get(parser) {
5760 let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5761 if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5762 cell_count += 1;
5763 let attrs = cell_tag.attributes();
5764 if attrs.get("colspan").is_some() || attrs.get("rowspan").is_some() {
5765 *has_span = true;
5766 }
5767 }
5768 }
5769 }
5770 counts.push(cell_count);
5771 }
5772 _ => {
5773 for child in tag.children().top().iter() {
5774 collect_table_row_counts(child, parser, counts, has_span);
5775 }
5776 }
5777 }
5778 }
5779 }
5780}
5781
5782fn count_links(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5783 let mut total = 0;
5784 if let Some(node) = node_handle.get(parser) {
5785 if let tl::Node::Tag(tag) = node {
5786 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5787 if tag_name.as_ref() == "a" {
5788 total += 1;
5789 }
5790
5791 for child in tag.children().top().iter() {
5792 total += count_links(child, parser);
5793 }
5794 }
5795 }
5796 total
5797}
5798
5799fn append_layout_row(
5800 row_handle: &tl::NodeHandle,
5801 parser: &tl::Parser,
5802 output: &mut String,
5803 options: &ConversionOptions,
5804 ctx: &Context,
5805 dom_ctx: &DomContext,
5806) {
5807 if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5808 let mut row_text = String::new();
5809 let row_children = row_tag.children();
5810 for cell_handle in row_children.top().iter() {
5811 if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
5812 let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5813 if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5814 let mut cell_text = String::new();
5815 let cell_ctx = Context {
5816 convert_as_inline: true,
5817 ..ctx.clone()
5818 };
5819 let cell_children = cell_tag.children();
5820 for cell_child in cell_children.top().iter() {
5821 walk_node(cell_child, parser, &mut cell_text, options, &cell_ctx, 0, dom_ctx);
5822 }
5823 let cell_content = text::normalize_whitespace(&cell_text);
5824 if !cell_content.trim().is_empty() {
5825 if !row_text.is_empty() {
5826 row_text.push(' ');
5827 }
5828 row_text.push_str(cell_content.trim());
5829 }
5830 }
5831 }
5832 }
5833
5834 let trimmed = row_text.trim();
5835 if !trimmed.is_empty() {
5836 if !output.is_empty() && !output.ends_with('\n') {
5837 output.push('\n');
5838 }
5839 let formatted = trimmed.strip_prefix("- ").unwrap_or(trimmed).trim_start();
5840 output.push_str("- ");
5841 output.push_str(formatted);
5842 output.push('\n');
5843 }
5844 }
5845}
5846
5847fn indent_table_for_list(table_content: &str, list_depth: usize, options: &ConversionOptions) -> String {
5849 if list_depth == 0 {
5850 return table_content.to_string();
5851 }
5852
5853 let Some(mut indent) = continuation_indent_string(list_depth, options) else {
5854 return table_content.to_string();
5855 };
5856
5857 if matches!(options.list_indent_type, ListIndentType::Spaces) {
5858 let space_count = indent.chars().filter(|c| *c == ' ').count();
5859 if space_count < 4 {
5860 indent.push_str(&" ".repeat(4 - space_count));
5861 }
5862 }
5863
5864 let mut result = String::with_capacity(table_content.len() + indent.len() * 4);
5865 for segment in table_content.split_inclusive('\n') {
5866 if segment.starts_with('|') {
5867 result.push_str(&indent);
5868 result.push_str(segment);
5869 } else {
5870 result.push_str(segment);
5871 }
5872 }
5873 result
5874}
5875
5876fn convert_table(
5878 node_handle: &tl::NodeHandle,
5879 parser: &tl::Parser,
5880 output: &mut String,
5881 options: &ConversionOptions,
5882 ctx: &Context,
5883 dom_ctx: &DomContext,
5884) {
5885 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5886 let mut row_counts = Vec::new();
5887 let mut has_span = false;
5888 collect_table_row_counts(node_handle, parser, &mut row_counts, &mut has_span);
5889
5890 let row_count = row_counts.len();
5891 let mut distinct_counts: Vec<_> = row_counts.into_iter().filter(|c| *c > 0).collect();
5892 distinct_counts.sort_unstable();
5893 distinct_counts.dedup();
5894
5895 let looks_like_layout =
5896 table_contains_nested_table(node_handle, parser, true) || has_span || distinct_counts.len() > 1;
5897 let link_count = count_links(node_handle, parser);
5898 let table_text = text::normalize_whitespace(&get_text_content(node_handle, parser));
5899 let is_blank_table = table_text.trim().is_empty();
5900
5901 if !table_has_header(node_handle, parser)
5902 && !table_has_caption(node_handle, parser)
5903 && (looks_like_layout || is_blank_table || (row_count <= 2 && link_count >= 3))
5904 {
5905 if is_blank_table {
5906 return;
5907 }
5908
5909 let table_children = tag.children();
5910 for child_handle in table_children.top().iter() {
5911 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5912 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5913 match tag_name.as_ref() {
5914 "thead" | "tbody" | "tfoot" => {
5915 for row_handle in child_tag.children().top().iter() {
5916 if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5917 if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
5918 append_layout_row(row_handle, parser, output, options, ctx, dom_ctx);
5919 }
5920 }
5921 }
5922 }
5923 "tr" => append_layout_row(child_handle, parser, output, options, ctx, dom_ctx),
5924 _ => {}
5925 }
5926 }
5927 }
5928 if !output.ends_with('\n') {
5929 output.push('\n');
5930 }
5931 return;
5932 }
5933
5934 let mut row_index = 0;
5935 let mut rowspan_tracker = std::collections::HashMap::new();
5936
5937 let children = tag.children();
5938 {
5939 for child_handle in children.top().iter() {
5940 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5941 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5942
5943 match tag_name.as_ref() {
5944 "caption" => {
5945 let mut text = String::new();
5946 let grandchildren = child_tag.children();
5947 {
5948 for grandchild_handle in grandchildren.top().iter() {
5949 walk_node(grandchild_handle, parser, &mut text, options, ctx, 0, dom_ctx);
5950 }
5951 }
5952 let text = text.trim();
5953 if !text.is_empty() {
5954 let escaped_text = text.replace('-', r"\-");
5955 output.push('*');
5956 output.push_str(&escaped_text);
5957 output.push_str("*\n\n");
5958 }
5959 }
5960
5961 "thead" | "tbody" | "tfoot" => {
5962 let section_children = child_tag.children();
5963 {
5964 for row_handle in section_children.top().iter() {
5965 if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5966 if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
5967 convert_table_row(
5968 row_handle,
5969 parser,
5970 output,
5971 options,
5972 ctx,
5973 row_index,
5974 &mut rowspan_tracker,
5975 dom_ctx,
5976 );
5977 row_index += 1;
5978 }
5979 }
5980 }
5981 }
5982 }
5983
5984 "tr" => {
5985 convert_table_row(
5986 child_handle,
5987 parser,
5988 output,
5989 options,
5990 ctx,
5991 row_index,
5992 &mut rowspan_tracker,
5993 dom_ctx,
5994 );
5995 row_index += 1;
5996 }
5997
5998 "colgroup" | "col" => {}
5999
6000 _ => {}
6001 }
6002 }
6003 }
6004 }
6005 }
6006}
6007
6008#[cfg(test)]
6009mod tests {
6010 use super::*;
6011 use crate::options::HighlightStyle;
6012
6013 #[test]
6014 fn test_trim_trailing_whitespace() {
6015 let mut s = String::from("hello ");
6016 trim_trailing_whitespace(&mut s);
6017 assert_eq!(s, "hello");
6018
6019 let mut s = String::from("hello\t\t");
6020 trim_trailing_whitespace(&mut s);
6021 assert_eq!(s, "hello");
6022
6023 let mut s = String::from("hello \t \t");
6024 trim_trailing_whitespace(&mut s);
6025 assert_eq!(s, "hello");
6026
6027 let mut s = String::from("hello");
6028 trim_trailing_whitespace(&mut s);
6029 assert_eq!(s, "hello");
6030
6031 let mut s = String::from("");
6032 trim_trailing_whitespace(&mut s);
6033 assert_eq!(s, "");
6034
6035 let mut s = String::from("hello\n");
6036 trim_trailing_whitespace(&mut s);
6037 assert_eq!(s, "hello\n");
6038 }
6039
6040 #[test]
6041 fn test_chomp_preserves_boundary_spaces() {
6042 assert_eq!(chomp_inline(" text "), (" ", " ", "text"));
6043 assert_eq!(chomp_inline("text"), ("", "", "text"));
6044 assert_eq!(chomp_inline(" text"), (" ", "", "text"));
6045 assert_eq!(chomp_inline("text "), ("", " ", "text"));
6046 assert_eq!(chomp_inline(" "), (" ", " ", ""));
6047 assert_eq!(chomp_inline(""), ("", "", ""));
6048 }
6049
6050 #[test]
6051 fn nested_strong_markup_is_normalized() {
6052 let html = "<strong><strong>Bold</strong></strong>";
6053 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6054 assert_eq!(result.trim(), "**Bold**");
6055 }
6056
6057 #[test]
6058 fn nested_strong_with_additional_text_is_normalized() {
6059 let html = "<strong>Hello <strong>World</strong></strong>";
6060 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6061 assert_eq!(result.trim(), "**Hello World**");
6062 }
6063
6064 #[test]
6065 fn nested_strong_partial_segments_are_normalized() {
6066 let html = "<b>bo<b>ld</b>er</b>";
6067 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6068 assert_eq!(result.trim(), "**bolder**");
6069 }
6070
6071 #[test]
6072 fn summary_with_inner_strong_is_not_double_wrapped() {
6073 let html = "<details><summary><strong>Title</strong></summary></details>";
6074 let mut options = ConversionOptions::default();
6075 options.preprocessing.remove_forms = false;
6076 let result = convert_html(html, &options).unwrap();
6077 assert_eq!(result.trim(), "**Title**");
6078 }
6079
6080 #[test]
6081 fn legend_with_inner_strong_is_not_double_wrapped() {
6082 let html = "<fieldset><legend><strong>Section</strong></legend></fieldset>";
6083 let mut options = ConversionOptions::default();
6084 options.preprocessing.remove_forms = false;
6085 let result = convert_html(html, &options).unwrap();
6086 assert_eq!(result.trim(), "**Section**");
6087 }
6088
6089 #[test]
6090 fn preprocessing_keeps_article_header_inside_main() {
6091 let html = r#"
6092 <body>
6093 <header class="global-header">
6094 <div>Global Navigation</div>
6095 </header>
6096 <main>
6097 <header class="article-header">
6098 <h1>Primary Title</h1>
6099 </header>
6100 <p>Body content stays.</p>
6101 </main>
6102 </body>
6103 "#;
6104 let mut options = ConversionOptions::default();
6105 options.preprocessing.enabled = true;
6106 let result = convert_html(html, &options).unwrap();
6107 assert!(
6108 result.contains("Primary Title"),
6109 "article header was removed: {}",
6110 result
6111 );
6112 assert!(
6113 result.contains("Body content stays"),
6114 "main body content missing: {}",
6115 result
6116 );
6117 assert!(
6118 !result.contains("Global Navigation"),
6119 "site chrome unexpectedly rendered: {}",
6120 result
6121 );
6122 }
6123
6124 #[test]
6125 fn preprocessing_drops_nav_but_keeps_body() {
6126 let html = r##"
6127 <main>
6128 <nav aria-label="Primary navigation">
6129 <a href="#a">NavOnly</a>
6130 </nav>
6131 <article>
6132 <p>Important narrative</p>
6133 </article>
6134 </main>
6135 "##;
6136 let mut options = ConversionOptions::default();
6137 options.preprocessing.enabled = true;
6138 let result = convert_html(html, &options).unwrap();
6139 assert!(
6140 !result.contains("NavOnly"),
6141 "navigation text should not appear: {}",
6142 result
6143 );
6144 assert!(
6145 result.contains("Important narrative"),
6146 "article text should remain: {}",
6147 result
6148 );
6149 }
6150
6151 #[test]
6152 fn preprocessing_retains_section_headers_inside_articles() {
6153 let html = r#"
6154 <article>
6155 <header>
6156 <h2>Section Heading</h2>
6157 </header>
6158 <section>
6159 <p>Section body</p>
6160 </section>
6161 </article>
6162 "#;
6163 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6164 assert!(
6165 result.contains("Section Heading"),
6166 "section heading was stripped: {}",
6167 result
6168 );
6169 assert!(result.contains("Section body"), "section body missing: {}", result);
6170 }
6171
6172 #[test]
6173 fn bold_highlight_suppresses_nested_strong() {
6174 let mut options = ConversionOptions::default();
6175 options.highlight_style = HighlightStyle::Bold;
6176 let html = "<p><mark><strong>Hot</strong></mark></p>";
6177 let result = convert_html(html, &options).unwrap();
6178 assert_eq!(result.trim(), "**Hot**");
6179 }
6180
6181 #[test]
6182 fn atx_heading_swallows_layout_line_breaks() {
6183 let html = r#"<h2>
6184 Heading
6185 Text
6186 with
6187 Line
6188 Breaks
6189</h2>"#;
6190 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6191 assert_eq!(result.trim(), "## Heading Text with Line Breaks");
6192 }
6193
6194 #[test]
6195 fn doctype_is_removed() {
6196 let html = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
6197 <html>
6198 <head><title>Example</title></head>
6199 <body><p>Hello World</p></body>
6200 </html>"#;
6201 let mut options = ConversionOptions::default();
6202 options.extract_metadata = false;
6203 let result = convert_html(html, &options).unwrap();
6204 assert_eq!(result.trim(), "Hello World");
6205 }
6206
6207 #[test]
6208 fn test_calculate_list_continuation_indent() {
6209 assert_eq!(calculate_list_continuation_indent(0), 0);
6210
6211 assert_eq!(calculate_list_continuation_indent(1), 1);
6212
6213 assert_eq!(calculate_list_continuation_indent(2), 3);
6214
6215 assert_eq!(calculate_list_continuation_indent(3), 5);
6216
6217 assert_eq!(calculate_list_continuation_indent(4), 7);
6218 }
6219
6220 #[test]
6221 fn strips_script_sections_without_removing_following_content() {
6222 let input = "<div>before</div><script>1 < 2</script><p>after</p>";
6223 let stripped = strip_script_and_style_sections(input);
6224 assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
6225 }
6226
6227 #[test]
6228 fn strips_multiline_script_sections() {
6229 let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
6230 let stripped = strip_script_and_style_sections(input);
6231 assert!(stripped.contains("Content"));
6232 assert!(stripped.contains("<script"));
6233 assert!(!stripped.contains("1 < 2"));
6234 }
6235
6236 #[test]
6237 fn hr_inside_paragraph_matches_inline_expectation() {
6238 let mut options = ConversionOptions::default();
6239 options.extract_metadata = false;
6240 let markdown = convert_html("<p>Hello<hr>World</p>", &options).unwrap();
6241 assert_eq!(markdown, "Hello\n---\nWorld\n");
6242 }
6243
6244 #[test]
6245 fn hr_inside_paragraph_matches_inline_expectation_via_public_api() {
6246 let mut options = ConversionOptions::default();
6247 options.extract_metadata = false;
6248 let markdown = crate::convert("<p>Hello<hr>World</p>", Some(options)).unwrap();
6249 assert_eq!(markdown, "Hello\n---\nWorld\n");
6250 }
6251
6252 #[test]
6253 fn test_add_list_continuation_indent_blank_line() {
6254 let opts = ConversionOptions::default();
6255 let mut output = String::from("* First para");
6256 add_list_continuation_indent(&mut output, 1, true, &opts);
6257 assert_eq!(output, "* First para\n\n ");
6258
6259 let mut output = String::from("* First para\n");
6260 add_list_continuation_indent(&mut output, 1, true, &opts);
6261 assert_eq!(output, "* First para\n\n ");
6262
6263 let mut output = String::from("* First para\n\n");
6264 add_list_continuation_indent(&mut output, 1, true, &opts);
6265 assert_eq!(output, "* First para\n\n ");
6266
6267 let mut output = String::from("* First para");
6268 add_list_continuation_indent(&mut output, 2, true, &opts);
6269 assert_eq!(output, "* First para\n\n ");
6270 }
6271
6272 #[test]
6273 fn test_add_list_continuation_indent_single_line() {
6274 let opts = ConversionOptions::default();
6275 let mut output = String::from("* First div");
6276 add_list_continuation_indent(&mut output, 1, false, &opts);
6277 assert_eq!(output, "* First div\n ");
6278
6279 let mut output = String::from("* First div\n");
6280 add_list_continuation_indent(&mut output, 1, false, &opts);
6281 assert_eq!(output, "* First div\n ");
6282
6283 let mut output = String::from("* First div\n");
6284 add_list_continuation_indent(&mut output, 1, false, &opts);
6285 assert_eq!(output, "* First div\n ");
6286 }
6287
6288 #[test]
6289 fn test_trim_trailing_whitespace_in_continuation() {
6290 let opts = ConversionOptions::default();
6291 let mut output = String::from("* First ");
6292 add_list_continuation_indent(&mut output, 1, true, &opts);
6293 assert_eq!(output, "* First\n\n ");
6294
6295 let mut output = String::from("* First\t\t");
6296 add_list_continuation_indent(&mut output, 1, false, &opts);
6297 assert_eq!(output, "* First\n ");
6298 }
6299
6300 #[test]
6301 fn test_escape_malformed_angle_brackets_bare() {
6302 let input = "1<2";
6303 let escaped = escape_malformed_angle_brackets(input);
6304 assert_eq!(escaped, "1<2");
6305 }
6306
6307 #[test]
6308 fn test_escape_malformed_angle_brackets_in_text() {
6309 let input = "<html>1<2 Content</html>";
6310 let escaped = escape_malformed_angle_brackets(input);
6311 assert_eq!(escaped, "<html>1<2 Content</html>");
6312 }
6313
6314 #[test]
6315 fn test_escape_malformed_angle_brackets_multiple() {
6316 let input = "1 < 2 < 3";
6317 let escaped = escape_malformed_angle_brackets(input);
6318 assert_eq!(escaped, "1 < 2 < 3");
6319 }
6320
6321 #[test]
6322 fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
6323 let input = "<div>content</div>";
6324 let escaped = escape_malformed_angle_brackets(input);
6325 assert_eq!(escaped, "<div>content</div>");
6326 }
6327
6328 #[test]
6329 fn test_escape_malformed_angle_brackets_mixed() {
6330 let input = "<div>1<2</div><p>3<4</p>";
6331 let escaped = escape_malformed_angle_brackets(input);
6332 assert_eq!(escaped, "<div>1<2</div><p>3<4</p>");
6333 }
6334
6335 #[test]
6336 fn test_escape_malformed_angle_brackets_at_end() {
6337 let input = "test<";
6338 let escaped = escape_malformed_angle_brackets(input);
6339 assert_eq!(escaped, "test<");
6340 }
6341
6342 #[test]
6343 fn test_escape_malformed_angle_brackets_preserves_comments() {
6344 let input = "<!-- comment -->1<2";
6345 let escaped = escape_malformed_angle_brackets(input);
6346 assert_eq!(escaped, "<!-- comment -->1<2");
6347 }
6348
6349 #[test]
6350 fn test_escape_malformed_angle_brackets_preserves_doctype() {
6351 let input = "<!DOCTYPE html>1<2";
6352 let escaped = escape_malformed_angle_brackets(input);
6353 assert_eq!(escaped, "<!DOCTYPE html>1<2");
6354 }
6355
6356 #[test]
6357 fn test_convert_with_malformed_angle_brackets() {
6358 let html = "<html>1<2\nContent</html>";
6359 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6360 assert!(
6361 result.contains("Content"),
6362 "Result should contain 'Content': {:?}",
6363 result
6364 );
6365 assert!(
6366 result.contains("1<2") || result.contains("1<2"),
6367 "Result should contain escaped or unescaped comparison"
6368 );
6369 }
6370
6371 #[test]
6372 fn test_convert_with_malformed_angle_brackets_in_div() {
6373 let html = "<html><div>1<2</div><div>Content</div></html>";
6374 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6375 assert!(
6376 result.contains("Content"),
6377 "Result should contain 'Content': {:?}",
6378 result
6379 );
6380 }
6381
6382 #[test]
6383 fn test_convert_with_multiple_malformed_angle_brackets() {
6384 let html = "<html>1 < 2 < 3<p>Content</p></html>";
6385 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6386 assert!(
6387 result.contains("Content"),
6388 "Result should contain 'Content': {:?}",
6389 result
6390 );
6391 }
6392
6393 #[test]
6394 fn test_preserve_tags_simple_table() {
6395 let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
6396 let options = ConversionOptions {
6397 preserve_tags: vec!["table".to_string()],
6398 ..Default::default()
6399 };
6400 let result = convert_html(html, &options).unwrap();
6401
6402 assert!(result.contains("<table>"), "Should preserve table tag");
6403 assert!(result.contains("</table>"), "Should have closing table tag");
6404 assert!(result.contains("<tr>"), "Should preserve tr tag");
6405 assert!(result.contains("<td>"), "Should preserve td tag");
6406 assert!(result.contains("Text"), "Should convert other elements");
6407 }
6408
6409 #[test]
6410 fn test_preserve_tags_with_attributes() {
6411 let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
6412 let options = ConversionOptions {
6413 preserve_tags: vec!["table".to_string()],
6414 ..Default::default()
6415 };
6416 let result = convert_html(html, &options).unwrap();
6417
6418 assert!(result.contains("<table"), "Should preserve table tag");
6419 assert!(result.contains("class="), "Should preserve class attribute");
6420 assert!(result.contains("id="), "Should preserve id attribute");
6421 assert!(result.contains("</table>"), "Should have closing tag");
6422 }
6423
6424 #[test]
6425 fn test_preserve_tags_multiple_tags() {
6426 let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
6427 let options = ConversionOptions {
6428 preserve_tags: vec!["table".to_string(), "form".to_string()],
6429 ..Default::default()
6430 };
6431 let result = convert_html(html, &options).unwrap();
6432
6433 assert!(result.contains("<table>"), "Should preserve table");
6434 assert!(result.contains("<form>"), "Should preserve form");
6435 assert!(result.contains("Text"), "Should convert paragraph");
6436 }
6437
6438 #[test]
6439 fn test_preserve_tags_nested_content() {
6440 let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
6441 let options = ConversionOptions {
6442 preserve_tags: vec!["table".to_string()],
6443 ..Default::default()
6444 };
6445 let result = convert_html(html, &options).unwrap();
6446
6447 assert!(result.contains("<thead>"), "Should preserve nested thead");
6448 assert!(result.contains("<tbody>"), "Should preserve nested tbody");
6449 assert!(result.contains("<th>"), "Should preserve th tag");
6450 assert!(result.contains("Header"), "Should preserve text content");
6451 }
6452
6453 #[test]
6454 fn test_preserve_tags_empty_list() {
6455 let html = r#"<table><tr><td>Cell</td></tr></table>"#;
6456 let options = ConversionOptions::default();
6457 let result = convert_html(html, &options).unwrap();
6458
6459 assert!(
6460 !result.contains("<table>"),
6461 "Should not preserve table without preserve_tags"
6462 );
6463 }
6464
6465 #[test]
6466 fn test_preserve_tags_vs_strip_tags() {
6467 let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
6468 let options = ConversionOptions {
6469 preserve_tags: vec!["table".to_string()],
6470 strip_tags: vec!["span".to_string()],
6471 ..Default::default()
6472 };
6473 let result = convert_html(html, &options).unwrap();
6474
6475 assert!(result.contains("<table>"), "Should preserve table");
6476 assert!(!result.contains("<span>"), "Should strip span tag");
6477 assert!(result.contains("Text"), "Should keep span text content");
6478 }
6479
6480 #[test]
6481 fn example_com_remains_visible() {
6482 let html = "<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href=\"https://iana.org/domains/example\">Learn more</a></div></body></html>";
6483
6484 let mut options = ConversionOptions::default();
6485 options.extract_metadata = false;
6486 let result = convert_html(html, &options).unwrap();
6487
6488 assert!(
6489 result.contains("Example Domain"),
6490 "content unexpectedly missing: {}",
6491 result
6492 );
6493 }
6494}
6495#[test]
6496fn normalize_self_closing_tags_noop_when_absent() {
6497 let html = "<div><p>text</p></div>";
6498 let normalized = normalize_self_closing_tags(html);
6499 assert!(matches!(normalized, Cow::Borrowed(_)));
6500 assert_eq!(normalized.as_ref(), html);
6501}
6502
6503#[test]
6504fn normalize_self_closing_tags_replaces_targets() {
6505 let html = "<br/><hr/><img/>";
6506 let normalized = normalize_self_closing_tags(html);
6507 assert_eq!(normalized.as_ref(), "<br><hr><img>");
6508}