1#![allow(clippy::collapsible_match)]
8#[cfg(feature = "inline-images")]
45use std::cell::RefCell;
46use std::collections::{BTreeMap, HashMap};
47#[cfg(feature = "inline-images")]
48use std::rc::Rc;
49
50use std::borrow::Cow;
51use std::str;
52
53use crate::error::Result;
54#[cfg(feature = "inline-images")]
55use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
56use crate::options::{ConversionOptions, HeadingStyle, ListIndentType};
57use crate::text;
58
59#[cfg(feature = "inline-images")]
60type InlineCollectorHandle = Rc<RefCell<InlineImageCollector>>;
61#[cfg(not(feature = "inline-images"))]
62type InlineCollectorHandle = ();
63
64fn chomp_inline(text: &str) -> (&str, &str, &str) {
70 if text.is_empty() {
71 return ("", "", "");
72 }
73
74 let prefix = if text.starts_with(&[' ', '\t'][..]) { " " } else { "" };
75
76 let has_trailing_linebreak = text.ends_with(" \n") || text.ends_with("\\\n");
77
78 let suffix = if has_trailing_linebreak {
79 if text.ends_with(" \n") { " \n" } else { "\\\n" }
80 } else if text.ends_with(&[' ', '\t'][..]) {
81 " "
82 } else {
83 ""
84 };
85
86 let trimmed = if has_trailing_linebreak {
87 if let Some(stripped) = text.strip_suffix(" \n") {
88 stripped.trim()
89 } else if let Some(stripped) = text.strip_suffix("\\\n") {
90 stripped.trim()
91 } else {
92 text.trim()
93 }
94 } else {
95 text.trim()
96 };
97
98 (prefix, suffix, trimmed)
99}
100
101fn trim_trailing_whitespace(output: &mut String) {
106 while output.ends_with(' ') || output.ends_with('\t') {
107 output.pop();
108 }
109}
110
111fn trim_line_end_whitespace(output: &mut String) {
113 if output.is_empty() {
114 return;
115 }
116
117 let mut cleaned = String::with_capacity(output.len());
118 for (idx, line) in output.split('\n').enumerate() {
119 if idx > 0 {
120 cleaned.push('\n');
121 }
122
123 let has_soft_break = line.ends_with(" ");
124 let trimmed = line.trim_end_matches([' ', '\t']);
125
126 if has_soft_break {
127 cleaned.push_str(trimmed);
128 cleaned.push_str(" ");
129 } else {
130 cleaned.push_str(trimmed);
131 }
132 }
133
134 cleaned.push('\n');
135 *output = cleaned;
136}
137
138fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
140 if value.len() <= max_len {
141 return;
142 }
143
144 let mut new_len = max_len.min(value.len());
145 while new_len > 0 && !value.is_char_boundary(new_len) {
146 new_len -= 1;
147 }
148 value.truncate(new_len);
149}
150
151fn dedent_code_block(content: &str) -> String {
156 let lines: Vec<&str> = content.lines().collect();
157 if lines.is_empty() {
158 return String::new();
159 }
160
161 let min_indent = lines
162 .iter()
163 .filter(|line| !line.trim().is_empty())
164 .map(|line| {
165 line.char_indices()
166 .take_while(|(_, c)| c.is_whitespace())
167 .map(|(idx, c)| idx + c.len_utf8())
168 .last()
169 .unwrap_or(0)
170 })
171 .min()
172 .unwrap_or(0);
173
174 lines
175 .iter()
176 .map(|line| {
177 if line.trim().is_empty() {
178 *line
179 } else {
180 &line[min_indent.min(line.len())..]
181 }
182 })
183 .collect::<Vec<_>>()
184 .join("\n")
185}
186
187fn calculate_list_continuation_indent(depth: usize) -> usize {
208 if depth > 0 { 2 * depth - 1 } else { 0 }
209}
210
211fn is_loose_list(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
233 if let Some(node) = node_handle.get(parser) {
234 if let tl::Node::Tag(tag) = node {
235 let children = tag.children();
236 {
237 for child_handle in children.top().iter() {
238 if let Some(child_node) = child_handle.get(parser) {
239 if let tl::Node::Tag(child_tag) = child_node {
240 if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
241 let li_children = child_tag.children();
242 {
243 for li_child_handle in li_children.top().iter() {
244 if let Some(li_child_node) = li_child_handle.get(parser) {
245 if let tl::Node::Tag(li_child_tag) = li_child_node {
246 if tag_name_eq(li_child_tag.name().as_utf8_str(), "p") {
247 return true;
248 }
249 }
250 }
251 }
252 }
253 }
254 }
255 }
256 }
257 }
258 }
259 }
260 false
261}
262
263fn add_list_continuation_indent(output: &mut String, list_depth: usize, blank_line: bool, options: &ConversionOptions) {
287 trim_trailing_whitespace(output);
288
289 if blank_line {
290 if !output.ends_with("\n\n") {
291 if output.ends_with('\n') {
292 output.push('\n');
293 } else {
294 output.push_str("\n\n");
295 }
296 }
297 } else if !output.ends_with('\n') {
298 output.push('\n');
299 }
300
301 let indent_level = calculate_list_continuation_indent(list_depth);
302 let indent_char = match options.list_indent_type {
303 ListIndentType::Tabs => "\t",
304 ListIndentType::Spaces => &" ".repeat(options.list_indent_width),
305 };
306 output.push_str(&indent_char.repeat(indent_level));
307}
308
309fn continuation_indent_string(list_depth: usize, options: &ConversionOptions) -> Option<String> {
311 let indent_level = calculate_list_continuation_indent(list_depth);
312 if indent_level == 0 {
313 return None;
314 }
315
316 let indent = match options.list_indent_type {
317 ListIndentType::Tabs => "\t".repeat(indent_level),
318 ListIndentType::Spaces => " ".repeat(options.list_indent_width * indent_level),
319 };
320 Some(indent)
321}
322
323fn add_list_leading_separator(output: &mut String, ctx: &Context) {
330 if ctx.in_table_cell {
331 let is_table_continuation =
332 !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
333 if is_table_continuation {
334 output.push_str("<br>");
335 }
336 return;
337 }
338
339 if !output.is_empty() && !ctx.in_list {
340 let needs_newline =
341 !output.ends_with("\n\n") && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
342 if needs_newline {
343 output.push_str("\n\n");
344 }
345 return;
346 }
347
348 if ctx.in_list_item && !output.is_empty() {
349 let needs_newline =
350 !output.ends_with('\n') && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
351 if needs_newline {
352 trim_trailing_whitespace(output);
353 output.push('\n');
354 }
355 }
356}
357
358fn add_nested_list_trailing_separator(output: &mut String, ctx: &Context) {
363 if !ctx.in_list_item {
364 return;
365 }
366
367 if ctx.loose_list {
368 if !output.ends_with("\n\n") {
369 if !output.ends_with('\n') {
370 output.push('\n');
371 }
372 output.push('\n');
373 }
374 } else if !output.ends_with('\n') {
375 output.push('\n');
376 }
377}
378
379fn calculate_list_nesting_depth(ctx: &Context) -> usize {
385 if ctx.in_list && !ctx.in_list_item {
386 ctx.list_depth + 1
387 } else {
388 ctx.list_depth
389 }
390}
391
392#[allow(clippy::too_many_arguments)]
397fn process_list_children(
398 node_handle: &tl::NodeHandle,
399 parser: &tl::Parser,
400 output: &mut String,
401 options: &ConversionOptions,
402 ctx: &Context,
403 depth: usize,
404 is_ordered: bool,
405 is_loose: bool,
406 nested_depth: usize,
407 start_counter: usize,
408 dom_ctx: &DomContext,
409) {
410 let mut counter = start_counter;
411
412 if let Some(node) = node_handle.get(parser) {
413 if let tl::Node::Tag(tag) = node {
414 let children = tag.children();
415 {
416 for child_handle in children.top().iter() {
417 if let Some(child_node) = child_handle.get(parser) {
418 if let tl::Node::Raw(bytes) = child_node {
419 if bytes.as_utf8_str().trim().is_empty() {
420 continue;
421 }
422 }
423 }
424
425 let list_ctx = Context {
426 in_ordered_list: is_ordered,
427 list_counter: if is_ordered { counter } else { 0 },
428 in_list: true,
429 list_depth: nested_depth,
430 ul_depth: if is_ordered { ctx.ul_depth } else { ctx.ul_depth + 1 },
431 loose_list: is_loose,
432 prev_item_had_blocks: false,
433 ..ctx.clone()
434 };
435
436 walk_node(child_handle, parser, output, options, &list_ctx, depth, dom_ctx);
437
438 if is_ordered {
439 if let Some(child_node) = child_handle.get(parser) {
440 if let tl::Node::Tag(child_tag) = child_node {
441 if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
442 counter += 1;
443 }
444 }
445 }
446 }
447 }
448 }
449 }
450 }
451}
452
453#[derive(Debug, Clone)]
455struct Context {
456 in_code: bool,
458 list_counter: usize,
460 in_ordered_list: bool,
462 last_was_dt: bool,
464 blockquote_depth: usize,
466 in_table_cell: bool,
468 convert_as_inline: bool,
470 inline_depth: usize,
472 in_list_item: bool,
474 list_depth: usize,
476 ul_depth: usize,
478 in_list: bool,
480 loose_list: bool,
482 prev_item_had_blocks: bool,
484 in_heading: bool,
486 heading_tag: Option<String>,
488 in_paragraph: bool,
490 in_ruby: bool,
492 in_strong: bool,
494 #[cfg(feature = "inline-images")]
495 inline_collector: Option<InlineCollectorHandle>,
497 #[cfg(feature = "metadata")]
498 metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
500}
501
502struct DomContext {
503 parent_map: HashMap<u32, Option<u32>>,
504 children_map: HashMap<u32, Vec<tl::NodeHandle>>,
505 root_children: Vec<tl::NodeHandle>,
506 node_map: HashMap<u32, tl::NodeHandle>,
507}
508
509fn escape_link_label(text: &str) -> String {
510 if text.is_empty() {
511 return String::new();
512 }
513
514 let mut result = String::with_capacity(text.len());
515 let mut backslash_count = 0usize;
516 let mut bracket_depth = 0usize;
517
518 for ch in text.chars() {
519 if ch == '\\' {
520 result.push('\\');
521 backslash_count += 1;
522 continue;
523 }
524
525 let is_escaped = backslash_count % 2 == 1;
526 backslash_count = 0;
527
528 match ch {
529 '[' if !is_escaped => {
530 bracket_depth = bracket_depth.saturating_add(1);
531 result.push('[');
532 }
533 ']' if !is_escaped => {
534 if bracket_depth == 0 {
535 result.push('\\');
536 } else {
537 bracket_depth -= 1;
538 }
539 result.push(']');
540 }
541 _ => result.push(ch),
542 }
543 }
544
545 result
546}
547
548fn append_markdown_link(
549 output: &mut String,
550 label: &str,
551 href: &str,
552 title: Option<&str>,
553 raw_text: &str,
554 options: &ConversionOptions,
555) {
556 output.push('[');
557 output.push_str(label);
558 output.push_str("](");
559
560 if href.is_empty() {
561 output.push_str("<>");
562 } else if href.contains(' ') || href.contains('\n') {
563 output.push('<');
564 output.push_str(href);
565 output.push('>');
566 } else {
567 let open_count = href.chars().filter(|&c| c == '(').count();
568 let close_count = href.chars().filter(|&c| c == ')').count();
569
570 if open_count == close_count {
571 output.push_str(href);
572 } else {
573 let escaped_href = href.replace("(", "\\(").replace(")", "\\)");
574 output.push_str(&escaped_href);
575 }
576 }
577
578 if let Some(title_text) = title {
579 output.push_str(" \"");
580 if title_text.contains('"') {
581 let escaped_title = title_text.replace('"', "\\\"");
582 output.push_str(&escaped_title);
583 } else {
584 output.push_str(title_text);
585 }
586 output.push('"');
587 } else if options.default_title && raw_text == href {
588 output.push_str(" \"");
589 if href.contains('"') {
590 let escaped_href = href.replace('"', "\\\"");
591 output.push_str(&escaped_href);
592 } else {
593 output.push_str(href);
594 }
595 output.push('"');
596 }
597
598 output.push(')');
599}
600
601fn heading_level_from_name(name: &str) -> Option<usize> {
602 match name {
603 "h1" => Some(1),
604 "h2" => Some(2),
605 "h3" => Some(3),
606 "h4" => Some(4),
607 "h5" => Some(5),
608 "h6" => Some(6),
609 _ => None,
610 }
611}
612
613fn find_single_heading_child(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<(usize, tl::NodeHandle)> {
614 let node = node_handle.get(parser)?;
615
616 let tl::Node::Tag(tag) = node else {
617 return None;
618 };
619
620 let children = tag.children();
621 let mut heading_data: Option<(usize, tl::NodeHandle)> = None;
622
623 for child_handle in children.top().iter() {
624 let Some(child_node) = child_handle.get(parser) else {
625 continue;
626 };
627
628 match child_node {
629 tl::Node::Raw(bytes) => {
630 if !bytes.as_utf8_str().trim().is_empty() {
631 return None;
632 }
633 }
634 tl::Node::Tag(child_tag) => {
635 let name = normalized_tag_name(child_tag.name().as_utf8_str());
636 if let Some(level) = heading_level_from_name(name.as_ref()) {
637 if heading_data.is_some() {
638 return None;
639 }
640 heading_data = Some((level, *child_handle));
641 } else {
642 return None;
643 }
644 }
645 _ => return None,
646 }
647 }
648
649 heading_data
650}
651
652fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
653 if text.is_empty() {
654 return;
655 }
656
657 if ctx.convert_as_inline {
658 output.push_str(text);
659 return;
660 }
661
662 if ctx.in_table_cell {
663 let is_table_continuation =
664 !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
665 if is_table_continuation {
666 output.push_str("<br>");
667 }
668 output.push_str(text);
669 return;
670 }
671
672 if ctx.in_list_item {
673 if output.ends_with('\n') {
674 if let Some(indent) = continuation_indent_string(ctx.list_depth, options) {
675 output.push_str(&indent);
676 }
677 } else if !output.ends_with(' ') && !output.is_empty() {
678 output.push(' ');
679 }
680 } else if !output.is_empty() && !output.ends_with("\n\n") {
681 if output.ends_with('\n') {
682 output.push('\n');
683 } else {
684 trim_trailing_whitespace(output);
685 output.push_str("\n\n");
686 }
687 }
688
689 let heading_suffix = if ctx.in_list_item || ctx.blockquote_depth > 0 {
690 "\n"
691 } else {
692 "\n\n"
693 };
694
695 match options.heading_style {
696 HeadingStyle::Underlined => {
697 if level == 1 {
698 output.push_str(text);
699 output.push('\n');
700 output.push_str(&"=".repeat(text.len()));
701 output.push_str(heading_suffix);
702 } else if level == 2 {
703 output.push_str(text);
704 output.push('\n');
705 output.push_str(&"-".repeat(text.len()));
706 output.push_str(heading_suffix);
707 } else {
708 output.push_str(&"#".repeat(level));
709 output.push(' ');
710 output.push_str(text);
711 output.push_str(heading_suffix);
712 }
713 }
714 HeadingStyle::Atx => {
715 output.push_str(&"#".repeat(level));
716 output.push(' ');
717 output.push_str(text);
718 output.push_str(heading_suffix);
719 }
720 HeadingStyle::AtxClosed => {
721 output.push_str(&"#".repeat(level));
722 output.push(' ');
723 output.push_str(text);
724 output.push(' ');
725 output.push_str(&"#".repeat(level));
726 output.push_str(heading_suffix);
727 }
728 }
729}
730
731fn normalize_heading_text<'a>(text: &'a str) -> Cow<'a, str> {
732 if !text.contains('\n') && !text.contains('\r') {
733 return Cow::Borrowed(text);
734 }
735
736 let mut normalized = String::with_capacity(text.len());
737 let mut pending_space = false;
738
739 for ch in text.chars() {
740 match ch {
741 '\n' | '\r' => {
742 if !normalized.is_empty() {
743 pending_space = true;
744 }
745 }
746 ' ' | '\t' if pending_space => continue,
747 _ => {
748 if pending_space {
749 if !normalized.ends_with(' ') {
750 normalized.push(' ');
751 }
752 pending_space = false;
753 }
754 normalized.push(ch);
755 }
756 }
757 }
758
759 Cow::Owned(normalized)
760}
761
762fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser) -> DomContext {
763 let mut ctx = DomContext {
764 parent_map: HashMap::new(),
765 children_map: HashMap::new(),
766 root_children: dom.children().to_vec(),
767 node_map: HashMap::new(),
768 };
769
770 for child_handle in dom.children().iter() {
771 record_node_hierarchy(child_handle, None, parser, &mut ctx);
772 }
773
774 ctx
775}
776
777fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
779 for handle in dom_ctx.node_map.values() {
780 if let Some(tl::Node::Tag(tag)) = handle.get(parser) {
781 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
782 if is_block_level_element(tag_name.as_ref()) {
783 let mut current = dom_ctx.parent_map.get(&handle.get_inner()).and_then(|p| *p);
784 while let Some(parent_id) = current {
785 if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
786 if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
787 let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
788 if is_inline_element(parent_name.as_ref()) {
789 return true;
790 }
791 }
792 }
793 current = dom_ctx.parent_map.get(&parent_id).and_then(|p| *p);
794 }
795 }
796 }
797 }
798
799 false
800}
801
802fn repair_with_html5ever(input: &str) -> Option<String> {
804 use html5ever::serialize::{SerializeOpts, serialize};
805 use html5ever::tendril::TendrilSink;
806 use markup5ever_rcdom::{RcDom, SerializableHandle};
807
808 let dom = html5ever::parse_document(RcDom::default(), Default::default())
809 .from_utf8()
810 .read_from(&mut input.as_bytes())
811 .ok()?;
812
813 let mut buf = Vec::with_capacity(input.len());
814 let handle = SerializableHandle::from(dom.document.clone());
815 serialize(&mut buf, &handle, SerializeOpts::default()).ok()?;
816 String::from_utf8(buf).ok()
817}
818
819fn record_node_hierarchy(node_handle: &tl::NodeHandle, parent: Option<u32>, parser: &tl::Parser, ctx: &mut DomContext) {
820 let id = node_handle.get_inner();
821 ctx.parent_map.insert(id, parent);
822 ctx.node_map.insert(id, *node_handle);
823
824 if let Some(node) = node_handle.get(parser) {
825 if let tl::Node::Tag(tag) = node {
826 let children: Vec<_> = tag.children().top().iter().copied().collect();
827 ctx.children_map.insert(id, children.clone());
828 for child in children {
829 record_node_hierarchy(&child, Some(id), parser, ctx);
830 }
831 }
832 }
833}
834
835fn is_hocr_document(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
844 fn check_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
845 if let Some(node) = node_handle.get(parser) {
846 match node {
847 tl::Node::Tag(tag) => {
848 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
849
850 if tag_name == "meta" {
851 if let Some(name_attr) = tag.attributes().get("name") {
852 if let Some(name_bytes) = name_attr {
853 let name_value = name_bytes.as_utf8_str();
854 if name_value == "ocr-system" || name_value == "ocr-capabilities" {
855 return true;
856 }
857 }
858 }
859 }
860
861 if let Some(class_attr) = tag.attributes().get("class") {
862 if let Some(class_bytes) = class_attr {
863 let class_value = class_bytes.as_utf8_str();
864 if class_value.contains("ocr_page")
865 || class_value.contains("ocrx_word")
866 || class_value.contains("ocr_carea")
867 || class_value.contains("ocr_par")
868 || class_value.contains("ocr_line")
869 {
870 return true;
871 }
872 }
873 }
874
875 let children = tag.children();
876 {
877 for child_handle in children.top().iter() {
878 if check_node(child_handle, parser) {
879 return true;
880 }
881 }
882 }
883 false
884 }
885 _ => false,
886 }
887 } else {
888 false
889 }
890 }
891
892 check_node(node_handle, parser)
893}
894
895fn extract_metadata(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> BTreeMap<String, String> {
906 let mut metadata = BTreeMap::new();
907
908 fn find_head(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<tl::NodeHandle> {
909 if let Some(node) = node_handle.get(parser) {
910 if let tl::Node::Tag(tag) = node {
911 if tag_name_eq(tag.name().as_utf8_str(), "head") {
912 return Some(*node_handle);
913 }
914 let children = tag.children();
915 {
916 for child_handle in children.top().iter() {
917 if let Some(result) = find_head(child_handle, parser) {
918 return Some(result);
919 }
920 }
921 }
922 }
923 }
924 None
925 }
926
927 let head_handle = match find_head(node_handle, parser) {
928 Some(h) => h,
929 None => return metadata,
930 };
931
932 if let Some(head_node) = head_handle.get(parser) {
933 if let tl::Node::Tag(head_tag) = head_node {
934 let children = head_tag.children();
935 {
936 for child_handle in children.top().iter() {
937 if let Some(child_node) = child_handle.get(parser) {
938 if let tl::Node::Tag(child_tag) = child_node {
939 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
940
941 match tag_name.as_ref() {
942 "title" => {
943 let title_children = child_tag.children();
944 {
945 if let Some(first_child) = title_children.top().iter().next() {
946 if let Some(text_node) = first_child.get(parser) {
947 if let tl::Node::Raw(bytes) = text_node {
948 let title = text::normalize_whitespace(&bytes.as_utf8_str())
949 .trim()
950 .to_string();
951 if !title.is_empty() {
952 metadata.insert("title".to_string(), title);
953 }
954 }
955 }
956 }
957 }
958 }
959 "base" => {
960 if let Some(href_attr) = child_tag.attributes().get("href") {
961 if let Some(href_bytes) = href_attr {
962 let href = href_bytes.as_utf8_str().to_string();
963 if !href.is_empty() {
964 metadata.insert("base-href".to_string(), href);
965 }
966 }
967 }
968 }
969 "meta" => {
970 let mut name_attr = None;
971 let mut property_attr = None;
972 let mut http_equiv_attr = None;
973 let mut content_attr = None;
974
975 if let Some(attr) = child_tag.attributes().get("name") {
976 if let Some(bytes) = attr {
977 name_attr = Some(bytes.as_utf8_str().to_string());
978 }
979 }
980 if let Some(attr) = child_tag.attributes().get("property") {
981 if let Some(bytes) = attr {
982 property_attr = Some(bytes.as_utf8_str().to_string());
983 }
984 }
985 if let Some(attr) = child_tag.attributes().get("http-equiv") {
986 if let Some(bytes) = attr {
987 http_equiv_attr = Some(bytes.as_utf8_str().to_string());
988 }
989 }
990 if let Some(attr) = child_tag.attributes().get("content") {
991 if let Some(bytes) = attr {
992 content_attr = Some(bytes.as_utf8_str().to_string());
993 }
994 }
995
996 if let Some(content) = content_attr {
997 if let Some(name) = name_attr {
998 let key = format!("meta-{}", name.to_lowercase());
999 metadata.insert(key, content);
1000 } else if let Some(property) = property_attr {
1001 let key = format!("meta-{}", property.to_lowercase().replace(':', "-"));
1002 metadata.insert(key, content);
1003 } else if let Some(http_equiv) = http_equiv_attr {
1004 let key = format!("meta-{}", http_equiv.to_lowercase());
1005 metadata.insert(key, content);
1006 }
1007 }
1008 }
1009 "link" => {
1010 let mut rel_attr = None;
1011 let mut href_attr = None;
1012
1013 if let Some(attr) = child_tag.attributes().get("rel") {
1014 if let Some(bytes) = attr {
1015 rel_attr = Some(bytes.as_utf8_str().to_string());
1016 }
1017 }
1018 if let Some(attr) = child_tag.attributes().get("href") {
1019 if let Some(bytes) = attr {
1020 href_attr = Some(bytes.as_utf8_str().to_string());
1021 }
1022 }
1023
1024 if let (Some(rel), Some(href)) = (rel_attr, href_attr) {
1025 let rel_lower = rel.to_lowercase();
1026 match rel_lower.as_str() {
1027 "canonical" => {
1028 metadata.insert("canonical".to_string(), href);
1029 }
1030 "author" | "license" | "alternate" => {
1031 metadata.insert(format!("link-{}", rel_lower), href);
1032 }
1033 _ => {}
1034 }
1035 }
1036 }
1037 _ => {}
1038 }
1039 }
1040 }
1041 }
1042 }
1043 }
1044 }
1045
1046 metadata
1047}
1048
1049fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
1051 if metadata.is_empty() {
1052 return String::new();
1053 }
1054
1055 let mut lines = vec!["---".to_string()];
1056 for (key, value) in metadata {
1057 let needs_quotes = value.contains(':') || value.contains('#') || value.contains('[') || value.contains(']');
1059 if needs_quotes {
1060 let escaped = value.replace('\\', "\\\\").replace('"', "\\\"");
1061 lines.push(format!("{}: \"{}\"", key, escaped));
1062 } else {
1063 lines.push(format!("{}: {}", key, value));
1064 }
1065 }
1066 lines.push("---".to_string());
1067
1068 lines.join("\n") + "\n\n"
1069}
1070
1071fn is_empty_inline_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
1073 const EMPTY_WHEN_NO_CONTENT_TAGS: &[&str] = &[
1074 "abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u",
1075 ];
1076
1077 if let Some(node) = node_handle.get(parser) {
1078 if let tl::Node::Tag(tag) = node {
1079 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1080 if EMPTY_WHEN_NO_CONTENT_TAGS.contains(&tag_name.as_ref()) {
1081 return get_text_content(node_handle, parser).trim().is_empty();
1082 }
1083 }
1084 }
1085 false
1086}
1087
1088fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1090 let mut text = String::with_capacity(64);
1091 if let Some(node) = node_handle.get(parser) {
1092 match node {
1093 tl::Node::Raw(bytes) => {
1094 text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1095 }
1096 tl::Node::Tag(tag) => {
1097 let children = tag.children();
1098 {
1099 for child_handle in children.top().iter() {
1100 text.push_str(&get_text_content(child_handle, parser));
1101 }
1102 }
1103 }
1104 _ => {}
1105 }
1106 }
1107 text
1108}
1109
1110fn collect_link_label_text(children: &[tl::NodeHandle], parser: &tl::Parser) -> (String, Vec<tl::NodeHandle>, bool) {
1112 let mut text = String::new();
1113 let mut saw_block = false;
1114 let mut block_nodes = Vec::new();
1115 let mut stack: Vec<_> = children.iter().rev().copied().collect();
1116
1117 while let Some(handle) = stack.pop() {
1118 if let Some(node) = handle.get(parser) {
1119 match node {
1120 tl::Node::Raw(bytes) => {
1121 text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1122 }
1123 tl::Node::Tag(tag) => {
1124 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1125 if is_block_level_element(tag_name.as_ref()) {
1126 saw_block = true;
1127 block_nodes.push(handle);
1128 continue;
1129 }
1130
1131 let tag_children = tag.children();
1132 {
1133 let mut child_nodes: Vec<_> = tag_children.top().iter().copied().collect();
1134 child_nodes.reverse();
1135 for child in child_nodes {
1136 stack.push(child);
1137 }
1138 }
1139 }
1140 _ => {}
1141 }
1142 }
1143 }
1144
1145 (text, block_nodes, saw_block)
1146}
1147
1148fn normalize_link_label(label: &str) -> String {
1149 let collapsed = label
1150 .chars()
1151 .map(|ch| if ch == '\n' || ch == '\r' { ' ' } else { ch })
1152 .collect::<String>();
1153 text::normalize_whitespace(&collapsed).trim().to_string()
1154}
1155
1156fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1158 if let Some(node) = node_handle.get(parser) {
1159 if let tl::Node::Tag(tag) = node {
1160 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1161 let mut html = String::with_capacity(256);
1162 html.push('<');
1163 html.push_str(&tag_name);
1164
1165 for (key, value_opt) in tag.attributes().iter() {
1167 html.push(' ');
1168 html.push_str(&key);
1169 if let Some(value) = value_opt {
1170 html.push_str("=\"");
1171 html.push_str(&value);
1172 html.push('"');
1173 }
1174 }
1175
1176 let has_children = !tag.children().top().is_empty();
1177 if !has_children {
1178 html.push_str(" />");
1179 } else {
1180 html.push('>');
1181 let children = tag.children();
1182 {
1183 for child_handle in children.top().iter() {
1184 html.push_str(&serialize_node(child_handle, parser));
1185 }
1186 }
1187 html.push_str("</");
1188 html.push_str(&tag_name);
1189 html.push('>');
1190 }
1191 return html;
1192 }
1193 }
1194 String::new()
1195}
1196
1197#[cfg(feature = "inline-images")]
1198fn non_empty_trimmed(value: &str) -> Option<String> {
1199 let trimmed = value.trim();
1200 if trimmed.is_empty() {
1201 None
1202 } else {
1203 Some(trimmed.to_string())
1204 }
1205}
1206
1207#[cfg(feature = "inline-images")]
1208fn handle_inline_data_image(
1209 collector_ref: &InlineCollectorHandle,
1210 src: &str,
1211 alt: &str,
1212 title: Option<&str>,
1213 attributes: BTreeMap<String, String>,
1214) {
1215 let trimmed_src = src.trim();
1216 if !trimmed_src.starts_with("data:") {
1217 return;
1218 }
1219
1220 let mut collector = collector_ref.borrow_mut();
1221 let index = collector.next_index();
1222
1223 let Some((meta, payload)) = trimmed_src.split_once(',') else {
1224 collector.warn_skip(index, "missing data URI separator");
1225 return;
1226 };
1227
1228 if payload.trim().is_empty() {
1229 collector.warn_skip(index, "empty data URI payload");
1230 return;
1231 }
1232
1233 if !meta.starts_with("data:") {
1234 collector.warn_skip(index, "invalid data URI scheme");
1235 return;
1236 }
1237
1238 let header = &meta["data:".len()..];
1239 if header.is_empty() {
1240 collector.warn_skip(index, "missing MIME type");
1241 return;
1242 }
1243
1244 let mut segments = header.split(';');
1245 let mime = segments.next().unwrap_or("");
1246 let Some((top_level, subtype_raw)) = mime.split_once('/') else {
1247 collector.warn_skip(index, "missing MIME subtype");
1248 return;
1249 };
1250
1251 if !top_level.eq_ignore_ascii_case("image") {
1252 collector.warn_skip(index, format!("unsupported MIME type {mime}"));
1253 return;
1254 }
1255
1256 let subtype_raw = subtype_raw.trim();
1257 if subtype_raw.is_empty() {
1258 collector.warn_skip(index, "missing MIME subtype");
1259 return;
1260 }
1261
1262 let subtype_lower = subtype_raw.to_ascii_lowercase();
1263
1264 let mut is_base64 = false;
1265 let mut inline_name: Option<String> = None;
1266 for segment in segments {
1267 if segment.eq_ignore_ascii_case("base64") {
1268 is_base64 = true;
1269 } else if let Some(value) = segment.strip_prefix("name=") {
1270 inline_name = non_empty_trimmed(value.trim_matches('"'));
1271 } else if let Some(value) = segment.strip_prefix("filename=") {
1272 inline_name = non_empty_trimmed(value.trim_matches('"'));
1273 }
1274 }
1275
1276 if !is_base64 {
1277 collector.warn_skip(index, "missing base64 encoding marker");
1278 return;
1279 }
1280
1281 use base64::{Engine as _, engine::general_purpose::STANDARD};
1282
1283 let payload_clean = payload.trim();
1284 let decoded = match STANDARD.decode(payload_clean) {
1285 Ok(bytes) => bytes,
1286 Err(_) => {
1287 collector.warn_skip(index, "invalid base64 payload");
1288 return;
1289 }
1290 };
1291
1292 if decoded.is_empty() {
1293 collector.warn_skip(index, "empty base64 payload");
1294 return;
1295 }
1296
1297 let max_size = collector.max_decoded_size();
1298 if decoded.len() as u64 > max_size {
1299 collector.warn_skip(
1300 index,
1301 format!(
1302 "decoded payload ({} bytes) exceeds configured max ({})",
1303 decoded.len(),
1304 max_size
1305 ),
1306 );
1307 return;
1308 }
1309
1310 let format = match subtype_lower.as_str() {
1311 "png" => InlineImageFormat::Png,
1312 "jpeg" | "jpg" => InlineImageFormat::Jpeg,
1313 "gif" => InlineImageFormat::Gif,
1314 "bmp" => InlineImageFormat::Bmp,
1315 "webp" => InlineImageFormat::Webp,
1316 "svg+xml" => InlineImageFormat::Svg,
1317 other => InlineImageFormat::Other(other.to_string()),
1318 };
1319
1320 let description = non_empty_trimmed(alt).or_else(|| title.and_then(non_empty_trimmed));
1321
1322 let filename_candidate = attributes
1323 .get("data-filename")
1324 .cloned()
1325 .or_else(|| attributes.get("filename").cloned())
1326 .or_else(|| attributes.get("data-name").cloned())
1327 .or(inline_name);
1328
1329 let dimensions = collector.infer_dimensions(index, &decoded, &format);
1330
1331 let image = collector.build_image(
1332 decoded,
1333 format,
1334 filename_candidate,
1335 description,
1336 dimensions,
1337 InlineImageSource::ImgDataUri,
1338 attributes,
1339 );
1340
1341 collector.push_image(index, image);
1342}
1343
1344#[cfg(feature = "inline-images")]
1345fn handle_inline_svg(
1346 collector_ref: &InlineCollectorHandle,
1347 node_handle: &tl::NodeHandle,
1348 parser: &tl::Parser,
1349 title_opt: Option<String>,
1350 attributes: BTreeMap<String, String>,
1351) {
1352 {
1353 let borrow = collector_ref.borrow();
1354 if !borrow.capture_svg() {
1355 return;
1356 }
1357 }
1358
1359 let mut collector = collector_ref.borrow_mut();
1360 let index = collector.next_index();
1361
1362 let serialized = serialize_element(node_handle, parser);
1363 if serialized.is_empty() {
1364 collector.warn_skip(index, "unable to serialize SVG element");
1365 return;
1366 }
1367
1368 let data = serialized.into_bytes();
1369 let max_size = collector.max_decoded_size();
1370 if data.len() as u64 > max_size {
1371 collector.warn_skip(
1372 index,
1373 format!(
1374 "serialized SVG payload ({} bytes) exceeds configured max ({})",
1375 data.len(),
1376 max_size
1377 ),
1378 );
1379 return;
1380 }
1381
1382 let description = attributes
1383 .get("aria-label")
1384 .and_then(|value| non_empty_trimmed(value))
1385 .or_else(|| title_opt.clone().and_then(|t| non_empty_trimmed(&t)));
1386
1387 let filename_candidate = attributes
1388 .get("data-filename")
1389 .cloned()
1390 .or_else(|| attributes.get("filename").cloned())
1391 .or_else(|| attributes.get("data-name").cloned());
1392
1393 let image = collector.build_image(
1394 data,
1395 InlineImageFormat::Svg,
1396 filename_candidate,
1397 description,
1398 None,
1399 InlineImageSource::SvgElement,
1400 attributes,
1401 );
1402
1403 collector.push_image(index, image);
1404}
1405
1406fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1408 if let Some(node) = node_handle.get(parser) {
1409 match node {
1410 tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
1411 tl::Node::Tag(_) => serialize_element(node_handle, parser),
1412 _ => String::new(),
1413 }
1414 } else {
1415 String::new()
1416 }
1417}
1418
1419pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
1421 convert_html_impl(html, options, None, None)
1422}
1423
1424#[cfg(feature = "inline-images")]
1425pub(crate) fn convert_html_with_inline_collector(
1426 html: &str,
1427 options: &ConversionOptions,
1428 collector: InlineCollectorHandle,
1429) -> Result<String> {
1430 convert_html_impl(html, options, Some(collector), None)
1431}
1432
1433#[cfg(feature = "metadata")]
1434pub(crate) fn convert_html_with_metadata(
1435 html: &str,
1436 options: &ConversionOptions,
1437 metadata_collector: crate::metadata::MetadataCollectorHandle,
1438) -> Result<String> {
1439 convert_html_impl(html, options, None, Some(metadata_collector))
1440}
1441
1442#[cfg_attr(not(feature = "inline-images"), allow(unused_variables))]
1443#[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
1444fn convert_html_impl(
1445 html: &str,
1446 options: &ConversionOptions,
1447 inline_collector: Option<InlineCollectorHandle>,
1448 #[cfg(feature = "metadata")] metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
1449 #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
1450) -> Result<String> {
1451 let mut preprocessed = preprocess_html(html).into_owned();
1453 let mut preprocessed_len = preprocessed.len();
1454
1455 let parser_options = tl::ParserOptions::default();
1456 let mut dom_guard = unsafe {
1457 tl::parse_owned(preprocessed.clone(), parser_options)
1458 .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1459 };
1460 let mut dom_ref = dom_guard.get_ref();
1461 let mut parser = dom_ref.parser();
1462 let mut dom_ctx = build_dom_context(dom_ref, parser);
1463 let mut output = String::with_capacity(preprocessed_len);
1464
1465 if has_inline_block_misnest(&dom_ctx, parser) {
1466 if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
1467 preprocessed = preprocess_html(&repaired_html).into_owned();
1468 preprocessed_len = preprocessed.len();
1469 dom_guard = unsafe {
1470 tl::parse_owned(preprocessed.clone(), parser_options)
1471 .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1472 };
1473 dom_ref = dom_guard.get_ref();
1474 parser = dom_ref.parser();
1475 dom_ctx = build_dom_context(dom_ref, parser);
1476 output = String::with_capacity(preprocessed_len);
1477 }
1478 }
1479
1480 let mut is_hocr = false;
1482 for child_handle in dom_ref.children().iter() {
1483 if is_hocr_document(child_handle, parser) {
1484 is_hocr = true;
1485 break;
1486 }
1487 }
1488
1489 if options.extract_metadata && !options.convert_as_inline && !is_hocr {
1490 for child_handle in dom_ref.children().iter() {
1491 let metadata = extract_metadata(child_handle, parser);
1492 if !metadata.is_empty() {
1493 let metadata_frontmatter = format_metadata_frontmatter(&metadata);
1494 output.push_str(&metadata_frontmatter);
1495 break;
1496 }
1497 }
1498 }
1499
1500 if is_hocr {
1501 use crate::hocr::{convert_to_markdown_with_options as convert_hocr_to_markdown, extract_hocr_document};
1502
1503 let (elements, metadata) = extract_hocr_document(dom_ref, options.debug);
1504
1505 if options.extract_metadata && !options.convert_as_inline {
1507 let mut metadata_map = BTreeMap::new();
1508 if let Some(system) = metadata.ocr_system {
1509 metadata_map.insert("ocr-system".to_string(), system);
1510 }
1511 if !metadata.ocr_capabilities.is_empty() {
1512 metadata_map.insert("ocr-capabilities".to_string(), metadata.ocr_capabilities.join(", "));
1513 }
1514 if let Some(pages) = metadata.ocr_number_of_pages {
1515 metadata_map.insert("ocr-number-of-pages".to_string(), pages.to_string());
1516 }
1517 if !metadata.ocr_langs.is_empty() {
1518 metadata_map.insert("ocr-langs".to_string(), metadata.ocr_langs.join(", "));
1519 }
1520 if !metadata.ocr_scripts.is_empty() {
1521 metadata_map.insert("ocr-scripts".to_string(), metadata.ocr_scripts.join(", "));
1522 }
1523
1524 if !metadata_map.is_empty() {
1525 output.push_str(&format_metadata_frontmatter(&metadata_map));
1526 }
1527 }
1528
1529 let mut markdown = convert_hocr_to_markdown(&elements, true, options.hocr_spatial_tables);
1530
1531 if markdown.trim().is_empty() {
1532 return Ok(output);
1533 }
1534
1535 markdown.truncate(markdown.trim_end().len());
1536 output.push_str(&markdown);
1537 output.push('\n');
1538
1539 return Ok(output);
1540 }
1541
1542 #[cfg(feature = "metadata")]
1544 if let Some(ref collector) = metadata_collector {
1545 if !is_hocr {
1546 for child_handle in dom_ref.children().iter() {
1547 let head_meta = extract_metadata(child_handle, parser);
1548 if !head_meta.is_empty() {
1549 collector.borrow_mut().set_head_metadata(head_meta);
1550 break;
1551 }
1552 }
1553 }
1554 }
1555
1556 #[cfg(feature = "metadata")]
1558 if let Some(ref collector) = metadata_collector {
1559 for child_handle in dom_ref.children().iter() {
1560 if let Some(tl::Node::Tag(tag)) = child_handle.get(parser) {
1561 let tag_name = tag.name().as_utf8_str();
1562 if tag_name == "html" || tag_name == "body" {
1563 if let Some(lang) = tag.attributes().get("lang") {
1564 if let Some(lang_bytes) = lang {
1565 let lang_str = lang_bytes.as_utf8_str();
1566 collector.borrow_mut().set_language(lang_str.to_string());
1567 }
1568 }
1569 if let Some(dir) = tag.attributes().get("dir") {
1570 if let Some(dir_bytes) = dir {
1571 let dir_str = dir_bytes.as_utf8_str();
1572 collector.borrow_mut().set_text_direction(dir_str.to_string());
1573 }
1574 }
1575 }
1576 }
1577 }
1578 }
1579
1580 let ctx = Context {
1581 in_code: false,
1582 list_counter: 0,
1583 in_ordered_list: false,
1584 last_was_dt: false,
1585 blockquote_depth: 0,
1586 in_table_cell: false,
1587 convert_as_inline: options.convert_as_inline,
1588 inline_depth: 0,
1589 in_list_item: false,
1590 list_depth: 0,
1591 ul_depth: 0,
1592 in_list: false,
1593 loose_list: false,
1594 prev_item_had_blocks: false,
1595 in_heading: false,
1596 heading_tag: None,
1597 in_paragraph: false,
1598 in_ruby: false,
1599 in_strong: false,
1600 #[cfg(feature = "inline-images")]
1601 inline_collector: inline_collector.clone(),
1602 #[cfg(feature = "metadata")]
1603 metadata_collector: metadata_collector.clone(),
1604 };
1605
1606 for child_handle in dom_ref.children().iter() {
1608 walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
1609 }
1610
1611 trim_line_end_whitespace(&mut output);
1613 let trimmed = output.trim_end_matches('\n');
1614 if trimmed.is_empty() {
1615 Ok(String::new())
1616 } else {
1617 Ok(format!("{}\n", trimmed))
1618 }
1619}
1620
1621fn preprocess_html(input: &str) -> Cow<'_, str> {
1622 const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1623 const TAGS: [&[u8]; 2] = [b"script", b"style"];
1624 const SVG: &[u8] = b"svg";
1625 const DOCTYPE: &[u8] = b"doctype";
1626 const EMPTY_COMMENT: &[u8] = b"<!---->";
1627
1628 let bytes = input.as_bytes();
1629 let len = bytes.len();
1630 if len == 0 {
1631 return Cow::Borrowed(input);
1632 }
1633
1634 let mut idx = 0;
1635 let mut last = 0;
1636 let mut output: Option<String> = None;
1637 let mut svg_depth = 0usize;
1638
1639 while idx < len {
1640 if bytes[idx] == b'<' {
1641 if bytes[idx..].starts_with(EMPTY_COMMENT) {
1642 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1643 out.push_str(&input[last..idx]);
1644 out.push_str("<!-- -->");
1645 idx += EMPTY_COMMENT.len();
1646 last = idx;
1647 continue;
1648 }
1649
1650 let mut replaced = false;
1651 for (pattern, replacement) in &SELF_CLOSING {
1652 if bytes[idx..].starts_with(pattern) {
1653 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1654 out.push_str(&input[last..idx]);
1655 out.push_str(replacement);
1656 idx += pattern.len();
1657 last = idx;
1658 replaced = true;
1659 break;
1660 }
1661 }
1662 if replaced {
1663 continue;
1664 }
1665
1666 if matches_tag_start(bytes, idx + 1, SVG) {
1667 if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1668 svg_depth += 1;
1669 idx = open_end;
1670 continue;
1671 }
1672 } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1673 if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1674 if svg_depth > 0 {
1675 svg_depth = svg_depth.saturating_sub(1);
1676 }
1677 idx = close_end;
1678 continue;
1679 }
1680 }
1681
1682 if svg_depth == 0 {
1683 let mut handled = false;
1684 for tag in TAGS {
1685 if matches_tag_start(bytes, idx + 1, tag) {
1686 if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1687 let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1688 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1689 out.push_str(&input[last..idx]);
1690 out.push_str(&input[idx..open_end]);
1691 out.push_str("</");
1692 out.push_str(str::from_utf8(tag).unwrap());
1693 out.push('>');
1694
1695 last = remove_end;
1696 idx = remove_end;
1697 handled = true;
1698 }
1699 }
1700
1701 if handled {
1702 break;
1703 }
1704 }
1705
1706 if handled {
1707 continue;
1708 }
1709
1710 if idx + 2 < len && bytes[idx + 1] == b'!' {
1711 let mut cursor = idx + 2;
1712 while cursor < len && bytes[cursor].is_ascii_whitespace() {
1713 cursor += 1;
1714 }
1715
1716 if cursor + DOCTYPE.len() <= len
1717 && bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
1718 {
1719 if let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len()) {
1720 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1721 out.push_str(&input[last..idx]);
1722 last = end;
1723 idx = end;
1724 continue;
1725 }
1726 }
1727 }
1728 }
1729
1730 let is_valid_tag = if idx + 1 < len {
1731 match bytes[idx + 1] {
1732 b'!' => {
1733 idx + 2 < len
1734 && (bytes[idx + 2] == b'-'
1735 || bytes[idx + 2].is_ascii_alphabetic()
1736 || bytes[idx + 2].is_ascii_uppercase())
1737 }
1738 b'/' => {
1739 idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1740 }
1741 b'?' => true,
1742 c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
1743 _ => false,
1744 }
1745 } else {
1746 false
1747 };
1748
1749 if !is_valid_tag {
1750 let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1751 out.push_str(&input[last..idx]);
1752 out.push_str("<");
1753 idx += 1;
1754 last = idx;
1755 continue;
1756 }
1757 }
1758
1759 idx += 1;
1760 }
1761
1762 if let Some(mut out) = output {
1763 if last < len {
1764 out.push_str(&input[last..]);
1765 }
1766 Cow::Owned(out)
1767 } else {
1768 Cow::Borrowed(input)
1769 }
1770}
1771
1772#[cfg(test)]
1773fn normalize_self_closing_tags(input: &str) -> Cow<'_, str> {
1774 const REPLACEMENTS: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1775
1776 if !REPLACEMENTS
1777 .iter()
1778 .any(|(pattern, _)| input.as_bytes().windows(pattern.len()).any(|w| w == *pattern))
1779 {
1780 return Cow::Borrowed(input);
1781 }
1782
1783 let bytes = input.as_bytes();
1784 let mut output = String::with_capacity(input.len());
1785 let mut idx = 0;
1786 let mut last = 0;
1787
1788 while idx < bytes.len() {
1789 let mut matched = false;
1790 for (pattern, replacement) in &REPLACEMENTS {
1791 if bytes[idx..].starts_with(*pattern) {
1792 output.push_str(&input[last..idx]);
1793 output.push_str(replacement);
1794 idx += pattern.len();
1795 last = idx;
1796 matched = true;
1797 break;
1798 }
1799 }
1800
1801 if !matched {
1802 idx += 1;
1803 }
1804 }
1805
1806 if last < input.len() {
1807 output.push_str(&input[last..]);
1808 }
1809
1810 Cow::Owned(output)
1811}
1812
1813#[cfg(test)]
1825fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
1826 let bytes = input.as_bytes();
1827 let len = bytes.len();
1828 let mut idx = 0;
1829 let mut last = 0;
1830 let mut output: Option<String> = None;
1831
1832 while idx < len {
1833 if bytes[idx] == b'<' {
1834 if idx + 1 < len {
1836 let next = bytes[idx + 1];
1837
1838 let is_valid_tag = match next {
1840 b'!' => {
1841 idx + 2 < len
1843 && (bytes[idx + 2] == b'-'
1844 || bytes[idx + 2].is_ascii_alphabetic()
1845 || bytes[idx + 2].is_ascii_uppercase())
1846 }
1847 b'/' => {
1848 idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1850 }
1851 b'?' => {
1852 true
1854 }
1855 c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => {
1856 true
1858 }
1859 _ => false,
1860 };
1861
1862 if !is_valid_tag {
1863 let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1865 out.push_str(&input[last..idx]);
1866 out.push_str("<");
1867 last = idx + 1;
1868 }
1869 } else {
1870 let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1872 out.push_str(&input[last..idx]);
1873 out.push_str("<");
1874 last = idx + 1;
1875 }
1876 }
1877 idx += 1;
1878 }
1879
1880 if let Some(mut out) = output {
1881 if last < input.len() {
1882 out.push_str(&input[last..]);
1883 }
1884 Cow::Owned(out)
1885 } else {
1886 Cow::Borrowed(input)
1887 }
1888}
1889
1890fn normalized_tag_name<'a>(raw: Cow<'a, str>) -> Cow<'a, str> {
1891 if raw.as_bytes().iter().any(|b| b.is_ascii_uppercase()) {
1892 let mut owned = raw.into_owned();
1893 owned.make_ascii_lowercase();
1894 Cow::Owned(owned)
1895 } else {
1896 raw
1897 }
1898}
1899
1900fn tag_name_eq(name: Cow<'_, str>, needle: &str) -> bool {
1901 name.eq_ignore_ascii_case(needle)
1902}
1903
1904fn should_drop_for_preprocessing(
1905 node_handle: &tl::NodeHandle,
1906 tag_name: &str,
1907 tag: &tl::HTMLTag,
1908 parser: &tl::Parser,
1909 dom_ctx: &DomContext,
1910 options: &ConversionOptions,
1911) -> bool {
1912 if !options.preprocessing.enabled {
1913 return false;
1914 }
1915
1916 if options.preprocessing.remove_navigation {
1917 let has_nav_hint = element_has_navigation_hint(tag);
1918
1919 if tag_name == "nav" {
1920 return true;
1921 }
1922
1923 if tag_name == "header" {
1924 let inside_semantic_content = has_semantic_content_ancestor(node_handle, parser, dom_ctx);
1925 if !inside_semantic_content {
1926 return true;
1927 }
1928 if has_nav_hint {
1929 return true;
1930 }
1931 } else if tag_name == "footer" || tag_name == "aside" {
1932 if has_nav_hint {
1933 return true;
1934 }
1935 } else if has_nav_hint && !matches!(tag_name, "main" | "article" | "html" | "body" | "head") {
1936 return true;
1937 }
1938 }
1939
1940 if options.preprocessing.remove_forms {
1941 if tag_name == "form" {
1942 let preserves_form = options.preserve_tags.iter().any(|t| t == "form");
1943 if !preserves_form {
1944 return true;
1945 }
1946 } else if matches!(
1947 tag_name,
1948 "button" | "select" | "textarea" | "label" | "fieldset" | "legend"
1949 ) {
1950 return true;
1951 }
1952 }
1953
1954 false
1955}
1956
1957fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
1958 let mut current_id = node_handle.get_inner();
1959 while let Some(parent_id) = dom_ctx.parent_map.get(¤t_id).copied().flatten() {
1960 if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
1961 if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
1962 let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
1963 if matches!(parent_name.as_ref(), "main" | "article" | "section") {
1964 return true;
1965 }
1966 if tag_has_main_semantics(parent_tag) {
1967 return true;
1968 }
1969 }
1970 }
1971 current_id = parent_id;
1972 }
1973 false
1974}
1975
1976fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
1977 if let Some(role_attr) = tag.attributes().get("role") {
1978 if let Some(role) = role_attr {
1979 let lowered = role.as_utf8_str().to_ascii_lowercase();
1980 if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
1981 return true;
1982 }
1983 }
1984 }
1985
1986 if let Some(class_attr) = tag.attributes().get("class") {
1987 if let Some(class_bytes) = class_attr {
1988 let class_value = class_bytes.as_utf8_str().to_ascii_lowercase();
1989 const MAIN_CLASS_HINTS: &[&str] = &[
1990 "mw-body",
1991 "mw-parser-output",
1992 "content-body",
1993 "content-container",
1994 "article-body",
1995 "article-content",
1996 "main-content",
1997 "page-content",
1998 "entry-content",
1999 "post-content",
2000 "document-body",
2001 ];
2002 if MAIN_CLASS_HINTS.iter().any(|hint| class_value.contains(hint)) {
2003 return true;
2004 }
2005 }
2006 }
2007
2008 false
2009}
2010
2011fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
2012 if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
2013 return true;
2014 }
2015
2016 if attribute_contains_any(
2017 tag,
2018 "aria-label",
2019 &["navigation", "menu", "contents", "table of contents", "toc"],
2020 ) {
2021 return true;
2022 }
2023
2024 const NAV_KEYWORDS: &[&str] = &[
2025 "nav",
2026 "navigation",
2027 "navbar",
2028 "breadcrumbs",
2029 "breadcrumb",
2030 "toc",
2031 "sidebar",
2032 "sidenav",
2033 "menu",
2034 "menubar",
2035 "mainmenu",
2036 "subnav",
2037 "tabs",
2038 "tablist",
2039 "toolbar",
2040 "pager",
2041 "pagination",
2042 "skipnav",
2043 "skip-link",
2044 "skiplinks",
2045 "site-nav",
2046 "site-menu",
2047 "site-header",
2048 "site-footer",
2049 "topbar",
2050 "bottombar",
2051 "masthead",
2052 "vector-nav",
2053 "vector-header",
2054 "vector-footer",
2055 ];
2056
2057 attribute_matches_any(tag, "class", NAV_KEYWORDS) || attribute_matches_any(tag, "id", NAV_KEYWORDS)
2058}
2059
2060fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2061 let Some(attr_value) = tag.attributes().get(attr) else {
2062 return false;
2063 };
2064 let Some(value) = attr_value else {
2065 return false;
2066 };
2067 let raw = value.as_utf8_str();
2068 raw.split_whitespace()
2069 .map(|token| {
2070 token
2071 .chars()
2072 .map(|c| match c {
2073 '_' | ':' | '.' | '/' => '-',
2074 _ => c,
2075 })
2076 .collect::<String>()
2077 .to_ascii_lowercase()
2078 })
2079 .filter(|token| !token.is_empty())
2080 .any(|token| keywords.iter().any(|kw| token == *kw))
2081}
2082
2083fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2084 let Some(attr_value) = tag.attributes().get(attr) else {
2085 return false;
2086 };
2087 let Some(value) = attr_value else {
2088 return false;
2089 };
2090 let lower = value.as_utf8_str().to_ascii_lowercase();
2091 keywords.iter().any(|kw| lower.contains(*kw))
2092}
2093
2094fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
2098 let mut html = String::new();
2099 serialize_node_to_html(handle, parser, &mut html);
2100 html
2101}
2102
2103fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
2105 match handle.get(parser) {
2106 Some(tl::Node::Tag(tag)) => {
2107 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2108
2109 output.push('<');
2111 output.push_str(&tag_name);
2112
2113 for (key, value) in tag.attributes().iter() {
2115 output.push(' ');
2116 output.push_str(&key);
2117 if let Some(val) = value {
2118 output.push_str("=\"");
2119 output.push_str(&val);
2120 output.push('"');
2121 }
2122 }
2123
2124 output.push('>');
2125
2126 let children = tag.children();
2128 for child_handle in children.top().iter() {
2129 serialize_node_to_html(child_handle, parser, output);
2130 }
2131
2132 if !matches!(
2134 tag_name.as_ref(),
2135 "br" | "hr"
2136 | "img"
2137 | "input"
2138 | "meta"
2139 | "link"
2140 | "area"
2141 | "base"
2142 | "col"
2143 | "embed"
2144 | "param"
2145 | "source"
2146 | "track"
2147 | "wbr"
2148 ) {
2149 output.push_str("</");
2150 output.push_str(&tag_name);
2151 output.push('>');
2152 }
2153 }
2154 Some(tl::Node::Raw(bytes)) => {
2155 if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
2156 output.push_str(text);
2157 }
2158 }
2159 _ => {}
2160 }
2161}
2162
2163#[cfg(test)]
2164fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
2165 const TAGS: [&[u8]; 2] = [b"script", b"style"];
2166 const SVG: &[u8] = b"svg";
2167
2168 let bytes = input.as_bytes();
2169 let len = bytes.len();
2170 let mut idx = 0;
2171 let mut last = 0;
2172 let mut output: Option<String> = None;
2173 let mut svg_depth = 0usize;
2174
2175 while idx < len {
2176 if bytes[idx] == b'<' {
2177 if matches_tag_start(bytes, idx + 1, SVG) {
2178 if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
2179 svg_depth += 1;
2180 idx = open_end;
2181 continue;
2182 }
2183 } else if matches_end_tag_start(bytes, idx + 1, SVG) {
2184 if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
2185 if svg_depth > 0 {
2186 svg_depth = svg_depth.saturating_sub(1);
2187 }
2188 idx = close_end;
2189 continue;
2190 }
2191 }
2192
2193 if svg_depth == 0 {
2194 let mut handled = false;
2195 for tag in TAGS {
2196 if matches_tag_start(bytes, idx + 1, tag) {
2197 if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
2198 let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
2199 let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
2200 out.push_str(&input[last..idx]);
2201 out.push_str(&input[idx..open_end]);
2202 out.push_str("</");
2203 out.push_str(str::from_utf8(tag).unwrap());
2204 out.push('>');
2205
2206 last = remove_end;
2207 idx = remove_end;
2208 handled = true;
2209 }
2210 }
2211
2212 if handled {
2213 break;
2214 }
2215 }
2216
2217 if handled {
2218 continue;
2219 }
2220 }
2221 }
2222
2223 idx += 1;
2224 }
2225
2226 if let Some(mut out) = output {
2227 if last < input.len() {
2228 out.push_str(&input[last..]);
2229 }
2230 Cow::Owned(out)
2231 } else {
2232 Cow::Borrowed(input)
2233 }
2234}
2235
2236fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
2237 if start >= bytes.len() {
2238 return false;
2239 }
2240
2241 if start + tag.len() > bytes.len() {
2242 return false;
2243 }
2244
2245 if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
2246 return false;
2247 }
2248
2249 start += tag.len();
2250
2251 match bytes.get(start) {
2252 Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
2253 Some(_) => false,
2254 None => true,
2255 }
2256}
2257
2258fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
2259 let len = bytes.len();
2260 let mut in_quote: Option<u8> = None;
2261
2262 while idx < len {
2263 match bytes[idx] {
2264 b'"' | b'\'' => {
2265 if let Some(current) = in_quote {
2266 if current == bytes[idx] {
2267 in_quote = None;
2268 }
2269 } else {
2270 in_quote = Some(bytes[idx]);
2271 }
2272 }
2273 b'>' if in_quote.is_none() => return Some(idx + 1),
2274 _ => {}
2275 }
2276 idx += 1;
2277 }
2278
2279 None
2280}
2281
2282fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
2283 let len = bytes.len();
2284 let mut depth = 1usize;
2285
2286 while idx < len {
2287 if bytes[idx] == b'<' {
2288 if matches_tag_start(bytes, idx + 1, tag) {
2289 if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
2290 depth += 1;
2291 idx = next;
2292 continue;
2293 }
2294 } else if matches_end_tag_start(bytes, idx + 1, tag) {
2295 if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
2296 depth -= 1;
2297 if depth == 0 {
2298 return Some(close);
2299 }
2300 idx = close;
2301 continue;
2302 }
2303 }
2304 }
2305
2306 idx += 1;
2307 }
2308
2309 None
2310}
2311
2312fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
2313 if start >= bytes.len() || bytes[start] != b'/' {
2314 return false;
2315 }
2316 matches_tag_start(bytes, start + 1, tag)
2317}
2318
2319fn is_inline_element(tag_name: &str) -> bool {
2321 matches!(
2322 tag_name,
2323 "a" | "abbr"
2324 | "b"
2325 | "bdi"
2326 | "bdo"
2327 | "br"
2328 | "cite"
2329 | "code"
2330 | "data"
2331 | "dfn"
2332 | "em"
2333 | "i"
2334 | "kbd"
2335 | "mark"
2336 | "q"
2337 | "rp"
2338 | "rt"
2339 | "ruby"
2340 | "s"
2341 | "samp"
2342 | "small"
2343 | "span"
2344 | "strong"
2345 | "sub"
2346 | "sup"
2347 | "time"
2348 | "u"
2349 | "var"
2350 | "wbr"
2351 | "del"
2352 | "ins"
2353 | "img"
2354 | "map"
2355 | "area"
2356 | "audio"
2357 | "video"
2358 | "picture"
2359 | "source"
2360 | "track"
2361 | "embed"
2362 | "object"
2363 | "param"
2364 | "input"
2365 | "label"
2366 | "button"
2367 | "select"
2368 | "textarea"
2369 | "output"
2370 | "progress"
2371 | "meter"
2372 )
2373}
2374
2375fn is_block_level_element(tag_name: &str) -> bool {
2377 !is_inline_element(tag_name)
2378 && matches!(
2379 tag_name,
2380 "address"
2381 | "article"
2382 | "aside"
2383 | "blockquote"
2384 | "canvas"
2385 | "dd"
2386 | "div"
2387 | "dl"
2388 | "dt"
2389 | "fieldset"
2390 | "figcaption"
2391 | "figure"
2392 | "footer"
2393 | "form"
2394 | "h1"
2395 | "h2"
2396 | "h3"
2397 | "h4"
2398 | "h5"
2399 | "h6"
2400 | "header"
2401 | "hr"
2402 | "li"
2403 | "main"
2404 | "nav"
2405 | "ol"
2406 | "p"
2407 | "pre"
2408 | "section"
2409 | "table"
2410 | "tfoot"
2411 | "ul"
2412 )
2413}
2414
2415fn get_next_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2416 let id = node_handle.get_inner();
2417 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2418
2419 let siblings = if let Some(parent_id) = parent {
2420 dom_ctx.children_map.get(&parent_id)?
2421 } else {
2422 &dom_ctx.root_children
2423 };
2424
2425 let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2426
2427 for sibling in siblings.iter().skip(position + 1) {
2428 if let Some(node) = sibling.get(parser) {
2429 match node {
2430 tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2431 tl::Node::Raw(raw) => {
2432 if !raw.as_utf8_str().trim().is_empty() {
2433 return None;
2434 }
2435 }
2436 _ => {}
2437 }
2438 }
2439 }
2440
2441 None
2442}
2443
2444fn get_previous_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2445 let id = node_handle.get_inner();
2446 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2447
2448 let siblings = if let Some(parent_id) = parent {
2449 dom_ctx.children_map.get(&parent_id)?
2450 } else {
2451 &dom_ctx.root_children
2452 };
2453
2454 let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2455
2456 for sibling in siblings.iter().take(position).rev() {
2457 if let Some(node) = sibling.get(parser) {
2458 match node {
2459 tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2460 tl::Node::Raw(raw) => {
2461 if !raw.as_utf8_str().trim().is_empty() {
2462 return None;
2463 }
2464 }
2465 _ => {}
2466 }
2467 }
2468 }
2469
2470 None
2471}
2472
2473fn previous_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2474 let id = node_handle.get_inner();
2475 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2476
2477 let siblings = if let Some(parent_id) = parent {
2478 if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2479 children
2480 } else {
2481 return false;
2482 }
2483 } else {
2484 &dom_ctx.root_children
2485 };
2486
2487 let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2488 return false;
2489 };
2490
2491 for sibling in siblings.iter().take(position).rev() {
2492 if let Some(node) = sibling.get(parser) {
2493 match node {
2494 tl::Node::Tag(tag) => {
2495 let name = normalized_tag_name(tag.name().as_utf8_str());
2496 return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2497 }
2498 tl::Node::Raw(raw) => {
2499 if raw.as_utf8_str().trim().is_empty() {
2500 continue;
2501 }
2502 return false;
2503 }
2504 _ => continue,
2505 }
2506 }
2507 }
2508
2509 false
2510}
2511
2512fn next_sibling_is_whitespace_text(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2513 let id = node_handle.get_inner();
2514 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2515
2516 let siblings = if let Some(parent_id) = parent {
2517 if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2518 children
2519 } else {
2520 return false;
2521 }
2522 } else {
2523 &dom_ctx.root_children
2524 };
2525
2526 let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2527 return false;
2528 };
2529
2530 for sibling in siblings.iter().skip(position + 1) {
2531 if let Some(node) = sibling.get(parser) {
2532 match node {
2533 tl::Node::Raw(raw) => return raw.as_utf8_str().trim().is_empty(),
2534 tl::Node::Tag(_) => return false,
2535 _ => continue,
2536 }
2537 }
2538 }
2539
2540 false
2541}
2542
2543fn next_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2544 let id = node_handle.get_inner();
2545 let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2546
2547 let siblings = if let Some(parent_id) = parent {
2548 if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2549 children
2550 } else {
2551 return false;
2552 }
2553 } else {
2554 &dom_ctx.root_children
2555 };
2556
2557 let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2558 return false;
2559 };
2560
2561 for sibling in siblings.iter().skip(position + 1) {
2562 if let Some(node) = sibling.get(parser) {
2563 match node {
2564 tl::Node::Tag(tag) => {
2565 let name = normalized_tag_name(tag.name().as_utf8_str());
2566 return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2567 }
2568 tl::Node::Raw(raw) => {
2569 if raw.as_utf8_str().trim().is_empty() {
2570 continue;
2571 }
2572 return false;
2573 }
2574 _ => continue,
2575 }
2576 }
2577 }
2578
2579 false
2580}
2581
2582fn append_inline_suffix(
2583 output: &mut String,
2584 suffix: &str,
2585 has_core_content: bool,
2586 node_handle: &tl::NodeHandle,
2587 parser: &tl::Parser,
2588 dom_ctx: &DomContext,
2589) {
2590 if suffix.is_empty() {
2591 return;
2592 }
2593
2594 if suffix == " " && has_core_content && next_sibling_is_whitespace_text(node_handle, parser, dom_ctx) {
2595 return;
2596 }
2597
2598 output.push_str(suffix);
2599}
2600
2601#[allow(clippy::only_used_in_recursion)]
2603fn walk_node(
2604 node_handle: &tl::NodeHandle,
2605 parser: &tl::Parser,
2606 output: &mut String,
2607 options: &ConversionOptions,
2608 ctx: &Context,
2609 depth: usize,
2610 dom_ctx: &DomContext,
2611) {
2612 let Some(node) = node_handle.get(parser) else { return };
2613
2614 match node {
2615 tl::Node::Raw(bytes) => {
2616 let mut text = text::decode_html_entities(&bytes.as_utf8_str());
2617
2618 if text.is_empty() {
2619 return;
2620 }
2621
2622 if options.strip_newlines {
2623 text = text.replace(['\r', '\n'], " ");
2624 }
2625
2626 if text.trim().is_empty() {
2627 if ctx.in_code {
2628 output.push_str(&text);
2629 return;
2630 }
2631
2632 if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2633 if ctx.convert_as_inline || ctx.in_table_cell || ctx.in_list_item {
2634 output.push_str(&text);
2635 return;
2636 }
2637 if text.contains("\n\n") || text.contains("\r\n\r\n") {
2638 if !output.ends_with("\n\n") {
2639 output.push('\n');
2640 }
2641 return;
2642 }
2643 output.push_str(&text);
2644 return;
2645 }
2646
2647 if text.contains('\n') {
2648 if output.is_empty() {
2649 return;
2650 }
2651 if !output.ends_with("\n\n") {
2652 if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2653 if is_inline_element(&next_tag) {
2654 return;
2655 }
2656 }
2657 }
2658 return;
2659 }
2660
2661 if previous_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2662 && next_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2663 {
2664 if text.chars().count() > 1 {
2665 if !output.ends_with(' ') {
2666 output.push(' ');
2667 }
2668 } else {
2669 output.push_str(&text);
2670 }
2671 } else {
2672 output.push_str(&text);
2673 }
2674 return;
2675 }
2676
2677 let processed_text = if ctx.in_code || ctx.in_ruby {
2678 text
2679 } else if ctx.in_table_cell {
2680 let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
2681 let normalized_text = text::normalize_whitespace(&text);
2682 text::escape(
2683 &normalized_text,
2684 options.escape_misc,
2685 options.escape_asterisks,
2686 options.escape_underscores,
2687 options.escape_ascii,
2688 )
2689 } else {
2690 text::escape(
2691 &text,
2692 options.escape_misc,
2693 options.escape_asterisks,
2694 options.escape_underscores,
2695 options.escape_ascii,
2696 )
2697 };
2698 if options.escape_misc {
2700 escaped
2701 } else {
2702 escaped.replace('|', r"\|")
2703 }
2704 } else if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2705 text::escape(
2706 &text,
2707 options.escape_misc,
2708 options.escape_asterisks,
2709 options.escape_underscores,
2710 options.escape_ascii,
2711 )
2712 } else {
2713 let has_trailing_single_newline =
2714 text.ends_with('\n') && !text.ends_with("\n\n") && !text.ends_with("\r\n\r\n");
2715
2716 let normalized_text = text::normalize_whitespace(&text);
2717
2718 let (prefix, suffix, core) = text::chomp(&normalized_text);
2719
2720 let skip_prefix = output.ends_with("\n\n")
2721 || output.ends_with("* ")
2722 || output.ends_with("- ")
2723 || output.ends_with(". ")
2724 || output.ends_with("] ")
2725 || (output.ends_with('\n') && prefix == " ")
2726 || (output.ends_with(' ')
2727 && prefix == " "
2728 && !previous_sibling_is_inline_tag(node_handle, parser, dom_ctx));
2729
2730 let mut final_text = String::new();
2731 if !skip_prefix && !prefix.is_empty() {
2732 final_text.push_str(prefix);
2733 }
2734
2735 let escaped_core = text::escape(
2736 core,
2737 options.escape_misc,
2738 options.escape_asterisks,
2739 options.escape_underscores,
2740 options.escape_ascii,
2741 );
2742 final_text.push_str(&escaped_core);
2743
2744 if !suffix.is_empty() {
2745 final_text.push_str(suffix);
2746 } else if has_trailing_single_newline {
2747 let at_paragraph_break = output.ends_with("\n\n");
2748 if options.debug {
2749 eprintln!(
2750 "[DEBUG] Text had trailing single newline that was chomped, at_paragraph_break={}",
2751 at_paragraph_break
2752 );
2753 }
2754 if !at_paragraph_break {
2755 if text.contains("\n\n") || text.contains("\r\n\r\n") {
2756 final_text.push('\n');
2757 } else if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2758 if options.debug {
2759 eprintln!("[DEBUG] Next sibling tag after newline: {}", next_tag);
2760 }
2761 if matches!(next_tag.as_str(), "span") {
2762 } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2764 final_text.push(' ');
2765 } else {
2766 final_text.push('\n');
2767 }
2768 } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2769 final_text.push(' ');
2770 } else {
2771 final_text.push('\n');
2772 }
2773 }
2774 }
2775
2776 final_text
2777 };
2778
2779 if ctx.in_list_item && processed_text.contains("\n\n") {
2780 let parts: Vec<&str> = processed_text.split("\n\n").collect();
2781 for (i, part) in parts.iter().enumerate() {
2782 if i > 0 {
2783 output.push_str("\n\n");
2784 output.push_str(&" ".repeat(4 * ctx.list_depth));
2785 }
2786 output.push_str(part.trim());
2787 }
2788 } else {
2789 output.push_str(&processed_text);
2790 }
2791 }
2792
2793 tl::Node::Tag(tag) => {
2794 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2795
2796 if should_drop_for_preprocessing(node_handle, tag_name.as_ref(), tag, parser, dom_ctx, options) {
2797 trim_trailing_whitespace(output);
2798 if options.debug {
2799 eprintln!("[DEBUG] Dropping <{}> subtree due to preprocessing settings", tag_name);
2800 }
2801 return;
2802 }
2803
2804 if options.strip_tags.iter().any(|t| t.as_str() == tag_name) {
2805 let children = tag.children();
2806 {
2807 for child_handle in children.top().iter() {
2808 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2809 }
2810 }
2811 return;
2812 }
2813
2814 if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
2816 let html = serialize_tag_to_html(node_handle, parser);
2817 output.push_str(&html);
2818 return;
2819 }
2820
2821 #[cfg(feature = "metadata")]
2823 if matches!(tag_name.as_ref(), "html" | "head" | "body") {
2824 if let Some(ref collector) = ctx.metadata_collector {
2825 let mut c = collector.borrow_mut();
2826
2827 if let Some(lang) = tag.attributes().get("lang").flatten() {
2828 c.set_language(lang.as_utf8_str().to_string());
2829 }
2830
2831 if let Some(dir) = tag.attributes().get("dir").flatten() {
2832 c.set_text_direction(dir.as_utf8_str().to_string());
2833 }
2834 }
2835 }
2836
2837 match tag_name.as_ref() {
2838 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
2839 let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
2840
2841 let mut text = String::new();
2842 let heading_ctx = Context {
2843 in_heading: true,
2844 convert_as_inline: true,
2845 heading_tag: Some(tag_name.to_string()),
2846 ..ctx.clone()
2847 };
2848 let children = tag.children();
2849 {
2850 for child_handle in children.top().iter() {
2851 walk_node(
2852 child_handle,
2853 parser,
2854 &mut text,
2855 options,
2856 &heading_ctx,
2857 depth + 1,
2858 dom_ctx,
2859 );
2860 }
2861 }
2862 let trimmed = text.trim();
2863 if !trimmed.is_empty() {
2864 let normalized = normalize_heading_text(trimmed);
2865 push_heading(output, ctx, options, level, normalized.as_ref());
2866
2867 #[cfg(feature = "metadata")]
2869 if let Some(ref collector) = ctx.metadata_collector {
2870 let id = tag
2871 .attributes()
2872 .get("id")
2873 .flatten()
2874 .map(|v| v.as_utf8_str().to_string());
2875 collector
2876 .borrow_mut()
2877 .add_header(level as u8, normalized.to_string(), id, depth, 0);
2878 }
2879 }
2880 }
2881
2882 "p" => {
2883 let content_start_pos = output.len();
2884
2885 let is_table_continuation =
2886 ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
2887
2888 let is_list_continuation = ctx.in_list_item
2889 && !output.is_empty()
2890 && !output.ends_with("* ")
2891 && !output.ends_with("- ")
2892 && !output.ends_with(". ");
2893
2894 let after_code_block = output.ends_with("```\n");
2895 let needs_leading_sep = !ctx.in_table_cell
2896 && !ctx.in_list_item
2897 && !ctx.convert_as_inline
2898 && ctx.blockquote_depth == 0
2899 && !output.is_empty()
2900 && !output.ends_with("\n\n")
2901 && !after_code_block;
2902
2903 if is_table_continuation {
2904 trim_trailing_whitespace(output);
2905 output.push_str("<br>");
2906 } else if is_list_continuation {
2907 add_list_continuation_indent(output, ctx.list_depth, true, options);
2908 } else if needs_leading_sep {
2909 trim_trailing_whitespace(output);
2910 output.push_str("\n\n");
2911 }
2912
2913 let p_ctx = Context {
2914 in_paragraph: true,
2915 ..ctx.clone()
2916 };
2917
2918 let children = tag.children();
2919 {
2920 let child_handles: Vec<_> = children.top().iter().collect();
2921 for (i, child_handle) in child_handles.iter().enumerate() {
2922 if let Some(node) = child_handle.get(parser) {
2924 if let tl::Node::Raw(bytes) = node {
2925 let text = bytes.as_utf8_str();
2926 if text.trim().is_empty() && i > 0 && i < child_handles.len() - 1 {
2927 let prev = &child_handles[i - 1];
2928 let next = &child_handles[i + 1];
2929 if is_empty_inline_element(prev, parser)
2930 && is_empty_inline_element(next, parser)
2931 {
2932 continue;
2933 }
2934 }
2935 }
2936 }
2937 walk_node(child_handle, parser, output, options, &p_ctx, depth + 1, dom_ctx);
2938 }
2939 }
2940
2941 let has_content = output.len() > content_start_pos;
2942
2943 if has_content && !ctx.convert_as_inline && !ctx.in_table_cell {
2944 output.push_str("\n\n");
2945 }
2946 }
2947
2948 "strong" | "b" => {
2949 if ctx.in_code {
2950 let children = tag.children();
2951 {
2952 for child_handle in children.top().iter() {
2953 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2954 }
2955 }
2956 } else {
2957 let mut content = String::with_capacity(64);
2958 let children = tag.children();
2959 {
2960 let strong_ctx = Context {
2961 inline_depth: ctx.inline_depth + 1,
2962 in_strong: true,
2963 ..ctx.clone()
2964 };
2965 for child_handle in children.top().iter() {
2966 walk_node(
2967 child_handle,
2968 parser,
2969 &mut content,
2970 options,
2971 &strong_ctx,
2972 depth + 1,
2973 dom_ctx,
2974 );
2975 }
2976 }
2977 let (prefix, suffix, trimmed) = chomp_inline(&content);
2978 if !content.trim().is_empty() {
2979 output.push_str(prefix);
2980 if ctx.in_strong {
2981 output.push_str(trimmed);
2982 } else {
2983 output.push(options.strong_em_symbol);
2984 output.push(options.strong_em_symbol);
2985 output.push_str(trimmed);
2986 output.push(options.strong_em_symbol);
2987 output.push(options.strong_em_symbol);
2988 }
2989 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
2990 } else if !content.is_empty() {
2991 output.push_str(prefix);
2992 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
2993 }
2994 }
2995 }
2996
2997 "em" | "i" => {
2998 if ctx.in_code {
2999 let children = tag.children();
3000 {
3001 for child_handle in children.top().iter() {
3002 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3003 }
3004 }
3005 } else {
3006 let mut content = String::with_capacity(64);
3007 let children = tag.children();
3008 {
3009 let em_ctx = Context {
3010 inline_depth: ctx.inline_depth + 1,
3011 ..ctx.clone()
3012 };
3013 for child_handle in children.top().iter() {
3014 walk_node(child_handle, parser, &mut content, options, &em_ctx, depth + 1, dom_ctx);
3015 }
3016 }
3017 let (prefix, suffix, trimmed) = chomp_inline(&content);
3018 if !content.trim().is_empty() {
3019 output.push_str(prefix);
3020 output.push(options.strong_em_symbol);
3021 output.push_str(trimmed);
3022 output.push(options.strong_em_symbol);
3023 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3024 } else if !content.is_empty() {
3025 output.push_str(prefix);
3026 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3027 } else if let Some(class_value) = tag
3028 .attributes()
3029 .get("class")
3030 .and_then(|v| v.as_ref().map(|val| val.as_utf8_str().to_string()))
3031 {
3032 if class_value.contains("caret") && !output.ends_with(' ') {
3033 output.push_str(" > ");
3034 }
3035 }
3036 }
3037 }
3038
3039 "a" => {
3040 const MAX_LINK_LABEL_LEN: usize = 512;
3041
3042 let href_attr = tag
3043 .attributes()
3044 .get("href")
3045 .flatten()
3046 .map(|v| text::decode_html_entities(&v.as_utf8_str()));
3047 let title = tag
3048 .attributes()
3049 .get("title")
3050 .flatten()
3051 .map(|v| v.as_utf8_str().to_string());
3052
3053 if let Some(href) = href_attr {
3054 let raw_text = text::normalize_whitespace(&get_text_content(node_handle, parser))
3055 .trim()
3056 .to_string();
3057
3058 let is_autolink = options.autolinks
3059 && !options.default_title
3060 && !href.is_empty()
3061 && (raw_text == href || (href.starts_with("mailto:") && raw_text == href[7..]));
3062
3063 if is_autolink {
3064 output.push('<');
3065 if href.starts_with("mailto:") && raw_text == href[7..] {
3066 output.push_str(&raw_text);
3067 } else {
3068 output.push_str(&href);
3069 }
3070 output.push('>');
3071 return;
3072 }
3073
3074 if let Some((heading_level, heading_handle)) = find_single_heading_child(node_handle, parser) {
3075 if let Some(heading_node) = heading_handle.get(parser) {
3076 if let tl::Node::Tag(heading_tag) = heading_node {
3077 let heading_name =
3078 normalized_tag_name(heading_tag.name().as_utf8_str()).into_owned();
3079 let mut heading_text = String::new();
3080 let heading_ctx = Context {
3081 in_heading: true,
3082 convert_as_inline: true,
3083 heading_tag: Some(heading_name),
3084 ..ctx.clone()
3085 };
3086 walk_node(
3087 &heading_handle,
3088 parser,
3089 &mut heading_text,
3090 options,
3091 &heading_ctx,
3092 depth + 1,
3093 dom_ctx,
3094 );
3095 let trimmed_heading = heading_text.trim();
3096 if !trimmed_heading.is_empty() {
3097 let escaped_label = escape_link_label(trimmed_heading);
3098 let mut link_buffer = String::new();
3099 append_markdown_link(
3100 &mut link_buffer,
3101 &escaped_label,
3102 href.as_str(),
3103 title.as_deref(),
3104 raw_text.as_str(),
3105 options,
3106 );
3107 push_heading(output, ctx, options, heading_level, link_buffer.as_str());
3108 return;
3109 }
3110 }
3111 }
3112 }
3113
3114 let children: Vec<_> = tag.children().top().iter().copied().collect();
3115 let (inline_label, _block_nodes, saw_block) = collect_link_label_text(&children, parser);
3116 let mut label = if saw_block {
3117 let mut content = String::new();
3118 let link_ctx = Context {
3119 inline_depth: ctx.inline_depth + 1,
3120 convert_as_inline: true,
3121 ..ctx.clone()
3122 };
3123 for child_handle in children.iter() {
3124 let mut child_buf = String::new();
3125 walk_node(
3126 child_handle,
3127 parser,
3128 &mut child_buf,
3129 options,
3130 &link_ctx,
3131 depth + 1,
3132 dom_ctx,
3133 );
3134 if !child_buf.trim().is_empty()
3135 && !content.is_empty()
3136 && !content.chars().last().map(|c| c.is_whitespace()).unwrap_or(true)
3137 && !child_buf.chars().next().map(|c| c.is_whitespace()).unwrap_or(true)
3138 {
3139 content.push(' ');
3140 }
3141 content.push_str(&child_buf);
3142 }
3143 if content.trim().is_empty() {
3144 normalize_link_label(&inline_label)
3145 } else {
3146 normalize_link_label(&content)
3147 }
3148 } else {
3149 let mut content = String::new();
3150 let link_ctx = Context {
3151 inline_depth: ctx.inline_depth + 1,
3152 ..ctx.clone()
3153 };
3154 for child_handle in children.iter() {
3155 walk_node(
3156 child_handle,
3157 parser,
3158 &mut content,
3159 options,
3160 &link_ctx,
3161 depth + 1,
3162 dom_ctx,
3163 );
3164 }
3165 normalize_link_label(&content)
3166 };
3167
3168 if label.is_empty() && saw_block {
3169 let fallback = text::normalize_whitespace(&get_text_content(node_handle, parser));
3170 label = normalize_link_label(&fallback);
3171 }
3172
3173 if label.is_empty() && !raw_text.is_empty() {
3174 label = normalize_link_label(&raw_text);
3175 }
3176
3177 if label.is_empty() && !href.is_empty() && !children.is_empty() {
3178 label = href.clone();
3179 }
3180
3181 if label.len() > MAX_LINK_LABEL_LEN {
3182 truncate_at_char_boundary(&mut label, MAX_LINK_LABEL_LEN);
3183 label.push('…');
3184 }
3185
3186 let escaped_label = escape_link_label(&label);
3187 append_markdown_link(
3188 output,
3189 &escaped_label,
3190 href.as_str(),
3191 title.as_deref(),
3192 label.as_str(),
3193 options,
3194 );
3195
3196 #[cfg(feature = "metadata")]
3198 if let Some(ref collector) = ctx.metadata_collector {
3199 let rel_attr = tag
3200 .attributes()
3201 .get("rel")
3202 .flatten()
3203 .map(|v| v.as_utf8_str().to_string());
3204 let mut attributes_map = BTreeMap::new();
3205 for (key, value_opt) in tag.attributes().iter() {
3206 let key_str = key.to_string();
3207 if key_str == "href" {
3208 continue;
3209 }
3210
3211 let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3212 attributes_map.insert(key_str, value);
3213 }
3214 collector.borrow_mut().add_link(
3215 href.clone(),
3216 label.clone(),
3217 title.clone(),
3218 rel_attr,
3219 attributes_map,
3220 );
3221 }
3222 } else {
3223 let children = tag.children();
3224 {
3225 for child_handle in children.top().iter() {
3226 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3227 }
3228 }
3229 }
3230 }
3231
3232 "img" => {
3233 use std::borrow::Cow;
3234
3235 let src = tag
3236 .attributes()
3237 .get("src")
3238 .flatten()
3239 .map(|v| v.as_utf8_str())
3240 .unwrap_or(Cow::Borrowed(""));
3241
3242 let alt = tag
3243 .attributes()
3244 .get("alt")
3245 .flatten()
3246 .map(|v| v.as_utf8_str())
3247 .unwrap_or(Cow::Borrowed(""));
3248
3249 let title = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str());
3250 #[cfg(feature = "metadata")]
3251 let mut attributes_map = BTreeMap::new();
3252 #[cfg(feature = "metadata")]
3253 let mut width: Option<u32> = None;
3254 #[cfg(feature = "metadata")]
3255 let mut height: Option<u32> = None;
3256 #[cfg(feature = "metadata")]
3257 for (key, value_opt) in tag.attributes().iter() {
3258 let key_str = key.to_string();
3259 if key_str == "src" {
3260 continue;
3261 }
3262 let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3263 if key_str == "width" {
3264 if let Ok(parsed) = value.parse::<u32>() {
3265 width = Some(parsed);
3266 }
3267 } else if key_str == "height" {
3268 if let Ok(parsed) = value.parse::<u32>() {
3269 height = Some(parsed);
3270 }
3271 }
3272 attributes_map.insert(key_str, value);
3273 }
3274
3275 #[cfg(feature = "inline-images")]
3276 if let Some(ref collector_ref) = ctx.inline_collector {
3277 let mut attributes_map = BTreeMap::new();
3278 for (key, value_opt) in tag.attributes().iter() {
3279 let key_str = key.to_string();
3280 let keep = key_str == "width"
3281 || key_str == "height"
3282 || key_str == "filename"
3283 || key_str == "aria-label"
3284 || key_str.starts_with("data-");
3285 if keep {
3286 let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
3287 attributes_map.insert(key_str, value);
3288 }
3289 }
3290 handle_inline_data_image(
3291 collector_ref,
3292 src.as_ref(),
3293 alt.as_ref(),
3294 title.as_deref(),
3295 attributes_map,
3296 );
3297 }
3298
3299 let keep_as_markdown = ctx.in_heading
3300 && ctx
3301 .heading_tag
3302 .as_ref()
3303 .is_some_and(|tag| options.keep_inline_images_in.iter().any(|t| t == tag));
3304
3305 let should_use_alt_text = !keep_as_markdown
3306 && (ctx.convert_as_inline
3307 || (ctx.in_heading
3308 && ctx
3309 .heading_tag
3310 .as_ref()
3311 .is_none_or(|tag| !options.keep_inline_images_in.iter().any(|t| t == tag))));
3312
3313 if should_use_alt_text {
3314 output.push_str(&alt);
3315 } else {
3316 output.push_str(";
3319 output.push_str(&src);
3320 if let Some(ref title_text) = title {
3321 output.push_str(" \"");
3322 output.push_str(title_text);
3323 output.push('"');
3324 }
3325 output.push(')');
3326 }
3327
3328 #[cfg(feature = "metadata")]
3330 if let Some(ref collector) = ctx.metadata_collector {
3331 if !src.is_empty() {
3332 let dimensions = match (width, height) {
3333 (Some(w), Some(h)) => Some((w, h)),
3334 _ => None,
3335 };
3336 collector.borrow_mut().add_image(
3337 src.to_string(),
3338 if alt.is_empty() { None } else { Some(alt.to_string()) },
3339 title.as_deref().map(|t| t.to_string()),
3340 dimensions,
3341 attributes_map.clone(),
3342 );
3343 }
3344 }
3345 }
3346
3347 "mark" => {
3348 if ctx.convert_as_inline {
3349 let children = tag.children();
3350 {
3351 for child_handle in children.top().iter() {
3352 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3353 }
3354 }
3355 } else {
3356 use crate::options::HighlightStyle;
3357 match options.highlight_style {
3358 HighlightStyle::DoubleEqual => {
3359 output.push_str("==");
3360 let children = tag.children();
3361 {
3362 for child_handle in children.top().iter() {
3363 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3364 }
3365 }
3366 output.push_str("==");
3367 }
3368 HighlightStyle::Html => {
3369 output.push_str("<mark>");
3370 let children = tag.children();
3371 {
3372 for child_handle in children.top().iter() {
3373 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3374 }
3375 }
3376 output.push_str("</mark>");
3377 }
3378 HighlightStyle::Bold => {
3379 let symbol = options.strong_em_symbol.to_string().repeat(2);
3380 output.push_str(&symbol);
3381 let bold_ctx = Context {
3382 in_strong: true,
3383 ..ctx.clone()
3384 };
3385 let children = tag.children();
3386 {
3387 for child_handle in children.top().iter() {
3388 walk_node(child_handle, parser, output, options, &bold_ctx, depth + 1, dom_ctx);
3389 }
3390 }
3391 output.push_str(&symbol);
3392 }
3393 HighlightStyle::None => {
3394 let children = tag.children();
3395 {
3396 for child_handle in children.top().iter() {
3397 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3398 }
3399 }
3400 }
3401 }
3402 }
3403 }
3404
3405 "del" | "s" => {
3406 if ctx.in_code {
3407 let children = tag.children();
3408 {
3409 for child_handle in children.top().iter() {
3410 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3411 }
3412 }
3413 } else {
3414 let mut content = String::with_capacity(32);
3415 let children = tag.children();
3416 {
3417 for child_handle in children.top().iter() {
3418 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3419 }
3420 }
3421 let (prefix, suffix, trimmed) = chomp_inline(&content);
3422 if !content.trim().is_empty() {
3423 output.push_str(prefix);
3424 output.push_str("~~");
3425 output.push_str(trimmed);
3426 output.push_str("~~");
3427 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3428 } else if !content.is_empty() {
3429 output.push_str(prefix);
3430 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3431 }
3432 }
3433 }
3434
3435 "ins" => {
3436 let mut content = String::with_capacity(32);
3437 let children = tag.children();
3438 {
3439 for child_handle in children.top().iter() {
3440 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3441 }
3442 }
3443 let (prefix, suffix, trimmed) = chomp_inline(&content);
3444 if !trimmed.is_empty() {
3445 output.push_str(prefix);
3446 output.push_str("==");
3447 output.push_str(trimmed);
3448 output.push_str("==");
3449 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3450 }
3451 }
3452
3453 "u" | "small" => {
3454 let children = tag.children();
3455 {
3456 for child_handle in children.top().iter() {
3457 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3458 }
3459 }
3460 }
3461
3462 "sub" => {
3463 if !ctx.in_code && !options.sub_symbol.is_empty() {
3464 output.push_str(&options.sub_symbol);
3465 }
3466 let children = tag.children();
3467 {
3468 for child_handle in children.top().iter() {
3469 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3470 }
3471 }
3472 if !ctx.in_code && !options.sub_symbol.is_empty() {
3473 if options.sub_symbol.starts_with('<') && !options.sub_symbol.starts_with("</") {
3474 output.push_str(&options.sub_symbol.replace('<', "</"));
3475 } else {
3476 output.push_str(&options.sub_symbol);
3477 }
3478 }
3479 }
3480
3481 "sup" => {
3482 if !ctx.in_code && !options.sup_symbol.is_empty() {
3483 output.push_str(&options.sup_symbol);
3484 }
3485 let children = tag.children();
3486 {
3487 for child_handle in children.top().iter() {
3488 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3489 }
3490 }
3491 if !ctx.in_code && !options.sup_symbol.is_empty() {
3492 if options.sup_symbol.starts_with('<') && !options.sup_symbol.starts_with("</") {
3493 output.push_str(&options.sup_symbol.replace('<', "</"));
3494 } else {
3495 output.push_str(&options.sup_symbol);
3496 }
3497 }
3498 }
3499
3500 "kbd" | "samp" => {
3501 let code_ctx = Context {
3502 in_code: true,
3503 ..ctx.clone()
3504 };
3505 let mut content = String::with_capacity(32);
3506 let children = tag.children();
3507 {
3508 for child_handle in children.top().iter() {
3509 walk_node(
3510 child_handle,
3511 parser,
3512 &mut content,
3513 options,
3514 &code_ctx,
3515 depth + 1,
3516 dom_ctx,
3517 );
3518 }
3519 }
3520 let normalized = text::normalize_whitespace(&content);
3521 let (prefix, suffix, trimmed) = chomp_inline(&normalized);
3522 if !content.trim().is_empty() {
3523 output.push_str(prefix);
3524 output.push('`');
3525 output.push_str(trimmed);
3526 output.push('`');
3527 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3528 } else if !content.is_empty() {
3529 output.push_str(prefix);
3530 append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3531 }
3532 }
3533
3534 "var" => {
3535 let mut content = String::with_capacity(32);
3536 let children = tag.children();
3537 {
3538 for child_handle in children.top().iter() {
3539 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3540 }
3541 }
3542 let (prefix, suffix, trimmed) = chomp_inline(&content);
3543 if !trimmed.is_empty() {
3544 output.push_str(prefix);
3545 output.push(options.strong_em_symbol);
3546 output.push_str(trimmed);
3547 output.push(options.strong_em_symbol);
3548 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3549 }
3550 }
3551
3552 "dfn" => {
3553 let mut content = String::with_capacity(32);
3554 let children = tag.children();
3555 {
3556 for child_handle in children.top().iter() {
3557 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3558 }
3559 }
3560 let (prefix, suffix, trimmed) = chomp_inline(&content);
3561 if !trimmed.is_empty() {
3562 output.push_str(prefix);
3563 output.push(options.strong_em_symbol);
3564 output.push_str(trimmed);
3565 output.push(options.strong_em_symbol);
3566 append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3567 }
3568 }
3569
3570 "abbr" => {
3571 let mut content = String::with_capacity(32);
3572 let children = tag.children();
3573 {
3574 for child_handle in children.top().iter() {
3575 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3576 }
3577 }
3578 let trimmed = content.trim();
3579
3580 if !trimmed.is_empty() {
3581 output.push_str(trimmed);
3582
3583 if let Some(title) = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str()) {
3584 let trimmed_title = title.trim();
3585 if !trimmed_title.is_empty() {
3586 output.push_str(" (");
3587 output.push_str(trimmed_title);
3588 output.push(')');
3589 }
3590 }
3591 }
3592 }
3593
3594 "time" | "data" => {
3595 let children = tag.children();
3596 {
3597 for child_handle in children.top().iter() {
3598 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3599 }
3600 }
3601 }
3602
3603 "wbr" => {}
3604
3605 "code" => {
3606 let code_ctx = Context {
3607 in_code: true,
3608 ..ctx.clone()
3609 };
3610
3611 if !ctx.in_code {
3612 let mut content = String::with_capacity(32);
3613 let children = tag.children();
3614 {
3615 for child_handle in children.top().iter() {
3616 walk_node(
3617 child_handle,
3618 parser,
3619 &mut content,
3620 options,
3621 &code_ctx,
3622 depth + 1,
3623 dom_ctx,
3624 );
3625 }
3626 }
3627
3628 let trimmed = &content;
3629
3630 if !content.trim().is_empty() {
3631 let contains_backtick = trimmed.contains('`');
3632
3633 let needs_delimiter_spaces = {
3634 let first_char = trimmed.chars().next();
3635 let last_char = trimmed.chars().last();
3636 let starts_with_space = first_char == Some(' ');
3637 let ends_with_space = last_char == Some(' ');
3638 let starts_with_backtick = first_char == Some('`');
3639 let ends_with_backtick = last_char == Some('`');
3640 let all_spaces = trimmed.chars().all(|c| c == ' ');
3641
3642 all_spaces
3643 || starts_with_backtick
3644 || ends_with_backtick
3645 || (starts_with_space && ends_with_space && contains_backtick)
3646 };
3647
3648 let (num_backticks, needs_spaces) = if contains_backtick {
3649 let max_consecutive = trimmed
3650 .chars()
3651 .fold((0, 0), |(max, current), c| {
3652 if c == '`' {
3653 let new_current = current + 1;
3654 (max.max(new_current), new_current)
3655 } else {
3656 (max, 0)
3657 }
3658 })
3659 .0;
3660 let num = if max_consecutive == 1 { 2 } else { 1 };
3661 (num, needs_delimiter_spaces)
3662 } else {
3663 (1, needs_delimiter_spaces)
3664 };
3665
3666 for _ in 0..num_backticks {
3667 output.push('`');
3668 }
3669 if needs_spaces {
3670 output.push(' ');
3671 }
3672 output.push_str(trimmed);
3673 if needs_spaces {
3674 output.push(' ');
3675 }
3676 for _ in 0..num_backticks {
3677 output.push('`');
3678 }
3679 }
3680 } else {
3681 let children = tag.children();
3682 {
3683 for child_handle in children.top().iter() {
3684 walk_node(child_handle, parser, output, options, &code_ctx, depth + 1, dom_ctx);
3685 }
3686 }
3687 }
3688 }
3689
3690 "pre" => {
3691 let code_ctx = Context {
3692 in_code: true,
3693 ..ctx.clone()
3694 };
3695
3696 let mut content = String::with_capacity(256);
3697 let children = tag.children();
3698 {
3699 for child_handle in children.top().iter() {
3700 walk_node(
3701 child_handle,
3702 parser,
3703 &mut content,
3704 options,
3705 &code_ctx,
3706 depth + 1,
3707 dom_ctx,
3708 );
3709 }
3710 }
3711
3712 if !content.is_empty() {
3713 let leading_newlines = content.chars().take_while(|&c| c == '\n').count();
3714 let trailing_newlines = content.chars().rev().take_while(|&c| c == '\n').count();
3715 let core = content.trim_matches('\n');
3716 let is_whitespace_only = core.trim().is_empty();
3717
3718 let processed_content = if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
3719 content
3720 } else {
3721 let mut core_text = if leading_newlines > 0 {
3722 dedent_code_block(core)
3723 } else {
3724 core.to_string()
3725 };
3726
3727 if is_whitespace_only {
3728 let mut rebuilt = String::new();
3729 for _ in 0..leading_newlines {
3730 rebuilt.push('\n');
3731 }
3732 rebuilt.push_str(&core_text);
3733 for _ in 0..trailing_newlines {
3734 rebuilt.push('\n');
3735 }
3736 rebuilt
3737 } else {
3738 for _ in 0..trailing_newlines {
3739 core_text.push('\n');
3740 }
3741 core_text
3742 }
3743 };
3744
3745 match options.code_block_style {
3746 crate::options::CodeBlockStyle::Indented => {
3747 if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3748 if output.ends_with('\n') {
3749 output.push('\n');
3750 } else {
3751 output.push_str("\n\n");
3752 }
3753 }
3754
3755 let indented = processed_content
3756 .lines()
3757 .map(|line| {
3758 if line.is_empty() {
3759 String::new()
3760 } else {
3761 format!(" {}", line)
3762 }
3763 })
3764 .collect::<Vec<_>>()
3765 .join("\n");
3766 output.push_str(&indented);
3767
3768 output.push_str("\n\n");
3769 }
3770 crate::options::CodeBlockStyle::Backticks | crate::options::CodeBlockStyle::Tildes => {
3771 if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3772 if output.ends_with('\n') {
3773 output.push('\n');
3774 } else {
3775 output.push_str("\n\n");
3776 }
3777 }
3778
3779 let fence = if options.code_block_style == crate::options::CodeBlockStyle::Backticks {
3780 "```"
3781 } else {
3782 "~~~"
3783 };
3784
3785 output.push_str(fence);
3786 if !options.code_language.is_empty() {
3787 output.push_str(&options.code_language);
3788 }
3789 output.push('\n');
3790 output.push_str(&processed_content);
3791 output.push('\n');
3792 output.push_str(fence);
3793 output.push('\n');
3794 }
3795 }
3796 }
3797 }
3798
3799 "blockquote" => {
3800 if ctx.convert_as_inline {
3801 let children = tag.children();
3802 {
3803 for child_handle in children.top().iter() {
3804 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3805 }
3806 }
3807 return;
3808 }
3809
3810 let cite = tag
3811 .attributes()
3812 .get("cite")
3813 .flatten()
3814 .map(|v| v.as_utf8_str().to_string());
3815
3816 let blockquote_ctx = Context {
3817 blockquote_depth: ctx.blockquote_depth + 1,
3818 ..ctx.clone()
3819 };
3820 let mut content = String::with_capacity(256);
3821 let children = tag.children();
3822 {
3823 for child_handle in children.top().iter() {
3824 walk_node(
3825 child_handle,
3826 parser,
3827 &mut content,
3828 options,
3829 &blockquote_ctx,
3830 depth + 1,
3831 dom_ctx,
3832 );
3833 }
3834 }
3835
3836 let trimmed_content = content.trim();
3837
3838 if !trimmed_content.is_empty() {
3839 if ctx.blockquote_depth > 0 {
3841 output.push_str("\n\n\n");
3843 } else if !output.is_empty() {
3844 if !output.ends_with('\n') {
3846 output.push('\n');
3847 } else if output.ends_with("\n\n") {
3848 output.truncate(output.len() - 1);
3850 }
3851 }
3852 let prefix = "> ";
3855
3856 for line in trimmed_content.lines() {
3857 output.push_str(prefix);
3858 output.push_str(line.trim());
3859 output.push('\n');
3860 }
3861
3862 if let Some(url) = cite {
3864 output.push('\n');
3865 output.push_str("— <");
3866 output.push_str(&url);
3867 output.push_str(">\n\n");
3868 }
3869
3870 while output.ends_with('\n') {
3871 output.truncate(output.len() - 1);
3872 }
3873 }
3874 }
3875
3876 "br" => {
3877 if ctx.in_heading {
3878 trim_trailing_whitespace(output);
3879 output.push_str(" ");
3880 } else {
3881 use crate::options::NewlineStyle;
3882 if output.is_empty() || output.ends_with('\n') {
3883 output.push('\n');
3884 } else {
3885 match options.newline_style {
3886 NewlineStyle::Spaces => output.push_str(" \n"),
3887 NewlineStyle::Backslash => output.push_str("\\\n"),
3888 }
3889 }
3890 }
3891 }
3892
3893 "hr" => {
3894 if !output.is_empty() {
3896 let prev_tag = get_previous_sibling_tag(node_handle, parser, dom_ctx);
3897 let last_line_is_blockquote = output
3898 .rsplit('\n')
3899 .find(|line| !line.trim().is_empty())
3900 .map(|line| line.trim_start().starts_with('>'))
3901 .unwrap_or(false);
3902 let needs_blank_line = !ctx.in_paragraph
3903 && !matches!(prev_tag.as_deref(), Some("blockquote"))
3904 && !last_line_is_blockquote;
3905
3906 if options.debug {
3907 eprintln!(
3908 "[DEBUG] <hr> prev_tag={:?} needs_blank_line={} in_paragraph={}",
3909 prev_tag, needs_blank_line, ctx.in_paragraph
3910 );
3911 }
3912
3913 if ctx.in_paragraph || !needs_blank_line {
3914 if !output.ends_with('\n') {
3915 output.push('\n');
3916 }
3917 } else {
3918 trim_trailing_whitespace(output);
3919 if output.ends_with('\n') {
3920 if !output.ends_with("\n\n") {
3921 output.push('\n');
3922 }
3923 } else {
3924 output.push_str("\n\n");
3925 }
3926 }
3927 }
3928 output.push_str("---\n");
3929 }
3930
3931 "ul" => {
3932 add_list_leading_separator(output, ctx);
3933
3934 let nested_depth = calculate_list_nesting_depth(ctx);
3935 let is_loose = is_loose_list(node_handle, parser);
3936
3937 process_list_children(
3938 node_handle,
3939 parser,
3940 output,
3941 options,
3942 ctx,
3943 depth,
3944 false,
3945 is_loose,
3946 nested_depth,
3947 1,
3948 dom_ctx,
3949 );
3950
3951 add_nested_list_trailing_separator(output, ctx);
3952 }
3953
3954 "ol" => {
3955 add_list_leading_separator(output, ctx);
3956
3957 let nested_depth = calculate_list_nesting_depth(ctx);
3958 let is_loose = is_loose_list(node_handle, parser);
3959
3960 let start = tag
3961 .attributes()
3962 .get("start")
3963 .flatten()
3964 .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
3965 .unwrap_or(1);
3966
3967 process_list_children(
3968 node_handle,
3969 parser,
3970 output,
3971 options,
3972 ctx,
3973 depth,
3974 true,
3975 is_loose,
3976 nested_depth,
3977 start,
3978 dom_ctx,
3979 );
3980
3981 add_nested_list_trailing_separator(output, ctx);
3982 }
3983
3984 "li" => {
3985 if ctx.list_depth > 0 {
3986 let indent = match options.list_indent_type {
3987 ListIndentType::Tabs => "\t".repeat(ctx.list_depth),
3988 ListIndentType::Spaces => " ".repeat(ctx.list_depth * options.list_indent_width),
3989 };
3990 output.push_str(&indent);
3991 }
3992
3993 let mut has_block_children = false;
3994 let children = tag.children();
3995 {
3996 for child_handle in children.top().iter() {
3997 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
3998 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
3999 if matches!(
4000 tag_name.as_ref(),
4001 "p" | "div" | "blockquote" | "pre" | "table" | "hr" | "dl"
4002 ) {
4003 has_block_children = true;
4004 break;
4005 }
4006 }
4007 }
4008 }
4009
4010 fn find_checkbox<'a>(
4011 node_handle: &tl::NodeHandle,
4012 parser: &'a tl::Parser<'a>,
4013 ) -> Option<(bool, tl::NodeHandle)> {
4014 if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4015 if tag_name_eq(node_tag.name().as_utf8_str(), "input") {
4016 let input_type = node_tag.attributes().get("type").flatten().map(|v| v.as_utf8_str());
4017
4018 if input_type.as_deref() == Some("checkbox") {
4019 let checked = node_tag.attributes().get("checked").is_some();
4020 return Some((checked, *node_handle));
4021 }
4022 }
4023
4024 let children = node_tag.children();
4025 {
4026 for child_handle in children.top().iter() {
4027 if let Some(result) = find_checkbox(child_handle, parser) {
4028 return Some(result);
4029 }
4030 }
4031 }
4032 }
4033 None
4034 }
4035
4036 let (is_task_list, task_checked, checkbox_node) =
4037 if let Some((checked, node)) = find_checkbox(node_handle, parser) {
4038 (true, checked, Some(node))
4039 } else {
4040 (false, false, None)
4041 };
4042
4043 let li_ctx = Context {
4044 in_list_item: true,
4045 list_depth: ctx.list_depth + 1,
4046 ..ctx.clone()
4047 };
4048
4049 if is_task_list {
4050 output.push('-');
4051 output.push(' ');
4052 output.push_str(if task_checked { "[x]" } else { "[ ]" });
4053
4054 fn is_checkbox_node(node_handle: &tl::NodeHandle, checkbox: &Option<tl::NodeHandle>) -> bool {
4055 if let Some(cb) = checkbox {
4056 node_handle == cb
4057 } else {
4058 false
4059 }
4060 }
4061
4062 fn contains_checkbox<'a>(
4063 node_handle: &tl::NodeHandle,
4064 parser: &'a tl::Parser<'a>,
4065 checkbox: &Option<tl::NodeHandle>,
4066 ) -> bool {
4067 if is_checkbox_node(node_handle, checkbox) {
4068 return true;
4069 }
4070 if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4071 let children = node_tag.children();
4072 {
4073 for child_handle in children.top().iter() {
4074 if contains_checkbox(child_handle, parser, checkbox) {
4075 return true;
4076 }
4077 }
4078 }
4079 }
4080 false
4081 }
4082
4083 #[allow(clippy::too_many_arguments)]
4084 fn render_li_content<'a>(
4085 node_handle: &tl::NodeHandle,
4086 parser: &'a tl::Parser<'a>,
4087 output: &mut String,
4088 options: &ConversionOptions,
4089 ctx: &Context,
4090 depth: usize,
4091 checkbox: &Option<tl::NodeHandle>,
4092 dom_ctx: &DomContext,
4093 ) {
4094 if is_checkbox_node(node_handle, checkbox) {
4095 return;
4096 }
4097
4098 if contains_checkbox(node_handle, parser, checkbox) {
4099 if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4100 let children = node_tag.children();
4101 {
4102 for child_handle in children.top().iter() {
4103 render_li_content(
4104 child_handle,
4105 parser,
4106 output,
4107 options,
4108 ctx,
4109 depth,
4110 checkbox,
4111 dom_ctx,
4112 );
4113 }
4114 }
4115 }
4116 } else {
4117 walk_node(node_handle, parser, output, options, ctx, depth, dom_ctx);
4118 }
4119 }
4120
4121 let mut task_text = String::new();
4122 let children = tag.children();
4123 {
4124 for child_handle in children.top().iter() {
4125 render_li_content(
4126 child_handle,
4127 parser,
4128 &mut task_text,
4129 options,
4130 &li_ctx,
4131 depth + 1,
4132 &checkbox_node,
4133 dom_ctx,
4134 );
4135 }
4136 }
4137 output.push(' ');
4138 let trimmed_task = task_text.trim();
4139 if !trimmed_task.is_empty() {
4140 output.push_str(trimmed_task);
4141 }
4142 } else {
4143 if !ctx.in_table_cell {
4144 if ctx.in_ordered_list {
4145 output.push_str(&format!("{}. ", ctx.list_counter));
4146 } else {
4147 let bullets: Vec<char> = options.bullets.chars().collect();
4148 let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
4149 let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
4150 output.push(bullet);
4151 output.push(' ');
4152 }
4153 }
4154
4155 let children = tag.children();
4156 {
4157 for child_handle in children.top().iter() {
4158 walk_node(child_handle, parser, output, options, &li_ctx, depth + 1, dom_ctx);
4159 }
4160 }
4161
4162 trim_trailing_whitespace(output);
4163 }
4164
4165 if !ctx.in_table_cell {
4166 if has_block_children || ctx.loose_list || ctx.prev_item_had_blocks {
4167 if !output.ends_with("\n\n") {
4168 if output.ends_with('\n') {
4169 output.push('\n');
4170 } else {
4171 output.push_str("\n\n");
4172 }
4173 }
4174 } else if !output.ends_with('\n') {
4175 output.push('\n');
4176 }
4177 }
4178 }
4179
4180 "table" => {
4181 let mut table_output = String::new();
4182 convert_table(node_handle, parser, &mut table_output, options, ctx, dom_ctx);
4183
4184 if ctx.in_list_item {
4185 let has_caption = table_output.starts_with('*');
4186
4187 if !has_caption {
4188 trim_trailing_whitespace(output);
4189 if !output.is_empty() && !output.ends_with('\n') {
4190 output.push('\n');
4191 }
4192 }
4193
4194 let indented = indent_table_for_list(&table_output, ctx.list_depth, options);
4195 output.push_str(&indented);
4196 } else {
4197 if !output.ends_with("\n\n") {
4198 if output.is_empty() || !output.ends_with('\n') {
4199 output.push_str("\n\n");
4200 } else {
4201 output.push('\n');
4202 }
4203 }
4204 output.push_str(&table_output);
4205 }
4206
4207 if !output.ends_with('\n') {
4208 output.push('\n');
4209 }
4210 }
4211
4212 "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" => {}
4213
4214 "caption" => {
4215 let mut text = String::new();
4216 let children = tag.children();
4217 {
4218 for child_handle in children.top().iter() {
4219 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4220 }
4221 }
4222 let text = text.trim();
4223 if !text.is_empty() {
4224 let escaped_text = text.replace('-', r"\-");
4226 output.push('*');
4227 output.push_str(&escaped_text);
4228 output.push_str("*\n\n");
4229 }
4230 }
4231
4232 "colgroup" | "col" => {}
4233
4234 "article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
4235 if ctx.convert_as_inline {
4236 let children = tag.children();
4237 {
4238 for child_handle in children.top().iter() {
4239 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4240 }
4241 }
4242 return;
4243 }
4244
4245 let mut content = String::with_capacity(256);
4246 let children = tag.children();
4247 {
4248 for child_handle in children.top().iter() {
4249 walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4250 }
4251 }
4252 if content.trim().is_empty() {
4253 return;
4254 }
4255
4256 if !output.is_empty() && !output.ends_with("\n\n") {
4257 output.push_str("\n\n");
4258 }
4259 output.push_str(&content);
4260 if content.ends_with('\n') && !content.ends_with("\n\n") {
4261 output.push('\n');
4262 } else if !content.ends_with('\n') {
4263 output.push_str("\n\n");
4264 }
4265 }
4266
4267 "figure" => {
4268 if ctx.convert_as_inline {
4269 let children = tag.children();
4270 {
4271 for child_handle in children.top().iter() {
4272 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4273 }
4274 }
4275 return;
4276 }
4277
4278 if !output.is_empty() && !output.ends_with("\n\n") {
4279 output.push_str("\n\n");
4280 }
4281
4282 let mut figure_content = String::new();
4283 let children = tag.children();
4284 {
4285 for child_handle in children.top().iter() {
4286 walk_node(child_handle, parser, &mut figure_content, options, ctx, depth, dom_ctx);
4287 }
4288 }
4289
4290 figure_content = figure_content.replace("\n;
4639 output.push_str(&src);
4640 output.push(')');
4641 if !ctx.in_paragraph && !ctx.convert_as_inline {
4642 output.push_str("\n\n");
4643 }
4644 }
4645
4646 let mut fallback = String::new();
4647 let children = tag.children();
4648 {
4649 for child_handle in children.top().iter() {
4650 let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4651 tag_name_eq(child_tag.name().as_utf8_str(), "source")
4652 } else {
4653 false
4654 };
4655
4656 if !is_source {
4657 walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4658 }
4659 }
4660 }
4661 if !fallback.is_empty() {
4662 output.push_str(fallback.trim());
4663 if !ctx.in_paragraph && !ctx.convert_as_inline {
4664 output.push_str("\n\n");
4665 }
4666 }
4667 }
4668
4669 "video" => {
4670 use std::borrow::Cow;
4671
4672 let src = tag
4673 .attributes()
4674 .get("src")
4675 .flatten()
4676 .map(|v| v.as_utf8_str())
4677 .or_else(|| {
4678 let children = tag.children();
4679 {
4680 for child_handle in children.top().iter() {
4681 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4682 if tag_name_eq(child_tag.name().as_utf8_str(), "source") {
4683 return child_tag
4684 .attributes()
4685 .get("src")
4686 .flatten()
4687 .map(|v| v.as_utf8_str());
4688 }
4689 }
4690 }
4691 }
4692 None
4693 })
4694 .unwrap_or(Cow::Borrowed(""));
4695
4696 if !src.is_empty() {
4697 output.push('[');
4698 output.push_str(&src);
4699 output.push_str("](");
4700 output.push_str(&src);
4701 output.push(')');
4702 if !ctx.in_paragraph && !ctx.convert_as_inline {
4703 output.push_str("\n\n");
4704 }
4705 }
4706
4707 let mut fallback = String::new();
4708 let children = tag.children();
4709 {
4710 for child_handle in children.top().iter() {
4711 let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4712 tag_name_eq(child_tag.name().as_utf8_str(), "source")
4713 } else {
4714 false
4715 };
4716
4717 if !is_source {
4718 walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4719 }
4720 }
4721 }
4722 if !fallback.is_empty() {
4723 output.push_str(fallback.trim());
4724 if !ctx.in_paragraph && !ctx.convert_as_inline {
4725 output.push_str("\n\n");
4726 }
4727 }
4728 }
4729
4730 "source" => {}
4731
4732 "picture" => {
4733 let children = tag.children();
4734 {
4735 for child_handle in children.top().iter() {
4736 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4737 if tag_name_eq(child_tag.name().as_utf8_str(), "img") {
4738 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4739 break;
4740 }
4741 }
4742 }
4743 }
4744 }
4745
4746 "iframe" => {
4747 use std::borrow::Cow;
4748
4749 let src = tag
4750 .attributes()
4751 .get("src")
4752 .flatten()
4753 .map(|v| v.as_utf8_str())
4754 .unwrap_or(Cow::Borrowed(""));
4755
4756 if !src.is_empty() {
4757 output.push('[');
4758 output.push_str(&src);
4759 output.push_str("](");
4760 output.push_str(&src);
4761 output.push(')');
4762 if !ctx.in_paragraph && !ctx.convert_as_inline {
4763 output.push_str("\n\n");
4764 }
4765 }
4766 }
4767
4768 "svg" => {
4769 let mut title = String::from("SVG Image");
4770 let children = tag.children();
4771 {
4772 for child_handle in children.top().iter() {
4773 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4774 if tag_name_eq(child_tag.name().as_utf8_str(), "title") {
4775 title = get_text_content(child_handle, parser).trim().to_string();
4776 break;
4777 }
4778 }
4779 }
4780 }
4781
4782 #[cfg(feature = "inline-images")]
4783 if let Some(ref collector_ref) = ctx.inline_collector {
4784 let title_opt = if title == "SVG Image" {
4785 None
4786 } else {
4787 Some(title.clone())
4788 };
4789 let mut attributes_map = BTreeMap::new();
4790 for (key, value_opt) in tag.attributes().iter() {
4791 let key_str = key.to_string();
4792 let keep = key_str == "width"
4793 || key_str == "height"
4794 || key_str == "filename"
4795 || key_str == "aria-label"
4796 || key_str.starts_with("data-");
4797 if keep {
4798 let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
4799 attributes_map.insert(key_str, value);
4800 }
4801 }
4802 handle_inline_svg(collector_ref, node_handle, parser, title_opt, attributes_map);
4803 }
4804
4805 if ctx.convert_as_inline {
4806 output.push_str(&title);
4807 } else {
4808 use base64::{Engine as _, engine::general_purpose::STANDARD};
4809
4810 let svg_html = serialize_element(node_handle, parser);
4811
4812 let base64_svg = STANDARD.encode(svg_html.as_bytes());
4813
4814 output.push_str(";
4817 output.push_str(&base64_svg);
4818 output.push(')');
4819 }
4820 }
4821
4822 "math" => {
4823 let text_content = get_text_content(node_handle, parser).trim().to_string();
4824
4825 if text_content.is_empty() {
4826 return;
4827 }
4828
4829 let math_html = serialize_element(node_handle, parser);
4830
4831 let escaped_text = text::escape(
4832 &text_content,
4833 options.escape_misc,
4834 options.escape_asterisks,
4835 options.escape_underscores,
4836 options.escape_ascii,
4837 );
4838
4839 let is_display_block = tag
4840 .attributes()
4841 .get("display")
4842 .flatten()
4843 .map(|v| v.as_utf8_str() == "block")
4844 .unwrap_or(false);
4845
4846 if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4847 output.push_str("\n\n");
4848 }
4849
4850 output.push_str("<!-- MathML: ");
4851 output.push_str(&math_html);
4852 output.push_str(" --> ");
4853 output.push_str(&escaped_text);
4854
4855 if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4856 output.push_str("\n\n");
4857 }
4858 }
4859
4860 "form" => {
4861 if ctx.convert_as_inline {
4862 let children = tag.children();
4863 {
4864 for child_handle in children.top().iter() {
4865 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4866 }
4867 }
4868 return;
4869 }
4870
4871 let mut content = String::new();
4872 let children = tag.children();
4873 {
4874 for child_handle in children.top().iter() {
4875 walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4876 }
4877 }
4878 let trimmed = content.trim();
4879 if !trimmed.is_empty() {
4880 if !output.is_empty() && !output.ends_with("\n\n") {
4881 output.push_str("\n\n");
4882 }
4883 output.push_str(trimmed);
4884 output.push_str("\n\n");
4885 }
4886 }
4887
4888 "fieldset" => {
4889 if ctx.convert_as_inline {
4890 let children = tag.children();
4891 {
4892 for child_handle in children.top().iter() {
4893 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4894 }
4895 }
4896 return;
4897 }
4898 let mut content = String::new();
4899 let children = tag.children();
4900 {
4901 for child_handle in children.top().iter() {
4902 walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4903 }
4904 }
4905 let trimmed = content.trim();
4906 if !trimmed.is_empty() {
4907 if !output.is_empty() && !output.ends_with("\n\n") {
4908 output.push_str("\n\n");
4909 }
4910 output.push_str(trimmed);
4911 output.push_str("\n\n");
4912 }
4913 }
4914
4915 "legend" => {
4916 let mut content = String::new();
4917 let mut legend_ctx = ctx.clone();
4918 if !ctx.convert_as_inline {
4919 legend_ctx.in_strong = true;
4920 }
4921 let children = tag.children();
4922 {
4923 for child_handle in children.top().iter() {
4924 walk_node(
4925 child_handle,
4926 parser,
4927 &mut content,
4928 options,
4929 &legend_ctx,
4930 depth + 1,
4931 dom_ctx,
4932 );
4933 }
4934 }
4935 let trimmed = content.trim();
4936 if !trimmed.is_empty() {
4937 if ctx.convert_as_inline {
4938 output.push_str(trimmed);
4939 } else {
4940 let symbol = options.strong_em_symbol.to_string().repeat(2);
4941 output.push_str(&symbol);
4942 output.push_str(trimmed);
4943 output.push_str(&symbol);
4944 output.push_str("\n\n");
4945 }
4946 }
4947 }
4948
4949 "label" => {
4950 let mut content = String::new();
4951 let children = tag.children();
4952 {
4953 for child_handle in children.top().iter() {
4954 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4955 }
4956 }
4957 let trimmed = content.trim();
4958 if !trimmed.is_empty() {
4959 output.push_str(trimmed);
4960 if !ctx.convert_as_inline {
4961 output.push_str("\n\n");
4962 }
4963 }
4964 }
4965
4966 "input" => {}
4967
4968 "textarea" => {
4969 let start_len = output.len();
4970 let children = tag.children();
4971 {
4972 for child_handle in children.top().iter() {
4973 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4974 }
4975 }
4976
4977 if !ctx.convert_as_inline && output.len() > start_len {
4978 output.push_str("\n\n");
4979 }
4980 }
4981
4982 "select" => {
4983 let start_len = output.len();
4984 let children = tag.children();
4985 {
4986 for child_handle in children.top().iter() {
4987 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4988 }
4989 }
4990
4991 if !ctx.convert_as_inline && output.len() > start_len {
4992 output.push('\n');
4993 }
4994 }
4995
4996 "option" => {
4997 let selected = tag.attributes().iter().any(|(name, _)| name.as_ref() == "selected");
4998
4999 let mut text = String::new();
5000 let children = tag.children();
5001 {
5002 for child_handle in children.top().iter() {
5003 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5004 }
5005 }
5006 let trimmed = text.trim();
5007 if !trimmed.is_empty() {
5008 if selected && !ctx.convert_as_inline {
5009 output.push_str("* ");
5010 }
5011 output.push_str(trimmed);
5012 if !ctx.convert_as_inline {
5013 output.push('\n');
5014 }
5015 }
5016 }
5017
5018 "optgroup" => {
5019 use std::borrow::Cow;
5020
5021 let label = tag
5022 .attributes()
5023 .get("label")
5024 .flatten()
5025 .map(|v| v.as_utf8_str())
5026 .unwrap_or(Cow::Borrowed(""));
5027
5028 if !label.is_empty() {
5029 let symbol = options.strong_em_symbol.to_string().repeat(2);
5030 output.push_str(&symbol);
5031 output.push_str(&label);
5032 output.push_str(&symbol);
5033 output.push('\n');
5034 }
5035
5036 let children = tag.children();
5037 {
5038 for child_handle in children.top().iter() {
5039 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5040 }
5041 }
5042 }
5043
5044 "button" => {
5045 let start_len = output.len();
5046 let children = tag.children();
5047 {
5048 for child_handle in children.top().iter() {
5049 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5050 }
5051 }
5052
5053 if !ctx.convert_as_inline && output.len() > start_len {
5054 output.push_str("\n\n");
5055 }
5056 }
5057
5058 "progress" => {
5059 let start_len = output.len();
5060 let children = tag.children();
5061 {
5062 for child_handle in children.top().iter() {
5063 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5064 }
5065 }
5066
5067 if !ctx.convert_as_inline && output.len() > start_len {
5068 output.push_str("\n\n");
5069 }
5070 }
5071
5072 "meter" => {
5073 let start_len = output.len();
5074 let children = tag.children();
5075 {
5076 for child_handle in children.top().iter() {
5077 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5078 }
5079 }
5080
5081 if !ctx.convert_as_inline && output.len() > start_len {
5082 output.push_str("\n\n");
5083 }
5084 }
5085
5086 "output" => {
5087 let start_len = output.len();
5088 let children = tag.children();
5089 {
5090 for child_handle in children.top().iter() {
5091 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5092 }
5093 }
5094
5095 if !ctx.convert_as_inline && output.len() > start_len {
5096 output.push_str("\n\n");
5097 }
5098 }
5099
5100 "datalist" => {
5101 let start_len = output.len();
5102 let children = tag.children();
5103 {
5104 for child_handle in children.top().iter() {
5105 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5106 }
5107 }
5108
5109 if !ctx.convert_as_inline && output.len() > start_len {
5110 output.push('\n');
5111 }
5112 }
5113
5114 "ruby" => {
5115 let ruby_ctx = ctx.clone();
5116
5117 let tag_sequence: Vec<String> = tag
5118 .children()
5119 .top()
5120 .iter()
5121 .filter_map(|child_handle| {
5122 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5123 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5124 if matches!(tag_name.as_ref(), "rb" | "rt" | "rtc") {
5125 Some(tag_name.into_owned())
5126 } else {
5127 None
5128 }
5129 } else {
5130 None
5131 }
5132 })
5133 .collect();
5134
5135 let has_rtc = tag_sequence.iter().any(|tag| tag == "rtc");
5136
5137 let is_interleaved = tag_sequence.windows(2).any(|w| w[0] == "rb" && w[1] == "rt");
5138
5139 if is_interleaved && !has_rtc {
5140 let mut current_base = String::new();
5141 let children = tag.children();
5142 {
5143 for child_handle in children.top().iter() {
5144 if let Some(node) = child_handle.get(parser) {
5145 match node {
5146 tl::Node::Tag(child_tag) => {
5147 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5148 if tag_name == "rt" {
5149 let mut annotation = String::new();
5150 walk_node(
5151 child_handle,
5152 parser,
5153 &mut annotation,
5154 options,
5155 &ruby_ctx,
5156 depth,
5157 dom_ctx,
5158 );
5159 if !current_base.is_empty() {
5160 output.push_str(current_base.trim());
5161 current_base.clear();
5162 }
5163 output.push_str(annotation.trim());
5164 } else if tag_name == "rb" {
5165 if !current_base.is_empty() {
5166 output.push_str(current_base.trim());
5167 current_base.clear();
5168 }
5169 walk_node(
5170 child_handle,
5171 parser,
5172 &mut current_base,
5173 options,
5174 &ruby_ctx,
5175 depth,
5176 dom_ctx,
5177 );
5178 } else if tag_name != "rp" {
5179 walk_node(
5180 child_handle,
5181 parser,
5182 &mut current_base,
5183 options,
5184 &ruby_ctx,
5185 depth,
5186 dom_ctx,
5187 );
5188 }
5189 }
5190 tl::Node::Raw(_) => {
5191 walk_node(
5192 child_handle,
5193 parser,
5194 &mut current_base,
5195 options,
5196 &ruby_ctx,
5197 depth,
5198 dom_ctx,
5199 );
5200 }
5201 _ => {}
5202 }
5203 }
5204 }
5205 }
5206 if !current_base.is_empty() {
5207 output.push_str(current_base.trim());
5208 }
5209 } else {
5210 let mut base_text = String::new();
5211 let mut rt_annotations = Vec::new();
5212 let mut rtc_content = String::new();
5213
5214 let children = tag.children();
5215 {
5216 for child_handle in children.top().iter() {
5217 if let Some(node) = child_handle.get(parser) {
5218 match node {
5219 tl::Node::Tag(child_tag) => {
5220 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5221 if tag_name == "rt" {
5222 let mut annotation = String::new();
5223 walk_node(
5224 child_handle,
5225 parser,
5226 &mut annotation,
5227 options,
5228 &ruby_ctx,
5229 depth,
5230 dom_ctx,
5231 );
5232 rt_annotations.push(annotation);
5233 } else if tag_name == "rtc" {
5234 walk_node(
5235 child_handle,
5236 parser,
5237 &mut rtc_content,
5238 options,
5239 &ruby_ctx,
5240 depth,
5241 dom_ctx,
5242 );
5243 } else if tag_name != "rp" {
5244 walk_node(
5245 child_handle,
5246 parser,
5247 &mut base_text,
5248 options,
5249 &ruby_ctx,
5250 depth,
5251 dom_ctx,
5252 );
5253 }
5254 }
5255 tl::Node::Raw(_) => {
5256 walk_node(
5257 child_handle,
5258 parser,
5259 &mut base_text,
5260 options,
5261 &ruby_ctx,
5262 depth,
5263 dom_ctx,
5264 );
5265 }
5266 _ => {}
5267 }
5268 }
5269 }
5270 }
5271
5272 let trimmed_base = base_text.trim();
5273
5274 output.push_str(trimmed_base);
5275
5276 if !rt_annotations.is_empty() {
5277 let rt_text = rt_annotations.iter().map(|s| s.trim()).collect::<Vec<_>>().join("");
5278 if !rt_text.is_empty() {
5279 if has_rtc && !rtc_content.trim().is_empty() && rt_annotations.len() > 1 {
5280 output.push('(');
5281 output.push_str(&rt_text);
5282 output.push(')');
5283 } else {
5284 output.push_str(&rt_text);
5285 }
5286 }
5287 }
5288
5289 if !rtc_content.trim().is_empty() {
5290 output.push_str(rtc_content.trim());
5291 }
5292 }
5293 }
5294
5295 "rb" => {
5296 let mut text = String::new();
5297 let children = tag.children();
5298 {
5299 for child_handle in children.top().iter() {
5300 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5301 }
5302 }
5303 output.push_str(text.trim());
5304 }
5305
5306 "rt" => {
5307 let mut text = String::new();
5308 let children = tag.children();
5309 {
5310 for child_handle in children.top().iter() {
5311 walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5312 }
5313 }
5314 let trimmed = text.trim();
5315
5316 if output.ends_with('(') {
5317 output.push_str(trimmed);
5318 } else {
5319 output.push('(');
5320 output.push_str(trimmed);
5321 output.push(')');
5322 }
5323 }
5324
5325 "rp" => {
5326 let mut content = String::new();
5327 let children = tag.children();
5328 {
5329 for child_handle in children.top().iter() {
5330 walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
5331 }
5332 }
5333 let trimmed = content.trim();
5334 if !trimmed.is_empty() {
5335 output.push_str(trimmed);
5336 }
5337 }
5338
5339 "rtc" => {
5340 let children = tag.children();
5341 {
5342 for child_handle in children.top().iter() {
5343 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5344 }
5345 }
5346 }
5347
5348 "div" => {
5349 if ctx.convert_as_inline {
5350 let children = tag.children();
5351 {
5352 for child_handle in children.top().iter() {
5353 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5354 }
5355 }
5356 return;
5357 }
5358
5359 let content_start_pos = output.len();
5360
5361 let is_table_continuation =
5362 ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
5363
5364 let is_list_continuation = ctx.in_list_item
5365 && !output.is_empty()
5366 && !output.ends_with("* ")
5367 && !output.ends_with("- ")
5368 && !output.ends_with(". ");
5369
5370 let needs_leading_sep = !ctx.in_table_cell
5371 && !ctx.in_list_item
5372 && !ctx.convert_as_inline
5373 && !output.is_empty()
5374 && !output.ends_with("\n\n");
5375
5376 if is_table_continuation {
5377 trim_trailing_whitespace(output);
5378 output.push_str("<br>");
5379 } else if is_list_continuation {
5380 add_list_continuation_indent(output, ctx.list_depth, false, options);
5381 } else if needs_leading_sep {
5382 trim_trailing_whitespace(output);
5383 output.push_str("\n\n");
5384 }
5385
5386 let children = tag.children();
5387 {
5388 for child_handle in children.top().iter() {
5389 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5390 }
5391 }
5392
5393 let has_content = output.len() > content_start_pos;
5394
5395 if has_content {
5396 if content_start_pos == 0 && output.starts_with('\n') && !output.starts_with("\n\n") {
5397 output.remove(0);
5398 }
5399 trim_trailing_whitespace(output);
5400
5401 if ctx.in_table_cell {
5402 } else if ctx.in_list_item {
5403 if is_list_continuation {
5404 if !output.ends_with('\n') {
5405 output.push('\n');
5406 }
5407 } else if !output.ends_with("\n\n") {
5408 if output.ends_with('\n') {
5409 output.push('\n');
5410 } else {
5411 output.push_str("\n\n");
5412 }
5413 }
5414 } else if !ctx.in_list_item && !ctx.convert_as_inline {
5415 if output.ends_with("\n\n") {
5416 } else if output.ends_with('\n') {
5417 output.push('\n');
5418 } else {
5419 output.push_str("\n\n");
5420 }
5421 }
5422 }
5423 }
5424
5425 "head" => {
5426 let children = tag.children();
5429 let has_body_like = children.top().iter().any(|child_handle| {
5430 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5431 let child_name = normalized_tag_name(child_tag.name().as_utf8_str());
5432 matches!(
5433 child_name.as_ref(),
5434 "body" | "main" | "article" | "section" | "div" | "p"
5435 )
5436 } else {
5437 false
5438 }
5439 });
5440
5441 if has_body_like {
5442 for child_handle in children.top().iter() {
5443 walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5444 }
5445 }
5446 }
5447
5448 "script" => {
5449 #[cfg(feature = "metadata")]
5451 if let Some(type_attr) = tag.attributes().get("type").flatten() {
5452 if type_attr.as_utf8_str() == "application/ld+json" {
5453 if let Some(ref collector) = ctx.metadata_collector {
5454 let json = get_text_content(node_handle, parser);
5455 collector.borrow_mut().add_json_ld(json);
5456 }
5457 }
5458 }
5459 }
5460 "style" => {}
5461
5462 "span" => {
5463 let is_hocr_word = tag.attributes().iter().any(|(name, value)| {
5464 name.as_ref() == "class" && value.as_ref().is_some_and(|v| v.as_ref().contains("ocrx_word"))
5465 });
5466
5467 if is_hocr_word
5468 && !output.is_empty()
5469 && !output.ends_with(' ')
5470 && !output.ends_with('\t')
5471 && !output.ends_with('\n')
5472 {
5473 output.push(' ');
5474 }
5475
5476 if !ctx.in_code
5477 && options.whitespace_mode == crate::options::WhitespaceMode::Normalized
5478 && output.ends_with('\n')
5479 && !output.ends_with("\n\n")
5480 {
5481 output.pop();
5482 }
5483
5484 let children = tag.children();
5485 {
5486 for child_handle in children.top().iter() {
5487 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5488 }
5489 }
5490 }
5491
5492 _ => {
5493 let len_before = output.len();
5494 let had_trailing_space = output.ends_with(' ');
5495
5496 let children = tag.children();
5497 {
5498 for child_handle in children.top().iter() {
5499 walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5500 }
5501 }
5502
5503 let len_after = output.len();
5504 if len_after > len_before {
5505 let start_idx = if output.is_char_boundary(len_before) {
5508 len_before
5509 } else {
5510 let capped = len_before.min(output.len());
5511 output
5512 .char_indices()
5513 .map(|(idx, _)| idx)
5514 .take_while(|idx| *idx <= capped)
5515 .last()
5516 .unwrap_or(capped)
5517 };
5518
5519 let added_content = output[start_idx..].to_string();
5520 if options.debug {
5521 eprintln!(
5522 "[DEBUG] <{}> added {:?}, trim={:?}, had_trailing_space={}",
5523 tag_name,
5524 added_content,
5525 added_content.trim(),
5526 had_trailing_space
5527 );
5528 }
5529
5530 let is_code_block = added_content.starts_with(" ")
5532 || added_content.starts_with("```")
5533 || added_content.starts_with("~~~");
5534
5535 if options.debug && added_content.trim().is_empty() {
5536 eprintln!(
5537 "[DEBUG] Whitespace-only content, is_code_block={}, will_truncate={}",
5538 is_code_block, !is_code_block
5539 );
5540 }
5541
5542 if added_content.trim().is_empty() && !is_code_block {
5543 output.truncate(start_idx);
5544 if !had_trailing_space && added_content.contains(' ') {
5545 output.push(' ');
5546 }
5547 if options.debug {
5548 eprintln!(
5549 "[DEBUG] Truncated, output now ends with space: {}",
5550 output.ends_with(' ')
5551 );
5552 }
5553 }
5554 }
5555 }
5556 }
5557 }
5558
5559 tl::Node::Comment(_) => {
5560 }
5562 }
5563}
5564
5565fn get_colspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5567 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5568 if let Some(Some(bytes)) = tag.attributes().get("colspan") {
5569 if let Ok(colspan) = bytes.as_utf8_str().parse::<usize>() {
5570 return colspan;
5571 }
5572 }
5573 }
5574 1
5575}
5576
5577fn get_colspan_rowspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> (usize, usize) {
5579 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5580 let attrs = tag.attributes();
5581 let colspan = attrs
5582 .get("colspan")
5583 .flatten()
5584 .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5585 .unwrap_or(1);
5586 let rowspan = attrs
5587 .get("rowspan")
5588 .flatten()
5589 .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5590 .unwrap_or(1);
5591 (colspan, rowspan)
5592 } else {
5593 (1, 1)
5594 }
5595}
5596
5597fn convert_table_cell(
5599 node_handle: &tl::NodeHandle,
5600 parser: &tl::Parser,
5601 output: &mut String,
5602 options: &ConversionOptions,
5603 ctx: &Context,
5604 _tag_name: &str,
5605 dom_ctx: &DomContext,
5606) {
5607 let mut text = String::with_capacity(128);
5608
5609 let cell_ctx = Context {
5610 in_table_cell: true,
5611 ..ctx.clone()
5612 };
5613
5614 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5615 let children = tag.children();
5616 {
5617 for child_handle in children.top().iter() {
5618 walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
5619 }
5620 }
5621 }
5622
5623 let text = text.trim();
5624 let text = if options.br_in_tables {
5625 text.split('\n')
5626 .filter(|s| !s.is_empty())
5627 .collect::<Vec<_>>()
5628 .join("<br>")
5629 } else {
5630 text.replace('\n', " ")
5631 };
5632
5633 let colspan = get_colspan(node_handle, parser);
5634
5635 output.push(' ');
5636 output.push_str(&text);
5637 output.push_str(&" |".repeat(colspan));
5638}
5639
5640#[allow(clippy::too_many_arguments)]
5642fn convert_table_row(
5643 node_handle: &tl::NodeHandle,
5644 parser: &tl::Parser,
5645 output: &mut String,
5646 options: &ConversionOptions,
5647 ctx: &Context,
5648 row_index: usize,
5649 rowspan_tracker: &mut std::collections::HashMap<usize, (String, usize)>,
5650 dom_ctx: &DomContext,
5651) {
5652 let mut row_text = String::with_capacity(256);
5653 let mut cells = Vec::new();
5654
5655 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5656 let children = tag.children();
5657 {
5658 for child_handle in children.top().iter() {
5659 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5660 let cell_name = normalized_tag_name(child_tag.name().as_utf8_str());
5661 if cell_name == "th" || cell_name == "td" {
5662 cells.push(*child_handle);
5663 }
5664 }
5665 }
5666 }
5667 }
5668
5669 let mut col_index = 0;
5670 let mut cell_iter = cells.iter();
5671
5672 loop {
5673 if let Some((_content, remaining_rows)) = rowspan_tracker.get_mut(&col_index) {
5674 if *remaining_rows > 0 {
5675 row_text.push(' ');
5676 row_text.push_str(" |");
5677 *remaining_rows -= 1;
5678 if *remaining_rows == 0 {
5679 rowspan_tracker.remove(&col_index);
5680 }
5681 col_index += 1;
5682 continue;
5683 }
5684 }
5685
5686 if let Some(cell_handle) = cell_iter.next() {
5687 let cell_start = row_text.len();
5688 convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
5689
5690 let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
5691
5692 if rowspan > 1 {
5693 let cell_text = &row_text[cell_start..];
5695 let cell_content = cell_text
5697 .trim_start_matches(' ')
5698 .trim_end_matches(" |")
5699 .trim()
5700 .to_string();
5701 rowspan_tracker.insert(col_index, (cell_content, rowspan - 1));
5702 }
5703
5704 col_index += colspan;
5705 } else {
5706 break;
5707 }
5708 }
5709
5710 output.push('|');
5711 output.push_str(&row_text);
5712 output.push('\n');
5713
5714 let is_first_row = row_index == 0;
5715 if is_first_row {
5716 let total_cols = cells.iter().map(|h| get_colspan(h, parser)).sum::<usize>().max(1);
5717 output.push_str("| ");
5718 for i in 0..total_cols {
5719 if i > 0 {
5720 output.push_str(" | ");
5721 }
5722 output.push_str("---");
5723 }
5724 output.push_str(" |\n");
5725 }
5726}
5727
5728fn table_has_header(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5729 if let Some(node) = node_handle.get(parser) {
5730 if let tl::Node::Tag(tag) = node {
5731 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5732 if tag_name.as_ref() == "th" {
5733 return true;
5734 }
5735 let children = tag.children();
5736 for child in children.top().iter() {
5737 if table_has_header(child, parser) {
5738 return true;
5739 }
5740 }
5741 }
5742 }
5743 false
5744}
5745
5746fn table_has_caption(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5747 if let Some(node) = node_handle.get(parser) {
5748 if let tl::Node::Tag(tag) = node {
5749 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5750 if tag_name.as_ref() == "caption" {
5751 return true;
5752 }
5753 let children = tag.children();
5754 for child in children.top().iter() {
5755 if table_has_caption(child, parser) {
5756 return true;
5757 }
5758 }
5759 }
5760 }
5761 false
5762}
5763
5764fn table_contains_nested_table(node_handle: &tl::NodeHandle, parser: &tl::Parser, is_root: bool) -> bool {
5765 if let Some(node) = node_handle.get(parser) {
5766 if let tl::Node::Tag(tag) = node {
5767 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5768 if !is_root && tag_name.as_ref() == "table" {
5769 return true;
5770 }
5771
5772 for child in tag.children().top().iter() {
5773 if table_contains_nested_table(child, parser, false) {
5774 return true;
5775 }
5776 }
5777 }
5778 }
5779 false
5780}
5781
5782fn collect_table_row_counts(
5783 node_handle: &tl::NodeHandle,
5784 parser: &tl::Parser,
5785 counts: &mut Vec<usize>,
5786 has_span: &mut bool,
5787) {
5788 if let Some(node) = node_handle.get(parser) {
5789 if let tl::Node::Tag(tag) = node {
5790 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5791 match tag_name.as_ref() {
5792 "tr" => {
5793 let mut cell_count = 0;
5794 for child in tag.children().top().iter() {
5795 if let Some(tl::Node::Tag(cell_tag)) = child.get(parser) {
5796 let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5797 if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5798 cell_count += 1;
5799 let attrs = cell_tag.attributes();
5800 if attrs.get("colspan").is_some() || attrs.get("rowspan").is_some() {
5801 *has_span = true;
5802 }
5803 }
5804 }
5805 }
5806 counts.push(cell_count);
5807 }
5808 _ => {
5809 for child in tag.children().top().iter() {
5810 collect_table_row_counts(child, parser, counts, has_span);
5811 }
5812 }
5813 }
5814 }
5815 }
5816}
5817
5818fn count_links(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5819 let mut total = 0;
5820 if let Some(node) = node_handle.get(parser) {
5821 if let tl::Node::Tag(tag) = node {
5822 let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5823 if tag_name.as_ref() == "a" {
5824 total += 1;
5825 }
5826
5827 for child in tag.children().top().iter() {
5828 total += count_links(child, parser);
5829 }
5830 }
5831 }
5832 total
5833}
5834
5835fn append_layout_row(
5836 row_handle: &tl::NodeHandle,
5837 parser: &tl::Parser,
5838 output: &mut String,
5839 options: &ConversionOptions,
5840 ctx: &Context,
5841 dom_ctx: &DomContext,
5842) {
5843 if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5844 let mut row_text = String::new();
5845 let row_children = row_tag.children();
5846 for cell_handle in row_children.top().iter() {
5847 if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
5848 let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5849 if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5850 let mut cell_text = String::new();
5851 let cell_ctx = Context {
5852 convert_as_inline: true,
5853 ..ctx.clone()
5854 };
5855 let cell_children = cell_tag.children();
5856 for cell_child in cell_children.top().iter() {
5857 walk_node(cell_child, parser, &mut cell_text, options, &cell_ctx, 0, dom_ctx);
5858 }
5859 let cell_content = text::normalize_whitespace(&cell_text);
5860 if !cell_content.trim().is_empty() {
5861 if !row_text.is_empty() {
5862 row_text.push(' ');
5863 }
5864 row_text.push_str(cell_content.trim());
5865 }
5866 }
5867 }
5868 }
5869
5870 let trimmed = row_text.trim();
5871 if !trimmed.is_empty() {
5872 if !output.is_empty() && !output.ends_with('\n') {
5873 output.push('\n');
5874 }
5875 let formatted = trimmed.strip_prefix("- ").unwrap_or(trimmed).trim_start();
5876 output.push_str("- ");
5877 output.push_str(formatted);
5878 output.push('\n');
5879 }
5880 }
5881}
5882
5883fn indent_table_for_list(table_content: &str, list_depth: usize, options: &ConversionOptions) -> String {
5885 if list_depth == 0 {
5886 return table_content.to_string();
5887 }
5888
5889 let Some(mut indent) = continuation_indent_string(list_depth, options) else {
5890 return table_content.to_string();
5891 };
5892
5893 if matches!(options.list_indent_type, ListIndentType::Spaces) {
5894 let space_count = indent.chars().filter(|c| *c == ' ').count();
5895 if space_count < 4 {
5896 indent.push_str(&" ".repeat(4 - space_count));
5897 }
5898 }
5899
5900 let mut result = String::with_capacity(table_content.len() + indent.len() * 4);
5901 for segment in table_content.split_inclusive('\n') {
5902 if segment.starts_with('|') {
5903 result.push_str(&indent);
5904 result.push_str(segment);
5905 } else {
5906 result.push_str(segment);
5907 }
5908 }
5909 result
5910}
5911
5912fn convert_table(
5914 node_handle: &tl::NodeHandle,
5915 parser: &tl::Parser,
5916 output: &mut String,
5917 options: &ConversionOptions,
5918 ctx: &Context,
5919 dom_ctx: &DomContext,
5920) {
5921 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5922 let mut row_counts = Vec::new();
5923 let mut has_span = false;
5924 collect_table_row_counts(node_handle, parser, &mut row_counts, &mut has_span);
5925
5926 let row_count = row_counts.len();
5927 let mut distinct_counts: Vec<_> = row_counts.into_iter().filter(|c| *c > 0).collect();
5928 distinct_counts.sort_unstable();
5929 distinct_counts.dedup();
5930
5931 let looks_like_layout =
5932 table_contains_nested_table(node_handle, parser, true) || has_span || distinct_counts.len() > 1;
5933 let link_count = count_links(node_handle, parser);
5934 let table_text = text::normalize_whitespace(&get_text_content(node_handle, parser));
5935 let is_blank_table = table_text.trim().is_empty();
5936
5937 if !table_has_header(node_handle, parser)
5938 && !table_has_caption(node_handle, parser)
5939 && (looks_like_layout || is_blank_table || (row_count <= 2 && link_count >= 3))
5940 {
5941 if is_blank_table {
5942 return;
5943 }
5944
5945 let table_children = tag.children();
5946 for child_handle in table_children.top().iter() {
5947 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5948 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5949 match tag_name.as_ref() {
5950 "thead" | "tbody" | "tfoot" => {
5951 for row_handle in child_tag.children().top().iter() {
5952 if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5953 if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
5954 append_layout_row(row_handle, parser, output, options, ctx, dom_ctx);
5955 }
5956 }
5957 }
5958 }
5959 "tr" => append_layout_row(child_handle, parser, output, options, ctx, dom_ctx),
5960 _ => {}
5961 }
5962 }
5963 }
5964 if !output.ends_with('\n') {
5965 output.push('\n');
5966 }
5967 return;
5968 }
5969
5970 let mut row_index = 0;
5971 let mut rowspan_tracker = std::collections::HashMap::new();
5972
5973 let children = tag.children();
5974 {
5975 for child_handle in children.top().iter() {
5976 if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5977 let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5978
5979 match tag_name.as_ref() {
5980 "caption" => {
5981 let mut text = String::new();
5982 let grandchildren = child_tag.children();
5983 {
5984 for grandchild_handle in grandchildren.top().iter() {
5985 walk_node(grandchild_handle, parser, &mut text, options, ctx, 0, dom_ctx);
5986 }
5987 }
5988 let text = text.trim();
5989 if !text.is_empty() {
5990 let escaped_text = text.replace('-', r"\-");
5992 output.push('*');
5993 output.push_str(&escaped_text);
5994 output.push_str("*\n\n");
5995 }
5996 }
5997
5998 "thead" | "tbody" | "tfoot" => {
5999 let section_children = child_tag.children();
6000 {
6001 for row_handle in section_children.top().iter() {
6002 if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
6003 if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
6004 convert_table_row(
6005 row_handle,
6006 parser,
6007 output,
6008 options,
6009 ctx,
6010 row_index,
6011 &mut rowspan_tracker,
6012 dom_ctx,
6013 );
6014 row_index += 1;
6015 }
6016 }
6017 }
6018 }
6019 }
6020
6021 "tr" => {
6022 convert_table_row(
6023 child_handle,
6024 parser,
6025 output,
6026 options,
6027 ctx,
6028 row_index,
6029 &mut rowspan_tracker,
6030 dom_ctx,
6031 );
6032 row_index += 1;
6033 }
6034
6035 "colgroup" | "col" => {}
6036
6037 _ => {}
6038 }
6039 }
6040 }
6041 }
6042 }
6043}
6044
6045#[cfg(test)]
6046mod tests {
6047 use super::*;
6048 use crate::options::HighlightStyle;
6049
6050 #[test]
6051 fn test_trim_trailing_whitespace() {
6052 let mut s = String::from("hello ");
6053 trim_trailing_whitespace(&mut s);
6054 assert_eq!(s, "hello");
6055
6056 let mut s = String::from("hello\t\t");
6057 trim_trailing_whitespace(&mut s);
6058 assert_eq!(s, "hello");
6059
6060 let mut s = String::from("hello \t \t");
6061 trim_trailing_whitespace(&mut s);
6062 assert_eq!(s, "hello");
6063
6064 let mut s = String::from("hello");
6065 trim_trailing_whitespace(&mut s);
6066 assert_eq!(s, "hello");
6067
6068 let mut s = String::from("");
6069 trim_trailing_whitespace(&mut s);
6070 assert_eq!(s, "");
6071
6072 let mut s = String::from("hello\n");
6073 trim_trailing_whitespace(&mut s);
6074 assert_eq!(s, "hello\n");
6075 }
6076
6077 #[test]
6078 fn test_chomp_preserves_boundary_spaces() {
6079 assert_eq!(chomp_inline(" text "), (" ", " ", "text"));
6080 assert_eq!(chomp_inline("text"), ("", "", "text"));
6081 assert_eq!(chomp_inline(" text"), (" ", "", "text"));
6082 assert_eq!(chomp_inline("text "), ("", " ", "text"));
6083 assert_eq!(chomp_inline(" "), (" ", " ", ""));
6084 assert_eq!(chomp_inline(""), ("", "", ""));
6085 }
6086
6087 #[test]
6088 fn nested_strong_markup_is_normalized() {
6089 let html = "<strong><strong>Bold</strong></strong>";
6090 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6091 assert_eq!(result.trim(), "**Bold**");
6092 }
6093
6094 #[test]
6095 fn nested_strong_with_additional_text_is_normalized() {
6096 let html = "<strong>Hello <strong>World</strong></strong>";
6097 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6098 assert_eq!(result.trim(), "**Hello World**");
6099 }
6100
6101 #[test]
6102 fn nested_strong_partial_segments_are_normalized() {
6103 let html = "<b>bo<b>ld</b>er</b>";
6104 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6105 assert_eq!(result.trim(), "**bolder**");
6106 }
6107
6108 #[test]
6109 fn summary_with_inner_strong_is_not_double_wrapped() {
6110 let html = "<details><summary><strong>Title</strong></summary></details>";
6111 let mut options = ConversionOptions::default();
6112 options.preprocessing.remove_forms = false;
6113 let result = convert_html(html, &options).unwrap();
6114 assert_eq!(result.trim(), "**Title**");
6115 }
6116
6117 #[test]
6118 fn legend_with_inner_strong_is_not_double_wrapped() {
6119 let html = "<fieldset><legend><strong>Section</strong></legend></fieldset>";
6120 let mut options = ConversionOptions::default();
6121 options.preprocessing.remove_forms = false; let result = convert_html(html, &options).unwrap();
6123 assert_eq!(result.trim(), "**Section**");
6124 }
6125
6126 #[test]
6127 fn preprocessing_keeps_article_header_inside_main() {
6128 let html = r#"
6129 <body>
6130 <header class="global-header">
6131 <div>Global Navigation</div>
6132 </header>
6133 <main>
6134 <header class="article-header">
6135 <h1>Primary Title</h1>
6136 </header>
6137 <p>Body content stays.</p>
6138 </main>
6139 </body>
6140 "#;
6141 let mut options = ConversionOptions::default();
6142 options.preprocessing.enabled = true;
6143 let result = convert_html(html, &options).unwrap();
6144 assert!(
6145 result.contains("Primary Title"),
6146 "article header was removed: {}",
6147 result
6148 );
6149 assert!(
6150 result.contains("Body content stays"),
6151 "main body content missing: {}",
6152 result
6153 );
6154 assert!(
6155 !result.contains("Global Navigation"),
6156 "site chrome unexpectedly rendered: {}",
6157 result
6158 );
6159 }
6160
6161 #[test]
6162 fn preprocessing_drops_nav_but_keeps_body() {
6163 let html = r##"
6164 <main>
6165 <nav aria-label="Primary navigation">
6166 <a href="#a">NavOnly</a>
6167 </nav>
6168 <article>
6169 <p>Important narrative</p>
6170 </article>
6171 </main>
6172 "##;
6173 let mut options = ConversionOptions::default();
6174 options.preprocessing.enabled = true;
6175 let result = convert_html(html, &options).unwrap();
6176 assert!(
6177 !result.contains("NavOnly"),
6178 "navigation text should not appear: {}",
6179 result
6180 );
6181 assert!(
6182 result.contains("Important narrative"),
6183 "article text should remain: {}",
6184 result
6185 );
6186 }
6187
6188 #[test]
6189 fn preprocessing_retains_section_headers_inside_articles() {
6190 let html = r#"
6191 <article>
6192 <header>
6193 <h2>Section Heading</h2>
6194 </header>
6195 <section>
6196 <p>Section body</p>
6197 </section>
6198 </article>
6199 "#;
6200 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6201 assert!(
6202 result.contains("Section Heading"),
6203 "section heading was stripped: {}",
6204 result
6205 );
6206 assert!(result.contains("Section body"), "section body missing: {}", result);
6207 }
6208
6209 #[test]
6210 fn bold_highlight_suppresses_nested_strong() {
6211 let mut options = ConversionOptions::default();
6212 options.highlight_style = HighlightStyle::Bold;
6213 let html = "<p><mark><strong>Hot</strong></mark></p>";
6214 let result = convert_html(html, &options).unwrap();
6215 assert_eq!(result.trim(), "**Hot**");
6216 }
6217
6218 #[test]
6219 fn atx_heading_swallows_layout_line_breaks() {
6220 let html = r#"<h2>
6221 Heading
6222 Text
6223 with
6224 Line
6225 Breaks
6226</h2>"#;
6227 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6228 assert_eq!(result.trim(), "## Heading Text with Line Breaks");
6229 }
6230
6231 #[test]
6232 fn doctype_is_removed() {
6233 let html = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
6234 <html>
6235 <head><title>Example</title></head>
6236 <body><p>Hello World</p></body>
6237 </html>"#;
6238 let mut options = ConversionOptions::default();
6239 options.extract_metadata = false;
6240 let result = convert_html(html, &options).unwrap();
6241 assert_eq!(result.trim(), "Hello World");
6242 }
6243
6244 #[test]
6245 fn test_calculate_list_continuation_indent() {
6246 assert_eq!(calculate_list_continuation_indent(0), 0);
6247
6248 assert_eq!(calculate_list_continuation_indent(1), 1);
6249
6250 assert_eq!(calculate_list_continuation_indent(2), 3);
6251
6252 assert_eq!(calculate_list_continuation_indent(3), 5);
6253
6254 assert_eq!(calculate_list_continuation_indent(4), 7);
6255 }
6256
6257 #[test]
6258 fn strips_script_sections_without_removing_following_content() {
6259 let input = "<div>before</div><script>1 < 2</script><p>after</p>";
6260 let stripped = strip_script_and_style_sections(input);
6261 assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
6262 }
6263
6264 #[test]
6265 fn strips_multiline_script_sections() {
6266 let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
6267 let stripped = strip_script_and_style_sections(input);
6268 assert!(stripped.contains("Content"));
6269 assert!(stripped.contains("<script"));
6270 assert!(!stripped.contains("1 < 2"));
6271 }
6272
6273 #[test]
6274 fn hr_inside_paragraph_matches_inline_expectation() {
6275 let mut options = ConversionOptions::default();
6276 options.extract_metadata = false;
6277 let markdown = convert_html("<p>Hello<hr>World</p>", &options).unwrap();
6278 assert_eq!(markdown, "Hello\n---\nWorld\n");
6279 }
6280
6281 #[test]
6282 fn hr_inside_paragraph_matches_inline_expectation_via_public_api() {
6283 let mut options = ConversionOptions::default();
6284 options.extract_metadata = false;
6285 let markdown = crate::convert("<p>Hello<hr>World</p>", Some(options)).unwrap();
6286 assert_eq!(markdown, "Hello\n---\nWorld\n");
6287 }
6288
6289 #[test]
6290 fn test_add_list_continuation_indent_blank_line() {
6291 let opts = ConversionOptions::default();
6292 let mut output = String::from("* First para");
6293 add_list_continuation_indent(&mut output, 1, true, &opts);
6294 assert_eq!(output, "* First para\n\n ");
6295
6296 let mut output = String::from("* First para\n");
6297 add_list_continuation_indent(&mut output, 1, true, &opts);
6298 assert_eq!(output, "* First para\n\n ");
6299
6300 let mut output = String::from("* First para\n\n");
6301 add_list_continuation_indent(&mut output, 1, true, &opts);
6302 assert_eq!(output, "* First para\n\n ");
6303
6304 let mut output = String::from("* First para");
6305 add_list_continuation_indent(&mut output, 2, true, &opts);
6306 assert_eq!(output, "* First para\n\n ");
6307 }
6308
6309 #[test]
6310 fn test_add_list_continuation_indent_single_line() {
6311 let opts = ConversionOptions::default();
6312 let mut output = String::from("* First div");
6313 add_list_continuation_indent(&mut output, 1, false, &opts);
6314 assert_eq!(output, "* First div\n ");
6315
6316 let mut output = String::from("* First div\n");
6317 add_list_continuation_indent(&mut output, 1, false, &opts);
6318 assert_eq!(output, "* First div\n ");
6319
6320 let mut output = String::from("* First div\n");
6321 add_list_continuation_indent(&mut output, 1, false, &opts);
6322 assert_eq!(output, "* First div\n ");
6323 }
6324
6325 #[test]
6326 fn test_trim_trailing_whitespace_in_continuation() {
6327 let opts = ConversionOptions::default();
6328 let mut output = String::from("* First ");
6329 add_list_continuation_indent(&mut output, 1, true, &opts);
6330 assert_eq!(output, "* First\n\n ");
6331
6332 let mut output = String::from("* First\t\t");
6333 add_list_continuation_indent(&mut output, 1, false, &opts);
6334 assert_eq!(output, "* First\n ");
6335 }
6336
6337 #[test]
6338 fn test_escape_malformed_angle_brackets_bare() {
6339 let input = "1<2";
6340 let escaped = escape_malformed_angle_brackets(input);
6341 assert_eq!(escaped, "1<2");
6342 }
6343
6344 #[test]
6345 fn test_escape_malformed_angle_brackets_in_text() {
6346 let input = "<html>1<2 Content</html>";
6347 let escaped = escape_malformed_angle_brackets(input);
6348 assert_eq!(escaped, "<html>1<2 Content</html>");
6349 }
6350
6351 #[test]
6352 fn test_escape_malformed_angle_brackets_multiple() {
6353 let input = "1 < 2 < 3";
6354 let escaped = escape_malformed_angle_brackets(input);
6355 assert_eq!(escaped, "1 < 2 < 3");
6356 }
6357
6358 #[test]
6359 fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
6360 let input = "<div>content</div>";
6361 let escaped = escape_malformed_angle_brackets(input);
6362 assert_eq!(escaped, "<div>content</div>");
6363 }
6364
6365 #[test]
6366 fn test_escape_malformed_angle_brackets_mixed() {
6367 let input = "<div>1<2</div><p>3<4</p>";
6368 let escaped = escape_malformed_angle_brackets(input);
6369 assert_eq!(escaped, "<div>1<2</div><p>3<4</p>");
6370 }
6371
6372 #[test]
6373 fn test_escape_malformed_angle_brackets_at_end() {
6374 let input = "test<";
6375 let escaped = escape_malformed_angle_brackets(input);
6376 assert_eq!(escaped, "test<");
6377 }
6378
6379 #[test]
6380 fn test_escape_malformed_angle_brackets_preserves_comments() {
6381 let input = "<!-- comment -->1<2";
6382 let escaped = escape_malformed_angle_brackets(input);
6383 assert_eq!(escaped, "<!-- comment -->1<2");
6384 }
6385
6386 #[test]
6387 fn test_escape_malformed_angle_brackets_preserves_doctype() {
6388 let input = "<!DOCTYPE html>1<2";
6389 let escaped = escape_malformed_angle_brackets(input);
6390 assert_eq!(escaped, "<!DOCTYPE html>1<2");
6391 }
6392
6393 #[test]
6394 fn test_convert_with_malformed_angle_brackets() {
6395 let html = "<html>1<2\nContent</html>";
6397 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6398 assert!(
6399 result.contains("Content"),
6400 "Result should contain 'Content': {:?}",
6401 result
6402 );
6403 assert!(
6404 result.contains("1<2") || result.contains("1<2"),
6405 "Result should contain escaped or unescaped comparison"
6406 );
6407 }
6408
6409 #[test]
6410 fn test_convert_with_malformed_angle_brackets_in_div() {
6411 let html = "<html><div>1<2</div><div>Content</div></html>";
6412 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6413 assert!(
6414 result.contains("Content"),
6415 "Result should contain 'Content': {:?}",
6416 result
6417 );
6418 }
6419
6420 #[test]
6421 fn test_convert_with_multiple_malformed_angle_brackets() {
6422 let html = "<html>1 < 2 < 3<p>Content</p></html>";
6423 let result = convert_html(html, &ConversionOptions::default()).unwrap();
6424 assert!(
6425 result.contains("Content"),
6426 "Result should contain 'Content': {:?}",
6427 result
6428 );
6429 }
6430
6431 #[test]
6432 fn test_preserve_tags_simple_table() {
6433 let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
6434 let options = ConversionOptions {
6435 preserve_tags: vec!["table".to_string()],
6436 ..Default::default()
6437 };
6438 let result = convert_html(html, &options).unwrap();
6439
6440 assert!(result.contains("<table>"), "Should preserve table tag");
6441 assert!(result.contains("</table>"), "Should have closing table tag");
6442 assert!(result.contains("<tr>"), "Should preserve tr tag");
6443 assert!(result.contains("<td>"), "Should preserve td tag");
6444 assert!(result.contains("Text"), "Should convert other elements");
6445 }
6446
6447 #[test]
6448 fn test_preserve_tags_with_attributes() {
6449 let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
6450 let options = ConversionOptions {
6451 preserve_tags: vec!["table".to_string()],
6452 ..Default::default()
6453 };
6454 let result = convert_html(html, &options).unwrap();
6455
6456 assert!(result.contains("<table"), "Should preserve table tag");
6457 assert!(result.contains("class="), "Should preserve class attribute");
6458 assert!(result.contains("id="), "Should preserve id attribute");
6459 assert!(result.contains("</table>"), "Should have closing tag");
6460 }
6461
6462 #[test]
6463 fn test_preserve_tags_multiple_tags() {
6464 let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
6465 let options = ConversionOptions {
6466 preserve_tags: vec!["table".to_string(), "form".to_string()],
6467 ..Default::default()
6468 };
6469 let result = convert_html(html, &options).unwrap();
6470
6471 assert!(result.contains("<table>"), "Should preserve table");
6472 assert!(result.contains("<form>"), "Should preserve form");
6473 assert!(result.contains("Text"), "Should convert paragraph");
6474 }
6475
6476 #[test]
6477 fn test_preserve_tags_nested_content() {
6478 let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
6479 let options = ConversionOptions {
6480 preserve_tags: vec!["table".to_string()],
6481 ..Default::default()
6482 };
6483 let result = convert_html(html, &options).unwrap();
6484
6485 assert!(result.contains("<thead>"), "Should preserve nested thead");
6486 assert!(result.contains("<tbody>"), "Should preserve nested tbody");
6487 assert!(result.contains("<th>"), "Should preserve th tag");
6488 assert!(result.contains("Header"), "Should preserve text content");
6489 }
6490
6491 #[test]
6492 fn test_preserve_tags_empty_list() {
6493 let html = r#"<table><tr><td>Cell</td></tr></table>"#;
6494 let options = ConversionOptions::default(); let result = convert_html(html, &options).unwrap();
6496
6497 assert!(
6499 !result.contains("<table>"),
6500 "Should not preserve table without preserve_tags"
6501 );
6502 }
6503
6504 #[test]
6505 fn test_preserve_tags_vs_strip_tags() {
6506 let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
6507 let options = ConversionOptions {
6508 preserve_tags: vec!["table".to_string()],
6509 strip_tags: vec!["span".to_string()],
6510 ..Default::default()
6511 };
6512 let result = convert_html(html, &options).unwrap();
6513
6514 assert!(result.contains("<table>"), "Should preserve table");
6515 assert!(!result.contains("<span>"), "Should strip span tag");
6516 assert!(result.contains("Text"), "Should keep span text content");
6517 }
6518
6519 #[test]
6520 fn example_com_remains_visible() {
6521 let html = "<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href=\"https://iana.org/domains/example\">Learn more</a></div></body></html>";
6522
6523 let mut options = ConversionOptions::default();
6524 options.extract_metadata = false; let result = convert_html(html, &options).unwrap();
6526
6527 assert!(
6528 result.contains("Example Domain"),
6529 "content unexpectedly missing: {}",
6530 result
6531 );
6532 }
6533}
6534#[test]
6535fn normalize_self_closing_tags_noop_when_absent() {
6536 let html = "<div><p>text</p></div>";
6537 let normalized = normalize_self_closing_tags(html);
6538 assert!(matches!(normalized, Cow::Borrowed(_)));
6539 assert_eq!(normalized.as_ref(), html);
6540}
6541
6542#[test]
6543fn normalize_self_closing_tags_replaces_targets() {
6544 let html = "<br/><hr/><img/>";
6545 let normalized = normalize_self_closing_tags(html);
6546 assert_eq!(normalized.as_ref(), "<br><hr><img>");
6547}