rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::mkdocs_attr_list::{ATTR_LIST_PATTERN, is_standalone_attr_list};
9use crate::utils::mkdocs_snippets::is_snippet_block_delimiter;
10use crate::utils::regex_cache::{
11    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
12    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
13    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
14    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
15};
16use crate::utils::sentence_utils::{
17    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
18    text_ends_with_abbreviation,
19};
20use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
21use std::collections::HashSet;
22use unicode_width::UnicodeWidthStr;
23
24/// Length calculation mode for reflow
25#[derive(Clone, Copy, Debug, Default, PartialEq)]
26pub enum ReflowLengthMode {
27    /// Count Unicode characters (grapheme clusters)
28    Chars,
29    /// Count visual display width (CJK = 2 columns, emoji = 2, etc.)
30    #[default]
31    Visual,
32    /// Count raw bytes
33    Bytes,
34}
35
36/// Calculate the display length of a string based on the length mode
37fn display_len(s: &str, mode: ReflowLengthMode) -> usize {
38    match mode {
39        ReflowLengthMode::Chars => s.chars().count(),
40        ReflowLengthMode::Visual => s.width(),
41        ReflowLengthMode::Bytes => s.len(),
42    }
43}
44
45/// Options for reflowing text
46#[derive(Clone)]
47pub struct ReflowOptions {
48    /// Target line length
49    pub line_length: usize,
50    /// Whether to break on sentence boundaries when possible
51    pub break_on_sentences: bool,
52    /// Whether to preserve existing line breaks in paragraphs
53    pub preserve_breaks: bool,
54    /// Whether to enforce one sentence per line
55    pub sentence_per_line: bool,
56    /// Whether to use semantic line breaks (cascading split strategy)
57    pub semantic_line_breaks: bool,
58    /// Custom abbreviations for sentence detection
59    /// Periods are optional - both "Dr" and "Dr." work the same
60    /// Custom abbreviations are always added to the built-in defaults
61    pub abbreviations: Option<Vec<String>>,
62    /// How to measure string length for line-length comparisons
63    pub length_mode: ReflowLengthMode,
64    /// Whether to treat {#id .class key="value"} as atomic (unsplittable) elements.
65    /// Enabled for MkDocs and Kramdown flavors.
66    pub attr_lists: bool,
67}
68
69impl Default for ReflowOptions {
70    fn default() -> Self {
71        Self {
72            line_length: 80,
73            break_on_sentences: true,
74            preserve_breaks: false,
75            sentence_per_line: false,
76            semantic_line_breaks: false,
77            abbreviations: None,
78            length_mode: ReflowLengthMode::default(),
79            attr_lists: false,
80        }
81    }
82}
83
84/// Detect if a character position is a sentence boundary
85/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
86/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
87fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
88    let chars: Vec<char> = text.chars().collect();
89
90    if pos + 1 >= chars.len() {
91        return false;
92    }
93
94    let c = chars[pos];
95    let next_char = chars[pos + 1];
96
97    // Check for CJK sentence-ending punctuation (。, ！, ？)
98    // CJK punctuation doesn't require space or uppercase after it
99    if is_cjk_sentence_ending(c) {
100        // Skip any trailing emphasis/strikethrough markers
101        let mut after_punct_pos = pos + 1;
102        while after_punct_pos < chars.len()
103            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
104        {
105            after_punct_pos += 1;
106        }
107
108        // Skip whitespace
109        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
110            after_punct_pos += 1;
111        }
112
113        // Check if we have more content (any non-whitespace)
114        if after_punct_pos >= chars.len() {
115            return false;
116        }
117
118        // Skip leading emphasis/strikethrough markers
119        while after_punct_pos < chars.len()
120            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
121        {
122            after_punct_pos += 1;
123        }
124
125        if after_punct_pos >= chars.len() {
126            return false;
127        }
128
129        // For CJK, we accept any character as the start of the next sentence
130        // (no uppercase requirement, since CJK doesn't have case)
131        return true;
132    }
133
134    // Check for ASCII sentence-ending punctuation
135    if c != '.' && c != '!' && c != '?' {
136        return false;
137    }
138
139    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
140    let (_space_pos, after_space_pos) = if next_char == ' ' {
141        // Normal case: punctuation followed by space
142        (pos + 1, pos + 2)
143    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
144        // Sentence ends with quote - check what follows the quote
145        if chars[pos + 2] == ' ' {
146            // Just quote followed by space: 'sentence." '
147            (pos + 2, pos + 3)
148        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
149            // Quote followed by emphasis: 'sentence."* '
150            (pos + 3, pos + 4)
151        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
152            && pos + 4 < chars.len()
153            && chars[pos + 3] == chars[pos + 2]
154            && chars[pos + 4] == ' '
155        {
156            // Quote followed by bold: 'sentence."** '
157            (pos + 4, pos + 5)
158        } else {
159            return false;
160        }
161    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
162        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
163        (pos + 2, pos + 3)
164    } else if (next_char == '*' || next_char == '_')
165        && pos + 3 < chars.len()
166        && chars[pos + 2] == next_char
167        && chars[pos + 3] == ' '
168    {
169        // Sentence ends with bold: "sentence.** " or "sentence.__ "
170        (pos + 3, pos + 4)
171    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
172        // Sentence ends with strikethrough: "sentence.~~ "
173        (pos + 3, pos + 4)
174    } else {
175        return false;
176    };
177
178    // Skip all whitespace after the space to find the start of the next sentence
179    let mut next_char_pos = after_space_pos;
180    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
181        next_char_pos += 1;
182    }
183
184    // Check if we reached the end of the string
185    if next_char_pos >= chars.len() {
186        return false;
187    }
188
189    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
190    let mut first_letter_pos = next_char_pos;
191    while first_letter_pos < chars.len()
192        && (chars[first_letter_pos] == '*'
193            || chars[first_letter_pos] == '_'
194            || chars[first_letter_pos] == '~'
195            || is_opening_quote(chars[first_letter_pos]))
196    {
197        first_letter_pos += 1;
198    }
199
200    // Check if we reached the end after skipping emphasis
201    if first_letter_pos >= chars.len() {
202        return false;
203    }
204
205    // First character of next sentence must be uppercase or CJK
206    let first_char = chars[first_letter_pos];
207    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
208        return false;
209    }
210
211    // Look back to check for common abbreviations (only applies to periods)
212    if pos > 0 && c == '.' {
213        // Convert char index to byte offset for string slicing
214        let byte_offset: usize = chars[..=pos].iter().map(|ch| ch.len_utf8()).sum();
215        if text_ends_with_abbreviation(&text[..byte_offset], abbreviations) {
216            return false;
217        }
218
219        // Check for decimal numbers (e.g., "3.14")
220        // Make sure to check if first_letter_pos is within bounds
221        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
222            return false;
223        }
224    }
225    true
226}
227
228/// Split text into sentences
229pub fn split_into_sentences(text: &str) -> Vec<String> {
230    split_into_sentences_custom(text, &None)
231}
232
233/// Split text into sentences with custom abbreviations
234pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
235    let abbreviations = get_abbreviations(custom_abbreviations);
236    split_into_sentences_with_set(text, &abbreviations)
237}
238
239/// Internal function to split text into sentences with a pre-computed abbreviations set
240/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
241fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
242    let mut sentences = Vec::new();
243    let mut current_sentence = String::new();
244    let mut chars = text.chars().peekable();
245    let mut pos = 0;
246
247    while let Some(c) = chars.next() {
248        current_sentence.push(c);
249
250        if is_sentence_boundary(text, pos, abbreviations) {
251            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
252            while let Some(&next) = chars.peek() {
253                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
254                    current_sentence.push(chars.next().unwrap());
255                    pos += 1;
256                } else {
257                    break;
258                }
259            }
260
261            // Consume the space after the sentence
262            if chars.peek() == Some(&' ') {
263                chars.next();
264                pos += 1;
265            }
266
267            sentences.push(current_sentence.trim().to_string());
268            current_sentence.clear();
269        }
270
271        pos += 1;
272    }
273
274    // Add any remaining text as the last sentence
275    if !current_sentence.trim().is_empty() {
276        sentences.push(current_sentence.trim().to_string());
277    }
278    sentences
279}
280
281/// Check if a line is a horizontal rule (---, ___, ***)
282fn is_horizontal_rule(line: &str) -> bool {
283    if line.len() < 3 {
284        return false;
285    }
286
287    // Check if line consists only of -, _, or * characters (at least 3)
288    let chars: Vec<char> = line.chars().collect();
289    if chars.is_empty() {
290        return false;
291    }
292
293    let first_char = chars[0];
294    if first_char != '-' && first_char != '_' && first_char != '*' {
295        return false;
296    }
297
298    // All characters should be the same (allowing spaces between)
299    for c in &chars {
300        if *c != first_char && *c != ' ' {
301            return false;
302        }
303    }
304
305    // Count non-space characters
306    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
307    non_space_count >= 3
308}
309
310/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
311fn is_numbered_list_item(line: &str) -> bool {
312    let mut chars = line.chars();
313
314    // Must start with a digit
315    if !chars.next().is_some_and(|c| c.is_numeric()) {
316        return false;
317    }
318
319    // Can have more digits
320    while let Some(c) = chars.next() {
321        if c == '.' {
322            // After period, must have a space (consistent with list marker extraction)
323            // "2019." alone is NOT treated as a list item to avoid false positives
324            return chars.next() == Some(' ');
325        }
326        if !c.is_numeric() {
327            return false;
328        }
329    }
330
331    false
332}
333
334/// Check if a trimmed line is an unordered list item (-, *, + followed by space)
335fn is_unordered_list_marker(s: &str) -> bool {
336    matches!(s.as_bytes().first(), Some(b'-' | b'*' | b'+'))
337        && !is_horizontal_rule(s)
338        && (s.len() == 1 || s.as_bytes().get(1) == Some(&b' '))
339}
340
341/// Shared structural checks for block boundary detection.
342/// Checks elements that only depend on the trimmed line content.
343fn is_block_boundary_core(trimmed: &str) -> bool {
344    trimmed.is_empty()
345        || trimmed.starts_with('#')
346        || trimmed.starts_with("```")
347        || trimmed.starts_with("~~~")
348        || trimmed.starts_with('>')
349        || (trimmed.starts_with('[') && trimmed.contains("]:"))
350        || is_horizontal_rule(trimmed)
351        || is_unordered_list_marker(trimmed)
352        || is_numbered_list_item(trimmed)
353        || is_definition_list_item(trimmed)
354        || trimmed.starts_with(":::")
355}
356
357/// Check if a trimmed line starts a new structural block element.
358/// Used for paragraph boundary detection in `reflow_markdown()`.
359fn is_block_boundary(trimmed: &str) -> bool {
360    is_block_boundary_core(trimmed) || trimmed.starts_with('|')
361}
362
363/// Check if a line starts a new structural block for paragraph boundary detection
364/// in `reflow_paragraph_at_line()`. Extends the core checks with indented code blocks
365/// (≥4 spaces) and table row detection via `is_potential_table_row`.
366fn is_paragraph_boundary(trimmed: &str, line: &str) -> bool {
367    is_block_boundary_core(trimmed)
368        || ElementCache::calculate_indentation_width_default(line) >= 4
369        || crate::utils::table_utils::TableUtils::is_potential_table_row(line)
370}
371
372/// Check if a line ends with a hard break (either two spaces or backslash)
373///
374/// CommonMark supports two formats for hard line breaks:
375/// 1. Two or more trailing spaces
376/// 2. A backslash at the end of the line
377fn has_hard_break(line: &str) -> bool {
378    let line = line.strip_suffix('\r').unwrap_or(line);
379    line.ends_with("  ") || line.ends_with('\\')
380}
381
382/// Check if text ends with sentence-terminating punctuation (. ! ?)
383fn ends_with_sentence_punct(text: &str) -> bool {
384    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
385}
386
387/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
388///
389/// Hard breaks in Markdown can be indicated by:
390/// 1. Two trailing spaces before a newline (traditional)
391/// 2. A backslash at the end of the line (mdformat style)
392fn trim_preserving_hard_break(s: &str) -> String {
393    // Strip trailing \r from CRLF line endings first to handle Windows files
394    let s = s.strip_suffix('\r').unwrap_or(s);
395
396    // Check for backslash hard break (mdformat style)
397    if s.ends_with('\\') {
398        // Preserve the backslash exactly as-is
399        return s.to_string();
400    }
401
402    // Check if there are at least 2 trailing spaces (traditional hard break)
403    if s.ends_with("  ") {
404        // Find the position where non-space content ends
405        let content_end = s.trim_end().len();
406        if content_end == 0 {
407            // String is all whitespace
408            return String::new();
409        }
410        // Preserve exactly 2 trailing spaces for hard break
411        format!("{}  ", &s[..content_end])
412    } else {
413        // No hard break, just trim all trailing whitespace
414        s.trim_end().to_string()
415    }
416}
417
418/// Parse markdown elements using the appropriate parser based on options.
419fn parse_elements(text: &str, options: &ReflowOptions) -> Vec<Element> {
420    if options.attr_lists {
421        parse_markdown_elements_with_attr_lists(text)
422    } else {
423        parse_markdown_elements(text)
424    }
425}
426
427pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
428    // For sentence-per-line mode, always process regardless of length
429    if options.sentence_per_line {
430        let elements = parse_elements(line, options);
431        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
432    }
433
434    // For semantic line breaks mode, use cascading split strategy
435    if options.semantic_line_breaks {
436        let elements = parse_elements(line, options);
437        return reflow_elements_semantic(&elements, options);
438    }
439
440    // Quick check: if line is already short enough or no wrapping requested, return as-is
441    // line_length = 0 means no wrapping (unlimited line length)
442    if options.line_length == 0 || display_len(line, options.length_mode) <= options.line_length {
443        return vec![line.to_string()];
444    }
445
446    // Parse the markdown to identify elements
447    let elements = parse_elements(line, options);
448
449    // Reflow the elements into lines
450    reflow_elements(&elements, options)
451}
452
453/// Image source in a linked image structure
454#[derive(Debug, Clone)]
455enum LinkedImageSource {
456    /// Inline image URL: ![alt](url)
457    Inline(String),
458    /// Reference image: ![alt][ref]
459    Reference(String),
460}
461
462/// Link target in a linked image structure
463#[derive(Debug, Clone)]
464enum LinkedImageTarget {
465    /// Inline link URL: ](url)
466    Inline(String),
467    /// Reference link: ][ref]
468    Reference(String),
469}
470
471/// Represents a piece of content in the markdown
472#[derive(Debug, Clone)]
473enum Element {
474    /// Plain text that can be wrapped
475    Text(String),
476    /// A complete markdown inline link [text](url)
477    Link { text: String, url: String },
478    /// A complete markdown reference link [text][ref]
479    ReferenceLink { text: String, reference: String },
480    /// A complete markdown empty reference link [text][]
481    EmptyReferenceLink { text: String },
482    /// A complete markdown shortcut reference link [ref]
483    ShortcutReference { reference: String },
484    /// A complete markdown inline image ![alt](url)
485    InlineImage { alt: String, url: String },
486    /// A complete markdown reference image ![alt][ref]
487    ReferenceImage { alt: String, reference: String },
488    /// A complete markdown empty reference image ![alt][]
489    EmptyReferenceImage { alt: String },
490    /// A clickable image badge in any of 4 forms:
491    /// - [![alt](img-url)](link-url)
492    /// - [![alt][img-ref]](link-url)
493    /// - [![alt](img-url)][link-ref]
494    /// - [![alt][img-ref]][link-ref]
495    LinkedImage {
496        alt: String,
497        img_source: LinkedImageSource,
498        link_target: LinkedImageTarget,
499    },
500    /// Footnote reference [^note]
501    FootnoteReference { note: String },
502    /// Strikethrough text ~~text~~
503    Strikethrough(String),
504    /// Wiki-style link [[wiki]] or [[wiki|text]]
505    WikiLink(String),
506    /// Inline math $math$
507    InlineMath(String),
508    /// Display math $$math$$
509    DisplayMath(String),
510    /// Emoji shortcode :emoji:
511    EmojiShortcode(String),
512    /// Autolink <https://...> or <mailto:...> or <user@domain.com>
513    Autolink(String),
514    /// HTML tag <tag> or </tag> or <tag/>
515    HtmlTag(String),
516    /// HTML entity &nbsp; or &#123;
517    HtmlEntity(String),
518    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
519    HugoShortcode(String),
520    /// MkDocs/kramdown attribute list {#id .class key="value"}
521    AttrList(String),
522    /// Inline code `code`
523    Code(String),
524    /// Bold text **text** or __text__
525    Bold {
526        content: String,
527        /// True if underscore markers (__), false for asterisks (**)
528        underscore: bool,
529    },
530    /// Italic text *text* or _text_
531    Italic {
532        content: String,
533        /// True if underscore marker (_), false for asterisk (*)
534        underscore: bool,
535    },
536}
537
538impl std::fmt::Display for Element {
539    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
540        match self {
541            Element::Text(s) => write!(f, "{s}"),
542            Element::Link { text, url } => write!(f, "[{text}]({url})"),
543            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
544            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
545            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
546            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
547            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
548            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
549            Element::LinkedImage {
550                alt,
551                img_source,
552                link_target,
553            } => {
554                // Build the image part: ![alt](url) or ![alt][ref]
555                let img_part = match img_source {
556                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
557                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
558                };
559                // Build the link part: (url) or [ref]
560                match link_target {
561                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
562                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
563                }
564            }
565            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
566            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
567            Element::WikiLink(s) => write!(f, "[[{s}]]"),
568            Element::InlineMath(s) => write!(f, "${s}$"),
569            Element::DisplayMath(s) => write!(f, "$${s}$$"),
570            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
571            Element::Autolink(s) => write!(f, "{s}"),
572            Element::HtmlTag(s) => write!(f, "{s}"),
573            Element::HtmlEntity(s) => write!(f, "{s}"),
574            Element::HugoShortcode(s) => write!(f, "{s}"),
575            Element::AttrList(s) => write!(f, "{s}"),
576            Element::Code(s) => write!(f, "`{s}`"),
577            Element::Bold { content, underscore } => {
578                if *underscore {
579                    write!(f, "__{content}__")
580                } else {
581                    write!(f, "**{content}**")
582                }
583            }
584            Element::Italic { content, underscore } => {
585                if *underscore {
586                    write!(f, "_{content}_")
587                } else {
588                    write!(f, "*{content}*")
589                }
590            }
591        }
592    }
593}
594
595impl Element {
596    /// Calculate the display width of this element using the given length mode.
597    /// This formats the element and computes its width, correctly handling
598    /// visual width for CJK characters and other wide glyphs.
599    fn display_width(&self, mode: ReflowLengthMode) -> usize {
600        let formatted = format!("{self}");
601        display_len(&formatted, mode)
602    }
603}
604
605/// An emphasis or formatting span parsed by pulldown-cmark
606#[derive(Debug, Clone)]
607struct EmphasisSpan {
608    /// Byte offset where the emphasis starts (including markers)
609    start: usize,
610    /// Byte offset where the emphasis ends (after closing markers)
611    end: usize,
612    /// The content inside the emphasis markers
613    content: String,
614    /// Whether this is strong (bold) emphasis
615    is_strong: bool,
616    /// Whether this is strikethrough (~~text~~)
617    is_strikethrough: bool,
618    /// Whether the original used underscore markers (for emphasis only)
619    uses_underscore: bool,
620}
621
622/// Extract emphasis and strikethrough spans from text using pulldown-cmark
623///
624/// This provides CommonMark-compliant emphasis parsing, correctly handling:
625/// - Nested emphasis like `*text **bold** more*`
626/// - Left/right flanking delimiter rules
627/// - Underscore vs asterisk markers
628/// - GFM strikethrough (~~text~~)
629///
630/// Returns spans sorted by start position.
631fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
632    let mut spans = Vec::new();
633    let mut options = Options::empty();
634    options.insert(Options::ENABLE_STRIKETHROUGH);
635
636    // Stacks to track nested formatting with their start positions
637    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
638    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
639    let mut strikethrough_stack: Vec<usize> = Vec::new();
640
641    let parser = Parser::new_ext(text, options).into_offset_iter();
642
643    for (event, range) in parser {
644        match event {
645            Event::Start(Tag::Emphasis) => {
646                // Check if this uses underscore by looking at the original text
647                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
648                emphasis_stack.push((range.start, uses_underscore));
649            }
650            Event::End(TagEnd::Emphasis) => {
651                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
652                    // Extract content between the markers (1 char marker on each side)
653                    let content_start = start_byte + 1;
654                    let content_end = range.end - 1;
655                    if content_end > content_start
656                        && let Some(content) = text.get(content_start..content_end)
657                    {
658                        spans.push(EmphasisSpan {
659                            start: start_byte,
660                            end: range.end,
661                            content: content.to_string(),
662                            is_strong: false,
663                            is_strikethrough: false,
664                            uses_underscore,
665                        });
666                    }
667                }
668            }
669            Event::Start(Tag::Strong) => {
670                // Check if this uses underscore by looking at the original text
671                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
672                strong_stack.push((range.start, uses_underscore));
673            }
674            Event::End(TagEnd::Strong) => {
675                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
676                    // Extract content between the markers (2 char marker on each side)
677                    let content_start = start_byte + 2;
678                    let content_end = range.end - 2;
679                    if content_end > content_start
680                        && let Some(content) = text.get(content_start..content_end)
681                    {
682                        spans.push(EmphasisSpan {
683                            start: start_byte,
684                            end: range.end,
685                            content: content.to_string(),
686                            is_strong: true,
687                            is_strikethrough: false,
688                            uses_underscore,
689                        });
690                    }
691                }
692            }
693            Event::Start(Tag::Strikethrough) => {
694                strikethrough_stack.push(range.start);
695            }
696            Event::End(TagEnd::Strikethrough) => {
697                if let Some(start_byte) = strikethrough_stack.pop() {
698                    // Extract content between the ~~ markers (2 char marker on each side)
699                    let content_start = start_byte + 2;
700                    let content_end = range.end - 2;
701                    if content_end > content_start
702                        && let Some(content) = text.get(content_start..content_end)
703                    {
704                        spans.push(EmphasisSpan {
705                            start: start_byte,
706                            end: range.end,
707                            content: content.to_string(),
708                            is_strong: false,
709                            is_strikethrough: true,
710                            uses_underscore: false,
711                        });
712                    }
713                }
714            }
715            _ => {}
716        }
717    }
718
719    // Sort by start position
720    spans.sort_by_key(|s| s.start);
721    spans
722}
723
724/// Parse markdown elements from text preserving the raw syntax
725///
726/// Detection order is critical:
727/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
728/// 2. Inline images ![alt](url) - before links to handle ! prefix
729/// 3. Reference images ![alt][ref] - before reference links
730/// 4. Inline links [text](url) - before reference links
731/// 5. Reference links [text][ref] - before shortcut references
732/// 6. Shortcut reference links [ref] - detected last to avoid false positives
733/// 7. Other elements (code, bold, italic, etc.) - processed normally
734fn parse_markdown_elements(text: &str) -> Vec<Element> {
735    parse_markdown_elements_inner(text, false)
736}
737
738fn parse_markdown_elements_with_attr_lists(text: &str) -> Vec<Element> {
739    parse_markdown_elements_inner(text, true)
740}
741
742fn parse_markdown_elements_inner(text: &str, attr_lists: bool) -> Vec<Element> {
743    let mut elements = Vec::new();
744    let mut remaining = text;
745
746    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
747    let emphasis_spans = extract_emphasis_spans(text);
748
749    while !remaining.is_empty() {
750        // Calculate current byte offset in original text
751        let current_offset = text.len() - remaining.len();
752        // Find the earliest occurrence of any markdown pattern
753        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
754
755        // Check for linked images FIRST (all 4 variants)
756        // Quick literal check: only run expensive regexes if we might have a linked image
757        // Pattern starts with "[!" so check for that first
758        if remaining.contains("[!") {
759            // Pattern 1: [![alt](img)](link) - inline image in inline link
760            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
761                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
762            {
763                earliest_match = Some((m.start(), "linked_image_ii", m));
764            }
765
766            // Pattern 2: [![alt][ref]](link) - reference image in inline link
767            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
768                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
769            {
770                earliest_match = Some((m.start(), "linked_image_ri", m));
771            }
772
773            // Pattern 3: [![alt](img)][ref] - inline image in reference link
774            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
775                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
776            {
777                earliest_match = Some((m.start(), "linked_image_ir", m));
778            }
779
780            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
781            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
782                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
783            {
784                earliest_match = Some((m.start(), "linked_image_rr", m));
785            }
786        }
787
788        // Check for images (they start with ! so should be detected before links)
789        // Inline images - ![alt](url)
790        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
791            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
792        {
793            earliest_match = Some((m.start(), "inline_image", m));
794        }
795
796        // Reference images - ![alt][ref]
797        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
798            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
799        {
800            earliest_match = Some((m.start(), "ref_image", m));
801        }
802
803        // Check for footnote references - [^note]
804        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
805            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
806        {
807            earliest_match = Some((m.start(), "footnote_ref", m));
808        }
809
810        // Check for inline links - [text](url)
811        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
812            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
813        {
814            earliest_match = Some((m.start(), "inline_link", m));
815        }
816
817        // Check for reference links - [text][ref]
818        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
819            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
820        {
821            earliest_match = Some((m.start(), "ref_link", m));
822        }
823
824        // Check for shortcut reference links - [ref]
825        // Only check if we haven't found an earlier pattern that would conflict
826        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
827            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
828        {
829            earliest_match = Some((m.start(), "shortcut_ref", m));
830        }
831
832        // Check for wiki-style links - [[wiki]]
833        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
834            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
835        {
836            earliest_match = Some((m.start(), "wiki_link", m));
837        }
838
839        // Check for display math first (before inline) - $$math$$
840        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
841            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
842        {
843            earliest_match = Some((m.start(), "display_math", m));
844        }
845
846        // Check for inline math - $math$
847        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
848            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
849        {
850            earliest_match = Some((m.start(), "inline_math", m));
851        }
852
853        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
854
855        // Check for emoji shortcodes - :emoji:
856        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
857            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
858        {
859            earliest_match = Some((m.start(), "emoji", m));
860        }
861
862        // Check for HTML entities - &nbsp; etc
863        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
864            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
865        {
866            earliest_match = Some((m.start(), "html_entity", m));
867        }
868
869        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
870        // Must be checked before other patterns to avoid false sentence breaks
871        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
872            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
873        {
874            earliest_match = Some((m.start(), "hugo_shortcode", m));
875        }
876
877        // Check for HTML tags - <tag> </tag> <tag/>
878        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
879        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
880            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
881        {
882            // Check if this is an autolink (starts with protocol or mailto:)
883            let matched_text = &remaining[m.start()..m.end()];
884            let is_url_autolink = matched_text.starts_with("<http://")
885                || matched_text.starts_with("<https://")
886                || matched_text.starts_with("<mailto:")
887                || matched_text.starts_with("<ftp://")
888                || matched_text.starts_with("<ftps://");
889
890            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
891            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
892            let is_email_autolink = {
893                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
894                EMAIL_PATTERN.is_match(content)
895            };
896
897            if is_url_autolink || is_email_autolink {
898                earliest_match = Some((m.start(), "autolink", m));
899            } else {
900                earliest_match = Some((m.start(), "html_tag", m));
901            }
902        }
903
904        // Find earliest non-link special characters
905        let mut next_special = remaining.len();
906        let mut special_type = "";
907        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
908        let mut attr_list_len: usize = 0;
909
910        // Check for code spans (not handled by pulldown-cmark in this context)
911        if let Some(pos) = remaining.find('`')
912            && pos < next_special
913        {
914            next_special = pos;
915            special_type = "code";
916        }
917
918        // Check for MkDocs/kramdown attr lists - {#id .class key="value"}
919        if attr_lists
920            && let Some(pos) = remaining.find('{')
921            && pos < next_special
922            && let Some(m) = ATTR_LIST_PATTERN.find(&remaining[pos..])
923            && m.start() == 0
924        {
925            next_special = pos;
926            special_type = "attr_list";
927            attr_list_len = m.end();
928        }
929
930        // Check for emphasis using pulldown-cmark's pre-extracted spans
931        // Find the earliest emphasis span that starts within remaining text
932        for span in &emphasis_spans {
933            if span.start >= current_offset && span.start < current_offset + remaining.len() {
934                let pos_in_remaining = span.start - current_offset;
935                if pos_in_remaining < next_special {
936                    next_special = pos_in_remaining;
937                    special_type = "pulldown_emphasis";
938                    pulldown_emphasis = Some(span);
939                }
940                break; // Spans are sorted by start position, so first match is earliest
941            }
942        }
943
944        // Determine which pattern to process first
945        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
946            pos < next_special
947        } else {
948            false
949        };
950
951        if should_process_markdown_link {
952            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
953
954            // Add any text before the match
955            if pos > 0 {
956                elements.push(Element::Text(remaining[..pos].to_string()));
957            }
958
959            // Process the matched pattern
960            match pattern_type {
961                // Pattern 1: [![alt](img)](link) - inline image in inline link
962                "linked_image_ii" => {
963                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
964                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
965                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
966                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
967                        elements.push(Element::LinkedImage {
968                            alt: alt.to_string(),
969                            img_source: LinkedImageSource::Inline(img_url.to_string()),
970                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
971                        });
972                        remaining = &remaining[match_obj.end()..];
973                    } else {
974                        elements.push(Element::Text("[".to_string()));
975                        remaining = &remaining[1..];
976                    }
977                }
978                // Pattern 2: [![alt][ref]](link) - reference image in inline link
979                "linked_image_ri" => {
980                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
981                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
982                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
983                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
984                        elements.push(Element::LinkedImage {
985                            alt: alt.to_string(),
986                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
987                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
988                        });
989                        remaining = &remaining[match_obj.end()..];
990                    } else {
991                        elements.push(Element::Text("[".to_string()));
992                        remaining = &remaining[1..];
993                    }
994                }
995                // Pattern 3: [![alt](img)][ref] - inline image in reference link
996                "linked_image_ir" => {
997                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
998                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
999                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1000                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1001                        elements.push(Element::LinkedImage {
1002                            alt: alt.to_string(),
1003                            img_source: LinkedImageSource::Inline(img_url.to_string()),
1004                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1005                        });
1006                        remaining = &remaining[match_obj.end()..];
1007                    } else {
1008                        elements.push(Element::Text("[".to_string()));
1009                        remaining = &remaining[1..];
1010                    }
1011                }
1012                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
1013                "linked_image_rr" => {
1014                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
1015                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1016                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1017                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1018                        elements.push(Element::LinkedImage {
1019                            alt: alt.to_string(),
1020                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
1021                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1022                        });
1023                        remaining = &remaining[match_obj.end()..];
1024                    } else {
1025                        elements.push(Element::Text("[".to_string()));
1026                        remaining = &remaining[1..];
1027                    }
1028                }
1029                "inline_image" => {
1030                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
1031                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1032                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1033                        elements.push(Element::InlineImage {
1034                            alt: alt.to_string(),
1035                            url: url.to_string(),
1036                        });
1037                        remaining = &remaining[match_obj.end()..];
1038                    } else {
1039                        elements.push(Element::Text("!".to_string()));
1040                        remaining = &remaining[1..];
1041                    }
1042                }
1043                "ref_image" => {
1044                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
1045                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1046                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1047
1048                        if reference.is_empty() {
1049                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1050                        } else {
1051                            elements.push(Element::ReferenceImage {
1052                                alt: alt.to_string(),
1053                                reference: reference.to_string(),
1054                            });
1055                        }
1056                        remaining = &remaining[match_obj.end()..];
1057                    } else {
1058                        elements.push(Element::Text("!".to_string()));
1059                        remaining = &remaining[1..];
1060                    }
1061                }
1062                "footnote_ref" => {
1063                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
1064                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1065                        elements.push(Element::FootnoteReference { note: note.to_string() });
1066                        remaining = &remaining[match_obj.end()..];
1067                    } else {
1068                        elements.push(Element::Text("[".to_string()));
1069                        remaining = &remaining[1..];
1070                    }
1071                }
1072                "inline_link" => {
1073                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1074                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1075                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1076                        elements.push(Element::Link {
1077                            text: text.to_string(),
1078                            url: url.to_string(),
1079                        });
1080                        remaining = &remaining[match_obj.end()..];
1081                    } else {
1082                        // Fallback - shouldn't happen
1083                        elements.push(Element::Text("[".to_string()));
1084                        remaining = &remaining[1..];
1085                    }
1086                }
1087                "ref_link" => {
1088                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1089                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1090                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1091
1092                        if reference.is_empty() {
1093                            // Empty reference link [text][]
1094                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1095                        } else {
1096                            // Regular reference link [text][ref]
1097                            elements.push(Element::ReferenceLink {
1098                                text: text.to_string(),
1099                                reference: reference.to_string(),
1100                            });
1101                        }
1102                        remaining = &remaining[match_obj.end()..];
1103                    } else {
1104                        // Fallback - shouldn't happen
1105                        elements.push(Element::Text("[".to_string()));
1106                        remaining = &remaining[1..];
1107                    }
1108                }
1109                "shortcut_ref" => {
1110                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1111                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1112                        elements.push(Element::ShortcutReference {
1113                            reference: reference.to_string(),
1114                        });
1115                        remaining = &remaining[match_obj.end()..];
1116                    } else {
1117                        // Fallback - shouldn't happen
1118                        elements.push(Element::Text("[".to_string()));
1119                        remaining = &remaining[1..];
1120                    }
1121                }
1122                "wiki_link" => {
1123                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1124                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1125                        elements.push(Element::WikiLink(content.to_string()));
1126                        remaining = &remaining[match_obj.end()..];
1127                    } else {
1128                        elements.push(Element::Text("[[".to_string()));
1129                        remaining = &remaining[2..];
1130                    }
1131                }
1132                "display_math" => {
1133                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1134                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1135                        elements.push(Element::DisplayMath(math.to_string()));
1136                        remaining = &remaining[match_obj.end()..];
1137                    } else {
1138                        elements.push(Element::Text("$$".to_string()));
1139                        remaining = &remaining[2..];
1140                    }
1141                }
1142                "inline_math" => {
1143                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1144                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1145                        elements.push(Element::InlineMath(math.to_string()));
1146                        remaining = &remaining[match_obj.end()..];
1147                    } else {
1148                        elements.push(Element::Text("$".to_string()));
1149                        remaining = &remaining[1..];
1150                    }
1151                }
1152                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1153                "emoji" => {
1154                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1155                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1156                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1157                        remaining = &remaining[match_obj.end()..];
1158                    } else {
1159                        elements.push(Element::Text(":".to_string()));
1160                        remaining = &remaining[1..];
1161                    }
1162                }
1163                "html_entity" => {
1164                    // HTML entities are captured whole - use as_str() to get just the matched content
1165                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1166                    remaining = &remaining[match_obj.end()..];
1167                }
1168                "hugo_shortcode" => {
1169                    // Hugo shortcodes are atomic elements - preserve them exactly
1170                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1171                    remaining = &remaining[match_obj.end()..];
1172                }
1173                "autolink" => {
1174                    // Autolinks are atomic elements - preserve them exactly
1175                    elements.push(Element::Autolink(match_obj.as_str().to_string()));
1176                    remaining = &remaining[match_obj.end()..];
1177                }
1178                "html_tag" => {
1179                    // HTML tags are captured whole - use as_str() to get just the matched content
1180                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1181                    remaining = &remaining[match_obj.end()..];
1182                }
1183                _ => {
1184                    // Unknown pattern, treat as text
1185                    elements.push(Element::Text("[".to_string()));
1186                    remaining = &remaining[1..];
1187                }
1188            }
1189        } else {
1190            // Process non-link special characters
1191
1192            // Add any text before the special character
1193            if next_special > 0 && next_special < remaining.len() {
1194                elements.push(Element::Text(remaining[..next_special].to_string()));
1195                remaining = &remaining[next_special..];
1196            }
1197
1198            // Process the special element
1199            match special_type {
1200                "code" => {
1201                    // Find end of code
1202                    if let Some(code_end) = remaining[1..].find('`') {
1203                        let code = &remaining[1..1 + code_end];
1204                        elements.push(Element::Code(code.to_string()));
1205                        remaining = &remaining[1 + code_end + 1..];
1206                    } else {
1207                        // No closing backtick, treat as text
1208                        elements.push(Element::Text(remaining.to_string()));
1209                        break;
1210                    }
1211                }
1212                "attr_list" => {
1213                    elements.push(Element::AttrList(remaining[..attr_list_len].to_string()));
1214                    remaining = &remaining[attr_list_len..];
1215                }
1216                "pulldown_emphasis" => {
1217                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1218                    if let Some(span) = pulldown_emphasis {
1219                        let span_len = span.end - span.start;
1220                        if span.is_strikethrough {
1221                            elements.push(Element::Strikethrough(span.content.clone()));
1222                        } else if span.is_strong {
1223                            elements.push(Element::Bold {
1224                                content: span.content.clone(),
1225                                underscore: span.uses_underscore,
1226                            });
1227                        } else {
1228                            elements.push(Element::Italic {
1229                                content: span.content.clone(),
1230                                underscore: span.uses_underscore,
1231                            });
1232                        }
1233                        remaining = &remaining[span_len..];
1234                    } else {
1235                        // Fallback - shouldn't happen
1236                        elements.push(Element::Text(remaining[..1].to_string()));
1237                        remaining = &remaining[1..];
1238                    }
1239                }
1240                _ => {
1241                    // No special elements found, add all remaining text
1242                    elements.push(Element::Text(remaining.to_string()));
1243                    break;
1244                }
1245            }
1246        }
1247    }
1248
1249    elements
1250}
1251
1252/// Reflow elements for sentence-per-line mode
1253fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1254    let abbreviations = get_abbreviations(custom_abbreviations);
1255    let mut lines = Vec::new();
1256    let mut current_line = String::new();
1257
1258    for (idx, element) in elements.iter().enumerate() {
1259        let element_str = format!("{element}");
1260
1261        // For text elements, split into sentences
1262        if let Element::Text(text) = element {
1263            // Simply append text - it already has correct spacing from tokenization
1264            let combined = format!("{current_line}{text}");
1265            // Use the pre-computed abbreviations set to avoid redundant computation
1266            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1267
1268            if sentences.len() > 1 {
1269                // We found sentence boundaries
1270                for (i, sentence) in sentences.iter().enumerate() {
1271                    if i == 0 {
1272                        // First sentence might continue from previous elements
1273                        // But check if it ends with an abbreviation
1274                        let trimmed = sentence.trim();
1275
1276                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1277                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1278                            current_line = sentence.to_string();
1279                        } else {
1280                            // Normal case - emit the first sentence
1281                            lines.push(sentence.to_string());
1282                            current_line.clear();
1283                        }
1284                    } else if i == sentences.len() - 1 {
1285                        // Last sentence: check if it's complete or incomplete
1286                        let trimmed = sentence.trim();
1287                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1288
1289                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1290                            // Complete sentence - emit it immediately
1291                            lines.push(sentence.to_string());
1292                            current_line.clear();
1293                        } else {
1294                            // Incomplete sentence - save for next iteration
1295                            current_line = sentence.to_string();
1296                        }
1297                    } else {
1298                        // Complete sentences in the middle
1299                        lines.push(sentence.to_string());
1300                    }
1301                }
1302            } else {
1303                // Single sentence - check if it's complete
1304                let trimmed = combined.trim();
1305
1306                // If the combined result is only whitespace, don't accumulate it.
1307                // This prevents leading spaces on subsequent elements when lines
1308                // are joined with spaces during reflow iteration.
1309                if trimmed.is_empty() {
1310                    continue;
1311                }
1312
1313                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1314
1315                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1316                    // Complete single sentence - emit it
1317                    lines.push(trimmed.to_string());
1318                    current_line.clear();
1319                } else {
1320                    // Incomplete sentence - continue accumulating
1321                    current_line = combined;
1322                }
1323            }
1324        } else if let Element::Italic { content, underscore } = element {
1325            // Handle italic elements - may contain multiple sentences that need continuation
1326            let marker = if *underscore { "_" } else { "*" };
1327            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1328        } else if let Element::Bold { content, underscore } = element {
1329            // Handle bold elements - may contain multiple sentences that need continuation
1330            let marker = if *underscore { "__" } else { "**" };
1331            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1332        } else if let Element::Strikethrough(content) = element {
1333            // Handle strikethrough elements - may contain multiple sentences that need continuation
1334            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1335        } else {
1336            // Non-text, non-emphasis elements (Code, Links, etc.)
1337            // Check if this element is adjacent to the preceding text (no space between)
1338            let is_adjacent = if idx > 0 {
1339                match &elements[idx - 1] {
1340                    Element::Text(t) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1341                    _ => true,
1342                }
1343            } else {
1344                false
1345            };
1346
1347            // Add space before element if needed, but not for adjacent elements
1348            if !is_adjacent
1349                && !current_line.is_empty()
1350                && !current_line.ends_with(' ')
1351                && !current_line.ends_with('(')
1352                && !current_line.ends_with('[')
1353            {
1354                current_line.push(' ');
1355            }
1356            current_line.push_str(&element_str);
1357        }
1358    }
1359
1360    // Add any remaining content
1361    if !current_line.is_empty() {
1362        lines.push(current_line.trim().to_string());
1363    }
1364    lines
1365}
1366
1367/// Handle splitting emphasis content at sentence boundaries while preserving markers
1368fn handle_emphasis_sentence_split(
1369    content: &str,
1370    marker: &str,
1371    abbreviations: &HashSet<String>,
1372    current_line: &mut String,
1373    lines: &mut Vec<String>,
1374) {
1375    // Split the emphasis content into sentences
1376    let sentences = split_into_sentences_with_set(content, abbreviations);
1377
1378    if sentences.len() <= 1 {
1379        // Single sentence or no boundaries - treat as atomic
1380        if !current_line.is_empty()
1381            && !current_line.ends_with(' ')
1382            && !current_line.ends_with('(')
1383            && !current_line.ends_with('[')
1384        {
1385            current_line.push(' ');
1386        }
1387        current_line.push_str(marker);
1388        current_line.push_str(content);
1389        current_line.push_str(marker);
1390
1391        // Check if the emphasis content ends with sentence punctuation - if so, emit
1392        let trimmed = content.trim();
1393        let ends_with_punct = ends_with_sentence_punct(trimmed);
1394        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1395            lines.push(current_line.clone());
1396            current_line.clear();
1397        }
1398    } else {
1399        // Multiple sentences - each gets its own emphasis markers
1400        for (i, sentence) in sentences.iter().enumerate() {
1401            let trimmed = sentence.trim();
1402            if trimmed.is_empty() {
1403                continue;
1404            }
1405
1406            if i == 0 {
1407                // First sentence: combine with current_line and emit
1408                if !current_line.is_empty()
1409                    && !current_line.ends_with(' ')
1410                    && !current_line.ends_with('(')
1411                    && !current_line.ends_with('[')
1412                {
1413                    current_line.push(' ');
1414                }
1415                current_line.push_str(marker);
1416                current_line.push_str(trimmed);
1417                current_line.push_str(marker);
1418
1419                // Check if this is a complete sentence
1420                let ends_with_punct = ends_with_sentence_punct(trimmed);
1421                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1422                    lines.push(current_line.clone());
1423                    current_line.clear();
1424                }
1425            } else if i == sentences.len() - 1 {
1426                // Last sentence: check if complete
1427                let ends_with_punct = ends_with_sentence_punct(trimmed);
1428
1429                let mut line = String::new();
1430                line.push_str(marker);
1431                line.push_str(trimmed);
1432                line.push_str(marker);
1433
1434                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1435                    lines.push(line);
1436                } else {
1437                    // Incomplete - keep in current_line for potential continuation
1438                    *current_line = line;
1439                }
1440            } else {
1441                // Middle sentences: emit with markers
1442                let mut line = String::new();
1443                line.push_str(marker);
1444                line.push_str(trimmed);
1445                line.push_str(marker);
1446                lines.push(line);
1447            }
1448        }
1449    }
1450}
1451
1452/// English break-words used for semantic line break splitting.
1453/// These are conjunctions and relative pronouns where a line break
1454/// reads naturally.
1455const BREAK_WORDS: &[&str] = &[
1456    "and",
1457    "or",
1458    "but",
1459    "nor",
1460    "yet",
1461    "so",
1462    "for",
1463    "which",
1464    "that",
1465    "because",
1466    "when",
1467    "if",
1468    "while",
1469    "where",
1470    "although",
1471    "though",
1472    "unless",
1473    "since",
1474    "after",
1475    "before",
1476    "until",
1477    "as",
1478    "once",
1479    "whether",
1480    "however",
1481    "therefore",
1482    "moreover",
1483    "furthermore",
1484    "nevertheless",
1485    "whereas",
1486];
1487
1488/// Check if a character is clause punctuation for semantic line breaks
1489fn is_clause_punctuation(c: char) -> bool {
1490    matches!(c, ',' | ';' | ':' | '\u{2014}') // comma, semicolon, colon, em dash
1491}
1492
1493/// Compute element spans for a flat text representation of elements.
1494/// Returns Vec of (start, end) byte offsets for non-Text elements,
1495/// so we can check that a split position doesn't fall inside them.
1496fn compute_element_spans(elements: &[Element]) -> Vec<(usize, usize)> {
1497    let mut spans = Vec::new();
1498    let mut offset = 0;
1499    for element in elements {
1500        let rendered = format!("{element}");
1501        let len = rendered.len();
1502        if !matches!(element, Element::Text(_)) {
1503            spans.push((offset, offset + len));
1504        }
1505        offset += len;
1506    }
1507    spans
1508}
1509
1510/// Check if a byte position falls inside any non-Text element span
1511fn is_inside_element(pos: usize, spans: &[(usize, usize)]) -> bool {
1512    spans.iter().any(|(start, end)| pos > *start && pos < *end)
1513}
1514
1515/// Minimum fraction of line_length that the first part of a split must occupy.
1516/// Prevents awkwardly short first lines like "A," or "Note:" on their own.
1517const MIN_SPLIT_RATIO: f64 = 0.3;
1518
1519/// Split a line at the latest clause punctuation that keeps the first part
1520/// within `line_length`. Returns None if no valid split point exists or if
1521/// the split would create an unreasonably short first line.
1522fn split_at_clause_punctuation(
1523    text: &str,
1524    line_length: usize,
1525    element_spans: &[(usize, usize)],
1526    length_mode: ReflowLengthMode,
1527) -> Option<(String, String)> {
1528    let chars: Vec<char> = text.chars().collect();
1529    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1530
1531    // Find the char index where accumulated display width exceeds line_length
1532    let mut width_acc = 0;
1533    let mut search_end_char = 0;
1534    for (idx, &c) in chars.iter().enumerate() {
1535        let c_width = display_len(&c.to_string(), length_mode);
1536        if width_acc + c_width > line_length {
1537            break;
1538        }
1539        width_acc += c_width;
1540        search_end_char = idx + 1;
1541    }
1542
1543    let mut best_pos = None;
1544    for i in (0..search_end_char).rev() {
1545        if is_clause_punctuation(chars[i]) {
1546            // Convert char position to byte position for element span check
1547            let byte_pos: usize = chars[..=i].iter().map(|c| c.len_utf8()).sum();
1548            if !is_inside_element(byte_pos, element_spans) {
1549                best_pos = Some(i);
1550                break;
1551            }
1552        }
1553    }
1554
1555    let pos = best_pos?;
1556
1557    // Reject splits that create very short first lines
1558    let first: String = chars[..=pos].iter().collect();
1559    let first_display_len = display_len(&first, length_mode);
1560    if first_display_len < min_first_len {
1561        return None;
1562    }
1563
1564    // Split after the punctuation character
1565    let rest: String = chars[pos + 1..].iter().collect();
1566    let rest = rest.trim_start().to_string();
1567
1568    if rest.is_empty() {
1569        return None;
1570    }
1571
1572    Some((first, rest))
1573}
1574
1575/// Split a line before the latest break-word that keeps the first part
1576/// within `line_length`. Returns None if no valid split point exists or if
1577/// the split would create an unreasonably short first line.
1578fn split_at_break_word(
1579    text: &str,
1580    line_length: usize,
1581    element_spans: &[(usize, usize)],
1582    length_mode: ReflowLengthMode,
1583) -> Option<(String, String)> {
1584    let lower = text.to_lowercase();
1585    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1586    let mut best_split: Option<(usize, usize)> = None; // (byte_start, word_len_bytes)
1587
1588    for &word in BREAK_WORDS {
1589        let mut search_start = 0;
1590        while let Some(pos) = lower[search_start..].find(word) {
1591            let abs_pos = search_start + pos;
1592
1593            // Verify it's a word boundary: preceded by space, followed by space
1594            let preceded_by_space = abs_pos == 0 || text.as_bytes().get(abs_pos - 1) == Some(&b' ');
1595            let followed_by_space = text.as_bytes().get(abs_pos + word.len()) == Some(&b' ');
1596
1597            if preceded_by_space && followed_by_space {
1598                // The break goes BEFORE the word, so first part ends at abs_pos - 1
1599                let first_part = text[..abs_pos].trim_end();
1600                let first_part_len = display_len(first_part, length_mode);
1601
1602                if first_part_len >= min_first_len
1603                    && first_part_len <= line_length
1604                    && !is_inside_element(abs_pos, element_spans)
1605                {
1606                    // Prefer the latest valid split point
1607                    if best_split.is_none_or(|(prev_pos, _)| abs_pos > prev_pos) {
1608                        best_split = Some((abs_pos, word.len()));
1609                    }
1610                }
1611            }
1612
1613            search_start = abs_pos + word.len();
1614        }
1615    }
1616
1617    let (byte_start, _word_len) = best_split?;
1618
1619    let first = text[..byte_start].trim_end().to_string();
1620    let rest = text[byte_start..].to_string();
1621
1622    if first.is_empty() || rest.trim().is_empty() {
1623        return None;
1624    }
1625
1626    Some((first, rest))
1627}
1628
1629/// Recursively cascade-split a line that exceeds line_length.
1630/// Tries clause punctuation first, then break-words, then word wrap.
1631fn cascade_split_line(
1632    text: &str,
1633    line_length: usize,
1634    abbreviations: &Option<Vec<String>>,
1635    length_mode: ReflowLengthMode,
1636    attr_lists: bool,
1637) -> Vec<String> {
1638    if line_length == 0 || display_len(text, length_mode) <= line_length {
1639        return vec![text.to_string()];
1640    }
1641
1642    let elements = parse_markdown_elements_inner(text, attr_lists);
1643    let element_spans = compute_element_spans(&elements);
1644
1645    // Try clause punctuation split
1646    if let Some((first, rest)) = split_at_clause_punctuation(text, line_length, &element_spans, length_mode) {
1647        let mut result = vec![first];
1648        result.extend(cascade_split_line(
1649            &rest,
1650            line_length,
1651            abbreviations,
1652            length_mode,
1653            attr_lists,
1654        ));
1655        return result;
1656    }
1657
1658    // Try break-word split
1659    if let Some((first, rest)) = split_at_break_word(text, line_length, &element_spans, length_mode) {
1660        let mut result = vec![first];
1661        result.extend(cascade_split_line(
1662            &rest,
1663            line_length,
1664            abbreviations,
1665            length_mode,
1666            attr_lists,
1667        ));
1668        return result;
1669    }
1670
1671    // Fallback: word wrap using existing reflow_elements
1672    let options = ReflowOptions {
1673        line_length,
1674        break_on_sentences: false,
1675        preserve_breaks: false,
1676        sentence_per_line: false,
1677        semantic_line_breaks: false,
1678        abbreviations: abbreviations.clone(),
1679        length_mode,
1680        attr_lists,
1681    };
1682    reflow_elements(&elements, &options)
1683}
1684
1685/// Reflow elements using semantic line breaks strategy:
1686/// 1. Split at sentence boundaries (always)
1687/// 2. For lines exceeding line_length, cascade through clause punct → break-words → word wrap
1688fn reflow_elements_semantic(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1689    // Step 1: Split into sentences using existing sentence-per-line logic
1690    let sentence_lines = reflow_elements_sentence_per_line(elements, &options.abbreviations);
1691
1692    // Step 2: For each sentence line, apply cascading splits if it exceeds line_length
1693    // When line_length is 0 (unlimited), skip cascading — sentence splits only
1694    if options.line_length == 0 {
1695        return sentence_lines;
1696    }
1697
1698    let length_mode = options.length_mode;
1699    let mut result = Vec::new();
1700    for line in sentence_lines {
1701        if display_len(&line, length_mode) <= options.line_length {
1702            result.push(line);
1703        } else {
1704            result.extend(cascade_split_line(
1705                &line,
1706                options.line_length,
1707                &options.abbreviations,
1708                length_mode,
1709                options.attr_lists,
1710            ));
1711        }
1712    }
1713
1714    // Step 3: Merge very short trailing lines back into the previous line.
1715    // Word wrap can produce lines like "was" or "see" on their own, which reads poorly.
1716    let min_line_len = ((options.line_length as f64) * MIN_SPLIT_RATIO) as usize;
1717    let mut merged: Vec<String> = Vec::with_capacity(result.len());
1718    for line in result {
1719        if !merged.is_empty() && display_len(&line, length_mode) < min_line_len && !line.trim().is_empty() {
1720            // Don't merge across sentence boundaries — sentence splits are intentional
1721            let prev_ends_at_sentence = {
1722                let trimmed = merged.last().unwrap().trim_end();
1723                trimmed
1724                    .chars()
1725                    .rev()
1726                    .find(|c| !matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | ')' | ']'))
1727                    .is_some_and(|c| matches!(c, '.' | '!' | '?'))
1728            };
1729
1730            if !prev_ends_at_sentence {
1731                let prev = merged.last_mut().unwrap();
1732                let combined = format!("{prev} {line}");
1733                // Only merge if the combined line fits within the limit
1734                if display_len(&combined, length_mode) <= options.line_length {
1735                    *prev = combined;
1736                    continue;
1737                }
1738            }
1739        }
1740        merged.push(line);
1741    }
1742    merged
1743}
1744
1745/// Find the last space in `line` that is safe to split at.
1746/// Safe spaces are those NOT inside rendered non-Text elements.
1747/// `element_spans` contains (start, end) byte ranges of non-Text elements in the line.
1748/// Find the last space in `line` that is not inside any element span.
1749/// Spans use exclusive bounds (pos > start && pos < end) because element
1750/// delimiters (e.g., `[`, `]`, `(`, `)`, `<`, `>`, `` ` ``) are never
1751/// spaces, so only interior positions need protection.
1752fn rfind_safe_space(line: &str, element_spans: &[(usize, usize)]) -> Option<usize> {
1753    line.char_indices()
1754        .rev()
1755        .map(|(pos, _)| pos)
1756        .find(|&pos| line.as_bytes()[pos] == b' ' && !element_spans.iter().any(|(s, e)| pos > *s && pos < *e))
1757}
1758
1759/// Reflow elements into lines that fit within the line length
1760fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1761    let mut lines = Vec::new();
1762    let mut current_line = String::new();
1763    let mut current_length = 0;
1764    // Track byte spans of non-Text elements in current_line for safe splitting
1765    let mut current_line_element_spans: Vec<(usize, usize)> = Vec::new();
1766    let length_mode = options.length_mode;
1767
1768    for (idx, element) in elements.iter().enumerate() {
1769        let element_str = format!("{element}");
1770        let element_len = element.display_width(length_mode);
1771
1772        // Determine adjacency from the original elements, not from current_line.
1773        // Elements are adjacent when there's no whitespace between them in the source:
1774        // - Text("v") → HugoShortcode("{{<...>}}") = adjacent (text has no trailing space)
1775        // - Text(" and ") → InlineLink("[a](url)") = NOT adjacent (text has trailing space)
1776        // - HugoShortcode("{{<...>}}") → Text(",") = adjacent (text has no leading space)
1777        let is_adjacent_to_prev = if idx > 0 {
1778            match (&elements[idx - 1], element) {
1779                (Element::Text(t), _) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1780                (_, Element::Text(t)) => !t.is_empty() && !t.starts_with(char::is_whitespace),
1781                _ => true,
1782            }
1783        } else {
1784            false
1785        };
1786
1787        // For text elements that might need breaking
1788        if let Element::Text(text) = element {
1789            // Check if original text had leading whitespace
1790            let has_leading_space = text.starts_with(char::is_whitespace);
1791            // If this is a text element, always process it word by word
1792            let words: Vec<&str> = text.split_whitespace().collect();
1793
1794            for (i, word) in words.iter().enumerate() {
1795                let word_len = display_len(word, length_mode);
1796                // Check if this "word" is just punctuation that should stay attached
1797                let is_trailing_punct = word
1798                    .chars()
1799                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1800
1801                // First word of text adjacent to preceding non-text element
1802                // must stay attached (e.g., shortcode followed by punctuation or text)
1803                let is_first_adjacent = i == 0 && is_adjacent_to_prev;
1804
1805                if is_first_adjacent {
1806                    // Attach directly without space, preventing line break
1807                    if current_length + word_len > options.line_length && current_length > 0 {
1808                        // Would exceed — break before the adjacent group
1809                        // Use element-aware space search to avoid splitting inside links/code/etc.
1810                        if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1811                            let before = current_line[..last_space].trim_end().to_string();
1812                            let after = current_line[last_space + 1..].to_string();
1813                            lines.push(before);
1814                            current_line = format!("{after}{word}");
1815                            current_length = display_len(&current_line, length_mode);
1816                            current_line_element_spans.clear();
1817                        } else {
1818                            current_line.push_str(word);
1819                            current_length += word_len;
1820                        }
1821                    } else {
1822                        current_line.push_str(word);
1823                        current_length += word_len;
1824                    }
1825                } else if current_length > 0
1826                    && current_length + 1 + word_len > options.line_length
1827                    && !is_trailing_punct
1828                {
1829                    // Start a new line (but never for trailing punctuation)
1830                    lines.push(current_line.trim().to_string());
1831                    current_line = word.to_string();
1832                    current_length = word_len;
1833                    current_line_element_spans.clear();
1834                } else {
1835                    // Add word to current line
1836                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1837                    // AND this isn't trailing punctuation (which attaches directly)
1838                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1839                        current_line.push(' ');
1840                        current_length += 1;
1841                    }
1842                    current_line.push_str(word);
1843                    current_length += word_len;
1844                }
1845            }
1846        } else if matches!(
1847            element,
1848            Element::Italic { .. } | Element::Bold { .. } | Element::Strikethrough(_)
1849        ) && element_len > options.line_length
1850        {
1851            // Italic, bold, and strikethrough with content longer than line_length need word wrapping.
1852            // Split content word-by-word, attach the opening marker to the first word
1853            // and the closing marker to the last word.
1854            let (content, marker): (&str, &str) = match element {
1855                Element::Italic { content, underscore } => (content.as_str(), if *underscore { "_" } else { "*" }),
1856                Element::Bold { content, underscore } => (content.as_str(), if *underscore { "__" } else { "**" }),
1857                Element::Strikethrough(content) => (content.as_str(), "~~"),
1858                _ => unreachable!(),
1859            };
1860
1861            let words: Vec<&str> = content.split_whitespace().collect();
1862            let n = words.len();
1863
1864            if n == 0 {
1865                // Empty span — treat as atomic
1866                let full = format!("{marker}{marker}");
1867                let full_len = display_len(&full, length_mode);
1868                if !is_adjacent_to_prev && current_length > 0 {
1869                    current_line.push(' ');
1870                    current_length += 1;
1871                }
1872                current_line.push_str(&full);
1873                current_length += full_len;
1874            } else {
1875                for (i, word) in words.iter().enumerate() {
1876                    let is_first = i == 0;
1877                    let is_last = i == n - 1;
1878                    let word_str: String = match (is_first, is_last) {
1879                        (true, true) => format!("{marker}{word}{marker}"),
1880                        (true, false) => format!("{marker}{word}"),
1881                        (false, true) => format!("{word}{marker}"),
1882                        (false, false) => word.to_string(),
1883                    };
1884                    let word_len = display_len(&word_str, length_mode);
1885
1886                    let needs_space = if is_first {
1887                        !is_adjacent_to_prev && current_length > 0
1888                    } else {
1889                        current_length > 0
1890                    };
1891
1892                    if needs_space && current_length + 1 + word_len > options.line_length {
1893                        lines.push(current_line.trim_end().to_string());
1894                        current_line = word_str;
1895                        current_length = word_len;
1896                        current_line_element_spans.clear();
1897                    } else {
1898                        if needs_space {
1899                            current_line.push(' ');
1900                            current_length += 1;
1901                        }
1902                        current_line.push_str(&word_str);
1903                        current_length += word_len;
1904                    }
1905                }
1906            }
1907        } else {
1908            // For non-text elements (code, links, references), treat as atomic units
1909            // These should never be broken across lines
1910
1911            if is_adjacent_to_prev {
1912                // Adjacent to preceding text — attach directly without space
1913                if current_length + element_len > options.line_length {
1914                    // Would exceed limit — break before the adjacent word group
1915                    // Use element-aware space search to avoid splitting inside links/code/etc.
1916                    if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1917                        let before = current_line[..last_space].trim_end().to_string();
1918                        let after = current_line[last_space + 1..].to_string();
1919                        lines.push(before);
1920                        current_line = format!("{after}{element_str}");
1921                        current_length = display_len(&current_line, length_mode);
1922                        current_line_element_spans.clear();
1923                        // Record the element span in the new current_line
1924                        let start = after.len();
1925                        current_line_element_spans.push((start, start + element_str.len()));
1926                    } else {
1927                        // No safe space to break at — accept the long line
1928                        let start = current_line.len();
1929                        current_line.push_str(&element_str);
1930                        current_length += element_len;
1931                        current_line_element_spans.push((start, current_line.len()));
1932                    }
1933                } else {
1934                    let start = current_line.len();
1935                    current_line.push_str(&element_str);
1936                    current_length += element_len;
1937                    current_line_element_spans.push((start, current_line.len()));
1938                }
1939            } else if current_length > 0 && current_length + 1 + element_len > options.line_length {
1940                // Not adjacent, would exceed — start new line
1941                lines.push(current_line.trim().to_string());
1942                current_line = element_str.clone();
1943                current_length = element_len;
1944                current_line_element_spans.clear();
1945                current_line_element_spans.push((0, element_str.len()));
1946            } else {
1947                // Not adjacent, fits — add with space
1948                let ends_with_opener =
1949                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1950                if current_length > 0 && !ends_with_opener {
1951                    current_line.push(' ');
1952                    current_length += 1;
1953                }
1954                let start = current_line.len();
1955                current_line.push_str(&element_str);
1956                current_length += element_len;
1957                current_line_element_spans.push((start, current_line.len()));
1958            }
1959        }
1960    }
1961
1962    // Don't forget the last line
1963    if !current_line.is_empty() {
1964        lines.push(current_line.trim_end().to_string());
1965    }
1966
1967    lines
1968}
1969
1970/// Reflow markdown content preserving structure
1971pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1972    let lines: Vec<&str> = content.lines().collect();
1973    let mut result = Vec::new();
1974    let mut i = 0;
1975
1976    while i < lines.len() {
1977        let line = lines[i];
1978        let trimmed = line.trim();
1979
1980        // Preserve empty lines
1981        if trimmed.is_empty() {
1982            result.push(String::new());
1983            i += 1;
1984            continue;
1985        }
1986
1987        // Preserve headings as-is
1988        if trimmed.starts_with('#') {
1989            result.push(line.to_string());
1990            i += 1;
1991            continue;
1992        }
1993
1994        // Preserve Quarto/Pandoc div markers (:::) as-is
1995        if trimmed.starts_with(":::") {
1996            result.push(line.to_string());
1997            i += 1;
1998            continue;
1999        }
2000
2001        // Preserve fenced code blocks
2002        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2003            result.push(line.to_string());
2004            i += 1;
2005            // Copy lines until closing fence
2006            while i < lines.len() {
2007                result.push(lines[i].to_string());
2008                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
2009                    i += 1;
2010                    break;
2011                }
2012                i += 1;
2013            }
2014            continue;
2015        }
2016
2017        // Preserve indented code blocks (4+ columns accounting for tab expansion)
2018        if ElementCache::calculate_indentation_width_default(line) >= 4 {
2019            // Collect all consecutive indented lines
2020            result.push(line.to_string());
2021            i += 1;
2022            while i < lines.len() {
2023                let next_line = lines[i];
2024                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
2025                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
2026                    result.push(next_line.to_string());
2027                    i += 1;
2028                } else {
2029                    break;
2030                }
2031            }
2032            continue;
2033        }
2034
2035        // Preserve block quotes (but reflow their content)
2036        if trimmed.starts_with('>') {
2037            // find() returns byte position which is correct for str slicing
2038            // The unwrap is safe because we already verified trimmed starts with '>'
2039            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
2040            let quote_prefix = line[0..gt_pos + 1].to_string();
2041            let quote_content = &line[quote_prefix.len()..].trim_start();
2042
2043            let reflowed = reflow_line(quote_content, options);
2044            for reflowed_line in reflowed.iter() {
2045                result.push(format!("{quote_prefix} {reflowed_line}"));
2046            }
2047            i += 1;
2048            continue;
2049        }
2050
2051        // Preserve horizontal rules first (before checking for lists)
2052        if is_horizontal_rule(trimmed) {
2053            result.push(line.to_string());
2054            i += 1;
2055            continue;
2056        }
2057
2058        // Preserve lists (but not horizontal rules)
2059        if is_unordered_list_marker(trimmed) || is_numbered_list_item(trimmed) {
2060            // Find the list marker and preserve indentation
2061            let indent = line.len() - line.trim_start().len();
2062            let indent_str = " ".repeat(indent);
2063
2064            // For numbered lists, find the period and the space after it
2065            // For bullet lists, find the marker and the space after it
2066            let mut marker_end = indent;
2067            let mut content_start = indent;
2068
2069            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
2070                // Numbered list: find the period
2071                if let Some(period_pos) = line[indent..].find('.') {
2072                    marker_end = indent + period_pos + 1; // Include the period
2073                    content_start = marker_end;
2074                    // Skip any spaces after the period to find content start
2075                    // Use byte-based check since content_start is a byte index
2076                    // This is safe because space is ASCII (single byte)
2077                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2078                        content_start += 1;
2079                    }
2080                }
2081            } else {
2082                // Bullet list: marker is single character
2083                marker_end = indent + 1; // Just the marker character
2084                content_start = marker_end;
2085                // Skip any spaces after the marker
2086                // Use byte-based check since content_start is a byte index
2087                // This is safe because space is ASCII (single byte)
2088                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2089                    content_start += 1;
2090                }
2091            }
2092
2093            let marker = &line[indent..marker_end];
2094
2095            // Collect all content for this list item (including continuation lines)
2096            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
2097            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
2098            i += 1;
2099
2100            // Collect continuation lines (indented lines that are part of this list item)
2101            while i < lines.len() {
2102                let next_line = lines[i];
2103                let next_trimmed = next_line.trim();
2104
2105                // Stop if we hit an empty line or another list item or special block
2106                if is_block_boundary(next_trimmed) {
2107                    break;
2108                }
2109
2110                // Check if this line is indented (continuation of list item)
2111                let next_indent = next_line.len() - next_line.trim_start().len();
2112                if next_indent >= content_start {
2113                    // This is a continuation line - add its content
2114                    // Preserve hard breaks while trimming excessive whitespace
2115                    let trimmed_start = next_line.trim_start();
2116                    list_content.push(trim_preserving_hard_break(trimmed_start));
2117                    i += 1;
2118                } else {
2119                    // Not indented enough, not part of this list item
2120                    break;
2121                }
2122            }
2123
2124            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
2125            // Hard breaks should prevent joining with the next line
2126            let combined_content = if options.preserve_breaks {
2127                list_content[0].clone()
2128            } else {
2129                // Check if any lines have hard breaks - if so, preserve the structure
2130                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
2131                if has_hard_breaks {
2132                    // Don't join lines with hard breaks - keep them separate with newlines
2133                    list_content.join("\n")
2134                } else {
2135                    // No hard breaks, safe to join with spaces
2136                    list_content.join(" ")
2137                }
2138            };
2139
2140            // Calculate the proper indentation for continuation lines
2141            let trimmed_marker = marker;
2142            let continuation_spaces = content_start;
2143
2144            // Adjust line length to account for list marker and space
2145            let prefix_length = indent + trimmed_marker.len() + 1;
2146
2147            // Create adjusted options with reduced line length
2148            let adjusted_options = ReflowOptions {
2149                line_length: options.line_length.saturating_sub(prefix_length),
2150                ..options.clone()
2151            };
2152
2153            let reflowed = reflow_line(&combined_content, &adjusted_options);
2154            for (j, reflowed_line) in reflowed.iter().enumerate() {
2155                if j == 0 {
2156                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
2157                } else {
2158                    // Continuation lines aligned with text after marker
2159                    let continuation_indent = " ".repeat(continuation_spaces);
2160                    result.push(format!("{continuation_indent}{reflowed_line}"));
2161                }
2162            }
2163            continue;
2164        }
2165
2166        // Preserve tables
2167        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
2168            result.push(line.to_string());
2169            i += 1;
2170            continue;
2171        }
2172
2173        // Preserve reference definitions
2174        if trimmed.starts_with('[') && line.contains("]:") {
2175            result.push(line.to_string());
2176            i += 1;
2177            continue;
2178        }
2179
2180        // Preserve definition list items (extended markdown)
2181        if is_definition_list_item(trimmed) {
2182            result.push(line.to_string());
2183            i += 1;
2184            continue;
2185        }
2186
2187        // Check if this is a single line that doesn't need processing
2188        let mut is_single_line_paragraph = true;
2189        if i + 1 < lines.len() {
2190            let next_trimmed = lines[i + 1].trim();
2191            // Check if next line continues this paragraph
2192            if !is_block_boundary(next_trimmed) {
2193                is_single_line_paragraph = false;
2194            }
2195        }
2196
2197        // If it's a single line that fits, just add it as-is
2198        if is_single_line_paragraph && display_len(line, options.length_mode) <= options.line_length {
2199            result.push(line.to_string());
2200            i += 1;
2201            continue;
2202        }
2203
2204        // For regular paragraphs, collect consecutive lines
2205        let mut paragraph_parts = Vec::new();
2206        let mut current_part = vec![line];
2207        i += 1;
2208
2209        // If preserve_breaks is true, treat each line separately
2210        if options.preserve_breaks {
2211            // Don't collect consecutive lines - just reflow this single line
2212            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
2213                Some("\\")
2214            } else if line.ends_with("  ") {
2215                Some("  ")
2216            } else {
2217                None
2218            };
2219            let reflowed = reflow_line(line, options);
2220
2221            // Preserve hard breaks (two trailing spaces or backslash)
2222            if let Some(break_marker) = hard_break_type {
2223                if !reflowed.is_empty() {
2224                    let mut reflowed_with_break = reflowed;
2225                    let last_idx = reflowed_with_break.len() - 1;
2226                    if !has_hard_break(&reflowed_with_break[last_idx]) {
2227                        reflowed_with_break[last_idx].push_str(break_marker);
2228                    }
2229                    result.extend(reflowed_with_break);
2230                }
2231            } else {
2232                result.extend(reflowed);
2233            }
2234        } else {
2235            // Original behavior: collect consecutive lines into a paragraph
2236            while i < lines.len() {
2237                let prev_line = if !current_part.is_empty() {
2238                    current_part.last().unwrap()
2239                } else {
2240                    ""
2241                };
2242                let next_line = lines[i];
2243                let next_trimmed = next_line.trim();
2244
2245                // Stop at empty lines or special blocks
2246                if is_block_boundary(next_trimmed) {
2247                    break;
2248                }
2249
2250                // Check if previous line ends with hard break (two spaces or backslash)
2251                // or is a complete sentence in sentence_per_line mode
2252                let prev_trimmed = prev_line.trim();
2253                let abbreviations = get_abbreviations(&options.abbreviations);
2254                let ends_with_sentence = (prev_trimmed.ends_with('.')
2255                    || prev_trimmed.ends_with('!')
2256                    || prev_trimmed.ends_with('?')
2257                    || prev_trimmed.ends_with(".*")
2258                    || prev_trimmed.ends_with("!*")
2259                    || prev_trimmed.ends_with("?*")
2260                    || prev_trimmed.ends_with("._")
2261                    || prev_trimmed.ends_with("!_")
2262                    || prev_trimmed.ends_with("?_")
2263                    // Quote-terminated sentences (straight and curly quotes)
2264                    || prev_trimmed.ends_with(".\"")
2265                    || prev_trimmed.ends_with("!\"")
2266                    || prev_trimmed.ends_with("?\"")
2267                    || prev_trimmed.ends_with(".'")
2268                    || prev_trimmed.ends_with("!'")
2269                    || prev_trimmed.ends_with("?'")
2270                    || prev_trimmed.ends_with(".\u{201D}")
2271                    || prev_trimmed.ends_with("!\u{201D}")
2272                    || prev_trimmed.ends_with("?\u{201D}")
2273                    || prev_trimmed.ends_with(".\u{2019}")
2274                    || prev_trimmed.ends_with("!\u{2019}")
2275                    || prev_trimmed.ends_with("?\u{2019}"))
2276                    && !text_ends_with_abbreviation(
2277                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
2278                        &abbreviations,
2279                    );
2280
2281                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
2282                    // Start a new part after hard break or complete sentence
2283                    paragraph_parts.push(current_part.join(" "));
2284                    current_part = vec![next_line];
2285                } else {
2286                    current_part.push(next_line);
2287                }
2288                i += 1;
2289            }
2290
2291            // Add the last part
2292            if !current_part.is_empty() {
2293                if current_part.len() == 1 {
2294                    // Single line, don't add trailing space
2295                    paragraph_parts.push(current_part[0].to_string());
2296                } else {
2297                    paragraph_parts.push(current_part.join(" "));
2298                }
2299            }
2300
2301            // Reflow each part separately, preserving hard breaks
2302            for (j, part) in paragraph_parts.iter().enumerate() {
2303                let reflowed = reflow_line(part, options);
2304                result.extend(reflowed);
2305
2306                // Preserve hard break by ensuring last line of part ends with hard break marker
2307                // Use two spaces as the default hard break format for reflows
2308                // But don't add hard breaks in sentence_per_line mode - lines are already separate
2309                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
2310                    let last_idx = result.len() - 1;
2311                    if !has_hard_break(&result[last_idx]) {
2312                        result[last_idx].push_str("  ");
2313                    }
2314                }
2315            }
2316        }
2317    }
2318
2319    // Preserve trailing newline if the original content had one
2320    let result_text = result.join("\n");
2321    if content.ends_with('\n') && !result_text.ends_with('\n') {
2322        format!("{result_text}\n")
2323    } else {
2324        result_text
2325    }
2326}
2327
2328/// Information about a reflowed paragraph
2329#[derive(Debug, Clone)]
2330pub struct ParagraphReflow {
2331    /// Starting byte offset of the paragraph in the original content
2332    pub start_byte: usize,
2333    /// Ending byte offset of the paragraph in the original content
2334    pub end_byte: usize,
2335    /// The reflowed text for this paragraph
2336    pub reflowed_text: String,
2337}
2338
2339/// A collected blockquote line used for style-preserving reflow.
2340///
2341/// The invariant `is_explicit == true` iff `prefix.is_some()` is enforced by the
2342/// constructors. Use [`BlockquoteLineData::explicit`] or [`BlockquoteLineData::lazy`]
2343/// rather than constructing the struct directly.
2344#[derive(Debug, Clone)]
2345pub struct BlockquoteLineData {
2346    /// Trimmed content without the `> ` prefix.
2347    pub(crate) content: String,
2348    /// Whether this line carries an explicit blockquote marker.
2349    pub(crate) is_explicit: bool,
2350    /// Full blockquote prefix (e.g. `"> "`, `"> > "`). `None` for lazy continuation lines.
2351    pub(crate) prefix: Option<String>,
2352}
2353
2354impl BlockquoteLineData {
2355    /// Create an explicit (marker-bearing) blockquote line.
2356    pub fn explicit(content: String, prefix: String) -> Self {
2357        Self {
2358            content,
2359            is_explicit: true,
2360            prefix: Some(prefix),
2361        }
2362    }
2363
2364    /// Create a lazy continuation line (no blockquote marker).
2365    pub fn lazy(content: String) -> Self {
2366        Self {
2367            content,
2368            is_explicit: false,
2369            prefix: None,
2370        }
2371    }
2372}
2373
2374/// Style for blockquote continuation lines after reflow.
2375#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2376pub enum BlockquoteContinuationStyle {
2377    Explicit,
2378    Lazy,
2379}
2380
2381/// Determine the continuation style for a blockquote paragraph from its collected lines.
2382///
2383/// The first line is always explicit (it carries the marker), so only continuation
2384/// lines (index 1+) are counted. Ties resolve to `Explicit`.
2385///
2386/// When the slice has only one element (no continuation lines to inspect), both
2387/// counts are zero and the tie-breaking rule returns `Explicit`.
2388pub fn blockquote_continuation_style(lines: &[BlockquoteLineData]) -> BlockquoteContinuationStyle {
2389    let mut explicit_count = 0usize;
2390    let mut lazy_count = 0usize;
2391
2392    for line in lines.iter().skip(1) {
2393        if line.is_explicit {
2394            explicit_count += 1;
2395        } else {
2396            lazy_count += 1;
2397        }
2398    }
2399
2400    if explicit_count > 0 && lazy_count == 0 {
2401        BlockquoteContinuationStyle::Explicit
2402    } else if lazy_count > 0 && explicit_count == 0 {
2403        BlockquoteContinuationStyle::Lazy
2404    } else if explicit_count >= lazy_count {
2405        BlockquoteContinuationStyle::Explicit
2406    } else {
2407        BlockquoteContinuationStyle::Lazy
2408    }
2409}
2410
2411/// Determine the dominant blockquote prefix for a paragraph.
2412///
2413/// The most frequently occurring explicit prefix wins. Ties are broken by earliest
2414/// first appearance. Falls back to `fallback` when no explicit lines are present.
2415pub fn dominant_blockquote_prefix(lines: &[BlockquoteLineData], fallback: &str) -> String {
2416    let mut counts: std::collections::HashMap<String, (usize, usize)> = std::collections::HashMap::new();
2417
2418    for (idx, line) in lines.iter().enumerate() {
2419        let Some(prefix) = line.prefix.as_ref() else {
2420            continue;
2421        };
2422        counts
2423            .entry(prefix.clone())
2424            .and_modify(|entry| entry.0 += 1)
2425            .or_insert((1, idx));
2426    }
2427
2428    counts
2429        .into_iter()
2430        .max_by(|(_, (count_a, first_idx_a)), (_, (count_b, first_idx_b))| {
2431            count_a.cmp(count_b).then_with(|| first_idx_b.cmp(first_idx_a))
2432        })
2433        .map(|(prefix, _)| prefix)
2434        .unwrap_or_else(|| fallback.to_string())
2435}
2436
2437/// Whether a reflowed blockquote content line must carry an explicit prefix.
2438///
2439/// Lines that would start a new block structure (headings, fences, lists, etc.)
2440/// cannot safely use lazy continuation syntax.
2441pub(crate) fn should_force_explicit_blockquote_line(content_line: &str) -> bool {
2442    let trimmed = content_line.trim_start();
2443    trimmed.starts_with('>')
2444        || trimmed.starts_with('#')
2445        || trimmed.starts_with("```")
2446        || trimmed.starts_with("~~~")
2447        || is_unordered_list_marker(trimmed)
2448        || is_numbered_list_item(trimmed)
2449        || is_horizontal_rule(trimmed)
2450        || is_definition_list_item(trimmed)
2451        || (trimmed.starts_with('[') && trimmed.contains("]:"))
2452        || trimmed.starts_with(":::")
2453        || (trimmed.starts_with('<')
2454            && !trimmed.starts_with("<http")
2455            && !trimmed.starts_with("<https")
2456            && !trimmed.starts_with("<mailto:"))
2457}
2458
2459/// Reflow blockquote content lines and apply continuation style.
2460///
2461/// Segments separated by hard breaks are reflowed independently. The output lines
2462/// receive blockquote prefixes according to `continuation_style`: the first line and
2463/// any line that would start a new block structure always get an explicit prefix;
2464/// other lines follow the detected style.
2465///
2466/// Returns the styled, reflowed lines (without a trailing newline).
2467pub fn reflow_blockquote_content(
2468    lines: &[BlockquoteLineData],
2469    explicit_prefix: &str,
2470    continuation_style: BlockquoteContinuationStyle,
2471    options: &ReflowOptions,
2472) -> Vec<String> {
2473    let content_strs: Vec<&str> = lines.iter().map(|l| l.content.as_str()).collect();
2474    let segments = split_into_segments_strs(&content_strs);
2475    let mut reflowed_content_lines: Vec<String> = Vec::new();
2476
2477    for segment in segments {
2478        let hard_break_type = segment.last().and_then(|&line| {
2479            let line = line.strip_suffix('\r').unwrap_or(line);
2480            if line.ends_with('\\') {
2481                Some("\\")
2482            } else if line.ends_with("  ") {
2483                Some("  ")
2484            } else {
2485                None
2486            }
2487        });
2488
2489        let pieces: Vec<&str> = segment
2490            .iter()
2491            .map(|&line| {
2492                if let Some(l) = line.strip_suffix('\\') {
2493                    l.trim_end()
2494                } else if let Some(l) = line.strip_suffix("  ") {
2495                    l.trim_end()
2496                } else {
2497                    line.trim_end()
2498                }
2499            })
2500            .collect();
2501
2502        let segment_text = pieces.join(" ");
2503        let segment_text = segment_text.trim();
2504        if segment_text.is_empty() {
2505            continue;
2506        }
2507
2508        let mut reflowed = reflow_line(segment_text, options);
2509        if let Some(break_marker) = hard_break_type
2510            && !reflowed.is_empty()
2511        {
2512            let last_idx = reflowed.len() - 1;
2513            if !has_hard_break(&reflowed[last_idx]) {
2514                reflowed[last_idx].push_str(break_marker);
2515            }
2516        }
2517        reflowed_content_lines.extend(reflowed);
2518    }
2519
2520    let mut styled_lines: Vec<String> = Vec::new();
2521    for (idx, line) in reflowed_content_lines.iter().enumerate() {
2522        let force_explicit = idx == 0
2523            || continuation_style == BlockquoteContinuationStyle::Explicit
2524            || should_force_explicit_blockquote_line(line);
2525        if force_explicit {
2526            styled_lines.push(format!("{explicit_prefix}{line}"));
2527        } else {
2528            styled_lines.push(line.clone());
2529        }
2530    }
2531
2532    styled_lines
2533}
2534
2535fn is_blockquote_content_boundary(content: &str) -> bool {
2536    let trimmed = content.trim();
2537    trimmed.is_empty()
2538        || is_block_boundary(trimmed)
2539        || crate::utils::table_utils::TableUtils::is_potential_table_row(content)
2540        || trimmed.starts_with(":::")
2541        || crate::utils::is_template_directive_only(content)
2542        || is_standalone_attr_list(content)
2543        || is_snippet_block_delimiter(content)
2544}
2545
2546fn split_into_segments_strs<'a>(lines: &[&'a str]) -> Vec<Vec<&'a str>> {
2547    let mut segments = Vec::new();
2548    let mut current = Vec::new();
2549
2550    for &line in lines {
2551        current.push(line);
2552        if has_hard_break(line) {
2553            segments.push(current);
2554            current = Vec::new();
2555        }
2556    }
2557
2558    if !current.is_empty() {
2559        segments.push(current);
2560    }
2561
2562    segments
2563}
2564
2565fn reflow_blockquote_paragraph_at_line(
2566    content: &str,
2567    lines: &[&str],
2568    target_idx: usize,
2569    options: &ReflowOptions,
2570) -> Option<ParagraphReflow> {
2571    let mut anchor_idx = target_idx;
2572    let mut target_level = if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[target_idx]) {
2573        parsed.nesting_level
2574    } else {
2575        let mut found = None;
2576        let mut idx = target_idx;
2577        loop {
2578            if lines[idx].trim().is_empty() {
2579                break;
2580            }
2581            if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[idx]) {
2582                found = Some((idx, parsed.nesting_level));
2583                break;
2584            }
2585            if idx == 0 {
2586                break;
2587            }
2588            idx -= 1;
2589        }
2590        let (idx, level) = found?;
2591        anchor_idx = idx;
2592        level
2593    };
2594
2595    // Expand backward to capture prior quote content at the same nesting level.
2596    let mut para_start = anchor_idx;
2597    while para_start > 0 {
2598        let prev_idx = para_start - 1;
2599        let prev_line = lines[prev_idx];
2600
2601        if prev_line.trim().is_empty() {
2602            break;
2603        }
2604
2605        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(prev_line) {
2606            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2607                break;
2608            }
2609            para_start = prev_idx;
2610            continue;
2611        }
2612
2613        let prev_lazy = prev_line.trim_start();
2614        if is_blockquote_content_boundary(prev_lazy) {
2615            break;
2616        }
2617        para_start = prev_idx;
2618    }
2619
2620    // Lazy continuation cannot precede the first explicit marker.
2621    while para_start < lines.len() {
2622        let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[para_start]) else {
2623            para_start += 1;
2624            continue;
2625        };
2626        target_level = parsed.nesting_level;
2627        break;
2628    }
2629
2630    if para_start >= lines.len() || para_start > target_idx {
2631        return None;
2632    }
2633
2634    // Collect explicit lines at target level and lazy continuation lines.
2635    // Each entry is (original_line_idx, BlockquoteLineData).
2636    let mut collected: Vec<(usize, BlockquoteLineData)> = Vec::new();
2637    let mut idx = para_start;
2638    while idx < lines.len() {
2639        if !collected.is_empty() && has_hard_break(&collected[collected.len() - 1].1.content) {
2640            break;
2641        }
2642
2643        let line = lines[idx];
2644        if line.trim().is_empty() {
2645            break;
2646        }
2647
2648        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(line) {
2649            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2650                break;
2651            }
2652            collected.push((
2653                idx,
2654                BlockquoteLineData::explicit(trim_preserving_hard_break(parsed.content), parsed.prefix.to_string()),
2655            ));
2656            idx += 1;
2657            continue;
2658        }
2659
2660        let lazy_content = line.trim_start();
2661        if is_blockquote_content_boundary(lazy_content) {
2662            break;
2663        }
2664
2665        collected.push((idx, BlockquoteLineData::lazy(trim_preserving_hard_break(lazy_content))));
2666        idx += 1;
2667    }
2668
2669    if collected.is_empty() {
2670        return None;
2671    }
2672
2673    let para_end = collected[collected.len() - 1].0;
2674    if target_idx < para_start || target_idx > para_end {
2675        return None;
2676    }
2677
2678    let line_data: Vec<BlockquoteLineData> = collected.iter().map(|(_, d)| d.clone()).collect();
2679
2680    let fallback_prefix = line_data
2681        .iter()
2682        .find_map(|d| d.prefix.clone())
2683        .unwrap_or_else(|| "> ".to_string());
2684    let explicit_prefix = dominant_blockquote_prefix(&line_data, &fallback_prefix);
2685    let continuation_style = blockquote_continuation_style(&line_data);
2686
2687    let adjusted_line_length = options
2688        .line_length
2689        .saturating_sub(display_len(&explicit_prefix, options.length_mode))
2690        .max(1);
2691
2692    let adjusted_options = ReflowOptions {
2693        line_length: adjusted_line_length,
2694        ..options.clone()
2695    };
2696
2697    let styled_lines = reflow_blockquote_content(&line_data, &explicit_prefix, continuation_style, &adjusted_options);
2698
2699    if styled_lines.is_empty() {
2700        return None;
2701    }
2702
2703    // Calculate byte offsets.
2704    let mut start_byte = 0;
2705    for line in lines.iter().take(para_start) {
2706        start_byte += line.len() + 1;
2707    }
2708
2709    let mut end_byte = start_byte;
2710    for line in lines.iter().take(para_end + 1).skip(para_start) {
2711        end_byte += line.len() + 1;
2712    }
2713
2714    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2715    if !includes_trailing_newline {
2716        end_byte -= 1;
2717    }
2718
2719    let reflowed_joined = styled_lines.join("\n");
2720    let reflowed_text = if includes_trailing_newline {
2721        if reflowed_joined.ends_with('\n') {
2722            reflowed_joined
2723        } else {
2724            format!("{reflowed_joined}\n")
2725        }
2726    } else if reflowed_joined.ends_with('\n') {
2727        reflowed_joined.trim_end_matches('\n').to_string()
2728    } else {
2729        reflowed_joined
2730    };
2731
2732    Some(ParagraphReflow {
2733        start_byte,
2734        end_byte,
2735        reflowed_text,
2736    })
2737}
2738
2739/// Reflow a single paragraph at the specified line number
2740///
2741/// This function finds the paragraph containing the given line number,
2742/// reflows it according to the specified line length, and returns
2743/// information about the paragraph location and its reflowed text.
2744///
2745/// # Arguments
2746///
2747/// * `content` - The full document content
2748/// * `line_number` - The 1-based line number within the paragraph to reflow
2749/// * `line_length` - The target line length for reflowing
2750///
2751/// # Returns
2752///
2753/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
2754/// or `None` if the line number is out of bounds or the content at that
2755/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
2756pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
2757    reflow_paragraph_at_line_with_mode(content, line_number, line_length, ReflowLengthMode::default())
2758}
2759
2760/// Reflow a paragraph at the given line with a specific length mode.
2761pub fn reflow_paragraph_at_line_with_mode(
2762    content: &str,
2763    line_number: usize,
2764    line_length: usize,
2765    length_mode: ReflowLengthMode,
2766) -> Option<ParagraphReflow> {
2767    let options = ReflowOptions {
2768        line_length,
2769        length_mode,
2770        ..Default::default()
2771    };
2772    reflow_paragraph_at_line_with_options(content, line_number, &options)
2773}
2774
2775/// Reflow a paragraph at the given line using the provided options.
2776///
2777/// This is the canonical implementation used by both the rule's fix mode and the
2778/// LSP "Reflow paragraph" action. Passing a fully configured `ReflowOptions` allows
2779/// the LSP action to respect user-configured reflow mode, abbreviations, etc.
2780///
2781/// # Returns
2782///
2783/// Returns `Some(ParagraphReflow)` with byte offsets and reflowed text, or `None`
2784/// if the line is out of bounds or sits inside a non-reflow-able construct.
2785pub fn reflow_paragraph_at_line_with_options(
2786    content: &str,
2787    line_number: usize,
2788    options: &ReflowOptions,
2789) -> Option<ParagraphReflow> {
2790    if line_number == 0 {
2791        return None;
2792    }
2793
2794    let lines: Vec<&str> = content.lines().collect();
2795
2796    // Check if line number is valid (1-based)
2797    if line_number > lines.len() {
2798        return None;
2799    }
2800
2801    let target_idx = line_number - 1; // Convert to 0-based
2802    let target_line = lines[target_idx];
2803    let trimmed = target_line.trim();
2804
2805    // Handle blockquote paragraphs (including lazy continuation lines) with
2806    // style-preserving output.
2807    if let Some(blockquote_reflow) = reflow_blockquote_paragraph_at_line(content, &lines, target_idx, options) {
2808        return Some(blockquote_reflow);
2809    }
2810
2811    // Don't reflow special blocks
2812    if is_paragraph_boundary(trimmed, target_line) {
2813        return None;
2814    }
2815
2816    // Find paragraph start - scan backward until blank line or special block
2817    let mut para_start = target_idx;
2818    while para_start > 0 {
2819        let prev_idx = para_start - 1;
2820        let prev_line = lines[prev_idx];
2821        let prev_trimmed = prev_line.trim();
2822
2823        // Stop at blank line or special blocks
2824        if is_paragraph_boundary(prev_trimmed, prev_line) {
2825            break;
2826        }
2827
2828        para_start = prev_idx;
2829    }
2830
2831    // Find paragraph end - scan forward until blank line or special block
2832    let mut para_end = target_idx;
2833    while para_end + 1 < lines.len() {
2834        let next_idx = para_end + 1;
2835        let next_line = lines[next_idx];
2836        let next_trimmed = next_line.trim();
2837
2838        // Stop at blank line or special blocks
2839        if is_paragraph_boundary(next_trimmed, next_line) {
2840            break;
2841        }
2842
2843        para_end = next_idx;
2844    }
2845
2846    // Extract paragraph lines
2847    let paragraph_lines = &lines[para_start..=para_end];
2848
2849    // Calculate byte offsets
2850    let mut start_byte = 0;
2851    for line in lines.iter().take(para_start) {
2852        start_byte += line.len() + 1; // +1 for newline
2853    }
2854
2855    let mut end_byte = start_byte;
2856    for line in paragraph_lines.iter() {
2857        end_byte += line.len() + 1; // +1 for newline
2858    }
2859
2860    // Track whether the byte range includes a trailing newline
2861    // (it doesn't if this is the last line and the file doesn't end with newline)
2862    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2863
2864    // Adjust end_byte if the last line doesn't have a newline
2865    if !includes_trailing_newline {
2866        end_byte -= 1;
2867    }
2868
2869    // Join paragraph lines and reflow
2870    let paragraph_text = paragraph_lines.join("\n");
2871
2872    // Reflow the paragraph using reflow_markdown to handle it properly
2873    let reflowed = reflow_markdown(&paragraph_text, options);
2874
2875    // Ensure reflowed text matches whether the byte range includes a trailing newline
2876    // This is critical: if the range includes a newline, the replacement must too,
2877    // otherwise the next line will get appended to the reflowed paragraph
2878    let reflowed_text = if includes_trailing_newline {
2879        // Range includes newline - ensure reflowed text has one
2880        if reflowed.ends_with('\n') {
2881            reflowed
2882        } else {
2883            format!("{reflowed}\n")
2884        }
2885    } else {
2886        // Range doesn't include newline - ensure reflowed text doesn't have one
2887        if reflowed.ends_with('\n') {
2888            reflowed.trim_end_matches('\n').to_string()
2889        } else {
2890            reflowed
2891        }
2892    };
2893
2894    Some(ParagraphReflow {
2895        start_byte,
2896        end_byte,
2897        reflowed_text,
2898    })
2899}
2900
2901#[cfg(test)]
2902mod tests {
2903    use super::*;
2904
2905    /// Unit test for private helper function text_ends_with_abbreviation()
2906    ///
2907    /// This test stays inline because it tests a private function.
2908    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2909    #[test]
2910    fn test_helper_function_text_ends_with_abbreviation() {
2911        // Test the helper function directly
2912        let abbreviations = get_abbreviations(&None);
2913
2914        // True cases - built-in abbreviations (titles and i.e./e.g.)
2915        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2916        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2917        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2918        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2919        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2920        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2921        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2922        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2923
2924        // False cases - NOT in built-in list (etc doesn't always have period)
2925        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2926        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2927        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2928        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2929        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2930        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2931        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2932        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2933        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2934        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2935    }
2936
2937    #[test]
2938    fn test_is_unordered_list_marker() {
2939        // Valid unordered list markers
2940        assert!(is_unordered_list_marker("- item"));
2941        assert!(is_unordered_list_marker("* item"));
2942        assert!(is_unordered_list_marker("+ item"));
2943        assert!(is_unordered_list_marker("-")); // lone marker
2944        assert!(is_unordered_list_marker("*"));
2945        assert!(is_unordered_list_marker("+"));
2946
2947        // Not list markers
2948        assert!(!is_unordered_list_marker("---")); // horizontal rule
2949        assert!(!is_unordered_list_marker("***")); // horizontal rule
2950        assert!(!is_unordered_list_marker("- - -")); // horizontal rule
2951        assert!(!is_unordered_list_marker("* * *")); // horizontal rule
2952        assert!(!is_unordered_list_marker("*emphasis*")); // emphasis, not list
2953        assert!(!is_unordered_list_marker("-word")); // no space after marker
2954        assert!(!is_unordered_list_marker("")); // empty
2955        assert!(!is_unordered_list_marker("text")); // plain text
2956        assert!(!is_unordered_list_marker("# heading")); // heading
2957    }
2958
2959    #[test]
2960    fn test_is_block_boundary() {
2961        // Block boundaries
2962        assert!(is_block_boundary("")); // empty line
2963        assert!(is_block_boundary("# Heading")); // ATX heading
2964        assert!(is_block_boundary("## Level 2")); // ATX heading
2965        assert!(is_block_boundary("```rust")); // code fence
2966        assert!(is_block_boundary("~~~")); // tilde code fence
2967        assert!(is_block_boundary("> quote")); // blockquote
2968        assert!(is_block_boundary("| cell |")); // table
2969        assert!(is_block_boundary("[link]: http://example.com")); // reference def
2970        assert!(is_block_boundary("---")); // horizontal rule
2971        assert!(is_block_boundary("***")); // horizontal rule
2972        assert!(is_block_boundary("- item")); // unordered list
2973        assert!(is_block_boundary("* item")); // unordered list
2974        assert!(is_block_boundary("+ item")); // unordered list
2975        assert!(is_block_boundary("1. item")); // ordered list
2976        assert!(is_block_boundary("10. item")); // ordered list
2977        assert!(is_block_boundary(": definition")); // definition list
2978        assert!(is_block_boundary(":::")); // div marker
2979        assert!(is_block_boundary("::::: {.callout-note}")); // div marker with attrs
2980
2981        // NOT block boundaries (paragraph continuation)
2982        assert!(!is_block_boundary("regular text"));
2983        assert!(!is_block_boundary("*emphasis*")); // emphasis, not list
2984        assert!(!is_block_boundary("[link](url)")); // inline link, not reference def
2985        assert!(!is_block_boundary("some words here"));
2986    }
2987
2988    #[test]
2989    fn test_definition_list_boundary_in_single_line_paragraph() {
2990        // Verifies that a definition list item after a single-line paragraph
2991        // is treated as a block boundary, not merged into the paragraph
2992        let options = ReflowOptions {
2993            line_length: 80,
2994            ..Default::default()
2995        };
2996        let input = "Term\n: Definition of the term";
2997        let result = reflow_markdown(input, &options);
2998        // The definition list marker should remain on its own line
2999        assert!(
3000            result.contains(": Definition"),
3001            "Definition list item should not be merged into previous line. Got: {result:?}"
3002        );
3003        let lines: Vec<&str> = result.lines().collect();
3004        assert_eq!(lines.len(), 2, "Should remain two separate lines. Got: {lines:?}");
3005        assert_eq!(lines[0], "Term");
3006        assert_eq!(lines[1], ": Definition of the term");
3007    }
3008
3009    #[test]
3010    fn test_is_paragraph_boundary() {
3011        // Core block boundary checks are inherited
3012        assert!(is_paragraph_boundary("# Heading", "# Heading"));
3013        assert!(is_paragraph_boundary("- item", "- item"));
3014        assert!(is_paragraph_boundary(":::", ":::"));
3015        assert!(is_paragraph_boundary(": definition", ": definition"));
3016
3017        // Indented code blocks (≥4 spaces or tab)
3018        assert!(is_paragraph_boundary("code", "    code"));
3019        assert!(is_paragraph_boundary("code", "\tcode"));
3020
3021        // Table rows via is_potential_table_row
3022        assert!(is_paragraph_boundary("| a | b |", "| a | b |"));
3023        assert!(is_paragraph_boundary("a | b", "a | b")); // pipe-delimited without leading pipe
3024
3025        // Not paragraph boundaries
3026        assert!(!is_paragraph_boundary("regular text", "regular text"));
3027        assert!(!is_paragraph_boundary("text", "  text")); // 2-space indent is not code
3028    }
3029
3030    #[test]
3031    fn test_div_marker_boundary_in_reflow_paragraph_at_line() {
3032        // Verifies that div markers (:::) are treated as paragraph boundaries
3033        // in reflow_paragraph_at_line, preventing reflow across div boundaries
3034        let content = "Some paragraph text here.\n\n::: {.callout-note}\nThis is a callout.\n:::\n";
3035        // Line 3 is the div marker — should not be reflowed
3036        let result = reflow_paragraph_at_line(content, 3, 80);
3037        assert!(result.is_none(), "Div marker line should not be reflowed");
3038    }
3039}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs