rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::regex_cache::{
9    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
10    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
11    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
12    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
13};
14use crate::utils::sentence_utils::{
15    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
16    text_ends_with_abbreviation,
17};
18use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
19use std::collections::HashSet;
20
21/// Options for reflowing text
22#[derive(Clone)]
23pub struct ReflowOptions {
24    /// Target line length
25    pub line_length: usize,
26    /// Whether to break on sentence boundaries when possible
27    pub break_on_sentences: bool,
28    /// Whether to preserve existing line breaks in paragraphs
29    pub preserve_breaks: bool,
30    /// Whether to enforce one sentence per line
31    pub sentence_per_line: bool,
32    /// Whether to use semantic line breaks (cascading split strategy)
33    pub semantic_line_breaks: bool,
34    /// Custom abbreviations for sentence detection
35    /// Periods are optional - both "Dr" and "Dr." work the same
36    /// Custom abbreviations are always added to the built-in defaults
37    pub abbreviations: Option<Vec<String>>,
38}
39
40impl Default for ReflowOptions {
41    fn default() -> Self {
42        Self {
43            line_length: 80,
44            break_on_sentences: true,
45            preserve_breaks: false,
46            sentence_per_line: false,
47            semantic_line_breaks: false,
48            abbreviations: None,
49        }
50    }
51}
52
53/// Detect if a character position is a sentence boundary
54/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
55/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
56fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
57    let chars: Vec<char> = text.chars().collect();
58
59    if pos + 1 >= chars.len() {
60        return false;
61    }
62
63    let c = chars[pos];
64    let next_char = chars[pos + 1];
65
66    // Check for CJK sentence-ending punctuation (。, ！, ？)
67    // CJK punctuation doesn't require space or uppercase after it
68    if is_cjk_sentence_ending(c) {
69        // Skip any trailing emphasis/strikethrough markers
70        let mut after_punct_pos = pos + 1;
71        while after_punct_pos < chars.len()
72            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
73        {
74            after_punct_pos += 1;
75        }
76
77        // Skip whitespace
78        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
79            after_punct_pos += 1;
80        }
81
82        // Check if we have more content (any non-whitespace)
83        if after_punct_pos >= chars.len() {
84            return false;
85        }
86
87        // Skip leading emphasis/strikethrough markers
88        while after_punct_pos < chars.len()
89            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
90        {
91            after_punct_pos += 1;
92        }
93
94        if after_punct_pos >= chars.len() {
95            return false;
96        }
97
98        // For CJK, we accept any character as the start of the next sentence
99        // (no uppercase requirement, since CJK doesn't have case)
100        return true;
101    }
102
103    // Check for ASCII sentence-ending punctuation
104    if c != '.' && c != '!' && c != '?' {
105        return false;
106    }
107
108    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
109    let (_space_pos, after_space_pos) = if next_char == ' ' {
110        // Normal case: punctuation followed by space
111        (pos + 1, pos + 2)
112    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
113        // Sentence ends with quote - check what follows the quote
114        if chars[pos + 2] == ' ' {
115            // Just quote followed by space: 'sentence." '
116            (pos + 2, pos + 3)
117        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
118            // Quote followed by emphasis: 'sentence."* '
119            (pos + 3, pos + 4)
120        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
121            && pos + 4 < chars.len()
122            && chars[pos + 3] == chars[pos + 2]
123            && chars[pos + 4] == ' '
124        {
125            // Quote followed by bold: 'sentence."** '
126            (pos + 4, pos + 5)
127        } else {
128            return false;
129        }
130    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
131        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
132        (pos + 2, pos + 3)
133    } else if (next_char == '*' || next_char == '_')
134        && pos + 3 < chars.len()
135        && chars[pos + 2] == next_char
136        && chars[pos + 3] == ' '
137    {
138        // Sentence ends with bold: "sentence.** " or "sentence.__ "
139        (pos + 3, pos + 4)
140    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
141        // Sentence ends with strikethrough: "sentence.~~ "
142        (pos + 3, pos + 4)
143    } else {
144        return false;
145    };
146
147    // Skip all whitespace after the space to find the start of the next sentence
148    let mut next_char_pos = after_space_pos;
149    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
150        next_char_pos += 1;
151    }
152
153    // Check if we reached the end of the string
154    if next_char_pos >= chars.len() {
155        return false;
156    }
157
158    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
159    let mut first_letter_pos = next_char_pos;
160    while first_letter_pos < chars.len()
161        && (chars[first_letter_pos] == '*'
162            || chars[first_letter_pos] == '_'
163            || chars[first_letter_pos] == '~'
164            || is_opening_quote(chars[first_letter_pos]))
165    {
166        first_letter_pos += 1;
167    }
168
169    // Check if we reached the end after skipping emphasis
170    if first_letter_pos >= chars.len() {
171        return false;
172    }
173
174    // First character of next sentence must be uppercase or CJK
175    let first_char = chars[first_letter_pos];
176    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
177        return false;
178    }
179
180    // Look back to check for common abbreviations (only applies to periods)
181    if pos > 0 && c == '.' {
182        // Convert char index to byte offset for string slicing
183        let byte_offset: usize = chars[..=pos].iter().map(|ch| ch.len_utf8()).sum();
184        if text_ends_with_abbreviation(&text[..byte_offset], abbreviations) {
185            return false;
186        }
187
188        // Check for decimal numbers (e.g., "3.14")
189        // Make sure to check if first_letter_pos is within bounds
190        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
191            return false;
192        }
193    }
194    true
195}
196
197/// Split text into sentences
198pub fn split_into_sentences(text: &str) -> Vec<String> {
199    split_into_sentences_custom(text, &None)
200}
201
202/// Split text into sentences with custom abbreviations
203pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
204    let abbreviations = get_abbreviations(custom_abbreviations);
205    split_into_sentences_with_set(text, &abbreviations)
206}
207
208/// Internal function to split text into sentences with a pre-computed abbreviations set
209/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
210fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
211    let mut sentences = Vec::new();
212    let mut current_sentence = String::new();
213    let mut chars = text.chars().peekable();
214    let mut pos = 0;
215
216    while let Some(c) = chars.next() {
217        current_sentence.push(c);
218
219        if is_sentence_boundary(text, pos, abbreviations) {
220            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
221            while let Some(&next) = chars.peek() {
222                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
223                    current_sentence.push(chars.next().unwrap());
224                    pos += 1;
225                } else {
226                    break;
227                }
228            }
229
230            // Consume the space after the sentence
231            if chars.peek() == Some(&' ') {
232                chars.next();
233                pos += 1;
234            }
235
236            sentences.push(current_sentence.trim().to_string());
237            current_sentence.clear();
238        }
239
240        pos += 1;
241    }
242
243    // Add any remaining text as the last sentence
244    if !current_sentence.trim().is_empty() {
245        sentences.push(current_sentence.trim().to_string());
246    }
247    sentences
248}
249
250/// Check if a line is a horizontal rule (---, ___, ***)
251fn is_horizontal_rule(line: &str) -> bool {
252    if line.len() < 3 {
253        return false;
254    }
255
256    // Check if line consists only of -, _, or * characters (at least 3)
257    let chars: Vec<char> = line.chars().collect();
258    if chars.is_empty() {
259        return false;
260    }
261
262    let first_char = chars[0];
263    if first_char != '-' && first_char != '_' && first_char != '*' {
264        return false;
265    }
266
267    // All characters should be the same (allowing spaces between)
268    for c in &chars {
269        if *c != first_char && *c != ' ' {
270            return false;
271        }
272    }
273
274    // Count non-space characters
275    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
276    non_space_count >= 3
277}
278
279/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
280fn is_numbered_list_item(line: &str) -> bool {
281    let mut chars = line.chars();
282
283    // Must start with a digit
284    if !chars.next().is_some_and(|c| c.is_numeric()) {
285        return false;
286    }
287
288    // Can have more digits
289    while let Some(c) = chars.next() {
290        if c == '.' {
291            // After period, must have a space (consistent with list marker extraction)
292            // "2019." alone is NOT treated as a list item to avoid false positives
293            return chars.next() == Some(' ');
294        }
295        if !c.is_numeric() {
296            return false;
297        }
298    }
299
300    false
301}
302
303/// Check if a trimmed line is an unordered list item (-, *, + followed by space)
304fn is_unordered_list_marker(s: &str) -> bool {
305    matches!(s.as_bytes().first(), Some(b'-' | b'*' | b'+'))
306        && !is_horizontal_rule(s)
307        && (s.len() == 1 || s.as_bytes().get(1) == Some(&b' '))
308}
309
310/// Shared structural checks for block boundary detection.
311/// Checks elements that only depend on the trimmed line content.
312fn is_block_boundary_core(trimmed: &str) -> bool {
313    trimmed.is_empty()
314        || trimmed.starts_with('#')
315        || trimmed.starts_with("```")
316        || trimmed.starts_with("~~~")
317        || trimmed.starts_with('>')
318        || (trimmed.starts_with('[') && trimmed.contains("]:"))
319        || is_horizontal_rule(trimmed)
320        || is_unordered_list_marker(trimmed)
321        || is_numbered_list_item(trimmed)
322        || is_definition_list_item(trimmed)
323        || trimmed.starts_with(":::")
324}
325
326/// Check if a trimmed line starts a new structural block element.
327/// Used for paragraph boundary detection in `reflow_markdown()`.
328fn is_block_boundary(trimmed: &str) -> bool {
329    is_block_boundary_core(trimmed) || trimmed.starts_with('|')
330}
331
332/// Check if a line starts a new structural block for paragraph boundary detection
333/// in `reflow_paragraph_at_line()`. Extends the core checks with indented code blocks
334/// (≥4 spaces) and table row detection via `is_potential_table_row`.
335fn is_paragraph_boundary(trimmed: &str, line: &str) -> bool {
336    is_block_boundary_core(trimmed)
337        || ElementCache::calculate_indentation_width_default(line) >= 4
338        || crate::utils::table_utils::TableUtils::is_potential_table_row(line)
339}
340
341/// Check if a line ends with a hard break (either two spaces or backslash)
342///
343/// CommonMark supports two formats for hard line breaks:
344/// 1. Two or more trailing spaces
345/// 2. A backslash at the end of the line
346fn has_hard_break(line: &str) -> bool {
347    let line = line.strip_suffix('\r').unwrap_or(line);
348    line.ends_with("  ") || line.ends_with('\\')
349}
350
351/// Check if text ends with sentence-terminating punctuation (. ! ?)
352fn ends_with_sentence_punct(text: &str) -> bool {
353    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
354}
355
356/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
357///
358/// Hard breaks in Markdown can be indicated by:
359/// 1. Two trailing spaces before a newline (traditional)
360/// 2. A backslash at the end of the line (mdformat style)
361fn trim_preserving_hard_break(s: &str) -> String {
362    // Strip trailing \r from CRLF line endings first to handle Windows files
363    let s = s.strip_suffix('\r').unwrap_or(s);
364
365    // Check for backslash hard break (mdformat style)
366    if s.ends_with('\\') {
367        // Preserve the backslash exactly as-is
368        return s.to_string();
369    }
370
371    // Check if there are at least 2 trailing spaces (traditional hard break)
372    if s.ends_with("  ") {
373        // Find the position where non-space content ends
374        let content_end = s.trim_end().len();
375        if content_end == 0 {
376            // String is all whitespace
377            return String::new();
378        }
379        // Preserve exactly 2 trailing spaces for hard break
380        format!("{}  ", &s[..content_end])
381    } else {
382        // No hard break, just trim all trailing whitespace
383        s.trim_end().to_string()
384    }
385}
386
387pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
388    // For sentence-per-line mode, always process regardless of length
389    if options.sentence_per_line {
390        let elements = parse_markdown_elements(line);
391        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
392    }
393
394    // For semantic line breaks mode, use cascading split strategy
395    if options.semantic_line_breaks {
396        let elements = parse_markdown_elements(line);
397        return reflow_elements_semantic(&elements, options);
398    }
399
400    // Quick check: if line is already short enough or no wrapping requested, return as-is
401    // line_length = 0 means no wrapping (unlimited line length)
402    if options.line_length == 0 || line.chars().count() <= options.line_length {
403        return vec![line.to_string()];
404    }
405
406    // Parse the markdown to identify elements
407    let elements = parse_markdown_elements(line);
408
409    // Reflow the elements into lines
410    reflow_elements(&elements, options)
411}
412
413/// Image source in a linked image structure
414#[derive(Debug, Clone)]
415enum LinkedImageSource {
416    /// Inline image URL: ![alt](url)
417    Inline(String),
418    /// Reference image: ![alt][ref]
419    Reference(String),
420}
421
422/// Link target in a linked image structure
423#[derive(Debug, Clone)]
424enum LinkedImageTarget {
425    /// Inline link URL: ](url)
426    Inline(String),
427    /// Reference link: ][ref]
428    Reference(String),
429}
430
431/// Represents a piece of content in the markdown
432#[derive(Debug, Clone)]
433enum Element {
434    /// Plain text that can be wrapped
435    Text(String),
436    /// A complete markdown inline link [text](url)
437    Link { text: String, url: String },
438    /// A complete markdown reference link [text][ref]
439    ReferenceLink { text: String, reference: String },
440    /// A complete markdown empty reference link [text][]
441    EmptyReferenceLink { text: String },
442    /// A complete markdown shortcut reference link [ref]
443    ShortcutReference { reference: String },
444    /// A complete markdown inline image ![alt](url)
445    InlineImage { alt: String, url: String },
446    /// A complete markdown reference image ![alt][ref]
447    ReferenceImage { alt: String, reference: String },
448    /// A complete markdown empty reference image ![alt][]
449    EmptyReferenceImage { alt: String },
450    /// A clickable image badge in any of 4 forms:
451    /// - [![alt](img-url)](link-url)
452    /// - [![alt][img-ref]](link-url)
453    /// - [![alt](img-url)][link-ref]
454    /// - [![alt][img-ref]][link-ref]
455    LinkedImage {
456        alt: String,
457        img_source: LinkedImageSource,
458        link_target: LinkedImageTarget,
459    },
460    /// Footnote reference [^note]
461    FootnoteReference { note: String },
462    /// Strikethrough text ~~text~~
463    Strikethrough(String),
464    /// Wiki-style link [[wiki]] or [[wiki|text]]
465    WikiLink(String),
466    /// Inline math $math$
467    InlineMath(String),
468    /// Display math $$math$$
469    DisplayMath(String),
470    /// Emoji shortcode :emoji:
471    EmojiShortcode(String),
472    /// HTML tag <tag> or </tag> or <tag/>
473    HtmlTag(String),
474    /// HTML entity &nbsp; or &#123;
475    HtmlEntity(String),
476    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
477    HugoShortcode(String),
478    /// Inline code `code`
479    Code(String),
480    /// Bold text **text** or __text__
481    Bold {
482        content: String,
483        /// True if underscore markers (__), false for asterisks (**)
484        underscore: bool,
485    },
486    /// Italic text *text* or _text_
487    Italic {
488        content: String,
489        /// True if underscore marker (_), false for asterisk (*)
490        underscore: bool,
491    },
492}
493
494impl std::fmt::Display for Element {
495    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
496        match self {
497            Element::Text(s) => write!(f, "{s}"),
498            Element::Link { text, url } => write!(f, "[{text}]({url})"),
499            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
500            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
501            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
502            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
503            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
504            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
505            Element::LinkedImage {
506                alt,
507                img_source,
508                link_target,
509            } => {
510                // Build the image part: ![alt](url) or ![alt][ref]
511                let img_part = match img_source {
512                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
513                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
514                };
515                // Build the link part: (url) or [ref]
516                match link_target {
517                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
518                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
519                }
520            }
521            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
522            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
523            Element::WikiLink(s) => write!(f, "[[{s}]]"),
524            Element::InlineMath(s) => write!(f, "${s}$"),
525            Element::DisplayMath(s) => write!(f, "$${s}$$"),
526            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
527            Element::HtmlTag(s) => write!(f, "{s}"),
528            Element::HtmlEntity(s) => write!(f, "{s}"),
529            Element::HugoShortcode(s) => write!(f, "{s}"),
530            Element::Code(s) => write!(f, "`{s}`"),
531            Element::Bold { content, underscore } => {
532                if *underscore {
533                    write!(f, "__{content}__")
534                } else {
535                    write!(f, "**{content}**")
536                }
537            }
538            Element::Italic { content, underscore } => {
539                if *underscore {
540                    write!(f, "_{content}_")
541                } else {
542                    write!(f, "*{content}*")
543                }
544            }
545        }
546    }
547}
548
549impl Element {
550    fn len(&self) -> usize {
551        match self {
552            Element::Text(s) => s.chars().count(),
553            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
554            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
555            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
556            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
557            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
558            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
559            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
560            Element::LinkedImage {
561                alt,
562                img_source,
563                link_target,
564            } => {
565                // Calculate length based on variant
566                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
567                let alt_len = alt.chars().count();
568                let img_len = match img_source {
569                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
570                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
571                };
572                let link_len = match link_target {
573                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
574                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
575                };
576                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
577                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
578                5 + alt_len + img_len + link_len
579            }
580            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
581            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
582            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
583            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
584            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
585            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
586            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
587            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
588            Element::HugoShortcode(s) => s.chars().count(),                  // {{< ... >}} - already complete
589            Element::Code(s) => s.chars().count() + 2,                       // `code`
590            Element::Bold { content, .. } => content.chars().count() + 4,    // **text** or __text__
591            Element::Italic { content, .. } => content.chars().count() + 2,  // *text* or _text_
592        }
593    }
594}
595
596/// An emphasis or formatting span parsed by pulldown-cmark
597#[derive(Debug, Clone)]
598struct EmphasisSpan {
599    /// Byte offset where the emphasis starts (including markers)
600    start: usize,
601    /// Byte offset where the emphasis ends (after closing markers)
602    end: usize,
603    /// The content inside the emphasis markers
604    content: String,
605    /// Whether this is strong (bold) emphasis
606    is_strong: bool,
607    /// Whether this is strikethrough (~~text~~)
608    is_strikethrough: bool,
609    /// Whether the original used underscore markers (for emphasis only)
610    uses_underscore: bool,
611}
612
613/// Extract emphasis and strikethrough spans from text using pulldown-cmark
614///
615/// This provides CommonMark-compliant emphasis parsing, correctly handling:
616/// - Nested emphasis like `*text **bold** more*`
617/// - Left/right flanking delimiter rules
618/// - Underscore vs asterisk markers
619/// - GFM strikethrough (~~text~~)
620///
621/// Returns spans sorted by start position.
622fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
623    let mut spans = Vec::new();
624    let mut options = Options::empty();
625    options.insert(Options::ENABLE_STRIKETHROUGH);
626
627    // Stacks to track nested formatting with their start positions
628    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
629    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
630    let mut strikethrough_stack: Vec<usize> = Vec::new();
631
632    let parser = Parser::new_ext(text, options).into_offset_iter();
633
634    for (event, range) in parser {
635        match event {
636            Event::Start(Tag::Emphasis) => {
637                // Check if this uses underscore by looking at the original text
638                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
639                emphasis_stack.push((range.start, uses_underscore));
640            }
641            Event::End(TagEnd::Emphasis) => {
642                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
643                    // Extract content between the markers (1 char marker on each side)
644                    let content_start = start_byte + 1;
645                    let content_end = range.end - 1;
646                    if content_end > content_start
647                        && let Some(content) = text.get(content_start..content_end)
648                    {
649                        spans.push(EmphasisSpan {
650                            start: start_byte,
651                            end: range.end,
652                            content: content.to_string(),
653                            is_strong: false,
654                            is_strikethrough: false,
655                            uses_underscore,
656                        });
657                    }
658                }
659            }
660            Event::Start(Tag::Strong) => {
661                // Check if this uses underscore by looking at the original text
662                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
663                strong_stack.push((range.start, uses_underscore));
664            }
665            Event::End(TagEnd::Strong) => {
666                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
667                    // Extract content between the markers (2 char marker on each side)
668                    let content_start = start_byte + 2;
669                    let content_end = range.end - 2;
670                    if content_end > content_start
671                        && let Some(content) = text.get(content_start..content_end)
672                    {
673                        spans.push(EmphasisSpan {
674                            start: start_byte,
675                            end: range.end,
676                            content: content.to_string(),
677                            is_strong: true,
678                            is_strikethrough: false,
679                            uses_underscore,
680                        });
681                    }
682                }
683            }
684            Event::Start(Tag::Strikethrough) => {
685                strikethrough_stack.push(range.start);
686            }
687            Event::End(TagEnd::Strikethrough) => {
688                if let Some(start_byte) = strikethrough_stack.pop() {
689                    // Extract content between the ~~ markers (2 char marker on each side)
690                    let content_start = start_byte + 2;
691                    let content_end = range.end - 2;
692                    if content_end > content_start
693                        && let Some(content) = text.get(content_start..content_end)
694                    {
695                        spans.push(EmphasisSpan {
696                            start: start_byte,
697                            end: range.end,
698                            content: content.to_string(),
699                            is_strong: false,
700                            is_strikethrough: true,
701                            uses_underscore: false,
702                        });
703                    }
704                }
705            }
706            _ => {}
707        }
708    }
709
710    // Sort by start position
711    spans.sort_by_key(|s| s.start);
712    spans
713}
714
715/// Parse markdown elements from text preserving the raw syntax
716///
717/// Detection order is critical:
718/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
719/// 2. Inline images ![alt](url) - before links to handle ! prefix
720/// 3. Reference images ![alt][ref] - before reference links
721/// 4. Inline links [text](url) - before reference links
722/// 5. Reference links [text][ref] - before shortcut references
723/// 6. Shortcut reference links [ref] - detected last to avoid false positives
724/// 7. Other elements (code, bold, italic, etc.) - processed normally
725fn parse_markdown_elements(text: &str) -> Vec<Element> {
726    let mut elements = Vec::new();
727    let mut remaining = text;
728
729    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
730    let emphasis_spans = extract_emphasis_spans(text);
731
732    while !remaining.is_empty() {
733        // Calculate current byte offset in original text
734        let current_offset = text.len() - remaining.len();
735        // Find the earliest occurrence of any markdown pattern
736        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
737
738        // Check for linked images FIRST (all 4 variants)
739        // Quick literal check: only run expensive regexes if we might have a linked image
740        // Pattern starts with "[!" so check for that first
741        if remaining.contains("[!") {
742            // Pattern 1: [![alt](img)](link) - inline image in inline link
743            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
744                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
745            {
746                earliest_match = Some((m.start(), "linked_image_ii", m));
747            }
748
749            // Pattern 2: [![alt][ref]](link) - reference image in inline link
750            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
751                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
752            {
753                earliest_match = Some((m.start(), "linked_image_ri", m));
754            }
755
756            // Pattern 3: [![alt](img)][ref] - inline image in reference link
757            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
758                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
759            {
760                earliest_match = Some((m.start(), "linked_image_ir", m));
761            }
762
763            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
764            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
765                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
766            {
767                earliest_match = Some((m.start(), "linked_image_rr", m));
768            }
769        }
770
771        // Check for images (they start with ! so should be detected before links)
772        // Inline images - ![alt](url)
773        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
774            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
775        {
776            earliest_match = Some((m.start(), "inline_image", m));
777        }
778
779        // Reference images - ![alt][ref]
780        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
781            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
782        {
783            earliest_match = Some((m.start(), "ref_image", m));
784        }
785
786        // Check for footnote references - [^note]
787        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
788            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
789        {
790            earliest_match = Some((m.start(), "footnote_ref", m));
791        }
792
793        // Check for inline links - [text](url)
794        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
795            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
796        {
797            earliest_match = Some((m.start(), "inline_link", m));
798        }
799
800        // Check for reference links - [text][ref]
801        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
802            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
803        {
804            earliest_match = Some((m.start(), "ref_link", m));
805        }
806
807        // Check for shortcut reference links - [ref]
808        // Only check if we haven't found an earlier pattern that would conflict
809        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
810            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
811        {
812            earliest_match = Some((m.start(), "shortcut_ref", m));
813        }
814
815        // Check for wiki-style links - [[wiki]]
816        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
817            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
818        {
819            earliest_match = Some((m.start(), "wiki_link", m));
820        }
821
822        // Check for display math first (before inline) - $$math$$
823        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
824            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
825        {
826            earliest_match = Some((m.start(), "display_math", m));
827        }
828
829        // Check for inline math - $math$
830        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
831            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
832        {
833            earliest_match = Some((m.start(), "inline_math", m));
834        }
835
836        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
837
838        // Check for emoji shortcodes - :emoji:
839        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
840            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
841        {
842            earliest_match = Some((m.start(), "emoji", m));
843        }
844
845        // Check for HTML entities - &nbsp; etc
846        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
847            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
848        {
849            earliest_match = Some((m.start(), "html_entity", m));
850        }
851
852        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
853        // Must be checked before other patterns to avoid false sentence breaks
854        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
855            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
856        {
857            earliest_match = Some((m.start(), "hugo_shortcode", m));
858        }
859
860        // Check for HTML tags - <tag> </tag> <tag/>
861        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
862        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
863            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
864        {
865            // Check if this is an autolink (starts with protocol or mailto:)
866            let matched_text = &remaining[m.start()..m.end()];
867            let is_url_autolink = matched_text.starts_with("<http://")
868                || matched_text.starts_with("<https://")
869                || matched_text.starts_with("<mailto:")
870                || matched_text.starts_with("<ftp://")
871                || matched_text.starts_with("<ftps://");
872
873            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
874            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
875            let is_email_autolink = {
876                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
877                EMAIL_PATTERN.is_match(content)
878            };
879
880            if !is_url_autolink && !is_email_autolink {
881                earliest_match = Some((m.start(), "html_tag", m));
882            }
883        }
884
885        // Find earliest non-link special characters
886        let mut next_special = remaining.len();
887        let mut special_type = "";
888        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
889
890        // Check for code spans (not handled by pulldown-cmark in this context)
891        if let Some(pos) = remaining.find('`')
892            && pos < next_special
893        {
894            next_special = pos;
895            special_type = "code";
896        }
897
898        // Check for emphasis using pulldown-cmark's pre-extracted spans
899        // Find the earliest emphasis span that starts within remaining text
900        for span in &emphasis_spans {
901            if span.start >= current_offset && span.start < current_offset + remaining.len() {
902                let pos_in_remaining = span.start - current_offset;
903                if pos_in_remaining < next_special {
904                    next_special = pos_in_remaining;
905                    special_type = "pulldown_emphasis";
906                    pulldown_emphasis = Some(span);
907                }
908                break; // Spans are sorted by start position, so first match is earliest
909            }
910        }
911
912        // Determine which pattern to process first
913        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
914            pos < next_special
915        } else {
916            false
917        };
918
919        if should_process_markdown_link {
920            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
921
922            // Add any text before the match
923            if pos > 0 {
924                elements.push(Element::Text(remaining[..pos].to_string()));
925            }
926
927            // Process the matched pattern
928            match pattern_type {
929                // Pattern 1: [![alt](img)](link) - inline image in inline link
930                "linked_image_ii" => {
931                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
932                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
933                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
934                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
935                        elements.push(Element::LinkedImage {
936                            alt: alt.to_string(),
937                            img_source: LinkedImageSource::Inline(img_url.to_string()),
938                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
939                        });
940                        remaining = &remaining[match_obj.end()..];
941                    } else {
942                        elements.push(Element::Text("[".to_string()));
943                        remaining = &remaining[1..];
944                    }
945                }
946                // Pattern 2: [![alt][ref]](link) - reference image in inline link
947                "linked_image_ri" => {
948                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
949                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
950                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
951                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
952                        elements.push(Element::LinkedImage {
953                            alt: alt.to_string(),
954                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
955                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
956                        });
957                        remaining = &remaining[match_obj.end()..];
958                    } else {
959                        elements.push(Element::Text("[".to_string()));
960                        remaining = &remaining[1..];
961                    }
962                }
963                // Pattern 3: [![alt](img)][ref] - inline image in reference link
964                "linked_image_ir" => {
965                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
966                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
967                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
968                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
969                        elements.push(Element::LinkedImage {
970                            alt: alt.to_string(),
971                            img_source: LinkedImageSource::Inline(img_url.to_string()),
972                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
973                        });
974                        remaining = &remaining[match_obj.end()..];
975                    } else {
976                        elements.push(Element::Text("[".to_string()));
977                        remaining = &remaining[1..];
978                    }
979                }
980                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
981                "linked_image_rr" => {
982                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
983                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
984                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
985                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
986                        elements.push(Element::LinkedImage {
987                            alt: alt.to_string(),
988                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
989                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
990                        });
991                        remaining = &remaining[match_obj.end()..];
992                    } else {
993                        elements.push(Element::Text("[".to_string()));
994                        remaining = &remaining[1..];
995                    }
996                }
997                "inline_image" => {
998                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
999                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1000                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1001                        elements.push(Element::InlineImage {
1002                            alt: alt.to_string(),
1003                            url: url.to_string(),
1004                        });
1005                        remaining = &remaining[match_obj.end()..];
1006                    } else {
1007                        elements.push(Element::Text("!".to_string()));
1008                        remaining = &remaining[1..];
1009                    }
1010                }
1011                "ref_image" => {
1012                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
1013                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1014                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1015
1016                        if reference.is_empty() {
1017                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1018                        } else {
1019                            elements.push(Element::ReferenceImage {
1020                                alt: alt.to_string(),
1021                                reference: reference.to_string(),
1022                            });
1023                        }
1024                        remaining = &remaining[match_obj.end()..];
1025                    } else {
1026                        elements.push(Element::Text("!".to_string()));
1027                        remaining = &remaining[1..];
1028                    }
1029                }
1030                "footnote_ref" => {
1031                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
1032                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1033                        elements.push(Element::FootnoteReference { note: note.to_string() });
1034                        remaining = &remaining[match_obj.end()..];
1035                    } else {
1036                        elements.push(Element::Text("[".to_string()));
1037                        remaining = &remaining[1..];
1038                    }
1039                }
1040                "inline_link" => {
1041                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1042                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1043                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1044                        elements.push(Element::Link {
1045                            text: text.to_string(),
1046                            url: url.to_string(),
1047                        });
1048                        remaining = &remaining[match_obj.end()..];
1049                    } else {
1050                        // Fallback - shouldn't happen
1051                        elements.push(Element::Text("[".to_string()));
1052                        remaining = &remaining[1..];
1053                    }
1054                }
1055                "ref_link" => {
1056                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1057                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1058                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1059
1060                        if reference.is_empty() {
1061                            // Empty reference link [text][]
1062                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1063                        } else {
1064                            // Regular reference link [text][ref]
1065                            elements.push(Element::ReferenceLink {
1066                                text: text.to_string(),
1067                                reference: reference.to_string(),
1068                            });
1069                        }
1070                        remaining = &remaining[match_obj.end()..];
1071                    } else {
1072                        // Fallback - shouldn't happen
1073                        elements.push(Element::Text("[".to_string()));
1074                        remaining = &remaining[1..];
1075                    }
1076                }
1077                "shortcut_ref" => {
1078                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1079                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1080                        elements.push(Element::ShortcutReference {
1081                            reference: reference.to_string(),
1082                        });
1083                        remaining = &remaining[match_obj.end()..];
1084                    } else {
1085                        // Fallback - shouldn't happen
1086                        elements.push(Element::Text("[".to_string()));
1087                        remaining = &remaining[1..];
1088                    }
1089                }
1090                "wiki_link" => {
1091                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1092                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1093                        elements.push(Element::WikiLink(content.to_string()));
1094                        remaining = &remaining[match_obj.end()..];
1095                    } else {
1096                        elements.push(Element::Text("[[".to_string()));
1097                        remaining = &remaining[2..];
1098                    }
1099                }
1100                "display_math" => {
1101                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1102                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1103                        elements.push(Element::DisplayMath(math.to_string()));
1104                        remaining = &remaining[match_obj.end()..];
1105                    } else {
1106                        elements.push(Element::Text("$$".to_string()));
1107                        remaining = &remaining[2..];
1108                    }
1109                }
1110                "inline_math" => {
1111                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1112                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1113                        elements.push(Element::InlineMath(math.to_string()));
1114                        remaining = &remaining[match_obj.end()..];
1115                    } else {
1116                        elements.push(Element::Text("$".to_string()));
1117                        remaining = &remaining[1..];
1118                    }
1119                }
1120                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1121                "emoji" => {
1122                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1123                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1124                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1125                        remaining = &remaining[match_obj.end()..];
1126                    } else {
1127                        elements.push(Element::Text(":".to_string()));
1128                        remaining = &remaining[1..];
1129                    }
1130                }
1131                "html_entity" => {
1132                    // HTML entities are captured whole - use as_str() to get just the matched content
1133                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1134                    remaining = &remaining[match_obj.end()..];
1135                }
1136                "hugo_shortcode" => {
1137                    // Hugo shortcodes are atomic elements - preserve them exactly
1138                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1139                    remaining = &remaining[match_obj.end()..];
1140                }
1141                "html_tag" => {
1142                    // HTML tags are captured whole - use as_str() to get just the matched content
1143                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1144                    remaining = &remaining[match_obj.end()..];
1145                }
1146                _ => {
1147                    // Unknown pattern, treat as text
1148                    elements.push(Element::Text("[".to_string()));
1149                    remaining = &remaining[1..];
1150                }
1151            }
1152        } else {
1153            // Process non-link special characters
1154
1155            // Add any text before the special character
1156            if next_special > 0 && next_special < remaining.len() {
1157                elements.push(Element::Text(remaining[..next_special].to_string()));
1158                remaining = &remaining[next_special..];
1159            }
1160
1161            // Process the special element
1162            match special_type {
1163                "code" => {
1164                    // Find end of code
1165                    if let Some(code_end) = remaining[1..].find('`') {
1166                        let code = &remaining[1..1 + code_end];
1167                        elements.push(Element::Code(code.to_string()));
1168                        remaining = &remaining[1 + code_end + 1..];
1169                    } else {
1170                        // No closing backtick, treat as text
1171                        elements.push(Element::Text(remaining.to_string()));
1172                        break;
1173                    }
1174                }
1175                "pulldown_emphasis" => {
1176                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1177                    if let Some(span) = pulldown_emphasis {
1178                        let span_len = span.end - span.start;
1179                        if span.is_strikethrough {
1180                            elements.push(Element::Strikethrough(span.content.clone()));
1181                        } else if span.is_strong {
1182                            elements.push(Element::Bold {
1183                                content: span.content.clone(),
1184                                underscore: span.uses_underscore,
1185                            });
1186                        } else {
1187                            elements.push(Element::Italic {
1188                                content: span.content.clone(),
1189                                underscore: span.uses_underscore,
1190                            });
1191                        }
1192                        remaining = &remaining[span_len..];
1193                    } else {
1194                        // Fallback - shouldn't happen
1195                        elements.push(Element::Text(remaining[..1].to_string()));
1196                        remaining = &remaining[1..];
1197                    }
1198                }
1199                _ => {
1200                    // No special elements found, add all remaining text
1201                    elements.push(Element::Text(remaining.to_string()));
1202                    break;
1203                }
1204            }
1205        }
1206    }
1207
1208    elements
1209}
1210
1211/// Reflow elements for sentence-per-line mode
1212fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1213    let abbreviations = get_abbreviations(custom_abbreviations);
1214    let mut lines = Vec::new();
1215    let mut current_line = String::new();
1216
1217    for (idx, element) in elements.iter().enumerate() {
1218        let element_str = format!("{element}");
1219
1220        // For text elements, split into sentences
1221        if let Element::Text(text) = element {
1222            // Simply append text - it already has correct spacing from tokenization
1223            let combined = format!("{current_line}{text}");
1224            // Use the pre-computed abbreviations set to avoid redundant computation
1225            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1226
1227            if sentences.len() > 1 {
1228                // We found sentence boundaries
1229                for (i, sentence) in sentences.iter().enumerate() {
1230                    if i == 0 {
1231                        // First sentence might continue from previous elements
1232                        // But check if it ends with an abbreviation
1233                        let trimmed = sentence.trim();
1234
1235                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1236                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1237                            current_line = sentence.to_string();
1238                        } else {
1239                            // Normal case - emit the first sentence
1240                            lines.push(sentence.to_string());
1241                            current_line.clear();
1242                        }
1243                    } else if i == sentences.len() - 1 {
1244                        // Last sentence: check if it's complete or incomplete
1245                        let trimmed = sentence.trim();
1246                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1247
1248                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1249                            // Complete sentence - emit it immediately
1250                            lines.push(sentence.to_string());
1251                            current_line.clear();
1252                        } else {
1253                            // Incomplete sentence - save for next iteration
1254                            current_line = sentence.to_string();
1255                        }
1256                    } else {
1257                        // Complete sentences in the middle
1258                        lines.push(sentence.to_string());
1259                    }
1260                }
1261            } else {
1262                // Single sentence - check if it's complete
1263                let trimmed = combined.trim();
1264
1265                // If the combined result is only whitespace, don't accumulate it.
1266                // This prevents leading spaces on subsequent elements when lines
1267                // are joined with spaces during reflow iteration.
1268                if trimmed.is_empty() {
1269                    continue;
1270                }
1271
1272                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1273
1274                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1275                    // Complete single sentence - emit it
1276                    lines.push(trimmed.to_string());
1277                    current_line.clear();
1278                } else {
1279                    // Incomplete sentence - continue accumulating
1280                    current_line = combined;
1281                }
1282            }
1283        } else if let Element::Italic { content, underscore } = element {
1284            // Handle italic elements - may contain multiple sentences that need continuation
1285            let marker = if *underscore { "_" } else { "*" };
1286            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1287        } else if let Element::Bold { content, underscore } = element {
1288            // Handle bold elements - may contain multiple sentences that need continuation
1289            let marker = if *underscore { "__" } else { "**" };
1290            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1291        } else if let Element::Strikethrough(content) = element {
1292            // Handle strikethrough elements - may contain multiple sentences that need continuation
1293            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1294        } else {
1295            // Non-text, non-emphasis elements (Code, Links, etc.)
1296            // Check if this element is adjacent to the preceding text (no space between)
1297            let is_adjacent = if idx > 0 {
1298                match &elements[idx - 1] {
1299                    Element::Text(t) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1300                    _ => true,
1301                }
1302            } else {
1303                false
1304            };
1305
1306            // Add space before element if needed, but not for adjacent elements
1307            if !is_adjacent
1308                && !current_line.is_empty()
1309                && !current_line.ends_with(' ')
1310                && !current_line.ends_with('(')
1311                && !current_line.ends_with('[')
1312            {
1313                current_line.push(' ');
1314            }
1315            current_line.push_str(&element_str);
1316        }
1317    }
1318
1319    // Add any remaining content
1320    if !current_line.is_empty() {
1321        lines.push(current_line.trim().to_string());
1322    }
1323    lines
1324}
1325
1326/// Handle splitting emphasis content at sentence boundaries while preserving markers
1327fn handle_emphasis_sentence_split(
1328    content: &str,
1329    marker: &str,
1330    abbreviations: &HashSet<String>,
1331    current_line: &mut String,
1332    lines: &mut Vec<String>,
1333) {
1334    // Split the emphasis content into sentences
1335    let sentences = split_into_sentences_with_set(content, abbreviations);
1336
1337    if sentences.len() <= 1 {
1338        // Single sentence or no boundaries - treat as atomic
1339        if !current_line.is_empty()
1340            && !current_line.ends_with(' ')
1341            && !current_line.ends_with('(')
1342            && !current_line.ends_with('[')
1343        {
1344            current_line.push(' ');
1345        }
1346        current_line.push_str(marker);
1347        current_line.push_str(content);
1348        current_line.push_str(marker);
1349
1350        // Check if the emphasis content ends with sentence punctuation - if so, emit
1351        let trimmed = content.trim();
1352        let ends_with_punct = ends_with_sentence_punct(trimmed);
1353        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1354            lines.push(current_line.clone());
1355            current_line.clear();
1356        }
1357    } else {
1358        // Multiple sentences - each gets its own emphasis markers
1359        for (i, sentence) in sentences.iter().enumerate() {
1360            let trimmed = sentence.trim();
1361            if trimmed.is_empty() {
1362                continue;
1363            }
1364
1365            if i == 0 {
1366                // First sentence: combine with current_line and emit
1367                if !current_line.is_empty()
1368                    && !current_line.ends_with(' ')
1369                    && !current_line.ends_with('(')
1370                    && !current_line.ends_with('[')
1371                {
1372                    current_line.push(' ');
1373                }
1374                current_line.push_str(marker);
1375                current_line.push_str(trimmed);
1376                current_line.push_str(marker);
1377
1378                // Check if this is a complete sentence
1379                let ends_with_punct = ends_with_sentence_punct(trimmed);
1380                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1381                    lines.push(current_line.clone());
1382                    current_line.clear();
1383                }
1384            } else if i == sentences.len() - 1 {
1385                // Last sentence: check if complete
1386                let ends_with_punct = ends_with_sentence_punct(trimmed);
1387
1388                let mut line = String::new();
1389                line.push_str(marker);
1390                line.push_str(trimmed);
1391                line.push_str(marker);
1392
1393                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1394                    lines.push(line);
1395                } else {
1396                    // Incomplete - keep in current_line for potential continuation
1397                    *current_line = line;
1398                }
1399            } else {
1400                // Middle sentences: emit with markers
1401                let mut line = String::new();
1402                line.push_str(marker);
1403                line.push_str(trimmed);
1404                line.push_str(marker);
1405                lines.push(line);
1406            }
1407        }
1408    }
1409}
1410
1411/// English break-words used for semantic line break splitting.
1412/// These are conjunctions and relative pronouns where a line break
1413/// reads naturally.
1414const BREAK_WORDS: &[&str] = &[
1415    "and",
1416    "or",
1417    "but",
1418    "nor",
1419    "yet",
1420    "so",
1421    "for",
1422    "which",
1423    "that",
1424    "because",
1425    "when",
1426    "if",
1427    "while",
1428    "where",
1429    "although",
1430    "though",
1431    "unless",
1432    "since",
1433    "after",
1434    "before",
1435    "until",
1436    "as",
1437    "once",
1438    "whether",
1439    "however",
1440    "therefore",
1441    "moreover",
1442    "furthermore",
1443    "nevertheless",
1444    "whereas",
1445];
1446
1447/// Check if a character is clause punctuation for semantic line breaks
1448fn is_clause_punctuation(c: char) -> bool {
1449    matches!(c, ',' | ';' | ':' | '\u{2014}') // comma, semicolon, colon, em dash
1450}
1451
1452/// Compute element spans for a flat text representation of elements.
1453/// Returns Vec of (start, end) byte offsets for non-Text elements,
1454/// so we can check that a split position doesn't fall inside them.
1455fn compute_element_spans(elements: &[Element]) -> Vec<(usize, usize)> {
1456    let mut spans = Vec::new();
1457    let mut offset = 0;
1458    for element in elements {
1459        let rendered = format!("{element}");
1460        let len = rendered.len();
1461        if !matches!(element, Element::Text(_)) {
1462            spans.push((offset, offset + len));
1463        }
1464        offset += len;
1465    }
1466    spans
1467}
1468
1469/// Check if a byte position falls inside any non-Text element span
1470fn is_inside_element(pos: usize, spans: &[(usize, usize)]) -> bool {
1471    spans.iter().any(|(start, end)| pos > *start && pos < *end)
1472}
1473
1474/// Minimum fraction of line_length that the first part of a split must occupy.
1475/// Prevents awkwardly short first lines like "A," or "Note:" on their own.
1476const MIN_SPLIT_RATIO: f64 = 0.3;
1477
1478/// Split a line at the latest clause punctuation that keeps the first part
1479/// within `line_length`. Returns None if no valid split point exists or if
1480/// the split would create an unreasonably short first line.
1481fn split_at_clause_punctuation(
1482    text: &str,
1483    line_length: usize,
1484    element_spans: &[(usize, usize)],
1485) -> Option<(String, String)> {
1486    let chars: Vec<char> = text.chars().collect();
1487    let search_end = chars.len().min(line_length);
1488    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1489
1490    let mut best_pos = None;
1491    for i in (0..search_end).rev() {
1492        if is_clause_punctuation(chars[i]) {
1493            // Convert char position to byte position for element span check
1494            let byte_pos: usize = chars[..=i].iter().map(|c| c.len_utf8()).sum();
1495            if !is_inside_element(byte_pos, element_spans) {
1496                best_pos = Some(i);
1497                break;
1498            }
1499        }
1500    }
1501
1502    let pos = best_pos?;
1503
1504    // Reject splits that create very short first lines
1505    if pos + 1 < min_first_len {
1506        return None;
1507    }
1508
1509    // Split after the punctuation character
1510    let first: String = chars[..=pos].iter().collect();
1511    let rest: String = chars[pos + 1..].iter().collect();
1512    let rest = rest.trim_start().to_string();
1513
1514    if rest.is_empty() {
1515        return None;
1516    }
1517
1518    Some((first, rest))
1519}
1520
1521/// Split a line before the latest break-word that keeps the first part
1522/// within `line_length`. Returns None if no valid split point exists or if
1523/// the split would create an unreasonably short first line.
1524fn split_at_break_word(text: &str, line_length: usize, element_spans: &[(usize, usize)]) -> Option<(String, String)> {
1525    let lower = text.to_lowercase();
1526    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1527    let mut best_split: Option<(usize, usize)> = None; // (byte_start, word_len_bytes)
1528
1529    for &word in BREAK_WORDS {
1530        let mut search_start = 0;
1531        while let Some(pos) = lower[search_start..].find(word) {
1532            let abs_pos = search_start + pos;
1533
1534            // Verify it's a word boundary: preceded by space, followed by space
1535            let preceded_by_space = abs_pos == 0 || text.as_bytes().get(abs_pos - 1) == Some(&b' ');
1536            let followed_by_space = text.as_bytes().get(abs_pos + word.len()) == Some(&b' ');
1537
1538            if preceded_by_space && followed_by_space {
1539                // The break goes BEFORE the word, so first part ends at abs_pos - 1
1540                let first_part_len = text[..abs_pos].trim_end().chars().count();
1541
1542                if first_part_len >= min_first_len
1543                    && first_part_len <= line_length
1544                    && !is_inside_element(abs_pos, element_spans)
1545                {
1546                    // Prefer the latest valid split point
1547                    if best_split.is_none_or(|(prev_pos, _)| abs_pos > prev_pos) {
1548                        best_split = Some((abs_pos, word.len()));
1549                    }
1550                }
1551            }
1552
1553            search_start = abs_pos + word.len();
1554        }
1555    }
1556
1557    let (byte_start, _word_len) = best_split?;
1558
1559    let first = text[..byte_start].trim_end().to_string();
1560    let rest = text[byte_start..].to_string();
1561
1562    if first.is_empty() || rest.trim().is_empty() {
1563        return None;
1564    }
1565
1566    Some((first, rest))
1567}
1568
1569/// Recursively cascade-split a line that exceeds line_length.
1570/// Tries clause punctuation first, then break-words, then word wrap.
1571fn cascade_split_line(text: &str, line_length: usize, abbreviations: &Option<Vec<String>>) -> Vec<String> {
1572    if line_length == 0 || text.chars().count() <= line_length {
1573        return vec![text.to_string()];
1574    }
1575
1576    let elements = parse_markdown_elements(text);
1577    let element_spans = compute_element_spans(&elements);
1578
1579    // Try clause punctuation split
1580    if let Some((first, rest)) = split_at_clause_punctuation(text, line_length, &element_spans) {
1581        let mut result = vec![first];
1582        result.extend(cascade_split_line(&rest, line_length, abbreviations));
1583        return result;
1584    }
1585
1586    // Try break-word split
1587    if let Some((first, rest)) = split_at_break_word(text, line_length, &element_spans) {
1588        let mut result = vec![first];
1589        result.extend(cascade_split_line(&rest, line_length, abbreviations));
1590        return result;
1591    }
1592
1593    // Fallback: word wrap using existing reflow_elements
1594    let options = ReflowOptions {
1595        line_length,
1596        break_on_sentences: false,
1597        preserve_breaks: false,
1598        sentence_per_line: false,
1599        semantic_line_breaks: false,
1600        abbreviations: abbreviations.clone(),
1601    };
1602    reflow_elements(&elements, &options)
1603}
1604
1605/// Reflow elements using semantic line breaks strategy:
1606/// 1. Split at sentence boundaries (always)
1607/// 2. For lines exceeding line_length, cascade through clause punct → break-words → word wrap
1608fn reflow_elements_semantic(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1609    // Step 1: Split into sentences using existing sentence-per-line logic
1610    let sentence_lines = reflow_elements_sentence_per_line(elements, &options.abbreviations);
1611
1612    // Step 2: For each sentence line, apply cascading splits if it exceeds line_length
1613    // When line_length is 0 (unlimited), skip cascading — sentence splits only
1614    if options.line_length == 0 {
1615        return sentence_lines;
1616    }
1617
1618    let mut result = Vec::new();
1619    for line in sentence_lines {
1620        if line.chars().count() <= options.line_length {
1621            result.push(line);
1622        } else {
1623            result.extend(cascade_split_line(&line, options.line_length, &options.abbreviations));
1624        }
1625    }
1626
1627    // Step 3: Merge very short trailing lines back into the previous line.
1628    // Word wrap can produce lines like "was" or "see" on their own, which reads poorly.
1629    let min_line_len = ((options.line_length as f64) * MIN_SPLIT_RATIO) as usize;
1630    let mut merged: Vec<String> = Vec::with_capacity(result.len());
1631    for line in result {
1632        if !merged.is_empty() && line.chars().count() < min_line_len && !line.trim().is_empty() {
1633            // Don't merge across sentence boundaries — sentence splits are intentional
1634            let prev_ends_at_sentence = {
1635                let trimmed = merged.last().unwrap().trim_end();
1636                trimmed
1637                    .chars()
1638                    .rev()
1639                    .find(|c| !matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | ')' | ']'))
1640                    .is_some_and(|c| matches!(c, '.' | '!' | '?'))
1641            };
1642
1643            if !prev_ends_at_sentence {
1644                let prev = merged.last_mut().unwrap();
1645                let combined = format!("{prev} {line}");
1646                // Only merge if the combined line doesn't wildly exceed the limit
1647                // (allow up to 10% overflow to avoid orphan words)
1648                if combined.chars().count() <= options.line_length + options.line_length / 10 {
1649                    *prev = combined;
1650                    continue;
1651                }
1652            }
1653        }
1654        merged.push(line);
1655    }
1656    merged
1657}
1658
1659/// Reflow elements into lines that fit within the line length
1660fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1661    let mut lines = Vec::new();
1662    let mut current_line = String::new();
1663    let mut current_length = 0;
1664
1665    for (idx, element) in elements.iter().enumerate() {
1666        let element_str = format!("{element}");
1667        let element_len = element.len();
1668
1669        // Determine adjacency from the original elements, not from current_line.
1670        // Elements are adjacent when there's no whitespace between them in the source:
1671        // - Text("v") → HugoShortcode("{{<...>}}") = adjacent (text has no trailing space)
1672        // - Text(" and ") → InlineLink("[a](url)") = NOT adjacent (text has trailing space)
1673        // - HugoShortcode("{{<...>}}") → Text(",") = adjacent (text has no leading space)
1674        let is_adjacent_to_prev = if idx > 0 {
1675            match (&elements[idx - 1], element) {
1676                (Element::Text(t), _) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1677                (_, Element::Text(t)) => !t.is_empty() && !t.starts_with(char::is_whitespace),
1678                _ => true,
1679            }
1680        } else {
1681            false
1682        };
1683
1684        // For text elements that might need breaking
1685        if let Element::Text(text) = element {
1686            // Check if original text had leading whitespace
1687            let has_leading_space = text.starts_with(char::is_whitespace);
1688            // If this is a text element, always process it word by word
1689            let words: Vec<&str> = text.split_whitespace().collect();
1690
1691            for (i, word) in words.iter().enumerate() {
1692                let word_len = word.chars().count();
1693                // Check if this "word" is just punctuation that should stay attached
1694                let is_trailing_punct = word
1695                    .chars()
1696                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1697
1698                // First word of text adjacent to preceding non-text element
1699                // must stay attached (e.g., shortcode followed by punctuation or text)
1700                let is_first_adjacent = i == 0 && is_adjacent_to_prev;
1701
1702                if is_first_adjacent {
1703                    // Attach directly without space, preventing line break
1704                    if current_length + word_len > options.line_length && current_length > 0 {
1705                        // Would exceed — break before the adjacent group
1706                        if let Some(last_space) = current_line.rfind(' ') {
1707                            let before = current_line[..last_space].trim_end().to_string();
1708                            let after = current_line[last_space + 1..].to_string();
1709                            lines.push(before);
1710                            current_line = format!("{after}{word}");
1711                            current_length = current_line.chars().count();
1712                        } else {
1713                            current_line.push_str(word);
1714                            current_length += word_len;
1715                        }
1716                    } else {
1717                        current_line.push_str(word);
1718                        current_length += word_len;
1719                    }
1720                } else if current_length > 0
1721                    && current_length + 1 + word_len > options.line_length
1722                    && !is_trailing_punct
1723                {
1724                    // Start a new line (but never for trailing punctuation)
1725                    lines.push(current_line.trim().to_string());
1726                    current_line = word.to_string();
1727                    current_length = word_len;
1728                } else {
1729                    // Add word to current line
1730                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1731                    // AND this isn't trailing punctuation (which attaches directly)
1732                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1733                        current_line.push(' ');
1734                        current_length += 1;
1735                    }
1736                    current_line.push_str(word);
1737                    current_length += word_len;
1738                }
1739            }
1740        } else {
1741            // For non-text elements (code, links, references), treat as atomic units
1742            // These should never be broken across lines
1743
1744            if is_adjacent_to_prev {
1745                // Adjacent to preceding text — attach directly without space
1746                if current_length + element_len > options.line_length {
1747                    // Would exceed limit — break before the adjacent word group
1748                    if let Some(last_space) = current_line.rfind(' ') {
1749                        let before = current_line[..last_space].trim_end().to_string();
1750                        let after = current_line[last_space + 1..].to_string();
1751                        lines.push(before);
1752                        current_line = format!("{after}{element_str}");
1753                        current_length = current_line.chars().count();
1754                    } else {
1755                        // No space to break at — accept the long line
1756                        current_line.push_str(&element_str);
1757                        current_length += element_len;
1758                    }
1759                } else {
1760                    current_line.push_str(&element_str);
1761                    current_length += element_len;
1762                }
1763            } else if current_length > 0 && current_length + 1 + element_len > options.line_length {
1764                // Not adjacent, would exceed — start new line
1765                lines.push(current_line.trim().to_string());
1766                current_line = element_str;
1767                current_length = element_len;
1768            } else {
1769                // Not adjacent, fits — add with space
1770                let ends_with_opener =
1771                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1772                if current_length > 0 && !ends_with_opener {
1773                    current_line.push(' ');
1774                    current_length += 1;
1775                }
1776                current_line.push_str(&element_str);
1777                current_length += element_len;
1778            }
1779        }
1780    }
1781
1782    // Don't forget the last line
1783    if !current_line.is_empty() {
1784        lines.push(current_line.trim_end().to_string());
1785    }
1786
1787    lines
1788}
1789
1790/// Reflow markdown content preserving structure
1791pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1792    let lines: Vec<&str> = content.lines().collect();
1793    let mut result = Vec::new();
1794    let mut i = 0;
1795
1796    while i < lines.len() {
1797        let line = lines[i];
1798        let trimmed = line.trim();
1799
1800        // Preserve empty lines
1801        if trimmed.is_empty() {
1802            result.push(String::new());
1803            i += 1;
1804            continue;
1805        }
1806
1807        // Preserve headings as-is
1808        if trimmed.starts_with('#') {
1809            result.push(line.to_string());
1810            i += 1;
1811            continue;
1812        }
1813
1814        // Preserve Quarto/Pandoc div markers (:::) as-is
1815        if trimmed.starts_with(":::") {
1816            result.push(line.to_string());
1817            i += 1;
1818            continue;
1819        }
1820
1821        // Preserve fenced code blocks
1822        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1823            result.push(line.to_string());
1824            i += 1;
1825            // Copy lines until closing fence
1826            while i < lines.len() {
1827                result.push(lines[i].to_string());
1828                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1829                    i += 1;
1830                    break;
1831                }
1832                i += 1;
1833            }
1834            continue;
1835        }
1836
1837        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1838        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1839            // Collect all consecutive indented lines
1840            result.push(line.to_string());
1841            i += 1;
1842            while i < lines.len() {
1843                let next_line = lines[i];
1844                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1845                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1846                    result.push(next_line.to_string());
1847                    i += 1;
1848                } else {
1849                    break;
1850                }
1851            }
1852            continue;
1853        }
1854
1855        // Preserve block quotes (but reflow their content)
1856        if trimmed.starts_with('>') {
1857            // find() returns byte position which is correct for str slicing
1858            // The unwrap is safe because we already verified trimmed starts with '>'
1859            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
1860            let quote_prefix = line[0..gt_pos + 1].to_string();
1861            let quote_content = &line[quote_prefix.len()..].trim_start();
1862
1863            let reflowed = reflow_line(quote_content, options);
1864            for reflowed_line in reflowed.iter() {
1865                result.push(format!("{quote_prefix} {reflowed_line}"));
1866            }
1867            i += 1;
1868            continue;
1869        }
1870
1871        // Preserve horizontal rules first (before checking for lists)
1872        if is_horizontal_rule(trimmed) {
1873            result.push(line.to_string());
1874            i += 1;
1875            continue;
1876        }
1877
1878        // Preserve lists (but not horizontal rules)
1879        if is_unordered_list_marker(trimmed) || is_numbered_list_item(trimmed) {
1880            // Find the list marker and preserve indentation
1881            let indent = line.len() - line.trim_start().len();
1882            let indent_str = " ".repeat(indent);
1883
1884            // For numbered lists, find the period and the space after it
1885            // For bullet lists, find the marker and the space after it
1886            let mut marker_end = indent;
1887            let mut content_start = indent;
1888
1889            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1890                // Numbered list: find the period
1891                if let Some(period_pos) = line[indent..].find('.') {
1892                    marker_end = indent + period_pos + 1; // Include the period
1893                    content_start = marker_end;
1894                    // Skip any spaces after the period to find content start
1895                    // Use byte-based check since content_start is a byte index
1896                    // This is safe because space is ASCII (single byte)
1897                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1898                        content_start += 1;
1899                    }
1900                }
1901            } else {
1902                // Bullet list: marker is single character
1903                marker_end = indent + 1; // Just the marker character
1904                content_start = marker_end;
1905                // Skip any spaces after the marker
1906                // Use byte-based check since content_start is a byte index
1907                // This is safe because space is ASCII (single byte)
1908                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1909                    content_start += 1;
1910                }
1911            }
1912
1913            let marker = &line[indent..marker_end];
1914
1915            // Collect all content for this list item (including continuation lines)
1916            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1917            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1918            i += 1;
1919
1920            // Collect continuation lines (indented lines that are part of this list item)
1921            while i < lines.len() {
1922                let next_line = lines[i];
1923                let next_trimmed = next_line.trim();
1924
1925                // Stop if we hit an empty line or another list item or special block
1926                if is_block_boundary(next_trimmed) {
1927                    break;
1928                }
1929
1930                // Check if this line is indented (continuation of list item)
1931                let next_indent = next_line.len() - next_line.trim_start().len();
1932                if next_indent >= content_start {
1933                    // This is a continuation line - add its content
1934                    // Preserve hard breaks while trimming excessive whitespace
1935                    let trimmed_start = next_line.trim_start();
1936                    list_content.push(trim_preserving_hard_break(trimmed_start));
1937                    i += 1;
1938                } else {
1939                    // Not indented enough, not part of this list item
1940                    break;
1941                }
1942            }
1943
1944            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1945            // Hard breaks should prevent joining with the next line
1946            let combined_content = if options.preserve_breaks {
1947                list_content[0].clone()
1948            } else {
1949                // Check if any lines have hard breaks - if so, preserve the structure
1950                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1951                if has_hard_breaks {
1952                    // Don't join lines with hard breaks - keep them separate with newlines
1953                    list_content.join("\n")
1954                } else {
1955                    // No hard breaks, safe to join with spaces
1956                    list_content.join(" ")
1957                }
1958            };
1959
1960            // Calculate the proper indentation for continuation lines
1961            let trimmed_marker = marker;
1962            let continuation_spaces = content_start;
1963
1964            // Adjust line length to account for list marker and space
1965            let prefix_length = indent + trimmed_marker.len() + 1;
1966
1967            // Create adjusted options with reduced line length
1968            let adjusted_options = ReflowOptions {
1969                line_length: options.line_length.saturating_sub(prefix_length),
1970                ..options.clone()
1971            };
1972
1973            let reflowed = reflow_line(&combined_content, &adjusted_options);
1974            for (j, reflowed_line) in reflowed.iter().enumerate() {
1975                if j == 0 {
1976                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1977                } else {
1978                    // Continuation lines aligned with text after marker
1979                    let continuation_indent = " ".repeat(continuation_spaces);
1980                    result.push(format!("{continuation_indent}{reflowed_line}"));
1981                }
1982            }
1983            continue;
1984        }
1985
1986        // Preserve tables
1987        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
1988            result.push(line.to_string());
1989            i += 1;
1990            continue;
1991        }
1992
1993        // Preserve reference definitions
1994        if trimmed.starts_with('[') && line.contains("]:") {
1995            result.push(line.to_string());
1996            i += 1;
1997            continue;
1998        }
1999
2000        // Preserve definition list items (extended markdown)
2001        if is_definition_list_item(trimmed) {
2002            result.push(line.to_string());
2003            i += 1;
2004            continue;
2005        }
2006
2007        // Check if this is a single line that doesn't need processing
2008        let mut is_single_line_paragraph = true;
2009        if i + 1 < lines.len() {
2010            let next_trimmed = lines[i + 1].trim();
2011            // Check if next line continues this paragraph
2012            if !is_block_boundary(next_trimmed) {
2013                is_single_line_paragraph = false;
2014            }
2015        }
2016
2017        // If it's a single line that fits, just add it as-is
2018        if is_single_line_paragraph && line.chars().count() <= options.line_length {
2019            result.push(line.to_string());
2020            i += 1;
2021            continue;
2022        }
2023
2024        // For regular paragraphs, collect consecutive lines
2025        let mut paragraph_parts = Vec::new();
2026        let mut current_part = vec![line];
2027        i += 1;
2028
2029        // If preserve_breaks is true, treat each line separately
2030        if options.preserve_breaks {
2031            // Don't collect consecutive lines - just reflow this single line
2032            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
2033                Some("\\")
2034            } else if line.ends_with("  ") {
2035                Some("  ")
2036            } else {
2037                None
2038            };
2039            let reflowed = reflow_line(line, options);
2040
2041            // Preserve hard breaks (two trailing spaces or backslash)
2042            if let Some(break_marker) = hard_break_type {
2043                if !reflowed.is_empty() {
2044                    let mut reflowed_with_break = reflowed;
2045                    let last_idx = reflowed_with_break.len() - 1;
2046                    if !has_hard_break(&reflowed_with_break[last_idx]) {
2047                        reflowed_with_break[last_idx].push_str(break_marker);
2048                    }
2049                    result.extend(reflowed_with_break);
2050                }
2051            } else {
2052                result.extend(reflowed);
2053            }
2054        } else {
2055            // Original behavior: collect consecutive lines into a paragraph
2056            while i < lines.len() {
2057                let prev_line = if !current_part.is_empty() {
2058                    current_part.last().unwrap()
2059                } else {
2060                    ""
2061                };
2062                let next_line = lines[i];
2063                let next_trimmed = next_line.trim();
2064
2065                // Stop at empty lines or special blocks
2066                if is_block_boundary(next_trimmed) {
2067                    break;
2068                }
2069
2070                // Check if previous line ends with hard break (two spaces or backslash)
2071                // or is a complete sentence in sentence_per_line mode
2072                let prev_trimmed = prev_line.trim();
2073                let abbreviations = get_abbreviations(&options.abbreviations);
2074                let ends_with_sentence = (prev_trimmed.ends_with('.')
2075                    || prev_trimmed.ends_with('!')
2076                    || prev_trimmed.ends_with('?')
2077                    || prev_trimmed.ends_with(".*")
2078                    || prev_trimmed.ends_with("!*")
2079                    || prev_trimmed.ends_with("?*")
2080                    || prev_trimmed.ends_with("._")
2081                    || prev_trimmed.ends_with("!_")
2082                    || prev_trimmed.ends_with("?_")
2083                    // Quote-terminated sentences (straight and curly quotes)
2084                    || prev_trimmed.ends_with(".\"")
2085                    || prev_trimmed.ends_with("!\"")
2086                    || prev_trimmed.ends_with("?\"")
2087                    || prev_trimmed.ends_with(".'")
2088                    || prev_trimmed.ends_with("!'")
2089                    || prev_trimmed.ends_with("?'")
2090                    || prev_trimmed.ends_with(".\u{201D}")
2091                    || prev_trimmed.ends_with("!\u{201D}")
2092                    || prev_trimmed.ends_with("?\u{201D}")
2093                    || prev_trimmed.ends_with(".\u{2019}")
2094                    || prev_trimmed.ends_with("!\u{2019}")
2095                    || prev_trimmed.ends_with("?\u{2019}"))
2096                    && !text_ends_with_abbreviation(
2097                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
2098                        &abbreviations,
2099                    );
2100
2101                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
2102                    // Start a new part after hard break or complete sentence
2103                    paragraph_parts.push(current_part.join(" "));
2104                    current_part = vec![next_line];
2105                } else {
2106                    current_part.push(next_line);
2107                }
2108                i += 1;
2109            }
2110
2111            // Add the last part
2112            if !current_part.is_empty() {
2113                if current_part.len() == 1 {
2114                    // Single line, don't add trailing space
2115                    paragraph_parts.push(current_part[0].to_string());
2116                } else {
2117                    paragraph_parts.push(current_part.join(" "));
2118                }
2119            }
2120
2121            // Reflow each part separately, preserving hard breaks
2122            for (j, part) in paragraph_parts.iter().enumerate() {
2123                let reflowed = reflow_line(part, options);
2124                result.extend(reflowed);
2125
2126                // Preserve hard break by ensuring last line of part ends with hard break marker
2127                // Use two spaces as the default hard break format for reflows
2128                // But don't add hard breaks in sentence_per_line mode - lines are already separate
2129                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
2130                    let last_idx = result.len() - 1;
2131                    if !has_hard_break(&result[last_idx]) {
2132                        result[last_idx].push_str("  ");
2133                    }
2134                }
2135            }
2136        }
2137    }
2138
2139    // Preserve trailing newline if the original content had one
2140    let result_text = result.join("\n");
2141    if content.ends_with('\n') && !result_text.ends_with('\n') {
2142        format!("{result_text}\n")
2143    } else {
2144        result_text
2145    }
2146}
2147
2148/// Information about a reflowed paragraph
2149#[derive(Debug, Clone)]
2150pub struct ParagraphReflow {
2151    /// Starting byte offset of the paragraph in the original content
2152    pub start_byte: usize,
2153    /// Ending byte offset of the paragraph in the original content
2154    pub end_byte: usize,
2155    /// The reflowed text for this paragraph
2156    pub reflowed_text: String,
2157}
2158
2159/// Reflow a single paragraph at the specified line number
2160///
2161/// This function finds the paragraph containing the given line number,
2162/// reflows it according to the specified line length, and returns
2163/// information about the paragraph location and its reflowed text.
2164///
2165/// # Arguments
2166///
2167/// * `content` - The full document content
2168/// * `line_number` - The 1-based line number within the paragraph to reflow
2169/// * `line_length` - The target line length for reflowing
2170///
2171/// # Returns
2172///
2173/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
2174/// or `None` if the line number is out of bounds or the content at that
2175/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
2176pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
2177    if line_number == 0 {
2178        return None;
2179    }
2180
2181    let lines: Vec<&str> = content.lines().collect();
2182
2183    // Check if line number is valid (1-based)
2184    if line_number > lines.len() {
2185        return None;
2186    }
2187
2188    let target_idx = line_number - 1; // Convert to 0-based
2189    let target_line = lines[target_idx];
2190    let trimmed = target_line.trim();
2191
2192    // Don't reflow special blocks
2193    if is_paragraph_boundary(trimmed, target_line) {
2194        return None;
2195    }
2196
2197    // Find paragraph start - scan backward until blank line or special block
2198    let mut para_start = target_idx;
2199    while para_start > 0 {
2200        let prev_idx = para_start - 1;
2201        let prev_line = lines[prev_idx];
2202        let prev_trimmed = prev_line.trim();
2203
2204        // Stop at blank line or special blocks
2205        if is_paragraph_boundary(prev_trimmed, prev_line) {
2206            break;
2207        }
2208
2209        para_start = prev_idx;
2210    }
2211
2212    // Find paragraph end - scan forward until blank line or special block
2213    let mut para_end = target_idx;
2214    while para_end + 1 < lines.len() {
2215        let next_idx = para_end + 1;
2216        let next_line = lines[next_idx];
2217        let next_trimmed = next_line.trim();
2218
2219        // Stop at blank line or special blocks
2220        if is_paragraph_boundary(next_trimmed, next_line) {
2221            break;
2222        }
2223
2224        para_end = next_idx;
2225    }
2226
2227    // Extract paragraph lines
2228    let paragraph_lines = &lines[para_start..=para_end];
2229
2230    // Calculate byte offsets
2231    let mut start_byte = 0;
2232    for line in lines.iter().take(para_start) {
2233        start_byte += line.len() + 1; // +1 for newline
2234    }
2235
2236    let mut end_byte = start_byte;
2237    for line in paragraph_lines.iter() {
2238        end_byte += line.len() + 1; // +1 for newline
2239    }
2240
2241    // Track whether the byte range includes a trailing newline
2242    // (it doesn't if this is the last line and the file doesn't end with newline)
2243    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2244
2245    // Adjust end_byte if the last line doesn't have a newline
2246    if !includes_trailing_newline {
2247        end_byte -= 1;
2248    }
2249
2250    // Join paragraph lines and reflow
2251    let paragraph_text = paragraph_lines.join("\n");
2252
2253    // Create reflow options
2254    let options = ReflowOptions {
2255        line_length,
2256        break_on_sentences: true,
2257        preserve_breaks: false,
2258        sentence_per_line: false,
2259        semantic_line_breaks: false,
2260        abbreviations: None,
2261    };
2262
2263    // Reflow the paragraph using reflow_markdown to handle it properly
2264    let reflowed = reflow_markdown(&paragraph_text, &options);
2265
2266    // Ensure reflowed text matches whether the byte range includes a trailing newline
2267    // This is critical: if the range includes a newline, the replacement must too,
2268    // otherwise the next line will get appended to the reflowed paragraph
2269    let reflowed_text = if includes_trailing_newline {
2270        // Range includes newline - ensure reflowed text has one
2271        if reflowed.ends_with('\n') {
2272            reflowed
2273        } else {
2274            format!("{reflowed}\n")
2275        }
2276    } else {
2277        // Range doesn't include newline - ensure reflowed text doesn't have one
2278        if reflowed.ends_with('\n') {
2279            reflowed.trim_end_matches('\n').to_string()
2280        } else {
2281            reflowed
2282        }
2283    };
2284
2285    Some(ParagraphReflow {
2286        start_byte,
2287        end_byte,
2288        reflowed_text,
2289    })
2290}
2291
2292#[cfg(test)]
2293mod tests {
2294    use super::*;
2295
2296    /// Unit test for private helper function text_ends_with_abbreviation()
2297    ///
2298    /// This test stays inline because it tests a private function.
2299    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2300    #[test]
2301    fn test_helper_function_text_ends_with_abbreviation() {
2302        // Test the helper function directly
2303        let abbreviations = get_abbreviations(&None);
2304
2305        // True cases - built-in abbreviations (titles and i.e./e.g.)
2306        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2307        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2308        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2309        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2310        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2311        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2312        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2313        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2314
2315        // False cases - NOT in built-in list (etc doesn't always have period)
2316        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2317        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2318        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2319        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2320        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2321        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2322        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2323        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2324        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2325        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2326    }
2327
2328    #[test]
2329    fn test_is_unordered_list_marker() {
2330        // Valid unordered list markers
2331        assert!(is_unordered_list_marker("- item"));
2332        assert!(is_unordered_list_marker("* item"));
2333        assert!(is_unordered_list_marker("+ item"));
2334        assert!(is_unordered_list_marker("-")); // lone marker
2335        assert!(is_unordered_list_marker("*"));
2336        assert!(is_unordered_list_marker("+"));
2337
2338        // Not list markers
2339        assert!(!is_unordered_list_marker("---")); // horizontal rule
2340        assert!(!is_unordered_list_marker("***")); // horizontal rule
2341        assert!(!is_unordered_list_marker("- - -")); // horizontal rule
2342        assert!(!is_unordered_list_marker("* * *")); // horizontal rule
2343        assert!(!is_unordered_list_marker("*emphasis*")); // emphasis, not list
2344        assert!(!is_unordered_list_marker("-word")); // no space after marker
2345        assert!(!is_unordered_list_marker("")); // empty
2346        assert!(!is_unordered_list_marker("text")); // plain text
2347        assert!(!is_unordered_list_marker("# heading")); // heading
2348    }
2349
2350    #[test]
2351    fn test_is_block_boundary() {
2352        // Block boundaries
2353        assert!(is_block_boundary("")); // empty line
2354        assert!(is_block_boundary("# Heading")); // ATX heading
2355        assert!(is_block_boundary("## Level 2")); // ATX heading
2356        assert!(is_block_boundary("```rust")); // code fence
2357        assert!(is_block_boundary("~~~")); // tilde code fence
2358        assert!(is_block_boundary("> quote")); // blockquote
2359        assert!(is_block_boundary("| cell |")); // table
2360        assert!(is_block_boundary("[link]: http://example.com")); // reference def
2361        assert!(is_block_boundary("---")); // horizontal rule
2362        assert!(is_block_boundary("***")); // horizontal rule
2363        assert!(is_block_boundary("- item")); // unordered list
2364        assert!(is_block_boundary("* item")); // unordered list
2365        assert!(is_block_boundary("+ item")); // unordered list
2366        assert!(is_block_boundary("1. item")); // ordered list
2367        assert!(is_block_boundary("10. item")); // ordered list
2368        assert!(is_block_boundary(": definition")); // definition list
2369        assert!(is_block_boundary(":::")); // div marker
2370        assert!(is_block_boundary("::::: {.callout-note}")); // div marker with attrs
2371
2372        // NOT block boundaries (paragraph continuation)
2373        assert!(!is_block_boundary("regular text"));
2374        assert!(!is_block_boundary("*emphasis*")); // emphasis, not list
2375        assert!(!is_block_boundary("[link](url)")); // inline link, not reference def
2376        assert!(!is_block_boundary("some words here"));
2377    }
2378
2379    #[test]
2380    fn test_definition_list_boundary_in_single_line_paragraph() {
2381        // Verifies that a definition list item after a single-line paragraph
2382        // is treated as a block boundary, not merged into the paragraph
2383        let options = ReflowOptions {
2384            line_length: 80,
2385            ..Default::default()
2386        };
2387        let input = "Term\n: Definition of the term";
2388        let result = reflow_markdown(input, &options);
2389        // The definition list marker should remain on its own line
2390        assert!(
2391            result.contains(": Definition"),
2392            "Definition list item should not be merged into previous line. Got: {result:?}"
2393        );
2394        let lines: Vec<&str> = result.lines().collect();
2395        assert_eq!(lines.len(), 2, "Should remain two separate lines. Got: {lines:?}");
2396        assert_eq!(lines[0], "Term");
2397        assert_eq!(lines[1], ": Definition of the term");
2398    }
2399
2400    #[test]
2401    fn test_is_paragraph_boundary() {
2402        // Core block boundary checks are inherited
2403        assert!(is_paragraph_boundary("# Heading", "# Heading"));
2404        assert!(is_paragraph_boundary("- item", "- item"));
2405        assert!(is_paragraph_boundary(":::", ":::"));
2406        assert!(is_paragraph_boundary(": definition", ": definition"));
2407
2408        // Indented code blocks (≥4 spaces or tab)
2409        assert!(is_paragraph_boundary("code", "    code"));
2410        assert!(is_paragraph_boundary("code", "\tcode"));
2411
2412        // Table rows via is_potential_table_row
2413        assert!(is_paragraph_boundary("| a | b |", "| a | b |"));
2414        assert!(is_paragraph_boundary("a | b", "a | b")); // pipe-delimited without leading pipe
2415
2416        // Not paragraph boundaries
2417        assert!(!is_paragraph_boundary("regular text", "regular text"));
2418        assert!(!is_paragraph_boundary("text", "  text")); // 2-space indent is not code
2419    }
2420
2421    #[test]
2422    fn test_div_marker_boundary_in_reflow_paragraph_at_line() {
2423        // Verifies that div markers (:::) are treated as paragraph boundaries
2424        // in reflow_paragraph_at_line, preventing reflow across div boundaries
2425        let content = "Some paragraph text here.\n\n::: {.callout-note}\nThis is a callout.\n:::\n";
2426        // Line 3 is the div marker — should not be reflowed
2427        let result = reflow_paragraph_at_line(content, 3, 80);
2428        assert!(result.is_none(), "Div marker line should not be reflowed");
2429    }
2430}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs