rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::mkdocs_attr_list::{ATTR_LIST_PATTERN, is_standalone_attr_list};
9use crate::utils::mkdocs_snippets::is_snippet_block_delimiter;
10use crate::utils::regex_cache::{
11    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
12    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
13    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
14    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
15};
16use crate::utils::sentence_utils::{
17    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
18    text_ends_with_abbreviation,
19};
20use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
21use std::collections::HashSet;
22use unicode_width::UnicodeWidthStr;
23
24/// Length calculation mode for reflow
25#[derive(Clone, Copy, Debug, Default, PartialEq)]
26pub enum ReflowLengthMode {
27    /// Count Unicode characters (grapheme clusters)
28    Chars,
29    /// Count visual display width (CJK = 2 columns, emoji = 2, etc.)
30    #[default]
31    Visual,
32    /// Count raw bytes
33    Bytes,
34}
35
36/// Calculate the display length of a string based on the length mode
37fn display_len(s: &str, mode: ReflowLengthMode) -> usize {
38    match mode {
39        ReflowLengthMode::Chars => s.chars().count(),
40        ReflowLengthMode::Visual => s.width(),
41        ReflowLengthMode::Bytes => s.len(),
42    }
43}
44
45/// Options for reflowing text
46#[derive(Clone)]
47pub struct ReflowOptions {
48    /// Target line length
49    pub line_length: usize,
50    /// Whether to break on sentence boundaries when possible
51    pub break_on_sentences: bool,
52    /// Whether to preserve existing line breaks in paragraphs
53    pub preserve_breaks: bool,
54    /// Whether to enforce one sentence per line
55    pub sentence_per_line: bool,
56    /// Whether to use semantic line breaks (cascading split strategy)
57    pub semantic_line_breaks: bool,
58    /// Custom abbreviations for sentence detection
59    /// Periods are optional - both "Dr" and "Dr." work the same
60    /// Custom abbreviations are always added to the built-in defaults
61    pub abbreviations: Option<Vec<String>>,
62    /// How to measure string length for line-length comparisons
63    pub length_mode: ReflowLengthMode,
64    /// Whether to treat {#id .class key="value"} as atomic (unsplittable) elements.
65    /// Enabled for MkDocs and Kramdown flavors.
66    pub attr_lists: bool,
67    /// Whether to require uppercase after periods for sentence detection.
68    /// When true (default), only "word. Capital" is a sentence boundary.
69    /// When false, "word. lowercase" is also treated as a sentence boundary.
70    /// Does not affect ! and ? which are always treated as sentence boundaries.
71    pub require_sentence_capital: bool,
72}
73
74impl Default for ReflowOptions {
75    fn default() -> Self {
76        Self {
77            line_length: 80,
78            break_on_sentences: true,
79            preserve_breaks: false,
80            sentence_per_line: false,
81            semantic_line_breaks: false,
82            abbreviations: None,
83            length_mode: ReflowLengthMode::default(),
84            attr_lists: false,
85            require_sentence_capital: true,
86        }
87    }
88}
89
90/// Detect if a character position is a sentence boundary
91/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
92/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
93fn is_sentence_boundary(
94    text: &str,
95    pos: usize,
96    abbreviations: &HashSet<String>,
97    require_sentence_capital: bool,
98) -> bool {
99    let chars: Vec<char> = text.chars().collect();
100
101    if pos + 1 >= chars.len() {
102        return false;
103    }
104
105    let c = chars[pos];
106    let next_char = chars[pos + 1];
107
108    // Check for CJK sentence-ending punctuation (。, ！, ？)
109    // CJK punctuation doesn't require space or uppercase after it
110    if is_cjk_sentence_ending(c) {
111        // Skip any trailing emphasis/strikethrough markers
112        let mut after_punct_pos = pos + 1;
113        while after_punct_pos < chars.len()
114            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
115        {
116            after_punct_pos += 1;
117        }
118
119        // Skip whitespace
120        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
121            after_punct_pos += 1;
122        }
123
124        // Check if we have more content (any non-whitespace)
125        if after_punct_pos >= chars.len() {
126            return false;
127        }
128
129        // Skip leading emphasis/strikethrough markers
130        while after_punct_pos < chars.len()
131            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
132        {
133            after_punct_pos += 1;
134        }
135
136        if after_punct_pos >= chars.len() {
137            return false;
138        }
139
140        // For CJK, we accept any character as the start of the next sentence
141        // (no uppercase requirement, since CJK doesn't have case)
142        return true;
143    }
144
145    // Check for ASCII sentence-ending punctuation
146    if c != '.' && c != '!' && c != '?' {
147        return false;
148    }
149
150    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
151    let (_space_pos, after_space_pos) = if next_char == ' ' {
152        // Normal case: punctuation followed by space
153        (pos + 1, pos + 2)
154    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
155        // Sentence ends with quote - check what follows the quote
156        if chars[pos + 2] == ' ' {
157            // Just quote followed by space: 'sentence." '
158            (pos + 2, pos + 3)
159        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
160            // Quote followed by emphasis: 'sentence."* '
161            (pos + 3, pos + 4)
162        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
163            && pos + 4 < chars.len()
164            && chars[pos + 3] == chars[pos + 2]
165            && chars[pos + 4] == ' '
166        {
167            // Quote followed by bold: 'sentence."** '
168            (pos + 4, pos + 5)
169        } else {
170            return false;
171        }
172    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
173        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
174        (pos + 2, pos + 3)
175    } else if (next_char == '*' || next_char == '_')
176        && pos + 3 < chars.len()
177        && chars[pos + 2] == next_char
178        && chars[pos + 3] == ' '
179    {
180        // Sentence ends with bold: "sentence.** " or "sentence.__ "
181        (pos + 3, pos + 4)
182    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
183        // Sentence ends with strikethrough: "sentence.~~ "
184        (pos + 3, pos + 4)
185    } else {
186        return false;
187    };
188
189    // Skip all whitespace after the space to find the start of the next sentence
190    let mut next_char_pos = after_space_pos;
191    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
192        next_char_pos += 1;
193    }
194
195    // Check if we reached the end of the string
196    if next_char_pos >= chars.len() {
197        return false;
198    }
199
200    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
201    let mut first_letter_pos = next_char_pos;
202    while first_letter_pos < chars.len()
203        && (chars[first_letter_pos] == '*'
204            || chars[first_letter_pos] == '_'
205            || chars[first_letter_pos] == '~'
206            || is_opening_quote(chars[first_letter_pos]))
207    {
208        first_letter_pos += 1;
209    }
210
211    // Check if we reached the end after skipping emphasis
212    if first_letter_pos >= chars.len() {
213        return false;
214    }
215
216    let first_char = chars[first_letter_pos];
217
218    // For ! and ?, sentence boundaries are unambiguous — no uppercase requirement
219    if c == '!' || c == '?' {
220        return true;
221    }
222
223    // Period-specific checks: periods are ambiguous (abbreviations, decimals, initials)
224    // so we apply additional guards before accepting a sentence boundary.
225
226    if pos > 0 {
227        // Check for common abbreviations
228        let byte_offset: usize = chars[..=pos].iter().map(|ch| ch.len_utf8()).sum();
229        if text_ends_with_abbreviation(&text[..byte_offset], abbreviations) {
230            return false;
231        }
232
233        // Check for decimal numbers (e.g., "3.14 is pi")
234        if chars[pos - 1].is_numeric() && first_char.is_ascii_digit() {
235            return false;
236        }
237
238        // Check for single-letter initials (e.g., "J. K. Rowling")
239        // A single uppercase letter before the period preceded by whitespace or start
240        // is likely an initial, not a sentence ending.
241        if chars[pos - 1].is_ascii_uppercase() && (pos == 1 || (pos >= 2 && chars[pos - 2].is_whitespace())) {
242            return false;
243        }
244    }
245
246    // In strict mode, require uppercase or CJK to start the next sentence after a period.
247    // In relaxed mode, accept any alphanumeric character.
248    if require_sentence_capital && !first_char.is_uppercase() && !is_cjk_char(first_char) {
249        return false;
250    }
251
252    true
253}
254
255/// Split text into sentences
256pub fn split_into_sentences(text: &str) -> Vec<String> {
257    split_into_sentences_custom(text, &None)
258}
259
260/// Split text into sentences with custom abbreviations
261pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
262    let abbreviations = get_abbreviations(custom_abbreviations);
263    split_into_sentences_with_set(text, &abbreviations, true)
264}
265
266/// Internal function to split text into sentences with a pre-computed abbreviations set
267/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
268fn split_into_sentences_with_set(
269    text: &str,
270    abbreviations: &HashSet<String>,
271    require_sentence_capital: bool,
272) -> Vec<String> {
273    let mut sentences = Vec::new();
274    let mut current_sentence = String::new();
275    let mut chars = text.chars().peekable();
276    let mut pos = 0;
277
278    while let Some(c) = chars.next() {
279        current_sentence.push(c);
280
281        if is_sentence_boundary(text, pos, abbreviations, require_sentence_capital) {
282            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
283            while let Some(&next) = chars.peek() {
284                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
285                    current_sentence.push(chars.next().unwrap());
286                    pos += 1;
287                } else {
288                    break;
289                }
290            }
291
292            // Consume the space after the sentence
293            if chars.peek() == Some(&' ') {
294                chars.next();
295                pos += 1;
296            }
297
298            sentences.push(current_sentence.trim().to_string());
299            current_sentence.clear();
300        }
301
302        pos += 1;
303    }
304
305    // Add any remaining text as the last sentence
306    if !current_sentence.trim().is_empty() {
307        sentences.push(current_sentence.trim().to_string());
308    }
309    sentences
310}
311
312/// Check if a line is a horizontal rule (---, ___, ***)
313fn is_horizontal_rule(line: &str) -> bool {
314    if line.len() < 3 {
315        return false;
316    }
317
318    // Check if line consists only of -, _, or * characters (at least 3)
319    let chars: Vec<char> = line.chars().collect();
320    if chars.is_empty() {
321        return false;
322    }
323
324    let first_char = chars[0];
325    if first_char != '-' && first_char != '_' && first_char != '*' {
326        return false;
327    }
328
329    // All characters should be the same (allowing spaces between)
330    for c in &chars {
331        if *c != first_char && *c != ' ' {
332            return false;
333        }
334    }
335
336    // Count non-space characters
337    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
338    non_space_count >= 3
339}
340
341/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
342fn is_numbered_list_item(line: &str) -> bool {
343    let mut chars = line.chars();
344
345    // Must start with a digit
346    if !chars.next().is_some_and(|c| c.is_numeric()) {
347        return false;
348    }
349
350    // Can have more digits
351    while let Some(c) = chars.next() {
352        if c == '.' {
353            // After period, must have a space (consistent with list marker extraction)
354            // "2019." alone is NOT treated as a list item to avoid false positives
355            return chars.next() == Some(' ');
356        }
357        if !c.is_numeric() {
358            return false;
359        }
360    }
361
362    false
363}
364
365/// Check if a trimmed line is an unordered list item (-, *, + followed by space)
366fn is_unordered_list_marker(s: &str) -> bool {
367    matches!(s.as_bytes().first(), Some(b'-' | b'*' | b'+'))
368        && !is_horizontal_rule(s)
369        && (s.len() == 1 || s.as_bytes().get(1) == Some(&b' '))
370}
371
372/// Shared structural checks for block boundary detection.
373/// Checks elements that only depend on the trimmed line content.
374fn is_block_boundary_core(trimmed: &str) -> bool {
375    trimmed.is_empty()
376        || trimmed.starts_with('#')
377        || trimmed.starts_with("```")
378        || trimmed.starts_with("~~~")
379        || trimmed.starts_with('>')
380        || (trimmed.starts_with('[') && trimmed.contains("]:"))
381        || is_horizontal_rule(trimmed)
382        || is_unordered_list_marker(trimmed)
383        || is_numbered_list_item(trimmed)
384        || is_definition_list_item(trimmed)
385        || trimmed.starts_with(":::")
386}
387
388/// Check if a trimmed line starts a new structural block element.
389/// Used for paragraph boundary detection in `reflow_markdown()`.
390fn is_block_boundary(trimmed: &str) -> bool {
391    is_block_boundary_core(trimmed) || trimmed.starts_with('|')
392}
393
394/// Check if a line starts a new structural block for paragraph boundary detection
395/// in `reflow_paragraph_at_line()`. Extends the core checks with indented code blocks
396/// (≥4 spaces) and table row detection via `is_potential_table_row`.
397fn is_paragraph_boundary(trimmed: &str, line: &str) -> bool {
398    is_block_boundary_core(trimmed)
399        || ElementCache::calculate_indentation_width_default(line) >= 4
400        || crate::utils::table_utils::TableUtils::is_potential_table_row(line)
401}
402
403/// Check if a line ends with a hard break (either two spaces or backslash)
404///
405/// CommonMark supports two formats for hard line breaks:
406/// 1. Two or more trailing spaces
407/// 2. A backslash at the end of the line
408fn has_hard_break(line: &str) -> bool {
409    let line = line.strip_suffix('\r').unwrap_or(line);
410    line.ends_with("  ") || line.ends_with('\\')
411}
412
413/// Check if text ends with sentence-terminating punctuation (. ! ?)
414fn ends_with_sentence_punct(text: &str) -> bool {
415    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
416}
417
418/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
419///
420/// Hard breaks in Markdown can be indicated by:
421/// 1. Two trailing spaces before a newline (traditional)
422/// 2. A backslash at the end of the line (mdformat style)
423fn trim_preserving_hard_break(s: &str) -> String {
424    // Strip trailing \r from CRLF line endings first to handle Windows files
425    let s = s.strip_suffix('\r').unwrap_or(s);
426
427    // Check for backslash hard break (mdformat style)
428    if s.ends_with('\\') {
429        // Preserve the backslash exactly as-is
430        return s.to_string();
431    }
432
433    // Check if there are at least 2 trailing spaces (traditional hard break)
434    if s.ends_with("  ") {
435        // Find the position where non-space content ends
436        let content_end = s.trim_end().len();
437        if content_end == 0 {
438            // String is all whitespace
439            return String::new();
440        }
441        // Preserve exactly 2 trailing spaces for hard break
442        format!("{}  ", &s[..content_end])
443    } else {
444        // No hard break, just trim all trailing whitespace
445        s.trim_end().to_string()
446    }
447}
448
449/// Parse markdown elements using the appropriate parser based on options.
450fn parse_elements(text: &str, options: &ReflowOptions) -> Vec<Element> {
451    if options.attr_lists {
452        parse_markdown_elements_with_attr_lists(text)
453    } else {
454        parse_markdown_elements(text)
455    }
456}
457
458pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
459    // For sentence-per-line mode, always process regardless of length
460    if options.sentence_per_line {
461        let elements = parse_elements(line, options);
462        return reflow_elements_sentence_per_line(&elements, &options.abbreviations, options.require_sentence_capital);
463    }
464
465    // For semantic line breaks mode, use cascading split strategy
466    if options.semantic_line_breaks {
467        let elements = parse_elements(line, options);
468        return reflow_elements_semantic(&elements, options);
469    }
470
471    // Quick check: if line is already short enough or no wrapping requested, return as-is
472    // line_length = 0 means no wrapping (unlimited line length)
473    if options.line_length == 0 || display_len(line, options.length_mode) <= options.line_length {
474        return vec![line.to_string()];
475    }
476
477    // Parse the markdown to identify elements
478    let elements = parse_elements(line, options);
479
480    // Reflow the elements into lines
481    reflow_elements(&elements, options)
482}
483
484/// Image source in a linked image structure
485#[derive(Debug, Clone)]
486enum LinkedImageSource {
487    /// Inline image URL: ![alt](url)
488    Inline(String),
489    /// Reference image: ![alt][ref]
490    Reference(String),
491}
492
493/// Link target in a linked image structure
494#[derive(Debug, Clone)]
495enum LinkedImageTarget {
496    /// Inline link URL: ](url)
497    Inline(String),
498    /// Reference link: ][ref]
499    Reference(String),
500}
501
502/// Represents a piece of content in the markdown
503#[derive(Debug, Clone)]
504enum Element {
505    /// Plain text that can be wrapped
506    Text(String),
507    /// A complete markdown inline link [text](url)
508    Link { text: String, url: String },
509    /// A complete markdown reference link [text][ref]
510    ReferenceLink { text: String, reference: String },
511    /// A complete markdown empty reference link [text][]
512    EmptyReferenceLink { text: String },
513    /// A complete markdown shortcut reference link [ref]
514    ShortcutReference { reference: String },
515    /// A complete markdown inline image ![alt](url)
516    InlineImage { alt: String, url: String },
517    /// A complete markdown reference image ![alt][ref]
518    ReferenceImage { alt: String, reference: String },
519    /// A complete markdown empty reference image ![alt][]
520    EmptyReferenceImage { alt: String },
521    /// A clickable image badge in any of 4 forms:
522    /// - [![alt](img-url)](link-url)
523    /// - [![alt][img-ref]](link-url)
524    /// - [![alt](img-url)][link-ref]
525    /// - [![alt][img-ref]][link-ref]
526    LinkedImage {
527        alt: String,
528        img_source: LinkedImageSource,
529        link_target: LinkedImageTarget,
530    },
531    /// Footnote reference [^note]
532    FootnoteReference { note: String },
533    /// Strikethrough text ~~text~~
534    Strikethrough(String),
535    /// Wiki-style link [[wiki]] or [[wiki|text]]
536    WikiLink(String),
537    /// Inline math $math$
538    InlineMath(String),
539    /// Display math $$math$$
540    DisplayMath(String),
541    /// Emoji shortcode :emoji:
542    EmojiShortcode(String),
543    /// Autolink <https://...> or <mailto:...> or <user@domain.com>
544    Autolink(String),
545    /// HTML tag <tag> or </tag> or <tag/>
546    HtmlTag(String),
547    /// HTML entity &nbsp; or &#123;
548    HtmlEntity(String),
549    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
550    HugoShortcode(String),
551    /// MkDocs/kramdown attribute list {#id .class key="value"}
552    AttrList(String),
553    /// Inline code `code`
554    Code(String),
555    /// Bold text **text** or __text__
556    Bold {
557        content: String,
558        /// True if underscore markers (__), false for asterisks (**)
559        underscore: bool,
560    },
561    /// Italic text *text* or _text_
562    Italic {
563        content: String,
564        /// True if underscore marker (_), false for asterisk (*)
565        underscore: bool,
566    },
567}
568
569impl std::fmt::Display for Element {
570    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
571        match self {
572            Element::Text(s) => write!(f, "{s}"),
573            Element::Link { text, url } => write!(f, "[{text}]({url})"),
574            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
575            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
576            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
577            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
578            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
579            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
580            Element::LinkedImage {
581                alt,
582                img_source,
583                link_target,
584            } => {
585                // Build the image part: ![alt](url) or ![alt][ref]
586                let img_part = match img_source {
587                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
588                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
589                };
590                // Build the link part: (url) or [ref]
591                match link_target {
592                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
593                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
594                }
595            }
596            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
597            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
598            Element::WikiLink(s) => write!(f, "[[{s}]]"),
599            Element::InlineMath(s) => write!(f, "${s}$"),
600            Element::DisplayMath(s) => write!(f, "$${s}$$"),
601            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
602            Element::Autolink(s) => write!(f, "{s}"),
603            Element::HtmlTag(s) => write!(f, "{s}"),
604            Element::HtmlEntity(s) => write!(f, "{s}"),
605            Element::HugoShortcode(s) => write!(f, "{s}"),
606            Element::AttrList(s) => write!(f, "{s}"),
607            Element::Code(s) => write!(f, "`{s}`"),
608            Element::Bold { content, underscore } => {
609                if *underscore {
610                    write!(f, "__{content}__")
611                } else {
612                    write!(f, "**{content}**")
613                }
614            }
615            Element::Italic { content, underscore } => {
616                if *underscore {
617                    write!(f, "_{content}_")
618                } else {
619                    write!(f, "*{content}*")
620                }
621            }
622        }
623    }
624}
625
626impl Element {
627    /// Calculate the display width of this element using the given length mode.
628    /// This formats the element and computes its width, correctly handling
629    /// visual width for CJK characters and other wide glyphs.
630    fn display_width(&self, mode: ReflowLengthMode) -> usize {
631        let formatted = format!("{self}");
632        display_len(&formatted, mode)
633    }
634}
635
636/// An emphasis or formatting span parsed by pulldown-cmark
637#[derive(Debug, Clone)]
638struct EmphasisSpan {
639    /// Byte offset where the emphasis starts (including markers)
640    start: usize,
641    /// Byte offset where the emphasis ends (after closing markers)
642    end: usize,
643    /// The content inside the emphasis markers
644    content: String,
645    /// Whether this is strong (bold) emphasis
646    is_strong: bool,
647    /// Whether this is strikethrough (~~text~~)
648    is_strikethrough: bool,
649    /// Whether the original used underscore markers (for emphasis only)
650    uses_underscore: bool,
651}
652
653/// Extract emphasis and strikethrough spans from text using pulldown-cmark
654///
655/// This provides CommonMark-compliant emphasis parsing, correctly handling:
656/// - Nested emphasis like `*text **bold** more*`
657/// - Left/right flanking delimiter rules
658/// - Underscore vs asterisk markers
659/// - GFM strikethrough (~~text~~)
660///
661/// Returns spans sorted by start position.
662fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
663    let mut spans = Vec::new();
664    let mut options = Options::empty();
665    options.insert(Options::ENABLE_STRIKETHROUGH);
666
667    // Stacks to track nested formatting with their start positions
668    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
669    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
670    let mut strikethrough_stack: Vec<usize> = Vec::new();
671
672    let parser = Parser::new_ext(text, options).into_offset_iter();
673
674    for (event, range) in parser {
675        match event {
676            Event::Start(Tag::Emphasis) => {
677                // Check if this uses underscore by looking at the original text
678                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
679                emphasis_stack.push((range.start, uses_underscore));
680            }
681            Event::End(TagEnd::Emphasis) => {
682                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
683                    // Extract content between the markers (1 char marker on each side)
684                    let content_start = start_byte + 1;
685                    let content_end = range.end - 1;
686                    if content_end > content_start
687                        && let Some(content) = text.get(content_start..content_end)
688                    {
689                        spans.push(EmphasisSpan {
690                            start: start_byte,
691                            end: range.end,
692                            content: content.to_string(),
693                            is_strong: false,
694                            is_strikethrough: false,
695                            uses_underscore,
696                        });
697                    }
698                }
699            }
700            Event::Start(Tag::Strong) => {
701                // Check if this uses underscore by looking at the original text
702                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
703                strong_stack.push((range.start, uses_underscore));
704            }
705            Event::End(TagEnd::Strong) => {
706                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
707                    // Extract content between the markers (2 char marker on each side)
708                    let content_start = start_byte + 2;
709                    let content_end = range.end - 2;
710                    if content_end > content_start
711                        && let Some(content) = text.get(content_start..content_end)
712                    {
713                        spans.push(EmphasisSpan {
714                            start: start_byte,
715                            end: range.end,
716                            content: content.to_string(),
717                            is_strong: true,
718                            is_strikethrough: false,
719                            uses_underscore,
720                        });
721                    }
722                }
723            }
724            Event::Start(Tag::Strikethrough) => {
725                strikethrough_stack.push(range.start);
726            }
727            Event::End(TagEnd::Strikethrough) => {
728                if let Some(start_byte) = strikethrough_stack.pop() {
729                    // Extract content between the ~~ markers (2 char marker on each side)
730                    let content_start = start_byte + 2;
731                    let content_end = range.end - 2;
732                    if content_end > content_start
733                        && let Some(content) = text.get(content_start..content_end)
734                    {
735                        spans.push(EmphasisSpan {
736                            start: start_byte,
737                            end: range.end,
738                            content: content.to_string(),
739                            is_strong: false,
740                            is_strikethrough: true,
741                            uses_underscore: false,
742                        });
743                    }
744                }
745            }
746            _ => {}
747        }
748    }
749
750    // Sort by start position
751    spans.sort_by_key(|s| s.start);
752    spans
753}
754
755/// Parse markdown elements from text preserving the raw syntax
756///
757/// Detection order is critical:
758/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
759/// 2. Inline images ![alt](url) - before links to handle ! prefix
760/// 3. Reference images ![alt][ref] - before reference links
761/// 4. Inline links [text](url) - before reference links
762/// 5. Reference links [text][ref] - before shortcut references
763/// 6. Shortcut reference links [ref] - detected last to avoid false positives
764/// 7. Other elements (code, bold, italic, etc.) - processed normally
765fn parse_markdown_elements(text: &str) -> Vec<Element> {
766    parse_markdown_elements_inner(text, false)
767}
768
769fn parse_markdown_elements_with_attr_lists(text: &str) -> Vec<Element> {
770    parse_markdown_elements_inner(text, true)
771}
772
773fn parse_markdown_elements_inner(text: &str, attr_lists: bool) -> Vec<Element> {
774    let mut elements = Vec::new();
775    let mut remaining = text;
776
777    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
778    let emphasis_spans = extract_emphasis_spans(text);
779
780    while !remaining.is_empty() {
781        // Calculate current byte offset in original text
782        let current_offset = text.len() - remaining.len();
783        // Find the earliest occurrence of any markdown pattern
784        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
785
786        // Check for linked images FIRST (all 4 variants)
787        // Quick literal check: only run expensive regexes if we might have a linked image
788        // Pattern starts with "[!" so check for that first
789        if remaining.contains("[!") {
790            // Pattern 1: [![alt](img)](link) - inline image in inline link
791            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
792                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
793            {
794                earliest_match = Some((m.start(), "linked_image_ii", m));
795            }
796
797            // Pattern 2: [![alt][ref]](link) - reference image in inline link
798            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
799                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
800            {
801                earliest_match = Some((m.start(), "linked_image_ri", m));
802            }
803
804            // Pattern 3: [![alt](img)][ref] - inline image in reference link
805            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
806                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
807            {
808                earliest_match = Some((m.start(), "linked_image_ir", m));
809            }
810
811            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
812            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
813                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
814            {
815                earliest_match = Some((m.start(), "linked_image_rr", m));
816            }
817        }
818
819        // Check for images (they start with ! so should be detected before links)
820        // Inline images - ![alt](url)
821        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
822            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
823        {
824            earliest_match = Some((m.start(), "inline_image", m));
825        }
826
827        // Reference images - ![alt][ref]
828        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
829            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
830        {
831            earliest_match = Some((m.start(), "ref_image", m));
832        }
833
834        // Check for footnote references - [^note]
835        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
836            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
837        {
838            earliest_match = Some((m.start(), "footnote_ref", m));
839        }
840
841        // Check for inline links - [text](url)
842        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
843            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
844        {
845            earliest_match = Some((m.start(), "inline_link", m));
846        }
847
848        // Check for reference links - [text][ref]
849        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
850            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
851        {
852            earliest_match = Some((m.start(), "ref_link", m));
853        }
854
855        // Check for shortcut reference links - [ref]
856        // Only check if we haven't found an earlier pattern that would conflict
857        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
858            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
859        {
860            earliest_match = Some((m.start(), "shortcut_ref", m));
861        }
862
863        // Check for wiki-style links - [[wiki]]
864        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
865            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
866        {
867            earliest_match = Some((m.start(), "wiki_link", m));
868        }
869
870        // Check for display math first (before inline) - $$math$$
871        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
872            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
873        {
874            earliest_match = Some((m.start(), "display_math", m));
875        }
876
877        // Check for inline math - $math$
878        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
879            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
880        {
881            earliest_match = Some((m.start(), "inline_math", m));
882        }
883
884        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
885
886        // Check for emoji shortcodes - :emoji:
887        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
888            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
889        {
890            earliest_match = Some((m.start(), "emoji", m));
891        }
892
893        // Check for HTML entities - &nbsp; etc
894        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
895            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
896        {
897            earliest_match = Some((m.start(), "html_entity", m));
898        }
899
900        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
901        // Must be checked before other patterns to avoid false sentence breaks
902        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
903            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
904        {
905            earliest_match = Some((m.start(), "hugo_shortcode", m));
906        }
907
908        // Check for HTML tags - <tag> </tag> <tag/>
909        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
910        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
911            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
912        {
913            // Check if this is an autolink (starts with protocol or mailto:)
914            let matched_text = &remaining[m.start()..m.end()];
915            let is_url_autolink = matched_text.starts_with("<http://")
916                || matched_text.starts_with("<https://")
917                || matched_text.starts_with("<mailto:")
918                || matched_text.starts_with("<ftp://")
919                || matched_text.starts_with("<ftps://");
920
921            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
922            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
923            let is_email_autolink = {
924                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
925                EMAIL_PATTERN.is_match(content)
926            };
927
928            if is_url_autolink || is_email_autolink {
929                earliest_match = Some((m.start(), "autolink", m));
930            } else {
931                earliest_match = Some((m.start(), "html_tag", m));
932            }
933        }
934
935        // Find earliest non-link special characters
936        let mut next_special = remaining.len();
937        let mut special_type = "";
938        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
939        let mut attr_list_len: usize = 0;
940
941        // Check for code spans (not handled by pulldown-cmark in this context)
942        if let Some(pos) = remaining.find('`')
943            && pos < next_special
944        {
945            next_special = pos;
946            special_type = "code";
947        }
948
949        // Check for MkDocs/kramdown attr lists - {#id .class key="value"}
950        if attr_lists
951            && let Some(pos) = remaining.find('{')
952            && pos < next_special
953            && let Some(m) = ATTR_LIST_PATTERN.find(&remaining[pos..])
954            && m.start() == 0
955        {
956            next_special = pos;
957            special_type = "attr_list";
958            attr_list_len = m.end();
959        }
960
961        // Check for emphasis using pulldown-cmark's pre-extracted spans
962        // Find the earliest emphasis span that starts within remaining text
963        for span in &emphasis_spans {
964            if span.start >= current_offset && span.start < current_offset + remaining.len() {
965                let pos_in_remaining = span.start - current_offset;
966                if pos_in_remaining < next_special {
967                    next_special = pos_in_remaining;
968                    special_type = "pulldown_emphasis";
969                    pulldown_emphasis = Some(span);
970                }
971                break; // Spans are sorted by start position, so first match is earliest
972            }
973        }
974
975        // Determine which pattern to process first
976        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
977            pos < next_special
978        } else {
979            false
980        };
981
982        if should_process_markdown_link {
983            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
984
985            // Add any text before the match
986            if pos > 0 {
987                elements.push(Element::Text(remaining[..pos].to_string()));
988            }
989
990            // Process the matched pattern
991            match pattern_type {
992                // Pattern 1: [![alt](img)](link) - inline image in inline link
993                "linked_image_ii" => {
994                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
995                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
996                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
997                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
998                        elements.push(Element::LinkedImage {
999                            alt: alt.to_string(),
1000                            img_source: LinkedImageSource::Inline(img_url.to_string()),
1001                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
1002                        });
1003                        remaining = &remaining[match_obj.end()..];
1004                    } else {
1005                        elements.push(Element::Text("[".to_string()));
1006                        remaining = &remaining[1..];
1007                    }
1008                }
1009                // Pattern 2: [![alt][ref]](link) - reference image in inline link
1010                "linked_image_ri" => {
1011                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
1012                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1013                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1014                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1015                        elements.push(Element::LinkedImage {
1016                            alt: alt.to_string(),
1017                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
1018                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
1019                        });
1020                        remaining = &remaining[match_obj.end()..];
1021                    } else {
1022                        elements.push(Element::Text("[".to_string()));
1023                        remaining = &remaining[1..];
1024                    }
1025                }
1026                // Pattern 3: [![alt](img)][ref] - inline image in reference link
1027                "linked_image_ir" => {
1028                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
1029                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1030                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1031                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1032                        elements.push(Element::LinkedImage {
1033                            alt: alt.to_string(),
1034                            img_source: LinkedImageSource::Inline(img_url.to_string()),
1035                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1036                        });
1037                        remaining = &remaining[match_obj.end()..];
1038                    } else {
1039                        elements.push(Element::Text("[".to_string()));
1040                        remaining = &remaining[1..];
1041                    }
1042                }
1043                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
1044                "linked_image_rr" => {
1045                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
1046                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1047                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1048                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1049                        elements.push(Element::LinkedImage {
1050                            alt: alt.to_string(),
1051                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
1052                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1053                        });
1054                        remaining = &remaining[match_obj.end()..];
1055                    } else {
1056                        elements.push(Element::Text("[".to_string()));
1057                        remaining = &remaining[1..];
1058                    }
1059                }
1060                "inline_image" => {
1061                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
1062                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1063                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1064                        elements.push(Element::InlineImage {
1065                            alt: alt.to_string(),
1066                            url: url.to_string(),
1067                        });
1068                        remaining = &remaining[match_obj.end()..];
1069                    } else {
1070                        elements.push(Element::Text("!".to_string()));
1071                        remaining = &remaining[1..];
1072                    }
1073                }
1074                "ref_image" => {
1075                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
1076                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1077                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1078
1079                        if reference.is_empty() {
1080                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1081                        } else {
1082                            elements.push(Element::ReferenceImage {
1083                                alt: alt.to_string(),
1084                                reference: reference.to_string(),
1085                            });
1086                        }
1087                        remaining = &remaining[match_obj.end()..];
1088                    } else {
1089                        elements.push(Element::Text("!".to_string()));
1090                        remaining = &remaining[1..];
1091                    }
1092                }
1093                "footnote_ref" => {
1094                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
1095                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1096                        elements.push(Element::FootnoteReference { note: note.to_string() });
1097                        remaining = &remaining[match_obj.end()..];
1098                    } else {
1099                        elements.push(Element::Text("[".to_string()));
1100                        remaining = &remaining[1..];
1101                    }
1102                }
1103                "inline_link" => {
1104                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1105                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1106                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1107                        elements.push(Element::Link {
1108                            text: text.to_string(),
1109                            url: url.to_string(),
1110                        });
1111                        remaining = &remaining[match_obj.end()..];
1112                    } else {
1113                        // Fallback - shouldn't happen
1114                        elements.push(Element::Text("[".to_string()));
1115                        remaining = &remaining[1..];
1116                    }
1117                }
1118                "ref_link" => {
1119                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1120                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1121                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1122
1123                        if reference.is_empty() {
1124                            // Empty reference link [text][]
1125                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1126                        } else {
1127                            // Regular reference link [text][ref]
1128                            elements.push(Element::ReferenceLink {
1129                                text: text.to_string(),
1130                                reference: reference.to_string(),
1131                            });
1132                        }
1133                        remaining = &remaining[match_obj.end()..];
1134                    } else {
1135                        // Fallback - shouldn't happen
1136                        elements.push(Element::Text("[".to_string()));
1137                        remaining = &remaining[1..];
1138                    }
1139                }
1140                "shortcut_ref" => {
1141                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1142                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1143                        elements.push(Element::ShortcutReference {
1144                            reference: reference.to_string(),
1145                        });
1146                        remaining = &remaining[match_obj.end()..];
1147                    } else {
1148                        // Fallback - shouldn't happen
1149                        elements.push(Element::Text("[".to_string()));
1150                        remaining = &remaining[1..];
1151                    }
1152                }
1153                "wiki_link" => {
1154                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1155                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1156                        elements.push(Element::WikiLink(content.to_string()));
1157                        remaining = &remaining[match_obj.end()..];
1158                    } else {
1159                        elements.push(Element::Text("[[".to_string()));
1160                        remaining = &remaining[2..];
1161                    }
1162                }
1163                "display_math" => {
1164                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1165                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1166                        elements.push(Element::DisplayMath(math.to_string()));
1167                        remaining = &remaining[match_obj.end()..];
1168                    } else {
1169                        elements.push(Element::Text("$$".to_string()));
1170                        remaining = &remaining[2..];
1171                    }
1172                }
1173                "inline_math" => {
1174                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1175                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1176                        elements.push(Element::InlineMath(math.to_string()));
1177                        remaining = &remaining[match_obj.end()..];
1178                    } else {
1179                        elements.push(Element::Text("$".to_string()));
1180                        remaining = &remaining[1..];
1181                    }
1182                }
1183                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1184                "emoji" => {
1185                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1186                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1187                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1188                        remaining = &remaining[match_obj.end()..];
1189                    } else {
1190                        elements.push(Element::Text(":".to_string()));
1191                        remaining = &remaining[1..];
1192                    }
1193                }
1194                "html_entity" => {
1195                    // HTML entities are captured whole - use as_str() to get just the matched content
1196                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1197                    remaining = &remaining[match_obj.end()..];
1198                }
1199                "hugo_shortcode" => {
1200                    // Hugo shortcodes are atomic elements - preserve them exactly
1201                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1202                    remaining = &remaining[match_obj.end()..];
1203                }
1204                "autolink" => {
1205                    // Autolinks are atomic elements - preserve them exactly
1206                    elements.push(Element::Autolink(match_obj.as_str().to_string()));
1207                    remaining = &remaining[match_obj.end()..];
1208                }
1209                "html_tag" => {
1210                    // HTML tags are captured whole - use as_str() to get just the matched content
1211                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1212                    remaining = &remaining[match_obj.end()..];
1213                }
1214                _ => {
1215                    // Unknown pattern, treat as text
1216                    elements.push(Element::Text("[".to_string()));
1217                    remaining = &remaining[1..];
1218                }
1219            }
1220        } else {
1221            // Process non-link special characters
1222
1223            // Add any text before the special character
1224            if next_special > 0 && next_special < remaining.len() {
1225                elements.push(Element::Text(remaining[..next_special].to_string()));
1226                remaining = &remaining[next_special..];
1227            }
1228
1229            // Process the special element
1230            match special_type {
1231                "code" => {
1232                    // Find end of code
1233                    if let Some(code_end) = remaining[1..].find('`') {
1234                        let code = &remaining[1..1 + code_end];
1235                        elements.push(Element::Code(code.to_string()));
1236                        remaining = &remaining[1 + code_end + 1..];
1237                    } else {
1238                        // No closing backtick, treat as text
1239                        elements.push(Element::Text(remaining.to_string()));
1240                        break;
1241                    }
1242                }
1243                "attr_list" => {
1244                    elements.push(Element::AttrList(remaining[..attr_list_len].to_string()));
1245                    remaining = &remaining[attr_list_len..];
1246                }
1247                "pulldown_emphasis" => {
1248                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1249                    if let Some(span) = pulldown_emphasis {
1250                        let span_len = span.end - span.start;
1251                        if span.is_strikethrough {
1252                            elements.push(Element::Strikethrough(span.content.clone()));
1253                        } else if span.is_strong {
1254                            elements.push(Element::Bold {
1255                                content: span.content.clone(),
1256                                underscore: span.uses_underscore,
1257                            });
1258                        } else {
1259                            elements.push(Element::Italic {
1260                                content: span.content.clone(),
1261                                underscore: span.uses_underscore,
1262                            });
1263                        }
1264                        remaining = &remaining[span_len..];
1265                    } else {
1266                        // Fallback - shouldn't happen
1267                        elements.push(Element::Text(remaining[..1].to_string()));
1268                        remaining = &remaining[1..];
1269                    }
1270                }
1271                _ => {
1272                    // No special elements found, add all remaining text
1273                    elements.push(Element::Text(remaining.to_string()));
1274                    break;
1275                }
1276            }
1277        }
1278    }
1279
1280    elements
1281}
1282
1283/// Reflow elements for sentence-per-line mode
1284fn reflow_elements_sentence_per_line(
1285    elements: &[Element],
1286    custom_abbreviations: &Option<Vec<String>>,
1287    require_sentence_capital: bool,
1288) -> Vec<String> {
1289    let abbreviations = get_abbreviations(custom_abbreviations);
1290    let mut lines = Vec::new();
1291    let mut current_line = String::new();
1292
1293    for (idx, element) in elements.iter().enumerate() {
1294        let element_str = format!("{element}");
1295
1296        // For text elements, split into sentences
1297        if let Element::Text(text) = element {
1298            // Simply append text - it already has correct spacing from tokenization
1299            let combined = format!("{current_line}{text}");
1300            // Use the pre-computed abbreviations set to avoid redundant computation
1301            let sentences = split_into_sentences_with_set(&combined, &abbreviations, require_sentence_capital);
1302
1303            if sentences.len() > 1 {
1304                // We found sentence boundaries
1305                for (i, sentence) in sentences.iter().enumerate() {
1306                    if i == 0 {
1307                        // First sentence might continue from previous elements
1308                        // But check if it ends with an abbreviation
1309                        let trimmed = sentence.trim();
1310
1311                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1312                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1313                            current_line = sentence.to_string();
1314                        } else {
1315                            // Normal case - emit the first sentence
1316                            lines.push(sentence.to_string());
1317                            current_line.clear();
1318                        }
1319                    } else if i == sentences.len() - 1 {
1320                        // Last sentence: check if it's complete or incomplete
1321                        let trimmed = sentence.trim();
1322                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1323
1324                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1325                            // Complete sentence - emit it immediately
1326                            lines.push(sentence.to_string());
1327                            current_line.clear();
1328                        } else {
1329                            // Incomplete sentence - save for next iteration
1330                            current_line = sentence.to_string();
1331                        }
1332                    } else {
1333                        // Complete sentences in the middle
1334                        lines.push(sentence.to_string());
1335                    }
1336                }
1337            } else {
1338                // Single sentence - check if it's complete
1339                let trimmed = combined.trim();
1340
1341                // If the combined result is only whitespace, don't accumulate it.
1342                // This prevents leading spaces on subsequent elements when lines
1343                // are joined with spaces during reflow iteration.
1344                if trimmed.is_empty() {
1345                    continue;
1346                }
1347
1348                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1349
1350                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1351                    // Complete single sentence - emit it
1352                    lines.push(trimmed.to_string());
1353                    current_line.clear();
1354                } else {
1355                    // Incomplete sentence - continue accumulating
1356                    current_line = combined;
1357                }
1358            }
1359        } else if let Element::Italic { content, underscore } = element {
1360            // Handle italic elements - may contain multiple sentences that need continuation
1361            let marker = if *underscore { "_" } else { "*" };
1362            handle_emphasis_sentence_split(
1363                content,
1364                marker,
1365                &abbreviations,
1366                require_sentence_capital,
1367                &mut current_line,
1368                &mut lines,
1369            );
1370        } else if let Element::Bold { content, underscore } = element {
1371            // Handle bold elements - may contain multiple sentences that need continuation
1372            let marker = if *underscore { "__" } else { "**" };
1373            handle_emphasis_sentence_split(
1374                content,
1375                marker,
1376                &abbreviations,
1377                require_sentence_capital,
1378                &mut current_line,
1379                &mut lines,
1380            );
1381        } else if let Element::Strikethrough(content) = element {
1382            // Handle strikethrough elements - may contain multiple sentences that need continuation
1383            handle_emphasis_sentence_split(
1384                content,
1385                "~~",
1386                &abbreviations,
1387                require_sentence_capital,
1388                &mut current_line,
1389                &mut lines,
1390            );
1391        } else {
1392            // Non-text, non-emphasis elements (Code, Links, etc.)
1393            // Check if this element is adjacent to the preceding text (no space between)
1394            let is_adjacent = if idx > 0 {
1395                match &elements[idx - 1] {
1396                    Element::Text(t) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1397                    _ => true,
1398                }
1399            } else {
1400                false
1401            };
1402
1403            // Add space before element if needed, but not for adjacent elements
1404            if !is_adjacent
1405                && !current_line.is_empty()
1406                && !current_line.ends_with(' ')
1407                && !current_line.ends_with('(')
1408                && !current_line.ends_with('[')
1409            {
1410                current_line.push(' ');
1411            }
1412            current_line.push_str(&element_str);
1413        }
1414    }
1415
1416    // Add any remaining content
1417    if !current_line.is_empty() {
1418        lines.push(current_line.trim().to_string());
1419    }
1420    lines
1421}
1422
1423/// Handle splitting emphasis content at sentence boundaries while preserving markers
1424fn handle_emphasis_sentence_split(
1425    content: &str,
1426    marker: &str,
1427    abbreviations: &HashSet<String>,
1428    require_sentence_capital: bool,
1429    current_line: &mut String,
1430    lines: &mut Vec<String>,
1431) {
1432    // Split the emphasis content into sentences
1433    let sentences = split_into_sentences_with_set(content, abbreviations, require_sentence_capital);
1434
1435    if sentences.len() <= 1 {
1436        // Single sentence or no boundaries - treat as atomic
1437        if !current_line.is_empty()
1438            && !current_line.ends_with(' ')
1439            && !current_line.ends_with('(')
1440            && !current_line.ends_with('[')
1441        {
1442            current_line.push(' ');
1443        }
1444        current_line.push_str(marker);
1445        current_line.push_str(content);
1446        current_line.push_str(marker);
1447
1448        // Check if the emphasis content ends with sentence punctuation - if so, emit
1449        let trimmed = content.trim();
1450        let ends_with_punct = ends_with_sentence_punct(trimmed);
1451        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1452            lines.push(current_line.clone());
1453            current_line.clear();
1454        }
1455    } else {
1456        // Multiple sentences - each gets its own emphasis markers
1457        for (i, sentence) in sentences.iter().enumerate() {
1458            let trimmed = sentence.trim();
1459            if trimmed.is_empty() {
1460                continue;
1461            }
1462
1463            if i == 0 {
1464                // First sentence: combine with current_line and emit
1465                if !current_line.is_empty()
1466                    && !current_line.ends_with(' ')
1467                    && !current_line.ends_with('(')
1468                    && !current_line.ends_with('[')
1469                {
1470                    current_line.push(' ');
1471                }
1472                current_line.push_str(marker);
1473                current_line.push_str(trimmed);
1474                current_line.push_str(marker);
1475
1476                // Check if this is a complete sentence
1477                let ends_with_punct = ends_with_sentence_punct(trimmed);
1478                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1479                    lines.push(current_line.clone());
1480                    current_line.clear();
1481                }
1482            } else if i == sentences.len() - 1 {
1483                // Last sentence: check if complete
1484                let ends_with_punct = ends_with_sentence_punct(trimmed);
1485
1486                let mut line = String::new();
1487                line.push_str(marker);
1488                line.push_str(trimmed);
1489                line.push_str(marker);
1490
1491                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1492                    lines.push(line);
1493                } else {
1494                    // Incomplete - keep in current_line for potential continuation
1495                    *current_line = line;
1496                }
1497            } else {
1498                // Middle sentences: emit with markers
1499                let mut line = String::new();
1500                line.push_str(marker);
1501                line.push_str(trimmed);
1502                line.push_str(marker);
1503                lines.push(line);
1504            }
1505        }
1506    }
1507}
1508
1509/// English break-words used for semantic line break splitting.
1510/// These are conjunctions and relative pronouns where a line break
1511/// reads naturally.
1512const BREAK_WORDS: &[&str] = &[
1513    "and",
1514    "or",
1515    "but",
1516    "nor",
1517    "yet",
1518    "so",
1519    "for",
1520    "which",
1521    "that",
1522    "because",
1523    "when",
1524    "if",
1525    "while",
1526    "where",
1527    "although",
1528    "though",
1529    "unless",
1530    "since",
1531    "after",
1532    "before",
1533    "until",
1534    "as",
1535    "once",
1536    "whether",
1537    "however",
1538    "therefore",
1539    "moreover",
1540    "furthermore",
1541    "nevertheless",
1542    "whereas",
1543];
1544
1545/// Check if a character is clause punctuation for semantic line breaks
1546fn is_clause_punctuation(c: char) -> bool {
1547    matches!(c, ',' | ';' | ':' | '\u{2014}') // comma, semicolon, colon, em dash
1548}
1549
1550/// Compute element spans for a flat text representation of elements.
1551/// Returns Vec of (start, end) byte offsets for non-Text elements,
1552/// so we can check that a split position doesn't fall inside them.
1553fn compute_element_spans(elements: &[Element]) -> Vec<(usize, usize)> {
1554    let mut spans = Vec::new();
1555    let mut offset = 0;
1556    for element in elements {
1557        let rendered = format!("{element}");
1558        let len = rendered.len();
1559        if !matches!(element, Element::Text(_)) {
1560            spans.push((offset, offset + len));
1561        }
1562        offset += len;
1563    }
1564    spans
1565}
1566
1567/// Check if a byte position falls inside any non-Text element span
1568fn is_inside_element(pos: usize, spans: &[(usize, usize)]) -> bool {
1569    spans.iter().any(|(start, end)| pos > *start && pos < *end)
1570}
1571
1572/// Minimum fraction of line_length that the first part of a split must occupy.
1573/// Prevents awkwardly short first lines like "A," or "Note:" on their own.
1574const MIN_SPLIT_RATIO: f64 = 0.3;
1575
1576/// Split a line at the latest clause punctuation that keeps the first part
1577/// within `line_length`. Returns None if no valid split point exists or if
1578/// the split would create an unreasonably short first line.
1579fn split_at_clause_punctuation(
1580    text: &str,
1581    line_length: usize,
1582    element_spans: &[(usize, usize)],
1583    length_mode: ReflowLengthMode,
1584) -> Option<(String, String)> {
1585    let chars: Vec<char> = text.chars().collect();
1586    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1587
1588    // Find the char index where accumulated display width exceeds line_length
1589    let mut width_acc = 0;
1590    let mut search_end_char = 0;
1591    for (idx, &c) in chars.iter().enumerate() {
1592        let c_width = display_len(&c.to_string(), length_mode);
1593        if width_acc + c_width > line_length {
1594            break;
1595        }
1596        width_acc += c_width;
1597        search_end_char = idx + 1;
1598    }
1599
1600    let mut best_pos = None;
1601    for i in (0..search_end_char).rev() {
1602        if is_clause_punctuation(chars[i]) {
1603            // Convert char position to byte position for element span check
1604            let byte_pos: usize = chars[..=i].iter().map(|c| c.len_utf8()).sum();
1605            if !is_inside_element(byte_pos, element_spans) {
1606                best_pos = Some(i);
1607                break;
1608            }
1609        }
1610    }
1611
1612    let pos = best_pos?;
1613
1614    // Reject splits that create very short first lines
1615    let first: String = chars[..=pos].iter().collect();
1616    let first_display_len = display_len(&first, length_mode);
1617    if first_display_len < min_first_len {
1618        return None;
1619    }
1620
1621    // Split after the punctuation character
1622    let rest: String = chars[pos + 1..].iter().collect();
1623    let rest = rest.trim_start().to_string();
1624
1625    if rest.is_empty() {
1626        return None;
1627    }
1628
1629    Some((first, rest))
1630}
1631
1632/// Split a line before the latest break-word that keeps the first part
1633/// within `line_length`. Returns None if no valid split point exists or if
1634/// the split would create an unreasonably short first line.
1635fn split_at_break_word(
1636    text: &str,
1637    line_length: usize,
1638    element_spans: &[(usize, usize)],
1639    length_mode: ReflowLengthMode,
1640) -> Option<(String, String)> {
1641    let lower = text.to_lowercase();
1642    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1643    let mut best_split: Option<(usize, usize)> = None; // (byte_start, word_len_bytes)
1644
1645    for &word in BREAK_WORDS {
1646        let mut search_start = 0;
1647        while let Some(pos) = lower[search_start..].find(word) {
1648            let abs_pos = search_start + pos;
1649
1650            // Verify it's a word boundary: preceded by space, followed by space
1651            let preceded_by_space = abs_pos == 0 || text.as_bytes().get(abs_pos - 1) == Some(&b' ');
1652            let followed_by_space = text.as_bytes().get(abs_pos + word.len()) == Some(&b' ');
1653
1654            if preceded_by_space && followed_by_space {
1655                // The break goes BEFORE the word, so first part ends at abs_pos - 1
1656                let first_part = text[..abs_pos].trim_end();
1657                let first_part_len = display_len(first_part, length_mode);
1658
1659                if first_part_len >= min_first_len
1660                    && first_part_len <= line_length
1661                    && !is_inside_element(abs_pos, element_spans)
1662                {
1663                    // Prefer the latest valid split point
1664                    if best_split.is_none_or(|(prev_pos, _)| abs_pos > prev_pos) {
1665                        best_split = Some((abs_pos, word.len()));
1666                    }
1667                }
1668            }
1669
1670            search_start = abs_pos + word.len();
1671        }
1672    }
1673
1674    let (byte_start, _word_len) = best_split?;
1675
1676    let first = text[..byte_start].trim_end().to_string();
1677    let rest = text[byte_start..].to_string();
1678
1679    if first.is_empty() || rest.trim().is_empty() {
1680        return None;
1681    }
1682
1683    Some((first, rest))
1684}
1685
1686/// Recursively cascade-split a line that exceeds line_length.
1687/// Tries clause punctuation first, then break-words, then word wrap.
1688fn cascade_split_line(
1689    text: &str,
1690    line_length: usize,
1691    abbreviations: &Option<Vec<String>>,
1692    length_mode: ReflowLengthMode,
1693    attr_lists: bool,
1694) -> Vec<String> {
1695    if line_length == 0 || display_len(text, length_mode) <= line_length {
1696        return vec![text.to_string()];
1697    }
1698
1699    let elements = parse_markdown_elements_inner(text, attr_lists);
1700    let element_spans = compute_element_spans(&elements);
1701
1702    // Try clause punctuation split
1703    if let Some((first, rest)) = split_at_clause_punctuation(text, line_length, &element_spans, length_mode) {
1704        let mut result = vec![first];
1705        result.extend(cascade_split_line(
1706            &rest,
1707            line_length,
1708            abbreviations,
1709            length_mode,
1710            attr_lists,
1711        ));
1712        return result;
1713    }
1714
1715    // Try break-word split
1716    if let Some((first, rest)) = split_at_break_word(text, line_length, &element_spans, length_mode) {
1717        let mut result = vec![first];
1718        result.extend(cascade_split_line(
1719            &rest,
1720            line_length,
1721            abbreviations,
1722            length_mode,
1723            attr_lists,
1724        ));
1725        return result;
1726    }
1727
1728    // Fallback: word wrap using existing reflow_elements
1729    let options = ReflowOptions {
1730        line_length,
1731        break_on_sentences: false,
1732        preserve_breaks: false,
1733        sentence_per_line: false,
1734        semantic_line_breaks: false,
1735        abbreviations: abbreviations.clone(),
1736        length_mode,
1737        attr_lists,
1738        require_sentence_capital: true,
1739    };
1740    reflow_elements(&elements, &options)
1741}
1742
1743/// Reflow elements using semantic line breaks strategy:
1744/// 1. Split at sentence boundaries (always)
1745/// 2. For lines exceeding line_length, cascade through clause punct → break-words → word wrap
1746fn reflow_elements_semantic(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1747    // Step 1: Split into sentences using existing sentence-per-line logic
1748    let sentence_lines =
1749        reflow_elements_sentence_per_line(elements, &options.abbreviations, options.require_sentence_capital);
1750
1751    // Step 2: For each sentence line, apply cascading splits if it exceeds line_length
1752    // When line_length is 0 (unlimited), skip cascading — sentence splits only
1753    if options.line_length == 0 {
1754        return sentence_lines;
1755    }
1756
1757    let length_mode = options.length_mode;
1758    let mut result = Vec::new();
1759    for line in sentence_lines {
1760        if display_len(&line, length_mode) <= options.line_length {
1761            result.push(line);
1762        } else {
1763            result.extend(cascade_split_line(
1764                &line,
1765                options.line_length,
1766                &options.abbreviations,
1767                length_mode,
1768                options.attr_lists,
1769            ));
1770        }
1771    }
1772
1773    // Step 3: Merge very short trailing lines back into the previous line.
1774    // Word wrap can produce lines like "was" or "see" on their own, which reads poorly.
1775    let min_line_len = ((options.line_length as f64) * MIN_SPLIT_RATIO) as usize;
1776    let mut merged: Vec<String> = Vec::with_capacity(result.len());
1777    for line in result {
1778        if !merged.is_empty() && display_len(&line, length_mode) < min_line_len && !line.trim().is_empty() {
1779            // Don't merge across sentence boundaries — sentence splits are intentional
1780            let prev_ends_at_sentence = {
1781                let trimmed = merged.last().unwrap().trim_end();
1782                trimmed
1783                    .chars()
1784                    .rev()
1785                    .find(|c| !matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | ')' | ']'))
1786                    .is_some_and(|c| matches!(c, '.' | '!' | '?'))
1787            };
1788
1789            if !prev_ends_at_sentence {
1790                let prev = merged.last_mut().unwrap();
1791                let combined = format!("{prev} {line}");
1792                // Only merge if the combined line fits within the limit
1793                if display_len(&combined, length_mode) <= options.line_length {
1794                    *prev = combined;
1795                    continue;
1796                }
1797            }
1798        }
1799        merged.push(line);
1800    }
1801    merged
1802}
1803
1804/// Find the last space in `line` that is safe to split at.
1805/// Safe spaces are those NOT inside rendered non-Text elements.
1806/// `element_spans` contains (start, end) byte ranges of non-Text elements in the line.
1807/// Find the last space in `line` that is not inside any element span.
1808/// Spans use exclusive bounds (pos > start && pos < end) because element
1809/// delimiters (e.g., `[`, `]`, `(`, `)`, `<`, `>`, `` ` ``) are never
1810/// spaces, so only interior positions need protection.
1811fn rfind_safe_space(line: &str, element_spans: &[(usize, usize)]) -> Option<usize> {
1812    line.char_indices()
1813        .rev()
1814        .map(|(pos, _)| pos)
1815        .find(|&pos| line.as_bytes()[pos] == b' ' && !element_spans.iter().any(|(s, e)| pos > *s && pos < *e))
1816}
1817
1818/// Reflow elements into lines that fit within the line length
1819fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1820    let mut lines = Vec::new();
1821    let mut current_line = String::new();
1822    let mut current_length = 0;
1823    // Track byte spans of non-Text elements in current_line for safe splitting
1824    let mut current_line_element_spans: Vec<(usize, usize)> = Vec::new();
1825    let length_mode = options.length_mode;
1826
1827    for (idx, element) in elements.iter().enumerate() {
1828        let element_str = format!("{element}");
1829        let element_len = element.display_width(length_mode);
1830
1831        // Determine adjacency from the original elements, not from current_line.
1832        // Elements are adjacent when there's no whitespace between them in the source:
1833        // - Text("v") → HugoShortcode("{{<...>}}") = adjacent (text has no trailing space)
1834        // - Text(" and ") → InlineLink("[a](url)") = NOT adjacent (text has trailing space)
1835        // - HugoShortcode("{{<...>}}") → Text(",") = adjacent (text has no leading space)
1836        let is_adjacent_to_prev = if idx > 0 {
1837            match (&elements[idx - 1], element) {
1838                (Element::Text(t), _) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1839                (_, Element::Text(t)) => !t.is_empty() && !t.starts_with(char::is_whitespace),
1840                _ => true,
1841            }
1842        } else {
1843            false
1844        };
1845
1846        // For text elements that might need breaking
1847        if let Element::Text(text) = element {
1848            // Check if original text had leading whitespace
1849            let has_leading_space = text.starts_with(char::is_whitespace);
1850            // If this is a text element, always process it word by word
1851            let words: Vec<&str> = text.split_whitespace().collect();
1852
1853            for (i, word) in words.iter().enumerate() {
1854                let word_len = display_len(word, length_mode);
1855                // Check if this "word" is just punctuation that should stay attached
1856                let is_trailing_punct = word
1857                    .chars()
1858                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1859
1860                // First word of text adjacent to preceding non-text element
1861                // must stay attached (e.g., shortcode followed by punctuation or text)
1862                let is_first_adjacent = i == 0 && is_adjacent_to_prev;
1863
1864                if is_first_adjacent {
1865                    // Attach directly without space, preventing line break
1866                    if current_length + word_len > options.line_length && current_length > 0 {
1867                        // Would exceed — break before the adjacent group
1868                        // Use element-aware space search to avoid splitting inside links/code/etc.
1869                        if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1870                            let before = current_line[..last_space].trim_end().to_string();
1871                            let after = current_line[last_space + 1..].to_string();
1872                            lines.push(before);
1873                            current_line = format!("{after}{word}");
1874                            current_length = display_len(&current_line, length_mode);
1875                            current_line_element_spans.clear();
1876                        } else {
1877                            current_line.push_str(word);
1878                            current_length += word_len;
1879                        }
1880                    } else {
1881                        current_line.push_str(word);
1882                        current_length += word_len;
1883                    }
1884                } else if current_length > 0
1885                    && current_length + 1 + word_len > options.line_length
1886                    && !is_trailing_punct
1887                {
1888                    // Start a new line (but never for trailing punctuation)
1889                    lines.push(current_line.trim().to_string());
1890                    current_line = word.to_string();
1891                    current_length = word_len;
1892                    current_line_element_spans.clear();
1893                } else {
1894                    // Add word to current line
1895                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1896                    // AND this isn't trailing punctuation (which attaches directly)
1897                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1898                        current_line.push(' ');
1899                        current_length += 1;
1900                    }
1901                    current_line.push_str(word);
1902                    current_length += word_len;
1903                }
1904            }
1905        } else if matches!(
1906            element,
1907            Element::Italic { .. } | Element::Bold { .. } | Element::Strikethrough(_)
1908        ) && element_len > options.line_length
1909        {
1910            // Italic, bold, and strikethrough with content longer than line_length need word wrapping.
1911            // Split content word-by-word, attach the opening marker to the first word
1912            // and the closing marker to the last word.
1913            let (content, marker): (&str, &str) = match element {
1914                Element::Italic { content, underscore } => (content.as_str(), if *underscore { "_" } else { "*" }),
1915                Element::Bold { content, underscore } => (content.as_str(), if *underscore { "__" } else { "**" }),
1916                Element::Strikethrough(content) => (content.as_str(), "~~"),
1917                _ => unreachable!(),
1918            };
1919
1920            let words: Vec<&str> = content.split_whitespace().collect();
1921            let n = words.len();
1922
1923            if n == 0 {
1924                // Empty span — treat as atomic
1925                let full = format!("{marker}{marker}");
1926                let full_len = display_len(&full, length_mode);
1927                if !is_adjacent_to_prev && current_length > 0 {
1928                    current_line.push(' ');
1929                    current_length += 1;
1930                }
1931                current_line.push_str(&full);
1932                current_length += full_len;
1933            } else {
1934                for (i, word) in words.iter().enumerate() {
1935                    let is_first = i == 0;
1936                    let is_last = i == n - 1;
1937                    let word_str: String = match (is_first, is_last) {
1938                        (true, true) => format!("{marker}{word}{marker}"),
1939                        (true, false) => format!("{marker}{word}"),
1940                        (false, true) => format!("{word}{marker}"),
1941                        (false, false) => word.to_string(),
1942                    };
1943                    let word_len = display_len(&word_str, length_mode);
1944
1945                    let needs_space = if is_first {
1946                        !is_adjacent_to_prev && current_length > 0
1947                    } else {
1948                        current_length > 0
1949                    };
1950
1951                    if needs_space && current_length + 1 + word_len > options.line_length {
1952                        lines.push(current_line.trim_end().to_string());
1953                        current_line = word_str;
1954                        current_length = word_len;
1955                        current_line_element_spans.clear();
1956                    } else {
1957                        if needs_space {
1958                            current_line.push(' ');
1959                            current_length += 1;
1960                        }
1961                        current_line.push_str(&word_str);
1962                        current_length += word_len;
1963                    }
1964                }
1965            }
1966        } else {
1967            // For non-text elements (code, links, references), treat as atomic units
1968            // These should never be broken across lines
1969
1970            if is_adjacent_to_prev {
1971                // Adjacent to preceding text — attach directly without space
1972                if current_length + element_len > options.line_length {
1973                    // Would exceed limit — break before the adjacent word group
1974                    // Use element-aware space search to avoid splitting inside links/code/etc.
1975                    if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1976                        let before = current_line[..last_space].trim_end().to_string();
1977                        let after = current_line[last_space + 1..].to_string();
1978                        lines.push(before);
1979                        current_line = format!("{after}{element_str}");
1980                        current_length = display_len(&current_line, length_mode);
1981                        current_line_element_spans.clear();
1982                        // Record the element span in the new current_line
1983                        let start = after.len();
1984                        current_line_element_spans.push((start, start + element_str.len()));
1985                    } else {
1986                        // No safe space to break at — accept the long line
1987                        let start = current_line.len();
1988                        current_line.push_str(&element_str);
1989                        current_length += element_len;
1990                        current_line_element_spans.push((start, current_line.len()));
1991                    }
1992                } else {
1993                    let start = current_line.len();
1994                    current_line.push_str(&element_str);
1995                    current_length += element_len;
1996                    current_line_element_spans.push((start, current_line.len()));
1997                }
1998            } else if current_length > 0 && current_length + 1 + element_len > options.line_length {
1999                // Not adjacent, would exceed — start new line
2000                lines.push(current_line.trim().to_string());
2001                current_line = element_str.clone();
2002                current_length = element_len;
2003                current_line_element_spans.clear();
2004                current_line_element_spans.push((0, element_str.len()));
2005            } else {
2006                // Not adjacent, fits — add with space
2007                let ends_with_opener =
2008                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
2009                if current_length > 0 && !ends_with_opener {
2010                    current_line.push(' ');
2011                    current_length += 1;
2012                }
2013                let start = current_line.len();
2014                current_line.push_str(&element_str);
2015                current_length += element_len;
2016                current_line_element_spans.push((start, current_line.len()));
2017            }
2018        }
2019    }
2020
2021    // Don't forget the last line
2022    if !current_line.is_empty() {
2023        lines.push(current_line.trim_end().to_string());
2024    }
2025
2026    lines
2027}
2028
2029/// Reflow markdown content preserving structure
2030pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
2031    let lines: Vec<&str> = content.lines().collect();
2032    let mut result = Vec::new();
2033    let mut i = 0;
2034
2035    while i < lines.len() {
2036        let line = lines[i];
2037        let trimmed = line.trim();
2038
2039        // Preserve empty lines
2040        if trimmed.is_empty() {
2041            result.push(String::new());
2042            i += 1;
2043            continue;
2044        }
2045
2046        // Preserve headings as-is
2047        if trimmed.starts_with('#') {
2048            result.push(line.to_string());
2049            i += 1;
2050            continue;
2051        }
2052
2053        // Preserve Quarto/Pandoc div markers (:::) as-is
2054        if trimmed.starts_with(":::") {
2055            result.push(line.to_string());
2056            i += 1;
2057            continue;
2058        }
2059
2060        // Preserve fenced code blocks
2061        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2062            result.push(line.to_string());
2063            i += 1;
2064            // Copy lines until closing fence
2065            while i < lines.len() {
2066                result.push(lines[i].to_string());
2067                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
2068                    i += 1;
2069                    break;
2070                }
2071                i += 1;
2072            }
2073            continue;
2074        }
2075
2076        // Preserve indented code blocks (4+ columns accounting for tab expansion)
2077        if ElementCache::calculate_indentation_width_default(line) >= 4 {
2078            // Collect all consecutive indented lines
2079            result.push(line.to_string());
2080            i += 1;
2081            while i < lines.len() {
2082                let next_line = lines[i];
2083                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
2084                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
2085                    result.push(next_line.to_string());
2086                    i += 1;
2087                } else {
2088                    break;
2089                }
2090            }
2091            continue;
2092        }
2093
2094        // Preserve block quotes (but reflow their content)
2095        if trimmed.starts_with('>') {
2096            // find() returns byte position which is correct for str slicing
2097            // The unwrap is safe because we already verified trimmed starts with '>'
2098            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
2099            let quote_prefix = line[0..gt_pos + 1].to_string();
2100            let quote_content = &line[quote_prefix.len()..].trim_start();
2101
2102            let reflowed = reflow_line(quote_content, options);
2103            for reflowed_line in reflowed.iter() {
2104                result.push(format!("{quote_prefix} {reflowed_line}"));
2105            }
2106            i += 1;
2107            continue;
2108        }
2109
2110        // Preserve horizontal rules first (before checking for lists)
2111        if is_horizontal_rule(trimmed) {
2112            result.push(line.to_string());
2113            i += 1;
2114            continue;
2115        }
2116
2117        // Preserve lists (but not horizontal rules)
2118        if is_unordered_list_marker(trimmed) || is_numbered_list_item(trimmed) {
2119            // Find the list marker and preserve indentation
2120            let indent = line.len() - line.trim_start().len();
2121            let indent_str = " ".repeat(indent);
2122
2123            // For numbered lists, find the period and the space after it
2124            // For bullet lists, find the marker and the space after it
2125            let mut marker_end = indent;
2126            let mut content_start = indent;
2127
2128            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
2129                // Numbered list: find the period
2130                if let Some(period_pos) = line[indent..].find('.') {
2131                    marker_end = indent + period_pos + 1; // Include the period
2132                    content_start = marker_end;
2133                    // Skip any spaces after the period to find content start
2134                    // Use byte-based check since content_start is a byte index
2135                    // This is safe because space is ASCII (single byte)
2136                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2137                        content_start += 1;
2138                    }
2139                }
2140            } else {
2141                // Bullet list: marker is single character
2142                marker_end = indent + 1; // Just the marker character
2143                content_start = marker_end;
2144                // Skip any spaces after the marker
2145                // Use byte-based check since content_start is a byte index
2146                // This is safe because space is ASCII (single byte)
2147                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2148                    content_start += 1;
2149                }
2150            }
2151
2152            let marker = &line[indent..marker_end];
2153
2154            // Collect all content for this list item (including continuation lines)
2155            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
2156            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
2157            i += 1;
2158
2159            // Collect continuation lines (indented lines that are part of this list item)
2160            while i < lines.len() {
2161                let next_line = lines[i];
2162                let next_trimmed = next_line.trim();
2163
2164                // Stop if we hit an empty line or another list item or special block
2165                if is_block_boundary(next_trimmed) {
2166                    break;
2167                }
2168
2169                // Check if this line is indented (continuation of list item)
2170                let next_indent = next_line.len() - next_line.trim_start().len();
2171                if next_indent >= content_start {
2172                    // This is a continuation line - add its content
2173                    // Preserve hard breaks while trimming excessive whitespace
2174                    let trimmed_start = next_line.trim_start();
2175                    list_content.push(trim_preserving_hard_break(trimmed_start));
2176                    i += 1;
2177                } else {
2178                    // Not indented enough, not part of this list item
2179                    break;
2180                }
2181            }
2182
2183            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
2184            // Hard breaks should prevent joining with the next line
2185            let combined_content = if options.preserve_breaks {
2186                list_content[0].clone()
2187            } else {
2188                // Check if any lines have hard breaks - if so, preserve the structure
2189                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
2190                if has_hard_breaks {
2191                    // Don't join lines with hard breaks - keep them separate with newlines
2192                    list_content.join("\n")
2193                } else {
2194                    // No hard breaks, safe to join with spaces
2195                    list_content.join(" ")
2196                }
2197            };
2198
2199            // Calculate the proper indentation for continuation lines
2200            let trimmed_marker = marker;
2201            let continuation_spaces = content_start;
2202
2203            // Adjust line length to account for list marker and space
2204            let prefix_length = indent + trimmed_marker.len() + 1;
2205
2206            // Create adjusted options with reduced line length
2207            let adjusted_options = ReflowOptions {
2208                line_length: options.line_length.saturating_sub(prefix_length),
2209                ..options.clone()
2210            };
2211
2212            let reflowed = reflow_line(&combined_content, &adjusted_options);
2213            for (j, reflowed_line) in reflowed.iter().enumerate() {
2214                if j == 0 {
2215                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
2216                } else {
2217                    // Continuation lines aligned with text after marker
2218                    let continuation_indent = " ".repeat(continuation_spaces);
2219                    result.push(format!("{continuation_indent}{reflowed_line}"));
2220                }
2221            }
2222            continue;
2223        }
2224
2225        // Preserve tables
2226        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
2227            result.push(line.to_string());
2228            i += 1;
2229            continue;
2230        }
2231
2232        // Preserve reference definitions
2233        if trimmed.starts_with('[') && line.contains("]:") {
2234            result.push(line.to_string());
2235            i += 1;
2236            continue;
2237        }
2238
2239        // Preserve definition list items (extended markdown)
2240        if is_definition_list_item(trimmed) {
2241            result.push(line.to_string());
2242            i += 1;
2243            continue;
2244        }
2245
2246        // Check if this is a single line that doesn't need processing
2247        let mut is_single_line_paragraph = true;
2248        if i + 1 < lines.len() {
2249            let next_trimmed = lines[i + 1].trim();
2250            // Check if next line continues this paragraph
2251            if !is_block_boundary(next_trimmed) {
2252                is_single_line_paragraph = false;
2253            }
2254        }
2255
2256        // If it's a single line that fits, just add it as-is
2257        if is_single_line_paragraph && display_len(line, options.length_mode) <= options.line_length {
2258            result.push(line.to_string());
2259            i += 1;
2260            continue;
2261        }
2262
2263        // For regular paragraphs, collect consecutive lines
2264        let mut paragraph_parts = Vec::new();
2265        let mut current_part = vec![line];
2266        i += 1;
2267
2268        // If preserve_breaks is true, treat each line separately
2269        if options.preserve_breaks {
2270            // Don't collect consecutive lines - just reflow this single line
2271            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
2272                Some("\\")
2273            } else if line.ends_with("  ") {
2274                Some("  ")
2275            } else {
2276                None
2277            };
2278            let reflowed = reflow_line(line, options);
2279
2280            // Preserve hard breaks (two trailing spaces or backslash)
2281            if let Some(break_marker) = hard_break_type {
2282                if !reflowed.is_empty() {
2283                    let mut reflowed_with_break = reflowed;
2284                    let last_idx = reflowed_with_break.len() - 1;
2285                    if !has_hard_break(&reflowed_with_break[last_idx]) {
2286                        reflowed_with_break[last_idx].push_str(break_marker);
2287                    }
2288                    result.extend(reflowed_with_break);
2289                }
2290            } else {
2291                result.extend(reflowed);
2292            }
2293        } else {
2294            // Original behavior: collect consecutive lines into a paragraph
2295            while i < lines.len() {
2296                let prev_line = if !current_part.is_empty() {
2297                    current_part.last().unwrap()
2298                } else {
2299                    ""
2300                };
2301                let next_line = lines[i];
2302                let next_trimmed = next_line.trim();
2303
2304                // Stop at empty lines or special blocks
2305                if is_block_boundary(next_trimmed) {
2306                    break;
2307                }
2308
2309                // Check if previous line ends with hard break (two spaces or backslash)
2310                // or is a complete sentence in sentence_per_line mode
2311                let prev_trimmed = prev_line.trim();
2312                let abbreviations = get_abbreviations(&options.abbreviations);
2313                let ends_with_sentence = (prev_trimmed.ends_with('.')
2314                    || prev_trimmed.ends_with('!')
2315                    || prev_trimmed.ends_with('?')
2316                    || prev_trimmed.ends_with(".*")
2317                    || prev_trimmed.ends_with("!*")
2318                    || prev_trimmed.ends_with("?*")
2319                    || prev_trimmed.ends_with("._")
2320                    || prev_trimmed.ends_with("!_")
2321                    || prev_trimmed.ends_with("?_")
2322                    // Quote-terminated sentences (straight and curly quotes)
2323                    || prev_trimmed.ends_with(".\"")
2324                    || prev_trimmed.ends_with("!\"")
2325                    || prev_trimmed.ends_with("?\"")
2326                    || prev_trimmed.ends_with(".'")
2327                    || prev_trimmed.ends_with("!'")
2328                    || prev_trimmed.ends_with("?'")
2329                    || prev_trimmed.ends_with(".\u{201D}")
2330                    || prev_trimmed.ends_with("!\u{201D}")
2331                    || prev_trimmed.ends_with("?\u{201D}")
2332                    || prev_trimmed.ends_with(".\u{2019}")
2333                    || prev_trimmed.ends_with("!\u{2019}")
2334                    || prev_trimmed.ends_with("?\u{2019}"))
2335                    && !text_ends_with_abbreviation(
2336                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
2337                        &abbreviations,
2338                    );
2339
2340                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
2341                    // Start a new part after hard break or complete sentence
2342                    paragraph_parts.push(current_part.join(" "));
2343                    current_part = vec![next_line];
2344                } else {
2345                    current_part.push(next_line);
2346                }
2347                i += 1;
2348            }
2349
2350            // Add the last part
2351            if !current_part.is_empty() {
2352                if current_part.len() == 1 {
2353                    // Single line, don't add trailing space
2354                    paragraph_parts.push(current_part[0].to_string());
2355                } else {
2356                    paragraph_parts.push(current_part.join(" "));
2357                }
2358            }
2359
2360            // Reflow each part separately, preserving hard breaks
2361            for (j, part) in paragraph_parts.iter().enumerate() {
2362                let reflowed = reflow_line(part, options);
2363                result.extend(reflowed);
2364
2365                // Preserve hard break by ensuring last line of part ends with hard break marker
2366                // Use two spaces as the default hard break format for reflows
2367                // But don't add hard breaks in sentence_per_line mode - lines are already separate
2368                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
2369                    let last_idx = result.len() - 1;
2370                    if !has_hard_break(&result[last_idx]) {
2371                        result[last_idx].push_str("  ");
2372                    }
2373                }
2374            }
2375        }
2376    }
2377
2378    // Preserve trailing newline if the original content had one
2379    let result_text = result.join("\n");
2380    if content.ends_with('\n') && !result_text.ends_with('\n') {
2381        format!("{result_text}\n")
2382    } else {
2383        result_text
2384    }
2385}
2386
2387/// Information about a reflowed paragraph
2388#[derive(Debug, Clone)]
2389pub struct ParagraphReflow {
2390    /// Starting byte offset of the paragraph in the original content
2391    pub start_byte: usize,
2392    /// Ending byte offset of the paragraph in the original content
2393    pub end_byte: usize,
2394    /// The reflowed text for this paragraph
2395    pub reflowed_text: String,
2396}
2397
2398/// A collected blockquote line used for style-preserving reflow.
2399///
2400/// The invariant `is_explicit == true` iff `prefix.is_some()` is enforced by the
2401/// constructors. Use [`BlockquoteLineData::explicit`] or [`BlockquoteLineData::lazy`]
2402/// rather than constructing the struct directly.
2403#[derive(Debug, Clone)]
2404pub struct BlockquoteLineData {
2405    /// Trimmed content without the `> ` prefix.
2406    pub(crate) content: String,
2407    /// Whether this line carries an explicit blockquote marker.
2408    pub(crate) is_explicit: bool,
2409    /// Full blockquote prefix (e.g. `"> "`, `"> > "`). `None` for lazy continuation lines.
2410    pub(crate) prefix: Option<String>,
2411}
2412
2413impl BlockquoteLineData {
2414    /// Create an explicit (marker-bearing) blockquote line.
2415    pub fn explicit(content: String, prefix: String) -> Self {
2416        Self {
2417            content,
2418            is_explicit: true,
2419            prefix: Some(prefix),
2420        }
2421    }
2422
2423    /// Create a lazy continuation line (no blockquote marker).
2424    pub fn lazy(content: String) -> Self {
2425        Self {
2426            content,
2427            is_explicit: false,
2428            prefix: None,
2429        }
2430    }
2431}
2432
2433/// Style for blockquote continuation lines after reflow.
2434#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2435pub enum BlockquoteContinuationStyle {
2436    Explicit,
2437    Lazy,
2438}
2439
2440/// Determine the continuation style for a blockquote paragraph from its collected lines.
2441///
2442/// The first line is always explicit (it carries the marker), so only continuation
2443/// lines (index 1+) are counted. Ties resolve to `Explicit`.
2444///
2445/// When the slice has only one element (no continuation lines to inspect), both
2446/// counts are zero and the tie-breaking rule returns `Explicit`.
2447pub fn blockquote_continuation_style(lines: &[BlockquoteLineData]) -> BlockquoteContinuationStyle {
2448    let mut explicit_count = 0usize;
2449    let mut lazy_count = 0usize;
2450
2451    for line in lines.iter().skip(1) {
2452        if line.is_explicit {
2453            explicit_count += 1;
2454        } else {
2455            lazy_count += 1;
2456        }
2457    }
2458
2459    if explicit_count > 0 && lazy_count == 0 {
2460        BlockquoteContinuationStyle::Explicit
2461    } else if lazy_count > 0 && explicit_count == 0 {
2462        BlockquoteContinuationStyle::Lazy
2463    } else if explicit_count >= lazy_count {
2464        BlockquoteContinuationStyle::Explicit
2465    } else {
2466        BlockquoteContinuationStyle::Lazy
2467    }
2468}
2469
2470/// Determine the dominant blockquote prefix for a paragraph.
2471///
2472/// The most frequently occurring explicit prefix wins. Ties are broken by earliest
2473/// first appearance. Falls back to `fallback` when no explicit lines are present.
2474pub fn dominant_blockquote_prefix(lines: &[BlockquoteLineData], fallback: &str) -> String {
2475    let mut counts: std::collections::HashMap<String, (usize, usize)> = std::collections::HashMap::new();
2476
2477    for (idx, line) in lines.iter().enumerate() {
2478        let Some(prefix) = line.prefix.as_ref() else {
2479            continue;
2480        };
2481        counts
2482            .entry(prefix.clone())
2483            .and_modify(|entry| entry.0 += 1)
2484            .or_insert((1, idx));
2485    }
2486
2487    counts
2488        .into_iter()
2489        .max_by(|(_, (count_a, first_idx_a)), (_, (count_b, first_idx_b))| {
2490            count_a.cmp(count_b).then_with(|| first_idx_b.cmp(first_idx_a))
2491        })
2492        .map(|(prefix, _)| prefix)
2493        .unwrap_or_else(|| fallback.to_string())
2494}
2495
2496/// Whether a reflowed blockquote content line must carry an explicit prefix.
2497///
2498/// Lines that would start a new block structure (headings, fences, lists, etc.)
2499/// cannot safely use lazy continuation syntax.
2500pub(crate) fn should_force_explicit_blockquote_line(content_line: &str) -> bool {
2501    let trimmed = content_line.trim_start();
2502    trimmed.starts_with('>')
2503        || trimmed.starts_with('#')
2504        || trimmed.starts_with("```")
2505        || trimmed.starts_with("~~~")
2506        || is_unordered_list_marker(trimmed)
2507        || is_numbered_list_item(trimmed)
2508        || is_horizontal_rule(trimmed)
2509        || is_definition_list_item(trimmed)
2510        || (trimmed.starts_with('[') && trimmed.contains("]:"))
2511        || trimmed.starts_with(":::")
2512        || (trimmed.starts_with('<')
2513            && !trimmed.starts_with("<http")
2514            && !trimmed.starts_with("<https")
2515            && !trimmed.starts_with("<mailto:"))
2516}
2517
2518/// Reflow blockquote content lines and apply continuation style.
2519///
2520/// Segments separated by hard breaks are reflowed independently. The output lines
2521/// receive blockquote prefixes according to `continuation_style`: the first line and
2522/// any line that would start a new block structure always get an explicit prefix;
2523/// other lines follow the detected style.
2524///
2525/// Returns the styled, reflowed lines (without a trailing newline).
2526pub fn reflow_blockquote_content(
2527    lines: &[BlockquoteLineData],
2528    explicit_prefix: &str,
2529    continuation_style: BlockquoteContinuationStyle,
2530    options: &ReflowOptions,
2531) -> Vec<String> {
2532    let content_strs: Vec<&str> = lines.iter().map(|l| l.content.as_str()).collect();
2533    let segments = split_into_segments_strs(&content_strs);
2534    let mut reflowed_content_lines: Vec<String> = Vec::new();
2535
2536    for segment in segments {
2537        let hard_break_type = segment.last().and_then(|&line| {
2538            let line = line.strip_suffix('\r').unwrap_or(line);
2539            if line.ends_with('\\') {
2540                Some("\\")
2541            } else if line.ends_with("  ") {
2542                Some("  ")
2543            } else {
2544                None
2545            }
2546        });
2547
2548        let pieces: Vec<&str> = segment
2549            .iter()
2550            .map(|&line| {
2551                if let Some(l) = line.strip_suffix('\\') {
2552                    l.trim_end()
2553                } else if let Some(l) = line.strip_suffix("  ") {
2554                    l.trim_end()
2555                } else {
2556                    line.trim_end()
2557                }
2558            })
2559            .collect();
2560
2561        let segment_text = pieces.join(" ");
2562        let segment_text = segment_text.trim();
2563        if segment_text.is_empty() {
2564            continue;
2565        }
2566
2567        let mut reflowed = reflow_line(segment_text, options);
2568        if let Some(break_marker) = hard_break_type
2569            && !reflowed.is_empty()
2570        {
2571            let last_idx = reflowed.len() - 1;
2572            if !has_hard_break(&reflowed[last_idx]) {
2573                reflowed[last_idx].push_str(break_marker);
2574            }
2575        }
2576        reflowed_content_lines.extend(reflowed);
2577    }
2578
2579    let mut styled_lines: Vec<String> = Vec::new();
2580    for (idx, line) in reflowed_content_lines.iter().enumerate() {
2581        let force_explicit = idx == 0
2582            || continuation_style == BlockquoteContinuationStyle::Explicit
2583            || should_force_explicit_blockquote_line(line);
2584        if force_explicit {
2585            styled_lines.push(format!("{explicit_prefix}{line}"));
2586        } else {
2587            styled_lines.push(line.clone());
2588        }
2589    }
2590
2591    styled_lines
2592}
2593
2594fn is_blockquote_content_boundary(content: &str) -> bool {
2595    let trimmed = content.trim();
2596    trimmed.is_empty()
2597        || is_block_boundary(trimmed)
2598        || crate::utils::table_utils::TableUtils::is_potential_table_row(content)
2599        || trimmed.starts_with(":::")
2600        || crate::utils::is_template_directive_only(content)
2601        || is_standalone_attr_list(content)
2602        || is_snippet_block_delimiter(content)
2603}
2604
2605fn split_into_segments_strs<'a>(lines: &[&'a str]) -> Vec<Vec<&'a str>> {
2606    let mut segments = Vec::new();
2607    let mut current = Vec::new();
2608
2609    for &line in lines {
2610        current.push(line);
2611        if has_hard_break(line) {
2612            segments.push(current);
2613            current = Vec::new();
2614        }
2615    }
2616
2617    if !current.is_empty() {
2618        segments.push(current);
2619    }
2620
2621    segments
2622}
2623
2624fn reflow_blockquote_paragraph_at_line(
2625    content: &str,
2626    lines: &[&str],
2627    target_idx: usize,
2628    options: &ReflowOptions,
2629) -> Option<ParagraphReflow> {
2630    let mut anchor_idx = target_idx;
2631    let mut target_level = if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[target_idx]) {
2632        parsed.nesting_level
2633    } else {
2634        let mut found = None;
2635        let mut idx = target_idx;
2636        loop {
2637            if lines[idx].trim().is_empty() {
2638                break;
2639            }
2640            if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[idx]) {
2641                found = Some((idx, parsed.nesting_level));
2642                break;
2643            }
2644            if idx == 0 {
2645                break;
2646            }
2647            idx -= 1;
2648        }
2649        let (idx, level) = found?;
2650        anchor_idx = idx;
2651        level
2652    };
2653
2654    // Expand backward to capture prior quote content at the same nesting level.
2655    let mut para_start = anchor_idx;
2656    while para_start > 0 {
2657        let prev_idx = para_start - 1;
2658        let prev_line = lines[prev_idx];
2659
2660        if prev_line.trim().is_empty() {
2661            break;
2662        }
2663
2664        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(prev_line) {
2665            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2666                break;
2667            }
2668            para_start = prev_idx;
2669            continue;
2670        }
2671
2672        let prev_lazy = prev_line.trim_start();
2673        if is_blockquote_content_boundary(prev_lazy) {
2674            break;
2675        }
2676        para_start = prev_idx;
2677    }
2678
2679    // Lazy continuation cannot precede the first explicit marker.
2680    while para_start < lines.len() {
2681        let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[para_start]) else {
2682            para_start += 1;
2683            continue;
2684        };
2685        target_level = parsed.nesting_level;
2686        break;
2687    }
2688
2689    if para_start >= lines.len() || para_start > target_idx {
2690        return None;
2691    }
2692
2693    // Collect explicit lines at target level and lazy continuation lines.
2694    // Each entry is (original_line_idx, BlockquoteLineData).
2695    let mut collected: Vec<(usize, BlockquoteLineData)> = Vec::new();
2696    let mut idx = para_start;
2697    while idx < lines.len() {
2698        if !collected.is_empty() && has_hard_break(&collected[collected.len() - 1].1.content) {
2699            break;
2700        }
2701
2702        let line = lines[idx];
2703        if line.trim().is_empty() {
2704            break;
2705        }
2706
2707        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(line) {
2708            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2709                break;
2710            }
2711            collected.push((
2712                idx,
2713                BlockquoteLineData::explicit(trim_preserving_hard_break(parsed.content), parsed.prefix.to_string()),
2714            ));
2715            idx += 1;
2716            continue;
2717        }
2718
2719        let lazy_content = line.trim_start();
2720        if is_blockquote_content_boundary(lazy_content) {
2721            break;
2722        }
2723
2724        collected.push((idx, BlockquoteLineData::lazy(trim_preserving_hard_break(lazy_content))));
2725        idx += 1;
2726    }
2727
2728    if collected.is_empty() {
2729        return None;
2730    }
2731
2732    let para_end = collected[collected.len() - 1].0;
2733    if target_idx < para_start || target_idx > para_end {
2734        return None;
2735    }
2736
2737    let line_data: Vec<BlockquoteLineData> = collected.iter().map(|(_, d)| d.clone()).collect();
2738
2739    let fallback_prefix = line_data
2740        .iter()
2741        .find_map(|d| d.prefix.clone())
2742        .unwrap_or_else(|| "> ".to_string());
2743    let explicit_prefix = dominant_blockquote_prefix(&line_data, &fallback_prefix);
2744    let continuation_style = blockquote_continuation_style(&line_data);
2745
2746    let adjusted_line_length = options
2747        .line_length
2748        .saturating_sub(display_len(&explicit_prefix, options.length_mode))
2749        .max(1);
2750
2751    let adjusted_options = ReflowOptions {
2752        line_length: adjusted_line_length,
2753        ..options.clone()
2754    };
2755
2756    let styled_lines = reflow_blockquote_content(&line_data, &explicit_prefix, continuation_style, &adjusted_options);
2757
2758    if styled_lines.is_empty() {
2759        return None;
2760    }
2761
2762    // Calculate byte offsets.
2763    let mut start_byte = 0;
2764    for line in lines.iter().take(para_start) {
2765        start_byte += line.len() + 1;
2766    }
2767
2768    let mut end_byte = start_byte;
2769    for line in lines.iter().take(para_end + 1).skip(para_start) {
2770        end_byte += line.len() + 1;
2771    }
2772
2773    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2774    if !includes_trailing_newline {
2775        end_byte -= 1;
2776    }
2777
2778    let reflowed_joined = styled_lines.join("\n");
2779    let reflowed_text = if includes_trailing_newline {
2780        if reflowed_joined.ends_with('\n') {
2781            reflowed_joined
2782        } else {
2783            format!("{reflowed_joined}\n")
2784        }
2785    } else if reflowed_joined.ends_with('\n') {
2786        reflowed_joined.trim_end_matches('\n').to_string()
2787    } else {
2788        reflowed_joined
2789    };
2790
2791    Some(ParagraphReflow {
2792        start_byte,
2793        end_byte,
2794        reflowed_text,
2795    })
2796}
2797
2798/// Reflow a single paragraph at the specified line number
2799///
2800/// This function finds the paragraph containing the given line number,
2801/// reflows it according to the specified line length, and returns
2802/// information about the paragraph location and its reflowed text.
2803///
2804/// # Arguments
2805///
2806/// * `content` - The full document content
2807/// * `line_number` - The 1-based line number within the paragraph to reflow
2808/// * `line_length` - The target line length for reflowing
2809///
2810/// # Returns
2811///
2812/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
2813/// or `None` if the line number is out of bounds or the content at that
2814/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
2815pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
2816    reflow_paragraph_at_line_with_mode(content, line_number, line_length, ReflowLengthMode::default())
2817}
2818
2819/// Reflow a paragraph at the given line with a specific length mode.
2820pub fn reflow_paragraph_at_line_with_mode(
2821    content: &str,
2822    line_number: usize,
2823    line_length: usize,
2824    length_mode: ReflowLengthMode,
2825) -> Option<ParagraphReflow> {
2826    let options = ReflowOptions {
2827        line_length,
2828        length_mode,
2829        ..Default::default()
2830    };
2831    reflow_paragraph_at_line_with_options(content, line_number, &options)
2832}
2833
2834/// Reflow a paragraph at the given line using the provided options.
2835///
2836/// This is the canonical implementation used by both the rule's fix mode and the
2837/// LSP "Reflow paragraph" action. Passing a fully configured `ReflowOptions` allows
2838/// the LSP action to respect user-configured reflow mode, abbreviations, etc.
2839///
2840/// # Returns
2841///
2842/// Returns `Some(ParagraphReflow)` with byte offsets and reflowed text, or `None`
2843/// if the line is out of bounds or sits inside a non-reflow-able construct.
2844pub fn reflow_paragraph_at_line_with_options(
2845    content: &str,
2846    line_number: usize,
2847    options: &ReflowOptions,
2848) -> Option<ParagraphReflow> {
2849    if line_number == 0 {
2850        return None;
2851    }
2852
2853    let lines: Vec<&str> = content.lines().collect();
2854
2855    // Check if line number is valid (1-based)
2856    if line_number > lines.len() {
2857        return None;
2858    }
2859
2860    let target_idx = line_number - 1; // Convert to 0-based
2861    let target_line = lines[target_idx];
2862    let trimmed = target_line.trim();
2863
2864    // Handle blockquote paragraphs (including lazy continuation lines) with
2865    // style-preserving output.
2866    if let Some(blockquote_reflow) = reflow_blockquote_paragraph_at_line(content, &lines, target_idx, options) {
2867        return Some(blockquote_reflow);
2868    }
2869
2870    // Don't reflow special blocks
2871    if is_paragraph_boundary(trimmed, target_line) {
2872        return None;
2873    }
2874
2875    // Find paragraph start - scan backward until blank line or special block
2876    let mut para_start = target_idx;
2877    while para_start > 0 {
2878        let prev_idx = para_start - 1;
2879        let prev_line = lines[prev_idx];
2880        let prev_trimmed = prev_line.trim();
2881
2882        // Stop at blank line or special blocks
2883        if is_paragraph_boundary(prev_trimmed, prev_line) {
2884            break;
2885        }
2886
2887        para_start = prev_idx;
2888    }
2889
2890    // Find paragraph end - scan forward until blank line or special block
2891    let mut para_end = target_idx;
2892    while para_end + 1 < lines.len() {
2893        let next_idx = para_end + 1;
2894        let next_line = lines[next_idx];
2895        let next_trimmed = next_line.trim();
2896
2897        // Stop at blank line or special blocks
2898        if is_paragraph_boundary(next_trimmed, next_line) {
2899            break;
2900        }
2901
2902        para_end = next_idx;
2903    }
2904
2905    // Extract paragraph lines
2906    let paragraph_lines = &lines[para_start..=para_end];
2907
2908    // Calculate byte offsets
2909    let mut start_byte = 0;
2910    for line in lines.iter().take(para_start) {
2911        start_byte += line.len() + 1; // +1 for newline
2912    }
2913
2914    let mut end_byte = start_byte;
2915    for line in paragraph_lines.iter() {
2916        end_byte += line.len() + 1; // +1 for newline
2917    }
2918
2919    // Track whether the byte range includes a trailing newline
2920    // (it doesn't if this is the last line and the file doesn't end with newline)
2921    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2922
2923    // Adjust end_byte if the last line doesn't have a newline
2924    if !includes_trailing_newline {
2925        end_byte -= 1;
2926    }
2927
2928    // Join paragraph lines and reflow
2929    let paragraph_text = paragraph_lines.join("\n");
2930
2931    // Reflow the paragraph using reflow_markdown to handle it properly
2932    let reflowed = reflow_markdown(&paragraph_text, options);
2933
2934    // Ensure reflowed text matches whether the byte range includes a trailing newline
2935    // This is critical: if the range includes a newline, the replacement must too,
2936    // otherwise the next line will get appended to the reflowed paragraph
2937    let reflowed_text = if includes_trailing_newline {
2938        // Range includes newline - ensure reflowed text has one
2939        if reflowed.ends_with('\n') {
2940            reflowed
2941        } else {
2942            format!("{reflowed}\n")
2943        }
2944    } else {
2945        // Range doesn't include newline - ensure reflowed text doesn't have one
2946        if reflowed.ends_with('\n') {
2947            reflowed.trim_end_matches('\n').to_string()
2948        } else {
2949            reflowed
2950        }
2951    };
2952
2953    Some(ParagraphReflow {
2954        start_byte,
2955        end_byte,
2956        reflowed_text,
2957    })
2958}
2959
2960#[cfg(test)]
2961mod tests {
2962    use super::*;
2963
2964    /// Unit test for private helper function text_ends_with_abbreviation()
2965    ///
2966    /// This test stays inline because it tests a private function.
2967    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2968    #[test]
2969    fn test_helper_function_text_ends_with_abbreviation() {
2970        // Test the helper function directly
2971        let abbreviations = get_abbreviations(&None);
2972
2973        // True cases - built-in abbreviations (titles and i.e./e.g.)
2974        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2975        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2976        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2977        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2978        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2979        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2980        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2981        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2982
2983        // False cases - NOT in built-in list (etc doesn't always have period)
2984        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2985        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2986        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2987        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2988        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2989        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2990        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2991        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2992        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2993        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2994    }
2995
2996    #[test]
2997    fn test_is_unordered_list_marker() {
2998        // Valid unordered list markers
2999        assert!(is_unordered_list_marker("- item"));
3000        assert!(is_unordered_list_marker("* item"));
3001        assert!(is_unordered_list_marker("+ item"));
3002        assert!(is_unordered_list_marker("-")); // lone marker
3003        assert!(is_unordered_list_marker("*"));
3004        assert!(is_unordered_list_marker("+"));
3005
3006        // Not list markers
3007        assert!(!is_unordered_list_marker("---")); // horizontal rule
3008        assert!(!is_unordered_list_marker("***")); // horizontal rule
3009        assert!(!is_unordered_list_marker("- - -")); // horizontal rule
3010        assert!(!is_unordered_list_marker("* * *")); // horizontal rule
3011        assert!(!is_unordered_list_marker("*emphasis*")); // emphasis, not list
3012        assert!(!is_unordered_list_marker("-word")); // no space after marker
3013        assert!(!is_unordered_list_marker("")); // empty
3014        assert!(!is_unordered_list_marker("text")); // plain text
3015        assert!(!is_unordered_list_marker("# heading")); // heading
3016    }
3017
3018    #[test]
3019    fn test_is_block_boundary() {
3020        // Block boundaries
3021        assert!(is_block_boundary("")); // empty line
3022        assert!(is_block_boundary("# Heading")); // ATX heading
3023        assert!(is_block_boundary("## Level 2")); // ATX heading
3024        assert!(is_block_boundary("```rust")); // code fence
3025        assert!(is_block_boundary("~~~")); // tilde code fence
3026        assert!(is_block_boundary("> quote")); // blockquote
3027        assert!(is_block_boundary("| cell |")); // table
3028        assert!(is_block_boundary("[link]: http://example.com")); // reference def
3029        assert!(is_block_boundary("---")); // horizontal rule
3030        assert!(is_block_boundary("***")); // horizontal rule
3031        assert!(is_block_boundary("- item")); // unordered list
3032        assert!(is_block_boundary("* item")); // unordered list
3033        assert!(is_block_boundary("+ item")); // unordered list
3034        assert!(is_block_boundary("1. item")); // ordered list
3035        assert!(is_block_boundary("10. item")); // ordered list
3036        assert!(is_block_boundary(": definition")); // definition list
3037        assert!(is_block_boundary(":::")); // div marker
3038        assert!(is_block_boundary("::::: {.callout-note}")); // div marker with attrs
3039
3040        // NOT block boundaries (paragraph continuation)
3041        assert!(!is_block_boundary("regular text"));
3042        assert!(!is_block_boundary("*emphasis*")); // emphasis, not list
3043        assert!(!is_block_boundary("[link](url)")); // inline link, not reference def
3044        assert!(!is_block_boundary("some words here"));
3045    }
3046
3047    #[test]
3048    fn test_definition_list_boundary_in_single_line_paragraph() {
3049        // Verifies that a definition list item after a single-line paragraph
3050        // is treated as a block boundary, not merged into the paragraph
3051        let options = ReflowOptions {
3052            line_length: 80,
3053            ..Default::default()
3054        };
3055        let input = "Term\n: Definition of the term";
3056        let result = reflow_markdown(input, &options);
3057        // The definition list marker should remain on its own line
3058        assert!(
3059            result.contains(": Definition"),
3060            "Definition list item should not be merged into previous line. Got: {result:?}"
3061        );
3062        let lines: Vec<&str> = result.lines().collect();
3063        assert_eq!(lines.len(), 2, "Should remain two separate lines. Got: {lines:?}");
3064        assert_eq!(lines[0], "Term");
3065        assert_eq!(lines[1], ": Definition of the term");
3066    }
3067
3068    #[test]
3069    fn test_is_paragraph_boundary() {
3070        // Core block boundary checks are inherited
3071        assert!(is_paragraph_boundary("# Heading", "# Heading"));
3072        assert!(is_paragraph_boundary("- item", "- item"));
3073        assert!(is_paragraph_boundary(":::", ":::"));
3074        assert!(is_paragraph_boundary(": definition", ": definition"));
3075
3076        // Indented code blocks (≥4 spaces or tab)
3077        assert!(is_paragraph_boundary("code", "    code"));
3078        assert!(is_paragraph_boundary("code", "\tcode"));
3079
3080        // Table rows via is_potential_table_row
3081        assert!(is_paragraph_boundary("| a | b |", "| a | b |"));
3082        assert!(is_paragraph_boundary("a | b", "a | b")); // pipe-delimited without leading pipe
3083
3084        // Not paragraph boundaries
3085        assert!(!is_paragraph_boundary("regular text", "regular text"));
3086        assert!(!is_paragraph_boundary("text", "  text")); // 2-space indent is not code
3087    }
3088
3089    #[test]
3090    fn test_div_marker_boundary_in_reflow_paragraph_at_line() {
3091        // Verifies that div markers (:::) are treated as paragraph boundaries
3092        // in reflow_paragraph_at_line, preventing reflow across div boundaries
3093        let content = "Some paragraph text here.\n\n::: {.callout-note}\nThis is a callout.\n:::\n";
3094        // Line 3 is the div marker — should not be reflowed
3095        let result = reflow_paragraph_at_line(content, 3, 80);
3096        assert!(result.is_none(), "Div marker line should not be reflowed");
3097    }
3098}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs