rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::mkdocs_attr_list::{ATTR_LIST_PATTERN, is_standalone_attr_list};
9use crate::utils::mkdocs_snippets::is_snippet_block_delimiter;
10use crate::utils::regex_cache::{
11    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
12    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
13    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
14    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
15};
16use crate::utils::sentence_utils::{
17    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
18    text_ends_with_abbreviation,
19};
20use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
21use std::collections::HashSet;
22use unicode_width::UnicodeWidthStr;
23
24/// Length calculation mode for reflow
25#[derive(Clone, Copy, Debug, Default, PartialEq)]
26pub enum ReflowLengthMode {
27    /// Count Unicode characters (grapheme clusters)
28    Chars,
29    /// Count visual display width (CJK = 2 columns, emoji = 2, etc.)
30    #[default]
31    Visual,
32    /// Count raw bytes
33    Bytes,
34}
35
36/// Calculate the display length of a string based on the length mode
37fn display_len(s: &str, mode: ReflowLengthMode) -> usize {
38    match mode {
39        ReflowLengthMode::Chars => s.chars().count(),
40        ReflowLengthMode::Visual => s.width(),
41        ReflowLengthMode::Bytes => s.len(),
42    }
43}
44
45/// Options for reflowing text
46#[derive(Clone)]
47pub struct ReflowOptions {
48    /// Target line length
49    pub line_length: usize,
50    /// Whether to break on sentence boundaries when possible
51    pub break_on_sentences: bool,
52    /// Whether to preserve existing line breaks in paragraphs
53    pub preserve_breaks: bool,
54    /// Whether to enforce one sentence per line
55    pub sentence_per_line: bool,
56    /// Whether to use semantic line breaks (cascading split strategy)
57    pub semantic_line_breaks: bool,
58    /// Custom abbreviations for sentence detection
59    /// Periods are optional - both "Dr" and "Dr." work the same
60    /// Custom abbreviations are always added to the built-in defaults
61    pub abbreviations: Option<Vec<String>>,
62    /// How to measure string length for line-length comparisons
63    pub length_mode: ReflowLengthMode,
64    /// Whether to treat {#id .class key="value"} as atomic (unsplittable) elements.
65    /// Enabled for MkDocs and Kramdown flavors.
66    pub attr_lists: bool,
67}
68
69impl Default for ReflowOptions {
70    fn default() -> Self {
71        Self {
72            line_length: 80,
73            break_on_sentences: true,
74            preserve_breaks: false,
75            sentence_per_line: false,
76            semantic_line_breaks: false,
77            abbreviations: None,
78            length_mode: ReflowLengthMode::default(),
79            attr_lists: false,
80        }
81    }
82}
83
84/// Detect if a character position is a sentence boundary
85/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
86/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
87fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
88    let chars: Vec<char> = text.chars().collect();
89
90    if pos + 1 >= chars.len() {
91        return false;
92    }
93
94    let c = chars[pos];
95    let next_char = chars[pos + 1];
96
97    // Check for CJK sentence-ending punctuation (。, ！, ？)
98    // CJK punctuation doesn't require space or uppercase after it
99    if is_cjk_sentence_ending(c) {
100        // Skip any trailing emphasis/strikethrough markers
101        let mut after_punct_pos = pos + 1;
102        while after_punct_pos < chars.len()
103            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
104        {
105            after_punct_pos += 1;
106        }
107
108        // Skip whitespace
109        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
110            after_punct_pos += 1;
111        }
112
113        // Check if we have more content (any non-whitespace)
114        if after_punct_pos >= chars.len() {
115            return false;
116        }
117
118        // Skip leading emphasis/strikethrough markers
119        while after_punct_pos < chars.len()
120            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
121        {
122            after_punct_pos += 1;
123        }
124
125        if after_punct_pos >= chars.len() {
126            return false;
127        }
128
129        // For CJK, we accept any character as the start of the next sentence
130        // (no uppercase requirement, since CJK doesn't have case)
131        return true;
132    }
133
134    // Check for ASCII sentence-ending punctuation
135    if c != '.' && c != '!' && c != '?' {
136        return false;
137    }
138
139    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
140    let (_space_pos, after_space_pos) = if next_char == ' ' {
141        // Normal case: punctuation followed by space
142        (pos + 1, pos + 2)
143    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
144        // Sentence ends with quote - check what follows the quote
145        if chars[pos + 2] == ' ' {
146            // Just quote followed by space: 'sentence." '
147            (pos + 2, pos + 3)
148        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
149            // Quote followed by emphasis: 'sentence."* '
150            (pos + 3, pos + 4)
151        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
152            && pos + 4 < chars.len()
153            && chars[pos + 3] == chars[pos + 2]
154            && chars[pos + 4] == ' '
155        {
156            // Quote followed by bold: 'sentence."** '
157            (pos + 4, pos + 5)
158        } else {
159            return false;
160        }
161    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
162        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
163        (pos + 2, pos + 3)
164    } else if (next_char == '*' || next_char == '_')
165        && pos + 3 < chars.len()
166        && chars[pos + 2] == next_char
167        && chars[pos + 3] == ' '
168    {
169        // Sentence ends with bold: "sentence.** " or "sentence.__ "
170        (pos + 3, pos + 4)
171    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
172        // Sentence ends with strikethrough: "sentence.~~ "
173        (pos + 3, pos + 4)
174    } else {
175        return false;
176    };
177
178    // Skip all whitespace after the space to find the start of the next sentence
179    let mut next_char_pos = after_space_pos;
180    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
181        next_char_pos += 1;
182    }
183
184    // Check if we reached the end of the string
185    if next_char_pos >= chars.len() {
186        return false;
187    }
188
189    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
190    let mut first_letter_pos = next_char_pos;
191    while first_letter_pos < chars.len()
192        && (chars[first_letter_pos] == '*'
193            || chars[first_letter_pos] == '_'
194            || chars[first_letter_pos] == '~'
195            || is_opening_quote(chars[first_letter_pos]))
196    {
197        first_letter_pos += 1;
198    }
199
200    // Check if we reached the end after skipping emphasis
201    if first_letter_pos >= chars.len() {
202        return false;
203    }
204
205    // First character of next sentence must be uppercase or CJK
206    let first_char = chars[first_letter_pos];
207    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
208        return false;
209    }
210
211    // Look back to check for common abbreviations (only applies to periods)
212    if pos > 0 && c == '.' {
213        // Convert char index to byte offset for string slicing
214        let byte_offset: usize = chars[..=pos].iter().map(|ch| ch.len_utf8()).sum();
215        if text_ends_with_abbreviation(&text[..byte_offset], abbreviations) {
216            return false;
217        }
218
219        // Check for decimal numbers (e.g., "3.14")
220        // Make sure to check if first_letter_pos is within bounds
221        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
222            return false;
223        }
224    }
225    true
226}
227
228/// Split text into sentences
229pub fn split_into_sentences(text: &str) -> Vec<String> {
230    split_into_sentences_custom(text, &None)
231}
232
233/// Split text into sentences with custom abbreviations
234pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
235    let abbreviations = get_abbreviations(custom_abbreviations);
236    split_into_sentences_with_set(text, &abbreviations)
237}
238
239/// Internal function to split text into sentences with a pre-computed abbreviations set
240/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
241fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
242    let mut sentences = Vec::new();
243    let mut current_sentence = String::new();
244    let mut chars = text.chars().peekable();
245    let mut pos = 0;
246
247    while let Some(c) = chars.next() {
248        current_sentence.push(c);
249
250        if is_sentence_boundary(text, pos, abbreviations) {
251            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
252            while let Some(&next) = chars.peek() {
253                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
254                    current_sentence.push(chars.next().unwrap());
255                    pos += 1;
256                } else {
257                    break;
258                }
259            }
260
261            // Consume the space after the sentence
262            if chars.peek() == Some(&' ') {
263                chars.next();
264                pos += 1;
265            }
266
267            sentences.push(current_sentence.trim().to_string());
268            current_sentence.clear();
269        }
270
271        pos += 1;
272    }
273
274    // Add any remaining text as the last sentence
275    if !current_sentence.trim().is_empty() {
276        sentences.push(current_sentence.trim().to_string());
277    }
278    sentences
279}
280
281/// Check if a line is a horizontal rule (---, ___, ***)
282fn is_horizontal_rule(line: &str) -> bool {
283    if line.len() < 3 {
284        return false;
285    }
286
287    // Check if line consists only of -, _, or * characters (at least 3)
288    let chars: Vec<char> = line.chars().collect();
289    if chars.is_empty() {
290        return false;
291    }
292
293    let first_char = chars[0];
294    if first_char != '-' && first_char != '_' && first_char != '*' {
295        return false;
296    }
297
298    // All characters should be the same (allowing spaces between)
299    for c in &chars {
300        if *c != first_char && *c != ' ' {
301            return false;
302        }
303    }
304
305    // Count non-space characters
306    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
307    non_space_count >= 3
308}
309
310/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
311fn is_numbered_list_item(line: &str) -> bool {
312    let mut chars = line.chars();
313
314    // Must start with a digit
315    if !chars.next().is_some_and(|c| c.is_numeric()) {
316        return false;
317    }
318
319    // Can have more digits
320    while let Some(c) = chars.next() {
321        if c == '.' {
322            // After period, must have a space (consistent with list marker extraction)
323            // "2019." alone is NOT treated as a list item to avoid false positives
324            return chars.next() == Some(' ');
325        }
326        if !c.is_numeric() {
327            return false;
328        }
329    }
330
331    false
332}
333
334/// Check if a trimmed line is an unordered list item (-, *, + followed by space)
335fn is_unordered_list_marker(s: &str) -> bool {
336    matches!(s.as_bytes().first(), Some(b'-' | b'*' | b'+'))
337        && !is_horizontal_rule(s)
338        && (s.len() == 1 || s.as_bytes().get(1) == Some(&b' '))
339}
340
341/// Shared structural checks for block boundary detection.
342/// Checks elements that only depend on the trimmed line content.
343fn is_block_boundary_core(trimmed: &str) -> bool {
344    trimmed.is_empty()
345        || trimmed.starts_with('#')
346        || trimmed.starts_with("```")
347        || trimmed.starts_with("~~~")
348        || trimmed.starts_with('>')
349        || (trimmed.starts_with('[') && trimmed.contains("]:"))
350        || is_horizontal_rule(trimmed)
351        || is_unordered_list_marker(trimmed)
352        || is_numbered_list_item(trimmed)
353        || is_definition_list_item(trimmed)
354        || trimmed.starts_with(":::")
355}
356
357/// Check if a trimmed line starts a new structural block element.
358/// Used for paragraph boundary detection in `reflow_markdown()`.
359fn is_block_boundary(trimmed: &str) -> bool {
360    is_block_boundary_core(trimmed) || trimmed.starts_with('|')
361}
362
363/// Check if a line starts a new structural block for paragraph boundary detection
364/// in `reflow_paragraph_at_line()`. Extends the core checks with indented code blocks
365/// (≥4 spaces) and table row detection via `is_potential_table_row`.
366fn is_paragraph_boundary(trimmed: &str, line: &str) -> bool {
367    is_block_boundary_core(trimmed)
368        || ElementCache::calculate_indentation_width_default(line) >= 4
369        || crate::utils::table_utils::TableUtils::is_potential_table_row(line)
370}
371
372/// Check if a line ends with a hard break (either two spaces or backslash)
373///
374/// CommonMark supports two formats for hard line breaks:
375/// 1. Two or more trailing spaces
376/// 2. A backslash at the end of the line
377fn has_hard_break(line: &str) -> bool {
378    let line = line.strip_suffix('\r').unwrap_or(line);
379    line.ends_with("  ") || line.ends_with('\\')
380}
381
382/// Check if text ends with sentence-terminating punctuation (. ! ?)
383fn ends_with_sentence_punct(text: &str) -> bool {
384    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
385}
386
387/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
388///
389/// Hard breaks in Markdown can be indicated by:
390/// 1. Two trailing spaces before a newline (traditional)
391/// 2. A backslash at the end of the line (mdformat style)
392fn trim_preserving_hard_break(s: &str) -> String {
393    // Strip trailing \r from CRLF line endings first to handle Windows files
394    let s = s.strip_suffix('\r').unwrap_or(s);
395
396    // Check for backslash hard break (mdformat style)
397    if s.ends_with('\\') {
398        // Preserve the backslash exactly as-is
399        return s.to_string();
400    }
401
402    // Check if there are at least 2 trailing spaces (traditional hard break)
403    if s.ends_with("  ") {
404        // Find the position where non-space content ends
405        let content_end = s.trim_end().len();
406        if content_end == 0 {
407            // String is all whitespace
408            return String::new();
409        }
410        // Preserve exactly 2 trailing spaces for hard break
411        format!("{}  ", &s[..content_end])
412    } else {
413        // No hard break, just trim all trailing whitespace
414        s.trim_end().to_string()
415    }
416}
417
418/// Parse markdown elements using the appropriate parser based on options.
419fn parse_elements(text: &str, options: &ReflowOptions) -> Vec<Element> {
420    if options.attr_lists {
421        parse_markdown_elements_with_attr_lists(text)
422    } else {
423        parse_markdown_elements(text)
424    }
425}
426
427pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
428    // For sentence-per-line mode, always process regardless of length
429    if options.sentence_per_line {
430        let elements = parse_elements(line, options);
431        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
432    }
433
434    // For semantic line breaks mode, use cascading split strategy
435    if options.semantic_line_breaks {
436        let elements = parse_elements(line, options);
437        return reflow_elements_semantic(&elements, options);
438    }
439
440    // Quick check: if line is already short enough or no wrapping requested, return as-is
441    // line_length = 0 means no wrapping (unlimited line length)
442    if options.line_length == 0 || display_len(line, options.length_mode) <= options.line_length {
443        return vec![line.to_string()];
444    }
445
446    // Parse the markdown to identify elements
447    let elements = parse_elements(line, options);
448
449    // Reflow the elements into lines
450    reflow_elements(&elements, options)
451}
452
453/// Image source in a linked image structure
454#[derive(Debug, Clone)]
455enum LinkedImageSource {
456    /// Inline image URL: ![alt](url)
457    Inline(String),
458    /// Reference image: ![alt][ref]
459    Reference(String),
460}
461
462/// Link target in a linked image structure
463#[derive(Debug, Clone)]
464enum LinkedImageTarget {
465    /// Inline link URL: ](url)
466    Inline(String),
467    /// Reference link: ][ref]
468    Reference(String),
469}
470
471/// Represents a piece of content in the markdown
472#[derive(Debug, Clone)]
473enum Element {
474    /// Plain text that can be wrapped
475    Text(String),
476    /// A complete markdown inline link [text](url)
477    Link { text: String, url: String },
478    /// A complete markdown reference link [text][ref]
479    ReferenceLink { text: String, reference: String },
480    /// A complete markdown empty reference link [text][]
481    EmptyReferenceLink { text: String },
482    /// A complete markdown shortcut reference link [ref]
483    ShortcutReference { reference: String },
484    /// A complete markdown inline image ![alt](url)
485    InlineImage { alt: String, url: String },
486    /// A complete markdown reference image ![alt][ref]
487    ReferenceImage { alt: String, reference: String },
488    /// A complete markdown empty reference image ![alt][]
489    EmptyReferenceImage { alt: String },
490    /// A clickable image badge in any of 4 forms:
491    /// - [![alt](img-url)](link-url)
492    /// - [![alt][img-ref]](link-url)
493    /// - [![alt](img-url)][link-ref]
494    /// - [![alt][img-ref]][link-ref]
495    LinkedImage {
496        alt: String,
497        img_source: LinkedImageSource,
498        link_target: LinkedImageTarget,
499    },
500    /// Footnote reference [^note]
501    FootnoteReference { note: String },
502    /// Strikethrough text ~~text~~
503    Strikethrough(String),
504    /// Wiki-style link [[wiki]] or [[wiki|text]]
505    WikiLink(String),
506    /// Inline math $math$
507    InlineMath(String),
508    /// Display math $$math$$
509    DisplayMath(String),
510    /// Emoji shortcode :emoji:
511    EmojiShortcode(String),
512    /// Autolink <https://...> or <mailto:...> or <user@domain.com>
513    Autolink(String),
514    /// HTML tag <tag> or </tag> or <tag/>
515    HtmlTag(String),
516    /// HTML entity &nbsp; or &#123;
517    HtmlEntity(String),
518    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
519    HugoShortcode(String),
520    /// MkDocs/kramdown attribute list {#id .class key="value"}
521    AttrList(String),
522    /// Inline code `code`
523    Code(String),
524    /// Bold text **text** or __text__
525    Bold {
526        content: String,
527        /// True if underscore markers (__), false for asterisks (**)
528        underscore: bool,
529    },
530    /// Italic text *text* or _text_
531    Italic {
532        content: String,
533        /// True if underscore marker (_), false for asterisk (*)
534        underscore: bool,
535    },
536}
537
538impl std::fmt::Display for Element {
539    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
540        match self {
541            Element::Text(s) => write!(f, "{s}"),
542            Element::Link { text, url } => write!(f, "[{text}]({url})"),
543            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
544            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
545            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
546            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
547            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
548            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
549            Element::LinkedImage {
550                alt,
551                img_source,
552                link_target,
553            } => {
554                // Build the image part: ![alt](url) or ![alt][ref]
555                let img_part = match img_source {
556                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
557                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
558                };
559                // Build the link part: (url) or [ref]
560                match link_target {
561                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
562                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
563                }
564            }
565            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
566            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
567            Element::WikiLink(s) => write!(f, "[[{s}]]"),
568            Element::InlineMath(s) => write!(f, "${s}$"),
569            Element::DisplayMath(s) => write!(f, "$${s}$$"),
570            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
571            Element::Autolink(s) => write!(f, "{s}"),
572            Element::HtmlTag(s) => write!(f, "{s}"),
573            Element::HtmlEntity(s) => write!(f, "{s}"),
574            Element::HugoShortcode(s) => write!(f, "{s}"),
575            Element::AttrList(s) => write!(f, "{s}"),
576            Element::Code(s) => write!(f, "`{s}`"),
577            Element::Bold { content, underscore } => {
578                if *underscore {
579                    write!(f, "__{content}__")
580                } else {
581                    write!(f, "**{content}**")
582                }
583            }
584            Element::Italic { content, underscore } => {
585                if *underscore {
586                    write!(f, "_{content}_")
587                } else {
588                    write!(f, "*{content}*")
589                }
590            }
591        }
592    }
593}
594
595impl Element {
596    /// Calculate the display width of this element using the given length mode.
597    /// This formats the element and computes its width, correctly handling
598    /// visual width for CJK characters and other wide glyphs.
599    fn display_width(&self, mode: ReflowLengthMode) -> usize {
600        let formatted = format!("{self}");
601        display_len(&formatted, mode)
602    }
603}
604
605/// An emphasis or formatting span parsed by pulldown-cmark
606#[derive(Debug, Clone)]
607struct EmphasisSpan {
608    /// Byte offset where the emphasis starts (including markers)
609    start: usize,
610    /// Byte offset where the emphasis ends (after closing markers)
611    end: usize,
612    /// The content inside the emphasis markers
613    content: String,
614    /// Whether this is strong (bold) emphasis
615    is_strong: bool,
616    /// Whether this is strikethrough (~~text~~)
617    is_strikethrough: bool,
618    /// Whether the original used underscore markers (for emphasis only)
619    uses_underscore: bool,
620}
621
622/// Extract emphasis and strikethrough spans from text using pulldown-cmark
623///
624/// This provides CommonMark-compliant emphasis parsing, correctly handling:
625/// - Nested emphasis like `*text **bold** more*`
626/// - Left/right flanking delimiter rules
627/// - Underscore vs asterisk markers
628/// - GFM strikethrough (~~text~~)
629///
630/// Returns spans sorted by start position.
631fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
632    let mut spans = Vec::new();
633    let mut options = Options::empty();
634    options.insert(Options::ENABLE_STRIKETHROUGH);
635
636    // Stacks to track nested formatting with their start positions
637    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
638    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
639    let mut strikethrough_stack: Vec<usize> = Vec::new();
640
641    let parser = Parser::new_ext(text, options).into_offset_iter();
642
643    for (event, range) in parser {
644        match event {
645            Event::Start(Tag::Emphasis) => {
646                // Check if this uses underscore by looking at the original text
647                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
648                emphasis_stack.push((range.start, uses_underscore));
649            }
650            Event::End(TagEnd::Emphasis) => {
651                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
652                    // Extract content between the markers (1 char marker on each side)
653                    let content_start = start_byte + 1;
654                    let content_end = range.end - 1;
655                    if content_end > content_start
656                        && let Some(content) = text.get(content_start..content_end)
657                    {
658                        spans.push(EmphasisSpan {
659                            start: start_byte,
660                            end: range.end,
661                            content: content.to_string(),
662                            is_strong: false,
663                            is_strikethrough: false,
664                            uses_underscore,
665                        });
666                    }
667                }
668            }
669            Event::Start(Tag::Strong) => {
670                // Check if this uses underscore by looking at the original text
671                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
672                strong_stack.push((range.start, uses_underscore));
673            }
674            Event::End(TagEnd::Strong) => {
675                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
676                    // Extract content between the markers (2 char marker on each side)
677                    let content_start = start_byte + 2;
678                    let content_end = range.end - 2;
679                    if content_end > content_start
680                        && let Some(content) = text.get(content_start..content_end)
681                    {
682                        spans.push(EmphasisSpan {
683                            start: start_byte,
684                            end: range.end,
685                            content: content.to_string(),
686                            is_strong: true,
687                            is_strikethrough: false,
688                            uses_underscore,
689                        });
690                    }
691                }
692            }
693            Event::Start(Tag::Strikethrough) => {
694                strikethrough_stack.push(range.start);
695            }
696            Event::End(TagEnd::Strikethrough) => {
697                if let Some(start_byte) = strikethrough_stack.pop() {
698                    // Extract content between the ~~ markers (2 char marker on each side)
699                    let content_start = start_byte + 2;
700                    let content_end = range.end - 2;
701                    if content_end > content_start
702                        && let Some(content) = text.get(content_start..content_end)
703                    {
704                        spans.push(EmphasisSpan {
705                            start: start_byte,
706                            end: range.end,
707                            content: content.to_string(),
708                            is_strong: false,
709                            is_strikethrough: true,
710                            uses_underscore: false,
711                        });
712                    }
713                }
714            }
715            _ => {}
716        }
717    }
718
719    // Sort by start position
720    spans.sort_by_key(|s| s.start);
721    spans
722}
723
724/// Parse markdown elements from text preserving the raw syntax
725///
726/// Detection order is critical:
727/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
728/// 2. Inline images ![alt](url) - before links to handle ! prefix
729/// 3. Reference images ![alt][ref] - before reference links
730/// 4. Inline links [text](url) - before reference links
731/// 5. Reference links [text][ref] - before shortcut references
732/// 6. Shortcut reference links [ref] - detected last to avoid false positives
733/// 7. Other elements (code, bold, italic, etc.) - processed normally
734fn parse_markdown_elements(text: &str) -> Vec<Element> {
735    parse_markdown_elements_inner(text, false)
736}
737
738fn parse_markdown_elements_with_attr_lists(text: &str) -> Vec<Element> {
739    parse_markdown_elements_inner(text, true)
740}
741
742fn parse_markdown_elements_inner(text: &str, attr_lists: bool) -> Vec<Element> {
743    let mut elements = Vec::new();
744    let mut remaining = text;
745
746    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
747    let emphasis_spans = extract_emphasis_spans(text);
748
749    while !remaining.is_empty() {
750        // Calculate current byte offset in original text
751        let current_offset = text.len() - remaining.len();
752        // Find the earliest occurrence of any markdown pattern
753        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
754
755        // Check for linked images FIRST (all 4 variants)
756        // Quick literal check: only run expensive regexes if we might have a linked image
757        // Pattern starts with "[!" so check for that first
758        if remaining.contains("[!") {
759            // Pattern 1: [![alt](img)](link) - inline image in inline link
760            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
761                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
762            {
763                earliest_match = Some((m.start(), "linked_image_ii", m));
764            }
765
766            // Pattern 2: [![alt][ref]](link) - reference image in inline link
767            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
768                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
769            {
770                earliest_match = Some((m.start(), "linked_image_ri", m));
771            }
772
773            // Pattern 3: [![alt](img)][ref] - inline image in reference link
774            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
775                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
776            {
777                earliest_match = Some((m.start(), "linked_image_ir", m));
778            }
779
780            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
781            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
782                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
783            {
784                earliest_match = Some((m.start(), "linked_image_rr", m));
785            }
786        }
787
788        // Check for images (they start with ! so should be detected before links)
789        // Inline images - ![alt](url)
790        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
791            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
792        {
793            earliest_match = Some((m.start(), "inline_image", m));
794        }
795
796        // Reference images - ![alt][ref]
797        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
798            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
799        {
800            earliest_match = Some((m.start(), "ref_image", m));
801        }
802
803        // Check for footnote references - [^note]
804        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
805            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
806        {
807            earliest_match = Some((m.start(), "footnote_ref", m));
808        }
809
810        // Check for inline links - [text](url)
811        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
812            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
813        {
814            earliest_match = Some((m.start(), "inline_link", m));
815        }
816
817        // Check for reference links - [text][ref]
818        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
819            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
820        {
821            earliest_match = Some((m.start(), "ref_link", m));
822        }
823
824        // Check for shortcut reference links - [ref]
825        // Only check if we haven't found an earlier pattern that would conflict
826        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
827            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
828        {
829            earliest_match = Some((m.start(), "shortcut_ref", m));
830        }
831
832        // Check for wiki-style links - [[wiki]]
833        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
834            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
835        {
836            earliest_match = Some((m.start(), "wiki_link", m));
837        }
838
839        // Check for display math first (before inline) - $$math$$
840        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
841            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
842        {
843            earliest_match = Some((m.start(), "display_math", m));
844        }
845
846        // Check for inline math - $math$
847        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
848            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
849        {
850            earliest_match = Some((m.start(), "inline_math", m));
851        }
852
853        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
854
855        // Check for emoji shortcodes - :emoji:
856        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
857            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
858        {
859            earliest_match = Some((m.start(), "emoji", m));
860        }
861
862        // Check for HTML entities - &nbsp; etc
863        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
864            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
865        {
866            earliest_match = Some((m.start(), "html_entity", m));
867        }
868
869        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
870        // Must be checked before other patterns to avoid false sentence breaks
871        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
872            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
873        {
874            earliest_match = Some((m.start(), "hugo_shortcode", m));
875        }
876
877        // Check for HTML tags - <tag> </tag> <tag/>
878        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
879        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
880            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
881        {
882            // Check if this is an autolink (starts with protocol or mailto:)
883            let matched_text = &remaining[m.start()..m.end()];
884            let is_url_autolink = matched_text.starts_with("<http://")
885                || matched_text.starts_with("<https://")
886                || matched_text.starts_with("<mailto:")
887                || matched_text.starts_with("<ftp://")
888                || matched_text.starts_with("<ftps://");
889
890            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
891            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
892            let is_email_autolink = {
893                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
894                EMAIL_PATTERN.is_match(content)
895            };
896
897            if is_url_autolink || is_email_autolink {
898                earliest_match = Some((m.start(), "autolink", m));
899            } else {
900                earliest_match = Some((m.start(), "html_tag", m));
901            }
902        }
903
904        // Find earliest non-link special characters
905        let mut next_special = remaining.len();
906        let mut special_type = "";
907        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
908        let mut attr_list_len: usize = 0;
909
910        // Check for code spans (not handled by pulldown-cmark in this context)
911        if let Some(pos) = remaining.find('`')
912            && pos < next_special
913        {
914            next_special = pos;
915            special_type = "code";
916        }
917
918        // Check for MkDocs/kramdown attr lists - {#id .class key="value"}
919        if attr_lists
920            && let Some(pos) = remaining.find('{')
921            && pos < next_special
922        {
923            if let Some(m) = ATTR_LIST_PATTERN.find(&remaining[pos..]) {
924                if m.start() == 0 {
925                    next_special = pos;
926                    special_type = "attr_list";
927                    attr_list_len = m.end();
928                }
929            }
930        }
931
932        // Check for emphasis using pulldown-cmark's pre-extracted spans
933        // Find the earliest emphasis span that starts within remaining text
934        for span in &emphasis_spans {
935            if span.start >= current_offset && span.start < current_offset + remaining.len() {
936                let pos_in_remaining = span.start - current_offset;
937                if pos_in_remaining < next_special {
938                    next_special = pos_in_remaining;
939                    special_type = "pulldown_emphasis";
940                    pulldown_emphasis = Some(span);
941                }
942                break; // Spans are sorted by start position, so first match is earliest
943            }
944        }
945
946        // Determine which pattern to process first
947        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
948            pos < next_special
949        } else {
950            false
951        };
952
953        if should_process_markdown_link {
954            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
955
956            // Add any text before the match
957            if pos > 0 {
958                elements.push(Element::Text(remaining[..pos].to_string()));
959            }
960
961            // Process the matched pattern
962            match pattern_type {
963                // Pattern 1: [![alt](img)](link) - inline image in inline link
964                "linked_image_ii" => {
965                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
966                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
967                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
968                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
969                        elements.push(Element::LinkedImage {
970                            alt: alt.to_string(),
971                            img_source: LinkedImageSource::Inline(img_url.to_string()),
972                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
973                        });
974                        remaining = &remaining[match_obj.end()..];
975                    } else {
976                        elements.push(Element::Text("[".to_string()));
977                        remaining = &remaining[1..];
978                    }
979                }
980                // Pattern 2: [![alt][ref]](link) - reference image in inline link
981                "linked_image_ri" => {
982                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
983                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
984                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
985                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
986                        elements.push(Element::LinkedImage {
987                            alt: alt.to_string(),
988                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
989                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
990                        });
991                        remaining = &remaining[match_obj.end()..];
992                    } else {
993                        elements.push(Element::Text("[".to_string()));
994                        remaining = &remaining[1..];
995                    }
996                }
997                // Pattern 3: [![alt](img)][ref] - inline image in reference link
998                "linked_image_ir" => {
999                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
1000                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1001                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1002                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1003                        elements.push(Element::LinkedImage {
1004                            alt: alt.to_string(),
1005                            img_source: LinkedImageSource::Inline(img_url.to_string()),
1006                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1007                        });
1008                        remaining = &remaining[match_obj.end()..];
1009                    } else {
1010                        elements.push(Element::Text("[".to_string()));
1011                        remaining = &remaining[1..];
1012                    }
1013                }
1014                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
1015                "linked_image_rr" => {
1016                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
1017                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1018                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1019                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1020                        elements.push(Element::LinkedImage {
1021                            alt: alt.to_string(),
1022                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
1023                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1024                        });
1025                        remaining = &remaining[match_obj.end()..];
1026                    } else {
1027                        elements.push(Element::Text("[".to_string()));
1028                        remaining = &remaining[1..];
1029                    }
1030                }
1031                "inline_image" => {
1032                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
1033                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1034                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1035                        elements.push(Element::InlineImage {
1036                            alt: alt.to_string(),
1037                            url: url.to_string(),
1038                        });
1039                        remaining = &remaining[match_obj.end()..];
1040                    } else {
1041                        elements.push(Element::Text("!".to_string()));
1042                        remaining = &remaining[1..];
1043                    }
1044                }
1045                "ref_image" => {
1046                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
1047                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1048                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1049
1050                        if reference.is_empty() {
1051                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1052                        } else {
1053                            elements.push(Element::ReferenceImage {
1054                                alt: alt.to_string(),
1055                                reference: reference.to_string(),
1056                            });
1057                        }
1058                        remaining = &remaining[match_obj.end()..];
1059                    } else {
1060                        elements.push(Element::Text("!".to_string()));
1061                        remaining = &remaining[1..];
1062                    }
1063                }
1064                "footnote_ref" => {
1065                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
1066                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1067                        elements.push(Element::FootnoteReference { note: note.to_string() });
1068                        remaining = &remaining[match_obj.end()..];
1069                    } else {
1070                        elements.push(Element::Text("[".to_string()));
1071                        remaining = &remaining[1..];
1072                    }
1073                }
1074                "inline_link" => {
1075                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1076                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1077                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1078                        elements.push(Element::Link {
1079                            text: text.to_string(),
1080                            url: url.to_string(),
1081                        });
1082                        remaining = &remaining[match_obj.end()..];
1083                    } else {
1084                        // Fallback - shouldn't happen
1085                        elements.push(Element::Text("[".to_string()));
1086                        remaining = &remaining[1..];
1087                    }
1088                }
1089                "ref_link" => {
1090                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1091                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1092                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1093
1094                        if reference.is_empty() {
1095                            // Empty reference link [text][]
1096                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1097                        } else {
1098                            // Regular reference link [text][ref]
1099                            elements.push(Element::ReferenceLink {
1100                                text: text.to_string(),
1101                                reference: reference.to_string(),
1102                            });
1103                        }
1104                        remaining = &remaining[match_obj.end()..];
1105                    } else {
1106                        // Fallback - shouldn't happen
1107                        elements.push(Element::Text("[".to_string()));
1108                        remaining = &remaining[1..];
1109                    }
1110                }
1111                "shortcut_ref" => {
1112                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1113                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1114                        elements.push(Element::ShortcutReference {
1115                            reference: reference.to_string(),
1116                        });
1117                        remaining = &remaining[match_obj.end()..];
1118                    } else {
1119                        // Fallback - shouldn't happen
1120                        elements.push(Element::Text("[".to_string()));
1121                        remaining = &remaining[1..];
1122                    }
1123                }
1124                "wiki_link" => {
1125                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1126                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1127                        elements.push(Element::WikiLink(content.to_string()));
1128                        remaining = &remaining[match_obj.end()..];
1129                    } else {
1130                        elements.push(Element::Text("[[".to_string()));
1131                        remaining = &remaining[2..];
1132                    }
1133                }
1134                "display_math" => {
1135                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1136                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1137                        elements.push(Element::DisplayMath(math.to_string()));
1138                        remaining = &remaining[match_obj.end()..];
1139                    } else {
1140                        elements.push(Element::Text("$$".to_string()));
1141                        remaining = &remaining[2..];
1142                    }
1143                }
1144                "inline_math" => {
1145                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1146                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1147                        elements.push(Element::InlineMath(math.to_string()));
1148                        remaining = &remaining[match_obj.end()..];
1149                    } else {
1150                        elements.push(Element::Text("$".to_string()));
1151                        remaining = &remaining[1..];
1152                    }
1153                }
1154                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1155                "emoji" => {
1156                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1157                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1158                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1159                        remaining = &remaining[match_obj.end()..];
1160                    } else {
1161                        elements.push(Element::Text(":".to_string()));
1162                        remaining = &remaining[1..];
1163                    }
1164                }
1165                "html_entity" => {
1166                    // HTML entities are captured whole - use as_str() to get just the matched content
1167                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1168                    remaining = &remaining[match_obj.end()..];
1169                }
1170                "hugo_shortcode" => {
1171                    // Hugo shortcodes are atomic elements - preserve them exactly
1172                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1173                    remaining = &remaining[match_obj.end()..];
1174                }
1175                "autolink" => {
1176                    // Autolinks are atomic elements - preserve them exactly
1177                    elements.push(Element::Autolink(match_obj.as_str().to_string()));
1178                    remaining = &remaining[match_obj.end()..];
1179                }
1180                "html_tag" => {
1181                    // HTML tags are captured whole - use as_str() to get just the matched content
1182                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1183                    remaining = &remaining[match_obj.end()..];
1184                }
1185                _ => {
1186                    // Unknown pattern, treat as text
1187                    elements.push(Element::Text("[".to_string()));
1188                    remaining = &remaining[1..];
1189                }
1190            }
1191        } else {
1192            // Process non-link special characters
1193
1194            // Add any text before the special character
1195            if next_special > 0 && next_special < remaining.len() {
1196                elements.push(Element::Text(remaining[..next_special].to_string()));
1197                remaining = &remaining[next_special..];
1198            }
1199
1200            // Process the special element
1201            match special_type {
1202                "code" => {
1203                    // Find end of code
1204                    if let Some(code_end) = remaining[1..].find('`') {
1205                        let code = &remaining[1..1 + code_end];
1206                        elements.push(Element::Code(code.to_string()));
1207                        remaining = &remaining[1 + code_end + 1..];
1208                    } else {
1209                        // No closing backtick, treat as text
1210                        elements.push(Element::Text(remaining.to_string()));
1211                        break;
1212                    }
1213                }
1214                "attr_list" => {
1215                    elements.push(Element::AttrList(remaining[..attr_list_len].to_string()));
1216                    remaining = &remaining[attr_list_len..];
1217                }
1218                "pulldown_emphasis" => {
1219                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1220                    if let Some(span) = pulldown_emphasis {
1221                        let span_len = span.end - span.start;
1222                        if span.is_strikethrough {
1223                            elements.push(Element::Strikethrough(span.content.clone()));
1224                        } else if span.is_strong {
1225                            elements.push(Element::Bold {
1226                                content: span.content.clone(),
1227                                underscore: span.uses_underscore,
1228                            });
1229                        } else {
1230                            elements.push(Element::Italic {
1231                                content: span.content.clone(),
1232                                underscore: span.uses_underscore,
1233                            });
1234                        }
1235                        remaining = &remaining[span_len..];
1236                    } else {
1237                        // Fallback - shouldn't happen
1238                        elements.push(Element::Text(remaining[..1].to_string()));
1239                        remaining = &remaining[1..];
1240                    }
1241                }
1242                _ => {
1243                    // No special elements found, add all remaining text
1244                    elements.push(Element::Text(remaining.to_string()));
1245                    break;
1246                }
1247            }
1248        }
1249    }
1250
1251    elements
1252}
1253
1254/// Reflow elements for sentence-per-line mode
1255fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1256    let abbreviations = get_abbreviations(custom_abbreviations);
1257    let mut lines = Vec::new();
1258    let mut current_line = String::new();
1259
1260    for (idx, element) in elements.iter().enumerate() {
1261        let element_str = format!("{element}");
1262
1263        // For text elements, split into sentences
1264        if let Element::Text(text) = element {
1265            // Simply append text - it already has correct spacing from tokenization
1266            let combined = format!("{current_line}{text}");
1267            // Use the pre-computed abbreviations set to avoid redundant computation
1268            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1269
1270            if sentences.len() > 1 {
1271                // We found sentence boundaries
1272                for (i, sentence) in sentences.iter().enumerate() {
1273                    if i == 0 {
1274                        // First sentence might continue from previous elements
1275                        // But check if it ends with an abbreviation
1276                        let trimmed = sentence.trim();
1277
1278                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1279                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1280                            current_line = sentence.to_string();
1281                        } else {
1282                            // Normal case - emit the first sentence
1283                            lines.push(sentence.to_string());
1284                            current_line.clear();
1285                        }
1286                    } else if i == sentences.len() - 1 {
1287                        // Last sentence: check if it's complete or incomplete
1288                        let trimmed = sentence.trim();
1289                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1290
1291                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1292                            // Complete sentence - emit it immediately
1293                            lines.push(sentence.to_string());
1294                            current_line.clear();
1295                        } else {
1296                            // Incomplete sentence - save for next iteration
1297                            current_line = sentence.to_string();
1298                        }
1299                    } else {
1300                        // Complete sentences in the middle
1301                        lines.push(sentence.to_string());
1302                    }
1303                }
1304            } else {
1305                // Single sentence - check if it's complete
1306                let trimmed = combined.trim();
1307
1308                // If the combined result is only whitespace, don't accumulate it.
1309                // This prevents leading spaces on subsequent elements when lines
1310                // are joined with spaces during reflow iteration.
1311                if trimmed.is_empty() {
1312                    continue;
1313                }
1314
1315                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1316
1317                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1318                    // Complete single sentence - emit it
1319                    lines.push(trimmed.to_string());
1320                    current_line.clear();
1321                } else {
1322                    // Incomplete sentence - continue accumulating
1323                    current_line = combined;
1324                }
1325            }
1326        } else if let Element::Italic { content, underscore } = element {
1327            // Handle italic elements - may contain multiple sentences that need continuation
1328            let marker = if *underscore { "_" } else { "*" };
1329            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1330        } else if let Element::Bold { content, underscore } = element {
1331            // Handle bold elements - may contain multiple sentences that need continuation
1332            let marker = if *underscore { "__" } else { "**" };
1333            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1334        } else if let Element::Strikethrough(content) = element {
1335            // Handle strikethrough elements - may contain multiple sentences that need continuation
1336            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1337        } else {
1338            // Non-text, non-emphasis elements (Code, Links, etc.)
1339            // Check if this element is adjacent to the preceding text (no space between)
1340            let is_adjacent = if idx > 0 {
1341                match &elements[idx - 1] {
1342                    Element::Text(t) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1343                    _ => true,
1344                }
1345            } else {
1346                false
1347            };
1348
1349            // Add space before element if needed, but not for adjacent elements
1350            if !is_adjacent
1351                && !current_line.is_empty()
1352                && !current_line.ends_with(' ')
1353                && !current_line.ends_with('(')
1354                && !current_line.ends_with('[')
1355            {
1356                current_line.push(' ');
1357            }
1358            current_line.push_str(&element_str);
1359        }
1360    }
1361
1362    // Add any remaining content
1363    if !current_line.is_empty() {
1364        lines.push(current_line.trim().to_string());
1365    }
1366    lines
1367}
1368
1369/// Handle splitting emphasis content at sentence boundaries while preserving markers
1370fn handle_emphasis_sentence_split(
1371    content: &str,
1372    marker: &str,
1373    abbreviations: &HashSet<String>,
1374    current_line: &mut String,
1375    lines: &mut Vec<String>,
1376) {
1377    // Split the emphasis content into sentences
1378    let sentences = split_into_sentences_with_set(content, abbreviations);
1379
1380    if sentences.len() <= 1 {
1381        // Single sentence or no boundaries - treat as atomic
1382        if !current_line.is_empty()
1383            && !current_line.ends_with(' ')
1384            && !current_line.ends_with('(')
1385            && !current_line.ends_with('[')
1386        {
1387            current_line.push(' ');
1388        }
1389        current_line.push_str(marker);
1390        current_line.push_str(content);
1391        current_line.push_str(marker);
1392
1393        // Check if the emphasis content ends with sentence punctuation - if so, emit
1394        let trimmed = content.trim();
1395        let ends_with_punct = ends_with_sentence_punct(trimmed);
1396        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1397            lines.push(current_line.clone());
1398            current_line.clear();
1399        }
1400    } else {
1401        // Multiple sentences - each gets its own emphasis markers
1402        for (i, sentence) in sentences.iter().enumerate() {
1403            let trimmed = sentence.trim();
1404            if trimmed.is_empty() {
1405                continue;
1406            }
1407
1408            if i == 0 {
1409                // First sentence: combine with current_line and emit
1410                if !current_line.is_empty()
1411                    && !current_line.ends_with(' ')
1412                    && !current_line.ends_with('(')
1413                    && !current_line.ends_with('[')
1414                {
1415                    current_line.push(' ');
1416                }
1417                current_line.push_str(marker);
1418                current_line.push_str(trimmed);
1419                current_line.push_str(marker);
1420
1421                // Check if this is a complete sentence
1422                let ends_with_punct = ends_with_sentence_punct(trimmed);
1423                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1424                    lines.push(current_line.clone());
1425                    current_line.clear();
1426                }
1427            } else if i == sentences.len() - 1 {
1428                // Last sentence: check if complete
1429                let ends_with_punct = ends_with_sentence_punct(trimmed);
1430
1431                let mut line = String::new();
1432                line.push_str(marker);
1433                line.push_str(trimmed);
1434                line.push_str(marker);
1435
1436                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1437                    lines.push(line);
1438                } else {
1439                    // Incomplete - keep in current_line for potential continuation
1440                    *current_line = line;
1441                }
1442            } else {
1443                // Middle sentences: emit with markers
1444                let mut line = String::new();
1445                line.push_str(marker);
1446                line.push_str(trimmed);
1447                line.push_str(marker);
1448                lines.push(line);
1449            }
1450        }
1451    }
1452}
1453
1454/// English break-words used for semantic line break splitting.
1455/// These are conjunctions and relative pronouns where a line break
1456/// reads naturally.
1457const BREAK_WORDS: &[&str] = &[
1458    "and",
1459    "or",
1460    "but",
1461    "nor",
1462    "yet",
1463    "so",
1464    "for",
1465    "which",
1466    "that",
1467    "because",
1468    "when",
1469    "if",
1470    "while",
1471    "where",
1472    "although",
1473    "though",
1474    "unless",
1475    "since",
1476    "after",
1477    "before",
1478    "until",
1479    "as",
1480    "once",
1481    "whether",
1482    "however",
1483    "therefore",
1484    "moreover",
1485    "furthermore",
1486    "nevertheless",
1487    "whereas",
1488];
1489
1490/// Check if a character is clause punctuation for semantic line breaks
1491fn is_clause_punctuation(c: char) -> bool {
1492    matches!(c, ',' | ';' | ':' | '\u{2014}') // comma, semicolon, colon, em dash
1493}
1494
1495/// Compute element spans for a flat text representation of elements.
1496/// Returns Vec of (start, end) byte offsets for non-Text elements,
1497/// so we can check that a split position doesn't fall inside them.
1498fn compute_element_spans(elements: &[Element]) -> Vec<(usize, usize)> {
1499    let mut spans = Vec::new();
1500    let mut offset = 0;
1501    for element in elements {
1502        let rendered = format!("{element}");
1503        let len = rendered.len();
1504        if !matches!(element, Element::Text(_)) {
1505            spans.push((offset, offset + len));
1506        }
1507        offset += len;
1508    }
1509    spans
1510}
1511
1512/// Check if a byte position falls inside any non-Text element span
1513fn is_inside_element(pos: usize, spans: &[(usize, usize)]) -> bool {
1514    spans.iter().any(|(start, end)| pos > *start && pos < *end)
1515}
1516
1517/// Minimum fraction of line_length that the first part of a split must occupy.
1518/// Prevents awkwardly short first lines like "A," or "Note:" on their own.
1519const MIN_SPLIT_RATIO: f64 = 0.3;
1520
1521/// Split a line at the latest clause punctuation that keeps the first part
1522/// within `line_length`. Returns None if no valid split point exists or if
1523/// the split would create an unreasonably short first line.
1524fn split_at_clause_punctuation(
1525    text: &str,
1526    line_length: usize,
1527    element_spans: &[(usize, usize)],
1528    length_mode: ReflowLengthMode,
1529) -> Option<(String, String)> {
1530    let chars: Vec<char> = text.chars().collect();
1531    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1532
1533    // Find the char index where accumulated display width exceeds line_length
1534    let mut width_acc = 0;
1535    let mut search_end_char = 0;
1536    for (idx, &c) in chars.iter().enumerate() {
1537        let c_width = display_len(&c.to_string(), length_mode);
1538        if width_acc + c_width > line_length {
1539            break;
1540        }
1541        width_acc += c_width;
1542        search_end_char = idx + 1;
1543    }
1544
1545    let mut best_pos = None;
1546    for i in (0..search_end_char).rev() {
1547        if is_clause_punctuation(chars[i]) {
1548            // Convert char position to byte position for element span check
1549            let byte_pos: usize = chars[..=i].iter().map(|c| c.len_utf8()).sum();
1550            if !is_inside_element(byte_pos, element_spans) {
1551                best_pos = Some(i);
1552                break;
1553            }
1554        }
1555    }
1556
1557    let pos = best_pos?;
1558
1559    // Reject splits that create very short first lines
1560    let first: String = chars[..=pos].iter().collect();
1561    let first_display_len = display_len(&first, length_mode);
1562    if first_display_len < min_first_len {
1563        return None;
1564    }
1565
1566    // Split after the punctuation character
1567    let rest: String = chars[pos + 1..].iter().collect();
1568    let rest = rest.trim_start().to_string();
1569
1570    if rest.is_empty() {
1571        return None;
1572    }
1573
1574    Some((first, rest))
1575}
1576
1577/// Split a line before the latest break-word that keeps the first part
1578/// within `line_length`. Returns None if no valid split point exists or if
1579/// the split would create an unreasonably short first line.
1580fn split_at_break_word(
1581    text: &str,
1582    line_length: usize,
1583    element_spans: &[(usize, usize)],
1584    length_mode: ReflowLengthMode,
1585) -> Option<(String, String)> {
1586    let lower = text.to_lowercase();
1587    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1588    let mut best_split: Option<(usize, usize)> = None; // (byte_start, word_len_bytes)
1589
1590    for &word in BREAK_WORDS {
1591        let mut search_start = 0;
1592        while let Some(pos) = lower[search_start..].find(word) {
1593            let abs_pos = search_start + pos;
1594
1595            // Verify it's a word boundary: preceded by space, followed by space
1596            let preceded_by_space = abs_pos == 0 || text.as_bytes().get(abs_pos - 1) == Some(&b' ');
1597            let followed_by_space = text.as_bytes().get(abs_pos + word.len()) == Some(&b' ');
1598
1599            if preceded_by_space && followed_by_space {
1600                // The break goes BEFORE the word, so first part ends at abs_pos - 1
1601                let first_part = text[..abs_pos].trim_end();
1602                let first_part_len = display_len(first_part, length_mode);
1603
1604                if first_part_len >= min_first_len
1605                    && first_part_len <= line_length
1606                    && !is_inside_element(abs_pos, element_spans)
1607                {
1608                    // Prefer the latest valid split point
1609                    if best_split.is_none_or(|(prev_pos, _)| abs_pos > prev_pos) {
1610                        best_split = Some((abs_pos, word.len()));
1611                    }
1612                }
1613            }
1614
1615            search_start = abs_pos + word.len();
1616        }
1617    }
1618
1619    let (byte_start, _word_len) = best_split?;
1620
1621    let first = text[..byte_start].trim_end().to_string();
1622    let rest = text[byte_start..].to_string();
1623
1624    if first.is_empty() || rest.trim().is_empty() {
1625        return None;
1626    }
1627
1628    Some((first, rest))
1629}
1630
1631/// Recursively cascade-split a line that exceeds line_length.
1632/// Tries clause punctuation first, then break-words, then word wrap.
1633fn cascade_split_line(
1634    text: &str,
1635    line_length: usize,
1636    abbreviations: &Option<Vec<String>>,
1637    length_mode: ReflowLengthMode,
1638    attr_lists: bool,
1639) -> Vec<String> {
1640    if line_length == 0 || display_len(text, length_mode) <= line_length {
1641        return vec![text.to_string()];
1642    }
1643
1644    let elements = parse_markdown_elements_inner(text, attr_lists);
1645    let element_spans = compute_element_spans(&elements);
1646
1647    // Try clause punctuation split
1648    if let Some((first, rest)) = split_at_clause_punctuation(text, line_length, &element_spans, length_mode) {
1649        let mut result = vec![first];
1650        result.extend(cascade_split_line(
1651            &rest,
1652            line_length,
1653            abbreviations,
1654            length_mode,
1655            attr_lists,
1656        ));
1657        return result;
1658    }
1659
1660    // Try break-word split
1661    if let Some((first, rest)) = split_at_break_word(text, line_length, &element_spans, length_mode) {
1662        let mut result = vec![first];
1663        result.extend(cascade_split_line(
1664            &rest,
1665            line_length,
1666            abbreviations,
1667            length_mode,
1668            attr_lists,
1669        ));
1670        return result;
1671    }
1672
1673    // Fallback: word wrap using existing reflow_elements
1674    let options = ReflowOptions {
1675        line_length,
1676        break_on_sentences: false,
1677        preserve_breaks: false,
1678        sentence_per_line: false,
1679        semantic_line_breaks: false,
1680        abbreviations: abbreviations.clone(),
1681        length_mode,
1682        attr_lists,
1683    };
1684    reflow_elements(&elements, &options)
1685}
1686
1687/// Reflow elements using semantic line breaks strategy:
1688/// 1. Split at sentence boundaries (always)
1689/// 2. For lines exceeding line_length, cascade through clause punct → break-words → word wrap
1690fn reflow_elements_semantic(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1691    // Step 1: Split into sentences using existing sentence-per-line logic
1692    let sentence_lines = reflow_elements_sentence_per_line(elements, &options.abbreviations);
1693
1694    // Step 2: For each sentence line, apply cascading splits if it exceeds line_length
1695    // When line_length is 0 (unlimited), skip cascading — sentence splits only
1696    if options.line_length == 0 {
1697        return sentence_lines;
1698    }
1699
1700    let length_mode = options.length_mode;
1701    let mut result = Vec::new();
1702    for line in sentence_lines {
1703        if display_len(&line, length_mode) <= options.line_length {
1704            result.push(line);
1705        } else {
1706            result.extend(cascade_split_line(
1707                &line,
1708                options.line_length,
1709                &options.abbreviations,
1710                length_mode,
1711                options.attr_lists,
1712            ));
1713        }
1714    }
1715
1716    // Step 3: Merge very short trailing lines back into the previous line.
1717    // Word wrap can produce lines like "was" or "see" on their own, which reads poorly.
1718    let min_line_len = ((options.line_length as f64) * MIN_SPLIT_RATIO) as usize;
1719    let mut merged: Vec<String> = Vec::with_capacity(result.len());
1720    for line in result {
1721        if !merged.is_empty() && display_len(&line, length_mode) < min_line_len && !line.trim().is_empty() {
1722            // Don't merge across sentence boundaries — sentence splits are intentional
1723            let prev_ends_at_sentence = {
1724                let trimmed = merged.last().unwrap().trim_end();
1725                trimmed
1726                    .chars()
1727                    .rev()
1728                    .find(|c| !matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | ')' | ']'))
1729                    .is_some_and(|c| matches!(c, '.' | '!' | '?'))
1730            };
1731
1732            if !prev_ends_at_sentence {
1733                let prev = merged.last_mut().unwrap();
1734                let combined = format!("{prev} {line}");
1735                // Only merge if the combined line fits within the limit
1736                if display_len(&combined, length_mode) <= options.line_length {
1737                    *prev = combined;
1738                    continue;
1739                }
1740            }
1741        }
1742        merged.push(line);
1743    }
1744    merged
1745}
1746
1747/// Find the last space in `line` that is safe to split at.
1748/// Safe spaces are those NOT inside rendered non-Text elements.
1749/// `element_spans` contains (start, end) byte ranges of non-Text elements in the line.
1750/// Find the last space in `line` that is not inside any element span.
1751/// Spans use exclusive bounds (pos > start && pos < end) because element
1752/// delimiters (e.g., `[`, `]`, `(`, `)`, `<`, `>`, `` ` ``) are never
1753/// spaces, so only interior positions need protection.
1754fn rfind_safe_space(line: &str, element_spans: &[(usize, usize)]) -> Option<usize> {
1755    line.char_indices()
1756        .rev()
1757        .map(|(pos, _)| pos)
1758        .find(|&pos| line.as_bytes()[pos] == b' ' && !element_spans.iter().any(|(s, e)| pos > *s && pos < *e))
1759}
1760
1761/// Reflow elements into lines that fit within the line length
1762fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1763    let mut lines = Vec::new();
1764    let mut current_line = String::new();
1765    let mut current_length = 0;
1766    // Track byte spans of non-Text elements in current_line for safe splitting
1767    let mut current_line_element_spans: Vec<(usize, usize)> = Vec::new();
1768    let length_mode = options.length_mode;
1769
1770    for (idx, element) in elements.iter().enumerate() {
1771        let element_str = format!("{element}");
1772        let element_len = element.display_width(length_mode);
1773
1774        // Determine adjacency from the original elements, not from current_line.
1775        // Elements are adjacent when there's no whitespace between them in the source:
1776        // - Text("v") → HugoShortcode("{{<...>}}") = adjacent (text has no trailing space)
1777        // - Text(" and ") → InlineLink("[a](url)") = NOT adjacent (text has trailing space)
1778        // - HugoShortcode("{{<...>}}") → Text(",") = adjacent (text has no leading space)
1779        let is_adjacent_to_prev = if idx > 0 {
1780            match (&elements[idx - 1], element) {
1781                (Element::Text(t), _) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1782                (_, Element::Text(t)) => !t.is_empty() && !t.starts_with(char::is_whitespace),
1783                _ => true,
1784            }
1785        } else {
1786            false
1787        };
1788
1789        // For text elements that might need breaking
1790        if let Element::Text(text) = element {
1791            // Check if original text had leading whitespace
1792            let has_leading_space = text.starts_with(char::is_whitespace);
1793            // If this is a text element, always process it word by word
1794            let words: Vec<&str> = text.split_whitespace().collect();
1795
1796            for (i, word) in words.iter().enumerate() {
1797                let word_len = display_len(word, length_mode);
1798                // Check if this "word" is just punctuation that should stay attached
1799                let is_trailing_punct = word
1800                    .chars()
1801                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1802
1803                // First word of text adjacent to preceding non-text element
1804                // must stay attached (e.g., shortcode followed by punctuation or text)
1805                let is_first_adjacent = i == 0 && is_adjacent_to_prev;
1806
1807                if is_first_adjacent {
1808                    // Attach directly without space, preventing line break
1809                    if current_length + word_len > options.line_length && current_length > 0 {
1810                        // Would exceed — break before the adjacent group
1811                        // Use element-aware space search to avoid splitting inside links/code/etc.
1812                        if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1813                            let before = current_line[..last_space].trim_end().to_string();
1814                            let after = current_line[last_space + 1..].to_string();
1815                            lines.push(before);
1816                            current_line = format!("{after}{word}");
1817                            current_length = display_len(&current_line, length_mode);
1818                            current_line_element_spans.clear();
1819                        } else {
1820                            current_line.push_str(word);
1821                            current_length += word_len;
1822                        }
1823                    } else {
1824                        current_line.push_str(word);
1825                        current_length += word_len;
1826                    }
1827                } else if current_length > 0
1828                    && current_length + 1 + word_len > options.line_length
1829                    && !is_trailing_punct
1830                {
1831                    // Start a new line (but never for trailing punctuation)
1832                    lines.push(current_line.trim().to_string());
1833                    current_line = word.to_string();
1834                    current_length = word_len;
1835                    current_line_element_spans.clear();
1836                } else {
1837                    // Add word to current line
1838                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1839                    // AND this isn't trailing punctuation (which attaches directly)
1840                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1841                        current_line.push(' ');
1842                        current_length += 1;
1843                    }
1844                    current_line.push_str(word);
1845                    current_length += word_len;
1846                }
1847            }
1848        } else if matches!(
1849            element,
1850            Element::Italic { .. } | Element::Bold { .. } | Element::Strikethrough(_)
1851        ) && element_len > options.line_length
1852        {
1853            // Italic, bold, and strikethrough with content longer than line_length need word wrapping.
1854            // Split content word-by-word, attach the opening marker to the first word
1855            // and the closing marker to the last word.
1856            let (content, marker): (&str, &str) = match element {
1857                Element::Italic { content, underscore } => (content.as_str(), if *underscore { "_" } else { "*" }),
1858                Element::Bold { content, underscore } => (content.as_str(), if *underscore { "__" } else { "**" }),
1859                Element::Strikethrough(content) => (content.as_str(), "~~"),
1860                _ => unreachable!(),
1861            };
1862
1863            let words: Vec<&str> = content.split_whitespace().collect();
1864            let n = words.len();
1865
1866            if n == 0 {
1867                // Empty span — treat as atomic
1868                let full = format!("{marker}{marker}");
1869                let full_len = display_len(&full, length_mode);
1870                if !is_adjacent_to_prev && current_length > 0 {
1871                    current_line.push(' ');
1872                    current_length += 1;
1873                }
1874                current_line.push_str(&full);
1875                current_length += full_len;
1876            } else {
1877                for (i, word) in words.iter().enumerate() {
1878                    let is_first = i == 0;
1879                    let is_last = i == n - 1;
1880                    let word_str: String = match (is_first, is_last) {
1881                        (true, true) => format!("{marker}{word}{marker}"),
1882                        (true, false) => format!("{marker}{word}"),
1883                        (false, true) => format!("{word}{marker}"),
1884                        (false, false) => word.to_string(),
1885                    };
1886                    let word_len = display_len(&word_str, length_mode);
1887
1888                    let needs_space = if is_first {
1889                        !is_adjacent_to_prev && current_length > 0
1890                    } else {
1891                        current_length > 0
1892                    };
1893
1894                    if needs_space && current_length + 1 + word_len > options.line_length {
1895                        lines.push(current_line.trim_end().to_string());
1896                        current_line = word_str;
1897                        current_length = word_len;
1898                        current_line_element_spans.clear();
1899                    } else {
1900                        if needs_space {
1901                            current_line.push(' ');
1902                            current_length += 1;
1903                        }
1904                        current_line.push_str(&word_str);
1905                        current_length += word_len;
1906                    }
1907                }
1908            }
1909        } else {
1910            // For non-text elements (code, links, references), treat as atomic units
1911            // These should never be broken across lines
1912
1913            if is_adjacent_to_prev {
1914                // Adjacent to preceding text — attach directly without space
1915                if current_length + element_len > options.line_length {
1916                    // Would exceed limit — break before the adjacent word group
1917                    // Use element-aware space search to avoid splitting inside links/code/etc.
1918                    if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1919                        let before = current_line[..last_space].trim_end().to_string();
1920                        let after = current_line[last_space + 1..].to_string();
1921                        lines.push(before);
1922                        current_line = format!("{after}{element_str}");
1923                        current_length = display_len(&current_line, length_mode);
1924                        current_line_element_spans.clear();
1925                        // Record the element span in the new current_line
1926                        let start = after.len();
1927                        current_line_element_spans.push((start, start + element_str.len()));
1928                    } else {
1929                        // No safe space to break at — accept the long line
1930                        let start = current_line.len();
1931                        current_line.push_str(&element_str);
1932                        current_length += element_len;
1933                        current_line_element_spans.push((start, current_line.len()));
1934                    }
1935                } else {
1936                    let start = current_line.len();
1937                    current_line.push_str(&element_str);
1938                    current_length += element_len;
1939                    current_line_element_spans.push((start, current_line.len()));
1940                }
1941            } else if current_length > 0 && current_length + 1 + element_len > options.line_length {
1942                // Not adjacent, would exceed — start new line
1943                lines.push(current_line.trim().to_string());
1944                current_line = element_str.clone();
1945                current_length = element_len;
1946                current_line_element_spans.clear();
1947                current_line_element_spans.push((0, element_str.len()));
1948            } else {
1949                // Not adjacent, fits — add with space
1950                let ends_with_opener =
1951                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1952                if current_length > 0 && !ends_with_opener {
1953                    current_line.push(' ');
1954                    current_length += 1;
1955                }
1956                let start = current_line.len();
1957                current_line.push_str(&element_str);
1958                current_length += element_len;
1959                current_line_element_spans.push((start, current_line.len()));
1960            }
1961        }
1962    }
1963
1964    // Don't forget the last line
1965    if !current_line.is_empty() {
1966        lines.push(current_line.trim_end().to_string());
1967    }
1968
1969    lines
1970}
1971
1972/// Reflow markdown content preserving structure
1973pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1974    let lines: Vec<&str> = content.lines().collect();
1975    let mut result = Vec::new();
1976    let mut i = 0;
1977
1978    while i < lines.len() {
1979        let line = lines[i];
1980        let trimmed = line.trim();
1981
1982        // Preserve empty lines
1983        if trimmed.is_empty() {
1984            result.push(String::new());
1985            i += 1;
1986            continue;
1987        }
1988
1989        // Preserve headings as-is
1990        if trimmed.starts_with('#') {
1991            result.push(line.to_string());
1992            i += 1;
1993            continue;
1994        }
1995
1996        // Preserve Quarto/Pandoc div markers (:::) as-is
1997        if trimmed.starts_with(":::") {
1998            result.push(line.to_string());
1999            i += 1;
2000            continue;
2001        }
2002
2003        // Preserve fenced code blocks
2004        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2005            result.push(line.to_string());
2006            i += 1;
2007            // Copy lines until closing fence
2008            while i < lines.len() {
2009                result.push(lines[i].to_string());
2010                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
2011                    i += 1;
2012                    break;
2013                }
2014                i += 1;
2015            }
2016            continue;
2017        }
2018
2019        // Preserve indented code blocks (4+ columns accounting for tab expansion)
2020        if ElementCache::calculate_indentation_width_default(line) >= 4 {
2021            // Collect all consecutive indented lines
2022            result.push(line.to_string());
2023            i += 1;
2024            while i < lines.len() {
2025                let next_line = lines[i];
2026                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
2027                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
2028                    result.push(next_line.to_string());
2029                    i += 1;
2030                } else {
2031                    break;
2032                }
2033            }
2034            continue;
2035        }
2036
2037        // Preserve block quotes (but reflow their content)
2038        if trimmed.starts_with('>') {
2039            // find() returns byte position which is correct for str slicing
2040            // The unwrap is safe because we already verified trimmed starts with '>'
2041            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
2042            let quote_prefix = line[0..gt_pos + 1].to_string();
2043            let quote_content = &line[quote_prefix.len()..].trim_start();
2044
2045            let reflowed = reflow_line(quote_content, options);
2046            for reflowed_line in reflowed.iter() {
2047                result.push(format!("{quote_prefix} {reflowed_line}"));
2048            }
2049            i += 1;
2050            continue;
2051        }
2052
2053        // Preserve horizontal rules first (before checking for lists)
2054        if is_horizontal_rule(trimmed) {
2055            result.push(line.to_string());
2056            i += 1;
2057            continue;
2058        }
2059
2060        // Preserve lists (but not horizontal rules)
2061        if is_unordered_list_marker(trimmed) || is_numbered_list_item(trimmed) {
2062            // Find the list marker and preserve indentation
2063            let indent = line.len() - line.trim_start().len();
2064            let indent_str = " ".repeat(indent);
2065
2066            // For numbered lists, find the period and the space after it
2067            // For bullet lists, find the marker and the space after it
2068            let mut marker_end = indent;
2069            let mut content_start = indent;
2070
2071            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
2072                // Numbered list: find the period
2073                if let Some(period_pos) = line[indent..].find('.') {
2074                    marker_end = indent + period_pos + 1; // Include the period
2075                    content_start = marker_end;
2076                    // Skip any spaces after the period to find content start
2077                    // Use byte-based check since content_start is a byte index
2078                    // This is safe because space is ASCII (single byte)
2079                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2080                        content_start += 1;
2081                    }
2082                }
2083            } else {
2084                // Bullet list: marker is single character
2085                marker_end = indent + 1; // Just the marker character
2086                content_start = marker_end;
2087                // Skip any spaces after the marker
2088                // Use byte-based check since content_start is a byte index
2089                // This is safe because space is ASCII (single byte)
2090                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2091                    content_start += 1;
2092                }
2093            }
2094
2095            let marker = &line[indent..marker_end];
2096
2097            // Collect all content for this list item (including continuation lines)
2098            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
2099            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
2100            i += 1;
2101
2102            // Collect continuation lines (indented lines that are part of this list item)
2103            while i < lines.len() {
2104                let next_line = lines[i];
2105                let next_trimmed = next_line.trim();
2106
2107                // Stop if we hit an empty line or another list item or special block
2108                if is_block_boundary(next_trimmed) {
2109                    break;
2110                }
2111
2112                // Check if this line is indented (continuation of list item)
2113                let next_indent = next_line.len() - next_line.trim_start().len();
2114                if next_indent >= content_start {
2115                    // This is a continuation line - add its content
2116                    // Preserve hard breaks while trimming excessive whitespace
2117                    let trimmed_start = next_line.trim_start();
2118                    list_content.push(trim_preserving_hard_break(trimmed_start));
2119                    i += 1;
2120                } else {
2121                    // Not indented enough, not part of this list item
2122                    break;
2123                }
2124            }
2125
2126            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
2127            // Hard breaks should prevent joining with the next line
2128            let combined_content = if options.preserve_breaks {
2129                list_content[0].clone()
2130            } else {
2131                // Check if any lines have hard breaks - if so, preserve the structure
2132                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
2133                if has_hard_breaks {
2134                    // Don't join lines with hard breaks - keep them separate with newlines
2135                    list_content.join("\n")
2136                } else {
2137                    // No hard breaks, safe to join with spaces
2138                    list_content.join(" ")
2139                }
2140            };
2141
2142            // Calculate the proper indentation for continuation lines
2143            let trimmed_marker = marker;
2144            let continuation_spaces = content_start;
2145
2146            // Adjust line length to account for list marker and space
2147            let prefix_length = indent + trimmed_marker.len() + 1;
2148
2149            // Create adjusted options with reduced line length
2150            let adjusted_options = ReflowOptions {
2151                line_length: options.line_length.saturating_sub(prefix_length),
2152                ..options.clone()
2153            };
2154
2155            let reflowed = reflow_line(&combined_content, &adjusted_options);
2156            for (j, reflowed_line) in reflowed.iter().enumerate() {
2157                if j == 0 {
2158                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
2159                } else {
2160                    // Continuation lines aligned with text after marker
2161                    let continuation_indent = " ".repeat(continuation_spaces);
2162                    result.push(format!("{continuation_indent}{reflowed_line}"));
2163                }
2164            }
2165            continue;
2166        }
2167
2168        // Preserve tables
2169        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
2170            result.push(line.to_string());
2171            i += 1;
2172            continue;
2173        }
2174
2175        // Preserve reference definitions
2176        if trimmed.starts_with('[') && line.contains("]:") {
2177            result.push(line.to_string());
2178            i += 1;
2179            continue;
2180        }
2181
2182        // Preserve definition list items (extended markdown)
2183        if is_definition_list_item(trimmed) {
2184            result.push(line.to_string());
2185            i += 1;
2186            continue;
2187        }
2188
2189        // Check if this is a single line that doesn't need processing
2190        let mut is_single_line_paragraph = true;
2191        if i + 1 < lines.len() {
2192            let next_trimmed = lines[i + 1].trim();
2193            // Check if next line continues this paragraph
2194            if !is_block_boundary(next_trimmed) {
2195                is_single_line_paragraph = false;
2196            }
2197        }
2198
2199        // If it's a single line that fits, just add it as-is
2200        if is_single_line_paragraph && display_len(line, options.length_mode) <= options.line_length {
2201            result.push(line.to_string());
2202            i += 1;
2203            continue;
2204        }
2205
2206        // For regular paragraphs, collect consecutive lines
2207        let mut paragraph_parts = Vec::new();
2208        let mut current_part = vec![line];
2209        i += 1;
2210
2211        // If preserve_breaks is true, treat each line separately
2212        if options.preserve_breaks {
2213            // Don't collect consecutive lines - just reflow this single line
2214            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
2215                Some("\\")
2216            } else if line.ends_with("  ") {
2217                Some("  ")
2218            } else {
2219                None
2220            };
2221            let reflowed = reflow_line(line, options);
2222
2223            // Preserve hard breaks (two trailing spaces or backslash)
2224            if let Some(break_marker) = hard_break_type {
2225                if !reflowed.is_empty() {
2226                    let mut reflowed_with_break = reflowed;
2227                    let last_idx = reflowed_with_break.len() - 1;
2228                    if !has_hard_break(&reflowed_with_break[last_idx]) {
2229                        reflowed_with_break[last_idx].push_str(break_marker);
2230                    }
2231                    result.extend(reflowed_with_break);
2232                }
2233            } else {
2234                result.extend(reflowed);
2235            }
2236        } else {
2237            // Original behavior: collect consecutive lines into a paragraph
2238            while i < lines.len() {
2239                let prev_line = if !current_part.is_empty() {
2240                    current_part.last().unwrap()
2241                } else {
2242                    ""
2243                };
2244                let next_line = lines[i];
2245                let next_trimmed = next_line.trim();
2246
2247                // Stop at empty lines or special blocks
2248                if is_block_boundary(next_trimmed) {
2249                    break;
2250                }
2251
2252                // Check if previous line ends with hard break (two spaces or backslash)
2253                // or is a complete sentence in sentence_per_line mode
2254                let prev_trimmed = prev_line.trim();
2255                let abbreviations = get_abbreviations(&options.abbreviations);
2256                let ends_with_sentence = (prev_trimmed.ends_with('.')
2257                    || prev_trimmed.ends_with('!')
2258                    || prev_trimmed.ends_with('?')
2259                    || prev_trimmed.ends_with(".*")
2260                    || prev_trimmed.ends_with("!*")
2261                    || prev_trimmed.ends_with("?*")
2262                    || prev_trimmed.ends_with("._")
2263                    || prev_trimmed.ends_with("!_")
2264                    || prev_trimmed.ends_with("?_")
2265                    // Quote-terminated sentences (straight and curly quotes)
2266                    || prev_trimmed.ends_with(".\"")
2267                    || prev_trimmed.ends_with("!\"")
2268                    || prev_trimmed.ends_with("?\"")
2269                    || prev_trimmed.ends_with(".'")
2270                    || prev_trimmed.ends_with("!'")
2271                    || prev_trimmed.ends_with("?'")
2272                    || prev_trimmed.ends_with(".\u{201D}")
2273                    || prev_trimmed.ends_with("!\u{201D}")
2274                    || prev_trimmed.ends_with("?\u{201D}")
2275                    || prev_trimmed.ends_with(".\u{2019}")
2276                    || prev_trimmed.ends_with("!\u{2019}")
2277                    || prev_trimmed.ends_with("?\u{2019}"))
2278                    && !text_ends_with_abbreviation(
2279                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
2280                        &abbreviations,
2281                    );
2282
2283                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
2284                    // Start a new part after hard break or complete sentence
2285                    paragraph_parts.push(current_part.join(" "));
2286                    current_part = vec![next_line];
2287                } else {
2288                    current_part.push(next_line);
2289                }
2290                i += 1;
2291            }
2292
2293            // Add the last part
2294            if !current_part.is_empty() {
2295                if current_part.len() == 1 {
2296                    // Single line, don't add trailing space
2297                    paragraph_parts.push(current_part[0].to_string());
2298                } else {
2299                    paragraph_parts.push(current_part.join(" "));
2300                }
2301            }
2302
2303            // Reflow each part separately, preserving hard breaks
2304            for (j, part) in paragraph_parts.iter().enumerate() {
2305                let reflowed = reflow_line(part, options);
2306                result.extend(reflowed);
2307
2308                // Preserve hard break by ensuring last line of part ends with hard break marker
2309                // Use two spaces as the default hard break format for reflows
2310                // But don't add hard breaks in sentence_per_line mode - lines are already separate
2311                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
2312                    let last_idx = result.len() - 1;
2313                    if !has_hard_break(&result[last_idx]) {
2314                        result[last_idx].push_str("  ");
2315                    }
2316                }
2317            }
2318        }
2319    }
2320
2321    // Preserve trailing newline if the original content had one
2322    let result_text = result.join("\n");
2323    if content.ends_with('\n') && !result_text.ends_with('\n') {
2324        format!("{result_text}\n")
2325    } else {
2326        result_text
2327    }
2328}
2329
2330/// Information about a reflowed paragraph
2331#[derive(Debug, Clone)]
2332pub struct ParagraphReflow {
2333    /// Starting byte offset of the paragraph in the original content
2334    pub start_byte: usize,
2335    /// Ending byte offset of the paragraph in the original content
2336    pub end_byte: usize,
2337    /// The reflowed text for this paragraph
2338    pub reflowed_text: String,
2339}
2340
2341/// A collected blockquote line used for style-preserving reflow.
2342///
2343/// The invariant `is_explicit == true` iff `prefix.is_some()` is enforced by the
2344/// constructors. Use [`BlockquoteLineData::explicit`] or [`BlockquoteLineData::lazy`]
2345/// rather than constructing the struct directly.
2346#[derive(Debug, Clone)]
2347pub struct BlockquoteLineData {
2348    /// Trimmed content without the `> ` prefix.
2349    pub(crate) content: String,
2350    /// Whether this line carries an explicit blockquote marker.
2351    pub(crate) is_explicit: bool,
2352    /// Full blockquote prefix (e.g. `"> "`, `"> > "`). `None` for lazy continuation lines.
2353    pub(crate) prefix: Option<String>,
2354}
2355
2356impl BlockquoteLineData {
2357    /// Create an explicit (marker-bearing) blockquote line.
2358    pub fn explicit(content: String, prefix: String) -> Self {
2359        Self {
2360            content,
2361            is_explicit: true,
2362            prefix: Some(prefix),
2363        }
2364    }
2365
2366    /// Create a lazy continuation line (no blockquote marker).
2367    pub fn lazy(content: String) -> Self {
2368        Self {
2369            content,
2370            is_explicit: false,
2371            prefix: None,
2372        }
2373    }
2374}
2375
2376/// Style for blockquote continuation lines after reflow.
2377#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2378pub enum BlockquoteContinuationStyle {
2379    Explicit,
2380    Lazy,
2381}
2382
2383/// Determine the continuation style for a blockquote paragraph from its collected lines.
2384///
2385/// The first line is always explicit (it carries the marker), so only continuation
2386/// lines (index 1+) are counted. Ties resolve to `Explicit`.
2387///
2388/// When the slice has only one element (no continuation lines to inspect), both
2389/// counts are zero and the tie-breaking rule returns `Explicit`.
2390pub fn blockquote_continuation_style(lines: &[BlockquoteLineData]) -> BlockquoteContinuationStyle {
2391    let mut explicit_count = 0usize;
2392    let mut lazy_count = 0usize;
2393
2394    for line in lines.iter().skip(1) {
2395        if line.is_explicit {
2396            explicit_count += 1;
2397        } else {
2398            lazy_count += 1;
2399        }
2400    }
2401
2402    if explicit_count > 0 && lazy_count == 0 {
2403        BlockquoteContinuationStyle::Explicit
2404    } else if lazy_count > 0 && explicit_count == 0 {
2405        BlockquoteContinuationStyle::Lazy
2406    } else if explicit_count >= lazy_count {
2407        BlockquoteContinuationStyle::Explicit
2408    } else {
2409        BlockquoteContinuationStyle::Lazy
2410    }
2411}
2412
2413/// Determine the dominant blockquote prefix for a paragraph.
2414///
2415/// The most frequently occurring explicit prefix wins. Ties are broken by earliest
2416/// first appearance. Falls back to `fallback` when no explicit lines are present.
2417pub fn dominant_blockquote_prefix(lines: &[BlockquoteLineData], fallback: &str) -> String {
2418    let mut counts: std::collections::HashMap<String, (usize, usize)> = std::collections::HashMap::new();
2419
2420    for (idx, line) in lines.iter().enumerate() {
2421        let Some(prefix) = line.prefix.as_ref() else {
2422            continue;
2423        };
2424        counts
2425            .entry(prefix.clone())
2426            .and_modify(|entry| entry.0 += 1)
2427            .or_insert((1, idx));
2428    }
2429
2430    counts
2431        .into_iter()
2432        .max_by(|(_, (count_a, first_idx_a)), (_, (count_b, first_idx_b))| {
2433            count_a.cmp(count_b).then_with(|| first_idx_b.cmp(first_idx_a))
2434        })
2435        .map(|(prefix, _)| prefix)
2436        .unwrap_or_else(|| fallback.to_string())
2437}
2438
2439/// Whether a reflowed blockquote content line must carry an explicit prefix.
2440///
2441/// Lines that would start a new block structure (headings, fences, lists, etc.)
2442/// cannot safely use lazy continuation syntax.
2443pub(crate) fn should_force_explicit_blockquote_line(content_line: &str) -> bool {
2444    let trimmed = content_line.trim_start();
2445    trimmed.starts_with('>')
2446        || trimmed.starts_with('#')
2447        || trimmed.starts_with("```")
2448        || trimmed.starts_with("~~~")
2449        || is_unordered_list_marker(trimmed)
2450        || is_numbered_list_item(trimmed)
2451        || is_horizontal_rule(trimmed)
2452        || is_definition_list_item(trimmed)
2453        || (trimmed.starts_with('[') && trimmed.contains("]:"))
2454        || trimmed.starts_with(":::")
2455        || (trimmed.starts_with('<')
2456            && !trimmed.starts_with("<http")
2457            && !trimmed.starts_with("<https")
2458            && !trimmed.starts_with("<mailto:"))
2459}
2460
2461/// Reflow blockquote content lines and apply continuation style.
2462///
2463/// Segments separated by hard breaks are reflowed independently. The output lines
2464/// receive blockquote prefixes according to `continuation_style`: the first line and
2465/// any line that would start a new block structure always get an explicit prefix;
2466/// other lines follow the detected style.
2467///
2468/// Returns the styled, reflowed lines (without a trailing newline).
2469pub fn reflow_blockquote_content(
2470    lines: &[BlockquoteLineData],
2471    explicit_prefix: &str,
2472    continuation_style: BlockquoteContinuationStyle,
2473    options: &ReflowOptions,
2474) -> Vec<String> {
2475    let content_strs: Vec<&str> = lines.iter().map(|l| l.content.as_str()).collect();
2476    let segments = split_into_segments_strs(&content_strs);
2477    let mut reflowed_content_lines: Vec<String> = Vec::new();
2478
2479    for segment in segments {
2480        let hard_break_type = segment.last().and_then(|&line| {
2481            let line = line.strip_suffix('\r').unwrap_or(line);
2482            if line.ends_with('\\') {
2483                Some("\\")
2484            } else if line.ends_with("  ") {
2485                Some("  ")
2486            } else {
2487                None
2488            }
2489        });
2490
2491        let pieces: Vec<&str> = segment
2492            .iter()
2493            .map(|&line| {
2494                if let Some(l) = line.strip_suffix('\\') {
2495                    l.trim_end()
2496                } else if let Some(l) = line.strip_suffix("  ") {
2497                    l.trim_end()
2498                } else {
2499                    line.trim_end()
2500                }
2501            })
2502            .collect();
2503
2504        let segment_text = pieces.join(" ");
2505        let segment_text = segment_text.trim();
2506        if segment_text.is_empty() {
2507            continue;
2508        }
2509
2510        let mut reflowed = reflow_line(segment_text, options);
2511        if let Some(break_marker) = hard_break_type
2512            && !reflowed.is_empty()
2513        {
2514            let last_idx = reflowed.len() - 1;
2515            if !has_hard_break(&reflowed[last_idx]) {
2516                reflowed[last_idx].push_str(break_marker);
2517            }
2518        }
2519        reflowed_content_lines.extend(reflowed);
2520    }
2521
2522    let mut styled_lines: Vec<String> = Vec::new();
2523    for (idx, line) in reflowed_content_lines.iter().enumerate() {
2524        let force_explicit = idx == 0
2525            || continuation_style == BlockquoteContinuationStyle::Explicit
2526            || should_force_explicit_blockquote_line(line);
2527        if force_explicit {
2528            styled_lines.push(format!("{explicit_prefix}{line}"));
2529        } else {
2530            styled_lines.push(line.clone());
2531        }
2532    }
2533
2534    styled_lines
2535}
2536
2537fn is_blockquote_content_boundary(content: &str) -> bool {
2538    let trimmed = content.trim();
2539    trimmed.is_empty()
2540        || is_block_boundary(trimmed)
2541        || crate::utils::table_utils::TableUtils::is_potential_table_row(content)
2542        || trimmed.starts_with(":::")
2543        || crate::utils::is_template_directive_only(content)
2544        || is_standalone_attr_list(content)
2545        || is_snippet_block_delimiter(content)
2546}
2547
2548fn split_into_segments_strs<'a>(lines: &[&'a str]) -> Vec<Vec<&'a str>> {
2549    let mut segments = Vec::new();
2550    let mut current = Vec::new();
2551
2552    for &line in lines {
2553        current.push(line);
2554        if has_hard_break(line) {
2555            segments.push(current);
2556            current = Vec::new();
2557        }
2558    }
2559
2560    if !current.is_empty() {
2561        segments.push(current);
2562    }
2563
2564    segments
2565}
2566
2567fn reflow_blockquote_paragraph_at_line(
2568    content: &str,
2569    lines: &[&str],
2570    target_idx: usize,
2571    options: &ReflowOptions,
2572) -> Option<ParagraphReflow> {
2573    let mut anchor_idx = target_idx;
2574    let mut target_level = if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[target_idx]) {
2575        parsed.nesting_level
2576    } else {
2577        let mut found = None;
2578        let mut idx = target_idx;
2579        loop {
2580            if lines[idx].trim().is_empty() {
2581                break;
2582            }
2583            if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[idx]) {
2584                found = Some((idx, parsed.nesting_level));
2585                break;
2586            }
2587            if idx == 0 {
2588                break;
2589            }
2590            idx -= 1;
2591        }
2592        let (idx, level) = found?;
2593        anchor_idx = idx;
2594        level
2595    };
2596
2597    // Expand backward to capture prior quote content at the same nesting level.
2598    let mut para_start = anchor_idx;
2599    while para_start > 0 {
2600        let prev_idx = para_start - 1;
2601        let prev_line = lines[prev_idx];
2602
2603        if prev_line.trim().is_empty() {
2604            break;
2605        }
2606
2607        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(prev_line) {
2608            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2609                break;
2610            }
2611            para_start = prev_idx;
2612            continue;
2613        }
2614
2615        let prev_lazy = prev_line.trim_start();
2616        if is_blockquote_content_boundary(prev_lazy) {
2617            break;
2618        }
2619        para_start = prev_idx;
2620    }
2621
2622    // Lazy continuation cannot precede the first explicit marker.
2623    while para_start < lines.len() {
2624        let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[para_start]) else {
2625            para_start += 1;
2626            continue;
2627        };
2628        target_level = parsed.nesting_level;
2629        break;
2630    }
2631
2632    if para_start >= lines.len() || para_start > target_idx {
2633        return None;
2634    }
2635
2636    // Collect explicit lines at target level and lazy continuation lines.
2637    // Each entry is (original_line_idx, BlockquoteLineData).
2638    let mut collected: Vec<(usize, BlockquoteLineData)> = Vec::new();
2639    let mut idx = para_start;
2640    while idx < lines.len() {
2641        if !collected.is_empty() && has_hard_break(&collected[collected.len() - 1].1.content) {
2642            break;
2643        }
2644
2645        let line = lines[idx];
2646        if line.trim().is_empty() {
2647            break;
2648        }
2649
2650        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(line) {
2651            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2652                break;
2653            }
2654            collected.push((
2655                idx,
2656                BlockquoteLineData::explicit(trim_preserving_hard_break(parsed.content), parsed.prefix.to_string()),
2657            ));
2658            idx += 1;
2659            continue;
2660        }
2661
2662        let lazy_content = line.trim_start();
2663        if is_blockquote_content_boundary(lazy_content) {
2664            break;
2665        }
2666
2667        collected.push((idx, BlockquoteLineData::lazy(trim_preserving_hard_break(lazy_content))));
2668        idx += 1;
2669    }
2670
2671    if collected.is_empty() {
2672        return None;
2673    }
2674
2675    let para_end = collected[collected.len() - 1].0;
2676    if target_idx < para_start || target_idx > para_end {
2677        return None;
2678    }
2679
2680    let line_data: Vec<BlockquoteLineData> = collected.iter().map(|(_, d)| d.clone()).collect();
2681
2682    let fallback_prefix = line_data
2683        .iter()
2684        .find_map(|d| d.prefix.clone())
2685        .unwrap_or_else(|| "> ".to_string());
2686    let explicit_prefix = dominant_blockquote_prefix(&line_data, &fallback_prefix);
2687    let continuation_style = blockquote_continuation_style(&line_data);
2688
2689    let adjusted_line_length = options
2690        .line_length
2691        .saturating_sub(display_len(&explicit_prefix, options.length_mode))
2692        .max(1);
2693
2694    let adjusted_options = ReflowOptions {
2695        line_length: adjusted_line_length,
2696        ..options.clone()
2697    };
2698
2699    let styled_lines = reflow_blockquote_content(&line_data, &explicit_prefix, continuation_style, &adjusted_options);
2700
2701    if styled_lines.is_empty() {
2702        return None;
2703    }
2704
2705    // Calculate byte offsets.
2706    let mut start_byte = 0;
2707    for line in lines.iter().take(para_start) {
2708        start_byte += line.len() + 1;
2709    }
2710
2711    let mut end_byte = start_byte;
2712    for line in lines.iter().take(para_end + 1).skip(para_start) {
2713        end_byte += line.len() + 1;
2714    }
2715
2716    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2717    if !includes_trailing_newline {
2718        end_byte -= 1;
2719    }
2720
2721    let reflowed_joined = styled_lines.join("\n");
2722    let reflowed_text = if includes_trailing_newline {
2723        if reflowed_joined.ends_with('\n') {
2724            reflowed_joined
2725        } else {
2726            format!("{reflowed_joined}\n")
2727        }
2728    } else if reflowed_joined.ends_with('\n') {
2729        reflowed_joined.trim_end_matches('\n').to_string()
2730    } else {
2731        reflowed_joined
2732    };
2733
2734    Some(ParagraphReflow {
2735        start_byte,
2736        end_byte,
2737        reflowed_text,
2738    })
2739}
2740
2741/// Reflow a single paragraph at the specified line number
2742///
2743/// This function finds the paragraph containing the given line number,
2744/// reflows it according to the specified line length, and returns
2745/// information about the paragraph location and its reflowed text.
2746///
2747/// # Arguments
2748///
2749/// * `content` - The full document content
2750/// * `line_number` - The 1-based line number within the paragraph to reflow
2751/// * `line_length` - The target line length for reflowing
2752///
2753/// # Returns
2754///
2755/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
2756/// or `None` if the line number is out of bounds or the content at that
2757/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
2758pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
2759    reflow_paragraph_at_line_with_mode(content, line_number, line_length, ReflowLengthMode::default())
2760}
2761
2762/// Reflow a paragraph at the given line with a specific length mode.
2763pub fn reflow_paragraph_at_line_with_mode(
2764    content: &str,
2765    line_number: usize,
2766    line_length: usize,
2767    length_mode: ReflowLengthMode,
2768) -> Option<ParagraphReflow> {
2769    let options = ReflowOptions {
2770        line_length,
2771        length_mode,
2772        ..Default::default()
2773    };
2774    reflow_paragraph_at_line_with_options(content, line_number, &options)
2775}
2776
2777/// Reflow a paragraph at the given line using the provided options.
2778///
2779/// This is the canonical implementation used by both the rule's fix mode and the
2780/// LSP "Reflow paragraph" action. Passing a fully configured `ReflowOptions` allows
2781/// the LSP action to respect user-configured reflow mode, abbreviations, etc.
2782///
2783/// # Returns
2784///
2785/// Returns `Some(ParagraphReflow)` with byte offsets and reflowed text, or `None`
2786/// if the line is out of bounds or sits inside a non-reflow-able construct.
2787pub fn reflow_paragraph_at_line_with_options(
2788    content: &str,
2789    line_number: usize,
2790    options: &ReflowOptions,
2791) -> Option<ParagraphReflow> {
2792    if line_number == 0 {
2793        return None;
2794    }
2795
2796    let lines: Vec<&str> = content.lines().collect();
2797
2798    // Check if line number is valid (1-based)
2799    if line_number > lines.len() {
2800        return None;
2801    }
2802
2803    let target_idx = line_number - 1; // Convert to 0-based
2804    let target_line = lines[target_idx];
2805    let trimmed = target_line.trim();
2806
2807    // Handle blockquote paragraphs (including lazy continuation lines) with
2808    // style-preserving output.
2809    if let Some(blockquote_reflow) = reflow_blockquote_paragraph_at_line(content, &lines, target_idx, options) {
2810        return Some(blockquote_reflow);
2811    }
2812
2813    // Don't reflow special blocks
2814    if is_paragraph_boundary(trimmed, target_line) {
2815        return None;
2816    }
2817
2818    // Find paragraph start - scan backward until blank line or special block
2819    let mut para_start = target_idx;
2820    while para_start > 0 {
2821        let prev_idx = para_start - 1;
2822        let prev_line = lines[prev_idx];
2823        let prev_trimmed = prev_line.trim();
2824
2825        // Stop at blank line or special blocks
2826        if is_paragraph_boundary(prev_trimmed, prev_line) {
2827            break;
2828        }
2829
2830        para_start = prev_idx;
2831    }
2832
2833    // Find paragraph end - scan forward until blank line or special block
2834    let mut para_end = target_idx;
2835    while para_end + 1 < lines.len() {
2836        let next_idx = para_end + 1;
2837        let next_line = lines[next_idx];
2838        let next_trimmed = next_line.trim();
2839
2840        // Stop at blank line or special blocks
2841        if is_paragraph_boundary(next_trimmed, next_line) {
2842            break;
2843        }
2844
2845        para_end = next_idx;
2846    }
2847
2848    // Extract paragraph lines
2849    let paragraph_lines = &lines[para_start..=para_end];
2850
2851    // Calculate byte offsets
2852    let mut start_byte = 0;
2853    for line in lines.iter().take(para_start) {
2854        start_byte += line.len() + 1; // +1 for newline
2855    }
2856
2857    let mut end_byte = start_byte;
2858    for line in paragraph_lines.iter() {
2859        end_byte += line.len() + 1; // +1 for newline
2860    }
2861
2862    // Track whether the byte range includes a trailing newline
2863    // (it doesn't if this is the last line and the file doesn't end with newline)
2864    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2865
2866    // Adjust end_byte if the last line doesn't have a newline
2867    if !includes_trailing_newline {
2868        end_byte -= 1;
2869    }
2870
2871    // Join paragraph lines and reflow
2872    let paragraph_text = paragraph_lines.join("\n");
2873
2874    // Reflow the paragraph using reflow_markdown to handle it properly
2875    let reflowed = reflow_markdown(&paragraph_text, options);
2876
2877    // Ensure reflowed text matches whether the byte range includes a trailing newline
2878    // This is critical: if the range includes a newline, the replacement must too,
2879    // otherwise the next line will get appended to the reflowed paragraph
2880    let reflowed_text = if includes_trailing_newline {
2881        // Range includes newline - ensure reflowed text has one
2882        if reflowed.ends_with('\n') {
2883            reflowed
2884        } else {
2885            format!("{reflowed}\n")
2886        }
2887    } else {
2888        // Range doesn't include newline - ensure reflowed text doesn't have one
2889        if reflowed.ends_with('\n') {
2890            reflowed.trim_end_matches('\n').to_string()
2891        } else {
2892            reflowed
2893        }
2894    };
2895
2896    Some(ParagraphReflow {
2897        start_byte,
2898        end_byte,
2899        reflowed_text,
2900    })
2901}
2902
2903#[cfg(test)]
2904mod tests {
2905    use super::*;
2906
2907    /// Unit test for private helper function text_ends_with_abbreviation()
2908    ///
2909    /// This test stays inline because it tests a private function.
2910    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2911    #[test]
2912    fn test_helper_function_text_ends_with_abbreviation() {
2913        // Test the helper function directly
2914        let abbreviations = get_abbreviations(&None);
2915
2916        // True cases - built-in abbreviations (titles and i.e./e.g.)
2917        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2918        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2919        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2920        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2921        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2922        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2923        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2924        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2925
2926        // False cases - NOT in built-in list (etc doesn't always have period)
2927        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2928        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2929        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2930        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2931        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2932        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2933        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2934        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2935        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2936        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2937    }
2938
2939    #[test]
2940    fn test_is_unordered_list_marker() {
2941        // Valid unordered list markers
2942        assert!(is_unordered_list_marker("- item"));
2943        assert!(is_unordered_list_marker("* item"));
2944        assert!(is_unordered_list_marker("+ item"));
2945        assert!(is_unordered_list_marker("-")); // lone marker
2946        assert!(is_unordered_list_marker("*"));
2947        assert!(is_unordered_list_marker("+"));
2948
2949        // Not list markers
2950        assert!(!is_unordered_list_marker("---")); // horizontal rule
2951        assert!(!is_unordered_list_marker("***")); // horizontal rule
2952        assert!(!is_unordered_list_marker("- - -")); // horizontal rule
2953        assert!(!is_unordered_list_marker("* * *")); // horizontal rule
2954        assert!(!is_unordered_list_marker("*emphasis*")); // emphasis, not list
2955        assert!(!is_unordered_list_marker("-word")); // no space after marker
2956        assert!(!is_unordered_list_marker("")); // empty
2957        assert!(!is_unordered_list_marker("text")); // plain text
2958        assert!(!is_unordered_list_marker("# heading")); // heading
2959    }
2960
2961    #[test]
2962    fn test_is_block_boundary() {
2963        // Block boundaries
2964        assert!(is_block_boundary("")); // empty line
2965        assert!(is_block_boundary("# Heading")); // ATX heading
2966        assert!(is_block_boundary("## Level 2")); // ATX heading
2967        assert!(is_block_boundary("```rust")); // code fence
2968        assert!(is_block_boundary("~~~")); // tilde code fence
2969        assert!(is_block_boundary("> quote")); // blockquote
2970        assert!(is_block_boundary("| cell |")); // table
2971        assert!(is_block_boundary("[link]: http://example.com")); // reference def
2972        assert!(is_block_boundary("---")); // horizontal rule
2973        assert!(is_block_boundary("***")); // horizontal rule
2974        assert!(is_block_boundary("- item")); // unordered list
2975        assert!(is_block_boundary("* item")); // unordered list
2976        assert!(is_block_boundary("+ item")); // unordered list
2977        assert!(is_block_boundary("1. item")); // ordered list
2978        assert!(is_block_boundary("10. item")); // ordered list
2979        assert!(is_block_boundary(": definition")); // definition list
2980        assert!(is_block_boundary(":::")); // div marker
2981        assert!(is_block_boundary("::::: {.callout-note}")); // div marker with attrs
2982
2983        // NOT block boundaries (paragraph continuation)
2984        assert!(!is_block_boundary("regular text"));
2985        assert!(!is_block_boundary("*emphasis*")); // emphasis, not list
2986        assert!(!is_block_boundary("[link](url)")); // inline link, not reference def
2987        assert!(!is_block_boundary("some words here"));
2988    }
2989
2990    #[test]
2991    fn test_definition_list_boundary_in_single_line_paragraph() {
2992        // Verifies that a definition list item after a single-line paragraph
2993        // is treated as a block boundary, not merged into the paragraph
2994        let options = ReflowOptions {
2995            line_length: 80,
2996            ..Default::default()
2997        };
2998        let input = "Term\n: Definition of the term";
2999        let result = reflow_markdown(input, &options);
3000        // The definition list marker should remain on its own line
3001        assert!(
3002            result.contains(": Definition"),
3003            "Definition list item should not be merged into previous line. Got: {result:?}"
3004        );
3005        let lines: Vec<&str> = result.lines().collect();
3006        assert_eq!(lines.len(), 2, "Should remain two separate lines. Got: {lines:?}");
3007        assert_eq!(lines[0], "Term");
3008        assert_eq!(lines[1], ": Definition of the term");
3009    }
3010
3011    #[test]
3012    fn test_is_paragraph_boundary() {
3013        // Core block boundary checks are inherited
3014        assert!(is_paragraph_boundary("# Heading", "# Heading"));
3015        assert!(is_paragraph_boundary("- item", "- item"));
3016        assert!(is_paragraph_boundary(":::", ":::"));
3017        assert!(is_paragraph_boundary(": definition", ": definition"));
3018
3019        // Indented code blocks (≥4 spaces or tab)
3020        assert!(is_paragraph_boundary("code", "    code"));
3021        assert!(is_paragraph_boundary("code", "\tcode"));
3022
3023        // Table rows via is_potential_table_row
3024        assert!(is_paragraph_boundary("| a | b |", "| a | b |"));
3025        assert!(is_paragraph_boundary("a | b", "a | b")); // pipe-delimited without leading pipe
3026
3027        // Not paragraph boundaries
3028        assert!(!is_paragraph_boundary("regular text", "regular text"));
3029        assert!(!is_paragraph_boundary("text", "  text")); // 2-space indent is not code
3030    }
3031
3032    #[test]
3033    fn test_div_marker_boundary_in_reflow_paragraph_at_line() {
3034        // Verifies that div markers (:::) are treated as paragraph boundaries
3035        // in reflow_paragraph_at_line, preventing reflow across div boundaries
3036        let content = "Some paragraph text here.\n\n::: {.callout-note}\nThis is a callout.\n:::\n";
3037        // Line 3 is the div marker — should not be reflowed
3038        let result = reflow_paragraph_at_line(content, 3, 80);
3039        assert!(result.is_none(), "Div marker line should not be reflowed");
3040    }
3041}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs