rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::mkdocs_attr_list::is_standalone_attr_list;
9use crate::utils::mkdocs_snippets::is_snippet_block_delimiter;
10use crate::utils::regex_cache::{
11    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
12    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
13    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
14    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
15};
16use crate::utils::sentence_utils::{
17    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
18    text_ends_with_abbreviation,
19};
20use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
21use std::collections::HashSet;
22use unicode_width::UnicodeWidthStr;
23
24/// Length calculation mode for reflow
25#[derive(Clone, Copy, Debug, Default, PartialEq)]
26pub enum ReflowLengthMode {
27    /// Count Unicode characters (grapheme clusters)
28    Chars,
29    /// Count visual display width (CJK = 2 columns, emoji = 2, etc.)
30    #[default]
31    Visual,
32    /// Count raw bytes
33    Bytes,
34}
35
36/// Calculate the display length of a string based on the length mode
37fn display_len(s: &str, mode: ReflowLengthMode) -> usize {
38    match mode {
39        ReflowLengthMode::Chars => s.chars().count(),
40        ReflowLengthMode::Visual => s.width(),
41        ReflowLengthMode::Bytes => s.len(),
42    }
43}
44
45/// Options for reflowing text
46#[derive(Clone)]
47pub struct ReflowOptions {
48    /// Target line length
49    pub line_length: usize,
50    /// Whether to break on sentence boundaries when possible
51    pub break_on_sentences: bool,
52    /// Whether to preserve existing line breaks in paragraphs
53    pub preserve_breaks: bool,
54    /// Whether to enforce one sentence per line
55    pub sentence_per_line: bool,
56    /// Whether to use semantic line breaks (cascading split strategy)
57    pub semantic_line_breaks: bool,
58    /// Custom abbreviations for sentence detection
59    /// Periods are optional - both "Dr" and "Dr." work the same
60    /// Custom abbreviations are always added to the built-in defaults
61    pub abbreviations: Option<Vec<String>>,
62    /// How to measure string length for line-length comparisons
63    pub length_mode: ReflowLengthMode,
64}
65
66impl Default for ReflowOptions {
67    fn default() -> Self {
68        Self {
69            line_length: 80,
70            break_on_sentences: true,
71            preserve_breaks: false,
72            sentence_per_line: false,
73            semantic_line_breaks: false,
74            abbreviations: None,
75            length_mode: ReflowLengthMode::default(),
76        }
77    }
78}
79
80/// Detect if a character position is a sentence boundary
81/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
82/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
83fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
84    let chars: Vec<char> = text.chars().collect();
85
86    if pos + 1 >= chars.len() {
87        return false;
88    }
89
90    let c = chars[pos];
91    let next_char = chars[pos + 1];
92
93    // Check for CJK sentence-ending punctuation (。, ！, ？)
94    // CJK punctuation doesn't require space or uppercase after it
95    if is_cjk_sentence_ending(c) {
96        // Skip any trailing emphasis/strikethrough markers
97        let mut after_punct_pos = pos + 1;
98        while after_punct_pos < chars.len()
99            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
100        {
101            after_punct_pos += 1;
102        }
103
104        // Skip whitespace
105        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
106            after_punct_pos += 1;
107        }
108
109        // Check if we have more content (any non-whitespace)
110        if after_punct_pos >= chars.len() {
111            return false;
112        }
113
114        // Skip leading emphasis/strikethrough markers
115        while after_punct_pos < chars.len()
116            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
117        {
118            after_punct_pos += 1;
119        }
120
121        if after_punct_pos >= chars.len() {
122            return false;
123        }
124
125        // For CJK, we accept any character as the start of the next sentence
126        // (no uppercase requirement, since CJK doesn't have case)
127        return true;
128    }
129
130    // Check for ASCII sentence-ending punctuation
131    if c != '.' && c != '!' && c != '?' {
132        return false;
133    }
134
135    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
136    let (_space_pos, after_space_pos) = if next_char == ' ' {
137        // Normal case: punctuation followed by space
138        (pos + 1, pos + 2)
139    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
140        // Sentence ends with quote - check what follows the quote
141        if chars[pos + 2] == ' ' {
142            // Just quote followed by space: 'sentence." '
143            (pos + 2, pos + 3)
144        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
145            // Quote followed by emphasis: 'sentence."* '
146            (pos + 3, pos + 4)
147        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
148            && pos + 4 < chars.len()
149            && chars[pos + 3] == chars[pos + 2]
150            && chars[pos + 4] == ' '
151        {
152            // Quote followed by bold: 'sentence."** '
153            (pos + 4, pos + 5)
154        } else {
155            return false;
156        }
157    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
158        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
159        (pos + 2, pos + 3)
160    } else if (next_char == '*' || next_char == '_')
161        && pos + 3 < chars.len()
162        && chars[pos + 2] == next_char
163        && chars[pos + 3] == ' '
164    {
165        // Sentence ends with bold: "sentence.** " or "sentence.__ "
166        (pos + 3, pos + 4)
167    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
168        // Sentence ends with strikethrough: "sentence.~~ "
169        (pos + 3, pos + 4)
170    } else {
171        return false;
172    };
173
174    // Skip all whitespace after the space to find the start of the next sentence
175    let mut next_char_pos = after_space_pos;
176    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
177        next_char_pos += 1;
178    }
179
180    // Check if we reached the end of the string
181    if next_char_pos >= chars.len() {
182        return false;
183    }
184
185    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
186    let mut first_letter_pos = next_char_pos;
187    while first_letter_pos < chars.len()
188        && (chars[first_letter_pos] == '*'
189            || chars[first_letter_pos] == '_'
190            || chars[first_letter_pos] == '~'
191            || is_opening_quote(chars[first_letter_pos]))
192    {
193        first_letter_pos += 1;
194    }
195
196    // Check if we reached the end after skipping emphasis
197    if first_letter_pos >= chars.len() {
198        return false;
199    }
200
201    // First character of next sentence must be uppercase or CJK
202    let first_char = chars[first_letter_pos];
203    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
204        return false;
205    }
206
207    // Look back to check for common abbreviations (only applies to periods)
208    if pos > 0 && c == '.' {
209        // Convert char index to byte offset for string slicing
210        let byte_offset: usize = chars[..=pos].iter().map(|ch| ch.len_utf8()).sum();
211        if text_ends_with_abbreviation(&text[..byte_offset], abbreviations) {
212            return false;
213        }
214
215        // Check for decimal numbers (e.g., "3.14")
216        // Make sure to check if first_letter_pos is within bounds
217        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
218            return false;
219        }
220    }
221    true
222}
223
224/// Split text into sentences
225pub fn split_into_sentences(text: &str) -> Vec<String> {
226    split_into_sentences_custom(text, &None)
227}
228
229/// Split text into sentences with custom abbreviations
230pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
231    let abbreviations = get_abbreviations(custom_abbreviations);
232    split_into_sentences_with_set(text, &abbreviations)
233}
234
235/// Internal function to split text into sentences with a pre-computed abbreviations set
236/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
237fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
238    let mut sentences = Vec::new();
239    let mut current_sentence = String::new();
240    let mut chars = text.chars().peekable();
241    let mut pos = 0;
242
243    while let Some(c) = chars.next() {
244        current_sentence.push(c);
245
246        if is_sentence_boundary(text, pos, abbreviations) {
247            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
248            while let Some(&next) = chars.peek() {
249                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
250                    current_sentence.push(chars.next().unwrap());
251                    pos += 1;
252                } else {
253                    break;
254                }
255            }
256
257            // Consume the space after the sentence
258            if chars.peek() == Some(&' ') {
259                chars.next();
260                pos += 1;
261            }
262
263            sentences.push(current_sentence.trim().to_string());
264            current_sentence.clear();
265        }
266
267        pos += 1;
268    }
269
270    // Add any remaining text as the last sentence
271    if !current_sentence.trim().is_empty() {
272        sentences.push(current_sentence.trim().to_string());
273    }
274    sentences
275}
276
277/// Check if a line is a horizontal rule (---, ___, ***)
278fn is_horizontal_rule(line: &str) -> bool {
279    if line.len() < 3 {
280        return false;
281    }
282
283    // Check if line consists only of -, _, or * characters (at least 3)
284    let chars: Vec<char> = line.chars().collect();
285    if chars.is_empty() {
286        return false;
287    }
288
289    let first_char = chars[0];
290    if first_char != '-' && first_char != '_' && first_char != '*' {
291        return false;
292    }
293
294    // All characters should be the same (allowing spaces between)
295    for c in &chars {
296        if *c != first_char && *c != ' ' {
297            return false;
298        }
299    }
300
301    // Count non-space characters
302    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
303    non_space_count >= 3
304}
305
306/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
307fn is_numbered_list_item(line: &str) -> bool {
308    let mut chars = line.chars();
309
310    // Must start with a digit
311    if !chars.next().is_some_and(|c| c.is_numeric()) {
312        return false;
313    }
314
315    // Can have more digits
316    while let Some(c) = chars.next() {
317        if c == '.' {
318            // After period, must have a space (consistent with list marker extraction)
319            // "2019." alone is NOT treated as a list item to avoid false positives
320            return chars.next() == Some(' ');
321        }
322        if !c.is_numeric() {
323            return false;
324        }
325    }
326
327    false
328}
329
330/// Check if a trimmed line is an unordered list item (-, *, + followed by space)
331fn is_unordered_list_marker(s: &str) -> bool {
332    matches!(s.as_bytes().first(), Some(b'-' | b'*' | b'+'))
333        && !is_horizontal_rule(s)
334        && (s.len() == 1 || s.as_bytes().get(1) == Some(&b' '))
335}
336
337/// Shared structural checks for block boundary detection.
338/// Checks elements that only depend on the trimmed line content.
339fn is_block_boundary_core(trimmed: &str) -> bool {
340    trimmed.is_empty()
341        || trimmed.starts_with('#')
342        || trimmed.starts_with("```")
343        || trimmed.starts_with("~~~")
344        || trimmed.starts_with('>')
345        || (trimmed.starts_with('[') && trimmed.contains("]:"))
346        || is_horizontal_rule(trimmed)
347        || is_unordered_list_marker(trimmed)
348        || is_numbered_list_item(trimmed)
349        || is_definition_list_item(trimmed)
350        || trimmed.starts_with(":::")
351}
352
353/// Check if a trimmed line starts a new structural block element.
354/// Used for paragraph boundary detection in `reflow_markdown()`.
355fn is_block_boundary(trimmed: &str) -> bool {
356    is_block_boundary_core(trimmed) || trimmed.starts_with('|')
357}
358
359/// Check if a line starts a new structural block for paragraph boundary detection
360/// in `reflow_paragraph_at_line()`. Extends the core checks with indented code blocks
361/// (≥4 spaces) and table row detection via `is_potential_table_row`.
362fn is_paragraph_boundary(trimmed: &str, line: &str) -> bool {
363    is_block_boundary_core(trimmed)
364        || ElementCache::calculate_indentation_width_default(line) >= 4
365        || crate::utils::table_utils::TableUtils::is_potential_table_row(line)
366}
367
368/// Check if a line ends with a hard break (either two spaces or backslash)
369///
370/// CommonMark supports two formats for hard line breaks:
371/// 1. Two or more trailing spaces
372/// 2. A backslash at the end of the line
373fn has_hard_break(line: &str) -> bool {
374    let line = line.strip_suffix('\r').unwrap_or(line);
375    line.ends_with("  ") || line.ends_with('\\')
376}
377
378/// Check if text ends with sentence-terminating punctuation (. ! ?)
379fn ends_with_sentence_punct(text: &str) -> bool {
380    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
381}
382
383/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
384///
385/// Hard breaks in Markdown can be indicated by:
386/// 1. Two trailing spaces before a newline (traditional)
387/// 2. A backslash at the end of the line (mdformat style)
388fn trim_preserving_hard_break(s: &str) -> String {
389    // Strip trailing \r from CRLF line endings first to handle Windows files
390    let s = s.strip_suffix('\r').unwrap_or(s);
391
392    // Check for backslash hard break (mdformat style)
393    if s.ends_with('\\') {
394        // Preserve the backslash exactly as-is
395        return s.to_string();
396    }
397
398    // Check if there are at least 2 trailing spaces (traditional hard break)
399    if s.ends_with("  ") {
400        // Find the position where non-space content ends
401        let content_end = s.trim_end().len();
402        if content_end == 0 {
403            // String is all whitespace
404            return String::new();
405        }
406        // Preserve exactly 2 trailing spaces for hard break
407        format!("{}  ", &s[..content_end])
408    } else {
409        // No hard break, just trim all trailing whitespace
410        s.trim_end().to_string()
411    }
412}
413
414pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
415    // For sentence-per-line mode, always process regardless of length
416    if options.sentence_per_line {
417        let elements = parse_markdown_elements(line);
418        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
419    }
420
421    // For semantic line breaks mode, use cascading split strategy
422    if options.semantic_line_breaks {
423        let elements = parse_markdown_elements(line);
424        return reflow_elements_semantic(&elements, options);
425    }
426
427    // Quick check: if line is already short enough or no wrapping requested, return as-is
428    // line_length = 0 means no wrapping (unlimited line length)
429    if options.line_length == 0 || display_len(line, options.length_mode) <= options.line_length {
430        return vec![line.to_string()];
431    }
432
433    // Parse the markdown to identify elements
434    let elements = parse_markdown_elements(line);
435
436    // Reflow the elements into lines
437    reflow_elements(&elements, options)
438}
439
440/// Image source in a linked image structure
441#[derive(Debug, Clone)]
442enum LinkedImageSource {
443    /// Inline image URL: ![alt](url)
444    Inline(String),
445    /// Reference image: ![alt][ref]
446    Reference(String),
447}
448
449/// Link target in a linked image structure
450#[derive(Debug, Clone)]
451enum LinkedImageTarget {
452    /// Inline link URL: ](url)
453    Inline(String),
454    /// Reference link: ][ref]
455    Reference(String),
456}
457
458/// Represents a piece of content in the markdown
459#[derive(Debug, Clone)]
460enum Element {
461    /// Plain text that can be wrapped
462    Text(String),
463    /// A complete markdown inline link [text](url)
464    Link { text: String, url: String },
465    /// A complete markdown reference link [text][ref]
466    ReferenceLink { text: String, reference: String },
467    /// A complete markdown empty reference link [text][]
468    EmptyReferenceLink { text: String },
469    /// A complete markdown shortcut reference link [ref]
470    ShortcutReference { reference: String },
471    /// A complete markdown inline image ![alt](url)
472    InlineImage { alt: String, url: String },
473    /// A complete markdown reference image ![alt][ref]
474    ReferenceImage { alt: String, reference: String },
475    /// A complete markdown empty reference image ![alt][]
476    EmptyReferenceImage { alt: String },
477    /// A clickable image badge in any of 4 forms:
478    /// - [![alt](img-url)](link-url)
479    /// - [![alt][img-ref]](link-url)
480    /// - [![alt](img-url)][link-ref]
481    /// - [![alt][img-ref]][link-ref]
482    LinkedImage {
483        alt: String,
484        img_source: LinkedImageSource,
485        link_target: LinkedImageTarget,
486    },
487    /// Footnote reference [^note]
488    FootnoteReference { note: String },
489    /// Strikethrough text ~~text~~
490    Strikethrough(String),
491    /// Wiki-style link [[wiki]] or [[wiki|text]]
492    WikiLink(String),
493    /// Inline math $math$
494    InlineMath(String),
495    /// Display math $$math$$
496    DisplayMath(String),
497    /// Emoji shortcode :emoji:
498    EmojiShortcode(String),
499    /// Autolink <https://...> or <mailto:...> or <user@domain.com>
500    Autolink(String),
501    /// HTML tag <tag> or </tag> or <tag/>
502    HtmlTag(String),
503    /// HTML entity &nbsp; or &#123;
504    HtmlEntity(String),
505    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
506    HugoShortcode(String),
507    /// Inline code `code`
508    Code(String),
509    /// Bold text **text** or __text__
510    Bold {
511        content: String,
512        /// True if underscore markers (__), false for asterisks (**)
513        underscore: bool,
514    },
515    /// Italic text *text* or _text_
516    Italic {
517        content: String,
518        /// True if underscore marker (_), false for asterisk (*)
519        underscore: bool,
520    },
521}
522
523impl std::fmt::Display for Element {
524    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
525        match self {
526            Element::Text(s) => write!(f, "{s}"),
527            Element::Link { text, url } => write!(f, "[{text}]({url})"),
528            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
529            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
530            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
531            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
532            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
533            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
534            Element::LinkedImage {
535                alt,
536                img_source,
537                link_target,
538            } => {
539                // Build the image part: ![alt](url) or ![alt][ref]
540                let img_part = match img_source {
541                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
542                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
543                };
544                // Build the link part: (url) or [ref]
545                match link_target {
546                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
547                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
548                }
549            }
550            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
551            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
552            Element::WikiLink(s) => write!(f, "[[{s}]]"),
553            Element::InlineMath(s) => write!(f, "${s}$"),
554            Element::DisplayMath(s) => write!(f, "$${s}$$"),
555            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
556            Element::Autolink(s) => write!(f, "{s}"),
557            Element::HtmlTag(s) => write!(f, "{s}"),
558            Element::HtmlEntity(s) => write!(f, "{s}"),
559            Element::HugoShortcode(s) => write!(f, "{s}"),
560            Element::Code(s) => write!(f, "`{s}`"),
561            Element::Bold { content, underscore } => {
562                if *underscore {
563                    write!(f, "__{content}__")
564                } else {
565                    write!(f, "**{content}**")
566                }
567            }
568            Element::Italic { content, underscore } => {
569                if *underscore {
570                    write!(f, "_{content}_")
571                } else {
572                    write!(f, "*{content}*")
573                }
574            }
575        }
576    }
577}
578
579impl Element {
580    /// Calculate the display width of this element using the given length mode.
581    /// This formats the element and computes its width, correctly handling
582    /// visual width for CJK characters and other wide glyphs.
583    fn display_width(&self, mode: ReflowLengthMode) -> usize {
584        let formatted = format!("{self}");
585        display_len(&formatted, mode)
586    }
587}
588
589/// An emphasis or formatting span parsed by pulldown-cmark
590#[derive(Debug, Clone)]
591struct EmphasisSpan {
592    /// Byte offset where the emphasis starts (including markers)
593    start: usize,
594    /// Byte offset where the emphasis ends (after closing markers)
595    end: usize,
596    /// The content inside the emphasis markers
597    content: String,
598    /// Whether this is strong (bold) emphasis
599    is_strong: bool,
600    /// Whether this is strikethrough (~~text~~)
601    is_strikethrough: bool,
602    /// Whether the original used underscore markers (for emphasis only)
603    uses_underscore: bool,
604}
605
606/// Extract emphasis and strikethrough spans from text using pulldown-cmark
607///
608/// This provides CommonMark-compliant emphasis parsing, correctly handling:
609/// - Nested emphasis like `*text **bold** more*`
610/// - Left/right flanking delimiter rules
611/// - Underscore vs asterisk markers
612/// - GFM strikethrough (~~text~~)
613///
614/// Returns spans sorted by start position.
615fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
616    let mut spans = Vec::new();
617    let mut options = Options::empty();
618    options.insert(Options::ENABLE_STRIKETHROUGH);
619
620    // Stacks to track nested formatting with their start positions
621    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
622    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
623    let mut strikethrough_stack: Vec<usize> = Vec::new();
624
625    let parser = Parser::new_ext(text, options).into_offset_iter();
626
627    for (event, range) in parser {
628        match event {
629            Event::Start(Tag::Emphasis) => {
630                // Check if this uses underscore by looking at the original text
631                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
632                emphasis_stack.push((range.start, uses_underscore));
633            }
634            Event::End(TagEnd::Emphasis) => {
635                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
636                    // Extract content between the markers (1 char marker on each side)
637                    let content_start = start_byte + 1;
638                    let content_end = range.end - 1;
639                    if content_end > content_start
640                        && let Some(content) = text.get(content_start..content_end)
641                    {
642                        spans.push(EmphasisSpan {
643                            start: start_byte,
644                            end: range.end,
645                            content: content.to_string(),
646                            is_strong: false,
647                            is_strikethrough: false,
648                            uses_underscore,
649                        });
650                    }
651                }
652            }
653            Event::Start(Tag::Strong) => {
654                // Check if this uses underscore by looking at the original text
655                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
656                strong_stack.push((range.start, uses_underscore));
657            }
658            Event::End(TagEnd::Strong) => {
659                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
660                    // Extract content between the markers (2 char marker on each side)
661                    let content_start = start_byte + 2;
662                    let content_end = range.end - 2;
663                    if content_end > content_start
664                        && let Some(content) = text.get(content_start..content_end)
665                    {
666                        spans.push(EmphasisSpan {
667                            start: start_byte,
668                            end: range.end,
669                            content: content.to_string(),
670                            is_strong: true,
671                            is_strikethrough: false,
672                            uses_underscore,
673                        });
674                    }
675                }
676            }
677            Event::Start(Tag::Strikethrough) => {
678                strikethrough_stack.push(range.start);
679            }
680            Event::End(TagEnd::Strikethrough) => {
681                if let Some(start_byte) = strikethrough_stack.pop() {
682                    // Extract content between the ~~ markers (2 char marker on each side)
683                    let content_start = start_byte + 2;
684                    let content_end = range.end - 2;
685                    if content_end > content_start
686                        && let Some(content) = text.get(content_start..content_end)
687                    {
688                        spans.push(EmphasisSpan {
689                            start: start_byte,
690                            end: range.end,
691                            content: content.to_string(),
692                            is_strong: false,
693                            is_strikethrough: true,
694                            uses_underscore: false,
695                        });
696                    }
697                }
698            }
699            _ => {}
700        }
701    }
702
703    // Sort by start position
704    spans.sort_by_key(|s| s.start);
705    spans
706}
707
708/// Parse markdown elements from text preserving the raw syntax
709///
710/// Detection order is critical:
711/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
712/// 2. Inline images ![alt](url) - before links to handle ! prefix
713/// 3. Reference images ![alt][ref] - before reference links
714/// 4. Inline links [text](url) - before reference links
715/// 5. Reference links [text][ref] - before shortcut references
716/// 6. Shortcut reference links [ref] - detected last to avoid false positives
717/// 7. Other elements (code, bold, italic, etc.) - processed normally
718fn parse_markdown_elements(text: &str) -> Vec<Element> {
719    let mut elements = Vec::new();
720    let mut remaining = text;
721
722    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
723    let emphasis_spans = extract_emphasis_spans(text);
724
725    while !remaining.is_empty() {
726        // Calculate current byte offset in original text
727        let current_offset = text.len() - remaining.len();
728        // Find the earliest occurrence of any markdown pattern
729        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
730
731        // Check for linked images FIRST (all 4 variants)
732        // Quick literal check: only run expensive regexes if we might have a linked image
733        // Pattern starts with "[!" so check for that first
734        if remaining.contains("[!") {
735            // Pattern 1: [![alt](img)](link) - inline image in inline link
736            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
737                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
738            {
739                earliest_match = Some((m.start(), "linked_image_ii", m));
740            }
741
742            // Pattern 2: [![alt][ref]](link) - reference image in inline link
743            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
744                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
745            {
746                earliest_match = Some((m.start(), "linked_image_ri", m));
747            }
748
749            // Pattern 3: [![alt](img)][ref] - inline image in reference link
750            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
751                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
752            {
753                earliest_match = Some((m.start(), "linked_image_ir", m));
754            }
755
756            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
757            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
758                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
759            {
760                earliest_match = Some((m.start(), "linked_image_rr", m));
761            }
762        }
763
764        // Check for images (they start with ! so should be detected before links)
765        // Inline images - ![alt](url)
766        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
767            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
768        {
769            earliest_match = Some((m.start(), "inline_image", m));
770        }
771
772        // Reference images - ![alt][ref]
773        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
774            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
775        {
776            earliest_match = Some((m.start(), "ref_image", m));
777        }
778
779        // Check for footnote references - [^note]
780        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
781            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
782        {
783            earliest_match = Some((m.start(), "footnote_ref", m));
784        }
785
786        // Check for inline links - [text](url)
787        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
788            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
789        {
790            earliest_match = Some((m.start(), "inline_link", m));
791        }
792
793        // Check for reference links - [text][ref]
794        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
795            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
796        {
797            earliest_match = Some((m.start(), "ref_link", m));
798        }
799
800        // Check for shortcut reference links - [ref]
801        // Only check if we haven't found an earlier pattern that would conflict
802        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
803            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
804        {
805            earliest_match = Some((m.start(), "shortcut_ref", m));
806        }
807
808        // Check for wiki-style links - [[wiki]]
809        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
810            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
811        {
812            earliest_match = Some((m.start(), "wiki_link", m));
813        }
814
815        // Check for display math first (before inline) - $$math$$
816        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
817            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
818        {
819            earliest_match = Some((m.start(), "display_math", m));
820        }
821
822        // Check for inline math - $math$
823        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
824            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
825        {
826            earliest_match = Some((m.start(), "inline_math", m));
827        }
828
829        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
830
831        // Check for emoji shortcodes - :emoji:
832        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
833            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
834        {
835            earliest_match = Some((m.start(), "emoji", m));
836        }
837
838        // Check for HTML entities - &nbsp; etc
839        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
840            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
841        {
842            earliest_match = Some((m.start(), "html_entity", m));
843        }
844
845        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
846        // Must be checked before other patterns to avoid false sentence breaks
847        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
848            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
849        {
850            earliest_match = Some((m.start(), "hugo_shortcode", m));
851        }
852
853        // Check for HTML tags - <tag> </tag> <tag/>
854        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
855        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
856            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
857        {
858            // Check if this is an autolink (starts with protocol or mailto:)
859            let matched_text = &remaining[m.start()..m.end()];
860            let is_url_autolink = matched_text.starts_with("<http://")
861                || matched_text.starts_with("<https://")
862                || matched_text.starts_with("<mailto:")
863                || matched_text.starts_with("<ftp://")
864                || matched_text.starts_with("<ftps://");
865
866            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
867            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
868            let is_email_autolink = {
869                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
870                EMAIL_PATTERN.is_match(content)
871            };
872
873            if is_url_autolink || is_email_autolink {
874                earliest_match = Some((m.start(), "autolink", m));
875            } else {
876                earliest_match = Some((m.start(), "html_tag", m));
877            }
878        }
879
880        // Find earliest non-link special characters
881        let mut next_special = remaining.len();
882        let mut special_type = "";
883        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
884
885        // Check for code spans (not handled by pulldown-cmark in this context)
886        if let Some(pos) = remaining.find('`')
887            && pos < next_special
888        {
889            next_special = pos;
890            special_type = "code";
891        }
892
893        // Check for emphasis using pulldown-cmark's pre-extracted spans
894        // Find the earliest emphasis span that starts within remaining text
895        for span in &emphasis_spans {
896            if span.start >= current_offset && span.start < current_offset + remaining.len() {
897                let pos_in_remaining = span.start - current_offset;
898                if pos_in_remaining < next_special {
899                    next_special = pos_in_remaining;
900                    special_type = "pulldown_emphasis";
901                    pulldown_emphasis = Some(span);
902                }
903                break; // Spans are sorted by start position, so first match is earliest
904            }
905        }
906
907        // Determine which pattern to process first
908        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
909            pos < next_special
910        } else {
911            false
912        };
913
914        if should_process_markdown_link {
915            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
916
917            // Add any text before the match
918            if pos > 0 {
919                elements.push(Element::Text(remaining[..pos].to_string()));
920            }
921
922            // Process the matched pattern
923            match pattern_type {
924                // Pattern 1: [![alt](img)](link) - inline image in inline link
925                "linked_image_ii" => {
926                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
927                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
928                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
929                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
930                        elements.push(Element::LinkedImage {
931                            alt: alt.to_string(),
932                            img_source: LinkedImageSource::Inline(img_url.to_string()),
933                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
934                        });
935                        remaining = &remaining[match_obj.end()..];
936                    } else {
937                        elements.push(Element::Text("[".to_string()));
938                        remaining = &remaining[1..];
939                    }
940                }
941                // Pattern 2: [![alt][ref]](link) - reference image in inline link
942                "linked_image_ri" => {
943                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
944                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
945                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
946                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
947                        elements.push(Element::LinkedImage {
948                            alt: alt.to_string(),
949                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
950                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
951                        });
952                        remaining = &remaining[match_obj.end()..];
953                    } else {
954                        elements.push(Element::Text("[".to_string()));
955                        remaining = &remaining[1..];
956                    }
957                }
958                // Pattern 3: [![alt](img)][ref] - inline image in reference link
959                "linked_image_ir" => {
960                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
961                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
962                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
963                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
964                        elements.push(Element::LinkedImage {
965                            alt: alt.to_string(),
966                            img_source: LinkedImageSource::Inline(img_url.to_string()),
967                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
968                        });
969                        remaining = &remaining[match_obj.end()..];
970                    } else {
971                        elements.push(Element::Text("[".to_string()));
972                        remaining = &remaining[1..];
973                    }
974                }
975                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
976                "linked_image_rr" => {
977                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
978                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
979                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
980                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
981                        elements.push(Element::LinkedImage {
982                            alt: alt.to_string(),
983                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
984                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
985                        });
986                        remaining = &remaining[match_obj.end()..];
987                    } else {
988                        elements.push(Element::Text("[".to_string()));
989                        remaining = &remaining[1..];
990                    }
991                }
992                "inline_image" => {
993                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
994                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
995                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
996                        elements.push(Element::InlineImage {
997                            alt: alt.to_string(),
998                            url: url.to_string(),
999                        });
1000                        remaining = &remaining[match_obj.end()..];
1001                    } else {
1002                        elements.push(Element::Text("!".to_string()));
1003                        remaining = &remaining[1..];
1004                    }
1005                }
1006                "ref_image" => {
1007                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
1008                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1009                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1010
1011                        if reference.is_empty() {
1012                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1013                        } else {
1014                            elements.push(Element::ReferenceImage {
1015                                alt: alt.to_string(),
1016                                reference: reference.to_string(),
1017                            });
1018                        }
1019                        remaining = &remaining[match_obj.end()..];
1020                    } else {
1021                        elements.push(Element::Text("!".to_string()));
1022                        remaining = &remaining[1..];
1023                    }
1024                }
1025                "footnote_ref" => {
1026                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
1027                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1028                        elements.push(Element::FootnoteReference { note: note.to_string() });
1029                        remaining = &remaining[match_obj.end()..];
1030                    } else {
1031                        elements.push(Element::Text("[".to_string()));
1032                        remaining = &remaining[1..];
1033                    }
1034                }
1035                "inline_link" => {
1036                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1037                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1038                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1039                        elements.push(Element::Link {
1040                            text: text.to_string(),
1041                            url: url.to_string(),
1042                        });
1043                        remaining = &remaining[match_obj.end()..];
1044                    } else {
1045                        // Fallback - shouldn't happen
1046                        elements.push(Element::Text("[".to_string()));
1047                        remaining = &remaining[1..];
1048                    }
1049                }
1050                "ref_link" => {
1051                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1052                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1053                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1054
1055                        if reference.is_empty() {
1056                            // Empty reference link [text][]
1057                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1058                        } else {
1059                            // Regular reference link [text][ref]
1060                            elements.push(Element::ReferenceLink {
1061                                text: text.to_string(),
1062                                reference: reference.to_string(),
1063                            });
1064                        }
1065                        remaining = &remaining[match_obj.end()..];
1066                    } else {
1067                        // Fallback - shouldn't happen
1068                        elements.push(Element::Text("[".to_string()));
1069                        remaining = &remaining[1..];
1070                    }
1071                }
1072                "shortcut_ref" => {
1073                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1074                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1075                        elements.push(Element::ShortcutReference {
1076                            reference: reference.to_string(),
1077                        });
1078                        remaining = &remaining[match_obj.end()..];
1079                    } else {
1080                        // Fallback - shouldn't happen
1081                        elements.push(Element::Text("[".to_string()));
1082                        remaining = &remaining[1..];
1083                    }
1084                }
1085                "wiki_link" => {
1086                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1087                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1088                        elements.push(Element::WikiLink(content.to_string()));
1089                        remaining = &remaining[match_obj.end()..];
1090                    } else {
1091                        elements.push(Element::Text("[[".to_string()));
1092                        remaining = &remaining[2..];
1093                    }
1094                }
1095                "display_math" => {
1096                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1097                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1098                        elements.push(Element::DisplayMath(math.to_string()));
1099                        remaining = &remaining[match_obj.end()..];
1100                    } else {
1101                        elements.push(Element::Text("$$".to_string()));
1102                        remaining = &remaining[2..];
1103                    }
1104                }
1105                "inline_math" => {
1106                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1107                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1108                        elements.push(Element::InlineMath(math.to_string()));
1109                        remaining = &remaining[match_obj.end()..];
1110                    } else {
1111                        elements.push(Element::Text("$".to_string()));
1112                        remaining = &remaining[1..];
1113                    }
1114                }
1115                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1116                "emoji" => {
1117                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1118                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1119                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1120                        remaining = &remaining[match_obj.end()..];
1121                    } else {
1122                        elements.push(Element::Text(":".to_string()));
1123                        remaining = &remaining[1..];
1124                    }
1125                }
1126                "html_entity" => {
1127                    // HTML entities are captured whole - use as_str() to get just the matched content
1128                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1129                    remaining = &remaining[match_obj.end()..];
1130                }
1131                "hugo_shortcode" => {
1132                    // Hugo shortcodes are atomic elements - preserve them exactly
1133                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1134                    remaining = &remaining[match_obj.end()..];
1135                }
1136                "autolink" => {
1137                    // Autolinks are atomic elements - preserve them exactly
1138                    elements.push(Element::Autolink(match_obj.as_str().to_string()));
1139                    remaining = &remaining[match_obj.end()..];
1140                }
1141                "html_tag" => {
1142                    // HTML tags are captured whole - use as_str() to get just the matched content
1143                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1144                    remaining = &remaining[match_obj.end()..];
1145                }
1146                _ => {
1147                    // Unknown pattern, treat as text
1148                    elements.push(Element::Text("[".to_string()));
1149                    remaining = &remaining[1..];
1150                }
1151            }
1152        } else {
1153            // Process non-link special characters
1154
1155            // Add any text before the special character
1156            if next_special > 0 && next_special < remaining.len() {
1157                elements.push(Element::Text(remaining[..next_special].to_string()));
1158                remaining = &remaining[next_special..];
1159            }
1160
1161            // Process the special element
1162            match special_type {
1163                "code" => {
1164                    // Find end of code
1165                    if let Some(code_end) = remaining[1..].find('`') {
1166                        let code = &remaining[1..1 + code_end];
1167                        elements.push(Element::Code(code.to_string()));
1168                        remaining = &remaining[1 + code_end + 1..];
1169                    } else {
1170                        // No closing backtick, treat as text
1171                        elements.push(Element::Text(remaining.to_string()));
1172                        break;
1173                    }
1174                }
1175                "pulldown_emphasis" => {
1176                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1177                    if let Some(span) = pulldown_emphasis {
1178                        let span_len = span.end - span.start;
1179                        if span.is_strikethrough {
1180                            elements.push(Element::Strikethrough(span.content.clone()));
1181                        } else if span.is_strong {
1182                            elements.push(Element::Bold {
1183                                content: span.content.clone(),
1184                                underscore: span.uses_underscore,
1185                            });
1186                        } else {
1187                            elements.push(Element::Italic {
1188                                content: span.content.clone(),
1189                                underscore: span.uses_underscore,
1190                            });
1191                        }
1192                        remaining = &remaining[span_len..];
1193                    } else {
1194                        // Fallback - shouldn't happen
1195                        elements.push(Element::Text(remaining[..1].to_string()));
1196                        remaining = &remaining[1..];
1197                    }
1198                }
1199                _ => {
1200                    // No special elements found, add all remaining text
1201                    elements.push(Element::Text(remaining.to_string()));
1202                    break;
1203                }
1204            }
1205        }
1206    }
1207
1208    elements
1209}
1210
1211/// Reflow elements for sentence-per-line mode
1212fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1213    let abbreviations = get_abbreviations(custom_abbreviations);
1214    let mut lines = Vec::new();
1215    let mut current_line = String::new();
1216
1217    for (idx, element) in elements.iter().enumerate() {
1218        let element_str = format!("{element}");
1219
1220        // For text elements, split into sentences
1221        if let Element::Text(text) = element {
1222            // Simply append text - it already has correct spacing from tokenization
1223            let combined = format!("{current_line}{text}");
1224            // Use the pre-computed abbreviations set to avoid redundant computation
1225            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1226
1227            if sentences.len() > 1 {
1228                // We found sentence boundaries
1229                for (i, sentence) in sentences.iter().enumerate() {
1230                    if i == 0 {
1231                        // First sentence might continue from previous elements
1232                        // But check if it ends with an abbreviation
1233                        let trimmed = sentence.trim();
1234
1235                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1236                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1237                            current_line = sentence.to_string();
1238                        } else {
1239                            // Normal case - emit the first sentence
1240                            lines.push(sentence.to_string());
1241                            current_line.clear();
1242                        }
1243                    } else if i == sentences.len() - 1 {
1244                        // Last sentence: check if it's complete or incomplete
1245                        let trimmed = sentence.trim();
1246                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1247
1248                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1249                            // Complete sentence - emit it immediately
1250                            lines.push(sentence.to_string());
1251                            current_line.clear();
1252                        } else {
1253                            // Incomplete sentence - save for next iteration
1254                            current_line = sentence.to_string();
1255                        }
1256                    } else {
1257                        // Complete sentences in the middle
1258                        lines.push(sentence.to_string());
1259                    }
1260                }
1261            } else {
1262                // Single sentence - check if it's complete
1263                let trimmed = combined.trim();
1264
1265                // If the combined result is only whitespace, don't accumulate it.
1266                // This prevents leading spaces on subsequent elements when lines
1267                // are joined with spaces during reflow iteration.
1268                if trimmed.is_empty() {
1269                    continue;
1270                }
1271
1272                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1273
1274                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1275                    // Complete single sentence - emit it
1276                    lines.push(trimmed.to_string());
1277                    current_line.clear();
1278                } else {
1279                    // Incomplete sentence - continue accumulating
1280                    current_line = combined;
1281                }
1282            }
1283        } else if let Element::Italic { content, underscore } = element {
1284            // Handle italic elements - may contain multiple sentences that need continuation
1285            let marker = if *underscore { "_" } else { "*" };
1286            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1287        } else if let Element::Bold { content, underscore } = element {
1288            // Handle bold elements - may contain multiple sentences that need continuation
1289            let marker = if *underscore { "__" } else { "**" };
1290            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1291        } else if let Element::Strikethrough(content) = element {
1292            // Handle strikethrough elements - may contain multiple sentences that need continuation
1293            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1294        } else {
1295            // Non-text, non-emphasis elements (Code, Links, etc.)
1296            // Check if this element is adjacent to the preceding text (no space between)
1297            let is_adjacent = if idx > 0 {
1298                match &elements[idx - 1] {
1299                    Element::Text(t) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1300                    _ => true,
1301                }
1302            } else {
1303                false
1304            };
1305
1306            // Add space before element if needed, but not for adjacent elements
1307            if !is_adjacent
1308                && !current_line.is_empty()
1309                && !current_line.ends_with(' ')
1310                && !current_line.ends_with('(')
1311                && !current_line.ends_with('[')
1312            {
1313                current_line.push(' ');
1314            }
1315            current_line.push_str(&element_str);
1316        }
1317    }
1318
1319    // Add any remaining content
1320    if !current_line.is_empty() {
1321        lines.push(current_line.trim().to_string());
1322    }
1323    lines
1324}
1325
1326/// Handle splitting emphasis content at sentence boundaries while preserving markers
1327fn handle_emphasis_sentence_split(
1328    content: &str,
1329    marker: &str,
1330    abbreviations: &HashSet<String>,
1331    current_line: &mut String,
1332    lines: &mut Vec<String>,
1333) {
1334    // Split the emphasis content into sentences
1335    let sentences = split_into_sentences_with_set(content, abbreviations);
1336
1337    if sentences.len() <= 1 {
1338        // Single sentence or no boundaries - treat as atomic
1339        if !current_line.is_empty()
1340            && !current_line.ends_with(' ')
1341            && !current_line.ends_with('(')
1342            && !current_line.ends_with('[')
1343        {
1344            current_line.push(' ');
1345        }
1346        current_line.push_str(marker);
1347        current_line.push_str(content);
1348        current_line.push_str(marker);
1349
1350        // Check if the emphasis content ends with sentence punctuation - if so, emit
1351        let trimmed = content.trim();
1352        let ends_with_punct = ends_with_sentence_punct(trimmed);
1353        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1354            lines.push(current_line.clone());
1355            current_line.clear();
1356        }
1357    } else {
1358        // Multiple sentences - each gets its own emphasis markers
1359        for (i, sentence) in sentences.iter().enumerate() {
1360            let trimmed = sentence.trim();
1361            if trimmed.is_empty() {
1362                continue;
1363            }
1364
1365            if i == 0 {
1366                // First sentence: combine with current_line and emit
1367                if !current_line.is_empty()
1368                    && !current_line.ends_with(' ')
1369                    && !current_line.ends_with('(')
1370                    && !current_line.ends_with('[')
1371                {
1372                    current_line.push(' ');
1373                }
1374                current_line.push_str(marker);
1375                current_line.push_str(trimmed);
1376                current_line.push_str(marker);
1377
1378                // Check if this is a complete sentence
1379                let ends_with_punct = ends_with_sentence_punct(trimmed);
1380                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1381                    lines.push(current_line.clone());
1382                    current_line.clear();
1383                }
1384            } else if i == sentences.len() - 1 {
1385                // Last sentence: check if complete
1386                let ends_with_punct = ends_with_sentence_punct(trimmed);
1387
1388                let mut line = String::new();
1389                line.push_str(marker);
1390                line.push_str(trimmed);
1391                line.push_str(marker);
1392
1393                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1394                    lines.push(line);
1395                } else {
1396                    // Incomplete - keep in current_line for potential continuation
1397                    *current_line = line;
1398                }
1399            } else {
1400                // Middle sentences: emit with markers
1401                let mut line = String::new();
1402                line.push_str(marker);
1403                line.push_str(trimmed);
1404                line.push_str(marker);
1405                lines.push(line);
1406            }
1407        }
1408    }
1409}
1410
1411/// English break-words used for semantic line break splitting.
1412/// These are conjunctions and relative pronouns where a line break
1413/// reads naturally.
1414const BREAK_WORDS: &[&str] = &[
1415    "and",
1416    "or",
1417    "but",
1418    "nor",
1419    "yet",
1420    "so",
1421    "for",
1422    "which",
1423    "that",
1424    "because",
1425    "when",
1426    "if",
1427    "while",
1428    "where",
1429    "although",
1430    "though",
1431    "unless",
1432    "since",
1433    "after",
1434    "before",
1435    "until",
1436    "as",
1437    "once",
1438    "whether",
1439    "however",
1440    "therefore",
1441    "moreover",
1442    "furthermore",
1443    "nevertheless",
1444    "whereas",
1445];
1446
1447/// Check if a character is clause punctuation for semantic line breaks
1448fn is_clause_punctuation(c: char) -> bool {
1449    matches!(c, ',' | ';' | ':' | '\u{2014}') // comma, semicolon, colon, em dash
1450}
1451
1452/// Compute element spans for a flat text representation of elements.
1453/// Returns Vec of (start, end) byte offsets for non-Text elements,
1454/// so we can check that a split position doesn't fall inside them.
1455fn compute_element_spans(elements: &[Element]) -> Vec<(usize, usize)> {
1456    let mut spans = Vec::new();
1457    let mut offset = 0;
1458    for element in elements {
1459        let rendered = format!("{element}");
1460        let len = rendered.len();
1461        if !matches!(element, Element::Text(_)) {
1462            spans.push((offset, offset + len));
1463        }
1464        offset += len;
1465    }
1466    spans
1467}
1468
1469/// Check if a byte position falls inside any non-Text element span
1470fn is_inside_element(pos: usize, spans: &[(usize, usize)]) -> bool {
1471    spans.iter().any(|(start, end)| pos > *start && pos < *end)
1472}
1473
1474/// Minimum fraction of line_length that the first part of a split must occupy.
1475/// Prevents awkwardly short first lines like "A," or "Note:" on their own.
1476const MIN_SPLIT_RATIO: f64 = 0.3;
1477
1478/// Split a line at the latest clause punctuation that keeps the first part
1479/// within `line_length`. Returns None if no valid split point exists or if
1480/// the split would create an unreasonably short first line.
1481fn split_at_clause_punctuation(
1482    text: &str,
1483    line_length: usize,
1484    element_spans: &[(usize, usize)],
1485    length_mode: ReflowLengthMode,
1486) -> Option<(String, String)> {
1487    let chars: Vec<char> = text.chars().collect();
1488    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1489
1490    // Find the char index where accumulated display width exceeds line_length
1491    let mut width_acc = 0;
1492    let mut search_end_char = 0;
1493    for (idx, &c) in chars.iter().enumerate() {
1494        let c_width = display_len(&c.to_string(), length_mode);
1495        if width_acc + c_width > line_length {
1496            break;
1497        }
1498        width_acc += c_width;
1499        search_end_char = idx + 1;
1500    }
1501
1502    let mut best_pos = None;
1503    for i in (0..search_end_char).rev() {
1504        if is_clause_punctuation(chars[i]) {
1505            // Convert char position to byte position for element span check
1506            let byte_pos: usize = chars[..=i].iter().map(|c| c.len_utf8()).sum();
1507            if !is_inside_element(byte_pos, element_spans) {
1508                best_pos = Some(i);
1509                break;
1510            }
1511        }
1512    }
1513
1514    let pos = best_pos?;
1515
1516    // Reject splits that create very short first lines
1517    let first: String = chars[..=pos].iter().collect();
1518    let first_display_len = display_len(&first, length_mode);
1519    if first_display_len < min_first_len {
1520        return None;
1521    }
1522
1523    // Split after the punctuation character
1524    let rest: String = chars[pos + 1..].iter().collect();
1525    let rest = rest.trim_start().to_string();
1526
1527    if rest.is_empty() {
1528        return None;
1529    }
1530
1531    Some((first, rest))
1532}
1533
1534/// Split a line before the latest break-word that keeps the first part
1535/// within `line_length`. Returns None if no valid split point exists or if
1536/// the split would create an unreasonably short first line.
1537fn split_at_break_word(
1538    text: &str,
1539    line_length: usize,
1540    element_spans: &[(usize, usize)],
1541    length_mode: ReflowLengthMode,
1542) -> Option<(String, String)> {
1543    let lower = text.to_lowercase();
1544    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1545    let mut best_split: Option<(usize, usize)> = None; // (byte_start, word_len_bytes)
1546
1547    for &word in BREAK_WORDS {
1548        let mut search_start = 0;
1549        while let Some(pos) = lower[search_start..].find(word) {
1550            let abs_pos = search_start + pos;
1551
1552            // Verify it's a word boundary: preceded by space, followed by space
1553            let preceded_by_space = abs_pos == 0 || text.as_bytes().get(abs_pos - 1) == Some(&b' ');
1554            let followed_by_space = text.as_bytes().get(abs_pos + word.len()) == Some(&b' ');
1555
1556            if preceded_by_space && followed_by_space {
1557                // The break goes BEFORE the word, so first part ends at abs_pos - 1
1558                let first_part = text[..abs_pos].trim_end();
1559                let first_part_len = display_len(first_part, length_mode);
1560
1561                if first_part_len >= min_first_len
1562                    && first_part_len <= line_length
1563                    && !is_inside_element(abs_pos, element_spans)
1564                {
1565                    // Prefer the latest valid split point
1566                    if best_split.is_none_or(|(prev_pos, _)| abs_pos > prev_pos) {
1567                        best_split = Some((abs_pos, word.len()));
1568                    }
1569                }
1570            }
1571
1572            search_start = abs_pos + word.len();
1573        }
1574    }
1575
1576    let (byte_start, _word_len) = best_split?;
1577
1578    let first = text[..byte_start].trim_end().to_string();
1579    let rest = text[byte_start..].to_string();
1580
1581    if first.is_empty() || rest.trim().is_empty() {
1582        return None;
1583    }
1584
1585    Some((first, rest))
1586}
1587
1588/// Recursively cascade-split a line that exceeds line_length.
1589/// Tries clause punctuation first, then break-words, then word wrap.
1590fn cascade_split_line(
1591    text: &str,
1592    line_length: usize,
1593    abbreviations: &Option<Vec<String>>,
1594    length_mode: ReflowLengthMode,
1595) -> Vec<String> {
1596    if line_length == 0 || display_len(text, length_mode) <= line_length {
1597        return vec![text.to_string()];
1598    }
1599
1600    let elements = parse_markdown_elements(text);
1601    let element_spans = compute_element_spans(&elements);
1602
1603    // Try clause punctuation split
1604    if let Some((first, rest)) = split_at_clause_punctuation(text, line_length, &element_spans, length_mode) {
1605        let mut result = vec![first];
1606        result.extend(cascade_split_line(&rest, line_length, abbreviations, length_mode));
1607        return result;
1608    }
1609
1610    // Try break-word split
1611    if let Some((first, rest)) = split_at_break_word(text, line_length, &element_spans, length_mode) {
1612        let mut result = vec![first];
1613        result.extend(cascade_split_line(&rest, line_length, abbreviations, length_mode));
1614        return result;
1615    }
1616
1617    // Fallback: word wrap using existing reflow_elements
1618    let options = ReflowOptions {
1619        line_length,
1620        break_on_sentences: false,
1621        preserve_breaks: false,
1622        sentence_per_line: false,
1623        semantic_line_breaks: false,
1624        abbreviations: abbreviations.clone(),
1625        length_mode,
1626    };
1627    reflow_elements(&elements, &options)
1628}
1629
1630/// Reflow elements using semantic line breaks strategy:
1631/// 1. Split at sentence boundaries (always)
1632/// 2. For lines exceeding line_length, cascade through clause punct → break-words → word wrap
1633fn reflow_elements_semantic(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1634    // Step 1: Split into sentences using existing sentence-per-line logic
1635    let sentence_lines = reflow_elements_sentence_per_line(elements, &options.abbreviations);
1636
1637    // Step 2: For each sentence line, apply cascading splits if it exceeds line_length
1638    // When line_length is 0 (unlimited), skip cascading — sentence splits only
1639    if options.line_length == 0 {
1640        return sentence_lines;
1641    }
1642
1643    let length_mode = options.length_mode;
1644    let mut result = Vec::new();
1645    for line in sentence_lines {
1646        if display_len(&line, length_mode) <= options.line_length {
1647            result.push(line);
1648        } else {
1649            result.extend(cascade_split_line(
1650                &line,
1651                options.line_length,
1652                &options.abbreviations,
1653                length_mode,
1654            ));
1655        }
1656    }
1657
1658    // Step 3: Merge very short trailing lines back into the previous line.
1659    // Word wrap can produce lines like "was" or "see" on their own, which reads poorly.
1660    let min_line_len = ((options.line_length as f64) * MIN_SPLIT_RATIO) as usize;
1661    let mut merged: Vec<String> = Vec::with_capacity(result.len());
1662    for line in result {
1663        if !merged.is_empty() && display_len(&line, length_mode) < min_line_len && !line.trim().is_empty() {
1664            // Don't merge across sentence boundaries — sentence splits are intentional
1665            let prev_ends_at_sentence = {
1666                let trimmed = merged.last().unwrap().trim_end();
1667                trimmed
1668                    .chars()
1669                    .rev()
1670                    .find(|c| !matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | ')' | ']'))
1671                    .is_some_and(|c| matches!(c, '.' | '!' | '?'))
1672            };
1673
1674            if !prev_ends_at_sentence {
1675                let prev = merged.last_mut().unwrap();
1676                let combined = format!("{prev} {line}");
1677                // Only merge if the combined line fits within the limit
1678                if display_len(&combined, length_mode) <= options.line_length {
1679                    *prev = combined;
1680                    continue;
1681                }
1682            }
1683        }
1684        merged.push(line);
1685    }
1686    merged
1687}
1688
1689/// Find the last space in `line` that is safe to split at.
1690/// Safe spaces are those NOT inside rendered non-Text elements.
1691/// `element_spans` contains (start, end) byte ranges of non-Text elements in the line.
1692/// Find the last space in `line` that is not inside any element span.
1693/// Spans use exclusive bounds (pos > start && pos < end) because element
1694/// delimiters (e.g., `[`, `]`, `(`, `)`, `<`, `>`, `` ` ``) are never
1695/// spaces, so only interior positions need protection.
1696fn rfind_safe_space(line: &str, element_spans: &[(usize, usize)]) -> Option<usize> {
1697    line.char_indices()
1698        .rev()
1699        .map(|(pos, _)| pos)
1700        .find(|&pos| line.as_bytes()[pos] == b' ' && !element_spans.iter().any(|(s, e)| pos > *s && pos < *e))
1701}
1702
1703/// Reflow elements into lines that fit within the line length
1704fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1705    let mut lines = Vec::new();
1706    let mut current_line = String::new();
1707    let mut current_length = 0;
1708    // Track byte spans of non-Text elements in current_line for safe splitting
1709    let mut current_line_element_spans: Vec<(usize, usize)> = Vec::new();
1710    let length_mode = options.length_mode;
1711
1712    for (idx, element) in elements.iter().enumerate() {
1713        let element_str = format!("{element}");
1714        let element_len = element.display_width(length_mode);
1715
1716        // Determine adjacency from the original elements, not from current_line.
1717        // Elements are adjacent when there's no whitespace between them in the source:
1718        // - Text("v") → HugoShortcode("{{<...>}}") = adjacent (text has no trailing space)
1719        // - Text(" and ") → InlineLink("[a](url)") = NOT adjacent (text has trailing space)
1720        // - HugoShortcode("{{<...>}}") → Text(",") = adjacent (text has no leading space)
1721        let is_adjacent_to_prev = if idx > 0 {
1722            match (&elements[idx - 1], element) {
1723                (Element::Text(t), _) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1724                (_, Element::Text(t)) => !t.is_empty() && !t.starts_with(char::is_whitespace),
1725                _ => true,
1726            }
1727        } else {
1728            false
1729        };
1730
1731        // For text elements that might need breaking
1732        if let Element::Text(text) = element {
1733            // Check if original text had leading whitespace
1734            let has_leading_space = text.starts_with(char::is_whitespace);
1735            // If this is a text element, always process it word by word
1736            let words: Vec<&str> = text.split_whitespace().collect();
1737
1738            for (i, word) in words.iter().enumerate() {
1739                let word_len = display_len(word, length_mode);
1740                // Check if this "word" is just punctuation that should stay attached
1741                let is_trailing_punct = word
1742                    .chars()
1743                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1744
1745                // First word of text adjacent to preceding non-text element
1746                // must stay attached (e.g., shortcode followed by punctuation or text)
1747                let is_first_adjacent = i == 0 && is_adjacent_to_prev;
1748
1749                if is_first_adjacent {
1750                    // Attach directly without space, preventing line break
1751                    if current_length + word_len > options.line_length && current_length > 0 {
1752                        // Would exceed — break before the adjacent group
1753                        // Use element-aware space search to avoid splitting inside links/code/etc.
1754                        if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1755                            let before = current_line[..last_space].trim_end().to_string();
1756                            let after = current_line[last_space + 1..].to_string();
1757                            lines.push(before);
1758                            current_line = format!("{after}{word}");
1759                            current_length = display_len(&current_line, length_mode);
1760                            current_line_element_spans.clear();
1761                        } else {
1762                            current_line.push_str(word);
1763                            current_length += word_len;
1764                        }
1765                    } else {
1766                        current_line.push_str(word);
1767                        current_length += word_len;
1768                    }
1769                } else if current_length > 0
1770                    && current_length + 1 + word_len > options.line_length
1771                    && !is_trailing_punct
1772                {
1773                    // Start a new line (but never for trailing punctuation)
1774                    lines.push(current_line.trim().to_string());
1775                    current_line = word.to_string();
1776                    current_length = word_len;
1777                    current_line_element_spans.clear();
1778                } else {
1779                    // Add word to current line
1780                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1781                    // AND this isn't trailing punctuation (which attaches directly)
1782                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1783                        current_line.push(' ');
1784                        current_length += 1;
1785                    }
1786                    current_line.push_str(word);
1787                    current_length += word_len;
1788                }
1789            }
1790        } else if matches!(
1791            element,
1792            Element::Italic { .. } | Element::Bold { .. } | Element::Strikethrough(_)
1793        ) && element_len > options.line_length
1794        {
1795            // Italic, bold, and strikethrough with content longer than line_length need word wrapping.
1796            // Split content word-by-word, attach the opening marker to the first word
1797            // and the closing marker to the last word.
1798            let (content, marker): (&str, &str) = match element {
1799                Element::Italic { content, underscore } => (content.as_str(), if *underscore { "_" } else { "*" }),
1800                Element::Bold { content, underscore } => (content.as_str(), if *underscore { "__" } else { "**" }),
1801                Element::Strikethrough(content) => (content.as_str(), "~~"),
1802                _ => unreachable!(),
1803            };
1804
1805            let words: Vec<&str> = content.split_whitespace().collect();
1806            let n = words.len();
1807
1808            if n == 0 {
1809                // Empty span — treat as atomic
1810                let full = format!("{marker}{marker}");
1811                let full_len = display_len(&full, length_mode);
1812                if !is_adjacent_to_prev && current_length > 0 {
1813                    current_line.push(' ');
1814                    current_length += 1;
1815                }
1816                current_line.push_str(&full);
1817                current_length += full_len;
1818            } else {
1819                for (i, word) in words.iter().enumerate() {
1820                    let is_first = i == 0;
1821                    let is_last = i == n - 1;
1822                    let word_str: String = match (is_first, is_last) {
1823                        (true, true) => format!("{marker}{word}{marker}"),
1824                        (true, false) => format!("{marker}{word}"),
1825                        (false, true) => format!("{word}{marker}"),
1826                        (false, false) => word.to_string(),
1827                    };
1828                    let word_len = display_len(&word_str, length_mode);
1829
1830                    let needs_space = if is_first {
1831                        !is_adjacent_to_prev && current_length > 0
1832                    } else {
1833                        current_length > 0
1834                    };
1835
1836                    if needs_space && current_length + 1 + word_len > options.line_length {
1837                        lines.push(current_line.trim_end().to_string());
1838                        current_line = word_str;
1839                        current_length = word_len;
1840                        current_line_element_spans.clear();
1841                    } else {
1842                        if needs_space {
1843                            current_line.push(' ');
1844                            current_length += 1;
1845                        }
1846                        current_line.push_str(&word_str);
1847                        current_length += word_len;
1848                    }
1849                }
1850            }
1851        } else {
1852            // For non-text elements (code, links, references), treat as atomic units
1853            // These should never be broken across lines
1854
1855            if is_adjacent_to_prev {
1856                // Adjacent to preceding text — attach directly without space
1857                if current_length + element_len > options.line_length {
1858                    // Would exceed limit — break before the adjacent word group
1859                    // Use element-aware space search to avoid splitting inside links/code/etc.
1860                    if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1861                        let before = current_line[..last_space].trim_end().to_string();
1862                        let after = current_line[last_space + 1..].to_string();
1863                        lines.push(before);
1864                        current_line = format!("{after}{element_str}");
1865                        current_length = display_len(&current_line, length_mode);
1866                        current_line_element_spans.clear();
1867                        // Record the element span in the new current_line
1868                        let start = after.len();
1869                        current_line_element_spans.push((start, start + element_str.len()));
1870                    } else {
1871                        // No safe space to break at — accept the long line
1872                        let start = current_line.len();
1873                        current_line.push_str(&element_str);
1874                        current_length += element_len;
1875                        current_line_element_spans.push((start, current_line.len()));
1876                    }
1877                } else {
1878                    let start = current_line.len();
1879                    current_line.push_str(&element_str);
1880                    current_length += element_len;
1881                    current_line_element_spans.push((start, current_line.len()));
1882                }
1883            } else if current_length > 0 && current_length + 1 + element_len > options.line_length {
1884                // Not adjacent, would exceed — start new line
1885                lines.push(current_line.trim().to_string());
1886                current_line = element_str.clone();
1887                current_length = element_len;
1888                current_line_element_spans.clear();
1889                current_line_element_spans.push((0, element_str.len()));
1890            } else {
1891                // Not adjacent, fits — add with space
1892                let ends_with_opener =
1893                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1894                if current_length > 0 && !ends_with_opener {
1895                    current_line.push(' ');
1896                    current_length += 1;
1897                }
1898                let start = current_line.len();
1899                current_line.push_str(&element_str);
1900                current_length += element_len;
1901                current_line_element_spans.push((start, current_line.len()));
1902            }
1903        }
1904    }
1905
1906    // Don't forget the last line
1907    if !current_line.is_empty() {
1908        lines.push(current_line.trim_end().to_string());
1909    }
1910
1911    lines
1912}
1913
1914/// Reflow markdown content preserving structure
1915pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1916    let lines: Vec<&str> = content.lines().collect();
1917    let mut result = Vec::new();
1918    let mut i = 0;
1919
1920    while i < lines.len() {
1921        let line = lines[i];
1922        let trimmed = line.trim();
1923
1924        // Preserve empty lines
1925        if trimmed.is_empty() {
1926            result.push(String::new());
1927            i += 1;
1928            continue;
1929        }
1930
1931        // Preserve headings as-is
1932        if trimmed.starts_with('#') {
1933            result.push(line.to_string());
1934            i += 1;
1935            continue;
1936        }
1937
1938        // Preserve Quarto/Pandoc div markers (:::) as-is
1939        if trimmed.starts_with(":::") {
1940            result.push(line.to_string());
1941            i += 1;
1942            continue;
1943        }
1944
1945        // Preserve fenced code blocks
1946        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1947            result.push(line.to_string());
1948            i += 1;
1949            // Copy lines until closing fence
1950            while i < lines.len() {
1951                result.push(lines[i].to_string());
1952                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1953                    i += 1;
1954                    break;
1955                }
1956                i += 1;
1957            }
1958            continue;
1959        }
1960
1961        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1962        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1963            // Collect all consecutive indented lines
1964            result.push(line.to_string());
1965            i += 1;
1966            while i < lines.len() {
1967                let next_line = lines[i];
1968                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1969                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1970                    result.push(next_line.to_string());
1971                    i += 1;
1972                } else {
1973                    break;
1974                }
1975            }
1976            continue;
1977        }
1978
1979        // Preserve block quotes (but reflow their content)
1980        if trimmed.starts_with('>') {
1981            // find() returns byte position which is correct for str slicing
1982            // The unwrap is safe because we already verified trimmed starts with '>'
1983            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
1984            let quote_prefix = line[0..gt_pos + 1].to_string();
1985            let quote_content = &line[quote_prefix.len()..].trim_start();
1986
1987            let reflowed = reflow_line(quote_content, options);
1988            for reflowed_line in reflowed.iter() {
1989                result.push(format!("{quote_prefix} {reflowed_line}"));
1990            }
1991            i += 1;
1992            continue;
1993        }
1994
1995        // Preserve horizontal rules first (before checking for lists)
1996        if is_horizontal_rule(trimmed) {
1997            result.push(line.to_string());
1998            i += 1;
1999            continue;
2000        }
2001
2002        // Preserve lists (but not horizontal rules)
2003        if is_unordered_list_marker(trimmed) || is_numbered_list_item(trimmed) {
2004            // Find the list marker and preserve indentation
2005            let indent = line.len() - line.trim_start().len();
2006            let indent_str = " ".repeat(indent);
2007
2008            // For numbered lists, find the period and the space after it
2009            // For bullet lists, find the marker and the space after it
2010            let mut marker_end = indent;
2011            let mut content_start = indent;
2012
2013            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
2014                // Numbered list: find the period
2015                if let Some(period_pos) = line[indent..].find('.') {
2016                    marker_end = indent + period_pos + 1; // Include the period
2017                    content_start = marker_end;
2018                    // Skip any spaces after the period to find content start
2019                    // Use byte-based check since content_start is a byte index
2020                    // This is safe because space is ASCII (single byte)
2021                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2022                        content_start += 1;
2023                    }
2024                }
2025            } else {
2026                // Bullet list: marker is single character
2027                marker_end = indent + 1; // Just the marker character
2028                content_start = marker_end;
2029                // Skip any spaces after the marker
2030                // Use byte-based check since content_start is a byte index
2031                // This is safe because space is ASCII (single byte)
2032                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2033                    content_start += 1;
2034                }
2035            }
2036
2037            let marker = &line[indent..marker_end];
2038
2039            // Collect all content for this list item (including continuation lines)
2040            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
2041            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
2042            i += 1;
2043
2044            // Collect continuation lines (indented lines that are part of this list item)
2045            while i < lines.len() {
2046                let next_line = lines[i];
2047                let next_trimmed = next_line.trim();
2048
2049                // Stop if we hit an empty line or another list item or special block
2050                if is_block_boundary(next_trimmed) {
2051                    break;
2052                }
2053
2054                // Check if this line is indented (continuation of list item)
2055                let next_indent = next_line.len() - next_line.trim_start().len();
2056                if next_indent >= content_start {
2057                    // This is a continuation line - add its content
2058                    // Preserve hard breaks while trimming excessive whitespace
2059                    let trimmed_start = next_line.trim_start();
2060                    list_content.push(trim_preserving_hard_break(trimmed_start));
2061                    i += 1;
2062                } else {
2063                    // Not indented enough, not part of this list item
2064                    break;
2065                }
2066            }
2067
2068            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
2069            // Hard breaks should prevent joining with the next line
2070            let combined_content = if options.preserve_breaks {
2071                list_content[0].clone()
2072            } else {
2073                // Check if any lines have hard breaks - if so, preserve the structure
2074                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
2075                if has_hard_breaks {
2076                    // Don't join lines with hard breaks - keep them separate with newlines
2077                    list_content.join("\n")
2078                } else {
2079                    // No hard breaks, safe to join with spaces
2080                    list_content.join(" ")
2081                }
2082            };
2083
2084            // Calculate the proper indentation for continuation lines
2085            let trimmed_marker = marker;
2086            let continuation_spaces = content_start;
2087
2088            // Adjust line length to account for list marker and space
2089            let prefix_length = indent + trimmed_marker.len() + 1;
2090
2091            // Create adjusted options with reduced line length
2092            let adjusted_options = ReflowOptions {
2093                line_length: options.line_length.saturating_sub(prefix_length),
2094                ..options.clone()
2095            };
2096
2097            let reflowed = reflow_line(&combined_content, &adjusted_options);
2098            for (j, reflowed_line) in reflowed.iter().enumerate() {
2099                if j == 0 {
2100                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
2101                } else {
2102                    // Continuation lines aligned with text after marker
2103                    let continuation_indent = " ".repeat(continuation_spaces);
2104                    result.push(format!("{continuation_indent}{reflowed_line}"));
2105                }
2106            }
2107            continue;
2108        }
2109
2110        // Preserve tables
2111        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
2112            result.push(line.to_string());
2113            i += 1;
2114            continue;
2115        }
2116
2117        // Preserve reference definitions
2118        if trimmed.starts_with('[') && line.contains("]:") {
2119            result.push(line.to_string());
2120            i += 1;
2121            continue;
2122        }
2123
2124        // Preserve definition list items (extended markdown)
2125        if is_definition_list_item(trimmed) {
2126            result.push(line.to_string());
2127            i += 1;
2128            continue;
2129        }
2130
2131        // Check if this is a single line that doesn't need processing
2132        let mut is_single_line_paragraph = true;
2133        if i + 1 < lines.len() {
2134            let next_trimmed = lines[i + 1].trim();
2135            // Check if next line continues this paragraph
2136            if !is_block_boundary(next_trimmed) {
2137                is_single_line_paragraph = false;
2138            }
2139        }
2140
2141        // If it's a single line that fits, just add it as-is
2142        if is_single_line_paragraph && display_len(line, options.length_mode) <= options.line_length {
2143            result.push(line.to_string());
2144            i += 1;
2145            continue;
2146        }
2147
2148        // For regular paragraphs, collect consecutive lines
2149        let mut paragraph_parts = Vec::new();
2150        let mut current_part = vec![line];
2151        i += 1;
2152
2153        // If preserve_breaks is true, treat each line separately
2154        if options.preserve_breaks {
2155            // Don't collect consecutive lines - just reflow this single line
2156            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
2157                Some("\\")
2158            } else if line.ends_with("  ") {
2159                Some("  ")
2160            } else {
2161                None
2162            };
2163            let reflowed = reflow_line(line, options);
2164
2165            // Preserve hard breaks (two trailing spaces or backslash)
2166            if let Some(break_marker) = hard_break_type {
2167                if !reflowed.is_empty() {
2168                    let mut reflowed_with_break = reflowed;
2169                    let last_idx = reflowed_with_break.len() - 1;
2170                    if !has_hard_break(&reflowed_with_break[last_idx]) {
2171                        reflowed_with_break[last_idx].push_str(break_marker);
2172                    }
2173                    result.extend(reflowed_with_break);
2174                }
2175            } else {
2176                result.extend(reflowed);
2177            }
2178        } else {
2179            // Original behavior: collect consecutive lines into a paragraph
2180            while i < lines.len() {
2181                let prev_line = if !current_part.is_empty() {
2182                    current_part.last().unwrap()
2183                } else {
2184                    ""
2185                };
2186                let next_line = lines[i];
2187                let next_trimmed = next_line.trim();
2188
2189                // Stop at empty lines or special blocks
2190                if is_block_boundary(next_trimmed) {
2191                    break;
2192                }
2193
2194                // Check if previous line ends with hard break (two spaces or backslash)
2195                // or is a complete sentence in sentence_per_line mode
2196                let prev_trimmed = prev_line.trim();
2197                let abbreviations = get_abbreviations(&options.abbreviations);
2198                let ends_with_sentence = (prev_trimmed.ends_with('.')
2199                    || prev_trimmed.ends_with('!')
2200                    || prev_trimmed.ends_with('?')
2201                    || prev_trimmed.ends_with(".*")
2202                    || prev_trimmed.ends_with("!*")
2203                    || prev_trimmed.ends_with("?*")
2204                    || prev_trimmed.ends_with("._")
2205                    || prev_trimmed.ends_with("!_")
2206                    || prev_trimmed.ends_with("?_")
2207                    // Quote-terminated sentences (straight and curly quotes)
2208                    || prev_trimmed.ends_with(".\"")
2209                    || prev_trimmed.ends_with("!\"")
2210                    || prev_trimmed.ends_with("?\"")
2211                    || prev_trimmed.ends_with(".'")
2212                    || prev_trimmed.ends_with("!'")
2213                    || prev_trimmed.ends_with("?'")
2214                    || prev_trimmed.ends_with(".\u{201D}")
2215                    || prev_trimmed.ends_with("!\u{201D}")
2216                    || prev_trimmed.ends_with("?\u{201D}")
2217                    || prev_trimmed.ends_with(".\u{2019}")
2218                    || prev_trimmed.ends_with("!\u{2019}")
2219                    || prev_trimmed.ends_with("?\u{2019}"))
2220                    && !text_ends_with_abbreviation(
2221                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
2222                        &abbreviations,
2223                    );
2224
2225                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
2226                    // Start a new part after hard break or complete sentence
2227                    paragraph_parts.push(current_part.join(" "));
2228                    current_part = vec![next_line];
2229                } else {
2230                    current_part.push(next_line);
2231                }
2232                i += 1;
2233            }
2234
2235            // Add the last part
2236            if !current_part.is_empty() {
2237                if current_part.len() == 1 {
2238                    // Single line, don't add trailing space
2239                    paragraph_parts.push(current_part[0].to_string());
2240                } else {
2241                    paragraph_parts.push(current_part.join(" "));
2242                }
2243            }
2244
2245            // Reflow each part separately, preserving hard breaks
2246            for (j, part) in paragraph_parts.iter().enumerate() {
2247                let reflowed = reflow_line(part, options);
2248                result.extend(reflowed);
2249
2250                // Preserve hard break by ensuring last line of part ends with hard break marker
2251                // Use two spaces as the default hard break format for reflows
2252                // But don't add hard breaks in sentence_per_line mode - lines are already separate
2253                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
2254                    let last_idx = result.len() - 1;
2255                    if !has_hard_break(&result[last_idx]) {
2256                        result[last_idx].push_str("  ");
2257                    }
2258                }
2259            }
2260        }
2261    }
2262
2263    // Preserve trailing newline if the original content had one
2264    let result_text = result.join("\n");
2265    if content.ends_with('\n') && !result_text.ends_with('\n') {
2266        format!("{result_text}\n")
2267    } else {
2268        result_text
2269    }
2270}
2271
2272/// Information about a reflowed paragraph
2273#[derive(Debug, Clone)]
2274pub struct ParagraphReflow {
2275    /// Starting byte offset of the paragraph in the original content
2276    pub start_byte: usize,
2277    /// Ending byte offset of the paragraph in the original content
2278    pub end_byte: usize,
2279    /// The reflowed text for this paragraph
2280    pub reflowed_text: String,
2281}
2282
2283/// A collected blockquote line used for style-preserving reflow.
2284///
2285/// The invariant `is_explicit == true` iff `prefix.is_some()` is enforced by the
2286/// constructors. Use [`BlockquoteLineData::explicit`] or [`BlockquoteLineData::lazy`]
2287/// rather than constructing the struct directly.
2288#[derive(Debug, Clone)]
2289pub struct BlockquoteLineData {
2290    /// Trimmed content without the `> ` prefix.
2291    pub(crate) content: String,
2292    /// Whether this line carries an explicit blockquote marker.
2293    pub(crate) is_explicit: bool,
2294    /// Full blockquote prefix (e.g. `"> "`, `"> > "`). `None` for lazy continuation lines.
2295    pub(crate) prefix: Option<String>,
2296}
2297
2298impl BlockquoteLineData {
2299    /// Create an explicit (marker-bearing) blockquote line.
2300    pub fn explicit(content: String, prefix: String) -> Self {
2301        Self {
2302            content,
2303            is_explicit: true,
2304            prefix: Some(prefix),
2305        }
2306    }
2307
2308    /// Create a lazy continuation line (no blockquote marker).
2309    pub fn lazy(content: String) -> Self {
2310        Self {
2311            content,
2312            is_explicit: false,
2313            prefix: None,
2314        }
2315    }
2316}
2317
2318/// Style for blockquote continuation lines after reflow.
2319#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2320pub enum BlockquoteContinuationStyle {
2321    Explicit,
2322    Lazy,
2323}
2324
2325/// Determine the continuation style for a blockquote paragraph from its collected lines.
2326///
2327/// The first line is always explicit (it carries the marker), so only continuation
2328/// lines (index 1+) are counted. Ties resolve to `Explicit`.
2329///
2330/// When the slice has only one element (no continuation lines to inspect), both
2331/// counts are zero and the tie-breaking rule returns `Explicit`.
2332pub fn blockquote_continuation_style(lines: &[BlockquoteLineData]) -> BlockquoteContinuationStyle {
2333    let mut explicit_count = 0usize;
2334    let mut lazy_count = 0usize;
2335
2336    for line in lines.iter().skip(1) {
2337        if line.is_explicit {
2338            explicit_count += 1;
2339        } else {
2340            lazy_count += 1;
2341        }
2342    }
2343
2344    if explicit_count > 0 && lazy_count == 0 {
2345        BlockquoteContinuationStyle::Explicit
2346    } else if lazy_count > 0 && explicit_count == 0 {
2347        BlockquoteContinuationStyle::Lazy
2348    } else if explicit_count >= lazy_count {
2349        BlockquoteContinuationStyle::Explicit
2350    } else {
2351        BlockquoteContinuationStyle::Lazy
2352    }
2353}
2354
2355/// Determine the dominant blockquote prefix for a paragraph.
2356///
2357/// The most frequently occurring explicit prefix wins. Ties are broken by earliest
2358/// first appearance. Falls back to `fallback` when no explicit lines are present.
2359pub fn dominant_blockquote_prefix(lines: &[BlockquoteLineData], fallback: &str) -> String {
2360    let mut counts: std::collections::HashMap<String, (usize, usize)> = std::collections::HashMap::new();
2361
2362    for (idx, line) in lines.iter().enumerate() {
2363        let Some(prefix) = line.prefix.as_ref() else {
2364            continue;
2365        };
2366        counts
2367            .entry(prefix.clone())
2368            .and_modify(|entry| entry.0 += 1)
2369            .or_insert((1, idx));
2370    }
2371
2372    counts
2373        .into_iter()
2374        .max_by(|(_, (count_a, first_idx_a)), (_, (count_b, first_idx_b))| {
2375            count_a.cmp(count_b).then_with(|| first_idx_b.cmp(first_idx_a))
2376        })
2377        .map(|(prefix, _)| prefix)
2378        .unwrap_or_else(|| fallback.to_string())
2379}
2380
2381/// Whether a reflowed blockquote content line must carry an explicit prefix.
2382///
2383/// Lines that would start a new block structure (headings, fences, lists, etc.)
2384/// cannot safely use lazy continuation syntax.
2385pub(crate) fn should_force_explicit_blockquote_line(content_line: &str) -> bool {
2386    let trimmed = content_line.trim_start();
2387    trimmed.starts_with('>')
2388        || trimmed.starts_with('#')
2389        || trimmed.starts_with("```")
2390        || trimmed.starts_with("~~~")
2391        || is_unordered_list_marker(trimmed)
2392        || is_numbered_list_item(trimmed)
2393        || is_horizontal_rule(trimmed)
2394        || is_definition_list_item(trimmed)
2395        || (trimmed.starts_with('[') && trimmed.contains("]:"))
2396        || trimmed.starts_with(":::")
2397        || (trimmed.starts_with('<')
2398            && !trimmed.starts_with("<http")
2399            && !trimmed.starts_with("<https")
2400            && !trimmed.starts_with("<mailto:"))
2401}
2402
2403/// Reflow blockquote content lines and apply continuation style.
2404///
2405/// Segments separated by hard breaks are reflowed independently. The output lines
2406/// receive blockquote prefixes according to `continuation_style`: the first line and
2407/// any line that would start a new block structure always get an explicit prefix;
2408/// other lines follow the detected style.
2409///
2410/// Returns the styled, reflowed lines (without a trailing newline).
2411pub fn reflow_blockquote_content(
2412    lines: &[BlockquoteLineData],
2413    explicit_prefix: &str,
2414    continuation_style: BlockquoteContinuationStyle,
2415    options: &ReflowOptions,
2416) -> Vec<String> {
2417    let content_strs: Vec<&str> = lines.iter().map(|l| l.content.as_str()).collect();
2418    let segments = split_into_segments_strs(&content_strs);
2419    let mut reflowed_content_lines: Vec<String> = Vec::new();
2420
2421    for segment in segments {
2422        let hard_break_type = segment.last().and_then(|&line| {
2423            let line = line.strip_suffix('\r').unwrap_or(line);
2424            if line.ends_with('\\') {
2425                Some("\\")
2426            } else if line.ends_with("  ") {
2427                Some("  ")
2428            } else {
2429                None
2430            }
2431        });
2432
2433        let pieces: Vec<&str> = segment
2434            .iter()
2435            .map(|&line| {
2436                if let Some(l) = line.strip_suffix('\\') {
2437                    l.trim_end()
2438                } else if let Some(l) = line.strip_suffix("  ") {
2439                    l.trim_end()
2440                } else {
2441                    line.trim_end()
2442                }
2443            })
2444            .collect();
2445
2446        let segment_text = pieces.join(" ");
2447        let segment_text = segment_text.trim();
2448        if segment_text.is_empty() {
2449            continue;
2450        }
2451
2452        let mut reflowed = reflow_line(segment_text, options);
2453        if let Some(break_marker) = hard_break_type
2454            && !reflowed.is_empty()
2455        {
2456            let last_idx = reflowed.len() - 1;
2457            if !has_hard_break(&reflowed[last_idx]) {
2458                reflowed[last_idx].push_str(break_marker);
2459            }
2460        }
2461        reflowed_content_lines.extend(reflowed);
2462    }
2463
2464    let mut styled_lines: Vec<String> = Vec::new();
2465    for (idx, line) in reflowed_content_lines.iter().enumerate() {
2466        let force_explicit = idx == 0
2467            || continuation_style == BlockquoteContinuationStyle::Explicit
2468            || should_force_explicit_blockquote_line(line);
2469        if force_explicit {
2470            styled_lines.push(format!("{explicit_prefix}{line}"));
2471        } else {
2472            styled_lines.push(line.clone());
2473        }
2474    }
2475
2476    styled_lines
2477}
2478
2479fn is_blockquote_content_boundary(content: &str) -> bool {
2480    let trimmed = content.trim();
2481    trimmed.is_empty()
2482        || is_block_boundary(trimmed)
2483        || crate::utils::table_utils::TableUtils::is_potential_table_row(content)
2484        || trimmed.starts_with(":::")
2485        || crate::utils::is_template_directive_only(content)
2486        || is_standalone_attr_list(content)
2487        || is_snippet_block_delimiter(content)
2488}
2489
2490fn split_into_segments_strs<'a>(lines: &[&'a str]) -> Vec<Vec<&'a str>> {
2491    let mut segments = Vec::new();
2492    let mut current = Vec::new();
2493
2494    for &line in lines {
2495        current.push(line);
2496        if has_hard_break(line) {
2497            segments.push(current);
2498            current = Vec::new();
2499        }
2500    }
2501
2502    if !current.is_empty() {
2503        segments.push(current);
2504    }
2505
2506    segments
2507}
2508
2509fn reflow_blockquote_paragraph_at_line(
2510    content: &str,
2511    lines: &[&str],
2512    target_idx: usize,
2513    options: &ReflowOptions,
2514) -> Option<ParagraphReflow> {
2515    let mut anchor_idx = target_idx;
2516    let mut target_level = if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[target_idx]) {
2517        parsed.nesting_level
2518    } else {
2519        let mut found = None;
2520        let mut idx = target_idx;
2521        loop {
2522            if lines[idx].trim().is_empty() {
2523                break;
2524            }
2525            if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[idx]) {
2526                found = Some((idx, parsed.nesting_level));
2527                break;
2528            }
2529            if idx == 0 {
2530                break;
2531            }
2532            idx -= 1;
2533        }
2534        let (idx, level) = found?;
2535        anchor_idx = idx;
2536        level
2537    };
2538
2539    // Expand backward to capture prior quote content at the same nesting level.
2540    let mut para_start = anchor_idx;
2541    while para_start > 0 {
2542        let prev_idx = para_start - 1;
2543        let prev_line = lines[prev_idx];
2544
2545        if prev_line.trim().is_empty() {
2546            break;
2547        }
2548
2549        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(prev_line) {
2550            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2551                break;
2552            }
2553            para_start = prev_idx;
2554            continue;
2555        }
2556
2557        let prev_lazy = prev_line.trim_start();
2558        if is_blockquote_content_boundary(prev_lazy) {
2559            break;
2560        }
2561        para_start = prev_idx;
2562    }
2563
2564    // Lazy continuation cannot precede the first explicit marker.
2565    while para_start < lines.len() {
2566        let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[para_start]) else {
2567            para_start += 1;
2568            continue;
2569        };
2570        target_level = parsed.nesting_level;
2571        break;
2572    }
2573
2574    if para_start >= lines.len() || para_start > target_idx {
2575        return None;
2576    }
2577
2578    // Collect explicit lines at target level and lazy continuation lines.
2579    // Each entry is (original_line_idx, BlockquoteLineData).
2580    let mut collected: Vec<(usize, BlockquoteLineData)> = Vec::new();
2581    let mut idx = para_start;
2582    while idx < lines.len() {
2583        if !collected.is_empty() && has_hard_break(&collected[collected.len() - 1].1.content) {
2584            break;
2585        }
2586
2587        let line = lines[idx];
2588        if line.trim().is_empty() {
2589            break;
2590        }
2591
2592        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(line) {
2593            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2594                break;
2595            }
2596            collected.push((
2597                idx,
2598                BlockquoteLineData::explicit(trim_preserving_hard_break(parsed.content), parsed.prefix.to_string()),
2599            ));
2600            idx += 1;
2601            continue;
2602        }
2603
2604        let lazy_content = line.trim_start();
2605        if is_blockquote_content_boundary(lazy_content) {
2606            break;
2607        }
2608
2609        collected.push((idx, BlockquoteLineData::lazy(trim_preserving_hard_break(lazy_content))));
2610        idx += 1;
2611    }
2612
2613    if collected.is_empty() {
2614        return None;
2615    }
2616
2617    let para_end = collected[collected.len() - 1].0;
2618    if target_idx < para_start || target_idx > para_end {
2619        return None;
2620    }
2621
2622    let line_data: Vec<BlockquoteLineData> = collected.iter().map(|(_, d)| d.clone()).collect();
2623
2624    let fallback_prefix = line_data
2625        .iter()
2626        .find_map(|d| d.prefix.clone())
2627        .unwrap_or_else(|| "> ".to_string());
2628    let explicit_prefix = dominant_blockquote_prefix(&line_data, &fallback_prefix);
2629    let continuation_style = blockquote_continuation_style(&line_data);
2630
2631    let adjusted_line_length = options
2632        .line_length
2633        .saturating_sub(display_len(&explicit_prefix, options.length_mode))
2634        .max(1);
2635
2636    let adjusted_options = ReflowOptions {
2637        line_length: adjusted_line_length,
2638        ..options.clone()
2639    };
2640
2641    let styled_lines = reflow_blockquote_content(&line_data, &explicit_prefix, continuation_style, &adjusted_options);
2642
2643    if styled_lines.is_empty() {
2644        return None;
2645    }
2646
2647    // Calculate byte offsets.
2648    let mut start_byte = 0;
2649    for line in lines.iter().take(para_start) {
2650        start_byte += line.len() + 1;
2651    }
2652
2653    let mut end_byte = start_byte;
2654    for line in lines.iter().take(para_end + 1).skip(para_start) {
2655        end_byte += line.len() + 1;
2656    }
2657
2658    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2659    if !includes_trailing_newline {
2660        end_byte -= 1;
2661    }
2662
2663    let reflowed_joined = styled_lines.join("\n");
2664    let reflowed_text = if includes_trailing_newline {
2665        if reflowed_joined.ends_with('\n') {
2666            reflowed_joined
2667        } else {
2668            format!("{reflowed_joined}\n")
2669        }
2670    } else if reflowed_joined.ends_with('\n') {
2671        reflowed_joined.trim_end_matches('\n').to_string()
2672    } else {
2673        reflowed_joined
2674    };
2675
2676    Some(ParagraphReflow {
2677        start_byte,
2678        end_byte,
2679        reflowed_text,
2680    })
2681}
2682
2683/// Reflow a single paragraph at the specified line number
2684///
2685/// This function finds the paragraph containing the given line number,
2686/// reflows it according to the specified line length, and returns
2687/// information about the paragraph location and its reflowed text.
2688///
2689/// # Arguments
2690///
2691/// * `content` - The full document content
2692/// * `line_number` - The 1-based line number within the paragraph to reflow
2693/// * `line_length` - The target line length for reflowing
2694///
2695/// # Returns
2696///
2697/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
2698/// or `None` if the line number is out of bounds or the content at that
2699/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
2700pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
2701    reflow_paragraph_at_line_with_mode(content, line_number, line_length, ReflowLengthMode::default())
2702}
2703
2704/// Reflow a paragraph at the given line with a specific length mode.
2705pub fn reflow_paragraph_at_line_with_mode(
2706    content: &str,
2707    line_number: usize,
2708    line_length: usize,
2709    length_mode: ReflowLengthMode,
2710) -> Option<ParagraphReflow> {
2711    let options = ReflowOptions {
2712        line_length,
2713        length_mode,
2714        ..Default::default()
2715    };
2716    reflow_paragraph_at_line_with_options(content, line_number, &options)
2717}
2718
2719/// Reflow a paragraph at the given line using the provided options.
2720///
2721/// This is the canonical implementation used by both the rule's fix mode and the
2722/// LSP "Reflow paragraph" action. Passing a fully configured `ReflowOptions` allows
2723/// the LSP action to respect user-configured reflow mode, abbreviations, etc.
2724///
2725/// # Returns
2726///
2727/// Returns `Some(ParagraphReflow)` with byte offsets and reflowed text, or `None`
2728/// if the line is out of bounds or sits inside a non-reflow-able construct.
2729pub fn reflow_paragraph_at_line_with_options(
2730    content: &str,
2731    line_number: usize,
2732    options: &ReflowOptions,
2733) -> Option<ParagraphReflow> {
2734    if line_number == 0 {
2735        return None;
2736    }
2737
2738    let lines: Vec<&str> = content.lines().collect();
2739
2740    // Check if line number is valid (1-based)
2741    if line_number > lines.len() {
2742        return None;
2743    }
2744
2745    let target_idx = line_number - 1; // Convert to 0-based
2746    let target_line = lines[target_idx];
2747    let trimmed = target_line.trim();
2748
2749    // Handle blockquote paragraphs (including lazy continuation lines) with
2750    // style-preserving output.
2751    if let Some(blockquote_reflow) = reflow_blockquote_paragraph_at_line(content, &lines, target_idx, options) {
2752        return Some(blockquote_reflow);
2753    }
2754
2755    // Don't reflow special blocks
2756    if is_paragraph_boundary(trimmed, target_line) {
2757        return None;
2758    }
2759
2760    // Find paragraph start - scan backward until blank line or special block
2761    let mut para_start = target_idx;
2762    while para_start > 0 {
2763        let prev_idx = para_start - 1;
2764        let prev_line = lines[prev_idx];
2765        let prev_trimmed = prev_line.trim();
2766
2767        // Stop at blank line or special blocks
2768        if is_paragraph_boundary(prev_trimmed, prev_line) {
2769            break;
2770        }
2771
2772        para_start = prev_idx;
2773    }
2774
2775    // Find paragraph end - scan forward until blank line or special block
2776    let mut para_end = target_idx;
2777    while para_end + 1 < lines.len() {
2778        let next_idx = para_end + 1;
2779        let next_line = lines[next_idx];
2780        let next_trimmed = next_line.trim();
2781
2782        // Stop at blank line or special blocks
2783        if is_paragraph_boundary(next_trimmed, next_line) {
2784            break;
2785        }
2786
2787        para_end = next_idx;
2788    }
2789
2790    // Extract paragraph lines
2791    let paragraph_lines = &lines[para_start..=para_end];
2792
2793    // Calculate byte offsets
2794    let mut start_byte = 0;
2795    for line in lines.iter().take(para_start) {
2796        start_byte += line.len() + 1; // +1 for newline
2797    }
2798
2799    let mut end_byte = start_byte;
2800    for line in paragraph_lines.iter() {
2801        end_byte += line.len() + 1; // +1 for newline
2802    }
2803
2804    // Track whether the byte range includes a trailing newline
2805    // (it doesn't if this is the last line and the file doesn't end with newline)
2806    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2807
2808    // Adjust end_byte if the last line doesn't have a newline
2809    if !includes_trailing_newline {
2810        end_byte -= 1;
2811    }
2812
2813    // Join paragraph lines and reflow
2814    let paragraph_text = paragraph_lines.join("\n");
2815
2816    // Reflow the paragraph using reflow_markdown to handle it properly
2817    let reflowed = reflow_markdown(&paragraph_text, options);
2818
2819    // Ensure reflowed text matches whether the byte range includes a trailing newline
2820    // This is critical: if the range includes a newline, the replacement must too,
2821    // otherwise the next line will get appended to the reflowed paragraph
2822    let reflowed_text = if includes_trailing_newline {
2823        // Range includes newline - ensure reflowed text has one
2824        if reflowed.ends_with('\n') {
2825            reflowed
2826        } else {
2827            format!("{reflowed}\n")
2828        }
2829    } else {
2830        // Range doesn't include newline - ensure reflowed text doesn't have one
2831        if reflowed.ends_with('\n') {
2832            reflowed.trim_end_matches('\n').to_string()
2833        } else {
2834            reflowed
2835        }
2836    };
2837
2838    Some(ParagraphReflow {
2839        start_byte,
2840        end_byte,
2841        reflowed_text,
2842    })
2843}
2844
2845#[cfg(test)]
2846mod tests {
2847    use super::*;
2848
2849    /// Unit test for private helper function text_ends_with_abbreviation()
2850    ///
2851    /// This test stays inline because it tests a private function.
2852    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2853    #[test]
2854    fn test_helper_function_text_ends_with_abbreviation() {
2855        // Test the helper function directly
2856        let abbreviations = get_abbreviations(&None);
2857
2858        // True cases - built-in abbreviations (titles and i.e./e.g.)
2859        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2860        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2861        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2862        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2863        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2864        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2865        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2866        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2867
2868        // False cases - NOT in built-in list (etc doesn't always have period)
2869        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2870        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2871        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2872        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2873        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2874        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2875        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2876        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2877        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2878        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2879    }
2880
2881    #[test]
2882    fn test_is_unordered_list_marker() {
2883        // Valid unordered list markers
2884        assert!(is_unordered_list_marker("- item"));
2885        assert!(is_unordered_list_marker("* item"));
2886        assert!(is_unordered_list_marker("+ item"));
2887        assert!(is_unordered_list_marker("-")); // lone marker
2888        assert!(is_unordered_list_marker("*"));
2889        assert!(is_unordered_list_marker("+"));
2890
2891        // Not list markers
2892        assert!(!is_unordered_list_marker("---")); // horizontal rule
2893        assert!(!is_unordered_list_marker("***")); // horizontal rule
2894        assert!(!is_unordered_list_marker("- - -")); // horizontal rule
2895        assert!(!is_unordered_list_marker("* * *")); // horizontal rule
2896        assert!(!is_unordered_list_marker("*emphasis*")); // emphasis, not list
2897        assert!(!is_unordered_list_marker("-word")); // no space after marker
2898        assert!(!is_unordered_list_marker("")); // empty
2899        assert!(!is_unordered_list_marker("text")); // plain text
2900        assert!(!is_unordered_list_marker("# heading")); // heading
2901    }
2902
2903    #[test]
2904    fn test_is_block_boundary() {
2905        // Block boundaries
2906        assert!(is_block_boundary("")); // empty line
2907        assert!(is_block_boundary("# Heading")); // ATX heading
2908        assert!(is_block_boundary("## Level 2")); // ATX heading
2909        assert!(is_block_boundary("```rust")); // code fence
2910        assert!(is_block_boundary("~~~")); // tilde code fence
2911        assert!(is_block_boundary("> quote")); // blockquote
2912        assert!(is_block_boundary("| cell |")); // table
2913        assert!(is_block_boundary("[link]: http://example.com")); // reference def
2914        assert!(is_block_boundary("---")); // horizontal rule
2915        assert!(is_block_boundary("***")); // horizontal rule
2916        assert!(is_block_boundary("- item")); // unordered list
2917        assert!(is_block_boundary("* item")); // unordered list
2918        assert!(is_block_boundary("+ item")); // unordered list
2919        assert!(is_block_boundary("1. item")); // ordered list
2920        assert!(is_block_boundary("10. item")); // ordered list
2921        assert!(is_block_boundary(": definition")); // definition list
2922        assert!(is_block_boundary(":::")); // div marker
2923        assert!(is_block_boundary("::::: {.callout-note}")); // div marker with attrs
2924
2925        // NOT block boundaries (paragraph continuation)
2926        assert!(!is_block_boundary("regular text"));
2927        assert!(!is_block_boundary("*emphasis*")); // emphasis, not list
2928        assert!(!is_block_boundary("[link](url)")); // inline link, not reference def
2929        assert!(!is_block_boundary("some words here"));
2930    }
2931
2932    #[test]
2933    fn test_definition_list_boundary_in_single_line_paragraph() {
2934        // Verifies that a definition list item after a single-line paragraph
2935        // is treated as a block boundary, not merged into the paragraph
2936        let options = ReflowOptions {
2937            line_length: 80,
2938            ..Default::default()
2939        };
2940        let input = "Term\n: Definition of the term";
2941        let result = reflow_markdown(input, &options);
2942        // The definition list marker should remain on its own line
2943        assert!(
2944            result.contains(": Definition"),
2945            "Definition list item should not be merged into previous line. Got: {result:?}"
2946        );
2947        let lines: Vec<&str> = result.lines().collect();
2948        assert_eq!(lines.len(), 2, "Should remain two separate lines. Got: {lines:?}");
2949        assert_eq!(lines[0], "Term");
2950        assert_eq!(lines[1], ": Definition of the term");
2951    }
2952
2953    #[test]
2954    fn test_is_paragraph_boundary() {
2955        // Core block boundary checks are inherited
2956        assert!(is_paragraph_boundary("# Heading", "# Heading"));
2957        assert!(is_paragraph_boundary("- item", "- item"));
2958        assert!(is_paragraph_boundary(":::", ":::"));
2959        assert!(is_paragraph_boundary(": definition", ": definition"));
2960
2961        // Indented code blocks (≥4 spaces or tab)
2962        assert!(is_paragraph_boundary("code", "    code"));
2963        assert!(is_paragraph_boundary("code", "\tcode"));
2964
2965        // Table rows via is_potential_table_row
2966        assert!(is_paragraph_boundary("| a | b |", "| a | b |"));
2967        assert!(is_paragraph_boundary("a | b", "a | b")); // pipe-delimited without leading pipe
2968
2969        // Not paragraph boundaries
2970        assert!(!is_paragraph_boundary("regular text", "regular text"));
2971        assert!(!is_paragraph_boundary("text", "  text")); // 2-space indent is not code
2972    }
2973
2974    #[test]
2975    fn test_div_marker_boundary_in_reflow_paragraph_at_line() {
2976        // Verifies that div markers (:::) are treated as paragraph boundaries
2977        // in reflow_paragraph_at_line, preventing reflow across div boundaries
2978        let content = "Some paragraph text here.\n\n::: {.callout-note}\nThis is a callout.\n:::\n";
2979        // Line 3 is the div marker — should not be reflowed
2980        let result = reflow_paragraph_at_line(content, 3, 80);
2981        assert!(result.is_none(), "Div marker line should not be reflowed");
2982    }
2983}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs