rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::calculate_indentation_width_default;
7use crate::utils::is_definition_list_item;
8use crate::utils::mkdocs_attr_list::{ATTR_LIST_PATTERN, is_standalone_attr_list};
9use crate::utils::mkdocs_snippets::is_snippet_block_delimiter;
10use crate::utils::regex_cache::{
11    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
12    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX, LINKED_IMAGE_INLINE_INLINE,
13    LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF, REF_IMAGE_REGEX, REF_LINK_REGEX,
14    SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
15};
16use crate::utils::sentence_utils::{
17    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
18    text_ends_with_abbreviation,
19};
20use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
21use std::collections::HashSet;
22use unicode_width::UnicodeWidthStr;
23
24/// Length calculation mode for reflow
25#[derive(Clone, Copy, Debug, Default, PartialEq)]
26pub enum ReflowLengthMode {
27    /// Count Unicode characters (grapheme clusters)
28    Chars,
29    /// Count visual display width (CJK = 2 columns, emoji = 2, etc.)
30    #[default]
31    Visual,
32    /// Count raw bytes
33    Bytes,
34}
35
36/// Calculate the display length of a string based on the length mode
37fn display_len(s: &str, mode: ReflowLengthMode) -> usize {
38    match mode {
39        ReflowLengthMode::Chars => s.chars().count(),
40        ReflowLengthMode::Visual => s.width(),
41        ReflowLengthMode::Bytes => s.len(),
42    }
43}
44
45/// Options for reflowing text
46#[derive(Clone)]
47pub struct ReflowOptions {
48    /// Target line length
49    pub line_length: usize,
50    /// Whether to break on sentence boundaries when possible
51    pub break_on_sentences: bool,
52    /// Whether to preserve existing line breaks in paragraphs
53    pub preserve_breaks: bool,
54    /// Whether to enforce one sentence per line
55    pub sentence_per_line: bool,
56    /// Whether to use semantic line breaks (cascading split strategy)
57    pub semantic_line_breaks: bool,
58    /// Custom abbreviations for sentence detection
59    /// Periods are optional - both "Dr" and "Dr." work the same
60    /// Custom abbreviations are always added to the built-in defaults
61    pub abbreviations: Option<Vec<String>>,
62    /// How to measure string length for line-length comparisons
63    pub length_mode: ReflowLengthMode,
64    /// Whether to treat {#id .class key="value"} as atomic (unsplittable) elements.
65    /// Enabled for MkDocs and Kramdown flavors.
66    pub attr_lists: bool,
67    /// Whether to require uppercase after periods for sentence detection.
68    /// When true (default), only "word. Capital" is a sentence boundary.
69    /// When false, "word. lowercase" is also treated as a sentence boundary.
70    /// Does not affect ! and ? which are always treated as sentence boundaries.
71    pub require_sentence_capital: bool,
72    /// Cap list continuation indent to this value when set.
73    /// Used by mkdocs flavor where continuation is always 4 spaces
74    /// regardless of checkbox markers.
75    pub max_list_continuation_indent: Option<usize>,
76}
77
78impl Default for ReflowOptions {
79    fn default() -> Self {
80        Self {
81            line_length: 80,
82            break_on_sentences: true,
83            preserve_breaks: false,
84            sentence_per_line: false,
85            semantic_line_breaks: false,
86            abbreviations: None,
87            length_mode: ReflowLengthMode::default(),
88            attr_lists: false,
89            require_sentence_capital: true,
90            max_list_continuation_indent: None,
91        }
92    }
93}
94
95/// Detect if a character position is a sentence boundary
96/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
97/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
98fn is_sentence_boundary(
99    text: &str,
100    pos: usize,
101    abbreviations: &HashSet<String>,
102    require_sentence_capital: bool,
103) -> bool {
104    let chars: Vec<char> = text.chars().collect();
105
106    if pos + 1 >= chars.len() {
107        return false;
108    }
109
110    let c = chars[pos];
111    let next_char = chars[pos + 1];
112
113    // Check for CJK sentence-ending punctuation (。, ！, ？)
114    // CJK punctuation doesn't require space or uppercase after it
115    if is_cjk_sentence_ending(c) {
116        // Skip any trailing emphasis/strikethrough markers
117        let mut after_punct_pos = pos + 1;
118        while after_punct_pos < chars.len()
119            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
120        {
121            after_punct_pos += 1;
122        }
123
124        // Skip whitespace
125        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
126            after_punct_pos += 1;
127        }
128
129        // Check if we have more content (any non-whitespace)
130        if after_punct_pos >= chars.len() {
131            return false;
132        }
133
134        // Skip leading emphasis/strikethrough markers
135        while after_punct_pos < chars.len()
136            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
137        {
138            after_punct_pos += 1;
139        }
140
141        if after_punct_pos >= chars.len() {
142            return false;
143        }
144
145        // For CJK, we accept any character as the start of the next sentence
146        // (no uppercase requirement, since CJK doesn't have case)
147        return true;
148    }
149
150    // Check for ASCII sentence-ending punctuation
151    if c != '.' && c != '!' && c != '?' {
152        return false;
153    }
154
155    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
156    let (_space_pos, after_space_pos) = if next_char == ' ' {
157        // Normal case: punctuation followed by space
158        (pos + 1, pos + 2)
159    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
160        // Sentence ends with quote - check what follows the quote
161        if chars[pos + 2] == ' ' {
162            // Just quote followed by space: 'sentence." '
163            (pos + 2, pos + 3)
164        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
165            // Quote followed by emphasis: 'sentence."* '
166            (pos + 3, pos + 4)
167        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
168            && pos + 4 < chars.len()
169            && chars[pos + 3] == chars[pos + 2]
170            && chars[pos + 4] == ' '
171        {
172            // Quote followed by bold: 'sentence."** '
173            (pos + 4, pos + 5)
174        } else {
175            return false;
176        }
177    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
178        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
179        (pos + 2, pos + 3)
180    } else if (next_char == '*' || next_char == '_')
181        && pos + 3 < chars.len()
182        && chars[pos + 2] == next_char
183        && chars[pos + 3] == ' '
184    {
185        // Sentence ends with bold: "sentence.** " or "sentence.__ "
186        (pos + 3, pos + 4)
187    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
188        // Sentence ends with strikethrough: "sentence.~~ "
189        (pos + 3, pos + 4)
190    } else {
191        return false;
192    };
193
194    // Skip all whitespace after the space to find the start of the next sentence
195    let mut next_char_pos = after_space_pos;
196    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
197        next_char_pos += 1;
198    }
199
200    // Check if we reached the end of the string
201    if next_char_pos >= chars.len() {
202        return false;
203    }
204
205    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
206    let mut first_letter_pos = next_char_pos;
207    while first_letter_pos < chars.len()
208        && (chars[first_letter_pos] == '*'
209            || chars[first_letter_pos] == '_'
210            || chars[first_letter_pos] == '~'
211            || is_opening_quote(chars[first_letter_pos]))
212    {
213        first_letter_pos += 1;
214    }
215
216    // Check if we reached the end after skipping emphasis
217    if first_letter_pos >= chars.len() {
218        return false;
219    }
220
221    let first_char = chars[first_letter_pos];
222
223    // For ! and ?, sentence boundaries are unambiguous — no uppercase requirement
224    if c == '!' || c == '?' {
225        return true;
226    }
227
228    // Period-specific checks: periods are ambiguous (abbreviations, decimals, initials)
229    // so we apply additional guards before accepting a sentence boundary.
230
231    if pos > 0 {
232        // Check for common abbreviations
233        let byte_offset: usize = chars[..=pos].iter().map(|ch| ch.len_utf8()).sum();
234        if text_ends_with_abbreviation(&text[..byte_offset], abbreviations) {
235            return false;
236        }
237
238        // Check for decimal numbers (e.g., "3.14 is pi")
239        if chars[pos - 1].is_numeric() && first_char.is_ascii_digit() {
240            return false;
241        }
242
243        // Check for single-letter initials (e.g., "J. K. Rowling")
244        // A single uppercase letter before the period preceded by whitespace or start
245        // is likely an initial, not a sentence ending.
246        if chars[pos - 1].is_ascii_uppercase() && (pos == 1 || (pos >= 2 && chars[pos - 2].is_whitespace())) {
247            return false;
248        }
249    }
250
251    // In strict mode, require uppercase or CJK to start the next sentence after a period.
252    // In relaxed mode, accept any alphanumeric character.
253    if require_sentence_capital && !first_char.is_uppercase() && !is_cjk_char(first_char) {
254        return false;
255    }
256
257    true
258}
259
260/// Split text into sentences
261pub fn split_into_sentences(text: &str) -> Vec<String> {
262    split_into_sentences_custom(text, &None)
263}
264
265/// Split text into sentences with custom abbreviations
266pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
267    let abbreviations = get_abbreviations(custom_abbreviations);
268    split_into_sentences_with_set(text, &abbreviations, true)
269}
270
271/// Internal function to split text into sentences with a pre-computed abbreviations set
272/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
273fn split_into_sentences_with_set(
274    text: &str,
275    abbreviations: &HashSet<String>,
276    require_sentence_capital: bool,
277) -> Vec<String> {
278    let mut sentences = Vec::new();
279    let mut current_sentence = String::new();
280    let mut chars = text.chars().peekable();
281    let mut pos = 0;
282
283    while let Some(c) = chars.next() {
284        current_sentence.push(c);
285
286        if is_sentence_boundary(text, pos, abbreviations, require_sentence_capital) {
287            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
288            while let Some(&next) = chars.peek() {
289                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
290                    current_sentence.push(chars.next().unwrap());
291                    pos += 1;
292                } else {
293                    break;
294                }
295            }
296
297            // Consume the space after the sentence
298            if chars.peek() == Some(&' ') {
299                chars.next();
300                pos += 1;
301            }
302
303            sentences.push(current_sentence.trim().to_string());
304            current_sentence.clear();
305        }
306
307        pos += 1;
308    }
309
310    // Add any remaining text as the last sentence
311    if !current_sentence.trim().is_empty() {
312        sentences.push(current_sentence.trim().to_string());
313    }
314    sentences
315}
316
317/// Check if a line is a horizontal rule (---, ___, ***)
318fn is_horizontal_rule(line: &str) -> bool {
319    if line.len() < 3 {
320        return false;
321    }
322
323    // Check if line consists only of -, _, or * characters (at least 3)
324    let chars: Vec<char> = line.chars().collect();
325    if chars.is_empty() {
326        return false;
327    }
328
329    let first_char = chars[0];
330    if first_char != '-' && first_char != '_' && first_char != '*' {
331        return false;
332    }
333
334    // All characters should be the same (allowing spaces between)
335    for c in &chars {
336        if *c != first_char && *c != ' ' {
337            return false;
338        }
339    }
340
341    // Count non-space characters
342    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
343    non_space_count >= 3
344}
345
346/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
347fn is_numbered_list_item(line: &str) -> bool {
348    let mut chars = line.chars();
349
350    // Must start with a digit
351    if !chars.next().is_some_and(|c| c.is_numeric()) {
352        return false;
353    }
354
355    // Can have more digits
356    while let Some(c) = chars.next() {
357        if c == '.' {
358            // After period, must have a space (consistent with list marker extraction)
359            // "2019." alone is NOT treated as a list item to avoid false positives
360            return chars.next() == Some(' ');
361        }
362        if !c.is_numeric() {
363            return false;
364        }
365    }
366
367    false
368}
369
370/// Check if a trimmed line is an unordered list item (-, *, + followed by space)
371fn is_unordered_list_marker(s: &str) -> bool {
372    matches!(s.as_bytes().first(), Some(b'-' | b'*' | b'+'))
373        && !is_horizontal_rule(s)
374        && (s.len() == 1 || s.as_bytes().get(1) == Some(&b' '))
375}
376
377/// Shared structural checks for block boundary detection.
378/// Checks elements that only depend on the trimmed line content.
379fn is_block_boundary_core(trimmed: &str) -> bool {
380    trimmed.is_empty()
381        || trimmed.starts_with('#')
382        || trimmed.starts_with("```")
383        || trimmed.starts_with("~~~")
384        || trimmed.starts_with('>')
385        || (trimmed.starts_with('[') && trimmed.contains("]:"))
386        || is_horizontal_rule(trimmed)
387        || is_unordered_list_marker(trimmed)
388        || is_numbered_list_item(trimmed)
389        || is_definition_list_item(trimmed)
390        || trimmed.starts_with(":::")
391}
392
393/// Check if a trimmed line starts a new structural block element.
394/// Used for paragraph boundary detection in `reflow_markdown()`.
395fn is_block_boundary(trimmed: &str) -> bool {
396    is_block_boundary_core(trimmed) || trimmed.starts_with('|')
397}
398
399/// Check if a line starts a new structural block for paragraph boundary detection
400/// in `reflow_paragraph_at_line()`. Extends the core checks with indented code blocks
401/// (≥4 spaces) and table row detection via `is_potential_table_row`.
402fn is_paragraph_boundary(trimmed: &str, line: &str) -> bool {
403    is_block_boundary_core(trimmed)
404        || calculate_indentation_width_default(line) >= 4
405        || crate::utils::table_utils::TableUtils::is_potential_table_row(line)
406}
407
408/// Check if a line ends with a hard break (either two spaces or backslash)
409///
410/// CommonMark supports two formats for hard line breaks:
411/// 1. Two or more trailing spaces
412/// 2. A backslash at the end of the line
413fn has_hard_break(line: &str) -> bool {
414    let line = line.strip_suffix('\r').unwrap_or(line);
415    line.ends_with("  ") || line.ends_with('\\')
416}
417
418/// Check if text ends with sentence-terminating punctuation (. ! ?)
419fn ends_with_sentence_punct(text: &str) -> bool {
420    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
421}
422
423/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
424///
425/// Hard breaks in Markdown can be indicated by:
426/// 1. Two trailing spaces before a newline (traditional)
427/// 2. A backslash at the end of the line (mdformat style)
428fn trim_preserving_hard_break(s: &str) -> String {
429    // Strip trailing \r from CRLF line endings first to handle Windows files
430    let s = s.strip_suffix('\r').unwrap_or(s);
431
432    // Check for backslash hard break (mdformat style)
433    if s.ends_with('\\') {
434        // Preserve the backslash exactly as-is
435        return s.to_string();
436    }
437
438    // Check if there are at least 2 trailing spaces (traditional hard break)
439    if s.ends_with("  ") {
440        // Find the position where non-space content ends
441        let content_end = s.trim_end().len();
442        if content_end == 0 {
443            // String is all whitespace
444            return String::new();
445        }
446        // Preserve exactly 2 trailing spaces for hard break
447        format!("{}  ", &s[..content_end])
448    } else {
449        // No hard break, just trim all trailing whitespace
450        s.trim_end().to_string()
451    }
452}
453
454/// Parse markdown elements using the appropriate parser based on options.
455fn parse_elements(text: &str, options: &ReflowOptions) -> Vec<Element> {
456    if options.attr_lists {
457        parse_markdown_elements_with_attr_lists(text)
458    } else {
459        parse_markdown_elements(text)
460    }
461}
462
463pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
464    // For sentence-per-line mode, always process regardless of length
465    if options.sentence_per_line {
466        let elements = parse_elements(line, options);
467        return reflow_elements_sentence_per_line(&elements, &options.abbreviations, options.require_sentence_capital);
468    }
469
470    // For semantic line breaks mode, use cascading split strategy
471    if options.semantic_line_breaks {
472        let elements = parse_elements(line, options);
473        return reflow_elements_semantic(&elements, options);
474    }
475
476    // Quick check: if line is already short enough or no wrapping requested, return as-is
477    // line_length = 0 means no wrapping (unlimited line length)
478    if options.line_length == 0 || display_len(line, options.length_mode) <= options.line_length {
479        return vec![line.to_string()];
480    }
481
482    // Parse the markdown to identify elements
483    let elements = parse_elements(line, options);
484
485    // Reflow the elements into lines
486    reflow_elements(&elements, options)
487}
488
489/// Image source in a linked image structure
490#[derive(Debug, Clone)]
491enum LinkedImageSource {
492    /// Inline image URL: ![alt](url)
493    Inline(String),
494    /// Reference image: ![alt][ref]
495    Reference(String),
496}
497
498/// Link target in a linked image structure
499#[derive(Debug, Clone)]
500enum LinkedImageTarget {
501    /// Inline link URL: ](url)
502    Inline(String),
503    /// Reference link: ][ref]
504    Reference(String),
505}
506
507/// Represents a piece of content in the markdown
508#[derive(Debug, Clone)]
509enum Element {
510    /// Plain text that can be wrapped
511    Text(String),
512    /// A complete markdown inline link [text](url)
513    Link { text: String, url: String },
514    /// A complete markdown reference link [text][ref]
515    ReferenceLink { text: String, reference: String },
516    /// A complete markdown empty reference link [text][]
517    EmptyReferenceLink { text: String },
518    /// A complete markdown shortcut reference link [ref]
519    ShortcutReference { reference: String },
520    /// A complete markdown inline image ![alt](url)
521    InlineImage { alt: String, url: String },
522    /// A complete markdown reference image ![alt][ref]
523    ReferenceImage { alt: String, reference: String },
524    /// A complete markdown empty reference image ![alt][]
525    EmptyReferenceImage { alt: String },
526    /// A clickable image badge in any of 4 forms:
527    /// - [![alt](img-url)](link-url)
528    /// - [![alt][img-ref]](link-url)
529    /// - [![alt](img-url)][link-ref]
530    /// - [![alt][img-ref]][link-ref]
531    LinkedImage {
532        alt: String,
533        img_source: LinkedImageSource,
534        link_target: LinkedImageTarget,
535    },
536    /// Footnote reference [^note]
537    FootnoteReference { note: String },
538    /// Strikethrough text ~~text~~
539    Strikethrough(String),
540    /// Wiki-style link [[wiki]] or [[wiki|text]]
541    WikiLink(String),
542    /// Inline math $math$
543    InlineMath(String),
544    /// Display math $$math$$
545    DisplayMath(String),
546    /// Emoji shortcode :emoji:
547    EmojiShortcode(String),
548    /// Autolink <https://...> or <mailto:...> or <user@domain.com>
549    Autolink(String),
550    /// HTML tag <tag> or </tag> or <tag/>
551    HtmlTag(String),
552    /// HTML entity &nbsp; or &#123;
553    HtmlEntity(String),
554    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
555    HugoShortcode(String),
556    /// MkDocs/kramdown attribute list {#id .class key="value"}
557    AttrList(String),
558    /// Inline code `code`
559    Code(String),
560    /// Bold text **text** or __text__
561    Bold {
562        content: String,
563        /// True if underscore markers (__), false for asterisks (**)
564        underscore: bool,
565    },
566    /// Italic text *text* or _text_
567    Italic {
568        content: String,
569        /// True if underscore marker (_), false for asterisk (*)
570        underscore: bool,
571    },
572}
573
574impl std::fmt::Display for Element {
575    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
576        match self {
577            Element::Text(s) => write!(f, "{s}"),
578            Element::Link { text, url } => write!(f, "[{text}]({url})"),
579            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
580            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
581            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
582            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
583            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
584            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
585            Element::LinkedImage {
586                alt,
587                img_source,
588                link_target,
589            } => {
590                // Build the image part: ![alt](url) or ![alt][ref]
591                let img_part = match img_source {
592                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
593                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
594                };
595                // Build the link part: (url) or [ref]
596                match link_target {
597                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
598                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
599                }
600            }
601            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
602            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
603            Element::WikiLink(s) => write!(f, "[[{s}]]"),
604            Element::InlineMath(s) => write!(f, "${s}$"),
605            Element::DisplayMath(s) => write!(f, "$${s}$$"),
606            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
607            Element::Autolink(s) => write!(f, "{s}"),
608            Element::HtmlTag(s) => write!(f, "{s}"),
609            Element::HtmlEntity(s) => write!(f, "{s}"),
610            Element::HugoShortcode(s) => write!(f, "{s}"),
611            Element::AttrList(s) => write!(f, "{s}"),
612            Element::Code(s) => write!(f, "`{s}`"),
613            Element::Bold { content, underscore } => {
614                if *underscore {
615                    write!(f, "__{content}__")
616                } else {
617                    write!(f, "**{content}**")
618                }
619            }
620            Element::Italic { content, underscore } => {
621                if *underscore {
622                    write!(f, "_{content}_")
623                } else {
624                    write!(f, "*{content}*")
625                }
626            }
627        }
628    }
629}
630
631impl Element {
632    /// Calculate the display width of this element using the given length mode.
633    /// This formats the element and computes its width, correctly handling
634    /// visual width for CJK characters and other wide glyphs.
635    fn display_width(&self, mode: ReflowLengthMode) -> usize {
636        let formatted = format!("{self}");
637        display_len(&formatted, mode)
638    }
639}
640
641/// An emphasis or formatting span parsed by pulldown-cmark
642#[derive(Debug, Clone)]
643struct EmphasisSpan {
644    /// Byte offset where the emphasis starts (including markers)
645    start: usize,
646    /// Byte offset where the emphasis ends (after closing markers)
647    end: usize,
648    /// The content inside the emphasis markers
649    content: String,
650    /// Whether this is strong (bold) emphasis
651    is_strong: bool,
652    /// Whether this is strikethrough (~~text~~)
653    is_strikethrough: bool,
654    /// Whether the original used underscore markers (for emphasis only)
655    uses_underscore: bool,
656}
657
658/// Extract emphasis and strikethrough spans from text using pulldown-cmark
659///
660/// This provides CommonMark-compliant emphasis parsing, correctly handling:
661/// - Nested emphasis like `*text **bold** more*`
662/// - Left/right flanking delimiter rules
663/// - Underscore vs asterisk markers
664/// - GFM strikethrough (~~text~~)
665///
666/// Returns spans sorted by start position.
667fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
668    let mut spans = Vec::new();
669    let mut options = Options::empty();
670    options.insert(Options::ENABLE_STRIKETHROUGH);
671
672    // Stacks to track nested formatting with their start positions
673    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
674    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
675    let mut strikethrough_stack: Vec<usize> = Vec::new();
676
677    let parser = Parser::new_ext(text, options).into_offset_iter();
678
679    for (event, range) in parser {
680        match event {
681            Event::Start(Tag::Emphasis) => {
682                // Check if this uses underscore by looking at the original text
683                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
684                emphasis_stack.push((range.start, uses_underscore));
685            }
686            Event::End(TagEnd::Emphasis) => {
687                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
688                    // Extract content between the markers (1 char marker on each side)
689                    let content_start = start_byte + 1;
690                    let content_end = range.end - 1;
691                    if content_end > content_start
692                        && let Some(content) = text.get(content_start..content_end)
693                    {
694                        spans.push(EmphasisSpan {
695                            start: start_byte,
696                            end: range.end,
697                            content: content.to_string(),
698                            is_strong: false,
699                            is_strikethrough: false,
700                            uses_underscore,
701                        });
702                    }
703                }
704            }
705            Event::Start(Tag::Strong) => {
706                // Check if this uses underscore by looking at the original text
707                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
708                strong_stack.push((range.start, uses_underscore));
709            }
710            Event::End(TagEnd::Strong) => {
711                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
712                    // Extract content between the markers (2 char marker on each side)
713                    let content_start = start_byte + 2;
714                    let content_end = range.end - 2;
715                    if content_end > content_start
716                        && let Some(content) = text.get(content_start..content_end)
717                    {
718                        spans.push(EmphasisSpan {
719                            start: start_byte,
720                            end: range.end,
721                            content: content.to_string(),
722                            is_strong: true,
723                            is_strikethrough: false,
724                            uses_underscore,
725                        });
726                    }
727                }
728            }
729            Event::Start(Tag::Strikethrough) => {
730                strikethrough_stack.push(range.start);
731            }
732            Event::End(TagEnd::Strikethrough) => {
733                if let Some(start_byte) = strikethrough_stack.pop() {
734                    // Extract content between the ~~ markers (2 char marker on each side)
735                    let content_start = start_byte + 2;
736                    let content_end = range.end - 2;
737                    if content_end > content_start
738                        && let Some(content) = text.get(content_start..content_end)
739                    {
740                        spans.push(EmphasisSpan {
741                            start: start_byte,
742                            end: range.end,
743                            content: content.to_string(),
744                            is_strong: false,
745                            is_strikethrough: true,
746                            uses_underscore: false,
747                        });
748                    }
749                }
750            }
751            _ => {}
752        }
753    }
754
755    // Sort by start position
756    spans.sort_by_key(|s| s.start);
757    spans
758}
759
760/// Parse markdown elements from text preserving the raw syntax
761///
762/// Detection order is critical:
763/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
764/// 2. Inline images ![alt](url) - before links to handle ! prefix
765/// 3. Reference images ![alt][ref] - before reference links
766/// 4. Inline links [text](url) - before reference links
767/// 5. Reference links [text][ref] - before shortcut references
768/// 6. Shortcut reference links [ref] - detected last to avoid false positives
769/// 7. Other elements (code, bold, italic, etc.) - processed normally
770fn parse_markdown_elements(text: &str) -> Vec<Element> {
771    parse_markdown_elements_inner(text, false)
772}
773
774fn parse_markdown_elements_with_attr_lists(text: &str) -> Vec<Element> {
775    parse_markdown_elements_inner(text, true)
776}
777
778fn parse_markdown_elements_inner(text: &str, attr_lists: bool) -> Vec<Element> {
779    let mut elements = Vec::new();
780    let mut remaining = text;
781
782    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
783    let emphasis_spans = extract_emphasis_spans(text);
784
785    while !remaining.is_empty() {
786        // Calculate current byte offset in original text
787        let current_offset = text.len() - remaining.len();
788        // Find the earliest occurrence of any markdown pattern
789        // Store (start, end, pattern_name) to unify standard Regex and FancyRegex match results
790        let mut earliest_match: Option<(usize, usize, &str)> = None;
791
792        // Check for linked images FIRST (all 4 variants)
793        // Quick literal check: only run expensive regexes if we might have a linked image
794        // Pattern starts with "[!" so check for that first
795        if remaining.contains("[!") {
796            // Pattern 1: [![alt](img)](link) - inline image in inline link
797            if let Some(m) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
798                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
799            {
800                earliest_match = Some((m.start(), m.end(), "linked_image_ii"));
801            }
802
803            // Pattern 2: [![alt][ref]](link) - reference image in inline link
804            if let Some(m) = LINKED_IMAGE_REF_INLINE.find(remaining)
805                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
806            {
807                earliest_match = Some((m.start(), m.end(), "linked_image_ri"));
808            }
809
810            // Pattern 3: [![alt](img)][ref] - inline image in reference link
811            if let Some(m) = LINKED_IMAGE_INLINE_REF.find(remaining)
812                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
813            {
814                earliest_match = Some((m.start(), m.end(), "linked_image_ir"));
815            }
816
817            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
818            if let Some(m) = LINKED_IMAGE_REF_REF.find(remaining)
819                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
820            {
821                earliest_match = Some((m.start(), m.end(), "linked_image_rr"));
822            }
823        }
824
825        // Check for images (they start with ! so should be detected before links)
826        // Inline images - ![alt](url)
827        if let Some(m) = INLINE_IMAGE_REGEX.find(remaining)
828            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
829        {
830            earliest_match = Some((m.start(), m.end(), "inline_image"));
831        }
832
833        // Reference images - ![alt][ref]
834        if let Some(m) = REF_IMAGE_REGEX.find(remaining)
835            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
836        {
837            earliest_match = Some((m.start(), m.end(), "ref_image"));
838        }
839
840        // Check for footnote references - [^note]
841        if let Some(m) = FOOTNOTE_REF_REGEX.find(remaining)
842            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
843        {
844            earliest_match = Some((m.start(), m.end(), "footnote_ref"));
845        }
846
847        // Check for inline links - [text](url)
848        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
849            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
850        {
851            earliest_match = Some((m.start(), m.end(), "inline_link"));
852        }
853
854        // Check for reference links - [text][ref]
855        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
856            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
857        {
858            earliest_match = Some((m.start(), m.end(), "ref_link"));
859        }
860
861        // Check for shortcut reference links - [ref]
862        // Only check if we haven't found an earlier pattern that would conflict
863        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
864            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
865        {
866            earliest_match = Some((m.start(), m.end(), "shortcut_ref"));
867        }
868
869        // Check for wiki-style links - [[wiki]]
870        if let Some(m) = WIKI_LINK_REGEX.find(remaining)
871            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
872        {
873            earliest_match = Some((m.start(), m.end(), "wiki_link"));
874        }
875
876        // Check for display math first (before inline) - $$math$$
877        if let Some(m) = DISPLAY_MATH_REGEX.find(remaining)
878            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
879        {
880            earliest_match = Some((m.start(), m.end(), "display_math"));
881        }
882
883        // Check for inline math - $math$
884        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
885            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
886        {
887            earliest_match = Some((m.start(), m.end(), "inline_math"));
888        }
889
890        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
891
892        // Check for emoji shortcodes - :emoji:
893        if let Some(m) = EMOJI_SHORTCODE_REGEX.find(remaining)
894            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
895        {
896            earliest_match = Some((m.start(), m.end(), "emoji"));
897        }
898
899        // Check for HTML entities - &nbsp; etc
900        if let Some(m) = HTML_ENTITY_REGEX.find(remaining)
901            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
902        {
903            earliest_match = Some((m.start(), m.end(), "html_entity"));
904        }
905
906        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
907        // Must be checked before other patterns to avoid false sentence breaks
908        if let Some(m) = HUGO_SHORTCODE_REGEX.find(remaining)
909            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
910        {
911            earliest_match = Some((m.start(), m.end(), "hugo_shortcode"));
912        }
913
914        // Check for HTML tags - <tag> </tag> <tag/>
915        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
916        if let Some(m) = HTML_TAG_PATTERN.find(remaining)
917            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
918        {
919            // Check if this is an autolink (starts with protocol or mailto:)
920            let matched_text = &remaining[m.start()..m.end()];
921            let is_url_autolink = matched_text.starts_with("<http://")
922                || matched_text.starts_with("<https://")
923                || matched_text.starts_with("<mailto:")
924                || matched_text.starts_with("<ftp://")
925                || matched_text.starts_with("<ftps://");
926
927            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
928            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
929            let is_email_autolink = {
930                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
931                EMAIL_PATTERN.is_match(content)
932            };
933
934            if is_url_autolink || is_email_autolink {
935                earliest_match = Some((m.start(), m.end(), "autolink"));
936            } else {
937                earliest_match = Some((m.start(), m.end(), "html_tag"));
938            }
939        }
940
941        // Find earliest non-link special characters
942        let mut next_special = remaining.len();
943        let mut special_type = "";
944        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
945        let mut attr_list_len: usize = 0;
946
947        // Check for code spans (not handled by pulldown-cmark in this context)
948        if let Some(pos) = remaining.find('`')
949            && pos < next_special
950        {
951            next_special = pos;
952            special_type = "code";
953        }
954
955        // Check for MkDocs/kramdown attr lists - {#id .class key="value"}
956        if attr_lists
957            && let Some(pos) = remaining.find('{')
958            && pos < next_special
959            && let Some(m) = ATTR_LIST_PATTERN.find(&remaining[pos..])
960            && m.start() == 0
961        {
962            next_special = pos;
963            special_type = "attr_list";
964            attr_list_len = m.end();
965        }
966
967        // Check for emphasis using pulldown-cmark's pre-extracted spans
968        // Find the earliest emphasis span that starts within remaining text
969        for span in &emphasis_spans {
970            if span.start >= current_offset && span.start < current_offset + remaining.len() {
971                let pos_in_remaining = span.start - current_offset;
972                if pos_in_remaining < next_special {
973                    next_special = pos_in_remaining;
974                    special_type = "pulldown_emphasis";
975                    pulldown_emphasis = Some(span);
976                }
977                break; // Spans are sorted by start position, so first match is earliest
978            }
979        }
980
981        // Determine which pattern to process first
982        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
983            pos < next_special
984        } else {
985            false
986        };
987
988        if should_process_markdown_link {
989            let (pos, match_end, pattern_type) = earliest_match.unwrap();
990
991            // Add any text before the match
992            if pos > 0 {
993                elements.push(Element::Text(remaining[..pos].to_string()));
994            }
995
996            // Process the matched pattern
997            match pattern_type {
998                // Pattern 1: [![alt](img)](link) - inline image in inline link
999                "linked_image_ii" => {
1000                    if let Some(caps) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
1001                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1002                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1003                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1004                        elements.push(Element::LinkedImage {
1005                            alt: alt.to_string(),
1006                            img_source: LinkedImageSource::Inline(img_url.to_string()),
1007                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
1008                        });
1009                        remaining = &remaining[match_end..];
1010                    } else {
1011                        elements.push(Element::Text("[".to_string()));
1012                        remaining = &remaining[1..];
1013                    }
1014                }
1015                // Pattern 2: [![alt][ref]](link) - reference image in inline link
1016                "linked_image_ri" => {
1017                    if let Some(caps) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
1018                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1019                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1020                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1021                        elements.push(Element::LinkedImage {
1022                            alt: alt.to_string(),
1023                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
1024                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
1025                        });
1026                        remaining = &remaining[match_end..];
1027                    } else {
1028                        elements.push(Element::Text("[".to_string()));
1029                        remaining = &remaining[1..];
1030                    }
1031                }
1032                // Pattern 3: [![alt](img)][ref] - inline image in reference link
1033                "linked_image_ir" => {
1034                    if let Some(caps) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
1035                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1036                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1037                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1038                        elements.push(Element::LinkedImage {
1039                            alt: alt.to_string(),
1040                            img_source: LinkedImageSource::Inline(img_url.to_string()),
1041                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1042                        });
1043                        remaining = &remaining[match_end..];
1044                    } else {
1045                        elements.push(Element::Text("[".to_string()));
1046                        remaining = &remaining[1..];
1047                    }
1048                }
1049                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
1050                "linked_image_rr" => {
1051                    if let Some(caps) = LINKED_IMAGE_REF_REF.captures(remaining) {
1052                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1053                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1054                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
1055                        elements.push(Element::LinkedImage {
1056                            alt: alt.to_string(),
1057                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
1058                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
1059                        });
1060                        remaining = &remaining[match_end..];
1061                    } else {
1062                        elements.push(Element::Text("[".to_string()));
1063                        remaining = &remaining[1..];
1064                    }
1065                }
1066                "inline_image" => {
1067                    if let Some(caps) = INLINE_IMAGE_REGEX.captures(remaining) {
1068                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1069                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1070                        elements.push(Element::InlineImage {
1071                            alt: alt.to_string(),
1072                            url: url.to_string(),
1073                        });
1074                        remaining = &remaining[match_end..];
1075                    } else {
1076                        elements.push(Element::Text("!".to_string()));
1077                        remaining = &remaining[1..];
1078                    }
1079                }
1080                "ref_image" => {
1081                    if let Some(caps) = REF_IMAGE_REGEX.captures(remaining) {
1082                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1083                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1084
1085                        if reference.is_empty() {
1086                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1087                        } else {
1088                            elements.push(Element::ReferenceImage {
1089                                alt: alt.to_string(),
1090                                reference: reference.to_string(),
1091                            });
1092                        }
1093                        remaining = &remaining[match_end..];
1094                    } else {
1095                        elements.push(Element::Text("!".to_string()));
1096                        remaining = &remaining[1..];
1097                    }
1098                }
1099                "footnote_ref" => {
1100                    if let Some(caps) = FOOTNOTE_REF_REGEX.captures(remaining) {
1101                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1102                        elements.push(Element::FootnoteReference { note: note.to_string() });
1103                        remaining = &remaining[match_end..];
1104                    } else {
1105                        elements.push(Element::Text("[".to_string()));
1106                        remaining = &remaining[1..];
1107                    }
1108                }
1109                "inline_link" => {
1110                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1111                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1112                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1113                        elements.push(Element::Link {
1114                            text: text.to_string(),
1115                            url: url.to_string(),
1116                        });
1117                        remaining = &remaining[match_end..];
1118                    } else {
1119                        // Fallback - shouldn't happen
1120                        elements.push(Element::Text("[".to_string()));
1121                        remaining = &remaining[1..];
1122                    }
1123                }
1124                "ref_link" => {
1125                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1126                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1127                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1128
1129                        if reference.is_empty() {
1130                            // Empty reference link [text][]
1131                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1132                        } else {
1133                            // Regular reference link [text][ref]
1134                            elements.push(Element::ReferenceLink {
1135                                text: text.to_string(),
1136                                reference: reference.to_string(),
1137                            });
1138                        }
1139                        remaining = &remaining[match_end..];
1140                    } else {
1141                        // Fallback - shouldn't happen
1142                        elements.push(Element::Text("[".to_string()));
1143                        remaining = &remaining[1..];
1144                    }
1145                }
1146                "shortcut_ref" => {
1147                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1148                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1149                        elements.push(Element::ShortcutReference {
1150                            reference: reference.to_string(),
1151                        });
1152                        remaining = &remaining[match_end..];
1153                    } else {
1154                        // Fallback - shouldn't happen
1155                        elements.push(Element::Text("[".to_string()));
1156                        remaining = &remaining[1..];
1157                    }
1158                }
1159                "wiki_link" => {
1160                    if let Some(caps) = WIKI_LINK_REGEX.captures(remaining) {
1161                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1162                        elements.push(Element::WikiLink(content.to_string()));
1163                        remaining = &remaining[match_end..];
1164                    } else {
1165                        elements.push(Element::Text("[[".to_string()));
1166                        remaining = &remaining[2..];
1167                    }
1168                }
1169                "display_math" => {
1170                    if let Some(caps) = DISPLAY_MATH_REGEX.captures(remaining) {
1171                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1172                        elements.push(Element::DisplayMath(math.to_string()));
1173                        remaining = &remaining[match_end..];
1174                    } else {
1175                        elements.push(Element::Text("$$".to_string()));
1176                        remaining = &remaining[2..];
1177                    }
1178                }
1179                "inline_math" => {
1180                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1181                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1182                        elements.push(Element::InlineMath(math.to_string()));
1183                        remaining = &remaining[match_end..];
1184                    } else {
1185                        elements.push(Element::Text("$".to_string()));
1186                        remaining = &remaining[1..];
1187                    }
1188                }
1189                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1190                "emoji" => {
1191                    if let Some(caps) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1192                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1193                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1194                        remaining = &remaining[match_end..];
1195                    } else {
1196                        elements.push(Element::Text(":".to_string()));
1197                        remaining = &remaining[1..];
1198                    }
1199                }
1200                "html_entity" => {
1201                    // HTML entities are captured whole
1202                    elements.push(Element::HtmlEntity(remaining[pos..match_end].to_string()));
1203                    remaining = &remaining[match_end..];
1204                }
1205                "hugo_shortcode" => {
1206                    // Hugo shortcodes are atomic elements - preserve them exactly
1207                    elements.push(Element::HugoShortcode(remaining[pos..match_end].to_string()));
1208                    remaining = &remaining[match_end..];
1209                }
1210                "autolink" => {
1211                    // Autolinks are atomic elements - preserve them exactly
1212                    elements.push(Element::Autolink(remaining[pos..match_end].to_string()));
1213                    remaining = &remaining[match_end..];
1214                }
1215                "html_tag" => {
1216                    // HTML tags are captured whole
1217                    elements.push(Element::HtmlTag(remaining[pos..match_end].to_string()));
1218                    remaining = &remaining[match_end..];
1219                }
1220                _ => {
1221                    // Unknown pattern, treat as text
1222                    elements.push(Element::Text("[".to_string()));
1223                    remaining = &remaining[1..];
1224                }
1225            }
1226        } else {
1227            // Process non-link special characters
1228
1229            // Add any text before the special character
1230            if next_special > 0 && next_special < remaining.len() {
1231                elements.push(Element::Text(remaining[..next_special].to_string()));
1232                remaining = &remaining[next_special..];
1233            }
1234
1235            // Process the special element
1236            match special_type {
1237                "code" => {
1238                    // Find end of code
1239                    if let Some(code_end) = remaining[1..].find('`') {
1240                        let code = &remaining[1..1 + code_end];
1241                        elements.push(Element::Code(code.to_string()));
1242                        remaining = &remaining[1 + code_end + 1..];
1243                    } else {
1244                        // No closing backtick, treat as text
1245                        elements.push(Element::Text(remaining.to_string()));
1246                        break;
1247                    }
1248                }
1249                "attr_list" => {
1250                    elements.push(Element::AttrList(remaining[..attr_list_len].to_string()));
1251                    remaining = &remaining[attr_list_len..];
1252                }
1253                "pulldown_emphasis" => {
1254                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1255                    if let Some(span) = pulldown_emphasis {
1256                        let span_len = span.end - span.start;
1257                        if span.is_strikethrough {
1258                            elements.push(Element::Strikethrough(span.content.clone()));
1259                        } else if span.is_strong {
1260                            elements.push(Element::Bold {
1261                                content: span.content.clone(),
1262                                underscore: span.uses_underscore,
1263                            });
1264                        } else {
1265                            elements.push(Element::Italic {
1266                                content: span.content.clone(),
1267                                underscore: span.uses_underscore,
1268                            });
1269                        }
1270                        remaining = &remaining[span_len..];
1271                    } else {
1272                        // Fallback - shouldn't happen
1273                        elements.push(Element::Text(remaining[..1].to_string()));
1274                        remaining = &remaining[1..];
1275                    }
1276                }
1277                _ => {
1278                    // No special elements found, add all remaining text
1279                    elements.push(Element::Text(remaining.to_string()));
1280                    break;
1281                }
1282            }
1283        }
1284    }
1285
1286    elements
1287}
1288
1289/// Reflow elements for sentence-per-line mode
1290fn reflow_elements_sentence_per_line(
1291    elements: &[Element],
1292    custom_abbreviations: &Option<Vec<String>>,
1293    require_sentence_capital: bool,
1294) -> Vec<String> {
1295    let abbreviations = get_abbreviations(custom_abbreviations);
1296    let mut lines = Vec::new();
1297    let mut current_line = String::new();
1298
1299    for (idx, element) in elements.iter().enumerate() {
1300        let element_str = format!("{element}");
1301
1302        // For text elements, split into sentences
1303        if let Element::Text(text) = element {
1304            // Simply append text - it already has correct spacing from tokenization
1305            let combined = format!("{current_line}{text}");
1306            // Use the pre-computed abbreviations set to avoid redundant computation
1307            let sentences = split_into_sentences_with_set(&combined, &abbreviations, require_sentence_capital);
1308
1309            if sentences.len() > 1 {
1310                // We found sentence boundaries
1311                for (i, sentence) in sentences.iter().enumerate() {
1312                    if i == 0 {
1313                        // First sentence might continue from previous elements
1314                        // But check if it ends with an abbreviation
1315                        let trimmed = sentence.trim();
1316
1317                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1318                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1319                            current_line = sentence.to_string();
1320                        } else {
1321                            // Normal case - emit the first sentence
1322                            lines.push(sentence.to_string());
1323                            current_line.clear();
1324                        }
1325                    } else if i == sentences.len() - 1 {
1326                        // Last sentence: check if it's complete or incomplete
1327                        let trimmed = sentence.trim();
1328                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1329
1330                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1331                            // Complete sentence - emit it immediately
1332                            lines.push(sentence.to_string());
1333                            current_line.clear();
1334                        } else {
1335                            // Incomplete sentence - save for next iteration
1336                            current_line = sentence.to_string();
1337                        }
1338                    } else {
1339                        // Complete sentences in the middle
1340                        lines.push(sentence.to_string());
1341                    }
1342                }
1343            } else {
1344                // Single sentence - check if it's complete
1345                let trimmed = combined.trim();
1346
1347                // If the combined result is only whitespace, don't accumulate it.
1348                // This prevents leading spaces on subsequent elements when lines
1349                // are joined with spaces during reflow iteration.
1350                if trimmed.is_empty() {
1351                    continue;
1352                }
1353
1354                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1355
1356                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1357                    // Complete single sentence - emit it
1358                    lines.push(trimmed.to_string());
1359                    current_line.clear();
1360                } else {
1361                    // Incomplete sentence - continue accumulating
1362                    current_line = combined;
1363                }
1364            }
1365        } else if let Element::Italic { content, underscore } = element {
1366            // Handle italic elements - may contain multiple sentences that need continuation
1367            let marker = if *underscore { "_" } else { "*" };
1368            handle_emphasis_sentence_split(
1369                content,
1370                marker,
1371                &abbreviations,
1372                require_sentence_capital,
1373                &mut current_line,
1374                &mut lines,
1375            );
1376        } else if let Element::Bold { content, underscore } = element {
1377            // Handle bold elements - may contain multiple sentences that need continuation
1378            let marker = if *underscore { "__" } else { "**" };
1379            handle_emphasis_sentence_split(
1380                content,
1381                marker,
1382                &abbreviations,
1383                require_sentence_capital,
1384                &mut current_line,
1385                &mut lines,
1386            );
1387        } else if let Element::Strikethrough(content) = element {
1388            // Handle strikethrough elements - may contain multiple sentences that need continuation
1389            handle_emphasis_sentence_split(
1390                content,
1391                "~~",
1392                &abbreviations,
1393                require_sentence_capital,
1394                &mut current_line,
1395                &mut lines,
1396            );
1397        } else {
1398            // Non-text, non-emphasis elements (Code, Links, etc.)
1399            // Check if this element is adjacent to the preceding text (no space between)
1400            let is_adjacent = if idx > 0 {
1401                match &elements[idx - 1] {
1402                    Element::Text(t) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1403                    _ => true,
1404                }
1405            } else {
1406                false
1407            };
1408
1409            // Add space before element if needed, but not for adjacent elements
1410            if !is_adjacent
1411                && !current_line.is_empty()
1412                && !current_line.ends_with(' ')
1413                && !current_line.ends_with('(')
1414                && !current_line.ends_with('[')
1415            {
1416                current_line.push(' ');
1417            }
1418            current_line.push_str(&element_str);
1419        }
1420    }
1421
1422    // Add any remaining content
1423    if !current_line.is_empty() {
1424        lines.push(current_line.trim().to_string());
1425    }
1426    lines
1427}
1428
1429/// Handle splitting emphasis content at sentence boundaries while preserving markers
1430fn handle_emphasis_sentence_split(
1431    content: &str,
1432    marker: &str,
1433    abbreviations: &HashSet<String>,
1434    require_sentence_capital: bool,
1435    current_line: &mut String,
1436    lines: &mut Vec<String>,
1437) {
1438    // Split the emphasis content into sentences
1439    let sentences = split_into_sentences_with_set(content, abbreviations, require_sentence_capital);
1440
1441    if sentences.len() <= 1 {
1442        // Single sentence or no boundaries - treat as atomic
1443        if !current_line.is_empty()
1444            && !current_line.ends_with(' ')
1445            && !current_line.ends_with('(')
1446            && !current_line.ends_with('[')
1447        {
1448            current_line.push(' ');
1449        }
1450        current_line.push_str(marker);
1451        current_line.push_str(content);
1452        current_line.push_str(marker);
1453
1454        // Check if the emphasis content ends with sentence punctuation - if so, emit
1455        let trimmed = content.trim();
1456        let ends_with_punct = ends_with_sentence_punct(trimmed);
1457        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1458            lines.push(current_line.clone());
1459            current_line.clear();
1460        }
1461    } else {
1462        // Multiple sentences - each gets its own emphasis markers
1463        for (i, sentence) in sentences.iter().enumerate() {
1464            let trimmed = sentence.trim();
1465            if trimmed.is_empty() {
1466                continue;
1467            }
1468
1469            if i == 0 {
1470                // First sentence: combine with current_line and emit
1471                if !current_line.is_empty()
1472                    && !current_line.ends_with(' ')
1473                    && !current_line.ends_with('(')
1474                    && !current_line.ends_with('[')
1475                {
1476                    current_line.push(' ');
1477                }
1478                current_line.push_str(marker);
1479                current_line.push_str(trimmed);
1480                current_line.push_str(marker);
1481
1482                // Check if this is a complete sentence
1483                let ends_with_punct = ends_with_sentence_punct(trimmed);
1484                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1485                    lines.push(current_line.clone());
1486                    current_line.clear();
1487                }
1488            } else if i == sentences.len() - 1 {
1489                // Last sentence: check if complete
1490                let ends_with_punct = ends_with_sentence_punct(trimmed);
1491
1492                let mut line = String::new();
1493                line.push_str(marker);
1494                line.push_str(trimmed);
1495                line.push_str(marker);
1496
1497                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1498                    lines.push(line);
1499                } else {
1500                    // Incomplete - keep in current_line for potential continuation
1501                    *current_line = line;
1502                }
1503            } else {
1504                // Middle sentences: emit with markers
1505                let mut line = String::new();
1506                line.push_str(marker);
1507                line.push_str(trimmed);
1508                line.push_str(marker);
1509                lines.push(line);
1510            }
1511        }
1512    }
1513}
1514
1515/// English break-words used for semantic line break splitting.
1516/// These are conjunctions and relative pronouns where a line break
1517/// reads naturally.
1518const BREAK_WORDS: &[&str] = &[
1519    "and",
1520    "or",
1521    "but",
1522    "nor",
1523    "yet",
1524    "so",
1525    "for",
1526    "which",
1527    "that",
1528    "because",
1529    "when",
1530    "if",
1531    "while",
1532    "where",
1533    "although",
1534    "though",
1535    "unless",
1536    "since",
1537    "after",
1538    "before",
1539    "until",
1540    "as",
1541    "once",
1542    "whether",
1543    "however",
1544    "therefore",
1545    "moreover",
1546    "furthermore",
1547    "nevertheless",
1548    "whereas",
1549];
1550
1551/// Check if a character is clause punctuation for semantic line breaks
1552fn is_clause_punctuation(c: char) -> bool {
1553    matches!(c, ',' | ';' | ':' | '\u{2014}') // comma, semicolon, colon, em dash
1554}
1555
1556/// Compute element spans for a flat text representation of elements.
1557/// Returns Vec of (start, end) byte offsets for non-Text elements,
1558/// so we can check that a split position doesn't fall inside them.
1559fn compute_element_spans(elements: &[Element]) -> Vec<(usize, usize)> {
1560    let mut spans = Vec::new();
1561    let mut offset = 0;
1562    for element in elements {
1563        let rendered = format!("{element}");
1564        let len = rendered.len();
1565        if !matches!(element, Element::Text(_)) {
1566            spans.push((offset, offset + len));
1567        }
1568        offset += len;
1569    }
1570    spans
1571}
1572
1573/// Check if a byte position falls inside any non-Text element span
1574fn is_inside_element(pos: usize, spans: &[(usize, usize)]) -> bool {
1575    spans.iter().any(|(start, end)| pos > *start && pos < *end)
1576}
1577
1578/// Minimum fraction of line_length that the first part of a split must occupy.
1579/// Prevents awkwardly short first lines like "A," or "Note:" on their own.
1580const MIN_SPLIT_RATIO: f64 = 0.3;
1581
1582/// Split a line at the latest clause punctuation that keeps the first part
1583/// within `line_length`. Returns None if no valid split point exists or if
1584/// the split would create an unreasonably short first line.
1585fn split_at_clause_punctuation(
1586    text: &str,
1587    line_length: usize,
1588    element_spans: &[(usize, usize)],
1589    length_mode: ReflowLengthMode,
1590) -> Option<(String, String)> {
1591    let chars: Vec<char> = text.chars().collect();
1592    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1593
1594    // Find the char index where accumulated display width exceeds line_length
1595    let mut width_acc = 0;
1596    let mut search_end_char = 0;
1597    for (idx, &c) in chars.iter().enumerate() {
1598        let c_width = display_len(&c.to_string(), length_mode);
1599        if width_acc + c_width > line_length {
1600            break;
1601        }
1602        width_acc += c_width;
1603        search_end_char = idx + 1;
1604    }
1605
1606    let mut best_pos = None;
1607    for i in (0..search_end_char).rev() {
1608        if is_clause_punctuation(chars[i]) {
1609            // Convert char position to byte position for element span check
1610            let byte_pos: usize = chars[..=i].iter().map(|c| c.len_utf8()).sum();
1611            if !is_inside_element(byte_pos, element_spans) {
1612                best_pos = Some(i);
1613                break;
1614            }
1615        }
1616    }
1617
1618    let pos = best_pos?;
1619
1620    // Reject splits that create very short first lines
1621    let first: String = chars[..=pos].iter().collect();
1622    let first_display_len = display_len(&first, length_mode);
1623    if first_display_len < min_first_len {
1624        return None;
1625    }
1626
1627    // Split after the punctuation character
1628    let rest: String = chars[pos + 1..].iter().collect();
1629    let rest = rest.trim_start().to_string();
1630
1631    if rest.is_empty() {
1632        return None;
1633    }
1634
1635    Some((first, rest))
1636}
1637
1638/// Split a line before the latest break-word that keeps the first part
1639/// within `line_length`. Returns None if no valid split point exists or if
1640/// the split would create an unreasonably short first line.
1641fn split_at_break_word(
1642    text: &str,
1643    line_length: usize,
1644    element_spans: &[(usize, usize)],
1645    length_mode: ReflowLengthMode,
1646) -> Option<(String, String)> {
1647    let lower = text.to_lowercase();
1648    let min_first_len = ((line_length as f64) * MIN_SPLIT_RATIO) as usize;
1649    let mut best_split: Option<(usize, usize)> = None; // (byte_start, word_len_bytes)
1650
1651    for &word in BREAK_WORDS {
1652        let mut search_start = 0;
1653        while let Some(pos) = lower[search_start..].find(word) {
1654            let abs_pos = search_start + pos;
1655
1656            // Verify it's a word boundary: preceded by space, followed by space
1657            let preceded_by_space = abs_pos == 0 || text.as_bytes().get(abs_pos - 1) == Some(&b' ');
1658            let followed_by_space = text.as_bytes().get(abs_pos + word.len()) == Some(&b' ');
1659
1660            if preceded_by_space && followed_by_space {
1661                // The break goes BEFORE the word, so first part ends at abs_pos - 1
1662                let first_part = text[..abs_pos].trim_end();
1663                let first_part_len = display_len(first_part, length_mode);
1664
1665                if first_part_len >= min_first_len
1666                    && first_part_len <= line_length
1667                    && !is_inside_element(abs_pos, element_spans)
1668                {
1669                    // Prefer the latest valid split point
1670                    if best_split.is_none_or(|(prev_pos, _)| abs_pos > prev_pos) {
1671                        best_split = Some((abs_pos, word.len()));
1672                    }
1673                }
1674            }
1675
1676            search_start = abs_pos + word.len();
1677        }
1678    }
1679
1680    let (byte_start, _word_len) = best_split?;
1681
1682    let first = text[..byte_start].trim_end().to_string();
1683    let rest = text[byte_start..].to_string();
1684
1685    if first.is_empty() || rest.trim().is_empty() {
1686        return None;
1687    }
1688
1689    Some((first, rest))
1690}
1691
1692/// Recursively cascade-split a line that exceeds line_length.
1693/// Tries clause punctuation first, then break-words, then word wrap.
1694fn cascade_split_line(
1695    text: &str,
1696    line_length: usize,
1697    abbreviations: &Option<Vec<String>>,
1698    length_mode: ReflowLengthMode,
1699    attr_lists: bool,
1700) -> Vec<String> {
1701    if line_length == 0 || display_len(text, length_mode) <= line_length {
1702        return vec![text.to_string()];
1703    }
1704
1705    let elements = parse_markdown_elements_inner(text, attr_lists);
1706    let element_spans = compute_element_spans(&elements);
1707
1708    // Try clause punctuation split
1709    if let Some((first, rest)) = split_at_clause_punctuation(text, line_length, &element_spans, length_mode) {
1710        let mut result = vec![first];
1711        result.extend(cascade_split_line(
1712            &rest,
1713            line_length,
1714            abbreviations,
1715            length_mode,
1716            attr_lists,
1717        ));
1718        return result;
1719    }
1720
1721    // Try break-word split
1722    if let Some((first, rest)) = split_at_break_word(text, line_length, &element_spans, length_mode) {
1723        let mut result = vec![first];
1724        result.extend(cascade_split_line(
1725            &rest,
1726            line_length,
1727            abbreviations,
1728            length_mode,
1729            attr_lists,
1730        ));
1731        return result;
1732    }
1733
1734    // Fallback: word wrap using existing reflow_elements
1735    let options = ReflowOptions {
1736        line_length,
1737        break_on_sentences: false,
1738        preserve_breaks: false,
1739        sentence_per_line: false,
1740        semantic_line_breaks: false,
1741        abbreviations: abbreviations.clone(),
1742        length_mode,
1743        attr_lists,
1744        require_sentence_capital: true,
1745        max_list_continuation_indent: None,
1746    };
1747    reflow_elements(&elements, &options)
1748}
1749
1750/// Reflow elements using semantic line breaks strategy:
1751/// 1. Split at sentence boundaries (always)
1752/// 2. For lines exceeding line_length, cascade through clause punct → break-words → word wrap
1753fn reflow_elements_semantic(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1754    // Step 1: Split into sentences using existing sentence-per-line logic
1755    let sentence_lines =
1756        reflow_elements_sentence_per_line(elements, &options.abbreviations, options.require_sentence_capital);
1757
1758    // Step 2: For each sentence line, apply cascading splits if it exceeds line_length
1759    // When line_length is 0 (unlimited), skip cascading — sentence splits only
1760    if options.line_length == 0 {
1761        return sentence_lines;
1762    }
1763
1764    let length_mode = options.length_mode;
1765    let mut result = Vec::new();
1766    for line in sentence_lines {
1767        if display_len(&line, length_mode) <= options.line_length {
1768            result.push(line);
1769        } else {
1770            result.extend(cascade_split_line(
1771                &line,
1772                options.line_length,
1773                &options.abbreviations,
1774                length_mode,
1775                options.attr_lists,
1776            ));
1777        }
1778    }
1779
1780    // Step 3: Merge very short trailing lines back into the previous line.
1781    // Word wrap can produce lines like "was" or "see" on their own, which reads poorly.
1782    let min_line_len = ((options.line_length as f64) * MIN_SPLIT_RATIO) as usize;
1783    let mut merged: Vec<String> = Vec::with_capacity(result.len());
1784    for line in result {
1785        if !merged.is_empty() && display_len(&line, length_mode) < min_line_len && !line.trim().is_empty() {
1786            // Don't merge across sentence boundaries — sentence splits are intentional
1787            let prev_ends_at_sentence = {
1788                let trimmed = merged.last().unwrap().trim_end();
1789                trimmed
1790                    .chars()
1791                    .rev()
1792                    .find(|c| !matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | ')' | ']'))
1793                    .is_some_and(|c| matches!(c, '.' | '!' | '?'))
1794            };
1795
1796            if !prev_ends_at_sentence {
1797                let prev = merged.last_mut().unwrap();
1798                let combined = format!("{prev} {line}");
1799                // Only merge if the combined line fits within the limit
1800                if display_len(&combined, length_mode) <= options.line_length {
1801                    *prev = combined;
1802                    continue;
1803                }
1804            }
1805        }
1806        merged.push(line);
1807    }
1808    merged
1809}
1810
1811/// Find the last space in `line` that is safe to split at.
1812/// Safe spaces are those NOT inside rendered non-Text elements.
1813/// `element_spans` contains (start, end) byte ranges of non-Text elements in the line.
1814/// Find the last space in `line` that is not inside any element span.
1815/// Spans use exclusive bounds (pos > start && pos < end) because element
1816/// delimiters (e.g., `[`, `]`, `(`, `)`, `<`, `>`, `` ` ``) are never
1817/// spaces, so only interior positions need protection.
1818fn rfind_safe_space(line: &str, element_spans: &[(usize, usize)]) -> Option<usize> {
1819    line.char_indices()
1820        .rev()
1821        .map(|(pos, _)| pos)
1822        .find(|&pos| line.as_bytes()[pos] == b' ' && !element_spans.iter().any(|(s, e)| pos > *s && pos < *e))
1823}
1824
1825/// Reflow elements into lines that fit within the line length
1826fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1827    let mut lines = Vec::new();
1828    let mut current_line = String::new();
1829    let mut current_length = 0;
1830    // Track byte spans of non-Text elements in current_line for safe splitting
1831    let mut current_line_element_spans: Vec<(usize, usize)> = Vec::new();
1832    let length_mode = options.length_mode;
1833
1834    for (idx, element) in elements.iter().enumerate() {
1835        let element_str = format!("{element}");
1836        let element_len = element.display_width(length_mode);
1837
1838        // Determine adjacency from the original elements, not from current_line.
1839        // Elements are adjacent when there's no whitespace between them in the source:
1840        // - Text("v") → HugoShortcode("{{<...>}}") = adjacent (text has no trailing space)
1841        // - Text(" and ") → InlineLink("[a](url)") = NOT adjacent (text has trailing space)
1842        // - HugoShortcode("{{<...>}}") → Text(",") = adjacent (text has no leading space)
1843        let is_adjacent_to_prev = if idx > 0 {
1844            match (&elements[idx - 1], element) {
1845                (Element::Text(t), _) => !t.is_empty() && !t.ends_with(char::is_whitespace),
1846                (_, Element::Text(t)) => !t.is_empty() && !t.starts_with(char::is_whitespace),
1847                _ => true,
1848            }
1849        } else {
1850            false
1851        };
1852
1853        // For text elements that might need breaking
1854        if let Element::Text(text) = element {
1855            // Check if original text had leading whitespace
1856            let has_leading_space = text.starts_with(char::is_whitespace);
1857            // If this is a text element, always process it word by word
1858            let words: Vec<&str> = text.split_whitespace().collect();
1859
1860            for (i, word) in words.iter().enumerate() {
1861                let word_len = display_len(word, length_mode);
1862                // Check if this "word" is just punctuation that should stay attached
1863                let is_trailing_punct = word
1864                    .chars()
1865                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1866
1867                // First word of text adjacent to preceding non-text element
1868                // must stay attached (e.g., shortcode followed by punctuation or text)
1869                let is_first_adjacent = i == 0 && is_adjacent_to_prev;
1870
1871                if is_first_adjacent {
1872                    // Attach directly without space, preventing line break
1873                    if current_length + word_len > options.line_length && current_length > 0 {
1874                        // Would exceed — break before the adjacent group
1875                        // Use element-aware space search to avoid splitting inside links/code/etc.
1876                        if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1877                            let before = current_line[..last_space].trim_end().to_string();
1878                            let after = current_line[last_space + 1..].to_string();
1879                            lines.push(before);
1880                            current_line = format!("{after}{word}");
1881                            current_length = display_len(&current_line, length_mode);
1882                            current_line_element_spans.clear();
1883                        } else {
1884                            current_line.push_str(word);
1885                            current_length += word_len;
1886                        }
1887                    } else {
1888                        current_line.push_str(word);
1889                        current_length += word_len;
1890                    }
1891                } else if current_length > 0
1892                    && current_length + 1 + word_len > options.line_length
1893                    && !is_trailing_punct
1894                {
1895                    // Start a new line (but never for trailing punctuation)
1896                    lines.push(current_line.trim().to_string());
1897                    current_line = word.to_string();
1898                    current_length = word_len;
1899                    current_line_element_spans.clear();
1900                } else {
1901                    // Add word to current line
1902                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1903                    // AND this isn't trailing punctuation (which attaches directly)
1904                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1905                        current_line.push(' ');
1906                        current_length += 1;
1907                    }
1908                    current_line.push_str(word);
1909                    current_length += word_len;
1910                }
1911            }
1912        } else if matches!(
1913            element,
1914            Element::Italic { .. } | Element::Bold { .. } | Element::Strikethrough(_)
1915        ) && element_len > options.line_length
1916        {
1917            // Italic, bold, and strikethrough with content longer than line_length need word wrapping.
1918            // Split content word-by-word, attach the opening marker to the first word
1919            // and the closing marker to the last word.
1920            let (content, marker): (&str, &str) = match element {
1921                Element::Italic { content, underscore } => (content.as_str(), if *underscore { "_" } else { "*" }),
1922                Element::Bold { content, underscore } => (content.as_str(), if *underscore { "__" } else { "**" }),
1923                Element::Strikethrough(content) => (content.as_str(), "~~"),
1924                _ => unreachable!(),
1925            };
1926
1927            let words: Vec<&str> = content.split_whitespace().collect();
1928            let n = words.len();
1929
1930            if n == 0 {
1931                // Empty span — treat as atomic
1932                let full = format!("{marker}{marker}");
1933                let full_len = display_len(&full, length_mode);
1934                if !is_adjacent_to_prev && current_length > 0 {
1935                    current_line.push(' ');
1936                    current_length += 1;
1937                }
1938                current_line.push_str(&full);
1939                current_length += full_len;
1940            } else {
1941                for (i, word) in words.iter().enumerate() {
1942                    let is_first = i == 0;
1943                    let is_last = i == n - 1;
1944                    let word_str: String = match (is_first, is_last) {
1945                        (true, true) => format!("{marker}{word}{marker}"),
1946                        (true, false) => format!("{marker}{word}"),
1947                        (false, true) => format!("{word}{marker}"),
1948                        (false, false) => word.to_string(),
1949                    };
1950                    let word_len = display_len(&word_str, length_mode);
1951
1952                    let needs_space = if is_first {
1953                        !is_adjacent_to_prev && current_length > 0
1954                    } else {
1955                        current_length > 0
1956                    };
1957
1958                    if needs_space && current_length + 1 + word_len > options.line_length {
1959                        lines.push(current_line.trim_end().to_string());
1960                        current_line = word_str;
1961                        current_length = word_len;
1962                        current_line_element_spans.clear();
1963                    } else {
1964                        if needs_space {
1965                            current_line.push(' ');
1966                            current_length += 1;
1967                        }
1968                        current_line.push_str(&word_str);
1969                        current_length += word_len;
1970                    }
1971                }
1972            }
1973        } else {
1974            // For non-text elements (code, links, references), treat as atomic units
1975            // These should never be broken across lines
1976
1977            if is_adjacent_to_prev {
1978                // Adjacent to preceding text — attach directly without space
1979                if current_length + element_len > options.line_length {
1980                    // Would exceed limit — break before the adjacent word group
1981                    // Use element-aware space search to avoid splitting inside links/code/etc.
1982                    if let Some(last_space) = rfind_safe_space(&current_line, &current_line_element_spans) {
1983                        let before = current_line[..last_space].trim_end().to_string();
1984                        let after = current_line[last_space + 1..].to_string();
1985                        lines.push(before);
1986                        current_line = format!("{after}{element_str}");
1987                        current_length = display_len(&current_line, length_mode);
1988                        current_line_element_spans.clear();
1989                        // Record the element span in the new current_line
1990                        let start = after.len();
1991                        current_line_element_spans.push((start, start + element_str.len()));
1992                    } else {
1993                        // No safe space to break at — accept the long line
1994                        let start = current_line.len();
1995                        current_line.push_str(&element_str);
1996                        current_length += element_len;
1997                        current_line_element_spans.push((start, current_line.len()));
1998                    }
1999                } else {
2000                    let start = current_line.len();
2001                    current_line.push_str(&element_str);
2002                    current_length += element_len;
2003                    current_line_element_spans.push((start, current_line.len()));
2004                }
2005            } else if current_length > 0 && current_length + 1 + element_len > options.line_length {
2006                // Not adjacent, would exceed — start new line
2007                lines.push(current_line.trim().to_string());
2008                current_line = element_str.clone();
2009                current_length = element_len;
2010                current_line_element_spans.clear();
2011                current_line_element_spans.push((0, element_str.len()));
2012            } else {
2013                // Not adjacent, fits — add with space
2014                let ends_with_opener =
2015                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
2016                if current_length > 0 && !ends_with_opener {
2017                    current_line.push(' ');
2018                    current_length += 1;
2019                }
2020                let start = current_line.len();
2021                current_line.push_str(&element_str);
2022                current_length += element_len;
2023                current_line_element_spans.push((start, current_line.len()));
2024            }
2025        }
2026    }
2027
2028    // Don't forget the last line
2029    if !current_line.is_empty() {
2030        lines.push(current_line.trim_end().to_string());
2031    }
2032
2033    lines
2034}
2035
2036/// Reflow markdown content preserving structure
2037pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
2038    let lines: Vec<&str> = content.lines().collect();
2039    let mut result = Vec::new();
2040    let mut i = 0;
2041
2042    while i < lines.len() {
2043        let line = lines[i];
2044        let trimmed = line.trim();
2045
2046        // Preserve empty lines
2047        if trimmed.is_empty() {
2048            result.push(String::new());
2049            i += 1;
2050            continue;
2051        }
2052
2053        // Preserve headings as-is
2054        if trimmed.starts_with('#') {
2055            result.push(line.to_string());
2056            i += 1;
2057            continue;
2058        }
2059
2060        // Preserve Quarto/Pandoc div markers (:::) as-is
2061        if trimmed.starts_with(":::") {
2062            result.push(line.to_string());
2063            i += 1;
2064            continue;
2065        }
2066
2067        // Preserve fenced code blocks
2068        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
2069            result.push(line.to_string());
2070            i += 1;
2071            // Copy lines until closing fence
2072            while i < lines.len() {
2073                result.push(lines[i].to_string());
2074                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
2075                    i += 1;
2076                    break;
2077                }
2078                i += 1;
2079            }
2080            continue;
2081        }
2082
2083        // Preserve indented code blocks (4+ columns accounting for tab expansion)
2084        if calculate_indentation_width_default(line) >= 4 {
2085            // Collect all consecutive indented lines
2086            result.push(line.to_string());
2087            i += 1;
2088            while i < lines.len() {
2089                let next_line = lines[i];
2090                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
2091                if calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
2092                    result.push(next_line.to_string());
2093                    i += 1;
2094                } else {
2095                    break;
2096                }
2097            }
2098            continue;
2099        }
2100
2101        // Preserve block quotes (but reflow their content)
2102        if trimmed.starts_with('>') {
2103            // find() returns byte position which is correct for str slicing
2104            // The unwrap is safe because we already verified trimmed starts with '>'
2105            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
2106            let quote_prefix = line[0..gt_pos + 1].to_string();
2107            let quote_content = &line[quote_prefix.len()..].trim_start();
2108
2109            let reflowed = reflow_line(quote_content, options);
2110            for reflowed_line in reflowed.iter() {
2111                result.push(format!("{quote_prefix} {reflowed_line}"));
2112            }
2113            i += 1;
2114            continue;
2115        }
2116
2117        // Preserve horizontal rules first (before checking for lists)
2118        if is_horizontal_rule(trimmed) {
2119            result.push(line.to_string());
2120            i += 1;
2121            continue;
2122        }
2123
2124        // Preserve lists (but not horizontal rules)
2125        if is_unordered_list_marker(trimmed) || is_numbered_list_item(trimmed) {
2126            // Find the list marker and preserve indentation
2127            let indent = line.len() - line.trim_start().len();
2128            let indent_str = " ".repeat(indent);
2129
2130            // For numbered lists, find the period and the space after it
2131            // For bullet lists, find the marker and the space after it
2132            let mut marker_end = indent;
2133            let mut content_start = indent;
2134
2135            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
2136                // Numbered list: find the period
2137                if let Some(period_pos) = line[indent..].find('.') {
2138                    marker_end = indent + period_pos + 1; // Include the period
2139                    content_start = marker_end;
2140                    // Skip any spaces after the period to find content start
2141                    // Use byte-based check since content_start is a byte index
2142                    // This is safe because space is ASCII (single byte)
2143                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2144                        content_start += 1;
2145                    }
2146                }
2147            } else {
2148                // Bullet list: marker is single character
2149                marker_end = indent + 1; // Just the marker character
2150                content_start = marker_end;
2151                // Skip any spaces after the marker
2152                // Use byte-based check since content_start is a byte index
2153                // This is safe because space is ASCII (single byte)
2154                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
2155                    content_start += 1;
2156                }
2157            }
2158
2159            // Minimum indent for continuation lines (based on list marker, before checkbox)
2160            let min_continuation_indent = content_start;
2161
2162            // Detect checkbox/task list markers: [ ], [x], [X]
2163            // GFM task lists work with both unordered and ordered lists
2164            let rest = &line[content_start..];
2165            if rest.starts_with("[ ] ") || rest.starts_with("[x] ") || rest.starts_with("[X] ") {
2166                marker_end = content_start + 3; // Include the checkbox `[ ]`
2167                content_start += 4; // Skip past `[ ] `
2168            }
2169
2170            let marker = &line[indent..marker_end];
2171
2172            // Collect all content for this list item (including continuation lines)
2173            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
2174            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
2175            i += 1;
2176
2177            // Collect continuation lines (indented lines that are part of this list item)
2178            // Use the base marker indent (not checkbox-extended) for collection,
2179            // since users may indent continuations to the bullet level, not the checkbox level
2180            while i < lines.len() {
2181                let next_line = lines[i];
2182                let next_trimmed = next_line.trim();
2183
2184                // Stop if we hit an empty line or another list item or special block
2185                if is_block_boundary(next_trimmed) {
2186                    break;
2187                }
2188
2189                // Check if this line is indented (continuation of list item)
2190                let next_indent = next_line.len() - next_line.trim_start().len();
2191                if next_indent >= min_continuation_indent {
2192                    // This is a continuation line - add its content
2193                    // Preserve hard breaks while trimming excessive whitespace
2194                    let trimmed_start = next_line.trim_start();
2195                    list_content.push(trim_preserving_hard_break(trimmed_start));
2196                    i += 1;
2197                } else {
2198                    // Not indented enough, not part of this list item
2199                    break;
2200                }
2201            }
2202
2203            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
2204            // Hard breaks should prevent joining with the next line
2205            let combined_content = if options.preserve_breaks {
2206                list_content[0].clone()
2207            } else {
2208                // Check if any lines have hard breaks - if so, preserve the structure
2209                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
2210                if has_hard_breaks {
2211                    // Don't join lines with hard breaks - keep them separate with newlines
2212                    list_content.join("\n")
2213                } else {
2214                    // No hard breaks, safe to join with spaces
2215                    list_content.join(" ")
2216                }
2217            };
2218
2219            // Calculate the proper indentation for continuation lines
2220            let trimmed_marker = marker;
2221            let continuation_spaces = if let Some(max_indent) = options.max_list_continuation_indent {
2222                // Cap the relative indent (past the nesting level) to max_indent,
2223                // then add back the nesting indent so nested items stay correct
2224                indent + (content_start - indent).min(max_indent)
2225            } else {
2226                content_start
2227            };
2228
2229            // Adjust line length to account for list marker and space
2230            let prefix_length = indent + trimmed_marker.len() + 1;
2231
2232            // Create adjusted options with reduced line length
2233            let adjusted_options = ReflowOptions {
2234                line_length: options.line_length.saturating_sub(prefix_length),
2235                ..options.clone()
2236            };
2237
2238            let reflowed = reflow_line(&combined_content, &adjusted_options);
2239            for (j, reflowed_line) in reflowed.iter().enumerate() {
2240                if j == 0 {
2241                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
2242                } else {
2243                    // Continuation lines aligned with text after marker
2244                    let continuation_indent = " ".repeat(continuation_spaces);
2245                    result.push(format!("{continuation_indent}{reflowed_line}"));
2246                }
2247            }
2248            continue;
2249        }
2250
2251        // Preserve tables
2252        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
2253            result.push(line.to_string());
2254            i += 1;
2255            continue;
2256        }
2257
2258        // Preserve reference definitions
2259        if trimmed.starts_with('[') && line.contains("]:") {
2260            result.push(line.to_string());
2261            i += 1;
2262            continue;
2263        }
2264
2265        // Preserve definition list items (extended markdown)
2266        if is_definition_list_item(trimmed) {
2267            result.push(line.to_string());
2268            i += 1;
2269            continue;
2270        }
2271
2272        // Check if this is a single line that doesn't need processing
2273        let mut is_single_line_paragraph = true;
2274        if i + 1 < lines.len() {
2275            let next_trimmed = lines[i + 1].trim();
2276            // Check if next line continues this paragraph
2277            if !is_block_boundary(next_trimmed) {
2278                is_single_line_paragraph = false;
2279            }
2280        }
2281
2282        // If it's a single line that fits, just add it as-is
2283        if is_single_line_paragraph && display_len(line, options.length_mode) <= options.line_length {
2284            result.push(line.to_string());
2285            i += 1;
2286            continue;
2287        }
2288
2289        // For regular paragraphs, collect consecutive lines
2290        let mut paragraph_parts = Vec::new();
2291        let mut current_part = vec![line];
2292        i += 1;
2293
2294        // If preserve_breaks is true, treat each line separately
2295        if options.preserve_breaks {
2296            // Don't collect consecutive lines - just reflow this single line
2297            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
2298                Some("\\")
2299            } else if line.ends_with("  ") {
2300                Some("  ")
2301            } else {
2302                None
2303            };
2304            let reflowed = reflow_line(line, options);
2305
2306            // Preserve hard breaks (two trailing spaces or backslash)
2307            if let Some(break_marker) = hard_break_type {
2308                if !reflowed.is_empty() {
2309                    let mut reflowed_with_break = reflowed;
2310                    let last_idx = reflowed_with_break.len() - 1;
2311                    if !has_hard_break(&reflowed_with_break[last_idx]) {
2312                        reflowed_with_break[last_idx].push_str(break_marker);
2313                    }
2314                    result.extend(reflowed_with_break);
2315                }
2316            } else {
2317                result.extend(reflowed);
2318            }
2319        } else {
2320            // Original behavior: collect consecutive lines into a paragraph
2321            while i < lines.len() {
2322                let prev_line = if !current_part.is_empty() {
2323                    current_part.last().unwrap()
2324                } else {
2325                    ""
2326                };
2327                let next_line = lines[i];
2328                let next_trimmed = next_line.trim();
2329
2330                // Stop at empty lines or special blocks
2331                if is_block_boundary(next_trimmed) {
2332                    break;
2333                }
2334
2335                // Check if previous line ends with hard break (two spaces or backslash)
2336                // or is a complete sentence in sentence_per_line mode
2337                let prev_trimmed = prev_line.trim();
2338                let abbreviations = get_abbreviations(&options.abbreviations);
2339                let ends_with_sentence = (prev_trimmed.ends_with('.')
2340                    || prev_trimmed.ends_with('!')
2341                    || prev_trimmed.ends_with('?')
2342                    || prev_trimmed.ends_with(".*")
2343                    || prev_trimmed.ends_with("!*")
2344                    || prev_trimmed.ends_with("?*")
2345                    || prev_trimmed.ends_with("._")
2346                    || prev_trimmed.ends_with("!_")
2347                    || prev_trimmed.ends_with("?_")
2348                    // Quote-terminated sentences (straight and curly quotes)
2349                    || prev_trimmed.ends_with(".\"")
2350                    || prev_trimmed.ends_with("!\"")
2351                    || prev_trimmed.ends_with("?\"")
2352                    || prev_trimmed.ends_with(".'")
2353                    || prev_trimmed.ends_with("!'")
2354                    || prev_trimmed.ends_with("?'")
2355                    || prev_trimmed.ends_with(".\u{201D}")
2356                    || prev_trimmed.ends_with("!\u{201D}")
2357                    || prev_trimmed.ends_with("?\u{201D}")
2358                    || prev_trimmed.ends_with(".\u{2019}")
2359                    || prev_trimmed.ends_with("!\u{2019}")
2360                    || prev_trimmed.ends_with("?\u{2019}"))
2361                    && !text_ends_with_abbreviation(
2362                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
2363                        &abbreviations,
2364                    );
2365
2366                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
2367                    // Start a new part after hard break or complete sentence
2368                    paragraph_parts.push(current_part.join(" "));
2369                    current_part = vec![next_line];
2370                } else {
2371                    current_part.push(next_line);
2372                }
2373                i += 1;
2374            }
2375
2376            // Add the last part
2377            if !current_part.is_empty() {
2378                if current_part.len() == 1 {
2379                    // Single line, don't add trailing space
2380                    paragraph_parts.push(current_part[0].to_string());
2381                } else {
2382                    paragraph_parts.push(current_part.join(" "));
2383                }
2384            }
2385
2386            // Reflow each part separately, preserving hard breaks
2387            for (j, part) in paragraph_parts.iter().enumerate() {
2388                let reflowed = reflow_line(part, options);
2389                result.extend(reflowed);
2390
2391                // Preserve hard break by ensuring last line of part ends with hard break marker
2392                // Use two spaces as the default hard break format for reflows
2393                // But don't add hard breaks in sentence_per_line mode - lines are already separate
2394                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
2395                    let last_idx = result.len() - 1;
2396                    if !has_hard_break(&result[last_idx]) {
2397                        result[last_idx].push_str("  ");
2398                    }
2399                }
2400            }
2401        }
2402    }
2403
2404    // Preserve trailing newline if the original content had one
2405    let result_text = result.join("\n");
2406    if content.ends_with('\n') && !result_text.ends_with('\n') {
2407        format!("{result_text}\n")
2408    } else {
2409        result_text
2410    }
2411}
2412
2413/// Information about a reflowed paragraph
2414#[derive(Debug, Clone)]
2415pub struct ParagraphReflow {
2416    /// Starting byte offset of the paragraph in the original content
2417    pub start_byte: usize,
2418    /// Ending byte offset of the paragraph in the original content
2419    pub end_byte: usize,
2420    /// The reflowed text for this paragraph
2421    pub reflowed_text: String,
2422}
2423
2424/// A collected blockquote line used for style-preserving reflow.
2425///
2426/// The invariant `is_explicit == true` iff `prefix.is_some()` is enforced by the
2427/// constructors. Use [`BlockquoteLineData::explicit`] or [`BlockquoteLineData::lazy`]
2428/// rather than constructing the struct directly.
2429#[derive(Debug, Clone)]
2430pub struct BlockquoteLineData {
2431    /// Trimmed content without the `> ` prefix.
2432    pub(crate) content: String,
2433    /// Whether this line carries an explicit blockquote marker.
2434    pub(crate) is_explicit: bool,
2435    /// Full blockquote prefix (e.g. `"> "`, `"> > "`). `None` for lazy continuation lines.
2436    pub(crate) prefix: Option<String>,
2437}
2438
2439impl BlockquoteLineData {
2440    /// Create an explicit (marker-bearing) blockquote line.
2441    pub fn explicit(content: String, prefix: String) -> Self {
2442        Self {
2443            content,
2444            is_explicit: true,
2445            prefix: Some(prefix),
2446        }
2447    }
2448
2449    /// Create a lazy continuation line (no blockquote marker).
2450    pub fn lazy(content: String) -> Self {
2451        Self {
2452            content,
2453            is_explicit: false,
2454            prefix: None,
2455        }
2456    }
2457}
2458
2459/// Style for blockquote continuation lines after reflow.
2460#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2461pub enum BlockquoteContinuationStyle {
2462    Explicit,
2463    Lazy,
2464}
2465
2466/// Determine the continuation style for a blockquote paragraph from its collected lines.
2467///
2468/// The first line is always explicit (it carries the marker), so only continuation
2469/// lines (index 1+) are counted. Ties resolve to `Explicit`.
2470///
2471/// When the slice has only one element (no continuation lines to inspect), both
2472/// counts are zero and the tie-breaking rule returns `Explicit`.
2473pub fn blockquote_continuation_style(lines: &[BlockquoteLineData]) -> BlockquoteContinuationStyle {
2474    let mut explicit_count = 0usize;
2475    let mut lazy_count = 0usize;
2476
2477    for line in lines.iter().skip(1) {
2478        if line.is_explicit {
2479            explicit_count += 1;
2480        } else {
2481            lazy_count += 1;
2482        }
2483    }
2484
2485    if explicit_count > 0 && lazy_count == 0 {
2486        BlockquoteContinuationStyle::Explicit
2487    } else if lazy_count > 0 && explicit_count == 0 {
2488        BlockquoteContinuationStyle::Lazy
2489    } else if explicit_count >= lazy_count {
2490        BlockquoteContinuationStyle::Explicit
2491    } else {
2492        BlockquoteContinuationStyle::Lazy
2493    }
2494}
2495
2496/// Determine the dominant blockquote prefix for a paragraph.
2497///
2498/// The most frequently occurring explicit prefix wins. Ties are broken by earliest
2499/// first appearance. Falls back to `fallback` when no explicit lines are present.
2500pub fn dominant_blockquote_prefix(lines: &[BlockquoteLineData], fallback: &str) -> String {
2501    let mut counts: std::collections::HashMap<String, (usize, usize)> = std::collections::HashMap::new();
2502
2503    for (idx, line) in lines.iter().enumerate() {
2504        let Some(prefix) = line.prefix.as_ref() else {
2505            continue;
2506        };
2507        counts
2508            .entry(prefix.clone())
2509            .and_modify(|entry| entry.0 += 1)
2510            .or_insert((1, idx));
2511    }
2512
2513    counts
2514        .into_iter()
2515        .max_by(|(_, (count_a, first_idx_a)), (_, (count_b, first_idx_b))| {
2516            count_a.cmp(count_b).then_with(|| first_idx_b.cmp(first_idx_a))
2517        })
2518        .map(|(prefix, _)| prefix)
2519        .unwrap_or_else(|| fallback.to_string())
2520}
2521
2522/// Whether a reflowed blockquote content line must carry an explicit prefix.
2523///
2524/// Lines that would start a new block structure (headings, fences, lists, etc.)
2525/// cannot safely use lazy continuation syntax.
2526pub(crate) fn should_force_explicit_blockquote_line(content_line: &str) -> bool {
2527    let trimmed = content_line.trim_start();
2528    trimmed.starts_with('>')
2529        || trimmed.starts_with('#')
2530        || trimmed.starts_with("```")
2531        || trimmed.starts_with("~~~")
2532        || is_unordered_list_marker(trimmed)
2533        || is_numbered_list_item(trimmed)
2534        || is_horizontal_rule(trimmed)
2535        || is_definition_list_item(trimmed)
2536        || (trimmed.starts_with('[') && trimmed.contains("]:"))
2537        || trimmed.starts_with(":::")
2538        || (trimmed.starts_with('<')
2539            && !trimmed.starts_with("<http")
2540            && !trimmed.starts_with("<https")
2541            && !trimmed.starts_with("<mailto:"))
2542}
2543
2544/// Reflow blockquote content lines and apply continuation style.
2545///
2546/// Segments separated by hard breaks are reflowed independently. The output lines
2547/// receive blockquote prefixes according to `continuation_style`: the first line and
2548/// any line that would start a new block structure always get an explicit prefix;
2549/// other lines follow the detected style.
2550///
2551/// Returns the styled, reflowed lines (without a trailing newline).
2552pub fn reflow_blockquote_content(
2553    lines: &[BlockquoteLineData],
2554    explicit_prefix: &str,
2555    continuation_style: BlockquoteContinuationStyle,
2556    options: &ReflowOptions,
2557) -> Vec<String> {
2558    let content_strs: Vec<&str> = lines.iter().map(|l| l.content.as_str()).collect();
2559    let segments = split_into_segments_strs(&content_strs);
2560    let mut reflowed_content_lines: Vec<String> = Vec::new();
2561
2562    for segment in segments {
2563        let hard_break_type = segment.last().and_then(|&line| {
2564            let line = line.strip_suffix('\r').unwrap_or(line);
2565            if line.ends_with('\\') {
2566                Some("\\")
2567            } else if line.ends_with("  ") {
2568                Some("  ")
2569            } else {
2570                None
2571            }
2572        });
2573
2574        let pieces: Vec<&str> = segment
2575            .iter()
2576            .map(|&line| {
2577                if let Some(l) = line.strip_suffix('\\') {
2578                    l.trim_end()
2579                } else if let Some(l) = line.strip_suffix("  ") {
2580                    l.trim_end()
2581                } else {
2582                    line.trim_end()
2583                }
2584            })
2585            .collect();
2586
2587        let segment_text = pieces.join(" ");
2588        let segment_text = segment_text.trim();
2589        if segment_text.is_empty() {
2590            continue;
2591        }
2592
2593        let mut reflowed = reflow_line(segment_text, options);
2594        if let Some(break_marker) = hard_break_type
2595            && !reflowed.is_empty()
2596        {
2597            let last_idx = reflowed.len() - 1;
2598            if !has_hard_break(&reflowed[last_idx]) {
2599                reflowed[last_idx].push_str(break_marker);
2600            }
2601        }
2602        reflowed_content_lines.extend(reflowed);
2603    }
2604
2605    let mut styled_lines: Vec<String> = Vec::new();
2606    for (idx, line) in reflowed_content_lines.iter().enumerate() {
2607        let force_explicit = idx == 0
2608            || continuation_style == BlockquoteContinuationStyle::Explicit
2609            || should_force_explicit_blockquote_line(line);
2610        if force_explicit {
2611            styled_lines.push(format!("{explicit_prefix}{line}"));
2612        } else {
2613            styled_lines.push(line.clone());
2614        }
2615    }
2616
2617    styled_lines
2618}
2619
2620fn is_blockquote_content_boundary(content: &str) -> bool {
2621    let trimmed = content.trim();
2622    trimmed.is_empty()
2623        || is_block_boundary(trimmed)
2624        || crate::utils::table_utils::TableUtils::is_potential_table_row(content)
2625        || trimmed.starts_with(":::")
2626        || crate::utils::is_template_directive_only(content)
2627        || is_standalone_attr_list(content)
2628        || is_snippet_block_delimiter(content)
2629}
2630
2631fn split_into_segments_strs<'a>(lines: &[&'a str]) -> Vec<Vec<&'a str>> {
2632    let mut segments = Vec::new();
2633    let mut current = Vec::new();
2634
2635    for &line in lines {
2636        current.push(line);
2637        if has_hard_break(line) {
2638            segments.push(current);
2639            current = Vec::new();
2640        }
2641    }
2642
2643    if !current.is_empty() {
2644        segments.push(current);
2645    }
2646
2647    segments
2648}
2649
2650fn reflow_blockquote_paragraph_at_line(
2651    content: &str,
2652    lines: &[&str],
2653    target_idx: usize,
2654    options: &ReflowOptions,
2655) -> Option<ParagraphReflow> {
2656    let mut anchor_idx = target_idx;
2657    let mut target_level = if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[target_idx]) {
2658        parsed.nesting_level
2659    } else {
2660        let mut found = None;
2661        let mut idx = target_idx;
2662        loop {
2663            if lines[idx].trim().is_empty() {
2664                break;
2665            }
2666            if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[idx]) {
2667                found = Some((idx, parsed.nesting_level));
2668                break;
2669            }
2670            if idx == 0 {
2671                break;
2672            }
2673            idx -= 1;
2674        }
2675        let (idx, level) = found?;
2676        anchor_idx = idx;
2677        level
2678    };
2679
2680    // Expand backward to capture prior quote content at the same nesting level.
2681    let mut para_start = anchor_idx;
2682    while para_start > 0 {
2683        let prev_idx = para_start - 1;
2684        let prev_line = lines[prev_idx];
2685
2686        if prev_line.trim().is_empty() {
2687            break;
2688        }
2689
2690        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(prev_line) {
2691            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2692                break;
2693            }
2694            para_start = prev_idx;
2695            continue;
2696        }
2697
2698        let prev_lazy = prev_line.trim_start();
2699        if is_blockquote_content_boundary(prev_lazy) {
2700            break;
2701        }
2702        para_start = prev_idx;
2703    }
2704
2705    // Lazy continuation cannot precede the first explicit marker.
2706    while para_start < lines.len() {
2707        let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(lines[para_start]) else {
2708            para_start += 1;
2709            continue;
2710        };
2711        target_level = parsed.nesting_level;
2712        break;
2713    }
2714
2715    if para_start >= lines.len() || para_start > target_idx {
2716        return None;
2717    }
2718
2719    // Collect explicit lines at target level and lazy continuation lines.
2720    // Each entry is (original_line_idx, BlockquoteLineData).
2721    let mut collected: Vec<(usize, BlockquoteLineData)> = Vec::new();
2722    let mut idx = para_start;
2723    while idx < lines.len() {
2724        if !collected.is_empty() && has_hard_break(&collected[collected.len() - 1].1.content) {
2725            break;
2726        }
2727
2728        let line = lines[idx];
2729        if line.trim().is_empty() {
2730            break;
2731        }
2732
2733        if let Some(parsed) = crate::utils::blockquote::parse_blockquote_prefix(line) {
2734            if parsed.nesting_level != target_level || is_blockquote_content_boundary(parsed.content) {
2735                break;
2736            }
2737            collected.push((
2738                idx,
2739                BlockquoteLineData::explicit(trim_preserving_hard_break(parsed.content), parsed.prefix.to_string()),
2740            ));
2741            idx += 1;
2742            continue;
2743        }
2744
2745        let lazy_content = line.trim_start();
2746        if is_blockquote_content_boundary(lazy_content) {
2747            break;
2748        }
2749
2750        collected.push((idx, BlockquoteLineData::lazy(trim_preserving_hard_break(lazy_content))));
2751        idx += 1;
2752    }
2753
2754    if collected.is_empty() {
2755        return None;
2756    }
2757
2758    let para_end = collected[collected.len() - 1].0;
2759    if target_idx < para_start || target_idx > para_end {
2760        return None;
2761    }
2762
2763    let line_data: Vec<BlockquoteLineData> = collected.iter().map(|(_, d)| d.clone()).collect();
2764
2765    let fallback_prefix = line_data
2766        .iter()
2767        .find_map(|d| d.prefix.clone())
2768        .unwrap_or_else(|| "> ".to_string());
2769    let explicit_prefix = dominant_blockquote_prefix(&line_data, &fallback_prefix);
2770    let continuation_style = blockquote_continuation_style(&line_data);
2771
2772    let adjusted_line_length = options
2773        .line_length
2774        .saturating_sub(display_len(&explicit_prefix, options.length_mode))
2775        .max(1);
2776
2777    let adjusted_options = ReflowOptions {
2778        line_length: adjusted_line_length,
2779        ..options.clone()
2780    };
2781
2782    let styled_lines = reflow_blockquote_content(&line_data, &explicit_prefix, continuation_style, &adjusted_options);
2783
2784    if styled_lines.is_empty() {
2785        return None;
2786    }
2787
2788    // Calculate byte offsets.
2789    let mut start_byte = 0;
2790    for line in lines.iter().take(para_start) {
2791        start_byte += line.len() + 1;
2792    }
2793
2794    let mut end_byte = start_byte;
2795    for line in lines.iter().take(para_end + 1).skip(para_start) {
2796        end_byte += line.len() + 1;
2797    }
2798
2799    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2800    if !includes_trailing_newline {
2801        end_byte -= 1;
2802    }
2803
2804    let reflowed_joined = styled_lines.join("\n");
2805    let reflowed_text = if includes_trailing_newline {
2806        if reflowed_joined.ends_with('\n') {
2807            reflowed_joined
2808        } else {
2809            format!("{reflowed_joined}\n")
2810        }
2811    } else if reflowed_joined.ends_with('\n') {
2812        reflowed_joined.trim_end_matches('\n').to_string()
2813    } else {
2814        reflowed_joined
2815    };
2816
2817    Some(ParagraphReflow {
2818        start_byte,
2819        end_byte,
2820        reflowed_text,
2821    })
2822}
2823
2824/// Reflow a single paragraph at the specified line number
2825///
2826/// This function finds the paragraph containing the given line number,
2827/// reflows it according to the specified line length, and returns
2828/// information about the paragraph location and its reflowed text.
2829///
2830/// # Arguments
2831///
2832/// * `content` - The full document content
2833/// * `line_number` - The 1-based line number within the paragraph to reflow
2834/// * `line_length` - The target line length for reflowing
2835///
2836/// # Returns
2837///
2838/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
2839/// or `None` if the line number is out of bounds or the content at that
2840/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
2841pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
2842    reflow_paragraph_at_line_with_mode(content, line_number, line_length, ReflowLengthMode::default())
2843}
2844
2845/// Reflow a paragraph at the given line with a specific length mode.
2846pub fn reflow_paragraph_at_line_with_mode(
2847    content: &str,
2848    line_number: usize,
2849    line_length: usize,
2850    length_mode: ReflowLengthMode,
2851) -> Option<ParagraphReflow> {
2852    let options = ReflowOptions {
2853        line_length,
2854        length_mode,
2855        ..Default::default()
2856    };
2857    reflow_paragraph_at_line_with_options(content, line_number, &options)
2858}
2859
2860/// Reflow a paragraph at the given line using the provided options.
2861///
2862/// This is the canonical implementation used by both the rule's fix mode and the
2863/// LSP "Reflow paragraph" action. Passing a fully configured `ReflowOptions` allows
2864/// the LSP action to respect user-configured reflow mode, abbreviations, etc.
2865///
2866/// # Returns
2867///
2868/// Returns `Some(ParagraphReflow)` with byte offsets and reflowed text, or `None`
2869/// if the line is out of bounds or sits inside a non-reflow-able construct.
2870pub fn reflow_paragraph_at_line_with_options(
2871    content: &str,
2872    line_number: usize,
2873    options: &ReflowOptions,
2874) -> Option<ParagraphReflow> {
2875    if line_number == 0 {
2876        return None;
2877    }
2878
2879    let lines: Vec<&str> = content.lines().collect();
2880
2881    // Check if line number is valid (1-based)
2882    if line_number > lines.len() {
2883        return None;
2884    }
2885
2886    let target_idx = line_number - 1; // Convert to 0-based
2887    let target_line = lines[target_idx];
2888    let trimmed = target_line.trim();
2889
2890    // Handle blockquote paragraphs (including lazy continuation lines) with
2891    // style-preserving output.
2892    if let Some(blockquote_reflow) = reflow_blockquote_paragraph_at_line(content, &lines, target_idx, options) {
2893        return Some(blockquote_reflow);
2894    }
2895
2896    // Don't reflow special blocks
2897    if is_paragraph_boundary(trimmed, target_line) {
2898        return None;
2899    }
2900
2901    // Find paragraph start - scan backward until blank line or special block
2902    let mut para_start = target_idx;
2903    while para_start > 0 {
2904        let prev_idx = para_start - 1;
2905        let prev_line = lines[prev_idx];
2906        let prev_trimmed = prev_line.trim();
2907
2908        // Stop at blank line or special blocks
2909        if is_paragraph_boundary(prev_trimmed, prev_line) {
2910            break;
2911        }
2912
2913        para_start = prev_idx;
2914    }
2915
2916    // Find paragraph end - scan forward until blank line or special block
2917    let mut para_end = target_idx;
2918    while para_end + 1 < lines.len() {
2919        let next_idx = para_end + 1;
2920        let next_line = lines[next_idx];
2921        let next_trimmed = next_line.trim();
2922
2923        // Stop at blank line or special blocks
2924        if is_paragraph_boundary(next_trimmed, next_line) {
2925            break;
2926        }
2927
2928        para_end = next_idx;
2929    }
2930
2931    // Extract paragraph lines
2932    let paragraph_lines = &lines[para_start..=para_end];
2933
2934    // Calculate byte offsets
2935    let mut start_byte = 0;
2936    for line in lines.iter().take(para_start) {
2937        start_byte += line.len() + 1; // +1 for newline
2938    }
2939
2940    let mut end_byte = start_byte;
2941    for line in paragraph_lines.iter() {
2942        end_byte += line.len() + 1; // +1 for newline
2943    }
2944
2945    // Track whether the byte range includes a trailing newline
2946    // (it doesn't if this is the last line and the file doesn't end with newline)
2947    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
2948
2949    // Adjust end_byte if the last line doesn't have a newline
2950    if !includes_trailing_newline {
2951        end_byte -= 1;
2952    }
2953
2954    // Join paragraph lines and reflow
2955    let paragraph_text = paragraph_lines.join("\n");
2956
2957    // Reflow the paragraph using reflow_markdown to handle it properly
2958    let reflowed = reflow_markdown(&paragraph_text, options);
2959
2960    // Ensure reflowed text matches whether the byte range includes a trailing newline
2961    // This is critical: if the range includes a newline, the replacement must too,
2962    // otherwise the next line will get appended to the reflowed paragraph
2963    let reflowed_text = if includes_trailing_newline {
2964        // Range includes newline - ensure reflowed text has one
2965        if reflowed.ends_with('\n') {
2966            reflowed
2967        } else {
2968            format!("{reflowed}\n")
2969        }
2970    } else {
2971        // Range doesn't include newline - ensure reflowed text doesn't have one
2972        if reflowed.ends_with('\n') {
2973            reflowed.trim_end_matches('\n').to_string()
2974        } else {
2975            reflowed
2976        }
2977    };
2978
2979    Some(ParagraphReflow {
2980        start_byte,
2981        end_byte,
2982        reflowed_text,
2983    })
2984}
2985
2986#[cfg(test)]
2987mod tests {
2988    use super::*;
2989
2990    /// Unit test for private helper function text_ends_with_abbreviation()
2991    ///
2992    /// This test stays inline because it tests a private function.
2993    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2994    #[test]
2995    fn test_helper_function_text_ends_with_abbreviation() {
2996        // Test the helper function directly
2997        let abbreviations = get_abbreviations(&None);
2998
2999        // True cases - built-in abbreviations (titles and i.e./e.g.)
3000        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
3001        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
3002        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
3003        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
3004        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
3005        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
3006        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
3007        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
3008
3009        // False cases - NOT in built-in list (etc doesn't always have period)
3010        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
3011        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
3012        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
3013        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
3014        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
3015        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
3016        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
3017        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
3018        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
3019        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
3020    }
3021
3022    #[test]
3023    fn test_is_unordered_list_marker() {
3024        // Valid unordered list markers
3025        assert!(is_unordered_list_marker("- item"));
3026        assert!(is_unordered_list_marker("* item"));
3027        assert!(is_unordered_list_marker("+ item"));
3028        assert!(is_unordered_list_marker("-")); // lone marker
3029        assert!(is_unordered_list_marker("*"));
3030        assert!(is_unordered_list_marker("+"));
3031
3032        // Not list markers
3033        assert!(!is_unordered_list_marker("---")); // horizontal rule
3034        assert!(!is_unordered_list_marker("***")); // horizontal rule
3035        assert!(!is_unordered_list_marker("- - -")); // horizontal rule
3036        assert!(!is_unordered_list_marker("* * *")); // horizontal rule
3037        assert!(!is_unordered_list_marker("*emphasis*")); // emphasis, not list
3038        assert!(!is_unordered_list_marker("-word")); // no space after marker
3039        assert!(!is_unordered_list_marker("")); // empty
3040        assert!(!is_unordered_list_marker("text")); // plain text
3041        assert!(!is_unordered_list_marker("# heading")); // heading
3042    }
3043
3044    #[test]
3045    fn test_is_block_boundary() {
3046        // Block boundaries
3047        assert!(is_block_boundary("")); // empty line
3048        assert!(is_block_boundary("# Heading")); // ATX heading
3049        assert!(is_block_boundary("## Level 2")); // ATX heading
3050        assert!(is_block_boundary("```rust")); // code fence
3051        assert!(is_block_boundary("~~~")); // tilde code fence
3052        assert!(is_block_boundary("> quote")); // blockquote
3053        assert!(is_block_boundary("| cell |")); // table
3054        assert!(is_block_boundary("[link]: http://example.com")); // reference def
3055        assert!(is_block_boundary("---")); // horizontal rule
3056        assert!(is_block_boundary("***")); // horizontal rule
3057        assert!(is_block_boundary("- item")); // unordered list
3058        assert!(is_block_boundary("* item")); // unordered list
3059        assert!(is_block_boundary("+ item")); // unordered list
3060        assert!(is_block_boundary("1. item")); // ordered list
3061        assert!(is_block_boundary("10. item")); // ordered list
3062        assert!(is_block_boundary(": definition")); // definition list
3063        assert!(is_block_boundary(":::")); // div marker
3064        assert!(is_block_boundary("::::: {.callout-note}")); // div marker with attrs
3065
3066        // NOT block boundaries (paragraph continuation)
3067        assert!(!is_block_boundary("regular text"));
3068        assert!(!is_block_boundary("*emphasis*")); // emphasis, not list
3069        assert!(!is_block_boundary("[link](url)")); // inline link, not reference def
3070        assert!(!is_block_boundary("some words here"));
3071    }
3072
3073    #[test]
3074    fn test_definition_list_boundary_in_single_line_paragraph() {
3075        // Verifies that a definition list item after a single-line paragraph
3076        // is treated as a block boundary, not merged into the paragraph
3077        let options = ReflowOptions {
3078            line_length: 80,
3079            ..Default::default()
3080        };
3081        let input = "Term\n: Definition of the term";
3082        let result = reflow_markdown(input, &options);
3083        // The definition list marker should remain on its own line
3084        assert!(
3085            result.contains(": Definition"),
3086            "Definition list item should not be merged into previous line. Got: {result:?}"
3087        );
3088        let lines: Vec<&str> = result.lines().collect();
3089        assert_eq!(lines.len(), 2, "Should remain two separate lines. Got: {lines:?}");
3090        assert_eq!(lines[0], "Term");
3091        assert_eq!(lines[1], ": Definition of the term");
3092    }
3093
3094    #[test]
3095    fn test_is_paragraph_boundary() {
3096        // Core block boundary checks are inherited
3097        assert!(is_paragraph_boundary("# Heading", "# Heading"));
3098        assert!(is_paragraph_boundary("- item", "- item"));
3099        assert!(is_paragraph_boundary(":::", ":::"));
3100        assert!(is_paragraph_boundary(": definition", ": definition"));
3101
3102        // Indented code blocks (≥4 spaces or tab)
3103        assert!(is_paragraph_boundary("code", "    code"));
3104        assert!(is_paragraph_boundary("code", "\tcode"));
3105
3106        // Table rows via is_potential_table_row
3107        assert!(is_paragraph_boundary("| a | b |", "| a | b |"));
3108        assert!(is_paragraph_boundary("a | b", "a | b")); // pipe-delimited without leading pipe
3109
3110        // Not paragraph boundaries
3111        assert!(!is_paragraph_boundary("regular text", "regular text"));
3112        assert!(!is_paragraph_boundary("text", "  text")); // 2-space indent is not code
3113    }
3114
3115    #[test]
3116    fn test_div_marker_boundary_in_reflow_paragraph_at_line() {
3117        // Verifies that div markers (:::) are treated as paragraph boundaries
3118        // in reflow_paragraph_at_line, preventing reflow across div boundaries
3119        let content = "Some paragraph text here.\n\n::: {.callout-note}\nThis is a callout.\n:::\n";
3120        // Line 3 is the div marker — should not be reflowed
3121        let result = reflow_paragraph_at_line(content, 3, 80);
3122        assert!(result.is_none(), "Div marker line should not be reflowed");
3123    }
3124}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs