rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::regex_cache::{
9    DISPLAY_MATH_REGEX, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
10    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
11    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
12    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
13};
14use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
15use std::collections::HashSet;
16
17/// Options for reflowing text
18#[derive(Clone)]
19pub struct ReflowOptions {
20    /// Target line length
21    pub line_length: usize,
22    /// Whether to break on sentence boundaries when possible
23    pub break_on_sentences: bool,
24    /// Whether to preserve existing line breaks in paragraphs
25    pub preserve_breaks: bool,
26    /// Whether to enforce one sentence per line
27    pub sentence_per_line: bool,
28    /// Custom abbreviations for sentence detection
29    /// Periods are optional - both "Dr" and "Dr." work the same
30    /// Custom abbreviations are always added to the built-in defaults
31    pub abbreviations: Option<Vec<String>>,
32}
33
34impl Default for ReflowOptions {
35    fn default() -> Self {
36        Self {
37            line_length: 80,
38            break_on_sentences: true,
39            preserve_breaks: false,
40            sentence_per_line: false,
41            abbreviations: None,
42        }
43    }
44}
45
46/// Get the effective abbreviations set based on options
47/// All abbreviations are normalized to lowercase for case-insensitive matching
48/// Custom abbreviations are always merged with built-in defaults
49fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
50    // Only include abbreviations that:
51    // 1. Conventionally ALWAYS have a period in standard writing
52    // 2. Are followed by something (name, example), not sentence-final
53    //
54    // Do NOT include:
55    // - Words that don't typically take periods (vs, etc)
56    // - Abbreviations that can end sentences (Inc., Ph.D., U.S.)
57    let mut abbreviations: HashSet<String> = [
58        // Titles - always have period, always followed by a name
59        "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr",
60        // Latin - always written with periods, introduce examples/references
61        "i.e", "e.g",
62    ]
63    .iter()
64    .map(|s| s.to_lowercase())
65    .collect();
66
67    // Always extend defaults with custom abbreviations
68    // Strip any trailing periods and normalize to lowercase for consistent matching
69    if let Some(custom_list) = custom {
70        for abbr in custom_list {
71            let normalized = abbr.trim_end_matches('.').to_lowercase();
72            if !normalized.is_empty() {
73                abbreviations.insert(normalized);
74            }
75        }
76    }
77
78    abbreviations
79}
80
81/// Check if text ends with a common abbreviation followed by a period
82///
83/// Abbreviations only count when followed by a period, not ! or ?.
84/// This prevents false positives where words ending in abbreviation-like
85/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
86/// detected as abbreviations.
87///
88/// Examples:
89///   - "Dr." -> true (abbreviation)
90///   - "Dr?" -> false (question, not abbreviation)
91///   - "paradigms." -> false (not in abbreviation list)
92///   - "paradigms?" -> false (question mark, not abbreviation)
93///
94/// See: Issue #150
95fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
96    // Only check if text ends with a period (abbreviations require periods)
97    if !text.ends_with('.') {
98        return false;
99    }
100
101    // Remove the trailing period
102    let without_period = text.trim_end_matches('.');
103
104    // Get the last word by splitting on whitespace
105    let last_word = without_period.split_whitespace().last().unwrap_or("");
106
107    if last_word.is_empty() {
108        return false;
109    }
110
111    // O(1) HashSet lookup (abbreviations are already lowercase)
112    abbreviations.contains(&last_word.to_lowercase())
113}
114
115/// Check if a character is CJK sentence-ending punctuation
116/// These include: 。(ideographic full stop), ！(fullwidth exclamation), ？(fullwidth question)
117fn is_cjk_sentence_ending(c: char) -> bool {
118    matches!(c, '。' | '！' | '？')
119}
120
121/// Check if a character is a CJK character (Chinese, Japanese, Korean)
122fn is_cjk_char(c: char) -> bool {
123    // CJK Unified Ideographs and common extensions
124    matches!(c,
125        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
126        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
127        '\u{3040}'..='\u{309F}' |   // Hiragana
128        '\u{30A0}'..='\u{30FF}' |   // Katakana
129        '\u{AC00}'..='\u{D7AF}'     // Hangul Syllables
130    )
131}
132
133/// Detect if a character position is a sentence boundary
134/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
135/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
136fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
137    let chars: Vec<char> = text.chars().collect();
138
139    if pos + 1 >= chars.len() {
140        return false;
141    }
142
143    let c = chars[pos];
144    let next_char = chars[pos + 1];
145
146    // Check for CJK sentence-ending punctuation (。, ！, ？)
147    // CJK punctuation doesn't require space or uppercase after it
148    if is_cjk_sentence_ending(c) {
149        // Skip any trailing emphasis/strikethrough markers
150        let mut after_punct_pos = pos + 1;
151        while after_punct_pos < chars.len()
152            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
153        {
154            after_punct_pos += 1;
155        }
156
157        // Skip whitespace
158        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
159            after_punct_pos += 1;
160        }
161
162        // Check if we have more content (any non-whitespace)
163        if after_punct_pos >= chars.len() {
164            return false;
165        }
166
167        // Skip leading emphasis/strikethrough markers
168        while after_punct_pos < chars.len()
169            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
170        {
171            after_punct_pos += 1;
172        }
173
174        if after_punct_pos >= chars.len() {
175            return false;
176        }
177
178        // For CJK, we accept any character as the start of the next sentence
179        // (no uppercase requirement, since CJK doesn't have case)
180        return true;
181    }
182
183    // Check for ASCII sentence-ending punctuation
184    if c != '.' && c != '!' && c != '?' {
185        return false;
186    }
187
188    // Must be followed by space, or by emphasis/strikethrough marker followed by space
189    let (_space_pos, after_space_pos) = if next_char == ' ' {
190        // Normal case: punctuation followed by space
191        (pos + 1, pos + 2)
192    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
193        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
194        (pos + 2, pos + 3)
195    } else if (next_char == '*' || next_char == '_')
196        && pos + 3 < chars.len()
197        && chars[pos + 2] == next_char
198        && chars[pos + 3] == ' '
199    {
200        // Sentence ends with bold: "sentence.** " or "sentence.__ "
201        (pos + 3, pos + 4)
202    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
203        // Sentence ends with strikethrough: "sentence.~~ "
204        (pos + 3, pos + 4)
205    } else {
206        return false;
207    };
208
209    // Skip all whitespace after the space to find the start of the next sentence
210    let mut next_char_pos = after_space_pos;
211    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
212        next_char_pos += 1;
213    }
214
215    // Check if we reached the end of the string
216    if next_char_pos >= chars.len() {
217        return false;
218    }
219
220    // Skip leading emphasis/strikethrough markers to find the actual first letter
221    let mut first_letter_pos = next_char_pos;
222    while first_letter_pos < chars.len()
223        && (chars[first_letter_pos] == '*' || chars[first_letter_pos] == '_' || chars[first_letter_pos] == '~')
224    {
225        first_letter_pos += 1;
226    }
227
228    // Check if we reached the end after skipping emphasis
229    if first_letter_pos >= chars.len() {
230        return false;
231    }
232
233    // First character of next sentence must be uppercase or CJK
234    let first_char = chars[first_letter_pos];
235    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
236        return false;
237    }
238
239    // Look back to check for common abbreviations (only applies to periods)
240    if pos > 0 && c == '.' {
241        // Check if the text up to and including this period ends with an abbreviation
242        // Note: text[..=pos] includes the character at pos (the period)
243        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
244            return false;
245        }
246
247        // Check for decimal numbers (e.g., "3.14")
248        // Make sure to check if first_letter_pos is within bounds
249        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
250            return false;
251        }
252    }
253    true
254}
255
256/// Split text into sentences
257pub fn split_into_sentences(text: &str) -> Vec<String> {
258    split_into_sentences_custom(text, &None)
259}
260
261/// Split text into sentences with custom abbreviations
262pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
263    let abbreviations = get_abbreviations(custom_abbreviations);
264    split_into_sentences_with_set(text, &abbreviations)
265}
266
267/// Internal function to split text into sentences with a pre-computed abbreviations set
268/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
269fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
270    let mut sentences = Vec::new();
271    let mut current_sentence = String::new();
272    let mut chars = text.chars().peekable();
273    let mut pos = 0;
274
275    while let Some(c) = chars.next() {
276        current_sentence.push(c);
277
278        if is_sentence_boundary(text, pos, abbreviations) {
279            // Consume any trailing emphasis/strikethrough markers (they belong to the current sentence)
280            while chars.peek() == Some(&'*') || chars.peek() == Some(&'_') || chars.peek() == Some(&'~') {
281                current_sentence.push(chars.next().unwrap());
282                pos += 1;
283            }
284
285            // Consume the space after the sentence
286            if chars.peek() == Some(&' ') {
287                chars.next();
288                pos += 1;
289            }
290
291            sentences.push(current_sentence.trim().to_string());
292            current_sentence.clear();
293        }
294
295        pos += 1;
296    }
297
298    // Add any remaining text as the last sentence
299    if !current_sentence.trim().is_empty() {
300        sentences.push(current_sentence.trim().to_string());
301    }
302    sentences
303}
304
305/// Check if a line is a horizontal rule (---, ___, ***)
306fn is_horizontal_rule(line: &str) -> bool {
307    if line.len() < 3 {
308        return false;
309    }
310
311    // Check if line consists only of -, _, or * characters (at least 3)
312    let chars: Vec<char> = line.chars().collect();
313    if chars.is_empty() {
314        return false;
315    }
316
317    let first_char = chars[0];
318    if first_char != '-' && first_char != '_' && first_char != '*' {
319        return false;
320    }
321
322    // All characters should be the same (allowing spaces between)
323    for c in &chars {
324        if *c != first_char && *c != ' ' {
325            return false;
326        }
327    }
328
329    // Count non-space characters
330    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
331    non_space_count >= 3
332}
333
334/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
335fn is_numbered_list_item(line: &str) -> bool {
336    let mut chars = line.chars();
337
338    // Must start with a digit
339    if !chars.next().is_some_and(|c| c.is_numeric()) {
340        return false;
341    }
342
343    // Can have more digits
344    while let Some(c) = chars.next() {
345        if c == '.' {
346            // After period, must have a space or be end of line
347            return chars.next().is_none_or(|c| c == ' ');
348        }
349        if !c.is_numeric() {
350            return false;
351        }
352    }
353
354    false
355}
356
357/// Check if a line ends with a hard break (either two spaces or backslash)
358///
359/// CommonMark supports two formats for hard line breaks:
360/// 1. Two or more trailing spaces
361/// 2. A backslash at the end of the line
362fn has_hard_break(line: &str) -> bool {
363    let line = line.strip_suffix('\r').unwrap_or(line);
364    line.ends_with("  ") || line.ends_with('\\')
365}
366
367/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
368///
369/// Hard breaks in Markdown can be indicated by:
370/// 1. Two trailing spaces before a newline (traditional)
371/// 2. A backslash at the end of the line (mdformat style)
372fn trim_preserving_hard_break(s: &str) -> String {
373    // Strip trailing \r from CRLF line endings first to handle Windows files
374    let s = s.strip_suffix('\r').unwrap_or(s);
375
376    // Check for backslash hard break (mdformat style)
377    if s.ends_with('\\') {
378        // Preserve the backslash exactly as-is
379        return s.to_string();
380    }
381
382    // Check if there are at least 2 trailing spaces (traditional hard break)
383    if s.ends_with("  ") {
384        // Find the position where non-space content ends
385        let content_end = s.trim_end().len();
386        if content_end == 0 {
387            // String is all whitespace
388            return String::new();
389        }
390        // Preserve exactly 2 trailing spaces for hard break
391        format!("{}  ", &s[..content_end])
392    } else {
393        // No hard break, just trim all trailing whitespace
394        s.trim_end().to_string()
395    }
396}
397
398pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
399    // For sentence-per-line mode, always process regardless of length
400    if options.sentence_per_line {
401        let elements = parse_markdown_elements(line);
402        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
403    }
404
405    // Quick check: if line is already short enough or no wrapping requested, return as-is
406    // line_length = 0 means no wrapping (unlimited line length)
407    if options.line_length == 0 || line.chars().count() <= options.line_length {
408        return vec![line.to_string()];
409    }
410
411    // Parse the markdown to identify elements
412    let elements = parse_markdown_elements(line);
413
414    // Reflow the elements into lines
415    reflow_elements(&elements, options)
416}
417
418/// Image source in a linked image structure
419#[derive(Debug, Clone)]
420enum LinkedImageSource {
421    /// Inline image URL: ![alt](url)
422    Inline(String),
423    /// Reference image: ![alt][ref]
424    Reference(String),
425}
426
427/// Link target in a linked image structure
428#[derive(Debug, Clone)]
429enum LinkedImageTarget {
430    /// Inline link URL: ](url)
431    Inline(String),
432    /// Reference link: ][ref]
433    Reference(String),
434}
435
436/// Represents a piece of content in the markdown
437#[derive(Debug, Clone)]
438enum Element {
439    /// Plain text that can be wrapped
440    Text(String),
441    /// A complete markdown inline link [text](url)
442    Link { text: String, url: String },
443    /// A complete markdown reference link [text][ref]
444    ReferenceLink { text: String, reference: String },
445    /// A complete markdown empty reference link [text][]
446    EmptyReferenceLink { text: String },
447    /// A complete markdown shortcut reference link [ref]
448    ShortcutReference { reference: String },
449    /// A complete markdown inline image ![alt](url)
450    InlineImage { alt: String, url: String },
451    /// A complete markdown reference image ![alt][ref]
452    ReferenceImage { alt: String, reference: String },
453    /// A complete markdown empty reference image ![alt][]
454    EmptyReferenceImage { alt: String },
455    /// A clickable image badge in any of 4 forms:
456    /// - [![alt](img-url)](link-url)
457    /// - [![alt][img-ref]](link-url)
458    /// - [![alt](img-url)][link-ref]
459    /// - [![alt][img-ref]][link-ref]
460    LinkedImage {
461        alt: String,
462        img_source: LinkedImageSource,
463        link_target: LinkedImageTarget,
464    },
465    /// Footnote reference [^note]
466    FootnoteReference { note: String },
467    /// Strikethrough text ~~text~~
468    Strikethrough(String),
469    /// Wiki-style link [[wiki]] or [[wiki|text]]
470    WikiLink(String),
471    /// Inline math $math$
472    InlineMath(String),
473    /// Display math $$math$$
474    DisplayMath(String),
475    /// Emoji shortcode :emoji:
476    EmojiShortcode(String),
477    /// HTML tag <tag> or </tag> or <tag/>
478    HtmlTag(String),
479    /// HTML entity &nbsp; or &#123;
480    HtmlEntity(String),
481    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
482    HugoShortcode(String),
483    /// Inline code `code`
484    Code(String),
485    /// Bold text **text** or __text__
486    Bold {
487        content: String,
488        /// True if underscore markers (__), false for asterisks (**)
489        underscore: bool,
490    },
491    /// Italic text *text* or _text_
492    Italic {
493        content: String,
494        /// True if underscore marker (_), false for asterisk (*)
495        underscore: bool,
496    },
497}
498
499impl std::fmt::Display for Element {
500    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
501        match self {
502            Element::Text(s) => write!(f, "{s}"),
503            Element::Link { text, url } => write!(f, "[{text}]({url})"),
504            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
505            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
506            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
507            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
508            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
509            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
510            Element::LinkedImage {
511                alt,
512                img_source,
513                link_target,
514            } => {
515                // Build the image part: ![alt](url) or ![alt][ref]
516                let img_part = match img_source {
517                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
518                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
519                };
520                // Build the link part: (url) or [ref]
521                match link_target {
522                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
523                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
524                }
525            }
526            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
527            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
528            Element::WikiLink(s) => write!(f, "[[{s}]]"),
529            Element::InlineMath(s) => write!(f, "${s}$"),
530            Element::DisplayMath(s) => write!(f, "$${s}$$"),
531            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
532            Element::HtmlTag(s) => write!(f, "{s}"),
533            Element::HtmlEntity(s) => write!(f, "{s}"),
534            Element::HugoShortcode(s) => write!(f, "{s}"),
535            Element::Code(s) => write!(f, "`{s}`"),
536            Element::Bold { content, underscore } => {
537                if *underscore {
538                    write!(f, "__{content}__")
539                } else {
540                    write!(f, "**{content}**")
541                }
542            }
543            Element::Italic { content, underscore } => {
544                if *underscore {
545                    write!(f, "_{content}_")
546                } else {
547                    write!(f, "*{content}*")
548                }
549            }
550        }
551    }
552}
553
554impl Element {
555    fn len(&self) -> usize {
556        match self {
557            Element::Text(s) => s.chars().count(),
558            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
559            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
560            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
561            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
562            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
563            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
564            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
565            Element::LinkedImage {
566                alt,
567                img_source,
568                link_target,
569            } => {
570                // Calculate length based on variant
571                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
572                let alt_len = alt.chars().count();
573                let img_len = match img_source {
574                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
575                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
576                };
577                let link_len = match link_target {
578                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
579                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
580                };
581                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
582                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
583                5 + alt_len + img_len + link_len
584            }
585            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
586            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
587            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
588            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
589            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
590            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
591            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
592            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
593            Element::HugoShortcode(s) => s.chars().count(),                  // {{< ... >}} - already complete
594            Element::Code(s) => s.chars().count() + 2,                       // `code`
595            Element::Bold { content, .. } => content.chars().count() + 4,    // **text** or __text__
596            Element::Italic { content, .. } => content.chars().count() + 2,  // *text* or _text_
597        }
598    }
599}
600
601/// An emphasis or formatting span parsed by pulldown-cmark
602#[derive(Debug, Clone)]
603struct EmphasisSpan {
604    /// Byte offset where the emphasis starts (including markers)
605    start: usize,
606    /// Byte offset where the emphasis ends (after closing markers)
607    end: usize,
608    /// The content inside the emphasis markers
609    content: String,
610    /// Whether this is strong (bold) emphasis
611    is_strong: bool,
612    /// Whether this is strikethrough (~~text~~)
613    is_strikethrough: bool,
614    /// Whether the original used underscore markers (for emphasis only)
615    uses_underscore: bool,
616}
617
618/// Extract emphasis and strikethrough spans from text using pulldown-cmark
619///
620/// This provides CommonMark-compliant emphasis parsing, correctly handling:
621/// - Nested emphasis like `*text **bold** more*`
622/// - Left/right flanking delimiter rules
623/// - Underscore vs asterisk markers
624/// - GFM strikethrough (~~text~~)
625///
626/// Returns spans sorted by start position.
627fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
628    let mut spans = Vec::new();
629    let mut options = Options::empty();
630    options.insert(Options::ENABLE_STRIKETHROUGH);
631
632    // Stacks to track nested formatting with their start positions
633    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
634    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
635    let mut strikethrough_stack: Vec<usize> = Vec::new();
636
637    let parser = Parser::new_ext(text, options).into_offset_iter();
638
639    for (event, range) in parser {
640        match event {
641            Event::Start(Tag::Emphasis) => {
642                // Check if this uses underscore by looking at the original text
643                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
644                emphasis_stack.push((range.start, uses_underscore));
645            }
646            Event::End(TagEnd::Emphasis) => {
647                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
648                    // Extract content between the markers (1 char marker on each side)
649                    let content_start = start_byte + 1;
650                    let content_end = range.end - 1;
651                    if content_end > content_start
652                        && let Some(content) = text.get(content_start..content_end)
653                    {
654                        spans.push(EmphasisSpan {
655                            start: start_byte,
656                            end: range.end,
657                            content: content.to_string(),
658                            is_strong: false,
659                            is_strikethrough: false,
660                            uses_underscore,
661                        });
662                    }
663                }
664            }
665            Event::Start(Tag::Strong) => {
666                // Check if this uses underscore by looking at the original text
667                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
668                strong_stack.push((range.start, uses_underscore));
669            }
670            Event::End(TagEnd::Strong) => {
671                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
672                    // Extract content between the markers (2 char marker on each side)
673                    let content_start = start_byte + 2;
674                    let content_end = range.end - 2;
675                    if content_end > content_start
676                        && let Some(content) = text.get(content_start..content_end)
677                    {
678                        spans.push(EmphasisSpan {
679                            start: start_byte,
680                            end: range.end,
681                            content: content.to_string(),
682                            is_strong: true,
683                            is_strikethrough: false,
684                            uses_underscore,
685                        });
686                    }
687                }
688            }
689            Event::Start(Tag::Strikethrough) => {
690                strikethrough_stack.push(range.start);
691            }
692            Event::End(TagEnd::Strikethrough) => {
693                if let Some(start_byte) = strikethrough_stack.pop() {
694                    // Extract content between the ~~ markers (2 char marker on each side)
695                    let content_start = start_byte + 2;
696                    let content_end = range.end - 2;
697                    if content_end > content_start
698                        && let Some(content) = text.get(content_start..content_end)
699                    {
700                        spans.push(EmphasisSpan {
701                            start: start_byte,
702                            end: range.end,
703                            content: content.to_string(),
704                            is_strong: false,
705                            is_strikethrough: true,
706                            uses_underscore: false,
707                        });
708                    }
709                }
710            }
711            _ => {}
712        }
713    }
714
715    // Sort by start position
716    spans.sort_by_key(|s| s.start);
717    spans
718}
719
720/// Parse markdown elements from text preserving the raw syntax
721///
722/// Detection order is critical:
723/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
724/// 2. Inline images ![alt](url) - before links to handle ! prefix
725/// 3. Reference images ![alt][ref] - before reference links
726/// 4. Inline links [text](url) - before reference links
727/// 5. Reference links [text][ref] - before shortcut references
728/// 6. Shortcut reference links [ref] - detected last to avoid false positives
729/// 7. Other elements (code, bold, italic, etc.) - processed normally
730fn parse_markdown_elements(text: &str) -> Vec<Element> {
731    let mut elements = Vec::new();
732    let mut remaining = text;
733
734    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
735    let emphasis_spans = extract_emphasis_spans(text);
736
737    while !remaining.is_empty() {
738        // Calculate current byte offset in original text
739        let current_offset = text.len() - remaining.len();
740        // Find the earliest occurrence of any markdown pattern
741        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
742
743        // Check for linked images FIRST (all 4 variants)
744        // Quick literal check: only run expensive regexes if we might have a linked image
745        // Pattern starts with "[!" so check for that first
746        if remaining.contains("[!") {
747            // Pattern 1: [![alt](img)](link) - inline image in inline link
748            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
749                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
750            {
751                earliest_match = Some((m.start(), "linked_image_ii", m));
752            }
753
754            // Pattern 2: [![alt][ref]](link) - reference image in inline link
755            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
756                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
757            {
758                earliest_match = Some((m.start(), "linked_image_ri", m));
759            }
760
761            // Pattern 3: [![alt](img)][ref] - inline image in reference link
762            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
763                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
764            {
765                earliest_match = Some((m.start(), "linked_image_ir", m));
766            }
767
768            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
769            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
770                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
771            {
772                earliest_match = Some((m.start(), "linked_image_rr", m));
773            }
774        }
775
776        // Check for images (they start with ! so should be detected before links)
777        // Inline images - ![alt](url)
778        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
779            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
780        {
781            earliest_match = Some((m.start(), "inline_image", m));
782        }
783
784        // Reference images - ![alt][ref]
785        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
786            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
787        {
788            earliest_match = Some((m.start(), "ref_image", m));
789        }
790
791        // Check for footnote references - [^note]
792        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
793            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
794        {
795            earliest_match = Some((m.start(), "footnote_ref", m));
796        }
797
798        // Check for inline links - [text](url)
799        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
800            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
801        {
802            earliest_match = Some((m.start(), "inline_link", m));
803        }
804
805        // Check for reference links - [text][ref]
806        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
807            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
808        {
809            earliest_match = Some((m.start(), "ref_link", m));
810        }
811
812        // Check for shortcut reference links - [ref]
813        // Only check if we haven't found an earlier pattern that would conflict
814        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
815            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
816        {
817            earliest_match = Some((m.start(), "shortcut_ref", m));
818        }
819
820        // Check for wiki-style links - [[wiki]]
821        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
822            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
823        {
824            earliest_match = Some((m.start(), "wiki_link", m));
825        }
826
827        // Check for display math first (before inline) - $$math$$
828        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
829            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
830        {
831            earliest_match = Some((m.start(), "display_math", m));
832        }
833
834        // Check for inline math - $math$
835        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
836            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
837        {
838            earliest_match = Some((m.start(), "inline_math", m));
839        }
840
841        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
842
843        // Check for emoji shortcodes - :emoji:
844        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
845            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
846        {
847            earliest_match = Some((m.start(), "emoji", m));
848        }
849
850        // Check for HTML entities - &nbsp; etc
851        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
852            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
853        {
854            earliest_match = Some((m.start(), "html_entity", m));
855        }
856
857        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
858        // Must be checked before other patterns to avoid false sentence breaks
859        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
860            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
861        {
862            earliest_match = Some((m.start(), "hugo_shortcode", m));
863        }
864
865        // Check for HTML tags - <tag> </tag> <tag/>
866        // But exclude autolinks like <https://...> or <mailto:...>
867        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
868            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
869        {
870            // Check if this is an autolink (starts with protocol or mailto:)
871            let matched_text = &remaining[m.start()..m.end()];
872            let is_autolink = matched_text.starts_with("<http://")
873                || matched_text.starts_with("<https://")
874                || matched_text.starts_with("<mailto:")
875                || matched_text.starts_with("<ftp://")
876                || matched_text.starts_with("<ftps://");
877
878            if !is_autolink {
879                earliest_match = Some((m.start(), "html_tag", m));
880            }
881        }
882
883        // Find earliest non-link special characters
884        let mut next_special = remaining.len();
885        let mut special_type = "";
886        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
887
888        // Check for code spans (not handled by pulldown-cmark in this context)
889        if let Some(pos) = remaining.find('`')
890            && pos < next_special
891        {
892            next_special = pos;
893            special_type = "code";
894        }
895
896        // Check for emphasis using pulldown-cmark's pre-extracted spans
897        // Find the earliest emphasis span that starts within remaining text
898        for span in &emphasis_spans {
899            if span.start >= current_offset && span.start < current_offset + remaining.len() {
900                let pos_in_remaining = span.start - current_offset;
901                if pos_in_remaining < next_special {
902                    next_special = pos_in_remaining;
903                    special_type = "pulldown_emphasis";
904                    pulldown_emphasis = Some(span);
905                }
906                break; // Spans are sorted by start position, so first match is earliest
907            }
908        }
909
910        // Determine which pattern to process first
911        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
912            pos < next_special
913        } else {
914            false
915        };
916
917        if should_process_markdown_link {
918            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
919
920            // Add any text before the match
921            if pos > 0 {
922                elements.push(Element::Text(remaining[..pos].to_string()));
923            }
924
925            // Process the matched pattern
926            match pattern_type {
927                // Pattern 1: [![alt](img)](link) - inline image in inline link
928                "linked_image_ii" => {
929                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
930                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
931                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
932                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
933                        elements.push(Element::LinkedImage {
934                            alt: alt.to_string(),
935                            img_source: LinkedImageSource::Inline(img_url.to_string()),
936                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
937                        });
938                        remaining = &remaining[match_obj.end()..];
939                    } else {
940                        elements.push(Element::Text("[".to_string()));
941                        remaining = &remaining[1..];
942                    }
943                }
944                // Pattern 2: [![alt][ref]](link) - reference image in inline link
945                "linked_image_ri" => {
946                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
947                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
948                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
949                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
950                        elements.push(Element::LinkedImage {
951                            alt: alt.to_string(),
952                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
953                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
954                        });
955                        remaining = &remaining[match_obj.end()..];
956                    } else {
957                        elements.push(Element::Text("[".to_string()));
958                        remaining = &remaining[1..];
959                    }
960                }
961                // Pattern 3: [![alt](img)][ref] - inline image in reference link
962                "linked_image_ir" => {
963                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
964                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
965                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
966                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
967                        elements.push(Element::LinkedImage {
968                            alt: alt.to_string(),
969                            img_source: LinkedImageSource::Inline(img_url.to_string()),
970                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
971                        });
972                        remaining = &remaining[match_obj.end()..];
973                    } else {
974                        elements.push(Element::Text("[".to_string()));
975                        remaining = &remaining[1..];
976                    }
977                }
978                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
979                "linked_image_rr" => {
980                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
981                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
982                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
983                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
984                        elements.push(Element::LinkedImage {
985                            alt: alt.to_string(),
986                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
987                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
988                        });
989                        remaining = &remaining[match_obj.end()..];
990                    } else {
991                        elements.push(Element::Text("[".to_string()));
992                        remaining = &remaining[1..];
993                    }
994                }
995                "inline_image" => {
996                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
997                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
998                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
999                        elements.push(Element::InlineImage {
1000                            alt: alt.to_string(),
1001                            url: url.to_string(),
1002                        });
1003                        remaining = &remaining[match_obj.end()..];
1004                    } else {
1005                        elements.push(Element::Text("!".to_string()));
1006                        remaining = &remaining[1..];
1007                    }
1008                }
1009                "ref_image" => {
1010                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
1011                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1012                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1013
1014                        if reference.is_empty() {
1015                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
1016                        } else {
1017                            elements.push(Element::ReferenceImage {
1018                                alt: alt.to_string(),
1019                                reference: reference.to_string(),
1020                            });
1021                        }
1022                        remaining = &remaining[match_obj.end()..];
1023                    } else {
1024                        elements.push(Element::Text("!".to_string()));
1025                        remaining = &remaining[1..];
1026                    }
1027                }
1028                "footnote_ref" => {
1029                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
1030                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1031                        elements.push(Element::FootnoteReference { note: note.to_string() });
1032                        remaining = &remaining[match_obj.end()..];
1033                    } else {
1034                        elements.push(Element::Text("[".to_string()));
1035                        remaining = &remaining[1..];
1036                    }
1037                }
1038                "inline_link" => {
1039                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
1040                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1041                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1042                        elements.push(Element::Link {
1043                            text: text.to_string(),
1044                            url: url.to_string(),
1045                        });
1046                        remaining = &remaining[match_obj.end()..];
1047                    } else {
1048                        // Fallback - shouldn't happen
1049                        elements.push(Element::Text("[".to_string()));
1050                        remaining = &remaining[1..];
1051                    }
1052                }
1053                "ref_link" => {
1054                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1055                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1056                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1057
1058                        if reference.is_empty() {
1059                            // Empty reference link [text][]
1060                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1061                        } else {
1062                            // Regular reference link [text][ref]
1063                            elements.push(Element::ReferenceLink {
1064                                text: text.to_string(),
1065                                reference: reference.to_string(),
1066                            });
1067                        }
1068                        remaining = &remaining[match_obj.end()..];
1069                    } else {
1070                        // Fallback - shouldn't happen
1071                        elements.push(Element::Text("[".to_string()));
1072                        remaining = &remaining[1..];
1073                    }
1074                }
1075                "shortcut_ref" => {
1076                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1077                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1078                        elements.push(Element::ShortcutReference {
1079                            reference: reference.to_string(),
1080                        });
1081                        remaining = &remaining[match_obj.end()..];
1082                    } else {
1083                        // Fallback - shouldn't happen
1084                        elements.push(Element::Text("[".to_string()));
1085                        remaining = &remaining[1..];
1086                    }
1087                }
1088                "wiki_link" => {
1089                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1090                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1091                        elements.push(Element::WikiLink(content.to_string()));
1092                        remaining = &remaining[match_obj.end()..];
1093                    } else {
1094                        elements.push(Element::Text("[[".to_string()));
1095                        remaining = &remaining[2..];
1096                    }
1097                }
1098                "display_math" => {
1099                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1100                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1101                        elements.push(Element::DisplayMath(math.to_string()));
1102                        remaining = &remaining[match_obj.end()..];
1103                    } else {
1104                        elements.push(Element::Text("$$".to_string()));
1105                        remaining = &remaining[2..];
1106                    }
1107                }
1108                "inline_math" => {
1109                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1110                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1111                        elements.push(Element::InlineMath(math.to_string()));
1112                        remaining = &remaining[match_obj.end()..];
1113                    } else {
1114                        elements.push(Element::Text("$".to_string()));
1115                        remaining = &remaining[1..];
1116                    }
1117                }
1118                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1119                "emoji" => {
1120                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1121                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1122                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1123                        remaining = &remaining[match_obj.end()..];
1124                    } else {
1125                        elements.push(Element::Text(":".to_string()));
1126                        remaining = &remaining[1..];
1127                    }
1128                }
1129                "html_entity" => {
1130                    // HTML entities are captured whole
1131                    elements.push(Element::HtmlEntity(remaining[..match_obj.end()].to_string()));
1132                    remaining = &remaining[match_obj.end()..];
1133                }
1134                "hugo_shortcode" => {
1135                    // Hugo shortcodes are atomic elements - preserve them exactly
1136                    elements.push(Element::HugoShortcode(remaining[..match_obj.end()].to_string()));
1137                    remaining = &remaining[match_obj.end()..];
1138                }
1139                "html_tag" => {
1140                    // HTML tags are captured whole
1141                    elements.push(Element::HtmlTag(remaining[..match_obj.end()].to_string()));
1142                    remaining = &remaining[match_obj.end()..];
1143                }
1144                _ => {
1145                    // Unknown pattern, treat as text
1146                    elements.push(Element::Text("[".to_string()));
1147                    remaining = &remaining[1..];
1148                }
1149            }
1150        } else {
1151            // Process non-link special characters
1152
1153            // Add any text before the special character
1154            if next_special > 0 && next_special < remaining.len() {
1155                elements.push(Element::Text(remaining[..next_special].to_string()));
1156                remaining = &remaining[next_special..];
1157            }
1158
1159            // Process the special element
1160            match special_type {
1161                "code" => {
1162                    // Find end of code
1163                    if let Some(code_end) = remaining[1..].find('`') {
1164                        let code = &remaining[1..1 + code_end];
1165                        elements.push(Element::Code(code.to_string()));
1166                        remaining = &remaining[1 + code_end + 1..];
1167                    } else {
1168                        // No closing backtick, treat as text
1169                        elements.push(Element::Text(remaining.to_string()));
1170                        break;
1171                    }
1172                }
1173                "pulldown_emphasis" => {
1174                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1175                    if let Some(span) = pulldown_emphasis {
1176                        let span_len = span.end - span.start;
1177                        if span.is_strikethrough {
1178                            elements.push(Element::Strikethrough(span.content.clone()));
1179                        } else if span.is_strong {
1180                            elements.push(Element::Bold {
1181                                content: span.content.clone(),
1182                                underscore: span.uses_underscore,
1183                            });
1184                        } else {
1185                            elements.push(Element::Italic {
1186                                content: span.content.clone(),
1187                                underscore: span.uses_underscore,
1188                            });
1189                        }
1190                        remaining = &remaining[span_len..];
1191                    } else {
1192                        // Fallback - shouldn't happen
1193                        elements.push(Element::Text(remaining[..1].to_string()));
1194                        remaining = &remaining[1..];
1195                    }
1196                }
1197                _ => {
1198                    // No special elements found, add all remaining text
1199                    elements.push(Element::Text(remaining.to_string()));
1200                    break;
1201                }
1202            }
1203        }
1204    }
1205
1206    elements
1207}
1208
1209/// Reflow elements for sentence-per-line mode
1210fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1211    let abbreviations = get_abbreviations(custom_abbreviations);
1212    let mut lines = Vec::new();
1213    let mut current_line = String::new();
1214
1215    for element in elements.iter() {
1216        let element_str = format!("{element}");
1217
1218        // For text elements, split into sentences
1219        if let Element::Text(text) = element {
1220            // Simply append text - it already has correct spacing from tokenization
1221            let combined = format!("{current_line}{text}");
1222            // Use the pre-computed abbreviations set to avoid redundant computation
1223            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1224
1225            if sentences.len() > 1 {
1226                // We found sentence boundaries
1227                for (i, sentence) in sentences.iter().enumerate() {
1228                    if i == 0 {
1229                        // First sentence might continue from previous elements
1230                        // But check if it ends with an abbreviation
1231                        let trimmed = sentence.trim();
1232
1233                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1234                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1235                            current_line = sentence.to_string();
1236                        } else {
1237                            // Normal case - emit the first sentence
1238                            lines.push(sentence.to_string());
1239                            current_line.clear();
1240                        }
1241                    } else if i == sentences.len() - 1 {
1242                        // Last sentence: check if it's complete or incomplete
1243                        let trimmed = sentence.trim();
1244                        let ends_with_sentence_punct =
1245                            trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1246
1247                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1248                            // Complete sentence - emit it immediately
1249                            lines.push(sentence.to_string());
1250                            current_line.clear();
1251                        } else {
1252                            // Incomplete sentence - save for next iteration
1253                            current_line = sentence.to_string();
1254                        }
1255                    } else {
1256                        // Complete sentences in the middle
1257                        lines.push(sentence.to_string());
1258                    }
1259                }
1260            } else {
1261                // Single sentence - check if it's complete
1262                let trimmed = combined.trim();
1263                let ends_with_sentence_punct =
1264                    trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1265
1266                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1267                    // Complete single sentence - emit it
1268                    lines.push(trimmed.to_string());
1269                    current_line.clear();
1270                } else {
1271                    // Incomplete sentence - continue accumulating
1272                    current_line = combined;
1273                }
1274            }
1275        } else if let Element::Italic { content, underscore } = element {
1276            // Handle italic elements - may contain multiple sentences that need continuation
1277            let marker = if *underscore { "_" } else { "*" };
1278            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1279        } else if let Element::Bold { content, underscore } = element {
1280            // Handle bold elements - may contain multiple sentences that need continuation
1281            let marker = if *underscore { "__" } else { "**" };
1282            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1283        } else if let Element::Strikethrough(content) = element {
1284            // Handle strikethrough elements - may contain multiple sentences that need continuation
1285            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1286        } else {
1287            // Non-text, non-emphasis elements (Code, Links, etc.)
1288            // Add space before element if needed (unless it's after an opening paren/bracket)
1289            if !current_line.is_empty()
1290                && !current_line.ends_with(' ')
1291                && !current_line.ends_with('(')
1292                && !current_line.ends_with('[')
1293            {
1294                current_line.push(' ');
1295            }
1296            current_line.push_str(&element_str);
1297        }
1298    }
1299
1300    // Add any remaining content
1301    if !current_line.is_empty() {
1302        lines.push(current_line.trim().to_string());
1303    }
1304    lines
1305}
1306
1307/// Handle splitting emphasis content at sentence boundaries while preserving markers
1308fn handle_emphasis_sentence_split(
1309    content: &str,
1310    marker: &str,
1311    abbreviations: &HashSet<String>,
1312    current_line: &mut String,
1313    lines: &mut Vec<String>,
1314) {
1315    // Split the emphasis content into sentences
1316    let sentences = split_into_sentences_with_set(content, abbreviations);
1317
1318    if sentences.len() <= 1 {
1319        // Single sentence or no boundaries - treat as atomic
1320        if !current_line.is_empty()
1321            && !current_line.ends_with(' ')
1322            && !current_line.ends_with('(')
1323            && !current_line.ends_with('[')
1324        {
1325            current_line.push(' ');
1326        }
1327        current_line.push_str(marker);
1328        current_line.push_str(content);
1329        current_line.push_str(marker);
1330
1331        // Check if the emphasis content ends with sentence punctuation - if so, emit
1332        let trimmed = content.trim();
1333        let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1334        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1335            lines.push(current_line.clone());
1336            current_line.clear();
1337        }
1338    } else {
1339        // Multiple sentences - each gets its own emphasis markers
1340        for (i, sentence) in sentences.iter().enumerate() {
1341            let trimmed = sentence.trim();
1342            if trimmed.is_empty() {
1343                continue;
1344            }
1345
1346            if i == 0 {
1347                // First sentence: combine with current_line and emit
1348                if !current_line.is_empty()
1349                    && !current_line.ends_with(' ')
1350                    && !current_line.ends_with('(')
1351                    && !current_line.ends_with('[')
1352                {
1353                    current_line.push(' ');
1354                }
1355                current_line.push_str(marker);
1356                current_line.push_str(trimmed);
1357                current_line.push_str(marker);
1358
1359                // Check if this is a complete sentence
1360                let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1361                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1362                    lines.push(current_line.clone());
1363                    current_line.clear();
1364                }
1365            } else if i == sentences.len() - 1 {
1366                // Last sentence: check if complete
1367                let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1368
1369                let mut line = String::new();
1370                line.push_str(marker);
1371                line.push_str(trimmed);
1372                line.push_str(marker);
1373
1374                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1375                    lines.push(line);
1376                } else {
1377                    // Incomplete - keep in current_line for potential continuation
1378                    *current_line = line;
1379                }
1380            } else {
1381                // Middle sentences: emit with markers
1382                let mut line = String::new();
1383                line.push_str(marker);
1384                line.push_str(trimmed);
1385                line.push_str(marker);
1386                lines.push(line);
1387            }
1388        }
1389    }
1390}
1391
1392/// Reflow elements into lines that fit within the line length
1393fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1394    let mut lines = Vec::new();
1395    let mut current_line = String::new();
1396    let mut current_length = 0;
1397
1398    for element in elements {
1399        let element_str = format!("{element}");
1400        let element_len = element.len();
1401
1402        // For text elements that might need breaking
1403        if let Element::Text(text) = element {
1404            // Check if original text had leading whitespace
1405            let has_leading_space = text.starts_with(char::is_whitespace);
1406            // If this is a text element, always process it word by word
1407            let words: Vec<&str> = text.split_whitespace().collect();
1408
1409            for (i, word) in words.iter().enumerate() {
1410                let word_len = word.chars().count();
1411                // Check if this "word" is just punctuation that should stay attached
1412                let is_trailing_punct = word
1413                    .chars()
1414                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1415
1416                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
1417                    // Start a new line (but never for trailing punctuation)
1418                    lines.push(current_line.trim().to_string());
1419                    current_line = word.to_string();
1420                    current_length = word_len;
1421                } else {
1422                    // Add word to current line
1423                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1424                    // AND this isn't trailing punctuation (which attaches directly)
1425                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1426                        current_line.push(' ');
1427                        current_length += 1;
1428                    }
1429                    current_line.push_str(word);
1430                    current_length += word_len;
1431                }
1432            }
1433        } else {
1434            // For non-text elements (code, links, references), treat as atomic units
1435            // These should never be broken across lines
1436            if current_length > 0 && current_length + 1 + element_len > options.line_length {
1437                // Start a new line
1438                lines.push(current_line.trim().to_string());
1439                current_line = element_str;
1440                current_length = element_len;
1441            } else {
1442                // Add element to current line
1443                // Don't add space if the current line ends with an opening bracket/paren
1444                let ends_with_opener =
1445                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1446                if current_length > 0 && !ends_with_opener {
1447                    current_line.push(' ');
1448                    current_length += 1;
1449                }
1450                current_line.push_str(&element_str);
1451                current_length += element_len;
1452            }
1453        }
1454    }
1455
1456    // Don't forget the last line
1457    if !current_line.is_empty() {
1458        lines.push(current_line.trim_end().to_string());
1459    }
1460
1461    lines
1462}
1463
1464/// Reflow markdown content preserving structure
1465pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1466    let lines: Vec<&str> = content.lines().collect();
1467    let mut result = Vec::new();
1468    let mut i = 0;
1469
1470    while i < lines.len() {
1471        let line = lines[i];
1472        let trimmed = line.trim();
1473
1474        // Preserve empty lines
1475        if trimmed.is_empty() {
1476            result.push(String::new());
1477            i += 1;
1478            continue;
1479        }
1480
1481        // Preserve headings as-is
1482        if trimmed.starts_with('#') {
1483            result.push(line.to_string());
1484            i += 1;
1485            continue;
1486        }
1487
1488        // Preserve fenced code blocks
1489        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1490            result.push(line.to_string());
1491            i += 1;
1492            // Copy lines until closing fence
1493            while i < lines.len() {
1494                result.push(lines[i].to_string());
1495                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1496                    i += 1;
1497                    break;
1498                }
1499                i += 1;
1500            }
1501            continue;
1502        }
1503
1504        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1505        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1506            // Collect all consecutive indented lines
1507            result.push(line.to_string());
1508            i += 1;
1509            while i < lines.len() {
1510                let next_line = lines[i];
1511                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1512                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1513                    result.push(next_line.to_string());
1514                    i += 1;
1515                } else {
1516                    break;
1517                }
1518            }
1519            continue;
1520        }
1521
1522        // Preserve block quotes (but reflow their content)
1523        if trimmed.starts_with('>') {
1524            let quote_prefix = line[0..line.find('>').unwrap() + 1].to_string();
1525            let quote_content = &line[quote_prefix.len()..].trim_start();
1526
1527            let reflowed = reflow_line(quote_content, options);
1528            for reflowed_line in reflowed.iter() {
1529                result.push(format!("{quote_prefix} {reflowed_line}"));
1530            }
1531            i += 1;
1532            continue;
1533        }
1534
1535        // Preserve horizontal rules first (before checking for lists)
1536        if is_horizontal_rule(trimmed) {
1537            result.push(line.to_string());
1538            i += 1;
1539            continue;
1540        }
1541
1542        // Preserve lists (but not horizontal rules)
1543        // A valid unordered list marker must be followed by a space (or be alone on line)
1544        // This prevents emphasis markers like "*text*" from being parsed as list items
1545        let is_unordered_list = |s: &str, marker: char| -> bool {
1546            s.starts_with(marker) && !is_horizontal_rule(s) && (s.len() == 1 || s.chars().nth(1) == Some(' '))
1547        };
1548        if is_unordered_list(trimmed, '-')
1549            || is_unordered_list(trimmed, '*')
1550            || is_unordered_list(trimmed, '+')
1551            || is_numbered_list_item(trimmed)
1552        {
1553            // Find the list marker and preserve indentation
1554            let indent = line.len() - line.trim_start().len();
1555            let indent_str = " ".repeat(indent);
1556
1557            // For numbered lists, find the period and the space after it
1558            // For bullet lists, find the marker and the space after it
1559            let mut marker_end = indent;
1560            let mut content_start = indent;
1561
1562            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1563                // Numbered list: find the period
1564                if let Some(period_pos) = line[indent..].find('.') {
1565                    marker_end = indent + period_pos + 1; // Include the period
1566                    content_start = marker_end;
1567                    // Skip any spaces after the period to find content start
1568                    while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1569                        content_start += 1;
1570                    }
1571                }
1572            } else {
1573                // Bullet list: marker is single character
1574                marker_end = indent + 1; // Just the marker character
1575                content_start = marker_end;
1576                // Skip any spaces after the marker
1577                while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1578                    content_start += 1;
1579                }
1580            }
1581
1582            let marker = &line[indent..marker_end];
1583
1584            // Collect all content for this list item (including continuation lines)
1585            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1586            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1587            i += 1;
1588
1589            // Collect continuation lines (indented lines that are part of this list item)
1590            while i < lines.len() {
1591                let next_line = lines[i];
1592                let next_trimmed = next_line.trim();
1593
1594                // Stop if we hit an empty line or another list item or special block
1595                if next_trimmed.is_empty()
1596                    || next_trimmed.starts_with('#')
1597                    || next_trimmed.starts_with("```")
1598                    || next_trimmed.starts_with("~~~")
1599                    || next_trimmed.starts_with('>')
1600                    || next_trimmed.starts_with('|')
1601                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1602                    || is_horizontal_rule(next_trimmed)
1603                    || (next_trimmed.starts_with('-')
1604                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1605                    || (next_trimmed.starts_with('*')
1606                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1607                    || (next_trimmed.starts_with('+')
1608                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1609                    || is_numbered_list_item(next_trimmed)
1610                    || is_definition_list_item(next_trimmed)
1611                {
1612                    break;
1613                }
1614
1615                // Check if this line is indented (continuation of list item)
1616                let next_indent = next_line.len() - next_line.trim_start().len();
1617                if next_indent >= content_start {
1618                    // This is a continuation line - add its content
1619                    // Preserve hard breaks while trimming excessive whitespace
1620                    let trimmed_start = next_line.trim_start();
1621                    list_content.push(trim_preserving_hard_break(trimmed_start));
1622                    i += 1;
1623                } else {
1624                    // Not indented enough, not part of this list item
1625                    break;
1626                }
1627            }
1628
1629            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1630            // Hard breaks should prevent joining with the next line
1631            let combined_content = if options.preserve_breaks {
1632                list_content[0].clone()
1633            } else {
1634                // Check if any lines have hard breaks - if so, preserve the structure
1635                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1636                if has_hard_breaks {
1637                    // Don't join lines with hard breaks - keep them separate with newlines
1638                    list_content.join("\n")
1639                } else {
1640                    // No hard breaks, safe to join with spaces
1641                    list_content.join(" ")
1642                }
1643            };
1644
1645            // Calculate the proper indentation for continuation lines
1646            let trimmed_marker = marker;
1647            let continuation_spaces = content_start;
1648
1649            // Adjust line length to account for list marker and space
1650            let prefix_length = indent + trimmed_marker.len() + 1;
1651
1652            // Create adjusted options with reduced line length
1653            let adjusted_options = ReflowOptions {
1654                line_length: options.line_length.saturating_sub(prefix_length),
1655                ..options.clone()
1656            };
1657
1658            let reflowed = reflow_line(&combined_content, &adjusted_options);
1659            for (j, reflowed_line) in reflowed.iter().enumerate() {
1660                if j == 0 {
1661                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1662                } else {
1663                    // Continuation lines aligned with text after marker
1664                    let continuation_indent = " ".repeat(continuation_spaces);
1665                    result.push(format!("{continuation_indent}{reflowed_line}"));
1666                }
1667            }
1668            continue;
1669        }
1670
1671        // Preserve tables
1672        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
1673            result.push(line.to_string());
1674            i += 1;
1675            continue;
1676        }
1677
1678        // Preserve reference definitions
1679        if trimmed.starts_with('[') && line.contains("]:") {
1680            result.push(line.to_string());
1681            i += 1;
1682            continue;
1683        }
1684
1685        // Preserve definition list items (extended markdown)
1686        if is_definition_list_item(trimmed) {
1687            result.push(line.to_string());
1688            i += 1;
1689            continue;
1690        }
1691
1692        // Check if this is a single line that doesn't need processing
1693        let mut is_single_line_paragraph = true;
1694        if i + 1 < lines.len() {
1695            let next_line = lines[i + 1];
1696            let next_trimmed = next_line.trim();
1697            // Check if next line starts a new block
1698            if !next_trimmed.is_empty()
1699                && !next_trimmed.starts_with('#')
1700                && !next_trimmed.starts_with("```")
1701                && !next_trimmed.starts_with("~~~")
1702                && !next_trimmed.starts_with('>')
1703                && !next_trimmed.starts_with('|')
1704                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1705                && !is_horizontal_rule(next_trimmed)
1706                && !(next_trimmed.starts_with('-')
1707                    && !is_horizontal_rule(next_trimmed)
1708                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1709                && !(next_trimmed.starts_with('*')
1710                    && !is_horizontal_rule(next_trimmed)
1711                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1712                && !(next_trimmed.starts_with('+')
1713                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1714                && !is_numbered_list_item(next_trimmed)
1715            {
1716                is_single_line_paragraph = false;
1717            }
1718        }
1719
1720        // If it's a single line that fits, just add it as-is
1721        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1722            result.push(line.to_string());
1723            i += 1;
1724            continue;
1725        }
1726
1727        // For regular paragraphs, collect consecutive lines
1728        let mut paragraph_parts = Vec::new();
1729        let mut current_part = vec![line];
1730        i += 1;
1731
1732        // If preserve_breaks is true, treat each line separately
1733        if options.preserve_breaks {
1734            // Don't collect consecutive lines - just reflow this single line
1735            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1736                Some("\\")
1737            } else if line.ends_with("  ") {
1738                Some("  ")
1739            } else {
1740                None
1741            };
1742            let reflowed = reflow_line(line, options);
1743
1744            // Preserve hard breaks (two trailing spaces or backslash)
1745            if let Some(break_marker) = hard_break_type {
1746                if !reflowed.is_empty() {
1747                    let mut reflowed_with_break = reflowed;
1748                    let last_idx = reflowed_with_break.len() - 1;
1749                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1750                        reflowed_with_break[last_idx].push_str(break_marker);
1751                    }
1752                    result.extend(reflowed_with_break);
1753                }
1754            } else {
1755                result.extend(reflowed);
1756            }
1757        } else {
1758            // Original behavior: collect consecutive lines into a paragraph
1759            while i < lines.len() {
1760                let prev_line = if !current_part.is_empty() {
1761                    current_part.last().unwrap()
1762                } else {
1763                    ""
1764                };
1765                let next_line = lines[i];
1766                let next_trimmed = next_line.trim();
1767
1768                // Stop at empty lines or special blocks
1769                if next_trimmed.is_empty()
1770                    || next_trimmed.starts_with('#')
1771                    || next_trimmed.starts_with("```")
1772                    || next_trimmed.starts_with("~~~")
1773                    || next_trimmed.starts_with('>')
1774                    || next_trimmed.starts_with('|')
1775                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1776                    || is_horizontal_rule(next_trimmed)
1777                    || (next_trimmed.starts_with('-')
1778                        && !is_horizontal_rule(next_trimmed)
1779                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1780                    || (next_trimmed.starts_with('*')
1781                        && !is_horizontal_rule(next_trimmed)
1782                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1783                    || (next_trimmed.starts_with('+')
1784                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1785                    || is_numbered_list_item(next_trimmed)
1786                    || is_definition_list_item(next_trimmed)
1787                {
1788                    break;
1789                }
1790
1791                // Check if previous line ends with hard break (two spaces or backslash)
1792                // or is a complete sentence in sentence_per_line mode
1793                let prev_trimmed = prev_line.trim();
1794                let abbreviations = get_abbreviations(&options.abbreviations);
1795                let ends_with_sentence = (prev_trimmed.ends_with('.')
1796                    || prev_trimmed.ends_with('!')
1797                    || prev_trimmed.ends_with('?')
1798                    || prev_trimmed.ends_with(".*")
1799                    || prev_trimmed.ends_with("!*")
1800                    || prev_trimmed.ends_with("?*")
1801                    || prev_trimmed.ends_with("._")
1802                    || prev_trimmed.ends_with("!_")
1803                    || prev_trimmed.ends_with("?_"))
1804                    && !text_ends_with_abbreviation(prev_trimmed.trim_end_matches(['*', '_']), &abbreviations);
1805
1806                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
1807                    // Start a new part after hard break or complete sentence
1808                    paragraph_parts.push(current_part.join(" "));
1809                    current_part = vec![next_line];
1810                } else {
1811                    current_part.push(next_line);
1812                }
1813                i += 1;
1814            }
1815
1816            // Add the last part
1817            if !current_part.is_empty() {
1818                if current_part.len() == 1 {
1819                    // Single line, don't add trailing space
1820                    paragraph_parts.push(current_part[0].to_string());
1821                } else {
1822                    paragraph_parts.push(current_part.join(" "));
1823                }
1824            }
1825
1826            // Reflow each part separately, preserving hard breaks
1827            for (j, part) in paragraph_parts.iter().enumerate() {
1828                let reflowed = reflow_line(part, options);
1829                result.extend(reflowed);
1830
1831                // Preserve hard break by ensuring last line of part ends with hard break marker
1832                // Use two spaces as the default hard break format for reflows
1833                // But don't add hard breaks in sentence_per_line mode - lines are already separate
1834                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
1835                    let last_idx = result.len() - 1;
1836                    if !has_hard_break(&result[last_idx]) {
1837                        result[last_idx].push_str("  ");
1838                    }
1839                }
1840            }
1841        }
1842    }
1843
1844    // Preserve trailing newline if the original content had one
1845    let result_text = result.join("\n");
1846    if content.ends_with('\n') && !result_text.ends_with('\n') {
1847        format!("{result_text}\n")
1848    } else {
1849        result_text
1850    }
1851}
1852
1853/// Information about a reflowed paragraph
1854#[derive(Debug, Clone)]
1855pub struct ParagraphReflow {
1856    /// Starting byte offset of the paragraph in the original content
1857    pub start_byte: usize,
1858    /// Ending byte offset of the paragraph in the original content
1859    pub end_byte: usize,
1860    /// The reflowed text for this paragraph
1861    pub reflowed_text: String,
1862}
1863
1864/// Reflow a single paragraph at the specified line number
1865///
1866/// This function finds the paragraph containing the given line number,
1867/// reflows it according to the specified line length, and returns
1868/// information about the paragraph location and its reflowed text.
1869///
1870/// # Arguments
1871///
1872/// * `content` - The full document content
1873/// * `line_number` - The 1-based line number within the paragraph to reflow
1874/// * `line_length` - The target line length for reflowing
1875///
1876/// # Returns
1877///
1878/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1879/// or `None` if the line number is out of bounds or the content at that
1880/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1881pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1882    if line_number == 0 {
1883        return None;
1884    }
1885
1886    let lines: Vec<&str> = content.lines().collect();
1887
1888    // Check if line number is valid (1-based)
1889    if line_number > lines.len() {
1890        return None;
1891    }
1892
1893    let target_idx = line_number - 1; // Convert to 0-based
1894    let target_line = lines[target_idx];
1895    let trimmed = target_line.trim();
1896
1897    // Don't reflow special blocks
1898    if trimmed.is_empty()
1899        || trimmed.starts_with('#')
1900        || trimmed.starts_with("```")
1901        || trimmed.starts_with("~~~")
1902        || ElementCache::calculate_indentation_width_default(target_line) >= 4
1903        || trimmed.starts_with('>')
1904        || crate::utils::table_utils::TableUtils::is_potential_table_row(target_line) // Tables
1905        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1906        || is_horizontal_rule(trimmed)
1907        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1908            && !is_horizontal_rule(trimmed)
1909            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1910        || is_numbered_list_item(trimmed)
1911        || is_definition_list_item(trimmed)
1912    {
1913        return None;
1914    }
1915
1916    // Find paragraph start - scan backward until blank line or special block
1917    let mut para_start = target_idx;
1918    while para_start > 0 {
1919        let prev_idx = para_start - 1;
1920        let prev_line = lines[prev_idx];
1921        let prev_trimmed = prev_line.trim();
1922
1923        // Stop at blank line or special blocks
1924        if prev_trimmed.is_empty()
1925            || prev_trimmed.starts_with('#')
1926            || prev_trimmed.starts_with("```")
1927            || prev_trimmed.starts_with("~~~")
1928            || ElementCache::calculate_indentation_width_default(prev_line) >= 4
1929            || prev_trimmed.starts_with('>')
1930            || crate::utils::table_utils::TableUtils::is_potential_table_row(prev_line)
1931            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1932            || is_horizontal_rule(prev_trimmed)
1933            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1934                && !is_horizontal_rule(prev_trimmed)
1935                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1936            || is_numbered_list_item(prev_trimmed)
1937            || is_definition_list_item(prev_trimmed)
1938        {
1939            break;
1940        }
1941
1942        para_start = prev_idx;
1943    }
1944
1945    // Find paragraph end - scan forward until blank line or special block
1946    let mut para_end = target_idx;
1947    while para_end + 1 < lines.len() {
1948        let next_idx = para_end + 1;
1949        let next_line = lines[next_idx];
1950        let next_trimmed = next_line.trim();
1951
1952        // Stop at blank line or special blocks
1953        if next_trimmed.is_empty()
1954            || next_trimmed.starts_with('#')
1955            || next_trimmed.starts_with("```")
1956            || next_trimmed.starts_with("~~~")
1957            || ElementCache::calculate_indentation_width_default(next_line) >= 4
1958            || next_trimmed.starts_with('>')
1959            || crate::utils::table_utils::TableUtils::is_potential_table_row(next_line)
1960            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1961            || is_horizontal_rule(next_trimmed)
1962            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1963                && !is_horizontal_rule(next_trimmed)
1964                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1965            || is_numbered_list_item(next_trimmed)
1966            || is_definition_list_item(next_trimmed)
1967        {
1968            break;
1969        }
1970
1971        para_end = next_idx;
1972    }
1973
1974    // Extract paragraph lines
1975    let paragraph_lines = &lines[para_start..=para_end];
1976
1977    // Calculate byte offsets
1978    let mut start_byte = 0;
1979    for line in lines.iter().take(para_start) {
1980        start_byte += line.len() + 1; // +1 for newline
1981    }
1982
1983    let mut end_byte = start_byte;
1984    for line in paragraph_lines.iter() {
1985        end_byte += line.len() + 1; // +1 for newline
1986    }
1987
1988    // Track whether the byte range includes a trailing newline
1989    // (it doesn't if this is the last line and the file doesn't end with newline)
1990    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1991
1992    // Adjust end_byte if the last line doesn't have a newline
1993    if !includes_trailing_newline {
1994        end_byte -= 1;
1995    }
1996
1997    // Join paragraph lines and reflow
1998    let paragraph_text = paragraph_lines.join("\n");
1999
2000    // Create reflow options
2001    let options = ReflowOptions {
2002        line_length,
2003        break_on_sentences: true,
2004        preserve_breaks: false,
2005        sentence_per_line: false,
2006        abbreviations: None,
2007    };
2008
2009    // Reflow the paragraph using reflow_markdown to handle it properly
2010    let reflowed = reflow_markdown(&paragraph_text, &options);
2011
2012    // Ensure reflowed text matches whether the byte range includes a trailing newline
2013    // This is critical: if the range includes a newline, the replacement must too,
2014    // otherwise the next line will get appended to the reflowed paragraph
2015    let reflowed_text = if includes_trailing_newline {
2016        // Range includes newline - ensure reflowed text has one
2017        if reflowed.ends_with('\n') {
2018            reflowed
2019        } else {
2020            format!("{reflowed}\n")
2021        }
2022    } else {
2023        // Range doesn't include newline - ensure reflowed text doesn't have one
2024        if reflowed.ends_with('\n') {
2025            reflowed.trim_end_matches('\n').to_string()
2026        } else {
2027            reflowed
2028        }
2029    };
2030
2031    Some(ParagraphReflow {
2032        start_byte,
2033        end_byte,
2034        reflowed_text,
2035    })
2036}
2037
2038#[cfg(test)]
2039mod tests {
2040    use super::*;
2041
2042    /// Unit test for private helper function text_ends_with_abbreviation()
2043    ///
2044    /// This test stays inline because it tests a private function.
2045    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2046    #[test]
2047    fn test_helper_function_text_ends_with_abbreviation() {
2048        // Test the helper function directly
2049        let abbreviations = get_abbreviations(&None);
2050
2051        // True cases - built-in abbreviations (titles and i.e./e.g.)
2052        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2053        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2054        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2055        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2056        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2057        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2058        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2059        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2060
2061        // False cases - NOT in built-in list (etc doesn't always have period)
2062        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2063        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2064        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2065        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2066        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2067        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2068        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2069        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2070        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2071        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2072    }
2073}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs