rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::regex_cache::{
9    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
10    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
11    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
12    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
13};
14use crate::utils::sentence_utils::{
15    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
16    text_ends_with_abbreviation,
17};
18use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
19use std::collections::HashSet;
20
21/// Options for reflowing text
22#[derive(Clone)]
23pub struct ReflowOptions {
24    /// Target line length
25    pub line_length: usize,
26    /// Whether to break on sentence boundaries when possible
27    pub break_on_sentences: bool,
28    /// Whether to preserve existing line breaks in paragraphs
29    pub preserve_breaks: bool,
30    /// Whether to enforce one sentence per line
31    pub sentence_per_line: bool,
32    /// Custom abbreviations for sentence detection
33    /// Periods are optional - both "Dr" and "Dr." work the same
34    /// Custom abbreviations are always added to the built-in defaults
35    pub abbreviations: Option<Vec<String>>,
36}
37
38impl Default for ReflowOptions {
39    fn default() -> Self {
40        Self {
41            line_length: 80,
42            break_on_sentences: true,
43            preserve_breaks: false,
44            sentence_per_line: false,
45            abbreviations: None,
46        }
47    }
48}
49
50/// Detect if a character position is a sentence boundary
51/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
52/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
53fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
54    let chars: Vec<char> = text.chars().collect();
55
56    if pos + 1 >= chars.len() {
57        return false;
58    }
59
60    let c = chars[pos];
61    let next_char = chars[pos + 1];
62
63    // Check for CJK sentence-ending punctuation (。, ！, ？)
64    // CJK punctuation doesn't require space or uppercase after it
65    if is_cjk_sentence_ending(c) {
66        // Skip any trailing emphasis/strikethrough markers
67        let mut after_punct_pos = pos + 1;
68        while after_punct_pos < chars.len()
69            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
70        {
71            after_punct_pos += 1;
72        }
73
74        // Skip whitespace
75        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
76            after_punct_pos += 1;
77        }
78
79        // Check if we have more content (any non-whitespace)
80        if after_punct_pos >= chars.len() {
81            return false;
82        }
83
84        // Skip leading emphasis/strikethrough markers
85        while after_punct_pos < chars.len()
86            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
87        {
88            after_punct_pos += 1;
89        }
90
91        if after_punct_pos >= chars.len() {
92            return false;
93        }
94
95        // For CJK, we accept any character as the start of the next sentence
96        // (no uppercase requirement, since CJK doesn't have case)
97        return true;
98    }
99
100    // Check for ASCII sentence-ending punctuation
101    if c != '.' && c != '!' && c != '?' {
102        return false;
103    }
104
105    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
106    let (_space_pos, after_space_pos) = if next_char == ' ' {
107        // Normal case: punctuation followed by space
108        (pos + 1, pos + 2)
109    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
110        // Sentence ends with quote - check what follows the quote
111        if chars[pos + 2] == ' ' {
112            // Just quote followed by space: 'sentence." '
113            (pos + 2, pos + 3)
114        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
115            // Quote followed by emphasis: 'sentence."* '
116            (pos + 3, pos + 4)
117        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
118            && pos + 4 < chars.len()
119            && chars[pos + 3] == chars[pos + 2]
120            && chars[pos + 4] == ' '
121        {
122            // Quote followed by bold: 'sentence."** '
123            (pos + 4, pos + 5)
124        } else {
125            return false;
126        }
127    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
128        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
129        (pos + 2, pos + 3)
130    } else if (next_char == '*' || next_char == '_')
131        && pos + 3 < chars.len()
132        && chars[pos + 2] == next_char
133        && chars[pos + 3] == ' '
134    {
135        // Sentence ends with bold: "sentence.** " or "sentence.__ "
136        (pos + 3, pos + 4)
137    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
138        // Sentence ends with strikethrough: "sentence.~~ "
139        (pos + 3, pos + 4)
140    } else {
141        return false;
142    };
143
144    // Skip all whitespace after the space to find the start of the next sentence
145    let mut next_char_pos = after_space_pos;
146    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
147        next_char_pos += 1;
148    }
149
150    // Check if we reached the end of the string
151    if next_char_pos >= chars.len() {
152        return false;
153    }
154
155    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
156    let mut first_letter_pos = next_char_pos;
157    while first_letter_pos < chars.len()
158        && (chars[first_letter_pos] == '*'
159            || chars[first_letter_pos] == '_'
160            || chars[first_letter_pos] == '~'
161            || is_opening_quote(chars[first_letter_pos]))
162    {
163        first_letter_pos += 1;
164    }
165
166    // Check if we reached the end after skipping emphasis
167    if first_letter_pos >= chars.len() {
168        return false;
169    }
170
171    // First character of next sentence must be uppercase or CJK
172    let first_char = chars[first_letter_pos];
173    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
174        return false;
175    }
176
177    // Look back to check for common abbreviations (only applies to periods)
178    if pos > 0 && c == '.' {
179        // Check if the text up to and including this period ends with an abbreviation
180        // Note: text[..=pos] includes the character at pos (the period)
181        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
182            return false;
183        }
184
185        // Check for decimal numbers (e.g., "3.14")
186        // Make sure to check if first_letter_pos is within bounds
187        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
188            return false;
189        }
190    }
191    true
192}
193
194/// Split text into sentences
195pub fn split_into_sentences(text: &str) -> Vec<String> {
196    split_into_sentences_custom(text, &None)
197}
198
199/// Split text into sentences with custom abbreviations
200pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
201    let abbreviations = get_abbreviations(custom_abbreviations);
202    split_into_sentences_with_set(text, &abbreviations)
203}
204
205/// Internal function to split text into sentences with a pre-computed abbreviations set
206/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
207fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
208    let mut sentences = Vec::new();
209    let mut current_sentence = String::new();
210    let mut chars = text.chars().peekable();
211    let mut pos = 0;
212
213    while let Some(c) = chars.next() {
214        current_sentence.push(c);
215
216        if is_sentence_boundary(text, pos, abbreviations) {
217            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
218            while let Some(&next) = chars.peek() {
219                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
220                    current_sentence.push(chars.next().unwrap());
221                    pos += 1;
222                } else {
223                    break;
224                }
225            }
226
227            // Consume the space after the sentence
228            if chars.peek() == Some(&' ') {
229                chars.next();
230                pos += 1;
231            }
232
233            sentences.push(current_sentence.trim().to_string());
234            current_sentence.clear();
235        }
236
237        pos += 1;
238    }
239
240    // Add any remaining text as the last sentence
241    if !current_sentence.trim().is_empty() {
242        sentences.push(current_sentence.trim().to_string());
243    }
244    sentences
245}
246
247/// Check if a line is a horizontal rule (---, ___, ***)
248fn is_horizontal_rule(line: &str) -> bool {
249    if line.len() < 3 {
250        return false;
251    }
252
253    // Check if line consists only of -, _, or * characters (at least 3)
254    let chars: Vec<char> = line.chars().collect();
255    if chars.is_empty() {
256        return false;
257    }
258
259    let first_char = chars[0];
260    if first_char != '-' && first_char != '_' && first_char != '*' {
261        return false;
262    }
263
264    // All characters should be the same (allowing spaces between)
265    for c in &chars {
266        if *c != first_char && *c != ' ' {
267            return false;
268        }
269    }
270
271    // Count non-space characters
272    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
273    non_space_count >= 3
274}
275
276/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
277fn is_numbered_list_item(line: &str) -> bool {
278    let mut chars = line.chars();
279
280    // Must start with a digit
281    if !chars.next().is_some_and(|c| c.is_numeric()) {
282        return false;
283    }
284
285    // Can have more digits
286    while let Some(c) = chars.next() {
287        if c == '.' {
288            // After period, must have a space (consistent with list marker extraction)
289            // "2019." alone is NOT treated as a list item to avoid false positives
290            return chars.next() == Some(' ');
291        }
292        if !c.is_numeric() {
293            return false;
294        }
295    }
296
297    false
298}
299
300/// Check if a line ends with a hard break (either two spaces or backslash)
301///
302/// CommonMark supports two formats for hard line breaks:
303/// 1. Two or more trailing spaces
304/// 2. A backslash at the end of the line
305fn has_hard_break(line: &str) -> bool {
306    let line = line.strip_suffix('\r').unwrap_or(line);
307    line.ends_with("  ") || line.ends_with('\\')
308}
309
310/// Check if text ends with sentence-terminating punctuation (. ! ?)
311fn ends_with_sentence_punct(text: &str) -> bool {
312    text.ends_with('.') || text.ends_with('!') || text.ends_with('?')
313}
314
315/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
316///
317/// Hard breaks in Markdown can be indicated by:
318/// 1. Two trailing spaces before a newline (traditional)
319/// 2. A backslash at the end of the line (mdformat style)
320fn trim_preserving_hard_break(s: &str) -> String {
321    // Strip trailing \r from CRLF line endings first to handle Windows files
322    let s = s.strip_suffix('\r').unwrap_or(s);
323
324    // Check for backslash hard break (mdformat style)
325    if s.ends_with('\\') {
326        // Preserve the backslash exactly as-is
327        return s.to_string();
328    }
329
330    // Check if there are at least 2 trailing spaces (traditional hard break)
331    if s.ends_with("  ") {
332        // Find the position where non-space content ends
333        let content_end = s.trim_end().len();
334        if content_end == 0 {
335            // String is all whitespace
336            return String::new();
337        }
338        // Preserve exactly 2 trailing spaces for hard break
339        format!("{}  ", &s[..content_end])
340    } else {
341        // No hard break, just trim all trailing whitespace
342        s.trim_end().to_string()
343    }
344}
345
346pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
347    // For sentence-per-line mode, always process regardless of length
348    if options.sentence_per_line {
349        let elements = parse_markdown_elements(line);
350        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
351    }
352
353    // Quick check: if line is already short enough or no wrapping requested, return as-is
354    // line_length = 0 means no wrapping (unlimited line length)
355    if options.line_length == 0 || line.chars().count() <= options.line_length {
356        return vec![line.to_string()];
357    }
358
359    // Parse the markdown to identify elements
360    let elements = parse_markdown_elements(line);
361
362    // Reflow the elements into lines
363    reflow_elements(&elements, options)
364}
365
366/// Image source in a linked image structure
367#[derive(Debug, Clone)]
368enum LinkedImageSource {
369    /// Inline image URL: ![alt](url)
370    Inline(String),
371    /// Reference image: ![alt][ref]
372    Reference(String),
373}
374
375/// Link target in a linked image structure
376#[derive(Debug, Clone)]
377enum LinkedImageTarget {
378    /// Inline link URL: ](url)
379    Inline(String),
380    /// Reference link: ][ref]
381    Reference(String),
382}
383
384/// Represents a piece of content in the markdown
385#[derive(Debug, Clone)]
386enum Element {
387    /// Plain text that can be wrapped
388    Text(String),
389    /// A complete markdown inline link [text](url)
390    Link { text: String, url: String },
391    /// A complete markdown reference link [text][ref]
392    ReferenceLink { text: String, reference: String },
393    /// A complete markdown empty reference link [text][]
394    EmptyReferenceLink { text: String },
395    /// A complete markdown shortcut reference link [ref]
396    ShortcutReference { reference: String },
397    /// A complete markdown inline image ![alt](url)
398    InlineImage { alt: String, url: String },
399    /// A complete markdown reference image ![alt][ref]
400    ReferenceImage { alt: String, reference: String },
401    /// A complete markdown empty reference image ![alt][]
402    EmptyReferenceImage { alt: String },
403    /// A clickable image badge in any of 4 forms:
404    /// - [![alt](img-url)](link-url)
405    /// - [![alt][img-ref]](link-url)
406    /// - [![alt](img-url)][link-ref]
407    /// - [![alt][img-ref]][link-ref]
408    LinkedImage {
409        alt: String,
410        img_source: LinkedImageSource,
411        link_target: LinkedImageTarget,
412    },
413    /// Footnote reference [^note]
414    FootnoteReference { note: String },
415    /// Strikethrough text ~~text~~
416    Strikethrough(String),
417    /// Wiki-style link [[wiki]] or [[wiki|text]]
418    WikiLink(String),
419    /// Inline math $math$
420    InlineMath(String),
421    /// Display math $$math$$
422    DisplayMath(String),
423    /// Emoji shortcode :emoji:
424    EmojiShortcode(String),
425    /// HTML tag <tag> or </tag> or <tag/>
426    HtmlTag(String),
427    /// HTML entity &nbsp; or &#123;
428    HtmlEntity(String),
429    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
430    HugoShortcode(String),
431    /// Inline code `code`
432    Code(String),
433    /// Bold text **text** or __text__
434    Bold {
435        content: String,
436        /// True if underscore markers (__), false for asterisks (**)
437        underscore: bool,
438    },
439    /// Italic text *text* or _text_
440    Italic {
441        content: String,
442        /// True if underscore marker (_), false for asterisk (*)
443        underscore: bool,
444    },
445}
446
447impl std::fmt::Display for Element {
448    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
449        match self {
450            Element::Text(s) => write!(f, "{s}"),
451            Element::Link { text, url } => write!(f, "[{text}]({url})"),
452            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
453            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
454            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
455            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
456            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
457            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
458            Element::LinkedImage {
459                alt,
460                img_source,
461                link_target,
462            } => {
463                // Build the image part: ![alt](url) or ![alt][ref]
464                let img_part = match img_source {
465                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
466                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
467                };
468                // Build the link part: (url) or [ref]
469                match link_target {
470                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
471                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
472                }
473            }
474            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
475            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
476            Element::WikiLink(s) => write!(f, "[[{s}]]"),
477            Element::InlineMath(s) => write!(f, "${s}$"),
478            Element::DisplayMath(s) => write!(f, "$${s}$$"),
479            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
480            Element::HtmlTag(s) => write!(f, "{s}"),
481            Element::HtmlEntity(s) => write!(f, "{s}"),
482            Element::HugoShortcode(s) => write!(f, "{s}"),
483            Element::Code(s) => write!(f, "`{s}`"),
484            Element::Bold { content, underscore } => {
485                if *underscore {
486                    write!(f, "__{content}__")
487                } else {
488                    write!(f, "**{content}**")
489                }
490            }
491            Element::Italic { content, underscore } => {
492                if *underscore {
493                    write!(f, "_{content}_")
494                } else {
495                    write!(f, "*{content}*")
496                }
497            }
498        }
499    }
500}
501
502impl Element {
503    fn len(&self) -> usize {
504        match self {
505            Element::Text(s) => s.chars().count(),
506            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
507            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
508            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
509            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
510            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
511            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
512            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
513            Element::LinkedImage {
514                alt,
515                img_source,
516                link_target,
517            } => {
518                // Calculate length based on variant
519                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
520                let alt_len = alt.chars().count();
521                let img_len = match img_source {
522                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
523                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
524                };
525                let link_len = match link_target {
526                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
527                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
528                };
529                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
530                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
531                5 + alt_len + img_len + link_len
532            }
533            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
534            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
535            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
536            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
537            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
538            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
539            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
540            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
541            Element::HugoShortcode(s) => s.chars().count(),                  // {{< ... >}} - already complete
542            Element::Code(s) => s.chars().count() + 2,                       // `code`
543            Element::Bold { content, .. } => content.chars().count() + 4,    // **text** or __text__
544            Element::Italic { content, .. } => content.chars().count() + 2,  // *text* or _text_
545        }
546    }
547}
548
549/// An emphasis or formatting span parsed by pulldown-cmark
550#[derive(Debug, Clone)]
551struct EmphasisSpan {
552    /// Byte offset where the emphasis starts (including markers)
553    start: usize,
554    /// Byte offset where the emphasis ends (after closing markers)
555    end: usize,
556    /// The content inside the emphasis markers
557    content: String,
558    /// Whether this is strong (bold) emphasis
559    is_strong: bool,
560    /// Whether this is strikethrough (~~text~~)
561    is_strikethrough: bool,
562    /// Whether the original used underscore markers (for emphasis only)
563    uses_underscore: bool,
564}
565
566/// Extract emphasis and strikethrough spans from text using pulldown-cmark
567///
568/// This provides CommonMark-compliant emphasis parsing, correctly handling:
569/// - Nested emphasis like `*text **bold** more*`
570/// - Left/right flanking delimiter rules
571/// - Underscore vs asterisk markers
572/// - GFM strikethrough (~~text~~)
573///
574/// Returns spans sorted by start position.
575fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
576    let mut spans = Vec::new();
577    let mut options = Options::empty();
578    options.insert(Options::ENABLE_STRIKETHROUGH);
579
580    // Stacks to track nested formatting with their start positions
581    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
582    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
583    let mut strikethrough_stack: Vec<usize> = Vec::new();
584
585    let parser = Parser::new_ext(text, options).into_offset_iter();
586
587    for (event, range) in parser {
588        match event {
589            Event::Start(Tag::Emphasis) => {
590                // Check if this uses underscore by looking at the original text
591                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
592                emphasis_stack.push((range.start, uses_underscore));
593            }
594            Event::End(TagEnd::Emphasis) => {
595                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
596                    // Extract content between the markers (1 char marker on each side)
597                    let content_start = start_byte + 1;
598                    let content_end = range.end - 1;
599                    if content_end > content_start
600                        && let Some(content) = text.get(content_start..content_end)
601                    {
602                        spans.push(EmphasisSpan {
603                            start: start_byte,
604                            end: range.end,
605                            content: content.to_string(),
606                            is_strong: false,
607                            is_strikethrough: false,
608                            uses_underscore,
609                        });
610                    }
611                }
612            }
613            Event::Start(Tag::Strong) => {
614                // Check if this uses underscore by looking at the original text
615                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
616                strong_stack.push((range.start, uses_underscore));
617            }
618            Event::End(TagEnd::Strong) => {
619                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
620                    // Extract content between the markers (2 char marker on each side)
621                    let content_start = start_byte + 2;
622                    let content_end = range.end - 2;
623                    if content_end > content_start
624                        && let Some(content) = text.get(content_start..content_end)
625                    {
626                        spans.push(EmphasisSpan {
627                            start: start_byte,
628                            end: range.end,
629                            content: content.to_string(),
630                            is_strong: true,
631                            is_strikethrough: false,
632                            uses_underscore,
633                        });
634                    }
635                }
636            }
637            Event::Start(Tag::Strikethrough) => {
638                strikethrough_stack.push(range.start);
639            }
640            Event::End(TagEnd::Strikethrough) => {
641                if let Some(start_byte) = strikethrough_stack.pop() {
642                    // Extract content between the ~~ markers (2 char marker on each side)
643                    let content_start = start_byte + 2;
644                    let content_end = range.end - 2;
645                    if content_end > content_start
646                        && let Some(content) = text.get(content_start..content_end)
647                    {
648                        spans.push(EmphasisSpan {
649                            start: start_byte,
650                            end: range.end,
651                            content: content.to_string(),
652                            is_strong: false,
653                            is_strikethrough: true,
654                            uses_underscore: false,
655                        });
656                    }
657                }
658            }
659            _ => {}
660        }
661    }
662
663    // Sort by start position
664    spans.sort_by_key(|s| s.start);
665    spans
666}
667
668/// Parse markdown elements from text preserving the raw syntax
669///
670/// Detection order is critical:
671/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
672/// 2. Inline images ![alt](url) - before links to handle ! prefix
673/// 3. Reference images ![alt][ref] - before reference links
674/// 4. Inline links [text](url) - before reference links
675/// 5. Reference links [text][ref] - before shortcut references
676/// 6. Shortcut reference links [ref] - detected last to avoid false positives
677/// 7. Other elements (code, bold, italic, etc.) - processed normally
678fn parse_markdown_elements(text: &str) -> Vec<Element> {
679    let mut elements = Vec::new();
680    let mut remaining = text;
681
682    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
683    let emphasis_spans = extract_emphasis_spans(text);
684
685    while !remaining.is_empty() {
686        // Calculate current byte offset in original text
687        let current_offset = text.len() - remaining.len();
688        // Find the earliest occurrence of any markdown pattern
689        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
690
691        // Check for linked images FIRST (all 4 variants)
692        // Quick literal check: only run expensive regexes if we might have a linked image
693        // Pattern starts with "[!" so check for that first
694        if remaining.contains("[!") {
695            // Pattern 1: [![alt](img)](link) - inline image in inline link
696            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
697                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
698            {
699                earliest_match = Some((m.start(), "linked_image_ii", m));
700            }
701
702            // Pattern 2: [![alt][ref]](link) - reference image in inline link
703            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
704                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
705            {
706                earliest_match = Some((m.start(), "linked_image_ri", m));
707            }
708
709            // Pattern 3: [![alt](img)][ref] - inline image in reference link
710            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
711                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
712            {
713                earliest_match = Some((m.start(), "linked_image_ir", m));
714            }
715
716            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
717            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
718                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
719            {
720                earliest_match = Some((m.start(), "linked_image_rr", m));
721            }
722        }
723
724        // Check for images (they start with ! so should be detected before links)
725        // Inline images - ![alt](url)
726        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
727            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
728        {
729            earliest_match = Some((m.start(), "inline_image", m));
730        }
731
732        // Reference images - ![alt][ref]
733        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
734            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
735        {
736            earliest_match = Some((m.start(), "ref_image", m));
737        }
738
739        // Check for footnote references - [^note]
740        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
741            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
742        {
743            earliest_match = Some((m.start(), "footnote_ref", m));
744        }
745
746        // Check for inline links - [text](url)
747        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
748            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
749        {
750            earliest_match = Some((m.start(), "inline_link", m));
751        }
752
753        // Check for reference links - [text][ref]
754        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
755            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
756        {
757            earliest_match = Some((m.start(), "ref_link", m));
758        }
759
760        // Check for shortcut reference links - [ref]
761        // Only check if we haven't found an earlier pattern that would conflict
762        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
763            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
764        {
765            earliest_match = Some((m.start(), "shortcut_ref", m));
766        }
767
768        // Check for wiki-style links - [[wiki]]
769        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
770            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
771        {
772            earliest_match = Some((m.start(), "wiki_link", m));
773        }
774
775        // Check for display math first (before inline) - $$math$$
776        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
777            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
778        {
779            earliest_match = Some((m.start(), "display_math", m));
780        }
781
782        // Check for inline math - $math$
783        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
784            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
785        {
786            earliest_match = Some((m.start(), "inline_math", m));
787        }
788
789        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
790
791        // Check for emoji shortcodes - :emoji:
792        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
793            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
794        {
795            earliest_match = Some((m.start(), "emoji", m));
796        }
797
798        // Check for HTML entities - &nbsp; etc
799        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
800            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
801        {
802            earliest_match = Some((m.start(), "html_entity", m));
803        }
804
805        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
806        // Must be checked before other patterns to avoid false sentence breaks
807        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
808            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
809        {
810            earliest_match = Some((m.start(), "hugo_shortcode", m));
811        }
812
813        // Check for HTML tags - <tag> </tag> <tag/>
814        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
815        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
816            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
817        {
818            // Check if this is an autolink (starts with protocol or mailto:)
819            let matched_text = &remaining[m.start()..m.end()];
820            let is_url_autolink = matched_text.starts_with("<http://")
821                || matched_text.starts_with("<https://")
822                || matched_text.starts_with("<mailto:")
823                || matched_text.starts_with("<ftp://")
824                || matched_text.starts_with("<ftps://");
825
826            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
827            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
828            let is_email_autolink = {
829                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
830                EMAIL_PATTERN.is_match(content)
831            };
832
833            if !is_url_autolink && !is_email_autolink {
834                earliest_match = Some((m.start(), "html_tag", m));
835            }
836        }
837
838        // Find earliest non-link special characters
839        let mut next_special = remaining.len();
840        let mut special_type = "";
841        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
842
843        // Check for code spans (not handled by pulldown-cmark in this context)
844        if let Some(pos) = remaining.find('`')
845            && pos < next_special
846        {
847            next_special = pos;
848            special_type = "code";
849        }
850
851        // Check for emphasis using pulldown-cmark's pre-extracted spans
852        // Find the earliest emphasis span that starts within remaining text
853        for span in &emphasis_spans {
854            if span.start >= current_offset && span.start < current_offset + remaining.len() {
855                let pos_in_remaining = span.start - current_offset;
856                if pos_in_remaining < next_special {
857                    next_special = pos_in_remaining;
858                    special_type = "pulldown_emphasis";
859                    pulldown_emphasis = Some(span);
860                }
861                break; // Spans are sorted by start position, so first match is earliest
862            }
863        }
864
865        // Determine which pattern to process first
866        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
867            pos < next_special
868        } else {
869            false
870        };
871
872        if should_process_markdown_link {
873            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
874
875            // Add any text before the match
876            if pos > 0 {
877                elements.push(Element::Text(remaining[..pos].to_string()));
878            }
879
880            // Process the matched pattern
881            match pattern_type {
882                // Pattern 1: [![alt](img)](link) - inline image in inline link
883                "linked_image_ii" => {
884                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
885                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
886                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
887                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
888                        elements.push(Element::LinkedImage {
889                            alt: alt.to_string(),
890                            img_source: LinkedImageSource::Inline(img_url.to_string()),
891                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
892                        });
893                        remaining = &remaining[match_obj.end()..];
894                    } else {
895                        elements.push(Element::Text("[".to_string()));
896                        remaining = &remaining[1..];
897                    }
898                }
899                // Pattern 2: [![alt][ref]](link) - reference image in inline link
900                "linked_image_ri" => {
901                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
902                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
903                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
904                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
905                        elements.push(Element::LinkedImage {
906                            alt: alt.to_string(),
907                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
908                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
909                        });
910                        remaining = &remaining[match_obj.end()..];
911                    } else {
912                        elements.push(Element::Text("[".to_string()));
913                        remaining = &remaining[1..];
914                    }
915                }
916                // Pattern 3: [![alt](img)][ref] - inline image in reference link
917                "linked_image_ir" => {
918                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
919                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
920                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
921                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
922                        elements.push(Element::LinkedImage {
923                            alt: alt.to_string(),
924                            img_source: LinkedImageSource::Inline(img_url.to_string()),
925                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
926                        });
927                        remaining = &remaining[match_obj.end()..];
928                    } else {
929                        elements.push(Element::Text("[".to_string()));
930                        remaining = &remaining[1..];
931                    }
932                }
933                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
934                "linked_image_rr" => {
935                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
936                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
937                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
938                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
939                        elements.push(Element::LinkedImage {
940                            alt: alt.to_string(),
941                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
942                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
943                        });
944                        remaining = &remaining[match_obj.end()..];
945                    } else {
946                        elements.push(Element::Text("[".to_string()));
947                        remaining = &remaining[1..];
948                    }
949                }
950                "inline_image" => {
951                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
952                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
953                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
954                        elements.push(Element::InlineImage {
955                            alt: alt.to_string(),
956                            url: url.to_string(),
957                        });
958                        remaining = &remaining[match_obj.end()..];
959                    } else {
960                        elements.push(Element::Text("!".to_string()));
961                        remaining = &remaining[1..];
962                    }
963                }
964                "ref_image" => {
965                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
966                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
967                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
968
969                        if reference.is_empty() {
970                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
971                        } else {
972                            elements.push(Element::ReferenceImage {
973                                alt: alt.to_string(),
974                                reference: reference.to_string(),
975                            });
976                        }
977                        remaining = &remaining[match_obj.end()..];
978                    } else {
979                        elements.push(Element::Text("!".to_string()));
980                        remaining = &remaining[1..];
981                    }
982                }
983                "footnote_ref" => {
984                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
985                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
986                        elements.push(Element::FootnoteReference { note: note.to_string() });
987                        remaining = &remaining[match_obj.end()..];
988                    } else {
989                        elements.push(Element::Text("[".to_string()));
990                        remaining = &remaining[1..];
991                    }
992                }
993                "inline_link" => {
994                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
995                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
996                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
997                        elements.push(Element::Link {
998                            text: text.to_string(),
999                            url: url.to_string(),
1000                        });
1001                        remaining = &remaining[match_obj.end()..];
1002                    } else {
1003                        // Fallback - shouldn't happen
1004                        elements.push(Element::Text("[".to_string()));
1005                        remaining = &remaining[1..];
1006                    }
1007                }
1008                "ref_link" => {
1009                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1010                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1011                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1012
1013                        if reference.is_empty() {
1014                            // Empty reference link [text][]
1015                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1016                        } else {
1017                            // Regular reference link [text][ref]
1018                            elements.push(Element::ReferenceLink {
1019                                text: text.to_string(),
1020                                reference: reference.to_string(),
1021                            });
1022                        }
1023                        remaining = &remaining[match_obj.end()..];
1024                    } else {
1025                        // Fallback - shouldn't happen
1026                        elements.push(Element::Text("[".to_string()));
1027                        remaining = &remaining[1..];
1028                    }
1029                }
1030                "shortcut_ref" => {
1031                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1032                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1033                        elements.push(Element::ShortcutReference {
1034                            reference: reference.to_string(),
1035                        });
1036                        remaining = &remaining[match_obj.end()..];
1037                    } else {
1038                        // Fallback - shouldn't happen
1039                        elements.push(Element::Text("[".to_string()));
1040                        remaining = &remaining[1..];
1041                    }
1042                }
1043                "wiki_link" => {
1044                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1045                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1046                        elements.push(Element::WikiLink(content.to_string()));
1047                        remaining = &remaining[match_obj.end()..];
1048                    } else {
1049                        elements.push(Element::Text("[[".to_string()));
1050                        remaining = &remaining[2..];
1051                    }
1052                }
1053                "display_math" => {
1054                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1055                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1056                        elements.push(Element::DisplayMath(math.to_string()));
1057                        remaining = &remaining[match_obj.end()..];
1058                    } else {
1059                        elements.push(Element::Text("$$".to_string()));
1060                        remaining = &remaining[2..];
1061                    }
1062                }
1063                "inline_math" => {
1064                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1065                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1066                        elements.push(Element::InlineMath(math.to_string()));
1067                        remaining = &remaining[match_obj.end()..];
1068                    } else {
1069                        elements.push(Element::Text("$".to_string()));
1070                        remaining = &remaining[1..];
1071                    }
1072                }
1073                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1074                "emoji" => {
1075                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1076                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1077                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1078                        remaining = &remaining[match_obj.end()..];
1079                    } else {
1080                        elements.push(Element::Text(":".to_string()));
1081                        remaining = &remaining[1..];
1082                    }
1083                }
1084                "html_entity" => {
1085                    // HTML entities are captured whole - use as_str() to get just the matched content
1086                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1087                    remaining = &remaining[match_obj.end()..];
1088                }
1089                "hugo_shortcode" => {
1090                    // Hugo shortcodes are atomic elements - preserve them exactly
1091                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1092                    remaining = &remaining[match_obj.end()..];
1093                }
1094                "html_tag" => {
1095                    // HTML tags are captured whole - use as_str() to get just the matched content
1096                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1097                    remaining = &remaining[match_obj.end()..];
1098                }
1099                _ => {
1100                    // Unknown pattern, treat as text
1101                    elements.push(Element::Text("[".to_string()));
1102                    remaining = &remaining[1..];
1103                }
1104            }
1105        } else {
1106            // Process non-link special characters
1107
1108            // Add any text before the special character
1109            if next_special > 0 && next_special < remaining.len() {
1110                elements.push(Element::Text(remaining[..next_special].to_string()));
1111                remaining = &remaining[next_special..];
1112            }
1113
1114            // Process the special element
1115            match special_type {
1116                "code" => {
1117                    // Find end of code
1118                    if let Some(code_end) = remaining[1..].find('`') {
1119                        let code = &remaining[1..1 + code_end];
1120                        elements.push(Element::Code(code.to_string()));
1121                        remaining = &remaining[1 + code_end + 1..];
1122                    } else {
1123                        // No closing backtick, treat as text
1124                        elements.push(Element::Text(remaining.to_string()));
1125                        break;
1126                    }
1127                }
1128                "pulldown_emphasis" => {
1129                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1130                    if let Some(span) = pulldown_emphasis {
1131                        let span_len = span.end - span.start;
1132                        if span.is_strikethrough {
1133                            elements.push(Element::Strikethrough(span.content.clone()));
1134                        } else if span.is_strong {
1135                            elements.push(Element::Bold {
1136                                content: span.content.clone(),
1137                                underscore: span.uses_underscore,
1138                            });
1139                        } else {
1140                            elements.push(Element::Italic {
1141                                content: span.content.clone(),
1142                                underscore: span.uses_underscore,
1143                            });
1144                        }
1145                        remaining = &remaining[span_len..];
1146                    } else {
1147                        // Fallback - shouldn't happen
1148                        elements.push(Element::Text(remaining[..1].to_string()));
1149                        remaining = &remaining[1..];
1150                    }
1151                }
1152                _ => {
1153                    // No special elements found, add all remaining text
1154                    elements.push(Element::Text(remaining.to_string()));
1155                    break;
1156                }
1157            }
1158        }
1159    }
1160
1161    elements
1162}
1163
1164/// Reflow elements for sentence-per-line mode
1165fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1166    let abbreviations = get_abbreviations(custom_abbreviations);
1167    let mut lines = Vec::new();
1168    let mut current_line = String::new();
1169
1170    for element in elements.iter() {
1171        let element_str = format!("{element}");
1172
1173        // For text elements, split into sentences
1174        if let Element::Text(text) = element {
1175            // Simply append text - it already has correct spacing from tokenization
1176            let combined = format!("{current_line}{text}");
1177            // Use the pre-computed abbreviations set to avoid redundant computation
1178            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1179
1180            if sentences.len() > 1 {
1181                // We found sentence boundaries
1182                for (i, sentence) in sentences.iter().enumerate() {
1183                    if i == 0 {
1184                        // First sentence might continue from previous elements
1185                        // But check if it ends with an abbreviation
1186                        let trimmed = sentence.trim();
1187
1188                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1189                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1190                            current_line = sentence.to_string();
1191                        } else {
1192                            // Normal case - emit the first sentence
1193                            lines.push(sentence.to_string());
1194                            current_line.clear();
1195                        }
1196                    } else if i == sentences.len() - 1 {
1197                        // Last sentence: check if it's complete or incomplete
1198                        let trimmed = sentence.trim();
1199                        let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1200
1201                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1202                            // Complete sentence - emit it immediately
1203                            lines.push(sentence.to_string());
1204                            current_line.clear();
1205                        } else {
1206                            // Incomplete sentence - save for next iteration
1207                            current_line = sentence.to_string();
1208                        }
1209                    } else {
1210                        // Complete sentences in the middle
1211                        lines.push(sentence.to_string());
1212                    }
1213                }
1214            } else {
1215                // Single sentence - check if it's complete
1216                let trimmed = combined.trim();
1217
1218                // If the combined result is only whitespace, don't accumulate it.
1219                // This prevents leading spaces on subsequent elements when lines
1220                // are joined with spaces during reflow iteration.
1221                if trimmed.is_empty() {
1222                    continue;
1223                }
1224
1225                let ends_with_sentence_punct = ends_with_sentence_punct(trimmed);
1226
1227                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1228                    // Complete single sentence - emit it
1229                    lines.push(trimmed.to_string());
1230                    current_line.clear();
1231                } else {
1232                    // Incomplete sentence - continue accumulating
1233                    current_line = combined;
1234                }
1235            }
1236        } else if let Element::Italic { content, underscore } = element {
1237            // Handle italic elements - may contain multiple sentences that need continuation
1238            let marker = if *underscore { "_" } else { "*" };
1239            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1240        } else if let Element::Bold { content, underscore } = element {
1241            // Handle bold elements - may contain multiple sentences that need continuation
1242            let marker = if *underscore { "__" } else { "**" };
1243            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1244        } else if let Element::Strikethrough(content) = element {
1245            // Handle strikethrough elements - may contain multiple sentences that need continuation
1246            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1247        } else {
1248            // Non-text, non-emphasis elements (Code, Links, etc.)
1249            // Add space before element if needed (unless it's after an opening paren/bracket)
1250            if !current_line.is_empty()
1251                && !current_line.ends_with(' ')
1252                && !current_line.ends_with('(')
1253                && !current_line.ends_with('[')
1254            {
1255                current_line.push(' ');
1256            }
1257            current_line.push_str(&element_str);
1258        }
1259    }
1260
1261    // Add any remaining content
1262    if !current_line.is_empty() {
1263        lines.push(current_line.trim().to_string());
1264    }
1265    lines
1266}
1267
1268/// Handle splitting emphasis content at sentence boundaries while preserving markers
1269fn handle_emphasis_sentence_split(
1270    content: &str,
1271    marker: &str,
1272    abbreviations: &HashSet<String>,
1273    current_line: &mut String,
1274    lines: &mut Vec<String>,
1275) {
1276    // Split the emphasis content into sentences
1277    let sentences = split_into_sentences_with_set(content, abbreviations);
1278
1279    if sentences.len() <= 1 {
1280        // Single sentence or no boundaries - treat as atomic
1281        if !current_line.is_empty()
1282            && !current_line.ends_with(' ')
1283            && !current_line.ends_with('(')
1284            && !current_line.ends_with('[')
1285        {
1286            current_line.push(' ');
1287        }
1288        current_line.push_str(marker);
1289        current_line.push_str(content);
1290        current_line.push_str(marker);
1291
1292        // Check if the emphasis content ends with sentence punctuation - if so, emit
1293        let trimmed = content.trim();
1294        let ends_with_punct = ends_with_sentence_punct(trimmed);
1295        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1296            lines.push(current_line.clone());
1297            current_line.clear();
1298        }
1299    } else {
1300        // Multiple sentences - each gets its own emphasis markers
1301        for (i, sentence) in sentences.iter().enumerate() {
1302            let trimmed = sentence.trim();
1303            if trimmed.is_empty() {
1304                continue;
1305            }
1306
1307            if i == 0 {
1308                // First sentence: combine with current_line and emit
1309                if !current_line.is_empty()
1310                    && !current_line.ends_with(' ')
1311                    && !current_line.ends_with('(')
1312                    && !current_line.ends_with('[')
1313                {
1314                    current_line.push(' ');
1315                }
1316                current_line.push_str(marker);
1317                current_line.push_str(trimmed);
1318                current_line.push_str(marker);
1319
1320                // Check if this is a complete sentence
1321                let ends_with_punct = ends_with_sentence_punct(trimmed);
1322                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1323                    lines.push(current_line.clone());
1324                    current_line.clear();
1325                }
1326            } else if i == sentences.len() - 1 {
1327                // Last sentence: check if complete
1328                let ends_with_punct = ends_with_sentence_punct(trimmed);
1329
1330                let mut line = String::new();
1331                line.push_str(marker);
1332                line.push_str(trimmed);
1333                line.push_str(marker);
1334
1335                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1336                    lines.push(line);
1337                } else {
1338                    // Incomplete - keep in current_line for potential continuation
1339                    *current_line = line;
1340                }
1341            } else {
1342                // Middle sentences: emit with markers
1343                let mut line = String::new();
1344                line.push_str(marker);
1345                line.push_str(trimmed);
1346                line.push_str(marker);
1347                lines.push(line);
1348            }
1349        }
1350    }
1351}
1352
1353/// Reflow elements into lines that fit within the line length
1354fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1355    let mut lines = Vec::new();
1356    let mut current_line = String::new();
1357    let mut current_length = 0;
1358
1359    for element in elements {
1360        let element_str = format!("{element}");
1361        let element_len = element.len();
1362
1363        // For text elements that might need breaking
1364        if let Element::Text(text) = element {
1365            // Check if original text had leading whitespace
1366            let has_leading_space = text.starts_with(char::is_whitespace);
1367            // If this is a text element, always process it word by word
1368            let words: Vec<&str> = text.split_whitespace().collect();
1369
1370            for (i, word) in words.iter().enumerate() {
1371                let word_len = word.chars().count();
1372                // Check if this "word" is just punctuation that should stay attached
1373                let is_trailing_punct = word
1374                    .chars()
1375                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1376
1377                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
1378                    // Start a new line (but never for trailing punctuation)
1379                    lines.push(current_line.trim().to_string());
1380                    current_line = word.to_string();
1381                    current_length = word_len;
1382                } else {
1383                    // Add word to current line
1384                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1385                    // AND this isn't trailing punctuation (which attaches directly)
1386                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1387                        current_line.push(' ');
1388                        current_length += 1;
1389                    }
1390                    current_line.push_str(word);
1391                    current_length += word_len;
1392                }
1393            }
1394        } else {
1395            // For non-text elements (code, links, references), treat as atomic units
1396            // These should never be broken across lines
1397            if current_length > 0 && current_length + 1 + element_len > options.line_length {
1398                // Start a new line
1399                lines.push(current_line.trim().to_string());
1400                current_line = element_str;
1401                current_length = element_len;
1402            } else {
1403                // Add element to current line
1404                // Don't add space if the current line ends with an opening bracket/paren
1405                let ends_with_opener =
1406                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1407                if current_length > 0 && !ends_with_opener {
1408                    current_line.push(' ');
1409                    current_length += 1;
1410                }
1411                current_line.push_str(&element_str);
1412                current_length += element_len;
1413            }
1414        }
1415    }
1416
1417    // Don't forget the last line
1418    if !current_line.is_empty() {
1419        lines.push(current_line.trim_end().to_string());
1420    }
1421
1422    lines
1423}
1424
1425/// Reflow markdown content preserving structure
1426pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1427    let lines: Vec<&str> = content.lines().collect();
1428    let mut result = Vec::new();
1429    let mut i = 0;
1430
1431    while i < lines.len() {
1432        let line = lines[i];
1433        let trimmed = line.trim();
1434
1435        // Preserve empty lines
1436        if trimmed.is_empty() {
1437            result.push(String::new());
1438            i += 1;
1439            continue;
1440        }
1441
1442        // Preserve headings as-is
1443        if trimmed.starts_with('#') {
1444            result.push(line.to_string());
1445            i += 1;
1446            continue;
1447        }
1448
1449        // Preserve fenced code blocks
1450        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1451            result.push(line.to_string());
1452            i += 1;
1453            // Copy lines until closing fence
1454            while i < lines.len() {
1455                result.push(lines[i].to_string());
1456                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1457                    i += 1;
1458                    break;
1459                }
1460                i += 1;
1461            }
1462            continue;
1463        }
1464
1465        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1466        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1467            // Collect all consecutive indented lines
1468            result.push(line.to_string());
1469            i += 1;
1470            while i < lines.len() {
1471                let next_line = lines[i];
1472                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1473                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1474                    result.push(next_line.to_string());
1475                    i += 1;
1476                } else {
1477                    break;
1478                }
1479            }
1480            continue;
1481        }
1482
1483        // Preserve block quotes (but reflow their content)
1484        if trimmed.starts_with('>') {
1485            // find() returns byte position which is correct for str slicing
1486            // The unwrap is safe because we already verified trimmed starts with '>'
1487            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
1488            let quote_prefix = line[0..gt_pos + 1].to_string();
1489            let quote_content = &line[quote_prefix.len()..].trim_start();
1490
1491            let reflowed = reflow_line(quote_content, options);
1492            for reflowed_line in reflowed.iter() {
1493                result.push(format!("{quote_prefix} {reflowed_line}"));
1494            }
1495            i += 1;
1496            continue;
1497        }
1498
1499        // Preserve horizontal rules first (before checking for lists)
1500        if is_horizontal_rule(trimmed) {
1501            result.push(line.to_string());
1502            i += 1;
1503            continue;
1504        }
1505
1506        // Preserve lists (but not horizontal rules)
1507        // A valid unordered list marker must be followed by a space (or be alone on line)
1508        // This prevents emphasis markers like "*text*" from being parsed as list items
1509        let is_unordered_list = |s: &str, marker: char| -> bool {
1510            s.starts_with(marker) && !is_horizontal_rule(s) && (s.len() == 1 || s.chars().nth(1) == Some(' '))
1511        };
1512        if is_unordered_list(trimmed, '-')
1513            || is_unordered_list(trimmed, '*')
1514            || is_unordered_list(trimmed, '+')
1515            || is_numbered_list_item(trimmed)
1516        {
1517            // Find the list marker and preserve indentation
1518            let indent = line.len() - line.trim_start().len();
1519            let indent_str = " ".repeat(indent);
1520
1521            // For numbered lists, find the period and the space after it
1522            // For bullet lists, find the marker and the space after it
1523            let mut marker_end = indent;
1524            let mut content_start = indent;
1525
1526            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1527                // Numbered list: find the period
1528                if let Some(period_pos) = line[indent..].find('.') {
1529                    marker_end = indent + period_pos + 1; // Include the period
1530                    content_start = marker_end;
1531                    // Skip any spaces after the period to find content start
1532                    // Use byte-based check since content_start is a byte index
1533                    // This is safe because space is ASCII (single byte)
1534                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1535                        content_start += 1;
1536                    }
1537                }
1538            } else {
1539                // Bullet list: marker is single character
1540                marker_end = indent + 1; // Just the marker character
1541                content_start = marker_end;
1542                // Skip any spaces after the marker
1543                // Use byte-based check since content_start is a byte index
1544                // This is safe because space is ASCII (single byte)
1545                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1546                    content_start += 1;
1547                }
1548            }
1549
1550            let marker = &line[indent..marker_end];
1551
1552            // Collect all content for this list item (including continuation lines)
1553            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1554            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1555            i += 1;
1556
1557            // Collect continuation lines (indented lines that are part of this list item)
1558            while i < lines.len() {
1559                let next_line = lines[i];
1560                let next_trimmed = next_line.trim();
1561
1562                // Stop if we hit an empty line or another list item or special block
1563                if next_trimmed.is_empty()
1564                    || next_trimmed.starts_with('#')
1565                    || next_trimmed.starts_with("```")
1566                    || next_trimmed.starts_with("~~~")
1567                    || next_trimmed.starts_with('>')
1568                    || next_trimmed.starts_with('|')
1569                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1570                    || is_horizontal_rule(next_trimmed)
1571                    || (next_trimmed.starts_with('-')
1572                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1573                    || (next_trimmed.starts_with('*')
1574                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1575                    || (next_trimmed.starts_with('+')
1576                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1577                    || is_numbered_list_item(next_trimmed)
1578                    || is_definition_list_item(next_trimmed)
1579                {
1580                    break;
1581                }
1582
1583                // Check if this line is indented (continuation of list item)
1584                let next_indent = next_line.len() - next_line.trim_start().len();
1585                if next_indent >= content_start {
1586                    // This is a continuation line - add its content
1587                    // Preserve hard breaks while trimming excessive whitespace
1588                    let trimmed_start = next_line.trim_start();
1589                    list_content.push(trim_preserving_hard_break(trimmed_start));
1590                    i += 1;
1591                } else {
1592                    // Not indented enough, not part of this list item
1593                    break;
1594                }
1595            }
1596
1597            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1598            // Hard breaks should prevent joining with the next line
1599            let combined_content = if options.preserve_breaks {
1600                list_content[0].clone()
1601            } else {
1602                // Check if any lines have hard breaks - if so, preserve the structure
1603                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1604                if has_hard_breaks {
1605                    // Don't join lines with hard breaks - keep them separate with newlines
1606                    list_content.join("\n")
1607                } else {
1608                    // No hard breaks, safe to join with spaces
1609                    list_content.join(" ")
1610                }
1611            };
1612
1613            // Calculate the proper indentation for continuation lines
1614            let trimmed_marker = marker;
1615            let continuation_spaces = content_start;
1616
1617            // Adjust line length to account for list marker and space
1618            let prefix_length = indent + trimmed_marker.len() + 1;
1619
1620            // Create adjusted options with reduced line length
1621            let adjusted_options = ReflowOptions {
1622                line_length: options.line_length.saturating_sub(prefix_length),
1623                ..options.clone()
1624            };
1625
1626            let reflowed = reflow_line(&combined_content, &adjusted_options);
1627            for (j, reflowed_line) in reflowed.iter().enumerate() {
1628                if j == 0 {
1629                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1630                } else {
1631                    // Continuation lines aligned with text after marker
1632                    let continuation_indent = " ".repeat(continuation_spaces);
1633                    result.push(format!("{continuation_indent}{reflowed_line}"));
1634                }
1635            }
1636            continue;
1637        }
1638
1639        // Preserve tables
1640        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
1641            result.push(line.to_string());
1642            i += 1;
1643            continue;
1644        }
1645
1646        // Preserve reference definitions
1647        if trimmed.starts_with('[') && line.contains("]:") {
1648            result.push(line.to_string());
1649            i += 1;
1650            continue;
1651        }
1652
1653        // Preserve definition list items (extended markdown)
1654        if is_definition_list_item(trimmed) {
1655            result.push(line.to_string());
1656            i += 1;
1657            continue;
1658        }
1659
1660        // Check if this is a single line that doesn't need processing
1661        let mut is_single_line_paragraph = true;
1662        if i + 1 < lines.len() {
1663            let next_line = lines[i + 1];
1664            let next_trimmed = next_line.trim();
1665            // Check if next line starts a new block
1666            if !next_trimmed.is_empty()
1667                && !next_trimmed.starts_with('#')
1668                && !next_trimmed.starts_with("```")
1669                && !next_trimmed.starts_with("~~~")
1670                && !next_trimmed.starts_with('>')
1671                && !next_trimmed.starts_with('|')
1672                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1673                && !is_horizontal_rule(next_trimmed)
1674                && !(next_trimmed.starts_with('-')
1675                    && !is_horizontal_rule(next_trimmed)
1676                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1677                && !(next_trimmed.starts_with('*')
1678                    && !is_horizontal_rule(next_trimmed)
1679                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1680                && !(next_trimmed.starts_with('+')
1681                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1682                && !is_numbered_list_item(next_trimmed)
1683            {
1684                is_single_line_paragraph = false;
1685            }
1686        }
1687
1688        // If it's a single line that fits, just add it as-is
1689        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1690            result.push(line.to_string());
1691            i += 1;
1692            continue;
1693        }
1694
1695        // For regular paragraphs, collect consecutive lines
1696        let mut paragraph_parts = Vec::new();
1697        let mut current_part = vec![line];
1698        i += 1;
1699
1700        // If preserve_breaks is true, treat each line separately
1701        if options.preserve_breaks {
1702            // Don't collect consecutive lines - just reflow this single line
1703            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1704                Some("\\")
1705            } else if line.ends_with("  ") {
1706                Some("  ")
1707            } else {
1708                None
1709            };
1710            let reflowed = reflow_line(line, options);
1711
1712            // Preserve hard breaks (two trailing spaces or backslash)
1713            if let Some(break_marker) = hard_break_type {
1714                if !reflowed.is_empty() {
1715                    let mut reflowed_with_break = reflowed;
1716                    let last_idx = reflowed_with_break.len() - 1;
1717                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1718                        reflowed_with_break[last_idx].push_str(break_marker);
1719                    }
1720                    result.extend(reflowed_with_break);
1721                }
1722            } else {
1723                result.extend(reflowed);
1724            }
1725        } else {
1726            // Original behavior: collect consecutive lines into a paragraph
1727            while i < lines.len() {
1728                let prev_line = if !current_part.is_empty() {
1729                    current_part.last().unwrap()
1730                } else {
1731                    ""
1732                };
1733                let next_line = lines[i];
1734                let next_trimmed = next_line.trim();
1735
1736                // Stop at empty lines or special blocks
1737                if next_trimmed.is_empty()
1738                    || next_trimmed.starts_with('#')
1739                    || next_trimmed.starts_with("```")
1740                    || next_trimmed.starts_with("~~~")
1741                    || next_trimmed.starts_with('>')
1742                    || next_trimmed.starts_with('|')
1743                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1744                    || is_horizontal_rule(next_trimmed)
1745                    || (next_trimmed.starts_with('-')
1746                        && !is_horizontal_rule(next_trimmed)
1747                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1748                    || (next_trimmed.starts_with('*')
1749                        && !is_horizontal_rule(next_trimmed)
1750                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1751                    || (next_trimmed.starts_with('+')
1752                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1753                    || is_numbered_list_item(next_trimmed)
1754                    || is_definition_list_item(next_trimmed)
1755                {
1756                    break;
1757                }
1758
1759                // Check if previous line ends with hard break (two spaces or backslash)
1760                // or is a complete sentence in sentence_per_line mode
1761                let prev_trimmed = prev_line.trim();
1762                let abbreviations = get_abbreviations(&options.abbreviations);
1763                let ends_with_sentence = (prev_trimmed.ends_with('.')
1764                    || prev_trimmed.ends_with('!')
1765                    || prev_trimmed.ends_with('?')
1766                    || prev_trimmed.ends_with(".*")
1767                    || prev_trimmed.ends_with("!*")
1768                    || prev_trimmed.ends_with("?*")
1769                    || prev_trimmed.ends_with("._")
1770                    || prev_trimmed.ends_with("!_")
1771                    || prev_trimmed.ends_with("?_")
1772                    // Quote-terminated sentences (straight and curly quotes)
1773                    || prev_trimmed.ends_with(".\"")
1774                    || prev_trimmed.ends_with("!\"")
1775                    || prev_trimmed.ends_with("?\"")
1776                    || prev_trimmed.ends_with(".'")
1777                    || prev_trimmed.ends_with("!'")
1778                    || prev_trimmed.ends_with("?'")
1779                    || prev_trimmed.ends_with(".\u{201D}")
1780                    || prev_trimmed.ends_with("!\u{201D}")
1781                    || prev_trimmed.ends_with("?\u{201D}")
1782                    || prev_trimmed.ends_with(".\u{2019}")
1783                    || prev_trimmed.ends_with("!\u{2019}")
1784                    || prev_trimmed.ends_with("?\u{2019}"))
1785                    && !text_ends_with_abbreviation(
1786                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
1787                        &abbreviations,
1788                    );
1789
1790                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
1791                    // Start a new part after hard break or complete sentence
1792                    paragraph_parts.push(current_part.join(" "));
1793                    current_part = vec![next_line];
1794                } else {
1795                    current_part.push(next_line);
1796                }
1797                i += 1;
1798            }
1799
1800            // Add the last part
1801            if !current_part.is_empty() {
1802                if current_part.len() == 1 {
1803                    // Single line, don't add trailing space
1804                    paragraph_parts.push(current_part[0].to_string());
1805                } else {
1806                    paragraph_parts.push(current_part.join(" "));
1807                }
1808            }
1809
1810            // Reflow each part separately, preserving hard breaks
1811            for (j, part) in paragraph_parts.iter().enumerate() {
1812                let reflowed = reflow_line(part, options);
1813                result.extend(reflowed);
1814
1815                // Preserve hard break by ensuring last line of part ends with hard break marker
1816                // Use two spaces as the default hard break format for reflows
1817                // But don't add hard breaks in sentence_per_line mode - lines are already separate
1818                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
1819                    let last_idx = result.len() - 1;
1820                    if !has_hard_break(&result[last_idx]) {
1821                        result[last_idx].push_str("  ");
1822                    }
1823                }
1824            }
1825        }
1826    }
1827
1828    // Preserve trailing newline if the original content had one
1829    let result_text = result.join("\n");
1830    if content.ends_with('\n') && !result_text.ends_with('\n') {
1831        format!("{result_text}\n")
1832    } else {
1833        result_text
1834    }
1835}
1836
1837/// Information about a reflowed paragraph
1838#[derive(Debug, Clone)]
1839pub struct ParagraphReflow {
1840    /// Starting byte offset of the paragraph in the original content
1841    pub start_byte: usize,
1842    /// Ending byte offset of the paragraph in the original content
1843    pub end_byte: usize,
1844    /// The reflowed text for this paragraph
1845    pub reflowed_text: String,
1846}
1847
1848/// Reflow a single paragraph at the specified line number
1849///
1850/// This function finds the paragraph containing the given line number,
1851/// reflows it according to the specified line length, and returns
1852/// information about the paragraph location and its reflowed text.
1853///
1854/// # Arguments
1855///
1856/// * `content` - The full document content
1857/// * `line_number` - The 1-based line number within the paragraph to reflow
1858/// * `line_length` - The target line length for reflowing
1859///
1860/// # Returns
1861///
1862/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1863/// or `None` if the line number is out of bounds or the content at that
1864/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1865pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1866    if line_number == 0 {
1867        return None;
1868    }
1869
1870    let lines: Vec<&str> = content.lines().collect();
1871
1872    // Check if line number is valid (1-based)
1873    if line_number > lines.len() {
1874        return None;
1875    }
1876
1877    let target_idx = line_number - 1; // Convert to 0-based
1878    let target_line = lines[target_idx];
1879    let trimmed = target_line.trim();
1880
1881    // Don't reflow special blocks
1882    if trimmed.is_empty()
1883        || trimmed.starts_with('#')
1884        || trimmed.starts_with("```")
1885        || trimmed.starts_with("~~~")
1886        || ElementCache::calculate_indentation_width_default(target_line) >= 4
1887        || trimmed.starts_with('>')
1888        || crate::utils::table_utils::TableUtils::is_potential_table_row(target_line) // Tables
1889        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1890        || is_horizontal_rule(trimmed)
1891        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1892            && !is_horizontal_rule(trimmed)
1893            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1894        || is_numbered_list_item(trimmed)
1895        || is_definition_list_item(trimmed)
1896    {
1897        return None;
1898    }
1899
1900    // Find paragraph start - scan backward until blank line or special block
1901    let mut para_start = target_idx;
1902    while para_start > 0 {
1903        let prev_idx = para_start - 1;
1904        let prev_line = lines[prev_idx];
1905        let prev_trimmed = prev_line.trim();
1906
1907        // Stop at blank line or special blocks
1908        if prev_trimmed.is_empty()
1909            || prev_trimmed.starts_with('#')
1910            || prev_trimmed.starts_with("```")
1911            || prev_trimmed.starts_with("~~~")
1912            || ElementCache::calculate_indentation_width_default(prev_line) >= 4
1913            || prev_trimmed.starts_with('>')
1914            || crate::utils::table_utils::TableUtils::is_potential_table_row(prev_line)
1915            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1916            || is_horizontal_rule(prev_trimmed)
1917            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1918                && !is_horizontal_rule(prev_trimmed)
1919                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1920            || is_numbered_list_item(prev_trimmed)
1921            || is_definition_list_item(prev_trimmed)
1922        {
1923            break;
1924        }
1925
1926        para_start = prev_idx;
1927    }
1928
1929    // Find paragraph end - scan forward until blank line or special block
1930    let mut para_end = target_idx;
1931    while para_end + 1 < lines.len() {
1932        let next_idx = para_end + 1;
1933        let next_line = lines[next_idx];
1934        let next_trimmed = next_line.trim();
1935
1936        // Stop at blank line or special blocks
1937        if next_trimmed.is_empty()
1938            || next_trimmed.starts_with('#')
1939            || next_trimmed.starts_with("```")
1940            || next_trimmed.starts_with("~~~")
1941            || ElementCache::calculate_indentation_width_default(next_line) >= 4
1942            || next_trimmed.starts_with('>')
1943            || crate::utils::table_utils::TableUtils::is_potential_table_row(next_line)
1944            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1945            || is_horizontal_rule(next_trimmed)
1946            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1947                && !is_horizontal_rule(next_trimmed)
1948                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1949            || is_numbered_list_item(next_trimmed)
1950            || is_definition_list_item(next_trimmed)
1951        {
1952            break;
1953        }
1954
1955        para_end = next_idx;
1956    }
1957
1958    // Extract paragraph lines
1959    let paragraph_lines = &lines[para_start..=para_end];
1960
1961    // Calculate byte offsets
1962    let mut start_byte = 0;
1963    for line in lines.iter().take(para_start) {
1964        start_byte += line.len() + 1; // +1 for newline
1965    }
1966
1967    let mut end_byte = start_byte;
1968    for line in paragraph_lines.iter() {
1969        end_byte += line.len() + 1; // +1 for newline
1970    }
1971
1972    // Track whether the byte range includes a trailing newline
1973    // (it doesn't if this is the last line and the file doesn't end with newline)
1974    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1975
1976    // Adjust end_byte if the last line doesn't have a newline
1977    if !includes_trailing_newline {
1978        end_byte -= 1;
1979    }
1980
1981    // Join paragraph lines and reflow
1982    let paragraph_text = paragraph_lines.join("\n");
1983
1984    // Create reflow options
1985    let options = ReflowOptions {
1986        line_length,
1987        break_on_sentences: true,
1988        preserve_breaks: false,
1989        sentence_per_line: false,
1990        abbreviations: None,
1991    };
1992
1993    // Reflow the paragraph using reflow_markdown to handle it properly
1994    let reflowed = reflow_markdown(&paragraph_text, &options);
1995
1996    // Ensure reflowed text matches whether the byte range includes a trailing newline
1997    // This is critical: if the range includes a newline, the replacement must too,
1998    // otherwise the next line will get appended to the reflowed paragraph
1999    let reflowed_text = if includes_trailing_newline {
2000        // Range includes newline - ensure reflowed text has one
2001        if reflowed.ends_with('\n') {
2002            reflowed
2003        } else {
2004            format!("{reflowed}\n")
2005        }
2006    } else {
2007        // Range doesn't include newline - ensure reflowed text doesn't have one
2008        if reflowed.ends_with('\n') {
2009            reflowed.trim_end_matches('\n').to_string()
2010        } else {
2011            reflowed
2012        }
2013    };
2014
2015    Some(ParagraphReflow {
2016        start_byte,
2017        end_byte,
2018        reflowed_text,
2019    })
2020}
2021
2022#[cfg(test)]
2023mod tests {
2024    use super::*;
2025
2026    /// Unit test for private helper function text_ends_with_abbreviation()
2027    ///
2028    /// This test stays inline because it tests a private function.
2029    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2030    #[test]
2031    fn test_helper_function_text_ends_with_abbreviation() {
2032        // Test the helper function directly
2033        let abbreviations = get_abbreviations(&None);
2034
2035        // True cases - built-in abbreviations (titles and i.e./e.g.)
2036        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2037        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2038        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2039        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2040        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2041        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2042        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2043        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2044
2045        // False cases - NOT in built-in list (etc doesn't always have period)
2046        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2047        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2048        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2049        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2050        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2051        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2052        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2053        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2054        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2055        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2056    }
2057}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs