rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::regex_cache::{
9    DISPLAY_MATH_REGEX, EMAIL_PATTERN, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
10    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
11    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
12    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
13};
14use crate::utils::sentence_utils::{
15    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
16    text_ends_with_abbreviation,
17};
18use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
19use std::collections::HashSet;
20
21/// Options for reflowing text
22#[derive(Clone)]
23pub struct ReflowOptions {
24    /// Target line length
25    pub line_length: usize,
26    /// Whether to break on sentence boundaries when possible
27    pub break_on_sentences: bool,
28    /// Whether to preserve existing line breaks in paragraphs
29    pub preserve_breaks: bool,
30    /// Whether to enforce one sentence per line
31    pub sentence_per_line: bool,
32    /// Custom abbreviations for sentence detection
33    /// Periods are optional - both "Dr" and "Dr." work the same
34    /// Custom abbreviations are always added to the built-in defaults
35    pub abbreviations: Option<Vec<String>>,
36}
37
38impl Default for ReflowOptions {
39    fn default() -> Self {
40        Self {
41            line_length: 80,
42            break_on_sentences: true,
43            preserve_breaks: false,
44            sentence_per_line: false,
45            abbreviations: None,
46        }
47    }
48}
49
50/// Detect if a character position is a sentence boundary
51/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
52/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
53fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
54    let chars: Vec<char> = text.chars().collect();
55
56    if pos + 1 >= chars.len() {
57        return false;
58    }
59
60    let c = chars[pos];
61    let next_char = chars[pos + 1];
62
63    // Check for CJK sentence-ending punctuation (。, ！, ？)
64    // CJK punctuation doesn't require space or uppercase after it
65    if is_cjk_sentence_ending(c) {
66        // Skip any trailing emphasis/strikethrough markers
67        let mut after_punct_pos = pos + 1;
68        while after_punct_pos < chars.len()
69            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
70        {
71            after_punct_pos += 1;
72        }
73
74        // Skip whitespace
75        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
76            after_punct_pos += 1;
77        }
78
79        // Check if we have more content (any non-whitespace)
80        if after_punct_pos >= chars.len() {
81            return false;
82        }
83
84        // Skip leading emphasis/strikethrough markers
85        while after_punct_pos < chars.len()
86            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
87        {
88            after_punct_pos += 1;
89        }
90
91        if after_punct_pos >= chars.len() {
92            return false;
93        }
94
95        // For CJK, we accept any character as the start of the next sentence
96        // (no uppercase requirement, since CJK doesn't have case)
97        return true;
98    }
99
100    // Check for ASCII sentence-ending punctuation
101    if c != '.' && c != '!' && c != '?' {
102        return false;
103    }
104
105    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
106    let (_space_pos, after_space_pos) = if next_char == ' ' {
107        // Normal case: punctuation followed by space
108        (pos + 1, pos + 2)
109    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
110        // Sentence ends with quote - check what follows the quote
111        if chars[pos + 2] == ' ' {
112            // Just quote followed by space: 'sentence." '
113            (pos + 2, pos + 3)
114        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
115            // Quote followed by emphasis: 'sentence."* '
116            (pos + 3, pos + 4)
117        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
118            && pos + 4 < chars.len()
119            && chars[pos + 3] == chars[pos + 2]
120            && chars[pos + 4] == ' '
121        {
122            // Quote followed by bold: 'sentence."** '
123            (pos + 4, pos + 5)
124        } else {
125            return false;
126        }
127    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
128        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
129        (pos + 2, pos + 3)
130    } else if (next_char == '*' || next_char == '_')
131        && pos + 3 < chars.len()
132        && chars[pos + 2] == next_char
133        && chars[pos + 3] == ' '
134    {
135        // Sentence ends with bold: "sentence.** " or "sentence.__ "
136        (pos + 3, pos + 4)
137    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
138        // Sentence ends with strikethrough: "sentence.~~ "
139        (pos + 3, pos + 4)
140    } else {
141        return false;
142    };
143
144    // Skip all whitespace after the space to find the start of the next sentence
145    let mut next_char_pos = after_space_pos;
146    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
147        next_char_pos += 1;
148    }
149
150    // Check if we reached the end of the string
151    if next_char_pos >= chars.len() {
152        return false;
153    }
154
155    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
156    let mut first_letter_pos = next_char_pos;
157    while first_letter_pos < chars.len()
158        && (chars[first_letter_pos] == '*'
159            || chars[first_letter_pos] == '_'
160            || chars[first_letter_pos] == '~'
161            || is_opening_quote(chars[first_letter_pos]))
162    {
163        first_letter_pos += 1;
164    }
165
166    // Check if we reached the end after skipping emphasis
167    if first_letter_pos >= chars.len() {
168        return false;
169    }
170
171    // First character of next sentence must be uppercase or CJK
172    let first_char = chars[first_letter_pos];
173    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
174        return false;
175    }
176
177    // Look back to check for common abbreviations (only applies to periods)
178    if pos > 0 && c == '.' {
179        // Check if the text up to and including this period ends with an abbreviation
180        // Note: text[..=pos] includes the character at pos (the period)
181        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
182            return false;
183        }
184
185        // Check for decimal numbers (e.g., "3.14")
186        // Make sure to check if first_letter_pos is within bounds
187        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
188            return false;
189        }
190    }
191    true
192}
193
194/// Split text into sentences
195pub fn split_into_sentences(text: &str) -> Vec<String> {
196    split_into_sentences_custom(text, &None)
197}
198
199/// Split text into sentences with custom abbreviations
200pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
201    let abbreviations = get_abbreviations(custom_abbreviations);
202    split_into_sentences_with_set(text, &abbreviations)
203}
204
205/// Internal function to split text into sentences with a pre-computed abbreviations set
206/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
207fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
208    let mut sentences = Vec::new();
209    let mut current_sentence = String::new();
210    let mut chars = text.chars().peekable();
211    let mut pos = 0;
212
213    while let Some(c) = chars.next() {
214        current_sentence.push(c);
215
216        if is_sentence_boundary(text, pos, abbreviations) {
217            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
218            while let Some(&next) = chars.peek() {
219                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
220                    current_sentence.push(chars.next().unwrap());
221                    pos += 1;
222                } else {
223                    break;
224                }
225            }
226
227            // Consume the space after the sentence
228            if chars.peek() == Some(&' ') {
229                chars.next();
230                pos += 1;
231            }
232
233            sentences.push(current_sentence.trim().to_string());
234            current_sentence.clear();
235        }
236
237        pos += 1;
238    }
239
240    // Add any remaining text as the last sentence
241    if !current_sentence.trim().is_empty() {
242        sentences.push(current_sentence.trim().to_string());
243    }
244    sentences
245}
246
247/// Check if a line is a horizontal rule (---, ___, ***)
248fn is_horizontal_rule(line: &str) -> bool {
249    if line.len() < 3 {
250        return false;
251    }
252
253    // Check if line consists only of -, _, or * characters (at least 3)
254    let chars: Vec<char> = line.chars().collect();
255    if chars.is_empty() {
256        return false;
257    }
258
259    let first_char = chars[0];
260    if first_char != '-' && first_char != '_' && first_char != '*' {
261        return false;
262    }
263
264    // All characters should be the same (allowing spaces between)
265    for c in &chars {
266        if *c != first_char && *c != ' ' {
267            return false;
268        }
269    }
270
271    // Count non-space characters
272    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
273    non_space_count >= 3
274}
275
276/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
277fn is_numbered_list_item(line: &str) -> bool {
278    let mut chars = line.chars();
279
280    // Must start with a digit
281    if !chars.next().is_some_and(|c| c.is_numeric()) {
282        return false;
283    }
284
285    // Can have more digits
286    while let Some(c) = chars.next() {
287        if c == '.' {
288            // After period, must have a space (consistent with list marker extraction)
289            // "2019." alone is NOT treated as a list item to avoid false positives
290            return chars.next() == Some(' ');
291        }
292        if !c.is_numeric() {
293            return false;
294        }
295    }
296
297    false
298}
299
300/// Check if a line ends with a hard break (either two spaces or backslash)
301///
302/// CommonMark supports two formats for hard line breaks:
303/// 1. Two or more trailing spaces
304/// 2. A backslash at the end of the line
305fn has_hard_break(line: &str) -> bool {
306    let line = line.strip_suffix('\r').unwrap_or(line);
307    line.ends_with("  ") || line.ends_with('\\')
308}
309
310/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
311///
312/// Hard breaks in Markdown can be indicated by:
313/// 1. Two trailing spaces before a newline (traditional)
314/// 2. A backslash at the end of the line (mdformat style)
315fn trim_preserving_hard_break(s: &str) -> String {
316    // Strip trailing \r from CRLF line endings first to handle Windows files
317    let s = s.strip_suffix('\r').unwrap_or(s);
318
319    // Check for backslash hard break (mdformat style)
320    if s.ends_with('\\') {
321        // Preserve the backslash exactly as-is
322        return s.to_string();
323    }
324
325    // Check if there are at least 2 trailing spaces (traditional hard break)
326    if s.ends_with("  ") {
327        // Find the position where non-space content ends
328        let content_end = s.trim_end().len();
329        if content_end == 0 {
330            // String is all whitespace
331            return String::new();
332        }
333        // Preserve exactly 2 trailing spaces for hard break
334        format!("{}  ", &s[..content_end])
335    } else {
336        // No hard break, just trim all trailing whitespace
337        s.trim_end().to_string()
338    }
339}
340
341pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
342    // For sentence-per-line mode, always process regardless of length
343    if options.sentence_per_line {
344        let elements = parse_markdown_elements(line);
345        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
346    }
347
348    // Quick check: if line is already short enough or no wrapping requested, return as-is
349    // line_length = 0 means no wrapping (unlimited line length)
350    if options.line_length == 0 || line.chars().count() <= options.line_length {
351        return vec![line.to_string()];
352    }
353
354    // Parse the markdown to identify elements
355    let elements = parse_markdown_elements(line);
356
357    // Reflow the elements into lines
358    reflow_elements(&elements, options)
359}
360
361/// Image source in a linked image structure
362#[derive(Debug, Clone)]
363enum LinkedImageSource {
364    /// Inline image URL: ![alt](url)
365    Inline(String),
366    /// Reference image: ![alt][ref]
367    Reference(String),
368}
369
370/// Link target in a linked image structure
371#[derive(Debug, Clone)]
372enum LinkedImageTarget {
373    /// Inline link URL: ](url)
374    Inline(String),
375    /// Reference link: ][ref]
376    Reference(String),
377}
378
379/// Represents a piece of content in the markdown
380#[derive(Debug, Clone)]
381enum Element {
382    /// Plain text that can be wrapped
383    Text(String),
384    /// A complete markdown inline link [text](url)
385    Link { text: String, url: String },
386    /// A complete markdown reference link [text][ref]
387    ReferenceLink { text: String, reference: String },
388    /// A complete markdown empty reference link [text][]
389    EmptyReferenceLink { text: String },
390    /// A complete markdown shortcut reference link [ref]
391    ShortcutReference { reference: String },
392    /// A complete markdown inline image ![alt](url)
393    InlineImage { alt: String, url: String },
394    /// A complete markdown reference image ![alt][ref]
395    ReferenceImage { alt: String, reference: String },
396    /// A complete markdown empty reference image ![alt][]
397    EmptyReferenceImage { alt: String },
398    /// A clickable image badge in any of 4 forms:
399    /// - [![alt](img-url)](link-url)
400    /// - [![alt][img-ref]](link-url)
401    /// - [![alt](img-url)][link-ref]
402    /// - [![alt][img-ref]][link-ref]
403    LinkedImage {
404        alt: String,
405        img_source: LinkedImageSource,
406        link_target: LinkedImageTarget,
407    },
408    /// Footnote reference [^note]
409    FootnoteReference { note: String },
410    /// Strikethrough text ~~text~~
411    Strikethrough(String),
412    /// Wiki-style link [[wiki]] or [[wiki|text]]
413    WikiLink(String),
414    /// Inline math $math$
415    InlineMath(String),
416    /// Display math $$math$$
417    DisplayMath(String),
418    /// Emoji shortcode :emoji:
419    EmojiShortcode(String),
420    /// HTML tag <tag> or </tag> or <tag/>
421    HtmlTag(String),
422    /// HTML entity &nbsp; or &#123;
423    HtmlEntity(String),
424    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
425    HugoShortcode(String),
426    /// Inline code `code`
427    Code(String),
428    /// Bold text **text** or __text__
429    Bold {
430        content: String,
431        /// True if underscore markers (__), false for asterisks (**)
432        underscore: bool,
433    },
434    /// Italic text *text* or _text_
435    Italic {
436        content: String,
437        /// True if underscore marker (_), false for asterisk (*)
438        underscore: bool,
439    },
440}
441
442impl std::fmt::Display for Element {
443    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
444        match self {
445            Element::Text(s) => write!(f, "{s}"),
446            Element::Link { text, url } => write!(f, "[{text}]({url})"),
447            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
448            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
449            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
450            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
451            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
452            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
453            Element::LinkedImage {
454                alt,
455                img_source,
456                link_target,
457            } => {
458                // Build the image part: ![alt](url) or ![alt][ref]
459                let img_part = match img_source {
460                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
461                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
462                };
463                // Build the link part: (url) or [ref]
464                match link_target {
465                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
466                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
467                }
468            }
469            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
470            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
471            Element::WikiLink(s) => write!(f, "[[{s}]]"),
472            Element::InlineMath(s) => write!(f, "${s}$"),
473            Element::DisplayMath(s) => write!(f, "$${s}$$"),
474            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
475            Element::HtmlTag(s) => write!(f, "{s}"),
476            Element::HtmlEntity(s) => write!(f, "{s}"),
477            Element::HugoShortcode(s) => write!(f, "{s}"),
478            Element::Code(s) => write!(f, "`{s}`"),
479            Element::Bold { content, underscore } => {
480                if *underscore {
481                    write!(f, "__{content}__")
482                } else {
483                    write!(f, "**{content}**")
484                }
485            }
486            Element::Italic { content, underscore } => {
487                if *underscore {
488                    write!(f, "_{content}_")
489                } else {
490                    write!(f, "*{content}*")
491                }
492            }
493        }
494    }
495}
496
497impl Element {
498    fn len(&self) -> usize {
499        match self {
500            Element::Text(s) => s.chars().count(),
501            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
502            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
503            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
504            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
505            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
506            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
507            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
508            Element::LinkedImage {
509                alt,
510                img_source,
511                link_target,
512            } => {
513                // Calculate length based on variant
514                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
515                let alt_len = alt.chars().count();
516                let img_len = match img_source {
517                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
518                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
519                };
520                let link_len = match link_target {
521                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
522                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
523                };
524                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
525                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
526                5 + alt_len + img_len + link_len
527            }
528            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
529            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
530            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
531            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
532            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
533            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
534            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
535            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
536            Element::HugoShortcode(s) => s.chars().count(),                  // {{< ... >}} - already complete
537            Element::Code(s) => s.chars().count() + 2,                       // `code`
538            Element::Bold { content, .. } => content.chars().count() + 4,    // **text** or __text__
539            Element::Italic { content, .. } => content.chars().count() + 2,  // *text* or _text_
540        }
541    }
542}
543
544/// An emphasis or formatting span parsed by pulldown-cmark
545#[derive(Debug, Clone)]
546struct EmphasisSpan {
547    /// Byte offset where the emphasis starts (including markers)
548    start: usize,
549    /// Byte offset where the emphasis ends (after closing markers)
550    end: usize,
551    /// The content inside the emphasis markers
552    content: String,
553    /// Whether this is strong (bold) emphasis
554    is_strong: bool,
555    /// Whether this is strikethrough (~~text~~)
556    is_strikethrough: bool,
557    /// Whether the original used underscore markers (for emphasis only)
558    uses_underscore: bool,
559}
560
561/// Extract emphasis and strikethrough spans from text using pulldown-cmark
562///
563/// This provides CommonMark-compliant emphasis parsing, correctly handling:
564/// - Nested emphasis like `*text **bold** more*`
565/// - Left/right flanking delimiter rules
566/// - Underscore vs asterisk markers
567/// - GFM strikethrough (~~text~~)
568///
569/// Returns spans sorted by start position.
570fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
571    let mut spans = Vec::new();
572    let mut options = Options::empty();
573    options.insert(Options::ENABLE_STRIKETHROUGH);
574
575    // Stacks to track nested formatting with their start positions
576    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
577    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
578    let mut strikethrough_stack: Vec<usize> = Vec::new();
579
580    let parser = Parser::new_ext(text, options).into_offset_iter();
581
582    for (event, range) in parser {
583        match event {
584            Event::Start(Tag::Emphasis) => {
585                // Check if this uses underscore by looking at the original text
586                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
587                emphasis_stack.push((range.start, uses_underscore));
588            }
589            Event::End(TagEnd::Emphasis) => {
590                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
591                    // Extract content between the markers (1 char marker on each side)
592                    let content_start = start_byte + 1;
593                    let content_end = range.end - 1;
594                    if content_end > content_start
595                        && let Some(content) = text.get(content_start..content_end)
596                    {
597                        spans.push(EmphasisSpan {
598                            start: start_byte,
599                            end: range.end,
600                            content: content.to_string(),
601                            is_strong: false,
602                            is_strikethrough: false,
603                            uses_underscore,
604                        });
605                    }
606                }
607            }
608            Event::Start(Tag::Strong) => {
609                // Check if this uses underscore by looking at the original text
610                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
611                strong_stack.push((range.start, uses_underscore));
612            }
613            Event::End(TagEnd::Strong) => {
614                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
615                    // Extract content between the markers (2 char marker on each side)
616                    let content_start = start_byte + 2;
617                    let content_end = range.end - 2;
618                    if content_end > content_start
619                        && let Some(content) = text.get(content_start..content_end)
620                    {
621                        spans.push(EmphasisSpan {
622                            start: start_byte,
623                            end: range.end,
624                            content: content.to_string(),
625                            is_strong: true,
626                            is_strikethrough: false,
627                            uses_underscore,
628                        });
629                    }
630                }
631            }
632            Event::Start(Tag::Strikethrough) => {
633                strikethrough_stack.push(range.start);
634            }
635            Event::End(TagEnd::Strikethrough) => {
636                if let Some(start_byte) = strikethrough_stack.pop() {
637                    // Extract content between the ~~ markers (2 char marker on each side)
638                    let content_start = start_byte + 2;
639                    let content_end = range.end - 2;
640                    if content_end > content_start
641                        && let Some(content) = text.get(content_start..content_end)
642                    {
643                        spans.push(EmphasisSpan {
644                            start: start_byte,
645                            end: range.end,
646                            content: content.to_string(),
647                            is_strong: false,
648                            is_strikethrough: true,
649                            uses_underscore: false,
650                        });
651                    }
652                }
653            }
654            _ => {}
655        }
656    }
657
658    // Sort by start position
659    spans.sort_by_key(|s| s.start);
660    spans
661}
662
663/// Parse markdown elements from text preserving the raw syntax
664///
665/// Detection order is critical:
666/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
667/// 2. Inline images ![alt](url) - before links to handle ! prefix
668/// 3. Reference images ![alt][ref] - before reference links
669/// 4. Inline links [text](url) - before reference links
670/// 5. Reference links [text][ref] - before shortcut references
671/// 6. Shortcut reference links [ref] - detected last to avoid false positives
672/// 7. Other elements (code, bold, italic, etc.) - processed normally
673fn parse_markdown_elements(text: &str) -> Vec<Element> {
674    let mut elements = Vec::new();
675    let mut remaining = text;
676
677    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
678    let emphasis_spans = extract_emphasis_spans(text);
679
680    while !remaining.is_empty() {
681        // Calculate current byte offset in original text
682        let current_offset = text.len() - remaining.len();
683        // Find the earliest occurrence of any markdown pattern
684        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
685
686        // Check for linked images FIRST (all 4 variants)
687        // Quick literal check: only run expensive regexes if we might have a linked image
688        // Pattern starts with "[!" so check for that first
689        if remaining.contains("[!") {
690            // Pattern 1: [![alt](img)](link) - inline image in inline link
691            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
692                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
693            {
694                earliest_match = Some((m.start(), "linked_image_ii", m));
695            }
696
697            // Pattern 2: [![alt][ref]](link) - reference image in inline link
698            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
699                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
700            {
701                earliest_match = Some((m.start(), "linked_image_ri", m));
702            }
703
704            // Pattern 3: [![alt](img)][ref] - inline image in reference link
705            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
706                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
707            {
708                earliest_match = Some((m.start(), "linked_image_ir", m));
709            }
710
711            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
712            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
713                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
714            {
715                earliest_match = Some((m.start(), "linked_image_rr", m));
716            }
717        }
718
719        // Check for images (they start with ! so should be detected before links)
720        // Inline images - ![alt](url)
721        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
722            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
723        {
724            earliest_match = Some((m.start(), "inline_image", m));
725        }
726
727        // Reference images - ![alt][ref]
728        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
729            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
730        {
731            earliest_match = Some((m.start(), "ref_image", m));
732        }
733
734        // Check for footnote references - [^note]
735        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
736            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
737        {
738            earliest_match = Some((m.start(), "footnote_ref", m));
739        }
740
741        // Check for inline links - [text](url)
742        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
743            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
744        {
745            earliest_match = Some((m.start(), "inline_link", m));
746        }
747
748        // Check for reference links - [text][ref]
749        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
750            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
751        {
752            earliest_match = Some((m.start(), "ref_link", m));
753        }
754
755        // Check for shortcut reference links - [ref]
756        // Only check if we haven't found an earlier pattern that would conflict
757        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
758            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
759        {
760            earliest_match = Some((m.start(), "shortcut_ref", m));
761        }
762
763        // Check for wiki-style links - [[wiki]]
764        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
765            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
766        {
767            earliest_match = Some((m.start(), "wiki_link", m));
768        }
769
770        // Check for display math first (before inline) - $$math$$
771        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
772            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
773        {
774            earliest_match = Some((m.start(), "display_math", m));
775        }
776
777        // Check for inline math - $math$
778        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
779            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
780        {
781            earliest_match = Some((m.start(), "inline_math", m));
782        }
783
784        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
785
786        // Check for emoji shortcodes - :emoji:
787        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
788            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
789        {
790            earliest_match = Some((m.start(), "emoji", m));
791        }
792
793        // Check for HTML entities - &nbsp; etc
794        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
795            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
796        {
797            earliest_match = Some((m.start(), "html_entity", m));
798        }
799
800        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
801        // Must be checked before other patterns to avoid false sentence breaks
802        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
803            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
804        {
805            earliest_match = Some((m.start(), "hugo_shortcode", m));
806        }
807
808        // Check for HTML tags - <tag> </tag> <tag/>
809        // But exclude autolinks like <https://...> or <mailto:...> or email autolinks <user@domain.com>
810        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
811            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
812        {
813            // Check if this is an autolink (starts with protocol or mailto:)
814            let matched_text = &remaining[m.start()..m.end()];
815            let is_url_autolink = matched_text.starts_with("<http://")
816                || matched_text.starts_with("<https://")
817                || matched_text.starts_with("<mailto:")
818                || matched_text.starts_with("<ftp://")
819                || matched_text.starts_with("<ftps://");
820
821            // Check if this is an email autolink (per CommonMark spec: <local@domain.tld>)
822            // Use centralized EMAIL_PATTERN for consistency with MD034 and other rules
823            let is_email_autolink = {
824                let content = matched_text.trim_start_matches('<').trim_end_matches('>');
825                EMAIL_PATTERN.is_match(content)
826            };
827
828            if !is_url_autolink && !is_email_autolink {
829                earliest_match = Some((m.start(), "html_tag", m));
830            }
831        }
832
833        // Find earliest non-link special characters
834        let mut next_special = remaining.len();
835        let mut special_type = "";
836        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
837
838        // Check for code spans (not handled by pulldown-cmark in this context)
839        if let Some(pos) = remaining.find('`')
840            && pos < next_special
841        {
842            next_special = pos;
843            special_type = "code";
844        }
845
846        // Check for emphasis using pulldown-cmark's pre-extracted spans
847        // Find the earliest emphasis span that starts within remaining text
848        for span in &emphasis_spans {
849            if span.start >= current_offset && span.start < current_offset + remaining.len() {
850                let pos_in_remaining = span.start - current_offset;
851                if pos_in_remaining < next_special {
852                    next_special = pos_in_remaining;
853                    special_type = "pulldown_emphasis";
854                    pulldown_emphasis = Some(span);
855                }
856                break; // Spans are sorted by start position, so first match is earliest
857            }
858        }
859
860        // Determine which pattern to process first
861        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
862            pos < next_special
863        } else {
864            false
865        };
866
867        if should_process_markdown_link {
868            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
869
870            // Add any text before the match
871            if pos > 0 {
872                elements.push(Element::Text(remaining[..pos].to_string()));
873            }
874
875            // Process the matched pattern
876            match pattern_type {
877                // Pattern 1: [![alt](img)](link) - inline image in inline link
878                "linked_image_ii" => {
879                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
880                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
881                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
882                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
883                        elements.push(Element::LinkedImage {
884                            alt: alt.to_string(),
885                            img_source: LinkedImageSource::Inline(img_url.to_string()),
886                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
887                        });
888                        remaining = &remaining[match_obj.end()..];
889                    } else {
890                        elements.push(Element::Text("[".to_string()));
891                        remaining = &remaining[1..];
892                    }
893                }
894                // Pattern 2: [![alt][ref]](link) - reference image in inline link
895                "linked_image_ri" => {
896                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
897                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
898                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
899                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
900                        elements.push(Element::LinkedImage {
901                            alt: alt.to_string(),
902                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
903                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
904                        });
905                        remaining = &remaining[match_obj.end()..];
906                    } else {
907                        elements.push(Element::Text("[".to_string()));
908                        remaining = &remaining[1..];
909                    }
910                }
911                // Pattern 3: [![alt](img)][ref] - inline image in reference link
912                "linked_image_ir" => {
913                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
914                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
915                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
916                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
917                        elements.push(Element::LinkedImage {
918                            alt: alt.to_string(),
919                            img_source: LinkedImageSource::Inline(img_url.to_string()),
920                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
921                        });
922                        remaining = &remaining[match_obj.end()..];
923                    } else {
924                        elements.push(Element::Text("[".to_string()));
925                        remaining = &remaining[1..];
926                    }
927                }
928                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
929                "linked_image_rr" => {
930                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
931                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
932                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
933                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
934                        elements.push(Element::LinkedImage {
935                            alt: alt.to_string(),
936                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
937                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
938                        });
939                        remaining = &remaining[match_obj.end()..];
940                    } else {
941                        elements.push(Element::Text("[".to_string()));
942                        remaining = &remaining[1..];
943                    }
944                }
945                "inline_image" => {
946                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
947                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
948                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
949                        elements.push(Element::InlineImage {
950                            alt: alt.to_string(),
951                            url: url.to_string(),
952                        });
953                        remaining = &remaining[match_obj.end()..];
954                    } else {
955                        elements.push(Element::Text("!".to_string()));
956                        remaining = &remaining[1..];
957                    }
958                }
959                "ref_image" => {
960                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
961                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
962                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
963
964                        if reference.is_empty() {
965                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
966                        } else {
967                            elements.push(Element::ReferenceImage {
968                                alt: alt.to_string(),
969                                reference: reference.to_string(),
970                            });
971                        }
972                        remaining = &remaining[match_obj.end()..];
973                    } else {
974                        elements.push(Element::Text("!".to_string()));
975                        remaining = &remaining[1..];
976                    }
977                }
978                "footnote_ref" => {
979                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
980                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
981                        elements.push(Element::FootnoteReference { note: note.to_string() });
982                        remaining = &remaining[match_obj.end()..];
983                    } else {
984                        elements.push(Element::Text("[".to_string()));
985                        remaining = &remaining[1..];
986                    }
987                }
988                "inline_link" => {
989                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
990                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
991                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
992                        elements.push(Element::Link {
993                            text: text.to_string(),
994                            url: url.to_string(),
995                        });
996                        remaining = &remaining[match_obj.end()..];
997                    } else {
998                        // Fallback - shouldn't happen
999                        elements.push(Element::Text("[".to_string()));
1000                        remaining = &remaining[1..];
1001                    }
1002                }
1003                "ref_link" => {
1004                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
1005                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1006                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
1007
1008                        if reference.is_empty() {
1009                            // Empty reference link [text][]
1010                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1011                        } else {
1012                            // Regular reference link [text][ref]
1013                            elements.push(Element::ReferenceLink {
1014                                text: text.to_string(),
1015                                reference: reference.to_string(),
1016                            });
1017                        }
1018                        remaining = &remaining[match_obj.end()..];
1019                    } else {
1020                        // Fallback - shouldn't happen
1021                        elements.push(Element::Text("[".to_string()));
1022                        remaining = &remaining[1..];
1023                    }
1024                }
1025                "shortcut_ref" => {
1026                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1027                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1028                        elements.push(Element::ShortcutReference {
1029                            reference: reference.to_string(),
1030                        });
1031                        remaining = &remaining[match_obj.end()..];
1032                    } else {
1033                        // Fallback - shouldn't happen
1034                        elements.push(Element::Text("[".to_string()));
1035                        remaining = &remaining[1..];
1036                    }
1037                }
1038                "wiki_link" => {
1039                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1040                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1041                        elements.push(Element::WikiLink(content.to_string()));
1042                        remaining = &remaining[match_obj.end()..];
1043                    } else {
1044                        elements.push(Element::Text("[[".to_string()));
1045                        remaining = &remaining[2..];
1046                    }
1047                }
1048                "display_math" => {
1049                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1050                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1051                        elements.push(Element::DisplayMath(math.to_string()));
1052                        remaining = &remaining[match_obj.end()..];
1053                    } else {
1054                        elements.push(Element::Text("$$".to_string()));
1055                        remaining = &remaining[2..];
1056                    }
1057                }
1058                "inline_math" => {
1059                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1060                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1061                        elements.push(Element::InlineMath(math.to_string()));
1062                        remaining = &remaining[match_obj.end()..];
1063                    } else {
1064                        elements.push(Element::Text("$".to_string()));
1065                        remaining = &remaining[1..];
1066                    }
1067                }
1068                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1069                "emoji" => {
1070                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1071                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1072                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1073                        remaining = &remaining[match_obj.end()..];
1074                    } else {
1075                        elements.push(Element::Text(":".to_string()));
1076                        remaining = &remaining[1..];
1077                    }
1078                }
1079                "html_entity" => {
1080                    // HTML entities are captured whole - use as_str() to get just the matched content
1081                    elements.push(Element::HtmlEntity(match_obj.as_str().to_string()));
1082                    remaining = &remaining[match_obj.end()..];
1083                }
1084                "hugo_shortcode" => {
1085                    // Hugo shortcodes are atomic elements - preserve them exactly
1086                    elements.push(Element::HugoShortcode(match_obj.as_str().to_string()));
1087                    remaining = &remaining[match_obj.end()..];
1088                }
1089                "html_tag" => {
1090                    // HTML tags are captured whole - use as_str() to get just the matched content
1091                    elements.push(Element::HtmlTag(match_obj.as_str().to_string()));
1092                    remaining = &remaining[match_obj.end()..];
1093                }
1094                _ => {
1095                    // Unknown pattern, treat as text
1096                    elements.push(Element::Text("[".to_string()));
1097                    remaining = &remaining[1..];
1098                }
1099            }
1100        } else {
1101            // Process non-link special characters
1102
1103            // Add any text before the special character
1104            if next_special > 0 && next_special < remaining.len() {
1105                elements.push(Element::Text(remaining[..next_special].to_string()));
1106                remaining = &remaining[next_special..];
1107            }
1108
1109            // Process the special element
1110            match special_type {
1111                "code" => {
1112                    // Find end of code
1113                    if let Some(code_end) = remaining[1..].find('`') {
1114                        let code = &remaining[1..1 + code_end];
1115                        elements.push(Element::Code(code.to_string()));
1116                        remaining = &remaining[1 + code_end + 1..];
1117                    } else {
1118                        // No closing backtick, treat as text
1119                        elements.push(Element::Text(remaining.to_string()));
1120                        break;
1121                    }
1122                }
1123                "pulldown_emphasis" => {
1124                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1125                    if let Some(span) = pulldown_emphasis {
1126                        let span_len = span.end - span.start;
1127                        if span.is_strikethrough {
1128                            elements.push(Element::Strikethrough(span.content.clone()));
1129                        } else if span.is_strong {
1130                            elements.push(Element::Bold {
1131                                content: span.content.clone(),
1132                                underscore: span.uses_underscore,
1133                            });
1134                        } else {
1135                            elements.push(Element::Italic {
1136                                content: span.content.clone(),
1137                                underscore: span.uses_underscore,
1138                            });
1139                        }
1140                        remaining = &remaining[span_len..];
1141                    } else {
1142                        // Fallback - shouldn't happen
1143                        elements.push(Element::Text(remaining[..1].to_string()));
1144                        remaining = &remaining[1..];
1145                    }
1146                }
1147                _ => {
1148                    // No special elements found, add all remaining text
1149                    elements.push(Element::Text(remaining.to_string()));
1150                    break;
1151                }
1152            }
1153        }
1154    }
1155
1156    elements
1157}
1158
1159/// Reflow elements for sentence-per-line mode
1160fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1161    let abbreviations = get_abbreviations(custom_abbreviations);
1162    let mut lines = Vec::new();
1163    let mut current_line = String::new();
1164
1165    for element in elements.iter() {
1166        let element_str = format!("{element}");
1167
1168        // For text elements, split into sentences
1169        if let Element::Text(text) = element {
1170            // Simply append text - it already has correct spacing from tokenization
1171            let combined = format!("{current_line}{text}");
1172            // Use the pre-computed abbreviations set to avoid redundant computation
1173            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1174
1175            if sentences.len() > 1 {
1176                // We found sentence boundaries
1177                for (i, sentence) in sentences.iter().enumerate() {
1178                    if i == 0 {
1179                        // First sentence might continue from previous elements
1180                        // But check if it ends with an abbreviation
1181                        let trimmed = sentence.trim();
1182
1183                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1184                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1185                            current_line = sentence.to_string();
1186                        } else {
1187                            // Normal case - emit the first sentence
1188                            lines.push(sentence.to_string());
1189                            current_line.clear();
1190                        }
1191                    } else if i == sentences.len() - 1 {
1192                        // Last sentence: check if it's complete or incomplete
1193                        let trimmed = sentence.trim();
1194                        let ends_with_sentence_punct =
1195                            trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1196
1197                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1198                            // Complete sentence - emit it immediately
1199                            lines.push(sentence.to_string());
1200                            current_line.clear();
1201                        } else {
1202                            // Incomplete sentence - save for next iteration
1203                            current_line = sentence.to_string();
1204                        }
1205                    } else {
1206                        // Complete sentences in the middle
1207                        lines.push(sentence.to_string());
1208                    }
1209                }
1210            } else {
1211                // Single sentence - check if it's complete
1212                let trimmed = combined.trim();
1213                let ends_with_sentence_punct =
1214                    trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1215
1216                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1217                    // Complete single sentence - emit it
1218                    lines.push(trimmed.to_string());
1219                    current_line.clear();
1220                } else {
1221                    // Incomplete sentence - continue accumulating
1222                    current_line = combined;
1223                }
1224            }
1225        } else if let Element::Italic { content, underscore } = element {
1226            // Handle italic elements - may contain multiple sentences that need continuation
1227            let marker = if *underscore { "_" } else { "*" };
1228            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1229        } else if let Element::Bold { content, underscore } = element {
1230            // Handle bold elements - may contain multiple sentences that need continuation
1231            let marker = if *underscore { "__" } else { "**" };
1232            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1233        } else if let Element::Strikethrough(content) = element {
1234            // Handle strikethrough elements - may contain multiple sentences that need continuation
1235            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1236        } else {
1237            // Non-text, non-emphasis elements (Code, Links, etc.)
1238            // Add space before element if needed (unless it's after an opening paren/bracket)
1239            if !current_line.is_empty()
1240                && !current_line.ends_with(' ')
1241                && !current_line.ends_with('(')
1242                && !current_line.ends_with('[')
1243            {
1244                current_line.push(' ');
1245            }
1246            current_line.push_str(&element_str);
1247        }
1248    }
1249
1250    // Add any remaining content
1251    if !current_line.is_empty() {
1252        lines.push(current_line.trim().to_string());
1253    }
1254    lines
1255}
1256
1257/// Handle splitting emphasis content at sentence boundaries while preserving markers
1258fn handle_emphasis_sentence_split(
1259    content: &str,
1260    marker: &str,
1261    abbreviations: &HashSet<String>,
1262    current_line: &mut String,
1263    lines: &mut Vec<String>,
1264) {
1265    // Split the emphasis content into sentences
1266    let sentences = split_into_sentences_with_set(content, abbreviations);
1267
1268    if sentences.len() <= 1 {
1269        // Single sentence or no boundaries - treat as atomic
1270        if !current_line.is_empty()
1271            && !current_line.ends_with(' ')
1272            && !current_line.ends_with('(')
1273            && !current_line.ends_with('[')
1274        {
1275            current_line.push(' ');
1276        }
1277        current_line.push_str(marker);
1278        current_line.push_str(content);
1279        current_line.push_str(marker);
1280
1281        // Check if the emphasis content ends with sentence punctuation - if so, emit
1282        let trimmed = content.trim();
1283        let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1284        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1285            lines.push(current_line.clone());
1286            current_line.clear();
1287        }
1288    } else {
1289        // Multiple sentences - each gets its own emphasis markers
1290        for (i, sentence) in sentences.iter().enumerate() {
1291            let trimmed = sentence.trim();
1292            if trimmed.is_empty() {
1293                continue;
1294            }
1295
1296            if i == 0 {
1297                // First sentence: combine with current_line and emit
1298                if !current_line.is_empty()
1299                    && !current_line.ends_with(' ')
1300                    && !current_line.ends_with('(')
1301                    && !current_line.ends_with('[')
1302                {
1303                    current_line.push(' ');
1304                }
1305                current_line.push_str(marker);
1306                current_line.push_str(trimmed);
1307                current_line.push_str(marker);
1308
1309                // Check if this is a complete sentence
1310                let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1311                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1312                    lines.push(current_line.clone());
1313                    current_line.clear();
1314                }
1315            } else if i == sentences.len() - 1 {
1316                // Last sentence: check if complete
1317                let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1318
1319                let mut line = String::new();
1320                line.push_str(marker);
1321                line.push_str(trimmed);
1322                line.push_str(marker);
1323
1324                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1325                    lines.push(line);
1326                } else {
1327                    // Incomplete - keep in current_line for potential continuation
1328                    *current_line = line;
1329                }
1330            } else {
1331                // Middle sentences: emit with markers
1332                let mut line = String::new();
1333                line.push_str(marker);
1334                line.push_str(trimmed);
1335                line.push_str(marker);
1336                lines.push(line);
1337            }
1338        }
1339    }
1340}
1341
1342/// Reflow elements into lines that fit within the line length
1343fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1344    let mut lines = Vec::new();
1345    let mut current_line = String::new();
1346    let mut current_length = 0;
1347
1348    for element in elements {
1349        let element_str = format!("{element}");
1350        let element_len = element.len();
1351
1352        // For text elements that might need breaking
1353        if let Element::Text(text) = element {
1354            // Check if original text had leading whitespace
1355            let has_leading_space = text.starts_with(char::is_whitespace);
1356            // If this is a text element, always process it word by word
1357            let words: Vec<&str> = text.split_whitespace().collect();
1358
1359            for (i, word) in words.iter().enumerate() {
1360                let word_len = word.chars().count();
1361                // Check if this "word" is just punctuation that should stay attached
1362                let is_trailing_punct = word
1363                    .chars()
1364                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1365
1366                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
1367                    // Start a new line (but never for trailing punctuation)
1368                    lines.push(current_line.trim().to_string());
1369                    current_line = word.to_string();
1370                    current_length = word_len;
1371                } else {
1372                    // Add word to current line
1373                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1374                    // AND this isn't trailing punctuation (which attaches directly)
1375                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1376                        current_line.push(' ');
1377                        current_length += 1;
1378                    }
1379                    current_line.push_str(word);
1380                    current_length += word_len;
1381                }
1382            }
1383        } else {
1384            // For non-text elements (code, links, references), treat as atomic units
1385            // These should never be broken across lines
1386            if current_length > 0 && current_length + 1 + element_len > options.line_length {
1387                // Start a new line
1388                lines.push(current_line.trim().to_string());
1389                current_line = element_str;
1390                current_length = element_len;
1391            } else {
1392                // Add element to current line
1393                // Don't add space if the current line ends with an opening bracket/paren
1394                let ends_with_opener =
1395                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1396                if current_length > 0 && !ends_with_opener {
1397                    current_line.push(' ');
1398                    current_length += 1;
1399                }
1400                current_line.push_str(&element_str);
1401                current_length += element_len;
1402            }
1403        }
1404    }
1405
1406    // Don't forget the last line
1407    if !current_line.is_empty() {
1408        lines.push(current_line.trim_end().to_string());
1409    }
1410
1411    lines
1412}
1413
1414/// Reflow markdown content preserving structure
1415pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1416    let lines: Vec<&str> = content.lines().collect();
1417    let mut result = Vec::new();
1418    let mut i = 0;
1419
1420    while i < lines.len() {
1421        let line = lines[i];
1422        let trimmed = line.trim();
1423
1424        // Preserve empty lines
1425        if trimmed.is_empty() {
1426            result.push(String::new());
1427            i += 1;
1428            continue;
1429        }
1430
1431        // Preserve headings as-is
1432        if trimmed.starts_with('#') {
1433            result.push(line.to_string());
1434            i += 1;
1435            continue;
1436        }
1437
1438        // Preserve fenced code blocks
1439        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1440            result.push(line.to_string());
1441            i += 1;
1442            // Copy lines until closing fence
1443            while i < lines.len() {
1444                result.push(lines[i].to_string());
1445                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1446                    i += 1;
1447                    break;
1448                }
1449                i += 1;
1450            }
1451            continue;
1452        }
1453
1454        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1455        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1456            // Collect all consecutive indented lines
1457            result.push(line.to_string());
1458            i += 1;
1459            while i < lines.len() {
1460                let next_line = lines[i];
1461                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1462                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1463                    result.push(next_line.to_string());
1464                    i += 1;
1465                } else {
1466                    break;
1467                }
1468            }
1469            continue;
1470        }
1471
1472        // Preserve block quotes (but reflow their content)
1473        if trimmed.starts_with('>') {
1474            // find() returns byte position which is correct for str slicing
1475            // The unwrap is safe because we already verified trimmed starts with '>'
1476            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
1477            let quote_prefix = line[0..gt_pos + 1].to_string();
1478            let quote_content = &line[quote_prefix.len()..].trim_start();
1479
1480            let reflowed = reflow_line(quote_content, options);
1481            for reflowed_line in reflowed.iter() {
1482                result.push(format!("{quote_prefix} {reflowed_line}"));
1483            }
1484            i += 1;
1485            continue;
1486        }
1487
1488        // Preserve horizontal rules first (before checking for lists)
1489        if is_horizontal_rule(trimmed) {
1490            result.push(line.to_string());
1491            i += 1;
1492            continue;
1493        }
1494
1495        // Preserve lists (but not horizontal rules)
1496        // A valid unordered list marker must be followed by a space (or be alone on line)
1497        // This prevents emphasis markers like "*text*" from being parsed as list items
1498        let is_unordered_list = |s: &str, marker: char| -> bool {
1499            s.starts_with(marker) && !is_horizontal_rule(s) && (s.len() == 1 || s.chars().nth(1) == Some(' '))
1500        };
1501        if is_unordered_list(trimmed, '-')
1502            || is_unordered_list(trimmed, '*')
1503            || is_unordered_list(trimmed, '+')
1504            || is_numbered_list_item(trimmed)
1505        {
1506            // Find the list marker and preserve indentation
1507            let indent = line.len() - line.trim_start().len();
1508            let indent_str = " ".repeat(indent);
1509
1510            // For numbered lists, find the period and the space after it
1511            // For bullet lists, find the marker and the space after it
1512            let mut marker_end = indent;
1513            let mut content_start = indent;
1514
1515            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1516                // Numbered list: find the period
1517                if let Some(period_pos) = line[indent..].find('.') {
1518                    marker_end = indent + period_pos + 1; // Include the period
1519                    content_start = marker_end;
1520                    // Skip any spaces after the period to find content start
1521                    // Use byte-based check since content_start is a byte index
1522                    // This is safe because space is ASCII (single byte)
1523                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1524                        content_start += 1;
1525                    }
1526                }
1527            } else {
1528                // Bullet list: marker is single character
1529                marker_end = indent + 1; // Just the marker character
1530                content_start = marker_end;
1531                // Skip any spaces after the marker
1532                // Use byte-based check since content_start is a byte index
1533                // This is safe because space is ASCII (single byte)
1534                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1535                    content_start += 1;
1536                }
1537            }
1538
1539            let marker = &line[indent..marker_end];
1540
1541            // Collect all content for this list item (including continuation lines)
1542            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1543            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1544            i += 1;
1545
1546            // Collect continuation lines (indented lines that are part of this list item)
1547            while i < lines.len() {
1548                let next_line = lines[i];
1549                let next_trimmed = next_line.trim();
1550
1551                // Stop if we hit an empty line or another list item or special block
1552                if next_trimmed.is_empty()
1553                    || next_trimmed.starts_with('#')
1554                    || next_trimmed.starts_with("```")
1555                    || next_trimmed.starts_with("~~~")
1556                    || next_trimmed.starts_with('>')
1557                    || next_trimmed.starts_with('|')
1558                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1559                    || is_horizontal_rule(next_trimmed)
1560                    || (next_trimmed.starts_with('-')
1561                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1562                    || (next_trimmed.starts_with('*')
1563                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1564                    || (next_trimmed.starts_with('+')
1565                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1566                    || is_numbered_list_item(next_trimmed)
1567                    || is_definition_list_item(next_trimmed)
1568                {
1569                    break;
1570                }
1571
1572                // Check if this line is indented (continuation of list item)
1573                let next_indent = next_line.len() - next_line.trim_start().len();
1574                if next_indent >= content_start {
1575                    // This is a continuation line - add its content
1576                    // Preserve hard breaks while trimming excessive whitespace
1577                    let trimmed_start = next_line.trim_start();
1578                    list_content.push(trim_preserving_hard_break(trimmed_start));
1579                    i += 1;
1580                } else {
1581                    // Not indented enough, not part of this list item
1582                    break;
1583                }
1584            }
1585
1586            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1587            // Hard breaks should prevent joining with the next line
1588            let combined_content = if options.preserve_breaks {
1589                list_content[0].clone()
1590            } else {
1591                // Check if any lines have hard breaks - if so, preserve the structure
1592                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1593                if has_hard_breaks {
1594                    // Don't join lines with hard breaks - keep them separate with newlines
1595                    list_content.join("\n")
1596                } else {
1597                    // No hard breaks, safe to join with spaces
1598                    list_content.join(" ")
1599                }
1600            };
1601
1602            // Calculate the proper indentation for continuation lines
1603            let trimmed_marker = marker;
1604            let continuation_spaces = content_start;
1605
1606            // Adjust line length to account for list marker and space
1607            let prefix_length = indent + trimmed_marker.len() + 1;
1608
1609            // Create adjusted options with reduced line length
1610            let adjusted_options = ReflowOptions {
1611                line_length: options.line_length.saturating_sub(prefix_length),
1612                ..options.clone()
1613            };
1614
1615            let reflowed = reflow_line(&combined_content, &adjusted_options);
1616            for (j, reflowed_line) in reflowed.iter().enumerate() {
1617                if j == 0 {
1618                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1619                } else {
1620                    // Continuation lines aligned with text after marker
1621                    let continuation_indent = " ".repeat(continuation_spaces);
1622                    result.push(format!("{continuation_indent}{reflowed_line}"));
1623                }
1624            }
1625            continue;
1626        }
1627
1628        // Preserve tables
1629        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
1630            result.push(line.to_string());
1631            i += 1;
1632            continue;
1633        }
1634
1635        // Preserve reference definitions
1636        if trimmed.starts_with('[') && line.contains("]:") {
1637            result.push(line.to_string());
1638            i += 1;
1639            continue;
1640        }
1641
1642        // Preserve definition list items (extended markdown)
1643        if is_definition_list_item(trimmed) {
1644            result.push(line.to_string());
1645            i += 1;
1646            continue;
1647        }
1648
1649        // Check if this is a single line that doesn't need processing
1650        let mut is_single_line_paragraph = true;
1651        if i + 1 < lines.len() {
1652            let next_line = lines[i + 1];
1653            let next_trimmed = next_line.trim();
1654            // Check if next line starts a new block
1655            if !next_trimmed.is_empty()
1656                && !next_trimmed.starts_with('#')
1657                && !next_trimmed.starts_with("```")
1658                && !next_trimmed.starts_with("~~~")
1659                && !next_trimmed.starts_with('>')
1660                && !next_trimmed.starts_with('|')
1661                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1662                && !is_horizontal_rule(next_trimmed)
1663                && !(next_trimmed.starts_with('-')
1664                    && !is_horizontal_rule(next_trimmed)
1665                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1666                && !(next_trimmed.starts_with('*')
1667                    && !is_horizontal_rule(next_trimmed)
1668                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1669                && !(next_trimmed.starts_with('+')
1670                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1671                && !is_numbered_list_item(next_trimmed)
1672            {
1673                is_single_line_paragraph = false;
1674            }
1675        }
1676
1677        // If it's a single line that fits, just add it as-is
1678        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1679            result.push(line.to_string());
1680            i += 1;
1681            continue;
1682        }
1683
1684        // For regular paragraphs, collect consecutive lines
1685        let mut paragraph_parts = Vec::new();
1686        let mut current_part = vec![line];
1687        i += 1;
1688
1689        // If preserve_breaks is true, treat each line separately
1690        if options.preserve_breaks {
1691            // Don't collect consecutive lines - just reflow this single line
1692            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1693                Some("\\")
1694            } else if line.ends_with("  ") {
1695                Some("  ")
1696            } else {
1697                None
1698            };
1699            let reflowed = reflow_line(line, options);
1700
1701            // Preserve hard breaks (two trailing spaces or backslash)
1702            if let Some(break_marker) = hard_break_type {
1703                if !reflowed.is_empty() {
1704                    let mut reflowed_with_break = reflowed;
1705                    let last_idx = reflowed_with_break.len() - 1;
1706                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1707                        reflowed_with_break[last_idx].push_str(break_marker);
1708                    }
1709                    result.extend(reflowed_with_break);
1710                }
1711            } else {
1712                result.extend(reflowed);
1713            }
1714        } else {
1715            // Original behavior: collect consecutive lines into a paragraph
1716            while i < lines.len() {
1717                let prev_line = if !current_part.is_empty() {
1718                    current_part.last().unwrap()
1719                } else {
1720                    ""
1721                };
1722                let next_line = lines[i];
1723                let next_trimmed = next_line.trim();
1724
1725                // Stop at empty lines or special blocks
1726                if next_trimmed.is_empty()
1727                    || next_trimmed.starts_with('#')
1728                    || next_trimmed.starts_with("```")
1729                    || next_trimmed.starts_with("~~~")
1730                    || next_trimmed.starts_with('>')
1731                    || next_trimmed.starts_with('|')
1732                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1733                    || is_horizontal_rule(next_trimmed)
1734                    || (next_trimmed.starts_with('-')
1735                        && !is_horizontal_rule(next_trimmed)
1736                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1737                    || (next_trimmed.starts_with('*')
1738                        && !is_horizontal_rule(next_trimmed)
1739                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1740                    || (next_trimmed.starts_with('+')
1741                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1742                    || is_numbered_list_item(next_trimmed)
1743                    || is_definition_list_item(next_trimmed)
1744                {
1745                    break;
1746                }
1747
1748                // Check if previous line ends with hard break (two spaces or backslash)
1749                // or is a complete sentence in sentence_per_line mode
1750                let prev_trimmed = prev_line.trim();
1751                let abbreviations = get_abbreviations(&options.abbreviations);
1752                let ends_with_sentence = (prev_trimmed.ends_with('.')
1753                    || prev_trimmed.ends_with('!')
1754                    || prev_trimmed.ends_with('?')
1755                    || prev_trimmed.ends_with(".*")
1756                    || prev_trimmed.ends_with("!*")
1757                    || prev_trimmed.ends_with("?*")
1758                    || prev_trimmed.ends_with("._")
1759                    || prev_trimmed.ends_with("!_")
1760                    || prev_trimmed.ends_with("?_")
1761                    // Quote-terminated sentences (straight and curly quotes)
1762                    || prev_trimmed.ends_with(".\"")
1763                    || prev_trimmed.ends_with("!\"")
1764                    || prev_trimmed.ends_with("?\"")
1765                    || prev_trimmed.ends_with(".'")
1766                    || prev_trimmed.ends_with("!'")
1767                    || prev_trimmed.ends_with("?'")
1768                    || prev_trimmed.ends_with(".\u{201D}")
1769                    || prev_trimmed.ends_with("!\u{201D}")
1770                    || prev_trimmed.ends_with("?\u{201D}")
1771                    || prev_trimmed.ends_with(".\u{2019}")
1772                    || prev_trimmed.ends_with("!\u{2019}")
1773                    || prev_trimmed.ends_with("?\u{2019}"))
1774                    && !text_ends_with_abbreviation(
1775                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
1776                        &abbreviations,
1777                    );
1778
1779                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
1780                    // Start a new part after hard break or complete sentence
1781                    paragraph_parts.push(current_part.join(" "));
1782                    current_part = vec![next_line];
1783                } else {
1784                    current_part.push(next_line);
1785                }
1786                i += 1;
1787            }
1788
1789            // Add the last part
1790            if !current_part.is_empty() {
1791                if current_part.len() == 1 {
1792                    // Single line, don't add trailing space
1793                    paragraph_parts.push(current_part[0].to_string());
1794                } else {
1795                    paragraph_parts.push(current_part.join(" "));
1796                }
1797            }
1798
1799            // Reflow each part separately, preserving hard breaks
1800            for (j, part) in paragraph_parts.iter().enumerate() {
1801                let reflowed = reflow_line(part, options);
1802                result.extend(reflowed);
1803
1804                // Preserve hard break by ensuring last line of part ends with hard break marker
1805                // Use two spaces as the default hard break format for reflows
1806                // But don't add hard breaks in sentence_per_line mode - lines are already separate
1807                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
1808                    let last_idx = result.len() - 1;
1809                    if !has_hard_break(&result[last_idx]) {
1810                        result[last_idx].push_str("  ");
1811                    }
1812                }
1813            }
1814        }
1815    }
1816
1817    // Preserve trailing newline if the original content had one
1818    let result_text = result.join("\n");
1819    if content.ends_with('\n') && !result_text.ends_with('\n') {
1820        format!("{result_text}\n")
1821    } else {
1822        result_text
1823    }
1824}
1825
1826/// Information about a reflowed paragraph
1827#[derive(Debug, Clone)]
1828pub struct ParagraphReflow {
1829    /// Starting byte offset of the paragraph in the original content
1830    pub start_byte: usize,
1831    /// Ending byte offset of the paragraph in the original content
1832    pub end_byte: usize,
1833    /// The reflowed text for this paragraph
1834    pub reflowed_text: String,
1835}
1836
1837/// Reflow a single paragraph at the specified line number
1838///
1839/// This function finds the paragraph containing the given line number,
1840/// reflows it according to the specified line length, and returns
1841/// information about the paragraph location and its reflowed text.
1842///
1843/// # Arguments
1844///
1845/// * `content` - The full document content
1846/// * `line_number` - The 1-based line number within the paragraph to reflow
1847/// * `line_length` - The target line length for reflowing
1848///
1849/// # Returns
1850///
1851/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1852/// or `None` if the line number is out of bounds or the content at that
1853/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1854pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1855    if line_number == 0 {
1856        return None;
1857    }
1858
1859    let lines: Vec<&str> = content.lines().collect();
1860
1861    // Check if line number is valid (1-based)
1862    if line_number > lines.len() {
1863        return None;
1864    }
1865
1866    let target_idx = line_number - 1; // Convert to 0-based
1867    let target_line = lines[target_idx];
1868    let trimmed = target_line.trim();
1869
1870    // Don't reflow special blocks
1871    if trimmed.is_empty()
1872        || trimmed.starts_with('#')
1873        || trimmed.starts_with("```")
1874        || trimmed.starts_with("~~~")
1875        || ElementCache::calculate_indentation_width_default(target_line) >= 4
1876        || trimmed.starts_with('>')
1877        || crate::utils::table_utils::TableUtils::is_potential_table_row(target_line) // Tables
1878        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1879        || is_horizontal_rule(trimmed)
1880        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1881            && !is_horizontal_rule(trimmed)
1882            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1883        || is_numbered_list_item(trimmed)
1884        || is_definition_list_item(trimmed)
1885    {
1886        return None;
1887    }
1888
1889    // Find paragraph start - scan backward until blank line or special block
1890    let mut para_start = target_idx;
1891    while para_start > 0 {
1892        let prev_idx = para_start - 1;
1893        let prev_line = lines[prev_idx];
1894        let prev_trimmed = prev_line.trim();
1895
1896        // Stop at blank line or special blocks
1897        if prev_trimmed.is_empty()
1898            || prev_trimmed.starts_with('#')
1899            || prev_trimmed.starts_with("```")
1900            || prev_trimmed.starts_with("~~~")
1901            || ElementCache::calculate_indentation_width_default(prev_line) >= 4
1902            || prev_trimmed.starts_with('>')
1903            || crate::utils::table_utils::TableUtils::is_potential_table_row(prev_line)
1904            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1905            || is_horizontal_rule(prev_trimmed)
1906            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1907                && !is_horizontal_rule(prev_trimmed)
1908                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1909            || is_numbered_list_item(prev_trimmed)
1910            || is_definition_list_item(prev_trimmed)
1911        {
1912            break;
1913        }
1914
1915        para_start = prev_idx;
1916    }
1917
1918    // Find paragraph end - scan forward until blank line or special block
1919    let mut para_end = target_idx;
1920    while para_end + 1 < lines.len() {
1921        let next_idx = para_end + 1;
1922        let next_line = lines[next_idx];
1923        let next_trimmed = next_line.trim();
1924
1925        // Stop at blank line or special blocks
1926        if next_trimmed.is_empty()
1927            || next_trimmed.starts_with('#')
1928            || next_trimmed.starts_with("```")
1929            || next_trimmed.starts_with("~~~")
1930            || ElementCache::calculate_indentation_width_default(next_line) >= 4
1931            || next_trimmed.starts_with('>')
1932            || crate::utils::table_utils::TableUtils::is_potential_table_row(next_line)
1933            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1934            || is_horizontal_rule(next_trimmed)
1935            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1936                && !is_horizontal_rule(next_trimmed)
1937                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1938            || is_numbered_list_item(next_trimmed)
1939            || is_definition_list_item(next_trimmed)
1940        {
1941            break;
1942        }
1943
1944        para_end = next_idx;
1945    }
1946
1947    // Extract paragraph lines
1948    let paragraph_lines = &lines[para_start..=para_end];
1949
1950    // Calculate byte offsets
1951    let mut start_byte = 0;
1952    for line in lines.iter().take(para_start) {
1953        start_byte += line.len() + 1; // +1 for newline
1954    }
1955
1956    let mut end_byte = start_byte;
1957    for line in paragraph_lines.iter() {
1958        end_byte += line.len() + 1; // +1 for newline
1959    }
1960
1961    // Track whether the byte range includes a trailing newline
1962    // (it doesn't if this is the last line and the file doesn't end with newline)
1963    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1964
1965    // Adjust end_byte if the last line doesn't have a newline
1966    if !includes_trailing_newline {
1967        end_byte -= 1;
1968    }
1969
1970    // Join paragraph lines and reflow
1971    let paragraph_text = paragraph_lines.join("\n");
1972
1973    // Create reflow options
1974    let options = ReflowOptions {
1975        line_length,
1976        break_on_sentences: true,
1977        preserve_breaks: false,
1978        sentence_per_line: false,
1979        abbreviations: None,
1980    };
1981
1982    // Reflow the paragraph using reflow_markdown to handle it properly
1983    let reflowed = reflow_markdown(&paragraph_text, &options);
1984
1985    // Ensure reflowed text matches whether the byte range includes a trailing newline
1986    // This is critical: if the range includes a newline, the replacement must too,
1987    // otherwise the next line will get appended to the reflowed paragraph
1988    let reflowed_text = if includes_trailing_newline {
1989        // Range includes newline - ensure reflowed text has one
1990        if reflowed.ends_with('\n') {
1991            reflowed
1992        } else {
1993            format!("{reflowed}\n")
1994        }
1995    } else {
1996        // Range doesn't include newline - ensure reflowed text doesn't have one
1997        if reflowed.ends_with('\n') {
1998            reflowed.trim_end_matches('\n').to_string()
1999        } else {
2000            reflowed
2001        }
2002    };
2003
2004    Some(ParagraphReflow {
2005        start_byte,
2006        end_byte,
2007        reflowed_text,
2008    })
2009}
2010
2011#[cfg(test)]
2012mod tests {
2013    use super::*;
2014
2015    /// Unit test for private helper function text_ends_with_abbreviation()
2016    ///
2017    /// This test stays inline because it tests a private function.
2018    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2019    #[test]
2020    fn test_helper_function_text_ends_with_abbreviation() {
2021        // Test the helper function directly
2022        let abbreviations = get_abbreviations(&None);
2023
2024        // True cases - built-in abbreviations (titles and i.e./e.g.)
2025        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2026        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2027        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2028        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2029        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2030        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2031        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2032        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2033
2034        // False cases - NOT in built-in list (etc doesn't always have period)
2035        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2036        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2037        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2038        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2039        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2040        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2041        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2042        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2043        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2044        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2045    }
2046}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs