rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::regex_cache::{
9    DISPLAY_MATH_REGEX, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
10    HUGO_SHORTCODE_REGEX, INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX,
11    LINKED_IMAGE_INLINE_INLINE, LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF,
12    REF_IMAGE_REGEX, REF_LINK_REGEX, SHORTCUT_REF_REGEX, WIKI_LINK_REGEX,
13};
14use crate::utils::sentence_utils::{
15    get_abbreviations, is_cjk_char, is_cjk_sentence_ending, is_closing_quote, is_opening_quote,
16    text_ends_with_abbreviation,
17};
18use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
19use std::collections::HashSet;
20
21/// Options for reflowing text
22#[derive(Clone)]
23pub struct ReflowOptions {
24    /// Target line length
25    pub line_length: usize,
26    /// Whether to break on sentence boundaries when possible
27    pub break_on_sentences: bool,
28    /// Whether to preserve existing line breaks in paragraphs
29    pub preserve_breaks: bool,
30    /// Whether to enforce one sentence per line
31    pub sentence_per_line: bool,
32    /// Custom abbreviations for sentence detection
33    /// Periods are optional - both "Dr" and "Dr." work the same
34    /// Custom abbreviations are always added to the built-in defaults
35    pub abbreviations: Option<Vec<String>>,
36}
37
38impl Default for ReflowOptions {
39    fn default() -> Self {
40        Self {
41            line_length: 80,
42            break_on_sentences: true,
43            preserve_breaks: false,
44            sentence_per_line: false,
45            abbreviations: None,
46        }
47    }
48}
49
50/// Detect if a character position is a sentence boundary
51/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
52/// Supports both ASCII punctuation (. ! ?) and CJK punctuation (。 ！ ？)
53fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
54    let chars: Vec<char> = text.chars().collect();
55
56    if pos + 1 >= chars.len() {
57        return false;
58    }
59
60    let c = chars[pos];
61    let next_char = chars[pos + 1];
62
63    // Check for CJK sentence-ending punctuation (。, ！, ？)
64    // CJK punctuation doesn't require space or uppercase after it
65    if is_cjk_sentence_ending(c) {
66        // Skip any trailing emphasis/strikethrough markers
67        let mut after_punct_pos = pos + 1;
68        while after_punct_pos < chars.len()
69            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
70        {
71            after_punct_pos += 1;
72        }
73
74        // Skip whitespace
75        while after_punct_pos < chars.len() && chars[after_punct_pos].is_whitespace() {
76            after_punct_pos += 1;
77        }
78
79        // Check if we have more content (any non-whitespace)
80        if after_punct_pos >= chars.len() {
81            return false;
82        }
83
84        // Skip leading emphasis/strikethrough markers
85        while after_punct_pos < chars.len()
86            && (chars[after_punct_pos] == '*' || chars[after_punct_pos] == '_' || chars[after_punct_pos] == '~')
87        {
88            after_punct_pos += 1;
89        }
90
91        if after_punct_pos >= chars.len() {
92            return false;
93        }
94
95        // For CJK, we accept any character as the start of the next sentence
96        // (no uppercase requirement, since CJK doesn't have case)
97        return true;
98    }
99
100    // Check for ASCII sentence-ending punctuation
101    if c != '.' && c != '!' && c != '?' {
102        return false;
103    }
104
105    // Must be followed by space, closing quote, or emphasis/strikethrough marker followed by space
106    let (_space_pos, after_space_pos) = if next_char == ' ' {
107        // Normal case: punctuation followed by space
108        (pos + 1, pos + 2)
109    } else if is_closing_quote(next_char) && pos + 2 < chars.len() {
110        // Sentence ends with quote - check what follows the quote
111        if chars[pos + 2] == ' ' {
112            // Just quote followed by space: 'sentence." '
113            (pos + 2, pos + 3)
114        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_') && pos + 3 < chars.len() && chars[pos + 3] == ' ' {
115            // Quote followed by emphasis: 'sentence."* '
116            (pos + 3, pos + 4)
117        } else if (chars[pos + 2] == '*' || chars[pos + 2] == '_')
118            && pos + 4 < chars.len()
119            && chars[pos + 3] == chars[pos + 2]
120            && chars[pos + 4] == ' '
121        {
122            // Quote followed by bold: 'sentence."** '
123            (pos + 4, pos + 5)
124        } else {
125            return false;
126        }
127    } else if (next_char == '*' || next_char == '_') && pos + 2 < chars.len() && chars[pos + 2] == ' ' {
128        // Sentence ends with emphasis: "sentence.* " or "sentence._ "
129        (pos + 2, pos + 3)
130    } else if (next_char == '*' || next_char == '_')
131        && pos + 3 < chars.len()
132        && chars[pos + 2] == next_char
133        && chars[pos + 3] == ' '
134    {
135        // Sentence ends with bold: "sentence.** " or "sentence.__ "
136        (pos + 3, pos + 4)
137    } else if next_char == '~' && pos + 3 < chars.len() && chars[pos + 2] == '~' && chars[pos + 3] == ' ' {
138        // Sentence ends with strikethrough: "sentence.~~ "
139        (pos + 3, pos + 4)
140    } else {
141        return false;
142    };
143
144    // Skip all whitespace after the space to find the start of the next sentence
145    let mut next_char_pos = after_space_pos;
146    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
147        next_char_pos += 1;
148    }
149
150    // Check if we reached the end of the string
151    if next_char_pos >= chars.len() {
152        return false;
153    }
154
155    // Skip leading emphasis/strikethrough markers and opening quotes to find the actual first letter
156    let mut first_letter_pos = next_char_pos;
157    while first_letter_pos < chars.len()
158        && (chars[first_letter_pos] == '*'
159            || chars[first_letter_pos] == '_'
160            || chars[first_letter_pos] == '~'
161            || is_opening_quote(chars[first_letter_pos]))
162    {
163        first_letter_pos += 1;
164    }
165
166    // Check if we reached the end after skipping emphasis
167    if first_letter_pos >= chars.len() {
168        return false;
169    }
170
171    // First character of next sentence must be uppercase or CJK
172    let first_char = chars[first_letter_pos];
173    if !first_char.is_uppercase() && !is_cjk_char(first_char) {
174        return false;
175    }
176
177    // Look back to check for common abbreviations (only applies to periods)
178    if pos > 0 && c == '.' {
179        // Check if the text up to and including this period ends with an abbreviation
180        // Note: text[..=pos] includes the character at pos (the period)
181        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
182            return false;
183        }
184
185        // Check for decimal numbers (e.g., "3.14")
186        // Make sure to check if first_letter_pos is within bounds
187        if chars[pos - 1].is_numeric() && first_letter_pos < chars.len() && chars[first_letter_pos].is_numeric() {
188            return false;
189        }
190    }
191    true
192}
193
194/// Split text into sentences
195pub fn split_into_sentences(text: &str) -> Vec<String> {
196    split_into_sentences_custom(text, &None)
197}
198
199/// Split text into sentences with custom abbreviations
200pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
201    let abbreviations = get_abbreviations(custom_abbreviations);
202    split_into_sentences_with_set(text, &abbreviations)
203}
204
205/// Internal function to split text into sentences with a pre-computed abbreviations set
206/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
207fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
208    let mut sentences = Vec::new();
209    let mut current_sentence = String::new();
210    let mut chars = text.chars().peekable();
211    let mut pos = 0;
212
213    while let Some(c) = chars.next() {
214        current_sentence.push(c);
215
216        if is_sentence_boundary(text, pos, abbreviations) {
217            // Consume any trailing emphasis/strikethrough markers and quotes (they belong to the current sentence)
218            while let Some(&next) = chars.peek() {
219                if next == '*' || next == '_' || next == '~' || is_closing_quote(next) {
220                    current_sentence.push(chars.next().unwrap());
221                    pos += 1;
222                } else {
223                    break;
224                }
225            }
226
227            // Consume the space after the sentence
228            if chars.peek() == Some(&' ') {
229                chars.next();
230                pos += 1;
231            }
232
233            sentences.push(current_sentence.trim().to_string());
234            current_sentence.clear();
235        }
236
237        pos += 1;
238    }
239
240    // Add any remaining text as the last sentence
241    if !current_sentence.trim().is_empty() {
242        sentences.push(current_sentence.trim().to_string());
243    }
244    sentences
245}
246
247/// Check if a line is a horizontal rule (---, ___, ***)
248fn is_horizontal_rule(line: &str) -> bool {
249    if line.len() < 3 {
250        return false;
251    }
252
253    // Check if line consists only of -, _, or * characters (at least 3)
254    let chars: Vec<char> = line.chars().collect();
255    if chars.is_empty() {
256        return false;
257    }
258
259    let first_char = chars[0];
260    if first_char != '-' && first_char != '_' && first_char != '*' {
261        return false;
262    }
263
264    // All characters should be the same (allowing spaces between)
265    for c in &chars {
266        if *c != first_char && *c != ' ' {
267            return false;
268        }
269    }
270
271    // Count non-space characters
272    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
273    non_space_count >= 3
274}
275
276/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
277fn is_numbered_list_item(line: &str) -> bool {
278    let mut chars = line.chars();
279
280    // Must start with a digit
281    if !chars.next().is_some_and(|c| c.is_numeric()) {
282        return false;
283    }
284
285    // Can have more digits
286    while let Some(c) = chars.next() {
287        if c == '.' {
288            // After period, must have a space or be end of line
289            return chars.next().is_none_or(|c| c == ' ');
290        }
291        if !c.is_numeric() {
292            return false;
293        }
294    }
295
296    false
297}
298
299/// Check if a line ends with a hard break (either two spaces or backslash)
300///
301/// CommonMark supports two formats for hard line breaks:
302/// 1. Two or more trailing spaces
303/// 2. A backslash at the end of the line
304fn has_hard_break(line: &str) -> bool {
305    let line = line.strip_suffix('\r').unwrap_or(line);
306    line.ends_with("  ") || line.ends_with('\\')
307}
308
309/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
310///
311/// Hard breaks in Markdown can be indicated by:
312/// 1. Two trailing spaces before a newline (traditional)
313/// 2. A backslash at the end of the line (mdformat style)
314fn trim_preserving_hard_break(s: &str) -> String {
315    // Strip trailing \r from CRLF line endings first to handle Windows files
316    let s = s.strip_suffix('\r').unwrap_or(s);
317
318    // Check for backslash hard break (mdformat style)
319    if s.ends_with('\\') {
320        // Preserve the backslash exactly as-is
321        return s.to_string();
322    }
323
324    // Check if there are at least 2 trailing spaces (traditional hard break)
325    if s.ends_with("  ") {
326        // Find the position where non-space content ends
327        let content_end = s.trim_end().len();
328        if content_end == 0 {
329            // String is all whitespace
330            return String::new();
331        }
332        // Preserve exactly 2 trailing spaces for hard break
333        format!("{}  ", &s[..content_end])
334    } else {
335        // No hard break, just trim all trailing whitespace
336        s.trim_end().to_string()
337    }
338}
339
340pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
341    // For sentence-per-line mode, always process regardless of length
342    if options.sentence_per_line {
343        let elements = parse_markdown_elements(line);
344        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
345    }
346
347    // Quick check: if line is already short enough or no wrapping requested, return as-is
348    // line_length = 0 means no wrapping (unlimited line length)
349    if options.line_length == 0 || line.chars().count() <= options.line_length {
350        return vec![line.to_string()];
351    }
352
353    // Parse the markdown to identify elements
354    let elements = parse_markdown_elements(line);
355
356    // Reflow the elements into lines
357    reflow_elements(&elements, options)
358}
359
360/// Image source in a linked image structure
361#[derive(Debug, Clone)]
362enum LinkedImageSource {
363    /// Inline image URL: ![alt](url)
364    Inline(String),
365    /// Reference image: ![alt][ref]
366    Reference(String),
367}
368
369/// Link target in a linked image structure
370#[derive(Debug, Clone)]
371enum LinkedImageTarget {
372    /// Inline link URL: ](url)
373    Inline(String),
374    /// Reference link: ][ref]
375    Reference(String),
376}
377
378/// Represents a piece of content in the markdown
379#[derive(Debug, Clone)]
380enum Element {
381    /// Plain text that can be wrapped
382    Text(String),
383    /// A complete markdown inline link [text](url)
384    Link { text: String, url: String },
385    /// A complete markdown reference link [text][ref]
386    ReferenceLink { text: String, reference: String },
387    /// A complete markdown empty reference link [text][]
388    EmptyReferenceLink { text: String },
389    /// A complete markdown shortcut reference link [ref]
390    ShortcutReference { reference: String },
391    /// A complete markdown inline image ![alt](url)
392    InlineImage { alt: String, url: String },
393    /// A complete markdown reference image ![alt][ref]
394    ReferenceImage { alt: String, reference: String },
395    /// A complete markdown empty reference image ![alt][]
396    EmptyReferenceImage { alt: String },
397    /// A clickable image badge in any of 4 forms:
398    /// - [![alt](img-url)](link-url)
399    /// - [![alt][img-ref]](link-url)
400    /// - [![alt](img-url)][link-ref]
401    /// - [![alt][img-ref]][link-ref]
402    LinkedImage {
403        alt: String,
404        img_source: LinkedImageSource,
405        link_target: LinkedImageTarget,
406    },
407    /// Footnote reference [^note]
408    FootnoteReference { note: String },
409    /// Strikethrough text ~~text~~
410    Strikethrough(String),
411    /// Wiki-style link [[wiki]] or [[wiki|text]]
412    WikiLink(String),
413    /// Inline math $math$
414    InlineMath(String),
415    /// Display math $$math$$
416    DisplayMath(String),
417    /// Emoji shortcode :emoji:
418    EmojiShortcode(String),
419    /// HTML tag <tag> or </tag> or <tag/>
420    HtmlTag(String),
421    /// HTML entity &nbsp; or &#123;
422    HtmlEntity(String),
423    /// Hugo/Go template shortcode {{< ... >}} or {{% ... %}}
424    HugoShortcode(String),
425    /// Inline code `code`
426    Code(String),
427    /// Bold text **text** or __text__
428    Bold {
429        content: String,
430        /// True if underscore markers (__), false for asterisks (**)
431        underscore: bool,
432    },
433    /// Italic text *text* or _text_
434    Italic {
435        content: String,
436        /// True if underscore marker (_), false for asterisk (*)
437        underscore: bool,
438    },
439}
440
441impl std::fmt::Display for Element {
442    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
443        match self {
444            Element::Text(s) => write!(f, "{s}"),
445            Element::Link { text, url } => write!(f, "[{text}]({url})"),
446            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
447            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
448            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
449            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
450            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
451            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
452            Element::LinkedImage {
453                alt,
454                img_source,
455                link_target,
456            } => {
457                // Build the image part: ![alt](url) or ![alt][ref]
458                let img_part = match img_source {
459                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
460                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
461                };
462                // Build the link part: (url) or [ref]
463                match link_target {
464                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
465                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
466                }
467            }
468            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
469            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
470            Element::WikiLink(s) => write!(f, "[[{s}]]"),
471            Element::InlineMath(s) => write!(f, "${s}$"),
472            Element::DisplayMath(s) => write!(f, "$${s}$$"),
473            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
474            Element::HtmlTag(s) => write!(f, "{s}"),
475            Element::HtmlEntity(s) => write!(f, "{s}"),
476            Element::HugoShortcode(s) => write!(f, "{s}"),
477            Element::Code(s) => write!(f, "`{s}`"),
478            Element::Bold { content, underscore } => {
479                if *underscore {
480                    write!(f, "__{content}__")
481                } else {
482                    write!(f, "**{content}**")
483                }
484            }
485            Element::Italic { content, underscore } => {
486                if *underscore {
487                    write!(f, "_{content}_")
488                } else {
489                    write!(f, "*{content}*")
490                }
491            }
492        }
493    }
494}
495
496impl Element {
497    fn len(&self) -> usize {
498        match self {
499            Element::Text(s) => s.chars().count(),
500            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
501            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
502            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
503            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
504            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
505            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
506            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
507            Element::LinkedImage {
508                alt,
509                img_source,
510                link_target,
511            } => {
512                // Calculate length based on variant
513                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
514                let alt_len = alt.chars().count();
515                let img_len = match img_source {
516                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
517                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
518                };
519                let link_len = match link_target {
520                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
521                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
522                };
523                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
524                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
525                5 + alt_len + img_len + link_len
526            }
527            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
528            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
529            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
530            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
531            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
532            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
533            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
534            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
535            Element::HugoShortcode(s) => s.chars().count(),                  // {{< ... >}} - already complete
536            Element::Code(s) => s.chars().count() + 2,                       // `code`
537            Element::Bold { content, .. } => content.chars().count() + 4,    // **text** or __text__
538            Element::Italic { content, .. } => content.chars().count() + 2,  // *text* or _text_
539        }
540    }
541}
542
543/// An emphasis or formatting span parsed by pulldown-cmark
544#[derive(Debug, Clone)]
545struct EmphasisSpan {
546    /// Byte offset where the emphasis starts (including markers)
547    start: usize,
548    /// Byte offset where the emphasis ends (after closing markers)
549    end: usize,
550    /// The content inside the emphasis markers
551    content: String,
552    /// Whether this is strong (bold) emphasis
553    is_strong: bool,
554    /// Whether this is strikethrough (~~text~~)
555    is_strikethrough: bool,
556    /// Whether the original used underscore markers (for emphasis only)
557    uses_underscore: bool,
558}
559
560/// Extract emphasis and strikethrough spans from text using pulldown-cmark
561///
562/// This provides CommonMark-compliant emphasis parsing, correctly handling:
563/// - Nested emphasis like `*text **bold** more*`
564/// - Left/right flanking delimiter rules
565/// - Underscore vs asterisk markers
566/// - GFM strikethrough (~~text~~)
567///
568/// Returns spans sorted by start position.
569fn extract_emphasis_spans(text: &str) -> Vec<EmphasisSpan> {
570    let mut spans = Vec::new();
571    let mut options = Options::empty();
572    options.insert(Options::ENABLE_STRIKETHROUGH);
573
574    // Stacks to track nested formatting with their start positions
575    let mut emphasis_stack: Vec<(usize, bool)> = Vec::new(); // (start_byte, uses_underscore)
576    let mut strong_stack: Vec<(usize, bool)> = Vec::new();
577    let mut strikethrough_stack: Vec<usize> = Vec::new();
578
579    let parser = Parser::new_ext(text, options).into_offset_iter();
580
581    for (event, range) in parser {
582        match event {
583            Event::Start(Tag::Emphasis) => {
584                // Check if this uses underscore by looking at the original text
585                let uses_underscore = text.get(range.start..range.start + 1) == Some("_");
586                emphasis_stack.push((range.start, uses_underscore));
587            }
588            Event::End(TagEnd::Emphasis) => {
589                if let Some((start_byte, uses_underscore)) = emphasis_stack.pop() {
590                    // Extract content between the markers (1 char marker on each side)
591                    let content_start = start_byte + 1;
592                    let content_end = range.end - 1;
593                    if content_end > content_start
594                        && let Some(content) = text.get(content_start..content_end)
595                    {
596                        spans.push(EmphasisSpan {
597                            start: start_byte,
598                            end: range.end,
599                            content: content.to_string(),
600                            is_strong: false,
601                            is_strikethrough: false,
602                            uses_underscore,
603                        });
604                    }
605                }
606            }
607            Event::Start(Tag::Strong) => {
608                // Check if this uses underscore by looking at the original text
609                let uses_underscore = text.get(range.start..range.start + 2) == Some("__");
610                strong_stack.push((range.start, uses_underscore));
611            }
612            Event::End(TagEnd::Strong) => {
613                if let Some((start_byte, uses_underscore)) = strong_stack.pop() {
614                    // Extract content between the markers (2 char marker on each side)
615                    let content_start = start_byte + 2;
616                    let content_end = range.end - 2;
617                    if content_end > content_start
618                        && let Some(content) = text.get(content_start..content_end)
619                    {
620                        spans.push(EmphasisSpan {
621                            start: start_byte,
622                            end: range.end,
623                            content: content.to_string(),
624                            is_strong: true,
625                            is_strikethrough: false,
626                            uses_underscore,
627                        });
628                    }
629                }
630            }
631            Event::Start(Tag::Strikethrough) => {
632                strikethrough_stack.push(range.start);
633            }
634            Event::End(TagEnd::Strikethrough) => {
635                if let Some(start_byte) = strikethrough_stack.pop() {
636                    // Extract content between the ~~ markers (2 char marker on each side)
637                    let content_start = start_byte + 2;
638                    let content_end = range.end - 2;
639                    if content_end > content_start
640                        && let Some(content) = text.get(content_start..content_end)
641                    {
642                        spans.push(EmphasisSpan {
643                            start: start_byte,
644                            end: range.end,
645                            content: content.to_string(),
646                            is_strong: false,
647                            is_strikethrough: true,
648                            uses_underscore: false,
649                        });
650                    }
651                }
652            }
653            _ => {}
654        }
655    }
656
657    // Sort by start position
658    spans.sort_by_key(|s| s.start);
659    spans
660}
661
662/// Parse markdown elements from text preserving the raw syntax
663///
664/// Detection order is critical:
665/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
666/// 2. Inline images ![alt](url) - before links to handle ! prefix
667/// 3. Reference images ![alt][ref] - before reference links
668/// 4. Inline links [text](url) - before reference links
669/// 5. Reference links [text][ref] - before shortcut references
670/// 6. Shortcut reference links [ref] - detected last to avoid false positives
671/// 7. Other elements (code, bold, italic, etc.) - processed normally
672fn parse_markdown_elements(text: &str) -> Vec<Element> {
673    let mut elements = Vec::new();
674    let mut remaining = text;
675
676    // Pre-extract emphasis spans using pulldown-cmark for CommonMark-compliant parsing
677    let emphasis_spans = extract_emphasis_spans(text);
678
679    while !remaining.is_empty() {
680        // Calculate current byte offset in original text
681        let current_offset = text.len() - remaining.len();
682        // Find the earliest occurrence of any markdown pattern
683        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
684
685        // Check for linked images FIRST (all 4 variants)
686        // Quick literal check: only run expensive regexes if we might have a linked image
687        // Pattern starts with "[!" so check for that first
688        if remaining.contains("[!") {
689            // Pattern 1: [![alt](img)](link) - inline image in inline link
690            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
691                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
692            {
693                earliest_match = Some((m.start(), "linked_image_ii", m));
694            }
695
696            // Pattern 2: [![alt][ref]](link) - reference image in inline link
697            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
698                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
699            {
700                earliest_match = Some((m.start(), "linked_image_ri", m));
701            }
702
703            // Pattern 3: [![alt](img)][ref] - inline image in reference link
704            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
705                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
706            {
707                earliest_match = Some((m.start(), "linked_image_ir", m));
708            }
709
710            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
711            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
712                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
713            {
714                earliest_match = Some((m.start(), "linked_image_rr", m));
715            }
716        }
717
718        // Check for images (they start with ! so should be detected before links)
719        // Inline images - ![alt](url)
720        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
721            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
722        {
723            earliest_match = Some((m.start(), "inline_image", m));
724        }
725
726        // Reference images - ![alt][ref]
727        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
728            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
729        {
730            earliest_match = Some((m.start(), "ref_image", m));
731        }
732
733        // Check for footnote references - [^note]
734        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
735            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
736        {
737            earliest_match = Some((m.start(), "footnote_ref", m));
738        }
739
740        // Check for inline links - [text](url)
741        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
742            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
743        {
744            earliest_match = Some((m.start(), "inline_link", m));
745        }
746
747        // Check for reference links - [text][ref]
748        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
749            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
750        {
751            earliest_match = Some((m.start(), "ref_link", m));
752        }
753
754        // Check for shortcut reference links - [ref]
755        // Only check if we haven't found an earlier pattern that would conflict
756        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
757            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
758        {
759            earliest_match = Some((m.start(), "shortcut_ref", m));
760        }
761
762        // Check for wiki-style links - [[wiki]]
763        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
764            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
765        {
766            earliest_match = Some((m.start(), "wiki_link", m));
767        }
768
769        // Check for display math first (before inline) - $$math$$
770        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
771            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
772        {
773            earliest_match = Some((m.start(), "display_math", m));
774        }
775
776        // Check for inline math - $math$
777        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
778            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
779        {
780            earliest_match = Some((m.start(), "inline_math", m));
781        }
782
783        // Note: Strikethrough is now handled by pulldown-cmark in extract_emphasis_spans
784
785        // Check for emoji shortcodes - :emoji:
786        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
787            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
788        {
789            earliest_match = Some((m.start(), "emoji", m));
790        }
791
792        // Check for HTML entities - &nbsp; etc
793        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
794            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
795        {
796            earliest_match = Some((m.start(), "html_entity", m));
797        }
798
799        // Check for Hugo shortcodes - {{< ... >}} or {{% ... %}}
800        // Must be checked before other patterns to avoid false sentence breaks
801        if let Ok(Some(m)) = HUGO_SHORTCODE_REGEX.find(remaining)
802            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
803        {
804            earliest_match = Some((m.start(), "hugo_shortcode", m));
805        }
806
807        // Check for HTML tags - <tag> </tag> <tag/>
808        // But exclude autolinks like <https://...> or <mailto:...>
809        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
810            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
811        {
812            // Check if this is an autolink (starts with protocol or mailto:)
813            let matched_text = &remaining[m.start()..m.end()];
814            let is_autolink = matched_text.starts_with("<http://")
815                || matched_text.starts_with("<https://")
816                || matched_text.starts_with("<mailto:")
817                || matched_text.starts_with("<ftp://")
818                || matched_text.starts_with("<ftps://");
819
820            if !is_autolink {
821                earliest_match = Some((m.start(), "html_tag", m));
822            }
823        }
824
825        // Find earliest non-link special characters
826        let mut next_special = remaining.len();
827        let mut special_type = "";
828        let mut pulldown_emphasis: Option<&EmphasisSpan> = None;
829
830        // Check for code spans (not handled by pulldown-cmark in this context)
831        if let Some(pos) = remaining.find('`')
832            && pos < next_special
833        {
834            next_special = pos;
835            special_type = "code";
836        }
837
838        // Check for emphasis using pulldown-cmark's pre-extracted spans
839        // Find the earliest emphasis span that starts within remaining text
840        for span in &emphasis_spans {
841            if span.start >= current_offset && span.start < current_offset + remaining.len() {
842                let pos_in_remaining = span.start - current_offset;
843                if pos_in_remaining < next_special {
844                    next_special = pos_in_remaining;
845                    special_type = "pulldown_emphasis";
846                    pulldown_emphasis = Some(span);
847                }
848                break; // Spans are sorted by start position, so first match is earliest
849            }
850        }
851
852        // Determine which pattern to process first
853        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
854            pos < next_special
855        } else {
856            false
857        };
858
859        if should_process_markdown_link {
860            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
861
862            // Add any text before the match
863            if pos > 0 {
864                elements.push(Element::Text(remaining[..pos].to_string()));
865            }
866
867            // Process the matched pattern
868            match pattern_type {
869                // Pattern 1: [![alt](img)](link) - inline image in inline link
870                "linked_image_ii" => {
871                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
872                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
873                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
874                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
875                        elements.push(Element::LinkedImage {
876                            alt: alt.to_string(),
877                            img_source: LinkedImageSource::Inline(img_url.to_string()),
878                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
879                        });
880                        remaining = &remaining[match_obj.end()..];
881                    } else {
882                        elements.push(Element::Text("[".to_string()));
883                        remaining = &remaining[1..];
884                    }
885                }
886                // Pattern 2: [![alt][ref]](link) - reference image in inline link
887                "linked_image_ri" => {
888                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
889                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
890                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
891                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
892                        elements.push(Element::LinkedImage {
893                            alt: alt.to_string(),
894                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
895                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
896                        });
897                        remaining = &remaining[match_obj.end()..];
898                    } else {
899                        elements.push(Element::Text("[".to_string()));
900                        remaining = &remaining[1..];
901                    }
902                }
903                // Pattern 3: [![alt](img)][ref] - inline image in reference link
904                "linked_image_ir" => {
905                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
906                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
907                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
908                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
909                        elements.push(Element::LinkedImage {
910                            alt: alt.to_string(),
911                            img_source: LinkedImageSource::Inline(img_url.to_string()),
912                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
913                        });
914                        remaining = &remaining[match_obj.end()..];
915                    } else {
916                        elements.push(Element::Text("[".to_string()));
917                        remaining = &remaining[1..];
918                    }
919                }
920                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
921                "linked_image_rr" => {
922                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
923                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
924                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
925                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
926                        elements.push(Element::LinkedImage {
927                            alt: alt.to_string(),
928                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
929                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
930                        });
931                        remaining = &remaining[match_obj.end()..];
932                    } else {
933                        elements.push(Element::Text("[".to_string()));
934                        remaining = &remaining[1..];
935                    }
936                }
937                "inline_image" => {
938                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
939                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
940                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
941                        elements.push(Element::InlineImage {
942                            alt: alt.to_string(),
943                            url: url.to_string(),
944                        });
945                        remaining = &remaining[match_obj.end()..];
946                    } else {
947                        elements.push(Element::Text("!".to_string()));
948                        remaining = &remaining[1..];
949                    }
950                }
951                "ref_image" => {
952                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
953                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
954                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
955
956                        if reference.is_empty() {
957                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
958                        } else {
959                            elements.push(Element::ReferenceImage {
960                                alt: alt.to_string(),
961                                reference: reference.to_string(),
962                            });
963                        }
964                        remaining = &remaining[match_obj.end()..];
965                    } else {
966                        elements.push(Element::Text("!".to_string()));
967                        remaining = &remaining[1..];
968                    }
969                }
970                "footnote_ref" => {
971                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
972                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
973                        elements.push(Element::FootnoteReference { note: note.to_string() });
974                        remaining = &remaining[match_obj.end()..];
975                    } else {
976                        elements.push(Element::Text("[".to_string()));
977                        remaining = &remaining[1..];
978                    }
979                }
980                "inline_link" => {
981                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
982                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
983                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
984                        elements.push(Element::Link {
985                            text: text.to_string(),
986                            url: url.to_string(),
987                        });
988                        remaining = &remaining[match_obj.end()..];
989                    } else {
990                        // Fallback - shouldn't happen
991                        elements.push(Element::Text("[".to_string()));
992                        remaining = &remaining[1..];
993                    }
994                }
995                "ref_link" => {
996                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
997                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
998                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
999
1000                        if reference.is_empty() {
1001                            // Empty reference link [text][]
1002                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
1003                        } else {
1004                            // Regular reference link [text][ref]
1005                            elements.push(Element::ReferenceLink {
1006                                text: text.to_string(),
1007                                reference: reference.to_string(),
1008                            });
1009                        }
1010                        remaining = &remaining[match_obj.end()..];
1011                    } else {
1012                        // Fallback - shouldn't happen
1013                        elements.push(Element::Text("[".to_string()));
1014                        remaining = &remaining[1..];
1015                    }
1016                }
1017                "shortcut_ref" => {
1018                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
1019                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1020                        elements.push(Element::ShortcutReference {
1021                            reference: reference.to_string(),
1022                        });
1023                        remaining = &remaining[match_obj.end()..];
1024                    } else {
1025                        // Fallback - shouldn't happen
1026                        elements.push(Element::Text("[".to_string()));
1027                        remaining = &remaining[1..];
1028                    }
1029                }
1030                "wiki_link" => {
1031                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
1032                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1033                        elements.push(Element::WikiLink(content.to_string()));
1034                        remaining = &remaining[match_obj.end()..];
1035                    } else {
1036                        elements.push(Element::Text("[[".to_string()));
1037                        remaining = &remaining[2..];
1038                    }
1039                }
1040                "display_math" => {
1041                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
1042                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1043                        elements.push(Element::DisplayMath(math.to_string()));
1044                        remaining = &remaining[match_obj.end()..];
1045                    } else {
1046                        elements.push(Element::Text("$$".to_string()));
1047                        remaining = &remaining[2..];
1048                    }
1049                }
1050                "inline_math" => {
1051                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
1052                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1053                        elements.push(Element::InlineMath(math.to_string()));
1054                        remaining = &remaining[match_obj.end()..];
1055                    } else {
1056                        elements.push(Element::Text("$".to_string()));
1057                        remaining = &remaining[1..];
1058                    }
1059                }
1060                // Note: "strikethrough" case removed - now handled by pulldown-cmark
1061                "emoji" => {
1062                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
1063                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
1064                        elements.push(Element::EmojiShortcode(emoji.to_string()));
1065                        remaining = &remaining[match_obj.end()..];
1066                    } else {
1067                        elements.push(Element::Text(":".to_string()));
1068                        remaining = &remaining[1..];
1069                    }
1070                }
1071                "html_entity" => {
1072                    // HTML entities are captured whole
1073                    elements.push(Element::HtmlEntity(remaining[..match_obj.end()].to_string()));
1074                    remaining = &remaining[match_obj.end()..];
1075                }
1076                "hugo_shortcode" => {
1077                    // Hugo shortcodes are atomic elements - preserve them exactly
1078                    elements.push(Element::HugoShortcode(remaining[..match_obj.end()].to_string()));
1079                    remaining = &remaining[match_obj.end()..];
1080                }
1081                "html_tag" => {
1082                    // HTML tags are captured whole
1083                    elements.push(Element::HtmlTag(remaining[..match_obj.end()].to_string()));
1084                    remaining = &remaining[match_obj.end()..];
1085                }
1086                _ => {
1087                    // Unknown pattern, treat as text
1088                    elements.push(Element::Text("[".to_string()));
1089                    remaining = &remaining[1..];
1090                }
1091            }
1092        } else {
1093            // Process non-link special characters
1094
1095            // Add any text before the special character
1096            if next_special > 0 && next_special < remaining.len() {
1097                elements.push(Element::Text(remaining[..next_special].to_string()));
1098                remaining = &remaining[next_special..];
1099            }
1100
1101            // Process the special element
1102            match special_type {
1103                "code" => {
1104                    // Find end of code
1105                    if let Some(code_end) = remaining[1..].find('`') {
1106                        let code = &remaining[1..1 + code_end];
1107                        elements.push(Element::Code(code.to_string()));
1108                        remaining = &remaining[1 + code_end + 1..];
1109                    } else {
1110                        // No closing backtick, treat as text
1111                        elements.push(Element::Text(remaining.to_string()));
1112                        break;
1113                    }
1114                }
1115                "pulldown_emphasis" => {
1116                    // Use pre-extracted emphasis/strikethrough span from pulldown-cmark
1117                    if let Some(span) = pulldown_emphasis {
1118                        let span_len = span.end - span.start;
1119                        if span.is_strikethrough {
1120                            elements.push(Element::Strikethrough(span.content.clone()));
1121                        } else if span.is_strong {
1122                            elements.push(Element::Bold {
1123                                content: span.content.clone(),
1124                                underscore: span.uses_underscore,
1125                            });
1126                        } else {
1127                            elements.push(Element::Italic {
1128                                content: span.content.clone(),
1129                                underscore: span.uses_underscore,
1130                            });
1131                        }
1132                        remaining = &remaining[span_len..];
1133                    } else {
1134                        // Fallback - shouldn't happen
1135                        elements.push(Element::Text(remaining[..1].to_string()));
1136                        remaining = &remaining[1..];
1137                    }
1138                }
1139                _ => {
1140                    // No special elements found, add all remaining text
1141                    elements.push(Element::Text(remaining.to_string()));
1142                    break;
1143                }
1144            }
1145        }
1146    }
1147
1148    elements
1149}
1150
1151/// Reflow elements for sentence-per-line mode
1152fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
1153    let abbreviations = get_abbreviations(custom_abbreviations);
1154    let mut lines = Vec::new();
1155    let mut current_line = String::new();
1156
1157    for element in elements.iter() {
1158        let element_str = format!("{element}");
1159
1160        // For text elements, split into sentences
1161        if let Element::Text(text) = element {
1162            // Simply append text - it already has correct spacing from tokenization
1163            let combined = format!("{current_line}{text}");
1164            // Use the pre-computed abbreviations set to avoid redundant computation
1165            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
1166
1167            if sentences.len() > 1 {
1168                // We found sentence boundaries
1169                for (i, sentence) in sentences.iter().enumerate() {
1170                    if i == 0 {
1171                        // First sentence might continue from previous elements
1172                        // But check if it ends with an abbreviation
1173                        let trimmed = sentence.trim();
1174
1175                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
1176                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
1177                            current_line = sentence.to_string();
1178                        } else {
1179                            // Normal case - emit the first sentence
1180                            lines.push(sentence.to_string());
1181                            current_line.clear();
1182                        }
1183                    } else if i == sentences.len() - 1 {
1184                        // Last sentence: check if it's complete or incomplete
1185                        let trimmed = sentence.trim();
1186                        let ends_with_sentence_punct =
1187                            trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1188
1189                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1190                            // Complete sentence - emit it immediately
1191                            lines.push(sentence.to_string());
1192                            current_line.clear();
1193                        } else {
1194                            // Incomplete sentence - save for next iteration
1195                            current_line = sentence.to_string();
1196                        }
1197                    } else {
1198                        // Complete sentences in the middle
1199                        lines.push(sentence.to_string());
1200                    }
1201                }
1202            } else {
1203                // Single sentence - check if it's complete
1204                let trimmed = combined.trim();
1205                let ends_with_sentence_punct =
1206                    trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1207
1208                if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1209                    // Complete single sentence - emit it
1210                    lines.push(trimmed.to_string());
1211                    current_line.clear();
1212                } else {
1213                    // Incomplete sentence - continue accumulating
1214                    current_line = combined;
1215                }
1216            }
1217        } else if let Element::Italic { content, underscore } = element {
1218            // Handle italic elements - may contain multiple sentences that need continuation
1219            let marker = if *underscore { "_" } else { "*" };
1220            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1221        } else if let Element::Bold { content, underscore } = element {
1222            // Handle bold elements - may contain multiple sentences that need continuation
1223            let marker = if *underscore { "__" } else { "**" };
1224            handle_emphasis_sentence_split(content, marker, &abbreviations, &mut current_line, &mut lines);
1225        } else if let Element::Strikethrough(content) = element {
1226            // Handle strikethrough elements - may contain multiple sentences that need continuation
1227            handle_emphasis_sentence_split(content, "~~", &abbreviations, &mut current_line, &mut lines);
1228        } else {
1229            // Non-text, non-emphasis elements (Code, Links, etc.)
1230            // Add space before element if needed (unless it's after an opening paren/bracket)
1231            if !current_line.is_empty()
1232                && !current_line.ends_with(' ')
1233                && !current_line.ends_with('(')
1234                && !current_line.ends_with('[')
1235            {
1236                current_line.push(' ');
1237            }
1238            current_line.push_str(&element_str);
1239        }
1240    }
1241
1242    // Add any remaining content
1243    if !current_line.is_empty() {
1244        lines.push(current_line.trim().to_string());
1245    }
1246    lines
1247}
1248
1249/// Handle splitting emphasis content at sentence boundaries while preserving markers
1250fn handle_emphasis_sentence_split(
1251    content: &str,
1252    marker: &str,
1253    abbreviations: &HashSet<String>,
1254    current_line: &mut String,
1255    lines: &mut Vec<String>,
1256) {
1257    // Split the emphasis content into sentences
1258    let sentences = split_into_sentences_with_set(content, abbreviations);
1259
1260    if sentences.len() <= 1 {
1261        // Single sentence or no boundaries - treat as atomic
1262        if !current_line.is_empty()
1263            && !current_line.ends_with(' ')
1264            && !current_line.ends_with('(')
1265            && !current_line.ends_with('[')
1266        {
1267            current_line.push(' ');
1268        }
1269        current_line.push_str(marker);
1270        current_line.push_str(content);
1271        current_line.push_str(marker);
1272
1273        // Check if the emphasis content ends with sentence punctuation - if so, emit
1274        let trimmed = content.trim();
1275        let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1276        if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1277            lines.push(current_line.clone());
1278            current_line.clear();
1279        }
1280    } else {
1281        // Multiple sentences - each gets its own emphasis markers
1282        for (i, sentence) in sentences.iter().enumerate() {
1283            let trimmed = sentence.trim();
1284            if trimmed.is_empty() {
1285                continue;
1286            }
1287
1288            if i == 0 {
1289                // First sentence: combine with current_line and emit
1290                if !current_line.is_empty()
1291                    && !current_line.ends_with(' ')
1292                    && !current_line.ends_with('(')
1293                    && !current_line.ends_with('[')
1294                {
1295                    current_line.push(' ');
1296                }
1297                current_line.push_str(marker);
1298                current_line.push_str(trimmed);
1299                current_line.push_str(marker);
1300
1301                // Check if this is a complete sentence
1302                let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1303                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1304                    lines.push(current_line.clone());
1305                    current_line.clear();
1306                }
1307            } else if i == sentences.len() - 1 {
1308                // Last sentence: check if complete
1309                let ends_with_punct = trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
1310
1311                let mut line = String::new();
1312                line.push_str(marker);
1313                line.push_str(trimmed);
1314                line.push_str(marker);
1315
1316                if ends_with_punct && !text_ends_with_abbreviation(trimmed, abbreviations) {
1317                    lines.push(line);
1318                } else {
1319                    // Incomplete - keep in current_line for potential continuation
1320                    *current_line = line;
1321                }
1322            } else {
1323                // Middle sentences: emit with markers
1324                let mut line = String::new();
1325                line.push_str(marker);
1326                line.push_str(trimmed);
1327                line.push_str(marker);
1328                lines.push(line);
1329            }
1330        }
1331    }
1332}
1333
1334/// Reflow elements into lines that fit within the line length
1335fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1336    let mut lines = Vec::new();
1337    let mut current_line = String::new();
1338    let mut current_length = 0;
1339
1340    for element in elements {
1341        let element_str = format!("{element}");
1342        let element_len = element.len();
1343
1344        // For text elements that might need breaking
1345        if let Element::Text(text) = element {
1346            // Check if original text had leading whitespace
1347            let has_leading_space = text.starts_with(char::is_whitespace);
1348            // If this is a text element, always process it word by word
1349            let words: Vec<&str> = text.split_whitespace().collect();
1350
1351            for (i, word) in words.iter().enumerate() {
1352                let word_len = word.chars().count();
1353                // Check if this "word" is just punctuation that should stay attached
1354                let is_trailing_punct = word
1355                    .chars()
1356                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1357
1358                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
1359                    // Start a new line (but never for trailing punctuation)
1360                    lines.push(current_line.trim().to_string());
1361                    current_line = word.to_string();
1362                    current_length = word_len;
1363                } else {
1364                    // Add word to current line
1365                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1366                    // AND this isn't trailing punctuation (which attaches directly)
1367                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1368                        current_line.push(' ');
1369                        current_length += 1;
1370                    }
1371                    current_line.push_str(word);
1372                    current_length += word_len;
1373                }
1374            }
1375        } else {
1376            // For non-text elements (code, links, references), treat as atomic units
1377            // These should never be broken across lines
1378            if current_length > 0 && current_length + 1 + element_len > options.line_length {
1379                // Start a new line
1380                lines.push(current_line.trim().to_string());
1381                current_line = element_str;
1382                current_length = element_len;
1383            } else {
1384                // Add element to current line
1385                // Don't add space if the current line ends with an opening bracket/paren
1386                let ends_with_opener =
1387                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1388                if current_length > 0 && !ends_with_opener {
1389                    current_line.push(' ');
1390                    current_length += 1;
1391                }
1392                current_line.push_str(&element_str);
1393                current_length += element_len;
1394            }
1395        }
1396    }
1397
1398    // Don't forget the last line
1399    if !current_line.is_empty() {
1400        lines.push(current_line.trim_end().to_string());
1401    }
1402
1403    lines
1404}
1405
1406/// Reflow markdown content preserving structure
1407pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1408    let lines: Vec<&str> = content.lines().collect();
1409    let mut result = Vec::new();
1410    let mut i = 0;
1411
1412    while i < lines.len() {
1413        let line = lines[i];
1414        let trimmed = line.trim();
1415
1416        // Preserve empty lines
1417        if trimmed.is_empty() {
1418            result.push(String::new());
1419            i += 1;
1420            continue;
1421        }
1422
1423        // Preserve headings as-is
1424        if trimmed.starts_with('#') {
1425            result.push(line.to_string());
1426            i += 1;
1427            continue;
1428        }
1429
1430        // Preserve fenced code blocks
1431        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1432            result.push(line.to_string());
1433            i += 1;
1434            // Copy lines until closing fence
1435            while i < lines.len() {
1436                result.push(lines[i].to_string());
1437                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1438                    i += 1;
1439                    break;
1440                }
1441                i += 1;
1442            }
1443            continue;
1444        }
1445
1446        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1447        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1448            // Collect all consecutive indented lines
1449            result.push(line.to_string());
1450            i += 1;
1451            while i < lines.len() {
1452                let next_line = lines[i];
1453                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1454                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1455                    result.push(next_line.to_string());
1456                    i += 1;
1457                } else {
1458                    break;
1459                }
1460            }
1461            continue;
1462        }
1463
1464        // Preserve block quotes (but reflow their content)
1465        if trimmed.starts_with('>') {
1466            // find() returns byte position which is correct for str slicing
1467            // The unwrap is safe because we already verified trimmed starts with '>'
1468            let gt_pos = line.find('>').expect("'>' must exist since trimmed.starts_with('>')");
1469            let quote_prefix = line[0..gt_pos + 1].to_string();
1470            let quote_content = &line[quote_prefix.len()..].trim_start();
1471
1472            let reflowed = reflow_line(quote_content, options);
1473            for reflowed_line in reflowed.iter() {
1474                result.push(format!("{quote_prefix} {reflowed_line}"));
1475            }
1476            i += 1;
1477            continue;
1478        }
1479
1480        // Preserve horizontal rules first (before checking for lists)
1481        if is_horizontal_rule(trimmed) {
1482            result.push(line.to_string());
1483            i += 1;
1484            continue;
1485        }
1486
1487        // Preserve lists (but not horizontal rules)
1488        // A valid unordered list marker must be followed by a space (or be alone on line)
1489        // This prevents emphasis markers like "*text*" from being parsed as list items
1490        let is_unordered_list = |s: &str, marker: char| -> bool {
1491            s.starts_with(marker) && !is_horizontal_rule(s) && (s.len() == 1 || s.chars().nth(1) == Some(' '))
1492        };
1493        if is_unordered_list(trimmed, '-')
1494            || is_unordered_list(trimmed, '*')
1495            || is_unordered_list(trimmed, '+')
1496            || is_numbered_list_item(trimmed)
1497        {
1498            // Find the list marker and preserve indentation
1499            let indent = line.len() - line.trim_start().len();
1500            let indent_str = " ".repeat(indent);
1501
1502            // For numbered lists, find the period and the space after it
1503            // For bullet lists, find the marker and the space after it
1504            let mut marker_end = indent;
1505            let mut content_start = indent;
1506
1507            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1508                // Numbered list: find the period
1509                if let Some(period_pos) = line[indent..].find('.') {
1510                    marker_end = indent + period_pos + 1; // Include the period
1511                    content_start = marker_end;
1512                    // Skip any spaces after the period to find content start
1513                    // Use byte-based check since content_start is a byte index
1514                    // This is safe because space is ASCII (single byte)
1515                    while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1516                        content_start += 1;
1517                    }
1518                }
1519            } else {
1520                // Bullet list: marker is single character
1521                marker_end = indent + 1; // Just the marker character
1522                content_start = marker_end;
1523                // Skip any spaces after the marker
1524                // Use byte-based check since content_start is a byte index
1525                // This is safe because space is ASCII (single byte)
1526                while content_start < line.len() && line.as_bytes().get(content_start) == Some(&b' ') {
1527                    content_start += 1;
1528                }
1529            }
1530
1531            let marker = &line[indent..marker_end];
1532
1533            // Collect all content for this list item (including continuation lines)
1534            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1535            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1536            i += 1;
1537
1538            // Collect continuation lines (indented lines that are part of this list item)
1539            while i < lines.len() {
1540                let next_line = lines[i];
1541                let next_trimmed = next_line.trim();
1542
1543                // Stop if we hit an empty line or another list item or special block
1544                if next_trimmed.is_empty()
1545                    || next_trimmed.starts_with('#')
1546                    || next_trimmed.starts_with("```")
1547                    || next_trimmed.starts_with("~~~")
1548                    || next_trimmed.starts_with('>')
1549                    || next_trimmed.starts_with('|')
1550                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1551                    || is_horizontal_rule(next_trimmed)
1552                    || (next_trimmed.starts_with('-')
1553                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1554                    || (next_trimmed.starts_with('*')
1555                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1556                    || (next_trimmed.starts_with('+')
1557                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1558                    || is_numbered_list_item(next_trimmed)
1559                    || is_definition_list_item(next_trimmed)
1560                {
1561                    break;
1562                }
1563
1564                // Check if this line is indented (continuation of list item)
1565                let next_indent = next_line.len() - next_line.trim_start().len();
1566                if next_indent >= content_start {
1567                    // This is a continuation line - add its content
1568                    // Preserve hard breaks while trimming excessive whitespace
1569                    let trimmed_start = next_line.trim_start();
1570                    list_content.push(trim_preserving_hard_break(trimmed_start));
1571                    i += 1;
1572                } else {
1573                    // Not indented enough, not part of this list item
1574                    break;
1575                }
1576            }
1577
1578            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1579            // Hard breaks should prevent joining with the next line
1580            let combined_content = if options.preserve_breaks {
1581                list_content[0].clone()
1582            } else {
1583                // Check if any lines have hard breaks - if so, preserve the structure
1584                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1585                if has_hard_breaks {
1586                    // Don't join lines with hard breaks - keep them separate with newlines
1587                    list_content.join("\n")
1588                } else {
1589                    // No hard breaks, safe to join with spaces
1590                    list_content.join(" ")
1591                }
1592            };
1593
1594            // Calculate the proper indentation for continuation lines
1595            let trimmed_marker = marker;
1596            let continuation_spaces = content_start;
1597
1598            // Adjust line length to account for list marker and space
1599            let prefix_length = indent + trimmed_marker.len() + 1;
1600
1601            // Create adjusted options with reduced line length
1602            let adjusted_options = ReflowOptions {
1603                line_length: options.line_length.saturating_sub(prefix_length),
1604                ..options.clone()
1605            };
1606
1607            let reflowed = reflow_line(&combined_content, &adjusted_options);
1608            for (j, reflowed_line) in reflowed.iter().enumerate() {
1609                if j == 0 {
1610                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1611                } else {
1612                    // Continuation lines aligned with text after marker
1613                    let continuation_indent = " ".repeat(continuation_spaces);
1614                    result.push(format!("{continuation_indent}{reflowed_line}"));
1615                }
1616            }
1617            continue;
1618        }
1619
1620        // Preserve tables
1621        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
1622            result.push(line.to_string());
1623            i += 1;
1624            continue;
1625        }
1626
1627        // Preserve reference definitions
1628        if trimmed.starts_with('[') && line.contains("]:") {
1629            result.push(line.to_string());
1630            i += 1;
1631            continue;
1632        }
1633
1634        // Preserve definition list items (extended markdown)
1635        if is_definition_list_item(trimmed) {
1636            result.push(line.to_string());
1637            i += 1;
1638            continue;
1639        }
1640
1641        // Check if this is a single line that doesn't need processing
1642        let mut is_single_line_paragraph = true;
1643        if i + 1 < lines.len() {
1644            let next_line = lines[i + 1];
1645            let next_trimmed = next_line.trim();
1646            // Check if next line starts a new block
1647            if !next_trimmed.is_empty()
1648                && !next_trimmed.starts_with('#')
1649                && !next_trimmed.starts_with("```")
1650                && !next_trimmed.starts_with("~~~")
1651                && !next_trimmed.starts_with('>')
1652                && !next_trimmed.starts_with('|')
1653                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1654                && !is_horizontal_rule(next_trimmed)
1655                && !(next_trimmed.starts_with('-')
1656                    && !is_horizontal_rule(next_trimmed)
1657                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1658                && !(next_trimmed.starts_with('*')
1659                    && !is_horizontal_rule(next_trimmed)
1660                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1661                && !(next_trimmed.starts_with('+')
1662                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1663                && !is_numbered_list_item(next_trimmed)
1664            {
1665                is_single_line_paragraph = false;
1666            }
1667        }
1668
1669        // If it's a single line that fits, just add it as-is
1670        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1671            result.push(line.to_string());
1672            i += 1;
1673            continue;
1674        }
1675
1676        // For regular paragraphs, collect consecutive lines
1677        let mut paragraph_parts = Vec::new();
1678        let mut current_part = vec![line];
1679        i += 1;
1680
1681        // If preserve_breaks is true, treat each line separately
1682        if options.preserve_breaks {
1683            // Don't collect consecutive lines - just reflow this single line
1684            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1685                Some("\\")
1686            } else if line.ends_with("  ") {
1687                Some("  ")
1688            } else {
1689                None
1690            };
1691            let reflowed = reflow_line(line, options);
1692
1693            // Preserve hard breaks (two trailing spaces or backslash)
1694            if let Some(break_marker) = hard_break_type {
1695                if !reflowed.is_empty() {
1696                    let mut reflowed_with_break = reflowed;
1697                    let last_idx = reflowed_with_break.len() - 1;
1698                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1699                        reflowed_with_break[last_idx].push_str(break_marker);
1700                    }
1701                    result.extend(reflowed_with_break);
1702                }
1703            } else {
1704                result.extend(reflowed);
1705            }
1706        } else {
1707            // Original behavior: collect consecutive lines into a paragraph
1708            while i < lines.len() {
1709                let prev_line = if !current_part.is_empty() {
1710                    current_part.last().unwrap()
1711                } else {
1712                    ""
1713                };
1714                let next_line = lines[i];
1715                let next_trimmed = next_line.trim();
1716
1717                // Stop at empty lines or special blocks
1718                if next_trimmed.is_empty()
1719                    || next_trimmed.starts_with('#')
1720                    || next_trimmed.starts_with("```")
1721                    || next_trimmed.starts_with("~~~")
1722                    || next_trimmed.starts_with('>')
1723                    || next_trimmed.starts_with('|')
1724                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1725                    || is_horizontal_rule(next_trimmed)
1726                    || (next_trimmed.starts_with('-')
1727                        && !is_horizontal_rule(next_trimmed)
1728                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1729                    || (next_trimmed.starts_with('*')
1730                        && !is_horizontal_rule(next_trimmed)
1731                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1732                    || (next_trimmed.starts_with('+')
1733                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1734                    || is_numbered_list_item(next_trimmed)
1735                    || is_definition_list_item(next_trimmed)
1736                {
1737                    break;
1738                }
1739
1740                // Check if previous line ends with hard break (two spaces or backslash)
1741                // or is a complete sentence in sentence_per_line mode
1742                let prev_trimmed = prev_line.trim();
1743                let abbreviations = get_abbreviations(&options.abbreviations);
1744                let ends_with_sentence = (prev_trimmed.ends_with('.')
1745                    || prev_trimmed.ends_with('!')
1746                    || prev_trimmed.ends_with('?')
1747                    || prev_trimmed.ends_with(".*")
1748                    || prev_trimmed.ends_with("!*")
1749                    || prev_trimmed.ends_with("?*")
1750                    || prev_trimmed.ends_with("._")
1751                    || prev_trimmed.ends_with("!_")
1752                    || prev_trimmed.ends_with("?_")
1753                    // Quote-terminated sentences (straight and curly quotes)
1754                    || prev_trimmed.ends_with(".\"")
1755                    || prev_trimmed.ends_with("!\"")
1756                    || prev_trimmed.ends_with("?\"")
1757                    || prev_trimmed.ends_with(".'")
1758                    || prev_trimmed.ends_with("!'")
1759                    || prev_trimmed.ends_with("?'")
1760                    || prev_trimmed.ends_with(".\u{201D}")
1761                    || prev_trimmed.ends_with("!\u{201D}")
1762                    || prev_trimmed.ends_with("?\u{201D}")
1763                    || prev_trimmed.ends_with(".\u{2019}")
1764                    || prev_trimmed.ends_with("!\u{2019}")
1765                    || prev_trimmed.ends_with("?\u{2019}"))
1766                    && !text_ends_with_abbreviation(
1767                        prev_trimmed.trim_end_matches(['*', '_', '"', '\'', '\u{201D}', '\u{2019}']),
1768                        &abbreviations,
1769                    );
1770
1771                if has_hard_break(prev_line) || (options.sentence_per_line && ends_with_sentence) {
1772                    // Start a new part after hard break or complete sentence
1773                    paragraph_parts.push(current_part.join(" "));
1774                    current_part = vec![next_line];
1775                } else {
1776                    current_part.push(next_line);
1777                }
1778                i += 1;
1779            }
1780
1781            // Add the last part
1782            if !current_part.is_empty() {
1783                if current_part.len() == 1 {
1784                    // Single line, don't add trailing space
1785                    paragraph_parts.push(current_part[0].to_string());
1786                } else {
1787                    paragraph_parts.push(current_part.join(" "));
1788                }
1789            }
1790
1791            // Reflow each part separately, preserving hard breaks
1792            for (j, part) in paragraph_parts.iter().enumerate() {
1793                let reflowed = reflow_line(part, options);
1794                result.extend(reflowed);
1795
1796                // Preserve hard break by ensuring last line of part ends with hard break marker
1797                // Use two spaces as the default hard break format for reflows
1798                // But don't add hard breaks in sentence_per_line mode - lines are already separate
1799                if j < paragraph_parts.len() - 1 && !result.is_empty() && !options.sentence_per_line {
1800                    let last_idx = result.len() - 1;
1801                    if !has_hard_break(&result[last_idx]) {
1802                        result[last_idx].push_str("  ");
1803                    }
1804                }
1805            }
1806        }
1807    }
1808
1809    // Preserve trailing newline if the original content had one
1810    let result_text = result.join("\n");
1811    if content.ends_with('\n') && !result_text.ends_with('\n') {
1812        format!("{result_text}\n")
1813    } else {
1814        result_text
1815    }
1816}
1817
1818/// Information about a reflowed paragraph
1819#[derive(Debug, Clone)]
1820pub struct ParagraphReflow {
1821    /// Starting byte offset of the paragraph in the original content
1822    pub start_byte: usize,
1823    /// Ending byte offset of the paragraph in the original content
1824    pub end_byte: usize,
1825    /// The reflowed text for this paragraph
1826    pub reflowed_text: String,
1827}
1828
1829/// Reflow a single paragraph at the specified line number
1830///
1831/// This function finds the paragraph containing the given line number,
1832/// reflows it according to the specified line length, and returns
1833/// information about the paragraph location and its reflowed text.
1834///
1835/// # Arguments
1836///
1837/// * `content` - The full document content
1838/// * `line_number` - The 1-based line number within the paragraph to reflow
1839/// * `line_length` - The target line length for reflowing
1840///
1841/// # Returns
1842///
1843/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1844/// or `None` if the line number is out of bounds or the content at that
1845/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1846pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1847    if line_number == 0 {
1848        return None;
1849    }
1850
1851    let lines: Vec<&str> = content.lines().collect();
1852
1853    // Check if line number is valid (1-based)
1854    if line_number > lines.len() {
1855        return None;
1856    }
1857
1858    let target_idx = line_number - 1; // Convert to 0-based
1859    let target_line = lines[target_idx];
1860    let trimmed = target_line.trim();
1861
1862    // Don't reflow special blocks
1863    if trimmed.is_empty()
1864        || trimmed.starts_with('#')
1865        || trimmed.starts_with("```")
1866        || trimmed.starts_with("~~~")
1867        || ElementCache::calculate_indentation_width_default(target_line) >= 4
1868        || trimmed.starts_with('>')
1869        || crate::utils::table_utils::TableUtils::is_potential_table_row(target_line) // Tables
1870        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1871        || is_horizontal_rule(trimmed)
1872        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1873            && !is_horizontal_rule(trimmed)
1874            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1875        || is_numbered_list_item(trimmed)
1876        || is_definition_list_item(trimmed)
1877    {
1878        return None;
1879    }
1880
1881    // Find paragraph start - scan backward until blank line or special block
1882    let mut para_start = target_idx;
1883    while para_start > 0 {
1884        let prev_idx = para_start - 1;
1885        let prev_line = lines[prev_idx];
1886        let prev_trimmed = prev_line.trim();
1887
1888        // Stop at blank line or special blocks
1889        if prev_trimmed.is_empty()
1890            || prev_trimmed.starts_with('#')
1891            || prev_trimmed.starts_with("```")
1892            || prev_trimmed.starts_with("~~~")
1893            || ElementCache::calculate_indentation_width_default(prev_line) >= 4
1894            || prev_trimmed.starts_with('>')
1895            || crate::utils::table_utils::TableUtils::is_potential_table_row(prev_line)
1896            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1897            || is_horizontal_rule(prev_trimmed)
1898            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1899                && !is_horizontal_rule(prev_trimmed)
1900                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1901            || is_numbered_list_item(prev_trimmed)
1902            || is_definition_list_item(prev_trimmed)
1903        {
1904            break;
1905        }
1906
1907        para_start = prev_idx;
1908    }
1909
1910    // Find paragraph end - scan forward until blank line or special block
1911    let mut para_end = target_idx;
1912    while para_end + 1 < lines.len() {
1913        let next_idx = para_end + 1;
1914        let next_line = lines[next_idx];
1915        let next_trimmed = next_line.trim();
1916
1917        // Stop at blank line or special blocks
1918        if next_trimmed.is_empty()
1919            || next_trimmed.starts_with('#')
1920            || next_trimmed.starts_with("```")
1921            || next_trimmed.starts_with("~~~")
1922            || ElementCache::calculate_indentation_width_default(next_line) >= 4
1923            || next_trimmed.starts_with('>')
1924            || crate::utils::table_utils::TableUtils::is_potential_table_row(next_line)
1925            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1926            || is_horizontal_rule(next_trimmed)
1927            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1928                && !is_horizontal_rule(next_trimmed)
1929                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1930            || is_numbered_list_item(next_trimmed)
1931            || is_definition_list_item(next_trimmed)
1932        {
1933            break;
1934        }
1935
1936        para_end = next_idx;
1937    }
1938
1939    // Extract paragraph lines
1940    let paragraph_lines = &lines[para_start..=para_end];
1941
1942    // Calculate byte offsets
1943    let mut start_byte = 0;
1944    for line in lines.iter().take(para_start) {
1945        start_byte += line.len() + 1; // +1 for newline
1946    }
1947
1948    let mut end_byte = start_byte;
1949    for line in paragraph_lines.iter() {
1950        end_byte += line.len() + 1; // +1 for newline
1951    }
1952
1953    // Track whether the byte range includes a trailing newline
1954    // (it doesn't if this is the last line and the file doesn't end with newline)
1955    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1956
1957    // Adjust end_byte if the last line doesn't have a newline
1958    if !includes_trailing_newline {
1959        end_byte -= 1;
1960    }
1961
1962    // Join paragraph lines and reflow
1963    let paragraph_text = paragraph_lines.join("\n");
1964
1965    // Create reflow options
1966    let options = ReflowOptions {
1967        line_length,
1968        break_on_sentences: true,
1969        preserve_breaks: false,
1970        sentence_per_line: false,
1971        abbreviations: None,
1972    };
1973
1974    // Reflow the paragraph using reflow_markdown to handle it properly
1975    let reflowed = reflow_markdown(&paragraph_text, &options);
1976
1977    // Ensure reflowed text matches whether the byte range includes a trailing newline
1978    // This is critical: if the range includes a newline, the replacement must too,
1979    // otherwise the next line will get appended to the reflowed paragraph
1980    let reflowed_text = if includes_trailing_newline {
1981        // Range includes newline - ensure reflowed text has one
1982        if reflowed.ends_with('\n') {
1983            reflowed
1984        } else {
1985            format!("{reflowed}\n")
1986        }
1987    } else {
1988        // Range doesn't include newline - ensure reflowed text doesn't have one
1989        if reflowed.ends_with('\n') {
1990            reflowed.trim_end_matches('\n').to_string()
1991        } else {
1992            reflowed
1993        }
1994    };
1995
1996    Some(ParagraphReflow {
1997        start_byte,
1998        end_byte,
1999        reflowed_text,
2000    })
2001}
2002
2003#[cfg(test)]
2004mod tests {
2005    use super::*;
2006
2007    /// Unit test for private helper function text_ends_with_abbreviation()
2008    ///
2009    /// This test stays inline because it tests a private function.
2010    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
2011    #[test]
2012    fn test_helper_function_text_ends_with_abbreviation() {
2013        // Test the helper function directly
2014        let abbreviations = get_abbreviations(&None);
2015
2016        // True cases - built-in abbreviations (titles and i.e./e.g.)
2017        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
2018        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
2019        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
2020        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
2021        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
2022        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
2023        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
2024        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
2025
2026        // False cases - NOT in built-in list (etc doesn't always have period)
2027        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
2028        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
2029        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
2030        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
2031        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
2032        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
2033        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
2034        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
2035        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
2036        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
2037    }
2038}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs