rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::is_definition_list_item;
7use crate::utils::regex_cache::{
8    DISPLAY_MATH_REGEX, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
9    INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX, LINKED_IMAGE_INLINE_INLINE,
10    LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF, REF_IMAGE_REGEX, REF_LINK_REGEX,
11    SHORTCUT_REF_REGEX, STRIKETHROUGH_FANCY_REGEX, WIKI_LINK_REGEX,
12};
13use std::collections::HashSet;
14
15/// Options for reflowing text
16#[derive(Clone)]
17pub struct ReflowOptions {
18    /// Target line length
19    pub line_length: usize,
20    /// Whether to break on sentence boundaries when possible
21    pub break_on_sentences: bool,
22    /// Whether to preserve existing line breaks in paragraphs
23    pub preserve_breaks: bool,
24    /// Whether to enforce one sentence per line
25    pub sentence_per_line: bool,
26    /// Custom abbreviations for sentence detection
27    /// Periods are optional - both "Dr" and "Dr." work the same
28    /// Custom abbreviations are always added to the built-in defaults
29    pub abbreviations: Option<Vec<String>>,
30}
31
32impl Default for ReflowOptions {
33    fn default() -> Self {
34        Self {
35            line_length: 80,
36            break_on_sentences: true,
37            preserve_breaks: false,
38            sentence_per_line: false,
39            abbreviations: None,
40        }
41    }
42}
43
44/// Get the effective abbreviations set based on options
45/// All abbreviations are normalized to lowercase for case-insensitive matching
46/// Custom abbreviations are always merged with built-in defaults
47fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
48    // Only include abbreviations that:
49    // 1. Conventionally ALWAYS have a period in standard writing
50    // 2. Are followed by something (name, example), not sentence-final
51    //
52    // Do NOT include:
53    // - Words that don't typically take periods (vs, etc)
54    // - Abbreviations that can end sentences (Inc., Ph.D., U.S.)
55    let mut abbreviations: HashSet<String> = [
56        // Titles - always have period, always followed by a name
57        "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr",
58        // Latin - always written with periods, introduce examples/references
59        "i.e", "e.g",
60    ]
61    .iter()
62    .map(|s| s.to_lowercase())
63    .collect();
64
65    // Always extend defaults with custom abbreviations
66    // Strip any trailing periods and normalize to lowercase for consistent matching
67    if let Some(custom_list) = custom {
68        for abbr in custom_list {
69            let normalized = abbr.trim_end_matches('.').to_lowercase();
70            if !normalized.is_empty() {
71                abbreviations.insert(normalized);
72            }
73        }
74    }
75
76    abbreviations
77}
78
79/// Check if text ends with a common abbreviation followed by a period
80///
81/// Abbreviations only count when followed by a period, not ! or ?.
82/// This prevents false positives where words ending in abbreviation-like
83/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
84/// detected as abbreviations.
85///
86/// Examples:
87///   - "Dr." -> true (abbreviation)
88///   - "Dr?" -> false (question, not abbreviation)
89///   - "paradigms." -> false (not in abbreviation list)
90///   - "paradigms?" -> false (question mark, not abbreviation)
91///
92/// See: Issue #150
93fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
94    // Only check if text ends with a period (abbreviations require periods)
95    if !text.ends_with('.') {
96        return false;
97    }
98
99    // Remove the trailing period
100    let without_period = text.trim_end_matches('.');
101
102    // Get the last word by splitting on whitespace
103    let last_word = without_period.split_whitespace().last().unwrap_or("");
104
105    if last_word.is_empty() {
106        return false;
107    }
108
109    // O(1) HashSet lookup (abbreviations are already lowercase)
110    abbreviations.contains(&last_word.to_lowercase())
111}
112
113/// Detect if a character position is a sentence boundary
114/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
115fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
116    let chars: Vec<char> = text.chars().collect();
117
118    if pos + 1 >= chars.len() {
119        return false;
120    }
121
122    // Check for sentence-ending punctuation
123    let c = chars[pos];
124    if c != '.' && c != '!' && c != '?' {
125        return false;
126    }
127
128    // Must be followed by at least one space
129    if chars[pos + 1] != ' ' {
130        return false;
131    }
132
133    // Skip all whitespace after the punctuation to find the start of the next sentence
134    let mut next_char_pos = pos + 2;
135    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
136        next_char_pos += 1;
137    }
138
139    // Check if we reached the end of the string
140    if next_char_pos >= chars.len() {
141        return false;
142    }
143
144    // Next character after space(s) must be uppercase (new sentence indicator)
145    if !chars[next_char_pos].is_uppercase() {
146        return false;
147    }
148
149    // Look back to check for common abbreviations (only applies to periods)
150    if pos > 0 && c == '.' {
151        // Check if the text up to and including this period ends with an abbreviation
152        // Note: text[..=pos] includes the character at pos (the period)
153        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
154            return false;
155        }
156
157        // Check for decimal numbers (e.g., "3.14")
158        // Make sure to check if next_char_pos is within bounds
159        if chars[pos - 1].is_numeric() && next_char_pos < chars.len() && chars[next_char_pos].is_numeric() {
160            return false;
161        }
162    }
163    true
164}
165
166/// Split text into sentences
167pub fn split_into_sentences(text: &str) -> Vec<String> {
168    split_into_sentences_custom(text, &None)
169}
170
171/// Split text into sentences with custom abbreviations
172pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
173    let abbreviations = get_abbreviations(custom_abbreviations);
174    split_into_sentences_with_set(text, &abbreviations)
175}
176
177/// Internal function to split text into sentences with a pre-computed abbreviations set
178/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
179fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
180    let mut sentences = Vec::new();
181    let mut current_sentence = String::new();
182    let mut chars = text.chars().peekable();
183    let mut pos = 0;
184
185    while let Some(c) = chars.next() {
186        current_sentence.push(c);
187
188        if is_sentence_boundary(text, pos, abbreviations) {
189            // Include the space after sentence if it exists
190            if chars.peek() == Some(&' ') {
191                chars.next();
192                pos += 1;
193            }
194            sentences.push(current_sentence.trim().to_string());
195            current_sentence.clear();
196        }
197
198        pos += 1;
199    }
200
201    // Add any remaining text as the last sentence
202    if !current_sentence.trim().is_empty() {
203        sentences.push(current_sentence.trim().to_string());
204    }
205    sentences
206}
207
208/// Check if a line is a horizontal rule (---, ___, ***)
209fn is_horizontal_rule(line: &str) -> bool {
210    if line.len() < 3 {
211        return false;
212    }
213
214    // Check if line consists only of -, _, or * characters (at least 3)
215    let chars: Vec<char> = line.chars().collect();
216    if chars.is_empty() {
217        return false;
218    }
219
220    let first_char = chars[0];
221    if first_char != '-' && first_char != '_' && first_char != '*' {
222        return false;
223    }
224
225    // All characters should be the same (allowing spaces between)
226    for c in &chars {
227        if *c != first_char && *c != ' ' {
228            return false;
229        }
230    }
231
232    // Count non-space characters
233    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
234    non_space_count >= 3
235}
236
237/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
238fn is_numbered_list_item(line: &str) -> bool {
239    let mut chars = line.chars();
240
241    // Must start with a digit
242    if !chars.next().is_some_and(|c| c.is_numeric()) {
243        return false;
244    }
245
246    // Can have more digits
247    while let Some(c) = chars.next() {
248        if c == '.' {
249            // After period, must have a space or be end of line
250            return chars.next().is_none_or(|c| c == ' ');
251        }
252        if !c.is_numeric() {
253            return false;
254        }
255    }
256
257    false
258}
259
260/// Check if a line ends with a hard break (either two spaces or backslash)
261///
262/// CommonMark supports two formats for hard line breaks:
263/// 1. Two or more trailing spaces
264/// 2. A backslash at the end of the line
265fn has_hard_break(line: &str) -> bool {
266    let line = line.strip_suffix('\r').unwrap_or(line);
267    line.ends_with("  ") || line.ends_with('\\')
268}
269
270/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
271///
272/// Hard breaks in Markdown can be indicated by:
273/// 1. Two trailing spaces before a newline (traditional)
274/// 2. A backslash at the end of the line (mdformat style)
275fn trim_preserving_hard_break(s: &str) -> String {
276    // Strip trailing \r from CRLF line endings first to handle Windows files
277    let s = s.strip_suffix('\r').unwrap_or(s);
278
279    // Check for backslash hard break (mdformat style)
280    if s.ends_with('\\') {
281        // Preserve the backslash exactly as-is
282        return s.to_string();
283    }
284
285    // Check if there are at least 2 trailing spaces (traditional hard break)
286    if s.ends_with("  ") {
287        // Find the position where non-space content ends
288        let content_end = s.trim_end().len();
289        if content_end == 0 {
290            // String is all whitespace
291            return String::new();
292        }
293        // Preserve exactly 2 trailing spaces for hard break
294        format!("{}  ", &s[..content_end])
295    } else {
296        // No hard break, just trim all trailing whitespace
297        s.trim_end().to_string()
298    }
299}
300
301pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
302    // For sentence-per-line mode, always process regardless of length
303    if options.sentence_per_line {
304        let elements = parse_markdown_elements(line);
305        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
306    }
307
308    // Quick check: if line is already short enough, return as-is
309    if line.chars().count() <= options.line_length {
310        return vec![line.to_string()];
311    }
312
313    // Parse the markdown to identify elements
314    let elements = parse_markdown_elements(line);
315
316    // Reflow the elements into lines
317    reflow_elements(&elements, options)
318}
319
320/// Image source in a linked image structure
321#[derive(Debug, Clone)]
322enum LinkedImageSource {
323    /// Inline image URL: ![alt](url)
324    Inline(String),
325    /// Reference image: ![alt][ref]
326    Reference(String),
327}
328
329/// Link target in a linked image structure
330#[derive(Debug, Clone)]
331enum LinkedImageTarget {
332    /// Inline link URL: ](url)
333    Inline(String),
334    /// Reference link: ][ref]
335    Reference(String),
336}
337
338/// Represents a piece of content in the markdown
339#[derive(Debug, Clone)]
340enum Element {
341    /// Plain text that can be wrapped
342    Text(String),
343    /// A complete markdown inline link [text](url)
344    Link { text: String, url: String },
345    /// A complete markdown reference link [text][ref]
346    ReferenceLink { text: String, reference: String },
347    /// A complete markdown empty reference link [text][]
348    EmptyReferenceLink { text: String },
349    /// A complete markdown shortcut reference link [ref]
350    ShortcutReference { reference: String },
351    /// A complete markdown inline image ![alt](url)
352    InlineImage { alt: String, url: String },
353    /// A complete markdown reference image ![alt][ref]
354    ReferenceImage { alt: String, reference: String },
355    /// A complete markdown empty reference image ![alt][]
356    EmptyReferenceImage { alt: String },
357    /// A clickable image badge in any of 4 forms:
358    /// - [![alt](img-url)](link-url)
359    /// - [![alt][img-ref]](link-url)
360    /// - [![alt](img-url)][link-ref]
361    /// - [![alt][img-ref]][link-ref]
362    LinkedImage {
363        alt: String,
364        img_source: LinkedImageSource,
365        link_target: LinkedImageTarget,
366    },
367    /// Footnote reference [^note]
368    FootnoteReference { note: String },
369    /// Strikethrough text ~~text~~
370    Strikethrough(String),
371    /// Wiki-style link [[wiki]] or [[wiki|text]]
372    WikiLink(String),
373    /// Inline math $math$
374    InlineMath(String),
375    /// Display math $$math$$
376    DisplayMath(String),
377    /// Emoji shortcode :emoji:
378    EmojiShortcode(String),
379    /// HTML tag <tag> or </tag> or <tag/>
380    HtmlTag(String),
381    /// HTML entity &nbsp; or &#123;
382    HtmlEntity(String),
383    /// Inline code `code`
384    Code(String),
385    /// Bold text **text**
386    Bold(String),
387    /// Italic text *text*
388    Italic(String),
389}
390
391impl std::fmt::Display for Element {
392    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
393        match self {
394            Element::Text(s) => write!(f, "{s}"),
395            Element::Link { text, url } => write!(f, "[{text}]({url})"),
396            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
397            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
398            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
399            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
400            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
401            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
402            Element::LinkedImage {
403                alt,
404                img_source,
405                link_target,
406            } => {
407                // Build the image part: ![alt](url) or ![alt][ref]
408                let img_part = match img_source {
409                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
410                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
411                };
412                // Build the link part: (url) or [ref]
413                match link_target {
414                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
415                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
416                }
417            }
418            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
419            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
420            Element::WikiLink(s) => write!(f, "[[{s}]]"),
421            Element::InlineMath(s) => write!(f, "${s}$"),
422            Element::DisplayMath(s) => write!(f, "$${s}$$"),
423            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
424            Element::HtmlTag(s) => write!(f, "{s}"),
425            Element::HtmlEntity(s) => write!(f, "{s}"),
426            Element::Code(s) => write!(f, "`{s}`"),
427            Element::Bold(s) => write!(f, "**{s}**"),
428            Element::Italic(s) => write!(f, "*{s}*"),
429        }
430    }
431}
432
433impl Element {
434    fn len(&self) -> usize {
435        match self {
436            Element::Text(s) => s.chars().count(),
437            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
438            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
439            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
440            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
441            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
442            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
443            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
444            Element::LinkedImage {
445                alt,
446                img_source,
447                link_target,
448            } => {
449                // Calculate length based on variant
450                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
451                let alt_len = alt.chars().count();
452                let img_len = match img_source {
453                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
454                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
455                };
456                let link_len = match link_target {
457                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
458                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
459                };
460                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
461                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
462                5 + alt_len + img_len + link_len
463            }
464            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
465            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
466            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
467            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
468            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
469            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
470            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
471            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
472            Element::Code(s) => s.chars().count() + 2,                       // `code`
473            Element::Bold(s) => s.chars().count() + 4,                       // **text**
474            Element::Italic(s) => s.chars().count() + 2,                     // *text*
475        }
476    }
477}
478
479/// Parse markdown elements from text preserving the raw syntax
480///
481/// Detection order is critical:
482/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
483/// 2. Inline images ![alt](url) - before links to handle ! prefix
484/// 3. Reference images ![alt][ref] - before reference links
485/// 4. Inline links [text](url) - before reference links
486/// 5. Reference links [text][ref] - before shortcut references
487/// 6. Shortcut reference links [ref] - detected last to avoid false positives
488/// 7. Other elements (code, bold, italic, etc.) - processed normally
489fn parse_markdown_elements(text: &str) -> Vec<Element> {
490    let mut elements = Vec::new();
491    let mut remaining = text;
492
493    while !remaining.is_empty() {
494        // Find the earliest occurrence of any markdown pattern
495        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
496
497        // Check for linked images FIRST (all 4 variants)
498        // Quick literal check: only run expensive regexes if we might have a linked image
499        // Pattern starts with "[!" so check for that first
500        if remaining.contains("[!") {
501            // Pattern 1: [![alt](img)](link) - inline image in inline link
502            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
503                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
504            {
505                earliest_match = Some((m.start(), "linked_image_ii", m));
506            }
507
508            // Pattern 2: [![alt][ref]](link) - reference image in inline link
509            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
510                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
511            {
512                earliest_match = Some((m.start(), "linked_image_ri", m));
513            }
514
515            // Pattern 3: [![alt](img)][ref] - inline image in reference link
516            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
517                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
518            {
519                earliest_match = Some((m.start(), "linked_image_ir", m));
520            }
521
522            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
523            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
524                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
525            {
526                earliest_match = Some((m.start(), "linked_image_rr", m));
527            }
528        }
529
530        // Check for images (they start with ! so should be detected before links)
531        // Inline images - ![alt](url)
532        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
533            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
534        {
535            earliest_match = Some((m.start(), "inline_image", m));
536        }
537
538        // Reference images - ![alt][ref]
539        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
540            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
541        {
542            earliest_match = Some((m.start(), "ref_image", m));
543        }
544
545        // Check for footnote references - [^note]
546        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
547            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
548        {
549            earliest_match = Some((m.start(), "footnote_ref", m));
550        }
551
552        // Check for inline links - [text](url)
553        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
554            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
555        {
556            earliest_match = Some((m.start(), "inline_link", m));
557        }
558
559        // Check for reference links - [text][ref]
560        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
561            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
562        {
563            earliest_match = Some((m.start(), "ref_link", m));
564        }
565
566        // Check for shortcut reference links - [ref]
567        // Only check if we haven't found an earlier pattern that would conflict
568        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
569            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
570        {
571            earliest_match = Some((m.start(), "shortcut_ref", m));
572        }
573
574        // Check for wiki-style links - [[wiki]]
575        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
576            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
577        {
578            earliest_match = Some((m.start(), "wiki_link", m));
579        }
580
581        // Check for display math first (before inline) - $$math$$
582        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
583            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
584        {
585            earliest_match = Some((m.start(), "display_math", m));
586        }
587
588        // Check for inline math - $math$
589        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
590            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
591        {
592            earliest_match = Some((m.start(), "inline_math", m));
593        }
594
595        // Check for strikethrough - ~~text~~
596        if let Ok(Some(m)) = STRIKETHROUGH_FANCY_REGEX.find(remaining)
597            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
598        {
599            earliest_match = Some((m.start(), "strikethrough", m));
600        }
601
602        // Check for emoji shortcodes - :emoji:
603        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
604            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
605        {
606            earliest_match = Some((m.start(), "emoji", m));
607        }
608
609        // Check for HTML entities - &nbsp; etc
610        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
611            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
612        {
613            earliest_match = Some((m.start(), "html_entity", m));
614        }
615
616        // Check for HTML tags - <tag> </tag> <tag/>
617        // But exclude autolinks like <https://...> or <mailto:...>
618        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
619            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
620        {
621            // Check if this is an autolink (starts with protocol or mailto:)
622            let matched_text = &remaining[m.start()..m.end()];
623            let is_autolink = matched_text.starts_with("<http://")
624                || matched_text.starts_with("<https://")
625                || matched_text.starts_with("<mailto:")
626                || matched_text.starts_with("<ftp://")
627                || matched_text.starts_with("<ftps://");
628
629            if !is_autolink {
630                earliest_match = Some((m.start(), "html_tag", m));
631            }
632        }
633
634        // Find earliest non-link special characters
635        let mut next_special = remaining.len();
636        let mut special_type = "";
637
638        if let Some(pos) = remaining.find('`')
639            && pos < next_special
640        {
641            next_special = pos;
642            special_type = "code";
643        }
644        if let Some(pos) = remaining.find("**")
645            && pos < next_special
646        {
647            next_special = pos;
648            special_type = "bold";
649        }
650        if let Some(pos) = remaining.find('*')
651            && pos < next_special
652            && !remaining[pos..].starts_with("**")
653        {
654            next_special = pos;
655            special_type = "italic";
656        }
657
658        // Determine which pattern to process first
659        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
660            pos < next_special
661        } else {
662            false
663        };
664
665        if should_process_markdown_link {
666            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
667
668            // Add any text before the match
669            if pos > 0 {
670                elements.push(Element::Text(remaining[..pos].to_string()));
671            }
672
673            // Process the matched pattern
674            match pattern_type {
675                // Pattern 1: [![alt](img)](link) - inline image in inline link
676                "linked_image_ii" => {
677                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
678                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
679                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
680                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
681                        elements.push(Element::LinkedImage {
682                            alt: alt.to_string(),
683                            img_source: LinkedImageSource::Inline(img_url.to_string()),
684                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
685                        });
686                        remaining = &remaining[match_obj.end()..];
687                    } else {
688                        elements.push(Element::Text("[".to_string()));
689                        remaining = &remaining[1..];
690                    }
691                }
692                // Pattern 2: [![alt][ref]](link) - reference image in inline link
693                "linked_image_ri" => {
694                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
695                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
696                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
697                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
698                        elements.push(Element::LinkedImage {
699                            alt: alt.to_string(),
700                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
701                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
702                        });
703                        remaining = &remaining[match_obj.end()..];
704                    } else {
705                        elements.push(Element::Text("[".to_string()));
706                        remaining = &remaining[1..];
707                    }
708                }
709                // Pattern 3: [![alt](img)][ref] - inline image in reference link
710                "linked_image_ir" => {
711                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
712                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
713                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
714                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
715                        elements.push(Element::LinkedImage {
716                            alt: alt.to_string(),
717                            img_source: LinkedImageSource::Inline(img_url.to_string()),
718                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
719                        });
720                        remaining = &remaining[match_obj.end()..];
721                    } else {
722                        elements.push(Element::Text("[".to_string()));
723                        remaining = &remaining[1..];
724                    }
725                }
726                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
727                "linked_image_rr" => {
728                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
729                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
730                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
731                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
732                        elements.push(Element::LinkedImage {
733                            alt: alt.to_string(),
734                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
735                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
736                        });
737                        remaining = &remaining[match_obj.end()..];
738                    } else {
739                        elements.push(Element::Text("[".to_string()));
740                        remaining = &remaining[1..];
741                    }
742                }
743                "inline_image" => {
744                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
745                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
746                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
747                        elements.push(Element::InlineImage {
748                            alt: alt.to_string(),
749                            url: url.to_string(),
750                        });
751                        remaining = &remaining[match_obj.end()..];
752                    } else {
753                        elements.push(Element::Text("!".to_string()));
754                        remaining = &remaining[1..];
755                    }
756                }
757                "ref_image" => {
758                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
759                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
760                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
761
762                        if reference.is_empty() {
763                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
764                        } else {
765                            elements.push(Element::ReferenceImage {
766                                alt: alt.to_string(),
767                                reference: reference.to_string(),
768                            });
769                        }
770                        remaining = &remaining[match_obj.end()..];
771                    } else {
772                        elements.push(Element::Text("!".to_string()));
773                        remaining = &remaining[1..];
774                    }
775                }
776                "footnote_ref" => {
777                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
778                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
779                        elements.push(Element::FootnoteReference { note: note.to_string() });
780                        remaining = &remaining[match_obj.end()..];
781                    } else {
782                        elements.push(Element::Text("[".to_string()));
783                        remaining = &remaining[1..];
784                    }
785                }
786                "inline_link" => {
787                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
788                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
789                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
790                        elements.push(Element::Link {
791                            text: text.to_string(),
792                            url: url.to_string(),
793                        });
794                        remaining = &remaining[match_obj.end()..];
795                    } else {
796                        // Fallback - shouldn't happen
797                        elements.push(Element::Text("[".to_string()));
798                        remaining = &remaining[1..];
799                    }
800                }
801                "ref_link" => {
802                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
803                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
804                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
805
806                        if reference.is_empty() {
807                            // Empty reference link [text][]
808                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
809                        } else {
810                            // Regular reference link [text][ref]
811                            elements.push(Element::ReferenceLink {
812                                text: text.to_string(),
813                                reference: reference.to_string(),
814                            });
815                        }
816                        remaining = &remaining[match_obj.end()..];
817                    } else {
818                        // Fallback - shouldn't happen
819                        elements.push(Element::Text("[".to_string()));
820                        remaining = &remaining[1..];
821                    }
822                }
823                "shortcut_ref" => {
824                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
825                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
826                        elements.push(Element::ShortcutReference {
827                            reference: reference.to_string(),
828                        });
829                        remaining = &remaining[match_obj.end()..];
830                    } else {
831                        // Fallback - shouldn't happen
832                        elements.push(Element::Text("[".to_string()));
833                        remaining = &remaining[1..];
834                    }
835                }
836                "wiki_link" => {
837                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
838                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
839                        elements.push(Element::WikiLink(content.to_string()));
840                        remaining = &remaining[match_obj.end()..];
841                    } else {
842                        elements.push(Element::Text("[[".to_string()));
843                        remaining = &remaining[2..];
844                    }
845                }
846                "display_math" => {
847                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
848                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
849                        elements.push(Element::DisplayMath(math.to_string()));
850                        remaining = &remaining[match_obj.end()..];
851                    } else {
852                        elements.push(Element::Text("$$".to_string()));
853                        remaining = &remaining[2..];
854                    }
855                }
856                "inline_math" => {
857                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
858                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
859                        elements.push(Element::InlineMath(math.to_string()));
860                        remaining = &remaining[match_obj.end()..];
861                    } else {
862                        elements.push(Element::Text("$".to_string()));
863                        remaining = &remaining[1..];
864                    }
865                }
866                "strikethrough" => {
867                    if let Ok(Some(caps)) = STRIKETHROUGH_FANCY_REGEX.captures(remaining) {
868                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
869                        elements.push(Element::Strikethrough(text.to_string()));
870                        remaining = &remaining[match_obj.end()..];
871                    } else {
872                        elements.push(Element::Text("~~".to_string()));
873                        remaining = &remaining[2..];
874                    }
875                }
876                "emoji" => {
877                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
878                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
879                        elements.push(Element::EmojiShortcode(emoji.to_string()));
880                        remaining = &remaining[match_obj.end()..];
881                    } else {
882                        elements.push(Element::Text(":".to_string()));
883                        remaining = &remaining[1..];
884                    }
885                }
886                "html_entity" => {
887                    // HTML entities are captured whole
888                    elements.push(Element::HtmlEntity(remaining[..match_obj.end()].to_string()));
889                    remaining = &remaining[match_obj.end()..];
890                }
891                "html_tag" => {
892                    // HTML tags are captured whole
893                    elements.push(Element::HtmlTag(remaining[..match_obj.end()].to_string()));
894                    remaining = &remaining[match_obj.end()..];
895                }
896                _ => {
897                    // Unknown pattern, treat as text
898                    elements.push(Element::Text("[".to_string()));
899                    remaining = &remaining[1..];
900                }
901            }
902        } else {
903            // Process non-link special characters
904
905            // Add any text before the special character
906            if next_special > 0 && next_special < remaining.len() {
907                elements.push(Element::Text(remaining[..next_special].to_string()));
908                remaining = &remaining[next_special..];
909            }
910
911            // Process the special element
912            match special_type {
913                "code" => {
914                    // Find end of code
915                    if let Some(code_end) = remaining[1..].find('`') {
916                        let code = &remaining[1..1 + code_end];
917                        elements.push(Element::Code(code.to_string()));
918                        remaining = &remaining[1 + code_end + 1..];
919                    } else {
920                        // No closing backtick, treat as text
921                        elements.push(Element::Text(remaining.to_string()));
922                        break;
923                    }
924                }
925                "bold" => {
926                    // Check for bold text
927                    if let Some(bold_end) = remaining[2..].find("**") {
928                        let bold_text = &remaining[2..2 + bold_end];
929                        elements.push(Element::Bold(bold_text.to_string()));
930                        remaining = &remaining[2 + bold_end + 2..];
931                    } else {
932                        // No closing **, treat as text
933                        elements.push(Element::Text("**".to_string()));
934                        remaining = &remaining[2..];
935                    }
936                }
937                "italic" => {
938                    // Check for italic text
939                    if let Some(italic_end) = remaining[1..].find('*') {
940                        let italic_text = &remaining[1..1 + italic_end];
941                        elements.push(Element::Italic(italic_text.to_string()));
942                        remaining = &remaining[1 + italic_end + 1..];
943                    } else {
944                        // No closing *, treat as text
945                        elements.push(Element::Text("*".to_string()));
946                        remaining = &remaining[1..];
947                    }
948                }
949                _ => {
950                    // No special elements found, add all remaining text
951                    elements.push(Element::Text(remaining.to_string()));
952                    break;
953                }
954            }
955        }
956    }
957
958    elements
959}
960
961/// Reflow elements for sentence-per-line mode
962fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
963    let abbreviations = get_abbreviations(custom_abbreviations);
964    let mut lines = Vec::new();
965    let mut current_line = String::new();
966
967    for element in elements.iter() {
968        let element_str = format!("{element}");
969
970        // For text elements, split into sentences
971        if let Element::Text(text) = element {
972            // Simply append text - it already has correct spacing from tokenization
973            let combined = format!("{current_line}{text}");
974            // Use the pre-computed abbreviations set to avoid redundant computation
975            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
976
977            if sentences.len() > 1 {
978                // We found sentence boundaries
979                for (i, sentence) in sentences.iter().enumerate() {
980                    if i == 0 {
981                        // First sentence might continue from previous elements
982                        // But check if it ends with an abbreviation
983                        let trimmed = sentence.trim();
984
985                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
986                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
987                            current_line = sentence.to_string();
988                        } else {
989                            // Normal case - emit the first sentence
990                            lines.push(sentence.to_string());
991                            current_line.clear();
992                        }
993                    } else if i == sentences.len() - 1 {
994                        // Last sentence: check if it's complete or incomplete
995                        let trimmed = sentence.trim();
996                        let ends_with_sentence_punct =
997                            trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
998
999                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1000                            // Complete sentence - emit it immediately
1001                            lines.push(sentence.to_string());
1002                            current_line.clear();
1003                        } else {
1004                            // Incomplete sentence - save for next iteration
1005                            current_line = sentence.to_string();
1006                        }
1007                    } else {
1008                        // Complete sentences in the middle
1009                        lines.push(sentence.to_string());
1010                    }
1011                }
1012            } else {
1013                // No sentence boundary found, continue accumulating
1014                current_line = combined;
1015            }
1016        } else {
1017            // Non-text elements (Code, Bold, Italic, etc.)
1018            // Add space before element if needed (unless it's after an opening paren/bracket)
1019            if !current_line.is_empty()
1020                && !current_line.ends_with(' ')
1021                && !current_line.ends_with('(')
1022                && !current_line.ends_with('[')
1023            {
1024                current_line.push(' ');
1025            }
1026            current_line.push_str(&element_str);
1027        }
1028    }
1029
1030    // Add any remaining content
1031    if !current_line.is_empty() {
1032        lines.push(current_line.trim().to_string());
1033    }
1034    lines
1035}
1036
1037/// Reflow elements into lines that fit within the line length
1038fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1039    let mut lines = Vec::new();
1040    let mut current_line = String::new();
1041    let mut current_length = 0;
1042
1043    for element in elements {
1044        let element_str = format!("{element}");
1045        let element_len = element.len();
1046
1047        // For text elements that might need breaking
1048        if let Element::Text(text) = element {
1049            // Check if original text had leading whitespace
1050            let has_leading_space = text.starts_with(char::is_whitespace);
1051            // If this is a text element, always process it word by word
1052            let words: Vec<&str> = text.split_whitespace().collect();
1053
1054            for (i, word) in words.iter().enumerate() {
1055                let word_len = word.chars().count();
1056                // Check if this "word" is just punctuation that should stay attached
1057                let is_trailing_punct = word
1058                    .chars()
1059                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1060
1061                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
1062                    // Start a new line (but never for trailing punctuation)
1063                    lines.push(current_line.trim().to_string());
1064                    current_line = word.to_string();
1065                    current_length = word_len;
1066                } else {
1067                    // Add word to current line
1068                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1069                    // AND this isn't trailing punctuation (which attaches directly)
1070                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1071                        current_line.push(' ');
1072                        current_length += 1;
1073                    }
1074                    current_line.push_str(word);
1075                    current_length += word_len;
1076                }
1077            }
1078        } else {
1079            // For non-text elements (code, links, references), treat as atomic units
1080            // These should never be broken across lines
1081            if current_length > 0 && current_length + 1 + element_len > options.line_length {
1082                // Start a new line
1083                lines.push(current_line.trim().to_string());
1084                current_line = element_str;
1085                current_length = element_len;
1086            } else {
1087                // Add element to current line
1088                // Don't add space if the current line ends with an opening bracket/paren
1089                let ends_with_opener =
1090                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1091                if current_length > 0 && !ends_with_opener {
1092                    current_line.push(' ');
1093                    current_length += 1;
1094                }
1095                current_line.push_str(&element_str);
1096                current_length += element_len;
1097            }
1098        }
1099    }
1100
1101    // Don't forget the last line
1102    if !current_line.is_empty() {
1103        lines.push(current_line.trim_end().to_string());
1104    }
1105
1106    lines
1107}
1108
1109/// Reflow markdown content preserving structure
1110pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1111    let lines: Vec<&str> = content.lines().collect();
1112    let mut result = Vec::new();
1113    let mut i = 0;
1114
1115    while i < lines.len() {
1116        let line = lines[i];
1117        let trimmed = line.trim();
1118
1119        // Preserve empty lines
1120        if trimmed.is_empty() {
1121            result.push(String::new());
1122            i += 1;
1123            continue;
1124        }
1125
1126        // Preserve headings as-is
1127        if trimmed.starts_with('#') {
1128            result.push(line.to_string());
1129            i += 1;
1130            continue;
1131        }
1132
1133        // Preserve fenced code blocks
1134        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1135            result.push(line.to_string());
1136            i += 1;
1137            // Copy lines until closing fence
1138            while i < lines.len() {
1139                result.push(lines[i].to_string());
1140                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1141                    i += 1;
1142                    break;
1143                }
1144                i += 1;
1145            }
1146            continue;
1147        }
1148
1149        // Preserve indented code blocks (4+ spaces or 1+ tab)
1150        if line.starts_with("    ") || line.starts_with("\t") {
1151            // Collect all consecutive indented lines
1152            result.push(line.to_string());
1153            i += 1;
1154            while i < lines.len() {
1155                let next_line = lines[i];
1156                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1157                if next_line.starts_with("    ") || next_line.starts_with("\t") || next_line.trim().is_empty() {
1158                    result.push(next_line.to_string());
1159                    i += 1;
1160                } else {
1161                    break;
1162                }
1163            }
1164            continue;
1165        }
1166
1167        // Preserve block quotes (but reflow their content)
1168        if trimmed.starts_with('>') {
1169            let quote_prefix = line[0..line.find('>').unwrap() + 1].to_string();
1170            let quote_content = &line[quote_prefix.len()..].trim_start();
1171
1172            let reflowed = reflow_line(quote_content, options);
1173            for reflowed_line in reflowed.iter() {
1174                result.push(format!("{quote_prefix} {reflowed_line}"));
1175            }
1176            i += 1;
1177            continue;
1178        }
1179
1180        // Preserve horizontal rules first (before checking for lists)
1181        if is_horizontal_rule(trimmed) {
1182            result.push(line.to_string());
1183            i += 1;
1184            continue;
1185        }
1186
1187        // Preserve lists (but not horizontal rules)
1188        if (trimmed.starts_with('-') && !is_horizontal_rule(trimmed))
1189            || (trimmed.starts_with('*') && !is_horizontal_rule(trimmed))
1190            || trimmed.starts_with('+')
1191            || is_numbered_list_item(trimmed)
1192        {
1193            // Find the list marker and preserve indentation
1194            let indent = line.len() - line.trim_start().len();
1195            let indent_str = " ".repeat(indent);
1196
1197            // For numbered lists, find the period and the space after it
1198            // For bullet lists, find the marker and the space after it
1199            let mut marker_end = indent;
1200            let mut content_start = indent;
1201
1202            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1203                // Numbered list: find the period
1204                if let Some(period_pos) = line[indent..].find('.') {
1205                    marker_end = indent + period_pos + 1; // Include the period
1206                    content_start = marker_end;
1207                    // Skip any spaces after the period to find content start
1208                    while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1209                        content_start += 1;
1210                    }
1211                }
1212            } else {
1213                // Bullet list: marker is single character
1214                marker_end = indent + 1; // Just the marker character
1215                content_start = marker_end;
1216                // Skip any spaces after the marker
1217                while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1218                    content_start += 1;
1219                }
1220            }
1221
1222            let marker = &line[indent..marker_end];
1223
1224            // Collect all content for this list item (including continuation lines)
1225            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1226            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1227            i += 1;
1228
1229            // Collect continuation lines (indented lines that are part of this list item)
1230            while i < lines.len() {
1231                let next_line = lines[i];
1232                let next_trimmed = next_line.trim();
1233
1234                // Stop if we hit an empty line or another list item or special block
1235                if next_trimmed.is_empty()
1236                    || next_trimmed.starts_with('#')
1237                    || next_trimmed.starts_with("```")
1238                    || next_trimmed.starts_with("~~~")
1239                    || next_trimmed.starts_with('>')
1240                    || next_trimmed.starts_with('|')
1241                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1242                    || is_horizontal_rule(next_trimmed)
1243                    || (next_trimmed.starts_with('-')
1244                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1245                    || (next_trimmed.starts_with('*')
1246                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1247                    || (next_trimmed.starts_with('+')
1248                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1249                    || is_numbered_list_item(next_trimmed)
1250                    || is_definition_list_item(next_trimmed)
1251                {
1252                    break;
1253                }
1254
1255                // Check if this line is indented (continuation of list item)
1256                let next_indent = next_line.len() - next_line.trim_start().len();
1257                if next_indent >= content_start {
1258                    // This is a continuation line - add its content
1259                    // Preserve hard breaks while trimming excessive whitespace
1260                    let trimmed_start = next_line.trim_start();
1261                    list_content.push(trim_preserving_hard_break(trimmed_start));
1262                    i += 1;
1263                } else {
1264                    // Not indented enough, not part of this list item
1265                    break;
1266                }
1267            }
1268
1269            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1270            // Hard breaks should prevent joining with the next line
1271            let combined_content = if options.preserve_breaks {
1272                list_content[0].clone()
1273            } else {
1274                // Check if any lines have hard breaks - if so, preserve the structure
1275                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1276                if has_hard_breaks {
1277                    // Don't join lines with hard breaks - keep them separate with newlines
1278                    list_content.join("\n")
1279                } else {
1280                    // No hard breaks, safe to join with spaces
1281                    list_content.join(" ")
1282                }
1283            };
1284
1285            // Calculate the proper indentation for continuation lines
1286            let trimmed_marker = marker;
1287            let continuation_spaces = content_start;
1288
1289            // Adjust line length to account for list marker and space
1290            let prefix_length = indent + trimmed_marker.len() + 1;
1291
1292            // Create adjusted options with reduced line length
1293            let adjusted_options = ReflowOptions {
1294                line_length: options.line_length.saturating_sub(prefix_length),
1295                ..options.clone()
1296            };
1297
1298            let reflowed = reflow_line(&combined_content, &adjusted_options);
1299            for (j, reflowed_line) in reflowed.iter().enumerate() {
1300                if j == 0 {
1301                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1302                } else {
1303                    // Continuation lines aligned with text after marker
1304                    let continuation_indent = " ".repeat(continuation_spaces);
1305                    result.push(format!("{continuation_indent}{reflowed_line}"));
1306                }
1307            }
1308            continue;
1309        }
1310
1311        // Preserve tables
1312        if trimmed.contains('|') {
1313            result.push(line.to_string());
1314            i += 1;
1315            continue;
1316        }
1317
1318        // Preserve reference definitions
1319        if trimmed.starts_with('[') && line.contains("]:") {
1320            result.push(line.to_string());
1321            i += 1;
1322            continue;
1323        }
1324
1325        // Preserve definition list items (extended markdown)
1326        if is_definition_list_item(trimmed) {
1327            result.push(line.to_string());
1328            i += 1;
1329            continue;
1330        }
1331
1332        // Check if this is a single line that doesn't need processing
1333        let mut is_single_line_paragraph = true;
1334        if i + 1 < lines.len() {
1335            let next_line = lines[i + 1];
1336            let next_trimmed = next_line.trim();
1337            // Check if next line starts a new block
1338            if !next_trimmed.is_empty()
1339                && !next_trimmed.starts_with('#')
1340                && !next_trimmed.starts_with("```")
1341                && !next_trimmed.starts_with("~~~")
1342                && !next_trimmed.starts_with('>')
1343                && !next_trimmed.starts_with('|')
1344                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1345                && !is_horizontal_rule(next_trimmed)
1346                && !(next_trimmed.starts_with('-')
1347                    && !is_horizontal_rule(next_trimmed)
1348                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1349                && !(next_trimmed.starts_with('*')
1350                    && !is_horizontal_rule(next_trimmed)
1351                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1352                && !(next_trimmed.starts_with('+')
1353                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1354                && !is_numbered_list_item(next_trimmed)
1355            {
1356                is_single_line_paragraph = false;
1357            }
1358        }
1359
1360        // If it's a single line that fits, just add it as-is
1361        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1362            result.push(line.to_string());
1363            i += 1;
1364            continue;
1365        }
1366
1367        // For regular paragraphs, collect consecutive lines
1368        let mut paragraph_parts = Vec::new();
1369        let mut current_part = vec![line];
1370        i += 1;
1371
1372        // If preserve_breaks is true, treat each line separately
1373        if options.preserve_breaks {
1374            // Don't collect consecutive lines - just reflow this single line
1375            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1376                Some("\\")
1377            } else if line.ends_with("  ") {
1378                Some("  ")
1379            } else {
1380                None
1381            };
1382            let reflowed = reflow_line(line, options);
1383
1384            // Preserve hard breaks (two trailing spaces or backslash)
1385            if let Some(break_marker) = hard_break_type {
1386                if !reflowed.is_empty() {
1387                    let mut reflowed_with_break = reflowed;
1388                    let last_idx = reflowed_with_break.len() - 1;
1389                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1390                        reflowed_with_break[last_idx].push_str(break_marker);
1391                    }
1392                    result.extend(reflowed_with_break);
1393                }
1394            } else {
1395                result.extend(reflowed);
1396            }
1397        } else {
1398            // Original behavior: collect consecutive lines into a paragraph
1399            while i < lines.len() {
1400                let prev_line = if !current_part.is_empty() {
1401                    current_part.last().unwrap()
1402                } else {
1403                    ""
1404                };
1405                let next_line = lines[i];
1406                let next_trimmed = next_line.trim();
1407
1408                // Stop at empty lines or special blocks
1409                if next_trimmed.is_empty()
1410                    || next_trimmed.starts_with('#')
1411                    || next_trimmed.starts_with("```")
1412                    || next_trimmed.starts_with("~~~")
1413                    || next_trimmed.starts_with('>')
1414                    || next_trimmed.starts_with('|')
1415                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1416                    || is_horizontal_rule(next_trimmed)
1417                    || (next_trimmed.starts_with('-')
1418                        && !is_horizontal_rule(next_trimmed)
1419                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1420                    || (next_trimmed.starts_with('*')
1421                        && !is_horizontal_rule(next_trimmed)
1422                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1423                    || (next_trimmed.starts_with('+')
1424                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1425                    || is_numbered_list_item(next_trimmed)
1426                    || is_definition_list_item(next_trimmed)
1427                {
1428                    break;
1429                }
1430
1431                // Check if previous line ends with hard break (two spaces or backslash)
1432                if has_hard_break(prev_line) {
1433                    // Start a new part after hard break
1434                    paragraph_parts.push(current_part.join(" "));
1435                    current_part = vec![next_line];
1436                } else {
1437                    current_part.push(next_line);
1438                }
1439                i += 1;
1440            }
1441
1442            // Add the last part
1443            if !current_part.is_empty() {
1444                if current_part.len() == 1 {
1445                    // Single line, don't add trailing space
1446                    paragraph_parts.push(current_part[0].to_string());
1447                } else {
1448                    paragraph_parts.push(current_part.join(" "));
1449                }
1450            }
1451
1452            // Reflow each part separately, preserving hard breaks
1453            for (j, part) in paragraph_parts.iter().enumerate() {
1454                let reflowed = reflow_line(part, options);
1455                result.extend(reflowed);
1456
1457                // Preserve hard break by ensuring last line of part ends with hard break marker
1458                // Use two spaces as the default hard break format for reflows
1459                if j < paragraph_parts.len() - 1 && !result.is_empty() {
1460                    let last_idx = result.len() - 1;
1461                    if !has_hard_break(&result[last_idx]) {
1462                        result[last_idx].push_str("  ");
1463                    }
1464                }
1465            }
1466        }
1467    }
1468
1469    // Preserve trailing newline if the original content had one
1470    let result_text = result.join("\n");
1471    if content.ends_with('\n') && !result_text.ends_with('\n') {
1472        format!("{result_text}\n")
1473    } else {
1474        result_text
1475    }
1476}
1477
1478/// Information about a reflowed paragraph
1479#[derive(Debug, Clone)]
1480pub struct ParagraphReflow {
1481    /// Starting byte offset of the paragraph in the original content
1482    pub start_byte: usize,
1483    /// Ending byte offset of the paragraph in the original content
1484    pub end_byte: usize,
1485    /// The reflowed text for this paragraph
1486    pub reflowed_text: String,
1487}
1488
1489/// Reflow a single paragraph at the specified line number
1490///
1491/// This function finds the paragraph containing the given line number,
1492/// reflows it according to the specified line length, and returns
1493/// information about the paragraph location and its reflowed text.
1494///
1495/// # Arguments
1496///
1497/// * `content` - The full document content
1498/// * `line_number` - The 1-based line number within the paragraph to reflow
1499/// * `line_length` - The target line length for reflowing
1500///
1501/// # Returns
1502///
1503/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1504/// or `None` if the line number is out of bounds or the content at that
1505/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1506pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1507    if line_number == 0 {
1508        return None;
1509    }
1510
1511    let lines: Vec<&str> = content.lines().collect();
1512
1513    // Check if line number is valid (1-based)
1514    if line_number > lines.len() {
1515        return None;
1516    }
1517
1518    let target_idx = line_number - 1; // Convert to 0-based
1519    let target_line = lines[target_idx];
1520    let trimmed = target_line.trim();
1521
1522    // Don't reflow special blocks
1523    if trimmed.is_empty()
1524        || trimmed.starts_with('#')
1525        || trimmed.starts_with("```")
1526        || trimmed.starts_with("~~~")
1527        || target_line.starts_with("    ")
1528        || target_line.starts_with('\t')
1529        || trimmed.starts_with('>')
1530        || trimmed.contains('|') // Tables
1531        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1532        || is_horizontal_rule(trimmed)
1533        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1534            && !is_horizontal_rule(trimmed)
1535            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1536        || is_numbered_list_item(trimmed)
1537        || is_definition_list_item(trimmed)
1538    {
1539        return None;
1540    }
1541
1542    // Find paragraph start - scan backward until blank line or special block
1543    let mut para_start = target_idx;
1544    while para_start > 0 {
1545        let prev_idx = para_start - 1;
1546        let prev_line = lines[prev_idx];
1547        let prev_trimmed = prev_line.trim();
1548
1549        // Stop at blank line or special blocks
1550        if prev_trimmed.is_empty()
1551            || prev_trimmed.starts_with('#')
1552            || prev_trimmed.starts_with("```")
1553            || prev_trimmed.starts_with("~~~")
1554            || prev_line.starts_with("    ")
1555            || prev_line.starts_with('\t')
1556            || prev_trimmed.starts_with('>')
1557            || prev_trimmed.contains('|')
1558            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1559            || is_horizontal_rule(prev_trimmed)
1560            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1561                && !is_horizontal_rule(prev_trimmed)
1562                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1563            || is_numbered_list_item(prev_trimmed)
1564            || is_definition_list_item(prev_trimmed)
1565        {
1566            break;
1567        }
1568
1569        para_start = prev_idx;
1570    }
1571
1572    // Find paragraph end - scan forward until blank line or special block
1573    let mut para_end = target_idx;
1574    while para_end + 1 < lines.len() {
1575        let next_idx = para_end + 1;
1576        let next_line = lines[next_idx];
1577        let next_trimmed = next_line.trim();
1578
1579        // Stop at blank line or special blocks
1580        if next_trimmed.is_empty()
1581            || next_trimmed.starts_with('#')
1582            || next_trimmed.starts_with("```")
1583            || next_trimmed.starts_with("~~~")
1584            || next_line.starts_with("    ")
1585            || next_line.starts_with('\t')
1586            || next_trimmed.starts_with('>')
1587            || next_trimmed.contains('|')
1588            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1589            || is_horizontal_rule(next_trimmed)
1590            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1591                && !is_horizontal_rule(next_trimmed)
1592                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1593            || is_numbered_list_item(next_trimmed)
1594            || is_definition_list_item(next_trimmed)
1595        {
1596            break;
1597        }
1598
1599        para_end = next_idx;
1600    }
1601
1602    // Extract paragraph lines
1603    let paragraph_lines = &lines[para_start..=para_end];
1604
1605    // Calculate byte offsets
1606    let mut start_byte = 0;
1607    for line in lines.iter().take(para_start) {
1608        start_byte += line.len() + 1; // +1 for newline
1609    }
1610
1611    let mut end_byte = start_byte;
1612    for line in paragraph_lines.iter() {
1613        end_byte += line.len() + 1; // +1 for newline
1614    }
1615
1616    // Track whether the byte range includes a trailing newline
1617    // (it doesn't if this is the last line and the file doesn't end with newline)
1618    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1619
1620    // Adjust end_byte if the last line doesn't have a newline
1621    if !includes_trailing_newline {
1622        end_byte -= 1;
1623    }
1624
1625    // Join paragraph lines and reflow
1626    let paragraph_text = paragraph_lines.join("\n");
1627
1628    // Create reflow options
1629    let options = ReflowOptions {
1630        line_length,
1631        break_on_sentences: true,
1632        preserve_breaks: false,
1633        sentence_per_line: false,
1634        abbreviations: None,
1635    };
1636
1637    // Reflow the paragraph using reflow_markdown to handle it properly
1638    let reflowed = reflow_markdown(&paragraph_text, &options);
1639
1640    // Ensure reflowed text matches whether the byte range includes a trailing newline
1641    // This is critical: if the range includes a newline, the replacement must too,
1642    // otherwise the next line will get appended to the reflowed paragraph
1643    let reflowed_text = if includes_trailing_newline {
1644        // Range includes newline - ensure reflowed text has one
1645        if reflowed.ends_with('\n') {
1646            reflowed
1647        } else {
1648            format!("{reflowed}\n")
1649        }
1650    } else {
1651        // Range doesn't include newline - ensure reflowed text doesn't have one
1652        if reflowed.ends_with('\n') {
1653            reflowed.trim_end_matches('\n').to_string()
1654        } else {
1655            reflowed
1656        }
1657    };
1658
1659    Some(ParagraphReflow {
1660        start_byte,
1661        end_byte,
1662        reflowed_text,
1663    })
1664}
1665
1666#[cfg(test)]
1667mod tests {
1668    use super::*;
1669
1670    /// Unit test for private helper function text_ends_with_abbreviation()
1671    ///
1672    /// This test stays inline because it tests a private function.
1673    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
1674    #[test]
1675    fn test_helper_function_text_ends_with_abbreviation() {
1676        // Test the helper function directly
1677        let abbreviations = get_abbreviations(&None);
1678
1679        // True cases - built-in abbreviations (titles and i.e./e.g.)
1680        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
1681        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
1682        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
1683        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
1684        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
1685        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
1686        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
1687        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
1688
1689        // False cases - NOT in built-in list (etc doesn't always have period)
1690        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
1691        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
1692        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
1693        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
1694        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
1695        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
1696        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
1697        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
1698        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
1699        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
1700    }
1701}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs