rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::element_cache::ElementCache;
7use crate::utils::is_definition_list_item;
8use crate::utils::regex_cache::{
9    DISPLAY_MATH_REGEX, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
10    INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX, LINKED_IMAGE_INLINE_INLINE,
11    LINKED_IMAGE_INLINE_REF, LINKED_IMAGE_REF_INLINE, LINKED_IMAGE_REF_REF, REF_IMAGE_REGEX, REF_LINK_REGEX,
12    SHORTCUT_REF_REGEX, STRIKETHROUGH_FANCY_REGEX, WIKI_LINK_REGEX,
13};
14use std::collections::HashSet;
15
16/// Options for reflowing text
17#[derive(Clone)]
18pub struct ReflowOptions {
19    /// Target line length
20    pub line_length: usize,
21    /// Whether to break on sentence boundaries when possible
22    pub break_on_sentences: bool,
23    /// Whether to preserve existing line breaks in paragraphs
24    pub preserve_breaks: bool,
25    /// Whether to enforce one sentence per line
26    pub sentence_per_line: bool,
27    /// Custom abbreviations for sentence detection
28    /// Periods are optional - both "Dr" and "Dr." work the same
29    /// Custom abbreviations are always added to the built-in defaults
30    pub abbreviations: Option<Vec<String>>,
31}
32
33impl Default for ReflowOptions {
34    fn default() -> Self {
35        Self {
36            line_length: 80,
37            break_on_sentences: true,
38            preserve_breaks: false,
39            sentence_per_line: false,
40            abbreviations: None,
41        }
42    }
43}
44
45/// Get the effective abbreviations set based on options
46/// All abbreviations are normalized to lowercase for case-insensitive matching
47/// Custom abbreviations are always merged with built-in defaults
48fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
49    // Only include abbreviations that:
50    // 1. Conventionally ALWAYS have a period in standard writing
51    // 2. Are followed by something (name, example), not sentence-final
52    //
53    // Do NOT include:
54    // - Words that don't typically take periods (vs, etc)
55    // - Abbreviations that can end sentences (Inc., Ph.D., U.S.)
56    let mut abbreviations: HashSet<String> = [
57        // Titles - always have period, always followed by a name
58        "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr",
59        // Latin - always written with periods, introduce examples/references
60        "i.e", "e.g",
61    ]
62    .iter()
63    .map(|s| s.to_lowercase())
64    .collect();
65
66    // Always extend defaults with custom abbreviations
67    // Strip any trailing periods and normalize to lowercase for consistent matching
68    if let Some(custom_list) = custom {
69        for abbr in custom_list {
70            let normalized = abbr.trim_end_matches('.').to_lowercase();
71            if !normalized.is_empty() {
72                abbreviations.insert(normalized);
73            }
74        }
75    }
76
77    abbreviations
78}
79
80/// Check if text ends with a common abbreviation followed by a period
81///
82/// Abbreviations only count when followed by a period, not ! or ?.
83/// This prevents false positives where words ending in abbreviation-like
84/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
85/// detected as abbreviations.
86///
87/// Examples:
88///   - "Dr." -> true (abbreviation)
89///   - "Dr?" -> false (question, not abbreviation)
90///   - "paradigms." -> false (not in abbreviation list)
91///   - "paradigms?" -> false (question mark, not abbreviation)
92///
93/// See: Issue #150
94fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
95    // Only check if text ends with a period (abbreviations require periods)
96    if !text.ends_with('.') {
97        return false;
98    }
99
100    // Remove the trailing period
101    let without_period = text.trim_end_matches('.');
102
103    // Get the last word by splitting on whitespace
104    let last_word = without_period.split_whitespace().last().unwrap_or("");
105
106    if last_word.is_empty() {
107        return false;
108    }
109
110    // O(1) HashSet lookup (abbreviations are already lowercase)
111    abbreviations.contains(&last_word.to_lowercase())
112}
113
114/// Detect if a character position is a sentence boundary
115/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
116fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
117    let chars: Vec<char> = text.chars().collect();
118
119    if pos + 1 >= chars.len() {
120        return false;
121    }
122
123    // Check for sentence-ending punctuation
124    let c = chars[pos];
125    if c != '.' && c != '!' && c != '?' {
126        return false;
127    }
128
129    // Must be followed by at least one space
130    if chars[pos + 1] != ' ' {
131        return false;
132    }
133
134    // Skip all whitespace after the punctuation to find the start of the next sentence
135    let mut next_char_pos = pos + 2;
136    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
137        next_char_pos += 1;
138    }
139
140    // Check if we reached the end of the string
141    if next_char_pos >= chars.len() {
142        return false;
143    }
144
145    // Next character after space(s) must be uppercase (new sentence indicator)
146    if !chars[next_char_pos].is_uppercase() {
147        return false;
148    }
149
150    // Look back to check for common abbreviations (only applies to periods)
151    if pos > 0 && c == '.' {
152        // Check if the text up to and including this period ends with an abbreviation
153        // Note: text[..=pos] includes the character at pos (the period)
154        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
155            return false;
156        }
157
158        // Check for decimal numbers (e.g., "3.14")
159        // Make sure to check if next_char_pos is within bounds
160        if chars[pos - 1].is_numeric() && next_char_pos < chars.len() && chars[next_char_pos].is_numeric() {
161            return false;
162        }
163    }
164    true
165}
166
167/// Split text into sentences
168pub fn split_into_sentences(text: &str) -> Vec<String> {
169    split_into_sentences_custom(text, &None)
170}
171
172/// Split text into sentences with custom abbreviations
173pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
174    let abbreviations = get_abbreviations(custom_abbreviations);
175    split_into_sentences_with_set(text, &abbreviations)
176}
177
178/// Internal function to split text into sentences with a pre-computed abbreviations set
179/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
180fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
181    let mut sentences = Vec::new();
182    let mut current_sentence = String::new();
183    let mut chars = text.chars().peekable();
184    let mut pos = 0;
185
186    while let Some(c) = chars.next() {
187        current_sentence.push(c);
188
189        if is_sentence_boundary(text, pos, abbreviations) {
190            // Include the space after sentence if it exists
191            if chars.peek() == Some(&' ') {
192                chars.next();
193                pos += 1;
194            }
195            sentences.push(current_sentence.trim().to_string());
196            current_sentence.clear();
197        }
198
199        pos += 1;
200    }
201
202    // Add any remaining text as the last sentence
203    if !current_sentence.trim().is_empty() {
204        sentences.push(current_sentence.trim().to_string());
205    }
206    sentences
207}
208
209/// Check if a line is a horizontal rule (---, ___, ***)
210fn is_horizontal_rule(line: &str) -> bool {
211    if line.len() < 3 {
212        return false;
213    }
214
215    // Check if line consists only of -, _, or * characters (at least 3)
216    let chars: Vec<char> = line.chars().collect();
217    if chars.is_empty() {
218        return false;
219    }
220
221    let first_char = chars[0];
222    if first_char != '-' && first_char != '_' && first_char != '*' {
223        return false;
224    }
225
226    // All characters should be the same (allowing spaces between)
227    for c in &chars {
228        if *c != first_char && *c != ' ' {
229            return false;
230        }
231    }
232
233    // Count non-space characters
234    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
235    non_space_count >= 3
236}
237
238/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
239fn is_numbered_list_item(line: &str) -> bool {
240    let mut chars = line.chars();
241
242    // Must start with a digit
243    if !chars.next().is_some_and(|c| c.is_numeric()) {
244        return false;
245    }
246
247    // Can have more digits
248    while let Some(c) = chars.next() {
249        if c == '.' {
250            // After period, must have a space or be end of line
251            return chars.next().is_none_or(|c| c == ' ');
252        }
253        if !c.is_numeric() {
254            return false;
255        }
256    }
257
258    false
259}
260
261/// Check if a line ends with a hard break (either two spaces or backslash)
262///
263/// CommonMark supports two formats for hard line breaks:
264/// 1. Two or more trailing spaces
265/// 2. A backslash at the end of the line
266fn has_hard_break(line: &str) -> bool {
267    let line = line.strip_suffix('\r').unwrap_or(line);
268    line.ends_with("  ") || line.ends_with('\\')
269}
270
271/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
272///
273/// Hard breaks in Markdown can be indicated by:
274/// 1. Two trailing spaces before a newline (traditional)
275/// 2. A backslash at the end of the line (mdformat style)
276fn trim_preserving_hard_break(s: &str) -> String {
277    // Strip trailing \r from CRLF line endings first to handle Windows files
278    let s = s.strip_suffix('\r').unwrap_or(s);
279
280    // Check for backslash hard break (mdformat style)
281    if s.ends_with('\\') {
282        // Preserve the backslash exactly as-is
283        return s.to_string();
284    }
285
286    // Check if there are at least 2 trailing spaces (traditional hard break)
287    if s.ends_with("  ") {
288        // Find the position where non-space content ends
289        let content_end = s.trim_end().len();
290        if content_end == 0 {
291            // String is all whitespace
292            return String::new();
293        }
294        // Preserve exactly 2 trailing spaces for hard break
295        format!("{}  ", &s[..content_end])
296    } else {
297        // No hard break, just trim all trailing whitespace
298        s.trim_end().to_string()
299    }
300}
301
302pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
303    // For sentence-per-line mode, always process regardless of length
304    if options.sentence_per_line {
305        let elements = parse_markdown_elements(line);
306        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
307    }
308
309    // Quick check: if line is already short enough, return as-is
310    if line.chars().count() <= options.line_length {
311        return vec![line.to_string()];
312    }
313
314    // Parse the markdown to identify elements
315    let elements = parse_markdown_elements(line);
316
317    // Reflow the elements into lines
318    reflow_elements(&elements, options)
319}
320
321/// Image source in a linked image structure
322#[derive(Debug, Clone)]
323enum LinkedImageSource {
324    /// Inline image URL: ![alt](url)
325    Inline(String),
326    /// Reference image: ![alt][ref]
327    Reference(String),
328}
329
330/// Link target in a linked image structure
331#[derive(Debug, Clone)]
332enum LinkedImageTarget {
333    /// Inline link URL: ](url)
334    Inline(String),
335    /// Reference link: ][ref]
336    Reference(String),
337}
338
339/// Represents a piece of content in the markdown
340#[derive(Debug, Clone)]
341enum Element {
342    /// Plain text that can be wrapped
343    Text(String),
344    /// A complete markdown inline link [text](url)
345    Link { text: String, url: String },
346    /// A complete markdown reference link [text][ref]
347    ReferenceLink { text: String, reference: String },
348    /// A complete markdown empty reference link [text][]
349    EmptyReferenceLink { text: String },
350    /// A complete markdown shortcut reference link [ref]
351    ShortcutReference { reference: String },
352    /// A complete markdown inline image ![alt](url)
353    InlineImage { alt: String, url: String },
354    /// A complete markdown reference image ![alt][ref]
355    ReferenceImage { alt: String, reference: String },
356    /// A complete markdown empty reference image ![alt][]
357    EmptyReferenceImage { alt: String },
358    /// A clickable image badge in any of 4 forms:
359    /// - [![alt](img-url)](link-url)
360    /// - [![alt][img-ref]](link-url)
361    /// - [![alt](img-url)][link-ref]
362    /// - [![alt][img-ref]][link-ref]
363    LinkedImage {
364        alt: String,
365        img_source: LinkedImageSource,
366        link_target: LinkedImageTarget,
367    },
368    /// Footnote reference [^note]
369    FootnoteReference { note: String },
370    /// Strikethrough text ~~text~~
371    Strikethrough(String),
372    /// Wiki-style link [[wiki]] or [[wiki|text]]
373    WikiLink(String),
374    /// Inline math $math$
375    InlineMath(String),
376    /// Display math $$math$$
377    DisplayMath(String),
378    /// Emoji shortcode :emoji:
379    EmojiShortcode(String),
380    /// HTML tag <tag> or </tag> or <tag/>
381    HtmlTag(String),
382    /// HTML entity &nbsp; or &#123;
383    HtmlEntity(String),
384    /// Inline code `code`
385    Code(String),
386    /// Bold text **text**
387    Bold(String),
388    /// Italic text *text*
389    Italic(String),
390}
391
392impl std::fmt::Display for Element {
393    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
394        match self {
395            Element::Text(s) => write!(f, "{s}"),
396            Element::Link { text, url } => write!(f, "[{text}]({url})"),
397            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
398            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
399            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
400            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
401            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
402            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
403            Element::LinkedImage {
404                alt,
405                img_source,
406                link_target,
407            } => {
408                // Build the image part: ![alt](url) or ![alt][ref]
409                let img_part = match img_source {
410                    LinkedImageSource::Inline(url) => format!("![{alt}]({url})"),
411                    LinkedImageSource::Reference(r) => format!("![{alt}][{r}]"),
412                };
413                // Build the link part: (url) or [ref]
414                match link_target {
415                    LinkedImageTarget::Inline(url) => write!(f, "[{img_part}]({url})"),
416                    LinkedImageTarget::Reference(r) => write!(f, "[{img_part}][{r}]"),
417                }
418            }
419            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
420            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
421            Element::WikiLink(s) => write!(f, "[[{s}]]"),
422            Element::InlineMath(s) => write!(f, "${s}$"),
423            Element::DisplayMath(s) => write!(f, "$${s}$$"),
424            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
425            Element::HtmlTag(s) => write!(f, "{s}"),
426            Element::HtmlEntity(s) => write!(f, "{s}"),
427            Element::Code(s) => write!(f, "`{s}`"),
428            Element::Bold(s) => write!(f, "**{s}**"),
429            Element::Italic(s) => write!(f, "*{s}*"),
430        }
431    }
432}
433
434impl Element {
435    fn len(&self) -> usize {
436        match self {
437            Element::Text(s) => s.chars().count(),
438            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
439            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
440            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
441            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
442            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
443            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
444            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
445            Element::LinkedImage {
446                alt,
447                img_source,
448                link_target,
449            } => {
450                // Calculate length based on variant
451                // Base: [ + ![alt] + ] = 4 chars for outer brackets and !
452                let alt_len = alt.chars().count();
453                let img_len = match img_source {
454                    LinkedImageSource::Inline(url) => url.chars().count() + 2, // (url)
455                    LinkedImageSource::Reference(r) => r.chars().count() + 2,  // [ref]
456                };
457                let link_len = match link_target {
458                    LinkedImageTarget::Inline(url) => url.chars().count() + 2, // (url)
459                    LinkedImageTarget::Reference(r) => r.chars().count() + 2,  // [ref]
460                };
461                // [![alt](img)](link) = [ + ! + [ + alt + ] + (img) + ] + (link)
462                //                     = 1 + 1 + 1 + alt + 1 + img_len + 1 + link_len = 5 + alt + img + link
463                5 + alt_len + img_len + link_len
464            }
465            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
466            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
467            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
468            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
469            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
470            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
471            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
472            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
473            Element::Code(s) => s.chars().count() + 2,                       // `code`
474            Element::Bold(s) => s.chars().count() + 4,                       // **text**
475            Element::Italic(s) => s.chars().count() + 2,                     // *text*
476        }
477    }
478}
479
480/// Parse markdown elements from text preserving the raw syntax
481///
482/// Detection order is critical:
483/// 1. Linked images [![alt](img)](link) - must be detected first as atomic units
484/// 2. Inline images ![alt](url) - before links to handle ! prefix
485/// 3. Reference images ![alt][ref] - before reference links
486/// 4. Inline links [text](url) - before reference links
487/// 5. Reference links [text][ref] - before shortcut references
488/// 6. Shortcut reference links [ref] - detected last to avoid false positives
489/// 7. Other elements (code, bold, italic, etc.) - processed normally
490fn parse_markdown_elements(text: &str) -> Vec<Element> {
491    let mut elements = Vec::new();
492    let mut remaining = text;
493
494    while !remaining.is_empty() {
495        // Find the earliest occurrence of any markdown pattern
496        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
497
498        // Check for linked images FIRST (all 4 variants)
499        // Quick literal check: only run expensive regexes if we might have a linked image
500        // Pattern starts with "[!" so check for that first
501        if remaining.contains("[!") {
502            // Pattern 1: [![alt](img)](link) - inline image in inline link
503            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_INLINE.find(remaining)
504                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
505            {
506                earliest_match = Some((m.start(), "linked_image_ii", m));
507            }
508
509            // Pattern 2: [![alt][ref]](link) - reference image in inline link
510            if let Ok(Some(m)) = LINKED_IMAGE_REF_INLINE.find(remaining)
511                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
512            {
513                earliest_match = Some((m.start(), "linked_image_ri", m));
514            }
515
516            // Pattern 3: [![alt](img)][ref] - inline image in reference link
517            if let Ok(Some(m)) = LINKED_IMAGE_INLINE_REF.find(remaining)
518                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
519            {
520                earliest_match = Some((m.start(), "linked_image_ir", m));
521            }
522
523            // Pattern 4: [![alt][ref]][ref] - reference image in reference link
524            if let Ok(Some(m)) = LINKED_IMAGE_REF_REF.find(remaining)
525                && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
526            {
527                earliest_match = Some((m.start(), "linked_image_rr", m));
528            }
529        }
530
531        // Check for images (they start with ! so should be detected before links)
532        // Inline images - ![alt](url)
533        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
534            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
535        {
536            earliest_match = Some((m.start(), "inline_image", m));
537        }
538
539        // Reference images - ![alt][ref]
540        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
541            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
542        {
543            earliest_match = Some((m.start(), "ref_image", m));
544        }
545
546        // Check for footnote references - [^note]
547        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
548            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
549        {
550            earliest_match = Some((m.start(), "footnote_ref", m));
551        }
552
553        // Check for inline links - [text](url)
554        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
555            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
556        {
557            earliest_match = Some((m.start(), "inline_link", m));
558        }
559
560        // Check for reference links - [text][ref]
561        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
562            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
563        {
564            earliest_match = Some((m.start(), "ref_link", m));
565        }
566
567        // Check for shortcut reference links - [ref]
568        // Only check if we haven't found an earlier pattern that would conflict
569        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
570            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
571        {
572            earliest_match = Some((m.start(), "shortcut_ref", m));
573        }
574
575        // Check for wiki-style links - [[wiki]]
576        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
577            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
578        {
579            earliest_match = Some((m.start(), "wiki_link", m));
580        }
581
582        // Check for display math first (before inline) - $$math$$
583        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
584            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
585        {
586            earliest_match = Some((m.start(), "display_math", m));
587        }
588
589        // Check for inline math - $math$
590        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
591            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
592        {
593            earliest_match = Some((m.start(), "inline_math", m));
594        }
595
596        // Check for strikethrough - ~~text~~
597        if let Ok(Some(m)) = STRIKETHROUGH_FANCY_REGEX.find(remaining)
598            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
599        {
600            earliest_match = Some((m.start(), "strikethrough", m));
601        }
602
603        // Check for emoji shortcodes - :emoji:
604        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
605            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
606        {
607            earliest_match = Some((m.start(), "emoji", m));
608        }
609
610        // Check for HTML entities - &nbsp; etc
611        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
612            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
613        {
614            earliest_match = Some((m.start(), "html_entity", m));
615        }
616
617        // Check for HTML tags - <tag> </tag> <tag/>
618        // But exclude autolinks like <https://...> or <mailto:...>
619        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
620            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
621        {
622            // Check if this is an autolink (starts with protocol or mailto:)
623            let matched_text = &remaining[m.start()..m.end()];
624            let is_autolink = matched_text.starts_with("<http://")
625                || matched_text.starts_with("<https://")
626                || matched_text.starts_with("<mailto:")
627                || matched_text.starts_with("<ftp://")
628                || matched_text.starts_with("<ftps://");
629
630            if !is_autolink {
631                earliest_match = Some((m.start(), "html_tag", m));
632            }
633        }
634
635        // Find earliest non-link special characters
636        let mut next_special = remaining.len();
637        let mut special_type = "";
638
639        if let Some(pos) = remaining.find('`')
640            && pos < next_special
641        {
642            next_special = pos;
643            special_type = "code";
644        }
645        if let Some(pos) = remaining.find("**")
646            && pos < next_special
647        {
648            next_special = pos;
649            special_type = "bold";
650        }
651        if let Some(pos) = remaining.find('*')
652            && pos < next_special
653            && !remaining[pos..].starts_with("**")
654        {
655            next_special = pos;
656            special_type = "italic";
657        }
658
659        // Determine which pattern to process first
660        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
661            pos < next_special
662        } else {
663            false
664        };
665
666        if should_process_markdown_link {
667            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
668
669            // Add any text before the match
670            if pos > 0 {
671                elements.push(Element::Text(remaining[..pos].to_string()));
672            }
673
674            // Process the matched pattern
675            match pattern_type {
676                // Pattern 1: [![alt](img)](link) - inline image in inline link
677                "linked_image_ii" => {
678                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_INLINE.captures(remaining) {
679                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
680                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
681                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
682                        elements.push(Element::LinkedImage {
683                            alt: alt.to_string(),
684                            img_source: LinkedImageSource::Inline(img_url.to_string()),
685                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
686                        });
687                        remaining = &remaining[match_obj.end()..];
688                    } else {
689                        elements.push(Element::Text("[".to_string()));
690                        remaining = &remaining[1..];
691                    }
692                }
693                // Pattern 2: [![alt][ref]](link) - reference image in inline link
694                "linked_image_ri" => {
695                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_INLINE.captures(remaining) {
696                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
697                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
698                        let link_url = caps.get(3).map(|m| m.as_str()).unwrap_or("");
699                        elements.push(Element::LinkedImage {
700                            alt: alt.to_string(),
701                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
702                            link_target: LinkedImageTarget::Inline(link_url.to_string()),
703                        });
704                        remaining = &remaining[match_obj.end()..];
705                    } else {
706                        elements.push(Element::Text("[".to_string()));
707                        remaining = &remaining[1..];
708                    }
709                }
710                // Pattern 3: [![alt](img)][ref] - inline image in reference link
711                "linked_image_ir" => {
712                    if let Ok(Some(caps)) = LINKED_IMAGE_INLINE_REF.captures(remaining) {
713                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
714                        let img_url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
715                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
716                        elements.push(Element::LinkedImage {
717                            alt: alt.to_string(),
718                            img_source: LinkedImageSource::Inline(img_url.to_string()),
719                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
720                        });
721                        remaining = &remaining[match_obj.end()..];
722                    } else {
723                        elements.push(Element::Text("[".to_string()));
724                        remaining = &remaining[1..];
725                    }
726                }
727                // Pattern 4: [![alt][ref]][ref] - reference image in reference link
728                "linked_image_rr" => {
729                    if let Ok(Some(caps)) = LINKED_IMAGE_REF_REF.captures(remaining) {
730                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
731                        let img_ref = caps.get(2).map(|m| m.as_str()).unwrap_or("");
732                        let link_ref = caps.get(3).map(|m| m.as_str()).unwrap_or("");
733                        elements.push(Element::LinkedImage {
734                            alt: alt.to_string(),
735                            img_source: LinkedImageSource::Reference(img_ref.to_string()),
736                            link_target: LinkedImageTarget::Reference(link_ref.to_string()),
737                        });
738                        remaining = &remaining[match_obj.end()..];
739                    } else {
740                        elements.push(Element::Text("[".to_string()));
741                        remaining = &remaining[1..];
742                    }
743                }
744                "inline_image" => {
745                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
746                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
747                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
748                        elements.push(Element::InlineImage {
749                            alt: alt.to_string(),
750                            url: url.to_string(),
751                        });
752                        remaining = &remaining[match_obj.end()..];
753                    } else {
754                        elements.push(Element::Text("!".to_string()));
755                        remaining = &remaining[1..];
756                    }
757                }
758                "ref_image" => {
759                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
760                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
761                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
762
763                        if reference.is_empty() {
764                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
765                        } else {
766                            elements.push(Element::ReferenceImage {
767                                alt: alt.to_string(),
768                                reference: reference.to_string(),
769                            });
770                        }
771                        remaining = &remaining[match_obj.end()..];
772                    } else {
773                        elements.push(Element::Text("!".to_string()));
774                        remaining = &remaining[1..];
775                    }
776                }
777                "footnote_ref" => {
778                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
779                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
780                        elements.push(Element::FootnoteReference { note: note.to_string() });
781                        remaining = &remaining[match_obj.end()..];
782                    } else {
783                        elements.push(Element::Text("[".to_string()));
784                        remaining = &remaining[1..];
785                    }
786                }
787                "inline_link" => {
788                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
789                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
790                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
791                        elements.push(Element::Link {
792                            text: text.to_string(),
793                            url: url.to_string(),
794                        });
795                        remaining = &remaining[match_obj.end()..];
796                    } else {
797                        // Fallback - shouldn't happen
798                        elements.push(Element::Text("[".to_string()));
799                        remaining = &remaining[1..];
800                    }
801                }
802                "ref_link" => {
803                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
804                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
805                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
806
807                        if reference.is_empty() {
808                            // Empty reference link [text][]
809                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
810                        } else {
811                            // Regular reference link [text][ref]
812                            elements.push(Element::ReferenceLink {
813                                text: text.to_string(),
814                                reference: reference.to_string(),
815                            });
816                        }
817                        remaining = &remaining[match_obj.end()..];
818                    } else {
819                        // Fallback - shouldn't happen
820                        elements.push(Element::Text("[".to_string()));
821                        remaining = &remaining[1..];
822                    }
823                }
824                "shortcut_ref" => {
825                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
826                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
827                        elements.push(Element::ShortcutReference {
828                            reference: reference.to_string(),
829                        });
830                        remaining = &remaining[match_obj.end()..];
831                    } else {
832                        // Fallback - shouldn't happen
833                        elements.push(Element::Text("[".to_string()));
834                        remaining = &remaining[1..];
835                    }
836                }
837                "wiki_link" => {
838                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
839                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
840                        elements.push(Element::WikiLink(content.to_string()));
841                        remaining = &remaining[match_obj.end()..];
842                    } else {
843                        elements.push(Element::Text("[[".to_string()));
844                        remaining = &remaining[2..];
845                    }
846                }
847                "display_math" => {
848                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
849                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
850                        elements.push(Element::DisplayMath(math.to_string()));
851                        remaining = &remaining[match_obj.end()..];
852                    } else {
853                        elements.push(Element::Text("$$".to_string()));
854                        remaining = &remaining[2..];
855                    }
856                }
857                "inline_math" => {
858                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
859                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
860                        elements.push(Element::InlineMath(math.to_string()));
861                        remaining = &remaining[match_obj.end()..];
862                    } else {
863                        elements.push(Element::Text("$".to_string()));
864                        remaining = &remaining[1..];
865                    }
866                }
867                "strikethrough" => {
868                    if let Ok(Some(caps)) = STRIKETHROUGH_FANCY_REGEX.captures(remaining) {
869                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
870                        elements.push(Element::Strikethrough(text.to_string()));
871                        remaining = &remaining[match_obj.end()..];
872                    } else {
873                        elements.push(Element::Text("~~".to_string()));
874                        remaining = &remaining[2..];
875                    }
876                }
877                "emoji" => {
878                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
879                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
880                        elements.push(Element::EmojiShortcode(emoji.to_string()));
881                        remaining = &remaining[match_obj.end()..];
882                    } else {
883                        elements.push(Element::Text(":".to_string()));
884                        remaining = &remaining[1..];
885                    }
886                }
887                "html_entity" => {
888                    // HTML entities are captured whole
889                    elements.push(Element::HtmlEntity(remaining[..match_obj.end()].to_string()));
890                    remaining = &remaining[match_obj.end()..];
891                }
892                "html_tag" => {
893                    // HTML tags are captured whole
894                    elements.push(Element::HtmlTag(remaining[..match_obj.end()].to_string()));
895                    remaining = &remaining[match_obj.end()..];
896                }
897                _ => {
898                    // Unknown pattern, treat as text
899                    elements.push(Element::Text("[".to_string()));
900                    remaining = &remaining[1..];
901                }
902            }
903        } else {
904            // Process non-link special characters
905
906            // Add any text before the special character
907            if next_special > 0 && next_special < remaining.len() {
908                elements.push(Element::Text(remaining[..next_special].to_string()));
909                remaining = &remaining[next_special..];
910            }
911
912            // Process the special element
913            match special_type {
914                "code" => {
915                    // Find end of code
916                    if let Some(code_end) = remaining[1..].find('`') {
917                        let code = &remaining[1..1 + code_end];
918                        elements.push(Element::Code(code.to_string()));
919                        remaining = &remaining[1 + code_end + 1..];
920                    } else {
921                        // No closing backtick, treat as text
922                        elements.push(Element::Text(remaining.to_string()));
923                        break;
924                    }
925                }
926                "bold" => {
927                    // Check for bold text
928                    if let Some(bold_end) = remaining[2..].find("**") {
929                        let bold_text = &remaining[2..2 + bold_end];
930                        elements.push(Element::Bold(bold_text.to_string()));
931                        remaining = &remaining[2 + bold_end + 2..];
932                    } else {
933                        // No closing **, treat as text
934                        elements.push(Element::Text("**".to_string()));
935                        remaining = &remaining[2..];
936                    }
937                }
938                "italic" => {
939                    // Check for italic text
940                    if let Some(italic_end) = remaining[1..].find('*') {
941                        let italic_text = &remaining[1..1 + italic_end];
942                        elements.push(Element::Italic(italic_text.to_string()));
943                        remaining = &remaining[1 + italic_end + 1..];
944                    } else {
945                        // No closing *, treat as text
946                        elements.push(Element::Text("*".to_string()));
947                        remaining = &remaining[1..];
948                    }
949                }
950                _ => {
951                    // No special elements found, add all remaining text
952                    elements.push(Element::Text(remaining.to_string()));
953                    break;
954                }
955            }
956        }
957    }
958
959    elements
960}
961
962/// Reflow elements for sentence-per-line mode
963fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
964    let abbreviations = get_abbreviations(custom_abbreviations);
965    let mut lines = Vec::new();
966    let mut current_line = String::new();
967
968    for element in elements.iter() {
969        let element_str = format!("{element}");
970
971        // For text elements, split into sentences
972        if let Element::Text(text) = element {
973            // Simply append text - it already has correct spacing from tokenization
974            let combined = format!("{current_line}{text}");
975            // Use the pre-computed abbreviations set to avoid redundant computation
976            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
977
978            if sentences.len() > 1 {
979                // We found sentence boundaries
980                for (i, sentence) in sentences.iter().enumerate() {
981                    if i == 0 {
982                        // First sentence might continue from previous elements
983                        // But check if it ends with an abbreviation
984                        let trimmed = sentence.trim();
985
986                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
987                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
988                            current_line = sentence.to_string();
989                        } else {
990                            // Normal case - emit the first sentence
991                            lines.push(sentence.to_string());
992                            current_line.clear();
993                        }
994                    } else if i == sentences.len() - 1 {
995                        // Last sentence: check if it's complete or incomplete
996                        let trimmed = sentence.trim();
997                        let ends_with_sentence_punct =
998                            trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
999
1000                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
1001                            // Complete sentence - emit it immediately
1002                            lines.push(sentence.to_string());
1003                            current_line.clear();
1004                        } else {
1005                            // Incomplete sentence - save for next iteration
1006                            current_line = sentence.to_string();
1007                        }
1008                    } else {
1009                        // Complete sentences in the middle
1010                        lines.push(sentence.to_string());
1011                    }
1012                }
1013            } else {
1014                // No sentence boundary found, continue accumulating
1015                current_line = combined;
1016            }
1017        } else {
1018            // Non-text elements (Code, Bold, Italic, etc.)
1019            // Add space before element if needed (unless it's after an opening paren/bracket)
1020            if !current_line.is_empty()
1021                && !current_line.ends_with(' ')
1022                && !current_line.ends_with('(')
1023                && !current_line.ends_with('[')
1024            {
1025                current_line.push(' ');
1026            }
1027            current_line.push_str(&element_str);
1028        }
1029    }
1030
1031    // Add any remaining content
1032    if !current_line.is_empty() {
1033        lines.push(current_line.trim().to_string());
1034    }
1035    lines
1036}
1037
1038/// Reflow elements into lines that fit within the line length
1039fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
1040    let mut lines = Vec::new();
1041    let mut current_line = String::new();
1042    let mut current_length = 0;
1043
1044    for element in elements {
1045        let element_str = format!("{element}");
1046        let element_len = element.len();
1047
1048        // For text elements that might need breaking
1049        if let Element::Text(text) = element {
1050            // Check if original text had leading whitespace
1051            let has_leading_space = text.starts_with(char::is_whitespace);
1052            // If this is a text element, always process it word by word
1053            let words: Vec<&str> = text.split_whitespace().collect();
1054
1055            for (i, word) in words.iter().enumerate() {
1056                let word_len = word.chars().count();
1057                // Check if this "word" is just punctuation that should stay attached
1058                let is_trailing_punct = word
1059                    .chars()
1060                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
1061
1062                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
1063                    // Start a new line (but never for trailing punctuation)
1064                    lines.push(current_line.trim().to_string());
1065                    current_line = word.to_string();
1066                    current_length = word_len;
1067                } else {
1068                    // Add word to current line
1069                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
1070                    // AND this isn't trailing punctuation (which attaches directly)
1071                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
1072                        current_line.push(' ');
1073                        current_length += 1;
1074                    }
1075                    current_line.push_str(word);
1076                    current_length += word_len;
1077                }
1078            }
1079        } else {
1080            // For non-text elements (code, links, references), treat as atomic units
1081            // These should never be broken across lines
1082            if current_length > 0 && current_length + 1 + element_len > options.line_length {
1083                // Start a new line
1084                lines.push(current_line.trim().to_string());
1085                current_line = element_str;
1086                current_length = element_len;
1087            } else {
1088                // Add element to current line
1089                // Don't add space if the current line ends with an opening bracket/paren
1090                let ends_with_opener =
1091                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
1092                if current_length > 0 && !ends_with_opener {
1093                    current_line.push(' ');
1094                    current_length += 1;
1095                }
1096                current_line.push_str(&element_str);
1097                current_length += element_len;
1098            }
1099        }
1100    }
1101
1102    // Don't forget the last line
1103    if !current_line.is_empty() {
1104        lines.push(current_line.trim_end().to_string());
1105    }
1106
1107    lines
1108}
1109
1110/// Reflow markdown content preserving structure
1111pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
1112    let lines: Vec<&str> = content.lines().collect();
1113    let mut result = Vec::new();
1114    let mut i = 0;
1115
1116    while i < lines.len() {
1117        let line = lines[i];
1118        let trimmed = line.trim();
1119
1120        // Preserve empty lines
1121        if trimmed.is_empty() {
1122            result.push(String::new());
1123            i += 1;
1124            continue;
1125        }
1126
1127        // Preserve headings as-is
1128        if trimmed.starts_with('#') {
1129            result.push(line.to_string());
1130            i += 1;
1131            continue;
1132        }
1133
1134        // Preserve fenced code blocks
1135        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
1136            result.push(line.to_string());
1137            i += 1;
1138            // Copy lines until closing fence
1139            while i < lines.len() {
1140                result.push(lines[i].to_string());
1141                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
1142                    i += 1;
1143                    break;
1144                }
1145                i += 1;
1146            }
1147            continue;
1148        }
1149
1150        // Preserve indented code blocks (4+ columns accounting for tab expansion)
1151        if ElementCache::calculate_indentation_width_default(line) >= 4 {
1152            // Collect all consecutive indented lines
1153            result.push(line.to_string());
1154            i += 1;
1155            while i < lines.len() {
1156                let next_line = lines[i];
1157                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
1158                if ElementCache::calculate_indentation_width_default(next_line) >= 4 || next_line.trim().is_empty() {
1159                    result.push(next_line.to_string());
1160                    i += 1;
1161                } else {
1162                    break;
1163                }
1164            }
1165            continue;
1166        }
1167
1168        // Preserve block quotes (but reflow their content)
1169        if trimmed.starts_with('>') {
1170            let quote_prefix = line[0..line.find('>').unwrap() + 1].to_string();
1171            let quote_content = &line[quote_prefix.len()..].trim_start();
1172
1173            let reflowed = reflow_line(quote_content, options);
1174            for reflowed_line in reflowed.iter() {
1175                result.push(format!("{quote_prefix} {reflowed_line}"));
1176            }
1177            i += 1;
1178            continue;
1179        }
1180
1181        // Preserve horizontal rules first (before checking for lists)
1182        if is_horizontal_rule(trimmed) {
1183            result.push(line.to_string());
1184            i += 1;
1185            continue;
1186        }
1187
1188        // Preserve lists (but not horizontal rules)
1189        if (trimmed.starts_with('-') && !is_horizontal_rule(trimmed))
1190            || (trimmed.starts_with('*') && !is_horizontal_rule(trimmed))
1191            || trimmed.starts_with('+')
1192            || is_numbered_list_item(trimmed)
1193        {
1194            // Find the list marker and preserve indentation
1195            let indent = line.len() - line.trim_start().len();
1196            let indent_str = " ".repeat(indent);
1197
1198            // For numbered lists, find the period and the space after it
1199            // For bullet lists, find the marker and the space after it
1200            let mut marker_end = indent;
1201            let mut content_start = indent;
1202
1203            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1204                // Numbered list: find the period
1205                if let Some(period_pos) = line[indent..].find('.') {
1206                    marker_end = indent + period_pos + 1; // Include the period
1207                    content_start = marker_end;
1208                    // Skip any spaces after the period to find content start
1209                    while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1210                        content_start += 1;
1211                    }
1212                }
1213            } else {
1214                // Bullet list: marker is single character
1215                marker_end = indent + 1; // Just the marker character
1216                content_start = marker_end;
1217                // Skip any spaces after the marker
1218                while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1219                    content_start += 1;
1220                }
1221            }
1222
1223            let marker = &line[indent..marker_end];
1224
1225            // Collect all content for this list item (including continuation lines)
1226            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1227            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1228            i += 1;
1229
1230            // Collect continuation lines (indented lines that are part of this list item)
1231            while i < lines.len() {
1232                let next_line = lines[i];
1233                let next_trimmed = next_line.trim();
1234
1235                // Stop if we hit an empty line or another list item or special block
1236                if next_trimmed.is_empty()
1237                    || next_trimmed.starts_with('#')
1238                    || next_trimmed.starts_with("```")
1239                    || next_trimmed.starts_with("~~~")
1240                    || next_trimmed.starts_with('>')
1241                    || next_trimmed.starts_with('|')
1242                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1243                    || is_horizontal_rule(next_trimmed)
1244                    || (next_trimmed.starts_with('-')
1245                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1246                    || (next_trimmed.starts_with('*')
1247                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1248                    || (next_trimmed.starts_with('+')
1249                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1250                    || is_numbered_list_item(next_trimmed)
1251                    || is_definition_list_item(next_trimmed)
1252                {
1253                    break;
1254                }
1255
1256                // Check if this line is indented (continuation of list item)
1257                let next_indent = next_line.len() - next_line.trim_start().len();
1258                if next_indent >= content_start {
1259                    // This is a continuation line - add its content
1260                    // Preserve hard breaks while trimming excessive whitespace
1261                    let trimmed_start = next_line.trim_start();
1262                    list_content.push(trim_preserving_hard_break(trimmed_start));
1263                    i += 1;
1264                } else {
1265                    // Not indented enough, not part of this list item
1266                    break;
1267                }
1268            }
1269
1270            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1271            // Hard breaks should prevent joining with the next line
1272            let combined_content = if options.preserve_breaks {
1273                list_content[0].clone()
1274            } else {
1275                // Check if any lines have hard breaks - if so, preserve the structure
1276                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1277                if has_hard_breaks {
1278                    // Don't join lines with hard breaks - keep them separate with newlines
1279                    list_content.join("\n")
1280                } else {
1281                    // No hard breaks, safe to join with spaces
1282                    list_content.join(" ")
1283                }
1284            };
1285
1286            // Calculate the proper indentation for continuation lines
1287            let trimmed_marker = marker;
1288            let continuation_spaces = content_start;
1289
1290            // Adjust line length to account for list marker and space
1291            let prefix_length = indent + trimmed_marker.len() + 1;
1292
1293            // Create adjusted options with reduced line length
1294            let adjusted_options = ReflowOptions {
1295                line_length: options.line_length.saturating_sub(prefix_length),
1296                ..options.clone()
1297            };
1298
1299            let reflowed = reflow_line(&combined_content, &adjusted_options);
1300            for (j, reflowed_line) in reflowed.iter().enumerate() {
1301                if j == 0 {
1302                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1303                } else {
1304                    // Continuation lines aligned with text after marker
1305                    let continuation_indent = " ".repeat(continuation_spaces);
1306                    result.push(format!("{continuation_indent}{reflowed_line}"));
1307                }
1308            }
1309            continue;
1310        }
1311
1312        // Preserve tables
1313        if crate::utils::table_utils::TableUtils::is_potential_table_row(line) {
1314            result.push(line.to_string());
1315            i += 1;
1316            continue;
1317        }
1318
1319        // Preserve reference definitions
1320        if trimmed.starts_with('[') && line.contains("]:") {
1321            result.push(line.to_string());
1322            i += 1;
1323            continue;
1324        }
1325
1326        // Preserve definition list items (extended markdown)
1327        if is_definition_list_item(trimmed) {
1328            result.push(line.to_string());
1329            i += 1;
1330            continue;
1331        }
1332
1333        // Check if this is a single line that doesn't need processing
1334        let mut is_single_line_paragraph = true;
1335        if i + 1 < lines.len() {
1336            let next_line = lines[i + 1];
1337            let next_trimmed = next_line.trim();
1338            // Check if next line starts a new block
1339            if !next_trimmed.is_empty()
1340                && !next_trimmed.starts_with('#')
1341                && !next_trimmed.starts_with("```")
1342                && !next_trimmed.starts_with("~~~")
1343                && !next_trimmed.starts_with('>')
1344                && !next_trimmed.starts_with('|')
1345                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1346                && !is_horizontal_rule(next_trimmed)
1347                && !(next_trimmed.starts_with('-')
1348                    && !is_horizontal_rule(next_trimmed)
1349                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1350                && !(next_trimmed.starts_with('*')
1351                    && !is_horizontal_rule(next_trimmed)
1352                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1353                && !(next_trimmed.starts_with('+')
1354                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1355                && !is_numbered_list_item(next_trimmed)
1356            {
1357                is_single_line_paragraph = false;
1358            }
1359        }
1360
1361        // If it's a single line that fits, just add it as-is
1362        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1363            result.push(line.to_string());
1364            i += 1;
1365            continue;
1366        }
1367
1368        // For regular paragraphs, collect consecutive lines
1369        let mut paragraph_parts = Vec::new();
1370        let mut current_part = vec![line];
1371        i += 1;
1372
1373        // If preserve_breaks is true, treat each line separately
1374        if options.preserve_breaks {
1375            // Don't collect consecutive lines - just reflow this single line
1376            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1377                Some("\\")
1378            } else if line.ends_with("  ") {
1379                Some("  ")
1380            } else {
1381                None
1382            };
1383            let reflowed = reflow_line(line, options);
1384
1385            // Preserve hard breaks (two trailing spaces or backslash)
1386            if let Some(break_marker) = hard_break_type {
1387                if !reflowed.is_empty() {
1388                    let mut reflowed_with_break = reflowed;
1389                    let last_idx = reflowed_with_break.len() - 1;
1390                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1391                        reflowed_with_break[last_idx].push_str(break_marker);
1392                    }
1393                    result.extend(reflowed_with_break);
1394                }
1395            } else {
1396                result.extend(reflowed);
1397            }
1398        } else {
1399            // Original behavior: collect consecutive lines into a paragraph
1400            while i < lines.len() {
1401                let prev_line = if !current_part.is_empty() {
1402                    current_part.last().unwrap()
1403                } else {
1404                    ""
1405                };
1406                let next_line = lines[i];
1407                let next_trimmed = next_line.trim();
1408
1409                // Stop at empty lines or special blocks
1410                if next_trimmed.is_empty()
1411                    || next_trimmed.starts_with('#')
1412                    || next_trimmed.starts_with("```")
1413                    || next_trimmed.starts_with("~~~")
1414                    || next_trimmed.starts_with('>')
1415                    || next_trimmed.starts_with('|')
1416                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1417                    || is_horizontal_rule(next_trimmed)
1418                    || (next_trimmed.starts_with('-')
1419                        && !is_horizontal_rule(next_trimmed)
1420                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1421                    || (next_trimmed.starts_with('*')
1422                        && !is_horizontal_rule(next_trimmed)
1423                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1424                    || (next_trimmed.starts_with('+')
1425                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1426                    || is_numbered_list_item(next_trimmed)
1427                    || is_definition_list_item(next_trimmed)
1428                {
1429                    break;
1430                }
1431
1432                // Check if previous line ends with hard break (two spaces or backslash)
1433                if has_hard_break(prev_line) {
1434                    // Start a new part after hard break
1435                    paragraph_parts.push(current_part.join(" "));
1436                    current_part = vec![next_line];
1437                } else {
1438                    current_part.push(next_line);
1439                }
1440                i += 1;
1441            }
1442
1443            // Add the last part
1444            if !current_part.is_empty() {
1445                if current_part.len() == 1 {
1446                    // Single line, don't add trailing space
1447                    paragraph_parts.push(current_part[0].to_string());
1448                } else {
1449                    paragraph_parts.push(current_part.join(" "));
1450                }
1451            }
1452
1453            // Reflow each part separately, preserving hard breaks
1454            for (j, part) in paragraph_parts.iter().enumerate() {
1455                let reflowed = reflow_line(part, options);
1456                result.extend(reflowed);
1457
1458                // Preserve hard break by ensuring last line of part ends with hard break marker
1459                // Use two spaces as the default hard break format for reflows
1460                if j < paragraph_parts.len() - 1 && !result.is_empty() {
1461                    let last_idx = result.len() - 1;
1462                    if !has_hard_break(&result[last_idx]) {
1463                        result[last_idx].push_str("  ");
1464                    }
1465                }
1466            }
1467        }
1468    }
1469
1470    // Preserve trailing newline if the original content had one
1471    let result_text = result.join("\n");
1472    if content.ends_with('\n') && !result_text.ends_with('\n') {
1473        format!("{result_text}\n")
1474    } else {
1475        result_text
1476    }
1477}
1478
1479/// Information about a reflowed paragraph
1480#[derive(Debug, Clone)]
1481pub struct ParagraphReflow {
1482    /// Starting byte offset of the paragraph in the original content
1483    pub start_byte: usize,
1484    /// Ending byte offset of the paragraph in the original content
1485    pub end_byte: usize,
1486    /// The reflowed text for this paragraph
1487    pub reflowed_text: String,
1488}
1489
1490/// Reflow a single paragraph at the specified line number
1491///
1492/// This function finds the paragraph containing the given line number,
1493/// reflows it according to the specified line length, and returns
1494/// information about the paragraph location and its reflowed text.
1495///
1496/// # Arguments
1497///
1498/// * `content` - The full document content
1499/// * `line_number` - The 1-based line number within the paragraph to reflow
1500/// * `line_length` - The target line length for reflowing
1501///
1502/// # Returns
1503///
1504/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1505/// or `None` if the line number is out of bounds or the content at that
1506/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1507pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1508    if line_number == 0 {
1509        return None;
1510    }
1511
1512    let lines: Vec<&str> = content.lines().collect();
1513
1514    // Check if line number is valid (1-based)
1515    if line_number > lines.len() {
1516        return None;
1517    }
1518
1519    let target_idx = line_number - 1; // Convert to 0-based
1520    let target_line = lines[target_idx];
1521    let trimmed = target_line.trim();
1522
1523    // Don't reflow special blocks
1524    if trimmed.is_empty()
1525        || trimmed.starts_with('#')
1526        || trimmed.starts_with("```")
1527        || trimmed.starts_with("~~~")
1528        || ElementCache::calculate_indentation_width_default(target_line) >= 4
1529        || trimmed.starts_with('>')
1530        || crate::utils::table_utils::TableUtils::is_potential_table_row(target_line) // Tables
1531        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1532        || is_horizontal_rule(trimmed)
1533        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1534            && !is_horizontal_rule(trimmed)
1535            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1536        || is_numbered_list_item(trimmed)
1537        || is_definition_list_item(trimmed)
1538    {
1539        return None;
1540    }
1541
1542    // Find paragraph start - scan backward until blank line or special block
1543    let mut para_start = target_idx;
1544    while para_start > 0 {
1545        let prev_idx = para_start - 1;
1546        let prev_line = lines[prev_idx];
1547        let prev_trimmed = prev_line.trim();
1548
1549        // Stop at blank line or special blocks
1550        if prev_trimmed.is_empty()
1551            || prev_trimmed.starts_with('#')
1552            || prev_trimmed.starts_with("```")
1553            || prev_trimmed.starts_with("~~~")
1554            || ElementCache::calculate_indentation_width_default(prev_line) >= 4
1555            || prev_trimmed.starts_with('>')
1556            || crate::utils::table_utils::TableUtils::is_potential_table_row(prev_line)
1557            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1558            || is_horizontal_rule(prev_trimmed)
1559            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1560                && !is_horizontal_rule(prev_trimmed)
1561                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1562            || is_numbered_list_item(prev_trimmed)
1563            || is_definition_list_item(prev_trimmed)
1564        {
1565            break;
1566        }
1567
1568        para_start = prev_idx;
1569    }
1570
1571    // Find paragraph end - scan forward until blank line or special block
1572    let mut para_end = target_idx;
1573    while para_end + 1 < lines.len() {
1574        let next_idx = para_end + 1;
1575        let next_line = lines[next_idx];
1576        let next_trimmed = next_line.trim();
1577
1578        // Stop at blank line or special blocks
1579        if next_trimmed.is_empty()
1580            || next_trimmed.starts_with('#')
1581            || next_trimmed.starts_with("```")
1582            || next_trimmed.starts_with("~~~")
1583            || ElementCache::calculate_indentation_width_default(next_line) >= 4
1584            || next_trimmed.starts_with('>')
1585            || crate::utils::table_utils::TableUtils::is_potential_table_row(next_line)
1586            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1587            || is_horizontal_rule(next_trimmed)
1588            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1589                && !is_horizontal_rule(next_trimmed)
1590                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1591            || is_numbered_list_item(next_trimmed)
1592            || is_definition_list_item(next_trimmed)
1593        {
1594            break;
1595        }
1596
1597        para_end = next_idx;
1598    }
1599
1600    // Extract paragraph lines
1601    let paragraph_lines = &lines[para_start..=para_end];
1602
1603    // Calculate byte offsets
1604    let mut start_byte = 0;
1605    for line in lines.iter().take(para_start) {
1606        start_byte += line.len() + 1; // +1 for newline
1607    }
1608
1609    let mut end_byte = start_byte;
1610    for line in paragraph_lines.iter() {
1611        end_byte += line.len() + 1; // +1 for newline
1612    }
1613
1614    // Track whether the byte range includes a trailing newline
1615    // (it doesn't if this is the last line and the file doesn't end with newline)
1616    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1617
1618    // Adjust end_byte if the last line doesn't have a newline
1619    if !includes_trailing_newline {
1620        end_byte -= 1;
1621    }
1622
1623    // Join paragraph lines and reflow
1624    let paragraph_text = paragraph_lines.join("\n");
1625
1626    // Create reflow options
1627    let options = ReflowOptions {
1628        line_length,
1629        break_on_sentences: true,
1630        preserve_breaks: false,
1631        sentence_per_line: false,
1632        abbreviations: None,
1633    };
1634
1635    // Reflow the paragraph using reflow_markdown to handle it properly
1636    let reflowed = reflow_markdown(&paragraph_text, &options);
1637
1638    // Ensure reflowed text matches whether the byte range includes a trailing newline
1639    // This is critical: if the range includes a newline, the replacement must too,
1640    // otherwise the next line will get appended to the reflowed paragraph
1641    let reflowed_text = if includes_trailing_newline {
1642        // Range includes newline - ensure reflowed text has one
1643        if reflowed.ends_with('\n') {
1644            reflowed
1645        } else {
1646            format!("{reflowed}\n")
1647        }
1648    } else {
1649        // Range doesn't include newline - ensure reflowed text doesn't have one
1650        if reflowed.ends_with('\n') {
1651            reflowed.trim_end_matches('\n').to_string()
1652        } else {
1653            reflowed
1654        }
1655    };
1656
1657    Some(ParagraphReflow {
1658        start_byte,
1659        end_byte,
1660        reflowed_text,
1661    })
1662}
1663
1664#[cfg(test)]
1665mod tests {
1666    use super::*;
1667
1668    /// Unit test for private helper function text_ends_with_abbreviation()
1669    ///
1670    /// This test stays inline because it tests a private function.
1671    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
1672    #[test]
1673    fn test_helper_function_text_ends_with_abbreviation() {
1674        // Test the helper function directly
1675        let abbreviations = get_abbreviations(&None);
1676
1677        // True cases - built-in abbreviations (titles and i.e./e.g.)
1678        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
1679        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
1680        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
1681        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
1682        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
1683        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
1684        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
1685        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
1686
1687        // False cases - NOT in built-in list (etc doesn't always have period)
1688        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
1689        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
1690        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
1691        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
1692        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
1693        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
1694        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
1695        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
1696        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
1697        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
1698    }
1699}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs