rumdl_lib/utils/
text_reflow.rs

1//! Text reflow utilities for MD013
2//!
3//! This module implements text wrapping/reflow functionality that preserves
4//! Markdown elements like links, emphasis, code spans, etc.
5
6use crate::utils::is_definition_list_item;
7use crate::utils::regex_cache::{
8    DISPLAY_MATH_REGEX, EMOJI_SHORTCODE_REGEX, FOOTNOTE_REF_REGEX, HTML_ENTITY_REGEX, HTML_TAG_PATTERN,
9    INLINE_IMAGE_FANCY_REGEX, INLINE_LINK_FANCY_REGEX, INLINE_MATH_REGEX, REF_IMAGE_REGEX, REF_LINK_REGEX,
10    SHORTCUT_REF_REGEX, STRIKETHROUGH_FANCY_REGEX, WIKI_LINK_REGEX,
11};
12use std::collections::HashSet;
13
14/// Options for reflowing text
15#[derive(Clone)]
16pub struct ReflowOptions {
17    /// Target line length
18    pub line_length: usize,
19    /// Whether to break on sentence boundaries when possible
20    pub break_on_sentences: bool,
21    /// Whether to preserve existing line breaks in paragraphs
22    pub preserve_breaks: bool,
23    /// Whether to enforce one sentence per line
24    pub sentence_per_line: bool,
25    /// Custom abbreviations for sentence detection
26    /// Periods are optional - both "Dr" and "Dr." work the same
27    /// Custom abbreviations are always added to the built-in defaults
28    pub abbreviations: Option<Vec<String>>,
29}
30
31impl Default for ReflowOptions {
32    fn default() -> Self {
33        Self {
34            line_length: 80,
35            break_on_sentences: true,
36            preserve_breaks: false,
37            sentence_per_line: false,
38            abbreviations: None,
39        }
40    }
41}
42
43/// Get the effective abbreviations set based on options
44/// All abbreviations are normalized to lowercase for case-insensitive matching
45/// Custom abbreviations are always merged with built-in defaults
46fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
47    // Only include abbreviations that:
48    // 1. Conventionally ALWAYS have a period in standard writing
49    // 2. Are followed by something (name, example), not sentence-final
50    //
51    // Do NOT include:
52    // - Words that don't typically take periods (vs, etc)
53    // - Abbreviations that can end sentences (Inc., Ph.D., U.S.)
54    let mut abbreviations: HashSet<String> = [
55        // Titles - always have period, always followed by a name
56        "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr",
57        // Latin - always written with periods, introduce examples/references
58        "i.e", "e.g",
59    ]
60    .iter()
61    .map(|s| s.to_lowercase())
62    .collect();
63
64    // Always extend defaults with custom abbreviations
65    // Strip any trailing periods and normalize to lowercase for consistent matching
66    if let Some(custom_list) = custom {
67        for abbr in custom_list {
68            let normalized = abbr.trim_end_matches('.').to_lowercase();
69            if !normalized.is_empty() {
70                abbreviations.insert(normalized);
71            }
72        }
73    }
74
75    abbreviations
76}
77
78/// Check if text ends with a common abbreviation followed by a period
79///
80/// Abbreviations only count when followed by a period, not ! or ?.
81/// This prevents false positives where words ending in abbreviation-like
82/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
83/// detected as abbreviations.
84///
85/// Examples:
86///   - "Dr." -> true (abbreviation)
87///   - "Dr?" -> false (question, not abbreviation)
88///   - "paradigms." -> false (not in abbreviation list)
89///   - "paradigms?" -> false (question mark, not abbreviation)
90///
91/// See: Issue #150
92fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
93    // Only check if text ends with a period (abbreviations require periods)
94    if !text.ends_with('.') {
95        return false;
96    }
97
98    // Remove the trailing period
99    let without_period = text.trim_end_matches('.');
100
101    // Get the last word by splitting on whitespace
102    let last_word = without_period.split_whitespace().last().unwrap_or("");
103
104    if last_word.is_empty() {
105        return false;
106    }
107
108    // O(1) HashSet lookup (abbreviations are already lowercase)
109    abbreviations.contains(&last_word.to_lowercase())
110}
111
112/// Detect if a character position is a sentence boundary
113/// Based on the approach from github.com/JoshuaKGoldberg/sentences-per-line
114fn is_sentence_boundary(text: &str, pos: usize, abbreviations: &HashSet<String>) -> bool {
115    let chars: Vec<char> = text.chars().collect();
116
117    if pos + 1 >= chars.len() {
118        return false;
119    }
120
121    // Check for sentence-ending punctuation
122    let c = chars[pos];
123    if c != '.' && c != '!' && c != '?' {
124        return false;
125    }
126
127    // Must be followed by at least one space
128    if chars[pos + 1] != ' ' {
129        return false;
130    }
131
132    // Skip all whitespace after the punctuation to find the start of the next sentence
133    let mut next_char_pos = pos + 2;
134    while next_char_pos < chars.len() && chars[next_char_pos].is_whitespace() {
135        next_char_pos += 1;
136    }
137
138    // Check if we reached the end of the string
139    if next_char_pos >= chars.len() {
140        return false;
141    }
142
143    // Next character after space(s) must be uppercase (new sentence indicator)
144    if !chars[next_char_pos].is_uppercase() {
145        return false;
146    }
147
148    // Look back to check for common abbreviations (only applies to periods)
149    if pos > 0 && c == '.' {
150        // Check if the text up to and including this period ends with an abbreviation
151        // Note: text[..=pos] includes the character at pos (the period)
152        if text_ends_with_abbreviation(&text[..=pos], abbreviations) {
153            return false;
154        }
155
156        // Check for decimal numbers (e.g., "3.14")
157        // Make sure to check if next_char_pos is within bounds
158        if chars[pos - 1].is_numeric() && next_char_pos < chars.len() && chars[next_char_pos].is_numeric() {
159            return false;
160        }
161    }
162    true
163}
164
165/// Split text into sentences
166pub fn split_into_sentences(text: &str) -> Vec<String> {
167    split_into_sentences_custom(text, &None)
168}
169
170/// Split text into sentences with custom abbreviations
171pub fn split_into_sentences_custom(text: &str, custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
172    let abbreviations = get_abbreviations(custom_abbreviations);
173    split_into_sentences_with_set(text, &abbreviations)
174}
175
176/// Internal function to split text into sentences with a pre-computed abbreviations set
177/// Use this when calling multiple times in a loop to avoid repeatedly computing the set
178fn split_into_sentences_with_set(text: &str, abbreviations: &HashSet<String>) -> Vec<String> {
179    let mut sentences = Vec::new();
180    let mut current_sentence = String::new();
181    let mut chars = text.chars().peekable();
182    let mut pos = 0;
183
184    while let Some(c) = chars.next() {
185        current_sentence.push(c);
186
187        if is_sentence_boundary(text, pos, abbreviations) {
188            // Include the space after sentence if it exists
189            if chars.peek() == Some(&' ') {
190                chars.next();
191                pos += 1;
192            }
193            sentences.push(current_sentence.trim().to_string());
194            current_sentence.clear();
195        }
196
197        pos += 1;
198    }
199
200    // Add any remaining text as the last sentence
201    if !current_sentence.trim().is_empty() {
202        sentences.push(current_sentence.trim().to_string());
203    }
204    sentences
205}
206
207/// Check if a line is a horizontal rule (---, ___, ***)
208fn is_horizontal_rule(line: &str) -> bool {
209    if line.len() < 3 {
210        return false;
211    }
212
213    // Check if line consists only of -, _, or * characters (at least 3)
214    let chars: Vec<char> = line.chars().collect();
215    if chars.is_empty() {
216        return false;
217    }
218
219    let first_char = chars[0];
220    if first_char != '-' && first_char != '_' && first_char != '*' {
221        return false;
222    }
223
224    // All characters should be the same (allowing spaces between)
225    for c in &chars {
226        if *c != first_char && *c != ' ' {
227            return false;
228        }
229    }
230
231    // Count non-space characters
232    let non_space_count = chars.iter().filter(|c| **c != ' ').count();
233    non_space_count >= 3
234}
235
236/// Check if a line is a numbered list item (e.g., "1. ", "10. ")
237fn is_numbered_list_item(line: &str) -> bool {
238    let mut chars = line.chars();
239
240    // Must start with a digit
241    if !chars.next().is_some_and(|c| c.is_numeric()) {
242        return false;
243    }
244
245    // Can have more digits
246    while let Some(c) = chars.next() {
247        if c == '.' {
248            // After period, must have a space or be end of line
249            return chars.next().is_none_or(|c| c == ' ');
250        }
251        if !c.is_numeric() {
252            return false;
253        }
254    }
255
256    false
257}
258
259/// Check if a line ends with a hard break (either two spaces or backslash)
260///
261/// CommonMark supports two formats for hard line breaks:
262/// 1. Two or more trailing spaces
263/// 2. A backslash at the end of the line
264fn has_hard_break(line: &str) -> bool {
265    let line = line.strip_suffix('\r').unwrap_or(line);
266    line.ends_with("  ") || line.ends_with('\\')
267}
268
269/// Trim trailing whitespace while preserving hard breaks (two trailing spaces or backslash)
270///
271/// Hard breaks in Markdown can be indicated by:
272/// 1. Two trailing spaces before a newline (traditional)
273/// 2. A backslash at the end of the line (mdformat style)
274fn trim_preserving_hard_break(s: &str) -> String {
275    // Strip trailing \r from CRLF line endings first to handle Windows files
276    let s = s.strip_suffix('\r').unwrap_or(s);
277
278    // Check for backslash hard break (mdformat style)
279    if s.ends_with('\\') {
280        // Preserve the backslash exactly as-is
281        return s.to_string();
282    }
283
284    // Check if there are at least 2 trailing spaces (traditional hard break)
285    if s.ends_with("  ") {
286        // Find the position where non-space content ends
287        let content_end = s.trim_end().len();
288        if content_end == 0 {
289            // String is all whitespace
290            return String::new();
291        }
292        // Preserve exactly 2 trailing spaces for hard break
293        format!("{}  ", &s[..content_end])
294    } else {
295        // No hard break, just trim all trailing whitespace
296        s.trim_end().to_string()
297    }
298}
299
300pub fn reflow_line(line: &str, options: &ReflowOptions) -> Vec<String> {
301    // For sentence-per-line mode, always process regardless of length
302    if options.sentence_per_line {
303        let elements = parse_markdown_elements(line);
304        return reflow_elements_sentence_per_line(&elements, &options.abbreviations);
305    }
306
307    // Quick check: if line is already short enough, return as-is
308    if line.chars().count() <= options.line_length {
309        return vec![line.to_string()];
310    }
311
312    // Parse the markdown to identify elements
313    let elements = parse_markdown_elements(line);
314
315    // Reflow the elements into lines
316    reflow_elements(&elements, options)
317}
318
319/// Represents a piece of content in the markdown
320#[derive(Debug, Clone)]
321enum Element {
322    /// Plain text that can be wrapped
323    Text(String),
324    /// A complete markdown inline link [text](url)
325    Link { text: String, url: String },
326    /// A complete markdown reference link [text][ref]
327    ReferenceLink { text: String, reference: String },
328    /// A complete markdown empty reference link [text][]
329    EmptyReferenceLink { text: String },
330    /// A complete markdown shortcut reference link [ref]
331    ShortcutReference { reference: String },
332    /// A complete markdown inline image ![alt](url)
333    InlineImage { alt: String, url: String },
334    /// A complete markdown reference image ![alt][ref]
335    ReferenceImage { alt: String, reference: String },
336    /// A complete markdown empty reference image ![alt][]
337    EmptyReferenceImage { alt: String },
338    /// Footnote reference [^note]
339    FootnoteReference { note: String },
340    /// Strikethrough text ~~text~~
341    Strikethrough(String),
342    /// Wiki-style link [[wiki]] or [[wiki|text]]
343    WikiLink(String),
344    /// Inline math $math$
345    InlineMath(String),
346    /// Display math $$math$$
347    DisplayMath(String),
348    /// Emoji shortcode :emoji:
349    EmojiShortcode(String),
350    /// HTML tag <tag> or </tag> or <tag/>
351    HtmlTag(String),
352    /// HTML entity &nbsp; or &#123;
353    HtmlEntity(String),
354    /// Inline code `code`
355    Code(String),
356    /// Bold text **text**
357    Bold(String),
358    /// Italic text *text*
359    Italic(String),
360}
361
362impl std::fmt::Display for Element {
363    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
364        match self {
365            Element::Text(s) => write!(f, "{s}"),
366            Element::Link { text, url } => write!(f, "[{text}]({url})"),
367            Element::ReferenceLink { text, reference } => write!(f, "[{text}][{reference}]"),
368            Element::EmptyReferenceLink { text } => write!(f, "[{text}][]"),
369            Element::ShortcutReference { reference } => write!(f, "[{reference}]"),
370            Element::InlineImage { alt, url } => write!(f, "![{alt}]({url})"),
371            Element::ReferenceImage { alt, reference } => write!(f, "![{alt}][{reference}]"),
372            Element::EmptyReferenceImage { alt } => write!(f, "![{alt}][]"),
373            Element::FootnoteReference { note } => write!(f, "[^{note}]"),
374            Element::Strikethrough(s) => write!(f, "~~{s}~~"),
375            Element::WikiLink(s) => write!(f, "[[{s}]]"),
376            Element::InlineMath(s) => write!(f, "${s}$"),
377            Element::DisplayMath(s) => write!(f, "$${s}$$"),
378            Element::EmojiShortcode(s) => write!(f, ":{s}:"),
379            Element::HtmlTag(s) => write!(f, "{s}"),
380            Element::HtmlEntity(s) => write!(f, "{s}"),
381            Element::Code(s) => write!(f, "`{s}`"),
382            Element::Bold(s) => write!(f, "**{s}**"),
383            Element::Italic(s) => write!(f, "*{s}*"),
384        }
385    }
386}
387
388impl Element {
389    fn len(&self) -> usize {
390        match self {
391            Element::Text(s) => s.chars().count(),
392            Element::Link { text, url } => text.chars().count() + url.chars().count() + 4, // [text](url)
393            Element::ReferenceLink { text, reference } => text.chars().count() + reference.chars().count() + 4, // [text][ref]
394            Element::EmptyReferenceLink { text } => text.chars().count() + 4, // [text][]
395            Element::ShortcutReference { reference } => reference.chars().count() + 2, // [ref]
396            Element::InlineImage { alt, url } => alt.chars().count() + url.chars().count() + 5, // ![alt](url)
397            Element::ReferenceImage { alt, reference } => alt.chars().count() + reference.chars().count() + 5, // ![alt][ref]
398            Element::EmptyReferenceImage { alt } => alt.chars().count() + 5, // ![alt][]
399            Element::FootnoteReference { note } => note.chars().count() + 3, // [^note]
400            Element::Strikethrough(s) => s.chars().count() + 4,              // ~~text~~
401            Element::WikiLink(s) => s.chars().count() + 4,                   // [[wiki]]
402            Element::InlineMath(s) => s.chars().count() + 2,                 // $math$
403            Element::DisplayMath(s) => s.chars().count() + 4,                // $$math$$
404            Element::EmojiShortcode(s) => s.chars().count() + 2,             // :emoji:
405            Element::HtmlTag(s) => s.chars().count(),                        // <tag> - already includes brackets
406            Element::HtmlEntity(s) => s.chars().count(),                     // &nbsp; - already complete
407            Element::Code(s) => s.chars().count() + 2,                       // `code`
408            Element::Bold(s) => s.chars().count() + 4,                       // **text**
409            Element::Italic(s) => s.chars().count() + 2,                     // *text*
410        }
411    }
412}
413
414/// Parse markdown elements from text preserving the raw syntax
415///
416/// Detection order is critical:
417/// 1. Inline links [text](url) - must be detected first to avoid conflicts
418/// 2. Reference links [text][ref] - detected before shortcut references
419/// 3. Empty reference links [text][] - a special case of reference links
420/// 4. Shortcut reference links [ref] - detected last to avoid false positives
421/// 5. Other elements (code, bold, italic) - processed normally
422fn parse_markdown_elements(text: &str) -> Vec<Element> {
423    let mut elements = Vec::new();
424    let mut remaining = text;
425
426    while !remaining.is_empty() {
427        // Find the earliest occurrence of any markdown pattern
428        let mut earliest_match: Option<(usize, &str, fancy_regex::Match)> = None;
429
430        // Check for images first (they start with ! so should be detected before links)
431        // Inline images - ![alt](url)
432        if let Ok(Some(m)) = INLINE_IMAGE_FANCY_REGEX.find(remaining)
433            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
434        {
435            earliest_match = Some((m.start(), "inline_image", m));
436        }
437
438        // Reference images - ![alt][ref]
439        if let Ok(Some(m)) = REF_IMAGE_REGEX.find(remaining)
440            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
441        {
442            earliest_match = Some((m.start(), "ref_image", m));
443        }
444
445        // Check for footnote references - [^note]
446        if let Ok(Some(m)) = FOOTNOTE_REF_REGEX.find(remaining)
447            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
448        {
449            earliest_match = Some((m.start(), "footnote_ref", m));
450        }
451
452        // Check for inline links - [text](url)
453        if let Ok(Some(m)) = INLINE_LINK_FANCY_REGEX.find(remaining)
454            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
455        {
456            earliest_match = Some((m.start(), "inline_link", m));
457        }
458
459        // Check for reference links - [text][ref]
460        if let Ok(Some(m)) = REF_LINK_REGEX.find(remaining)
461            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
462        {
463            earliest_match = Some((m.start(), "ref_link", m));
464        }
465
466        // Check for shortcut reference links - [ref]
467        // Only check if we haven't found an earlier pattern that would conflict
468        if let Ok(Some(m)) = SHORTCUT_REF_REGEX.find(remaining)
469            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
470        {
471            earliest_match = Some((m.start(), "shortcut_ref", m));
472        }
473
474        // Check for wiki-style links - [[wiki]]
475        if let Ok(Some(m)) = WIKI_LINK_REGEX.find(remaining)
476            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
477        {
478            earliest_match = Some((m.start(), "wiki_link", m));
479        }
480
481        // Check for display math first (before inline) - $$math$$
482        if let Ok(Some(m)) = DISPLAY_MATH_REGEX.find(remaining)
483            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
484        {
485            earliest_match = Some((m.start(), "display_math", m));
486        }
487
488        // Check for inline math - $math$
489        if let Ok(Some(m)) = INLINE_MATH_REGEX.find(remaining)
490            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
491        {
492            earliest_match = Some((m.start(), "inline_math", m));
493        }
494
495        // Check for strikethrough - ~~text~~
496        if let Ok(Some(m)) = STRIKETHROUGH_FANCY_REGEX.find(remaining)
497            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
498        {
499            earliest_match = Some((m.start(), "strikethrough", m));
500        }
501
502        // Check for emoji shortcodes - :emoji:
503        if let Ok(Some(m)) = EMOJI_SHORTCODE_REGEX.find(remaining)
504            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
505        {
506            earliest_match = Some((m.start(), "emoji", m));
507        }
508
509        // Check for HTML entities - &nbsp; etc
510        if let Ok(Some(m)) = HTML_ENTITY_REGEX.find(remaining)
511            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
512        {
513            earliest_match = Some((m.start(), "html_entity", m));
514        }
515
516        // Check for HTML tags - <tag> </tag> <tag/>
517        // But exclude autolinks like <https://...> or <mailto:...>
518        if let Ok(Some(m)) = HTML_TAG_PATTERN.find(remaining)
519            && earliest_match.as_ref().is_none_or(|(start, _, _)| m.start() < *start)
520        {
521            // Check if this is an autolink (starts with protocol or mailto:)
522            let matched_text = &remaining[m.start()..m.end()];
523            let is_autolink = matched_text.starts_with("<http://")
524                || matched_text.starts_with("<https://")
525                || matched_text.starts_with("<mailto:")
526                || matched_text.starts_with("<ftp://")
527                || matched_text.starts_with("<ftps://");
528
529            if !is_autolink {
530                earliest_match = Some((m.start(), "html_tag", m));
531            }
532        }
533
534        // Find earliest non-link special characters
535        let mut next_special = remaining.len();
536        let mut special_type = "";
537
538        if let Some(pos) = remaining.find('`')
539            && pos < next_special
540        {
541            next_special = pos;
542            special_type = "code";
543        }
544        if let Some(pos) = remaining.find("**")
545            && pos < next_special
546        {
547            next_special = pos;
548            special_type = "bold";
549        }
550        if let Some(pos) = remaining.find('*')
551            && pos < next_special
552            && !remaining[pos..].starts_with("**")
553        {
554            next_special = pos;
555            special_type = "italic";
556        }
557
558        // Determine which pattern to process first
559        let should_process_markdown_link = if let Some((pos, _, _)) = earliest_match {
560            pos < next_special
561        } else {
562            false
563        };
564
565        if should_process_markdown_link {
566            let (pos, pattern_type, match_obj) = earliest_match.unwrap();
567
568            // Add any text before the match
569            if pos > 0 {
570                elements.push(Element::Text(remaining[..pos].to_string()));
571            }
572
573            // Process the matched pattern
574            match pattern_type {
575                "inline_image" => {
576                    if let Ok(Some(caps)) = INLINE_IMAGE_FANCY_REGEX.captures(remaining) {
577                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
578                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
579                        elements.push(Element::InlineImage {
580                            alt: alt.to_string(),
581                            url: url.to_string(),
582                        });
583                        remaining = &remaining[match_obj.end()..];
584                    } else {
585                        elements.push(Element::Text("!".to_string()));
586                        remaining = &remaining[1..];
587                    }
588                }
589                "ref_image" => {
590                    if let Ok(Some(caps)) = REF_IMAGE_REGEX.captures(remaining) {
591                        let alt = caps.get(1).map(|m| m.as_str()).unwrap_or("");
592                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
593
594                        if reference.is_empty() {
595                            elements.push(Element::EmptyReferenceImage { alt: alt.to_string() });
596                        } else {
597                            elements.push(Element::ReferenceImage {
598                                alt: alt.to_string(),
599                                reference: reference.to_string(),
600                            });
601                        }
602                        remaining = &remaining[match_obj.end()..];
603                    } else {
604                        elements.push(Element::Text("!".to_string()));
605                        remaining = &remaining[1..];
606                    }
607                }
608                "footnote_ref" => {
609                    if let Ok(Some(caps)) = FOOTNOTE_REF_REGEX.captures(remaining) {
610                        let note = caps.get(1).map(|m| m.as_str()).unwrap_or("");
611                        elements.push(Element::FootnoteReference { note: note.to_string() });
612                        remaining = &remaining[match_obj.end()..];
613                    } else {
614                        elements.push(Element::Text("[".to_string()));
615                        remaining = &remaining[1..];
616                    }
617                }
618                "inline_link" => {
619                    if let Ok(Some(caps)) = INLINE_LINK_FANCY_REGEX.captures(remaining) {
620                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
621                        let url = caps.get(2).map(|m| m.as_str()).unwrap_or("");
622                        elements.push(Element::Link {
623                            text: text.to_string(),
624                            url: url.to_string(),
625                        });
626                        remaining = &remaining[match_obj.end()..];
627                    } else {
628                        // Fallback - shouldn't happen
629                        elements.push(Element::Text("[".to_string()));
630                        remaining = &remaining[1..];
631                    }
632                }
633                "ref_link" => {
634                    if let Ok(Some(caps)) = REF_LINK_REGEX.captures(remaining) {
635                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
636                        let reference = caps.get(2).map(|m| m.as_str()).unwrap_or("");
637
638                        if reference.is_empty() {
639                            // Empty reference link [text][]
640                            elements.push(Element::EmptyReferenceLink { text: text.to_string() });
641                        } else {
642                            // Regular reference link [text][ref]
643                            elements.push(Element::ReferenceLink {
644                                text: text.to_string(),
645                                reference: reference.to_string(),
646                            });
647                        }
648                        remaining = &remaining[match_obj.end()..];
649                    } else {
650                        // Fallback - shouldn't happen
651                        elements.push(Element::Text("[".to_string()));
652                        remaining = &remaining[1..];
653                    }
654                }
655                "shortcut_ref" => {
656                    if let Ok(Some(caps)) = SHORTCUT_REF_REGEX.captures(remaining) {
657                        let reference = caps.get(1).map(|m| m.as_str()).unwrap_or("");
658                        elements.push(Element::ShortcutReference {
659                            reference: reference.to_string(),
660                        });
661                        remaining = &remaining[match_obj.end()..];
662                    } else {
663                        // Fallback - shouldn't happen
664                        elements.push(Element::Text("[".to_string()));
665                        remaining = &remaining[1..];
666                    }
667                }
668                "wiki_link" => {
669                    if let Ok(Some(caps)) = WIKI_LINK_REGEX.captures(remaining) {
670                        let content = caps.get(1).map(|m| m.as_str()).unwrap_or("");
671                        elements.push(Element::WikiLink(content.to_string()));
672                        remaining = &remaining[match_obj.end()..];
673                    } else {
674                        elements.push(Element::Text("[[".to_string()));
675                        remaining = &remaining[2..];
676                    }
677                }
678                "display_math" => {
679                    if let Ok(Some(caps)) = DISPLAY_MATH_REGEX.captures(remaining) {
680                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
681                        elements.push(Element::DisplayMath(math.to_string()));
682                        remaining = &remaining[match_obj.end()..];
683                    } else {
684                        elements.push(Element::Text("$$".to_string()));
685                        remaining = &remaining[2..];
686                    }
687                }
688                "inline_math" => {
689                    if let Ok(Some(caps)) = INLINE_MATH_REGEX.captures(remaining) {
690                        let math = caps.get(1).map(|m| m.as_str()).unwrap_or("");
691                        elements.push(Element::InlineMath(math.to_string()));
692                        remaining = &remaining[match_obj.end()..];
693                    } else {
694                        elements.push(Element::Text("$".to_string()));
695                        remaining = &remaining[1..];
696                    }
697                }
698                "strikethrough" => {
699                    if let Ok(Some(caps)) = STRIKETHROUGH_FANCY_REGEX.captures(remaining) {
700                        let text = caps.get(1).map(|m| m.as_str()).unwrap_or("");
701                        elements.push(Element::Strikethrough(text.to_string()));
702                        remaining = &remaining[match_obj.end()..];
703                    } else {
704                        elements.push(Element::Text("~~".to_string()));
705                        remaining = &remaining[2..];
706                    }
707                }
708                "emoji" => {
709                    if let Ok(Some(caps)) = EMOJI_SHORTCODE_REGEX.captures(remaining) {
710                        let emoji = caps.get(1).map(|m| m.as_str()).unwrap_or("");
711                        elements.push(Element::EmojiShortcode(emoji.to_string()));
712                        remaining = &remaining[match_obj.end()..];
713                    } else {
714                        elements.push(Element::Text(":".to_string()));
715                        remaining = &remaining[1..];
716                    }
717                }
718                "html_entity" => {
719                    // HTML entities are captured whole
720                    elements.push(Element::HtmlEntity(remaining[..match_obj.end()].to_string()));
721                    remaining = &remaining[match_obj.end()..];
722                }
723                "html_tag" => {
724                    // HTML tags are captured whole
725                    elements.push(Element::HtmlTag(remaining[..match_obj.end()].to_string()));
726                    remaining = &remaining[match_obj.end()..];
727                }
728                _ => {
729                    // Unknown pattern, treat as text
730                    elements.push(Element::Text("[".to_string()));
731                    remaining = &remaining[1..];
732                }
733            }
734        } else {
735            // Process non-link special characters
736
737            // Add any text before the special character
738            if next_special > 0 && next_special < remaining.len() {
739                elements.push(Element::Text(remaining[..next_special].to_string()));
740                remaining = &remaining[next_special..];
741            }
742
743            // Process the special element
744            match special_type {
745                "code" => {
746                    // Find end of code
747                    if let Some(code_end) = remaining[1..].find('`') {
748                        let code = &remaining[1..1 + code_end];
749                        elements.push(Element::Code(code.to_string()));
750                        remaining = &remaining[1 + code_end + 1..];
751                    } else {
752                        // No closing backtick, treat as text
753                        elements.push(Element::Text(remaining.to_string()));
754                        break;
755                    }
756                }
757                "bold" => {
758                    // Check for bold text
759                    if let Some(bold_end) = remaining[2..].find("**") {
760                        let bold_text = &remaining[2..2 + bold_end];
761                        elements.push(Element::Bold(bold_text.to_string()));
762                        remaining = &remaining[2 + bold_end + 2..];
763                    } else {
764                        // No closing **, treat as text
765                        elements.push(Element::Text("**".to_string()));
766                        remaining = &remaining[2..];
767                    }
768                }
769                "italic" => {
770                    // Check for italic text
771                    if let Some(italic_end) = remaining[1..].find('*') {
772                        let italic_text = &remaining[1..1 + italic_end];
773                        elements.push(Element::Italic(italic_text.to_string()));
774                        remaining = &remaining[1 + italic_end + 1..];
775                    } else {
776                        // No closing *, treat as text
777                        elements.push(Element::Text("*".to_string()));
778                        remaining = &remaining[1..];
779                    }
780                }
781                _ => {
782                    // No special elements found, add all remaining text
783                    elements.push(Element::Text(remaining.to_string()));
784                    break;
785                }
786            }
787        }
788    }
789
790    elements
791}
792
793/// Reflow elements for sentence-per-line mode
794fn reflow_elements_sentence_per_line(elements: &[Element], custom_abbreviations: &Option<Vec<String>>) -> Vec<String> {
795    let abbreviations = get_abbreviations(custom_abbreviations);
796    let mut lines = Vec::new();
797    let mut current_line = String::new();
798
799    for element in elements.iter() {
800        let element_str = format!("{element}");
801
802        // For text elements, split into sentences
803        if let Element::Text(text) = element {
804            // Simply append text - it already has correct spacing from tokenization
805            let combined = format!("{current_line}{text}");
806            // Use the pre-computed abbreviations set to avoid redundant computation
807            let sentences = split_into_sentences_with_set(&combined, &abbreviations);
808
809            if sentences.len() > 1 {
810                // We found sentence boundaries
811                for (i, sentence) in sentences.iter().enumerate() {
812                    if i == 0 {
813                        // First sentence might continue from previous elements
814                        // But check if it ends with an abbreviation
815                        let trimmed = sentence.trim();
816
817                        if text_ends_with_abbreviation(trimmed, &abbreviations) {
818                            // Don't emit yet - this sentence ends with abbreviation, continue accumulating
819                            current_line = sentence.to_string();
820                        } else {
821                            // Normal case - emit the first sentence
822                            lines.push(sentence.to_string());
823                            current_line.clear();
824                        }
825                    } else if i == sentences.len() - 1 {
826                        // Last sentence: check if it's complete or incomplete
827                        let trimmed = sentence.trim();
828                        let ends_with_sentence_punct =
829                            trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?');
830
831                        if ends_with_sentence_punct && !text_ends_with_abbreviation(trimmed, &abbreviations) {
832                            // Complete sentence - emit it immediately
833                            lines.push(sentence.to_string());
834                            current_line.clear();
835                        } else {
836                            // Incomplete sentence - save for next iteration
837                            current_line = sentence.to_string();
838                        }
839                    } else {
840                        // Complete sentences in the middle
841                        lines.push(sentence.to_string());
842                    }
843                }
844            } else {
845                // No sentence boundary found, continue accumulating
846                current_line = combined;
847            }
848        } else {
849            // Non-text elements (Code, Bold, Italic, etc.)
850            // Add space before element if needed (unless it's after an opening paren/bracket)
851            if !current_line.is_empty()
852                && !current_line.ends_with(' ')
853                && !current_line.ends_with('(')
854                && !current_line.ends_with('[')
855            {
856                current_line.push(' ');
857            }
858            current_line.push_str(&element_str);
859        }
860    }
861
862    // Add any remaining content
863    if !current_line.is_empty() {
864        lines.push(current_line.trim().to_string());
865    }
866    lines
867}
868
869/// Reflow elements into lines that fit within the line length
870fn reflow_elements(elements: &[Element], options: &ReflowOptions) -> Vec<String> {
871    let mut lines = Vec::new();
872    let mut current_line = String::new();
873    let mut current_length = 0;
874
875    for element in elements {
876        let element_str = format!("{element}");
877        let element_len = element.len();
878
879        // For text elements that might need breaking
880        if let Element::Text(text) = element {
881            // Check if original text had leading whitespace
882            let has_leading_space = text.starts_with(char::is_whitespace);
883            // If this is a text element, always process it word by word
884            let words: Vec<&str> = text.split_whitespace().collect();
885
886            for (i, word) in words.iter().enumerate() {
887                let word_len = word.chars().count();
888                // Check if this "word" is just punctuation that should stay attached
889                let is_trailing_punct = word
890                    .chars()
891                    .all(|c| matches!(c, ',' | '.' | ':' | ';' | '!' | '?' | ')' | ']' | '}'));
892
893                if current_length > 0 && current_length + 1 + word_len > options.line_length && !is_trailing_punct {
894                    // Start a new line (but never for trailing punctuation)
895                    lines.push(current_line.trim().to_string());
896                    current_line = word.to_string();
897                    current_length = word_len;
898                } else {
899                    // Add word to current line
900                    // Only add space if: we have content AND (this isn't the first word OR original had leading space)
901                    // AND this isn't trailing punctuation (which attaches directly)
902                    if current_length > 0 && (i > 0 || has_leading_space) && !is_trailing_punct {
903                        current_line.push(' ');
904                        current_length += 1;
905                    }
906                    current_line.push_str(word);
907                    current_length += word_len;
908                }
909            }
910        } else {
911            // For non-text elements (code, links, references), treat as atomic units
912            // These should never be broken across lines
913            if current_length > 0 && current_length + 1 + element_len > options.line_length {
914                // Start a new line
915                lines.push(current_line.trim().to_string());
916                current_line = element_str;
917                current_length = element_len;
918            } else {
919                // Add element to current line
920                // Don't add space if the current line ends with an opening bracket/paren
921                let ends_with_opener =
922                    current_line.ends_with('(') || current_line.ends_with('[') || current_line.ends_with('{');
923                if current_length > 0 && !ends_with_opener {
924                    current_line.push(' ');
925                    current_length += 1;
926                }
927                current_line.push_str(&element_str);
928                current_length += element_len;
929            }
930        }
931    }
932
933    // Don't forget the last line
934    if !current_line.is_empty() {
935        lines.push(current_line.trim_end().to_string());
936    }
937
938    lines
939}
940
941/// Reflow markdown content preserving structure
942pub fn reflow_markdown(content: &str, options: &ReflowOptions) -> String {
943    let lines: Vec<&str> = content.lines().collect();
944    let mut result = Vec::new();
945    let mut i = 0;
946
947    while i < lines.len() {
948        let line = lines[i];
949        let trimmed = line.trim();
950
951        // Preserve empty lines
952        if trimmed.is_empty() {
953            result.push(String::new());
954            i += 1;
955            continue;
956        }
957
958        // Preserve headings as-is
959        if trimmed.starts_with('#') {
960            result.push(line.to_string());
961            i += 1;
962            continue;
963        }
964
965        // Preserve fenced code blocks
966        if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
967            result.push(line.to_string());
968            i += 1;
969            // Copy lines until closing fence
970            while i < lines.len() {
971                result.push(lines[i].to_string());
972                if lines[i].trim().starts_with("```") || lines[i].trim().starts_with("~~~") {
973                    i += 1;
974                    break;
975                }
976                i += 1;
977            }
978            continue;
979        }
980
981        // Preserve indented code blocks (4+ spaces or 1+ tab)
982        if line.starts_with("    ") || line.starts_with("\t") {
983            // Collect all consecutive indented lines
984            result.push(line.to_string());
985            i += 1;
986            while i < lines.len() {
987                let next_line = lines[i];
988                // Continue if next line is also indented or empty (empty lines in code blocks are ok)
989                if next_line.starts_with("    ") || next_line.starts_with("\t") || next_line.trim().is_empty() {
990                    result.push(next_line.to_string());
991                    i += 1;
992                } else {
993                    break;
994                }
995            }
996            continue;
997        }
998
999        // Preserve block quotes (but reflow their content)
1000        if trimmed.starts_with('>') {
1001            let quote_prefix = line[0..line.find('>').unwrap() + 1].to_string();
1002            let quote_content = &line[quote_prefix.len()..].trim_start();
1003
1004            let reflowed = reflow_line(quote_content, options);
1005            for reflowed_line in reflowed.iter() {
1006                result.push(format!("{quote_prefix} {reflowed_line}"));
1007            }
1008            i += 1;
1009            continue;
1010        }
1011
1012        // Preserve horizontal rules first (before checking for lists)
1013        if is_horizontal_rule(trimmed) {
1014            result.push(line.to_string());
1015            i += 1;
1016            continue;
1017        }
1018
1019        // Preserve lists (but not horizontal rules)
1020        if (trimmed.starts_with('-') && !is_horizontal_rule(trimmed))
1021            || (trimmed.starts_with('*') && !is_horizontal_rule(trimmed))
1022            || trimmed.starts_with('+')
1023            || is_numbered_list_item(trimmed)
1024        {
1025            // Find the list marker and preserve indentation
1026            let indent = line.len() - line.trim_start().len();
1027            let indent_str = " ".repeat(indent);
1028
1029            // For numbered lists, find the period and the space after it
1030            // For bullet lists, find the marker and the space after it
1031            let mut marker_end = indent;
1032            let mut content_start = indent;
1033
1034            if trimmed.chars().next().is_some_and(|c| c.is_numeric()) {
1035                // Numbered list: find the period
1036                if let Some(period_pos) = line[indent..].find('.') {
1037                    marker_end = indent + period_pos + 1; // Include the period
1038                    content_start = marker_end;
1039                    // Skip any spaces after the period to find content start
1040                    while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1041                        content_start += 1;
1042                    }
1043                }
1044            } else {
1045                // Bullet list: marker is single character
1046                marker_end = indent + 1; // Just the marker character
1047                content_start = marker_end;
1048                // Skip any spaces after the marker
1049                while content_start < line.len() && line.chars().nth(content_start) == Some(' ') {
1050                    content_start += 1;
1051                }
1052            }
1053
1054            let marker = &line[indent..marker_end];
1055
1056            // Collect all content for this list item (including continuation lines)
1057            // Preserve hard breaks (2 trailing spaces) while trimming excessive whitespace
1058            let mut list_content = vec![trim_preserving_hard_break(&line[content_start..])];
1059            i += 1;
1060
1061            // Collect continuation lines (indented lines that are part of this list item)
1062            while i < lines.len() {
1063                let next_line = lines[i];
1064                let next_trimmed = next_line.trim();
1065
1066                // Stop if we hit an empty line or another list item or special block
1067                if next_trimmed.is_empty()
1068                    || next_trimmed.starts_with('#')
1069                    || next_trimmed.starts_with("```")
1070                    || next_trimmed.starts_with("~~~")
1071                    || next_trimmed.starts_with('>')
1072                    || next_trimmed.starts_with('|')
1073                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1074                    || is_horizontal_rule(next_trimmed)
1075                    || (next_trimmed.starts_with('-')
1076                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1077                    || (next_trimmed.starts_with('*')
1078                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1079                    || (next_trimmed.starts_with('+')
1080                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1081                    || is_numbered_list_item(next_trimmed)
1082                    || is_definition_list_item(next_trimmed)
1083                {
1084                    break;
1085                }
1086
1087                // Check if this line is indented (continuation of list item)
1088                let next_indent = next_line.len() - next_line.trim_start().len();
1089                if next_indent >= content_start {
1090                    // This is a continuation line - add its content
1091                    // Preserve hard breaks while trimming excessive whitespace
1092                    let trimmed_start = next_line.trim_start();
1093                    list_content.push(trim_preserving_hard_break(trimmed_start));
1094                    i += 1;
1095                } else {
1096                    // Not indented enough, not part of this list item
1097                    break;
1098                }
1099            }
1100
1101            // Join content, but respect hard breaks (lines ending with 2 spaces or backslash)
1102            // Hard breaks should prevent joining with the next line
1103            let combined_content = if options.preserve_breaks {
1104                list_content[0].clone()
1105            } else {
1106                // Check if any lines have hard breaks - if so, preserve the structure
1107                let has_hard_breaks = list_content.iter().any(|line| has_hard_break(line));
1108                if has_hard_breaks {
1109                    // Don't join lines with hard breaks - keep them separate with newlines
1110                    list_content.join("\n")
1111                } else {
1112                    // No hard breaks, safe to join with spaces
1113                    list_content.join(" ")
1114                }
1115            };
1116
1117            // Calculate the proper indentation for continuation lines
1118            let trimmed_marker = marker;
1119            let continuation_spaces = content_start;
1120
1121            // Adjust line length to account for list marker and space
1122            let prefix_length = indent + trimmed_marker.len() + 1;
1123
1124            // Create adjusted options with reduced line length
1125            let adjusted_options = ReflowOptions {
1126                line_length: options.line_length.saturating_sub(prefix_length),
1127                ..options.clone()
1128            };
1129
1130            let reflowed = reflow_line(&combined_content, &adjusted_options);
1131            for (j, reflowed_line) in reflowed.iter().enumerate() {
1132                if j == 0 {
1133                    result.push(format!("{indent_str}{trimmed_marker} {reflowed_line}"));
1134                } else {
1135                    // Continuation lines aligned with text after marker
1136                    let continuation_indent = " ".repeat(continuation_spaces);
1137                    result.push(format!("{continuation_indent}{reflowed_line}"));
1138                }
1139            }
1140            continue;
1141        }
1142
1143        // Preserve tables
1144        if trimmed.contains('|') {
1145            result.push(line.to_string());
1146            i += 1;
1147            continue;
1148        }
1149
1150        // Preserve reference definitions
1151        if trimmed.starts_with('[') && line.contains("]:") {
1152            result.push(line.to_string());
1153            i += 1;
1154            continue;
1155        }
1156
1157        // Preserve definition list items (extended markdown)
1158        if is_definition_list_item(trimmed) {
1159            result.push(line.to_string());
1160            i += 1;
1161            continue;
1162        }
1163
1164        // Check if this is a single line that doesn't need processing
1165        let mut is_single_line_paragraph = true;
1166        if i + 1 < lines.len() {
1167            let next_line = lines[i + 1];
1168            let next_trimmed = next_line.trim();
1169            // Check if next line starts a new block
1170            if !next_trimmed.is_empty()
1171                && !next_trimmed.starts_with('#')
1172                && !next_trimmed.starts_with("```")
1173                && !next_trimmed.starts_with("~~~")
1174                && !next_trimmed.starts_with('>')
1175                && !next_trimmed.starts_with('|')
1176                && !(next_trimmed.starts_with('[') && next_line.contains("]:"))
1177                && !is_horizontal_rule(next_trimmed)
1178                && !(next_trimmed.starts_with('-')
1179                    && !is_horizontal_rule(next_trimmed)
1180                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1181                && !(next_trimmed.starts_with('*')
1182                    && !is_horizontal_rule(next_trimmed)
1183                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1184                && !(next_trimmed.starts_with('+')
1185                    && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1186                && !is_numbered_list_item(next_trimmed)
1187            {
1188                is_single_line_paragraph = false;
1189            }
1190        }
1191
1192        // If it's a single line that fits, just add it as-is
1193        if is_single_line_paragraph && line.chars().count() <= options.line_length {
1194            result.push(line.to_string());
1195            i += 1;
1196            continue;
1197        }
1198
1199        // For regular paragraphs, collect consecutive lines
1200        let mut paragraph_parts = Vec::new();
1201        let mut current_part = vec![line];
1202        i += 1;
1203
1204        // If preserve_breaks is true, treat each line separately
1205        if options.preserve_breaks {
1206            // Don't collect consecutive lines - just reflow this single line
1207            let hard_break_type = if line.strip_suffix('\r').unwrap_or(line).ends_with('\\') {
1208                Some("\\")
1209            } else if line.ends_with("  ") {
1210                Some("  ")
1211            } else {
1212                None
1213            };
1214            let reflowed = reflow_line(line, options);
1215
1216            // Preserve hard breaks (two trailing spaces or backslash)
1217            if let Some(break_marker) = hard_break_type {
1218                if !reflowed.is_empty() {
1219                    let mut reflowed_with_break = reflowed;
1220                    let last_idx = reflowed_with_break.len() - 1;
1221                    if !has_hard_break(&reflowed_with_break[last_idx]) {
1222                        reflowed_with_break[last_idx].push_str(break_marker);
1223                    }
1224                    result.extend(reflowed_with_break);
1225                }
1226            } else {
1227                result.extend(reflowed);
1228            }
1229        } else {
1230            // Original behavior: collect consecutive lines into a paragraph
1231            while i < lines.len() {
1232                let prev_line = if !current_part.is_empty() {
1233                    current_part.last().unwrap()
1234                } else {
1235                    ""
1236                };
1237                let next_line = lines[i];
1238                let next_trimmed = next_line.trim();
1239
1240                // Stop at empty lines or special blocks
1241                if next_trimmed.is_empty()
1242                    || next_trimmed.starts_with('#')
1243                    || next_trimmed.starts_with("```")
1244                    || next_trimmed.starts_with("~~~")
1245                    || next_trimmed.starts_with('>')
1246                    || next_trimmed.starts_with('|')
1247                    || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1248                    || is_horizontal_rule(next_trimmed)
1249                    || (next_trimmed.starts_with('-')
1250                        && !is_horizontal_rule(next_trimmed)
1251                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1252                    || (next_trimmed.starts_with('*')
1253                        && !is_horizontal_rule(next_trimmed)
1254                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1255                    || (next_trimmed.starts_with('+')
1256                        && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1257                    || is_numbered_list_item(next_trimmed)
1258                    || is_definition_list_item(next_trimmed)
1259                {
1260                    break;
1261                }
1262
1263                // Check if previous line ends with hard break (two spaces or backslash)
1264                if has_hard_break(prev_line) {
1265                    // Start a new part after hard break
1266                    paragraph_parts.push(current_part.join(" "));
1267                    current_part = vec![next_line];
1268                } else {
1269                    current_part.push(next_line);
1270                }
1271                i += 1;
1272            }
1273
1274            // Add the last part
1275            if !current_part.is_empty() {
1276                if current_part.len() == 1 {
1277                    // Single line, don't add trailing space
1278                    paragraph_parts.push(current_part[0].to_string());
1279                } else {
1280                    paragraph_parts.push(current_part.join(" "));
1281                }
1282            }
1283
1284            // Reflow each part separately, preserving hard breaks
1285            for (j, part) in paragraph_parts.iter().enumerate() {
1286                let reflowed = reflow_line(part, options);
1287                result.extend(reflowed);
1288
1289                // Preserve hard break by ensuring last line of part ends with hard break marker
1290                // Use two spaces as the default hard break format for reflows
1291                if j < paragraph_parts.len() - 1 && !result.is_empty() {
1292                    let last_idx = result.len() - 1;
1293                    if !has_hard_break(&result[last_idx]) {
1294                        result[last_idx].push_str("  ");
1295                    }
1296                }
1297            }
1298        }
1299    }
1300
1301    // Preserve trailing newline if the original content had one
1302    let result_text = result.join("\n");
1303    if content.ends_with('\n') && !result_text.ends_with('\n') {
1304        format!("{result_text}\n")
1305    } else {
1306        result_text
1307    }
1308}
1309
1310/// Information about a reflowed paragraph
1311#[derive(Debug, Clone)]
1312pub struct ParagraphReflow {
1313    /// Starting byte offset of the paragraph in the original content
1314    pub start_byte: usize,
1315    /// Ending byte offset of the paragraph in the original content
1316    pub end_byte: usize,
1317    /// The reflowed text for this paragraph
1318    pub reflowed_text: String,
1319}
1320
1321/// Reflow a single paragraph at the specified line number
1322///
1323/// This function finds the paragraph containing the given line number,
1324/// reflows it according to the specified line length, and returns
1325/// information about the paragraph location and its reflowed text.
1326///
1327/// # Arguments
1328///
1329/// * `content` - The full document content
1330/// * `line_number` - The 1-based line number within the paragraph to reflow
1331/// * `line_length` - The target line length for reflowing
1332///
1333/// # Returns
1334///
1335/// Returns `Some(ParagraphReflow)` if a paragraph was found and reflowed,
1336/// or `None` if the line number is out of bounds or the content at that
1337/// line shouldn't be reflowed (e.g., code blocks, headings, etc.)
1338pub fn reflow_paragraph_at_line(content: &str, line_number: usize, line_length: usize) -> Option<ParagraphReflow> {
1339    if line_number == 0 {
1340        return None;
1341    }
1342
1343    let lines: Vec<&str> = content.lines().collect();
1344
1345    // Check if line number is valid (1-based)
1346    if line_number > lines.len() {
1347        return None;
1348    }
1349
1350    let target_idx = line_number - 1; // Convert to 0-based
1351    let target_line = lines[target_idx];
1352    let trimmed = target_line.trim();
1353
1354    // Don't reflow special blocks
1355    if trimmed.is_empty()
1356        || trimmed.starts_with('#')
1357        || trimmed.starts_with("```")
1358        || trimmed.starts_with("~~~")
1359        || target_line.starts_with("    ")
1360        || target_line.starts_with('\t')
1361        || trimmed.starts_with('>')
1362        || trimmed.contains('|') // Tables
1363        || (trimmed.starts_with('[') && target_line.contains("]:")) // Reference definitions
1364        || is_horizontal_rule(trimmed)
1365        || ((trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+'))
1366            && !is_horizontal_rule(trimmed)
1367            && (trimmed.len() == 1 || trimmed.chars().nth(1) == Some(' ')))
1368        || is_numbered_list_item(trimmed)
1369        || is_definition_list_item(trimmed)
1370    {
1371        return None;
1372    }
1373
1374    // Find paragraph start - scan backward until blank line or special block
1375    let mut para_start = target_idx;
1376    while para_start > 0 {
1377        let prev_idx = para_start - 1;
1378        let prev_line = lines[prev_idx];
1379        let prev_trimmed = prev_line.trim();
1380
1381        // Stop at blank line or special blocks
1382        if prev_trimmed.is_empty()
1383            || prev_trimmed.starts_with('#')
1384            || prev_trimmed.starts_with("```")
1385            || prev_trimmed.starts_with("~~~")
1386            || prev_line.starts_with("    ")
1387            || prev_line.starts_with('\t')
1388            || prev_trimmed.starts_with('>')
1389            || prev_trimmed.contains('|')
1390            || (prev_trimmed.starts_with('[') && prev_line.contains("]:"))
1391            || is_horizontal_rule(prev_trimmed)
1392            || ((prev_trimmed.starts_with('-') || prev_trimmed.starts_with('*') || prev_trimmed.starts_with('+'))
1393                && !is_horizontal_rule(prev_trimmed)
1394                && (prev_trimmed.len() == 1 || prev_trimmed.chars().nth(1) == Some(' ')))
1395            || is_numbered_list_item(prev_trimmed)
1396            || is_definition_list_item(prev_trimmed)
1397        {
1398            break;
1399        }
1400
1401        para_start = prev_idx;
1402    }
1403
1404    // Find paragraph end - scan forward until blank line or special block
1405    let mut para_end = target_idx;
1406    while para_end + 1 < lines.len() {
1407        let next_idx = para_end + 1;
1408        let next_line = lines[next_idx];
1409        let next_trimmed = next_line.trim();
1410
1411        // Stop at blank line or special blocks
1412        if next_trimmed.is_empty()
1413            || next_trimmed.starts_with('#')
1414            || next_trimmed.starts_with("```")
1415            || next_trimmed.starts_with("~~~")
1416            || next_line.starts_with("    ")
1417            || next_line.starts_with('\t')
1418            || next_trimmed.starts_with('>')
1419            || next_trimmed.contains('|')
1420            || (next_trimmed.starts_with('[') && next_line.contains("]:"))
1421            || is_horizontal_rule(next_trimmed)
1422            || ((next_trimmed.starts_with('-') || next_trimmed.starts_with('*') || next_trimmed.starts_with('+'))
1423                && !is_horizontal_rule(next_trimmed)
1424                && (next_trimmed.len() == 1 || next_trimmed.chars().nth(1) == Some(' ')))
1425            || is_numbered_list_item(next_trimmed)
1426            || is_definition_list_item(next_trimmed)
1427        {
1428            break;
1429        }
1430
1431        para_end = next_idx;
1432    }
1433
1434    // Extract paragraph lines
1435    let paragraph_lines = &lines[para_start..=para_end];
1436
1437    // Calculate byte offsets
1438    let mut start_byte = 0;
1439    for line in lines.iter().take(para_start) {
1440        start_byte += line.len() + 1; // +1 for newline
1441    }
1442
1443    let mut end_byte = start_byte;
1444    for line in paragraph_lines.iter() {
1445        end_byte += line.len() + 1; // +1 for newline
1446    }
1447
1448    // Track whether the byte range includes a trailing newline
1449    // (it doesn't if this is the last line and the file doesn't end with newline)
1450    let includes_trailing_newline = para_end != lines.len() - 1 || content.ends_with('\n');
1451
1452    // Adjust end_byte if the last line doesn't have a newline
1453    if !includes_trailing_newline {
1454        end_byte -= 1;
1455    }
1456
1457    // Join paragraph lines and reflow
1458    let paragraph_text = paragraph_lines.join("\n");
1459
1460    // Create reflow options
1461    let options = ReflowOptions {
1462        line_length,
1463        break_on_sentences: true,
1464        preserve_breaks: false,
1465        sentence_per_line: false,
1466        abbreviations: None,
1467    };
1468
1469    // Reflow the paragraph using reflow_markdown to handle it properly
1470    let reflowed = reflow_markdown(&paragraph_text, &options);
1471
1472    // Ensure reflowed text matches whether the byte range includes a trailing newline
1473    // This is critical: if the range includes a newline, the replacement must too,
1474    // otherwise the next line will get appended to the reflowed paragraph
1475    let reflowed_text = if includes_trailing_newline {
1476        // Range includes newline - ensure reflowed text has one
1477        if reflowed.ends_with('\n') {
1478            reflowed
1479        } else {
1480            format!("{reflowed}\n")
1481        }
1482    } else {
1483        // Range doesn't include newline - ensure reflowed text doesn't have one
1484        if reflowed.ends_with('\n') {
1485            reflowed.trim_end_matches('\n').to_string()
1486        } else {
1487            reflowed
1488        }
1489    };
1490
1491    Some(ParagraphReflow {
1492        start_byte,
1493        end_byte,
1494        reflowed_text,
1495    })
1496}
1497
1498#[cfg(test)]
1499mod tests {
1500    use super::*;
1501
1502    /// Unit test for private helper function text_ends_with_abbreviation()
1503    ///
1504    /// This test stays inline because it tests a private function.
1505    /// All other tests (public API, integration tests) are in tests/utils/text_reflow_test.rs
1506    #[test]
1507    fn test_helper_function_text_ends_with_abbreviation() {
1508        // Test the helper function directly
1509        let abbreviations = get_abbreviations(&None);
1510
1511        // True cases - built-in abbreviations (titles and i.e./e.g.)
1512        assert!(text_ends_with_abbreviation("Dr.", &abbreviations));
1513        assert!(text_ends_with_abbreviation("word Dr.", &abbreviations));
1514        assert!(text_ends_with_abbreviation("e.g.", &abbreviations));
1515        assert!(text_ends_with_abbreviation("i.e.", &abbreviations));
1516        assert!(text_ends_with_abbreviation("Mr.", &abbreviations));
1517        assert!(text_ends_with_abbreviation("Mrs.", &abbreviations));
1518        assert!(text_ends_with_abbreviation("Ms.", &abbreviations));
1519        assert!(text_ends_with_abbreviation("Prof.", &abbreviations));
1520
1521        // False cases - NOT in built-in list (etc doesn't always have period)
1522        assert!(!text_ends_with_abbreviation("etc.", &abbreviations));
1523        assert!(!text_ends_with_abbreviation("paradigms.", &abbreviations));
1524        assert!(!text_ends_with_abbreviation("programs.", &abbreviations));
1525        assert!(!text_ends_with_abbreviation("items.", &abbreviations));
1526        assert!(!text_ends_with_abbreviation("systems.", &abbreviations));
1527        assert!(!text_ends_with_abbreviation("Dr?", &abbreviations)); // question mark, not period
1528        assert!(!text_ends_with_abbreviation("Mr!", &abbreviations)); // exclamation, not period
1529        assert!(!text_ends_with_abbreviation("paradigms?", &abbreviations)); // question mark
1530        assert!(!text_ends_with_abbreviation("word", &abbreviations)); // no punctuation
1531        assert!(!text_ends_with_abbreviation("", &abbreviations)); // empty string
1532    }
1533}
rumdl_lib/utils/text_reflow.rs

rumdl_lib/utils/
text_reflow.rs