Skip to main content

rumdl_lib/utils/
sentence_utils.rs

1//! Sentence detection utilities
2//!
3//! This module provides shared functionality for detecting sentence boundaries
4//! in markdown text. Used by both text reflow (MD013) and the multiple spaces
5//! rule (MD064).
6//!
7//! Features:
8//! - Common abbreviation detection (Mr., Dr., Prof., etc.)
9//! - CJK punctuation support (。, !, ?)
10//! - Closing quote detection (straight and curly)
11//! - Both forward-looking (reflow) and backward-looking (MD064) sentence detection
12
13use std::collections::HashSet;
14
15/// Default abbreviations that should NOT be treated as sentence endings.
16///
17/// Only includes abbreviations that:
18/// 1. Conventionally ALWAYS have a period in standard writing
19/// 2. Are almost always followed by something, not sentence-final
20///
21/// Does NOT include:
22/// - Abbreviations that commonly end sentences (etc., Inc., Ph.D., U.S.)
23const DEFAULT_ABBREVIATIONS: &[&str] = &[
24    // Titles - always have period, always followed by a name
25    "mr", "mrs", "ms", "dr", "prof", "sr", "jr", "st",
26    // Latin - always written with periods, introduce examples/references
27    "i.e", "e.g", // Reference abbreviations - followed by what they refer to
28    "vs", "fig", "no", "vol", "ch", "sec", "al",
29];
30
31/// Get the effective abbreviations set based on custom additions
32/// All abbreviations are normalized to lowercase for case-insensitive matching
33/// Custom abbreviations are always merged with built-in defaults
34pub fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
35    let mut abbreviations: HashSet<String> = DEFAULT_ABBREVIATIONS.iter().map(|s| s.to_lowercase()).collect();
36
37    // Always extend defaults with custom abbreviations
38    // Strip any trailing periods and normalize to lowercase for consistent matching
39    if let Some(custom_list) = custom {
40        for abbr in custom_list {
41            let normalized = abbr.trim_end_matches('.').to_lowercase();
42            if !normalized.is_empty() {
43                abbreviations.insert(normalized);
44            }
45        }
46    }
47
48    abbreviations
49}
50
51/// Check if text ends with a common abbreviation followed by a period
52///
53/// Abbreviations only count when followed by a period, not ! or ?.
54/// This prevents false positives where words ending in abbreviation-like
55/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
56/// detected as abbreviations.
57///
58/// Examples:
59///   - "Dr." -> true (abbreviation)
60///   - "Dr?" -> false (question, not abbreviation)
61///   - "paradigms." -> false (not in abbreviation list)
62///   - "paradigms?" -> false (question mark, not abbreviation)
63pub fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
64    // Only check if text ends with a period (abbreviations require periods)
65    if !text.ends_with('.') {
66        return false;
67    }
68
69    // Remove the trailing period
70    let without_period = text.trim_end_matches('.');
71
72    // Get the last word by splitting on whitespace
73    let last_word = without_period.split_whitespace().last().unwrap_or("");
74
75    if last_word.is_empty() {
76        return false;
77    }
78
79    // Strip leading punctuation (parentheses, brackets, quotes, emphasis markers)
80    // that may precede the abbreviation, e.g. "(e.g." or "[i.e."
81    let stripped = last_word.trim_start_matches(|c: char| !c.is_alphanumeric() && c != '.');
82
83    // Check the full stripped word first (covers simple cases like "Dr.", "Prof.")
84    if abbreviations.contains(&stripped.to_lowercase()) {
85        return true;
86    }
87
88    // Also check the last hyphen-separated component so that hyphenated place names
89    // like "Wrangell-St." are recognized via the "st" abbreviation entry.
90    if let Some(after_hyphen) = stripped.rsplit('-').next()
91        && !after_hyphen.is_empty()
92        && after_hyphen != stripped
93    {
94        return abbreviations.contains(&after_hyphen.to_lowercase());
95    }
96
97    false
98}
99
100/// Check if a character is CJK sentence-ending punctuation
101/// These include: 。(ideographic full stop), !(fullwidth exclamation), ?(fullwidth question)
102pub fn is_cjk_sentence_ending(c: char) -> bool {
103    matches!(c, '。' | '!' | '?')
104}
105
106/// Check if a character is a closing quote mark
107/// Includes straight quotes and curly/smart quotes
108pub fn is_closing_quote(c: char) -> bool {
109    // " (straight double), ' (straight single), " (U+201D right double), ' (U+2019 right single)
110    // » (right guillemet), › (single right guillemet)
111    matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | '»' | '›')
112}
113
114/// Check if a character is an opening quote mark
115/// Includes straight quotes and curly/smart quotes
116pub fn is_opening_quote(c: char) -> bool {
117    // " (straight double), ' (straight single), " (U+201C left double), ' (U+2018 left single)
118    // « (left guillemet), ‹ (single left guillemet)
119    matches!(c, '"' | '\'' | '\u{201C}' | '\u{2018}' | '«' | '‹')
120}
121
122/// Check if a character is a CJK character (Chinese, Japanese, Korean)
123pub fn is_cjk_char(c: char) -> bool {
124    // CJK Unified Ideographs and common extensions
125    matches!(c,
126        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
127        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
128        '\u{3040}'..='\u{309F}' |   // Hiragana
129        '\u{30A0}'..='\u{30FF}' |   // Katakana
130        '\u{AC00}'..='\u{D7AF}'     // Hangul Syllables
131    )
132}
133
134/// Check if a character is closing punctuation that can follow sentence-ending punctuation
135/// This includes closing quotes, parentheses, and brackets
136fn is_trailing_close_punctuation(c: char) -> bool {
137    is_closing_quote(c) || matches!(c, ')' | ']' | '}')
138}
139
140/// Check if multiple spaces occur immediately after sentence-ending punctuation.
141/// This is a backward-looking check used by MD064.
142///
143/// Returns true if the character(s) immediately before `match_start` constitute
144/// a sentence ending, supporting the traditional two-space-after-sentence convention.
145///
146/// Recognized sentence-ending patterns:
147/// - Direct punctuation: `.`, `!`, `?`, `。`, `!`, `?`
148/// - With closing quotes: `."`, `!"`, `?"`, `.'`, `!'`, `?'`, `."`, `?"`, `!"`
149/// - With closing parenthesis: `.)`, `!)`, `?)`
150/// - With closing bracket: `.]`, `!]`, `?]`
151/// - Ellipsis: `...`
152/// - Combinations: `.")`  (quote then paren), `?')`
153///
154/// Does NOT treat as sentence ending:
155/// - Abbreviations: `Dr.`, `Mr.`, `Prof.`, etc. (when detectable)
156/// - Single letters followed by period: `A.` (likely initials or list markers)
157pub fn is_after_sentence_ending(text: &str, match_start: usize) -> bool {
158    is_after_sentence_ending_with_abbreviations(text, match_start, &get_abbreviations(&None))
159}
160
161/// Check if multiple spaces occur immediately after sentence-ending punctuation,
162/// with a custom abbreviations set.
163///
164/// Note: `match_start` is a byte position (from regex). This function handles
165/// multi-byte UTF-8 characters correctly by working with character iterators.
166fn is_after_sentence_ending_with_abbreviations(
167    text: &str,
168    match_start: usize,
169    abbreviations: &HashSet<String>,
170) -> bool {
171    if match_start == 0 || match_start > text.len() {
172        return false;
173    }
174
175    // Safely get the portion of the text before the spaces
176    // match_start is a byte position, so we need to ensure it's a valid char boundary
177    let Some(before) = text.get(..match_start) else {
178        return false; // Invalid byte position
179    };
180
181    // Collect chars for iteration (we need random access for some checks)
182    let chars: Vec<char> = before.chars().collect();
183    if chars.is_empty() {
184        return false;
185    }
186
187    let mut idx = chars.len() - 1;
188
189    // Skip through any trailing closing punctuation (quotes, parens, brackets)
190    // These can appear after the sentence-ending punctuation
191    // e.g., `sentence."  Next` or `sentence.)  Next` or `sentence.")`
192    while idx > 0 && is_trailing_close_punctuation(chars[idx]) {
193        idx -= 1;
194    }
195
196    // Now check if we're at sentence-ending punctuation
197    let current = chars[idx];
198
199    // Check for CJK sentence-ending punctuation
200    if is_cjk_sentence_ending(current) {
201        return true;
202    }
203
204    // Direct sentence-ending punctuation (! and ?)
205    if current == '!' || current == '?' {
206        return true;
207    }
208
209    // Period - need more careful handling
210    if current == '.' {
211        // Check for ellipsis (...) - always a valid sentence ending
212        if idx >= 2 && chars[idx - 1] == '.' && chars[idx - 2] == '.' {
213            return true;
214        }
215
216        // Build the text before the period by collecting chars up to idx
217        // (not including the period itself)
218        let text_before_period: String = chars[..idx].iter().collect();
219
220        // Check if this is an abbreviation
221        if text_ends_with_abbreviation(&format!("{text_before_period}."), abbreviations) {
222            return false;
223        }
224
225        // Check what comes before the period
226        if idx > 0 {
227            let prev = chars[idx - 1];
228
229            // Single letter before period - likely initial or list marker, not sentence
230            // e.g., "A." "B." but allow "a." at end of sentence
231            if prev.is_ascii_uppercase() {
232                // Check if it's preceded by whitespace or start of text (isolated initial)
233                if idx >= 2 {
234                    if chars[idx - 2].is_whitespace() {
235                        // "word A." - isolated initial, not sentence ending
236                        return false;
237                    }
238                } else {
239                    // "A." at start - not a sentence ending
240                    return false;
241                }
242            }
243
244            // If previous char is alphanumeric, closing quote/paren, or markdown inline delimiters, treat as sentence end
245            // Markdown inline elements that can end before punctuation:
246            // - `)` `]` - links, images, footnote refs
247            // - `` ` `` - inline code
248            // - `*` `_` - emphasis/bold
249            // - `~` - strikethrough
250            // - `=` - highlight (extended markdown)
251            // - `^` - superscript (extended markdown)
252            if prev.is_alphanumeric()
253                || is_closing_quote(prev)
254                || matches!(prev, ')' | ']' | '`' | '*' | '_' | '~' | '=' | '^')
255                || is_cjk_char(prev)
256            {
257                return true;
258            }
259        }
260
261        // Period at start or after non-word char - not a sentence ending
262        return false;
263    }
264
265    false
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    // === Abbreviation tests ===
273
274    #[test]
275    fn test_get_abbreviations_default() {
276        let abbrevs = get_abbreviations(&None);
277        assert!(abbrevs.contains("dr"));
278        assert!(abbrevs.contains("mr"));
279        assert!(abbrevs.contains("prof"));
280        assert!(abbrevs.contains("i.e"));
281        assert!(abbrevs.contains("e.g"));
282        assert!(abbrevs.contains("st"));
283    }
284
285    #[test]
286    fn test_st_abbreviation_not_sentence_boundary() {
287        let abbrevs = get_abbreviations(&None);
288
289        // Plain "St." is recognized as an abbreviation
290        assert!(text_ends_with_abbreviation("St.", &abbrevs));
291
292        // Hyphenated prefix form: "Wrangell-St." matches via the "st" component
293        assert!(text_ends_with_abbreviation("Wrangell-St.", &abbrevs));
294
295        // Non-abbreviation words are not affected
296        assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
297        assert!(!text_ends_with_abbreviation("starts.", &abbrevs));
298
299        // Hyphenated word where suffix is NOT an abbreviation must NOT match
300        assert!(!text_ends_with_abbreviation("word-foo.", &abbrevs));
301        assert!(!text_ends_with_abbreviation("end-street.", &abbrevs));
302
303        // Other known abbreviations still work
304        assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
305        assert!(text_ends_with_abbreviation("Mr.", &abbrevs));
306    }
307
308    #[test]
309    fn test_get_abbreviations_custom() {
310        let custom = Some(vec!["Corp".to_string(), "Ltd.".to_string()]);
311        let abbrevs = get_abbreviations(&custom);
312        // Should include defaults
313        assert!(abbrevs.contains("dr"));
314        // Should include custom (normalized)
315        assert!(abbrevs.contains("corp"));
316        assert!(abbrevs.contains("ltd"));
317    }
318
319    #[test]
320    fn test_text_ends_with_abbreviation() {
321        let abbrevs = get_abbreviations(&None);
322        assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
323        assert!(text_ends_with_abbreviation("Hello Dr.", &abbrevs));
324        assert!(text_ends_with_abbreviation("Prof.", &abbrevs));
325        assert!(!text_ends_with_abbreviation("Doctor.", &abbrevs));
326        assert!(!text_ends_with_abbreviation("Dr?", &abbrevs)); // Not a period
327        assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
328    }
329
330    #[test]
331    fn test_text_ends_with_abbreviation_after_punctuation() {
332        let abbrevs = get_abbreviations(&None);
333        // Abbreviations preceded by opening parenthesis
334        assert!(text_ends_with_abbreviation("(e.g.", &abbrevs));
335        assert!(text_ends_with_abbreviation("(i.e.", &abbrevs));
336        assert!(text_ends_with_abbreviation("word (e.g.", &abbrevs));
337        assert!(text_ends_with_abbreviation("word (i.e.", &abbrevs));
338        // Abbreviations preceded by opening bracket
339        assert!(text_ends_with_abbreviation("[e.g.", &abbrevs));
340        assert!(text_ends_with_abbreviation("[Dr.", &abbrevs));
341        // Abbreviations preceded by quotes
342        assert!(text_ends_with_abbreviation("\"Dr.", &abbrevs));
343        // Abbreviations preceded by emphasis markers
344        assert!(text_ends_with_abbreviation("*e.g.", &abbrevs));
345        assert!(text_ends_with_abbreviation("**e.g.", &abbrevs));
346        // Nested punctuation (quote + paren)
347        assert!(text_ends_with_abbreviation("(\"e.g.", &abbrevs));
348        assert!(text_ends_with_abbreviation("([Dr.", &abbrevs));
349        // Non-abbreviations with leading punctuation should still not match
350        assert!(!text_ends_with_abbreviation("(paradigms.", &abbrevs));
351        assert!(!text_ends_with_abbreviation("[Doctor.", &abbrevs));
352    }
353
354    // === Punctuation helper tests ===
355
356    #[test]
357    fn test_is_closing_quote() {
358        assert!(is_closing_quote('"'));
359        assert!(is_closing_quote('\''));
360        assert!(is_closing_quote('\u{201D}')); // "
361        assert!(is_closing_quote('\u{2019}')); // '
362        assert!(is_closing_quote('»'));
363        assert!(is_closing_quote('›'));
364        assert!(!is_closing_quote('a'));
365        assert!(!is_closing_quote('.'));
366    }
367
368    #[test]
369    fn test_is_cjk_sentence_ending() {
370        assert!(is_cjk_sentence_ending('。'));
371        assert!(is_cjk_sentence_ending('!'));
372        assert!(is_cjk_sentence_ending('?'));
373        assert!(!is_cjk_sentence_ending('.'));
374        assert!(!is_cjk_sentence_ending('!'));
375    }
376
377    #[test]
378    fn test_is_cjk_char() {
379        assert!(is_cjk_char('中'));
380        assert!(is_cjk_char('あ')); // Hiragana
381        assert!(is_cjk_char('ア')); // Katakana
382        assert!(is_cjk_char('한')); // Hangul
383        assert!(!is_cjk_char('a'));
384        assert!(!is_cjk_char('A'));
385    }
386
387    // === is_after_sentence_ending tests ===
388
389    #[test]
390    fn test_after_period() {
391        assert!(is_after_sentence_ending("Hello.  ", 6));
392        assert!(is_after_sentence_ending("End of sentence.  Next", 16));
393    }
394
395    #[test]
396    fn test_after_exclamation() {
397        assert!(is_after_sentence_ending("Wow!  ", 4));
398        assert!(is_after_sentence_ending("Great!  Next", 6));
399    }
400
401    #[test]
402    fn test_after_question() {
403        assert!(is_after_sentence_ending("Really?  ", 7));
404        assert!(is_after_sentence_ending("What?  Next", 5));
405    }
406
407    #[test]
408    fn test_after_closing_quote() {
409        assert!(is_after_sentence_ending("He said \"Hello.\"  Next", 16));
410        assert!(is_after_sentence_ending("She said 'Hi.'  Next", 14));
411    }
412
413    #[test]
414    fn test_after_curly_quotes() {
415        let content = format!("He said {}Hello.{}  Next", '\u{201C}', '\u{201D}');
416        // Find the position after the closing quote
417        let pos = content.find("  ").unwrap();
418        assert!(is_after_sentence_ending(&content, pos));
419    }
420
421    #[test]
422    fn test_after_closing_paren() {
423        assert!(is_after_sentence_ending("(See note.)  Next", 11));
424        assert!(is_after_sentence_ending("(Really!)  Next", 9));
425    }
426
427    #[test]
428    fn test_after_closing_bracket() {
429        assert!(is_after_sentence_ending("[Citation.]  Next", 11));
430    }
431
432    #[test]
433    fn test_after_ellipsis() {
434        assert!(is_after_sentence_ending("And so...  Next", 9));
435        assert!(is_after_sentence_ending("Hmm...  Let me think", 6));
436    }
437
438    #[test]
439    fn test_not_after_abbreviation() {
440        // Dr. should NOT be treated as sentence ending
441        assert!(!is_after_sentence_ending("Dr.  Smith", 3));
442        assert!(!is_after_sentence_ending("Mr.  Jones", 3));
443        assert!(!is_after_sentence_ending("Prof.  Williams", 5));
444    }
445
446    #[test]
447    fn test_not_after_single_initial() {
448        // Single capital letter + period is likely an initial, not sentence end
449        assert!(!is_after_sentence_ending("John A.  Smith", 7));
450        // But lowercase should work (end of sentence)
451        assert!(is_after_sentence_ending("letter a.  Next", 9));
452    }
453
454    #[test]
455    fn test_mid_sentence_not_detected() {
456        // Spaces not after sentence punctuation
457        assert!(!is_after_sentence_ending("word  word", 4));
458        assert!(!is_after_sentence_ending("multiple  spaces", 8));
459    }
460
461    #[test]
462    fn test_cjk_sentence_ending() {
463        // CJK chars are 3 bytes each in UTF-8
464        // 日(3)+本(3)+語(3)+。(3) = 12 bytes before the spaces
465        assert!(is_after_sentence_ending("日本語。  Next", 12)); // After 。
466        // 中(3)+文(3)+!(3) = 9 bytes before the spaces
467        assert!(is_after_sentence_ending("中文!  Next", 9)); // After !
468        // 한(3)+국(3)+어(3)+?(3) = 12 bytes before the spaces
469        assert!(is_after_sentence_ending("한국어?  Next", 12)); // After ?
470    }
471
472    #[test]
473    fn test_complex_endings() {
474        // Multiple closing punctuation
475        assert!(is_after_sentence_ending("(He said \"Yes.\")  Next", 16));
476        // Quote then paren
477        assert!(is_after_sentence_ending("\"End.\")  Next", 7));
478    }
479
480    #[test]
481    fn test_guillemets() {
482        assert!(is_after_sentence_ending("Il dit «Oui.»  Next", 13));
483    }
484
485    #[test]
486    fn test_empty_and_edge_cases() {
487        assert!(!is_after_sentence_ending("", 0));
488        assert!(!is_after_sentence_ending(".", 0));
489        assert!(!is_after_sentence_ending("a", 0));
490    }
491
492    #[test]
493    fn test_latin_abbreviations() {
494        // i.e. and e.g. should not be sentence endings
495        assert!(!is_after_sentence_ending("i.e.  example", 4));
496        assert!(!is_after_sentence_ending("e.g.  example", 4));
497    }
498
499    #[test]
500    fn test_abbreviations_after_opening_punctuation() {
501        // Abbreviations preceded by parentheses, brackets, quotes
502        assert!(!is_after_sentence_ending("(e.g.  Wasm)", 5));
503        assert!(!is_after_sentence_ending("(i.e.  PyO3)", 5));
504        assert!(!is_after_sentence_ending("[e.g.  Chapter]", 5));
505        assert!(!is_after_sentence_ending("(Dr.  Smith)", 4));
506        // Nested punctuation: quote + paren
507        assert!(!is_after_sentence_ending("(\"e.g.  something\")", 6));
508    }
509
510    #[test]
511    fn test_after_inline_code() {
512        // Issue #345: Sentence ending with inline code should be recognized
513        // "Hello from `backticks`.  How's it going?"
514        // Position 23 is after the period following the closing backtick
515        assert!(is_after_sentence_ending("Hello from `backticks`.  Next", 23));
516
517        // Simple case: just code and period
518        assert!(is_after_sentence_ending("`code`.  Next", 7));
519
520        // Multiple inline code spans
521        assert!(is_after_sentence_ending("Use `foo` and `bar`.  Next", 20));
522
523        // With exclamation mark
524        assert!(is_after_sentence_ending("`important`!  Next", 12));
525
526        // With question mark
527        assert!(is_after_sentence_ending("Is it `true`?  Next", 13));
528
529        // Inline code in the middle shouldn't affect sentence detection
530        assert!(is_after_sentence_ending("The `code` works.  Next", 17));
531    }
532
533    #[test]
534    fn test_after_inline_code_with_quotes() {
535        // Inline code before closing quote before period
536        assert!(is_after_sentence_ending("He said \"use `code`\".  Next", 21));
537
538        // Inline code in parentheses
539        assert!(is_after_sentence_ending("(see `example`).  Next", 16));
540    }
541
542    #[test]
543    fn test_after_emphasis() {
544        // Asterisk emphasis
545        assert!(is_after_sentence_ending("The word is *important*.  Next", 24));
546
547        // Underscore emphasis
548        assert!(is_after_sentence_ending("The word is _important_.  Next", 24));
549
550        // With exclamation
551        assert!(is_after_sentence_ending("This is *urgent*!  Next", 17));
552
553        // With question
554        assert!(is_after_sentence_ending("Is it _true_?  Next", 13));
555    }
556
557    #[test]
558    fn test_after_bold() {
559        // Asterisk bold
560        assert!(is_after_sentence_ending("The word is **critical**.  Next", 25));
561
562        // Underscore bold
563        assert!(is_after_sentence_ending("The word is __critical__.  Next", 25));
564    }
565
566    #[test]
567    fn test_after_strikethrough() {
568        // GFM strikethrough
569        assert!(is_after_sentence_ending("This is ~~wrong~~.  Next", 18));
570
571        // With exclamation
572        assert!(is_after_sentence_ending("That was ~~bad~~!  Next", 17));
573    }
574
575    #[test]
576    fn test_after_extended_markdown() {
577        // Highlight syntax (some flavors)
578        assert!(is_after_sentence_ending("This is ==highlighted==.  Next", 24));
579
580        // Superscript syntax (some flavors)
581        assert!(is_after_sentence_ending("E equals mc^2^.  Next", 15));
582    }
583}