rumdl_lib/utils/
sentence_utils.rs

1//! Sentence detection utilities
2//!
3//! This module provides shared functionality for detecting sentence boundaries
4//! in markdown text. Used by both text reflow (MD013) and the multiple spaces
5//! rule (MD064).
6//!
7//! Features:
8//! - Common abbreviation detection (Mr., Dr., Prof., etc.)
9//! - CJK punctuation support (。, ！, ？)
10//! - Closing quote detection (straight and curly)
11//! - Both forward-looking (reflow) and backward-looking (MD064) sentence detection
12
13use std::collections::HashSet;
14
15/// Default abbreviations that should NOT be treated as sentence endings.
16///
17/// Only includes abbreviations that:
18/// 1. Conventionally ALWAYS have a period in standard writing
19/// 2. Are followed by something (name, example), not sentence-final
20///
21/// Does NOT include:
22/// - Words that don't typically take periods (vs, etc)
23/// - Abbreviations that can end sentences (Inc., Ph.D., U.S.)
24pub const DEFAULT_ABBREVIATIONS: &[&str] = &[
25    // Titles - always have period, always followed by a name
26    "mr", "mrs", "ms", "dr", "prof", "sr", "jr",
27    // Latin - always written with periods, introduce examples/references
28    "i.e", "e.g",
29];
30
31/// Get the effective abbreviations set based on custom additions
32/// All abbreviations are normalized to lowercase for case-insensitive matching
33/// Custom abbreviations are always merged with built-in defaults
34pub fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
35    let mut abbreviations: HashSet<String> = DEFAULT_ABBREVIATIONS.iter().map(|s| s.to_lowercase()).collect();
36
37    // Always extend defaults with custom abbreviations
38    // Strip any trailing periods and normalize to lowercase for consistent matching
39    if let Some(custom_list) = custom {
40        for abbr in custom_list {
41            let normalized = abbr.trim_end_matches('.').to_lowercase();
42            if !normalized.is_empty() {
43                abbreviations.insert(normalized);
44            }
45        }
46    }
47
48    abbreviations
49}
50
51/// Check if text ends with a common abbreviation followed by a period
52///
53/// Abbreviations only count when followed by a period, not ! or ?.
54/// This prevents false positives where words ending in abbreviation-like
55/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
56/// detected as abbreviations.
57///
58/// Examples:
59///   - "Dr." -> true (abbreviation)
60///   - "Dr?" -> false (question, not abbreviation)
61///   - "paradigms." -> false (not in abbreviation list)
62///   - "paradigms?" -> false (question mark, not abbreviation)
63pub fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
64    // Only check if text ends with a period (abbreviations require periods)
65    if !text.ends_with('.') {
66        return false;
67    }
68
69    // Remove the trailing period
70    let without_period = text.trim_end_matches('.');
71
72    // Get the last word by splitting on whitespace
73    let last_word = without_period.split_whitespace().last().unwrap_or("");
74
75    if last_word.is_empty() {
76        return false;
77    }
78
79    // Strip leading punctuation (parentheses, brackets, quotes, emphasis markers)
80    // that may precede the abbreviation, e.g. "(e.g." or "[i.e."
81    let stripped = last_word.trim_start_matches(|c: char| !c.is_alphanumeric() && c != '.');
82
83    // O(1) HashSet lookup (abbreviations are already lowercase)
84    abbreviations.contains(&stripped.to_lowercase())
85}
86
87/// Check if a character is CJK sentence-ending punctuation
88/// These include: 。(ideographic full stop), ！(fullwidth exclamation), ？(fullwidth question)
89pub fn is_cjk_sentence_ending(c: char) -> bool {
90    matches!(c, '。' | '！' | '？')
91}
92
93/// Check if a character is a closing quote mark
94/// Includes straight quotes and curly/smart quotes
95pub fn is_closing_quote(c: char) -> bool {
96    // " (straight double), ' (straight single), " (U+201D right double), ' (U+2019 right single)
97    // » (right guillemet), › (single right guillemet)
98    matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | '»' | '›')
99}
100
101/// Check if a character is an opening quote mark
102/// Includes straight quotes and curly/smart quotes
103pub fn is_opening_quote(c: char) -> bool {
104    // " (straight double), ' (straight single), " (U+201C left double), ' (U+2018 left single)
105    // « (left guillemet), ‹ (single left guillemet)
106    matches!(c, '"' | '\'' | '\u{201C}' | '\u{2018}' | '«' | '‹')
107}
108
109/// Check if a character is a CJK character (Chinese, Japanese, Korean)
110pub fn is_cjk_char(c: char) -> bool {
111    // CJK Unified Ideographs and common extensions
112    matches!(c,
113        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
114        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
115        '\u{3040}'..='\u{309F}' |   // Hiragana
116        '\u{30A0}'..='\u{30FF}' |   // Katakana
117        '\u{AC00}'..='\u{D7AF}'     // Hangul Syllables
118    )
119}
120
121/// Check if a character is sentence-ending punctuation (ASCII or CJK)
122pub fn is_sentence_ending_punctuation(c: char) -> bool {
123    matches!(c, '.' | '!' | '?') || is_cjk_sentence_ending(c)
124}
125
126/// Check if a character is closing punctuation that can follow sentence-ending punctuation
127/// This includes closing quotes, parentheses, and brackets
128pub fn is_trailing_close_punctuation(c: char) -> bool {
129    is_closing_quote(c) || matches!(c, ')' | ']' | '}')
130}
131
132/// Check if multiple spaces occur immediately after sentence-ending punctuation.
133/// This is a backward-looking check used by MD064.
134///
135/// Returns true if the character(s) immediately before `match_start` constitute
136/// a sentence ending, supporting the traditional two-space-after-sentence convention.
137///
138/// Recognized sentence-ending patterns:
139/// - Direct punctuation: `.`, `!`, `?`, `。`, `！`, `？`
140/// - With closing quotes: `."`, `!"`, `?"`, `.'`, `!'`, `?'`, `."`, `?"`, `!"`
141/// - With closing parenthesis: `.)`, `!)`, `?)`
142/// - With closing bracket: `.]`, `!]`, `?]`
143/// - Ellipsis: `...`
144/// - Combinations: `.")`  (quote then paren), `?')`
145///
146/// Does NOT treat as sentence ending:
147/// - Abbreviations: `Dr.`, `Mr.`, `Prof.`, etc. (when detectable)
148/// - Single letters followed by period: `A.` (likely initials or list markers)
149pub fn is_after_sentence_ending(text: &str, match_start: usize) -> bool {
150    is_after_sentence_ending_with_abbreviations(text, match_start, &get_abbreviations(&None))
151}
152
153/// Check if multiple spaces occur immediately after sentence-ending punctuation,
154/// with a custom abbreviations set.
155///
156/// Note: `match_start` is a byte position (from regex). This function handles
157/// multi-byte UTF-8 characters correctly by working with character iterators.
158pub fn is_after_sentence_ending_with_abbreviations(
159    text: &str,
160    match_start: usize,
161    abbreviations: &HashSet<String>,
162) -> bool {
163    if match_start == 0 || match_start > text.len() {
164        return false;
165    }
166
167    // Safely get the portion of the text before the spaces
168    // match_start is a byte position, so we need to ensure it's a valid char boundary
169    let before = match text.get(..match_start) {
170        Some(s) => s,
171        None => return false, // Invalid byte position
172    };
173
174    // Collect chars for iteration (we need random access for some checks)
175    let chars: Vec<char> = before.chars().collect();
176    if chars.is_empty() {
177        return false;
178    }
179
180    let mut idx = chars.len() - 1;
181
182    // Skip through any trailing closing punctuation (quotes, parens, brackets)
183    // These can appear after the sentence-ending punctuation
184    // e.g., `sentence."  Next` or `sentence.)  Next` or `sentence.")`
185    while idx > 0 && is_trailing_close_punctuation(chars[idx]) {
186        idx -= 1;
187    }
188
189    // Now check if we're at sentence-ending punctuation
190    let current = chars[idx];
191
192    // Check for CJK sentence-ending punctuation
193    if is_cjk_sentence_ending(current) {
194        return true;
195    }
196
197    // Direct sentence-ending punctuation (! and ?)
198    if current == '!' || current == '?' {
199        return true;
200    }
201
202    // Period - need more careful handling
203    if current == '.' {
204        // Check for ellipsis (...) - always a valid sentence ending
205        if idx >= 2 && chars[idx - 1] == '.' && chars[idx - 2] == '.' {
206            return true;
207        }
208
209        // Build the text before the period by collecting chars up to idx
210        // (not including the period itself)
211        let text_before_period: String = chars[..idx].iter().collect();
212
213        // Check if this is an abbreviation
214        if text_ends_with_abbreviation(&format!("{text_before_period}."), abbreviations) {
215            return false;
216        }
217
218        // Check what comes before the period
219        if idx > 0 {
220            let prev = chars[idx - 1];
221
222            // Single letter before period - likely initial or list marker, not sentence
223            // e.g., "A." "B." but allow "a." at end of sentence
224            if prev.is_ascii_uppercase() {
225                // Check if it's preceded by whitespace or start of text (isolated initial)
226                if idx >= 2 {
227                    if chars[idx - 2].is_whitespace() {
228                        // "word A." - isolated initial, not sentence ending
229                        return false;
230                    }
231                } else {
232                    // "A." at start - not a sentence ending
233                    return false;
234                }
235            }
236
237            // If previous char is alphanumeric, closing quote/paren, or markdown inline delimiters, treat as sentence end
238            // Markdown inline elements that can end before punctuation:
239            // - `)` `]` - links, images, footnote refs
240            // - `` ` `` - inline code
241            // - `*` `_` - emphasis/bold
242            // - `~` - strikethrough
243            // - `=` - highlight (extended markdown)
244            // - `^` - superscript (extended markdown)
245            if prev.is_alphanumeric()
246                || is_closing_quote(prev)
247                || matches!(prev, ')' | ']' | '`' | '*' | '_' | '~' | '=' | '^')
248                || is_cjk_char(prev)
249            {
250                return true;
251            }
252        }
253
254        // Period at start or after non-word char - not a sentence ending
255        return false;
256    }
257
258    false
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    // === Abbreviation tests ===
266
267    #[test]
268    fn test_get_abbreviations_default() {
269        let abbrevs = get_abbreviations(&None);
270        assert!(abbrevs.contains("dr"));
271        assert!(abbrevs.contains("mr"));
272        assert!(abbrevs.contains("prof"));
273        assert!(abbrevs.contains("i.e"));
274        assert!(abbrevs.contains("e.g"));
275    }
276
277    #[test]
278    fn test_get_abbreviations_custom() {
279        let custom = Some(vec!["Corp".to_string(), "Ltd.".to_string()]);
280        let abbrevs = get_abbreviations(&custom);
281        // Should include defaults
282        assert!(abbrevs.contains("dr"));
283        // Should include custom (normalized)
284        assert!(abbrevs.contains("corp"));
285        assert!(abbrevs.contains("ltd"));
286    }
287
288    #[test]
289    fn test_text_ends_with_abbreviation() {
290        let abbrevs = get_abbreviations(&None);
291        assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
292        assert!(text_ends_with_abbreviation("Hello Dr.", &abbrevs));
293        assert!(text_ends_with_abbreviation("Prof.", &abbrevs));
294        assert!(!text_ends_with_abbreviation("Doctor.", &abbrevs));
295        assert!(!text_ends_with_abbreviation("Dr?", &abbrevs)); // Not a period
296        assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
297    }
298
299    #[test]
300    fn test_text_ends_with_abbreviation_after_punctuation() {
301        let abbrevs = get_abbreviations(&None);
302        // Abbreviations preceded by opening parenthesis
303        assert!(text_ends_with_abbreviation("(e.g.", &abbrevs));
304        assert!(text_ends_with_abbreviation("(i.e.", &abbrevs));
305        assert!(text_ends_with_abbreviation("word (e.g.", &abbrevs));
306        assert!(text_ends_with_abbreviation("word (i.e.", &abbrevs));
307        // Abbreviations preceded by opening bracket
308        assert!(text_ends_with_abbreviation("[e.g.", &abbrevs));
309        assert!(text_ends_with_abbreviation("[Dr.", &abbrevs));
310        // Abbreviations preceded by quotes
311        assert!(text_ends_with_abbreviation("\"Dr.", &abbrevs));
312        // Abbreviations preceded by emphasis markers
313        assert!(text_ends_with_abbreviation("*e.g.", &abbrevs));
314        assert!(text_ends_with_abbreviation("**e.g.", &abbrevs));
315        // Nested punctuation (quote + paren)
316        assert!(text_ends_with_abbreviation("(\"e.g.", &abbrevs));
317        assert!(text_ends_with_abbreviation("([Dr.", &abbrevs));
318        // Non-abbreviations with leading punctuation should still not match
319        assert!(!text_ends_with_abbreviation("(paradigms.", &abbrevs));
320        assert!(!text_ends_with_abbreviation("[Doctor.", &abbrevs));
321    }
322
323    // === Punctuation helper tests ===
324
325    #[test]
326    fn test_is_closing_quote() {
327        assert!(is_closing_quote('"'));
328        assert!(is_closing_quote('\''));
329        assert!(is_closing_quote('\u{201D}')); // "
330        assert!(is_closing_quote('\u{2019}')); // '
331        assert!(is_closing_quote('»'));
332        assert!(is_closing_quote('›'));
333        assert!(!is_closing_quote('a'));
334        assert!(!is_closing_quote('.'));
335    }
336
337    #[test]
338    fn test_is_cjk_sentence_ending() {
339        assert!(is_cjk_sentence_ending('。'));
340        assert!(is_cjk_sentence_ending('！'));
341        assert!(is_cjk_sentence_ending('？'));
342        assert!(!is_cjk_sentence_ending('.'));
343        assert!(!is_cjk_sentence_ending('!'));
344    }
345
346    #[test]
347    fn test_is_cjk_char() {
348        assert!(is_cjk_char('中'));
349        assert!(is_cjk_char('あ')); // Hiragana
350        assert!(is_cjk_char('ア')); // Katakana
351        assert!(is_cjk_char('한')); // Hangul
352        assert!(!is_cjk_char('a'));
353        assert!(!is_cjk_char('A'));
354    }
355
356    // === is_after_sentence_ending tests ===
357
358    #[test]
359    fn test_after_period() {
360        assert!(is_after_sentence_ending("Hello.  ", 6));
361        assert!(is_after_sentence_ending("End of sentence.  Next", 16));
362    }
363
364    #[test]
365    fn test_after_exclamation() {
366        assert!(is_after_sentence_ending("Wow!  ", 4));
367        assert!(is_after_sentence_ending("Great!  Next", 6));
368    }
369
370    #[test]
371    fn test_after_question() {
372        assert!(is_after_sentence_ending("Really?  ", 7));
373        assert!(is_after_sentence_ending("What?  Next", 5));
374    }
375
376    #[test]
377    fn test_after_closing_quote() {
378        assert!(is_after_sentence_ending("He said \"Hello.\"  Next", 16));
379        assert!(is_after_sentence_ending("She said 'Hi.'  Next", 14));
380    }
381
382    #[test]
383    fn test_after_curly_quotes() {
384        let content = format!("He said {}Hello.{}  Next", '\u{201C}', '\u{201D}');
385        // Find the position after the closing quote
386        let pos = content.find("  ").unwrap();
387        assert!(is_after_sentence_ending(&content, pos));
388    }
389
390    #[test]
391    fn test_after_closing_paren() {
392        assert!(is_after_sentence_ending("(See note.)  Next", 11));
393        assert!(is_after_sentence_ending("(Really!)  Next", 9));
394    }
395
396    #[test]
397    fn test_after_closing_bracket() {
398        assert!(is_after_sentence_ending("[Citation.]  Next", 11));
399    }
400
401    #[test]
402    fn test_after_ellipsis() {
403        assert!(is_after_sentence_ending("And so...  Next", 9));
404        assert!(is_after_sentence_ending("Hmm...  Let me think", 6));
405    }
406
407    #[test]
408    fn test_not_after_abbreviation() {
409        // Dr. should NOT be treated as sentence ending
410        assert!(!is_after_sentence_ending("Dr.  Smith", 3));
411        assert!(!is_after_sentence_ending("Mr.  Jones", 3));
412        assert!(!is_after_sentence_ending("Prof.  Williams", 5));
413    }
414
415    #[test]
416    fn test_not_after_single_initial() {
417        // Single capital letter + period is likely an initial, not sentence end
418        assert!(!is_after_sentence_ending("John A.  Smith", 7));
419        // But lowercase should work (end of sentence)
420        assert!(is_after_sentence_ending("letter a.  Next", 9));
421    }
422
423    #[test]
424    fn test_mid_sentence_not_detected() {
425        // Spaces not after sentence punctuation
426        assert!(!is_after_sentence_ending("word  word", 4));
427        assert!(!is_after_sentence_ending("multiple  spaces", 8));
428    }
429
430    #[test]
431    fn test_cjk_sentence_ending() {
432        // CJK chars are 3 bytes each in UTF-8
433        // 日(3)+本(3)+語(3)+。(3) = 12 bytes before the spaces
434        assert!(is_after_sentence_ending("日本語。  Next", 12)); // After 。
435        // 中(3)+文(3)+！(3) = 9 bytes before the spaces
436        assert!(is_after_sentence_ending("中文！  Next", 9)); // After ！
437        // 한(3)+국(3)+어(3)+？(3) = 12 bytes before the spaces
438        assert!(is_after_sentence_ending("한국어？  Next", 12)); // After ？
439    }
440
441    #[test]
442    fn test_complex_endings() {
443        // Multiple closing punctuation
444        assert!(is_after_sentence_ending("(He said \"Yes.\")  Next", 16));
445        // Quote then paren
446        assert!(is_after_sentence_ending("\"End.\")  Next", 7));
447    }
448
449    #[test]
450    fn test_guillemets() {
451        assert!(is_after_sentence_ending("Il dit «Oui.»  Next", 13));
452    }
453
454    #[test]
455    fn test_empty_and_edge_cases() {
456        assert!(!is_after_sentence_ending("", 0));
457        assert!(!is_after_sentence_ending(".", 0));
458        assert!(!is_after_sentence_ending("a", 0));
459    }
460
461    #[test]
462    fn test_latin_abbreviations() {
463        // i.e. and e.g. should not be sentence endings
464        assert!(!is_after_sentence_ending("i.e.  example", 4));
465        assert!(!is_after_sentence_ending("e.g.  example", 4));
466    }
467
468    #[test]
469    fn test_abbreviations_after_opening_punctuation() {
470        // Abbreviations preceded by parentheses, brackets, quotes
471        assert!(!is_after_sentence_ending("(e.g.  Wasm)", 5));
472        assert!(!is_after_sentence_ending("(i.e.  PyO3)", 5));
473        assert!(!is_after_sentence_ending("[e.g.  Chapter]", 5));
474        assert!(!is_after_sentence_ending("(Dr.  Smith)", 4));
475        // Nested punctuation: quote + paren
476        assert!(!is_after_sentence_ending("(\"e.g.  something\")", 6));
477    }
478
479    #[test]
480    fn test_after_inline_code() {
481        // Issue #345: Sentence ending with inline code should be recognized
482        // "Hello from `backticks`.  How's it going?"
483        // Position 23 is after the period following the closing backtick
484        assert!(is_after_sentence_ending("Hello from `backticks`.  Next", 23));
485
486        // Simple case: just code and period
487        assert!(is_after_sentence_ending("`code`.  Next", 7));
488
489        // Multiple inline code spans
490        assert!(is_after_sentence_ending("Use `foo` and `bar`.  Next", 20));
491
492        // With exclamation mark
493        assert!(is_after_sentence_ending("`important`!  Next", 12));
494
495        // With question mark
496        assert!(is_after_sentence_ending("Is it `true`?  Next", 13));
497
498        // Inline code in the middle shouldn't affect sentence detection
499        assert!(is_after_sentence_ending("The `code` works.  Next", 17));
500    }
501
502    #[test]
503    fn test_after_inline_code_with_quotes() {
504        // Inline code before closing quote before period
505        assert!(is_after_sentence_ending("He said \"use `code`\".  Next", 21));
506
507        // Inline code in parentheses
508        assert!(is_after_sentence_ending("(see `example`).  Next", 16));
509    }
510
511    #[test]
512    fn test_after_emphasis() {
513        // Asterisk emphasis
514        assert!(is_after_sentence_ending("The word is *important*.  Next", 24));
515
516        // Underscore emphasis
517        assert!(is_after_sentence_ending("The word is _important_.  Next", 24));
518
519        // With exclamation
520        assert!(is_after_sentence_ending("This is *urgent*!  Next", 17));
521
522        // With question
523        assert!(is_after_sentence_ending("Is it _true_?  Next", 13));
524    }
525
526    #[test]
527    fn test_after_bold() {
528        // Asterisk bold
529        assert!(is_after_sentence_ending("The word is **critical**.  Next", 25));
530
531        // Underscore bold
532        assert!(is_after_sentence_ending("The word is __critical__.  Next", 25));
533    }
534
535    #[test]
536    fn test_after_strikethrough() {
537        // GFM strikethrough
538        assert!(is_after_sentence_ending("This is ~~wrong~~.  Next", 18));
539
540        // With exclamation
541        assert!(is_after_sentence_ending("That was ~~bad~~!  Next", 17));
542    }
543
544    #[test]
545    fn test_after_extended_markdown() {
546        // Highlight syntax (some flavors)
547        assert!(is_after_sentence_ending("This is ==highlighted==.  Next", 24));
548
549        // Superscript syntax (some flavors)
550        assert!(is_after_sentence_ending("E equals mc^2^.  Next", 15));
551    }
552}
rumdl_lib/utils/sentence_utils.rs

rumdl_lib/utils/
sentence_utils.rs