rumdl 0.1.51

A fast Markdown linter written in Rust (Ru(st) MarkDown Linter)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
//! Sentence detection utilities
//!
//! This module provides shared functionality for detecting sentence boundaries
//! in markdown text. Used by both text reflow (MD013) and the multiple spaces
//! rule (MD064).
//!
//! Features:
//! - Common abbreviation detection (Mr., Dr., Prof., etc.)
//! - CJK punctuation support (。, !, ?)
//! - Closing quote detection (straight and curly)
//! - Both forward-looking (reflow) and backward-looking (MD064) sentence detection

use std::collections::HashSet;

/// Default abbreviations that should NOT be treated as sentence endings.
///
/// Only includes abbreviations that:
/// 1. Conventionally ALWAYS have a period in standard writing
/// 2. Are almost always followed by something, not sentence-final
///
/// Does NOT include:
/// - Abbreviations that commonly end sentences (etc., Inc., Ph.D., U.S.)
pub const DEFAULT_ABBREVIATIONS: &[&str] = &[
    // Titles - always have period, always followed by a name
    "mr", "mrs", "ms", "dr", "prof", "sr", "jr",
    // Latin - always written with periods, introduce examples/references
    "i.e", "e.g", // Reference abbreviations - followed by what they refer to
    "vs", "fig", "no", "vol", "ch", "sec", "al",
];

/// Get the effective abbreviations set based on custom additions
/// All abbreviations are normalized to lowercase for case-insensitive matching
/// Custom abbreviations are always merged with built-in defaults
pub fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
    let mut abbreviations: HashSet<String> = DEFAULT_ABBREVIATIONS.iter().map(|s| s.to_lowercase()).collect();

    // Always extend defaults with custom abbreviations
    // Strip any trailing periods and normalize to lowercase for consistent matching
    if let Some(custom_list) = custom {
        for abbr in custom_list {
            let normalized = abbr.trim_end_matches('.').to_lowercase();
            if !normalized.is_empty() {
                abbreviations.insert(normalized);
            }
        }
    }

    abbreviations
}

/// Check if text ends with a common abbreviation followed by a period
///
/// Abbreviations only count when followed by a period, not ! or ?.
/// This prevents false positives where words ending in abbreviation-like
/// letter sequences (e.g., "paradigms" ending in "ms") are incorrectly
/// detected as abbreviations.
///
/// Examples:
///   - "Dr." -> true (abbreviation)
///   - "Dr?" -> false (question, not abbreviation)
///   - "paradigms." -> false (not in abbreviation list)
///   - "paradigms?" -> false (question mark, not abbreviation)
pub fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
    // Only check if text ends with a period (abbreviations require periods)
    if !text.ends_with('.') {
        return false;
    }

    // Remove the trailing period
    let without_period = text.trim_end_matches('.');

    // Get the last word by splitting on whitespace
    let last_word = without_period.split_whitespace().last().unwrap_or("");

    if last_word.is_empty() {
        return false;
    }

    // Strip leading punctuation (parentheses, brackets, quotes, emphasis markers)
    // that may precede the abbreviation, e.g. "(e.g." or "[i.e."
    let stripped = last_word.trim_start_matches(|c: char| !c.is_alphanumeric() && c != '.');

    // O(1) HashSet lookup (abbreviations are already lowercase)
    abbreviations.contains(&stripped.to_lowercase())
}

/// Check if a character is CJK sentence-ending punctuation
/// These include: 。(ideographic full stop), !(fullwidth exclamation), ?(fullwidth question)
pub fn is_cjk_sentence_ending(c: char) -> bool {
    matches!(c, '' | '' | '')
}

/// Check if a character is a closing quote mark
/// Includes straight quotes and curly/smart quotes
pub fn is_closing_quote(c: char) -> bool {
    // " (straight double), ' (straight single), " (U+201D right double), ' (U+2019 right single)
    // » (right guillemet), › (single right guillemet)
    matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | '»' | '')
}

/// Check if a character is an opening quote mark
/// Includes straight quotes and curly/smart quotes
pub fn is_opening_quote(c: char) -> bool {
    // " (straight double), ' (straight single), " (U+201C left double), ' (U+2018 left single)
    // « (left guillemet), ‹ (single left guillemet)
    matches!(c, '"' | '\'' | '\u{201C}' | '\u{2018}' | '«' | '')
}

/// Check if a character is a CJK character (Chinese, Japanese, Korean)
pub fn is_cjk_char(c: char) -> bool {
    // CJK Unified Ideographs and common extensions
    matches!(c,
        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
        '\u{3040}'..='\u{309F}' |   // Hiragana
        '\u{30A0}'..='\u{30FF}' |   // Katakana
        '\u{AC00}'..='\u{D7AF}'     // Hangul Syllables
    )
}

/// Check if a character is sentence-ending punctuation (ASCII or CJK)
pub fn is_sentence_ending_punctuation(c: char) -> bool {
    matches!(c, '.' | '!' | '?') || is_cjk_sentence_ending(c)
}

/// Check if a character is closing punctuation that can follow sentence-ending punctuation
/// This includes closing quotes, parentheses, and brackets
pub fn is_trailing_close_punctuation(c: char) -> bool {
    is_closing_quote(c) || matches!(c, ')' | ']' | '}')
}

/// Check if multiple spaces occur immediately after sentence-ending punctuation.
/// This is a backward-looking check used by MD064.
///
/// Returns true if the character(s) immediately before `match_start` constitute
/// a sentence ending, supporting the traditional two-space-after-sentence convention.
///
/// Recognized sentence-ending patterns:
/// - Direct punctuation: `.`, `!`, `?`, `。`, `!`, `?`
/// - With closing quotes: `."`, `!"`, `?"`, `.'`, `!'`, `?'`, `."`, `?"`, `!"`
/// - With closing parenthesis: `.)`, `!)`, `?)`
/// - With closing bracket: `.]`, `!]`, `?]`
/// - Ellipsis: `...`
/// - Combinations: `.")`  (quote then paren), `?')`
///
/// Does NOT treat as sentence ending:
/// - Abbreviations: `Dr.`, `Mr.`, `Prof.`, etc. (when detectable)
/// - Single letters followed by period: `A.` (likely initials or list markers)
pub fn is_after_sentence_ending(text: &str, match_start: usize) -> bool {
    is_after_sentence_ending_with_abbreviations(text, match_start, &get_abbreviations(&None))
}

/// Check if multiple spaces occur immediately after sentence-ending punctuation,
/// with a custom abbreviations set.
///
/// Note: `match_start` is a byte position (from regex). This function handles
/// multi-byte UTF-8 characters correctly by working with character iterators.
pub fn is_after_sentence_ending_with_abbreviations(
    text: &str,
    match_start: usize,
    abbreviations: &HashSet<String>,
) -> bool {
    if match_start == 0 || match_start > text.len() {
        return false;
    }

    // Safely get the portion of the text before the spaces
    // match_start is a byte position, so we need to ensure it's a valid char boundary
    let before = match text.get(..match_start) {
        Some(s) => s,
        None => return false, // Invalid byte position
    };

    // Collect chars for iteration (we need random access for some checks)
    let chars: Vec<char> = before.chars().collect();
    if chars.is_empty() {
        return false;
    }

    let mut idx = chars.len() - 1;

    // Skip through any trailing closing punctuation (quotes, parens, brackets)
    // These can appear after the sentence-ending punctuation
    // e.g., `sentence."  Next` or `sentence.)  Next` or `sentence.")`
    while idx > 0 && is_trailing_close_punctuation(chars[idx]) {
        idx -= 1;
    }

    // Now check if we're at sentence-ending punctuation
    let current = chars[idx];

    // Check for CJK sentence-ending punctuation
    if is_cjk_sentence_ending(current) {
        return true;
    }

    // Direct sentence-ending punctuation (! and ?)
    if current == '!' || current == '?' {
        return true;
    }

    // Period - need more careful handling
    if current == '.' {
        // Check for ellipsis (...) - always a valid sentence ending
        if idx >= 2 && chars[idx - 1] == '.' && chars[idx - 2] == '.' {
            return true;
        }

        // Build the text before the period by collecting chars up to idx
        // (not including the period itself)
        let text_before_period: String = chars[..idx].iter().collect();

        // Check if this is an abbreviation
        if text_ends_with_abbreviation(&format!("{text_before_period}."), abbreviations) {
            return false;
        }

        // Check what comes before the period
        if idx > 0 {
            let prev = chars[idx - 1];

            // Single letter before period - likely initial or list marker, not sentence
            // e.g., "A." "B." but allow "a." at end of sentence
            if prev.is_ascii_uppercase() {
                // Check if it's preceded by whitespace or start of text (isolated initial)
                if idx >= 2 {
                    if chars[idx - 2].is_whitespace() {
                        // "word A." - isolated initial, not sentence ending
                        return false;
                    }
                } else {
                    // "A." at start - not a sentence ending
                    return false;
                }
            }

            // If previous char is alphanumeric, closing quote/paren, or markdown inline delimiters, treat as sentence end
            // Markdown inline elements that can end before punctuation:
            // - `)` `]` - links, images, footnote refs
            // - `` ` `` - inline code
            // - `*` `_` - emphasis/bold
            // - `~` - strikethrough
            // - `=` - highlight (extended markdown)
            // - `^` - superscript (extended markdown)
            if prev.is_alphanumeric()
                || is_closing_quote(prev)
                || matches!(prev, ')' | ']' | '`' | '*' | '_' | '~' | '=' | '^')
                || is_cjk_char(prev)
            {
                return true;
            }
        }

        // Period at start or after non-word char - not a sentence ending
        return false;
    }

    false
}

#[cfg(test)]
mod tests {
    use super::*;

    // === Abbreviation tests ===

    #[test]
    fn test_get_abbreviations_default() {
        let abbrevs = get_abbreviations(&None);
        assert!(abbrevs.contains("dr"));
        assert!(abbrevs.contains("mr"));
        assert!(abbrevs.contains("prof"));
        assert!(abbrevs.contains("i.e"));
        assert!(abbrevs.contains("e.g"));
    }

    #[test]
    fn test_get_abbreviations_custom() {
        let custom = Some(vec!["Corp".to_string(), "Ltd.".to_string()]);
        let abbrevs = get_abbreviations(&custom);
        // Should include defaults
        assert!(abbrevs.contains("dr"));
        // Should include custom (normalized)
        assert!(abbrevs.contains("corp"));
        assert!(abbrevs.contains("ltd"));
    }

    #[test]
    fn test_text_ends_with_abbreviation() {
        let abbrevs = get_abbreviations(&None);
        assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
        assert!(text_ends_with_abbreviation("Hello Dr.", &abbrevs));
        assert!(text_ends_with_abbreviation("Prof.", &abbrevs));
        assert!(!text_ends_with_abbreviation("Doctor.", &abbrevs));
        assert!(!text_ends_with_abbreviation("Dr?", &abbrevs)); // Not a period
        assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
    }

    #[test]
    fn test_text_ends_with_abbreviation_after_punctuation() {
        let abbrevs = get_abbreviations(&None);
        // Abbreviations preceded by opening parenthesis
        assert!(text_ends_with_abbreviation("(e.g.", &abbrevs));
        assert!(text_ends_with_abbreviation("(i.e.", &abbrevs));
        assert!(text_ends_with_abbreviation("word (e.g.", &abbrevs));
        assert!(text_ends_with_abbreviation("word (i.e.", &abbrevs));
        // Abbreviations preceded by opening bracket
        assert!(text_ends_with_abbreviation("[e.g.", &abbrevs));
        assert!(text_ends_with_abbreviation("[Dr.", &abbrevs));
        // Abbreviations preceded by quotes
        assert!(text_ends_with_abbreviation("\"Dr.", &abbrevs));
        // Abbreviations preceded by emphasis markers
        assert!(text_ends_with_abbreviation("*e.g.", &abbrevs));
        assert!(text_ends_with_abbreviation("**e.g.", &abbrevs));
        // Nested punctuation (quote + paren)
        assert!(text_ends_with_abbreviation("(\"e.g.", &abbrevs));
        assert!(text_ends_with_abbreviation("([Dr.", &abbrevs));
        // Non-abbreviations with leading punctuation should still not match
        assert!(!text_ends_with_abbreviation("(paradigms.", &abbrevs));
        assert!(!text_ends_with_abbreviation("[Doctor.", &abbrevs));
    }

    // === Punctuation helper tests ===

    #[test]
    fn test_is_closing_quote() {
        assert!(is_closing_quote('"'));
        assert!(is_closing_quote('\''));
        assert!(is_closing_quote('\u{201D}')); // "
        assert!(is_closing_quote('\u{2019}')); // '
        assert!(is_closing_quote('»'));
        assert!(is_closing_quote(''));
        assert!(!is_closing_quote('a'));
        assert!(!is_closing_quote('.'));
    }

    #[test]
    fn test_is_cjk_sentence_ending() {
        assert!(is_cjk_sentence_ending(''));
        assert!(is_cjk_sentence_ending(''));
        assert!(is_cjk_sentence_ending(''));
        assert!(!is_cjk_sentence_ending('.'));
        assert!(!is_cjk_sentence_ending('!'));
    }

    #[test]
    fn test_is_cjk_char() {
        assert!(is_cjk_char(''));
        assert!(is_cjk_char('')); // Hiragana
        assert!(is_cjk_char('')); // Katakana
        assert!(is_cjk_char('')); // Hangul
        assert!(!is_cjk_char('a'));
        assert!(!is_cjk_char('A'));
    }

    // === is_after_sentence_ending tests ===

    #[test]
    fn test_after_period() {
        assert!(is_after_sentence_ending("Hello.  ", 6));
        assert!(is_after_sentence_ending("End of sentence.  Next", 16));
    }

    #[test]
    fn test_after_exclamation() {
        assert!(is_after_sentence_ending("Wow!  ", 4));
        assert!(is_after_sentence_ending("Great!  Next", 6));
    }

    #[test]
    fn test_after_question() {
        assert!(is_after_sentence_ending("Really?  ", 7));
        assert!(is_after_sentence_ending("What?  Next", 5));
    }

    #[test]
    fn test_after_closing_quote() {
        assert!(is_after_sentence_ending("He said \"Hello.\"  Next", 16));
        assert!(is_after_sentence_ending("She said 'Hi.'  Next", 14));
    }

    #[test]
    fn test_after_curly_quotes() {
        let content = format!("He said {}Hello.{}  Next", '\u{201C}', '\u{201D}');
        // Find the position after the closing quote
        let pos = content.find("  ").unwrap();
        assert!(is_after_sentence_ending(&content, pos));
    }

    #[test]
    fn test_after_closing_paren() {
        assert!(is_after_sentence_ending("(See note.)  Next", 11));
        assert!(is_after_sentence_ending("(Really!)  Next", 9));
    }

    #[test]
    fn test_after_closing_bracket() {
        assert!(is_after_sentence_ending("[Citation.]  Next", 11));
    }

    #[test]
    fn test_after_ellipsis() {
        assert!(is_after_sentence_ending("And so...  Next", 9));
        assert!(is_after_sentence_ending("Hmm...  Let me think", 6));
    }

    #[test]
    fn test_not_after_abbreviation() {
        // Dr. should NOT be treated as sentence ending
        assert!(!is_after_sentence_ending("Dr.  Smith", 3));
        assert!(!is_after_sentence_ending("Mr.  Jones", 3));
        assert!(!is_after_sentence_ending("Prof.  Williams", 5));
    }

    #[test]
    fn test_not_after_single_initial() {
        // Single capital letter + period is likely an initial, not sentence end
        assert!(!is_after_sentence_ending("John A.  Smith", 7));
        // But lowercase should work (end of sentence)
        assert!(is_after_sentence_ending("letter a.  Next", 9));
    }

    #[test]
    fn test_mid_sentence_not_detected() {
        // Spaces not after sentence punctuation
        assert!(!is_after_sentence_ending("word  word", 4));
        assert!(!is_after_sentence_ending("multiple  spaces", 8));
    }

    #[test]
    fn test_cjk_sentence_ending() {
        // CJK chars are 3 bytes each in UTF-8
        // 日(3)+本(3)+語(3)+。(3) = 12 bytes before the spaces
        assert!(is_after_sentence_ending("日本語。  Next", 12)); // After 。
        // 中(3)+文(3)+!(3) = 9 bytes before the spaces
        assert!(is_after_sentence_ending("中文!  Next", 9)); // After !
        // 한(3)+국(3)+어(3)+?(3) = 12 bytes before the spaces
        assert!(is_after_sentence_ending("한국어?  Next", 12)); // After ?
    }

    #[test]
    fn test_complex_endings() {
        // Multiple closing punctuation
        assert!(is_after_sentence_ending("(He said \"Yes.\")  Next", 16));
        // Quote then paren
        assert!(is_after_sentence_ending("\"End.\")  Next", 7));
    }

    #[test]
    fn test_guillemets() {
        assert!(is_after_sentence_ending("Il dit «Oui.»  Next", 13));
    }

    #[test]
    fn test_empty_and_edge_cases() {
        assert!(!is_after_sentence_ending("", 0));
        assert!(!is_after_sentence_ending(".", 0));
        assert!(!is_after_sentence_ending("a", 0));
    }

    #[test]
    fn test_latin_abbreviations() {
        // i.e. and e.g. should not be sentence endings
        assert!(!is_after_sentence_ending("i.e.  example", 4));
        assert!(!is_after_sentence_ending("e.g.  example", 4));
    }

    #[test]
    fn test_abbreviations_after_opening_punctuation() {
        // Abbreviations preceded by parentheses, brackets, quotes
        assert!(!is_after_sentence_ending("(e.g.  Wasm)", 5));
        assert!(!is_after_sentence_ending("(i.e.  PyO3)", 5));
        assert!(!is_after_sentence_ending("[e.g.  Chapter]", 5));
        assert!(!is_after_sentence_ending("(Dr.  Smith)", 4));
        // Nested punctuation: quote + paren
        assert!(!is_after_sentence_ending("(\"e.g.  something\")", 6));
    }

    #[test]
    fn test_after_inline_code() {
        // Issue #345: Sentence ending with inline code should be recognized
        // "Hello from `backticks`.  How's it going?"
        // Position 23 is after the period following the closing backtick
        assert!(is_after_sentence_ending("Hello from `backticks`.  Next", 23));

        // Simple case: just code and period
        assert!(is_after_sentence_ending("`code`.  Next", 7));

        // Multiple inline code spans
        assert!(is_after_sentence_ending("Use `foo` and `bar`.  Next", 20));

        // With exclamation mark
        assert!(is_after_sentence_ending("`important`!  Next", 12));

        // With question mark
        assert!(is_after_sentence_ending("Is it `true`?  Next", 13));

        // Inline code in the middle shouldn't affect sentence detection
        assert!(is_after_sentence_ending("The `code` works.  Next", 17));
    }

    #[test]
    fn test_after_inline_code_with_quotes() {
        // Inline code before closing quote before period
        assert!(is_after_sentence_ending("He said \"use `code`\".  Next", 21));

        // Inline code in parentheses
        assert!(is_after_sentence_ending("(see `example`).  Next", 16));
    }

    #[test]
    fn test_after_emphasis() {
        // Asterisk emphasis
        assert!(is_after_sentence_ending("The word is *important*.  Next", 24));

        // Underscore emphasis
        assert!(is_after_sentence_ending("The word is _important_.  Next", 24));

        // With exclamation
        assert!(is_after_sentence_ending("This is *urgent*!  Next", 17));

        // With question
        assert!(is_after_sentence_ending("Is it _true_?  Next", 13));
    }

    #[test]
    fn test_after_bold() {
        // Asterisk bold
        assert!(is_after_sentence_ending("The word is **critical**.  Next", 25));

        // Underscore bold
        assert!(is_after_sentence_ending("The word is __critical__.  Next", 25));
    }

    #[test]
    fn test_after_strikethrough() {
        // GFM strikethrough
        assert!(is_after_sentence_ending("This is ~~wrong~~.  Next", 18));

        // With exclamation
        assert!(is_after_sentence_ending("That was ~~bad~~!  Next", 17));
    }

    #[test]
    fn test_after_extended_markdown() {
        // Highlight syntax (some flavors)
        assert!(is_after_sentence_ending("This is ==highlighted==.  Next", 24));

        // Superscript syntax (some flavors)
        assert!(is_after_sentence_ending("E equals mc^2^.  Next", 15));
    }
}