1use std::collections::HashSet;
14
15const DEFAULT_ABBREVIATIONS: &[&str] = &[
24 "mr", "mrs", "ms", "dr", "prof", "sr", "jr", "st",
26 "i.e", "e.g", "vs", "fig", "no", "vol", "ch", "sec", "al",
29];
30
31pub fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
35 let mut abbreviations: HashSet<String> = DEFAULT_ABBREVIATIONS.iter().map(|s| s.to_lowercase()).collect();
36
37 if let Some(custom_list) = custom {
40 for abbr in custom_list {
41 let normalized = abbr.trim_end_matches('.').to_lowercase();
42 if !normalized.is_empty() {
43 abbreviations.insert(normalized);
44 }
45 }
46 }
47
48 abbreviations
49}
50
51pub fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
64 if !text.ends_with('.') {
66 return false;
67 }
68
69 let without_period = text.trim_end_matches('.');
71
72 let last_word = without_period.split_whitespace().last().unwrap_or("");
74
75 if last_word.is_empty() {
76 return false;
77 }
78
79 let stripped = last_word.trim_start_matches(|c: char| !c.is_alphanumeric() && c != '.');
82
83 if abbreviations.contains(&stripped.to_lowercase()) {
85 return true;
86 }
87
88 if let Some(after_hyphen) = stripped.rsplit('-').next()
91 && !after_hyphen.is_empty()
92 && after_hyphen != stripped
93 {
94 return abbreviations.contains(&after_hyphen.to_lowercase());
95 }
96
97 false
98}
99
100pub fn is_cjk_sentence_ending(c: char) -> bool {
103 matches!(c, '。' | '!' | '?')
104}
105
106pub fn is_closing_quote(c: char) -> bool {
109 matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | '»' | '›')
112}
113
114pub fn is_opening_quote(c: char) -> bool {
117 matches!(c, '"' | '\'' | '\u{201C}' | '\u{2018}' | '«' | '‹')
120}
121
122pub fn is_cjk_char(c: char) -> bool {
124 matches!(c,
126 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
132}
133
134fn is_trailing_close_punctuation(c: char) -> bool {
137 is_closing_quote(c) || matches!(c, ')' | ']' | '}')
138}
139
140pub fn is_after_sentence_ending(text: &str, match_start: usize) -> bool {
158 is_after_sentence_ending_with_abbreviations(text, match_start, &get_abbreviations(&None))
159}
160
161fn is_after_sentence_ending_with_abbreviations(
167 text: &str,
168 match_start: usize,
169 abbreviations: &HashSet<String>,
170) -> bool {
171 if match_start == 0 || match_start > text.len() {
172 return false;
173 }
174
175 let Some(before) = text.get(..match_start) else {
178 return false; };
180
181 let chars: Vec<char> = before.chars().collect();
183 if chars.is_empty() {
184 return false;
185 }
186
187 let mut idx = chars.len() - 1;
188
189 while idx > 0 && is_trailing_close_punctuation(chars[idx]) {
193 idx -= 1;
194 }
195
196 let current = chars[idx];
198
199 if is_cjk_sentence_ending(current) {
201 return true;
202 }
203
204 if current == '!' || current == '?' {
206 return true;
207 }
208
209 if current == '.' {
211 if idx >= 2 && chars[idx - 1] == '.' && chars[idx - 2] == '.' {
213 return true;
214 }
215
216 let text_before_period: String = chars[..idx].iter().collect();
219
220 if text_ends_with_abbreviation(&format!("{text_before_period}."), abbreviations) {
222 return false;
223 }
224
225 if idx > 0 {
227 let prev = chars[idx - 1];
228
229 if prev.is_ascii_uppercase() {
232 if idx >= 2 {
234 if chars[idx - 2].is_whitespace() {
235 return false;
237 }
238 } else {
239 return false;
241 }
242 }
243
244 if prev.is_alphanumeric()
253 || is_closing_quote(prev)
254 || matches!(prev, ')' | ']' | '`' | '*' | '_' | '~' | '=' | '^')
255 || is_cjk_char(prev)
256 {
257 return true;
258 }
259 }
260
261 return false;
263 }
264
265 false
266}
267
268#[cfg(test)]
269mod tests {
270 use super::*;
271
272 #[test]
275 fn test_get_abbreviations_default() {
276 let abbrevs = get_abbreviations(&None);
277 assert!(abbrevs.contains("dr"));
278 assert!(abbrevs.contains("mr"));
279 assert!(abbrevs.contains("prof"));
280 assert!(abbrevs.contains("i.e"));
281 assert!(abbrevs.contains("e.g"));
282 assert!(abbrevs.contains("st"));
283 }
284
285 #[test]
286 fn test_st_abbreviation_not_sentence_boundary() {
287 let abbrevs = get_abbreviations(&None);
288
289 assert!(text_ends_with_abbreviation("St.", &abbrevs));
291
292 assert!(text_ends_with_abbreviation("Wrangell-St.", &abbrevs));
294
295 assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
297 assert!(!text_ends_with_abbreviation("starts.", &abbrevs));
298
299 assert!(!text_ends_with_abbreviation("word-foo.", &abbrevs));
301 assert!(!text_ends_with_abbreviation("end-street.", &abbrevs));
302
303 assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
305 assert!(text_ends_with_abbreviation("Mr.", &abbrevs));
306 }
307
308 #[test]
309 fn test_get_abbreviations_custom() {
310 let custom = Some(vec!["Corp".to_string(), "Ltd.".to_string()]);
311 let abbrevs = get_abbreviations(&custom);
312 assert!(abbrevs.contains("dr"));
314 assert!(abbrevs.contains("corp"));
316 assert!(abbrevs.contains("ltd"));
317 }
318
319 #[test]
320 fn test_text_ends_with_abbreviation() {
321 let abbrevs = get_abbreviations(&None);
322 assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
323 assert!(text_ends_with_abbreviation("Hello Dr.", &abbrevs));
324 assert!(text_ends_with_abbreviation("Prof.", &abbrevs));
325 assert!(!text_ends_with_abbreviation("Doctor.", &abbrevs));
326 assert!(!text_ends_with_abbreviation("Dr?", &abbrevs)); assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
328 }
329
330 #[test]
331 fn test_text_ends_with_abbreviation_after_punctuation() {
332 let abbrevs = get_abbreviations(&None);
333 assert!(text_ends_with_abbreviation("(e.g.", &abbrevs));
335 assert!(text_ends_with_abbreviation("(i.e.", &abbrevs));
336 assert!(text_ends_with_abbreviation("word (e.g.", &abbrevs));
337 assert!(text_ends_with_abbreviation("word (i.e.", &abbrevs));
338 assert!(text_ends_with_abbreviation("[e.g.", &abbrevs));
340 assert!(text_ends_with_abbreviation("[Dr.", &abbrevs));
341 assert!(text_ends_with_abbreviation("\"Dr.", &abbrevs));
343 assert!(text_ends_with_abbreviation("*e.g.", &abbrevs));
345 assert!(text_ends_with_abbreviation("**e.g.", &abbrevs));
346 assert!(text_ends_with_abbreviation("(\"e.g.", &abbrevs));
348 assert!(text_ends_with_abbreviation("([Dr.", &abbrevs));
349 assert!(!text_ends_with_abbreviation("(paradigms.", &abbrevs));
351 assert!(!text_ends_with_abbreviation("[Doctor.", &abbrevs));
352 }
353
354 #[test]
357 fn test_is_closing_quote() {
358 assert!(is_closing_quote('"'));
359 assert!(is_closing_quote('\''));
360 assert!(is_closing_quote('\u{201D}')); assert!(is_closing_quote('\u{2019}')); assert!(is_closing_quote('»'));
363 assert!(is_closing_quote('›'));
364 assert!(!is_closing_quote('a'));
365 assert!(!is_closing_quote('.'));
366 }
367
368 #[test]
369 fn test_is_cjk_sentence_ending() {
370 assert!(is_cjk_sentence_ending('。'));
371 assert!(is_cjk_sentence_ending('!'));
372 assert!(is_cjk_sentence_ending('?'));
373 assert!(!is_cjk_sentence_ending('.'));
374 assert!(!is_cjk_sentence_ending('!'));
375 }
376
377 #[test]
378 fn test_is_cjk_char() {
379 assert!(is_cjk_char('中'));
380 assert!(is_cjk_char('あ')); assert!(is_cjk_char('ア')); assert!(is_cjk_char('한')); assert!(!is_cjk_char('a'));
384 assert!(!is_cjk_char('A'));
385 }
386
387 #[test]
390 fn test_after_period() {
391 assert!(is_after_sentence_ending("Hello. ", 6));
392 assert!(is_after_sentence_ending("End of sentence. Next", 16));
393 }
394
395 #[test]
396 fn test_after_exclamation() {
397 assert!(is_after_sentence_ending("Wow! ", 4));
398 assert!(is_after_sentence_ending("Great! Next", 6));
399 }
400
401 #[test]
402 fn test_after_question() {
403 assert!(is_after_sentence_ending("Really? ", 7));
404 assert!(is_after_sentence_ending("What? Next", 5));
405 }
406
407 #[test]
408 fn test_after_closing_quote() {
409 assert!(is_after_sentence_ending("He said \"Hello.\" Next", 16));
410 assert!(is_after_sentence_ending("She said 'Hi.' Next", 14));
411 }
412
413 #[test]
414 fn test_after_curly_quotes() {
415 let content = format!("He said {}Hello.{} Next", '\u{201C}', '\u{201D}');
416 let pos = content.find(" ").unwrap();
418 assert!(is_after_sentence_ending(&content, pos));
419 }
420
421 #[test]
422 fn test_after_closing_paren() {
423 assert!(is_after_sentence_ending("(See note.) Next", 11));
424 assert!(is_after_sentence_ending("(Really!) Next", 9));
425 }
426
427 #[test]
428 fn test_after_closing_bracket() {
429 assert!(is_after_sentence_ending("[Citation.] Next", 11));
430 }
431
432 #[test]
433 fn test_after_ellipsis() {
434 assert!(is_after_sentence_ending("And so... Next", 9));
435 assert!(is_after_sentence_ending("Hmm... Let me think", 6));
436 }
437
438 #[test]
439 fn test_not_after_abbreviation() {
440 assert!(!is_after_sentence_ending("Dr. Smith", 3));
442 assert!(!is_after_sentence_ending("Mr. Jones", 3));
443 assert!(!is_after_sentence_ending("Prof. Williams", 5));
444 }
445
446 #[test]
447 fn test_not_after_single_initial() {
448 assert!(!is_after_sentence_ending("John A. Smith", 7));
450 assert!(is_after_sentence_ending("letter a. Next", 9));
452 }
453
454 #[test]
455 fn test_mid_sentence_not_detected() {
456 assert!(!is_after_sentence_ending("word word", 4));
458 assert!(!is_after_sentence_ending("multiple spaces", 8));
459 }
460
461 #[test]
462 fn test_cjk_sentence_ending() {
463 assert!(is_after_sentence_ending("日本語。 Next", 12)); assert!(is_after_sentence_ending("中文! Next", 9)); assert!(is_after_sentence_ending("한국어? Next", 12)); }
471
472 #[test]
473 fn test_complex_endings() {
474 assert!(is_after_sentence_ending("(He said \"Yes.\") Next", 16));
476 assert!(is_after_sentence_ending("\"End.\") Next", 7));
478 }
479
480 #[test]
481 fn test_guillemets() {
482 assert!(is_after_sentence_ending("Il dit «Oui.» Next", 13));
483 }
484
485 #[test]
486 fn test_empty_and_edge_cases() {
487 assert!(!is_after_sentence_ending("", 0));
488 assert!(!is_after_sentence_ending(".", 0));
489 assert!(!is_after_sentence_ending("a", 0));
490 }
491
492 #[test]
493 fn test_latin_abbreviations() {
494 assert!(!is_after_sentence_ending("i.e. example", 4));
496 assert!(!is_after_sentence_ending("e.g. example", 4));
497 }
498
499 #[test]
500 fn test_abbreviations_after_opening_punctuation() {
501 assert!(!is_after_sentence_ending("(e.g. Wasm)", 5));
503 assert!(!is_after_sentence_ending("(i.e. PyO3)", 5));
504 assert!(!is_after_sentence_ending("[e.g. Chapter]", 5));
505 assert!(!is_after_sentence_ending("(Dr. Smith)", 4));
506 assert!(!is_after_sentence_ending("(\"e.g. something\")", 6));
508 }
509
510 #[test]
511 fn test_after_inline_code() {
512 assert!(is_after_sentence_ending("Hello from `backticks`. Next", 23));
516
517 assert!(is_after_sentence_ending("`code`. Next", 7));
519
520 assert!(is_after_sentence_ending("Use `foo` and `bar`. Next", 20));
522
523 assert!(is_after_sentence_ending("`important`! Next", 12));
525
526 assert!(is_after_sentence_ending("Is it `true`? Next", 13));
528
529 assert!(is_after_sentence_ending("The `code` works. Next", 17));
531 }
532
533 #[test]
534 fn test_after_inline_code_with_quotes() {
535 assert!(is_after_sentence_ending("He said \"use `code`\". Next", 21));
537
538 assert!(is_after_sentence_ending("(see `example`). Next", 16));
540 }
541
542 #[test]
543 fn test_after_emphasis() {
544 assert!(is_after_sentence_ending("The word is *important*. Next", 24));
546
547 assert!(is_after_sentence_ending("The word is _important_. Next", 24));
549
550 assert!(is_after_sentence_ending("This is *urgent*! Next", 17));
552
553 assert!(is_after_sentence_ending("Is it _true_? Next", 13));
555 }
556
557 #[test]
558 fn test_after_bold() {
559 assert!(is_after_sentence_ending("The word is **critical**. Next", 25));
561
562 assert!(is_after_sentence_ending("The word is __critical__. Next", 25));
564 }
565
566 #[test]
567 fn test_after_strikethrough() {
568 assert!(is_after_sentence_ending("This is ~~wrong~~. Next", 18));
570
571 assert!(is_after_sentence_ending("That was ~~bad~~! Next", 17));
573 }
574
575 #[test]
576 fn test_after_extended_markdown() {
577 assert!(is_after_sentence_ending("This is ==highlighted==. Next", 24));
579
580 assert!(is_after_sentence_ending("E equals mc^2^. Next", 15));
582 }
583}