1use std::collections::HashSet;
14
15pub const DEFAULT_ABBREVIATIONS: &[&str] = &[
25 "mr", "mrs", "ms", "dr", "prof", "sr", "jr",
27 "i.e", "e.g",
29];
30
31pub fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
35 let mut abbreviations: HashSet<String> = DEFAULT_ABBREVIATIONS.iter().map(|s| s.to_lowercase()).collect();
36
37 if let Some(custom_list) = custom {
40 for abbr in custom_list {
41 let normalized = abbr.trim_end_matches('.').to_lowercase();
42 if !normalized.is_empty() {
43 abbreviations.insert(normalized);
44 }
45 }
46 }
47
48 abbreviations
49}
50
51pub fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
64 if !text.ends_with('.') {
66 return false;
67 }
68
69 let without_period = text.trim_end_matches('.');
71
72 let last_word = without_period.split_whitespace().last().unwrap_or("");
74
75 if last_word.is_empty() {
76 return false;
77 }
78
79 let stripped = last_word.trim_start_matches(|c: char| !c.is_alphanumeric() && c != '.');
82
83 abbreviations.contains(&stripped.to_lowercase())
85}
86
87pub fn is_cjk_sentence_ending(c: char) -> bool {
90 matches!(c, '。' | '!' | '?')
91}
92
93pub fn is_closing_quote(c: char) -> bool {
96 matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | '»' | '›')
99}
100
101pub fn is_opening_quote(c: char) -> bool {
104 matches!(c, '"' | '\'' | '\u{201C}' | '\u{2018}' | '«' | '‹')
107}
108
109pub fn is_cjk_char(c: char) -> bool {
111 matches!(c,
113 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
119}
120
121pub fn is_sentence_ending_punctuation(c: char) -> bool {
123 matches!(c, '.' | '!' | '?') || is_cjk_sentence_ending(c)
124}
125
126pub fn is_trailing_close_punctuation(c: char) -> bool {
129 is_closing_quote(c) || matches!(c, ')' | ']' | '}')
130}
131
132pub fn is_after_sentence_ending(text: &str, match_start: usize) -> bool {
150 is_after_sentence_ending_with_abbreviations(text, match_start, &get_abbreviations(&None))
151}
152
153pub fn is_after_sentence_ending_with_abbreviations(
159 text: &str,
160 match_start: usize,
161 abbreviations: &HashSet<String>,
162) -> bool {
163 if match_start == 0 || match_start > text.len() {
164 return false;
165 }
166
167 let before = match text.get(..match_start) {
170 Some(s) => s,
171 None => return false, };
173
174 let chars: Vec<char> = before.chars().collect();
176 if chars.is_empty() {
177 return false;
178 }
179
180 let mut idx = chars.len() - 1;
181
182 while idx > 0 && is_trailing_close_punctuation(chars[idx]) {
186 idx -= 1;
187 }
188
189 let current = chars[idx];
191
192 if is_cjk_sentence_ending(current) {
194 return true;
195 }
196
197 if current == '!' || current == '?' {
199 return true;
200 }
201
202 if current == '.' {
204 if idx >= 2 && chars[idx - 1] == '.' && chars[idx - 2] == '.' {
206 return true;
207 }
208
209 let text_before_period: String = chars[..idx].iter().collect();
212
213 if text_ends_with_abbreviation(&format!("{text_before_period}."), abbreviations) {
215 return false;
216 }
217
218 if idx > 0 {
220 let prev = chars[idx - 1];
221
222 if prev.is_ascii_uppercase() {
225 if idx >= 2 {
227 if chars[idx - 2].is_whitespace() {
228 return false;
230 }
231 } else {
232 return false;
234 }
235 }
236
237 if prev.is_alphanumeric()
246 || is_closing_quote(prev)
247 || matches!(prev, ')' | ']' | '`' | '*' | '_' | '~' | '=' | '^')
248 || is_cjk_char(prev)
249 {
250 return true;
251 }
252 }
253
254 return false;
256 }
257
258 false
259}
260
261#[cfg(test)]
262mod tests {
263 use super::*;
264
265 #[test]
268 fn test_get_abbreviations_default() {
269 let abbrevs = get_abbreviations(&None);
270 assert!(abbrevs.contains("dr"));
271 assert!(abbrevs.contains("mr"));
272 assert!(abbrevs.contains("prof"));
273 assert!(abbrevs.contains("i.e"));
274 assert!(abbrevs.contains("e.g"));
275 }
276
277 #[test]
278 fn test_get_abbreviations_custom() {
279 let custom = Some(vec!["Corp".to_string(), "Ltd.".to_string()]);
280 let abbrevs = get_abbreviations(&custom);
281 assert!(abbrevs.contains("dr"));
283 assert!(abbrevs.contains("corp"));
285 assert!(abbrevs.contains("ltd"));
286 }
287
288 #[test]
289 fn test_text_ends_with_abbreviation() {
290 let abbrevs = get_abbreviations(&None);
291 assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
292 assert!(text_ends_with_abbreviation("Hello Dr.", &abbrevs));
293 assert!(text_ends_with_abbreviation("Prof.", &abbrevs));
294 assert!(!text_ends_with_abbreviation("Doctor.", &abbrevs));
295 assert!(!text_ends_with_abbreviation("Dr?", &abbrevs)); assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
297 }
298
299 #[test]
300 fn test_text_ends_with_abbreviation_after_punctuation() {
301 let abbrevs = get_abbreviations(&None);
302 assert!(text_ends_with_abbreviation("(e.g.", &abbrevs));
304 assert!(text_ends_with_abbreviation("(i.e.", &abbrevs));
305 assert!(text_ends_with_abbreviation("word (e.g.", &abbrevs));
306 assert!(text_ends_with_abbreviation("word (i.e.", &abbrevs));
307 assert!(text_ends_with_abbreviation("[e.g.", &abbrevs));
309 assert!(text_ends_with_abbreviation("[Dr.", &abbrevs));
310 assert!(text_ends_with_abbreviation("\"Dr.", &abbrevs));
312 assert!(text_ends_with_abbreviation("*e.g.", &abbrevs));
314 assert!(text_ends_with_abbreviation("**e.g.", &abbrevs));
315 assert!(text_ends_with_abbreviation("(\"e.g.", &abbrevs));
317 assert!(text_ends_with_abbreviation("([Dr.", &abbrevs));
318 assert!(!text_ends_with_abbreviation("(paradigms.", &abbrevs));
320 assert!(!text_ends_with_abbreviation("[Doctor.", &abbrevs));
321 }
322
323 #[test]
326 fn test_is_closing_quote() {
327 assert!(is_closing_quote('"'));
328 assert!(is_closing_quote('\''));
329 assert!(is_closing_quote('\u{201D}')); assert!(is_closing_quote('\u{2019}')); assert!(is_closing_quote('»'));
332 assert!(is_closing_quote('›'));
333 assert!(!is_closing_quote('a'));
334 assert!(!is_closing_quote('.'));
335 }
336
337 #[test]
338 fn test_is_cjk_sentence_ending() {
339 assert!(is_cjk_sentence_ending('。'));
340 assert!(is_cjk_sentence_ending('!'));
341 assert!(is_cjk_sentence_ending('?'));
342 assert!(!is_cjk_sentence_ending('.'));
343 assert!(!is_cjk_sentence_ending('!'));
344 }
345
346 #[test]
347 fn test_is_cjk_char() {
348 assert!(is_cjk_char('中'));
349 assert!(is_cjk_char('あ')); assert!(is_cjk_char('ア')); assert!(is_cjk_char('한')); assert!(!is_cjk_char('a'));
353 assert!(!is_cjk_char('A'));
354 }
355
356 #[test]
359 fn test_after_period() {
360 assert!(is_after_sentence_ending("Hello. ", 6));
361 assert!(is_after_sentence_ending("End of sentence. Next", 16));
362 }
363
364 #[test]
365 fn test_after_exclamation() {
366 assert!(is_after_sentence_ending("Wow! ", 4));
367 assert!(is_after_sentence_ending("Great! Next", 6));
368 }
369
370 #[test]
371 fn test_after_question() {
372 assert!(is_after_sentence_ending("Really? ", 7));
373 assert!(is_after_sentence_ending("What? Next", 5));
374 }
375
376 #[test]
377 fn test_after_closing_quote() {
378 assert!(is_after_sentence_ending("He said \"Hello.\" Next", 16));
379 assert!(is_after_sentence_ending("She said 'Hi.' Next", 14));
380 }
381
382 #[test]
383 fn test_after_curly_quotes() {
384 let content = format!("He said {}Hello.{} Next", '\u{201C}', '\u{201D}');
385 let pos = content.find(" ").unwrap();
387 assert!(is_after_sentence_ending(&content, pos));
388 }
389
390 #[test]
391 fn test_after_closing_paren() {
392 assert!(is_after_sentence_ending("(See note.) Next", 11));
393 assert!(is_after_sentence_ending("(Really!) Next", 9));
394 }
395
396 #[test]
397 fn test_after_closing_bracket() {
398 assert!(is_after_sentence_ending("[Citation.] Next", 11));
399 }
400
401 #[test]
402 fn test_after_ellipsis() {
403 assert!(is_after_sentence_ending("And so... Next", 9));
404 assert!(is_after_sentence_ending("Hmm... Let me think", 6));
405 }
406
407 #[test]
408 fn test_not_after_abbreviation() {
409 assert!(!is_after_sentence_ending("Dr. Smith", 3));
411 assert!(!is_after_sentence_ending("Mr. Jones", 3));
412 assert!(!is_after_sentence_ending("Prof. Williams", 5));
413 }
414
415 #[test]
416 fn test_not_after_single_initial() {
417 assert!(!is_after_sentence_ending("John A. Smith", 7));
419 assert!(is_after_sentence_ending("letter a. Next", 9));
421 }
422
423 #[test]
424 fn test_mid_sentence_not_detected() {
425 assert!(!is_after_sentence_ending("word word", 4));
427 assert!(!is_after_sentence_ending("multiple spaces", 8));
428 }
429
430 #[test]
431 fn test_cjk_sentence_ending() {
432 assert!(is_after_sentence_ending("日本語。 Next", 12)); assert!(is_after_sentence_ending("中文! Next", 9)); assert!(is_after_sentence_ending("한국어? Next", 12)); }
440
441 #[test]
442 fn test_complex_endings() {
443 assert!(is_after_sentence_ending("(He said \"Yes.\") Next", 16));
445 assert!(is_after_sentence_ending("\"End.\") Next", 7));
447 }
448
449 #[test]
450 fn test_guillemets() {
451 assert!(is_after_sentence_ending("Il dit «Oui.» Next", 13));
452 }
453
454 #[test]
455 fn test_empty_and_edge_cases() {
456 assert!(!is_after_sentence_ending("", 0));
457 assert!(!is_after_sentence_ending(".", 0));
458 assert!(!is_after_sentence_ending("a", 0));
459 }
460
461 #[test]
462 fn test_latin_abbreviations() {
463 assert!(!is_after_sentence_ending("i.e. example", 4));
465 assert!(!is_after_sentence_ending("e.g. example", 4));
466 }
467
468 #[test]
469 fn test_abbreviations_after_opening_punctuation() {
470 assert!(!is_after_sentence_ending("(e.g. Wasm)", 5));
472 assert!(!is_after_sentence_ending("(i.e. PyO3)", 5));
473 assert!(!is_after_sentence_ending("[e.g. Chapter]", 5));
474 assert!(!is_after_sentence_ending("(Dr. Smith)", 4));
475 assert!(!is_after_sentence_ending("(\"e.g. something\")", 6));
477 }
478
479 #[test]
480 fn test_after_inline_code() {
481 assert!(is_after_sentence_ending("Hello from `backticks`. Next", 23));
485
486 assert!(is_after_sentence_ending("`code`. Next", 7));
488
489 assert!(is_after_sentence_ending("Use `foo` and `bar`. Next", 20));
491
492 assert!(is_after_sentence_ending("`important`! Next", 12));
494
495 assert!(is_after_sentence_ending("Is it `true`? Next", 13));
497
498 assert!(is_after_sentence_ending("The `code` works. Next", 17));
500 }
501
502 #[test]
503 fn test_after_inline_code_with_quotes() {
504 assert!(is_after_sentence_ending("He said \"use `code`\". Next", 21));
506
507 assert!(is_after_sentence_ending("(see `example`). Next", 16));
509 }
510
511 #[test]
512 fn test_after_emphasis() {
513 assert!(is_after_sentence_ending("The word is *important*. Next", 24));
515
516 assert!(is_after_sentence_ending("The word is _important_. Next", 24));
518
519 assert!(is_after_sentence_ending("This is *urgent*! Next", 17));
521
522 assert!(is_after_sentence_ending("Is it _true_? Next", 13));
524 }
525
526 #[test]
527 fn test_after_bold() {
528 assert!(is_after_sentence_ending("The word is **critical**. Next", 25));
530
531 assert!(is_after_sentence_ending("The word is __critical__. Next", 25));
533 }
534
535 #[test]
536 fn test_after_strikethrough() {
537 assert!(is_after_sentence_ending("This is ~~wrong~~. Next", 18));
539
540 assert!(is_after_sentence_ending("That was ~~bad~~! Next", 17));
542 }
543
544 #[test]
545 fn test_after_extended_markdown() {
546 assert!(is_after_sentence_ending("This is ==highlighted==. Next", 24));
548
549 assert!(is_after_sentence_ending("E equals mc^2^. Next", 15));
551 }
552}