use std::collections::HashSet;
pub const DEFAULT_ABBREVIATIONS: &[&str] = &[
"mr", "mrs", "ms", "dr", "prof", "sr", "jr",
"i.e", "e.g", "vs", "fig", "no", "vol", "ch", "sec", "al",
];
pub fn get_abbreviations(custom: &Option<Vec<String>>) -> HashSet<String> {
let mut abbreviations: HashSet<String> = DEFAULT_ABBREVIATIONS.iter().map(|s| s.to_lowercase()).collect();
if let Some(custom_list) = custom {
for abbr in custom_list {
let normalized = abbr.trim_end_matches('.').to_lowercase();
if !normalized.is_empty() {
abbreviations.insert(normalized);
}
}
}
abbreviations
}
pub fn text_ends_with_abbreviation(text: &str, abbreviations: &HashSet<String>) -> bool {
if !text.ends_with('.') {
return false;
}
let without_period = text.trim_end_matches('.');
let last_word = without_period.split_whitespace().last().unwrap_or("");
if last_word.is_empty() {
return false;
}
let stripped = last_word.trim_start_matches(|c: char| !c.is_alphanumeric() && c != '.');
abbreviations.contains(&stripped.to_lowercase())
}
pub fn is_cjk_sentence_ending(c: char) -> bool {
matches!(c, '。' | '!' | '?')
}
pub fn is_closing_quote(c: char) -> bool {
matches!(c, '"' | '\'' | '\u{201D}' | '\u{2019}' | '»' | '›')
}
pub fn is_opening_quote(c: char) -> bool {
matches!(c, '"' | '\'' | '\u{201C}' | '\u{2018}' | '«' | '‹')
}
pub fn is_cjk_char(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
}
pub fn is_sentence_ending_punctuation(c: char) -> bool {
matches!(c, '.' | '!' | '?') || is_cjk_sentence_ending(c)
}
pub fn is_trailing_close_punctuation(c: char) -> bool {
is_closing_quote(c) || matches!(c, ')' | ']' | '}')
}
pub fn is_after_sentence_ending(text: &str, match_start: usize) -> bool {
is_after_sentence_ending_with_abbreviations(text, match_start, &get_abbreviations(&None))
}
pub fn is_after_sentence_ending_with_abbreviations(
text: &str,
match_start: usize,
abbreviations: &HashSet<String>,
) -> bool {
if match_start == 0 || match_start > text.len() {
return false;
}
let before = match text.get(..match_start) {
Some(s) => s,
None => return false, };
let chars: Vec<char> = before.chars().collect();
if chars.is_empty() {
return false;
}
let mut idx = chars.len() - 1;
while idx > 0 && is_trailing_close_punctuation(chars[idx]) {
idx -= 1;
}
let current = chars[idx];
if is_cjk_sentence_ending(current) {
return true;
}
if current == '!' || current == '?' {
return true;
}
if current == '.' {
if idx >= 2 && chars[idx - 1] == '.' && chars[idx - 2] == '.' {
return true;
}
let text_before_period: String = chars[..idx].iter().collect();
if text_ends_with_abbreviation(&format!("{text_before_period}."), abbreviations) {
return false;
}
if idx > 0 {
let prev = chars[idx - 1];
if prev.is_ascii_uppercase() {
if idx >= 2 {
if chars[idx - 2].is_whitespace() {
return false;
}
} else {
return false;
}
}
if prev.is_alphanumeric()
|| is_closing_quote(prev)
|| matches!(prev, ')' | ']' | '`' | '*' | '_' | '~' | '=' | '^')
|| is_cjk_char(prev)
{
return true;
}
}
return false;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_abbreviations_default() {
let abbrevs = get_abbreviations(&None);
assert!(abbrevs.contains("dr"));
assert!(abbrevs.contains("mr"));
assert!(abbrevs.contains("prof"));
assert!(abbrevs.contains("i.e"));
assert!(abbrevs.contains("e.g"));
}
#[test]
fn test_get_abbreviations_custom() {
let custom = Some(vec!["Corp".to_string(), "Ltd.".to_string()]);
let abbrevs = get_abbreviations(&custom);
assert!(abbrevs.contains("dr"));
assert!(abbrevs.contains("corp"));
assert!(abbrevs.contains("ltd"));
}
#[test]
fn test_text_ends_with_abbreviation() {
let abbrevs = get_abbreviations(&None);
assert!(text_ends_with_abbreviation("Dr.", &abbrevs));
assert!(text_ends_with_abbreviation("Hello Dr.", &abbrevs));
assert!(text_ends_with_abbreviation("Prof.", &abbrevs));
assert!(!text_ends_with_abbreviation("Doctor.", &abbrevs));
assert!(!text_ends_with_abbreviation("Dr?", &abbrevs)); assert!(!text_ends_with_abbreviation("paradigms.", &abbrevs));
}
#[test]
fn test_text_ends_with_abbreviation_after_punctuation() {
let abbrevs = get_abbreviations(&None);
assert!(text_ends_with_abbreviation("(e.g.", &abbrevs));
assert!(text_ends_with_abbreviation("(i.e.", &abbrevs));
assert!(text_ends_with_abbreviation("word (e.g.", &abbrevs));
assert!(text_ends_with_abbreviation("word (i.e.", &abbrevs));
assert!(text_ends_with_abbreviation("[e.g.", &abbrevs));
assert!(text_ends_with_abbreviation("[Dr.", &abbrevs));
assert!(text_ends_with_abbreviation("\"Dr.", &abbrevs));
assert!(text_ends_with_abbreviation("*e.g.", &abbrevs));
assert!(text_ends_with_abbreviation("**e.g.", &abbrevs));
assert!(text_ends_with_abbreviation("(\"e.g.", &abbrevs));
assert!(text_ends_with_abbreviation("([Dr.", &abbrevs));
assert!(!text_ends_with_abbreviation("(paradigms.", &abbrevs));
assert!(!text_ends_with_abbreviation("[Doctor.", &abbrevs));
}
#[test]
fn test_is_closing_quote() {
assert!(is_closing_quote('"'));
assert!(is_closing_quote('\''));
assert!(is_closing_quote('\u{201D}')); assert!(is_closing_quote('\u{2019}')); assert!(is_closing_quote('»'));
assert!(is_closing_quote('›'));
assert!(!is_closing_quote('a'));
assert!(!is_closing_quote('.'));
}
#[test]
fn test_is_cjk_sentence_ending() {
assert!(is_cjk_sentence_ending('。'));
assert!(is_cjk_sentence_ending('!'));
assert!(is_cjk_sentence_ending('?'));
assert!(!is_cjk_sentence_ending('.'));
assert!(!is_cjk_sentence_ending('!'));
}
#[test]
fn test_is_cjk_char() {
assert!(is_cjk_char('中'));
assert!(is_cjk_char('あ')); assert!(is_cjk_char('ア')); assert!(is_cjk_char('한')); assert!(!is_cjk_char('a'));
assert!(!is_cjk_char('A'));
}
#[test]
fn test_after_period() {
assert!(is_after_sentence_ending("Hello. ", 6));
assert!(is_after_sentence_ending("End of sentence. Next", 16));
}
#[test]
fn test_after_exclamation() {
assert!(is_after_sentence_ending("Wow! ", 4));
assert!(is_after_sentence_ending("Great! Next", 6));
}
#[test]
fn test_after_question() {
assert!(is_after_sentence_ending("Really? ", 7));
assert!(is_after_sentence_ending("What? Next", 5));
}
#[test]
fn test_after_closing_quote() {
assert!(is_after_sentence_ending("He said \"Hello.\" Next", 16));
assert!(is_after_sentence_ending("She said 'Hi.' Next", 14));
}
#[test]
fn test_after_curly_quotes() {
let content = format!("He said {}Hello.{} Next", '\u{201C}', '\u{201D}');
let pos = content.find(" ").unwrap();
assert!(is_after_sentence_ending(&content, pos));
}
#[test]
fn test_after_closing_paren() {
assert!(is_after_sentence_ending("(See note.) Next", 11));
assert!(is_after_sentence_ending("(Really!) Next", 9));
}
#[test]
fn test_after_closing_bracket() {
assert!(is_after_sentence_ending("[Citation.] Next", 11));
}
#[test]
fn test_after_ellipsis() {
assert!(is_after_sentence_ending("And so... Next", 9));
assert!(is_after_sentence_ending("Hmm... Let me think", 6));
}
#[test]
fn test_not_after_abbreviation() {
assert!(!is_after_sentence_ending("Dr. Smith", 3));
assert!(!is_after_sentence_ending("Mr. Jones", 3));
assert!(!is_after_sentence_ending("Prof. Williams", 5));
}
#[test]
fn test_not_after_single_initial() {
assert!(!is_after_sentence_ending("John A. Smith", 7));
assert!(is_after_sentence_ending("letter a. Next", 9));
}
#[test]
fn test_mid_sentence_not_detected() {
assert!(!is_after_sentence_ending("word word", 4));
assert!(!is_after_sentence_ending("multiple spaces", 8));
}
#[test]
fn test_cjk_sentence_ending() {
assert!(is_after_sentence_ending("日本語。 Next", 12)); assert!(is_after_sentence_ending("中文! Next", 9)); assert!(is_after_sentence_ending("한국어? Next", 12)); }
#[test]
fn test_complex_endings() {
assert!(is_after_sentence_ending("(He said \"Yes.\") Next", 16));
assert!(is_after_sentence_ending("\"End.\") Next", 7));
}
#[test]
fn test_guillemets() {
assert!(is_after_sentence_ending("Il dit «Oui.» Next", 13));
}
#[test]
fn test_empty_and_edge_cases() {
assert!(!is_after_sentence_ending("", 0));
assert!(!is_after_sentence_ending(".", 0));
assert!(!is_after_sentence_ending("a", 0));
}
#[test]
fn test_latin_abbreviations() {
assert!(!is_after_sentence_ending("i.e. example", 4));
assert!(!is_after_sentence_ending("e.g. example", 4));
}
#[test]
fn test_abbreviations_after_opening_punctuation() {
assert!(!is_after_sentence_ending("(e.g. Wasm)", 5));
assert!(!is_after_sentence_ending("(i.e. PyO3)", 5));
assert!(!is_after_sentence_ending("[e.g. Chapter]", 5));
assert!(!is_after_sentence_ending("(Dr. Smith)", 4));
assert!(!is_after_sentence_ending("(\"e.g. something\")", 6));
}
#[test]
fn test_after_inline_code() {
assert!(is_after_sentence_ending("Hello from `backticks`. Next", 23));
assert!(is_after_sentence_ending("`code`. Next", 7));
assert!(is_after_sentence_ending("Use `foo` and `bar`. Next", 20));
assert!(is_after_sentence_ending("`important`! Next", 12));
assert!(is_after_sentence_ending("Is it `true`? Next", 13));
assert!(is_after_sentence_ending("The `code` works. Next", 17));
}
#[test]
fn test_after_inline_code_with_quotes() {
assert!(is_after_sentence_ending("He said \"use `code`\". Next", 21));
assert!(is_after_sentence_ending("(see `example`). Next", 16));
}
#[test]
fn test_after_emphasis() {
assert!(is_after_sentence_ending("The word is *important*. Next", 24));
assert!(is_after_sentence_ending("The word is _important_. Next", 24));
assert!(is_after_sentence_ending("This is *urgent*! Next", 17));
assert!(is_after_sentence_ending("Is it _true_? Next", 13));
}
#[test]
fn test_after_bold() {
assert!(is_after_sentence_ending("The word is **critical**. Next", 25));
assert!(is_after_sentence_ending("The word is __critical__. Next", 25));
}
#[test]
fn test_after_strikethrough() {
assert!(is_after_sentence_ending("This is ~~wrong~~. Next", 18));
assert!(is_after_sentence_ending("That was ~~bad~~! Next", 17));
}
#[test]
fn test_after_extended_markdown() {
assert!(is_after_sentence_ending("This is ==highlighted==. Next", 24));
assert!(is_after_sentence_ending("E equals mc^2^. Next", 15));
}
}