use std::borrow::Cow;
use std::collections::{BTreeSet, HashSet};
use std::iter::Iterator;
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, FindIter, MatchKind};
use onig::{Captures, Error, Regex};
use unic_ucd_case::is_cased;
use crate::rule::Rule;
use crate::util::{re, re_i};
pub struct AbbreviationReplacer {
possessive_abbreviation_rule: Rule,
kommanditgesellschaft_rule: Rule,
single_letter_abbreviation_rules: [Rule; 2],
am_pm_rules: [Rule; 4],
python_splitlines_keepends: PythonSplitLines,
abbreviations: Vec<(&'static str, Regex, Regex)>,
prepositive_abbreviations: HashSet<&'static str>,
number_abbreviations: HashSet<&'static str>,
multi_period_abbreviation_regex: Regex,
replace_abbreviation_as_sentence_boundary: Rule,
}
#[rustfmt::skip]
const ABBREVIATIONS: &[&str] = &[
"adj", "adm", "adv", "al", "ala", "alta", "apr", "arc", "ariz", "ark", "art", "assn", "asst",
"attys", "aug", "ave", "bart", "bld", "bldg", "blvd", "brig", "bros", "btw", "cal", "calif",
"capt", "cl", "cmdr", "co", "col", "colo", "comdr", "con", "conn", "corp", "cpl", "cres", "ct",
"d.phil", "dak", "dec", "del", "dept", "det", "dist", "dr", "dr.phil", "dr.philos", "drs",
"e.g", "ens", "esp", "esq", "etc", "exp", "expy", "ext", "feb", "fed", "fla", "ft", "fwy",
"fy", "ga", "gen", "gov", "hon", "hosp", "hr", "hway", "hwy", "i.e", "ia", "id", "ida", "ill",
"inc", "ind", "ing", "insp", "is", "jan", "jr", "jul", "jun", "kan", "kans", "ken", "ky", "la",
"lt", "ltd", "maj", "man", "mar", "mass", "may", "md", "me", "med", "messrs", "mex", "mfg",
"mich", "min", "minn", "miss", "mlle", "mm", "mme", "mo", "mont", "mr", "mrs", "ms", "msgr",
"mssrs", "mt", "mtn", "neb", "nebr", "nev", "no", "nos", "nov", "nr", "oct", "ok", "okla",
"ont", "op", "ord", "ore", "p", "pa", "pd", "pde", "penn", "penna", "pfc", "ph", "ph.d", "pl",
"plz", "pp", "prof", "pvt", "que", "rd", "rs", "ref", "rep", "reps", "res", "rev", "rt",
"sask", "sec", "sen", "sens", "sep", "sept", "sfc", "sgt", "sr", "st", "supt", "surg", "tce",
"tenn", "tex", "univ", "usafa", "u.s", "ut", "va", "v", "ver", "viz", "vs", "vt", "wash",
"wis", "wisc", "wy", "wyo", "yuk", "fig",
];
const PREPOSITIVE_ABBREVIATIONS: &[&str] = &[
"adm", "attys", "brig", "capt", "cmdr", "col", "cpl", "det", "dr", "gen", "gov", "ing", "lt",
"maj", "mr", "mrs", "ms", "mt", "messrs", "mssrs", "prof", "ph", "rep", "reps", "rev", "sen",
"sens", "sgt", "st", "supt", "v", "vs", "fig",
];
const NUMBER_ABBREVIATIONS: &[&str] = &["art", "ext", "no", "nos", "p", "pp"];
impl AbbreviationReplacer {
pub fn new() -> Result<Self, Error> {
Ok(AbbreviationReplacer {
possessive_abbreviation_rule: Rule::new(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", "∯")?,
kommanditgesellschaft_rule: Rule::new(r"(?<=Co)\.(?=\sKG)", "∯")?,
single_letter_abbreviation_rules: [
Rule::new(r"(?<=^[A-Z])\.(?=\s)", "∯")?,
Rule::new(r"(?<=\s[A-Z])\.(?=,?\s)", "∯")?,
],
am_pm_rules: [
Rule::new(r"(?<= P∯M)∯(?=\s[A-Z])", ".")?,
Rule::new(r"(?<=A∯M)∯(?=\s[A-Z])", ".")?,
Rule::new(r"(?<=p∯m)∯(?=\s[A-Z])", ".")?,
Rule::new(r"(?<=a∯m)∯(?=\s[A-Z])", ".")?,
],
python_splitlines_keepends: PythonSplitLines::new(),
abbreviations: ABBREVIATIONS
.iter()
.map(|&abbr| -> Result<_, Error> {
let abbr_match = re_i(&format!(r"(?:^|\s|\r|\n){}", abbr))?;
let escaped = abbr.replace(r".", r"\.");
let next_word_start = re(&format!(r"(?<={{{}}} ).{{1}}", escaped))?;
Ok((abbr, abbr_match, next_word_start))
})
.collect::<Result<_, _>>()?,
prepositive_abbreviations: PREPOSITIVE_ABBREVIATIONS.iter().copied().collect(),
number_abbreviations: NUMBER_ABBREVIATIONS.iter().copied().collect(),
multi_period_abbreviation_regex: re_i(r"\b[a-z](?:\.[a-z])+[.]")?,
replace_abbreviation_as_sentence_boundary: Rule::new(
r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯((?=\sA\s)|(?=\sBeing\s)|(?=\sDid\s)|(?=\sFor\s)|(?=\sHe\s)|(?=\sHow\s)|(?=\sHowever\s)|(?=\sI\s)|(?=\sIn\s)|(?=\sIt\s)|(?=\sMillions\s)|(?=\sMore\s)|(?=\sShe\s)|(?=\sThat\s)|(?=\sThe\s)|(?=\sThere\s)|(?=\sThey\s)|(?=\sWe\s)|(?=\sWhat\s)|(?=\sWhen\s)|(?=\sWhere\s)|(?=\sWho\s)|(?=\sWhy\s))",
r"\1.",
)?,
})
}
pub fn replace(&self, text: &str) -> String {
let text = self.possessive_abbreviation_rule.replace_all(&text);
let mut text = self.kommanditgesellschaft_rule.replace_all(&text);
for rule in &self.single_letter_abbreviation_rules {
text = rule.replace_all(&text);
}
let text = {
let mut abbr_handled_text = String::new();
for line in self.python_splitlines_keepends.splitlines_keepends(&text) {
abbr_handled_text += &self.search_for_abbreviations_in_string(line);
}
abbr_handled_text
};
let mut text = self
.multi_period_abbreviation_regex
.replace_all(&text, |c: &Captures| {
let mat = c.at(0).unwrap(); mat.replace('.', "∯")
});
for rule in &self.am_pm_rules {
text = rule.replace_all(&text);
}
self.replace_abbreviation_as_sentence_boundary
.replace_all(&text)
}
fn search_for_abbreviations_in_string<'a>(&self, text: &'a str) -> Cow<'a, str> {
let lowered = text.to_lowercase();
let mut text = Cow::Borrowed(text);
for (abbr, abbr_match_regex, next_word_start_regex) in &self.abbreviations {
if !lowered.contains(abbr) {
continue;
}
let abbrev_match: Vec<_> = abbr_match_regex.find_iter(&text).collect();
if abbrev_match.is_empty() {
continue;
}
let char_array: Vec<_> = next_word_start_regex.find_iter(&text).collect();
let mut replace_locations = BTreeSet::new();
for (ind, range) in abbrev_match.into_iter().enumerate() {
let abbr = &text[range.0..range.1].trim();
let ch = char_array.get(ind).map(|r| &text[r.0..r.1]).unwrap_or("");
let upper = python_isupper(ch);
let abbr_lower = abbr.to_lowercase();
let abbr_lower = abbr_lower.as_str();
let is_prepositive = self.prepositive_abbreviations.contains(abbr_lower);
if !upper || is_prepositive {
let regex = if is_prepositive {
format!(r"(?<=\s{abbr})\.(?=(\s|:\d+))", abbr = abbr)
} else if self.number_abbreviations.contains(abbr_lower) {
format!(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())", abbr = abbr)
} else {
format!(
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))",
abbr = abbr
)
};
let prepended_text = format!(" {}", text);
replace_locations.extend(re(®ex).unwrap().find_iter(&prepended_text).map(
|r| r.0 - 1,
));
}
}
if !replace_locations.is_empty() {
let mut owned = text.into_owned();
for loc in replace_locations.into_iter().rev() {
owned.replace_range(loc..(loc + 1), "∯");
}
text = Cow::Owned(owned);
}
}
text
}
}
struct PythonSplitLines(AhoCorasick);
impl PythonSplitLines {
fn new() -> Self {
let newlines = &[
"\r\n", "\n", "\r", "\x0b", "\x0c", "\x1c", "\x1d", "\x1e", "\u{85}", "\u{2028}", "\u{2029}", ];
Self(
AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostFirst)
.dfa(true)
.build(newlines),
)
}
fn splitlines_keepends<'a>(&self, input: &'a str) -> PythonSplitLinesKeepEnds<'_, 'a> {
PythonSplitLinesKeepEnds {
input,
last_index: 0,
searcher: self.0.find_iter(input),
}
}
}
struct PythonSplitLinesKeepEnds<'ac, 'input> {
input: &'input str,
last_index: usize,
searcher: FindIter<'ac, 'input, usize>,
}
impl<'ac, 'input> Iterator for PythonSplitLinesKeepEnds<'ac, 'input> {
type Item = &'input str;
fn next(&mut self) -> Option<Self::Item> {
match self.searcher.next() {
Some(mat) => {
let begin = self.last_index;
let end = mat.end();
self.last_index = end;
Some(&self.input[begin..end])
}
None => {
let last = self.last_index;
let len = self.input.len();
if last < len {
self.last_index = len;
Some(&self.input[last..len])
} else {
None
}
}
}
}
}
#[test]
fn test_python_splitlines_keepends() {
let splitter = PythonSplitLines::new();
let input = "x\nx\rx\r\nx\x0bx\x0cx\x1cx\x1dx\x1ex\u{85}x\u{2028}x\u{2029}";
let output = [
"x\n",
"x\r",
"x\r\n",
"x\x0b",
"x\x0c",
"x\x1c",
"x\x1d",
"x\x1e",
"x\u{85}",
"x\u{2028}",
"x\u{2029}",
];
assert_eq!(
splitter.splitlines_keepends(input).collect::<Vec<_>>(),
output
);
let input = "\n\na";
let output = ["\n", "\n", "a"];
assert_eq!(
splitter.splitlines_keepends(input).collect::<Vec<_>>(),
output
);
}
fn python_isupper(text: &str) -> bool {
let mut cased = false;
for c in text.chars() {
if is_cased(c) && c.is_uppercase() {
cased = true
} else if is_cased(c) && c.is_lowercase() {
return false;
}
}
cased
}
#[test]
fn test_python_isupper() {
assert!(!python_isupper("abc"));
assert!(!python_isupper("123"));
assert!(python_isupper("A_B"));
assert!(!python_isupper("a_b"));
assert!(python_isupper("A1"));
assert!(python_isupper("1A"));
assert!(!python_isupper("a1"));
assert!(!python_isupper("1a"));
assert!(!python_isupper("가나다a"));
assert!(python_isupper("가나다A"));
}
#[cfg(test)]
mod tests {
use super::*;
type TestResult = Result<(), Error>;
#[test]
fn regex_should_be_compiled() {
assert!(AbbreviationReplacer::new().is_ok())
}
#[test]
fn test_abbr_replace() -> TestResult {
let rep = AbbreviationReplacer::new()?;
assert_eq!(
rep.replace("Humana Inc. is including"),
"Humana Inc∯ is including"
);
Ok(())
}
#[test]
fn test_search_for_abbreviations_in_string() -> TestResult {
let rep = AbbreviationReplacer::new()?;
assert_eq!(
rep.search_for_abbreviations_in_string("Humana Inc. is including"),
"Humana Inc∯ is including"
);
Ok(())
}
}