mod abbreviation_replacer;
mod list_item_replacer;
mod rule;
mod util;
use std::borrow::Cow;
use std::error::Error;
use std::iter::Iterator;
use onig::{Captures, Regex};
use abbreviation_replacer::AbbreviationReplacer;
use list_item_replacer::ListItemReplacer;
use rule::Rule;
use util::re;
const PUNCTUATIONS: [char; 7] = ['。', '.', '.', '!', '!', '?', '?'];
pub struct Segmenter {
list_item_replacer: ListItemReplacer,
abbreviation_replacer: AbbreviationReplacer,
number_rules: [Rule; 5],
continuous_punctuation_regex: Regex,
numbered_reference: Rule,
abbreviation_with_multiple_periods_and_email_regex: regex::Regex,
misc_rules: [Rule; 2],
parens_between_double_quotes_regex: Regex,
parens_between_double_quotes_0: Rule,
parens_between_double_quotes_1: Rule,
ellipsis_rules: [Rule; 5],
exclamation_regex: Regex,
sub_escaped_regex_reserved_characters: [Rule; 5],
word_with_leading_apostrophe: Regex,
trailing_apostrophe: Regex,
between_single_quotes_regex: Regex,
between_single_quote_slanted_regex: Regex,
between_double_quotes_regex_2: Regex,
between_square_brackets_regex_2: Regex,
between_parens_regex_2: Regex,
between_quote_arrow_regex_2: Regex,
between_em_dashes_regex_2: Regex,
between_quote_slanted_regex_2: Regex,
double_punctuation: Regex,
question_mark_in_quotation_and_exclamation_point_rules: [Rule; 4],
replace_parens: Rule,
sentence_boundary_regex: Regex,
post_process_regex: Regex,
quotation_at_end_of_sentence_regex: Regex,
split_space_quotation_at_end_of_sentence_regex: Regex,
}
impl Segmenter {
pub fn new() -> Result<Self, Box<dyn Error>> {
Ok(Segmenter {
list_item_replacer: ListItemReplacer::new()?,
abbreviation_replacer: AbbreviationReplacer::new()?,
number_rules: [
Rule::new(r"\.(?=\d)", "∯")?,
Rule::new(r"(?<=\d)\.(?=\S)", "∯")?,
Rule::new(r"(?<=\r\d)\.(?=(\s\S)|\))", "∯")?,
Rule::new(r"(?<=^\d)\.(?=(\s\S)|\))", "∯")?,
Rule::new(r"(?<=^\d\d)\.(?=(\s\S)|\))", "∯")?,
],
continuous_punctuation_regex: re(r"(?<=\S)(!|\?){3,}(?=(\s|\Z|$))")?,
numbered_reference: Rule::new(
r"(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])",
r"∯\2\r\7",
)?,
abbreviation_with_multiple_periods_and_email_regex: regex::Regex::new(
r"([a-zA-Z0-9_])(?:\.)([a-zA-Z0-9_])",
)?,
misc_rules: [
Rule::new(r"(?<=[a-zA-z]°)\.(?=\s*\d+)", "∯")?,
Rule::new(
r"(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)",
"∯",
)?,
],
parens_between_double_quotes_regex: re(r#"["\”]\s\(.*\)\s["\“]"#)?,
parens_between_double_quotes_0: Rule::new(r"\s(?=\()", "\r")?,
parens_between_double_quotes_1: Rule::new(r"(?<=\))\s", "\r")?,
ellipsis_rules: [
Rule::new(r"(\s\.){3}\s", "♟♟♟♟♟♟♟")?,
Rule::new(r"(?<=[a-z])(\.\s){3}\.($|\\n)", "♝♝♝♝♝♝♝")?,
Rule::new(r"(?<=\S)\.{3}(?=\.\s[A-Z])", "ƪƪƪ")?,
Rule::new(r"\.\.\.(?=\s+[A-Z])", "☏☏.")?,
Rule::new(r"\.\.\.", "ƪƪƪ")?,
],
exclamation_regex: re(
r"!Xũ|!Kung|ǃʼOǃKung|!Xuun|!Kung\-Ekoka|ǃHu|ǃKhung|ǃKu|ǃung|ǃXo|ǃXû|ǃXung|ǃXũ|!Xun|Yahoo!|Y!J|Yum!",
)?,
sub_escaped_regex_reserved_characters: [
Rule::new(r"\\\(", "(")?,
Rule::new(r"\\\)", ")")?,
Rule::new(r"\\\[", "[")?,
Rule::new(r"\\\]", "]")?,
Rule::new(r"\\\-", "-")?,
],
word_with_leading_apostrophe: re(r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S")?,
trailing_apostrophe: re(r"'\s")?,
between_single_quotes_regex: re(r"(?<=\s)'(?:[^']|'[a-zA-Z])*'")?,
between_single_quote_slanted_regex: re(r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’")?,
between_double_quotes_regex_2: re(r#""(?=(?<tmp>[^\"\\]+|\\{2}|\\.)*)\k<tmp>""#)?,
between_square_brackets_regex_2: re(r#"\[(?=(?<tmp>[^\]\\]+|\\{2}|\\.)*)\k<tmp>\]"#)?,
between_parens_regex_2: re(r"\((?=(?<tmp>[^\(\)\\]+|\\{2}|\\.)*)\k<tmp>\)")?,
between_quote_arrow_regex_2: re(r"\«(?=(?<tmp>[^»\\]+|\\{2}|\\.)*)\k<tmp>\»")?,
between_em_dashes_regex_2: re(r"--(?=(?<tmp>[^--]*))\k<tmp>--")?,
between_quote_slanted_regex_2: re(r"\“(?=(?<tmp>[^”\\]+|\\{2}|\\.)*)\k<tmp>\”")?,
double_punctuation: re(r"^(?:\?!|!\?|\?\?|!!)")?,
question_mark_in_quotation_and_exclamation_point_rules: [
Rule::new(r#"\?(?=(\'|\"))"#, "&ᓷ&")?,
Rule::new(r#"\!(?=(\'|\"))"#, "&ᓴ&")?,
Rule::new(r"\!(?=\,\s[a-z])", "&ᓴ&")?,
Rule::new(r"\!(?=\s[a-z])", "&ᓴ&")?,
],
replace_parens: Rule::new(
r"\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])",
r"&✂&\1&⌬&",
)?,
sentence_boundary_regex: re(
r#"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!??].*|\S.*?[。..!!??ȸȹ☉☈☇☄]"#,
)?,
post_process_regex: re(r"\A[a-zA-Z]*\Z")?,
quotation_at_end_of_sentence_regex: re(r#"[!?\.-][\"\'“”]\s{1}[A-Z]"#)?,
split_space_quotation_at_end_of_sentence_regex: re(
r#"(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])"#,
)?,
})
}
pub fn segment<'a>(&'a self, original_input: &'a str) -> impl Iterator<Item = &'a str> {
let text = original_input.replace('\n', "\r");
let text = self.list_item_replacer.add_line_break(&text);
let mut text = self.abbreviation_replacer.replace(&text);
for rule in &self.number_rules {
text = rule.replace_all(&text);
}
let text = self
.continuous_punctuation_regex
.replace_all(&text, |c: &Captures| {
let mat = c.at(0).unwrap();
mat.replace('!', "&ᓴ&").replace('?', "&ᓷ&")
});
let text = self.numbered_reference.replace_all(&text);
let mut text = self
.abbreviation_with_multiple_periods_and_email_regex
.replace_all(&text, "$1∮$2");
for rule in &self.misc_rules {
text = Cow::Owned(rule.replace_all(&text));
}
let text = self
.parens_between_double_quotes_regex
.replace_all(&text, |c: &Captures| {
let mat = c.at(0).unwrap();
let mat = self.parens_between_double_quotes_0.replace_all(mat);
self.parens_between_double_quotes_1.replace_all(&mat)
});
let mut prior_start_char_idx = 0;
text.split('\r')
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect::<Vec<_>>()
.into_iter()
.flat_map(move |sent| {
let mut sent = sent.replace(r"\n", "ȹ");
for rule in &self.ellipsis_rules {
sent = rule.replace_all(&sent);
}
if PUNCTUATIONS.iter().any(|&p| sent.contains(p)) {
if !sent.ends_with(&PUNCTUATIONS[..]) {
sent += "ȸ";
}
sent = self
.exclamation_regex
.replace_all(&sent, self.replace_punctuation(false));
if self.word_with_leading_apostrophe.find(&sent).is_none()
|| self.trailing_apostrophe.find(&sent).is_some()
{
sent = self
.between_single_quotes_regex
.replace_all(&sent, self.replace_punctuation(true));
}
sent = self
.between_single_quote_slanted_regex
.replace_all(&sent, self.replace_punctuation(false));
sent = self
.between_double_quotes_regex_2
.replace_all(&sent, self.replace_punctuation(false));
sent = self
.between_square_brackets_regex_2
.replace_all(&sent, self.replace_punctuation(false));
sent = self
.between_parens_regex_2
.replace_all(&sent, self.replace_punctuation(false));
sent = self
.between_quote_arrow_regex_2
.replace_all(&sent, self.replace_punctuation(false));
sent = self
.between_em_dashes_regex_2
.replace_all(&sent, self.replace_punctuation(false));
sent = self
.between_quote_slanted_regex_2
.replace_all(&sent, self.replace_punctuation(false));
if self.double_punctuation.find(&sent).is_none() {
sent = sent
.replace(r"?!", "☉")
.replace(r"!?", "☈")
.replace(r"??", "☇")
.replace(r"!!", "☄");
}
for rule in &self.question_mark_in_quotation_and_exclamation_point_rules {
sent = rule.replace_all(&sent);
}
sent = self.replace_parens.replace_all(&sent);
sent = sent.replace(r"&ᓴ&$", "!");
self.sentence_boundary_regex
.find_iter(&sent)
.map(|r| sent[r.0..r.1].to_string())
.collect::<Vec<_>>()
} else {
vec![sent]
}
})
.flat_map(move |mut sent| {
sent = sent
.replace(r"∯", ".")
.replace(r"♬", "،")
.replace(r"♭", ":")
.replace(r"&ᓰ&", "。")
.replace(r"&ᓱ&", ".")
.replace(r"&ᓳ&", "!")
.replace(r"&ᓴ&", "!")
.replace(r"&ᓷ&", "?")
.replace(r"&ᓸ&", "?")
.replace(r"☉", "?!")
.replace(r"☇", "??")
.replace(r"☈", "!?")
.replace(r"☄", "!!")
.replace(r"&✂&", "(")
.replace(r"&⌬&", ")")
.replace(r"ȸ", "")
.replace(r"ȹ", "\n");
if sent.len() > 2 && self.post_process_regex.find(&sent).is_some() {
return vec![sent];
}
sent = sent
.replace(r"ƪƪƪ", "...")
.replace(r"♟♟♟♟♟♟♟", " . . . ")
.replace(r"♝♝♝♝♝♝♝", ". . . .")
.replace(r"☏☏", "..")
.replace(r"∮", ".");
if self
.quotation_at_end_of_sentence_regex
.find(&sent)
.is_some()
{
self.split_space_quotation_at_end_of_sentence_regex
.split(&sent)
.map(|s| s.to_string())
.collect()
} else {
vec![sent.replace("\n", "").trim().to_string()]
}
})
.map(|sent| sent.replace(r"&⎋&", "'"))
.flat_map(move |sent| -> Vec<_> {
let re = regex::Regex::new(&format!(r"{}\s*", regex::escape(&sent))).unwrap();
re.find_iter(original_input)
.filter_map(|mat| {
let match_str = mat.as_str();
let match_start_idx = mat.start();
if match_start_idx >= prior_start_char_idx {
prior_start_char_idx = match_start_idx;
Some(match_str)
} else {
None
}
})
.collect()
})
}
fn replace_punctuation(&self, is_match_type_single: bool) -> impl Fn(&Captures) -> String + '_ {
move |c: &Captures| {
let mat = c.at(0).unwrap();
let mut mat = mat.replace('.', "∯");
mat = mat.replace('。', "&ᓰ&");
mat = mat.replace('.', "&ᓱ&");
mat = mat.replace('!', "&ᓳ&");
mat = mat.replace('!', "&ᓴ&");
mat = mat.replace('?', "&ᓷ&");
mat = mat.replace('?', "&ᓸ&");
if !is_match_type_single {
mat = mat.replace("'", "&⎋&");
}
for rule in &self.sub_escaped_regex_reserved_characters {
mat = rule.replace_all(&mat);
}
mat
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
type TestResult = Result<(), Box<dyn Error>>;
#[test]
fn regex_should_be_compiled() -> TestResult {
let _seg = Segmenter::new()?;
Ok(())
}
#[test]
fn empty_string() -> TestResult {
let seg = Segmenter::new()?;
let expected: [String; 0] = [];
let actual: Vec<_> = seg.segment("").collect();
assert_eq!(actual, expected);
Ok(())
}
}