use regex::{Regex, RegexBuilder};
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::io::{BufRead, BufReader, Read};
use std::sync::{LazyLock, Mutex};
pub(super) enum Language {
Unknown,
English,
Romanian,
Italian,
French,
Portuguese,
Galician,
Catalan,
}
impl From<Language> for LanguageConfig {
fn from(value: Language) -> Self {
let mut p_char = r#"\[¿¡{'`"‚„†‡‹‘’“”•–—›»«"#;
let mut f_char = r#"\]}'`",;:!?؟%‚„…†‡‰‹‘’“”•–—›»«"#;
let mut p_clitic = "";
let mut f_clitic = "";
match value {
Language::Unknown => { }
Language::English => {
f_clitic = "['’´](s|re|ve|d|m|em|ll)|n['’´]t";
}
Language::Romanian => {
p_char = r#"\[¿¡{`"‚„†‡‹‘’“”•–—›»«"#;
f_char = r#"\]}`",;:!?\%‚„…†‡‰‹‘’“”•–—›»«"#
}
Language::Italian => {
p_clitic = "(?:d[ae]ll|nell|all|[ld]|sull|quest|un|senz|tutt|c|s)['´’]";
}
Language::French => {
p_clitic = "(?:[dcjlmnst]|qu|jusqu|lorsqu|quoiqu|puisqu)['’´]";
f_clitic = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mêmes?|-m['’´]|-moi|-nous|-on|-toi|-tu|-t['’´]|-vous|-en|-y|-ci|-là";
}
Language::Portuguese => {
f_clitic = "-a|-as|-la|-las|-lha|-lhas|-lhe|-lhes|-lho|-lhos|-lo|-los|-ma|-mas|-me|-mo|-mos|-na|-nas|-no|-no-la|-no-las|-no-lo|-no-los|-nos|-o|-os|-s|-se|-se-á|-se-ão|-se-é|-se-ia|-se-lha|-se-lhas|-se-lhe|-se-lhes|-se-lho|-se-lhos|-se-nos|-se-vos|-ta|-tas|-te|-to|-tos|-vo-la|-vo-las|-vo-lo|-vo-los|-vos";
}
Language::Galician => {
f_clitic = "-la|-las|-lo|-los|-nos";
}
Language::Catalan => {
p_clitic = "[dlmnst]['’´]";
f_clitic =
"['’´](n|s|ls|l|hi|ns|t|m|ho)|-(se|lo|la|li|los|les|hi|ho|ne|nos|me|s|te|m)";
}
}
let abbreviations: HashSet<String> = HashSet::new();
LanguageConfig {
p_char: p_char.to_string(),
f_char: f_char.to_string(),
p_clitic: p_clitic.to_string(),
f_clitic: f_clitic.to_string(),
abbreviations,
}
}
}
impl<S> From<S> for Language
where
S: AsRef<str>,
{
fn from(value: S) -> Self {
if value.as_ref().len() >= 2 {
match &value.as_ref()[0..2] {
"ca" => Language::Catalan,
"en" => Language::English,
"fr" => Language::French,
"gl" => Language::Galician,
"it" => Language::Italian,
"pt" => Language::Portuguese,
"ro" => Language::Romanian,
_ => Language::Unknown,
}
} else {
Language::Unknown
}
}
}
#[derive(Clone)]
struct LanguageConfig {
p_char: String,
f_char: String,
p_clitic: String,
f_clitic: String,
abbreviations: HashSet<String>,
}
const SPLIT_MARKER: char = '\u{0179}';
const SPLIT_MARKER_STR: &str = "\u{0179}";
static COMPILED_REGEX_CACHE: LazyLock<Mutex<HashMap<String, Regex>>> =
LazyLock::new(Mutex::default);
fn cached_regex(p: &str) -> crate::error::Result<Regex> {
let mut cache = COMPILED_REGEX_CACHE.lock()?;
if let Some(existing) = cache.get(p) {
Ok(existing.clone())
} else {
let compiled = Regex::new(p)?;
cache.insert(p.to_string(), compiled.clone());
Ok(compiled)
}
}
fn cached_regex_case_insensitive(p: &str) -> crate::error::Result<Regex> {
let mut cache = COMPILED_REGEX_CACHE.lock()?;
if let Some(existing) = cache.get(p) {
Ok(existing.clone())
} else {
let compiled = RegexBuilder::new(p).case_insensitive(true).build()?;
cache.insert(p.to_string(), compiled.clone());
Ok(compiled)
}
}
#[derive(Clone)]
pub(super) struct TreeTaggerTokenizer {
config: LanguageConfig,
}
#[derive(Clone)]
pub(super) struct Token {
pub value: String,
pub whitespace_after: Option<String>,
}
impl Token {
fn new_val<S: ToString>(value: S) -> Self {
Self {
value: value.to_string(),
whitespace_after: None,
}
}
}
impl TreeTaggerTokenizer {
pub(super) fn new(language: Language) -> anyhow::Result<Self> {
let config: LanguageConfig = language.into();
Ok(Self { config })
}
pub(super) fn tokenize<R: Read>(&self, reader: R) -> anyhow::Result<Vec<Token>> {
let mut result = Vec::new();
let mut buffered_reader = BufReader::new(reader);
let mut line = String::new();
let mut is_first_line = true;
while buffered_reader.read_line(&mut line)? > 0 {
if is_first_line {
line = cached_regex("^\u{FEFF}")?.replace(&line, "").to_string();
is_first_line = false;
}
}
line = cached_regex("[\n\t]")?.replace_all(&line, " ").to_string();
while let Cow::Owned(new_line) =
cached_regex("(<[^<> ]*) ([^<>]*>)")?.replace_all(&line, "${1}\u{0179}${2}")
{
line = new_line;
}
line = line.replace(' ', "\u{178}");
line = line
.replace('\u{0179}', " ")
.replace('\u{178}', SPLIT_MARKER_STR);
line = cached_regex("(<[^<>]*>)")?
.replace_all(&line, &format!("{SPLIT_MARKER}$1{SPLIT_MARKER}"))
.to_string();
line = line.trim_matches(SPLIT_MARKER).to_string();
line = cached_regex(&format!("{SPLIT_MARKER}{SPLIT_MARKER}{SPLIT_MARKER}*"))?
.replace_all(&line, SPLIT_MARKER_STR)
.to_string();
for segment in line.split(SPLIT_MARKER) {
let mut segment = segment.to_string();
if cached_regex("^<.*>$")?.is_match(&segment) {
result.push(Token::new_val(segment));
} else {
segment = cached_regex("(\\.\\.\\.)")?
.replace_all(&segment, " ... ")
.to_string();
segment = cached_regex("([;!?])([^ ])")?
.replace_all(&segment, "$1 $2")
.to_string();
for mut current_token in segment.split(' ').map(str::to_string) {
let mut suffix = Vec::new();
let mut finished = false;
while !finished {
if let Some(m) =
substitute("^(\\()([^\\)]*)(.)$", "$2$3", &mut current_token)?
{
result.push(Token::new_val(m.get(1).map_or("", |m| m.as_str())));
} else if let Some(m) =
substitute("^([^(]+)(\\))$", "$1", &mut current_token)?
{
suffix.insert(0, Token::new_val(m.get(2).map_or("", |m| m.as_str())));
} else if let Some(m) = substitute(
&format!("^([{}])(.)", &self.config.p_char),
"$2",
&mut current_token,
)? {
result.push(Token::new_val(m.get(1).map_or("", |m| m.as_str())));
} else if let Some(m) = substitute(
&format!("(.)([{}])$", &self.config.f_char),
"$1",
&mut current_token,
)? {
suffix.insert(0, Token::new_val(m.get(2).map_or("", |m| m.as_str())));
} else if let Some(m) = substitute(
&format!("([{}]|\\))\\.$", &self.config.f_char),
"",
&mut current_token,
)? {
suffix.insert(0, Token::new_val("."));
let punction_before_period =
m.get(1).map_or("", |m| m.as_str()).to_string();
if current_token.is_empty() {
current_token = punction_before_period;
} else {
suffix.insert(0, Token::new_val(punction_before_period));
}
} else {
finished = true;
}
}
if self.config.abbreviations.contains(¤t_token) {
result.push(Token::new_val(¤t_token));
result.extend(suffix.iter().cloned());
continue;
}
if cached_regex("^([A-Za-z-]\\.)+$")?.is_match(¤t_token) {
result.push(Token::new_val(¤t_token));
result.extend(suffix.iter().cloned());
continue;
}
if let Some(m) = cached_regex("^(..*)\\.$")?.captures(¤t_token.clone())
&& current_token != "..."
{
current_token = m.get(1).map_or("", |m| m.as_str()).to_string();
suffix.insert(0, Token::new_val("."));
if self.config.abbreviations.contains(¤t_token) {
result.push(Token::new_val(¤t_token));
result.extend(suffix.iter().cloned());
continue;
}
}
while let Some(m) = substitute("^(--)(.)", "$2", &mut current_token)? {
result.push(Token::new_val(m.get(1).map_or("", |m| m.as_str())));
}
if !self.config.p_clitic.is_empty() {
while let Some(m) = substitute_i(
&format!("^({})(.)", self.config.p_clitic),
"$2",
&mut current_token,
)? {
result.push(Token::new_val(m.get(1).map_or("", |m| m.as_str())));
}
}
while let Some(m) = substitute("(.)(--)$", "$1", &mut current_token)? {
suffix.insert(0, Token::new_val(m.get(2).map_or("", |m| m.as_str())));
}
if !self.config.f_clitic.is_empty() {
while let Some(m) = substitute_i(
&format!("(.)({})$", self.config.f_clitic),
"$1",
&mut current_token,
)? {
suffix.insert(0, Token::new_val(m.get(2).map_or("", |m| m.as_str())));
}
}
result.push(Token::new_val(current_token));
result.extend(suffix.into_iter());
}
}
}
let result = result
.into_iter()
.filter(|t| !t.value.is_empty() || t.whitespace_after.is_some())
.collect();
Ok(result)
}
}
fn substitute(
pattern: &str,
replacement: &str,
buffer: &mut String,
) -> anyhow::Result<Option<Vec<String>>> {
let pattern = cached_regex(pattern)?;
if let Some(caps) = pattern.captures(buffer)
&& let Some(whole_match) = caps.get(0)
{
let captured_values = caps
.iter()
.map(|c| {
if let Some(m) = c {
m.as_str().to_string()
} else {
String::new()
}
})
.collect();
let mut expanded_replacment = String::new();
caps.expand(replacement, &mut expanded_replacment);
expanded_replacment.insert_str(0, &buffer[0..whole_match.start()]);
expanded_replacment.push_str(&buffer[whole_match.end()..]);
*buffer = expanded_replacment;
return Ok(Some(captured_values));
}
Ok(None)
}
fn substitute_i(
pattern: &str,
replacement: &str,
buffer: &mut String,
) -> anyhow::Result<Option<Vec<String>>> {
let pattern = cached_regex_case_insensitive(pattern)?;
if let Some(caps) = pattern.captures(buffer)
&& let Some(whole_match) = caps.get(0)
{
let captured_values = caps
.iter()
.map(|c| {
if let Some(m) = c {
m.as_str().to_string()
} else {
String::new()
}
})
.collect();
let mut expanded_replacment = String::new();
caps.expand(replacement, &mut expanded_replacment);
expanded_replacment.insert_str(0, &buffer[0..whole_match.start()]);
expanded_replacment.push_str(&buffer[whole_match.end()..]);
*buffer = expanded_replacment;
return Ok(Some(captured_values));
}
Ok(None)
}
#[cfg(test)]
mod tests;