use std::collections::VecDeque;
use std::iter::Enumerate;
use crate::digit_string::DigitString;
use crate::error::Error;
use crate::lang::{BasicAnnotate, LangInterpreter};
use crate::tokenizer::{BasicToken, tokenize};
struct WordToDigitParser<'a, T: LangInterpreter> {
int_part: DigitString,
dec_part: DigitString,
dec_separator: Option<char>,
lang: &'a T,
}
impl<'a, T: LangInterpreter> WordToDigitParser<'a, T> {
pub fn new(lang: &'a T) -> Self {
Self {
int_part: DigitString::new(),
dec_part: DigitString::new(),
dec_separator: None,
lang,
}
}
pub fn reset(&mut self) {
self.int_part.reset();
self.dec_part.reset();
self.dec_separator = None;
}
pub fn push(&mut self, word: &str) -> Result<(), Error> {
let status = if self.dec_separator.is_some() {
self.lang.apply_decimal(word, &mut self.dec_part)
} else {
self.lang.apply(word, &mut self.int_part)
};
if status.is_err() && self.dec_separator.is_none() && !self.int_part.is_empty() {
self.dec_separator = self.lang.check_decimal_separator(word);
if self.dec_separator.is_some() {
Err(Error::Incomplete)
} else {
status
}
} else {
status
}
}
pub fn string_and_value(&mut self) -> (String, f64) {
let res = if !self.dec_part.is_empty() {
let sep = self.dec_separator.unwrap();
self.lang
.format_decimal_and_value(&self.int_part, &self.dec_part, sep)
} else {
self.lang.format_and_value(&self.int_part)
};
self.reset();
res
}
pub fn has_number(&self) -> bool {
!self.int_part.is_empty()
}
pub fn is_ordinal(&self) -> bool {
self.int_part.is_ordinal()
}
}
pub fn text2digits<T: LangInterpreter>(text: &str, lang: &T) -> Result<String, Error> {
match lang.exec_group(text.to_lowercase().split_whitespace()) {
Ok(ds) => Ok(lang.format_and_value(&ds).0),
Err(err) => Err(err),
}
}
pub trait Token {
fn text(&self) -> &str;
fn text_lowercase(&self) -> &str;
fn nt_separated(&self, _previous: &Self) -> bool {
false
}
fn not_a_number_part(&self) -> bool {
false
}
}
pub trait Replace {
fn replace<I: Iterator<Item = Self>>(replaced: I, data: String) -> Self;
}
impl Token for &BasicToken {
fn text(&self) -> &str {
self.text.as_str()
}
fn text_lowercase(&self) -> &str {
self.lowercase.as_str()
}
fn nt_separated(&self, _previous: &Self) -> bool {
false
}
fn not_a_number_part(&self) -> bool {
self.nan
}
}
impl Replace for BasicToken {
fn replace<I: Iterator<Item = Self>>(_replaced: I, data: String) -> Self {
Self {
lowercase: data.to_lowercase(),
text: data,
nan: false,
}
}
}
impl BasicAnnotate for BasicToken {
fn text_lowercase(&self) -> &str {
self.lowercase.as_str()
}
fn set_nan(&mut self, val: bool) {
self.nan = val
}
}
#[derive(Debug)]
pub struct Occurence {
pub start: usize,
pub end: usize,
pub text: String,
pub value: f64,
pub is_ordinal: bool,
}
#[derive(Debug, PartialEq)]
enum MatchKind {
Cardinal,
Ordinal,
None,
}
impl MatchKind {
fn is_none(&self) -> bool {
*self == MatchKind::None
}
}
#[derive(Debug)]
struct NumTracker {
matches: VecDeque<Occurence>,
on_hold: Option<Occurence>,
last_contiguous_match: MatchKind,
match_start: usize,
match_end: usize,
}
impl NumTracker {
fn new() -> Self {
Self {
matches: VecDeque::with_capacity(2),
on_hold: None,
last_contiguous_match: MatchKind::None,
match_start: 0,
match_end: 0,
}
}
fn number_advanced(&mut self, pos: usize) {
if self.match_start == self.match_end {
self.match_start = pos
}
self.match_end = pos + 1;
}
fn number_end(
&mut self,
is_ordinal: bool,
digits: String,
value: f64,
forget_if_isolate: bool,
) {
let occurence = Occurence {
start: self.match_start,
end: self.match_end,
text: digits,
is_ordinal,
value,
};
let kind = if is_ordinal {
MatchKind::Ordinal
} else {
MatchKind::Cardinal
};
if self.last_contiguous_match != kind {
self.last_contiguous_match = MatchKind::None;
}
if !self.last_contiguous_match.is_none() {
if let Some(prev) = self.on_hold.take() {
self.matches.push_back(prev);
}
self.matches.push_back(occurence);
} else if forget_if_isolate {
self.on_hold.replace(occurence);
} else {
self.matches.push_back(occurence);
self.on_hold.take();
}
self.last_contiguous_match = kind;
self.match_start = self.match_end;
}
fn sequence_breaker(&mut self) {
self.last_contiguous_match = MatchKind::None
}
fn pop(&mut self) -> Option<Occurence> {
self.matches.pop_front()
}
fn has_matches(&self) -> bool {
!self.matches.is_empty()
}
fn replace<T: Replace>(self, tokens: &mut Vec<T>) {
for Occurence {
start, end, text, ..
} in self.matches.into_iter().rev()
{
let repr: T = Replace::replace(tokens.drain(start..end), text);
tokens.insert(start, repr);
}
}
fn into_vec(self) -> Vec<Occurence> {
self.matches.into()
}
}
pub struct FindNumbers<'a, L, T, I>
where
L: LangInterpreter,
T: Token,
I: Iterator<Item = (usize, T)>,
{
lang: &'a L,
input: I,
parser: WordToDigitParser<'a, L>,
tracker: NumTracker,
previous: Option<T>,
threshold: f64,
}
impl<'a, L, T, I> FindNumbers<'a, L, T, I>
where
L: LangInterpreter,
T: Token,
I: Iterator<Item = (usize, T)>,
{
fn new(input: I, lang: &'a L, threshold: f64) -> Self {
Self {
lang,
input,
parser: WordToDigitParser::new(lang),
tracker: NumTracker::new(),
previous: None,
threshold,
}
}
fn push(&mut self, pos: usize, token: T) {
if token.text() == "-" || is_whitespace(token.text()) {
return;
}
if token.not_a_number_part() {
if self.parser.has_number() {
self.number_end()
}
self.outside_number(&token);
self.previous.replace(token);
return;
}
let lo_token = token.text_lowercase();
let test = if let Some(ref prev) = self.previous {
if self.parser.has_number() && token.nt_separated(prev) {
"," } else {
lo_token
}
} else {
lo_token
};
match self.parser.push(test) {
Ok(()) => self.tracker.number_advanced(pos),
Err(Error::Incomplete) => (),
Err(_) if self.parser.has_number() => {
self.number_end();
if self.parser.push(lo_token).is_ok() {
self.tracker.number_advanced(pos);
} else {
self.outside_number(&token)
}
}
Err(_) => self.outside_number(&token),
}
self.previous.replace(token);
}
fn finalize(&mut self) {
if self.parser.has_number() {
self.number_end()
}
}
fn number_end(&mut self) {
let is_ordinal = self.parser.is_ordinal();
let (digits, value) = self.parser.string_and_value();
let forget_if_isolate = (digits.len() == 1 || is_ordinal) && value < self.threshold;
self.tracker
.number_end(is_ordinal, digits, value, forget_if_isolate);
}
fn outside_number(&mut self, token: &T) {
let text = token.text();
if !(text.chars().all(|c| !c.is_alphabetic()) && text.trim() != "."
|| self.lang.is_linking(text))
{
self.tracker.sequence_breaker()
};
}
fn track_numbers(mut self) -> NumTracker {
while let Some((pos, token)) = self.input.next() {
self.push(pos, token);
}
self.finalize();
self.tracker
}
}
impl<L, T, I> Iterator for FindNumbers<'_, L, T, I>
where
L: LangInterpreter,
T: Token,
I: Iterator<Item = (usize, T)>,
{
type Item = Occurence;
fn next(&mut self) -> Option<Self::Item> {
if self.tracker.has_matches() {
return self.tracker.pop();
}
while let Some((pos, token)) = self.input.next() {
self.push(pos, token);
if self.tracker.has_matches() {
return self.tracker.pop();
}
}
self.finalize();
self.tracker.pop()
}
}
fn track_numbers<L: LangInterpreter, T: Token, I: Iterator<Item = T>>(
input: I,
lang: &L,
threshold: f64,
) -> NumTracker {
let scanner = FindNumbers::new(input.enumerate(), lang, threshold);
scanner.track_numbers()
}
pub fn find_numbers<L: LangInterpreter, T: Token, I: Iterator<Item = T>>(
input: I,
lang: &L,
threshold: f64,
) -> Vec<Occurence> {
track_numbers(input, lang, threshold).into_vec()
}
pub fn find_numbers_iter<L, T, I>(
input: I,
lang: &L,
threshold: f64,
) -> FindNumbers<'_, L, T, Enumerate<I>>
where
L: LangInterpreter,
T: Token,
I: Iterator<Item = T>,
{
FindNumbers::new(input.enumerate(), lang, threshold)
}
pub fn replace_numbers_in_stream<'a, L, T>(mut input: Vec<T>, lang: &L, threshold: f64) -> Vec<T>
where
L: LangInterpreter,
T: Replace + 'a,
for<'b> &'b T: Token,
{
let tracker = track_numbers(input.iter(), lang, threshold);
tracker.replace(&mut input);
input
}
pub fn replace_numbers_in_text<L: LangInterpreter>(text: &str, lang: &L, threshold: f64) -> String {
let mut tokens = tokenize(text).collect();
lang.basic_annotate(&mut tokens);
let out = replace_numbers_in_stream(tokens, lang, threshold);
out.join("")
}
fn is_whitespace(token: &str) -> bool {
token.chars().all(char::is_whitespace)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::lang::Language;
use crate::tokenizer::tokenize;
impl Token for BasicToken {
fn text(&self) -> &str {
self.text.as_str()
}
fn text_lowercase(&self) -> &str {
&self.lowercase.as_str()
}
fn nt_separated(&self, _previous: &Self) -> bool {
false
}
fn not_a_number_part(&self) -> bool {
self.nan
}
}
#[test]
fn test_word_to_digits_parser_zero() {
let fr = Language::french();
let mut parser = WordToDigitParser::new(&fr);
parser.push("zéro").unwrap();
assert!(parser.has_number());
let (repr, val) = parser.string_and_value();
assert_eq!(repr, "0");
assert_eq!(val, 0.0);
}
#[test]
fn test_grouping() {
let fr = Language::french();
let wyget = replace_numbers_in_text("zéro zéro trente quatre-vingt-dix-sept", &fr, 10.0);
assert_eq!(wyget, "0030 97");
}
#[test]
fn test_find_isolated_single() {
let fr = Language::french();
let ocs = find_numbers(tokenize("c'est un logement neuf"), &fr, 10.0);
dbg!(&ocs);
assert!(ocs.is_empty());
}
#[test]
fn test_find_all_isolated_single() {
let fr = Language::french();
let ocs = find_numbers(tokenize("c'est zéro"), &fr, 0.0);
dbg!(&ocs);
assert_eq!(ocs.len(), 1);
assert_eq!(ocs[0].text, "0");
assert_eq!(ocs[0].value, 0.0);
}
#[test]
fn test_find_isolated_long() {
let fr = Language::french();
let ocs = find_numbers(tokenize("trente-sept rue du docteur leroy"), &fr, 10.0);
dbg!(&ocs);
assert_eq!(ocs.len(), 1);
assert_eq!(ocs[0].text, "37");
assert_eq!(ocs[0].value, 37.0);
}
#[test]
fn test_find_isolated_with_leading_zero() {
let fr = Language::french();
let ocs = find_numbers(tokenize("quatre-vingt-douze slash zéro deux"), &fr, 10.0);
dbg!(&ocs);
assert_eq!(ocs.len(), 2);
assert_eq!(ocs[1].text, "02");
}
#[test]
fn bench() {
let fr = Language::french();
for _ in 0..1 {
let wyget = replace_numbers_in_text(
"Vingt-cinq vaches, douze poulets et cent vingt-cinq kg de pommes de terre.
Mille deux cent soixante-six clous. zéro neuf soixante zéro six douze vingt et un.
les uns et les autres ; une suite de chiffres : un, deux, trois !
cinquante trois mille millions deux cent quarante trois mille sept cent vingt quatre.
",
&fr,
10.0,
);
assert_eq!(
wyget,
"25 vaches, 12 poulets et 125 kg de pommes de terre.\n 1266 clous. 09 60 06 12 21.\n les uns et les autres ; une suite de chiffres : 1, 2, 3 !\n 53000243724.\n "
);
}
}
}