use crate::semiring::Semiring;
use crate::wfst::{MutableWfst, VectorWfst, WeightedTransition, Wfst};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SemioticClass {
Cardinal,
Ordinal,
Decimal,
Fraction,
Money,
Time,
Date,
Measure,
Address,
Telephone,
Electronic,
Verbatim,
Whitelist,
Plain,
}
#[derive(Debug, Clone)]
pub struct TaggedToken {
pub text: String,
pub class: SemioticClass,
pub start: usize,
pub end: usize,
}
#[derive(Debug)]
pub struct TextNormalizer<W: Semiring> {
classifier: VectorWfst<char, W>,
verbalizers: HashMap<SemioticClass, Verbalizer<W>>,
}
#[derive(Debug)]
pub struct Verbalizer<W: Semiring> {
wfst: VectorWfst<char, W>,
class: SemioticClass,
}
impl<W: Semiring> Verbalizer<W> {
pub fn wfst(&self) -> &VectorWfst<char, W> {
&self.wfst
}
pub fn class(&self) -> SemioticClass {
self.class
}
}
impl<W: Semiring + Clone + From<f64>> TextNormalizer<W> {
pub fn new() -> Self {
Self {
classifier: Self::build_default_classifier(),
verbalizers: Self::build_default_verbalizers(),
}
}
fn build_default_classifier() -> VectorWfst<char, W> {
let mut fst: VectorWfst<char, W> = VectorWfst::new();
let start = fst.add_state();
fst.set_start(start);
fst.set_final(start, W::one());
fst
}
fn build_default_verbalizers() -> HashMap<SemioticClass, Verbalizer<W>> {
let mut verbalizers = HashMap::new();
verbalizers.insert(
SemioticClass::Cardinal,
Verbalizer {
wfst: Self::build_cardinal_verbalizer(),
class: SemioticClass::Cardinal,
},
);
verbalizers
}
fn build_cardinal_verbalizer() -> VectorWfst<char, W> {
let mut fst: VectorWfst<char, W> = VectorWfst::new();
let start = fst.add_state();
fst.set_start(start);
fst.set_final(start, W::one());
let digit_state = fst.add_state();
for digit in '0'..='9' {
fst.add_transition(WeightedTransition {
from: start,
input: Some(digit),
output: Some(digit),
to: digit_state,
weight: W::one(),
});
}
fst.set_final(digit_state, W::one());
fst
}
pub fn normalize(&self, input: &str) -> Vec<(String, W)> {
debug_assert!(
self.classifier.num_states() > 0,
"TextNormalizer classifier should have been built"
);
debug_assert!(
self.verbalizers.contains_key(&SemioticClass::Cardinal),
"TextNormalizer should ship a cardinal verbalizer"
);
let mut results = Vec::new();
let normalized = self.normalize_numbers(input);
results.push((normalized, W::one()));
results
}
pub fn classifier(&self) -> &VectorWfst<char, W> {
&self.classifier
}
pub fn verbalizer(&self, class: SemioticClass) -> Option<&Verbalizer<W>> {
self.verbalizers.get(&class)
}
fn normalize_numbers(&self, input: &str) -> String {
let mut result = String::new();
let mut chars = input.chars().peekable();
while let Some(ch) = chars.next() {
if ch.is_ascii_digit() {
let mut num = String::new();
num.push(ch);
while let Some(&next) = chars.peek() {
if next.is_ascii_digit() {
num.push(
chars
.next()
.expect("text_processing/mod.rs: required value was None/Err"),
);
} else {
break;
}
}
result.push_str(&number_to_words(&num));
} else {
result.push(ch);
}
}
result
}
}
impl<W: Semiring + Clone + From<f64>> Default for TextNormalizer<W> {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct InverseTextNormalizer<W: Semiring> {
classifier: VectorWfst<char, W>,
verbalizers: HashMap<SemioticClass, Verbalizer<W>>,
}
impl<W: Semiring + Clone + From<f64>> InverseTextNormalizer<W> {
pub fn new() -> Self {
Self {
classifier: Self::build_default_classifier(),
verbalizers: Self::build_default_verbalizers(),
}
}
fn build_default_classifier() -> VectorWfst<char, W> {
let mut fst: VectorWfst<char, W> = VectorWfst::new();
let start = fst.add_state();
fst.set_start(start);
fst.set_final(start, W::one());
fst
}
fn build_default_verbalizers() -> HashMap<SemioticClass, Verbalizer<W>> {
HashMap::new()
}
pub fn denormalize(&self, input: &str) -> Vec<(String, W)> {
debug_assert!(
self.classifier.num_states() > 0,
"InverseTextNormalizer classifier should have been built"
);
let mut results = Vec::new();
let denormalized = self.denormalize_numbers(input);
results.push((denormalized, W::one()));
results
}
pub fn classifier(&self) -> &VectorWfst<char, W> {
&self.classifier
}
pub fn verbalizer(&self, class: SemioticClass) -> Option<&Verbalizer<W>> {
self.verbalizers.get(&class)
}
fn denormalize_numbers(&self, input: &str) -> String {
let words: Vec<&str> = input.split_whitespace().collect();
let mut result = Vec::new();
let mut i = 0;
while i < words.len() {
if let Some((num, consumed)) = words_to_number(&words[i..]) {
result.push(num.to_string());
i += consumed;
} else {
result.push(words[i].to_string());
i += 1;
}
}
result.join(" ")
}
}
impl<W: Semiring + Clone + From<f64>> Default for InverseTextNormalizer<W> {
fn default() -> Self {
Self::new()
}
}
fn number_to_words(num_str: &str) -> String {
let num: u64 = num_str.parse().unwrap_or(0);
if num == 0 {
return "zero".to_string();
}
let ones = [
"",
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
];
let tens = [
"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
];
let mut result = Vec::new();
if num >= 1_000_000_000 {
let billions = num / 1_000_000_000;
result.push(format!("{} billion", ones[billions as usize]));
}
let remainder = num % 1_000_000_000;
if remainder >= 1_000_000 {
let millions = remainder / 1_000_000;
if millions < 20 {
result.push(format!("{} million", ones[millions as usize]));
} else {
let t = millions / 10;
let o = millions % 10;
if o == 0 {
result.push(format!("{} million", tens[t as usize]));
} else {
result.push(format!("{} {} million", tens[t as usize], ones[o as usize]));
}
}
}
let remainder = remainder % 1_000_000;
if remainder >= 1000 {
let thousands = remainder / 1000;
if thousands < 20 {
result.push(format!("{} thousand", ones[thousands as usize]));
} else {
let t = thousands / 10;
let o = thousands % 10;
if o == 0 {
result.push(format!("{} thousand", tens[t as usize]));
} else {
result.push(format!(
"{} {} thousand",
tens[t as usize], ones[o as usize]
));
}
}
}
let remainder = remainder % 1000;
if remainder >= 100 {
let hundreds = remainder / 100;
result.push(format!("{} hundred", ones[hundreds as usize]));
}
let remainder = remainder % 100;
if remainder > 0 {
if remainder < 20 {
result.push(ones[remainder as usize].to_string());
} else {
let t = remainder / 10;
let o = remainder % 10;
if o == 0 {
result.push(tens[t as usize].to_string());
} else {
result.push(format!("{} {}", tens[t as usize], ones[o as usize]));
}
}
}
result.join(" ")
}
fn words_to_number(words: &[&str]) -> Option<(u64, usize)> {
let word_values: HashMap<&str, u64> = [
("zero", 0),
("one", 1),
("two", 2),
("three", 3),
("four", 4),
("five", 5),
("six", 6),
("seven", 7),
("eight", 8),
("nine", 9),
("ten", 10),
("eleven", 11),
("twelve", 12),
("thirteen", 13),
("fourteen", 14),
("fifteen", 15),
("sixteen", 16),
("seventeen", 17),
("eighteen", 18),
("nineteen", 19),
("twenty", 20),
("thirty", 30),
("forty", 40),
("fifty", 50),
("sixty", 60),
("seventy", 70),
("eighty", 80),
("ninety", 90),
]
.into_iter()
.collect();
let multipliers: HashMap<&str, u64> = [
("hundred", 100),
("thousand", 1000),
("million", 1_000_000),
("billion", 1_000_000_000),
]
.into_iter()
.collect();
if words.is_empty() {
return None;
}
let first_lower = words[0].to_lowercase();
if !word_values.contains_key(first_lower.as_str()) {
return None;
}
let mut total = 0u64;
let mut current = 0u64;
let mut consumed = 0;
for word in words {
let lower = word.to_lowercase();
if let Some(&value) = word_values.get(lower.as_str()) {
current += value;
consumed += 1;
} else if let Some(&mult) = multipliers.get(lower.as_str()) {
if mult >= 1000 {
current *= mult;
total += current;
current = 0;
} else {
current *= mult;
}
consumed += 1;
} else {
break;
}
}
total += current;
if consumed > 0 {
Some((total, consumed))
} else {
None
}
}
#[derive(Debug, Clone)]
pub struct MoneyConfig {
pub symbol: String,
pub name_singular: String,
pub name_plural: String,
pub subunit_singular: String,
pub subunit_plural: String,
pub subunit_divisor: u32,
}
impl Default for MoneyConfig {
fn default() -> Self {
Self {
symbol: "$".to_string(),
name_singular: "dollar".to_string(),
name_plural: "dollars".to_string(),
subunit_singular: "cent".to_string(),
subunit_plural: "cents".to_string(),
subunit_divisor: 100,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DateFormat {
MonthDayYear,
DayMonthYear,
YearMonthDay,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TimeFormat {
TwelveHour,
TwentyFourHour,
}
#[cfg(test)]
mod tests {
use super::*;
use crate::semiring::TropicalWeight;
#[test]
fn test_number_to_words() {
assert_eq!(number_to_words("0"), "zero");
assert_eq!(number_to_words("1"), "one");
assert_eq!(number_to_words("12"), "twelve");
assert_eq!(number_to_words("21"), "twenty one");
assert_eq!(number_to_words("100"), "one hundred");
assert_eq!(number_to_words("123"), "one hundred twenty three");
assert_eq!(number_to_words("1000"), "one thousand");
assert_eq!(
number_to_words("1234"),
"one thousand two hundred thirty four"
);
}
#[test]
fn test_words_to_number() {
assert_eq!(words_to_number(&["one"]), Some((1, 1)));
assert_eq!(words_to_number(&["twelve"]), Some((12, 1)));
assert_eq!(words_to_number(&["twenty", "one"]), Some((21, 2)));
assert_eq!(words_to_number(&["one", "hundred"]), Some((100, 2)));
assert_eq!(
words_to_number(&["one", "hundred", "twenty", "three"]),
Some((123, 4))
);
assert_eq!(words_to_number(&["one", "thousand"]), Some((1000, 2)));
}
#[test]
fn test_text_normalizer() {
let normalizer: TextNormalizer<TropicalWeight> = TextNormalizer::new();
let results = normalizer.normalize("I have 123 apples");
assert!(!results.is_empty());
assert!(results[0].0.contains("one hundred twenty three"));
}
#[test]
fn test_inverse_text_normalizer() {
let itn: InverseTextNormalizer<TropicalWeight> = InverseTextNormalizer::new();
let results = itn.denormalize("I have one hundred twenty three apples");
assert!(!results.is_empty());
assert!(results[0].0.contains("123"));
}
#[test]
fn test_semiotic_class() {
assert_eq!(SemioticClass::Cardinal, SemioticClass::Cardinal);
assert_ne!(SemioticClass::Cardinal, SemioticClass::Ordinal);
}
}