use regex::{Captures, Regex};
use crate::Float;
const DIGIT_FREQS: [Float; 10] = [
0.009, 0.300, 0.175, 0.124, 0.096, 0.078, 0.066, 0.057, 0.050, 0.045,
];
const YEAR_LOG_PEAK: Float = -1.9185;
const NOT_YEAR_PROB: Float = 0.1;
const REFERENCE_YEAR: Float = 2019.;
const PLATEAU_WIDTH: Float = 20.;
const FLOAT_10: Float = 10.;
const FLOAT_0_2: Float = 0.2;
const FLOAT_0_0083: Float = 0.0083;
#[derive(Clone)]
pub struct NumberHandler {
digit_re: Regex,
multi_digit_re: Regex,
pure_digit_re: Regex,
}
impl NumberHandler {
pub fn new() -> Self {
Self {
digit_re: Regex::new(r"\d").unwrap(),
multi_digit_re: Regex::new(r"\d[\d.,]+").unwrap(),
pure_digit_re: Regex::new(r"\d+").unwrap(),
}
}
pub fn smash_numbers(&self, text: &str) -> String {
self.multi_digit_re
.replace_all(text, |captures: &Captures| self.sub_zeroes(captures))
.to_string()
}
fn sub_zeroes(&self, captures: &Captures) -> String {
let group0 = captures.get(0).unwrap().as_str();
self.digit_re.replace_all(group0, "0").to_string()
}
pub fn digit_freq(&self, text: &str) -> Float {
let mut freq = 1.;
for m in self.multi_digit_re.find_iter(text) {
for sm in self.pure_digit_re.find_iter(m.as_str()) {
if sm.as_str().len() == 4 {
freq *= self.year_freq(sm.as_str());
} else {
freq *= self.benford_freq(sm.as_str());
}
}
}
freq
}
fn benford_freq(&self, text: &str) -> Float {
debug_assert_ne!(text.len(), 0);
let chars = text.chars().collect::<Vec<char>>();
let first_digit = chars[0].to_digit(10).unwrap() as usize;
DIGIT_FREQS[first_digit] / FLOAT_10.powi(chars.len() as i32 - 1)
}
fn year_freq(&self, text: &str) -> Float {
debug_assert_eq!(text.len(), 4);
let year = text.parse::<Float>().unwrap();
let year_log_freq = if year <= REFERENCE_YEAR {
FLOAT_0_0083.mul_add(-REFERENCE_YEAR + year, YEAR_LOG_PEAK)
} else if REFERENCE_YEAR < year && year <= REFERENCE_YEAR + PLATEAU_WIDTH {
YEAR_LOG_PEAK
} else {
FLOAT_0_2.mul_add(-year + (REFERENCE_YEAR + PLATEAU_WIDTH), YEAR_LOG_PEAK)
};
let year_prob = FLOAT_10.powf(year_log_freq);
let not_year_prob = NOT_YEAR_PROB * self.benford_freq(text);
year_prob + not_year_prob
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_relative_eq;
#[test]
fn test_smash_numbers() {
let handler: NumberHandler = NumberHandler::new();
assert_eq!(handler.smash_numbers("33.4"), "00.0");
assert_eq!(handler.smash_numbers("33-4"), "00-4");
assert_eq!(handler.smash_numbers("三三.四"), "三三.四");
}
#[test]
fn test_digit_freq() {
let handler: NumberHandler = NumberHandler::new();
assert_relative_eq!(handler.digit_freq("1991.08.07"), 5.7467896897867986e-09);
assert_relative_eq!(handler.digit_freq("1991年08月07日"), 5.7467896897867986e-09);
assert_relative_eq!(handler.digit_freq("平成三年八月七日"), 1.0);
}
#[test]
fn test_benford_freq() {
let handler: NumberHandler = NumberHandler::new();
assert_relative_eq!(handler.benford_freq("7"), 0.057);
assert_relative_eq!(handler.benford_freq("07"), 0.0009);
assert_relative_eq!(handler.benford_freq("007"), 8.999999999999999e-05);
}
#[test]
fn test_year_freq() {
let handler: NumberHandler = NumberHandler::new();
assert_relative_eq!(handler.year_freq("1992"), 0.007231119202497894);
assert_relative_eq!(handler.year_freq("2023"), 0.012081740881970011);
assert_relative_eq!(handler.year_freq("0000"), 9.000000000002107e-07);
assert_relative_eq!(handler.year_freq("9999"), 4.5e-06);
}
}