libmotiva 0.1.0

Sanctioned entities matching utilities
Documentation
use bumpalo::{
  Bump,
  collections::{CollectIn, Vec},
};
use itertools::Itertools;
use libmotiva_macros::scoring_feature;
use rphonetic::{Encoder, Soundex};

use crate::{
  matching::{Feature, extractors},
  model::{Entity, HasProperties, SearchEntity},
};

#[scoring_feature(SoundexNameParts, name = "soundex_name_parts")]
fn score_feature(&self, bump: &Bump, lhs: &SearchEntity, rhs: &Entity) -> f64 {
  let soundex = Soundex::default();
  let mut similarities = Vec::with_capacity_in(lhs.name_parts.len(), bump);

  let rhs_soundexes = extractors::name_parts_flat(rhs.names_and_aliases().iter())
    .unique()
    .map(|s| soundex.encode(&s.to_string()))
    .collect_in::<Vec<_>>(bump);

  for part in &lhs.name_parts {
    let lhs_soundex = soundex.encode(part);

    similarities.push(if rhs_soundexes.contains(&lhs_soundex) { 1.0 } else { 0.0 });
  }

  similarities.iter().sum::<f64>() / 1.0f64.max(similarities.len() as f64)
}

#[cfg(test)]
mod tests {
  use bumpalo::Bump;
  use float_cmp::approx_eq;

  use crate::{
    matching::Feature,
    model::{Entity, SearchEntity},
    tests::python::nomenklatura_comparer,
  };

  #[test]
  #[serial_test::serial]
  fn against_nomenklatura() {
    pyo3::prepare_freethreaded_python();

    let lhs = SearchEntity::builder("Person").properties(&[("name", &["Vladimir Putin", "Vladimir Putin"])]).build();
    let rhs = Entity::builder("Person").properties(&[("name", &["Vladymire Poutine"])]).build();

    let nscore = nomenklatura_comparer("logic_v1.phonetic", "name_soundex_match", &lhs, &rhs).unwrap();

    assert!(approx_eq!(f64, nscore, super::SoundexNameParts.score_feature(&Bump::new(), &lhs, &rhs), epsilon = 0.01));
  }
}