use std::collections::HashSet;
use crate::core::db::{Database, DbError};
use super::FactIssue;
pub fn edit_distance(a: &str, b: &str) -> usize {
let (a, b) = if a.len() < b.len() { (b, a) } else { (a, b) };
let a_bytes = a.as_bytes();
let b_bytes = b.as_bytes();
if b.is_empty() {
return a.len();
}
let mut prev: Vec<usize> = (0..=b_bytes.len()).collect();
let mut curr = vec![0usize; b_bytes.len() + 1];
for (i, ac) in a_bytes.iter().enumerate() {
curr[0] = i + 1;
for (j, bc) in b_bytes.iter().enumerate() {
let cost = if ac == bc { 0 } else { 1 };
curr[j + 1] = (prev[j + 1] + 1).min(curr[j] + 1).min(prev[j] + cost);
}
std::mem::swap(&mut prev, &mut curr);
}
prev[b_bytes.len()]
}
pub fn candidates_from_text(text: &str) -> Vec<String> {
crate::aaak::codec::extract_entities(text)
.into_iter()
.filter(|s| s.len() >= 3)
.collect()
}
pub fn query_known_entities(
db: &Database,
scope: Option<(&str, Option<&str>)>,
) -> Result<Vec<String>, DbError> {
let mut seen: HashSet<String> = HashSet::new();
let triples = db.query_triples(None, None, None, false)?;
for t in triples {
if is_name_like(&t.subject) {
seen.insert(t.subject);
}
if is_name_like(&t.object) {
seen.insert(t.object);
}
}
let drawers = db.top_drawers(50)?;
for drawer in drawers {
if let Some((wing, room)) = scope {
if drawer.wing != wing {
continue;
}
if let Some(r) = room {
if drawer.room.as_deref() != Some(r) {
continue;
}
}
}
for entity in crate::aaak::codec::extract_entities(&drawer.content) {
if entity.len() >= 3 {
seen.insert(entity);
}
}
}
Ok(seen.into_iter().collect())
}
fn is_name_like(value: &str) -> bool {
value.len() >= 3
&& value.chars().next().is_some_and(|c| c.is_ascii_uppercase())
&& value.chars().all(|c| c.is_ascii_alphabetic())
}
pub fn detect_similar_name_conflicts(text_names: &[String], known: &[String]) -> Vec<FactIssue> {
let mut issues = Vec::new();
for mentioned in text_names {
let mut best: Option<(&String, usize)> = None;
for k in known {
if k == mentioned {
best = None;
break;
}
let d = edit_distance(mentioned, k);
if d <= 2 && best.map(|(_, bd)| d < bd).unwrap_or(true) {
best = Some((k, d));
}
}
if let Some((known_entity, edit)) = best {
issues.push(FactIssue::SimilarNameConflict {
mentioned: mentioned.clone(),
known_entity: known_entity.clone(),
edit_distance: edit,
});
}
}
issues
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_edit_distance_equal_zero() {
assert_eq!(edit_distance("Alice", "Alice"), 0);
assert_eq!(edit_distance("", ""), 0);
}
#[test]
fn test_edit_distance_one_substitution() {
assert_eq!(edit_distance("Bob", "Bop"), 1);
}
#[test]
fn test_edit_distance_insertion_plus_deletion() {
assert_eq!(edit_distance("Bob", "Bobby"), 2);
}
#[test]
fn test_edit_distance_empty_vs_nonempty() {
assert_eq!(edit_distance("", "Alice"), 5);
assert_eq!(edit_distance("Alice", ""), 5);
}
#[test]
fn test_similar_name_detects_bob_vs_bobby() {
let text = vec!["Bobby".to_string()];
let known = vec!["Bob".to_string(), "Alice".to_string()];
let issues = detect_similar_name_conflicts(&text, &known);
assert_eq!(issues.len(), 1);
match &issues[0] {
FactIssue::SimilarNameConflict {
mentioned,
known_entity,
edit_distance,
} => {
assert_eq!(mentioned, "Bobby");
assert_eq!(known_entity, "Bob");
assert_eq!(*edit_distance, 2);
}
other => panic!("unexpected issue: {other:?}"),
}
}
#[test]
fn test_similar_name_ignores_identical() {
let text = vec!["Alice".to_string()];
let known = vec!["Alice".to_string(), "Alicia".to_string()];
let issues = detect_similar_name_conflicts(&text, &known);
assert!(issues.is_empty());
}
#[test]
fn test_is_name_like_requires_capital_alpha() {
assert!(is_name_like("Alice"));
assert!(!is_name_like("alice"));
assert!(!is_name_like("Al")); assert!(!is_name_like("Acme Inc")); assert!(!is_name_like("Co3")); }
}