use std::collections::HashMap;
use rust_stemmers::{Algorithm, Stemmer};
use unicode_segmentation::UnicodeSegmentation;
use uuid::Uuid;
use crate::store::hierarchy::Hierarchy;
use crate::store::node::NodeKind;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LexCategory {
Place,
Character,
Note,
Artefact,
}
#[derive(Debug, Clone, Copy)]
pub struct LexHit {
pub col_start: usize,
pub col_end: usize,
pub category: LexCategory,
}
#[derive(Debug)]
struct CompiledName {
stems_per_word: Vec<Vec<String>>,
category: LexCategory,
}
#[derive(Default, Debug)]
pub struct Lexicon {
names: Vec<CompiledName>,
algos: Vec<Algorithm>,
}
impl Lexicon {
pub fn build(
hierarchy: &Hierarchy,
books: &[(Uuid, LexCategory)],
algorithms: Vec<Algorithm>,
) -> Self {
let mut names: Vec<CompiledName> = Vec::new();
let mut seen: HashMap<String, ()> = HashMap::new();
for (book_id, category) in books {
collect_names_into(
hierarchy,
*book_id,
*category,
&algorithms,
&mut names,
&mut seen,
);
}
names.sort_by(|a, b| b.stems_per_word.len().cmp(&a.stems_per_word.len()));
Self {
names,
algos: algorithms,
}
}
pub fn is_empty(&self) -> bool {
self.names.is_empty()
}
pub fn row_hits(&self, line: &str) -> Vec<LexHit> {
if self.is_empty() {
return Vec::new();
}
let tokens = tokenize_with_offsets(line);
if tokens.is_empty() {
return Vec::new();
}
let token_stems: Vec<Vec<String>> = tokens
.iter()
.map(|t| stems_for(&t.text, &self.algos))
.collect();
let mut out: Vec<LexHit> = Vec::new();
let mut i = 0usize;
while i < tokens.len() {
let mut matched: Option<(usize, LexCategory)> = None; for name in &self.names {
let n = name.stems_per_word.len();
if i + n > tokens.len() {
continue;
}
let mut ok = true;
for k in 0..n {
let accept = &name.stems_per_word[k];
let cand = &token_stems[i + k];
if !cand.iter().any(|s| accept.iter().any(|a| a == s)) {
ok = false;
break;
}
}
if ok {
if matched.map_or(true, |(prev_n, _)| n > prev_n) {
matched = Some((n, name.category));
}
}
}
if let Some((n, category)) = matched {
let start_char = tokens[i].char_start;
let end_char = tokens[i + n - 1].char_end;
out.push(LexHit {
col_start: start_char,
col_end: end_char,
category,
});
i += n; } else {
i += 1;
}
}
out
}
}
fn collect_names_into(
hierarchy: &Hierarchy,
root: Uuid,
category: LexCategory,
algos: &[Algorithm],
out: &mut Vec<CompiledName>,
seen: &mut HashMap<String, ()>,
) {
for id in hierarchy.collect_subtree(root) {
if id == root {
continue;
}
let Some(node) = hierarchy.get(id) else {
continue;
};
if node.kind != NodeKind::Paragraph {
continue;
}
let trimmed = node.title.trim();
if trimmed.is_empty() {
continue;
}
let key = trimmed.to_lowercase();
if seen.insert(key, ()).is_some() {
continue;
}
let words = tokenize_words(trimmed);
if words.is_empty() {
continue;
}
let stems_per_word: Vec<Vec<String>> = words
.iter()
.map(|w| stems_for(w, algos))
.collect();
out.push(CompiledName {
stems_per_word,
category,
});
}
}
fn tokenize_words(s: &str) -> Vec<String> {
s.unicode_words()
.map(|w| w.to_lowercase())
.filter(|w| !w.is_empty())
.collect()
}
#[derive(Debug, Clone)]
struct Token {
text: String,
char_start: usize,
char_end: usize,
}
fn tokenize_with_offsets(s: &str) -> Vec<Token> {
let mut out = Vec::new();
let mut byte_to_char: Vec<usize> = Vec::with_capacity(s.len() + 1);
{
let mut c = 0usize;
for (b, _) in s.char_indices() {
while byte_to_char.len() < b {
byte_to_char.push(c);
}
byte_to_char.push(c);
c += 1;
}
while byte_to_char.len() <= s.len() {
byte_to_char.push(c);
}
}
for (b, w) in s.unicode_word_indices() {
let start = byte_to_char[b];
let end = byte_to_char[b + w.len()];
let text = w.to_lowercase();
if !text.is_empty() {
out.push(Token {
text,
char_start: start,
char_end: end,
});
}
}
out
}
fn stems_for(word: &str, algos: &[Algorithm]) -> Vec<String> {
let lc = word.to_lowercase();
let mut out: Vec<String> = Vec::with_capacity(1 + algos.len());
out.push(lc.clone());
for a in algos {
let s = Stemmer::create(*a).stem(&lc).into_owned();
if !out.contains(&s) {
out.push(s);
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use rust_stemmers::Algorithm;
fn build_test_lex(
places: &[&str],
characters: &[&str],
algos: Vec<Algorithm>,
) -> Lexicon {
build_test_lex_cats(&[
(LexCategory::Place, places),
(LexCategory::Character, characters),
], algos)
}
fn build_test_lex_cats(
groups: &[(LexCategory, &[&str])],
algos: Vec<Algorithm>,
) -> Lexicon {
let mut names: Vec<CompiledName> = Vec::new();
for (cat, list) in groups {
for n in *list {
let words = tokenize_words(n);
if words.is_empty() {
continue;
}
names.push(CompiledName {
stems_per_word: words.iter().map(|w| stems_for(w, &algos)).collect(),
category: *cat,
});
}
}
names.sort_by(|a, b| b.stems_per_word.len().cmp(&a.stems_per_word.len()));
Lexicon { names, algos }
}
#[test]
fn notes_and_artefacts_hit_with_distinct_categories() {
let lex = build_test_lex_cats(
&[
(LexCategory::Note, &["dragonglass"]),
(LexCategory::Artefact, &["valyrian steel"]),
],
vec![Algorithm::English],
);
let hits = lex.row_hits("She wielded the Valyrian Steel against the Dragonglass.");
let cats: Vec<LexCategory> = hits.iter().map(|h| h.category).collect();
assert!(cats.contains(&LexCategory::Note), "got: {hits:?}");
assert!(cats.contains(&LexCategory::Artefact), "got: {hits:?}");
}
#[test]
fn english_inflections_match() {
let lex = build_test_lex(&["cities"], &[], vec![Algorithm::English]);
let hits = lex.row_hits("She walked through the city at dawn.");
assert_eq!(hits.len(), 1, "expected one hit, got {hits:?}");
}
#[test]
fn russian_inflections_of_moscow() {
let lex = build_test_lex(&[], &["Москва"], vec![Algorithm::Russian]);
let line = "Из Москвы в Москве и снова в Москвою.";
let hits = lex.row_hits(line);
assert!(
hits.len() >= 2,
"expected at least two Москва forms, got {hits:?}"
);
for h in &hits {
assert_eq!(h.category, LexCategory::Character);
}
}
#[test]
fn whole_word_via_segmentation() {
let lex = build_test_lex(&[], &["Tom"], vec![Algorithm::English]);
let hits = lex.row_hits("Tomorrow is another day.");
assert!(hits.is_empty(), "got unexpected hits: {hits:?}");
let hits2 = lex.row_hits("Then Tom arrived.");
assert_eq!(hits2.len(), 1);
}
#[test]
fn multi_word_phrase() {
let lex = build_test_lex(&["King's Landing"], &[], vec![Algorithm::English]);
let line = "She returned to King's Landing alone.";
let hits = lex.row_hits(line);
assert_eq!(hits.len(), 1);
let matched: String = line
.chars()
.skip(hits[0].col_start)
.take(hits[0].col_end - hits[0].col_start)
.collect();
assert!(matched.contains("King") && matched.contains("Landing"));
}
#[test]
fn longest_match_wins() {
let lex = build_test_lex(
&[],
&["Robb", "Robb Stark"],
vec![Algorithm::English],
);
let hits = lex.row_hits("Robb Stark rode north.");
assert_eq!(hits.len(), 1);
assert!(hits[0].col_end - hits[0].col_start >= "Robb Stark".len());
}
#[test]
fn empty_lexicon_no_hits() {
let lex = Lexicon::default();
assert!(lex.is_empty());
assert!(lex.row_hits("anything").is_empty());
}
#[test]
fn empty_algos_falls_back_to_exact() {
let lex = build_test_lex(&["Rohan"], &[], vec![]);
let hits = lex.row_hits("Rohan rides.");
assert_eq!(hits.len(), 1);
let none = lex.row_hits("Rohans ride.");
assert!(none.is_empty());
}
}