use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum EntityKind {
Character,
Place,
Artefact,
}
impl EntityKind {
pub fn label(self) -> &'static str {
match self {
Self::Character => "character",
Self::Place => "place",
Self::Artefact => "artefact",
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DescriptionSnippet {
pub chapter: String,
pub paragraph: Uuid,
pub text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EntityDescriptions {
pub entity: String,
pub kind: EntityKind,
pub snippets: Vec<DescriptionSnippet>,
}
#[derive(Debug, Clone)]
pub struct Candidate {
pub paragraph: Uuid,
pub chapter_order: usize,
pub chapter_title: String,
pub text: String,
}
pub fn assemble_descriptions(
entity: &str,
candidates: &[Candidate],
max_snippets: usize,
coref: &HashSet<Uuid>,
) -> Vec<DescriptionSnippet> {
let needle = entity.trim().to_lowercase();
if needle.is_empty() || max_snippets == 0 {
return Vec::new();
}
let mut seen = HashSet::new();
let mut kept: Vec<&Candidate> = Vec::new();
for c in candidates {
if kept.len() >= max_snippets {
break;
}
if !c.text.to_lowercase().contains(&needle) && !coref.contains(&c.paragraph) {
continue;
}
if !seen.insert(c.paragraph) {
continue;
}
kept.push(c);
}
kept.sort_by_key(|c| c.chapter_order);
kept.into_iter()
.map(|c| DescriptionSnippet {
chapter: c.chapter_title.clone(),
paragraph: c.paragraph,
text: c.text.clone(),
})
.collect()
}
fn pronouns(language: &str, kind: EntityKind) -> &'static [&'static str] {
use EntityKind::*;
match language.trim().to_lowercase().as_str() {
"russian" | "русский" => match kind {
Character => &[
"он", "его", "ему", "им", "нём", "нем", "она", "её", "ее", "ей", "ней", "они",
"их", "ими", "них",
],
Place => &["оно", "там", "тут", "здесь", "туда", "сюда"],
Artefact => &["оно", "его", "ему", "им"],
},
"french" | "français" | "francais" => match kind {
Character => &["il", "elle", "ils", "elles", "lui", "eux", "leur", "leurs"],
Place => &["y", "là", "ici", "ça", "cela"],
Artefact => &["ça", "cela", "celui", "celle"],
},
"german" | "deutsch" => match kind {
Character => &[
"er", "ihn", "ihm", "sein", "seine", "sie", "ihr", "ihre", "ihnen",
],
Place => &["es", "da", "dort", "dorthin", "hier", "dahin"],
Artefact => &["es", "dies", "dieses"],
},
"spanish" | "español" | "espanol" => match kind {
Character => &[
"él", "ella", "ellos", "ellas", "le", "les", "su", "sus", "suyo", "suya",
],
Place => &["ahí", "allí", "allá", "aquí", "acá"],
Artefact => &["ello", "eso", "esto"],
},
_ => match kind {
Character => &[
"he", "him", "his", "she", "her", "hers", "they", "them", "their", "theirs",
],
Place => &["it", "its", "there", "here"],
Artefact => &["it", "its"],
},
}
}
pub fn mentions(haystack_lc: &str, name_lc: &str) -> bool {
if name_lc.is_empty() {
return false;
}
let mut from = 0;
while let Some(rel) = haystack_lc[from..].find(name_lc) {
let start = from + rel;
let end = start + name_lc.len();
let before_ok = haystack_lc[..start]
.chars()
.next_back()
.is_none_or(|c| !c.is_alphanumeric());
let after_ok = haystack_lc[end..]
.chars()
.next()
.is_none_or(|c| !c.is_alphanumeric());
if before_ok && after_ok {
return true;
}
from = start + name_lc.chars().next().map_or(1, char::len_utf8);
}
false
}
fn word_set(text_lc: &str) -> HashSet<&str> {
text_lc
.split(|c: char| !c.is_alphanumeric())
.filter(|w| !w.is_empty())
.collect()
}
pub fn attribute_continuations(
chapters: &[Vec<(Uuid, String)>],
lexicon: &[(String, EntityKind)],
language: &str,
) -> HashMap<Uuid, Vec<String>> {
let lex_lc: Vec<(String, String, EntityKind)> = lexicon
.iter()
.map(|(n, k)| (n.clone(), n.to_lowercase(), *k))
.collect();
let mut out: HashMap<Uuid, Vec<String>> = HashMap::new();
for chapter in chapters {
let mut anchor: HashMap<EntityKind, String> = HashMap::new();
for (pid, text) in chapter {
let lc = text.to_lowercase();
let mut named_by_kind: HashMap<EntityKind, Vec<&str>> = HashMap::new();
for (name, name_lc, kind) in &lex_lc {
if mentions(&lc, name_lc) {
named_by_kind.entry(*kind).or_default().push(name.as_str());
}
}
if named_by_kind.is_empty() {
let words = word_set(&lc);
for (kind, anchor_name) in &anchor {
if pronouns(language, *kind).iter().any(|p| words.contains(*p)) {
out.entry(*pid).or_default().push(anchor_name.clone());
}
}
} else {
for (kind, names) in &named_by_kind {
if names.len() == 1 {
anchor.insert(*kind, names[0].to_string());
} else {
anchor.remove(kind);
}
}
}
}
}
out
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DriftConflict {
pub entity: String,
pub kind: EntityKind,
pub a: String,
pub b: String,
pub chapter_a: String,
pub chapter_b: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub paragraph_b: Option<Uuid>,
pub detail: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DriftReport {
#[serde(default)]
pub version: String,
#[serde(default)]
pub content_hash: u64,
pub conflicts: Vec<DriftConflict>,
#[serde(default)]
pub descriptions: Vec<EntityDescriptions>,
}
impl DriftReport {
pub fn sidecar_path(project_root: &Path) -> PathBuf {
project_root.join(".inkhaven").join("drift.json")
}
pub fn load(project_root: &Path) -> std::io::Result<Self> {
let path = Self::sidecar_path(project_root);
match std::fs::read_to_string(&path) {
Ok(s) => serde_json::from_str(&s)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Self::default()),
Err(e) => Err(e),
}
}
pub fn save(&self, project_root: &Path) -> std::io::Result<()> {
let path = Self::sidecar_path(project_root);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let body = serde_json::to_vec_pretty(self)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
crate::io_atomic::write(&path, &body)
}
pub fn compute_hash(descs: &[EntityDescriptions]) -> u64 {
use std::hash::{Hash, Hasher};
let mut rows: Vec<String> = Vec::new();
for d in descs {
for s in &d.snippets {
rows.push(format!("{}\u{1}{}\u{1}{}", d.entity, s.chapter, s.text));
}
}
rows.sort();
let mut h = std::collections::hash_map::DefaultHasher::new();
for r in rows {
r.hash(&mut h);
}
h.finish()
}
}
pub fn parse_drift_pairs(raw: &str, n: usize) -> Vec<(usize, usize, String)> {
let mut out = Vec::new();
for line in raw.lines() {
let line = line.trim().trim_start_matches(['-', '*', '•', ' ']).trim();
if line.is_empty() || !line.contains('|') {
continue;
}
let parts: Vec<&str> = line.splitn(3, '|').map(str::trim).collect();
if parts.len() != 3 || parts[2].is_empty() {
continue;
}
let idx = |s: &str| -> Option<usize> {
s.trim_matches(|c: char| !c.is_ascii_digit())
.parse::<usize>()
.ok()
};
let (Some(i), Some(j)) = (idx(parts[0]), idx(parts[1])) else {
continue;
};
if i == 0 || j == 0 || i > n || j > n || i == j {
continue;
}
out.push((i - 1, j - 1, parts[2].to_string()));
}
out
}
pub fn resolve_conflicts(
entity: &str,
kind: EntityKind,
snippets: &[DescriptionSnippet],
pairs: &[(usize, usize, String)],
) -> Vec<DriftConflict> {
let mut out = Vec::new();
for (i, j, why) in pairs {
let (Some(si), Some(sj)) = (snippets.get(*i), snippets.get(*j)) else {
continue;
};
let (earlier, later) = if i <= j { (si, sj) } else { (sj, si) };
out.push(DriftConflict {
entity: entity.to_string(),
kind,
a: quote(&earlier.text),
b: quote(&later.text),
chapter_a: earlier.chapter.clone(),
chapter_b: later.chapter.clone(),
paragraph_b: Some(later.paragraph),
detail: why.trim().to_string(),
});
}
out
}
fn quote(text: &str) -> String {
let one_line = text.split_whitespace().collect::<Vec<_>>().join(" ");
let capped: String = one_line.chars().take(120).collect();
if one_line.chars().count() > 120 {
format!("{}…", capped.trim_end())
} else {
capped
}
}
#[cfg(test)]
mod tests {
use super::*;
fn cand(order: usize, chapter: &str, text: &str) -> Candidate {
Candidate {
paragraph: Uuid::now_v7(),
chapter_order: order,
chapter_title: chapter.into(),
text: text.into(),
}
}
#[test]
fn keeps_only_paragraphs_that_mention_the_entity() {
let cands = vec![
cand(2, "ch-2", "The Drunken Goose was cramped and smoky."),
cand(5, "ch-5", "The inn down the road smelled of woodsmoke."), cand(8, "ch-8", "By winter the Drunken Goose felt airy and bright."),
];
let out = assemble_descriptions("The Drunken Goose", &cands, 8, &HashSet::new());
assert_eq!(out.len(), 2, "the un-named inn paragraph is filtered out");
assert!(out[0].text.contains("cramped"));
assert!(out[1].text.contains("airy"));
}
#[test]
fn coref_attributed_paragraph_is_kept_despite_no_name() {
let pronoun_para = Uuid::now_v7();
let cands = vec![
cand(2, "ch-2", "The Drunken Goose was cramped and smoky."),
Candidate {
paragraph: pronoun_para,
chapter_order: 6,
chapter_title: "ch-6".into(),
text: "Inside, it felt airy and bright.".into(), },
];
let coref: HashSet<Uuid> = [pronoun_para].into_iter().collect();
let out = assemble_descriptions("The Drunken Goose", &cands, 8, &coref);
assert_eq!(out.len(), 2, "the coref-attributed pronoun paragraph is kept");
assert!(out[1].text.contains("airy"));
}
#[test]
fn dedups_and_orders_by_chapter_then_caps_by_relevance() {
let p = Uuid::now_v7();
let dup_a = Candidate { paragraph: p, chapter_order: 9, chapter_title: "ch-9".into(), text: "Mara spoke softly.".into() };
let dup_b = Candidate { paragraph: p, chapter_order: 9, chapter_title: "ch-9".into(), text: "Mara spoke softly.".into() };
let cands = vec![
dup_a,
dup_b,
cand(1, "ch-1", "Mara, soft-spoken as ever."),
cand(4, "ch-4", "Mara's voice boomed across the hall."),
];
let out = assemble_descriptions("Mara", &cands, 2, &HashSet::new());
assert_eq!(out.len(), 2, "dup collapses, cap=2 honoured");
assert_eq!(out[0].chapter, "ch-1", "presented in chapter order");
assert_eq!(out[1].chapter, "ch-9");
}
#[test]
fn empty_entity_or_zero_cap_returns_nothing() {
let cands = vec![cand(1, "ch-1", "anything")];
assert!(assemble_descriptions("", &cands, 8, &HashSet::new()).is_empty());
assert!(assemble_descriptions("x", &cands, 0, &HashSet::new()).is_empty());
}
#[test]
fn attribute_continuations_recency_and_ambiguity() {
let p_named = Uuid::now_v7();
let p_pron = Uuid::now_v7();
let p_other = Uuid::now_v7();
let lexicon = vec![
("Mara".to_string(), EntityKind::Character),
("Joss".to_string(), EntityKind::Character),
("The Goose".to_string(), EntityKind::Place),
];
let p_ambig = Uuid::now_v7();
let p_after_ambig = Uuid::now_v7();
let chapters = vec![vec![
(p_named, "Mara crossed the yard.".to_string()),
(p_pron, "She was taller than he remembered, her hair gone grey.".to_string()),
(p_other, "The Goose stood at the corner.".to_string()),
(p_ambig, "Mara and Joss argued by the door.".to_string()),
(p_after_ambig, "She would not look at him.".to_string()),
]];
let map = attribute_continuations(&chapters, &lexicon, "english");
assert_eq!(map.get(&p_pron).map(|v| v.as_slice()), Some(&["Mara".to_string()][..]));
assert!(!map.contains_key(&p_after_ambig), "ambiguous anchor → no attribution");
assert!(map.values().all(|v| !v.contains(&"The Goose".to_string())));
}
#[test]
fn attribute_continuations_does_not_cross_chapters() {
let p_named = Uuid::now_v7();
let p_next_chapter = Uuid::now_v7();
let lexicon = vec![("Mara".to_string(), EntityKind::Character)];
let chapters = vec![
vec![(p_named, "Mara waited.".to_string())],
vec![(p_next_chapter, "She sighed.".to_string())], ];
let map = attribute_continuations(&chapters, &lexicon, "english");
assert!(!map.contains_key(&p_next_chapter), "anchor resets per chapter");
}
#[test]
fn coref_is_multilingual_russian() {
let p_named = Uuid::now_v7();
let p_pron = Uuid::now_v7();
let lexicon = vec![("Мара".to_string(), EntityKind::Character)];
let chapters = vec![vec![
(p_named, "Мара пересекла двор.".to_string()),
(p_pron, "Она была выше, чем он помнил.".to_string()),
]];
assert!(
!attribute_continuations(&chapters, &lexicon, "english").contains_key(&p_pron),
"english pronoun set must not fire on Russian prose"
);
let ru = attribute_continuations(&chapters, &lexicon, "russian");
assert_eq!(ru.get(&p_pron).map(|v| v.as_slice()), Some(&["Мара".to_string()][..]));
}
#[test]
fn mentions_respects_word_boundaries_including_unicode() {
assert!(mentions("mara crossed the yard", "mara"));
assert!(!mentions("samuel spoke", "sam"), "no substring false-match");
assert!(mentions("the drunken goose was loud", "drunken goose"));
assert!(mentions("мара пересекла двор", "мара"));
assert!(!mentions("марашка сидела тихо", "мара"), "no Cyrillic substring false-match");
}
#[test]
fn parse_drift_pairs_reads_indices_and_skips_noise() {
let raw = "\
i | j | why\n\
- [1] | [2] | cramped vs airy\n\
2 | 4 | soft vs booming\n\
3 | 3 | self-reference (dropped)\n\
9 | 1 | out of range (dropped)\n\
none\n\
gibberish without a pipe\n";
let pairs = parse_drift_pairs(raw, 4);
assert_eq!(pairs, vec![(0, 1, "cramped vs airy".into()), (1, 3, "soft vs booming".into())]);
}
#[test]
fn resolve_conflicts_orders_earlier_first_and_sets_jump() {
let s1 = DescriptionSnippet { chapter: "ch-2".into(), paragraph: Uuid::now_v7(), text: "cramped and smoky".into() };
let s2 = DescriptionSnippet { chapter: "ch-20".into(), paragraph: Uuid::now_v7(), text: "airy and bright".into() };
let snippets = vec![s1.clone(), s2.clone()];
let pairs = vec![(1, 0, "atmosphere flipped".to_string())];
let out = resolve_conflicts("The Drunken Goose", EntityKind::Place, &snippets, &pairs);
assert_eq!(out.len(), 1);
let c = &out[0];
assert_eq!(c.chapter_a, "ch-2", "earlier chapter is a");
assert_eq!(c.chapter_b, "ch-20");
assert_eq!(c.paragraph_b, Some(s2.paragraph), "jump targets the later, divergent passage");
assert_eq!(c.kind, EntityKind::Place);
assert!(c.a.contains("cramped") && c.b.contains("airy"));
}
#[test]
fn report_hash_is_order_independent_and_round_trips() {
let mk = |ch: &str, t: &str| DescriptionSnippet { chapter: ch.into(), paragraph: Uuid::now_v7(), text: t.into() };
let a = EntityDescriptions { entity: "Mara".into(), kind: EntityKind::Character, snippets: vec![mk("ch-1", "soft"), mk("ch-4", "loud")] };
let b = EntityDescriptions { entity: "Goose".into(), kind: EntityKind::Place, snippets: vec![mk("ch-2", "smoky")] };
let h1 = DriftReport::compute_hash(&[a.clone(), b.clone()]);
let h2 = DriftReport::compute_hash(&[b, a]);
assert_eq!(h1, h2, "hash ignores entity/snippet order");
let dir = tempfile::tempdir().unwrap();
let report = DriftReport {
version: "x".into(),
content_hash: h1,
conflicts: vec![DriftConflict {
entity: "Mara".into(),
kind: EntityKind::Character,
a: "soft".into(),
b: "loud".into(),
chapter_a: "ch-1".into(),
chapter_b: "ch-4".into(),
paragraph_b: Some(Uuid::now_v7()),
detail: "voice flipped".into(),
}],
descriptions: Vec::new(),
};
report.save(dir.path()).unwrap();
let loaded = DriftReport::load(dir.path()).unwrap();
assert_eq!(loaded.conflicts, report.conflicts);
assert_eq!(loaded.content_hash, h1);
}
}