use std::collections::BTreeSet;
use std::path::{Path, PathBuf};
use rust_stemmers::Stemmer;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct FactFinding {
pub chapter: String,
#[serde(default)]
pub chapter_index: usize,
pub claim: String,
pub fact: String,
pub detail: String,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct FactScanReport {
#[serde(default)]
pub version: String,
#[serde(default)]
pub language: String,
pub findings: Vec<FactFinding>,
}
impl FactScanReport {
pub fn sidecar_path(project_root: &Path) -> PathBuf {
project_root.join(".inkhaven").join("facts_scan.json")
}
pub fn load(project_root: &Path) -> std::io::Result<Self> {
let path = Self::sidecar_path(project_root);
match std::fs::read_to_string(&path) {
Ok(s) => serde_json::from_str(&s)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Self::default()),
Err(e) => Err(e),
}
}
pub fn save(&self, project_root: &Path) -> std::io::Result<()> {
let path = Self::sidecar_path(project_root);
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let body = serde_json::to_vec_pretty(self)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
crate::io_atomic::write(&path, &body)
}
}
pub fn parse_findings(raw: &str, chapter: &str, chapter_index: usize) -> Vec<FactFinding> {
let mut out = Vec::new();
for line in raw.lines() {
let line = line.trim().trim_start_matches(['-', '*', '•']).trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.splitn(3, '|').map(str::trim).collect();
if parts.len() != 3 {
continue;
}
let (claim, fact, detail) = (parts[0], parts[1], parts[2]);
if claim.is_empty() || fact.is_empty() || detail.is_empty() {
continue;
}
if claim.eq_ignore_ascii_case("claim") && fact.eq_ignore_ascii_case("fact") {
continue;
}
if claim.eq_ignore_ascii_case("none")
|| claim.eq_ignore_ascii_case("no contradictions")
{
continue;
}
out.push(FactFinding {
chapter: chapter.to_string(),
chapter_index,
claim: claim.to_string(),
fact: fact.to_string(),
detail: detail.to_string(),
});
}
out
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FactCandidate {
pub category: String,
pub statement: String,
pub chapter: String,
}
pub fn parse_candidates(raw: &str, chapter: &str) -> Vec<FactCandidate> {
let mut out = Vec::new();
for line in raw.lines() {
let line = line.trim().trim_start_matches(['-', '*', '•']).trim();
if line.is_empty() {
continue;
}
let parts: Vec<&str> = line.splitn(2, '|').map(str::trim).collect();
if parts.len() != 2 {
continue;
}
let (category, statement) = (parts[0], parts[1]);
if category.is_empty() || statement.is_empty() {
continue;
}
if category.eq_ignore_ascii_case("category")
&& statement.eq_ignore_ascii_case("statement")
{
continue;
}
if category.eq_ignore_ascii_case("none") {
continue;
}
out.push(FactCandidate {
category: category.to_string(),
statement: statement.to_string(),
chapter: chapter.to_string(),
});
}
out
}
pub fn normalise_tokens(text: &str, stemmer: &Option<Stemmer>) -> BTreeSet<String> {
text.split_whitespace()
.map(|w| {
let trimmed = w.trim_matches(|c: char| !c.is_alphanumeric());
crate::text::normalize_stem(trimmed, stemmer)
})
.filter(|w| !w.is_empty())
.collect()
}
pub fn near_duplicate(a: &BTreeSet<String>, b: &BTreeSet<String>, threshold: f64) -> bool {
if a.is_empty() || b.is_empty() {
return false;
}
let inter = a.intersection(b).count() as f64;
let union = a.union(b).count() as f64;
union > 0.0 && (inter / union) >= threshold
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_pipe_delimited_findings() {
let raw = "the capital's first snow | Climate: equatorial, no winter | the basin has no winter\n\
two days after leaving the coast | Geography: capital is 3 days' ride inland | distance understated";
let f = parse_findings(raw, "The Wharf", 2);
assert_eq!(f.len(), 2);
assert_eq!(f[0].claim, "the capital's first snow");
assert_eq!(f[0].fact, "Climate: equatorial, no winter");
assert_eq!(f[0].chapter, "The Wharf");
assert_eq!(f[0].chapter_index, 2);
assert_eq!(f[1].claim, "two days after leaving the coast");
}
#[test]
fn skips_malformed_preamble_header_and_markers() {
let raw = "Here is what I found:\n\
\n\
- snow in the capital | Climate: tropical | no winter here\n\
this line has no pipes\n\
claim | fact | detail\n\
half a claim | only one bar\n\
* overnight ride | Geography: 3 days | far too fast";
let f = parse_findings(raw, "Ch1", 0);
assert_eq!(f.len(), 2);
assert_eq!(f[0].claim, "snow in the capital");
assert_eq!(f[1].claim, "overnight ride");
}
#[test]
fn none_sentinel_yields_nothing() {
assert!(parse_findings("none | — | —", "Ch1", 0).is_empty());
assert!(parse_findings("", "Ch1", 0).is_empty());
assert!(parse_findings("No contradictions found.", "Ch1", 0).is_empty());
}
#[test]
fn sidecar_round_trips() {
let tmp = tempfile::tempdir().unwrap();
let report = FactScanReport {
version: "1.2.21".into(),
language: "english".into(),
findings: vec![FactFinding {
chapter: "The Wharf".into(),
chapter_index: 2,
claim: "first snow".into(),
fact: "Climate: equatorial".into(),
detail: "no winter".into(),
}],
};
report.save(tmp.path()).unwrap();
let loaded = FactScanReport::load(tmp.path()).unwrap();
assert_eq!(loaded.findings, report.findings);
assert_eq!(loaded.language, "english");
}
#[test]
fn load_missing_sidecar_is_empty() {
let tmp = tempfile::tempdir().unwrap();
let r = FactScanReport::load(tmp.path()).unwrap();
assert!(r.findings.is_empty());
}
#[test]
fn parses_category_statement_candidates() {
let raw = "Here are the world facts:\n\
- climate | The Sael basin is equatorial; no winter.\n\
geography | The capital is three days' ride inland.\n\
category | statement\n\
none\n\
half a line";
let c = parse_candidates(raw, "Arrivals");
assert_eq!(c.len(), 2);
assert_eq!(c[0].category, "climate");
assert_eq!(c[0].statement, "The Sael basin is equatorial; no winter.");
assert_eq!(c[0].chapter, "Arrivals");
assert_eq!(c[1].category, "geography");
}
#[test]
fn near_duplicate_detects_inflected_restatement() {
let stemmer: Option<Stemmer> = crate::config::parse_stemmer_language("english")
.map(Stemmer::create);
let a = normalise_tokens("The capital is three days' ride inland", &stemmer);
let b = normalise_tokens("the capitals are three days riding inland", &stemmer);
assert!(near_duplicate(&a, &b, 0.6));
let c = normalise_tokens("Winter lasts six months in the north", &stemmer);
assert!(!near_duplicate(&a, &c, 0.6));
assert!(!near_duplicate(&a, &BTreeSet::new(), 0.6));
}
}