use std::collections::BTreeMap;
use std::fs;
use std::io;
use std::path::Path;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use super::types::PrefixTier;
use crate::index::AnnotationId;
pub const SCHEMA_VERSION: u32 = 1;
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
pub struct CanonMatchesFile {
#[serde(rename = "__meta__", default)]
pub meta: CacheMeta,
#[serde(flatten)]
pub entries: BTreeMap<AnnotationId, CacheEntry>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct CacheMeta {
pub schema_version: u32,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub canon_version: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_fetched: Option<String>,
}
impl Default for CacheMeta {
fn default() -> Self {
Self {
schema_version: SCHEMA_VERSION,
canon_version: None,
last_fetched: None,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct CacheEntry {
pub last_match_text_hash: String,
pub canon_fetched_at: String,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pending_matches: Vec<PendingMatch>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub accepted_matches: Vec<AcceptedMatch>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub rejected_matches: Vec<RejectedMatch>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct PendingMatch {
pub canon_id: String,
pub version: String,
pub canonical_text: String,
pub canon_version: String,
pub confidence: f64,
pub prefix_tier: PrefixTier,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub backed_by: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub linked: Option<String>,
pub disposition: Disposition,
pub found_at: String,
pub found_by: String,
}
impl Eq for PendingMatch {}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "kebab-case")]
pub enum Disposition {
Open,
Skipped,
Accepted,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct AcceptedMatch {
pub canon_id: String,
pub version: String,
pub canonical_text: String,
pub canon_version: String,
pub confidence: f64,
pub prefix_tier: PrefixTier,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub backed_by: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub linked: Option<String>,
pub accepted_at: String,
pub bound_at: String,
}
impl Eq for AcceptedMatch {}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct RejectedMatch {
pub canon_id: String,
pub version: String,
pub text_hash: String,
pub rejected_at: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub reason: Option<String>,
}
impl CanonMatchesFile {
pub fn read(path: &Path) -> io::Result<Self> {
match fs::read_to_string(path) {
Ok(raw) => toml::from_str(&raw)
.map_err(|e| io::Error::other(format!("parse {}: {e}", path.display()))),
Err(e) if e.kind() == io::ErrorKind::NotFound => Ok(Self::default()),
Err(e) => Err(e),
}
}
pub fn write_atomic(&self, path: &Path) -> io::Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
let toml_text = toml::to_string_pretty(self)
.map_err(|e| io::Error::other(format!("serialize canon-matches: {e}")))?;
let tmp = path.with_extension("toml.tmp");
fs::write(&tmp, toml_text.as_bytes())?;
fs::rename(&tmp, path)?;
Ok(())
}
}
impl CacheEntry {
pub fn is_fresh_for(
&self,
current_text_hash: &str,
is_version_active: impl Fn(&str, &str) -> bool,
) -> bool {
if self.last_match_text_hash != current_text_hash {
return false;
}
let stale_in = |matches: &[(&str, &str)]| -> bool {
!matches
.iter()
.all(|(canon_id, version)| is_version_active(canon_id, version))
};
let pending_pairs: Vec<(&str, &str)> = self
.pending_matches
.iter()
.map(|m| (m.canon_id.as_str(), m.version.as_str()))
.collect();
if stale_in(&pending_pairs) {
return false;
}
let accepted_pairs: Vec<(&str, &str)> = self
.accepted_matches
.iter()
.map(|m| (m.canon_id.as_str(), m.version.as_str()))
.collect();
if stale_in(&accepted_pairs) {
return false;
}
true
}
pub fn is_rejected(&self, canon_id: &str, text_hash: &str) -> bool {
self.rejected_matches
.iter()
.any(|r| r.canon_id == canon_id && r.text_hash == text_hash)
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn aid(s: &str) -> AnnotationId {
AnnotationId::parse(s).unwrap_or_else(|e| panic!("parse {s:?}: {e}"))
}
fn sample_pending() -> PendingMatch {
PendingMatch {
canon_id: "cell_written_exactly_once_per_page_edit".into(),
version: "v0.2.1".into(),
canonical_text: "edit_page writes each cell exactly once".into(),
canon_version: "v0.2.0".into(),
confidence: 0.92,
prefix_tier: PrefixTier::Aristos,
backed_by: Some("specialized neural checker".into()),
linked: Some("arta_a1b2c3d4".into()),
disposition: Disposition::Open,
found_at: "2026-06-15T09:14:22Z".into(),
found_by: "aristo stamp".into(),
}
}
fn sample_accepted() -> AcceptedMatch {
AcceptedMatch {
canon_id: "cell_written_exactly_once_per_page_edit".into(),
version: "v0.2.1".into(),
canonical_text: "edit_page writes each cell exactly once".into(),
canon_version: "v0.2.0".into(),
confidence: 1.0,
prefix_tier: PrefixTier::Aristos,
backed_by: Some("specialized neural checker".into()),
linked: Some("arta_a1b2c3d4".into()),
accepted_at: "2026-06-15T09:20:00Z".into(),
bound_at: "2026-06-15T09:20:00Z".into(),
}
}
fn sample_rejected() -> RejectedMatch {
RejectedMatch {
canon_id: "some_unrelated_entry".into(),
version: "v0.1.2".into(),
text_hash: "blake3:c2f7a912".into(),
rejected_at: "2026-06-13T11:48:00Z".into(),
reason: Some("intentionally narrower than canon entry".into()),
}
}
#[test]
fn empty_file_round_trips() {
let f = CanonMatchesFile::default();
let s = toml::to_string(&f).unwrap();
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
assert_eq!(back.meta.schema_version, SCHEMA_VERSION);
}
#[test]
fn disposition_all_variants_round_trip_as_kebab_case() {
for (variant, wire) in [
(Disposition::Open, "\"open\""),
(Disposition::Skipped, "\"skipped\""),
(Disposition::Accepted, "\"accepted\""),
] {
let s = serde_json::to_string(&variant).unwrap();
assert_eq!(s, wire, "variant {variant:?} should serialize as {wire}");
let back: Disposition = serde_json::from_str(wire).unwrap();
assert_eq!(back, variant);
}
}
#[test]
fn pending_match_with_accepted_disposition_round_trips() {
let mut p = sample_pending();
p.disposition = Disposition::Accepted;
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("edit_page_cell_write_invariant"),
CacheEntry {
last_match_text_hash: "blake3:7f3a9e2c".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![p],
accepted_matches: vec![],
rejected_matches: vec![],
},
);
let s = toml::to_string(&f).unwrap();
assert!(s.contains("disposition = \"accepted\""), "got: {s}");
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
}
#[test]
fn pending_match_round_trips_with_aristos_tier() {
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("edit_page_cell_write_invariant"),
CacheEntry {
last_match_text_hash: "blake3:7f3a9e2c".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![sample_pending()],
accepted_matches: vec![],
rejected_matches: vec![],
},
);
let s = toml::to_string(&f).unwrap();
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
let entry = &back.entries[&aid("edit_page_cell_write_invariant")];
assert_eq!(entry.pending_matches[0].prefix_tier, PrefixTier::Aristos);
assert_eq!(
entry.pending_matches[0].backed_by.as_deref(),
Some("specialized neural checker")
);
}
#[test]
fn kanon_tier_pending_omits_backed_by_in_serialized_form() {
let mut p = sample_pending();
p.prefix_tier = PrefixTier::Kanon;
p.backed_by = None;
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("foo"),
CacheEntry {
last_match_text_hash: "blake3:abc".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![p],
accepted_matches: vec![],
rejected_matches: vec![],
},
);
let s = toml::to_string(&f).unwrap();
assert!(!s.contains("backed_by"), "got: {s}");
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back.entries[&aid("foo")].pending_matches[0].backed_by, None);
}
#[test]
fn accepted_match_round_trips_with_linked_field() {
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("aristos:cell_written_exactly_once_per_page_edit"),
CacheEntry {
last_match_text_hash: "blake3:9d4e2f01".into(),
canon_fetched_at: "2026-06-15T09:30:00Z".into(),
pending_matches: vec![],
accepted_matches: vec![sample_accepted()],
rejected_matches: vec![],
},
);
let s = toml::to_string(&f).unwrap();
assert!(s.contains("linked = \"arta_a1b2c3d4\""), "got: {s}");
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
}
#[test]
fn accepted_match_without_linked_omits_field_on_disk() {
let mut acc = sample_accepted();
acc.linked = None;
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("aristos:foo"),
CacheEntry {
last_match_text_hash: "blake3:abc".into(),
canon_fetched_at: "2026-06-15T09:30:00Z".into(),
pending_matches: vec![],
accepted_matches: vec![acc],
rejected_matches: vec![],
},
);
let s = toml::to_string(&f).unwrap();
assert!(!s.contains("linked ="), "got: {s}");
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
}
#[test]
fn canon_prefixed_keys_round_trip() {
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("aristos:cell_written_exactly_once_per_page_edit"),
CacheEntry {
last_match_text_hash: "blake3:9d4e2f01".into(),
canon_fetched_at: "2026-06-15T09:30:00Z".into(),
pending_matches: vec![],
accepted_matches: vec![sample_accepted()],
rejected_matches: vec![],
},
);
f.entries.insert(
aid("kanon:checkout_total_non_negative"),
CacheEntry {
last_match_text_hash: "blake3:a4f721e8".into(),
canon_fetched_at: "2026-06-15T09:30:00Z".into(),
pending_matches: vec![],
accepted_matches: vec![AcceptedMatch {
canon_id: "checkout_total_non_negative".into(),
version: "v0.1.0".into(),
canonical_text: "checkout total is non-negative".into(),
canon_version: "v0.2.0".into(),
confidence: 0.94,
prefix_tier: PrefixTier::Kanon,
backed_by: None,
linked: None,
accepted_at: "2026-06-14T17:02:11Z".into(),
bound_at: "2026-06-14T17:02:11Z".into(),
}],
rejected_matches: vec![],
},
);
let s = toml::to_string(&f).unwrap();
assert!(s.contains("\"aristos:cell_written"), "got: {s}");
assert!(s.contains("\"kanon:checkout_total"), "got: {s}");
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
}
#[test]
fn rejected_match_round_trip() {
let mut f = CanonMatchesFile::default();
f.entries.insert(
aid("my_local_invariant"),
CacheEntry {
last_match_text_hash: "blake3:c2f7a912".into(),
canon_fetched_at: "2026-06-13T10:15:00Z".into(),
pending_matches: vec![],
accepted_matches: vec![],
rejected_matches: vec![sample_rejected()],
},
);
let s = toml::to_string(&f).unwrap();
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
}
#[test]
fn meta_block_round_trips_with_canon_version() {
let f = CanonMatchesFile {
meta: CacheMeta {
schema_version: 1,
canon_version: Some("v0.2.0".into()),
last_fetched: Some("2026-06-15T09:14:22Z".into()),
},
entries: BTreeMap::new(),
};
let s = toml::to_string(&f).unwrap();
assert!(s.contains("schema_version = 1"));
assert!(s.contains("canon_version = \"v0.2.0\""));
let back: CanonMatchesFile = toml::from_str(&s).unwrap();
assert_eq!(back, f);
}
#[test]
fn unknown_field_inside_pending_match_rejected() {
let toml_text = r#"
[__meta__]
schema_version = 1
[foo]
last_match_text_hash = "blake3:x"
canon_fetched_at = "2026-06-15T09:14:22Z"
[[foo.pending_matches]]
canon_id = "x"
version = "v0.1.0"
canonical_text = "y"
canon_version = "v0.2.0"
confidence = 0.9
prefix_tier = "aristos:"
linked = "arta_x"
disposition = "open"
found_at = "2026-06-15T09:14:22Z"
found_by = "aristo stamp"
unknown_field = "should reject"
"#;
let result: Result<CanonMatchesFile, _> = toml::from_str(toml_text);
assert!(result.is_err(), "expected deny_unknown_fields rejection");
}
#[test]
fn worked_example_matches_locked_sample_shape() {
let raw = r#"
[__meta__]
schema_version = 1
canon_version = "v0.2.0"
last_fetched = "2026-06-15T09:14:22Z"
[edit_page_cell_write_invariant]
last_match_text_hash = "blake3:7f3a9e2c..."
canon_fetched_at = "2026-06-15T09:14:22Z"
[[edit_page_cell_write_invariant.pending_matches]]
canon_id = "cell_written_exactly_once_per_page_edit"
version = "v0.2.1"
canonical_text = "edit_page writes each cell exactly once"
canon_version = "v0.2.0"
confidence = 0.92
prefix_tier = "aristos:"
backed_by = "specialized neural checker"
linked = "arta_a1b2c3d4..."
disposition = "open"
found_at = "2026-06-15T09:14:22Z"
found_by = "aristo stamp"
["aristos:cell_written_exactly_once_per_page_edit"]
last_match_text_hash = "blake3:9d4e2f01..."
canon_fetched_at = "2026-06-15T09:30:00Z"
[["aristos:cell_written_exactly_once_per_page_edit".accepted_matches]]
canon_id = "cell_written_exactly_once_per_page_edit"
version = "v0.2.1"
canonical_text = "edit_page writes each cell exactly once"
canon_version = "v0.2.0"
confidence = 1.0
prefix_tier = "aristos:"
backed_by = "specialized neural checker"
linked = "arta_a1b2c3d4..."
accepted_at = "2026-06-15T09:20:00Z"
bound_at = "2026-06-15T09:20:00Z"
["kanon:checkout_total_non_negative"]
last_match_text_hash = "blake3:a4f721e8..."
canon_fetched_at = "2026-06-15T09:30:00Z"
[["kanon:checkout_total_non_negative".accepted_matches]]
canon_id = "checkout_total_non_negative"
version = "v0.1.0"
canonical_text = "checkout total is non-negative"
canon_version = "v0.2.0"
confidence = 0.94
prefix_tier = "kanon:"
accepted_at = "2026-06-14T17:02:11Z"
bound_at = "2026-06-14T17:02:11Z"
[my_local_invariant]
last_match_text_hash = "blake3:c2f7a912..."
canon_fetched_at = "2026-06-13T10:15:00Z"
[[my_local_invariant.rejected_matches]]
canon_id = "some_unrelated_entry"
version = "v0.1.2"
text_hash = "blake3:c2f7a912..."
rejected_at = "2026-06-13T11:48:00Z"
reason = "intentionally narrower than canon entry"
"#;
let parsed: CanonMatchesFile =
toml::from_str(raw).unwrap_or_else(|e| panic!("locked sample shape must parse: {e}"));
assert!(parsed
.entries
.contains_key(&aid("edit_page_cell_write_invariant")));
assert!(parsed
.entries
.contains_key(&aid("aristos:cell_written_exactly_once_per_page_edit")));
assert!(parsed
.entries
.contains_key(&aid("kanon:checkout_total_non_negative")));
assert!(parsed.entries.contains_key(&aid("my_local_invariant")));
assert_eq!(parsed.meta.canon_version.as_deref(), Some("v0.2.0"));
let kanon_entry = &parsed.entries[&aid("kanon:checkout_total_non_negative")];
assert_eq!(kanon_entry.accepted_matches[0].backed_by, None);
assert_eq!(
kanon_entry.accepted_matches[0].prefix_tier,
PrefixTier::Kanon
);
let aristos_entry =
&parsed.entries[&aid("aristos:cell_written_exactly_once_per_page_edit")];
assert_eq!(
aristos_entry.accepted_matches[0].linked.as_deref(),
Some("arta_a1b2c3d4..."),
);
assert_eq!(kanon_entry.accepted_matches[0].linked, None);
}
#[test]
fn read_missing_file_returns_default() {
let tmp = TempDir::new().unwrap();
let p = tmp.path().join("never-existed.toml");
let f = CanonMatchesFile::read(&p).unwrap();
assert_eq!(f, CanonMatchesFile::default());
}
#[test]
fn read_then_write_round_trips_through_disk() {
let tmp = TempDir::new().unwrap();
let p = tmp.path().join(".aristo/canon-matches.toml");
let mut original = CanonMatchesFile::default();
original.entries.insert(
aid("foo"),
CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![sample_pending()],
accepted_matches: vec![],
rejected_matches: vec![],
},
);
original.meta.canon_version = Some("v0.2.0".into());
original.write_atomic(&p).unwrap();
let loaded = CanonMatchesFile::read(&p).unwrap();
assert_eq!(loaded, original);
}
#[test]
fn write_atomic_creates_parent_directory() {
let tmp = TempDir::new().unwrap();
let p = tmp.path().join("deep/nested/.aristo/canon-matches.toml");
assert!(!p.parent().unwrap().exists());
CanonMatchesFile::default().write_atomic(&p).unwrap();
assert!(p.exists());
}
#[test]
fn write_atomic_replaces_existing_file() {
let tmp = TempDir::new().unwrap();
let p = tmp.path().join("canon-matches.toml");
fs::write(&p, b"junk that should be overwritten").unwrap();
let f = CanonMatchesFile {
meta: CacheMeta {
schema_version: 1,
canon_version: Some("v0.2.0".into()),
last_fetched: None,
},
entries: BTreeMap::new(),
};
f.write_atomic(&p).unwrap();
let loaded = CanonMatchesFile::read(&p).unwrap();
assert_eq!(loaded, f);
}
#[test]
fn write_atomic_leaves_no_tmp_file_on_success() {
let tmp = TempDir::new().unwrap();
let p = tmp.path().join("canon-matches.toml");
CanonMatchesFile::default().write_atomic(&p).unwrap();
let tmp_path = p.with_extension("toml.tmp");
assert!(!tmp_path.exists(), "tmp file should have been renamed");
}
#[test]
fn malformed_file_returns_io_error_not_default() {
let tmp = TempDir::new().unwrap();
let p = tmp.path().join("canon-matches.toml");
fs::write(&p, b"this is not TOML = = =").unwrap();
let err = CanonMatchesFile::read(&p).unwrap_err();
assert!(err.to_string().contains("parse"), "got: {err}");
}
#[test]
fn cache_hit_when_text_hash_matches_and_versions_active() {
let entry = CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![sample_pending()],
accepted_matches: vec![],
rejected_matches: vec![],
};
let is_active = |_cid: &str, _v: &str| true;
assert!(entry.is_fresh_for("blake3:x", is_active));
}
#[test]
fn cache_miss_when_text_hash_changes() {
let entry = CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![],
accepted_matches: vec![],
rejected_matches: vec![],
};
assert!(!entry.is_fresh_for("blake3:DIFFERENT", |_, _| true));
}
#[test]
fn cache_miss_when_pending_match_version_no_longer_active() {
let entry = CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![sample_pending()], accepted_matches: vec![],
rejected_matches: vec![],
};
let is_active = |_cid: &str, version: &str| version != "v0.2.1";
assert!(!entry.is_fresh_for("blake3:x", is_active));
}
#[test]
fn cache_miss_when_accepted_match_version_no_longer_active() {
let entry = CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![],
accepted_matches: vec![sample_accepted()], rejected_matches: vec![],
};
let is_active = |_cid: &str, version: &str| version != "v0.2.1";
assert!(!entry.is_fresh_for("blake3:x", is_active));
}
#[test]
fn rejected_matches_dont_affect_freshness() {
let entry = CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-15T09:14:22Z".into(),
pending_matches: vec![],
accepted_matches: vec![],
rejected_matches: vec![sample_rejected()],
};
let is_active = |_cid: &str, _v: &str| false;
assert!(entry.is_fresh_for("blake3:x", is_active));
}
#[test]
fn is_rejected_matches_canon_id_and_text_hash() {
let entry = CacheEntry {
last_match_text_hash: "blake3:x".into(),
canon_fetched_at: "2026-06-13T10:15:00Z".into(),
pending_matches: vec![],
accepted_matches: vec![],
rejected_matches: vec![sample_rejected()],
};
assert!(entry.is_rejected("some_unrelated_entry", "blake3:c2f7a912"));
assert!(!entry.is_rejected("other_id", "blake3:c2f7a912"));
assert!(!entry.is_rejected("some_unrelated_entry", "blake3:DIFFERENT"));
}
}