Skip to main content

spool/
wiki_lint.rs

1//! Wiki lint — 周期化知识库一致性检查。
2//!
3//! Karpathy LLM Wiki 的 Lint 阶段在 spool 的具体化:把已有的
4//! `consolidation::detect_prune_candidates` (staleness / expired / superseded)
5//! 和新的 orphan / broken cross-ref 检测 合成一份 `LintReport`,供 CLI / MCP /
6//! wakeup 统一消费。
7//!
8//! ## 设计原则
9//! - 纯读,不改 ledger / 不写 vault。消费方决定如何处理。
10//! - 复用 `consolidation::detect_prune_candidates`,不重建第二套 staleness 逻辑。
11//! - orphan / cross-ref 是本模块新增: orphan 发现 vault 中有 canonical note 但
12//!   ledger 无对应 record;broken cross-ref 发现 `related_records` 指向不存在
13//!   记录。
14
15use crate::domain::MemoryLifecycleState;
16use crate::knowledge::cluster::{self as consolidation, PruneSuggestion};
17use crate::lifecycle_store::{LedgerEntry, LifecycleStore, latest_state_entries};
18use crate::vault_writer::{MEMORY_LEDGER_COMPILED_DIR, MEMORY_LEDGER_DIR};
19use anyhow::{Context, Result};
20use serde::Serialize;
21use std::collections::HashSet;
22use std::fs;
23use std::path::Path;
24use ts_rs::TS;
25
26#[derive(Debug, Clone, Serialize, TS)]
27#[ts(export, export_to = "../frontend/src/lib/types/generated/")]
28pub struct BrokenCrossRef {
29    pub record_id: String,
30    pub title: String,
31    /// 指向的目标 record_id (不存在于 ledger)
32    pub missing_target: String,
33    #[ts(type = "string")]
34    pub field: &'static str,
35}
36
37#[derive(Debug, Clone, Serialize, TS)]
38#[ts(export, export_to = "../frontend/src/lib/types/generated/")]
39pub struct OrphanNote {
40    /// 相对 vault_root 的路径 (如 "50-Memory-Ledger/Extracted/abc.md")
41    pub relative_path: String,
42    /// 从文件名反推的 record_id
43    pub record_id: String,
44}
45
46#[derive(Debug, Clone, Serialize, TS)]
47#[ts(export, export_to = "../frontend/src/lib/types/generated/")]
48pub struct LintReport {
49    pub total_active_records: usize,
50    pub prune_suggestions: Vec<PruneSuggestion>,
51    pub broken_cross_refs: Vec<BrokenCrossRef>,
52    pub orphan_notes: Vec<OrphanNote>,
53}
54
55impl LintReport {
56    pub fn is_clean(&self) -> bool {
57        self.prune_suggestions.is_empty()
58            && self.broken_cross_refs.is_empty()
59            && self.orphan_notes.is_empty()
60    }
61
62    pub fn summary_line(&self) -> String {
63        format!(
64            "lint: {} active records, {} prune, {} broken cross-refs, {} orphan notes",
65            self.total_active_records,
66            self.prune_suggestions.len(),
67            self.broken_cross_refs.len(),
68            self.orphan_notes.len(),
69        )
70    }
71}
72
73/// 从 config_path 加载,跑完整 lint pass。失败降级为 anyhow::Err 供调用方自行处理。
74pub fn run_lint_from_config(config_path: &Path) -> Result<LintReport> {
75    let config = crate::app::load(config_path)
76        .with_context(|| format!("failed to load config {}", config_path.display()))?;
77    let vault_root = crate::app::resolve_override_path(&config.vault.root, config_path)
78        .context("failed to resolve vault root")?;
79    let config_dir = config_path.parent().unwrap_or_else(|| Path::new("."));
80    let lifecycle_root = crate::lifecycle_store::lifecycle_root_from_config(config_dir);
81    let store = LifecycleStore::new(&lifecycle_root);
82    let entries = latest_state_entries(&store).context("failed to read ledger entries")?;
83    Ok(run_lint(&entries, &lifecycle_root, &vault_root))
84}
85
86/// 核心 lint 逻辑 — 纯函数,方便测试。
87pub fn run_lint(entries: &[LedgerEntry], lifecycle_root: &Path, vault_root: &Path) -> LintReport {
88    let total_active_records = entries
89        .iter()
90        .filter(|e| {
91            matches!(
92                e.record.state,
93                MemoryLifecycleState::Accepted | MemoryLifecycleState::Canonical
94            )
95        })
96        .count();
97
98    let prune_suggestions = consolidation::detect_prune_candidates(entries, lifecycle_root);
99    let broken_cross_refs = detect_broken_cross_refs(entries);
100    let orphan_notes = detect_orphan_notes(entries, vault_root).unwrap_or_default();
101
102    LintReport {
103        total_active_records,
104        prune_suggestions,
105        broken_cross_refs,
106        orphan_notes,
107    }
108}
109
110/// 扫 accepted / canonical 记录的 `related_records` / `supersedes` 字段,
111/// 找出指向不存在 record_id 的引用。
112fn detect_broken_cross_refs(entries: &[LedgerEntry]) -> Vec<BrokenCrossRef> {
113    let known_ids: HashSet<&str> = entries.iter().map(|e| e.record_id.as_str()).collect();
114    let mut broken = Vec::new();
115
116    for entry in entries {
117        if !matches!(
118            entry.record.state,
119            MemoryLifecycleState::Accepted | MemoryLifecycleState::Canonical
120        ) {
121            continue;
122        }
123        for target in &entry.record.related_records {
124            if !known_ids.contains(target.as_str()) {
125                broken.push(BrokenCrossRef {
126                    record_id: entry.record_id.clone(),
127                    title: entry.record.title.clone(),
128                    missing_target: target.clone(),
129                    field: "related_records",
130                });
131            }
132        }
133        if let Some(ref target) = entry.record.supersedes
134            && !known_ids.contains(target.as_str())
135        {
136            broken.push(BrokenCrossRef {
137                record_id: entry.record_id.clone(),
138                title: entry.record.title.clone(),
139                missing_target: target.clone(),
140                field: "supersedes",
141            });
142        }
143    }
144
145    broken
146}
147
148/// 扫 vault `50-Memory-Ledger/{Extracted,Compiled}/` 目录,找出没有对应
149/// ledger record 的 `.md` 文件 (文件名即 record_id)。
150fn detect_orphan_notes(entries: &[LedgerEntry], vault_root: &Path) -> Result<Vec<OrphanNote>> {
151    let known_ids: HashSet<&str> = entries.iter().map(|e| e.record_id.as_str()).collect();
152    let mut orphans = Vec::new();
153
154    for rel_dir in [MEMORY_LEDGER_DIR, MEMORY_LEDGER_COMPILED_DIR] {
155        let dir = vault_root.join(rel_dir);
156        if !dir.is_dir() {
157            continue;
158        }
159        let reader = match fs::read_dir(&dir) {
160            Ok(r) => r,
161            Err(_) => continue,
162        };
163        for entry in reader.flatten() {
164            let path = entry.path();
165            if path.extension().and_then(|s| s.to_str()) != Some("md") {
166                continue;
167            }
168            let record_id = match path.file_stem().and_then(|s| s.to_str()) {
169                Some(s) => s.to_string(),
170                None => continue,
171            };
172            if known_ids.contains(record_id.as_str()) {
173                continue;
174            }
175            let rel_path = relative_path(vault_root, &path);
176            orphans.push(OrphanNote {
177                relative_path: rel_path,
178                record_id,
179            });
180        }
181    }
182
183    Ok(orphans)
184}
185
186fn relative_path(base: &Path, absolute: &Path) -> String {
187    absolute
188        .strip_prefix(base)
189        .map(|p| p.display().to_string())
190        .unwrap_or_else(|_| absolute.display().to_string())
191        .replace(std::path::MAIN_SEPARATOR, "/")
192}
193
194/// Render as markdown summary for wakeup / CLI display.
195pub fn render_lint_markdown(report: &LintReport) -> String {
196    let mut out = String::new();
197    out.push_str("# Wiki Lint Report\n\n");
198    out.push_str(&format!("{}\n\n", report.summary_line()));
199
200    if report.is_clean() {
201        out.push_str("✓ 知识库干净,无需清理。\n");
202        return out;
203    }
204
205    if !report.prune_suggestions.is_empty() {
206        out.push_str("## 可归档\n\n");
207        for s in &report.prune_suggestions {
208            out.push_str(&format!(
209                "- `{}` {} — {:?}\n",
210                s.record_id, s.title, s.reason
211            ));
212        }
213        out.push('\n');
214    }
215
216    if !report.broken_cross_refs.is_empty() {
217        out.push_str("## 断链\n\n");
218        for b in &report.broken_cross_refs {
219            out.push_str(&format!(
220                "- `{}` {} → `{}` 缺失 (字段 {})\n",
221                b.record_id, b.title, b.missing_target, b.field
222            ));
223        }
224        out.push('\n');
225    }
226
227    if !report.orphan_notes.is_empty() {
228        out.push_str("## 孤儿 note\n\n");
229        for o in &report.orphan_notes {
230            out.push_str(&format!(
231                "- `{}` (record_id `{}`)\n",
232                o.relative_path, o.record_id
233            ));
234        }
235        out.push('\n');
236    }
237
238    out
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244    use crate::domain::{
245        MemoryLedgerAction, MemoryLifecycleState, MemoryOrigin, MemoryRecord, MemoryScope,
246        MemorySourceKind,
247    };
248    use crate::lifecycle_store::TransitionMetadata;
249    use std::fs;
250    use tempfile::tempdir;
251
252    fn entry_with(record_id: &str, state: MemoryLifecycleState) -> LedgerEntry {
253        LedgerEntry {
254            schema_version: "memory-ledger.v1".to_string(),
255            recorded_at: "unix:1".to_string(),
256            record_id: record_id.to_string(),
257            scope_key: "user:long".to_string(),
258            action: MemoryLedgerAction::RecordManual,
259            source_kind: MemorySourceKind::Manual,
260            metadata: TransitionMetadata::default(),
261            record: MemoryRecord {
262                title: format!("title-{record_id}"),
263                summary: "s".to_string(),
264                memory_type: "preference".to_string(),
265                scope: MemoryScope::User,
266                state,
267                origin: MemoryOrigin {
268                    source_kind: MemorySourceKind::Manual,
269                    source_ref: "m".to_string(),
270                },
271                project_id: None,
272                user_id: None,
273                sensitivity: None,
274                entities: Vec::new(),
275                tags: Vec::new(),
276                triggers: Vec::new(),
277                related_files: Vec::new(),
278                related_records: Vec::new(),
279                supersedes: None,
280                applies_to: Vec::new(),
281                valid_until: None,
282            },
283        }
284    }
285
286    #[test]
287    fn detect_broken_cross_refs_should_flag_missing_related_records_and_supersedes() {
288        let mut a = entry_with("rec-a", MemoryLifecycleState::Accepted);
289        a.record.related_records = vec!["rec-b".to_string(), "rec-missing".to_string()];
290        a.record.supersedes = Some("rec-also-missing".to_string());
291
292        let b = entry_with("rec-b", MemoryLifecycleState::Accepted);
293
294        // Candidate entries are not scanned.
295        let mut c = entry_with("rec-c", MemoryLifecycleState::Candidate);
296        c.record.related_records = vec!["rec-never".to_string()];
297
298        let entries = vec![a, b, c];
299        let broken = detect_broken_cross_refs(&entries);
300        let missing: HashSet<_> = broken.iter().map(|b| b.missing_target.clone()).collect();
301        assert!(missing.contains("rec-missing"));
302        assert!(missing.contains("rec-also-missing"));
303        assert!(!missing.contains("rec-never"));
304        assert!(!missing.contains("rec-b"));
305    }
306
307    #[test]
308    fn detect_orphan_notes_should_find_md_files_without_matching_record() {
309        let temp = tempdir().unwrap();
310        let extracted = temp.path().join(MEMORY_LEDGER_DIR);
311        let compiled = temp.path().join(MEMORY_LEDGER_COMPILED_DIR);
312        fs::create_dir_all(&extracted).unwrap();
313        fs::create_dir_all(&compiled).unwrap();
314
315        fs::write(extracted.join("rec-known.md"), "# known").unwrap();
316        fs::write(extracted.join("rec-orphan-a.md"), "# orphan").unwrap();
317        fs::write(compiled.join("wiki-orphan.md"), "# compiled orphan").unwrap();
318        fs::write(extracted.join("not-markdown.txt"), "skip").unwrap();
319
320        let entries = vec![entry_with("rec-known", MemoryLifecycleState::Accepted)];
321        let orphans = detect_orphan_notes(&entries, temp.path()).unwrap();
322        let ids: HashSet<_> = orphans.iter().map(|o| o.record_id.clone()).collect();
323        assert!(ids.contains("rec-orphan-a"));
324        assert!(ids.contains("wiki-orphan"));
325        assert!(!ids.contains("rec-known"));
326        assert!(!ids.contains("not-markdown"));
327    }
328
329    #[test]
330    fn run_lint_should_count_active_and_compose_sub_reports() {
331        let temp = tempdir().unwrap();
332        let lifecycle_root = temp.path().join(".spool");
333        fs::create_dir_all(&lifecycle_root).unwrap();
334
335        let a = entry_with("rec-a", MemoryLifecycleState::Accepted);
336        let b = entry_with("rec-b", MemoryLifecycleState::Canonical);
337        let c = entry_with("rec-c", MemoryLifecycleState::Candidate);
338
339        let report = run_lint(&[a, b, c], &lifecycle_root, temp.path());
340        assert_eq!(report.total_active_records, 2);
341        assert!(report.prune_suggestions.is_empty());
342        assert!(report.broken_cross_refs.is_empty());
343        assert!(report.orphan_notes.is_empty());
344        assert!(report.is_clean());
345    }
346
347    #[test]
348    fn render_lint_markdown_should_highlight_issues() {
349        let temp = tempdir().unwrap();
350        let mut a = entry_with("rec-a", MemoryLifecycleState::Accepted);
351        a.record.related_records = vec!["rec-gone".to_string()];
352
353        let report = run_lint(&[a], temp.path(), temp.path());
354        let md = render_lint_markdown(&report);
355        assert!(md.contains("断链"));
356        assert!(md.contains("rec-gone"));
357    }
358}