1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
use std::collections::{BTreeMap, BTreeSet};
use std::path::{Path, PathBuf};
use super::io::{read_codewiki_meta, safe_doc_path};
use super::{BuiltDoc, CODEWIKI_RENDER_VERSION, CodewikiDocMeta, SourceSpan};
use crate::index::hasher;
/// Decides whether a doc's previous content can be reused without any LLM
/// call: same AI mode, healthy, sources unchanged, and still on disk (#681).
/// Degraded docs are never reusable (#687).
pub(crate) struct ReusePlan {
project_root: PathBuf,
out_dir: PathBuf,
ai_mode: String,
docs: BTreeMap<String, CodewikiDocMeta>,
/// Lazy current-content hashes; `None` records an unhashable file so a
/// missing source is probed once and never reused.
current_hashes: BTreeMap<String, Option<String>>,
/// Files git reported as possibly-changed since the `--since` ref (Leaf H,
/// #893). When `Some`, a file outside this set keeps its recorded hash
/// without a disk read, so an unchanged page reuses without a re-hash;
/// dependents fall out naturally because a page is reused only when every
/// one of its own sources and neighbors hashes as recorded. `None` runs a
/// full content-hash scan (the from-scratch default).
since: Option<BTreeSet<String>>,
/// File → recorded content hash from the previous run's metadata, used to
/// answer `current_hash` for `--since`-unchanged files without disk I/O.
recorded_hashes: BTreeMap<String, String>,
}
impl ReusePlan {
#[cfg(test)]
pub(crate) fn load(project_root: &Path, out_dir: &Path, ai_mode: &str) -> anyhow::Result<Self> {
Self::load_with_since(project_root, out_dir, ai_mode, None)
}
/// Like [`ReusePlan::load`] but scoping the change set to the files git
/// reports changed since a ref. `None` is the full-scan default (Leaf H).
pub(crate) fn load_with_since(
project_root: &Path,
out_dir: &Path,
ai_mode: &str,
since: Option<BTreeSet<String>>,
) -> anyhow::Result<Self> {
let previous = read_codewiki_meta(out_dir)?;
let mut recorded_hashes = BTreeMap::new();
for meta in previous.docs.values() {
for (file, hash) in meta.source_hashes.iter().chain(meta.neighbor_hashes.iter()) {
recorded_hashes.insert(file.clone(), hash.clone());
}
}
Ok(Self {
project_root: project_root.to_path_buf(),
out_dir: out_dir.to_path_buf(),
ai_mode: ai_mode.to_string(),
docs: previous.docs,
current_hashes: BTreeMap::new(),
since,
recorded_hashes,
})
}
/// The on-disk page of a reusable doc, or `None` when the doc must be
/// regenerated. Emitting disk content verbatim keeps a forced rewrite
/// lossless.
pub(crate) fn reusable_page(
&mut self,
doc_path: &str,
sources: &BTreeSet<String>,
) -> Option<String> {
if !self.reusable(doc_path, sources, &BTreeSet::new()) {
return None;
}
let target = safe_doc_path(&self.out_dir, doc_path).ok()?;
std::fs::read_to_string(target).ok()
}
/// The on-disk page of a derived aggregate page (architecture,
/// infrastructure, …) keyed on a page-type invalidation digest rather than
/// a source-file set (Leaf H, #893). Reused only when the recorded digest
/// matches `invalidation_key` — so a model-irrelevant edit (a function body)
/// keeps the page even though source files changed.
pub(crate) fn reusable_page_keyed(
&mut self,
doc_path: &str,
invalidation_key: &str,
) -> Option<String> {
let entry = self.docs.get(doc_path)?;
if entry.degraded
|| entry.ai_mode != self.ai_mode
|| entry.render_version != CODEWIKI_RENDER_VERSION
|| entry.invalidation_key.as_deref() != Some(invalidation_key)
{
return None;
}
let target = safe_doc_path(&self.out_dir, doc_path).ok()?;
if !target.exists() {
return None;
}
std::fs::read_to_string(target).ok()
}
pub(crate) fn reusable_page_keyed_with_sources(
&mut self,
doc_path: &str,
invalidation_key: &str,
sources: &BTreeSet<String>,
) -> Option<String> {
let entry = self.docs.get(doc_path)?;
if entry.invalidation_key.as_deref() != Some(invalidation_key) {
return None;
}
if !self.reusable(doc_path, sources, &BTreeSet::new()) {
return None;
}
let target = safe_doc_path(&self.out_dir, doc_path).ok()?;
std::fs::read_to_string(target).ok()
}
/// Both the on-disk page and the recorded summary of a reusable doc.
pub(crate) fn reusable_page_with_summary(
&mut self,
doc_path: &str,
sources: &BTreeSet<String>,
) -> Option<(String, String)> {
self.reusable_page_with_summary_and_neighbors(doc_path, sources, &BTreeSet::new())
}
/// Like [`ReusePlan::reusable_page_with_summary`] but also invalidating the
/// page when a cross-file neighbor's content changed even though the page's
/// own sources did not (#885, Leaf H) — so a caller edit refreshes the
/// callee's relationship narrative.
pub(crate) fn reusable_page_with_summary_and_neighbors(
&mut self,
doc_path: &str,
sources: &BTreeSet<String>,
neighbors: &BTreeSet<String>,
) -> Option<(String, String)> {
let summary = self.docs.get(doc_path)?.summary.clone()?;
if !self.reusable(doc_path, sources, neighbors) {
return None;
}
let target = safe_doc_path(&self.out_dir, doc_path).ok()?;
let page = std::fs::read_to_string(target).ok()?;
Some((page, summary))
}
pub(crate) fn reusable_pages_with_prefixes(
&mut self,
prefixes: &[&str],
) -> Option<Vec<BuiltDoc>> {
let paths = self
.docs
.keys()
.filter(|path| prefixes.iter().any(|prefix| path.starts_with(prefix)))
.cloned()
.collect::<Vec<_>>();
if paths.is_empty() {
return None;
}
let mut docs = Vec::with_capacity(paths.len());
for path in paths {
let entry = self.docs.get(&path)?;
let sources = entry.source_hashes.keys().cloned().collect::<BTreeSet<_>>();
let summary = entry.summary.clone();
let content = self.reusable_page(&path, &sources)?;
docs.push(BuiltDoc {
path,
content,
degraded: false,
summary,
neighbors: BTreeSet::new(),
invalidation_key: None,
invalidation_key_requires_sources: false,
});
}
docs.sort_by(|left, right| left.path.cmp(&right.path));
Some(docs)
}
fn reusable(
&mut self,
doc_path: &str,
sources: &BTreeSet<String>,
neighbors: &BTreeSet<String>,
) -> bool {
let Some(entry) = self.docs.get(doc_path) else {
return false;
};
// A degraded doc is never "unchanged" — re-runs must repair it even
// when its sources match (#687). An empty hash set cannot prove the
// doc unchanged (#672), and a mode change invalidates content that
// hashes cannot see (#677).
if entry.degraded
|| entry.ai_mode != self.ai_mode
|| entry.render_version != CODEWIKI_RENDER_VERSION
|| entry.source_hashes.is_empty()
{
return false;
}
// The recorded source set and the recorded cross-file neighbor set must
// both still match exactly — a new or dropped source/neighbor (e.g. an
// added caller, #885) is itself a change even before any hash differs.
if !set_matches(&entry.source_hashes, sources)
|| !set_matches(&entry.neighbor_hashes, neighbors)
{
return false;
}
let expected = entry
.source_hashes
.clone()
.into_iter()
.chain(entry.neighbor_hashes.clone())
.collect::<BTreeMap<_, _>>();
for (file, expected_hash) in &expected {
if self.current_hash(file).as_deref() != Some(expected_hash.as_str()) {
return false;
}
}
// Meta alone is not proof the page exists: deleting a page from disk
// must force regeneration, which is also the supported manual way to
// invalidate a single doc (#681).
let Ok(target) = safe_doc_path(&self.out_dir, doc_path) else {
return false;
};
target.exists()
}
fn current_hash(&mut self, file: &str) -> Option<String> {
if let Some(hash) = self.current_hashes.get(file) {
return hash.clone();
}
// Under `--since`, a file git did not report changed keeps its recorded
// hash without a disk read, so an unchanged page reuses without a
// re-hash and the rewrite set stays scoped to the diff + dependents.
let hash = if self
.since
.as_ref()
.is_some_and(|since| !since.contains(file))
{
self.recorded_hashes.get(file).cloned()
} else {
hasher::file_content_hash(&self.project_root.join(file)).ok()
};
self.current_hashes.insert(file.to_string(), hash.clone());
hash
}
}
/// Distinct source files cited by a doc's spans — the provenance set whose
/// hashes decide reuse.
pub(crate) fn span_files(spans: &[SourceSpan]) -> BTreeSet<String> {
spans.iter().map(|span| span.file.clone()).collect()
}
/// True when the recorded hash map covers exactly the current file set — same
/// size and same keys — so an added or dropped file fails the match.
fn set_matches(recorded: &BTreeMap<String, String>, current: &BTreeSet<String>) -> bool {
recorded.len() == current.len() && recorded.keys().all(|file| current.contains(file))
}