Skip to main content

normalize_native_rules/
stale_doc.rs

1//! `stale-doc` native rule — detects documentation files that are likely stale because
2//! strongly co-changed code files have been updated more recently.
3//!
4//! Uses the `co_change_edges` table in the normalize index to find which code files
5//! historically change together with each doc file, then compares last-commit timestamps.
6
7use normalize_output::diagnostics::{DiagnosticsReport, Issue, RelatedLocation, Severity};
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10
11/// Configurable options for the `stale-doc` rule.
12/// Deserialized from `extra` fields on the `RuleOverride` via `rule_config()`.
13#[derive(serde::Deserialize, Default)]
14pub struct StaleDocConfig {
15    /// Minimum co-change count to consider a pair coupled (default: 3).
16    #[serde(default)]
17    pub min_co_changes: Option<u64>,
18    /// Only flag if the code file was committed more recently by at least N days (default: 0).
19    #[serde(default)]
20    pub min_lag_days: Option<u64>,
21    /// Glob patterns for doc files to check (default: built-in list).
22    #[serde(default)]
23    pub doc_patterns: Vec<String>,
24}
25
26/// Default glob patterns for doc files.
27const DEFAULT_DOC_PATTERNS: &[&str] = &["**/*.md", "**/*.rst", "docs/**/*"];
28
29/// Patterns that are explicitly excluded (handled by other rules).
30const EXCLUDED_FILENAMES: &[&str] = &["SUMMARY.md"];
31
32/// Returns true if the given relative path matches the doc patterns and is not excluded.
33fn is_doc_file(rel_path: &str, patterns: &[glob::Pattern]) -> bool {
34    let file_name = std::path::Path::new(rel_path)
35        .file_name()
36        .map(|n| n.to_string_lossy().into_owned())
37        .unwrap_or_default();
38    if EXCLUDED_FILENAMES.contains(&file_name.as_str()) {
39        return false;
40    }
41    patterns.iter().any(|p| p.matches(rel_path))
42}
43
44/// Open the gix repository at or containing `root`.
45fn gix_open(root: &Path) -> Option<gix::Repository> {
46    gix::discover(root).ok()
47}
48
49/// Returns the Unix timestamp (seconds) of the most recent commit that touches `rel_path`.
50///
51/// Walks commits newest-first, diffs each against its parent, and returns the committer
52/// timestamp of the first commit that includes `rel_path` in the changeset.
53pub fn git_last_commit_time(root: &Path, rel_path: &str) -> Option<i64> {
54    let repo = gix_open(root)?;
55    let head_id = repo.head_id().ok()?;
56    let walk = head_id
57        .ancestors()
58        .sorting(gix::revision::walk::Sorting::ByCommitTime(
59            gix::traverse::commit::simple::CommitTimeOrder::NewestFirst,
60        ))
61        .all()
62        .ok()?;
63
64    for info in walk {
65        let Ok(info) = info else { continue };
66        let Ok(commit) = info.object() else { continue };
67        let Ok(tree) = commit.tree() else { continue };
68        let parent_tree = info
69            .parent_ids()
70            .next()
71            .and_then(|pid| pid.object().ok())
72            .and_then(|obj| obj.into_commit().tree().ok());
73        let changes = match repo.diff_tree_to_tree(parent_tree.as_ref(), Some(&tree), None) {
74            Ok(c) => c,
75            Err(_) => continue,
76        };
77        let touches = changes.iter().any(|change| {
78            use gix::object::tree::diff::ChangeDetached;
79            let loc = match change {
80                ChangeDetached::Addition { location, .. }
81                | ChangeDetached::Deletion { location, .. }
82                | ChangeDetached::Modification { location, .. } => location.as_slice(),
83                ChangeDetached::Rewrite {
84                    source_location, ..
85                } => source_location.as_slice(),
86            };
87            loc == rel_path.as_bytes()
88        });
89        if touches {
90            return info.commit_time;
91        }
92    }
93    None
94}
95
96/// Build a `DiagnosticsReport` for the `stale-doc` rule.
97///
98/// For each documentation file matching the configured patterns, finds co-change partners
99/// (code files it historically changes with) and flags the doc file if any partner was
100/// committed more recently by at least `min_lag_days`.
101///
102/// Gracefully degrades if:
103/// - The index is not built → returns empty report with a tool error note.
104/// - The `co_change_edges` table is empty → returns empty report with a tool error note.
105/// - A file's last commit time cannot be determined → skips the comparison.
106pub fn build_stale_doc_report(
107    root: &Path,
108    config: StaleDocConfig,
109    files: Option<&[PathBuf]>,
110) -> DiagnosticsReport {
111    let min_co_changes = config.min_co_changes.unwrap_or(3) as usize;
112    let min_lag_secs = config.min_lag_days.unwrap_or(0) * 86400;
113
114    // Build glob patterns for doc file detection.
115    let raw_patterns: Vec<&str> = if config.doc_patterns.is_empty() {
116        DEFAULT_DOC_PATTERNS.to_vec()
117    } else {
118        config.doc_patterns.iter().map(String::as_str).collect()
119    };
120    let patterns: Vec<glob::Pattern> = raw_patterns
121        .iter()
122        .filter_map(|p| glob::Pattern::new(p).ok())
123        .collect();
124
125    // Open the index to get co-change edges.
126    let index_path = root.join(".normalize").join("index.sqlite");
127    if !index_path.exists() {
128        return DiagnosticsReport {
129            issues: vec![],
130            files_checked: 0,
131            sources_run: vec!["stale-doc".into()],
132            tool_errors: vec![normalize_output::diagnostics::ToolFailure {
133                tool: "stale-doc".into(),
134                message: "index not built — run `normalize structure rebuild` to enable stale-doc"
135                    .into(),
136            }],
137            daemon_cached: false,
138        };
139    }
140
141    // Query co-change edges synchronously by blocking on the async API.
142    let edges_result = {
143        let db_path = index_path.clone();
144        let root_path = root.to_path_buf();
145        std::thread::Builder::new()
146            .stack_size(8 * 1024 * 1024)
147            .spawn(move || {
148                let rt = tokio::runtime::Builder::new_current_thread()
149                    .enable_all()
150                    .build()
151                    .ok()?;
152                rt.block_on(async {
153                    let index = normalize_facts::FileIndex::open(&db_path, &root_path)
154                        .await
155                        .ok()?;
156                    index.query_co_change_edges(min_co_changes).await.ok()?
157                })
158            })
159            .ok()
160            .and_then(|h| h.join().ok())
161            .flatten()
162    };
163
164    let Some(edges) = edges_result else {
165        return DiagnosticsReport {
166            issues: vec![],
167            files_checked: 0,
168            sources_run: vec!["stale-doc".into()],
169            tool_errors: vec![normalize_output::diagnostics::ToolFailure {
170                tool: "stale-doc".into(),
171                message: "co_change_edges table is empty or index could not be read — run `normalize structure rebuild`".into(),
172            }],
173            daemon_cached: false,
174        };
175    };
176
177    if edges.is_empty() {
178        return DiagnosticsReport {
179            issues: vec![],
180            files_checked: 0,
181            sources_run: vec!["stale-doc".into()],
182            tool_errors: vec![normalize_output::diagnostics::ToolFailure {
183                tool: "stale-doc".into(),
184                message: "co_change_edges table is empty — run `normalize structure rebuild` to populate it".into(),
185            }],
186            daemon_cached: false,
187        };
188    }
189
190    // Build a map: doc_file -> list of (code_file, co_change_count)
191    // Each edge (file_a, file_b, count) can relate a doc file on either side.
192    let mut doc_to_partners: HashMap<String, Vec<(String, usize)>> = HashMap::new();
193    for (file_a, file_b, count) in &edges {
194        let a_is_doc = is_doc_file(file_a, &patterns);
195        let b_is_doc = is_doc_file(file_b, &patterns);
196        // Only record pairs where exactly one file is a doc (doc ↔ code coupling).
197        // Doc-to-doc coupling is not a signal for staleness.
198        if a_is_doc && !b_is_doc {
199            doc_to_partners
200                .entry(file_a.clone())
201                .or_default()
202                .push((file_b.clone(), *count));
203        } else if b_is_doc && !a_is_doc {
204            doc_to_partners
205                .entry(file_b.clone())
206                .or_default()
207                .push((file_a.clone(), *count));
208        }
209    }
210
211    if doc_to_partners.is_empty() {
212        return DiagnosticsReport {
213            issues: vec![],
214            files_checked: 0,
215            sources_run: vec!["stale-doc".into()],
216            tool_errors: vec![],
217            daemon_cached: false,
218        };
219    }
220
221    // Filter to requested files when --files was provided.
222    let doc_files: Vec<String> = if let Some(explicit_files) = files {
223        let explicit_rel: Vec<String> = explicit_files
224            .iter()
225            .filter_map(|p| {
226                p.strip_prefix(root)
227                    .ok()
228                    .map(|r| r.to_string_lossy().into_owned())
229            })
230            .collect();
231        doc_to_partners
232            .keys()
233            .filter(|k| explicit_rel.contains(k))
234            .cloned()
235            .collect()
236    } else {
237        doc_to_partners.keys().cloned().collect()
238    };
239
240    let files_checked = doc_files.len();
241
242    // Cache commit times to avoid redundant git walks.
243    let mut commit_time_cache: HashMap<String, Option<i64>> = HashMap::new();
244
245    let mut issues = Vec::new();
246
247    for doc_path in &doc_files {
248        // Skip doc files that don't exist on disk (deleted, renamed).
249        if !root.join(doc_path).exists() {
250            continue;
251        }
252
253        let doc_time = *commit_time_cache
254            .entry(doc_path.clone())
255            .or_insert_with(|| git_last_commit_time(root, doc_path));
256
257        let Some(doc_ts) = doc_time else {
258            // Can't determine doc's last commit time — skip.
259            continue;
260        };
261
262        let partners = &doc_to_partners[doc_path];
263
264        // Find the partner most recently committed after the doc.
265        let mut worst_partner: Option<(&str, usize, i64)> = None; // (file, count, ts)
266
267        for (partner_path, co_count) in partners {
268            // Skip partners that don't exist on disk.
269            if !root.join(partner_path).exists() {
270                continue;
271            }
272
273            let partner_time = *commit_time_cache
274                .entry(partner_path.clone())
275                .or_insert_with(|| git_last_commit_time(root, partner_path));
276
277            let Some(partner_ts) = partner_time else {
278                continue;
279            };
280
281            if partner_ts <= doc_ts {
282                continue;
283            }
284
285            let lag = (partner_ts - doc_ts) as u64;
286            if lag < min_lag_secs {
287                continue;
288            }
289
290            // Pick the partner with the largest lag (most behind).
291            let is_worse = worst_partner
292                .map(|(_, _, worst_ts)| partner_ts > worst_ts)
293                .unwrap_or(true);
294            if is_worse {
295                worst_partner = Some((partner_path.as_str(), *co_count, partner_ts));
296            }
297        }
298
299        if let Some((partner_path, co_count, partner_ts)) = worst_partner {
300            let lag_days = ((partner_ts - doc_ts) as u64) / 86400;
301            issues.push(Issue {
302                file: doc_path.clone(),
303                line: None,
304                column: None,
305                end_line: None,
306                end_column: None,
307                rule_id: "stale-doc".into(),
308                message: format!(
309                    "possibly stale — {partner_path} was updated {lag_days} day{} more recently (last co-changed {co_count} times)",
310                    if lag_days == 1 { "" } else { "s" }
311                ),
312                severity: Severity::Warning,
313                source: "stale-doc".into(),
314                related: vec![RelatedLocation {
315                    file: partner_path.to_string(),
316                    line: None,
317                    message: Some(format!("co-changed {co_count} times, updated {lag_days} day{} more recently than doc", if lag_days == 1 { "" } else { "s" })),
318                }],
319                suggestion: Some(format!(
320                    "review {doc_path} to ensure it reflects recent changes in {partner_path}"
321                )),
322            });
323        }
324    }
325
326    // Sort by file path for deterministic output.
327    issues.sort_by(|a, b| a.file.cmp(&b.file));
328
329    DiagnosticsReport {
330        issues,
331        files_checked,
332        sources_run: vec!["stale-doc".into()],
333        tool_errors: vec![],
334        daemon_cached: false,
335    }
336}