use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::rc::Rc;
use serde::{Deserialize, Serialize};
use tree_sitter::TreeCursor;
use crate::core::config::Config;
use crate::core::doc_pairs::DocPair;
use crate::core::finding::{Finding, IntoFindings, Location};
use crate::core::severity::Severity;
use crate::feature::{decorate, Family, Feature, FeatureKind, FeatureMeta, HotspotIndex};
use crate::observer::code::complexity::parse;
use crate::observer::shared::lang::Language;
#[derive(Debug, Clone, Default)]
pub struct DocDriftObserver {
pub enabled: bool,
pub pairs: Vec<DocPair>,
}
impl DocDriftObserver {
#[must_use]
pub fn from_config_and_pairs(cfg: &Config, pairs: Vec<DocPair>) -> Self {
Self {
enabled: cfg.features.docs.enabled,
pairs,
}
}
#[must_use]
pub fn scan(&self, root: &Path) -> DocDriftReport {
let mut report = DocDriftReport::default();
if !self.enabled || self.pairs.is_empty() {
return report;
}
let mut src_cache: HashMap<PathBuf, Rc<HashSet<String>>> = HashMap::new();
for pair in &self.pairs {
let doc_path = root.join(&pair.doc);
let Ok(doc_text) = std::fs::read_to_string(&doc_path) else {
continue;
};
let mentions = extract_inline_identifiers(&doc_text);
if mentions.is_empty() {
continue;
}
let mut combined: HashSet<String> = HashSet::new();
for src in &pair.srcs {
let key = PathBuf::from(src);
let set = src_cache
.entry(key.clone())
.or_insert_with(|| Rc::new(parse_src_identifiers(&root.join(src))));
combined.extend(set.iter().cloned());
}
for mention in mentions {
if mention_resolves(&combined, &mention.text) {
continue;
}
report.entries.push(DocDriftEntry {
doc_path: PathBuf::from(&pair.doc),
src_paths: pair.srcs.iter().map(PathBuf::from).collect(),
identifier: mention.text,
doc_line: mention.line,
});
}
}
report.totals = DocDriftTotals {
dangling_identifiers: report.entries.len(),
};
report.entries.sort_by(|a, b| {
a.doc_path
.cmp(&b.doc_path)
.then_with(|| a.doc_line.cmp(&b.doc_line))
.then_with(|| a.identifier.cmp(&b.identifier))
});
report
}
}
fn parse_src_identifiers(src_path: &Path) -> HashSet<String> {
let mut out = HashSet::new();
let Some(lang) = Language::from_path(src_path) else {
return out;
};
let Ok(src_text) = std::fs::read_to_string(src_path) else {
return out;
};
let Ok(parsed) = parse(src_text, lang) else {
return out;
};
collect_identifier_tokens(&parsed.tree, parsed.source.as_bytes(), &mut out);
out
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct DocDriftReport {
pub entries: Vec<DocDriftEntry>,
pub totals: DocDriftTotals,
}
impl DocDriftReport {
#[must_use]
pub fn worst_n(&self, n: usize) -> Vec<DocDriftEntry> {
let mut top = self.entries.clone();
top.truncate(n);
top
}
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct DocDriftEntry {
pub doc_path: PathBuf,
pub src_paths: Vec<PathBuf>,
pub identifier: String,
pub doc_line: u32,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
pub struct DocDriftTotals {
pub dangling_identifiers: usize,
}
#[derive(Debug, Clone)]
struct InlineMention {
text: String,
line: u32,
}
fn extract_inline_identifiers(text: &str) -> Vec<InlineMention> {
let mut out = Vec::new();
for (line_no, line) in crate::observer::docs::markdown::iter_prose_lines(text) {
scan_line_for_inline(line, line_no, &mut out);
}
out
}
fn scan_line_for_inline(line: &str, line_no: u32, out: &mut Vec<InlineMention>) {
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'`' {
i += 1;
continue;
}
if i + 1 < bytes.len() && bytes[i + 1] == b'`' {
i += 2;
while i + 1 < bytes.len() && !(bytes[i] == b'`' && bytes[i + 1] == b'`') {
i += 1;
}
i = i.saturating_add(2);
continue;
}
let start = i + 1;
let mut end = start;
while end < bytes.len() && bytes[end] != b'`' {
end += 1;
}
if end >= bytes.len() {
return;
}
let span = &line[start..end];
if is_identifier_shape(span) && looks_like_definition_mention(span) {
out.push(InlineMention {
text: span.to_owned(),
line: line_no,
});
}
i = end + 1;
}
}
fn looks_like_definition_mention(span: &str) -> bool {
if span.starts_with('-') {
return false;
}
if span.contains('-') {
return false;
}
if let Some(idx) = span.rfind('.') {
let ext = &span[idx + 1..];
if (2..=5).contains(&ext.len()) && ext.chars().all(|c| c.is_ascii_lowercase()) {
return false;
}
}
if span.contains("::")
&& span
.split("::")
.all(|s| s.chars().next().is_some_and(|c| c.is_ascii_lowercase()))
{
return false;
}
if span.contains('.') && !span.contains("::") && span.chars().all(|c| !c.is_ascii_uppercase()) {
return false;
}
if span.ends_with(':') {
return false;
}
if span.contains('<') || span.contains('>') {
return false;
}
true
}
fn is_identifier_shape(span: &str) -> bool {
if span.is_empty() {
return false;
}
if span.chars().count() < 2 {
return false;
}
let mut has_alpha = false;
for ch in span.chars() {
if ch.is_ascii_alphabetic() {
has_alpha = true;
}
if !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | ':' | '.' | '<' | '>' | '-')) {
return false;
}
}
if !has_alpha {
return false;
}
if span.len() >= 4
&& span.chars().any(|c| c.is_ascii_digit())
&& span.chars().all(|c| c.is_ascii_hexdigit())
{
return false;
}
true
}
fn mention_resolves(combined: &HashSet<String>, mention: &str) -> bool {
if combined.contains(mention) {
return true;
}
let segments: Vec<&str> = mention
.split([':', '.'])
.filter(|s| !s.is_empty())
.collect();
segments.len() > 1 && segments.iter().all(|s| combined.contains(*s))
}
fn collect_identifier_tokens(tree: &tree_sitter::Tree, source: &[u8], out: &mut HashSet<String>) {
let mut cursor: TreeCursor<'_> = tree.walk();
loop {
let node = cursor.node();
if node.child_count() == 0 && !node.is_extra() && !node.is_error() {
if let Ok(text) = node.utf8_text(source) {
let trimmed = text.trim();
if is_identifier_shape(trimmed) {
out.insert(trimmed.to_owned());
}
}
}
if cursor.goto_first_child() {
continue;
}
loop {
if cursor.goto_next_sibling() {
break;
}
if !cursor.goto_parent() {
return;
}
}
}
}
impl IntoFindings for DocDriftReport {
fn into_findings(&self) -> Vec<Finding> {
self.entries
.iter()
.map(|entry| {
let primary = Location {
file: entry.doc_path.clone(),
line: Some(entry.doc_line),
symbol: None,
};
let locations: Vec<Location> = entry
.src_paths
.iter()
.map(|p| Location::file(p.clone()))
.collect();
let summary = format!(
"doc_drift: doc references `{}` but no paired src defines it",
entry.identifier,
);
let seed = format!(
"doc_drift:{}:{}",
entry.doc_path.to_string_lossy(),
entry.identifier,
);
Finding::new("doc_drift", primary, summary, &seed).with_locations(locations)
})
.collect()
}
}
pub struct DocDriftFeature;
impl Feature for DocDriftFeature {
fn meta(&self) -> FeatureMeta {
FeatureMeta {
name: "doc_drift",
version: 1,
kind: FeatureKind::DocsScanner,
}
}
fn enabled(&self, cfg: &Config) -> bool {
cfg.features.docs.enabled
}
fn family(&self) -> Family {
Family::Docs
}
fn lower(
&self,
reports: &crate::observers::ObserverReports,
_cfg: &Config,
_cal: &crate::core::calibration::Calibration,
hotspot: &HotspotIndex,
) -> Vec<Finding> {
let Some(report) = reports.doc_drift.as_ref() else {
return Vec::new();
};
report
.into_findings()
.into_iter()
.map(|f| decorate(f, Severity::Critical, hotspot))
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn names(body: &str) -> Vec<String> {
extract_inline_identifiers(body)
.into_iter()
.map(|m| m.text)
.collect()
}
#[test]
fn extracts_inline_identifiers_outside_fences() {
let got = names("Use `Foo::bar` to do X.\n\n```rust\nlet `Baz` = 1;\n```\n\nSee `Qux`.");
assert!(got.contains(&"Foo::bar".to_string()));
assert!(got.contains(&"Qux".to_string()));
assert!(!got.contains(&"Baz".to_string()), "fenced span leaked");
}
#[test]
fn skips_double_backtick_spans() {
assert_eq!(
names("Embed ``with `nested` backticks`` here, plus `Real`."),
vec!["Real".to_string()],
);
}
#[test]
fn ignores_non_identifier_shape() {
assert_eq!(
names("Numbers `123`, prose `hello world`, punct `()`. But `Real_id`."),
vec!["Real_id".to_string()],
);
}
#[test]
fn filters_file_extension_mentions() {
assert_eq!(
names(
"Files: `config.toml`, `state.json`, `lib.rs`, `quick-start.mdx`. \
Real: `Config`."
),
vec!["Config".to_string()],
);
}
#[test]
fn filters_cli_flags_and_hyphenated_names() {
assert_eq!(
names(
"Flags: `--feature`, `-v`. Crates: `heal-cli`, `cargo-llvm-cov`. \
Real: `Cli`."
),
vec!["Cli".to_string()],
);
}
#[test]
fn filters_module_paths_but_keeps_type_references() {
assert_eq!(
names(
"Module: `core::finding`, `observers::run_all`. \
Type: `core::Error`, `tree_sitter::Tree`. \
Method: `Foo::bar`."
),
vec![
"core::Error".to_string(),
"tree_sitter::Tree".to_string(),
"Foo::bar".to_string(),
],
);
}
#[test]
fn filters_metric_strings_but_keeps_field_references() {
assert_eq!(
names(
"Metric: `change_coupling.drift`, `doc_link_health`. \
TOML key: `features.docs`. \
Field: `Finding.workspace`, `LocReport.primary`."
),
vec![
"doc_link_health".to_string(),
"Finding.workspace".to_string(),
"LocReport.primary".to_string(),
],
);
}
#[test]
fn filters_yaml_keys_and_parameterized_types() {
assert_eq!(
names(
"Front matter: `title:`, `description:`. \
Generics: `Option<usize>`, `Vec<DocBody>`. \
Real: `Foo`, `Option`."
),
vec!["Foo".to_string(), "Option".to_string()],
);
}
#[test]
fn filters_single_char_placeholders() {
assert_eq!(
names(
"Pattern: `X` and `Y` form `Foo<X, Y>`. Loop var `i`. \
Real: `Foo`, `Map`."
),
vec!["Foo".to_string(), "Map".to_string()],
);
}
#[test]
fn mention_resolves_via_exact_match() {
let mut set = HashSet::new();
set.insert("Config".to_string());
assert!(mention_resolves(&set, "Config"));
assert!(!mention_resolves(&set, "Other"));
}
#[test]
fn mention_resolves_via_decomposed_qualified_identifier() {
let mut set = HashSet::new();
for s in [
"Severity",
"Medium",
"Feature",
"lower",
"Finding",
"workspace",
] {
set.insert(s.to_string());
}
assert!(
mention_resolves(&set, "Severity::Medium"),
"enum variant must resolve",
);
assert!(
mention_resolves(&set, "Feature::lower"),
"trait method must resolve",
);
assert!(
mention_resolves(&set, "Finding.workspace"),
"field access must resolve",
);
}
#[test]
fn mention_resolves_requires_every_segment() {
let mut set = HashSet::new();
set.insert("Foo".to_string());
assert!(
!mention_resolves(&set, "Foo::bar"),
"missing member must keep finding",
);
let mut set = HashSet::new();
set.insert("bar".to_string());
assert!(
!mention_resolves(&set, "Foo::bar"),
"missing qualifier must keep finding",
);
}
#[test]
fn mention_resolves_handles_three_segment_paths() {
let mut set = HashSet::new();
for s in ["core", "Error", "Io"] {
set.insert(s.to_string());
}
assert!(mention_resolves(&set, "core::Error::Io"));
let mut partial = HashSet::new();
for s in ["core", "Error"] {
partial.insert(s.to_string());
}
assert!(!mention_resolves(&partial, "core::Error::Io"));
}
#[test]
fn filters_hex_sha_fragments_but_keeps_words() {
assert_eq!(
names(
"Commits: `89d849a`, `c455dba7`, `deadbeef0`. \
Words: `face`, `bead`. Real: `Config`."
),
vec!["face".to_string(), "bead".to_string(), "Config".to_string(),],
);
}
#[test]
fn filters_module_attribute_paths_across_languages() {
assert_eq!(
names("Use `os.path.join` and `requests.get`. Class: `MyClass.method`."),
vec!["MyClass.method".to_string()],
);
assert_eq!(
names("Call `fmt.Println` for output. Internal: `pkg.helper`."),
vec!["fmt.Println".to_string()],
);
assert_eq!(
names("Reference `scala.collection.immutable.List`. Lowercase: `pkg.helper`."),
vec!["scala.collection.immutable.List".to_string()],
);
}
#[test]
fn filters_extensions_for_six_languages() {
assert_eq!(
names(
"Files: `lib.rs`, `script.py`, `app.ts`, `index.tsx`, \
`main.go`, `Build.scala`, `module.js`, `mod.jsx`. \
Type: `Foo`."
),
vec!["Foo".to_string()],
);
}
}