use rumdl_lib::rule::LintWarning;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use std::sync::atomic::{AtomicU64, Ordering};
const VERSION: &str = env!("CARGO_PKG_VERSION");
static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
fn atomic_write(target: &Path, bytes: &[u8]) -> std::io::Result<()> {
let counter = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
let tmp_path = target.with_extension(format!("tmp.{}.{counter}", std::process::id()));
match fs::write(&tmp_path, bytes).and_then(|()| fs::rename(&tmp_path, target)) {
Ok(()) => Ok(()),
Err(e) => {
let _ = fs::remove_file(&tmp_path);
Err(e)
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CacheMissReason {
Disabled,
MissingEntry { path: PathBuf },
UnreadableEntry { path: PathBuf, error: String },
InvalidEntry { path: PathBuf, error: String },
FileChanged,
ConfigChanged,
RulesChanged,
VersionChanged { cached: String, current: &'static str },
}
impl std::fmt::Display for CacheMissReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Disabled => write!(f, "cache is disabled"),
Self::MissingEntry { path } => write!(f, "no cache entry at {}", path.display()),
Self::UnreadableEntry { path, error } => {
write!(f, "could not read cache entry at {}: {error}", path.display())
}
Self::InvalidEntry { path, error } => {
write!(f, "cache entry at {} is invalid: {error}", path.display())
}
Self::FileChanged => write!(f, "file content hash changed"),
Self::ConfigChanged => write!(f, "configuration hash changed"),
Self::RulesChanged => write!(f, "enabled rules hash changed"),
Self::VersionChanged { cached, current } => {
write!(f, "rumdl version changed from {cached} to {current}")
}
}
}
}
#[derive(Debug, Default, Clone)]
pub struct CacheStats {
pub hits: usize,
pub misses: usize,
pub writes: usize,
}
impl CacheStats {
#[cfg(test)]
pub fn hit_rate(&self) -> f64 {
let total = self.hits + self.misses;
if total == 0 {
0.0
} else {
(self.hits as f64 / total as f64) * 100.0
}
}
}
#[derive(Debug, Serialize, Deserialize)]
struct CacheEntry {
file_hash: String,
config_hash: String,
rules_hash: String,
version: String,
warnings: Vec<LintWarning>,
timestamp: i64,
}
pub struct LintCache {
cache_dir: PathBuf,
enabled: bool,
stats: Mutex<CacheStats>,
}
impl LintCache {
pub fn new(cache_dir: PathBuf, enabled: bool) -> Self {
Self {
cache_dir,
enabled,
stats: Mutex::new(CacheStats::default()),
}
}
fn record_hit(&self) {
if let Ok(mut stats) = self.stats.lock() {
stats.hits += 1;
}
}
fn record_miss(&self) {
if let Ok(mut stats) = self.stats.lock() {
stats.misses += 1;
}
}
fn record_write(&self) {
if let Ok(mut stats) = self.stats.lock() {
stats.writes += 1;
}
}
pub fn hash_content(content: &str) -> String {
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
let hash = blake3::hash(content.as_bytes()).to_hex().to_string();
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: hash content", start.elapsed());
hash
}
pub fn hash_config(config: &rumdl_lib::config::Config) -> String {
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
let config_json = serde_json::to_string(config).unwrap_or_default();
let hash = blake3::hash(config_json.as_bytes()).to_hex().to_string();
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: hash config", start.elapsed());
hash
}
pub fn hash_rules(rules: &[Box<dyn rumdl_lib::rule::Rule>]) -> String {
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
let mut rule_names: Vec<&str> = rules.iter().map(|r| r.name()).collect();
rule_names.sort_unstable();
let rules_str = rule_names.join(",");
let hash = blake3::hash(rules_str.as_bytes()).to_hex().to_string();
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: hash rules", start.elapsed());
hash
}
fn cache_file_path(&self, file_hash: &str, rules_hash: &str) -> PathBuf {
let short_rules_hash = &rules_hash[..16];
self.cache_dir
.join(VERSION)
.join(format!("{file_hash}_{short_rules_hash}.json"))
}
#[cfg(test)]
pub fn get(&self, content: &str, config_hash: &str, rules_hash: &str) -> Option<Vec<LintWarning>> {
self.get_with_reason(content, config_hash, rules_hash).ok()
}
#[cfg(test)]
pub fn get_with_reason(
&self,
content: &str,
config_hash: &str,
rules_hash: &str,
) -> Result<Vec<LintWarning>, CacheMissReason> {
let file_hash = Self::hash_content(content);
self.get_with_reason_for_hash(&file_hash, config_hash, rules_hash)
}
pub fn get_with_reason_for_hash(
&self,
file_hash: &str,
config_hash: &str,
rules_hash: &str,
) -> Result<Vec<LintWarning>, CacheMissReason> {
if !self.enabled {
return Err(CacheMissReason::Disabled);
}
let cache_path = self.cache_file_path(file_hash, rules_hash);
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
let cache_data = match fs::read_to_string(&cache_path) {
Ok(data) => data,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: read entry", start.elapsed());
self.record_miss();
return Err(CacheMissReason::MissingEntry { path: cache_path });
}
Err(e) => {
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: read entry", start.elapsed());
self.record_miss();
return Err(CacheMissReason::UnreadableEntry {
path: cache_path,
error: e.to_string(),
});
}
};
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: read entry", start.elapsed());
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
let entry: CacheEntry = match serde_json::from_str(&cache_data) {
Ok(entry) => entry,
Err(e) => {
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: parse entry", start.elapsed());
self.record_miss();
return Err(CacheMissReason::InvalidEntry {
path: cache_path,
error: e.to_string(),
});
}
};
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: parse entry", start.elapsed());
if entry.file_hash != file_hash {
self.record_miss();
return Err(CacheMissReason::FileChanged);
}
if entry.config_hash != config_hash {
self.record_miss();
return Err(CacheMissReason::ConfigChanged);
}
if entry.rules_hash != rules_hash {
self.record_miss();
return Err(CacheMissReason::RulesChanged);
}
if entry.version != VERSION {
self.record_miss();
return Err(CacheMissReason::VersionChanged {
cached: entry.version,
current: VERSION,
});
}
self.record_hit();
Ok(entry.warnings)
}
#[cfg(test)]
pub fn set(&self, content: &str, config_hash: &str, rules_hash: &str, warnings: Vec<LintWarning>) {
let file_hash = Self::hash_content(content);
self.set_with_hash(&file_hash, config_hash, rules_hash, warnings);
}
pub fn set_with_hash(&self, file_hash: &str, config_hash: &str, rules_hash: &str, warnings: Vec<LintWarning>) {
if !self.enabled {
return;
}
let cache_path = self.cache_file_path(file_hash, rules_hash);
if let Some(parent) = cache_path.parent() {
let _ = fs::create_dir_all(parent);
}
let entry = CacheEntry {
file_hash: file_hash.to_string(),
config_hash: config_hash.to_string(),
rules_hash: rules_hash.to_string(),
version: VERSION.to_string(),
warnings,
timestamp: chrono::Utc::now().timestamp(),
};
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
let json = serde_json::to_string_pretty(&entry);
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: serialize entry", start.elapsed());
if let Ok(json) = json {
#[cfg(feature = "profiling")]
let start = std::time::Instant::now();
match atomic_write(&cache_path, json.as_bytes()) {
Ok(()) => self.record_write(),
Err(e) => log::debug!("Cache write failed for {}: {}", cache_path.display(), e),
}
#[cfg(feature = "profiling")]
rumdl_lib::profiling::record_duration("cache: write entry", start.elapsed());
}
}
pub fn clear(&self) -> std::io::Result<()> {
if self.cache_dir.exists() {
fs::remove_dir_all(&self.cache_dir)?;
}
Ok(())
}
pub fn init(&self) -> std::io::Result<()> {
if !self.enabled {
return Ok(());
}
let version_dir = self.cache_dir.join(VERSION);
fs::create_dir_all(&version_dir)?;
self.prune_old_versions()?;
let gitignore_path = self.cache_dir.join(".gitignore");
if !gitignore_path.exists() {
fs::write(gitignore_path, "# Automatically created by rumdl.\n*\n")?;
}
let cachedir_tag = self.cache_dir.join("CACHEDIR.TAG");
if !cachedir_tag.exists() {
fs::write(
cachedir_tag,
"Signature: 8a477f597d28d172789f06886806bc55\n# This file is a cache directory tag created by rumdl.\n",
)?;
}
Ok(())
}
fn prune_old_versions(&self) -> std::io::Result<()> {
if !self.cache_dir.exists() {
return Ok(());
}
let entries = fs::read_dir(&self.cache_dir)?;
for entry in entries.flatten() {
let path = entry.path();
if !path.is_dir() {
continue;
}
if let Some(dir_name) = path.file_name().and_then(|n| n.to_str()) {
if dir_name == VERSION {
continue;
}
if dir_name.chars().next().is_some_and(|c| c.is_ascii_digit()) {
log::info!("Pruning old cache version: {dir_name}");
if let Err(e) = fs::remove_dir_all(&path) {
log::warn!("Failed to prune old cache {dir_name}: {e}");
}
}
}
}
Ok(())
}
#[cfg(test)]
pub fn stats(&self) -> CacheStats {
self.stats.lock().map(|stats| stats.clone()).unwrap_or_default()
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_cache_disabled() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), false);
let content = "# Test";
let config_hash = "abc123";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
assert!(cache.get(content, config_hash, rules_hash).is_none());
cache.set(content, config_hash, rules_hash, vec![]);
assert_eq!(cache.stats().writes, 0);
}
#[test]
fn test_cache_miss() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
let content = "# Test";
let config_hash = "abc123";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
assert!(cache.get(content, config_hash, rules_hash).is_none());
assert_eq!(cache.stats().misses, 1);
assert_eq!(cache.stats().hits, 0);
}
#[test]
fn test_cache_miss_reason_missing_entry() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
let content = "# Test";
let config_hash = "abc123";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
let reason = cache
.get_with_reason(content, config_hash, rules_hash)
.expect_err("empty cache should miss");
assert!(matches!(reason, CacheMissReason::MissingEntry { .. }));
assert!(reason.to_string().contains("no cache entry at"));
assert_eq!(cache.stats().misses, 1);
}
#[test]
fn test_cache_hit() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
cache.init().unwrap();
let content = "# Test";
let config_hash = "abc123";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
let warnings = vec![];
cache.set(content, config_hash, rules_hash, warnings.clone());
let cached = cache.get(content, config_hash, rules_hash);
assert!(cached.is_some());
assert_eq!(cached.unwrap(), warnings);
assert_eq!(cache.stats().hits, 1);
}
#[test]
fn test_cache_invalidation_on_content_change() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
cache.init().unwrap();
let content1 = "# Test 1";
let content2 = "# Test 2";
let config_hash = "abc123";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
cache.set(content1, config_hash, rules_hash, vec![]);
assert!(cache.get(content2, config_hash, rules_hash).is_none());
}
#[test]
fn test_cache_invalidation_on_config_change() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
cache.init().unwrap();
let content = "# Test";
let config_hash1 = "abc123";
let config_hash2 = "def456";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
cache.set(content, config_hash1, rules_hash, vec![]);
assert!(cache.get(content, config_hash2, rules_hash).is_none());
}
#[test]
fn test_cache_miss_reason_config_changed() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
cache.init().unwrap();
let content = "# Test";
let config_hash1 = "abc123";
let config_hash2 = "def456";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
cache.set(content, config_hash1, rules_hash, vec![]);
let reason = cache
.get_with_reason(content, config_hash2, rules_hash)
.expect_err("changed config hash should miss");
assert_eq!(reason, CacheMissReason::ConfigChanged);
assert_eq!(reason.to_string(), "configuration hash changed");
}
#[test]
fn test_hash_content() {
let content1 = "# Test";
let content2 = "# Test";
let content3 = "# Different";
let hash1 = LintCache::hash_content(content1);
let hash2 = LintCache::hash_content(content2);
let hash3 = LintCache::hash_content(content3);
assert_eq!(hash1, hash2);
assert_ne!(hash1, hash3);
}
#[test]
fn test_hash_config_is_stable_across_repeated_config_loads() {
let temp_dir = TempDir::new().unwrap();
let config_path = temp_dir.path().join(".rumdl.toml");
let mut config_content = String::from(
r#"
[global]
line-length = 100
[per-file-ignores]
"#,
);
for i in 0..64 {
config_content.push_str(&format!("\"docs/section-{i:02}/**/*.md\" = [\"MD013\", \"MD033\"]\n"));
}
std::fs::write(&config_path, config_content).unwrap();
let mut hashes = std::collections::BTreeSet::new();
for _ in 0..128 {
let sourced =
rumdl_lib::config::SourcedConfig::load_with_discovery(Some(config_path.to_str().unwrap()), None, true)
.unwrap();
let config: rumdl_lib::config::Config = sourced.into_validated_unchecked().into();
hashes.insert(LintCache::hash_config(&config));
}
let unique_count = hashes.len();
let sample: Vec<_> = hashes.iter().take(3).cloned().collect();
assert_eq!(
unique_count, 1,
"loading the same config repeatedly must produce one stable config hash, got {unique_count} unique hashes; sample: {sample:?}",
);
}
#[test]
fn test_hash_config_is_stable_with_code_block_tools_maps() {
let temp_dir = TempDir::new().unwrap();
let config_path = temp_dir.path().join(".rumdl.toml");
let mut config_content = String::from(
r#"
[global]
line-length = 100
[code-block-tools]
enabled = true
[code-block-tools.language-aliases]
"#,
);
for i in 0..32 {
config_content.push_str(&format!("\"alias-{i:02}\" = \"lang-{i:02}\"\n"));
}
for i in 0..32 {
config_content.push_str(&format!(
"\n[code-block-tools.languages.\"lang-{i:02}\"]\nenabled = false\n",
));
config_content.push_str(&format!(
"\n[code-block-tools.tools.\"tool-{i:02}\"]\ncommand = [\"tool-{i:02}\"]\n",
));
}
std::fs::write(&config_path, config_content).unwrap();
let mut hashes = std::collections::BTreeSet::new();
for _ in 0..128 {
let sourced =
rumdl_lib::config::SourcedConfig::load_with_discovery(Some(config_path.to_str().unwrap()), None, true)
.unwrap();
let config: rumdl_lib::config::Config = sourced.into_validated_unchecked().into();
hashes.insert(LintCache::hash_config(&config));
}
let unique_count = hashes.len();
let sample: Vec<_> = hashes.iter().take(3).cloned().collect();
assert_eq!(
unique_count, 1,
"code-block-tools maps must serialize deterministically, got {unique_count} unique hashes; sample: {sample:?}",
);
}
#[test]
fn test_cache_stats() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
cache.init().unwrap();
let content = "# Test";
let config_hash = "abc123";
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
cache.get(content, config_hash, rules_hash);
assert_eq!(cache.stats().misses, 1);
assert_eq!(cache.stats().hits, 0);
cache.set(content, config_hash, rules_hash, vec![]);
assert_eq!(cache.stats().writes, 1);
cache.get(content, config_hash, rules_hash);
assert_eq!(cache.stats().hits, 1);
assert_eq!(cache.stats().hit_rate(), 50.0); }
#[test]
fn test_cache_clear() {
let temp_dir = TempDir::new().unwrap();
let cache = LintCache::new(temp_dir.path().to_path_buf(), true);
cache.init().unwrap();
let rules_hash = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";
cache.set("# Test", "abc", rules_hash, vec![]);
cache.clear().unwrap();
assert!(!cache.cache_dir.exists());
}
#[test]
fn test_prune_old_versions() {
let temp_dir = TempDir::new().unwrap();
let cache_dir = temp_dir.path().to_path_buf();
fs::create_dir_all(cache_dir.join("0.0.1")).unwrap();
fs::create_dir_all(cache_dir.join("0.0.50")).unwrap();
fs::create_dir_all(cache_dir.join("0.0.100")).unwrap();
fs::write(cache_dir.join("0.0.1").join("test.json"), "{}").unwrap();
fs::write(cache_dir.join("0.0.50").join("test.json"), "{}").unwrap();
fs::create_dir_all(cache_dir.join("some_other_dir")).unwrap();
let cache = LintCache::new(cache_dir.clone(), true);
cache.init().unwrap();
assert!(cache_dir.join(VERSION).exists());
assert!(!cache_dir.join("0.0.1").exists());
assert!(!cache_dir.join("0.0.50").exists());
assert!(!cache_dir.join("0.0.100").exists());
assert!(cache_dir.join("some_other_dir").exists());
}
#[test]
fn test_atomic_write_concurrent_no_corruption() {
use std::sync::Arc;
use std::thread;
let temp_dir = TempDir::new().unwrap();
let target = Arc::new(temp_dir.path().join("entry.json"));
let mut handles = Vec::new();
for writer_id in 0..16u8 {
let target = Arc::clone(&target);
handles.push(thread::spawn(move || {
let payload = vec![b'a' + writer_id; 4096 * (writer_id as usize + 1)];
for _ in 0..32 {
atomic_write(&target, &payload).expect("atomic write succeeds");
}
payload
}));
}
let payloads: Vec<Vec<u8>> = handles.into_iter().map(|h| h.join().unwrap()).collect();
let final_bytes = fs::read(&*target).expect("target file readable");
assert!(
payloads.iter().any(|p| p == &final_bytes),
"final cache file must equal exactly one writer's full payload, got {} bytes",
final_bytes.len()
);
}
}