use std::collections::HashSet;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use serde::Serialize;
use crate::ingest::{is_supported_file, sha256_hex};
use crate::store::Store;
#[derive(Debug, Clone, Serialize)]
pub struct DiffReport {
pub scanned_path: Option<String>,
pub unchanged: Vec<FileState>,
pub changed: Vec<FileState>,
pub missing: Vec<MissingFile>,
pub unindexed: Vec<UnindexedFile>,
}
#[derive(Debug, Clone, Serialize)]
pub struct FileState {
pub source_id: String,
pub uri: String,
pub path: String,
pub indexed_sha256: String,
pub current_sha256: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct MissingFile {
pub source_id: String,
pub uri: String,
pub path: String,
pub indexed_sha256: String,
pub reason: String,
}
#[derive(Debug, Clone, Serialize)]
pub struct UnindexedFile {
pub path: String,
pub bytes: u64,
}
pub fn diff(store: &Store, scanned_path: Option<&Path>) -> Result<DiffReport> {
let scan_root = match scanned_path {
Some(p) => {
Some(fs::canonicalize(p).with_context(|| format!("canonicalize {}", p.display()))?)
}
None => None,
};
let indexed: Vec<(String, String, String)> = {
let conn = store.conn();
let mut stmt = conn.prepare(
"SELECT id, uri, content_sha256 FROM sources
WHERE uri LIKE 'file://%'
ORDER BY id",
)?;
let rows = stmt.query_map([], |row| {
Ok((
row.get::<_, String>(0)?,
row.get::<_, String>(1)?,
row.get::<_, String>(2)?,
))
})?;
rows.collect::<Result<Vec<_>, _>>()?
};
let mut unchanged = Vec::new();
let mut changed = Vec::new();
let mut missing = Vec::new();
let mut indexed_abs: HashSet<PathBuf> = HashSet::new();
for (source_id, uri, indexed_sha) in indexed {
let abs_str = uri.strip_prefix("file://").unwrap_or(uri.as_str());
let abs = PathBuf::from(abs_str);
if let Some(root) = &scan_root
&& !abs.starts_with(root)
{
continue;
}
indexed_abs.insert(abs.clone());
match fs::read(&abs) {
Ok(bytes) => {
let current_sha = sha256_hex(&bytes);
let state = FileState {
source_id,
uri,
path: abs.to_string_lossy().into_owned(),
indexed_sha256: indexed_sha.clone(),
current_sha256: current_sha.clone(),
};
if current_sha == indexed_sha {
unchanged.push(state);
} else {
changed.push(state);
}
}
Err(err) => {
let reason = if err.kind() == io::ErrorKind::NotFound {
"not found".to_string()
} else {
format!("{err}")
};
missing.push(MissingFile {
source_id,
uri,
path: abs.to_string_lossy().into_owned(),
indexed_sha256: indexed_sha,
reason,
});
}
}
}
let mut unindexed = Vec::new();
if let Some(root) = &scan_root {
for entry in walkdir::WalkDir::new(root).sort_by_file_name() {
let entry = entry.with_context(|| format!("walk {}", root.display()))?;
let p = entry.path();
if !p.is_file() || !is_supported_file(p) {
continue;
}
let canonical = match fs::canonicalize(p) {
Ok(c) => c,
Err(_) => continue,
};
if indexed_abs.contains(&canonical) {
continue;
}
let bytes = fs::metadata(&canonical).map(|m| m.len()).unwrap_or(0);
unindexed.push(UnindexedFile {
path: canonical.to_string_lossy().into_owned(),
bytes,
});
}
}
Ok(DiffReport {
scanned_path: scan_root.map(|p| p.to_string_lossy().into_owned()),
unchanged,
changed,
missing,
unindexed,
})
}
pub fn print_text(report: &DiffReport) {
match &report.scanned_path {
Some(p) => println!("scanned: {p}"),
None => println!("scanned: <store-wide>"),
}
println!(
"unchanged: {} changed: {} missing: {} unindexed: {}",
report.unchanged.len(),
report.changed.len(),
report.missing.len(),
report.unindexed.len(),
);
if !report.changed.is_empty() {
println!();
println!("changed:");
for f in &report.changed {
println!(
" {id} indexed={old} -> current={new} {path}",
id = f.source_id,
old = &f.indexed_sha256[..12.min(f.indexed_sha256.len())],
new = &f.current_sha256[..12.min(f.current_sha256.len())],
path = f.path,
);
}
}
if !report.missing.is_empty() {
println!();
println!("missing:");
for f in &report.missing {
println!(
" {id} {path} ({reason})",
id = f.source_id,
path = f.path,
reason = f.reason,
);
}
}
if !report.unindexed.is_empty() {
println!();
println!("unindexed:");
for f in &report.unindexed {
println!(" {path} ({bytes} B)", path = f.path, bytes = f.bytes);
}
}
}
pub fn print_json(report: &DiffReport) -> Result<()> {
println!("{}", serde_json::to_string_pretty(report)?);
Ok(())
}