cordance-scan 0.1.0

Cordance repository scanners. Deterministic surface classification.
Documentation
//! Directory walker. Honours `.gitignore` via the `ignore` crate.
//!
//! Blocked paths are emitted as `BlockedSurface` records without reading their
//! content. Non-blocked paths are hashed and classified.
//!
//! Hashing is the dominant cost for large repositories. The walker therefore
//! splits work into four phases:
//!
//! 1. **Discovery.** Walk the tree, drop directories, capture
//!    `(abs_path, rel_str, rel_path, metadata)` for every file. No hashing
//!    happens here.
//! 2. **Classification.** A pure function from `(rel_str, metadata)` to a
//!    partially-built record. Blocked paths short-circuit and are *not*
//!    hashed.
//! 3. **Hashing.** Performed inside the per-file closure. When the number of
//!    candidate files exceeds [`PARALLEL_THRESHOLD`], hashing runs on the
//!    rayon thread pool via `into_par_iter`; otherwise it runs sequentially
//!    on the calling thread.
//! 4. **Sort.** Results are sorted by repo-relative path so output is
//!    deterministic regardless of parallelism.

use camino::Utf8PathBuf;
use chrono::{DateTime, Utc};
use cordance_core::source::{SourceClass, SourceRecord};
use rayon::prelude::*;
use tracing::warn;

use crate::ScanError;

/// File-count threshold for switching to rayon-parallel hashing.
///
/// Below this, the overhead of dispatching to a thread pool isn't worth it;
/// above it, sha256 of file bytes dominates and scales nearly linearly with
/// cores.
pub const PARALLEL_THRESHOLD: usize = 256;

/// Discovery-phase tuple. Carries everything `process_entry` needs without
/// requiring a second `stat` call per file.
type PreparedEntry = (Utf8PathBuf, String, Utf8PathBuf, Option<std::fs::Metadata>);

/// Walk a repository root, classify each file, hash non-blocked files, and
/// return one `SourceRecord` per file, sorted by path.
///
/// Hidden files are included; `.gitignore` rules are honoured by the `ignore`
/// crate.
///
/// For repositories with more than [`PARALLEL_THRESHOLD`] files the hash
/// phase runs on the rayon global thread pool. Result order is deterministic
/// (sorted by path) regardless of which path is taken.
///
/// # Errors
///
/// Returns `ScanError::Io` if the walker encounters an I/O error, or
/// `ScanError::NonUtf8Path` if a path contains non-UTF-8 bytes.
pub fn walk(root: &Utf8PathBuf) -> Result<Vec<SourceRecord>, ScanError> {
    let prepared = discover(root)?;

    let mut records: Vec<SourceRecord> = if prepared.len() > PARALLEL_THRESHOLD {
        prepared.into_par_iter().map(process_entry).collect()
    } else {
        prepared.into_iter().map(process_entry).collect()
    };

    records.sort_by(|a, b| a.path.cmp(&b.path));

    Ok(records)
}

/// Phase 1: walk the tree and capture per-file context. Returns an error if
/// the walker reports an I/O failure or a path cannot be decoded as UTF-8.
fn discover(root: &Utf8PathBuf) -> Result<Vec<PreparedEntry>, ScanError> {
    let walker = ignore::WalkBuilder::new(root.as_std_path())
        .hidden(false)
        .build();

    let mut prepared: Vec<PreparedEntry> = Vec::new();

    for entry in walker {
        let entry = entry.map_err(|e| {
            let io = e.io_error().map_or_else(
                || std::io::Error::other(e.to_string()),
                |io| std::io::Error::from(io.kind()),
            );
            ScanError::Io {
                path: root.clone(),
                source: io,
            }
        })?;

        // Skip directories; we only want files.
        if entry.file_type().is_none_or(|ft| ft.is_dir()) {
            continue;
        }

        let abs_path: Utf8PathBuf = entry
            .path()
            .to_str()
            .ok_or_else(|| ScanError::NonUtf8Path(entry.path().to_string_lossy().into_owned()))
            .map(Utf8PathBuf::from)?;

        // Repo-relative path with forward slashes.
        let relative_native = abs_path
            .as_std_path()
            .strip_prefix(root.as_std_path())
            .unwrap_or(abs_path.as_std_path());
        let rel_str = relative_native
            .to_str()
            .ok_or_else(|| ScanError::NonUtf8Path(relative_native.to_string_lossy().into_owned()))?
            .replace('\\', "/");
        let rel_path = Utf8PathBuf::from(&rel_str);

        let meta = match entry.metadata() {
            Ok(m) => Some(m),
            Err(e) => {
                warn!("failed to read metadata for {abs_path}: {e}");
                None
            }
        };

        prepared.push((abs_path, rel_str, rel_path, meta));
    }

    Ok(prepared)
}

/// Phases 2+3: classify and (if not blocked) hash a single prepared entry.
///
/// Pure function — safe to call concurrently from a rayon parallel iterator.
/// All inputs are owned values; no shared mutable state.
fn process_entry(prepared: PreparedEntry) -> SourceRecord {
    let (abs_path, rel_str, rel_path, meta) = prepared;

    if crate::blocked::is_blocked(&rel_str) {
        let size_bytes = meta.as_ref().map_or(0, std::fs::Metadata::len);
        return SourceRecord {
            id: SourceRecord::stable_id(SourceClass::BlockedSurface, &rel_path),
            path: rel_path,
            class: SourceClass::BlockedSurface,
            sha256: String::new(),
            size_bytes,
            modified: None,
            blocked: true,
            blocked_reason: crate::blocked::block_reason(&rel_str).map(str::to_string),
        };
    }

    let sha256 = match crate::hasher::sha256_file(&abs_path) {
        Ok(h) => h,
        Err(e) => {
            // Hashing failed (permissions, transient I/O, etc.). Emit a
            // `BlockedSurface` record rather than a fake-empty sha256:
            // - keeps the record visible in reports for drift / audit;
            // - guarantees drift detection never compares against an empty
            //   hash that could collide with another unreadable file;
            // - downstream consumers (lock writers, receipt builders) can
            //   filter on `blocked` and skip these cleanly.
            //
            // The full error (which embeds the absolute on-disk path) is
            // logged for the operator, but the `blocked_reason` returned to
            // callers is path-free. `pack.json` and MCP responses can be
            // shared between users, so leaking an absolute filesystem path
            // there would be a low-grade information disclosure. (Round-2
            // redteam HIGH.)
            warn!(path = %abs_path, error = %e, "hash failed");
            let size_bytes = meta.as_ref().map_or(0, std::fs::Metadata::len);
            return SourceRecord {
                id: SourceRecord::stable_id(SourceClass::BlockedSurface, &rel_path),
                path: rel_path,
                class: SourceClass::BlockedSurface,
                sha256: String::new(),
                size_bytes,
                modified: None,
                blocked: true,
                blocked_reason: Some("hash failed (see logs)".into()),
            };
        }
    };

    let (size_bytes, modified) = meta.map_or((0, None), |m| {
        let modified: Option<DateTime<Utc>> = m.modified().ok().map(DateTime::from);
        (m.len(), modified)
    });

    let class = crate::classifier::classify(&rel_str);

    SourceRecord {
        id: SourceRecord::stable_id(class, &rel_path),
        path: rel_path,
        class,
        sha256,
        size_bytes,
        modified,
        blocked: false,
        blocked_reason: None,
    }
}