nyx-scanner 0.6.0

A multi-language static analysis tool for detecting security vulnerabilities
Documentation
//! Filesystem walker with batched path delivery.
//!
//! Builds an [`ignore`]-crate [`WalkBuilder`] from the config (respecting
//! `.gitignore`, excluded directories, and excluded extensions), then delivers
//! discovered paths to the analysis pipeline in batches over a crossbeam channel.
//! Batching amortizes channel overhead for large trees.
//!
//! All paths are checked via [`crate::utils::path::path_stays_within_root`]
//! before entering a batch, preventing traversal outside the scan root.

use crate::utils::Config;
use crate::utils::path::path_stays_within_root;
use crossbeam_channel::{Receiver, Sender, bounded};
use ignore::{WalkBuilder, WalkState, overrides::OverrideBuilder};
use std::thread::JoinHandle;
use std::{
    mem,
    path::{Path, PathBuf},
    thread,
};

// ---------------------------------------------------------------------------
// Internal constants / helpers
// ---------------------------------------------------------------------------

type Paths = Vec<PathBuf>;

struct BatchSender {
    tx: Sender<Paths>,
    batch: Paths,
    batch_size: usize,
}
impl BatchSender {
    fn new(tx: Sender<Paths>, batch_size: usize) -> Self {
        Self {
            tx,
            batch: Vec::with_capacity(batch_size),
            batch_size,
        }
    }

    fn push_path(&mut self, path: PathBuf) {
        self.batch.push(path);
        if self.batch.len() >= self.batch_size {
            self.flush();
        }
    }

    fn flush(&mut self) {
        if !self.batch.is_empty() {
            tracing::debug!(n_paths = self.batch.len(), "flushing batch");
            let _ = self.tx.send(mem::take(&mut self.batch));
        }
    }
}
impl Drop for BatchSender {
    fn drop(&mut self) {
        self.flush();
    }
}

fn build_overrides(root: &Path, cfg: &Config) -> ignore::overrides::Override {
    let mut ob = OverrideBuilder::new(root);

    for ext in &cfg.scanner.excluded_extensions {
        if let Err(e) = ob.add(&format!("!*.{ext}")) {
            tracing::warn!("invalid exclude‐extension pattern ‘{ext}’: {e}");
        }
    }
    for dir in &cfg.scanner.excluded_directories {
        if let Err(e) = ob.add(&format!("!**/{dir}/**")) {
            tracing::warn!("invalid exclude‐dir pattern ‘{dir}’: {e}");
        }
    }
    for file in &cfg.scanner.excluded_files {
        if let Err(e) = ob.add(&format!("!{file}")) {
            tracing::warn!("invalid exclude‐file pattern ‘{file}’: {e}");
        }
    }

    ob.build().unwrap_or_else(|e| {
        tracing::error!("failed to build ignore overrides: {e}");
        ignore::overrides::Override::empty()
    })
}

// ---------------------------------------------------------------------------
/// Walk `root` and send *batches* of paths through the returned channel.
pub fn spawn_file_walker(root: &Path, cfg: &Config) -> (Receiver<Paths>, JoinHandle<()>) {
    let _span = tracing::info_span!("spawn_file_walker", root = %root.display()).entered();
    let overrides = build_overrides(root, cfg);

    // ----- 2  channel & thread pool parameters -----------------------------
    let workers = cfg.performance.worker_threads.unwrap_or(num_cpus::get());
    let (tx, rx) = bounded::<Paths>(workers * cfg.performance.channel_multiplier);

    let root = root.to_path_buf();
    let canonical_root = std::fs::canonicalize(&root).ok();
    let scan_hidden = cfg.scanner.scan_hidden_files;
    let follow = cfg.scanner.follow_symlinks;
    let max_bytes = cfg.scanner.max_file_size_mb.unwrap_or(0) * 1_048_576;
    let batch_size = cfg.performance.batch_size;
    let max_depth = cfg.performance.max_depth;
    let same_file_system = cfg.scanner.one_file_system;
    let require_git = cfg.scanner.require_git_to_read_vcsignore;

    // ----- 3  the background walker thread ---------------------------------
    let handle = thread::spawn(move || {
        tracing::info!(
            root = ?root,
            workers = workers,
            scan_hidden = scan_hidden,
            follow_links = follow,
            max_bytes = max_bytes,
            batch_size = batch_size,
            "starting directory walk"
        );

        let mut builder = WalkBuilder::new(root);
        builder
            .hidden(!scan_hidden)
            .follow_links(follow)
            .threads(workers)
            .overrides(overrides)
            .same_file_system(same_file_system)
            .require_git(require_git);
        if let Some(depth) = max_depth {
            builder.max_depth(Some(depth));
        }
        builder
            .filter_entry(|e| {
                e.file_type()
                    .map(|ft| ft.is_dir() || ft.is_file())
                    .unwrap_or(true)
            })
            .build_parallel()
            .run(move || {
                let mut bs = BatchSender::new(tx.clone(), batch_size);
                let canonical_root = canonical_root.clone();

                Box::new(move |entry| {
                    if let Ok(e) = entry {
                        let metadata = match e.metadata() {
                            Ok(metadata) => metadata,
                            Err(_) => return WalkState::Continue,
                        };
                        let is_file = metadata.file_type().is_file();
                        let under_limit = max_bytes == 0 || metadata.len() <= max_bytes;
                        // Always canonicalize and verify containment, a symlink
                        // in the tree can escape the root even when follow=false
                        // if the walker resolves it at metadata time.
                        let path_allowed = canonical_root.as_ref().is_none_or(|root| {
                            path_stays_within_root(root, e.path()).unwrap_or(false)
                        });

                        if is_file && under_limit && path_allowed {
                            bs.push_path(e.into_path());
                        }
                    }
                    WalkState::Continue
                })
            });
        tracing::info!("directory walk complete");
    });

    (rx, handle)
}

#[test]
fn walker_respects_excluded_extensions() {
    let tmp = tempfile::tempdir().unwrap();
    std::fs::write(tmp.path().join("keep.rs"), "fn main(){}").unwrap(); // nyx:ignore cfg-unguarded-sink
    std::fs::write(tmp.path().join("skip.txt"), "ignored").unwrap(); // nyx:ignore cfg-unguarded-sink

    let mut cfg = Config::default();
    cfg.scanner.excluded_extensions = vec!["txt".into()];
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 2;

    let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
    if let Err(err) = handle.join() {
        tracing::error!("walker thread panicked: {:#?}", err);
    }

    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(all.iter().any(|p| p.ends_with("keep.rs")));
    assert!(all.iter().all(|p| !p.ends_with("skip.txt")));
}

#[test]
fn walker_respects_excluded_directories() {
    let tmp = tempfile::tempdir().unwrap();
    let root = tmp.path();

    // Files at root level
    std::fs::write(root.join("keep.rs"), "fn main(){}").unwrap(); // nyx:ignore cfg-unguarded-sink
    // Files in excluded subdir
    let vendor = root.join("vendor");
    std::fs::create_dir(&vendor).unwrap();
    std::fs::write(vendor.join("dep.rs"), "fn dep(){}").unwrap(); // nyx:ignore cfg-unguarded-sink

    let mut cfg = Config::default();
    cfg.scanner.excluded_directories = vec!["vendor".into()];
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 4;

    let (rx, handle) = spawn_file_walker(root, &cfg);
    handle.join().ok();
    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(all.iter().any(|p| p.ends_with("keep.rs")));
    assert!(
        all.iter().all(|p| !p.starts_with(&vendor)),
        "vendor dir files should be excluded: {all:?}"
    );
}

#[test]
fn walker_respects_excluded_files() {
    let tmp = tempfile::tempdir().unwrap();
    let root = tmp.path();

    std::fs::write(root.join("keep.rs"), "fn a(){}").unwrap(); // nyx:ignore cfg-unguarded-sink
    std::fs::write(root.join("skip.rs"), "fn b(){}").unwrap(); // nyx:ignore cfg-unguarded-sink

    let mut cfg = Config::default();
    cfg.scanner.excluded_files = vec!["skip.rs".into()];
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 4;

    let (rx, handle) = spawn_file_walker(root, &cfg);
    handle.join().ok();
    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(all.iter().any(|p| p.ends_with("keep.rs")));
    assert!(all.iter().all(|p| !p.ends_with("skip.rs")));
}

#[test]
fn walker_respects_max_file_size() {
    let tmp = tempfile::tempdir().unwrap();
    let root = tmp.path();

    // Write a small file (a few bytes) and a large file (> 1 MB limit)
    std::fs::write(root.join("small.rs"), "fn s(){}").unwrap(); // nyx:ignore cfg-unguarded-sink
    let big_data = vec![b'x'; 2 * 1_048_576]; // 2 MB
    std::fs::write(root.join("big.rs"), big_data).unwrap(); // nyx:ignore cfg-unguarded-sink

    let mut cfg = Config::default();
    cfg.scanner.max_file_size_mb = Some(1); // 1 MB limit
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 4;

    let (rx, handle) = spawn_file_walker(root, &cfg);
    handle.join().ok();
    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(all.iter().any(|p| p.ends_with("small.rs")));
    assert!(
        all.iter().all(|p| !p.ends_with("big.rs")),
        "file exceeding size limit should be excluded: {all:?}"
    );
}

#[test]
fn walker_returns_empty_on_empty_directory() {
    let tmp = tempfile::tempdir().unwrap();

    let mut cfg = Config::default();
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 4;

    let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
    handle.join().ok();
    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(all.is_empty(), "empty directory should yield no files");
}

#[cfg(unix)]
#[test]
fn walker_follow_symlinks_does_not_escape_root() {
    use std::os::unix::fs::symlink;

    let tmp = tempfile::tempdir().unwrap();
    let outside = tempfile::tempdir().unwrap();
    let outside_file = outside.path().join("secret.rs");
    std::fs::write(&outside_file, "fn leaked() {}").unwrap();

    let link = tmp.path().join("escape.rs");
    symlink(&outside_file, &link).unwrap();

    let mut cfg = Config::default();
    cfg.scanner.follow_symlinks = true;
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 4;

    let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
    handle.join().ok();
    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(
        all.iter().all(|path| path != &link),
        "symlink escapes must not be scanned: {all:?}"
    );
}

#[cfg(unix)]
#[test]
fn walker_no_follow_symlinks_still_rejects_outside_paths() {
    // Pre-existing symlink to an out-of-root file must be excluded even when
    // follow_symlinks=false, the walker may surface the resolved path on
    // some platforms.
    use std::os::unix::fs::symlink;

    let tmp = tempfile::tempdir().unwrap();
    let outside = tempfile::tempdir().unwrap();
    let outside_file = outside.path().join("secret.rs");
    std::fs::write(&outside_file, "fn leaked() {}").unwrap();

    let link = tmp.path().join("escape.rs");
    symlink(&outside_file, &link).unwrap();

    let mut cfg = Config::default();
    cfg.scanner.follow_symlinks = false;
    cfg.performance.worker_threads = Some(1);
    cfg.performance.channel_multiplier = 1;
    cfg.performance.batch_size = 4;

    let (rx, handle) = spawn_file_walker(tmp.path(), &cfg);
    handle.join().ok();
    let all: Vec<_> = rx.into_iter().flatten().collect();

    assert!(
        all.iter()
            .all(|path| !path.starts_with(outside.path()) && path != &link),
        "symlink target outside root must not be scanned: {all:?}"
    );
}