use codewalk::{CodeWalker, WalkConfig};
use keyhog_core::merkle_index::MerkleIndex;
use keyhog_core::{Chunk, ChunkMetadata, Source, SourceError};
use std::collections::HashSet;
use std::io::{Read, Seek, SeekFrom};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
mod read;
#[cfg(unix)]
const MMAP_THRESHOLD: u64 = 64 * 1024;
#[cfg(not(unix))]
const MMAP_THRESHOLD: u64 = 1024 * 1024;
const DEFAULT_WINDOW_SIZE: usize = 64 * 1024 * 1024;
const DEFAULT_WINDOW_OVERLAP: usize = 4 * 1024;
pub struct FilesystemSource {
root: PathBuf,
max_file_size: u64,
ignore_paths: Vec<String>,
include_paths: Vec<PathBuf>,
respect_gitignore: bool,
merkle: Option<Arc<MerkleIndex>>,
skipped: Arc<AtomicUsize>,
window_size: usize,
window_overlap: usize,
}
impl FilesystemSource {
pub fn new(root: PathBuf) -> Self {
let root = root.canonicalize().unwrap_or(root);
Self {
root,
max_file_size: 100 * 1024 * 1024, ignore_paths: Vec::new(),
include_paths: Vec::new(),
respect_gitignore: true,
merkle: None,
skipped: Arc::new(AtomicUsize::new(0)),
window_size: DEFAULT_WINDOW_SIZE,
window_overlap: DEFAULT_WINDOW_OVERLAP,
}
}
pub fn with_window_config(mut self, window_size: usize, overlap: usize) -> Self {
assert!(window_size > overlap, "window must exceed overlap");
self.window_size = window_size;
self.window_overlap = overlap;
self
}
pub fn with_merkle_skip(mut self, merkle: Arc<MerkleIndex>) -> Self {
self.merkle = Some(merkle);
self
}
pub fn skipped_counter(&self) -> Arc<AtomicUsize> {
self.skipped.clone()
}
pub fn with_include_paths(mut self, paths: Vec<PathBuf>) -> Self {
self.include_paths = paths;
self
}
pub fn with_max_file_size(mut self, bytes: u64) -> Self {
self.max_file_size = bytes;
self
}
pub fn with_ignore_paths(mut self, paths: Vec<String>) -> Self {
self.ignore_paths = paths;
self
}
pub fn with_respect_gitignore(mut self, respect: bool) -> Self {
self.respect_gitignore = respect;
self
}
}
const SKIP_EXTENSIONS: &[&str] = &[
"png",
"jpg",
"jpeg",
"gif",
"bmp",
"ico",
"cur",
"icns",
"webp",
"svg",
"mp3",
"mp4",
"avi",
"mov",
"mkv",
"flac",
"wav",
"ogg",
"webm",
"tar",
"tgz",
"bz2",
"xz",
"rar",
"7z",
"zip",
"exe",
"dll",
"so",
"dylib",
"o",
"a",
"lib",
"obj",
"class",
"wasm",
"pyc",
"pyo",
"elc",
"beam",
"pdf",
"doc",
"docx",
"xls",
"xlsx",
"ppt",
"pptx",
"ttf",
"otf",
"woff",
"woff2",
"eot",
"db",
"sqlite",
"sqlite3",
"iso",
"img",
"bin",
"rom",
"pickle",
"npy",
"npz",
"onnx",
"pb",
"tflite",
"pt",
"safetensors",
];
const SKIP_DIRS: &[&str] = &[
".git",
"node_modules",
"target",
"__pycache__",
".venv",
"venv",
".tox",
"dist",
"build",
".next",
".nuxt",
"vendor",
"swagger-ui",
"swagger",
];
impl Source for FilesystemSource {
fn name(&self) -> &str {
"filesystem"
}
fn chunks(&self) -> Box<dyn Iterator<Item = Result<Chunk, SourceError>> + '_> {
let max_size = self.max_file_size;
let mut config = walker_config(self.max_file_size, &self.ignore_paths);
if !self.respect_gitignore {
config = config.respect_gitignore(false);
}
let walker = CodeWalker::new(&self.root, config);
let mut entries: Vec<codewalk::FileEntry> = walker
.walk_iter()
.filter_map(|result| match result {
Ok(entry) => Some(entry),
Err(error) => {
tracing::warn!(
%error,
"skipping unreadable filesystem entry; scan continues"
);
None
}
})
.collect();
if !self.include_paths.is_empty() {
let allowed: HashSet<PathBuf> = self
.include_paths
.iter()
.map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()))
.collect();
entries.retain(|e| {
let canonical = e.path.canonicalize().unwrap_or_else(|_| e.path.clone());
allowed.contains(&canonical)
});
}
let merkle = self.merkle.clone();
let skipped = self.skipped.clone();
let window_size = self.window_size;
let window_overlap = self.window_overlap;
Box::new(entries.into_iter().flat_map(move |entry| {
let path = entry.path;
let file_size = entry.size;
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
if SKIP_EXTENSIONS.contains(&ext.as_str()) {
return vec![];
}
let live_mtime_ns = file_mtime_ns(&path);
if let (Some(idx), Some(mtime_ns)) = (merkle.as_ref(), live_mtime_ns) {
if idx.metadata_unchanged(&path, mtime_ns, file_size) {
skipped.fetch_add(1, Ordering::Relaxed);
return vec![];
}
}
if ext == "zip" || ext == "apk" || ext == "ipa" || ext == "crx" || ext == "jar" {
let mut archive_chunks = Vec::new();
let mut total_uncompressed: u64 = 0;
let total_budget: u64 = max_size.saturating_mul(4); if let Ok(pack) = openpack::OpenPack::open_default(&path) {
if let Ok(entries) = pack.entries() {
for archive_entry in entries {
if archive_entry.is_dir || is_default_excluded(&archive_entry.name) {
continue;
}
if archive_entry.uncompressed_size > max_size {
tracing::warn!(
archive = %path.display(),
entry = %archive_entry.name,
size = archive_entry.uncompressed_size,
"skipping archive entry: uncompressed size exceeds per-file cap"
);
continue;
}
total_uncompressed = total_uncompressed
.saturating_add(archive_entry.uncompressed_size);
if total_uncompressed > total_budget {
tracing::warn!(
archive = %path.display(),
"aborting archive extraction: total uncompressed size exceeds 4x file cap (zip-bomb guard)"
);
break;
}
if let Ok(content) = pack.read_entry(&archive_entry.name) {
if let Ok(s) = String::from_utf8(content.clone()) {
archive_chunks.push(Ok(Chunk {
data: s.into(),
metadata: ChunkMetadata {
source_type: "filesystem/archive".into(),
path: Some(format!(
"{}//{}",
path.display(),
archive_entry.name
)),
..Default::default()
},
}));
} else {
let strings =
crate::strings::extract_printable_strings(&content, 8);
if !strings.is_empty() {
archive_chunks.push(Ok(Chunk {
data: keyhog_core::SensitiveString::join(&strings, "\n"),
metadata: ChunkMetadata {
source_type: "filesystem/archive-binary".into(),
path: Some(format!(
"{}//{}",
path.display(),
archive_entry.name
)),
..Default::default()
},
}));
}
}
}
}
}
}
return archive_chunks;
} else if ext == "gz" || ext == "zst" || ext == "lz4" || ext == "sz" {
return extract_compressed_chunks(&path, max_size);
}
let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if is_default_excluded(filename) {
return vec![];
}
if filename.contains(".min.")
|| filename.contains(".bundle.")
|| filename.ends_with(".chunk.js")
{
return vec![];
}
if file_size > window_size as u64 {
if let Some(windows) =
read::read_file_windowed_mmap(&path, window_size, window_overlap)
{
return windows
.into_iter()
.map(|w| {
Ok(Chunk {
data: w.text.into(),
metadata: ChunkMetadata {
source_type: "filesystem/windowed".to_string(),
path: Some(path.display().to_string()),
base_offset: w.offset,
mtime_ns: live_mtime_ns,
size_bytes: Some(file_size),
..Default::default()
},
})
})
.collect();
}
let mut window_chunks = Vec::new();
if let Ok(mut file) = std::fs::File::open(&path) {
let mut current_offset = 0;
let mut buffer = vec![0u8; window_size];
while let Ok(n) = file.read(&mut buffer) {
if n == 0 { break; }
let data = String::from_utf8_lossy(&buffer[..n]).into_owned();
window_chunks.push(Ok(Chunk {
data: data.into(),
metadata: ChunkMetadata {
source_type: "filesystem/windowed".to_string(),
path: Some(path.display().to_string()),
base_offset: current_offset,
mtime_ns: live_mtime_ns,
size_bytes: Some(file_size),
..Default::default()
},
}));
if n < window_size { break; }
let _ = file.seek(SeekFrom::Current(-(window_overlap as i64)));
current_offset += n - window_overlap;
}
}
return window_chunks;
}
let file_text = if file_size >= MMAP_THRESHOLD {
read::read_file_mmap(&path)
} else {
read::read_file_buffered(&path)
};
let (content, source_type) = match file_text {
Some(text) if !text.is_empty() => (text.into(), "filesystem"),
_ => {
if let Ok(bytes) = read::read_file_safe(&path) {
let strings = crate::strings::extract_printable_strings(&bytes, 8);
if strings.is_empty() {
return vec![];
}
(keyhog_core::SensitiveString::join(&strings, "\n"), "filesystem:binary-strings")
} else {
return vec![];
}
}
};
vec![Ok(Chunk {
data: content,
metadata: ChunkMetadata {
source_type: source_type.to_string(),
path: Some(path.display().to_string()),
mtime_ns: live_mtime_ns,
size_bytes: Some(file_size),
..Default::default()
},
})]
}))
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
fn extract_compressed_chunks(path: &Path, max_size: u64) -> Vec<Result<Chunk, SourceError>> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let format = match ext.as_str() {
"gz" => ziftsieve::CompressionFormat::Gzip,
"zst" => ziftsieve::CompressionFormat::Zstd,
"lz4" => ziftsieve::CompressionFormat::Lz4,
_ => ziftsieve::CompressionFormat::Snappy,
};
let file_bytes = match read::read_file_for_compressed_input(path, max_size) {
Some(b) => b,
None => return Vec::new(),
};
let bytes = file_bytes.as_slice();
let total_budget: usize = max_size.saturating_mul(4) as usize;
let mut chunks = Vec::new();
if let Ok(blocks) = ziftsieve::extract_from_bytes(format, bytes) {
let mut current_chunk_literals = String::new();
let mut total_decompressed: usize = 0;
for block in blocks {
if let Ok(s) = std::str::from_utf8(block.literals()) {
total_decompressed = total_decompressed.saturating_add(s.len());
if total_decompressed > total_budget {
tracing::warn!(
path = %path.display(),
bytes = total_decompressed,
cap = total_budget,
"aborting compressed extraction: total decompressed size exceeds 4x file cap (gzip-bomb guard)"
);
break;
}
current_chunk_literals.push_str(s);
current_chunk_literals.push('\n');
}
if current_chunk_literals.len() > 8 * 1024 * 1024 {
chunks.push(Ok(Chunk {
data: std::mem::take(&mut current_chunk_literals).into(),
metadata: ChunkMetadata {
source_type: "filesystem/compressed".into(),
path: Some(path.display().to_string()),
..Default::default()
},
}));
}
}
if !current_chunk_literals.is_empty() {
chunks.push(Ok(Chunk {
data: current_chunk_literals.into(),
metadata: ChunkMetadata {
source_type: "filesystem/compressed".into(),
path: Some(path.display().to_string()),
..Default::default()
},
}));
}
}
chunks
}
fn is_default_excluded(path: &str) -> bool {
let bytes = path.as_bytes();
let ends_ci = |suffix: &[u8]| -> bool {
bytes.len() >= suffix.len()
&& bytes[bytes.len() - suffix.len()..].eq_ignore_ascii_case(suffix)
};
const SUFFIXES: &[&[u8]] = &[
b".min.js",
b".min.css",
b".bak",
b".swp",
b".tmp",
b".map",
b".cache",
];
if SUFFIXES.iter().any(|s| ends_ci(s)) {
return true;
}
const SKIP_SEGMENTS: &[&[u8]] = &[
b"node_modules",
b".git",
b"__pycache__",
b"vendor",
b"dist",
b"build",
b"out",
];
let mut filename: &[u8] = bytes;
for segment in path.split(['/', '\\']) {
let seg_bytes = segment.as_bytes();
if SKIP_SEGMENTS
.iter()
.any(|skip| seg_bytes.eq_ignore_ascii_case(skip))
{
return true;
}
if !seg_bytes.is_empty() {
filename = seg_bytes;
}
}
const FILENAMES: &[&[u8]] = &[
b"package-lock.json",
b"yarn.lock",
b"pnpm-lock.yaml",
b"cache.json",
b"cargo.lock",
b"go.sum",
b"gemfile.lock",
b"angular.json",
];
if FILENAMES
.iter()
.any(|name| filename.eq_ignore_ascii_case(name))
{
return true;
}
let tsc = b"tsconfig";
let json = b".json";
if filename.len() >= tsc.len() + json.len()
&& filename[..tsc.len()].eq_ignore_ascii_case(tsc)
&& filename[filename.len() - json.len()..].eq_ignore_ascii_case(json)
{
return true;
}
false
}
fn file_mtime_ns(path: &Path) -> Option<u64> {
let meta = std::fs::metadata(path).ok()?;
let modified = meta.modified().ok()?;
let dur = modified
.duration_since(std::time::UNIX_EPOCH)
.ok()?;
let nanos = dur.as_secs() as u128 * 1_000_000_000 + dur.subsec_nanos() as u128;
Some(u64::try_from(nanos).unwrap_or(u64::MAX))
}
fn walker_config(max_file_size: u64, ignore_paths: &[String]) -> WalkConfig {
let mut exclude_extensions = HashSet::new();
exclude_extensions.extend(SKIP_EXTENSIONS.iter().map(|ext| (*ext).to_string()));
let mut exclude_dirs = HashSet::new();
exclude_dirs.extend(SKIP_DIRS.iter().map(|dir| (*dir).to_string()));
let ignore_overrides = ignore_paths
.iter()
.map(|pattern| {
if pattern.starts_with('!') {
pattern.clone()
} else {
format!("!{pattern}")
}
})
.collect();
WalkConfig::default()
.max_file_size(max_file_size)
.follow_symlinks(false)
.respect_gitignore(true)
.skip_hidden(false)
.skip_binary(false)
.exclude_extensions(exclude_extensions)
.exclude_dirs(exclude_dirs)
.ignore_files(vec![".keyhogignore".to_string()])
.ignore_patterns(ignore_overrides)
}