pub mod filter;
pub mod parallel;
pub mod traverse;
pub use traverse::CodeWalker;
use std::collections::HashSet;
use std::io::Read;
use std::path::PathBuf;
const DEFAULT_MAX_SYMLINK_DEPTH: usize = 16;
pub(crate) const READ_CHUNK_SIZE: usize = 64 * 1024;
#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
#[serde(default)]
#[allow(clippy::struct_excessive_bools)]
pub struct WalkConfig {
pub max_file_size: u64,
pub skip_binary: bool,
pub skip_hidden: bool,
pub respect_gitignore: bool,
pub follow_symlinks: bool,
pub include_extensions: HashSet<String>,
pub exclude_extensions: HashSet<String>,
pub exclude_dirs: HashSet<String>,
pub ignore_files: Vec<String>,
pub ignore_patterns: Vec<String>,
pub max_symlink_depth: usize,
}
impl Default for WalkConfig {
fn default() -> Self {
let exclude_dirs: HashSet<String> = [
"node_modules",
".git",
"target",
"__pycache__",
".venv",
"venv",
".tox",
".mypy_cache",
".pytest_cache",
"dist",
"build",
".next",
".nuxt",
"vendor",
".bundle",
".gradle",
".mvn",
"Pods",
]
.iter()
.map(std::string::ToString::to_string)
.collect();
Self {
max_file_size: 10 * 1024 * 1024, skip_binary: true,
skip_hidden: true,
respect_gitignore: true,
follow_symlinks: false,
include_extensions: HashSet::new(),
exclude_extensions: HashSet::new(),
exclude_dirs,
ignore_files: vec![".keyhogignore".to_string()],
ignore_patterns: Vec::new(),
max_symlink_depth: DEFAULT_MAX_SYMLINK_DEPTH,
}
}
}
impl WalkConfig {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn builder() -> Self {
Self::default()
}
#[must_use]
pub fn artifact_defaults() -> Self {
Self {
skip_hidden: false,
respect_gitignore: false,
exclude_dirs: HashSet::new(),
..Self::default()
}
}
pub fn load(path: &std::path::Path) -> Result<Self, Box<dyn std::error::Error>> {
let content = std::fs::read_to_string(path)?;
let config: WalkConfig = toml::from_str(&content)?;
Ok(config)
}
pub fn from_toml(toml_str: &str) -> Result<Self, toml::de::Error> {
toml::from_str(toml_str)
}
#[must_use]
pub fn max_file_size(mut self, max_file_size: u64) -> Self {
self.max_file_size = max_file_size;
self
}
#[must_use]
pub fn skip_binary(mut self, skip_binary: bool) -> Self {
self.skip_binary = skip_binary;
self
}
#[must_use]
pub fn skip_hidden(mut self, skip_hidden: bool) -> Self {
self.skip_hidden = skip_hidden;
self
}
#[must_use]
pub fn respect_gitignore(mut self, respect_gitignore: bool) -> Self {
self.respect_gitignore = respect_gitignore;
self
}
#[must_use]
pub fn follow_symlinks(mut self, follow_symlinks: bool) -> Self {
self.follow_symlinks = follow_symlinks;
self
}
#[must_use]
pub fn include_extensions(mut self, include_extensions: HashSet<String>) -> Self {
self.include_extensions = include_extensions;
self
}
#[must_use]
pub fn exclude_extensions(mut self, exclude_extensions: HashSet<String>) -> Self {
self.exclude_extensions = exclude_extensions;
self
}
#[must_use]
pub fn exclude_dirs(mut self, exclude_dirs: HashSet<String>) -> Self {
self.exclude_dirs = exclude_dirs;
self
}
#[must_use]
pub fn ignore_files(mut self, ignore_files: Vec<String>) -> Self {
self.ignore_files = ignore_files;
self
}
#[must_use]
pub fn ignore_patterns(mut self, ignore_patterns: Vec<String>) -> Self {
self.ignore_patterns = ignore_patterns;
self
}
#[must_use]
pub fn max_symlink_depth(mut self, max_symlink_depth: usize) -> Self {
self.max_symlink_depth = max_symlink_depth;
self
}
}
#[derive(Clone, Debug)]
pub struct FileEntry {
pub path: PathBuf,
pub size: u64,
pub is_binary: bool,
}
impl FileEntry {
pub fn content(&self) -> crate::error::Result<FileContent> {
const MAX_CONTENT_AUTOLOAD: u64 = 256 * 1024 * 1024;
if self.size > MAX_CONTENT_AUTOLOAD {
return Err(crate::error::CodewalkError::FileTooLarge(self.size));
}
let read_limit = self.size;
let bounded_capacity = usize::try_from(read_limit).unwrap_or(READ_CHUNK_SIZE);
let mut bytes = Vec::with_capacity(bounded_capacity.min(READ_CHUNK_SIZE * 4));
for chunk in self.content_chunks()? {
let chunk = chunk?;
let remaining = usize::try_from(read_limit.saturating_sub(bytes.len() as u64))
.unwrap_or(0);
if remaining == 0 {
break;
}
let take = chunk.len().min(remaining);
bytes.extend_from_slice(&chunk[..take]);
}
if self.is_binary {
return Ok(FileContent::Binary(bytes));
}
match String::from_utf8(bytes) {
Ok(text) => Ok(FileContent::Text(text)),
Err(err) => Ok(FileContent::Unknown(err.into_bytes())),
}
}
pub fn content_chunks(&self) -> crate::error::Result<FileContentChunks> {
Ok(FileContentChunks {
file: std::fs::File::open(&self.path)?,
done: false,
})
}
pub fn content_str(&self) -> crate::error::Result<String> {
match self.content()? {
FileContent::Text(text) => Ok(text),
FileContent::Binary(bytes) | FileContent::Unknown(bytes) => {
Ok(String::from_utf8(bytes)?)
}
}
}
}
#[derive(Debug)]
pub enum FileContent {
Text(String),
Binary(Vec<u8>),
Unknown(Vec<u8>),
}
impl std::fmt::Display for FileContent {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
Self::Text(_) => "text",
Self::Binary(_) => "binary",
Self::Unknown(_) => "unknown",
})
}
}
impl FileContent {
#[must_use]
pub fn as_bytes(&self) -> &[u8] {
match self {
Self::Text(text) => text.as_bytes(),
Self::Binary(bytes) | Self::Unknown(bytes) => bytes.as_slice(),
}
}
#[must_use]
pub fn as_text(&self) -> Option<&str> {
match self {
Self::Text(text) => Some(text.as_str()),
Self::Binary(_) | Self::Unknown(_) => None,
}
}
#[must_use]
pub fn is_text(&self) -> bool {
matches!(self, Self::Text(_))
}
#[must_use]
pub fn is_binary(&self) -> bool {
matches!(self, Self::Binary(_))
}
#[must_use]
pub fn is_unknown(&self) -> bool {
matches!(self, Self::Unknown(_))
}
#[must_use]
pub fn len(&self) -> usize {
self.as_bytes().len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl AsRef<[u8]> for FileContent {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}
#[derive(Debug)]
pub struct FileContentChunks {
file: std::fs::File,
done: bool,
}
impl Iterator for FileContentChunks {
type Item = crate::error::Result<Vec<u8>>;
fn next(&mut self) -> Option<Self::Item> {
if self.done {
return None;
}
let mut chunk = vec![0_u8; READ_CHUNK_SIZE];
match self.file.read(&mut chunk) {
Ok(0) => {
self.done = true;
None
}
Ok(read) => {
chunk.truncate(read);
Some(Ok(chunk))
}
Err(err) => {
self.done = true;
Some(Err(err.into()))
}
}
}
}
#[cfg(test)]
pub(crate) mod test_utils {
#![allow(clippy::unwrap_used)]
use std::fs;
pub(crate) fn setup_test_dir() -> tempfile::TempDir {
let dir = tempfile::tempdir().unwrap();
fs::write(dir.path().join("main.rs"), "fn main() {}").unwrap();
fs::write(dir.path().join("lib.rs"), "pub fn hello() {}").unwrap();
fs::write(dir.path().join("data.bin"), b"\x7fELF\x00\x00\x00\x00").unwrap();
fs::create_dir(dir.path().join("node_modules")).unwrap();
fs::write(dir.path().join("node_modules/junk.js"), "// junk").unwrap();
fs::create_dir(dir.path().join("src")).unwrap();
fs::write(dir.path().join("src/app.py"), "print('hello')").unwrap();
dir
}
}
#[cfg(test)]
mod tests {
#![allow(clippy::unwrap_used)]
use super::*;
use std::fs;
#[test]
fn file_content_read() {
let dir = tempfile::tempdir().unwrap();
fs::write(dir.path().join("test.txt"), "hello world").unwrap();
let config = WalkConfig {
skip_binary: false,
..WalkConfig::default()
};
let walker = CodeWalker::new(dir.path(), config);
let entries = walker.walk().unwrap();
assert_eq!(entries.len(), 1);
let content = entries[0].content().unwrap();
assert_eq!(content.as_bytes(), b"hello world");
assert_eq!(content.len(), 11);
assert!(!content.is_empty());
}
#[test]
fn file_content_str() {
let dir = tempfile::tempdir().unwrap();
fs::write(dir.path().join("test.rs"), "fn main() {}").unwrap();
let walker = CodeWalker::new(dir.path(), WalkConfig::default());
let entries = walker.walk().unwrap();
let s = entries[0].content_str().unwrap();
assert_eq!(s, "fn main() {}");
}
#[test]
fn default_config_excludes_common_dirs() {
let config = WalkConfig::default();
assert!(config.exclude_dirs.contains("node_modules"));
assert!(config.exclude_dirs.contains(".git"));
assert!(config.exclude_dirs.contains("target"));
assert!(config.exclude_dirs.contains("__pycache__"));
assert!(config.exclude_dirs.contains("vendor"));
}
}