scanit/lib.rs
1#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)]
2#![allow(clippy::single_char_lifetime_names)]
3#![allow(clippy::single_call_fn)]
4#![allow(clippy::question_mark_used)]
5#![allow(clippy::too_many_arguments)]
6#![allow(clippy::fn_params_excessive_bools)]
7#![allow(clippy::struct_excessive_bools)]
8#![allow(clippy::multiple_crate_versions)]
9
10//!CLI TOOL FOR SEARCHING FILE PATHS
11//! A CLI tool for efficiently searching file paths in parallel
12//!
13//! This crate provides functionality to:
14//! - Search files using regex patterns
15//! - Filter system paths
16//! - Handle both Unix and Windows paths
17//! - Process directories in parallel
18
19#[cfg(not(target_env = "msvc"))]
20#[global_allocator]
21static GLOBAL_ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
22
23#[cfg(target_env = "msvc")]
24#[global_allocator]
25static GLOBAL_ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
26
27use fnmatch_regex2::glob_to_regex;
28use ignore::{DirEntry, WalkBuilder, WalkState};
29use regex::{bytes::Regex, bytes::RegexBuilder};
30pub use std::ffi::OsString;
31use std::path::PathBuf;
32use std::process::exit as process_exit;
33pub use std::sync::mpsc::{channel as unbounded, Receiver,Sender};
34pub type BoxBytes = Box<[u8]>;
35use std::collections::HashSet;
36use std::sync::OnceLock;
37mod process_entries;
38use process_entries::{process_entry_fullpath, process_entry_shortpath};
39pub use process_entries::{FileNameBytes,AsBytes};
40mod config;
41mod constants;
42pub use config::SearchConfig;
43mod error;
44use constants::{AVOID, START_PREFIX};
45pub(crate) use constants::{DEPTH_CHECK, DOT_PATTERN};
46pub use error::ScanError;
47
48static AVOID_PATHS: OnceLock<HashSet<PathBuf>> = OnceLock::new();
49
50/// Checks if a given path should be excluded from system paths
51///
52/// # Arguments
53/// * `filepath` - The path to check
54///
55/// # Returns
56/// * `true` if the path should be included
57/// * `false` if the path should be excluded
58#[allow(clippy::inline_always)]
59#[inline(always)]
60fn avoid_sys_paths(path_entry: &DirEntry) -> bool {
61 if path_entry.depth() > DEPTH_CHECK {
62 return true;
63 }
64 let paths = AVOID_PATHS.get_or_init(|| AVOID.iter().map(PathBuf::from).collect::<HashSet<_>>());
65 !paths.contains(path_entry.path())
66}
67
68#[allow(clippy::missing_errors_doc)]
69#[must_use = "builds regex but modifies errors to map to custom error type"]
70fn build_regex(pattern: &str, case_sensitive: bool) -> Result<Regex, ScanError> {
71 RegexBuilder::new(pattern)
72 .case_insensitive(case_sensitive)
73 .build()
74 .map_err(ScanError::Regex)
75}
76
77#[must_use]
78fn process_glob_regex(glob_pattern: &str) -> String {
79 glob_to_regex(glob_pattern).map_or_else(
80 |_| {
81 eprintln!("This can't be processed as a glob pattern");
82 process_exit(1)
83 },
84 |good_pattern| good_pattern.as_str().into(),
85 )
86}
87
88/// Creates an iterator over files matching the given search configuration.
89///
90/// # Arguments
91///
92/// The search configuration (`SearchConfig`) contains:
93/// * `pattern` - A regex pattern (or a glob pattern if `use_glob` is true) to match against file paths.
94/// * `root` - The root directory from which to start the search.
95/// * `hide_hidden` - Whether to skip hidden files and directories.
96/// * `case_sensitive` - Whether regex matching should be case sensitive.
97/// * `thread_count` - Number of parallel threads to use during traversal.
98/// * `keep_dirs` - Whether to include directory paths in the output.
99/// * `keep_sys_paths` - Whether system paths should be included, overriding default filtering.
100/// * `max_depth` - Maximum directory depth to traverse.
101/// * `use_glob` - If true, the input pattern is treated as a glob pattern.
102/// * `full_path` - If true, matching is performed against the full file path instead of just the filename.
103///
104/// # Errors
105///
106/// Returns a `ScanError` if:
107/// * The regex (or glob-to-regex conversion) fails to compile.
108/// * Directory traversal fails.
109/// * File system access is denied.
110///
111/// # Returns
112///
113/// * `Result<Receiver<Box<[u8]>>, ScanError>` - An iterator over matched file paths represented as boxed bytes.
114///
115/// # Examples
116/// ```rust
117/// use scanit::{find_files_iter, SearchConfig, ScanError};
118///
119/// fn main() -> Result<(), ScanError> {
120/// let search_config = SearchConfig {
121/// pattern: r".*\.rs$".into(),
122/// root: ".".into(),
123/// hide_hidden: true,
124/// case_sensitive: false,
125/// thread_count: 4,
126/// keep_dirs: false,
127/// keep_sys_paths: false,
128/// max_depth: Some(5),
129/// use_glob: false,
130/// full_path: false,
131/// };
132///
133///
134///
135/// for path in find_files_iter(&search_config)?.iter() {
136/// println!("{:?}", &*path);
137/// }
138///
139/// Ok(())
140/// }
141/// ```
142#[inline]
143pub fn find_files_iter(search_config: &SearchConfig) -> Result<Receiver<BoxBytes>, ScanError> {
144 let (tx, rx) = unbounded::<BoxBytes>();
145
146 let pattern_to_use = if search_config.use_glob {
147 process_glob_regex(&search_config.pattern)
148 } else {
149 search_config.pattern.clone()
150 };
151
152 let re: Option<Regex> = if search_config.pattern == DOT_PATTERN {
153 None
154 } else {
155 Some(build_regex(&pattern_to_use, search_config.case_sensitive)?)
156 };
157
158 //This just avoids unnecessary boolean checks(trivial but good to do)
159 let conditional_check: bool =
160 search_config.root != START_PREFIX || search_config.keep_sys_paths;
161
162 //implementing this switch here improves performance.
163 let process_entry = if search_config.use_glob || search_config.full_path {
164 process_entry_fullpath
165 } else {
166 process_entry_shortpath
167 };
168
169 WalkBuilder::new(&search_config.root)
170 .hidden(!search_config.hide_hidden)
171 .filter_entry(move |entry| conditional_check || avoid_sys_paths(entry))
172 .git_global(false)
173 .git_ignore(false)
174 .git_exclude(false)
175 .ignore(false)
176 .max_depth(search_config.max_depth)
177 .threads(search_config.thread_count)
178 .build_parallel()
179 .run(|| {
180 Box::new(|entry| {
181 entry.map_or(WalkState::Continue, |entry_path| {
182 if !search_config.keep_dirs
183 && entry_path
184 .file_type()
185 .is_some_and(|filetype| filetype.is_dir())
186 {
187 return WalkState::Continue;
188 }
189
190 process_entry(&entry_path, re.as_ref(), &tx)
191 })
192 })
193 });
194 Ok(rx)
195}
196
197/// # Examples
198/// ```
199/// use scanit::{find_files, ScanError};
200///
201/// fn main() -> Result<(), ScanError> {
202/// // Find all Rust source files in current directory
203/// let files = find_files(
204/// r"\.rs$", // Match files ending in .rs
205/// ".", // Search in current directory
206/// true, // Skip hidden files
207/// false, // Case-insensitive matching
208/// 4, // Use 4 parallel threads
209/// false, // Don't include directory paths
210/// false, // Skip system paths
211/// Some(5), // Search up to 5 directories deep
212/// false, // Use regex (not glob) pattern
213/// false, // Match against filename only
214/// )?;
215///
216/// // Example output: ["main.rs", "lib.rs", "tests/common.rs"]
217/// for path in files {
218/// println!("{:?}", path);
219/// }
220/// Ok(())
221/// }
222/// ```
223///
224/// # Errors
225/// Returns `ScanError` if:
226/// * The regex pattern is invalid (`ScanError::Regex`)
227/// * Directory traversal fails (`ScanError::Walk`)
228/// * File system access is denied (`ScanError::Io`)
229/// * Memory allocation fails during path collection
230#[inline]
231pub fn find_files(
232 pattern: &str,
233 root: &str,
234 hide_hidden: bool,
235 case_sensitive: bool,
236 thread_count: usize,
237 keep_dirs: bool,
238 keep_sys_paths: bool,
239 max_depth: Option<usize>,
240 use_glob: bool,
241 full_path: bool,
242) -> Result<Vec<OsString>, ScanError> {
243 let search_config = SearchConfig {
244 pattern: pattern.to_string(),
245 root: root.into(),
246 hide_hidden,
247 case_sensitive,
248 thread_count,
249 keep_dirs,
250 keep_sys_paths,
251 max_depth,
252 use_glob,
253 full_path,
254 };
255
256 Ok(find_files_iter(&search_config)?
257 .iter()
258 .map(|arc_str| unsafe { OsString::from_encoded_bytes_unchecked(arc_str.into()) })
259 // SAFETY: The bytes in arc_str are guaranteed to be valid OsString data
260 // because they were originally created from OsStrings in process_entry_*.rs
261 // and have only been transmitted as raw bytes for performance reasons.
262 // This conversion is safe because:
263 // 1. The original data came from valid OsStrings
264 // 2. The bytes have not been modified during transmission
265 // 3. The platform encoding has not changed during the operation
266 .collect::<Vec<OsString>>())
267}