scanit/
lib.rs

1#![warn(clippy::all, clippy::pedantic, clippy::nursery, clippy::cargo)]
2#![allow(clippy::single_char_lifetime_names)]
3#![allow(clippy::single_call_fn)]
4#![allow(clippy::question_mark_used)]
5#![allow(clippy::too_many_arguments)]
6#![allow(clippy::fn_params_excessive_bools)]
7#![allow(clippy::struct_excessive_bools)]
8#![allow(clippy::multiple_crate_versions)]
9
10//!CLI TOOL FOR SEARCHING FILE PATHS
11//! A CLI tool for efficiently searching file paths in parallel
12//!
13//! This crate provides functionality to:
14//! - Search files using regex patterns
15//! - Filter system paths
16//! - Handle both Unix and Windows paths
17//! - Process directories in parallel
18
19#[cfg(not(target_env = "msvc"))]
20#[global_allocator]
21static GLOBAL_ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
22
23#[cfg(target_env = "msvc")]
24#[global_allocator]
25static GLOBAL_ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
26
27use fnmatch_regex2::glob_to_regex;
28use ignore::{DirEntry, WalkBuilder, WalkState};
29use regex::{bytes::Regex, bytes::RegexBuilder};
30pub use std::ffi::OsString;
31use std::path::PathBuf;
32use std::process::exit as process_exit;
33pub use std::sync::mpsc::{channel as unbounded, Receiver,Sender};
34pub type BoxBytes = Box<[u8]>;
35use std::collections::HashSet;
36use std::sync::OnceLock;
37mod process_entries;
38use process_entries::{process_entry_fullpath, process_entry_shortpath};
39pub use process_entries::{FileNameBytes,AsBytes};
40mod config;
41mod constants;
42pub use config::SearchConfig;
43mod error;
44use constants::{AVOID, START_PREFIX};
45pub(crate) use constants::{DEPTH_CHECK, DOT_PATTERN};
46pub use error::ScanError;
47
48static AVOID_PATHS: OnceLock<HashSet<PathBuf>> = OnceLock::new();
49
50/// Checks if a given path should be excluded from system paths
51///
52/// # Arguments
53/// * `filepath` - The path to check
54///
55/// # Returns
56/// * `true` if the path should be included
57/// * `false` if the path should be excluded
58#[allow(clippy::inline_always)]
59#[inline(always)]
60fn avoid_sys_paths(path_entry: &DirEntry) -> bool {
61    if path_entry.depth() > DEPTH_CHECK {
62        return true;
63    }
64    let paths = AVOID_PATHS.get_or_init(|| AVOID.iter().map(PathBuf::from).collect::<HashSet<_>>());
65    !paths.contains(path_entry.path())
66}
67
68#[allow(clippy::missing_errors_doc)]
69#[must_use = "builds regex but modifies errors to map to custom error type"]
70fn build_regex(pattern: &str, case_sensitive: bool) -> Result<Regex, ScanError> {
71    RegexBuilder::new(pattern)
72        .case_insensitive(case_sensitive)
73        .build()
74        .map_err(ScanError::Regex)
75}
76
77#[must_use]
78fn process_glob_regex(glob_pattern: &str) -> String {
79    glob_to_regex(glob_pattern).map_or_else(
80        |_| {
81            eprintln!("This can't be processed as a glob pattern");
82            process_exit(1)
83        },
84        |good_pattern| good_pattern.as_str().into(),
85    )
86}
87
88/// Creates an iterator over files matching the given search configuration.
89///
90/// # Arguments
91///
92/// The search configuration (`SearchConfig`) contains:
93/// * `pattern` - A regex pattern (or a glob pattern if `use_glob` is true) to match against file paths.
94/// * `root` - The root directory from which to start the search.
95/// * `hide_hidden` - Whether to skip hidden files and directories.
96/// * `case_sensitive` - Whether regex matching should be case sensitive.
97/// * `thread_count` - Number of parallel threads to use during traversal.
98/// * `keep_dirs` - Whether to include directory paths in the output.
99/// * `keep_sys_paths` - Whether system paths should be included, overriding default filtering.
100/// * `max_depth` - Maximum directory depth to traverse.
101/// * `use_glob` - If true, the input pattern is treated as a glob pattern.
102/// * `full_path` - If true, matching is performed against the full file path instead of just the filename.
103///
104/// # Errors
105///
106/// Returns a `ScanError` if:
107/// * The regex (or glob-to-regex conversion) fails to compile.
108/// * Directory traversal fails.
109/// * File system access is denied.
110///
111/// # Returns
112///
113/// * `Result<Receiver<Box<[u8]>>, ScanError>` - An iterator over matched file paths represented as boxed bytes.
114///
115/// # Examples
116/// ```rust
117/// use scanit::{find_files_iter, SearchConfig, ScanError};
118///
119/// fn main() -> Result<(), ScanError> {
120///     let search_config = SearchConfig {
121///         pattern: r".*\.rs$".into(),
122///         root: ".".into(),
123///         hide_hidden: true,
124///         case_sensitive: false,
125///         thread_count: 4,
126///         keep_dirs: false,
127///         keep_sys_paths: false,
128///         max_depth: Some(5),
129///         use_glob: false,
130///         full_path: false,
131///     };
132///     
133///     
134///     
135///     for path in  find_files_iter(&search_config)?.iter() {
136///         println!("{:?}", &*path);
137///     }
138///     
139///     Ok(())
140/// }
141/// ```
142#[inline]
143pub fn find_files_iter(search_config: &SearchConfig) -> Result<Receiver<BoxBytes>, ScanError> {
144    let (tx, rx) = unbounded::<BoxBytes>();
145
146    let pattern_to_use = if search_config.use_glob {
147        process_glob_regex(&search_config.pattern)
148    } else {
149        search_config.pattern.clone()
150    };
151
152    let re: Option<Regex> = if search_config.pattern == DOT_PATTERN {
153        None
154    } else {
155        Some(build_regex(&pattern_to_use, search_config.case_sensitive)?)
156    };
157
158    //This just avoids unnecessary boolean checks(trivial but good to do)
159    let conditional_check: bool =
160        search_config.root != START_PREFIX || search_config.keep_sys_paths;
161
162    //implementing this switch here improves performance.
163    let process_entry = if search_config.use_glob || search_config.full_path {
164        process_entry_fullpath
165    } else {
166        process_entry_shortpath
167    };
168
169    WalkBuilder::new(&search_config.root)
170        .hidden(!search_config.hide_hidden)
171        .filter_entry(move |entry| conditional_check || avoid_sys_paths(entry))
172        .git_global(false)
173        .git_ignore(false)
174        .git_exclude(false)
175        .ignore(false)
176        .max_depth(search_config.max_depth)
177        .threads(search_config.thread_count)
178        .build_parallel()
179        .run(|| {
180            Box::new(|entry| {
181                entry.map_or(WalkState::Continue, |entry_path| {
182                    if !search_config.keep_dirs
183                        && entry_path
184                            .file_type()
185                            .is_some_and(|filetype| filetype.is_dir())
186                    {
187                        return WalkState::Continue;
188                    }
189
190                    process_entry(&entry_path, re.as_ref(), &tx)
191                })
192            })
193        });
194    Ok(rx)
195}
196
197/// # Examples
198/// ```
199/// use scanit::{find_files, ScanError};
200///
201/// fn main() -> Result<(), ScanError> {
202///     // Find all Rust source files in current directory
203///     let files = find_files(
204///         r"\.rs$",        // Match files ending in .rs
205///         ".",             // Search in current directory
206///         true,            // Skip hidden files
207///         false,           // Case-insensitive matching
208///         4,               // Use 4 parallel threads
209///         false,           // Don't include directory paths
210///         false,           // Skip system paths
211///         Some(5),         // Search up to 5 directories deep
212///         false,           // Use regex (not glob) pattern
213///         false,           // Match against filename only
214///     )?;
215///
216///     // Example output: ["main.rs", "lib.rs", "tests/common.rs"]
217///     for path in files {
218///         println!("{:?}", path);
219///     }
220///     Ok(())
221/// }
222/// ```
223///
224/// # Errors
225/// Returns `ScanError` if:
226/// * The regex pattern is invalid (`ScanError::Regex`)
227/// * Directory traversal fails (`ScanError::Walk`)
228/// * File system access is denied (`ScanError::Io`)
229/// * Memory allocation fails during path collection
230#[inline]
231pub fn find_files(
232    pattern: &str,
233    root: &str,
234    hide_hidden: bool,
235    case_sensitive: bool,
236    thread_count: usize,
237    keep_dirs: bool,
238    keep_sys_paths: bool,
239    max_depth: Option<usize>,
240    use_glob: bool,
241    full_path: bool,
242) -> Result<Vec<OsString>, ScanError> {
243    let search_config = SearchConfig {
244        pattern: pattern.to_string(),
245        root: root.into(),
246        hide_hidden,
247        case_sensitive,
248        thread_count,
249        keep_dirs,
250        keep_sys_paths,
251        max_depth,
252        use_glob,
253        full_path,
254    };
255
256    Ok(find_files_iter(&search_config)?
257        .iter()
258        .map(|arc_str| unsafe { OsString::from_encoded_bytes_unchecked(arc_str.into()) })
259        // SAFETY: The bytes in arc_str are guaranteed to be valid OsString data
260        // because they were originally created from OsStrings in process_entry_*.rs
261        // and have only been transmitted as raw bytes for performance reasons.
262        // This conversion is safe because:
263        // 1. The original data came from valid OsStrings
264        // 2. The bytes have not been modified during transmission
265        // 3. The platform encoding has not changed during the operation
266        .collect::<Vec<OsString>>())
267}