file_crawler/
lib.rs

1//! A customisable, multithreaded (optionally async) file crawler for local file systems
2//! # Getting Started
3//! It is recommended to
4//! - add it to your project
5//! ```text
6//! cargo add file-crawler
7//! ```
8//! - use the prelude
9//! ```rust
10//! use file_crawler::prelude::*;
11//! ```
12//! - and read the examples (or the [`Crawler`][crate::builder::Crawler] docs)!
13//!
14//! - While working with the library, refer to the `Crawler` documentation.
15//!
16//! # Examples
17//! Below are some examples showing usage in different use cases. Reading these is is enough to understand everything for most use cases.
18//! ### Example 1
19//! Here's how you create a synchronous, multithreaded `Crawler` that prints the file name of every file in a folder:
20//! ```rust,ignore
21//! # fn main() -> Result<Box<dyn Error>> {
22//! use file_crawler::prelude::*;
23//!
24//! use std::path::PathBuf;
25//!
26//! Crawler::new()
27//!     .start_dir("C:\\user\\foo")
28//!     .run(|_, path: PathBuf| {
29//!         println!("{}", path.display());
30//!         //placeholder error type for now
31//!         Ok::<(), std::io::Error>(())
32//!     })?;
33//! Ok(())
34//! # }
35//! ```
36//! ### Example 2
37//! Actually, we left one argument out: the [`Context`][crate::builder::Crawler::context]!
38//! We didn't need it, but if we want to know how many files we have in our folder we can do this:
39//! ```rust,ignore
40//! # fn main() -> Result<(), Box<dyn Error>> {
41//! use file_crawler::prelude::*;
42//!
43//! use std::path::PathBuf;
44//! use std::sync::atomic::AtomicU32;
45//! use std::sync::{Arc, Mutex};
46//!
47//! //the context is later returned as the exact same type from the Crawler::run function
48//! //so we can bind it to a variable if needed
49//! let count=
50//! Crawler::new()
51//!     .start_dir("C:\\user\\foo")
52//!     //you can of course use atomic types, this makes more sense for numbers
53//!     .context(Mutex::new(0))
54//!     .run(|ctx: Arc<Mutex<u32>>, path: PathBuf| {
55//!         *ctx.lock().unwrap()+=1;
56//!         println!("{}", path.display());
57//!         Ok::<(), std::io::Error>(())
58//!     })?;
59//!  println!("Total number of files in \"C\\user\\foo\": {}", count.lock().unwrap());
60//! Ok(())
61//! # }
62//! ```
63//! ### Example 3
64//! Until now the `Ok()` was more mandatory than useful. Let's look at a use case where it is a big benefit,
65//! like counting the appearance of the letter '`a`' (assuming only text files are in the folder)
66//! ```rust,ignore
67//! use file_crawler::prelude::*;
68//!
69//! use std::fs::File;
70//! use std::path::PathBuf;
71//! use std::sync::Arc;
72//! use std::sync::atomic::AtomicU32;
73//!
74//! let a_count=
75//! Crawler::new()
76//! .start_dir("C:\\user\\foo")
77//! .context(AtomicU32::new(0))
78//! .run(|ctx: Arc<AtomicU32>, path: PathBuf| {
79//! let mut contents=String::new();
80//! let mut file=File::open(path)?;
81//! //NOTE: this can cause an error for files not readable as UTF-8
82//! //which returns an error and therefore terminates the crawler
83//! file.read_to_string(&mut contents)?;
84//! contents.chars().for_each(|char| if char=='a' { ctx.fetch_add(1, Ordering::Relaxed); });
85//! Ok::<(),std::io::Error>(())
86//! })?;
87//!println!("Appearance of the letter 'a' in \"C\\user\\foo\": {}", a_count.load(Ordering::Relaxed));
88//! Ok(())
89//!
90//! ```
91//! ### Example 4
92//! Say, you are looking all `.txt` files in a folder that's probably very big and deeply nested and
93//! don't want to use all the computation power and time it would require you can do something like this:
94//! ```rust,ignore
95//!  use file_crawler::prelude::*;
96//!
97//!  use std::path::PathBuf;
98//!
99//!  Crawler::new()
100//!     .start_dir("C:\\user\\probably_very_deep_folder")
101//!     //you can set a regex for every file / folder
102//!     //the closure you specify is only executed for a file if its name matches the regex
103//!     //this regex matches every single-line string ending in ".txt"
104//!     .file_regex(r"^.*\.txt$")
105//!     //sets a maximum depth (in terms of "folder layers" over each other)
106//!     .search_depth(3)
107//!     //you can also leave out the "PathBuf", before it was kept to make it easier to read
108//!     .run(|_, path| {
109//!         println!("{}", path.display());
110//!         Ok::<(), std::io::Error>(())
111//!     })?;
112//! ```
113//! You can also set a folder regex via [`Crawler::folder_regex`][crate::builder::Crawler::folder_regex], checking for the file regex
114//! in the closure is possible, but in the future declaring it on the [`Crawler`][crate::builder::Crawler] may enable further optimisations.
115//!
116//! ### Example 5
117//! A focus was also put on the laziness[^laziness_explanation] of the `Crawler`, so it is possible to create, store and *then* use one or more mostly without any heavy computations before running[^regex_compile_disclaimer]:
118//! ```rust,ignore
119//! # fn main() -> Result<(), Box<dyn Error>> {
120//! use file_crawler::prelude::*;
121//!
122//! use tokio::fs::File;
123//! use std::path::PathBuf;
124//! use std::sync::Arc;
125//! use std::sync::atomic::AtomicU32;
126//! use std::sync::atomic::Ordering;
127//! use tokio::io::AsyncReadExt;
128//!
129//! const START_DIR: &str="C:\\user\\foo";
130//!
131//! //the file types we are interested in
132//! let regexes = [
133//!                r"^.*\.txt$",
134//!                r"^.*\.elf$",
135//!                r"^.*\.png$"
136//!               ];
137//!
138//! //constructing them
139//! let crawlers = regexes.iter()
140//!                 .map(|regex|
141//!                     Crawler::new()
142//!                     .file_regex(regex)
143//!                     .start_dir(START_DIR)
144//!                 );
145//!
146//! //using them
147//! for crawler in crawlers {
148//!     crawler.run(|_, path| {
149//!         println!("{}", path.display());
150//!         Ok::<(), std::io::Error>(())
151//!     })?;
152//! }
153//! Ok(())
154//! # }
155//! ```
156//!
157//! ### Example 6
158//! Like with iterators in [`rayon`](https://crates.io/crates/rayon), you can simply exchange the [`Crawler::new`][crate::builder::Crawler::new] method with the [`Crawler::new_async`][crate::builder::Crawler::new_async]
159//! method to get an async crawler.
160//! ```rust,ignore
161//! # fn main() -> Result<(), Box<dyn Error>> {
162//!  use file_crawler::prelude::*;
163//!
164//!  //we're using the tokio from the prelude here, no need to add it as an extra dependency
165//!  use tokio::fs::File;
166//!  use tokio::path::PathBuf;
167//!  use std::sync::Arc;
168//!  use std::sync::atomic::AtomicU32;
169//!
170//! //basically the same as example 3!
171//!  let a_count=
172//!  //only change required to make it async (except for the run(..) code)
173//!  //don't forget to enable the 'async' feature!
174//!  Crawler::new_async()
175//!     .start_dir("C:\\user\\foo")
176//!     .context(AtomicU32::new(0))
177//!     .run(async |ctx: Arc<AtomicU32>, path: PathBuf| {
178//!         let mut contents=String::new();
179//!         let mut file=File::open(path).await?;
180//!         //NOTE: this can cause an error for files not readable as UTF-8
181//!         //which returns an error and therefore terminates the crawler
182//!         file.read_to_string(&mut contents).await?;
183//!         contents.chars().for_each(|char| if char=='a' { ctx.fetch_add(1, Ordering::Relaxed); });
184//!         Ok::<(), std::io::Error>(())
185//!     }).await?;
186//!  println!("Appearance of the letter 'a' in \"C\\user\\foo\": {}", a_count.load(Ordering::Relaxed));
187//!  Ok(())
188//! # }
189//! ```
190//!
191//! # Features
192//! - **parallel**: enables non-async multithreaded Crawler execution via the [`rayon`](https://crates.io/crates/rayon) crate. *Enabled by default*.
193//! - **async**: enables asynchronous, multithreaded[^async_disclaimer] Crawler execution via [`tokio`](https://crates.io/crates/tokio).
194//! - **lazy_store**: enables creation of async and non-async `Crawler`s for later usage or interfacing with other crates, but not running them so tokio/rayon do not need to be compiled[^lazy_store_redundancy].
195//!
196//! # Planned Features
197//! - **chili**: [`chili`](https://crates.io/crates/chili) as an optional backend (instead of [`rayon`](https://crates.io/crates/rayon), [GitHub issue](https://github.com/HQ2000-Rust/Custom-File-Crawler/issues/1))
198//!
199//! # Panics
200//! In general - especially with the focus on the [`Crawler`][crate::builder::Crawler]'s laziness - it is desirable to have as many potential panics at creation, not at runtime (in terms of calling run on the `Crawler`).
201//! Panics can (for example) occur when setting the regex to an invalid string, this may be changed in the future. So, if the creation of the `Crawler` succeeds, running will most likely *not* cause a panic.
202//!
203//!
204//! [^async_disclaimer]: Currently, the async version demands a tokio runtime with at least 2 threads. Running it in a single threaded runtime is theoretically possible, but causes indefinite execution, so this **won't work**:
205//! [^lazy_store_redundancy]: Not necessary if both the **parallel** and **async** feature are enabled.
206//! [^laziness_explanation]: [lazy evaluation](https://en.wikipedia.org/wiki/Lazy_evaluation).
207//! [^regex_compile_disclaimer]: one exception is setting a regex because it is compiled on setting it to emit an early panic.
208
209pub mod prelude;
210/// Building the crawler via the builder pattern, only way as of now
211pub mod builder {
212    use crate::builder::{
213        context::NoContext,
214        internal::{config::Config, utils::box_err},
215    };
216    #[cfg(any(feature = "parallel", feature= "lazy_store", doc))]
217    use crate::builder::{internal::par_run, marker::NonAsync};
218    #[cfg(any(feature = "async", feature = "lazy_store", doc))]
219    use crate::builder::{
220        internal::{async_run, utils::Execute},
221        marker::Async,
222    };
223
224    #[cfg(any(feature = "async", doc))]
225    use std::future::Future;
226    use std::{error::Error, marker::Send};
227    use regex::Regex;
228    use std::{
229        fmt::Debug,
230        marker::PhantomData,
231        path::{Path, PathBuf},
232        sync::Arc,
233    };
234    //to not get a compile error for docs
235    #[cfg(any(feature = "async", doc))]
236    use tokio::sync::mpsc::error::TryRecvError;
237
238    ///"Asyncness" markers. They do nothing.
239    pub mod marker {
240        #[cfg(any(feature = "parallel", feature = "lazy_store",doc))]
241        #[derive(Default, Copy, Clone, Debug)]
242        pub struct NonAsync;
243        #[cfg(any(feature = "async", feature = "lazy_store", doc))]
244        #[derive(Default, Copy, Clone, Debug)]
245        pub struct Async;
246    }
247    /// [`NoContext`] placeholder
248    pub mod context {
249        #[derive(Debug, Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
250        pub struct NoContext;
251    }
252
253    #[derive(Default, Clone, Debug)]
254    enum StartDir {
255        #[default]
256        Current,
257        Custom(PathBuf),
258    }
259
260    ///The core of this library.
261    /// Create one with [`Crawler::new`][crate::builder::Crawler::new] or [`Crawler::new_async`][crate::builder::Crawler::new_async]
262    /// to get started. Also see the [examples][crate#Examples].
263    #[derive(Debug, Clone, Default)]
264    pub struct Crawler<A, C> {
265        start_dir: StartDir,
266        file_regex: Option<Regex>,
267        folder_regex: Option<Regex>,
268        max_depth: Option<u32>,
269        context: C,
270        async_marker: PhantomData<A>,
271    }
272
273
274    #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
275    impl Crawler<NonAsync, NoContext> {
276        ///Create a new non-async, parallel [`Crawler`] without any context
277        /// ```rust
278        /// # fn main() {
279        /// # use file_crawler::builder::Crawler;
280        /// let parallel_crawler=Crawler::new();
281        /// # }
282        /// ```
283        pub fn new() -> Self {
284            //if there are diverging attributes for the Sync/Async versions later
285            Self { ..Self::default() }
286        }
287    }
288
289
290
291
292
293    impl<C> Crawler<NonAsync, C>
294    where
295        C: Send + Sync,
296    {
297        ///Sets the directory the crawler should start in. Default is the current directory, resolved when
298        ///[`Crawler::run`][crate::builder::Crawler::run] is called, if that fails, it panics before doing anything.
299        ///```rust,ignore
300        /// # fn main() -> Result<(),Box<dyn Error>> {
301        /// # use file_crawler::builder::Crawler;
302        /// use std::collections::HashSet;
303        /// use std::path::PathBuf;
304        /// use std::sync::Mutex;
305        ///
306        /// //Assuming that the content of C:\foo isn't changing during execution
307        /// //and this program is executed in that same folder
308        /// let crawler_1_result =
309        /// Crawler::new()
310        ///    .start_dir("C:\\foo")
311        ///    .context(Mutex::new(Vec::new()))
312        ///    .run(|ctx: Mutex<Vec<String>>, path| {
313        ///        ctx.lock().unwrap().insert(path.display());
314        ///    });
315        /// let crawler_2_result =
316        /// Crawler::new()
317        ///    .context(Mutex::new(Vec::new()))
318        ///    .run(|ctx: Mutex<HashSet<String>>, path| {
319        ///        ctx.lock().unwrap().insert(path.display());
320        ///    })?;
321        ///
322        /// //then (not guaranteed when using a Vec instead of a HashSet by design)
323        /// assert_eq!(crawler_1_result, crawler_2_result);
324        /// # }
325        /// ```
326        #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
327        pub fn start_dir<P: AsRef<Path>>(self, path: P) -> Self {
328            self.start_dir_(path.as_ref())
329        }
330        ///Only applies the closure in [`run`][crate::builder::Crawler::run] to a file if the given regex matches
331        /// ```rust,ignore
332        /// # fn main() -> Result<(), Box<dyn Error>>{
333        /// # use file_crawler::builder::Crawler;
334        ///
335        /// //prints all text files in the current directory (and all its subfolders)
336        /// Crawler::new()
337        ///     .file_regex(r"^.*\.txt$")
338        ///     .run(|_, path| {
339        ///         println!("{}", path.display());
340        ///     })?;
341        /// # }
342        /// ```
343        #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
344        pub fn file_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
345            self.file_regex_(regex.as_ref())
346        }
347        ///Only go into a folder if matches the given regex (meaning all files and subfolders etc. will not be traversed)
348        /// ```rust,ignore
349        /// //given this folder structure:
350        /// //foo
351        /// // |--bar
352        /// // |   |--foo.txt
353        /// // |--foobar
354        /// // |    |---barbar
355        /// // |           |---baz.txt
356        /// // |--foo
357        /// //     |--baz.txt
358        /// # fn main() -> Result<(), Box<dyn Error>>{
359        /// # use file_crawler::builder::Crawler;
360        ///
361        /// //this prints *only* baz.txt because the regex matches "foo", but not "bar" or "barbar" AND "foobar"
362        /// Crawler::new()
363        ///     .start_dir("path\\to\\foo")
364        ///     .folder_regex("foo")
365        ///     .run(|_, path| {
366        ///         println!("{}", path.display());
367        ///     })?;
368        /// # }
369        /// ```
370        #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
371        pub fn folder_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
372            self.folder_regex_(regex.as_ref())
373        }
374        ///How deep (in terms of folder layers over each other) the [`Crawler`] should go
375        /// ```rust
376        /// # fn main() -> Result<(), Box<dyn Error>>{
377        /// # use file_crawler::builder::Crawler;
378        ///
379        /// //prints all text files in the current directory, but not its subfolders
380        /// Crawler::new()
381        ///     //exchanging the 0 with a 1 mean that it also traverses the subfolders, but not their subfolders
382        ///     .search_depth(0)
383        ///     .file_regex(r"^.*\.txt$")
384        ///     .run(|_, path| {
385        ///         println!("{}", path.display());
386        ///     })?;
387        /// # }
388        /// ```
389        #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
390        pub fn search_depth(self, depth: u32) -> Self {
391            self.search_depth_(depth)
392        }
393        ///Adds a context ( = a value that is passed to the closure on every invocation via an [`Arc`]) with the type `CNEW`.
394        /// It is returned from the [`run`][crate::builder::Crawler::run] function after execution.
395        /// Defaults to the zero-sized [`NoContext`].
396        ///```rust
397        /// # fn main() -> Result<(), Box<dyn Error>>{
398        /// # use file_crawler::builder::Crawler;
399        /// use std::sync::atomic::AtomicU16;
400        ///
401        /// //bind the context to a variable
402        /// let result =
403        /// Crawler::new()
404        ///    //adds a counter (for everything not representable with Atomics, a Mutex is recommended)
405        ///     .context(AtomicU16::new(0))
406        ///     .run(|_, path| {
407        ///         println!("{}", path.display());
408        ///     })?;
409        /// println!("{} files in the current directory")
410        /// # }
411        /// ```
412        #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
413        pub fn context<CNEW: Send + Sync>(self, context: CNEW) -> Crawler<NonAsync, CNEW> {
414            self.context_(context)
415        }
416
417        ///Runs the (modified) Crawler returned from [`Crawler::new`][crate::builder::Crawler::new], execution a closure that's passed
418        ///a [`Context`][crate::builder::Crawler::context] and the path of the file for every file in the specified directory. For exceptions,
419        ///see [`search_depth`][crate::builder::Crawler::search_depth], [`file_regex`][crate::builder::Crawler::file_regex] and [`folder_regex`][crate::builder::Crawler::folder_regex].
420        /// ```rust,ignore
421        /// # fn main() -> Result<Box<dyn Error>> {
422        ///  use file_crawler::prelude::*;
423        ///
424        ///  use std::path::PathBuf;
425        ///
426        ///  Crawler::new()
427        ///     .start_dir("C\\user\\foo")
428        ///     .file_regex(r"^.*\.txt$")
429        ///     .search_depth(3)
430        ///     .run(|_, path| {
431        ///         println!("{}", path.display());
432        ///         Ok(())
433        ///     })?;
434        /// # }
435        /// ```
436        #[cfg(any(feature = "parallel", doc))]
437        pub fn run<A, E>(self, action: A) -> Result<C, Box<dyn Error + Send + 'static>>
438        where
439            A: FnMut(Arc<C>, PathBuf) -> Result<(), E> + Clone + Send + Sync,
440            E: Error + Send + 'static,
441        {
442            let start_dir = match self.start_dir {
443                StartDir::Custom(path) => path,
444                StartDir::Current => std::env::current_dir().map_err(box_err)?,
445            };
446
447            let result = par_run::<A, E, C>(
448                action,
449                Config {
450                    start_dir,
451                    file_regex: self.file_regex,
452                    folder_regex: self.folder_regex,
453                    max_depth: self.max_depth,
454                    context: Arc::new(self.context),
455                },
456            )?;
457            Ok(Arc::into_inner(result).expect("Every other Arc should have been dropped by now"))
458        }
459    }
460
461    #[cfg(any(feature = "async", feature = "lazy_store", doc))]
462    impl Crawler<Async, NoContext> {
463        ///Create a new async (parallel)[^async_disclaimer] [`Crawler`] without any context
464        /// ```rust
465        /// # #[tokio::main]
466        /// # async fn main() {
467        /// # use file_crawler::builder::Crawler;
468        /// let async_crawler=Crawler::new_async();
469        /// # }
470        /// ```
471        pub fn new_async() -> Self {
472            //same as above
473            Self { ..Self::default() }
474        }
475    }
476
477    #[cfg(any(feature = "async", feature = "lazy_store", doc))]
478    impl<C> Crawler<Async, C>
479    where
480        C: Send + Sync + 'static,
481    {
482        /// See [`Crawler::search_depth`][crate::builder::Crawler::start_dir].
483        #[cfg(any(feature = "async", feature = "lazy_store", doc))]
484        pub fn start_dir<P: AsRef<Path>>(self, path: P) -> Self {
485            self.start_dir_(path.as_ref())
486        }
487        /// See [`Crawler::file_regex`][crate::builder::Crawler::file_regex].
488        #[cfg(any(feature = "async", feature = "lazy_store", doc))]
489        pub fn file_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
490            self.file_regex_(regex.as_ref())
491        }
492        /// See [`Crawler::folder_regex`][crate::builder::Crawler::folder_regex].
493        #[cfg(any(feature = "async", feature = "lazy_store", doc))]
494        pub fn folder_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
495            self.folder_regex_(regex.as_ref())
496        }
497        /// See [`Crawler::search_depth`][crate::builder::Crawler::search_depth].
498        #[cfg(any(feature = "async", feature = "lazy_store", doc))]
499        pub fn search_depth(self, depth: u32) -> Self {
500            self.search_depth_(depth)
501        }
502        /// See [`Crawler::context`][crate::builder::Crawler::context].
503        #[cfg(any(feature = "async", feature = "lazy_store", doc))]
504        pub fn context<CNEW: Send + Sync + 'static>(self, context: CNEW) -> Crawler<Async, CNEW> {
505            self.context_(context)
506        }
507        ///Runs a (modified) asynchronous file crawler from [`Crawler::new_async`][crate::builder::Crawler::new_async] using [`tokio`](https://crates.io/crates/tokio).
508        ///Requires an at least two-threaded runtime ([3][crate#fnref3]).
509        ///Otherwise, the same as the synchronous version. It is recommended to use the exposed `tokio` (through the `prelude`) dependency instead of `std` when possible.
510        /// ```rust,ignore
511        /// # fn main() -> Result<Box<dyn Error>> {
512        ///  use file_crawler::prelude::*;
513        ///
514        ///  use std::path::PathBuf;
515        ///
516        ///  Crawler::new()
517        ///     .start_dir("C\\user\\foo")
518        ///     .file_regex(r"^.*\.txt$")
519        ///     .run(|_, path| {
520        ///         let contents=String::new();
521        ///         let file=tokio::fs::File::open(&path).await?;
522        ///         file.read_to_string(&mut contents).await?;
523        ///         println!("{}:\n{}", path.display(), contents);
524        ///         Ok(())
525        ///     })?;
526        /// # }
527        #[cfg(any(feature = "async", doc))]
528        pub async fn run<Fun, Fut, E>(
529            self,
530            action: Fun,
531        ) -> Result<C, Box<dyn Error + Send + 'static>>
532        where
533            E: Send + Error + 'static,
534            Fun: Fn(Arc<C>, PathBuf) -> Fut + Send + 'static + Clone,
535            Fut: Future<Output = Result<(), E>> + Send + 'static,
536        {
537            let (task_authority_tx, mut task_authority_rx) =
538                tokio::sync::mpsc::unbounded_channel::<Execute<E>>();
539
540            let task_authority = tokio::task::spawn_blocking(
541                async move || -> Result<(), Box<dyn Error + Send + 'static>> {
542                    let mut recursion_tasks = tokio::task::JoinSet::new();
543                    let mut action_tasks = tokio::task::JoinSet::new();
544
545                    loop {
546                        match task_authority_rx.try_recv() {
547                            Ok(signal) => match signal {
548                                Execute::Recursion(task) => drop(recursion_tasks.spawn(task)),
549                                Execute::Action(task) => drop(action_tasks.spawn(task)),
550                            },
551                            Err(e) => match e {
552                                TryRecvError::Disconnected => {
553                                    unreachable!("Senders shouldn't be dropped by now");
554                                }
555                                //fall-through
556                                TryRecvError::Empty => {}
557                            },
558                        };
559                        match (recursion_tasks.is_empty(), action_tasks.is_empty()) {
560                            (true, true) => break,
561                            (rec, act) => {
562                                if !rec {
563                                    if let Some(result) = recursion_tasks.try_join_next() {
564                                        //unwrap to propagate panics
565                                        result.unwrap().map_err(box_err)?;
566                                    }
567                                }
568                                if !act {
569                                    if let Some(result) = action_tasks.try_join_next() {
570                                        //unwrap to propagate panics
571                                        result.unwrap().map_err(box_err)?;
572                                    }
573                                }
574                            }
575                        }
576                    }
577
578                    Ok::<(), Box<dyn Error + Send + 'static>>(())
579                },
580            );
581
582            let start_dir = match self.start_dir {
583                StartDir::Custom(path) => path,
584                StartDir::Current => std::env::current_dir().map_err(box_err)?,
585            };
586
587            let config = Config {
588                //this is the start of the invariant get_custom_dir() relies on through the whole execution
589                start_dir,
590                context: Arc::new(self.context),
591                max_depth: self.max_depth,
592                folder_regex: self.folder_regex,
593                file_regex: self.file_regex,
594            };
595
596            task_authority_tx
597                .send(Execute::Recursion(async_run(
598                    task_authority_tx.clone(),
599                    action,
600                    Config {
601                        context: Arc::clone(&config.context),
602                        ..config
603                    },
604                )))
605                .expect("The Reveiver should not have been dropped by now");
606
607            task_authority.await.unwrap().await?;
608
609            Ok(Arc::into_inner(config.context)
610                .expect("Every other clone should have been dropped by now"))
611        }
612    }
613
614    pub(in crate::builder) mod internal {
615        pub(crate) mod utils {
616
617            use std::error::Error;
618
619            pub(crate) fn box_err(
620                error: impl Error + Send + 'static,
621            ) -> Box<dyn Error + Send + 'static> {
622                Box::new(error)
623            }
624
625            #[cfg(any(feature = "async", doc))]
626            use std::pin::Pin;
627            #[cfg(any(feature = "async", doc))]
628            pub(in crate::builder) enum Execute<E> {
629                Recursion(Pin<Box<dyn Future<Output = Result<(), std::io::Error>> + Send>>),
630                Action(Pin<Box<dyn Future<Output = Result<(), E>> + Send>>),
631            }
632        }
633        pub(in crate::builder) mod config {
634            use crate::builder::{Crawler, StartDir};
635            use std::path::PathBuf;
636            use std::sync::Arc;
637
638            //I could add C: ?Sized, but that would make no difference because in the Crawler C: Sized...
639            #[derive(Debug, Clone)]
640            pub(in crate::builder) struct Config<C> {
641                pub(in crate::builder) start_dir: PathBuf,
642                pub(in crate::builder) file_regex: Option<regex::Regex>,
643                pub(in crate::builder) folder_regex: Option<regex::Regex>,
644                pub(in crate::builder) max_depth: Option<u32>,
645
646                pub(in crate::builder) context: Arc<C>,
647            }
648            impl<A, C: 'static> From<Crawler<A, C>> for Config<C> {
649                fn from(value: Crawler<A, C>) -> Self {
650                    Self {
651                        start_dir: match value.start_dir {
652                            StartDir::Custom(path) => path,
653                            StartDir::Current => unreachable!("Ensure that this isn't the case"),
654                        }
655                        .to_path_buf(),
656                        context: Arc::new(value.context),
657                        file_regex: value.file_regex,
658                        folder_regex: value.folder_regex,
659                        max_depth: value.max_depth,
660                    }
661                }
662            }
663        }
664        //this impl imposes minimal trait bounds for C, so I can have custom ones for async/non_async
665
666        //this prevents error with trait bounds when trying to run stored crawlers
667
668        //could make a feature gate here, but it makes no sense to disable both async and parallel
669        impl<A, C> Crawler<A, C> {
670            pub(in crate::builder) fn start_dir_(self, path: &Path) -> Self {
671                Crawler {
672                    start_dir: StartDir::Custom(path.to_path_buf()),
673                    ..self
674                }
675            }
676            pub(in crate::builder) fn file_regex_(self, regex: &str) -> Self {
677                Self {
678                    file_regex: match Regex::new(regex) {
679                        Ok(re) => Some(re),
680                        Err(e) => panic!("Error compiling file regex: {}", e),
681                    },
682                    ..self
683                }
684            }
685            pub(in crate::builder) fn folder_regex_(self, regex: &str) -> Self {
686                Self {
687                    folder_regex: match Regex::new(regex) {
688                        Ok(re) => Some(re),
689                        Err(e) => panic!("Error compiling folder regex: {}", e),
690                    },
691                    ..self
692                }
693            }
694            pub(in crate::builder) fn search_depth_(self, depth: u32) -> Self {
695                Self {
696                    max_depth: Some(depth),
697                    ..self
698                }
699            }
700            pub(in crate::builder) fn context_<CNEW>(self, context: CNEW) -> Crawler<A, CNEW> {
701                //sadly this is necessary because of the different types...
702                Crawler::<A, CNEW> {
703                    context,
704                    start_dir: self.start_dir,
705                    file_regex: self.file_regex,
706                    folder_regex: self.folder_regex,
707                    max_depth: self.max_depth,
708                    async_marker: self.async_marker,
709                }
710            }
711        }
712        pub(super) mod regex {
713            use crate::builder::internal::config::Config;
714
715            impl<C> Config<C> {
716                pub(in crate::builder) fn validate_folder_regex(&self, str: &str) -> bool {
717                    self.folder_regex
718                        .as_ref()
719                        .map_or(true, |regex| regex.is_match(str))
720                }
721                pub(in crate::builder) fn validate_file_regex(&self, str: &str) -> bool {
722                    self.file_regex
723                        .as_ref()
724                        .map_or(true, |regex| regex.is_match(str))
725                }
726            }
727        }
728        #[cfg(any(feature = "parallel", doc))]
729        pub(in crate::builder) fn par_run<A, E, C>(
730            action: A,
731            config: Config<C>, //1
732        ) -> Result<Arc<C>, Box<dyn Error + Send + 'static>>
733        where
734            A: FnMut(Arc<C>, PathBuf) -> Result<(), E> + Clone + Send + Sync,
735            E: Error + Send + 'static,
736            C: Send + Sync,
737        {
738            use rayon::prelude::*;
739            //'?' doesn't work here (because of the non-trivial trait bound conversions)
740            let entries = std::fs::read_dir(&config.start_dir).map_err(box_err)?;
741            //could optimize that later with .filter()
742            entries.into_iter().par_bridge().try_for_each(|result| {
743                let path = result.map_err(box_err)?.path();
744                if path.is_dir() && !matches!(config.max_depth, Some(0)) {
745                    if config.validate_folder_regex(&path.to_string_lossy()) {
746                        let config = Config {
747                            start_dir: path,
748                            max_depth: config.max_depth.and_then(|depth| Some(depth - 1)),
749                            folder_regex: config.folder_regex.clone(),
750                            file_regex: config.file_regex.clone(),
751                            context: Arc::clone(&config.context),
752                        };
753                        par_run(action.clone(), config)?;
754                    }
755                } else {
756                    if config.validate_file_regex(&path.to_string_lossy()) {
757                        action.clone()(config.context.clone(), path).map_err(box_err)?;
758                    }
759                }
760                //just to be sure
761                Ok::<(), Box<dyn Error + Send>>(())
762            })?;
763            Ok(config.context)
764        }
765
766        #[cfg(any(feature = "async", doc))]
767        use crate::builder::internal::utils::Execute;
768        use crate::builder::{
769            Crawler, StartDir,
770            internal::{config::Config, utils::box_err},
771        };
772        use ::regex::Regex;
773        use std::{
774            error::Error,
775            path::{Path, PathBuf},
776            sync::Arc,
777        };
778
779        #[cfg(any(feature = "async", doc))]
780        use std::pin::Pin;
781        #[cfg(any(feature = "async", doc))]
782        use tokio::sync::mpsc::UnboundedSender;
783
784        #[cfg(any(feature = "async", doc))]
785        pub(in crate::builder) fn async_run<Fun, Fut, E, C>(
786            authority_sender: UnboundedSender<Execute<E>>,
787            action: Fun,
788            config: Config<C>,
789        ) -> Pin<Box<dyn Future<Output = Result<(), std::io::Error>> + Send>>
790        where
791            E: Send + 'static + Error,
792            Fun: Fn(Arc<C>, PathBuf) -> Fut + Send + 'static + Clone,
793            Fut: Future<Output = Result<(), E>> + Send + 'static,
794            C: Send + Sync + 'static,
795        {
796            Box::pin(async move {
797                //here, the Custom(_) invariant is important
798                let mut entries = tokio::fs::read_dir(&config.start_dir).await?;
799
800                loop {
801                    if let Some(entry) = entries.next_entry().await? {
802                        let path = entry.path();
803                        if path.is_dir() && !matches!(config.max_depth, Some(0)) {
804                            if config.validate_folder_regex(&path.to_string_lossy()) {
805                                let config = Config {
806                                    start_dir: path,
807                                    max_depth: config.max_depth.and_then(|depth| Some(depth - 1)),
808                                    context: Arc::clone(&config.context),
809                                    file_regex: config.file_regex.clone(),
810                                    folder_regex: config.folder_regex.clone(),
811                                };
812                                authority_sender
813                                    .send(Execute::Recursion(Box::pin(
814                                        async_run::<Fun, Fut, E, C>(
815                                            authority_sender.clone(),
816                                            action.clone(),
817                                            config,
818                                        ),
819                                    )))
820                                    .expect("The Receiver should not have been dropped by now");
821                            }
822                        } else {
823                            if config.validate_file_regex(&path.to_string_lossy()) {
824                                authority_sender
825                                    .send(Execute::Action(Box::pin(action.clone()(
826                                        Arc::clone(&config.context),
827                                        path,
828                                    ))))
829                                    .expect("The Receiver should not be dropped by now");
830                            }
831                        }
832                        //saving the else branch
833                        continue;
834                    }
835                    break Ok(());
836                }
837            })
838        }
839    }
840}