file_crawler/lib.rs
1//! A customisable, multithreaded (optionally async) file crawler for local file systems
2//! # Getting Started
3//! It is recommended to
4//! - add it to your project
5//! ```text
6//! cargo add file-crawler
7//! ```
8//! - use the prelude
9//! ```rust
10//! use file_crawler::prelude::*;
11//! ```
12//! - and read the examples (or the [`Crawler`][crate::builder::Crawler] docs)!
13//!
14//! - While working with the library, refer to the `Crawler` documentation.
15//!
16//! # Examples
17//! Below are some examples showing usage in different use cases. Reading these is is enough to understand everything for most use cases.
18//! ### Example 1
19//! Here's how you create a synchronous, multithreaded `Crawler` that prints the file name of every file in a folder:
20//! ```rust,ignore
21//! # fn main() -> Result<Box<dyn Error>> {
22//! use file_crawler::prelude::*;
23//!
24//! use std::path::PathBuf;
25//!
26//! Crawler::new()
27//! .start_dir("C:\\user\\foo")
28//! .run(|_, path: PathBuf| {
29//! println!("{}", path.display());
30//! //placeholder error type for now
31//! Ok::<(), std::io::Error>(())
32//! })?;
33//! Ok(())
34//! # }
35//! ```
36//! ### Example 2
37//! Actually, we left one argument out: the [`Context`][crate::builder::Crawler::context]!
38//! We didn't need it, but if we want to know how many files we have in our folder we can do this:
39//! ```rust,ignore
40//! # fn main() -> Result<(), Box<dyn Error>> {
41//! use file_crawler::prelude::*;
42//!
43//! use std::path::PathBuf;
44//! use std::sync::atomic::AtomicU32;
45//! use std::sync::{Arc, Mutex};
46//!
47//! //the context is later returned as the exact same type from the Crawler::run function
48//! //so we can bind it to a variable if needed
49//! let count=
50//! Crawler::new()
51//! .start_dir("C:\\user\\foo")
52//! //you can of course use atomic types, this makes more sense for numbers
53//! .context(Mutex::new(0))
54//! .run(|ctx: Arc<Mutex<u32>>, path: PathBuf| {
55//! *ctx.lock().unwrap()+=1;
56//! println!("{}", path.display());
57//! Ok::<(), std::io::Error>(())
58//! })?;
59//! println!("Total number of files in \"C\\user\\foo\": {}", count.lock().unwrap());
60//! Ok(())
61//! # }
62//! ```
63//! ### Example 3
64//! Until now the `Ok()` was more mandatory than useful. Let's look at a use case where it is a big benefit,
65//! like counting the appearance of the letter '`a`' (assuming only text files are in the folder)
66//! ```rust,ignore
67//! use file_crawler::prelude::*;
68//!
69//! use std::fs::File;
70//! use std::path::PathBuf;
71//! use std::sync::Arc;
72//! use std::sync::atomic::AtomicU32;
73//!
74//! let a_count=
75//! Crawler::new()
76//! .start_dir("C:\\user\\foo")
77//! .context(AtomicU32::new(0))
78//! .run(|ctx: Arc<AtomicU32>, path: PathBuf| {
79//! let mut contents=String::new();
80//! let mut file=File::open(path)?;
81//! //NOTE: this can cause an error for files not readable as UTF-8
82//! //which returns an error and therefore terminates the crawler
83//! file.read_to_string(&mut contents)?;
84//! contents.chars().for_each(|char| if char=='a' { ctx.fetch_add(1, Ordering::Relaxed); });
85//! Ok::<(),std::io::Error>(())
86//! })?;
87//!println!("Appearance of the letter 'a' in \"C\\user\\foo\": {}", a_count.load(Ordering::Relaxed));
88//! Ok(())
89//!
90//! ```
91//! ### Example 4
92//! Say, you are looking all `.txt` files in a folder that's probably very big and deeply nested and
93//! don't want to use all the computation power and time it would require you can do something like this:
94//! ```rust,ignore
95//! use file_crawler::prelude::*;
96//!
97//! use std::path::PathBuf;
98//!
99//! Crawler::new()
100//! .start_dir("C:\\user\\probably_very_deep_folder")
101//! //you can set a regex for every file / folder
102//! //the closure you specify is only executed for a file if its name matches the regex
103//! //this regex matches every single-line string ending in ".txt"
104//! .file_regex(r"^.*\.txt$")
105//! //sets a maximum depth (in terms of "folder layers" over each other)
106//! .search_depth(3)
107//! //you can also leave out the "PathBuf", before it was kept to make it easier to read
108//! .run(|_, path| {
109//! println!("{}", path.display());
110//! Ok::<(), std::io::Error>(())
111//! })?;
112//! ```
113//! You can also set a folder regex via [`Crawler::folder_regex`][crate::builder::Crawler::folder_regex], checking for the file regex
114//! in the closure is possible, but in the future declaring it on the [`Crawler`][crate::builder::Crawler] may enable further optimisations.
115//!
116//! ### Example 5
117//! A focus was also put on the laziness[^laziness_explanation] of the `Crawler`, so it is possible to create, store and *then* use one or more mostly without any heavy computations before running[^regex_compile_disclaimer]:
118//! ```rust,ignore
119//! # fn main() -> Result<(), Box<dyn Error>> {
120//! use file_crawler::prelude::*;
121//!
122//! use tokio::fs::File;
123//! use std::path::PathBuf;
124//! use std::sync::Arc;
125//! use std::sync::atomic::AtomicU32;
126//! use std::sync::atomic::Ordering;
127//! use tokio::io::AsyncReadExt;
128//!
129//! const START_DIR: &str="C:\\user\\foo";
130//!
131//! //the file types we are interested in
132//! let regexes = [
133//! r"^.*\.txt$",
134//! r"^.*\.elf$",
135//! r"^.*\.png$"
136//! ];
137//!
138//! //constructing them
139//! let crawlers = regexes.iter()
140//! .map(|regex|
141//! Crawler::new()
142//! .file_regex(regex)
143//! .start_dir(START_DIR)
144//! );
145//!
146//! //using them
147//! for crawler in crawlers {
148//! crawler.run(|_, path| {
149//! println!("{}", path.display());
150//! Ok::<(), std::io::Error>(())
151//! })?;
152//! }
153//! Ok(())
154//! # }
155//! ```
156//!
157//! ### Example 6
158//! Like with iterators in [`rayon`](https://crates.io/crates/rayon), you can simply exchange the [`Crawler::new`][crate::builder::Crawler::new] method with the [`Crawler::new_async`][crate::builder::Crawler::new_async]
159//! method to get an async crawler.
160//! ```rust,ignore
161//! # fn main() -> Result<(), Box<dyn Error>> {
162//! use file_crawler::prelude::*;
163//!
164//! //we're using the tokio from the prelude here, no need to add it as an extra dependency
165//! use tokio::fs::File;
166//! use tokio::path::PathBuf;
167//! use std::sync::Arc;
168//! use std::sync::atomic::AtomicU32;
169//!
170//! //basically the same as example 3!
171//! let a_count=
172//! //only change required to make it async (except for the run(..) code)
173//! //don't forget to enable the 'async' feature!
174//! Crawler::new_async()
175//! .start_dir("C:\\user\\foo")
176//! .context(AtomicU32::new(0))
177//! .run(async |ctx: Arc<AtomicU32>, path: PathBuf| {
178//! let mut contents=String::new();
179//! let mut file=File::open(path).await?;
180//! //NOTE: this can cause an error for files not readable as UTF-8
181//! //which returns an error and therefore terminates the crawler
182//! file.read_to_string(&mut contents).await?;
183//! contents.chars().for_each(|char| if char=='a' { ctx.fetch_add(1, Ordering::Relaxed); });
184//! Ok::<(), std::io::Error>(())
185//! }).await?;
186//! println!("Appearance of the letter 'a' in \"C\\user\\foo\": {}", a_count.load(Ordering::Relaxed));
187//! Ok(())
188//! # }
189//! ```
190//!
191//! # Features
192//! - **parallel**: enables non-async multithreaded Crawler execution via the [`rayon`](https://crates.io/crates/rayon) crate. *Enabled by default*.
193//! - **async**: enables asynchronous, multithreaded[^async_disclaimer] Crawler execution via [`tokio`](https://crates.io/crates/tokio).
194//! - **lazy_store**: enables creation of async and non-async `Crawler`s for later usage or interfacing with other crates, but not running them so tokio/rayon do not need to be compiled[^lazy_store_redundancy].
195//!
196//! # Planned Features
197//! - **chili**: [`chili`](https://crates.io/crates/chili) as an optional backend (instead of [`rayon`](https://crates.io/crates/rayon), [GitHub issue](https://github.com/HQ2000-Rust/Custom-File-Crawler/issues/1))
198//!
199//! # Panics
200//! In general - especially with the focus on the [`Crawler`][crate::builder::Crawler]'s laziness - it is desirable to have as many potential panics at creation, not at runtime (in terms of calling run on the `Crawler`).
201//! Panics can (for example) occur when setting the regex to an invalid string, this may be changed in the future. So, if the creation of the `Crawler` succeeds, running will most likely *not* cause a panic.
202//!
203//!
204//! [^async_disclaimer]: Currently, the async version demands a tokio runtime with at least 2 threads. Running it in a single threaded runtime is theoretically possible, but causes indefinite execution, so this **won't work**:
205//! [^lazy_store_redundancy]: Not necessary if both the **parallel** and **async** feature are enabled.
206//! [^laziness_explanation]: [lazy evaluation](https://en.wikipedia.org/wiki/Lazy_evaluation).
207//! [^regex_compile_disclaimer]: one exception is setting a regex because it is compiled on setting it to emit an early panic.
208
209pub mod prelude;
210/// Building the crawler via the builder pattern, only way as of now
211pub mod builder {
212 use crate::builder::{
213 context::NoContext,
214 internal::{config::Config, utils::box_err},
215 };
216 #[cfg(any(feature = "parallel", feature= "lazy_store", doc))]
217 use crate::builder::{internal::par_run, marker::NonAsync};
218 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
219 use crate::builder::{
220 internal::{async_run, utils::Execute},
221 marker::Async,
222 };
223
224 #[cfg(any(feature = "async", doc))]
225 use std::future::Future;
226 use std::{error::Error, marker::Send};
227 use regex::Regex;
228 use std::{
229 fmt::Debug,
230 marker::PhantomData,
231 path::{Path, PathBuf},
232 sync::Arc,
233 };
234 //to not get a compile error for docs
235 #[cfg(any(feature = "async", doc))]
236 use tokio::sync::mpsc::error::TryRecvError;
237
238 ///"Asyncness" markers. They do nothing.
239 pub mod marker {
240 #[cfg(any(feature = "parallel", feature = "lazy_store",doc))]
241 #[derive(Default, Copy, Clone, Debug)]
242 pub struct NonAsync;
243 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
244 #[derive(Default, Copy, Clone, Debug)]
245 pub struct Async;
246 }
247 /// [`NoContext`] placeholder
248 pub mod context {
249 #[derive(Debug, Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
250 pub struct NoContext;
251 }
252
253 #[derive(Default, Clone, Debug)]
254 enum StartDir {
255 #[default]
256 Current,
257 Custom(PathBuf),
258 }
259
260 ///The core of this library.
261 /// Create one with [`Crawler::new`][crate::builder::Crawler::new] or [`Crawler::new_async`][crate::builder::Crawler::new_async]
262 /// to get started. Also see the [examples][crate#Examples].
263 #[derive(Debug, Clone, Default)]
264 pub struct Crawler<A, C> {
265 start_dir: StartDir,
266 file_regex: Option<Regex>,
267 folder_regex: Option<Regex>,
268 max_depth: Option<u32>,
269 context: C,
270 async_marker: PhantomData<A>,
271 }
272
273
274 #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
275 impl Crawler<NonAsync, NoContext> {
276 ///Create a new non-async, parallel [`Crawler`] without any context
277 /// ```rust
278 /// # fn main() {
279 /// # use file_crawler::builder::Crawler;
280 /// let parallel_crawler=Crawler::new();
281 /// # }
282 /// ```
283 pub fn new() -> Self {
284 //if there are diverging attributes for the Sync/Async versions later
285 Self { ..Self::default() }
286 }
287 }
288
289
290
291
292
293 impl<C> Crawler<NonAsync, C>
294 where
295 C: Send + Sync,
296 {
297 ///Sets the directory the crawler should start in. Default is the current directory, resolved when
298 ///[`Crawler::run`][crate::builder::Crawler::run] is called, if that fails, it panics before doing anything.
299 ///```rust,ignore
300 /// # fn main() -> Result<(),Box<dyn Error>> {
301 /// # use file_crawler::builder::Crawler;
302 /// use std::collections::HashSet;
303 /// use std::path::PathBuf;
304 /// use std::sync::Mutex;
305 ///
306 /// //Assuming that the content of C:\foo isn't changing during execution
307 /// //and this program is executed in that same folder
308 /// let crawler_1_result =
309 /// Crawler::new()
310 /// .start_dir("C:\\foo")
311 /// .context(Mutex::new(Vec::new()))
312 /// .run(|ctx: Mutex<Vec<String>>, path| {
313 /// ctx.lock().unwrap().insert(path.display());
314 /// });
315 /// let crawler_2_result =
316 /// Crawler::new()
317 /// .context(Mutex::new(Vec::new()))
318 /// .run(|ctx: Mutex<HashSet<String>>, path| {
319 /// ctx.lock().unwrap().insert(path.display());
320 /// })?;
321 ///
322 /// //then (not guaranteed when using a Vec instead of a HashSet by design)
323 /// assert_eq!(crawler_1_result, crawler_2_result);
324 /// # }
325 /// ```
326 #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
327 pub fn start_dir<P: AsRef<Path>>(self, path: P) -> Self {
328 self.start_dir_(path.as_ref())
329 }
330 ///Only applies the closure in [`run`][crate::builder::Crawler::run] to a file if the given regex matches
331 /// ```rust,ignore
332 /// # fn main() -> Result<(), Box<dyn Error>>{
333 /// # use file_crawler::builder::Crawler;
334 ///
335 /// //prints all text files in the current directory (and all its subfolders)
336 /// Crawler::new()
337 /// .file_regex(r"^.*\.txt$")
338 /// .run(|_, path| {
339 /// println!("{}", path.display());
340 /// })?;
341 /// # }
342 /// ```
343 #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
344 pub fn file_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
345 self.file_regex_(regex.as_ref())
346 }
347 ///Only go into a folder if matches the given regex (meaning all files and subfolders etc. will not be traversed)
348 /// ```rust,ignore
349 /// //given this folder structure:
350 /// //foo
351 /// // |--bar
352 /// // | |--foo.txt
353 /// // |--foobar
354 /// // | |---barbar
355 /// // | |---baz.txt
356 /// // |--foo
357 /// // |--baz.txt
358 /// # fn main() -> Result<(), Box<dyn Error>>{
359 /// # use file_crawler::builder::Crawler;
360 ///
361 /// //this prints *only* baz.txt because the regex matches "foo", but not "bar" or "barbar" AND "foobar"
362 /// Crawler::new()
363 /// .start_dir("path\\to\\foo")
364 /// .folder_regex("foo")
365 /// .run(|_, path| {
366 /// println!("{}", path.display());
367 /// })?;
368 /// # }
369 /// ```
370 #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
371 pub fn folder_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
372 self.folder_regex_(regex.as_ref())
373 }
374 ///How deep (in terms of folder layers over each other) the [`Crawler`] should go
375 /// ```rust
376 /// # fn main() -> Result<(), Box<dyn Error>>{
377 /// # use file_crawler::builder::Crawler;
378 ///
379 /// //prints all text files in the current directory, but not its subfolders
380 /// Crawler::new()
381 /// //exchanging the 0 with a 1 mean that it also traverses the subfolders, but not their subfolders
382 /// .search_depth(0)
383 /// .file_regex(r"^.*\.txt$")
384 /// .run(|_, path| {
385 /// println!("{}", path.display());
386 /// })?;
387 /// # }
388 /// ```
389 #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
390 pub fn search_depth(self, depth: u32) -> Self {
391 self.search_depth_(depth)
392 }
393 ///Adds a context ( = a value that is passed to the closure on every invocation via an [`Arc`]) with the type `CNEW`.
394 /// It is returned from the [`run`][crate::builder::Crawler::run] function after execution.
395 /// Defaults to the zero-sized [`NoContext`].
396 ///```rust
397 /// # fn main() -> Result<(), Box<dyn Error>>{
398 /// # use file_crawler::builder::Crawler;
399 /// use std::sync::atomic::AtomicU16;
400 ///
401 /// //bind the context to a variable
402 /// let result =
403 /// Crawler::new()
404 /// //adds a counter (for everything not representable with Atomics, a Mutex is recommended)
405 /// .context(AtomicU16::new(0))
406 /// .run(|_, path| {
407 /// println!("{}", path.display());
408 /// })?;
409 /// println!("{} files in the current directory")
410 /// # }
411 /// ```
412 #[cfg(any(feature = "parallel", feature = "lazy_store", doc))]
413 pub fn context<CNEW: Send + Sync>(self, context: CNEW) -> Crawler<NonAsync, CNEW> {
414 self.context_(context)
415 }
416
417 ///Runs the (modified) Crawler returned from [`Crawler::new`][crate::builder::Crawler::new], execution a closure that's passed
418 ///a [`Context`][crate::builder::Crawler::context] and the path of the file for every file in the specified directory. For exceptions,
419 ///see [`search_depth`][crate::builder::Crawler::search_depth], [`file_regex`][crate::builder::Crawler::file_regex] and [`folder_regex`][crate::builder::Crawler::folder_regex].
420 /// ```rust,ignore
421 /// # fn main() -> Result<Box<dyn Error>> {
422 /// use file_crawler::prelude::*;
423 ///
424 /// use std::path::PathBuf;
425 ///
426 /// Crawler::new()
427 /// .start_dir("C\\user\\foo")
428 /// .file_regex(r"^.*\.txt$")
429 /// .search_depth(3)
430 /// .run(|_, path| {
431 /// println!("{}", path.display());
432 /// Ok(())
433 /// })?;
434 /// # }
435 /// ```
436 #[cfg(any(feature = "parallel", doc))]
437 pub fn run<A, E>(self, action: A) -> Result<C, Box<dyn Error + Send + 'static>>
438 where
439 A: FnMut(Arc<C>, PathBuf) -> Result<(), E> + Clone + Send + Sync,
440 E: Error + Send + 'static,
441 {
442 let start_dir = match self.start_dir {
443 StartDir::Custom(path) => path,
444 StartDir::Current => std::env::current_dir().map_err(box_err)?,
445 };
446
447 let result = par_run::<A, E, C>(
448 action,
449 Config {
450 start_dir,
451 file_regex: self.file_regex,
452 folder_regex: self.folder_regex,
453 max_depth: self.max_depth,
454 context: Arc::new(self.context),
455 },
456 )?;
457 Ok(Arc::into_inner(result).expect("Every other Arc should have been dropped by now"))
458 }
459 }
460
461 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
462 impl Crawler<Async, NoContext> {
463 ///Create a new async (parallel)[^async_disclaimer] [`Crawler`] without any context
464 /// ```rust
465 /// # #[tokio::main]
466 /// # async fn main() {
467 /// # use file_crawler::builder::Crawler;
468 /// let async_crawler=Crawler::new_async();
469 /// # }
470 /// ```
471 pub fn new_async() -> Self {
472 //same as above
473 Self { ..Self::default() }
474 }
475 }
476
477 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
478 impl<C> Crawler<Async, C>
479 where
480 C: Send + Sync + 'static,
481 {
482 /// See [`Crawler::search_depth`][crate::builder::Crawler::start_dir].
483 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
484 pub fn start_dir<P: AsRef<Path>>(self, path: P) -> Self {
485 self.start_dir_(path.as_ref())
486 }
487 /// See [`Crawler::file_regex`][crate::builder::Crawler::file_regex].
488 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
489 pub fn file_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
490 self.file_regex_(regex.as_ref())
491 }
492 /// See [`Crawler::folder_regex`][crate::builder::Crawler::folder_regex].
493 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
494 pub fn folder_regex<STR: AsRef<str>>(self, regex: STR) -> Self {
495 self.folder_regex_(regex.as_ref())
496 }
497 /// See [`Crawler::search_depth`][crate::builder::Crawler::search_depth].
498 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
499 pub fn search_depth(self, depth: u32) -> Self {
500 self.search_depth_(depth)
501 }
502 /// See [`Crawler::context`][crate::builder::Crawler::context].
503 #[cfg(any(feature = "async", feature = "lazy_store", doc))]
504 pub fn context<CNEW: Send + Sync + 'static>(self, context: CNEW) -> Crawler<Async, CNEW> {
505 self.context_(context)
506 }
507 ///Runs a (modified) asynchronous file crawler from [`Crawler::new_async`][crate::builder::Crawler::new_async] using [`tokio`](https://crates.io/crates/tokio).
508 ///Requires an at least two-threaded runtime ([3][crate#fnref3]).
509 ///Otherwise, the same as the synchronous version. It is recommended to use the exposed `tokio` (through the `prelude`) dependency instead of `std` when possible.
510 /// ```rust,ignore
511 /// # fn main() -> Result<Box<dyn Error>> {
512 /// use file_crawler::prelude::*;
513 ///
514 /// use std::path::PathBuf;
515 ///
516 /// Crawler::new()
517 /// .start_dir("C\\user\\foo")
518 /// .file_regex(r"^.*\.txt$")
519 /// .run(|_, path| {
520 /// let contents=String::new();
521 /// let file=tokio::fs::File::open(&path).await?;
522 /// file.read_to_string(&mut contents).await?;
523 /// println!("{}:\n{}", path.display(), contents);
524 /// Ok(())
525 /// })?;
526 /// # }
527 #[cfg(any(feature = "async", doc))]
528 pub async fn run<Fun, Fut, E>(
529 self,
530 action: Fun,
531 ) -> Result<C, Box<dyn Error + Send + 'static>>
532 where
533 E: Send + Error + 'static,
534 Fun: Fn(Arc<C>, PathBuf) -> Fut + Send + 'static + Clone,
535 Fut: Future<Output = Result<(), E>> + Send + 'static,
536 {
537 let (task_authority_tx, mut task_authority_rx) =
538 tokio::sync::mpsc::unbounded_channel::<Execute<E>>();
539
540 let task_authority = tokio::task::spawn_blocking(
541 async move || -> Result<(), Box<dyn Error + Send + 'static>> {
542 let mut recursion_tasks = tokio::task::JoinSet::new();
543 let mut action_tasks = tokio::task::JoinSet::new();
544
545 loop {
546 match task_authority_rx.try_recv() {
547 Ok(signal) => match signal {
548 Execute::Recursion(task) => drop(recursion_tasks.spawn(task)),
549 Execute::Action(task) => drop(action_tasks.spawn(task)),
550 },
551 Err(e) => match e {
552 TryRecvError::Disconnected => {
553 unreachable!("Senders shouldn't be dropped by now");
554 }
555 //fall-through
556 TryRecvError::Empty => {}
557 },
558 };
559 match (recursion_tasks.is_empty(), action_tasks.is_empty()) {
560 (true, true) => break,
561 (rec, act) => {
562 if !rec {
563 if let Some(result) = recursion_tasks.try_join_next() {
564 //unwrap to propagate panics
565 result.unwrap().map_err(box_err)?;
566 }
567 }
568 if !act {
569 if let Some(result) = action_tasks.try_join_next() {
570 //unwrap to propagate panics
571 result.unwrap().map_err(box_err)?;
572 }
573 }
574 }
575 }
576 }
577
578 Ok::<(), Box<dyn Error + Send + 'static>>(())
579 },
580 );
581
582 let start_dir = match self.start_dir {
583 StartDir::Custom(path) => path,
584 StartDir::Current => std::env::current_dir().map_err(box_err)?,
585 };
586
587 let config = Config {
588 //this is the start of the invariant get_custom_dir() relies on through the whole execution
589 start_dir,
590 context: Arc::new(self.context),
591 max_depth: self.max_depth,
592 folder_regex: self.folder_regex,
593 file_regex: self.file_regex,
594 };
595
596 task_authority_tx
597 .send(Execute::Recursion(async_run(
598 task_authority_tx.clone(),
599 action,
600 Config {
601 context: Arc::clone(&config.context),
602 ..config
603 },
604 )))
605 .expect("The Reveiver should not have been dropped by now");
606
607 task_authority.await.unwrap().await?;
608
609 Ok(Arc::into_inner(config.context)
610 .expect("Every other clone should have been dropped by now"))
611 }
612 }
613
614 pub(in crate::builder) mod internal {
615 pub(crate) mod utils {
616
617 use std::error::Error;
618
619 pub(crate) fn box_err(
620 error: impl Error + Send + 'static,
621 ) -> Box<dyn Error + Send + 'static> {
622 Box::new(error)
623 }
624
625 #[cfg(any(feature = "async", doc))]
626 use std::pin::Pin;
627 #[cfg(any(feature = "async", doc))]
628 pub(in crate::builder) enum Execute<E> {
629 Recursion(Pin<Box<dyn Future<Output = Result<(), std::io::Error>> + Send>>),
630 Action(Pin<Box<dyn Future<Output = Result<(), E>> + Send>>),
631 }
632 }
633 pub(in crate::builder) mod config {
634 use crate::builder::{Crawler, StartDir};
635 use std::path::PathBuf;
636 use std::sync::Arc;
637
638 //I could add C: ?Sized, but that would make no difference because in the Crawler C: Sized...
639 #[derive(Debug, Clone)]
640 pub(in crate::builder) struct Config<C> {
641 pub(in crate::builder) start_dir: PathBuf,
642 pub(in crate::builder) file_regex: Option<regex::Regex>,
643 pub(in crate::builder) folder_regex: Option<regex::Regex>,
644 pub(in crate::builder) max_depth: Option<u32>,
645
646 pub(in crate::builder) context: Arc<C>,
647 }
648 impl<A, C: 'static> From<Crawler<A, C>> for Config<C> {
649 fn from(value: Crawler<A, C>) -> Self {
650 Self {
651 start_dir: match value.start_dir {
652 StartDir::Custom(path) => path,
653 StartDir::Current => unreachable!("Ensure that this isn't the case"),
654 }
655 .to_path_buf(),
656 context: Arc::new(value.context),
657 file_regex: value.file_regex,
658 folder_regex: value.folder_regex,
659 max_depth: value.max_depth,
660 }
661 }
662 }
663 }
664 //this impl imposes minimal trait bounds for C, so I can have custom ones for async/non_async
665
666 //this prevents error with trait bounds when trying to run stored crawlers
667
668 //could make a feature gate here, but it makes no sense to disable both async and parallel
669 impl<A, C> Crawler<A, C> {
670 pub(in crate::builder) fn start_dir_(self, path: &Path) -> Self {
671 Crawler {
672 start_dir: StartDir::Custom(path.to_path_buf()),
673 ..self
674 }
675 }
676 pub(in crate::builder) fn file_regex_(self, regex: &str) -> Self {
677 Self {
678 file_regex: match Regex::new(regex) {
679 Ok(re) => Some(re),
680 Err(e) => panic!("Error compiling file regex: {}", e),
681 },
682 ..self
683 }
684 }
685 pub(in crate::builder) fn folder_regex_(self, regex: &str) -> Self {
686 Self {
687 folder_regex: match Regex::new(regex) {
688 Ok(re) => Some(re),
689 Err(e) => panic!("Error compiling folder regex: {}", e),
690 },
691 ..self
692 }
693 }
694 pub(in crate::builder) fn search_depth_(self, depth: u32) -> Self {
695 Self {
696 max_depth: Some(depth),
697 ..self
698 }
699 }
700 pub(in crate::builder) fn context_<CNEW>(self, context: CNEW) -> Crawler<A, CNEW> {
701 //sadly this is necessary because of the different types...
702 Crawler::<A, CNEW> {
703 context,
704 start_dir: self.start_dir,
705 file_regex: self.file_regex,
706 folder_regex: self.folder_regex,
707 max_depth: self.max_depth,
708 async_marker: self.async_marker,
709 }
710 }
711 }
712 pub(super) mod regex {
713 use crate::builder::internal::config::Config;
714
715 impl<C> Config<C> {
716 pub(in crate::builder) fn validate_folder_regex(&self, str: &str) -> bool {
717 self.folder_regex
718 .as_ref()
719 .map_or(true, |regex| regex.is_match(str))
720 }
721 pub(in crate::builder) fn validate_file_regex(&self, str: &str) -> bool {
722 self.file_regex
723 .as_ref()
724 .map_or(true, |regex| regex.is_match(str))
725 }
726 }
727 }
728 #[cfg(any(feature = "parallel", doc))]
729 pub(in crate::builder) fn par_run<A, E, C>(
730 action: A,
731 config: Config<C>, //1
732 ) -> Result<Arc<C>, Box<dyn Error + Send + 'static>>
733 where
734 A: FnMut(Arc<C>, PathBuf) -> Result<(), E> + Clone + Send + Sync,
735 E: Error + Send + 'static,
736 C: Send + Sync,
737 {
738 use rayon::prelude::*;
739 //'?' doesn't work here (because of the non-trivial trait bound conversions)
740 let entries = std::fs::read_dir(&config.start_dir).map_err(box_err)?;
741 //could optimize that later with .filter()
742 entries.into_iter().par_bridge().try_for_each(|result| {
743 let path = result.map_err(box_err)?.path();
744 if path.is_dir() && !matches!(config.max_depth, Some(0)) {
745 if config.validate_folder_regex(&path.to_string_lossy()) {
746 let config = Config {
747 start_dir: path,
748 max_depth: config.max_depth.and_then(|depth| Some(depth - 1)),
749 folder_regex: config.folder_regex.clone(),
750 file_regex: config.file_regex.clone(),
751 context: Arc::clone(&config.context),
752 };
753 par_run(action.clone(), config)?;
754 }
755 } else {
756 if config.validate_file_regex(&path.to_string_lossy()) {
757 action.clone()(config.context.clone(), path).map_err(box_err)?;
758 }
759 }
760 //just to be sure
761 Ok::<(), Box<dyn Error + Send>>(())
762 })?;
763 Ok(config.context)
764 }
765
766 #[cfg(any(feature = "async", doc))]
767 use crate::builder::internal::utils::Execute;
768 use crate::builder::{
769 Crawler, StartDir,
770 internal::{config::Config, utils::box_err},
771 };
772 use ::regex::Regex;
773 use std::{
774 error::Error,
775 path::{Path, PathBuf},
776 sync::Arc,
777 };
778
779 #[cfg(any(feature = "async", doc))]
780 use std::pin::Pin;
781 #[cfg(any(feature = "async", doc))]
782 use tokio::sync::mpsc::UnboundedSender;
783
784 #[cfg(any(feature = "async", doc))]
785 pub(in crate::builder) fn async_run<Fun, Fut, E, C>(
786 authority_sender: UnboundedSender<Execute<E>>,
787 action: Fun,
788 config: Config<C>,
789 ) -> Pin<Box<dyn Future<Output = Result<(), std::io::Error>> + Send>>
790 where
791 E: Send + 'static + Error,
792 Fun: Fn(Arc<C>, PathBuf) -> Fut + Send + 'static + Clone,
793 Fut: Future<Output = Result<(), E>> + Send + 'static,
794 C: Send + Sync + 'static,
795 {
796 Box::pin(async move {
797 //here, the Custom(_) invariant is important
798 let mut entries = tokio::fs::read_dir(&config.start_dir).await?;
799
800 loop {
801 if let Some(entry) = entries.next_entry().await? {
802 let path = entry.path();
803 if path.is_dir() && !matches!(config.max_depth, Some(0)) {
804 if config.validate_folder_regex(&path.to_string_lossy()) {
805 let config = Config {
806 start_dir: path,
807 max_depth: config.max_depth.and_then(|depth| Some(depth - 1)),
808 context: Arc::clone(&config.context),
809 file_regex: config.file_regex.clone(),
810 folder_regex: config.folder_regex.clone(),
811 };
812 authority_sender
813 .send(Execute::Recursion(Box::pin(
814 async_run::<Fun, Fut, E, C>(
815 authority_sender.clone(),
816 action.clone(),
817 config,
818 ),
819 )))
820 .expect("The Receiver should not have been dropped by now");
821 }
822 } else {
823 if config.validate_file_regex(&path.to_string_lossy()) {
824 authority_sender
825 .send(Execute::Action(Box::pin(action.clone()(
826 Arc::clone(&config.context),
827 path,
828 ))))
829 .expect("The Receiver should not be dropped by now");
830 }
831 }
832 //saving the else branch
833 continue;
834 }
835 break Ok(());
836 }
837 })
838 }
839 }
840}