mago_database/
loader.rs

1use std::borrow::Cow;
2use std::collections::HashSet;
3use std::ffi::OsString;
4use std::path::PathBuf;
5
6use globset::Glob;
7use globset::GlobSet;
8use globset::GlobSetBuilder;
9use rayon::iter::IntoParallelIterator;
10use rayon::iter::ParallelIterator;
11use walkdir::WalkDir;
12
13use crate::Database;
14use crate::error::DatabaseError;
15use crate::exclusion::Exclusion;
16use crate::file::File;
17use crate::file::FileType;
18use crate::utils::read_file;
19
20/// Configures and builds a `Database` by scanning the filesystem and memory.
21pub struct DatabaseLoader {
22    database: Option<Database>,
23    workspace: PathBuf,
24    paths: Vec<PathBuf>,
25    includes: Vec<PathBuf>,
26    excludes: Vec<Exclusion>,
27    memory_sources: Vec<(&'static str, &'static str, FileType)>,
28    extensions: Vec<String>,
29}
30
31impl DatabaseLoader {
32    /// Creates a new loader with the given configuration.
33    #[allow(clippy::too_many_arguments)]
34    pub fn new(
35        workspace: PathBuf,
36        paths: Vec<PathBuf>,
37        includes: Vec<PathBuf>,
38        excludes: Vec<Exclusion>,
39        extensions: Vec<String>,
40    ) -> Self {
41        Self { workspace, paths, includes, excludes, memory_sources: vec![], extensions, database: None }
42    }
43
44    /// Adds a memory source to the loader.
45    ///
46    /// This allows you to include files that are not on the filesystem but should be part of the database.
47    ///
48    /// # Arguments
49    ///
50    /// * `name` - The logical name of the file, typically its path relative to the workspace.
51    /// * `contents` - The contents of the file as a string.
52    /// * `file_type` - The type of the file, indicating whether it's a host file or a vendored file.
53    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
54        self.memory_sources.push((name, contents, file_type));
55    }
56
57    /// Scans sources according to the configuration and builds a `Database`.
58    ///
59    /// This is the main entry point that orchestrates the entire loading process.
60    /// It returns a `Result` as some pre-processing, like compiling globs, can fail.
61    pub fn load(mut self) -> Result<Database, DatabaseError> {
62        let mut db = if let Some(existing_db) = self.database.take() { existing_db } else { Database::new() };
63
64        let extensions_set: HashSet<OsString> = self.extensions.iter().map(OsString::from).collect();
65
66        let mut glob_builder = GlobSetBuilder::new();
67        for ex in &self.excludes {
68            if let Exclusion::Pattern(pat) = ex {
69                glob_builder.add(Glob::new(pat)?);
70            }
71        }
72
73        let glob_excludes = glob_builder.build()?;
74        let host_files = self.load_paths(&self.paths, FileType::Host, &extensions_set, &glob_excludes)?;
75        let vendored_files = self.load_paths(&self.includes, FileType::Vendored, &extensions_set, &glob_excludes)?;
76
77        for file in host_files.into_iter().chain(vendored_files.into_iter()) {
78            db.add(file);
79        }
80
81        for (name, contents, file_type) in self.memory_sources {
82            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
83
84            db.add(file);
85        }
86
87        Ok(db)
88    }
89
90    /// Discovers and reads all files from a set of root paths in parallel.
91    fn load_paths(
92        &self,
93        roots: &[PathBuf],
94        file_type: FileType,
95        extensions: &HashSet<OsString>,
96        glob_excludes: &GlobSet,
97    ) -> Result<Vec<File>, DatabaseError> {
98        // 2. Discover all file paths first. This part is still synchronous and fast.
99        let path_excludes: HashSet<_> = self
100            .excludes
101            .iter()
102            .filter_map(|ex| match ex {
103                Exclusion::Path(p) => p.canonicalize().ok(),
104                _ => None,
105            })
106            .collect();
107
108        let mut paths_to_process = Vec::new();
109        for root in roots {
110            for entry in WalkDir::new(root).into_iter().filter_map(Result::ok) {
111                if entry.file_type().is_file() {
112                    paths_to_process.push(entry.into_path());
113                }
114            }
115        }
116
117        // 3. Use a parallel iterator to process all discovered paths.
118        let files: Vec<File> = paths_to_process
119            .into_par_iter() // This is the magic from rayon!
120            .filter_map(|path| {
121                // Apply filters in parallel
122                if glob_excludes.is_match(&path) {
123                    return None;
124                }
125                if let Ok(p) = path.canonicalize()
126                    && path_excludes.contains(&p)
127                {
128                    return None;
129                }
130                if let Some(ext) = path.extension() {
131                    if !extensions.contains(ext) {
132                        return None;
133                    }
134                } else {
135                    return None;
136                }
137
138                // Read the file. `read_file` is a blocking operation, but since it's
139                // running in a limited number of threads, it's efficient.
140                match read_file(&self.workspace, &path, file_type) {
141                    Ok(file) => Some(Ok(file)),
142                    Err(e) => Some(Err(e)),
143                }
144            })
145            .collect::<Result<Vec<File>, _>>()?; // Collect results, propagating any errors.
146
147        Ok(files)
148    }
149}