Skip to main content

mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7
8use foldhash::HashMap;
9use foldhash::HashSet;
10use globset::GlobBuilder;
11use globset::GlobSet;
12use globset::GlobSetBuilder;
13use rayon::prelude::*;
14use walkdir::WalkDir;
15
16use crate::Database;
17use crate::DatabaseConfiguration;
18use crate::error::DatabaseError;
19use crate::exclusion::Exclusion;
20use crate::file::File;
21use crate::file::FileId;
22use crate::file::FileType;
23use crate::utils::read_file;
24
25/// Holds a file along with the specificity of the pattern that matched it.
26///
27/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
28/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
29/// specificity than directory patterns).
30#[derive(Debug)]
31struct FileWithSpecificity {
32    file: File,
33    specificity: usize,
34}
35
36/// Builder for loading files into a Database from the filesystem and memory.
37pub struct DatabaseLoader<'a> {
38    database: Option<Database<'a>>,
39    configuration: DatabaseConfiguration<'a>,
40    memory_sources: Vec<(&'static str, &'static str, FileType)>,
41    /// When set, content for this file (by logical name) is taken from here instead of disk.
42    /// Used for editor integrations: read content from stdin but use the given path for baseline and reporting.
43    stdin_override: Option<(Cow<'a, str>, String)>,
44}
45
46impl<'a> DatabaseLoader<'a> {
47    #[must_use]
48    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
49        Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
50    }
51
52    #[must_use]
53    pub fn with_database(mut self, database: Database<'a>) -> Self {
54        self.database = Some(database);
55        self
56    }
57
58    /// When set, the file with this logical name (workspace-relative path) will use the given
59    /// content instead of being read from disk. The logical name is used for baseline and reporting.
60    #[must_use]
61    pub fn with_stdin_override(mut self, logical_name: impl Into<Cow<'a, str>>, content: String) -> Self {
62        self.stdin_override = Some((logical_name.into(), content));
63        self
64    }
65
66    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
67        self.memory_sources.push((name, contents, file_type));
68    }
69
70    /// Loads files from disk into the database.
71    ///
72    /// # Errors
73    ///
74    /// Returns a [`DatabaseError`] if:
75    /// - A glob pattern is invalid
76    /// - File system operations fail (reading directories, files)
77    /// - File content cannot be read as valid UTF-8
78    pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
79        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
80
81        // Update database configuration to use the loader's configuration
82        // (fixes workspace path when merging with prelude database)
83        db.configuration = self.configuration.clone();
84
85        let extensions_set: HashSet<OsString> =
86            self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
87
88        let glob_settings = &self.configuration.glob;
89        let mut glob_builder = GlobSetBuilder::new();
90        for ex in &self.configuration.excludes {
91            if let Exclusion::Pattern(pat) = ex {
92                let glob = GlobBuilder::new(pat)
93                    .case_insensitive(glob_settings.case_insensitive)
94                    .literal_separator(glob_settings.literal_separator)
95                    .backslash_escape(glob_settings.backslash_escape)
96                    .empty_alternates(glob_settings.empty_alternates)
97                    .build()?;
98
99                glob_builder.add(glob);
100            }
101        }
102
103        let glob_excludes = glob_builder.build()?;
104
105        let path_excludes: HashSet<_> = self
106            .configuration
107            .excludes
108            .iter()
109            .filter_map(|ex| match ex {
110                Exclusion::Path(p) => Some(p),
111                _ => None,
112            })
113            .collect();
114
115        let host_files_with_spec = self.load_paths(
116            &self.configuration.paths,
117            FileType::Host,
118            &extensions_set,
119            &glob_excludes,
120            &path_excludes,
121        )?;
122        let vendored_files_with_spec = self.load_paths(
123            &self.configuration.includes,
124            FileType::Vendored,
125            &extensions_set,
126            &glob_excludes,
127            &path_excludes,
128        )?;
129
130        let mut all_files: HashMap<FileId, File> = HashMap::default();
131        let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
132
133        // Process host files (from paths)
134        for file_with_spec in host_files_with_spec {
135            let file_id = file_with_spec.file.id;
136            let specificity = file_with_spec.specificity;
137
138            all_files.insert(file_id, file_with_spec.file);
139            file_decisions.insert(file_id, (FileType::Host, specificity));
140        }
141
142        // When stdin override is set, ensure that the file is in the database
143        // (covers new/unsaved files, not on disk)
144        if let Some((ref name, ref content)) = self.stdin_override {
145            let file = File::ephemeral(Cow::Owned(name.as_ref().to_string()), Cow::Owned(content.clone()));
146            let file_id = file.id;
147            if let Entry::Vacant(e) = all_files.entry(file_id) {
148                e.insert(file);
149
150                file_decisions.insert(file_id, (FileType::Host, usize::MAX));
151            }
152        }
153
154        for file_with_spec in vendored_files_with_spec {
155            let file_id = file_with_spec.file.id;
156            let vendored_specificity = file_with_spec.specificity;
157
158            all_files.entry(file_id).or_insert(file_with_spec.file);
159
160            match file_decisions.get(&file_id) {
161                Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
162                    // Keep Host
163                }
164                _ => {
165                    file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
166                }
167            }
168        }
169
170        db.reserve(file_decisions.len() + self.memory_sources.len());
171
172        for (file_id, (final_type, _)) in file_decisions {
173            if let Some(mut file) = all_files.remove(&file_id) {
174                file.file_type = final_type;
175                db.add(file);
176            }
177        }
178
179        for (name, contents, file_type) in self.memory_sources {
180            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
181
182            db.add(file);
183        }
184
185        Ok(db)
186    }
187
188    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
189    ///
190    /// Supports both:
191    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
192    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
193    ///
194    /// Returns files along with their pattern specificity for conflict resolution.
195    fn load_paths(
196        &self,
197        roots: &[Cow<'a, str>],
198        file_type: FileType,
199        extensions: &HashSet<OsString>,
200        glob_excludes: &GlobSet,
201        path_excludes: &HashSet<&Cow<'a, Path>>,
202    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
203        // Canonicalize the workspace once.  All WalkDir roots are canonicalized
204        // before traversal so their paths inherit the canonical prefix without
205        // any per-file syscalls.
206        let canonical_workspace =
207            self.configuration.workspace.canonicalize().unwrap_or_else(|_| self.configuration.workspace.to_path_buf());
208
209        // Pre-canonicalize path excludes once as strings.  A plain byte-string
210        // prefix check is then sufficient in the parallel section, replacing the
211        // per-file canonicalize() + Path::starts_with (Components iteration).
212        let canonical_excludes: Vec<String> = path_excludes
213            .iter()
214            .filter_map(|ex| {
215                let p = if Path::new(ex.as_ref()).is_absolute() {
216                    ex.as_ref().to_path_buf()
217                } else {
218                    self.configuration.workspace.join(ex.as_ref())
219                };
220
221                p.canonicalize().ok()?.into_os_string().into_string().ok()
222            })
223            .collect();
224
225        let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
226
227        for root in roots {
228            // Check if this is a glob pattern (contains glob metacharacters).
229            // First check if it's an actual file/directory on disk. if so, treat it
230            // as a literal path even if the name contains glob metacharacters like `[]`.
231            let resolved_path = if Path::new(root.as_ref()).is_absolute() {
232                Path::new(root.as_ref()).to_path_buf()
233            } else {
234                self.configuration.workspace.join(root.as_ref())
235            };
236
237            let is_glob_pattern = !resolved_path.exists()
238                && (root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{'));
239
240            let specificity = Self::calculate_pattern_specificity(root.as_ref());
241            if is_glob_pattern {
242                // Handle as glob pattern
243                let pattern = if Path::new(root.as_ref()).is_absolute() {
244                    root.to_string()
245                } else {
246                    // Make relative patterns absolute by prepending workspace
247                    self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
248                };
249
250                match glob::glob(&pattern) {
251                    Ok(entries) => {
252                        for entry in entries {
253                            match entry {
254                                Ok(path) => {
255                                    if path.is_file() {
256                                        // Canonicalize so the path shares the same prefix as
257                                        // `canonical_workspace` (important on macOS where
258                                        // TempDir / glob return /var/… but canonicalize gives
259                                        // /private/var/…).  Fall back to the original on error.
260                                        let canonical = path.canonicalize().unwrap_or(path);
261                                        paths_to_process.push((canonical, specificity));
262                                    }
263                                }
264                                Err(e) => {
265                                    tracing::warn!("Failed to read glob entry: {}", e);
266                                }
267                            }
268                        }
269                    }
270                    Err(e) => {
271                        return Err(DatabaseError::Glob(e.to_string()));
272                    }
273                }
274            } else {
275                // Canonicalize the root once.  WalkDir does not follow symlinks, so
276                // every path it yields under a canonical root is itself canonical.
277                let canonical_root = resolved_path.canonicalize().unwrap_or(resolved_path);
278                for entry in WalkDir::new(&canonical_root).into_iter().filter_map(Result::ok) {
279                    if entry.file_type().is_file() {
280                        paths_to_process.push((entry.into_path(), specificity));
281                    }
282                }
283            }
284        }
285
286        let has_path_excludes = !canonical_excludes.is_empty();
287        let files: Vec<FileWithSpecificity> = paths_to_process
288            .into_par_iter()
289            .filter_map(|(path, specificity)| {
290                if glob_excludes.is_match(&path) {
291                    return None;
292                }
293
294                let ext = path.extension()?;
295                if !extensions.contains(ext) {
296                    return None;
297                }
298
299                if has_path_excludes {
300                    let excluded = path.to_str().is_some_and(|s| {
301                        canonical_excludes.iter().any(|excl| {
302                            s.starts_with(excl.as_str())
303                                && matches!(s.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
304                        })
305                    });
306
307                    if excluded {
308                        return None;
309                    }
310                }
311
312                let workspace = canonical_workspace.as_path();
313                #[cfg(windows)]
314                let logical_name = path
315                    .strip_prefix(workspace)
316                    .unwrap_or_else(|_| path.as_path())
317                    .to_string_lossy()
318                    .replace('\\', "/");
319                #[cfg(not(windows))]
320                let logical_name =
321                    path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
322
323                if let Some((ref override_name, ref override_content)) = self.stdin_override
324                    && override_name.as_ref() == logical_name
325                {
326                    let file = File::new(
327                        Cow::Owned(logical_name),
328                        file_type,
329                        Some(path.clone()),
330                        Cow::Owned(override_content.clone()),
331                    );
332
333                    return Some(Ok(FileWithSpecificity { file, specificity }));
334                }
335
336                match read_file(workspace, &path, file_type) {
337                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
338                    Err(e) => Some(Err(e)),
339                }
340            })
341            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
342
343        Ok(files)
344    }
345
346    /// Calculates how specific a pattern is for a given file path.
347    ///
348    /// Examples:
349    ///
350    /// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
351    /// - "src/" matching src/b.php: ~100 (directory, 1 component)
352    /// - "src" matching src/b.php: ~100 (directory, 1 component)
353    fn calculate_pattern_specificity(pattern: &str) -> usize {
354        let pattern_path = Path::new(pattern);
355
356        let component_count = pattern_path.components().count();
357        let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
358
359        if is_glob {
360            let non_wildcard_components = pattern_path
361                .components()
362                .filter(|c| {
363                    let s = c.as_os_str().to_string_lossy();
364                    !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
365                })
366                .count();
367            non_wildcard_components * 10
368        } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
369            component_count * 1000
370        } else {
371            component_count * 100
372        }
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379    use crate::DatabaseReader;
380    use crate::GlobSettings;
381    use std::borrow::Cow;
382    use tempfile::TempDir;
383
384    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
385        // Normalize path separators to platform-specific separators
386        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
387
388        DatabaseConfiguration {
389            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
390            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
391            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
392            excludes: vec![],
393            extensions: vec![Cow::Borrowed("php")],
394            glob: GlobSettings::default(),
395        }
396    }
397
398    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
399        let file_path = temp_dir.path().join(relative_path);
400        if let Some(parent) = file_path.parent() {
401            std::fs::create_dir_all(parent).unwrap();
402        }
403        std::fs::write(file_path, content).unwrap();
404    }
405
406    #[test]
407    fn test_specificity_calculation_exact_file() {
408        let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
409        assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
410    }
411
412    #[test]
413    fn test_specificity_calculation_directory() {
414        let spec = DatabaseLoader::calculate_pattern_specificity("src/");
415        assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
416    }
417
418    #[test]
419    fn test_specificity_calculation_glob() {
420        let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
421        assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
422    }
423
424    #[test]
425    fn test_specificity_calculation_deeper_path() {
426        let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
427        let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
428        assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
429    }
430
431    #[test]
432    fn test_exact_file_vs_directory() {
433        let temp_dir = TempDir::new().unwrap();
434
435        create_test_file(&temp_dir, "src/b.php", "<?php");
436        create_test_file(&temp_dir, "src/a.php", "<?php");
437
438        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
439        let loader = DatabaseLoader::new(config);
440        let db = loader.load().unwrap();
441
442        let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
443        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
444
445        let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
446        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
447    }
448
449    #[test]
450    fn test_deeper_vs_shallower_directory() {
451        let temp_dir = TempDir::new().unwrap();
452
453        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
454
455        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
456        let loader = DatabaseLoader::new(config);
457        let db = loader.load().unwrap();
458
459        let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
460        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
461    }
462
463    #[test]
464    fn test_exact_file_vs_glob() {
465        let temp_dir = TempDir::new().unwrap();
466
467        create_test_file(&temp_dir, "src/b.php", "<?php");
468
469        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
470        let loader = DatabaseLoader::new(config);
471        let db = loader.load().unwrap();
472
473        let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
474        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
475    }
476
477    #[test]
478    fn test_equal_specificity_includes_wins() {
479        let temp_dir = TempDir::new().unwrap();
480
481        create_test_file(&temp_dir, "src/a.php", "<?php");
482
483        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
484        let loader = DatabaseLoader::new(config);
485        let db = loader.load().unwrap();
486
487        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
488        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
489    }
490
491    #[test]
492    fn test_complex_scenario_from_bug_report() {
493        let temp_dir = TempDir::new().unwrap();
494
495        create_test_file(&temp_dir, "src/a.php", "<?php");
496        create_test_file(&temp_dir, "src/b.php", "<?php");
497        create_test_file(&temp_dir, "src/c/d.php", "<?php");
498        create_test_file(&temp_dir, "src/c/e.php", "<?php");
499        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
500        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
501
502        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
503        let loader = DatabaseLoader::new(config);
504        let db = loader.load().unwrap();
505
506        let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
507        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
508
509        let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
510        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
511
512        let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
513        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
514    }
515
516    #[test]
517    fn test_files_only_in_paths() {
518        let temp_dir = TempDir::new().unwrap();
519
520        create_test_file(&temp_dir, "src/a.php", "<?php");
521
522        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
523        let loader = DatabaseLoader::new(config);
524        let db = loader.load().unwrap();
525
526        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
527        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
528    }
529
530    #[test]
531    fn test_files_only_in_includes() {
532        let temp_dir = TempDir::new().unwrap();
533
534        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
535
536        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
537        let loader = DatabaseLoader::new(config);
538        let db = loader.load().unwrap();
539
540        let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
541        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
542    }
543
544    #[test]
545    fn test_stdin_override_replaces_file_content() {
546        let temp_dir = TempDir::new().unwrap();
547        create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
548
549        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
550        let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", "<?php\n// from stdin".to_string());
551        let db = loader.load().unwrap();
552
553        let file = db.files().find(|f| f.name.contains("foo.php")).unwrap();
554        assert_eq!(
555            file.contents.as_ref(),
556            "<?php\n// from stdin",
557            "stdin override content should be used instead of disk"
558        );
559    }
560
561    #[test]
562    fn test_stdin_override_adds_file_when_not_on_disk() {
563        let temp_dir = TempDir::new().unwrap();
564        // Do not create src/foo.php on disk
565        create_test_file(&temp_dir, "src/.gitkeep", "");
566
567        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
568        let loader =
569            DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", "<?php\n// unsaved buffer".to_string());
570        let db = loader.load().unwrap();
571
572        let file = db.files().find(|f| f.name.contains("unsaved.php")).unwrap();
573        assert_eq!(file.file_type, FileType::Host);
574        assert_eq!(file.contents.as_ref(), "<?php\n// unsaved buffer");
575    }
576}