Skip to main content

mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7
8use foldhash::HashMap;
9use foldhash::HashSet;
10use globset::Glob;
11use globset::GlobSet;
12use globset::GlobSetBuilder;
13use rayon::prelude::*;
14use walkdir::WalkDir;
15
16use crate::Database;
17use crate::DatabaseConfiguration;
18use crate::error::DatabaseError;
19use crate::exclusion::Exclusion;
20use crate::file::File;
21use crate::file::FileId;
22use crate::file::FileType;
23use crate::utils::read_file;
24
25/// Holds a file along with the specificity of the pattern that matched it.
26///
27/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
28/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
29/// specificity than directory patterns).
30#[derive(Debug)]
31struct FileWithSpecificity {
32    file: File,
33    specificity: usize,
34}
35
36/// Builder for loading files into a Database from the filesystem and memory.
37pub struct DatabaseLoader<'a> {
38    database: Option<Database<'a>>,
39    configuration: DatabaseConfiguration<'a>,
40    memory_sources: Vec<(&'static str, &'static str, FileType)>,
41    /// When set, content for this file (by logical name) is taken from here instead of disk.
42    /// Used for editor integrations: read content from stdin but use the given path for baseline and reporting.
43    stdin_override: Option<(Cow<'a, str>, String)>,
44}
45
46impl<'a> DatabaseLoader<'a> {
47    #[must_use]
48    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
49        Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
50    }
51
52    #[must_use]
53    pub fn with_database(mut self, database: Database<'a>) -> Self {
54        self.database = Some(database);
55        self
56    }
57
58    /// When set, the file with this logical name (workspace-relative path) will use the given
59    /// content instead of being read from disk. The logical name is used for baseline and reporting.
60    #[must_use]
61    pub fn with_stdin_override(mut self, logical_name: impl Into<Cow<'a, str>>, content: String) -> Self {
62        self.stdin_override = Some((logical_name.into(), content));
63        self
64    }
65
66    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
67        self.memory_sources.push((name, contents, file_type));
68    }
69
70    /// Loads files from disk into the database.
71    ///
72    /// # Errors
73    ///
74    /// Returns a [`DatabaseError`] if:
75    /// - A glob pattern is invalid
76    /// - File system operations fail (reading directories, files)
77    /// - File content cannot be read as valid UTF-8
78    pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
79        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
80
81        // Update database configuration to use the loader's configuration
82        // (fixes workspace path when merging with prelude database)
83        db.configuration = self.configuration.clone();
84
85        let extensions_set: HashSet<OsString> =
86            self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
87
88        let mut glob_builder = GlobSetBuilder::new();
89        for ex in &self.configuration.excludes {
90            if let Exclusion::Pattern(pat) = ex {
91                glob_builder.add(Glob::new(pat)?);
92            }
93        }
94
95        let glob_excludes = glob_builder.build()?;
96
97        let path_excludes: HashSet<_> = self
98            .configuration
99            .excludes
100            .iter()
101            .filter_map(|ex| match ex {
102                Exclusion::Path(p) => Some(p),
103                _ => None,
104            })
105            .collect();
106
107        let host_files_with_spec = self.load_paths(
108            &self.configuration.paths,
109            FileType::Host,
110            &extensions_set,
111            &glob_excludes,
112            &path_excludes,
113        )?;
114        let vendored_files_with_spec = self.load_paths(
115            &self.configuration.includes,
116            FileType::Vendored,
117            &extensions_set,
118            &glob_excludes,
119            &path_excludes,
120        )?;
121
122        let mut all_files: HashMap<FileId, File> = HashMap::default();
123        let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
124
125        // Process host files (from paths)
126        for file_with_spec in host_files_with_spec {
127            let file_id = file_with_spec.file.id;
128            let specificity = file_with_spec.specificity;
129
130            all_files.insert(file_id, file_with_spec.file);
131            file_decisions.insert(file_id, (FileType::Host, specificity));
132        }
133
134        // When stdin override is set, ensure that the file is in the database
135        // (covers new/unsaved files, not on disk)
136        if let Some((ref name, ref content)) = self.stdin_override {
137            let file = File::ephemeral(Cow::Owned(name.as_ref().to_string()), Cow::Owned(content.clone()));
138            let file_id = file.id;
139            if let Entry::Vacant(e) = all_files.entry(file_id) {
140                e.insert(file);
141
142                file_decisions.insert(file_id, (FileType::Host, usize::MAX));
143            }
144        }
145
146        for file_with_spec in vendored_files_with_spec {
147            let file_id = file_with_spec.file.id;
148            let vendored_specificity = file_with_spec.specificity;
149
150            all_files.entry(file_id).or_insert(file_with_spec.file);
151
152            match file_decisions.get(&file_id) {
153                Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
154                    // Keep Host
155                }
156                _ => {
157                    file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
158                }
159            }
160        }
161
162        for (file_id, (final_type, _)) in file_decisions {
163            if let Some(mut file) = all_files.remove(&file_id) {
164                file.file_type = final_type;
165                db.add(file);
166            }
167        }
168
169        for (name, contents, file_type) in self.memory_sources {
170            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
171
172            db.add(file);
173        }
174
175        Ok(db)
176    }
177
178    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
179    ///
180    /// Supports both:
181    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
182    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
183    ///
184    /// Returns files along with their pattern specificity for conflict resolution.
185    fn load_paths(
186        &self,
187        roots: &[Cow<'a, str>],
188        file_type: FileType,
189        extensions: &HashSet<OsString>,
190        glob_excludes: &GlobSet,
191        path_excludes: &HashSet<&Cow<'a, Path>>,
192    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
193        let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
194
195        for root in roots {
196            // Check if this is a glob pattern (contains glob metacharacters).
197            // First check if it's an actual file/directory on disk. if so, treat it
198            // as a literal path even if the name contains glob metacharacters like `[]`.
199            let resolved_path = if Path::new(root.as_ref()).is_absolute() {
200                Path::new(root.as_ref()).to_path_buf()
201            } else {
202                self.configuration.workspace.join(root.as_ref())
203            };
204
205            let is_glob_pattern = !resolved_path.exists()
206                && (root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{'));
207
208            let specificity = Self::calculate_pattern_specificity(root.as_ref());
209            if is_glob_pattern {
210                // Handle as glob pattern
211                let pattern = if Path::new(root.as_ref()).is_absolute() {
212                    root.to_string()
213                } else {
214                    // Make relative patterns absolute by prepending workspace
215                    self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
216                };
217
218                match glob::glob(&pattern) {
219                    Ok(entries) => {
220                        for entry in entries {
221                            match entry {
222                                Ok(path) => {
223                                    if path.is_file() {
224                                        paths_to_process.push((path, specificity));
225                                    }
226                                }
227                                Err(e) => {
228                                    tracing::warn!("Failed to read glob entry: {}", e);
229                                }
230                            }
231                        }
232                    }
233                    Err(e) => {
234                        return Err(DatabaseError::Glob(e.to_string()));
235                    }
236                }
237            } else {
238                for entry in WalkDir::new(&resolved_path).into_iter().filter_map(Result::ok) {
239                    if entry.file_type().is_file() {
240                        paths_to_process.push((entry.into_path(), specificity));
241                    }
242                }
243            }
244        }
245
246        let has_path_excludes = !path_excludes.is_empty();
247        let files: Vec<FileWithSpecificity> = paths_to_process
248            .into_par_iter()
249            .filter_map(|(path, specificity)| {
250                if glob_excludes.is_match(&path) {
251                    return None;
252                }
253
254                let ext = path.extension()?;
255                if !extensions.contains(ext) {
256                    return None;
257                }
258
259                if has_path_excludes
260                    && let Ok(canonical_path) = path.canonicalize()
261                    && path_excludes.iter().any(|excluded| canonical_path.starts_with(excluded))
262                {
263                    return None;
264                }
265
266                let workspace = self.configuration.workspace.as_ref();
267                #[cfg(windows)]
268                let logical_name = path
269                    .strip_prefix(workspace)
270                    .unwrap_or_else(|_| path.as_path())
271                    .to_string_lossy()
272                    .replace('\\', "/");
273                #[cfg(not(windows))]
274                let logical_name =
275                    path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
276
277                if let Some((ref override_name, ref override_content)) = self.stdin_override
278                    && override_name.as_ref() == logical_name
279                {
280                    let file = File::new(
281                        Cow::Owned(logical_name),
282                        file_type,
283                        Some(path.clone()),
284                        Cow::Owned(override_content.clone()),
285                    );
286
287                    return Some(Ok(FileWithSpecificity { file, specificity }));
288                }
289
290                match read_file(workspace, &path, file_type) {
291                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
292                    Err(e) => Some(Err(e)),
293                }
294            })
295            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
296
297        Ok(files)
298    }
299
300    /// Calculates how specific a pattern is for a given file path.
301    ///
302    /// Examples:
303    ///
304    /// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
305    /// - "src/" matching src/b.php: ~100 (directory, 1 component)
306    /// - "src" matching src/b.php: ~100 (directory, 1 component)
307    fn calculate_pattern_specificity(pattern: &str) -> usize {
308        let pattern_path = Path::new(pattern);
309
310        let component_count = pattern_path.components().count();
311        let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
312
313        if is_glob {
314            let non_wildcard_components = pattern_path
315                .components()
316                .filter(|c| {
317                    let s = c.as_os_str().to_string_lossy();
318                    !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
319                })
320                .count();
321            non_wildcard_components * 10
322        } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
323            component_count * 1000
324        } else {
325            component_count * 100
326        }
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333    use crate::DatabaseReader;
334    use std::borrow::Cow;
335    use tempfile::TempDir;
336
337    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
338        // Normalize path separators to platform-specific separators
339        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
340
341        DatabaseConfiguration {
342            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
343            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
344            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
345            excludes: vec![],
346            extensions: vec![Cow::Borrowed("php")],
347        }
348    }
349
350    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
351        let file_path = temp_dir.path().join(relative_path);
352        if let Some(parent) = file_path.parent() {
353            std::fs::create_dir_all(parent).unwrap();
354        }
355        std::fs::write(file_path, content).unwrap();
356    }
357
358    #[test]
359    fn test_specificity_calculation_exact_file() {
360        let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
361        assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
362    }
363
364    #[test]
365    fn test_specificity_calculation_directory() {
366        let spec = DatabaseLoader::calculate_pattern_specificity("src/");
367        assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
368    }
369
370    #[test]
371    fn test_specificity_calculation_glob() {
372        let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
373        assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
374    }
375
376    #[test]
377    fn test_specificity_calculation_deeper_path() {
378        let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
379        let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
380        assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
381    }
382
383    #[test]
384    fn test_exact_file_vs_directory() {
385        let temp_dir = TempDir::new().unwrap();
386
387        create_test_file(&temp_dir, "src/b.php", "<?php");
388        create_test_file(&temp_dir, "src/a.php", "<?php");
389
390        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
391        let loader = DatabaseLoader::new(config);
392        let db = loader.load().unwrap();
393
394        let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
395        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
396
397        let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
398        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
399    }
400
401    #[test]
402    fn test_deeper_vs_shallower_directory() {
403        let temp_dir = TempDir::new().unwrap();
404
405        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
406
407        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
408        let loader = DatabaseLoader::new(config);
409        let db = loader.load().unwrap();
410
411        let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
412        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
413    }
414
415    #[test]
416    fn test_exact_file_vs_glob() {
417        let temp_dir = TempDir::new().unwrap();
418
419        create_test_file(&temp_dir, "src/b.php", "<?php");
420
421        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
422        let loader = DatabaseLoader::new(config);
423        let db = loader.load().unwrap();
424
425        let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
426        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
427    }
428
429    #[test]
430    fn test_equal_specificity_includes_wins() {
431        let temp_dir = TempDir::new().unwrap();
432
433        create_test_file(&temp_dir, "src/a.php", "<?php");
434
435        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
436        let loader = DatabaseLoader::new(config);
437        let db = loader.load().unwrap();
438
439        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
440        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
441    }
442
443    #[test]
444    fn test_complex_scenario_from_bug_report() {
445        let temp_dir = TempDir::new().unwrap();
446
447        create_test_file(&temp_dir, "src/a.php", "<?php");
448        create_test_file(&temp_dir, "src/b.php", "<?php");
449        create_test_file(&temp_dir, "src/c/d.php", "<?php");
450        create_test_file(&temp_dir, "src/c/e.php", "<?php");
451        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
452        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
453
454        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
455        let loader = DatabaseLoader::new(config);
456        let db = loader.load().unwrap();
457
458        let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
459        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
460
461        let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
462        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
463
464        let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
465        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
466    }
467
468    #[test]
469    fn test_files_only_in_paths() {
470        let temp_dir = TempDir::new().unwrap();
471
472        create_test_file(&temp_dir, "src/a.php", "<?php");
473
474        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
475        let loader = DatabaseLoader::new(config);
476        let db = loader.load().unwrap();
477
478        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
479        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
480    }
481
482    #[test]
483    fn test_files_only_in_includes() {
484        let temp_dir = TempDir::new().unwrap();
485
486        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
487
488        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
489        let loader = DatabaseLoader::new(config);
490        let db = loader.load().unwrap();
491
492        let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
493        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
494    }
495
496    #[test]
497    fn test_stdin_override_replaces_file_content() {
498        let temp_dir = TempDir::new().unwrap();
499        create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
500
501        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
502        let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", "<?php\n// from stdin".to_string());
503        let db = loader.load().unwrap();
504
505        let file = db.files().find(|f| f.name.contains("foo.php")).unwrap();
506        assert_eq!(
507            file.contents.as_ref(),
508            "<?php\n// from stdin",
509            "stdin override content should be used instead of disk"
510        );
511    }
512
513    #[test]
514    fn test_stdin_override_adds_file_when_not_on_disk() {
515        let temp_dir = TempDir::new().unwrap();
516        // Do not create src/foo.php on disk
517        create_test_file(&temp_dir, "src/.gitkeep", "");
518
519        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
520        let loader =
521            DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", "<?php\n// unsaved buffer".to_string());
522        let db = loader.load().unwrap();
523
524        let file = db.files().find(|f| f.name.contains("unsaved.php")).unwrap();
525        assert_eq!(file.file_type, FileType::Host);
526        assert_eq!(file.contents.as_ref(), "<?php\n// unsaved buffer");
527    }
528}