Skip to main content

mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::ffi::OsString;
5use std::path::Path;
6
7use foldhash::HashMap;
8use foldhash::HashSet;
9use globset::Glob;
10use globset::GlobSet;
11use globset::GlobSetBuilder;
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use crate::Database;
16use crate::DatabaseConfiguration;
17use crate::error::DatabaseError;
18use crate::exclusion::Exclusion;
19use crate::file::File;
20use crate::file::FileId;
21use crate::file::FileType;
22use crate::utils::read_file;
23
24/// Holds a file along with the specificity of the pattern that matched it.
25///
26/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
27/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
28/// specificity than directory patterns).
29#[derive(Debug)]
30struct FileWithSpecificity {
31    file: File,
32    specificity: usize,
33}
34
35/// Builder for loading files into a Database from the filesystem and memory.
36pub struct DatabaseLoader<'a> {
37    database: Option<Database<'a>>,
38    configuration: DatabaseConfiguration<'a>,
39    memory_sources: Vec<(&'static str, &'static str, FileType)>,
40}
41
42impl<'a> DatabaseLoader<'a> {
43    #[must_use]
44    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
45        Self { configuration, memory_sources: vec![], database: None }
46    }
47
48    #[must_use]
49    pub fn with_database(mut self, database: Database<'a>) -> Self {
50        self.database = Some(database);
51        self
52    }
53
54    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
55        self.memory_sources.push((name, contents, file_type));
56    }
57
58    /// Loads files from disk into the database.
59    ///
60    /// # Errors
61    ///
62    /// Returns a [`DatabaseError`] if:
63    /// - A glob pattern is invalid
64    /// - File system operations fail (reading directories, files)
65    /// - File content cannot be read as valid UTF-8
66    pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
67        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
68
69        // Update database configuration to use the loader's configuration
70        // (fixes workspace path when merging with prelude database)
71        db.configuration = self.configuration.clone();
72
73        let extensions_set: HashSet<OsString> =
74            self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
75
76        let mut glob_builder = GlobSetBuilder::new();
77        for ex in &self.configuration.excludes {
78            if let Exclusion::Pattern(pat) = ex {
79                glob_builder.add(Glob::new(pat)?);
80            }
81        }
82
83        let glob_excludes = glob_builder.build()?;
84
85        let path_excludes: HashSet<_> = self
86            .configuration
87            .excludes
88            .iter()
89            .filter_map(|ex| match ex {
90                Exclusion::Path(p) => Some(p),
91                _ => None,
92            })
93            .collect();
94
95        let host_files_with_spec = self.load_paths(
96            &self.configuration.paths,
97            FileType::Host,
98            &extensions_set,
99            &glob_excludes,
100            &path_excludes,
101        )?;
102        let vendored_files_with_spec = self.load_paths(
103            &self.configuration.includes,
104            FileType::Vendored,
105            &extensions_set,
106            &glob_excludes,
107            &path_excludes,
108        )?;
109
110        let mut all_files: HashMap<FileId, File> = HashMap::default();
111        let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
112
113        // Process host files (from paths)
114        for file_with_spec in host_files_with_spec {
115            let file_id = file_with_spec.file.id;
116            let specificity = file_with_spec.specificity;
117
118            all_files.insert(file_id, file_with_spec.file);
119            file_decisions.insert(file_id, (FileType::Host, specificity));
120        }
121
122        for file_with_spec in vendored_files_with_spec {
123            let file_id = file_with_spec.file.id;
124            let vendored_specificity = file_with_spec.specificity;
125
126            all_files.entry(file_id).or_insert(file_with_spec.file);
127
128            match file_decisions.get(&file_id) {
129                Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
130                    // Keep Host
131                }
132                _ => {
133                    file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
134                }
135            }
136        }
137
138        for (file_id, (final_type, _)) in file_decisions {
139            if let Some(mut file) = all_files.remove(&file_id) {
140                file.file_type = final_type;
141                db.add(file);
142            }
143        }
144
145        for (name, contents, file_type) in self.memory_sources {
146            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
147
148            db.add(file);
149        }
150
151        Ok(db)
152    }
153
154    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
155    ///
156    /// Supports both:
157    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
158    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
159    ///
160    /// Returns files along with their pattern specificity for conflict resolution.
161    fn load_paths(
162        &self,
163        roots: &[Cow<'a, str>],
164        file_type: FileType,
165        extensions: &HashSet<OsString>,
166        glob_excludes: &GlobSet,
167        path_excludes: &HashSet<&Cow<'a, Path>>,
168    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
169        let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
170
171        for root in roots {
172            // Check if this is a glob pattern (contains glob metacharacters)
173            let is_glob_pattern = root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{');
174
175            let specificity = Self::calculate_pattern_specificity(root.as_ref());
176            if is_glob_pattern {
177                // Handle as glob pattern
178                let pattern = if Path::new(root.as_ref()).is_absolute() {
179                    root.to_string()
180                } else {
181                    // Make relative patterns absolute by prepending workspace
182                    self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
183                };
184
185                match glob::glob(&pattern) {
186                    Ok(entries) => {
187                        for entry in entries {
188                            match entry {
189                                Ok(path) => {
190                                    if path.is_file() {
191                                        paths_to_process.push((path, specificity));
192                                    }
193                                }
194                                Err(e) => {
195                                    tracing::warn!("Failed to read glob entry: {}", e);
196                                }
197                            }
198                        }
199                    }
200                    Err(e) => {
201                        return Err(DatabaseError::Glob(e.to_string()));
202                    }
203                }
204            } else {
205                // Handle as directory path (existing logic)
206                let dir_path = if Path::new(root.as_ref()).is_absolute() {
207                    Path::new(root.as_ref()).to_path_buf()
208                } else {
209                    self.configuration.workspace.join(root.as_ref())
210                };
211
212                for entry in WalkDir::new(&dir_path).into_iter().filter_map(Result::ok) {
213                    if entry.file_type().is_file() {
214                        paths_to_process.push((entry.into_path(), specificity));
215                    }
216                }
217            }
218        }
219
220        let has_path_excludes = !path_excludes.is_empty();
221        let files: Vec<FileWithSpecificity> = paths_to_process
222            .into_par_iter()
223            .filter_map(|(path, specificity)| {
224                if glob_excludes.is_match(&path) {
225                    return None;
226                }
227
228                let ext = path.extension()?;
229                if !extensions.contains(ext) {
230                    return None;
231                }
232
233                if has_path_excludes
234                    && let Ok(canonical_path) = path.canonicalize()
235                    && path_excludes.iter().any(|excluded| canonical_path.starts_with(excluded))
236                {
237                    return None;
238                }
239
240                match read_file(self.configuration.workspace.as_ref(), &path, file_type) {
241                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
242                    Err(e) => Some(Err(e)),
243                }
244            })
245            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
246
247        Ok(files)
248    }
249
250    /// Calculates how specific a pattern is for a given file path.
251    ///
252    /// Examples:
253    ///
254    /// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
255    /// - "src/" matching src/b.php: ~100 (directory, 1 component)
256    /// - "src" matching src/b.php: ~100 (directory, 1 component)
257    fn calculate_pattern_specificity(pattern: &str) -> usize {
258        let pattern_path = Path::new(pattern);
259
260        let component_count = pattern_path.components().count();
261        let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
262
263        if is_glob {
264            let non_wildcard_components = pattern_path
265                .components()
266                .filter(|c| {
267                    let s = c.as_os_str().to_string_lossy();
268                    !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
269                })
270                .count();
271            non_wildcard_components * 10
272        } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
273            component_count * 1000
274        } else {
275            component_count * 100
276        }
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283    use crate::DatabaseReader;
284    use std::borrow::Cow;
285    use tempfile::TempDir;
286
287    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
288        // Normalize path separators to platform-specific separators
289        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
290
291        DatabaseConfiguration {
292            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
293            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
294            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
295            excludes: vec![],
296            extensions: vec![Cow::Borrowed("php")],
297        }
298    }
299
300    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
301        let file_path = temp_dir.path().join(relative_path);
302        if let Some(parent) = file_path.parent() {
303            std::fs::create_dir_all(parent).unwrap();
304        }
305        std::fs::write(file_path, content).unwrap();
306    }
307
308    #[test]
309    fn test_specificity_calculation_exact_file() {
310        let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
311        assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
312    }
313
314    #[test]
315    fn test_specificity_calculation_directory() {
316        let spec = DatabaseLoader::calculate_pattern_specificity("src/");
317        assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
318    }
319
320    #[test]
321    fn test_specificity_calculation_glob() {
322        let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
323        assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
324    }
325
326    #[test]
327    fn test_specificity_calculation_deeper_path() {
328        let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
329        let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
330        assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
331    }
332
333    #[test]
334    fn test_exact_file_vs_directory() {
335        let temp_dir = TempDir::new().unwrap();
336
337        create_test_file(&temp_dir, "src/b.php", "<?php");
338        create_test_file(&temp_dir, "src/a.php", "<?php");
339
340        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
341        let loader = DatabaseLoader::new(config);
342        let db = loader.load().unwrap();
343
344        let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
345        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
346
347        let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
348        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
349    }
350
351    #[test]
352    fn test_deeper_vs_shallower_directory() {
353        let temp_dir = TempDir::new().unwrap();
354
355        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
356
357        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
358        let loader = DatabaseLoader::new(config);
359        let db = loader.load().unwrap();
360
361        let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
362        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
363    }
364
365    #[test]
366    fn test_exact_file_vs_glob() {
367        let temp_dir = TempDir::new().unwrap();
368
369        create_test_file(&temp_dir, "src/b.php", "<?php");
370
371        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
372        let loader = DatabaseLoader::new(config);
373        let db = loader.load().unwrap();
374
375        let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
376        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
377    }
378
379    #[test]
380    fn test_equal_specificity_includes_wins() {
381        let temp_dir = TempDir::new().unwrap();
382
383        create_test_file(&temp_dir, "src/a.php", "<?php");
384
385        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
386        let loader = DatabaseLoader::new(config);
387        let db = loader.load().unwrap();
388
389        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
390        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
391    }
392
393    #[test]
394    fn test_complex_scenario_from_bug_report() {
395        let temp_dir = TempDir::new().unwrap();
396
397        create_test_file(&temp_dir, "src/a.php", "<?php");
398        create_test_file(&temp_dir, "src/b.php", "<?php");
399        create_test_file(&temp_dir, "src/c/d.php", "<?php");
400        create_test_file(&temp_dir, "src/c/e.php", "<?php");
401        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
402        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
403
404        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
405        let loader = DatabaseLoader::new(config);
406        let db = loader.load().unwrap();
407
408        let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
409        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
410
411        let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
412        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
413
414        let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
415        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
416    }
417
418    #[test]
419    fn test_files_only_in_paths() {
420        let temp_dir = TempDir::new().unwrap();
421
422        create_test_file(&temp_dir, "src/a.php", "<?php");
423
424        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
425        let loader = DatabaseLoader::new(config);
426        let db = loader.load().unwrap();
427
428        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
429        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
430    }
431
432    #[test]
433    fn test_files_only_in_includes() {
434        let temp_dir = TempDir::new().unwrap();
435
436        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
437
438        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
439        let loader = DatabaseLoader::new(config);
440        let db = loader.load().unwrap();
441
442        let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
443        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
444    }
445}