mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::ffi::OsString;
5use std::path::Path;
6
7use ahash::HashMap;
8use ahash::HashSet;
9use globset::Glob;
10use globset::GlobSet;
11use globset::GlobSetBuilder;
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use crate::Database;
16use crate::DatabaseConfiguration;
17use crate::error::DatabaseError;
18use crate::exclusion::Exclusion;
19use crate::file::File;
20use crate::file::FileId;
21use crate::file::FileType;
22use crate::utils::read_file;
23
24/// Holds a file along with the specificity of the pattern that matched it.
25///
26/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
27/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
28/// specificity than directory patterns).
29#[derive(Debug)]
30struct FileWithSpecificity {
31    file: File,
32    specificity: usize,
33}
34
35/// Builder for loading files into a Database from the filesystem and memory.
36pub struct DatabaseLoader<'a> {
37    database: Option<Database<'a>>,
38    configuration: DatabaseConfiguration<'a>,
39    memory_sources: Vec<(&'static str, &'static str, FileType)>,
40}
41
42impl<'a> DatabaseLoader<'a> {
43    #[must_use]
44    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
45        Self { configuration, memory_sources: vec![], database: None }
46    }
47
48    #[must_use]
49    pub fn with_database(mut self, database: Database<'a>) -> Self {
50        self.database = Some(database);
51        self
52    }
53
54    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
55        self.memory_sources.push((name, contents, file_type));
56    }
57
58    /// Loads files from disk into the database.
59    ///
60    /// # Errors
61    ///
62    /// Returns a [`DatabaseError`] if:
63    /// - A glob pattern is invalid
64    /// - File system operations fail (reading directories, files)
65    /// - File content cannot be read as valid UTF-8
66    pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
67        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
68
69        // Update database configuration to use the loader's configuration
70        // (fixes workspace path when merging with prelude database)
71        db.configuration = self.configuration.clone();
72
73        let extensions_set: HashSet<OsString> =
74            self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
75
76        let mut glob_builder = GlobSetBuilder::new();
77        for ex in &self.configuration.excludes {
78            if let Exclusion::Pattern(pat) = ex {
79                glob_builder.add(Glob::new(pat)?);
80            }
81        }
82
83        let glob_excludes = glob_builder.build()?;
84
85        let path_excludes: HashSet<_> = self
86            .configuration
87            .excludes
88            .iter()
89            .filter_map(|ex| match ex {
90                Exclusion::Path(p) => Some(p),
91                _ => None,
92            })
93            .collect();
94
95        let host_files_with_spec = self.load_paths(
96            &self.configuration.paths,
97            FileType::Host,
98            &extensions_set,
99            &glob_excludes,
100            &path_excludes,
101        )?;
102        let vendored_files_with_spec = self.load_paths(
103            &self.configuration.includes,
104            FileType::Vendored,
105            &extensions_set,
106            &glob_excludes,
107            &path_excludes,
108        )?;
109
110        let mut all_files: HashMap<FileId, File> = HashMap::default();
111        let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
112
113        // Process host files (from paths)
114        for file_with_spec in host_files_with_spec {
115            let file_id = file_with_spec.file.id;
116            let specificity = file_with_spec.specificity;
117
118            all_files.insert(file_id, file_with_spec.file);
119            file_decisions.insert(file_id, (FileType::Host, specificity));
120        }
121
122        for file_with_spec in vendored_files_with_spec {
123            let file_id = file_with_spec.file.id;
124            let vendored_specificity = file_with_spec.specificity;
125
126            all_files.entry(file_id).or_insert(file_with_spec.file);
127
128            match file_decisions.get(&file_id) {
129                Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
130                    // Keep Host
131                }
132                _ => {
133                    file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
134                }
135            }
136        }
137
138        for (file_id, (final_type, _)) in file_decisions {
139            if let Some(mut file) = all_files.remove(&file_id) {
140                file.file_type = final_type;
141                db.add(file);
142            }
143        }
144
145        for (name, contents, file_type) in self.memory_sources {
146            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
147
148            db.add(file);
149        }
150
151        Ok(db)
152    }
153
154    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
155    ///
156    /// Supports both:
157    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
158    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
159    ///
160    /// Returns files along with their pattern specificity for conflict resolution.
161    fn load_paths(
162        &self,
163        roots: &[Cow<'a, str>],
164        file_type: FileType,
165        extensions: &HashSet<OsString>,
166        glob_excludes: &GlobSet,
167        path_excludes: &HashSet<&Cow<'a, Path>>,
168    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
169        let mut paths_to_process: Vec<(std::path::PathBuf, String, usize)> = Vec::new();
170
171        for root in roots {
172            // Check if this is a glob pattern (contains glob metacharacters)
173            let is_glob_pattern = root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{');
174
175            let specificity = Self::calculate_pattern_specificity(root.as_ref());
176            if is_glob_pattern {
177                // Handle as glob pattern
178                let pattern = if Path::new(root.as_ref()).is_absolute() {
179                    root.to_string()
180                } else {
181                    // Make relative patterns absolute by prepending workspace
182                    self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
183                };
184
185                match glob::glob(&pattern) {
186                    Ok(entries) => {
187                        for entry in entries {
188                            match entry {
189                                Ok(path) => {
190                                    if path.is_file() {
191                                        paths_to_process.push((path, root.to_string(), specificity));
192                                    }
193                                }
194                                Err(e) => {
195                                    tracing::warn!("Failed to read glob entry: {}", e);
196                                }
197                            }
198                        }
199                    }
200                    Err(e) => {
201                        return Err(DatabaseError::Glob(e.to_string()));
202                    }
203                }
204            } else {
205                // Handle as directory path (existing logic)
206                let dir_path = if Path::new(root.as_ref()).is_absolute() {
207                    Path::new(root.as_ref()).to_path_buf()
208                } else {
209                    self.configuration.workspace.join(root.as_ref())
210                };
211
212                for entry in WalkDir::new(&dir_path).into_iter().filter_map(Result::ok) {
213                    if entry.file_type().is_file() {
214                        paths_to_process.push((entry.into_path(), root.to_string(), specificity));
215                    }
216                }
217            }
218        }
219
220        let files: Vec<FileWithSpecificity> = paths_to_process
221            .into_par_iter()
222            .filter_map(|(path, _pattern, specificity)| {
223                if glob_excludes.is_match(&path) {
224                    return None;
225                }
226
227                if let Ok(canonical_path) = path.canonicalize()
228                    && path_excludes.iter().any(|excluded| canonical_path.starts_with(excluded))
229                {
230                    return None;
231                }
232
233                if let Some(ext) = path.extension() {
234                    if !extensions.contains(ext) {
235                        return None;
236                    }
237                } else {
238                    return None;
239                }
240
241                match read_file(self.configuration.workspace.as_ref(), &path, file_type) {
242                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
243                    Err(e) => Some(Err(e)),
244                }
245            })
246            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
247
248        Ok(files)
249    }
250
251    /// Calculates how specific a pattern is for a given file path.
252    ///
253    /// Examples:
254    ///
255    /// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
256    /// - "src/" matching src/b.php: ~100 (directory, 1 component)
257    /// - "src" matching src/b.php: ~100 (directory, 1 component)
258    fn calculate_pattern_specificity(pattern: &str) -> usize {
259        let pattern_path = Path::new(pattern);
260
261        let component_count = pattern_path.components().count();
262        let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
263
264        if is_glob {
265            let non_wildcard_components = pattern_path
266                .components()
267                .filter(|c| {
268                    let s = c.as_os_str().to_string_lossy();
269                    !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
270                })
271                .count();
272            non_wildcard_components * 10
273        } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
274            component_count * 1000
275        } else {
276            component_count * 100
277        }
278    }
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284    use crate::DatabaseReader;
285    use std::borrow::Cow;
286    use tempfile::TempDir;
287
288    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
289        // Normalize path separators to platform-specific separators
290        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
291
292        DatabaseConfiguration {
293            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
294            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
295            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
296            excludes: vec![],
297            extensions: vec![Cow::Borrowed("php")],
298        }
299    }
300
301    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
302        let file_path = temp_dir.path().join(relative_path);
303        if let Some(parent) = file_path.parent() {
304            std::fs::create_dir_all(parent).unwrap();
305        }
306        std::fs::write(file_path, content).unwrap();
307    }
308
309    #[test]
310    fn test_specificity_calculation_exact_file() {
311        let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
312        assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
313    }
314
315    #[test]
316    fn test_specificity_calculation_directory() {
317        let spec = DatabaseLoader::calculate_pattern_specificity("src/");
318        assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
319    }
320
321    #[test]
322    fn test_specificity_calculation_glob() {
323        let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
324        assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
325    }
326
327    #[test]
328    fn test_specificity_calculation_deeper_path() {
329        let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
330        let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
331        assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
332    }
333
334    #[test]
335    fn test_exact_file_vs_directory() {
336        let temp_dir = TempDir::new().unwrap();
337
338        create_test_file(&temp_dir, "src/b.php", "<?php");
339        create_test_file(&temp_dir, "src/a.php", "<?php");
340
341        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
342        let loader = DatabaseLoader::new(config);
343        let db = loader.load().unwrap();
344
345        let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
346        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
347
348        let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
349        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
350    }
351
352    #[test]
353    fn test_deeper_vs_shallower_directory() {
354        let temp_dir = TempDir::new().unwrap();
355
356        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
357
358        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
359        let loader = DatabaseLoader::new(config);
360        let db = loader.load().unwrap();
361
362        let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
363        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
364    }
365
366    #[test]
367    fn test_exact_file_vs_glob() {
368        let temp_dir = TempDir::new().unwrap();
369
370        create_test_file(&temp_dir, "src/b.php", "<?php");
371
372        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
373        let loader = DatabaseLoader::new(config);
374        let db = loader.load().unwrap();
375
376        let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
377        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
378    }
379
380    #[test]
381    fn test_equal_specificity_includes_wins() {
382        let temp_dir = TempDir::new().unwrap();
383
384        create_test_file(&temp_dir, "src/a.php", "<?php");
385
386        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
387        let loader = DatabaseLoader::new(config);
388        let db = loader.load().unwrap();
389
390        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
391        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
392    }
393
394    #[test]
395    fn test_complex_scenario_from_bug_report() {
396        let temp_dir = TempDir::new().unwrap();
397
398        create_test_file(&temp_dir, "src/a.php", "<?php");
399        create_test_file(&temp_dir, "src/b.php", "<?php");
400        create_test_file(&temp_dir, "src/c/d.php", "<?php");
401        create_test_file(&temp_dir, "src/c/e.php", "<?php");
402        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
403        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
404
405        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
406        let loader = DatabaseLoader::new(config);
407        let db = loader.load().unwrap();
408
409        let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
410        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
411
412        let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
413        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
414
415        let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
416        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
417    }
418
419    #[test]
420    fn test_files_only_in_paths() {
421        let temp_dir = TempDir::new().unwrap();
422
423        create_test_file(&temp_dir, "src/a.php", "<?php");
424
425        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
426        let loader = DatabaseLoader::new(config);
427        let db = loader.load().unwrap();
428
429        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
430        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
431    }
432
433    #[test]
434    fn test_files_only_in_includes() {
435        let temp_dir = TempDir::new().unwrap();
436
437        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
438
439        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
440        let loader = DatabaseLoader::new(config);
441        let db = loader.load().unwrap();
442
443        let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
444        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
445    }
446}