mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::ffi::OsString;
5use std::path::Path;
6
7use ahash::HashMap;
8use ahash::HashSet;
9use globset::Glob;
10use globset::GlobSet;
11use globset::GlobSetBuilder;
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use crate::Database;
16use crate::DatabaseConfiguration;
17use crate::error::DatabaseError;
18use crate::exclusion::Exclusion;
19use crate::file::File;
20use crate::file::FileId;
21use crate::file::FileType;
22use crate::utils::read_file;
23
24/// Holds a file along with the specificity of the pattern that matched it.
25///
26/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
27/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
28/// specificity than directory patterns).
29#[derive(Debug)]
30struct FileWithSpecificity {
31    file: File,
32    specificity: usize,
33}
34
35/// Builder for loading files into a Database from the filesystem and memory.
36pub struct DatabaseLoader<'a> {
37    database: Option<Database<'a>>,
38    configuration: DatabaseConfiguration<'a>,
39    memory_sources: Vec<(&'static str, &'static str, FileType)>,
40}
41
42impl<'a> DatabaseLoader<'a> {
43    pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
44        Self { configuration, memory_sources: vec![], database: None }
45    }
46
47    pub fn with_database(mut self, database: Database<'a>) -> Self {
48        self.database = Some(database);
49        self
50    }
51
52    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
53        self.memory_sources.push((name, contents, file_type));
54    }
55
56    pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
57        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
58
59        // Update database configuration to use the loader's configuration
60        // (fixes workspace path when merging with prelude database)
61        db.configuration = self.configuration.clone();
62
63        let extensions_set: HashSet<OsString> =
64            self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
65
66        let mut glob_builder = GlobSetBuilder::new();
67        for ex in &self.configuration.excludes {
68            if let Exclusion::Pattern(pat) = ex {
69                glob_builder.add(Glob::new(pat)?);
70            }
71        }
72
73        let glob_excludes = glob_builder.build()?;
74
75        let path_excludes: HashSet<_> = self
76            .configuration
77            .excludes
78            .iter()
79            .filter_map(|ex| match ex {
80                Exclusion::Path(p) => Some(p),
81                _ => None,
82            })
83            .collect();
84
85        let host_files_with_spec = self.load_paths(
86            &self.configuration.paths,
87            FileType::Host,
88            &extensions_set,
89            &glob_excludes,
90            &path_excludes,
91        )?;
92        let vendored_files_with_spec = self.load_paths(
93            &self.configuration.includes,
94            FileType::Vendored,
95            &extensions_set,
96            &glob_excludes,
97            &path_excludes,
98        )?;
99
100        let mut all_files: HashMap<FileId, File> = HashMap::default();
101        let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
102
103        // Process host files (from paths)
104        for file_with_spec in host_files_with_spec {
105            let file_id = file_with_spec.file.id;
106            let specificity = file_with_spec.specificity;
107
108            all_files.insert(file_id, file_with_spec.file);
109            file_decisions.insert(file_id, (FileType::Host, specificity));
110        }
111
112        for file_with_spec in vendored_files_with_spec {
113            let file_id = file_with_spec.file.id;
114            let vendored_specificity = file_with_spec.specificity;
115
116            all_files.entry(file_id).or_insert(file_with_spec.file);
117
118            match file_decisions.get(&file_id) {
119                Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
120                    // Keep Host
121                }
122                _ => {
123                    file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
124                }
125            }
126        }
127
128        for (file_id, (final_type, _)) in file_decisions {
129            if let Some(mut file) = all_files.remove(&file_id) {
130                file.file_type = final_type;
131                db.add(file);
132            }
133        }
134
135        for (name, contents, file_type) in self.memory_sources {
136            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
137
138            db.add(file);
139        }
140
141        Ok(db)
142    }
143
144    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
145    ///
146    /// Supports both:
147    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
148    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
149    ///
150    /// Returns files along with their pattern specificity for conflict resolution.
151    fn load_paths(
152        &self,
153        roots: &[Cow<'a, str>],
154        file_type: FileType,
155        extensions: &HashSet<OsString>,
156        glob_excludes: &GlobSet,
157        path_excludes: &HashSet<&Cow<'a, Path>>,
158    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
159        let mut paths_to_process: Vec<(std::path::PathBuf, String, usize)> = Vec::new();
160
161        for root in roots {
162            // Check if this is a glob pattern (contains glob metacharacters)
163            let is_glob_pattern = root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{');
164
165            let specificity = Self::calculate_pattern_specificity(root.as_ref());
166            if is_glob_pattern {
167                // Handle as glob pattern
168                let pattern = if Path::new(root.as_ref()).is_absolute() {
169                    root.to_string()
170                } else {
171                    // Make relative patterns absolute by prepending workspace
172                    self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
173                };
174
175                match glob::glob(&pattern) {
176                    Ok(entries) => {
177                        for entry in entries {
178                            match entry {
179                                Ok(path) => {
180                                    if path.is_file() {
181                                        paths_to_process.push((path, root.to_string(), specificity));
182                                    }
183                                }
184                                Err(e) => {
185                                    tracing::warn!("Failed to read glob entry: {}", e);
186                                }
187                            }
188                        }
189                    }
190                    Err(e) => {
191                        return Err(DatabaseError::Glob(e.to_string()));
192                    }
193                }
194            } else {
195                // Handle as directory path (existing logic)
196                let dir_path = if Path::new(root.as_ref()).is_absolute() {
197                    Path::new(root.as_ref()).to_path_buf()
198                } else {
199                    self.configuration.workspace.join(root.as_ref())
200                };
201
202                for entry in WalkDir::new(&dir_path).into_iter().filter_map(Result::ok) {
203                    if entry.file_type().is_file() {
204                        paths_to_process.push((entry.into_path(), root.to_string(), specificity));
205                    }
206                }
207            }
208        }
209
210        let files: Vec<FileWithSpecificity> = paths_to_process
211            .into_par_iter()
212            .filter_map(|(path, _pattern, specificity)| {
213                if glob_excludes.is_match(&path) {
214                    return None;
215                }
216
217                if let Ok(canonical_path) = path.canonicalize()
218                    && path_excludes.iter().any(|excluded| canonical_path.starts_with(excluded))
219                {
220                    return None;
221                }
222
223                if let Some(ext) = path.extension() {
224                    if !extensions.contains(ext) {
225                        return None;
226                    }
227                } else {
228                    return None;
229                }
230
231                match read_file(self.configuration.workspace.as_ref(), &path, file_type) {
232                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
233                    Err(e) => Some(Err(e)),
234                }
235            })
236            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
237
238        Ok(files)
239    }
240
241    /// Calculates how specific a pattern is for a given file path.
242    ///
243    /// Examples:
244    ///
245    /// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
246    /// - "src/" matching src/b.php: ~100 (directory, 1 component)
247    /// - "src" matching src/b.php: ~100 (directory, 1 component)
248    fn calculate_pattern_specificity(pattern: &str) -> usize {
249        let pattern_path = Path::new(pattern);
250
251        let component_count = pattern_path.components().count();
252        let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
253
254        if is_glob {
255            let non_wildcard_components = pattern_path
256                .components()
257                .filter(|c| {
258                    let s = c.as_os_str().to_string_lossy();
259                    !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
260                })
261                .count();
262            non_wildcard_components * 10
263        } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
264            component_count * 1000
265        } else {
266            component_count * 100
267        }
268    }
269}
270
271#[cfg(test)]
272mod tests {
273    use super::*;
274    use crate::DatabaseReader;
275    use std::borrow::Cow;
276    use tempfile::TempDir;
277
278    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
279        // Normalize path separators to platform-specific separators
280        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
281
282        DatabaseConfiguration {
283            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
284            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
285            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
286            excludes: vec![],
287            extensions: vec![Cow::Borrowed("php")],
288        }
289    }
290
291    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
292        let file_path = temp_dir.path().join(relative_path);
293        if let Some(parent) = file_path.parent() {
294            std::fs::create_dir_all(parent).unwrap();
295        }
296        std::fs::write(file_path, content).unwrap();
297    }
298
299    #[test]
300    fn test_specificity_calculation_exact_file() {
301        let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
302        assert!(spec >= 2000, "Exact file should have high specificity, got {}", spec);
303    }
304
305    #[test]
306    fn test_specificity_calculation_directory() {
307        let spec = DatabaseLoader::calculate_pattern_specificity("src/");
308        assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {}", spec);
309    }
310
311    #[test]
312    fn test_specificity_calculation_glob() {
313        let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
314        assert!(spec < 100, "Glob pattern should have low specificity, got {}", spec);
315    }
316
317    #[test]
318    fn test_specificity_calculation_deeper_path() {
319        let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
320        let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
321        assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
322    }
323
324    #[test]
325    fn test_exact_file_vs_directory() {
326        let temp_dir = TempDir::new().unwrap();
327
328        create_test_file(&temp_dir, "src/b.php", "<?php");
329        create_test_file(&temp_dir, "src/a.php", "<?php");
330
331        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
332        let loader = DatabaseLoader::new(config);
333        let db = loader.load().unwrap();
334
335        let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
336        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
337
338        let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
339        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
340    }
341
342    #[test]
343    fn test_deeper_vs_shallower_directory() {
344        let temp_dir = TempDir::new().unwrap();
345
346        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
347
348        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
349        let loader = DatabaseLoader::new(config);
350        let db = loader.load().unwrap();
351
352        let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
353        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
354    }
355
356    #[test]
357    fn test_exact_file_vs_glob() {
358        let temp_dir = TempDir::new().unwrap();
359
360        create_test_file(&temp_dir, "src/b.php", "<?php");
361
362        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
363        let loader = DatabaseLoader::new(config);
364        let db = loader.load().unwrap();
365
366        let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
367        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
368    }
369
370    #[test]
371    fn test_equal_specificity_includes_wins() {
372        let temp_dir = TempDir::new().unwrap();
373
374        create_test_file(&temp_dir, "src/a.php", "<?php");
375
376        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
377        let loader = DatabaseLoader::new(config);
378        let db = loader.load().unwrap();
379
380        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
381        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
382    }
383
384    #[test]
385    fn test_complex_scenario_from_bug_report() {
386        let temp_dir = TempDir::new().unwrap();
387
388        create_test_file(&temp_dir, "src/a.php", "<?php");
389        create_test_file(&temp_dir, "src/b.php", "<?php");
390        create_test_file(&temp_dir, "src/c/d.php", "<?php");
391        create_test_file(&temp_dir, "src/c/e.php", "<?php");
392        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
393        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
394
395        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
396        let loader = DatabaseLoader::new(config);
397        let db = loader.load().unwrap();
398
399        let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
400        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
401
402        let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
403        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
404
405        let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
406        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
407    }
408
409    #[test]
410    fn test_files_only_in_paths() {
411        let temp_dir = TempDir::new().unwrap();
412
413        create_test_file(&temp_dir, "src/a.php", "<?php");
414
415        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
416        let loader = DatabaseLoader::new(config);
417        let db = loader.load().unwrap();
418
419        let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
420        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
421    }
422
423    #[test]
424    fn test_files_only_in_includes() {
425        let temp_dir = TempDir::new().unwrap();
426
427        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
428
429        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
430        let loader = DatabaseLoader::new(config);
431        let db = loader.load().unwrap();
432
433        let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
434        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
435    }
436}