Skip to main content

mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7use std::path::PathBuf;
8
9use foldhash::HashMap;
10use foldhash::HashSet;
11use globset::GlobSet;
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use crate::Database;
16use crate::DatabaseConfiguration;
17use crate::error::DatabaseError;
18use crate::exclusion::Exclusion;
19use crate::file::File;
20use crate::file::FileId;
21use crate::file::FileType;
22use crate::matcher::build_glob_set;
23use crate::utils::bytes_to_os_str;
24use crate::utils::bytes_to_path;
25use crate::utils::bytes_to_string_lossy;
26use crate::utils::read_file;
27
28/// Holds a file along with the specificity of the pattern that matched it.
29///
30/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
31/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
32/// specificity than directory patterns).
33#[derive(Debug)]
34struct FileWithSpecificity {
35    file: File,
36    specificity: usize,
37}
38
39/// Builder for loading files into a Database from the filesystem and memory.
40pub struct DatabaseLoader<'config> {
41    database: Option<Database<'config>>,
42    configuration: DatabaseConfiguration<'config>,
43    memory_sources: Vec<(&'static [u8], &'static [u8], FileType)>,
44    stdin_override: Option<(Cow<'config, [u8]>, Vec<u8>)>,
45}
46
47impl<'config> DatabaseLoader<'config> {
48    #[inline]
49    #[must_use]
50    pub fn new(configuration: DatabaseConfiguration<'config>) -> Self {
51        Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
52    }
53
54    #[inline]
55    #[must_use]
56    pub fn with_database(mut self, database: Database<'config>) -> Self {
57        self.database = Some(database);
58        self
59    }
60
61    /// When set, the file with this logical name (workspace-relative path) will use the given
62    /// content instead of being read from disk. The logical name is used for baseline and reporting.
63    ///
64    /// `content` is raw bytes: PHP source is binary-safe, so a buffer piped in via `--stdin-input`
65    /// may not be valid UTF-8.
66    #[inline]
67    #[must_use]
68    pub fn with_stdin_override(mut self, logical_name: impl AsRef<[u8]>, content: Vec<u8>) -> Self {
69        self.stdin_override = Some((Cow::Owned(logical_name.as_ref().to_vec()), content));
70        self
71    }
72
73    #[inline]
74    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
75        self.memory_sources.push((name.as_bytes(), contents.as_bytes(), file_type));
76    }
77
78    /// Loads files from disk into the database.
79    ///
80    /// # Errors
81    ///
82    /// Returns a [`DatabaseError`] if:
83    /// - A glob pattern is invalid
84    /// - File system operations fail (reading directories, files)
85    /// - A file exceeds the maximum supported size
86    #[inline]
87    pub fn load(mut self) -> Result<Database<'config>, DatabaseError> {
88        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
89
90        // Update database configuration to use the loader's configuration
91        // (fixes workspace path when merging with prelude database)
92        db.configuration = self.configuration.clone();
93
94        let extensions_set: HashSet<OsString> =
95            self.configuration.extensions.iter().map(|s| bytes_to_os_str(s.as_ref()).into_owned()).collect();
96
97        let glob_exclude_patterns: Vec<&str> = self
98            .configuration
99            .excludes
100            .iter()
101            .filter_map(|ex| match ex {
102                Exclusion::Pattern(pat) => Some(pat.as_ref()),
103                Exclusion::Path(_) => None,
104            })
105            .collect();
106
107        let glob_excludes = build_glob_set(glob_exclude_patterns.iter().copied(), self.configuration.glob)?;
108        let dir_prune_patterns: Vec<&str> = glob_exclude_patterns
109            .iter()
110            .filter_map(|pat| {
111                let stripped =
112                    pat.strip_suffix("/**/*").or_else(|| pat.strip_suffix("/**")).or_else(|| pat.strip_suffix("/*"))?;
113                if stripped.is_empty() || stripped == "*" || stripped == "**" {
114                    return None;
115                }
116                Some(stripped)
117            })
118            .collect();
119
120        let dir_prune_globs = build_glob_set(dir_prune_patterns.iter().copied(), self.configuration.glob)?;
121
122        let path_excludes: HashSet<_> = self
123            .configuration
124            .excludes
125            .iter()
126            .filter_map(|ex| match ex {
127                Exclusion::Path(p) => Some(p),
128                Exclusion::Pattern(_) => None,
129            })
130            .collect();
131
132        let host_files_with_spec = self.load_paths(
133            &self.configuration.paths,
134            FileType::Host,
135            &extensions_set,
136            &glob_excludes,
137            &dir_prune_globs,
138            &path_excludes,
139        )?;
140
141        let vendored_files_with_spec = self.load_paths(
142            &self.configuration.includes,
143            FileType::Vendored,
144            &extensions_set,
145            &glob_excludes,
146            &dir_prune_globs,
147            &path_excludes,
148        )?;
149
150        let mut all_files: HashMap<FileId, File> = HashMap::default();
151        let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
152
153        // Process host files (from paths)
154        for file_with_spec in host_files_with_spec {
155            let file_id = file_with_spec.file.id;
156            let specificity = file_with_spec.specificity;
157
158            all_files.insert(file_id, file_with_spec.file);
159            file_decisions.insert(file_id, (FileType::Host, specificity));
160        }
161
162        // When stdin override is set, ensure that the file is in the database
163        // (covers new/unsaved files, not on disk). Excluded paths are skipped
164        // so that editor integrations using `--stdin-input` honor the same
165        // exclude rules as a regular filesystem scan.
166        if let Some((name, content)) = &self.stdin_override {
167            let virtual_path = self.configuration.workspace.join(bytes_to_path(name.as_ref()).as_ref());
168            let virtual_path_canonical = virtual_path.canonicalize().unwrap_or_else(|_| virtual_path.clone());
169            let virtual_path_str = virtual_path_canonical.to_string_lossy();
170
171            let matched_glob = !glob_excludes.is_empty()
172                && (glob_excludes.is_match(virtual_path_canonical.as_path())
173                    || glob_excludes.is_match(bytes_to_path(name.as_ref()).as_ref()));
174
175            let matched_path = path_excludes.iter().any(|excl| {
176                let canonical = if Path::new(excl.as_ref()).is_absolute() {
177                    excl.as_ref().to_path_buf()
178                } else {
179                    self.configuration.workspace.join(excl.as_ref())
180                };
181                let canonical = canonical.canonicalize().unwrap_or(canonical);
182                let canonical_str = canonical.to_string_lossy();
183
184                virtual_path_str.starts_with(canonical_str.as_ref())
185                    && matches!(virtual_path_str.as_bytes().get(canonical_str.len()), None | Some(&b'/' | &b'\\'))
186            });
187
188            if !matched_glob && !matched_path {
189                let file = File::ephemeral(Cow::Owned(name.as_ref().to_vec()), Cow::Owned(content.clone()));
190                let file_id = file.id;
191                if let Entry::Vacant(e) = all_files.entry(file_id) {
192                    e.insert(file);
193
194                    file_decisions.insert(file_id, (FileType::Host, usize::MAX));
195                }
196            }
197        }
198
199        for file_with_spec in vendored_files_with_spec {
200            let file_id = file_with_spec.file.id;
201            let vendored_specificity = file_with_spec.specificity;
202
203            all_files.entry(file_id).or_insert(file_with_spec.file);
204
205            match file_decisions.get(&file_id) {
206                Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
207                    // Keep Host
208                }
209                _ => {
210                    file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
211                }
212            }
213        }
214
215        db.reserve(file_decisions.len() + self.memory_sources.len());
216
217        for (file_id, (final_type, _)) in file_decisions {
218            if let Some(mut file) = all_files.remove(&file_id) {
219                file.file_type = final_type;
220                db.add(file);
221            }
222        }
223
224        for (name, contents, file_type) in self.memory_sources {
225            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
226
227            db.add(file);
228        }
229
230        Ok(db)
231    }
232
233    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
234    ///
235    /// Supports both:
236    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
237    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
238    ///
239    /// Returns files along with their pattern specificity for conflict resolution.
240    fn load_paths(
241        &self,
242        roots: &[Cow<'config, [u8]>],
243        file_type: FileType,
244        extensions: &HashSet<OsString>,
245        glob_excludes: &GlobSet,
246        dir_prune_globs: &GlobSet,
247        path_excludes: &HashSet<&Cow<'config, Path>>,
248    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
249        // Canonicalize the workspace once.  All WalkDir roots are canonicalized
250        // before traversal so their paths inherit the canonical prefix without
251        // any per-file syscalls.
252        let canonical_workspace =
253            self.configuration.workspace.canonicalize().unwrap_or_else(|_| self.configuration.workspace.to_path_buf());
254
255        // Pre-canonicalize path excludes once as strings.  A plain byte-string
256        // prefix check is then sufficient in the parallel section, replacing the
257        // per-file canonicalize() + Path::starts_with (Components iteration).
258        let canonical_excludes: Vec<String> = path_excludes
259            .iter()
260            .filter_map(|ex| {
261                let p = if Path::new(ex.as_ref()).is_absolute() {
262                    ex.as_ref().to_path_buf()
263                } else {
264                    self.configuration.workspace.join(ex.as_ref())
265                };
266
267                p.canonicalize().ok()?.into_os_string().into_string().ok()
268            })
269            .collect();
270
271        let workspace_relative_str = |path: &Path| -> String {
272            let rel = path.strip_prefix(canonical_workspace.as_path()).unwrap_or(path);
273            let s = rel.to_string_lossy();
274            #[cfg(windows)]
275            {
276                s.replace('\\', "/")
277            }
278            #[cfg(not(windows))]
279            {
280                s.into_owned()
281            }
282        };
283
284        let mut paths_to_process: Vec<(PathBuf, usize)> = Vec::new();
285
286        for root in roots {
287            // Check if this is a glob pattern (contains glob metacharacters).
288            // First check if it's an actual file/directory on disk. if so, treat it
289            // as a literal path even if the name contains glob metacharacters like `[]`.
290            let root_path = bytes_to_path(root.as_ref());
291            let resolved_path = if root_path.is_absolute() {
292                root_path.as_ref().to_path_buf()
293            } else {
294                self.configuration.workspace.join(root_path.as_ref())
295            };
296
297            let is_glob_pattern = !resolved_path.exists()
298                && (root.contains(&b'*') || root.contains(&b'?') || root.contains(&b'[') || root.contains(&b'{'));
299
300            let specificity = Self::calculate_pattern_specificity(root.as_ref());
301            if is_glob_pattern {
302                // Handle as glob pattern
303                let pattern = if root_path.is_absolute() {
304                    bytes_to_string_lossy(root.as_ref()).into_owned()
305                } else {
306                    // Make relative patterns absolute by prepending workspace
307                    self.configuration.workspace.join(root_path.as_ref()).to_string_lossy().to_string()
308                };
309
310                match glob::glob(&pattern) {
311                    Ok(entries) => {
312                        for entry in entries {
313                            match entry {
314                                Ok(path) => {
315                                    if path.is_file() {
316                                        // Canonicalize so the path shares the same prefix as
317                                        // `canonical_workspace` (important on macOS where
318                                        // TempDir / glob return /var/… but canonicalize gives
319                                        // /private/var/…).  Fall back to the original on error.
320                                        let canonical = path.canonicalize().unwrap_or(path);
321                                        paths_to_process.push((canonical, specificity));
322                                    }
323                                }
324                                Err(e) => {
325                                    tracing::warn!("Failed to read glob entry: {}", e);
326                                }
327                            }
328                        }
329                    }
330                    Err(e) => {
331                        return Err(DatabaseError::Glob(e.to_string()));
332                    }
333                }
334            } else {
335                let canonical_root = resolved_path.canonicalize().unwrap_or(resolved_path);
336                let has_dir_prunes = !dir_prune_globs.is_empty();
337                let has_path_prunes = !canonical_excludes.is_empty();
338                let walker = WalkDir::new(&canonical_root).follow_links(true).into_iter().filter_entry(|entry| {
339                    if entry.depth() == 0 || !entry.file_type().is_dir() {
340                        return true;
341                    }
342
343                    let path = entry.path();
344
345                    if has_path_prunes
346                        && let Some(p) = path.to_str()
347                        && canonical_excludes.iter().any(|excl| {
348                            p.starts_with(excl.as_str())
349                                && matches!(p.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
350                        })
351                    {
352                        return false;
353                    }
354
355                    if has_dir_prunes
356                        && (dir_prune_globs.is_match(path) || dir_prune_globs.is_match(workspace_relative_str(path)))
357                    {
358                        return false;
359                    }
360
361                    true
362                });
363
364                for entry in walker {
365                    match entry {
366                        Ok(entry) => {
367                            if !entry.file_type().is_dir() {
368                                paths_to_process.push((entry.into_path(), specificity));
369                            }
370                        }
371                        Err(err) => {
372                            let path = err.path().unwrap_or(canonical_root.as_path()).display();
373                            if let Some(ancestor) = err.loop_ancestor() {
374                                tracing::warn!(
375                                    "Skipping symlink loop at `{path}`: link cycles back to `{}`.",
376                                    ancestor.display(),
377                                );
378                            } else {
379                                tracing::warn!("Failed to walk `{path}`: {err}. Entry will be skipped.");
380                            }
381                        }
382                    }
383                }
384            }
385        }
386
387        let has_path_excludes = !canonical_excludes.is_empty();
388        let has_glob_excludes = !glob_excludes.is_empty();
389        let files: Vec<FileWithSpecificity> = paths_to_process
390            .into_par_iter()
391            .filter_map(|(path, specificity)| {
392                if has_glob_excludes
393                    && (glob_excludes.is_match(&path) || glob_excludes.is_match(workspace_relative_str(&path)))
394                {
395                    return None;
396                }
397
398                let ext = path.extension()?;
399                if !extensions.contains(ext) {
400                    return None;
401                }
402
403                if has_path_excludes {
404                    let excluded = path.to_str().is_some_and(|s| {
405                        canonical_excludes.iter().any(|excl| {
406                            s.starts_with(excl.as_str())
407                                && matches!(s.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
408                        })
409                    });
410
411                    if excluded {
412                        return None;
413                    }
414                }
415
416                let workspace = canonical_workspace.as_path();
417                #[cfg(windows)]
418                let logical_name =
419                    path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().replace('\\', "/");
420                #[cfg(not(windows))]
421                let logical_name =
422                    path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
423
424                if let Some((override_name, override_content)) = &self.stdin_override
425                    && override_name.as_ref() == logical_name.as_bytes()
426                {
427                    let file = File::new(
428                        Cow::Owned(logical_name.into_bytes()),
429                        file_type,
430                        Some(path.clone()),
431                        Cow::Owned(override_content.clone()),
432                    );
433
434                    return Some(Ok(FileWithSpecificity { file, specificity }));
435                }
436
437                match read_file(workspace, &path, file_type) {
438                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
439                    Err(e) => Some(Err(e)),
440                }
441            })
442            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
443
444        Ok(files)
445    }
446
447    /// Calculates how specific a pattern is for a given file path.
448    ///
449    /// Examples:
450    ///
451    /// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
452    /// - "src/" matching src/b.php: ~100 (directory, 1 component)
453    /// - "src" matching src/b.php: ~100 (directory, 1 component)
454    fn calculate_pattern_specificity(pattern: &[u8]) -> usize {
455        let pattern_path = bytes_to_path(pattern);
456
457        let component_count = pattern_path.components().count();
458        let is_glob =
459            pattern.contains(&b'*') || pattern.contains(&b'?') || pattern.contains(&b'[') || pattern.contains(&b'{');
460
461        if is_glob {
462            let non_wildcard_components = pattern_path
463                .components()
464                .filter(|c| {
465                    let s = c.as_os_str().to_string_lossy();
466                    !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
467                })
468                .count();
469            non_wildcard_components * 10
470        } else if pattern_path.is_file()
471            || pattern_path.extension().is_some()
472            || pattern.rsplit(|&b| b == b'.').next().is_some_and(|ext| ext.eq_ignore_ascii_case(b"php"))
473        {
474            component_count * 1000
475        } else {
476            component_count * 100
477        }
478    }
479}
480
481#[cfg(test)]
482#[allow(clippy::unwrap_used)]
483mod tests {
484    use super::*;
485    use crate::DatabaseReader;
486    use crate::GlobSettings;
487    use std::borrow::Cow;
488    use tempfile::TempDir;
489
490    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
491        // Normalize path separators to platform-specific separators
492        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
493
494        DatabaseConfiguration {
495            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
496            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s).into_bytes())).collect(),
497            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s).into_bytes())).collect(),
498            excludes: vec![],
499            extensions: vec![Cow::Borrowed(b"php")],
500            glob: GlobSettings::default(),
501        }
502    }
503
504    /// Returns the file's logical name as a lossy UTF-8 string for assertion matching.
505    fn name_str(name: &[u8]) -> std::borrow::Cow<'_, str> {
506        String::from_utf8_lossy(name)
507    }
508
509    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
510        let file_path = temp_dir.path().join(relative_path);
511        if let Some(parent) = file_path.parent() {
512            std::fs::create_dir_all(parent).unwrap();
513        }
514        std::fs::write(file_path, content).unwrap();
515    }
516
517    #[test]
518    fn test_specificity_calculation_exact_file() {
519        let spec = DatabaseLoader::calculate_pattern_specificity(b"src/b.php");
520        assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
521    }
522
523    #[test]
524    fn test_specificity_calculation_directory() {
525        let spec = DatabaseLoader::calculate_pattern_specificity(b"src/");
526        assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
527    }
528
529    #[test]
530    fn test_specificity_calculation_glob() {
531        let spec = DatabaseLoader::calculate_pattern_specificity(b"src/*.php");
532        assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
533    }
534
535    #[test]
536    fn test_specificity_calculation_deeper_path() {
537        let shallow_spec = DatabaseLoader::calculate_pattern_specificity(b"src/");
538        let deep_spec = DatabaseLoader::calculate_pattern_specificity(b"src/foo/bar/");
539        assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
540    }
541
542    #[test]
543    fn test_exact_file_vs_directory() {
544        let temp_dir = TempDir::new().unwrap();
545
546        create_test_file(&temp_dir, "src/b.php", "<?php");
547        create_test_file(&temp_dir, "src/a.php", "<?php");
548
549        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
550        let loader = DatabaseLoader::new(config);
551        let db = loader.load().unwrap();
552
553        let b_file = db.files().find(|f| name_str(&f.name).contains("b.php")).unwrap();
554        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
555
556        let a_file = db.files().find(|f| name_str(&f.name).contains("a.php")).unwrap();
557        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
558    }
559
560    #[test]
561    fn test_deeper_vs_shallower_directory() {
562        let temp_dir = TempDir::new().unwrap();
563
564        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
565
566        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
567        let loader = DatabaseLoader::new(config);
568        let db = loader.load().unwrap();
569
570        let file = db.files().find(|f| name_str(&f.name).contains("bar.php")).unwrap();
571        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
572    }
573
574    #[test]
575    fn test_exact_file_vs_glob() {
576        let temp_dir = TempDir::new().unwrap();
577
578        create_test_file(&temp_dir, "src/b.php", "<?php");
579
580        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
581        let loader = DatabaseLoader::new(config);
582        let db = loader.load().unwrap();
583
584        let file = db.files().find(|f| name_str(&f.name).contains("b.php")).unwrap();
585        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
586    }
587
588    #[test]
589    fn test_equal_specificity_includes_wins() {
590        let temp_dir = TempDir::new().unwrap();
591
592        create_test_file(&temp_dir, "src/a.php", "<?php");
593
594        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
595        let loader = DatabaseLoader::new(config);
596        let db = loader.load().unwrap();
597
598        let file = db.files().find(|f| name_str(&f.name).contains("a.php")).unwrap();
599        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
600    }
601
602    #[test]
603    fn test_complex_scenario_from_bug_report() {
604        let temp_dir = TempDir::new().unwrap();
605
606        create_test_file(&temp_dir, "src/a.php", "<?php");
607        create_test_file(&temp_dir, "src/b.php", "<?php");
608        create_test_file(&temp_dir, "src/c/d.php", "<?php");
609        create_test_file(&temp_dir, "src/c/e.php", "<?php");
610        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
611        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
612
613        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
614        let loader = DatabaseLoader::new(config);
615        let db = loader.load().unwrap();
616
617        let b_file = db
618            .files()
619            .find(|f| name_str(&f.name).contains("src/b.php") || name_str(&f.name).ends_with("b.php"))
620            .unwrap();
621        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
622
623        let d_file = db.files().find(|f| name_str(&f.name).contains("d.php")).unwrap();
624        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
625
626        let lib_file = db.files().find(|f| name_str(&f.name).contains("lib1.php")).unwrap();
627        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
628    }
629
630    #[test]
631    fn test_files_only_in_paths() {
632        let temp_dir = TempDir::new().unwrap();
633
634        create_test_file(&temp_dir, "src/a.php", "<?php");
635
636        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
637        let loader = DatabaseLoader::new(config);
638        let db = loader.load().unwrap();
639
640        let file = db.files().find(|f| name_str(&f.name).contains("a.php")).unwrap();
641        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
642    }
643
644    #[test]
645    fn test_files_only_in_includes() {
646        let temp_dir = TempDir::new().unwrap();
647
648        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
649
650        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
651        let loader = DatabaseLoader::new(config);
652        let db = loader.load().unwrap();
653
654        let file = db.files().find(|f| name_str(&f.name).contains("lib.php")).unwrap();
655        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
656    }
657
658    #[test]
659    fn test_stdin_override_replaces_file_content() {
660        let temp_dir = TempDir::new().unwrap();
661        create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
662
663        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
664        let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", b"<?php\n// from stdin".to_vec());
665        let db = loader.load().unwrap();
666
667        let file = db.files().find(|f| name_str(&f.name).contains("foo.php")).unwrap();
668        assert_eq!(
669            file.contents.as_ref(),
670            b"<?php\n// from stdin",
671            "stdin override content should be used instead of disk"
672        );
673    }
674
675    #[test]
676    fn test_glob_excludes_match_workspace_relative_paths() {
677        let temp_dir = TempDir::new().unwrap();
678
679        create_test_file(&temp_dir, "src/Absences/Foo/Foo.php", "<?php");
680        create_test_file(&temp_dir, "src/Absences/Test/Faker/Provider/AbsencesProvider.php", "<?php");
681        create_test_file(&temp_dir, "src/Calendar/Test/Helper.php", "<?php");
682
683        let mut config = create_test_config(&temp_dir, vec!["src"], vec![]);
684        config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("src/*/Test/**"))];
685
686        let loader = DatabaseLoader::new(config);
687        let db = loader.load().unwrap();
688
689        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
690        assert!(names.iter().any(|n| n.ends_with("src/Absences/Foo/Foo.php")), "non-Test file should be loaded");
691        assert!(
692            !names.iter().any(|n| n.contains("src/Absences/Test/")),
693            "files under src/*/Test/** should be excluded, got {names:?}"
694        );
695        assert!(
696            !names.iter().any(|n| n.contains("src/Calendar/Test/")),
697            "files under src/*/Test/** should be excluded, got {names:?}"
698        );
699    }
700
701    #[test]
702    fn test_glob_excludes_match_legacy_absolute_prefix_patterns() {
703        let temp_dir = TempDir::new().unwrap();
704
705        create_test_file(&temp_dir, "packages/foo/src/main.php", "<?php");
706        create_test_file(&temp_dir, "packages/foo/vendor/lib.php", "<?php");
707
708        let mut config = create_test_config(&temp_dir, vec!["packages"], vec![]);
709        config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("*/packages/**/vendor/*"))];
710
711        let loader = DatabaseLoader::new(config);
712        let db = loader.load().unwrap();
713
714        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
715        assert!(names.iter().any(|n| n.ends_with("packages/foo/src/main.php")));
716        assert!(
717            !names.iter().any(|n| n.contains("/vendor/")),
718            "legacy `*/packages/**/vendor/*` style should still exclude vendor files, got {names:?}"
719        );
720    }
721
722    #[test]
723    fn test_glob_dir_prune_skips_relative_directories() {
724        let temp_dir = TempDir::new().unwrap();
725
726        create_test_file(&temp_dir, "vendor/slevomat/coding-standard/main.php", "<?php");
727        create_test_file(&temp_dir, "vendor/slevomat/coding-standard/tests/Sniffs/Foo.php", "<?php");
728        create_test_file(&temp_dir, "vendor/another/lib.php", "<?php");
729
730        let mut config = create_test_config(&temp_dir, vec![], vec!["vendor"]);
731        config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("vendor/**/tests/**"))];
732
733        let loader = DatabaseLoader::new(config);
734        let db = loader.load().unwrap();
735
736        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
737        assert!(names.iter().any(|n| n.ends_with("vendor/slevomat/coding-standard/main.php")));
738        assert!(names.iter().any(|n| n.ends_with("vendor/another/lib.php")));
739        assert!(
740            !names.iter().any(|n| n.contains("/tests/")),
741            "files under vendor/**/tests/** should be pruned, got {names:?}"
742        );
743    }
744
745    #[test]
746    fn test_stdin_override_adds_file_when_not_on_disk() {
747        let temp_dir = TempDir::new().unwrap();
748        create_test_file(&temp_dir, "src/.gitkeep", "");
749
750        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
751        let loader =
752            DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", b"<?php\n// unsaved buffer".to_vec());
753        let db = loader.load().unwrap();
754
755        let file = db.files().find(|f| name_str(&f.name).contains("unsaved.php")).unwrap();
756        assert_eq!(file.file_type, FileType::Host);
757        assert_eq!(file.contents.as_ref(), b"<?php\n// unsaved buffer");
758    }
759
760    #[test]
761    fn test_stdin_override_accepts_non_utf8_content() {
762        let temp_dir = TempDir::new().unwrap();
763        create_test_file(&temp_dir, "src/.gitkeep", "");
764
765        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
766        // PHP identifiers are binary-safe, so a buffer piped in via `--stdin-input` may not
767        // be valid UTF-8. The loaded file must carry those bytes through verbatim.
768        let content = b"<?php\n\nfunction f\xC9\xFF(): void {}\n".to_vec();
769        assert!(std::str::from_utf8(&content).is_err(), "test buffer must contain non-UTF-8 bytes");
770
771        let loader = DatabaseLoader::new(config).with_stdin_override("src/buffer.php", content.clone());
772        let db = loader.load().unwrap();
773
774        let file = db.files().find(|f| name_str(&f.name).contains("buffer.php")).unwrap();
775        assert_eq!(file.contents.as_ref(), content.as_slice());
776    }
777
778    #[cfg(unix)]
779    #[test]
780    fn test_symlinked_file_under_include_is_loaded() {
781        let temp_dir = TempDir::new().unwrap();
782        let external = TempDir::new().unwrap();
783
784        create_test_file(&external, "Bar.php", "<?php class Bar {}\n");
785        std::fs::create_dir_all(temp_dir.path().join("vendor")).unwrap();
786        std::os::unix::fs::symlink(external.path().join("Bar.php"), temp_dir.path().join("vendor/Bar.php")).unwrap();
787
788        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
789        let db = DatabaseLoader::new(config).load().unwrap();
790
791        let bar = db.files().find(|f| name_str(&f.name).contains("Bar.php"));
792        assert!(bar.is_some(), "symlinked Bar.php should be loaded via include = ['vendor/']");
793    }
794
795    #[cfg(unix)]
796    #[test]
797    fn test_symlinked_directory_under_include_is_descended() {
798        let temp_dir = TempDir::new().unwrap();
799        let external = TempDir::new().unwrap();
800
801        create_test_file(&external, "src/Foo.php", "<?php class Foo {}\n");
802        create_test_file(&external, "src/Bar.php", "<?php class Bar {}\n");
803
804        std::fs::create_dir_all(temp_dir.path().join("vendor")).unwrap();
805        std::os::unix::fs::symlink(external.path(), temp_dir.path().join("vendor/example-package")).unwrap();
806
807        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
808        let db = DatabaseLoader::new(config).load().unwrap();
809
810        assert!(db.files().any(|f| name_str(&f.name).contains("Foo.php")), "Foo.php inside symlinked dir not found");
811        assert!(db.files().any(|f| name_str(&f.name).contains("Bar.php")), "Bar.php inside symlinked dir not found");
812    }
813
814    #[cfg(unix)]
815    #[test]
816    fn test_symlink_cycle_is_warned_and_skipped() {
817        let temp_dir = TempDir::new().unwrap();
818        create_test_file(&temp_dir, "src/Real.php", "<?php class Real {}\n");
819        std::os::unix::fs::symlink(temp_dir.path().join("src"), temp_dir.path().join("src/loop")).unwrap();
820
821        let config = create_test_config(&temp_dir, vec![], vec!["src/"]);
822        let db = DatabaseLoader::new(config).load().expect("symlink cycle should not abort the load");
823
824        assert!(
825            db.files().any(|f| name_str(&f.name).contains("Real.php")),
826            "Real.php still reachable despite the loop"
827        );
828    }
829}