Skip to main content

mago_database/
loader.rs

1//! Database loader for scanning and loading project files.
2
3use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7use std::path::PathBuf;
8
9use foldhash::HashMap;
10use foldhash::HashSet;
11use globset::GlobSet;
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use crate::Database;
16use crate::DatabaseConfiguration;
17use crate::error::DatabaseError;
18use crate::exclusion::Exclusion;
19use crate::file::File;
20use crate::file::FileId;
21use crate::file::FileType;
22use crate::matcher::build_glob_set;
23use crate::utils::bytes_to_os_str;
24use crate::utils::bytes_to_path;
25use crate::utils::bytes_to_string_lossy;
26use crate::utils::read_file;
27
28/// Holds a file along with the specificity of the pattern that matched it.
29///
30/// Specificity is used to resolve conflicts when a file matches both `paths` and `includes`.
31/// Higher specificity values indicate more specific matches (e.g., exact file paths have higher
32/// specificity than directory patterns).
33#[derive(Debug)]
34struct FileWithSpecificity {
35    file: File,
36    specificity: usize,
37}
38
39/// Builder for loading files into a Database from the filesystem and memory.
40pub struct DatabaseLoader<'config> {
41    database: Option<Database<'config>>,
42    configuration: DatabaseConfiguration<'config>,
43    memory_sources: Vec<(&'static [u8], &'static [u8], FileType)>,
44    stdin_override: Option<(Cow<'config, [u8]>, Vec<u8>)>,
45}
46
47impl<'config> DatabaseLoader<'config> {
48    #[inline]
49    #[must_use]
50    pub fn new(configuration: DatabaseConfiguration<'config>) -> Self {
51        Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
52    }
53
54    #[inline]
55    #[must_use]
56    pub fn with_database(mut self, database: Database<'config>) -> Self {
57        self.database = Some(database);
58        self
59    }
60
61    /// When set, the file with this logical name (workspace-relative path) will use the given
62    /// content instead of being read from disk. The logical name is used for baseline and reporting.
63    ///
64    /// `content` is raw bytes: PHP source is binary-safe, so a buffer piped in via `--stdin-input`
65    /// may not be valid UTF-8.
66    #[inline]
67    #[must_use]
68    pub fn with_stdin_override(mut self, logical_name: impl AsRef<[u8]>, content: Vec<u8>) -> Self {
69        self.stdin_override = Some((Cow::Owned(logical_name.as_ref().to_vec()), content));
70        self
71    }
72
73    #[inline]
74    pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
75        self.memory_sources.push((name.as_bytes(), contents.as_bytes(), file_type));
76    }
77
78    /// Loads files from disk into the database.
79    ///
80    /// # Errors
81    ///
82    /// Returns a [`DatabaseError`] if:
83    /// - A glob pattern is invalid
84    /// - File system operations fail (reading directories, files)
85    /// - A file exceeds the maximum supported size
86    #[inline]
87    pub fn load(mut self) -> Result<Database<'config>, DatabaseError> {
88        let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
89
90        // Update database configuration to use the loader's configuration
91        // (fixes workspace path when merging with prelude database)
92        db.configuration = self.configuration.clone();
93
94        let extensions_set: HashSet<OsString> =
95            self.configuration.extensions.iter().map(|s| bytes_to_os_str(s.as_ref()).into_owned()).collect();
96
97        let glob_exclude_patterns: Vec<&str> = self
98            .configuration
99            .excludes
100            .iter()
101            .filter_map(|ex| match ex {
102                Exclusion::Pattern(pat) => Some(pat.as_ref()),
103                Exclusion::Path(_) => None,
104            })
105            .collect();
106
107        let glob_excludes = build_glob_set(glob_exclude_patterns.iter().copied(), self.configuration.glob)?;
108        let dir_prune_patterns: Vec<&str> = glob_exclude_patterns
109            .iter()
110            .filter_map(|pat| {
111                let stripped =
112                    pat.strip_suffix("/**/*").or_else(|| pat.strip_suffix("/**")).or_else(|| pat.strip_suffix("/*"))?;
113                if stripped.is_empty() || stripped == "*" || stripped == "**" {
114                    return None;
115                }
116                Some(stripped)
117            })
118            .collect();
119
120        let dir_prune_globs = build_glob_set(dir_prune_patterns.iter().copied(), self.configuration.glob)?;
121
122        let path_excludes: HashSet<_> = self
123            .configuration
124            .excludes
125            .iter()
126            .filter_map(|ex| match ex {
127                Exclusion::Path(p) => Some(p),
128                Exclusion::Pattern(_) => None,
129            })
130            .collect();
131
132        let host_files_with_spec = self.load_paths(
133            &self.configuration.paths,
134            FileType::Host,
135            &extensions_set,
136            &glob_excludes,
137            &dir_prune_globs,
138            &path_excludes,
139        )?;
140
141        let vendored_files_with_spec = self.load_paths(
142            &self.configuration.includes,
143            FileType::Vendored,
144            &extensions_set,
145            &glob_excludes,
146            &dir_prune_globs,
147            &path_excludes,
148        )?;
149
150        let patch_files_with_spec = self.load_paths(
151            &self.configuration.patches,
152            FileType::Patch,
153            &extensions_set,
154            &glob_excludes,
155            &dir_prune_globs,
156            &path_excludes,
157        )?;
158
159        let mut all_files: HashMap<FileId, File> = HashMap::default();
160        // Per-file maximum specificity for each tier the file matched. `None` in a slot means
161        // the file did not match any configured pattern in that tier; otherwise it carries the
162        // best specificity score that tier could offer, scored by `calculate_pattern_specificity`.
163        type TierSpecs = (Option<usize>, Option<usize>, Option<usize>);
164        let mut tier_specs: HashMap<FileId, TierSpecs> = HashMap::default();
165
166        // Process host files (from paths)
167        for file_with_spec in host_files_with_spec {
168            let file_id = file_with_spec.file.id;
169            let specificity = file_with_spec.specificity;
170
171            all_files.insert(file_id, file_with_spec.file);
172            bump_spec(&mut tier_specs.entry(file_id).or_insert((None, None, None)).0, specificity);
173        }
174
175        // When stdin override is set, ensure that the file is in the database
176        // (covers new/unsaved files, not on disk). Excluded paths are skipped
177        // so that editor integrations using `--stdin-input` honor the same
178        // exclude rules as a regular filesystem scan.
179        if let Some((name, content)) = &self.stdin_override {
180            let virtual_path = self.configuration.workspace.join(bytes_to_path(name.as_ref()).as_ref());
181            let virtual_path_canonical = virtual_path.canonicalize().unwrap_or_else(|_| virtual_path.clone());
182            let virtual_path_str = virtual_path_canonical.to_string_lossy();
183
184            let matched_glob = !glob_excludes.is_empty()
185                && (glob_excludes.is_match(virtual_path_canonical.as_path())
186                    || glob_excludes.is_match(bytes_to_path(name.as_ref()).as_ref()));
187
188            let matched_path = path_excludes.iter().any(|excl| {
189                let canonical = if Path::new(excl.as_ref()).is_absolute() {
190                    excl.as_ref().to_path_buf()
191                } else {
192                    self.configuration.workspace.join(excl.as_ref())
193                };
194                let canonical = canonical.canonicalize().unwrap_or(canonical);
195                let canonical_str = canonical.to_string_lossy();
196
197                virtual_path_str.starts_with(canonical_str.as_ref())
198                    && matches!(virtual_path_str.as_bytes().get(canonical_str.len()), None | Some(&b'/' | &b'\\'))
199            });
200
201            if !matched_glob && !matched_path {
202                let file = File::ephemeral(Cow::Owned(name.as_ref().to_vec()), Cow::Owned(content.clone()));
203                let file_id = file.id;
204                if let Entry::Vacant(e) = all_files.entry(file_id) {
205                    e.insert(file);
206
207                    bump_spec(&mut tier_specs.entry(file_id).or_insert((None, None, None)).0, usize::MAX);
208                }
209            }
210        }
211
212        for file_with_spec in vendored_files_with_spec {
213            let file_id = file_with_spec.file.id;
214            let vendored_specificity = file_with_spec.specificity;
215
216            all_files.entry(file_id).or_insert(file_with_spec.file);
217            bump_spec(&mut tier_specs.entry(file_id).or_insert((None, None, None)).1, vendored_specificity);
218        }
219
220        for file_with_spec in patch_files_with_spec {
221            let file_id = file_with_spec.file.id;
222            let specificity = file_with_spec.specificity;
223            all_files.entry(file_id).or_insert(file_with_spec.file);
224            bump_spec(&mut tier_specs.entry(file_id).or_insert((None, None, None)).2, specificity);
225        }
226
227        db.reserve(tier_specs.len() + self.memory_sources.len());
228
229        for (file_id, (host_spec, vendored_spec, patch_spec)) in tier_specs {
230            if let Some(mut file) = all_files.remove(&file_id) {
231                file.file_type = resolve_file_type(host_spec, vendored_spec, patch_spec);
232                db.add(file);
233            }
234        }
235
236        for (name, contents, file_type) in self.memory_sources {
237            let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
238
239            db.add(file);
240        }
241
242        Ok(db)
243    }
244
245    /// Discovers and reads all files from a set of root paths or glob patterns in parallel.
246    ///
247    /// Supports both:
248    /// - Directory paths (e.g., "src", "tests") - recursively walks all files
249    /// - Glob patterns (e.g., "src/**/*.php", "tests/Unit/*Test.php") - matches files using glob syntax
250    ///
251    /// Returns files along with their pattern specificity for conflict resolution.
252    fn load_paths(
253        &self,
254        roots: &[Cow<'config, [u8]>],
255        file_type: FileType,
256        extensions: &HashSet<OsString>,
257        glob_excludes: &GlobSet,
258        dir_prune_globs: &GlobSet,
259        path_excludes: &HashSet<&Cow<'config, Path>>,
260    ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
261        // Canonicalize the workspace once.  All WalkDir roots are canonicalized
262        // before traversal so their paths inherit the canonical prefix without
263        // any per-file syscalls.
264        let canonical_workspace =
265            self.configuration.workspace.canonicalize().unwrap_or_else(|_| self.configuration.workspace.to_path_buf());
266
267        // Pre-canonicalize path excludes once as strings.  A plain byte-string
268        // prefix check is then sufficient in the parallel section, replacing the
269        // per-file canonicalize() + Path::starts_with (Components iteration).
270        let canonical_excludes: Vec<String> = path_excludes
271            .iter()
272            .filter_map(|ex| {
273                let p = if Path::new(ex.as_ref()).is_absolute() {
274                    ex.as_ref().to_path_buf()
275                } else {
276                    self.configuration.workspace.join(ex.as_ref())
277                };
278
279                p.canonicalize().ok()?.into_os_string().into_string().ok()
280            })
281            .collect();
282
283        let workspace_relative_str = |path: &Path| -> String {
284            let rel = path.strip_prefix(canonical_workspace.as_path()).unwrap_or(path);
285            let s = rel.to_string_lossy();
286            #[cfg(windows)]
287            {
288                s.replace('\\', "/")
289            }
290            #[cfg(not(windows))]
291            {
292                s.into_owned()
293            }
294        };
295
296        // The bool flags a path that was named exactly (a literal file on disk) rather than
297        // discovered by walking a configured directory. Such paths bypass the extension filter.
298        let mut paths_to_process: Vec<(PathBuf, usize, bool)> = Vec::new();
299
300        for root in roots {
301            // Check if this is a glob pattern (contains glob metacharacters).
302            // First check if it's an actual file/directory on disk. if so, treat it
303            // as a literal path even if the name contains glob metacharacters like `[]`.
304            let root_path = bytes_to_path(root.as_ref());
305            let resolved_path = if root_path.is_absolute() {
306                root_path.as_ref().to_path_buf()
307            } else {
308                self.configuration.workspace.join(root_path.as_ref())
309            };
310
311            let is_glob_pattern = !resolved_path.exists()
312                && (root.contains(&b'*') || root.contains(&b'?') || root.contains(&b'[') || root.contains(&b'{'));
313
314            let specificity = calculate_pattern_specificity(root.as_ref());
315            if is_glob_pattern {
316                // Handle as glob pattern
317                let pattern = if root_path.is_absolute() {
318                    bytes_to_string_lossy(root.as_ref()).into_owned()
319                } else {
320                    // Make relative patterns absolute by prepending workspace
321                    self.configuration.workspace.join(root_path.as_ref()).to_string_lossy().to_string()
322                };
323
324                match glob::glob(&pattern) {
325                    Ok(entries) => {
326                        for entry in entries {
327                            match entry {
328                                Ok(path) => {
329                                    if path.is_file() {
330                                        // Canonicalize so the path shares the same prefix as
331                                        // `canonical_workspace` (important on macOS where
332                                        // TempDir / glob return /var/… but canonicalize gives
333                                        // /private/var/…).  Fall back to the original on error.
334                                        let canonical = path.canonicalize().unwrap_or(path);
335                                        paths_to_process.push((canonical, specificity, false));
336                                    }
337                                }
338                                Err(e) => {
339                                    tracing::warn!("Failed to read glob entry: {}", e);
340                                }
341                            }
342                        }
343                    }
344                    Err(e) => {
345                        return Err(DatabaseError::Glob(e.to_string()));
346                    }
347                }
348            } else {
349                let canonical_root = resolved_path.canonicalize().unwrap_or(resolved_path);
350
351                // A path that resolves to a regular file was named explicitly rather than
352                // discovered by walking a directory. Honor it verbatim, bypassing the extension
353                // filter so extensionless PHP files (e.g. `bin/console`) can be loaded.
354                if canonical_root.is_file() {
355                    paths_to_process.push((canonical_root, specificity, true));
356                    continue;
357                }
358
359                let has_dir_prunes = !dir_prune_globs.is_empty();
360                let has_path_prunes = !canonical_excludes.is_empty();
361                let walker = WalkDir::new(&canonical_root).follow_links(true).into_iter().filter_entry(|entry| {
362                    if entry.depth() == 0 || !entry.file_type().is_dir() {
363                        return true;
364                    }
365
366                    let path = entry.path();
367
368                    if has_path_prunes
369                        && let Some(p) = path.to_str()
370                        && canonical_excludes.iter().any(|excl| {
371                            p.starts_with(excl.as_str())
372                                && matches!(p.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
373                        })
374                    {
375                        return false;
376                    }
377
378                    if has_dir_prunes
379                        && (dir_prune_globs.is_match(path) || dir_prune_globs.is_match(workspace_relative_str(path)))
380                    {
381                        return false;
382                    }
383
384                    true
385                });
386
387                for entry in walker {
388                    match entry {
389                        Ok(entry) => {
390                            if !entry.file_type().is_dir() {
391                                paths_to_process.push((entry.into_path(), specificity, false));
392                            }
393                        }
394                        Err(err) => {
395                            let path = err.path().unwrap_or(canonical_root.as_path()).display();
396                            if let Some(ancestor) = err.loop_ancestor() {
397                                tracing::warn!(
398                                    "Skipping symlink loop at `{path}`: link cycles back to `{}`.",
399                                    ancestor.display(),
400                                );
401                            } else {
402                                tracing::warn!("Failed to walk `{path}`: {err}. Entry will be skipped.");
403                            }
404                        }
405                    }
406                }
407            }
408        }
409
410        let has_path_excludes = !canonical_excludes.is_empty();
411        let has_glob_excludes = !glob_excludes.is_empty();
412        let files: Vec<FileWithSpecificity> = paths_to_process
413            .into_par_iter()
414            .filter_map(|(path, specificity, skip_ext_check)| {
415                if has_glob_excludes
416                    && (glob_excludes.is_match(&path) || glob_excludes.is_match(workspace_relative_str(&path)))
417                {
418                    return None;
419                }
420
421                if !skip_ext_check {
422                    let ext = path.extension()?;
423                    if !extensions.contains(ext) {
424                        return None;
425                    }
426                }
427
428                if has_path_excludes {
429                    let excluded = path.to_str().is_some_and(|s| {
430                        canonical_excludes.iter().any(|excl| {
431                            s.starts_with(excl.as_str())
432                                && matches!(s.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
433                        })
434                    });
435
436                    if excluded {
437                        return None;
438                    }
439                }
440
441                let workspace = canonical_workspace.as_path();
442                #[cfg(windows)]
443                let logical_name =
444                    path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().replace('\\', "/");
445                #[cfg(not(windows))]
446                let logical_name =
447                    path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
448
449                if let Some((override_name, override_content)) = &self.stdin_override
450                    && override_name.as_ref() == logical_name.as_bytes()
451                {
452                    let file = File::new(
453                        Cow::Owned(logical_name.into_bytes()),
454                        file_type,
455                        Some(path),
456                        Cow::Owned(override_content.clone()),
457                    );
458
459                    return Some(Ok(FileWithSpecificity { file, specificity }));
460                }
461
462                match read_file(workspace, &path, file_type) {
463                    Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
464                    Err(e) => Some(Err(e)),
465                }
466            })
467            .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
468
469        Ok(files)
470    }
471}
472
473fn bump_spec(slot: &mut Option<usize>, s: usize) {
474    *slot = Some(slot.map_or(s, |e| e.max(s)));
475}
476
477/// Picks the final [`FileType`] for a file that matched configured base paths in one or
478/// more tiers.
479///
480/// Each argument carries the maximum specificity of any matching base path in that tier, or
481/// `None` if no base path in that tier matched. Vendored wins over Host at equal-or-more
482/// specificity; Host wins only when strictly more specific. Patch beats Vendored
483/// unconditionally; Patch beats Host only when strictly more specific. When nothing matches
484/// the result is `Host`.
485pub(crate) fn resolve_file_type(
486    host_spec: Option<usize>,
487    vendored_spec: Option<usize>,
488    patch_spec: Option<usize>,
489) -> FileType {
490    let mut decision: Option<(FileType, usize)> = host_spec.map(|s| (FileType::Host, s));
491
492    if let Some(v) = vendored_spec {
493        decision = match decision {
494            Some((FileType::Host, h)) if v < h => decision,
495            _ => Some((FileType::Vendored, v)),
496        };
497    }
498
499    if let Some(p) = patch_spec {
500        decision = match decision {
501            Some((FileType::Host | FileType::Patch, e)) if p <= e => decision,
502            _ => Some((FileType::Patch, p)),
503        };
504    }
505
506    decision.map(|(ft, _)| ft).unwrap_or(FileType::Host)
507}
508
509/// Calculates how specific a configured base path or glob pattern is for conflict resolution.
510///
511/// Examples:
512///
513/// - "src/b.php" matching src/b.php: ~2000 (exact file, 2 components)
514/// - "src/" matching src/b.php: ~100 (directory, 1 component)
515/// - "src" matching src/b.php: ~100 (directory, 1 component)
516pub(crate) fn calculate_pattern_specificity(pattern: &[u8]) -> usize {
517    let pattern_path = bytes_to_path(pattern);
518
519    let component_count = pattern_path.components().count();
520    let is_glob =
521        pattern.contains(&b'*') || pattern.contains(&b'?') || pattern.contains(&b'[') || pattern.contains(&b'{');
522
523    if is_glob {
524        let non_wildcard_components = pattern_path
525            .components()
526            .filter(|c| {
527                let s = c.as_os_str().to_string_lossy();
528                !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
529            })
530            .count();
531        non_wildcard_components * 10
532    } else if pattern_path.is_file()
533        || pattern_path.extension().is_some()
534        || pattern.rsplit(|&b| b == b'.').next().is_some_and(|ext| ext.eq_ignore_ascii_case(b"php"))
535    {
536        component_count * 1000
537    } else {
538        component_count * 100
539    }
540}
541
542#[cfg(test)]
543mod resolution_tests {
544    use super::*;
545
546    #[test]
547    fn defaults_to_host_when_nothing_matches() {
548        assert_eq!(resolve_file_type(None, None, None), FileType::Host);
549    }
550
551    #[test]
552    fn host_only_match_yields_host() {
553        assert_eq!(resolve_file_type(Some(100), None, None), FileType::Host);
554    }
555
556    #[test]
557    fn vendored_only_match_yields_vendored() {
558        assert_eq!(resolve_file_type(None, Some(100), None), FileType::Vendored);
559    }
560
561    #[test]
562    fn patch_only_match_yields_patch() {
563        assert_eq!(resolve_file_type(None, None, Some(100)), FileType::Patch);
564    }
565
566    #[test]
567    fn vendored_beats_host_at_equal_specificity() {
568        assert_eq!(resolve_file_type(Some(100), Some(100), None), FileType::Vendored);
569    }
570
571    #[test]
572    fn vendored_beats_host_when_more_specific() {
573        assert_eq!(resolve_file_type(Some(100), Some(2000), None), FileType::Vendored);
574    }
575
576    #[test]
577    fn host_beats_vendored_only_when_strictly_more_specific() {
578        assert_eq!(resolve_file_type(Some(2000), Some(100), None), FileType::Host);
579    }
580
581    #[test]
582    fn patch_beats_vendored_unconditionally() {
583        assert_eq!(resolve_file_type(None, Some(2000), Some(100)), FileType::Patch);
584    }
585
586    #[test]
587    fn host_beats_patch_at_equal_specificity() {
588        assert_eq!(resolve_file_type(Some(100), None, Some(100)), FileType::Host);
589    }
590
591    #[test]
592    fn patch_beats_host_when_strictly_more_specific() {
593        assert_eq!(resolve_file_type(Some(100), None, Some(2000)), FileType::Patch);
594    }
595
596    #[test]
597    fn patch_beats_host_that_won_over_vendored() {
598        assert_eq!(resolve_file_type(Some(100), Some(2000), Some(50)), FileType::Patch);
599    }
600
601    #[test]
602    fn exact_file_path_beats_directory_at_same_component_count() {
603        assert!(calculate_pattern_specificity(b"src/foo.php") > calculate_pattern_specificity(b"src/foo"));
604    }
605
606    #[test]
607    fn directory_beats_glob_at_same_non_wildcard_count() {
608        assert!(calculate_pattern_specificity(b"src/") > calculate_pattern_specificity(b"src/**/*.php"));
609    }
610
611    #[test]
612    fn deeper_path_beats_shallower_at_same_kind() {
613        assert!(calculate_pattern_specificity(b"src/inner/") > calculate_pattern_specificity(b"src/"));
614    }
615
616    #[test]
617    fn extensionless_phpish_pattern_treated_as_file() {
618        assert_eq!(calculate_pattern_specificity(b"src/foo.PHP"), calculate_pattern_specificity(b"src/foo.php"),);
619    }
620}
621
622#[cfg(test)]
623#[allow(clippy::unwrap_used, clippy::expect_used)]
624mod tests {
625    use super::*;
626    use crate::DatabaseReader;
627    use crate::GlobSettings;
628    use std::borrow::Cow;
629    use tempfile::TempDir;
630
631    fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
632        create_test_config_with_patches(temp_dir, paths, includes, vec![])
633    }
634
635    fn create_test_config_with_patches(
636        temp_dir: &TempDir,
637        paths: Vec<&str>,
638        includes: Vec<&str>,
639        patches: Vec<&str>,
640    ) -> DatabaseConfiguration<'static> {
641        // Normalize path separators to platform-specific separators
642        let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
643
644        DatabaseConfiguration {
645            workspace: Cow::Owned(temp_dir.path().to_path_buf()),
646            paths: paths.into_iter().map(|s| Cow::Owned(normalize(s).into_bytes())).collect(),
647            includes: includes.into_iter().map(|s| Cow::Owned(normalize(s).into_bytes())).collect(),
648            patches: patches.into_iter().map(|s| Cow::Owned(normalize(s).into_bytes())).collect(),
649            excludes: vec![],
650            extensions: vec![Cow::Borrowed(b"php")],
651            glob: GlobSettings::default(),
652        }
653    }
654
655    /// Returns the file's logical name as a lossy UTF-8 string for assertion matching.
656    fn name_str(name: &[u8]) -> std::borrow::Cow<'_, str> {
657        String::from_utf8_lossy(name)
658    }
659
660    fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
661        let file_path = temp_dir.path().join(relative_path);
662        if let Some(parent) = file_path.parent() {
663            std::fs::create_dir_all(parent).unwrap();
664        }
665        std::fs::write(file_path, content).unwrap();
666    }
667
668    #[test]
669    fn test_exact_file_vs_directory() {
670        let temp_dir = TempDir::new().unwrap();
671
672        create_test_file(&temp_dir, "src/b.php", "<?php");
673        create_test_file(&temp_dir, "src/a.php", "<?php");
674
675        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
676        let loader = DatabaseLoader::new(config);
677        let db = loader.load().unwrap();
678
679        let b_file = db.files().find(|f| name_str(&f.name).contains("b.php")).unwrap();
680        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
681
682        let a_file = db.files().find(|f| name_str(&f.name).contains("a.php")).unwrap();
683        assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
684    }
685
686    #[test]
687    fn test_deeper_vs_shallower_directory() {
688        let temp_dir = TempDir::new().unwrap();
689
690        create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
691
692        let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
693        let loader = DatabaseLoader::new(config);
694        let db = loader.load().unwrap();
695
696        let file = db.files().find(|f| name_str(&f.name).contains("bar.php")).unwrap();
697        assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
698    }
699
700    #[test]
701    fn test_exact_file_vs_glob() {
702        let temp_dir = TempDir::new().unwrap();
703
704        create_test_file(&temp_dir, "src/b.php", "<?php");
705
706        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
707        let loader = DatabaseLoader::new(config);
708        let db = loader.load().unwrap();
709
710        let file = db.files().find(|f| name_str(&f.name).contains("b.php")).unwrap();
711        assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
712    }
713
714    #[test]
715    fn test_equal_specificity_includes_wins() {
716        let temp_dir = TempDir::new().unwrap();
717
718        create_test_file(&temp_dir, "src/a.php", "<?php");
719
720        let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
721        let loader = DatabaseLoader::new(config);
722        let db = loader.load().unwrap();
723
724        let file = db.files().find(|f| name_str(&f.name).contains("a.php")).unwrap();
725        assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
726    }
727
728    #[test]
729    fn test_complex_scenario_from_bug_report() {
730        let temp_dir = TempDir::new().unwrap();
731
732        create_test_file(&temp_dir, "src/a.php", "<?php");
733        create_test_file(&temp_dir, "src/b.php", "<?php");
734        create_test_file(&temp_dir, "src/c/d.php", "<?php");
735        create_test_file(&temp_dir, "src/c/e.php", "<?php");
736        create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
737        create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
738
739        let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
740        let loader = DatabaseLoader::new(config);
741        let db = loader.load().unwrap();
742
743        let b_file = db
744            .files()
745            .find(|f| name_str(&f.name).contains("src/b.php") || name_str(&f.name).ends_with("b.php"))
746            .unwrap();
747        assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
748
749        let d_file = db.files().find(|f| name_str(&f.name).contains("d.php")).unwrap();
750        assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
751
752        let lib_file = db.files().find(|f| name_str(&f.name).contains("lib1.php")).unwrap();
753        assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
754    }
755
756    #[test]
757    fn test_files_only_in_paths() {
758        let temp_dir = TempDir::new().unwrap();
759
760        create_test_file(&temp_dir, "src/a.php", "<?php");
761
762        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
763        let loader = DatabaseLoader::new(config);
764        let db = loader.load().unwrap();
765
766        let file = db.files().find(|f| name_str(&f.name).contains("a.php")).unwrap();
767        assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
768    }
769
770    #[test]
771    fn test_files_only_in_includes() {
772        let temp_dir = TempDir::new().unwrap();
773
774        create_test_file(&temp_dir, "vendor/lib.php", "<?php");
775
776        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
777        let loader = DatabaseLoader::new(config);
778        let db = loader.load().unwrap();
779
780        let file = db.files().find(|f| name_str(&f.name).contains("lib.php")).unwrap();
781        assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
782    }
783
784    #[test]
785    fn test_stdin_override_replaces_file_content() {
786        let temp_dir = TempDir::new().unwrap();
787        create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
788
789        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
790        let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", b"<?php\n// from stdin".to_vec());
791        let db = loader.load().unwrap();
792
793        let file = db.files().find(|f| name_str(&f.name).contains("foo.php")).unwrap();
794        assert_eq!(
795            file.contents.as_ref(),
796            b"<?php\n// from stdin",
797            "stdin override content should be used instead of disk"
798        );
799    }
800
801    #[test]
802    fn test_glob_excludes_match_workspace_relative_paths() {
803        let temp_dir = TempDir::new().unwrap();
804
805        create_test_file(&temp_dir, "src/Absences/Foo/Foo.php", "<?php");
806        create_test_file(&temp_dir, "src/Absences/Test/Faker/Provider/AbsencesProvider.php", "<?php");
807        create_test_file(&temp_dir, "src/Calendar/Test/Helper.php", "<?php");
808
809        let mut config = create_test_config(&temp_dir, vec!["src"], vec![]);
810        config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("src/*/Test/**"))];
811
812        let loader = DatabaseLoader::new(config);
813        let db = loader.load().unwrap();
814
815        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
816        assert!(names.iter().any(|n| n.ends_with("src/Absences/Foo/Foo.php")), "non-Test file should be loaded");
817        assert!(
818            !names.iter().any(|n| n.contains("src/Absences/Test/")),
819            "files under src/*/Test/** should be excluded, got {names:?}"
820        );
821        assert!(
822            !names.iter().any(|n| n.contains("src/Calendar/Test/")),
823            "files under src/*/Test/** should be excluded, got {names:?}"
824        );
825    }
826
827    #[test]
828    fn test_glob_excludes_match_legacy_absolute_prefix_patterns() {
829        let temp_dir = TempDir::new().unwrap();
830
831        create_test_file(&temp_dir, "packages/foo/src/main.php", "<?php");
832        create_test_file(&temp_dir, "packages/foo/vendor/lib.php", "<?php");
833
834        let mut config = create_test_config(&temp_dir, vec!["packages"], vec![]);
835        config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("*/packages/**/vendor/*"))];
836
837        let loader = DatabaseLoader::new(config);
838        let db = loader.load().unwrap();
839
840        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
841        assert!(names.iter().any(|n| n.ends_with("packages/foo/src/main.php")));
842        assert!(
843            !names.iter().any(|n| n.contains("/vendor/")),
844            "legacy `*/packages/**/vendor/*` style should still exclude vendor files, got {names:?}"
845        );
846    }
847
848    #[test]
849    fn test_glob_dir_prune_skips_relative_directories() {
850        let temp_dir = TempDir::new().unwrap();
851
852        create_test_file(&temp_dir, "vendor/slevomat/coding-standard/main.php", "<?php");
853        create_test_file(&temp_dir, "vendor/slevomat/coding-standard/tests/Sniffs/Foo.php", "<?php");
854        create_test_file(&temp_dir, "vendor/another/lib.php", "<?php");
855
856        let mut config = create_test_config(&temp_dir, vec![], vec!["vendor"]);
857        config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("vendor/**/tests/**"))];
858
859        let loader = DatabaseLoader::new(config);
860        let db = loader.load().unwrap();
861
862        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
863        assert!(names.iter().any(|n| n.ends_with("vendor/slevomat/coding-standard/main.php")));
864        assert!(names.iter().any(|n| n.ends_with("vendor/another/lib.php")));
865        assert!(
866            !names.iter().any(|n| n.contains("/tests/")),
867            "files under vendor/**/tests/** should be pruned, got {names:?}"
868        );
869    }
870
871    #[test]
872    fn test_stdin_override_adds_file_when_not_on_disk() {
873        let temp_dir = TempDir::new().unwrap();
874        create_test_file(&temp_dir, "src/.gitkeep", "");
875
876        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
877        let loader =
878            DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", b"<?php\n// unsaved buffer".to_vec());
879        let db = loader.load().unwrap();
880
881        let file = db.files().find(|f| name_str(&f.name).contains("unsaved.php")).unwrap();
882        assert_eq!(file.file_type, FileType::Host);
883        assert_eq!(file.contents.as_ref(), b"<?php\n// unsaved buffer");
884    }
885
886    #[test]
887    fn test_stdin_override_accepts_non_utf8_content() {
888        let temp_dir = TempDir::new().unwrap();
889        create_test_file(&temp_dir, "src/.gitkeep", "");
890
891        let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
892        // PHP identifiers are binary-safe, so a buffer piped in via `--stdin-input` may not
893        // be valid UTF-8. The loaded file must carry those bytes through verbatim.
894        let content = b"<?php\n\nfunction f\xC9\xFF(): void {}\n".to_vec();
895        assert!(std::str::from_utf8(&content).is_err(), "test buffer must contain non-UTF-8 bytes");
896
897        let loader = DatabaseLoader::new(config).with_stdin_override("src/buffer.php", content.clone());
898        let db = loader.load().unwrap();
899
900        let file = db.files().find(|f| name_str(&f.name).contains("buffer.php")).unwrap();
901        assert_eq!(file.contents.as_ref(), content.as_slice());
902    }
903
904    #[cfg(unix)]
905    #[test]
906    fn test_symlinked_file_under_include_is_loaded() {
907        let temp_dir = TempDir::new().unwrap();
908        let external = TempDir::new().unwrap();
909
910        create_test_file(&external, "Bar.php", "<?php class Bar {}\n");
911        std::fs::create_dir_all(temp_dir.path().join("vendor")).unwrap();
912        std::os::unix::fs::symlink(external.path().join("Bar.php"), temp_dir.path().join("vendor/Bar.php")).unwrap();
913
914        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
915        let db = DatabaseLoader::new(config).load().unwrap();
916
917        let bar = db.files().find(|f| name_str(&f.name).contains("Bar.php"));
918        assert!(bar.is_some(), "symlinked Bar.php should be loaded via include = ['vendor/']");
919    }
920
921    #[cfg(unix)]
922    #[test]
923    fn test_symlinked_directory_under_include_is_descended() {
924        let temp_dir = TempDir::new().unwrap();
925        let external = TempDir::new().unwrap();
926
927        create_test_file(&external, "src/Foo.php", "<?php class Foo {}\n");
928        create_test_file(&external, "src/Bar.php", "<?php class Bar {}\n");
929
930        std::fs::create_dir_all(temp_dir.path().join("vendor")).unwrap();
931        std::os::unix::fs::symlink(external.path(), temp_dir.path().join("vendor/example-package")).unwrap();
932
933        let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
934        let db = DatabaseLoader::new(config).load().unwrap();
935
936        assert!(db.files().any(|f| name_str(&f.name).contains("Foo.php")), "Foo.php inside symlinked dir not found");
937        assert!(db.files().any(|f| name_str(&f.name).contains("Bar.php")), "Bar.php inside symlinked dir not found");
938    }
939
940    #[cfg(unix)]
941    #[test]
942    fn test_symlink_cycle_is_warned_and_skipped() {
943        let temp_dir = TempDir::new().unwrap();
944        create_test_file(&temp_dir, "src/Real.php", "<?php class Real {}\n");
945        std::os::unix::fs::symlink(temp_dir.path().join("src"), temp_dir.path().join("src/loop")).unwrap();
946
947        let config = create_test_config(&temp_dir, vec![], vec!["src/"]);
948        let db = DatabaseLoader::new(config).load().expect("symlink cycle should not abort the load");
949
950        assert!(
951            db.files().any(|f| name_str(&f.name).contains("Real.php")),
952            "Real.php still reachable despite the loop"
953        );
954    }
955
956    #[test]
957    fn test_exact_extensionless_file_is_loaded() {
958        let temp_dir = TempDir::new().unwrap();
959        create_test_file(&temp_dir, "bin/console", "<?php\n// entrypoint");
960
961        // `bin/console` has no extension, so it would be filtered out when discovered by
962        // walking a directory. Naming it exactly must bypass the extension requirement.
963        let config = create_test_config(&temp_dir, vec!["bin/console"], vec![]);
964        let db = DatabaseLoader::new(config).load().unwrap();
965
966        let file = db.files().find(|f| name_str(&f.name).ends_with("bin/console")).unwrap();
967        assert_eq!(file.file_type, FileType::Host);
968        assert_eq!(file.contents.as_ref(), b"<?php\n// entrypoint");
969    }
970
971    #[test]
972    fn test_extensionless_file_in_directory_is_skipped() {
973        let temp_dir = TempDir::new().unwrap();
974        create_test_file(&temp_dir, "bin/console", "<?php");
975        create_test_file(&temp_dir, "bin/run.php", "<?php");
976
977        // Walking the directory must still honor the extension filter: only `run.php` loads.
978        let config = create_test_config(&temp_dir, vec!["bin"], vec![]);
979        let db = DatabaseLoader::new(config).load().unwrap();
980
981        let names: Vec<String> = db.files().map(|f| name_str(&f.name).into_owned()).collect();
982        assert!(names.iter().any(|n| n.ends_with("bin/run.php")), "run.php should be loaded, got {names:?}");
983        assert!(!names.iter().any(|n| n.ends_with("bin/console")), "extensionless console should be skipped");
984    }
985
986    #[test]
987    fn test_patch_beats_vendored_at_equal_specificity() {
988        // A file covered by both patches and includes at the same directory-level specificity
989        // should be classified as Patch, not Vendored.
990        let temp_dir = TempDir::new().unwrap();
991        create_test_file(&temp_dir, "lib/Foo.php", "<?php");
992
993        let config = create_test_config_with_patches(&temp_dir, vec![], vec!["lib/"], vec!["lib/"]);
994        let db = DatabaseLoader::new(config).load().unwrap();
995
996        let file = db.files().find(|f| String::from_utf8_lossy(&f.name).contains("Foo.php")).unwrap();
997        assert_eq!(file.file_type, FileType::Patch, "patch should beat vendored at equal specificity");
998    }
999
1000    #[test]
1001    fn test_host_beats_patch_at_equal_specificity() {
1002        // When a file is covered by both paths and patches at the same directory-level specificity,
1003        // the host (paths) classification wins.  Patches only override host when strictly more specific.
1004        let temp_dir = TempDir::new().unwrap();
1005        create_test_file(&temp_dir, "src/Foo.php", "<?php");
1006
1007        let config = create_test_config_with_patches(&temp_dir, vec!["src/"], vec![], vec!["src/"]);
1008        let db = DatabaseLoader::new(config).load().unwrap();
1009
1010        let file = db.files().find(|f| String::from_utf8_lossy(&f.name).contains("Foo.php")).unwrap();
1011        assert_eq!(file.file_type, FileType::Host, "host should beat patch at equal specificity");
1012    }
1013
1014    #[test]
1015    fn test_patch_beats_host_when_strictly_more_specific() {
1016        // An exact-file patch pattern has higher specificity than a directory paths pattern,
1017        // so the patch wins and the file is treated as Patch rather than Host.
1018        let temp_dir = TempDir::new().unwrap();
1019        create_test_file(&temp_dir, "src/Foo.php", "<?php");
1020        create_test_file(&temp_dir, "src/Bar.php", "<?php");
1021
1022        // Patch covers only Foo.php exactly; paths covers the whole directory.
1023        let config = create_test_config_with_patches(&temp_dir, vec!["src/"], vec![], vec!["src/Foo.php"]);
1024        let db = DatabaseLoader::new(config).load().unwrap();
1025
1026        let foo = db.files().find(|f| String::from_utf8_lossy(&f.name).contains("Foo.php")).unwrap();
1027        assert_eq!(foo.file_type, FileType::Patch, "exact-file patch should beat directory-level host pattern");
1028
1029        let bar = db.files().find(|f| String::from_utf8_lossy(&f.name).contains("Bar.php")).unwrap();
1030        assert_eq!(bar.file_type, FileType::Host, "file not covered by patch should remain Host");
1031    }
1032}