1use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7
8use foldhash::HashMap;
9use foldhash::HashSet;
10use globset::GlobSet;
11use rayon::prelude::*;
12use walkdir::WalkDir;
13
14use crate::Database;
15use crate::DatabaseConfiguration;
16use crate::error::DatabaseError;
17use crate::exclusion::Exclusion;
18use crate::file::File;
19use crate::file::FileId;
20use crate::file::FileType;
21use crate::matcher::build_glob_set;
22use crate::utils::read_file;
23
24#[derive(Debug)]
30struct FileWithSpecificity {
31 file: File,
32 specificity: usize,
33}
34
35pub struct DatabaseLoader<'a> {
37 database: Option<Database<'a>>,
38 configuration: DatabaseConfiguration<'a>,
39 memory_sources: Vec<(&'static str, &'static str, FileType)>,
40 stdin_override: Option<(Cow<'a, str>, String)>,
43}
44
45impl<'a> DatabaseLoader<'a> {
46 #[must_use]
47 pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
48 Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
49 }
50
51 #[must_use]
52 pub fn with_database(mut self, database: Database<'a>) -> Self {
53 self.database = Some(database);
54 self
55 }
56
57 #[must_use]
60 pub fn with_stdin_override(mut self, logical_name: impl Into<Cow<'a, str>>, content: String) -> Self {
61 self.stdin_override = Some((logical_name.into(), content));
62 self
63 }
64
65 pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
66 self.memory_sources.push((name, contents, file_type));
67 }
68
69 pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
78 let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
79
80 db.configuration = self.configuration.clone();
83
84 let extensions_set: HashSet<OsString> =
85 self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
86
87 let glob_exclude_patterns: Vec<&str> = self
88 .configuration
89 .excludes
90 .iter()
91 .filter_map(|ex| match ex {
92 Exclusion::Pattern(pat) => Some(pat.as_ref()),
93 Exclusion::Path(_) => None,
94 })
95 .collect();
96
97 let glob_excludes = build_glob_set(glob_exclude_patterns.iter().copied(), self.configuration.glob)?;
98 let dir_prune_patterns: Vec<&str> = glob_exclude_patterns
99 .iter()
100 .filter_map(|pat| {
101 let stripped =
102 pat.strip_suffix("/**/*").or_else(|| pat.strip_suffix("/**")).or_else(|| pat.strip_suffix("/*"))?;
103 if stripped.is_empty() || stripped == "*" || stripped == "**" {
104 return None;
105 }
106 Some(stripped)
107 })
108 .collect();
109
110 let dir_prune_globs = build_glob_set(dir_prune_patterns.iter().copied(), self.configuration.glob)?;
111
112 let path_excludes: HashSet<_> = self
113 .configuration
114 .excludes
115 .iter()
116 .filter_map(|ex| match ex {
117 Exclusion::Path(p) => Some(p),
118 _ => None,
119 })
120 .collect();
121
122 let host_files_with_spec = self.load_paths(
123 &self.configuration.paths,
124 FileType::Host,
125 &extensions_set,
126 &glob_excludes,
127 &dir_prune_globs,
128 &path_excludes,
129 )?;
130
131 let vendored_files_with_spec = self.load_paths(
132 &self.configuration.includes,
133 FileType::Vendored,
134 &extensions_set,
135 &glob_excludes,
136 &dir_prune_globs,
137 &path_excludes,
138 )?;
139
140 let mut all_files: HashMap<FileId, File> = HashMap::default();
141 let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
142
143 for file_with_spec in host_files_with_spec {
145 let file_id = file_with_spec.file.id;
146 let specificity = file_with_spec.specificity;
147
148 all_files.insert(file_id, file_with_spec.file);
149 file_decisions.insert(file_id, (FileType::Host, specificity));
150 }
151
152 if let Some((ref name, ref content)) = self.stdin_override {
155 let file = File::ephemeral(Cow::Owned(name.as_ref().to_string()), Cow::Owned(content.clone()));
156 let file_id = file.id;
157 if let Entry::Vacant(e) = all_files.entry(file_id) {
158 e.insert(file);
159
160 file_decisions.insert(file_id, (FileType::Host, usize::MAX));
161 }
162 }
163
164 for file_with_spec in vendored_files_with_spec {
165 let file_id = file_with_spec.file.id;
166 let vendored_specificity = file_with_spec.specificity;
167
168 all_files.entry(file_id).or_insert(file_with_spec.file);
169
170 match file_decisions.get(&file_id) {
171 Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
172 }
174 _ => {
175 file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
176 }
177 }
178 }
179
180 db.reserve(file_decisions.len() + self.memory_sources.len());
181
182 for (file_id, (final_type, _)) in file_decisions {
183 if let Some(mut file) = all_files.remove(&file_id) {
184 file.file_type = final_type;
185 db.add(file);
186 }
187 }
188
189 for (name, contents, file_type) in self.memory_sources {
190 let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
191
192 db.add(file);
193 }
194
195 Ok(db)
196 }
197
198 fn load_paths(
206 &self,
207 roots: &[Cow<'a, str>],
208 file_type: FileType,
209 extensions: &HashSet<OsString>,
210 glob_excludes: &GlobSet,
211 dir_prune_globs: &GlobSet,
212 path_excludes: &HashSet<&Cow<'a, Path>>,
213 ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
214 let canonical_workspace =
218 self.configuration.workspace.canonicalize().unwrap_or_else(|_| self.configuration.workspace.to_path_buf());
219
220 let canonical_excludes: Vec<String> = path_excludes
224 .iter()
225 .filter_map(|ex| {
226 let p = if Path::new(ex.as_ref()).is_absolute() {
227 ex.as_ref().to_path_buf()
228 } else {
229 self.configuration.workspace.join(ex.as_ref())
230 };
231
232 p.canonicalize().ok()?.into_os_string().into_string().ok()
233 })
234 .collect();
235
236 let workspace_relative_str = |path: &Path| -> String {
237 let rel = path.strip_prefix(canonical_workspace.as_path()).unwrap_or(path);
238 let s = rel.to_string_lossy();
239 #[cfg(windows)]
240 {
241 s.replace('\\', "/")
242 }
243 #[cfg(not(windows))]
244 {
245 s.into_owned()
246 }
247 };
248
249 let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
250
251 for root in roots {
252 let resolved_path = if Path::new(root.as_ref()).is_absolute() {
256 Path::new(root.as_ref()).to_path_buf()
257 } else {
258 self.configuration.workspace.join(root.as_ref())
259 };
260
261 let is_glob_pattern = !resolved_path.exists()
262 && (root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{'));
263
264 let specificity = Self::calculate_pattern_specificity(root.as_ref());
265 if is_glob_pattern {
266 let pattern = if Path::new(root.as_ref()).is_absolute() {
268 root.to_string()
269 } else {
270 self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
272 };
273
274 match glob::glob(&pattern) {
275 Ok(entries) => {
276 for entry in entries {
277 match entry {
278 Ok(path) => {
279 if path.is_file() {
280 let canonical = path.canonicalize().unwrap_or(path);
285 paths_to_process.push((canonical, specificity));
286 }
287 }
288 Err(e) => {
289 tracing::warn!("Failed to read glob entry: {}", e);
290 }
291 }
292 }
293 }
294 Err(e) => {
295 return Err(DatabaseError::Glob(e.to_string()));
296 }
297 }
298 } else {
299 let canonical_root = resolved_path.canonicalize().unwrap_or(resolved_path);
300 let has_dir_prunes = !dir_prune_globs.is_empty();
301 let has_path_prunes = !canonical_excludes.is_empty();
302 let walker = WalkDir::new(&canonical_root).into_iter().filter_entry(|entry| {
303 if entry.depth() == 0 || !entry.file_type().is_dir() {
304 return true;
305 }
306
307 let path = entry.path();
308
309 if has_path_prunes
310 && let Some(p) = path.to_str()
311 && canonical_excludes.iter().any(|excl| {
312 p.starts_with(excl.as_str())
313 && matches!(p.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
314 })
315 {
316 return false;
317 }
318
319 if has_dir_prunes
320 && (dir_prune_globs.is_match(path) || dir_prune_globs.is_match(workspace_relative_str(path)))
321 {
322 return false;
323 }
324
325 true
326 });
327
328 for entry in walker.filter_map(Result::ok) {
329 if entry.file_type().is_file() {
330 paths_to_process.push((entry.into_path(), specificity));
331 }
332 }
333 }
334 }
335
336 let has_path_excludes = !canonical_excludes.is_empty();
337 let has_glob_excludes = !glob_excludes.is_empty();
338 let files: Vec<FileWithSpecificity> = paths_to_process
339 .into_par_iter()
340 .filter_map(|(path, specificity)| {
341 if has_glob_excludes
342 && (glob_excludes.is_match(&path) || glob_excludes.is_match(workspace_relative_str(&path)))
343 {
344 return None;
345 }
346
347 let ext = path.extension()?;
348 if !extensions.contains(ext) {
349 return None;
350 }
351
352 if has_path_excludes {
353 let excluded = path.to_str().is_some_and(|s| {
354 canonical_excludes.iter().any(|excl| {
355 s.starts_with(excl.as_str())
356 && matches!(s.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
357 })
358 });
359
360 if excluded {
361 return None;
362 }
363 }
364
365 let workspace = canonical_workspace.as_path();
366 #[cfg(windows)]
367 let logical_name = path
368 .strip_prefix(workspace)
369 .unwrap_or_else(|_| path.as_path())
370 .to_string_lossy()
371 .replace('\\', "/");
372 #[cfg(not(windows))]
373 let logical_name =
374 path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
375
376 if let Some((ref override_name, ref override_content)) = self.stdin_override
377 && override_name.as_ref() == logical_name
378 {
379 let file = File::new(
380 Cow::Owned(logical_name),
381 file_type,
382 Some(path.clone()),
383 Cow::Owned(override_content.clone()),
384 );
385
386 return Some(Ok(FileWithSpecificity { file, specificity }));
387 }
388
389 match read_file(workspace, &path, file_type) {
390 Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
391 Err(e) => Some(Err(e)),
392 }
393 })
394 .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
395
396 Ok(files)
397 }
398
399 fn calculate_pattern_specificity(pattern: &str) -> usize {
407 let pattern_path = Path::new(pattern);
408
409 let component_count = pattern_path.components().count();
410 let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
411
412 if is_glob {
413 let non_wildcard_components = pattern_path
414 .components()
415 .filter(|c| {
416 let s = c.as_os_str().to_string_lossy();
417 !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
418 })
419 .count();
420 non_wildcard_components * 10
421 } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
422 component_count * 1000
423 } else {
424 component_count * 100
425 }
426 }
427}
428
429#[cfg(test)]
430mod tests {
431 use super::*;
432 use crate::DatabaseReader;
433 use crate::GlobSettings;
434 use std::borrow::Cow;
435 use tempfile::TempDir;
436
437 fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
438 let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
440
441 DatabaseConfiguration {
442 workspace: Cow::Owned(temp_dir.path().to_path_buf()),
443 paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
444 includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
445 excludes: vec![],
446 extensions: vec![Cow::Borrowed("php")],
447 glob: GlobSettings::default(),
448 }
449 }
450
451 fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
452 let file_path = temp_dir.path().join(relative_path);
453 if let Some(parent) = file_path.parent() {
454 std::fs::create_dir_all(parent).unwrap();
455 }
456 std::fs::write(file_path, content).unwrap();
457 }
458
459 #[test]
460 fn test_specificity_calculation_exact_file() {
461 let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
462 assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
463 }
464
465 #[test]
466 fn test_specificity_calculation_directory() {
467 let spec = DatabaseLoader::calculate_pattern_specificity("src/");
468 assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
469 }
470
471 #[test]
472 fn test_specificity_calculation_glob() {
473 let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
474 assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
475 }
476
477 #[test]
478 fn test_specificity_calculation_deeper_path() {
479 let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
480 let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
481 assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
482 }
483
484 #[test]
485 fn test_exact_file_vs_directory() {
486 let temp_dir = TempDir::new().unwrap();
487
488 create_test_file(&temp_dir, "src/b.php", "<?php");
489 create_test_file(&temp_dir, "src/a.php", "<?php");
490
491 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
492 let loader = DatabaseLoader::new(config);
493 let db = loader.load().unwrap();
494
495 let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
496 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
497
498 let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
499 assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
500 }
501
502 #[test]
503 fn test_deeper_vs_shallower_directory() {
504 let temp_dir = TempDir::new().unwrap();
505
506 create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
507
508 let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
509 let loader = DatabaseLoader::new(config);
510 let db = loader.load().unwrap();
511
512 let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
513 assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
514 }
515
516 #[test]
517 fn test_exact_file_vs_glob() {
518 let temp_dir = TempDir::new().unwrap();
519
520 create_test_file(&temp_dir, "src/b.php", "<?php");
521
522 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
523 let loader = DatabaseLoader::new(config);
524 let db = loader.load().unwrap();
525
526 let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
527 assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
528 }
529
530 #[test]
531 fn test_equal_specificity_includes_wins() {
532 let temp_dir = TempDir::new().unwrap();
533
534 create_test_file(&temp_dir, "src/a.php", "<?php");
535
536 let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
537 let loader = DatabaseLoader::new(config);
538 let db = loader.load().unwrap();
539
540 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
541 assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
542 }
543
544 #[test]
545 fn test_complex_scenario_from_bug_report() {
546 let temp_dir = TempDir::new().unwrap();
547
548 create_test_file(&temp_dir, "src/a.php", "<?php");
549 create_test_file(&temp_dir, "src/b.php", "<?php");
550 create_test_file(&temp_dir, "src/c/d.php", "<?php");
551 create_test_file(&temp_dir, "src/c/e.php", "<?php");
552 create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
553 create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
554
555 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
556 let loader = DatabaseLoader::new(config);
557 let db = loader.load().unwrap();
558
559 let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
560 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
561
562 let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
563 assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
564
565 let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
566 assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
567 }
568
569 #[test]
570 fn test_files_only_in_paths() {
571 let temp_dir = TempDir::new().unwrap();
572
573 create_test_file(&temp_dir, "src/a.php", "<?php");
574
575 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
576 let loader = DatabaseLoader::new(config);
577 let db = loader.load().unwrap();
578
579 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
580 assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
581 }
582
583 #[test]
584 fn test_files_only_in_includes() {
585 let temp_dir = TempDir::new().unwrap();
586
587 create_test_file(&temp_dir, "vendor/lib.php", "<?php");
588
589 let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
590 let loader = DatabaseLoader::new(config);
591 let db = loader.load().unwrap();
592
593 let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
594 assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
595 }
596
597 #[test]
598 fn test_stdin_override_replaces_file_content() {
599 let temp_dir = TempDir::new().unwrap();
600 create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
601
602 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
603 let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", "<?php\n// from stdin".to_string());
604 let db = loader.load().unwrap();
605
606 let file = db.files().find(|f| f.name.contains("foo.php")).unwrap();
607 assert_eq!(
608 file.contents.as_ref(),
609 "<?php\n// from stdin",
610 "stdin override content should be used instead of disk"
611 );
612 }
613
614 #[test]
615 fn test_glob_excludes_match_workspace_relative_paths() {
616 let temp_dir = TempDir::new().unwrap();
617
618 create_test_file(&temp_dir, "src/Absences/Foo/Foo.php", "<?php");
619 create_test_file(&temp_dir, "src/Absences/Test/Faker/Provider/AbsencesProvider.php", "<?php");
620 create_test_file(&temp_dir, "src/Calendar/Test/Helper.php", "<?php");
621
622 let mut config = create_test_config(&temp_dir, vec!["src"], vec![]);
623 config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("src/*/Test/**"))];
624
625 let loader = DatabaseLoader::new(config);
626 let db = loader.load().unwrap();
627
628 let names: Vec<String> = db.files().map(|f| f.name.to_string()).collect();
629 assert!(names.iter().any(|n| n.ends_with("src/Absences/Foo/Foo.php")), "non-Test file should be loaded");
630 assert!(
631 !names.iter().any(|n| n.contains("src/Absences/Test/")),
632 "files under src/*/Test/** should be excluded, got {names:?}"
633 );
634 assert!(
635 !names.iter().any(|n| n.contains("src/Calendar/Test/")),
636 "files under src/*/Test/** should be excluded, got {names:?}"
637 );
638 }
639
640 #[test]
641 fn test_glob_excludes_match_legacy_absolute_prefix_patterns() {
642 let temp_dir = TempDir::new().unwrap();
643
644 create_test_file(&temp_dir, "packages/foo/src/main.php", "<?php");
645 create_test_file(&temp_dir, "packages/foo/vendor/lib.php", "<?php");
646
647 let mut config = create_test_config(&temp_dir, vec!["packages"], vec![]);
648 config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("*/packages/**/vendor/*"))];
649
650 let loader = DatabaseLoader::new(config);
651 let db = loader.load().unwrap();
652
653 let names: Vec<String> = db.files().map(|f| f.name.to_string()).collect();
654 assert!(names.iter().any(|n| n.ends_with("packages/foo/src/main.php")));
655 assert!(
656 !names.iter().any(|n| n.contains("/vendor/")),
657 "legacy `*/packages/**/vendor/*` style should still exclude vendor files, got {names:?}"
658 );
659 }
660
661 #[test]
662 fn test_glob_dir_prune_skips_relative_directories() {
663 let temp_dir = TempDir::new().unwrap();
664
665 create_test_file(&temp_dir, "vendor/slevomat/coding-standard/main.php", "<?php");
666 create_test_file(&temp_dir, "vendor/slevomat/coding-standard/tests/Sniffs/Foo.php", "<?php");
667 create_test_file(&temp_dir, "vendor/another/lib.php", "<?php");
668
669 let mut config = create_test_config(&temp_dir, vec![], vec!["vendor"]);
670 config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("vendor/**/tests/**"))];
671
672 let loader = DatabaseLoader::new(config);
673 let db = loader.load().unwrap();
674
675 let names: Vec<String> = db.files().map(|f| f.name.to_string()).collect();
676 assert!(names.iter().any(|n| n.ends_with("vendor/slevomat/coding-standard/main.php")));
677 assert!(names.iter().any(|n| n.ends_with("vendor/another/lib.php")));
678 assert!(
679 !names.iter().any(|n| n.contains("/tests/")),
680 "files under vendor/**/tests/** should be pruned, got {names:?}"
681 );
682 }
683
684 #[test]
685 fn test_stdin_override_adds_file_when_not_on_disk() {
686 let temp_dir = TempDir::new().unwrap();
687 create_test_file(&temp_dir, "src/.gitkeep", "");
688
689 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
690 let loader =
691 DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", "<?php\n// unsaved buffer".to_string());
692 let db = loader.load().unwrap();
693
694 let file = db.files().find(|f| f.name.contains("unsaved.php")).unwrap();
695 assert_eq!(file.file_type, FileType::Host);
696 assert_eq!(file.contents.as_ref(), "<?php\n// unsaved buffer");
697 }
698}