1use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7
8use foldhash::HashMap;
9use foldhash::HashSet;
10use globset::GlobBuilder;
11use globset::GlobSet;
12use globset::GlobSetBuilder;
13use rayon::prelude::*;
14use walkdir::WalkDir;
15
16use crate::Database;
17use crate::DatabaseConfiguration;
18use crate::error::DatabaseError;
19use crate::exclusion::Exclusion;
20use crate::file::File;
21use crate::file::FileId;
22use crate::file::FileType;
23use crate::utils::read_file;
24
25#[derive(Debug)]
31struct FileWithSpecificity {
32 file: File,
33 specificity: usize,
34}
35
36pub struct DatabaseLoader<'a> {
38 database: Option<Database<'a>>,
39 configuration: DatabaseConfiguration<'a>,
40 memory_sources: Vec<(&'static str, &'static str, FileType)>,
41 stdin_override: Option<(Cow<'a, str>, String)>,
44}
45
46impl<'a> DatabaseLoader<'a> {
47 #[must_use]
48 pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
49 Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
50 }
51
52 #[must_use]
53 pub fn with_database(mut self, database: Database<'a>) -> Self {
54 self.database = Some(database);
55 self
56 }
57
58 #[must_use]
61 pub fn with_stdin_override(mut self, logical_name: impl Into<Cow<'a, str>>, content: String) -> Self {
62 self.stdin_override = Some((logical_name.into(), content));
63 self
64 }
65
66 pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
67 self.memory_sources.push((name, contents, file_type));
68 }
69
70 pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
79 let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
80
81 db.configuration = self.configuration.clone();
84
85 let extensions_set: HashSet<OsString> =
86 self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
87
88 let glob_settings = &self.configuration.glob;
89 let mut glob_builder = GlobSetBuilder::new();
90 for ex in &self.configuration.excludes {
91 if let Exclusion::Pattern(pat) = ex {
92 let glob = GlobBuilder::new(pat)
93 .case_insensitive(glob_settings.case_insensitive)
94 .literal_separator(glob_settings.literal_separator)
95 .backslash_escape(glob_settings.backslash_escape)
96 .empty_alternates(glob_settings.empty_alternates)
97 .build()?;
98
99 glob_builder.add(glob);
100 }
101 }
102
103 let glob_excludes = glob_builder.build()?;
104
105 let path_excludes: HashSet<_> = self
106 .configuration
107 .excludes
108 .iter()
109 .filter_map(|ex| match ex {
110 Exclusion::Path(p) => Some(p),
111 _ => None,
112 })
113 .collect();
114
115 let host_files_with_spec = self.load_paths(
116 &self.configuration.paths,
117 FileType::Host,
118 &extensions_set,
119 &glob_excludes,
120 &path_excludes,
121 )?;
122 let vendored_files_with_spec = self.load_paths(
123 &self.configuration.includes,
124 FileType::Vendored,
125 &extensions_set,
126 &glob_excludes,
127 &path_excludes,
128 )?;
129
130 let mut all_files: HashMap<FileId, File> = HashMap::default();
131 let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
132
133 for file_with_spec in host_files_with_spec {
135 let file_id = file_with_spec.file.id;
136 let specificity = file_with_spec.specificity;
137
138 all_files.insert(file_id, file_with_spec.file);
139 file_decisions.insert(file_id, (FileType::Host, specificity));
140 }
141
142 if let Some((ref name, ref content)) = self.stdin_override {
145 let file = File::ephemeral(Cow::Owned(name.as_ref().to_string()), Cow::Owned(content.clone()));
146 let file_id = file.id;
147 if let Entry::Vacant(e) = all_files.entry(file_id) {
148 e.insert(file);
149
150 file_decisions.insert(file_id, (FileType::Host, usize::MAX));
151 }
152 }
153
154 for file_with_spec in vendored_files_with_spec {
155 let file_id = file_with_spec.file.id;
156 let vendored_specificity = file_with_spec.specificity;
157
158 all_files.entry(file_id).or_insert(file_with_spec.file);
159
160 match file_decisions.get(&file_id) {
161 Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
162 }
164 _ => {
165 file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
166 }
167 }
168 }
169
170 db.reserve(file_decisions.len() + self.memory_sources.len());
171
172 for (file_id, (final_type, _)) in file_decisions {
173 if let Some(mut file) = all_files.remove(&file_id) {
174 file.file_type = final_type;
175 db.add(file);
176 }
177 }
178
179 for (name, contents, file_type) in self.memory_sources {
180 let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
181
182 db.add(file);
183 }
184
185 Ok(db)
186 }
187
188 fn load_paths(
196 &self,
197 roots: &[Cow<'a, str>],
198 file_type: FileType,
199 extensions: &HashSet<OsString>,
200 glob_excludes: &GlobSet,
201 path_excludes: &HashSet<&Cow<'a, Path>>,
202 ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
203 let canonical_workspace =
207 self.configuration.workspace.canonicalize().unwrap_or_else(|_| self.configuration.workspace.to_path_buf());
208
209 let canonical_excludes: Vec<String> = path_excludes
213 .iter()
214 .filter_map(|ex| {
215 let p = if Path::new(ex.as_ref()).is_absolute() {
216 ex.as_ref().to_path_buf()
217 } else {
218 self.configuration.workspace.join(ex.as_ref())
219 };
220
221 p.canonicalize().ok()?.into_os_string().into_string().ok()
222 })
223 .collect();
224
225 let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
226
227 for root in roots {
228 let resolved_path = if Path::new(root.as_ref()).is_absolute() {
232 Path::new(root.as_ref()).to_path_buf()
233 } else {
234 self.configuration.workspace.join(root.as_ref())
235 };
236
237 let is_glob_pattern = !resolved_path.exists()
238 && (root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{'));
239
240 let specificity = Self::calculate_pattern_specificity(root.as_ref());
241 if is_glob_pattern {
242 let pattern = if Path::new(root.as_ref()).is_absolute() {
244 root.to_string()
245 } else {
246 self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
248 };
249
250 match glob::glob(&pattern) {
251 Ok(entries) => {
252 for entry in entries {
253 match entry {
254 Ok(path) => {
255 if path.is_file() {
256 let canonical = path.canonicalize().unwrap_or(path);
261 paths_to_process.push((canonical, specificity));
262 }
263 }
264 Err(e) => {
265 tracing::warn!("Failed to read glob entry: {}", e);
266 }
267 }
268 }
269 }
270 Err(e) => {
271 return Err(DatabaseError::Glob(e.to_string()));
272 }
273 }
274 } else {
275 let canonical_root = resolved_path.canonicalize().unwrap_or(resolved_path);
278 for entry in WalkDir::new(&canonical_root).into_iter().filter_map(Result::ok) {
279 if entry.file_type().is_file() {
280 paths_to_process.push((entry.into_path(), specificity));
281 }
282 }
283 }
284 }
285
286 let has_path_excludes = !canonical_excludes.is_empty();
287 let files: Vec<FileWithSpecificity> = paths_to_process
288 .into_par_iter()
289 .filter_map(|(path, specificity)| {
290 if glob_excludes.is_match(&path) {
291 return None;
292 }
293
294 let ext = path.extension()?;
295 if !extensions.contains(ext) {
296 return None;
297 }
298
299 if has_path_excludes {
300 let excluded = path.to_str().is_some_and(|s| {
301 canonical_excludes.iter().any(|excl| {
302 s.starts_with(excl.as_str())
303 && matches!(s.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
304 })
305 });
306
307 if excluded {
308 return None;
309 }
310 }
311
312 let workspace = canonical_workspace.as_path();
313 #[cfg(windows)]
314 let logical_name = path
315 .strip_prefix(workspace)
316 .unwrap_or_else(|_| path.as_path())
317 .to_string_lossy()
318 .replace('\\', "/");
319 #[cfg(not(windows))]
320 let logical_name =
321 path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
322
323 if let Some((ref override_name, ref override_content)) = self.stdin_override
324 && override_name.as_ref() == logical_name
325 {
326 let file = File::new(
327 Cow::Owned(logical_name),
328 file_type,
329 Some(path.clone()),
330 Cow::Owned(override_content.clone()),
331 );
332
333 return Some(Ok(FileWithSpecificity { file, specificity }));
334 }
335
336 match read_file(workspace, &path, file_type) {
337 Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
338 Err(e) => Some(Err(e)),
339 }
340 })
341 .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
342
343 Ok(files)
344 }
345
346 fn calculate_pattern_specificity(pattern: &str) -> usize {
354 let pattern_path = Path::new(pattern);
355
356 let component_count = pattern_path.components().count();
357 let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
358
359 if is_glob {
360 let non_wildcard_components = pattern_path
361 .components()
362 .filter(|c| {
363 let s = c.as_os_str().to_string_lossy();
364 !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
365 })
366 .count();
367 non_wildcard_components * 10
368 } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
369 component_count * 1000
370 } else {
371 component_count * 100
372 }
373 }
374}
375
376#[cfg(test)]
377mod tests {
378 use super::*;
379 use crate::DatabaseReader;
380 use crate::GlobSettings;
381 use std::borrow::Cow;
382 use tempfile::TempDir;
383
384 fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
385 let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
387
388 DatabaseConfiguration {
389 workspace: Cow::Owned(temp_dir.path().to_path_buf()),
390 paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
391 includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
392 excludes: vec![],
393 extensions: vec![Cow::Borrowed("php")],
394 glob: GlobSettings::default(),
395 }
396 }
397
398 fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
399 let file_path = temp_dir.path().join(relative_path);
400 if let Some(parent) = file_path.parent() {
401 std::fs::create_dir_all(parent).unwrap();
402 }
403 std::fs::write(file_path, content).unwrap();
404 }
405
406 #[test]
407 fn test_specificity_calculation_exact_file() {
408 let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
409 assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
410 }
411
412 #[test]
413 fn test_specificity_calculation_directory() {
414 let spec = DatabaseLoader::calculate_pattern_specificity("src/");
415 assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
416 }
417
418 #[test]
419 fn test_specificity_calculation_glob() {
420 let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
421 assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
422 }
423
424 #[test]
425 fn test_specificity_calculation_deeper_path() {
426 let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
427 let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
428 assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
429 }
430
431 #[test]
432 fn test_exact_file_vs_directory() {
433 let temp_dir = TempDir::new().unwrap();
434
435 create_test_file(&temp_dir, "src/b.php", "<?php");
436 create_test_file(&temp_dir, "src/a.php", "<?php");
437
438 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
439 let loader = DatabaseLoader::new(config);
440 let db = loader.load().unwrap();
441
442 let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
443 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
444
445 let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
446 assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
447 }
448
449 #[test]
450 fn test_deeper_vs_shallower_directory() {
451 let temp_dir = TempDir::new().unwrap();
452
453 create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
454
455 let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
456 let loader = DatabaseLoader::new(config);
457 let db = loader.load().unwrap();
458
459 let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
460 assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
461 }
462
463 #[test]
464 fn test_exact_file_vs_glob() {
465 let temp_dir = TempDir::new().unwrap();
466
467 create_test_file(&temp_dir, "src/b.php", "<?php");
468
469 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
470 let loader = DatabaseLoader::new(config);
471 let db = loader.load().unwrap();
472
473 let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
474 assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
475 }
476
477 #[test]
478 fn test_equal_specificity_includes_wins() {
479 let temp_dir = TempDir::new().unwrap();
480
481 create_test_file(&temp_dir, "src/a.php", "<?php");
482
483 let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
484 let loader = DatabaseLoader::new(config);
485 let db = loader.load().unwrap();
486
487 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
488 assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
489 }
490
491 #[test]
492 fn test_complex_scenario_from_bug_report() {
493 let temp_dir = TempDir::new().unwrap();
494
495 create_test_file(&temp_dir, "src/a.php", "<?php");
496 create_test_file(&temp_dir, "src/b.php", "<?php");
497 create_test_file(&temp_dir, "src/c/d.php", "<?php");
498 create_test_file(&temp_dir, "src/c/e.php", "<?php");
499 create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
500 create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
501
502 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
503 let loader = DatabaseLoader::new(config);
504 let db = loader.load().unwrap();
505
506 let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
507 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
508
509 let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
510 assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
511
512 let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
513 assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
514 }
515
516 #[test]
517 fn test_files_only_in_paths() {
518 let temp_dir = TempDir::new().unwrap();
519
520 create_test_file(&temp_dir, "src/a.php", "<?php");
521
522 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
523 let loader = DatabaseLoader::new(config);
524 let db = loader.load().unwrap();
525
526 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
527 assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
528 }
529
530 #[test]
531 fn test_files_only_in_includes() {
532 let temp_dir = TempDir::new().unwrap();
533
534 create_test_file(&temp_dir, "vendor/lib.php", "<?php");
535
536 let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
537 let loader = DatabaseLoader::new(config);
538 let db = loader.load().unwrap();
539
540 let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
541 assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
542 }
543
544 #[test]
545 fn test_stdin_override_replaces_file_content() {
546 let temp_dir = TempDir::new().unwrap();
547 create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
548
549 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
550 let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", "<?php\n// from stdin".to_string());
551 let db = loader.load().unwrap();
552
553 let file = db.files().find(|f| f.name.contains("foo.php")).unwrap();
554 assert_eq!(
555 file.contents.as_ref(),
556 "<?php\n// from stdin",
557 "stdin override content should be used instead of disk"
558 );
559 }
560
561 #[test]
562 fn test_stdin_override_adds_file_when_not_on_disk() {
563 let temp_dir = TempDir::new().unwrap();
564 create_test_file(&temp_dir, "src/.gitkeep", "");
566
567 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
568 let loader =
569 DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", "<?php\n// unsaved buffer".to_string());
570 let db = loader.load().unwrap();
571
572 let file = db.files().find(|f| f.name.contains("unsaved.php")).unwrap();
573 assert_eq!(file.file_type, FileType::Host);
574 assert_eq!(file.contents.as_ref(), "<?php\n// unsaved buffer");
575 }
576}