1use std::borrow::Cow;
4use std::collections::hash_map::Entry;
5use std::ffi::OsString;
6use std::path::Path;
7
8use foldhash::HashMap;
9use foldhash::HashSet;
10use globset::Glob;
11use globset::GlobSet;
12use globset::GlobSetBuilder;
13use rayon::prelude::*;
14use walkdir::WalkDir;
15
16use crate::Database;
17use crate::DatabaseConfiguration;
18use crate::error::DatabaseError;
19use crate::exclusion::Exclusion;
20use crate::file::File;
21use crate::file::FileId;
22use crate::file::FileType;
23use crate::utils::read_file;
24
25#[derive(Debug)]
31struct FileWithSpecificity {
32 file: File,
33 specificity: usize,
34}
35
36pub struct DatabaseLoader<'a> {
38 database: Option<Database<'a>>,
39 configuration: DatabaseConfiguration<'a>,
40 memory_sources: Vec<(&'static str, &'static str, FileType)>,
41 stdin_override: Option<(Cow<'a, str>, String)>,
44}
45
46impl<'a> DatabaseLoader<'a> {
47 #[must_use]
48 pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
49 Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
50 }
51
52 #[must_use]
53 pub fn with_database(mut self, database: Database<'a>) -> Self {
54 self.database = Some(database);
55 self
56 }
57
58 #[must_use]
61 pub fn with_stdin_override(mut self, logical_name: impl Into<Cow<'a, str>>, content: String) -> Self {
62 self.stdin_override = Some((logical_name.into(), content));
63 self
64 }
65
66 pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
67 self.memory_sources.push((name, contents, file_type));
68 }
69
70 pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
79 let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
80
81 db.configuration = self.configuration.clone();
84
85 let extensions_set: HashSet<OsString> =
86 self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
87
88 let mut glob_builder = GlobSetBuilder::new();
89 for ex in &self.configuration.excludes {
90 if let Exclusion::Pattern(pat) = ex {
91 glob_builder.add(Glob::new(pat)?);
92 }
93 }
94
95 let glob_excludes = glob_builder.build()?;
96
97 let path_excludes: HashSet<_> = self
98 .configuration
99 .excludes
100 .iter()
101 .filter_map(|ex| match ex {
102 Exclusion::Path(p) => Some(p),
103 _ => None,
104 })
105 .collect();
106
107 let host_files_with_spec = self.load_paths(
108 &self.configuration.paths,
109 FileType::Host,
110 &extensions_set,
111 &glob_excludes,
112 &path_excludes,
113 )?;
114 let vendored_files_with_spec = self.load_paths(
115 &self.configuration.includes,
116 FileType::Vendored,
117 &extensions_set,
118 &glob_excludes,
119 &path_excludes,
120 )?;
121
122 let mut all_files: HashMap<FileId, File> = HashMap::default();
123 let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
124
125 for file_with_spec in host_files_with_spec {
127 let file_id = file_with_spec.file.id;
128 let specificity = file_with_spec.specificity;
129
130 all_files.insert(file_id, file_with_spec.file);
131 file_decisions.insert(file_id, (FileType::Host, specificity));
132 }
133
134 if let Some((ref name, ref content)) = self.stdin_override {
137 let file = File::ephemeral(Cow::Owned(name.as_ref().to_string()), Cow::Owned(content.clone()));
138 let file_id = file.id;
139 if let Entry::Vacant(e) = all_files.entry(file_id) {
140 e.insert(file);
141
142 file_decisions.insert(file_id, (FileType::Host, usize::MAX));
143 }
144 }
145
146 for file_with_spec in vendored_files_with_spec {
147 let file_id = file_with_spec.file.id;
148 let vendored_specificity = file_with_spec.specificity;
149
150 all_files.entry(file_id).or_insert(file_with_spec.file);
151
152 match file_decisions.get(&file_id) {
153 Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
154 }
156 _ => {
157 file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
158 }
159 }
160 }
161
162 for (file_id, (final_type, _)) in file_decisions {
163 if let Some(mut file) = all_files.remove(&file_id) {
164 file.file_type = final_type;
165 db.add(file);
166 }
167 }
168
169 for (name, contents, file_type) in self.memory_sources {
170 let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
171
172 db.add(file);
173 }
174
175 Ok(db)
176 }
177
178 fn load_paths(
186 &self,
187 roots: &[Cow<'a, str>],
188 file_type: FileType,
189 extensions: &HashSet<OsString>,
190 glob_excludes: &GlobSet,
191 path_excludes: &HashSet<&Cow<'a, Path>>,
192 ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
193 let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
194
195 for root in roots {
196 let resolved_path = if Path::new(root.as_ref()).is_absolute() {
200 Path::new(root.as_ref()).to_path_buf()
201 } else {
202 self.configuration.workspace.join(root.as_ref())
203 };
204
205 let is_glob_pattern = !resolved_path.exists()
206 && (root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{'));
207
208 let specificity = Self::calculate_pattern_specificity(root.as_ref());
209 if is_glob_pattern {
210 let pattern = if Path::new(root.as_ref()).is_absolute() {
212 root.to_string()
213 } else {
214 self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
216 };
217
218 match glob::glob(&pattern) {
219 Ok(entries) => {
220 for entry in entries {
221 match entry {
222 Ok(path) => {
223 if path.is_file() {
224 paths_to_process.push((path, specificity));
225 }
226 }
227 Err(e) => {
228 tracing::warn!("Failed to read glob entry: {}", e);
229 }
230 }
231 }
232 }
233 Err(e) => {
234 return Err(DatabaseError::Glob(e.to_string()));
235 }
236 }
237 } else {
238 for entry in WalkDir::new(&resolved_path).into_iter().filter_map(Result::ok) {
239 if entry.file_type().is_file() {
240 paths_to_process.push((entry.into_path(), specificity));
241 }
242 }
243 }
244 }
245
246 let has_path_excludes = !path_excludes.is_empty();
247 let files: Vec<FileWithSpecificity> = paths_to_process
248 .into_par_iter()
249 .filter_map(|(path, specificity)| {
250 if glob_excludes.is_match(&path) {
251 return None;
252 }
253
254 let ext = path.extension()?;
255 if !extensions.contains(ext) {
256 return None;
257 }
258
259 if has_path_excludes
260 && let Ok(canonical_path) = path.canonicalize()
261 && path_excludes.iter().any(|excluded| canonical_path.starts_with(excluded))
262 {
263 return None;
264 }
265
266 let workspace = self.configuration.workspace.as_ref();
267 #[cfg(windows)]
268 let logical_name = path
269 .strip_prefix(workspace)
270 .unwrap_or_else(|_| path.as_path())
271 .to_string_lossy()
272 .replace('\\', "/");
273 #[cfg(not(windows))]
274 let logical_name =
275 path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
276
277 if let Some((ref override_name, ref override_content)) = self.stdin_override
278 && override_name.as_ref() == logical_name
279 {
280 let file = File::new(
281 Cow::Owned(logical_name),
282 file_type,
283 Some(path.clone()),
284 Cow::Owned(override_content.clone()),
285 );
286
287 return Some(Ok(FileWithSpecificity { file, specificity }));
288 }
289
290 match read_file(workspace, &path, file_type) {
291 Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
292 Err(e) => Some(Err(e)),
293 }
294 })
295 .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
296
297 Ok(files)
298 }
299
300 fn calculate_pattern_specificity(pattern: &str) -> usize {
308 let pattern_path = Path::new(pattern);
309
310 let component_count = pattern_path.components().count();
311 let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
312
313 if is_glob {
314 let non_wildcard_components = pattern_path
315 .components()
316 .filter(|c| {
317 let s = c.as_os_str().to_string_lossy();
318 !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
319 })
320 .count();
321 non_wildcard_components * 10
322 } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
323 component_count * 1000
324 } else {
325 component_count * 100
326 }
327 }
328}
329
330#[cfg(test)]
331mod tests {
332 use super::*;
333 use crate::DatabaseReader;
334 use std::borrow::Cow;
335 use tempfile::TempDir;
336
337 fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
338 let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
340
341 DatabaseConfiguration {
342 workspace: Cow::Owned(temp_dir.path().to_path_buf()),
343 paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
344 includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
345 excludes: vec![],
346 extensions: vec![Cow::Borrowed("php")],
347 }
348 }
349
350 fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
351 let file_path = temp_dir.path().join(relative_path);
352 if let Some(parent) = file_path.parent() {
353 std::fs::create_dir_all(parent).unwrap();
354 }
355 std::fs::write(file_path, content).unwrap();
356 }
357
358 #[test]
359 fn test_specificity_calculation_exact_file() {
360 let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
361 assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
362 }
363
364 #[test]
365 fn test_specificity_calculation_directory() {
366 let spec = DatabaseLoader::calculate_pattern_specificity("src/");
367 assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
368 }
369
370 #[test]
371 fn test_specificity_calculation_glob() {
372 let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
373 assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
374 }
375
376 #[test]
377 fn test_specificity_calculation_deeper_path() {
378 let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
379 let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
380 assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
381 }
382
383 #[test]
384 fn test_exact_file_vs_directory() {
385 let temp_dir = TempDir::new().unwrap();
386
387 create_test_file(&temp_dir, "src/b.php", "<?php");
388 create_test_file(&temp_dir, "src/a.php", "<?php");
389
390 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
391 let loader = DatabaseLoader::new(config);
392 let db = loader.load().unwrap();
393
394 let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
395 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
396
397 let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
398 assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
399 }
400
401 #[test]
402 fn test_deeper_vs_shallower_directory() {
403 let temp_dir = TempDir::new().unwrap();
404
405 create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
406
407 let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
408 let loader = DatabaseLoader::new(config);
409 let db = loader.load().unwrap();
410
411 let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
412 assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
413 }
414
415 #[test]
416 fn test_exact_file_vs_glob() {
417 let temp_dir = TempDir::new().unwrap();
418
419 create_test_file(&temp_dir, "src/b.php", "<?php");
420
421 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
422 let loader = DatabaseLoader::new(config);
423 let db = loader.load().unwrap();
424
425 let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
426 assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
427 }
428
429 #[test]
430 fn test_equal_specificity_includes_wins() {
431 let temp_dir = TempDir::new().unwrap();
432
433 create_test_file(&temp_dir, "src/a.php", "<?php");
434
435 let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
436 let loader = DatabaseLoader::new(config);
437 let db = loader.load().unwrap();
438
439 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
440 assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
441 }
442
443 #[test]
444 fn test_complex_scenario_from_bug_report() {
445 let temp_dir = TempDir::new().unwrap();
446
447 create_test_file(&temp_dir, "src/a.php", "<?php");
448 create_test_file(&temp_dir, "src/b.php", "<?php");
449 create_test_file(&temp_dir, "src/c/d.php", "<?php");
450 create_test_file(&temp_dir, "src/c/e.php", "<?php");
451 create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
452 create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
453
454 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
455 let loader = DatabaseLoader::new(config);
456 let db = loader.load().unwrap();
457
458 let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
459 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
460
461 let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
462 assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
463
464 let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
465 assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
466 }
467
468 #[test]
469 fn test_files_only_in_paths() {
470 let temp_dir = TempDir::new().unwrap();
471
472 create_test_file(&temp_dir, "src/a.php", "<?php");
473
474 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
475 let loader = DatabaseLoader::new(config);
476 let db = loader.load().unwrap();
477
478 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
479 assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
480 }
481
482 #[test]
483 fn test_files_only_in_includes() {
484 let temp_dir = TempDir::new().unwrap();
485
486 create_test_file(&temp_dir, "vendor/lib.php", "<?php");
487
488 let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
489 let loader = DatabaseLoader::new(config);
490 let db = loader.load().unwrap();
491
492 let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
493 assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
494 }
495
496 #[test]
497 fn test_stdin_override_replaces_file_content() {
498 let temp_dir = TempDir::new().unwrap();
499 create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
500
501 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
502 let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", "<?php\n// from stdin".to_string());
503 let db = loader.load().unwrap();
504
505 let file = db.files().find(|f| f.name.contains("foo.php")).unwrap();
506 assert_eq!(
507 file.contents.as_ref(),
508 "<?php\n// from stdin",
509 "stdin override content should be used instead of disk"
510 );
511 }
512
513 #[test]
514 fn test_stdin_override_adds_file_when_not_on_disk() {
515 let temp_dir = TempDir::new().unwrap();
516 create_test_file(&temp_dir, "src/.gitkeep", "");
518
519 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
520 let loader =
521 DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", "<?php\n// unsaved buffer".to_string());
522 let db = loader.load().unwrap();
523
524 let file = db.files().find(|f| f.name.contains("unsaved.php")).unwrap();
525 assert_eq!(file.file_type, FileType::Host);
526 assert_eq!(file.contents.as_ref(), "<?php\n// unsaved buffer");
527 }
528}