1use std::borrow::Cow;
4use std::ffi::OsString;
5use std::path::Path;
6
7use foldhash::HashMap;
8use foldhash::HashSet;
9use globset::Glob;
10use globset::GlobSet;
11use globset::GlobSetBuilder;
12use rayon::prelude::*;
13use walkdir::WalkDir;
14
15use crate::Database;
16use crate::DatabaseConfiguration;
17use crate::error::DatabaseError;
18use crate::exclusion::Exclusion;
19use crate::file::File;
20use crate::file::FileId;
21use crate::file::FileType;
22use crate::utils::read_file;
23
24#[derive(Debug)]
30struct FileWithSpecificity {
31 file: File,
32 specificity: usize,
33}
34
35pub struct DatabaseLoader<'a> {
37 database: Option<Database<'a>>,
38 configuration: DatabaseConfiguration<'a>,
39 memory_sources: Vec<(&'static str, &'static str, FileType)>,
40}
41
42impl<'a> DatabaseLoader<'a> {
43 #[must_use]
44 pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
45 Self { configuration, memory_sources: vec![], database: None }
46 }
47
48 #[must_use]
49 pub fn with_database(mut self, database: Database<'a>) -> Self {
50 self.database = Some(database);
51 self
52 }
53
54 pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
55 self.memory_sources.push((name, contents, file_type));
56 }
57
58 pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
67 let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
68
69 db.configuration = self.configuration.clone();
72
73 let extensions_set: HashSet<OsString> =
74 self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
75
76 let mut glob_builder = GlobSetBuilder::new();
77 for ex in &self.configuration.excludes {
78 if let Exclusion::Pattern(pat) = ex {
79 glob_builder.add(Glob::new(pat)?);
80 }
81 }
82
83 let glob_excludes = glob_builder.build()?;
84
85 let path_excludes: HashSet<_> = self
86 .configuration
87 .excludes
88 .iter()
89 .filter_map(|ex| match ex {
90 Exclusion::Path(p) => Some(p),
91 _ => None,
92 })
93 .collect();
94
95 let host_files_with_spec = self.load_paths(
96 &self.configuration.paths,
97 FileType::Host,
98 &extensions_set,
99 &glob_excludes,
100 &path_excludes,
101 )?;
102 let vendored_files_with_spec = self.load_paths(
103 &self.configuration.includes,
104 FileType::Vendored,
105 &extensions_set,
106 &glob_excludes,
107 &path_excludes,
108 )?;
109
110 let mut all_files: HashMap<FileId, File> = HashMap::default();
111 let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
112
113 for file_with_spec in host_files_with_spec {
115 let file_id = file_with_spec.file.id;
116 let specificity = file_with_spec.specificity;
117
118 all_files.insert(file_id, file_with_spec.file);
119 file_decisions.insert(file_id, (FileType::Host, specificity));
120 }
121
122 for file_with_spec in vendored_files_with_spec {
123 let file_id = file_with_spec.file.id;
124 let vendored_specificity = file_with_spec.specificity;
125
126 all_files.entry(file_id).or_insert(file_with_spec.file);
127
128 match file_decisions.get(&file_id) {
129 Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
130 }
132 _ => {
133 file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
134 }
135 }
136 }
137
138 for (file_id, (final_type, _)) in file_decisions {
139 if let Some(mut file) = all_files.remove(&file_id) {
140 file.file_type = final_type;
141 db.add(file);
142 }
143 }
144
145 for (name, contents, file_type) in self.memory_sources {
146 let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
147
148 db.add(file);
149 }
150
151 Ok(db)
152 }
153
154 fn load_paths(
162 &self,
163 roots: &[Cow<'a, str>],
164 file_type: FileType,
165 extensions: &HashSet<OsString>,
166 glob_excludes: &GlobSet,
167 path_excludes: &HashSet<&Cow<'a, Path>>,
168 ) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
169 let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
170
171 for root in roots {
172 let is_glob_pattern = root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{');
174
175 let specificity = Self::calculate_pattern_specificity(root.as_ref());
176 if is_glob_pattern {
177 let pattern = if Path::new(root.as_ref()).is_absolute() {
179 root.to_string()
180 } else {
181 self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
183 };
184
185 match glob::glob(&pattern) {
186 Ok(entries) => {
187 for entry in entries {
188 match entry {
189 Ok(path) => {
190 if path.is_file() {
191 paths_to_process.push((path, specificity));
192 }
193 }
194 Err(e) => {
195 tracing::warn!("Failed to read glob entry: {}", e);
196 }
197 }
198 }
199 }
200 Err(e) => {
201 return Err(DatabaseError::Glob(e.to_string()));
202 }
203 }
204 } else {
205 let dir_path = if Path::new(root.as_ref()).is_absolute() {
207 Path::new(root.as_ref()).to_path_buf()
208 } else {
209 self.configuration.workspace.join(root.as_ref())
210 };
211
212 for entry in WalkDir::new(&dir_path).into_iter().filter_map(Result::ok) {
213 if entry.file_type().is_file() {
214 paths_to_process.push((entry.into_path(), specificity));
215 }
216 }
217 }
218 }
219
220 let has_path_excludes = !path_excludes.is_empty();
221 let files: Vec<FileWithSpecificity> = paths_to_process
222 .into_par_iter()
223 .filter_map(|(path, specificity)| {
224 if glob_excludes.is_match(&path) {
225 return None;
226 }
227
228 let ext = path.extension()?;
229 if !extensions.contains(ext) {
230 return None;
231 }
232
233 if has_path_excludes
234 && let Ok(canonical_path) = path.canonicalize()
235 && path_excludes.iter().any(|excluded| canonical_path.starts_with(excluded))
236 {
237 return None;
238 }
239
240 match read_file(self.configuration.workspace.as_ref(), &path, file_type) {
241 Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
242 Err(e) => Some(Err(e)),
243 }
244 })
245 .collect::<Result<Vec<FileWithSpecificity>, _>>()?;
246
247 Ok(files)
248 }
249
250 fn calculate_pattern_specificity(pattern: &str) -> usize {
258 let pattern_path = Path::new(pattern);
259
260 let component_count = pattern_path.components().count();
261 let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
262
263 if is_glob {
264 let non_wildcard_components = pattern_path
265 .components()
266 .filter(|c| {
267 let s = c.as_os_str().to_string_lossy();
268 !s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
269 })
270 .count();
271 non_wildcard_components * 10
272 } else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
273 component_count * 1000
274 } else {
275 component_count * 100
276 }
277 }
278}
279
280#[cfg(test)]
281mod tests {
282 use super::*;
283 use crate::DatabaseReader;
284 use std::borrow::Cow;
285 use tempfile::TempDir;
286
287 fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
288 let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
290
291 DatabaseConfiguration {
292 workspace: Cow::Owned(temp_dir.path().to_path_buf()),
293 paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
294 includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
295 excludes: vec![],
296 extensions: vec![Cow::Borrowed("php")],
297 }
298 }
299
300 fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
301 let file_path = temp_dir.path().join(relative_path);
302 if let Some(parent) = file_path.parent() {
303 std::fs::create_dir_all(parent).unwrap();
304 }
305 std::fs::write(file_path, content).unwrap();
306 }
307
308 #[test]
309 fn test_specificity_calculation_exact_file() {
310 let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
311 assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
312 }
313
314 #[test]
315 fn test_specificity_calculation_directory() {
316 let spec = DatabaseLoader::calculate_pattern_specificity("src/");
317 assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
318 }
319
320 #[test]
321 fn test_specificity_calculation_glob() {
322 let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
323 assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
324 }
325
326 #[test]
327 fn test_specificity_calculation_deeper_path() {
328 let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
329 let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
330 assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
331 }
332
333 #[test]
334 fn test_exact_file_vs_directory() {
335 let temp_dir = TempDir::new().unwrap();
336
337 create_test_file(&temp_dir, "src/b.php", "<?php");
338 create_test_file(&temp_dir, "src/a.php", "<?php");
339
340 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
341 let loader = DatabaseLoader::new(config);
342 let db = loader.load().unwrap();
343
344 let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
345 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
346
347 let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
348 assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
349 }
350
351 #[test]
352 fn test_deeper_vs_shallower_directory() {
353 let temp_dir = TempDir::new().unwrap();
354
355 create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
356
357 let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
358 let loader = DatabaseLoader::new(config);
359 let db = loader.load().unwrap();
360
361 let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
362 assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
363 }
364
365 #[test]
366 fn test_exact_file_vs_glob() {
367 let temp_dir = TempDir::new().unwrap();
368
369 create_test_file(&temp_dir, "src/b.php", "<?php");
370
371 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
372 let loader = DatabaseLoader::new(config);
373 let db = loader.load().unwrap();
374
375 let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
376 assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
377 }
378
379 #[test]
380 fn test_equal_specificity_includes_wins() {
381 let temp_dir = TempDir::new().unwrap();
382
383 create_test_file(&temp_dir, "src/a.php", "<?php");
384
385 let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
386 let loader = DatabaseLoader::new(config);
387 let db = loader.load().unwrap();
388
389 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
390 assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
391 }
392
393 #[test]
394 fn test_complex_scenario_from_bug_report() {
395 let temp_dir = TempDir::new().unwrap();
396
397 create_test_file(&temp_dir, "src/a.php", "<?php");
398 create_test_file(&temp_dir, "src/b.php", "<?php");
399 create_test_file(&temp_dir, "src/c/d.php", "<?php");
400 create_test_file(&temp_dir, "src/c/e.php", "<?php");
401 create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
402 create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
403
404 let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
405 let loader = DatabaseLoader::new(config);
406 let db = loader.load().unwrap();
407
408 let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
409 assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
410
411 let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
412 assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
413
414 let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
415 assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
416 }
417
418 #[test]
419 fn test_files_only_in_paths() {
420 let temp_dir = TempDir::new().unwrap();
421
422 create_test_file(&temp_dir, "src/a.php", "<?php");
423
424 let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
425 let loader = DatabaseLoader::new(config);
426 let db = loader.load().unwrap();
427
428 let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
429 assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
430 }
431
432 #[test]
433 fn test_files_only_in_includes() {
434 let temp_dir = TempDir::new().unwrap();
435
436 create_test_file(&temp_dir, "vendor/lib.php", "<?php");
437
438 let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
439 let loader = DatabaseLoader::new(config);
440 let db = loader.load().unwrap();
441
442 let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
443 assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
444 }
445}