use std::borrow::Cow;
use std::collections::hash_map::Entry;
use std::ffi::OsString;
use std::path::Path;
use foldhash::HashMap;
use foldhash::HashSet;
use globset::GlobSet;
use rayon::prelude::*;
use walkdir::WalkDir;
use crate::Database;
use crate::DatabaseConfiguration;
use crate::error::DatabaseError;
use crate::exclusion::Exclusion;
use crate::file::File;
use crate::file::FileId;
use crate::file::FileType;
use crate::matcher::build_glob_set;
use crate::utils::read_file;
#[derive(Debug)]
struct FileWithSpecificity {
file: File,
specificity: usize,
}
pub struct DatabaseLoader<'a> {
database: Option<Database<'a>>,
configuration: DatabaseConfiguration<'a>,
memory_sources: Vec<(&'static str, &'static str, FileType)>,
stdin_override: Option<(Cow<'a, str>, String)>,
}
impl<'a> DatabaseLoader<'a> {
#[must_use]
pub fn new(configuration: DatabaseConfiguration<'a>) -> Self {
Self { configuration, memory_sources: vec![], database: None, stdin_override: None }
}
#[must_use]
pub fn with_database(mut self, database: Database<'a>) -> Self {
self.database = Some(database);
self
}
#[must_use]
pub fn with_stdin_override(mut self, logical_name: impl Into<Cow<'a, str>>, content: String) -> Self {
self.stdin_override = Some((logical_name.into(), content));
self
}
pub fn add_memory_source(&mut self, name: &'static str, contents: &'static str, file_type: FileType) {
self.memory_sources.push((name, contents, file_type));
}
pub fn load(mut self) -> Result<Database<'a>, DatabaseError> {
let mut db = self.database.take().unwrap_or_else(|| Database::new(self.configuration.clone()));
db.configuration = self.configuration.clone();
let extensions_set: HashSet<OsString> =
self.configuration.extensions.iter().map(|s| OsString::from(s.as_ref())).collect();
let glob_exclude_patterns: Vec<&str> = self
.configuration
.excludes
.iter()
.filter_map(|ex| match ex {
Exclusion::Pattern(pat) => Some(pat.as_ref()),
Exclusion::Path(_) => None,
})
.collect();
let glob_excludes = build_glob_set(glob_exclude_patterns.iter().copied(), self.configuration.glob)?;
let dir_prune_patterns: Vec<&str> = glob_exclude_patterns
.iter()
.filter_map(|pat| {
let stripped =
pat.strip_suffix("/**/*").or_else(|| pat.strip_suffix("/**")).or_else(|| pat.strip_suffix("/*"))?;
if stripped.is_empty() || stripped == "*" || stripped == "**" {
return None;
}
Some(stripped)
})
.collect();
let dir_prune_globs = build_glob_set(dir_prune_patterns.iter().copied(), self.configuration.glob)?;
let path_excludes: HashSet<_> = self
.configuration
.excludes
.iter()
.filter_map(|ex| match ex {
Exclusion::Path(p) => Some(p),
_ => None,
})
.collect();
let host_files_with_spec = self.load_paths(
&self.configuration.paths,
FileType::Host,
&extensions_set,
&glob_excludes,
&dir_prune_globs,
&path_excludes,
)?;
let vendored_files_with_spec = self.load_paths(
&self.configuration.includes,
FileType::Vendored,
&extensions_set,
&glob_excludes,
&dir_prune_globs,
&path_excludes,
)?;
let mut all_files: HashMap<FileId, File> = HashMap::default();
let mut file_decisions: HashMap<FileId, (FileType, usize)> = HashMap::default();
for file_with_spec in host_files_with_spec {
let file_id = file_with_spec.file.id;
let specificity = file_with_spec.specificity;
all_files.insert(file_id, file_with_spec.file);
file_decisions.insert(file_id, (FileType::Host, specificity));
}
if let Some((ref name, ref content)) = self.stdin_override {
let file = File::ephemeral(Cow::Owned(name.as_ref().to_string()), Cow::Owned(content.clone()));
let file_id = file.id;
if let Entry::Vacant(e) = all_files.entry(file_id) {
e.insert(file);
file_decisions.insert(file_id, (FileType::Host, usize::MAX));
}
}
for file_with_spec in vendored_files_with_spec {
let file_id = file_with_spec.file.id;
let vendored_specificity = file_with_spec.specificity;
all_files.entry(file_id).or_insert(file_with_spec.file);
match file_decisions.get(&file_id) {
Some((FileType::Host, host_specificity)) if vendored_specificity < *host_specificity => {
}
_ => {
file_decisions.insert(file_id, (FileType::Vendored, vendored_specificity));
}
}
}
db.reserve(file_decisions.len() + self.memory_sources.len());
for (file_id, (final_type, _)) in file_decisions {
if let Some(mut file) = all_files.remove(&file_id) {
file.file_type = final_type;
db.add(file);
}
}
for (name, contents, file_type) in self.memory_sources {
let file = File::new(Cow::Borrowed(name), file_type, None, Cow::Borrowed(contents));
db.add(file);
}
Ok(db)
}
fn load_paths(
&self,
roots: &[Cow<'a, str>],
file_type: FileType,
extensions: &HashSet<OsString>,
glob_excludes: &GlobSet,
dir_prune_globs: &GlobSet,
path_excludes: &HashSet<&Cow<'a, Path>>,
) -> Result<Vec<FileWithSpecificity>, DatabaseError> {
let canonical_workspace =
self.configuration.workspace.canonicalize().unwrap_or_else(|_| self.configuration.workspace.to_path_buf());
let canonical_excludes: Vec<String> = path_excludes
.iter()
.filter_map(|ex| {
let p = if Path::new(ex.as_ref()).is_absolute() {
ex.as_ref().to_path_buf()
} else {
self.configuration.workspace.join(ex.as_ref())
};
p.canonicalize().ok()?.into_os_string().into_string().ok()
})
.collect();
let workspace_relative_str = |path: &Path| -> String {
let rel = path.strip_prefix(canonical_workspace.as_path()).unwrap_or(path);
let s = rel.to_string_lossy();
#[cfg(windows)]
{
s.replace('\\', "/")
}
#[cfg(not(windows))]
{
s.into_owned()
}
};
let mut paths_to_process: Vec<(std::path::PathBuf, usize)> = Vec::new();
for root in roots {
let resolved_path = if Path::new(root.as_ref()).is_absolute() {
Path::new(root.as_ref()).to_path_buf()
} else {
self.configuration.workspace.join(root.as_ref())
};
let is_glob_pattern = !resolved_path.exists()
&& (root.contains('*') || root.contains('?') || root.contains('[') || root.contains('{'));
let specificity = Self::calculate_pattern_specificity(root.as_ref());
if is_glob_pattern {
let pattern = if Path::new(root.as_ref()).is_absolute() {
root.to_string()
} else {
self.configuration.workspace.join(root.as_ref()).to_string_lossy().to_string()
};
match glob::glob(&pattern) {
Ok(entries) => {
for entry in entries {
match entry {
Ok(path) => {
if path.is_file() {
let canonical = path.canonicalize().unwrap_or(path);
paths_to_process.push((canonical, specificity));
}
}
Err(e) => {
tracing::warn!("Failed to read glob entry: {}", e);
}
}
}
}
Err(e) => {
return Err(DatabaseError::Glob(e.to_string()));
}
}
} else {
let canonical_root = resolved_path.canonicalize().unwrap_or(resolved_path);
let has_dir_prunes = !dir_prune_globs.is_empty();
let has_path_prunes = !canonical_excludes.is_empty();
let walker = WalkDir::new(&canonical_root).into_iter().filter_entry(|entry| {
if entry.depth() == 0 || !entry.file_type().is_dir() {
return true;
}
let path = entry.path();
if has_path_prunes
&& let Some(p) = path.to_str()
&& canonical_excludes.iter().any(|excl| {
p.starts_with(excl.as_str())
&& matches!(p.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
})
{
return false;
}
if has_dir_prunes
&& (dir_prune_globs.is_match(path) || dir_prune_globs.is_match(workspace_relative_str(path)))
{
return false;
}
true
});
for entry in walker.filter_map(Result::ok) {
if entry.file_type().is_file() {
paths_to_process.push((entry.into_path(), specificity));
}
}
}
}
let has_path_excludes = !canonical_excludes.is_empty();
let has_glob_excludes = !glob_excludes.is_empty();
let files: Vec<FileWithSpecificity> = paths_to_process
.into_par_iter()
.filter_map(|(path, specificity)| {
if has_glob_excludes
&& (glob_excludes.is_match(&path) || glob_excludes.is_match(workspace_relative_str(&path)))
{
return None;
}
let ext = path.extension()?;
if !extensions.contains(ext) {
return None;
}
if has_path_excludes {
let excluded = path.to_str().is_some_and(|s| {
canonical_excludes.iter().any(|excl| {
s.starts_with(excl.as_str())
&& matches!(s.as_bytes().get(excl.len()), None | Some(&b'/' | &b'\\'))
})
});
if excluded {
return None;
}
}
let workspace = canonical_workspace.as_path();
#[cfg(windows)]
let logical_name = path
.strip_prefix(workspace)
.unwrap_or_else(|_| path.as_path())
.to_string_lossy()
.replace('\\', "/");
#[cfg(not(windows))]
let logical_name =
path.strip_prefix(workspace).unwrap_or(path.as_path()).to_string_lossy().into_owned();
if let Some((ref override_name, ref override_content)) = self.stdin_override
&& override_name.as_ref() == logical_name
{
let file = File::new(
Cow::Owned(logical_name),
file_type,
Some(path.clone()),
Cow::Owned(override_content.clone()),
);
return Some(Ok(FileWithSpecificity { file, specificity }));
}
match read_file(workspace, &path, file_type) {
Ok(file) => Some(Ok(FileWithSpecificity { file, specificity })),
Err(e) => Some(Err(e)),
}
})
.collect::<Result<Vec<FileWithSpecificity>, _>>()?;
Ok(files)
}
fn calculate_pattern_specificity(pattern: &str) -> usize {
let pattern_path = Path::new(pattern);
let component_count = pattern_path.components().count();
let is_glob = pattern.contains('*') || pattern.contains('?') || pattern.contains('[') || pattern.contains('{');
if is_glob {
let non_wildcard_components = pattern_path
.components()
.filter(|c| {
let s = c.as_os_str().to_string_lossy();
!s.contains('*') && !s.contains('?') && !s.contains('[') && !s.contains('{')
})
.count();
non_wildcard_components * 10
} else if pattern_path.is_file() || pattern_path.extension().is_some() || pattern.ends_with(".php") {
component_count * 1000
} else {
component_count * 100
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::DatabaseReader;
use crate::GlobSettings;
use std::borrow::Cow;
use tempfile::TempDir;
fn create_test_config(temp_dir: &TempDir, paths: Vec<&str>, includes: Vec<&str>) -> DatabaseConfiguration<'static> {
let normalize = |s: &str| s.replace('/', std::path::MAIN_SEPARATOR_STR);
DatabaseConfiguration {
workspace: Cow::Owned(temp_dir.path().to_path_buf()),
paths: paths.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
includes: includes.into_iter().map(|s| Cow::Owned(normalize(s))).collect(),
excludes: vec![],
extensions: vec![Cow::Borrowed("php")],
glob: GlobSettings::default(),
}
}
fn create_test_file(temp_dir: &TempDir, relative_path: &str, content: &str) {
let file_path = temp_dir.path().join(relative_path);
if let Some(parent) = file_path.parent() {
std::fs::create_dir_all(parent).unwrap();
}
std::fs::write(file_path, content).unwrap();
}
#[test]
fn test_specificity_calculation_exact_file() {
let spec = DatabaseLoader::calculate_pattern_specificity("src/b.php");
assert!(spec >= 2000, "Exact file should have high specificity, got {spec}");
}
#[test]
fn test_specificity_calculation_directory() {
let spec = DatabaseLoader::calculate_pattern_specificity("src/");
assert!((100..1000).contains(&spec), "Directory should have moderate specificity, got {spec}");
}
#[test]
fn test_specificity_calculation_glob() {
let spec = DatabaseLoader::calculate_pattern_specificity("src/*.php");
assert!(spec < 100, "Glob pattern should have low specificity, got {spec}");
}
#[test]
fn test_specificity_calculation_deeper_path() {
let shallow_spec = DatabaseLoader::calculate_pattern_specificity("src/");
let deep_spec = DatabaseLoader::calculate_pattern_specificity("src/foo/bar/");
assert!(deep_spec > shallow_spec, "Deeper path should have higher specificity");
}
#[test]
fn test_exact_file_vs_directory() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/b.php", "<?php");
create_test_file(&temp_dir, "src/a.php", "<?php");
let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/"]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let b_file = db.files().find(|f| f.name.contains("b.php")).unwrap();
assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host (exact file beats directory)");
let a_file = db.files().find(|f| f.name.contains("a.php")).unwrap();
assert_eq!(a_file.file_type, FileType::Vendored, "src/a.php should be Vendored");
}
#[test]
fn test_deeper_vs_shallower_directory() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/foo/bar.php", "<?php");
let config = create_test_config(&temp_dir, vec!["src/foo/"], vec!["src/"]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("bar.php")).unwrap();
assert_eq!(file.file_type, FileType::Host, "Deeper directory pattern should win");
}
#[test]
fn test_exact_file_vs_glob() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/b.php", "<?php");
let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["src/*.php"]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("b.php")).unwrap();
assert_eq!(file.file_type, FileType::Host, "Exact file should beat glob pattern");
}
#[test]
fn test_equal_specificity_includes_wins() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/a.php", "<?php");
let config = create_test_config(&temp_dir, vec!["src/"], vec!["src/"]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
assert_eq!(file.file_type, FileType::Vendored, "Equal specificity: includes should win");
}
#[test]
fn test_complex_scenario_from_bug_report() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/a.php", "<?php");
create_test_file(&temp_dir, "src/b.php", "<?php");
create_test_file(&temp_dir, "src/c/d.php", "<?php");
create_test_file(&temp_dir, "src/c/e.php", "<?php");
create_test_file(&temp_dir, "vendor/lib1.php", "<?php");
create_test_file(&temp_dir, "vendor/lib2.php", "<?php");
let config = create_test_config(&temp_dir, vec!["src/b.php"], vec!["vendor", "src/c", "src/"]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let b_file = db.files().find(|f| f.name.contains("src/b.php") || f.name.ends_with("b.php")).unwrap();
assert_eq!(b_file.file_type, FileType::Host, "src/b.php should be Host in bug scenario");
let d_file = db.files().find(|f| f.name.contains("d.php")).unwrap();
assert_eq!(d_file.file_type, FileType::Vendored, "src/c/d.php should be Vendored");
let lib_file = db.files().find(|f| f.name.contains("lib1.php")).unwrap();
assert_eq!(lib_file.file_type, FileType::Vendored, "vendor/lib1.php should be Vendored");
}
#[test]
fn test_files_only_in_paths() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/a.php", "<?php");
let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("a.php")).unwrap();
assert_eq!(file.file_type, FileType::Host, "File only in paths should be Host");
}
#[test]
fn test_files_only_in_includes() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "vendor/lib.php", "<?php");
let config = create_test_config(&temp_dir, vec![], vec!["vendor/"]);
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("lib.php")).unwrap();
assert_eq!(file.file_type, FileType::Vendored, "File only in includes should be Vendored");
}
#[test]
fn test_stdin_override_replaces_file_content() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/foo.php", "<?php\n// on disk");
let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
let loader = DatabaseLoader::new(config).with_stdin_override("src/foo.php", "<?php\n// from stdin".to_string());
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("foo.php")).unwrap();
assert_eq!(
file.contents.as_ref(),
"<?php\n// from stdin",
"stdin override content should be used instead of disk"
);
}
#[test]
fn test_glob_excludes_match_workspace_relative_paths() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/Absences/Foo/Foo.php", "<?php");
create_test_file(&temp_dir, "src/Absences/Test/Faker/Provider/AbsencesProvider.php", "<?php");
create_test_file(&temp_dir, "src/Calendar/Test/Helper.php", "<?php");
let mut config = create_test_config(&temp_dir, vec!["src"], vec![]);
config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("src/*/Test/**"))];
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let names: Vec<String> = db.files().map(|f| f.name.to_string()).collect();
assert!(names.iter().any(|n| n.ends_with("src/Absences/Foo/Foo.php")), "non-Test file should be loaded");
assert!(
!names.iter().any(|n| n.contains("src/Absences/Test/")),
"files under src/*/Test/** should be excluded, got {names:?}"
);
assert!(
!names.iter().any(|n| n.contains("src/Calendar/Test/")),
"files under src/*/Test/** should be excluded, got {names:?}"
);
}
#[test]
fn test_glob_excludes_match_legacy_absolute_prefix_patterns() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "packages/foo/src/main.php", "<?php");
create_test_file(&temp_dir, "packages/foo/vendor/lib.php", "<?php");
let mut config = create_test_config(&temp_dir, vec!["packages"], vec![]);
config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("*/packages/**/vendor/*"))];
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let names: Vec<String> = db.files().map(|f| f.name.to_string()).collect();
assert!(names.iter().any(|n| n.ends_with("packages/foo/src/main.php")));
assert!(
!names.iter().any(|n| n.contains("/vendor/")),
"legacy `*/packages/**/vendor/*` style should still exclude vendor files, got {names:?}"
);
}
#[test]
fn test_glob_dir_prune_skips_relative_directories() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "vendor/slevomat/coding-standard/main.php", "<?php");
create_test_file(&temp_dir, "vendor/slevomat/coding-standard/tests/Sniffs/Foo.php", "<?php");
create_test_file(&temp_dir, "vendor/another/lib.php", "<?php");
let mut config = create_test_config(&temp_dir, vec![], vec!["vendor"]);
config.excludes = vec![Exclusion::Pattern(Cow::Borrowed("vendor/**/tests/**"))];
let loader = DatabaseLoader::new(config);
let db = loader.load().unwrap();
let names: Vec<String> = db.files().map(|f| f.name.to_string()).collect();
assert!(names.iter().any(|n| n.ends_with("vendor/slevomat/coding-standard/main.php")));
assert!(names.iter().any(|n| n.ends_with("vendor/another/lib.php")));
assert!(
!names.iter().any(|n| n.contains("/tests/")),
"files under vendor/**/tests/** should be pruned, got {names:?}"
);
}
#[test]
fn test_stdin_override_adds_file_when_not_on_disk() {
let temp_dir = TempDir::new().unwrap();
create_test_file(&temp_dir, "src/.gitkeep", "");
let config = create_test_config(&temp_dir, vec!["src/"], vec![]);
let loader =
DatabaseLoader::new(config).with_stdin_override("src/unsaved.php", "<?php\n// unsaved buffer".to_string());
let db = loader.load().unwrap();
let file = db.files().find(|f| f.name.contains("unsaved.php")).unwrap();
assert_eq!(file.file_type, FileType::Host);
assert_eq!(file.contents.as_ref(), "<?php\n// unsaved buffer");
}
}