impl FileClassifier {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub fn should_parse_with_options(
&self,
path: &Path,
content: &[u8],
include_large_files: bool,
) -> ParseDecision {
if content.is_empty() {
return ParseDecision::Skip(SkipReason::EmptyFile);
}
if content.len() > self.max_file_size {
return ParseDecision::Skip(SkipReason::FileTooLarge);
}
if !include_large_files && content.len() > LARGE_FILE_THRESHOLD {
return ParseDecision::Skip(SkipReason::LargeFile);
}
if self.is_build_artifact(path) {
return ParseDecision::Skip(SkipReason::BuildArtifact);
}
if self.skip_vendor && self.is_vendor_path(path) {
return ParseDecision::Skip(SkipReason::VendorDirectory);
}
let sample = &content[..content.len().min(1024)];
if self.is_binary(sample) {
return ParseDecision::Skip(SkipReason::BinaryContent);
}
if let Ok(text) = std::str::from_utf8(content) {
if text.lines().any(|l| l.len() > self.max_line_length) {
return ParseDecision::Skip(SkipReason::LineTooLong);
}
}
if self.is_minified(sample) {
return ParseDecision::Skip(SkipReason::MinifiedContent);
}
ParseDecision::Parse
}
#[must_use]
pub fn should_parse(&self, path: &Path, content: &[u8]) -> ParseDecision {
self.should_parse_with_options(path, content, false)
}
fn is_vendor_path(&self, path: &Path) -> bool {
let path_str = path.to_string_lossy();
if self
.vendor_patterns
.iter()
.any(|pattern| path_str.contains(pattern))
{
return true;
}
if let Some(name) = path.file_name() {
let name_str = name.to_string_lossy();
for pattern in &VENDOR_RULES.file_patterns {
if let Ok(re) = Regex::new(pattern) {
if re.is_match(&name_str) {
return true;
}
}
}
}
false
}
fn is_binary(&self, sample: &[u8]) -> bool {
if sample.contains(&0) {
return true;
}
let non_printable = sample
.iter()
.filter(|&&b| b < 32 && b != b'\n' && b != b'\r' && b != b'\t')
.count();
non_printable as f64 / sample.len() as f64 > 0.3
}
fn is_minified(&self, sample: &[u8]) -> bool {
for sig in &VENDOR_RULES.content_signatures {
if sample.starts_with(sig) {
return true;
}
}
let entropy = calculate_shannon_entropy(sample);
let newline_count = sample.iter().filter(|&&b| b == b'\n').count();
let newline_ratio = newline_count as f64 / sample.len() as f64;
entropy > MINIFIED_ENTROPY_THRESHOLD || newline_ratio < 0.001
}
fn is_build_artifact(&self, path: &Path) -> bool {
let path_str = path.to_string_lossy();
BUILD_PATTERNS
.iter()
.any(|pattern| path_str.contains(pattern))
}
}
fn calculate_shannon_entropy(data: &[u8]) -> f64 {
let mut frequencies = [0u32; 256];
for &byte in data {
frequencies[byte as usize] += 1;
}
let len = data.len() as f64;
let mut entropy = 0.0;
for &count in &frequencies {
if count > 0 {
let p = f64::from(count) / len;
entropy -= p * p.log2();
}
}
entropy
}