#![cfg_attr(coverage_nightly, coverage(off))]
use anyhow::Result;
use ignore::{DirEntry, WalkBuilder, WalkState};
use lazy_static::lazy_static;
use regex::RegexSet;
use serde::{Deserialize, Serialize};
use std::{
path::{Path, PathBuf},
sync::Arc,
};
use tracing::{debug, trace};
use crate::services::file_classifier::FileClassifier;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FileCategory {
SourceCode, EssentialDoc, BuildConfig, GeneratedOutput, DevelopmentDoc, TestArtifact, }
lazy_static! {
static ref EXTERNAL_REPO_PATTERNS: RegexSet = RegexSet::new([
r"https?___", r".*___github_com_.*", r".*___gitlab_com_.*", r".*___bitbucket_org_.*", r".*\$\$external\$\$.*", r".*/external_deps/.*", r".*/third_party_repos/.*", ]).expect("Invalid regex patterns");
static ref ADDITIONAL_IGNORE_PATTERNS: Vec<&'static str> = vec![
"/.cargo/registry/",
"/.cargo/git/",
"/.rustup/",
"/site-packages/",
"/.venv/",
"/venv/",
"/.tox/",
"/__pycache__/",
"/.mypy_cache/",
"/.pytest_cache/",
"/.gradle/",
"/gradle/",
"/.m2/",
"/.ivy2/",
"/.sbt/",
"/.coursier/",
"/bazel-*/",
"/.ccache/",
"/.cache/",
"*.min.js",
"*.min.css",
"*.bundle.js",
"*-bundle.js",
"*.production.js",
"*.prod.js",
"*-min.js",
"*.packed.js",
"*.dist.js",
];
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileDiscoveryConfig {
pub max_depth: Option<usize>,
pub follow_links: bool,
pub respect_gitignore: bool,
pub filter_external_repos: bool,
pub custom_ignore_patterns: Vec<String>,
pub max_files: Option<usize>,
}
impl Default for FileDiscoveryConfig {
fn default() -> Self {
Self {
max_depth: Some(15),
follow_links: false,
respect_gitignore: true,
filter_external_repos: true,
custom_ignore_patterns: vec![],
max_files: Some(50_000), }
}
}
pub struct ProjectFileDiscovery {
root: PathBuf,
config: FileDiscoveryConfig,
classifier: Arc<FileClassifier>,
}
impl ProjectFileDiscovery {
#[must_use]
pub fn new(root: PathBuf) -> Self {
Self {
root,
config: FileDiscoveryConfig::default(),
classifier: Arc::new(FileClassifier::default()),
}
}
#[must_use]
pub fn with_config(mut self, config: FileDiscoveryConfig) -> Self {
self.config = config;
self
}
#[must_use]
pub fn with_classifier(mut self, classifier: Arc<FileClassifier>) -> Self {
self.classifier = classifier;
self
}
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct DiscoveryStats {
pub total_files: usize,
pub files_by_extension: std::collections::HashMap<String, usize>,
pub files_by_category: std::collections::HashMap<String, usize>,
pub discovered_paths: Vec<PathBuf>,
}
pub struct ExternalRepoFilter {
patterns: RegexSet,
}
impl ExternalRepoFilter {
#[must_use]
pub fn new() -> Self {
Self {
patterns: EXTERNAL_REPO_PATTERNS.clone(),
}
}
#[must_use]
pub fn is_external_dependency(&self, entry: &DirEntry) -> bool {
let path_str = entry.path().to_string_lossy();
self.patterns.is_match(&path_str)
}
}
impl Default for ExternalRepoFilter {
fn default() -> Self {
Self::new()
}
}
include!("file_discovery_walker.rs");
include!("file_discovery_filters.rs");
include!("file_discovery_categorization.rs");
include!("file_discovery_tests.rs");