agnix_core/
pipeline.rs

1//! Validation pipeline: file and project validation.
2
3#[cfg(feature = "filesystem")]
4use std::collections::HashMap;
5use std::collections::HashSet;
6use std::path::Path;
7#[cfg(feature = "filesystem")]
8use std::path::PathBuf;
9#[cfg(feature = "filesystem")]
10use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
11
12#[cfg(feature = "filesystem")]
13use rayon::iter::ParallelBridge;
14#[cfg(feature = "filesystem")]
15use rayon::prelude::*;
16#[cfg(feature = "filesystem")]
17use rust_i18n::t;
18
19use crate::config::LintConfig;
20use crate::diagnostics::Diagnostic;
21#[cfg(feature = "filesystem")]
22use crate::diagnostics::{ConfigError, CoreError, LintResult, ValidationError, ValidationOutcome};
23use crate::file_types::{FileType, detect_file_type};
24#[cfg(feature = "filesystem")]
25use crate::file_utils;
26use crate::parsers::frontmatter::normalize_line_endings;
27use crate::registry::ValidatorRegistry;
28#[cfg(feature = "filesystem")]
29use crate::rules::project_level::run_project_level_checks;
30#[cfg(feature = "filesystem")]
31use crate::schemas;
32
33/// Result of validating a project, including diagnostics and metadata.
34///
35/// All fields are public. Use [`ValidationResult::new`] for convenient construction when only
36/// `diagnostics` and `files_checked` are known; struct literal construction is also supported.
37/// Note: adding a new public field in the future would be a breaking change for struct literals
38/// and exhaustive destructuring patterns. For forward-compatible code, prefer
39/// [`ValidationResult::new`] and use `..` in destructure patterns.
40#[derive(Debug, Clone)]
41pub struct ValidationResult {
42    /// Diagnostics found during validation.
43    pub diagnostics: Vec<Diagnostic>,
44    /// Number of files that were checked (excludes Unknown file types).
45    pub files_checked: usize,
46    /// Wall-clock time spent in validation, in milliseconds.
47    pub validation_time_ms: Option<u64>,
48    /// Number of validator instances registered in the registry (not the count of validators executed).
49    /// The field name uses "factories" for backward compatibility; since v0.12.2 this counts
50    /// pre-built cached instances rather than factory invocations.
51    pub validator_factories_registered: usize,
52}
53
54impl ValidationResult {
55    /// Create a new `ValidationResult` with the given diagnostics and file count.
56    ///
57    /// Metadata fields (`validation_time_ms`, `validator_factories_registered`) default to
58    /// `None` / `0` and can be set with the builder-style helpers.
59    pub fn new(diagnostics: Vec<Diagnostic>, files_checked: usize) -> Self {
60        Self {
61            diagnostics,
62            files_checked,
63            validation_time_ms: None,
64            validator_factories_registered: 0,
65        }
66    }
67
68    /// Set the wall-clock validation time (builder pattern).
69    pub fn with_timing(mut self, ms: u64) -> Self {
70        self.validation_time_ms = Some(ms);
71        self
72    }
73
74    /// Set the total number of validator factories registered (builder pattern).
75    pub fn with_validator_factories_registered(mut self, count: usize) -> Self {
76        self.validator_factories_registered = count;
77        self
78    }
79}
80
81/// Pre-compiled file inclusion/exclusion patterns for efficient matching.
82///
83/// Used internally by `validate_project_with_registry` to avoid re-compiling
84/// glob patterns for every file during parallel validation.
85#[derive(Default)]
86pub(crate) struct CompiledFilesConfig {
87    include_as_memory: Vec<glob::Pattern>,
88    include_as_generic: Vec<glob::Pattern>,
89    exclude: Vec<glob::Pattern>,
90}
91
92impl CompiledFilesConfig {
93    fn is_empty(&self) -> bool {
94        self.include_as_memory.is_empty()
95            && self.include_as_generic.is_empty()
96            && self.exclude.is_empty()
97    }
98}
99
100/// Compile glob patterns, collecting any invalid patterns as [`Diagnostic`] warnings
101/// instead of printing to stderr.
102///
103/// Returns the successfully compiled patterns alongside diagnostics for any
104/// patterns that failed to compile. Invalid patterns are excluded from the
105/// compiled output and reported as Diagnostic warnings instead.
106///
107/// `config_file` is the path used in the diagnostic `file` field - callers should
108/// pass an absolute path (e.g. `root_dir.join(".agnix.toml")`) so that diagnostics
109/// are consistent with other config-level diagnostics in the pipeline.
110#[cfg(feature = "filesystem")]
111fn compile_patterns_with_diagnostics(
112    patterns: &[String],
113    config_file: &Path,
114) -> (Vec<glob::Pattern>, Vec<Diagnostic>) {
115    let mut compiled = Vec::with_capacity(patterns.len());
116    let mut diagnostics = Vec::new();
117    for p in patterns {
118        let normalized = p.replace('\\', "/");
119        match glob::Pattern::new(&normalized) {
120            Ok(pat) => compiled.push(pat),
121            Err(e) => {
122                diagnostics.push(
123                    Diagnostic::warning(
124                        config_file.to_path_buf(),
125                        1,
126                        0,
127                        "config::glob",
128                        t!(
129                            "rules.invalid_glob_pattern",
130                            pattern = p,
131                            error = e.to_string()
132                        ),
133                    )
134                    .with_suggestion(t!("rules.invalid_glob_pattern_suggestion")),
135                );
136            }
137        }
138    }
139    (compiled, diagnostics)
140}
141
142/// Compile glob patterns leniently, discarding diagnostics for invalid patterns.
143///
144/// Used in code paths where diagnostics cannot be surfaced (e.g. the public
145/// [`resolve_file_type`] API, which must not change its return type). Invalid
146/// patterns are silently skipped.
147fn compile_patterns_lenient(patterns: &[String]) -> Vec<glob::Pattern> {
148    patterns
149        .iter()
150        .filter_map(|p| {
151            let normalized = p.replace('\\', "/");
152            glob::Pattern::new(&normalized).ok()
153        })
154        .collect()
155}
156
157fn compile_files_config(files: &crate::config::FilesConfig) -> CompiledFilesConfig {
158    CompiledFilesConfig {
159        include_as_memory: compile_patterns_lenient(&files.include_as_memory),
160        include_as_generic: compile_patterns_lenient(&files.include_as_generic),
161        exclude: compile_patterns_lenient(&files.exclude),
162    }
163}
164
165/// Compile `[files]` config patterns, surfacing invalid patterns as diagnostics.
166///
167/// Used by [`validate_project_with_registry`] where diagnostics can be
168/// propagated to the caller. Returns both the compiled config and any
169/// diagnostics for malformed glob patterns.
170///
171/// `config_file` is forwarded to [`compile_patterns_with_diagnostics`] for the
172/// diagnostic `file` field.
173#[cfg(feature = "filesystem")]
174fn compile_files_config_with_diagnostics(
175    files: &crate::config::FilesConfig,
176    config_file: &Path,
177) -> (CompiledFilesConfig, Vec<Diagnostic>) {
178    let mut all_diagnostics = Vec::new();
179
180    let (include_as_memory, diags) =
181        compile_patterns_with_diagnostics(&files.include_as_memory, config_file);
182    all_diagnostics.extend(diags);
183
184    let (include_as_generic, diags) =
185        compile_patterns_with_diagnostics(&files.include_as_generic, config_file);
186    all_diagnostics.extend(diags);
187
188    let (exclude, diags) = compile_patterns_with_diagnostics(&files.exclude, config_file);
189    all_diagnostics.extend(diags);
190
191    (
192        CompiledFilesConfig {
193            include_as_memory,
194            include_as_generic,
195            exclude,
196        },
197        all_diagnostics,
198    )
199}
200
201/// Match options for file inclusion/exclusion glob patterns.
202///
203/// `require_literal_separator` is `true` so that `*` only matches within a
204/// single path component. Users must use `**` for recursive matching (e.g.
205/// `dir/**/*.md` instead of `dir/*.md` to match nested files).
206const FILES_MATCH_OPTIONS: glob::MatchOptions = glob::MatchOptions {
207    case_sensitive: true,
208    require_literal_separator: true,
209    require_literal_leading_dot: false,
210};
211
212fn resolve_with_compiled(
213    path: &Path,
214    root_dir: Option<&Path>,
215    compiled: &CompiledFilesConfig,
216) -> FileType {
217    if compiled.is_empty() {
218        return detect_file_type(path);
219    }
220
221    let rel_path = if let Some(root) = root_dir {
222        normalize_rel_path(path, root)
223    } else {
224        // No root_dir: use filename only
225        path.file_name()
226            .and_then(|n| n.to_str())
227            .unwrap_or("")
228            .to_string()
229    };
230
231    // Priority: exclude > include_as_memory > include_as_generic > detect
232    for pattern in &compiled.exclude {
233        if pattern.matches_with(&rel_path, FILES_MATCH_OPTIONS) {
234            return FileType::Unknown;
235        }
236    }
237    for pattern in &compiled.include_as_memory {
238        if pattern.matches_with(&rel_path, FILES_MATCH_OPTIONS) {
239            return FileType::ClaudeMd;
240        }
241    }
242    for pattern in &compiled.include_as_generic {
243        if pattern.matches_with(&rel_path, FILES_MATCH_OPTIONS) {
244            return FileType::GenericMarkdown;
245        }
246    }
247
248    detect_file_type(path)
249}
250
251/// Resolve file type with config-based overrides.
252///
253/// Applies `[files]` config patterns on top of [`detect_file_type`]:
254/// - `files.exclude` patterns map to [`FileType::Unknown`] (skip validation)
255/// - `files.include_as_memory` patterns map to [`FileType::ClaudeMd`]
256/// - `files.include_as_generic` patterns map to [`FileType::GenericMarkdown`]
257/// - Otherwise falls through to [`detect_file_type`]
258///
259/// Priority: exclude > include_as_memory > include_as_generic > built-in detection.
260///
261/// When no `[files]` patterns are configured, this is equivalent to
262/// calling `detect_file_type(path)` directly.
263pub fn resolve_file_type(path: &Path, config: &LintConfig) -> FileType {
264    let files = config.files_config();
265    if files.include_as_memory.is_empty()
266        && files.include_as_generic.is_empty()
267        && files.exclude.is_empty()
268    {
269        return detect_file_type(path);
270    }
271
272    // Compile patterns on-demand for single-file validation. Invalid patterns
273    // are silently skipped (no diagnostics) because this public API returns only
274    // a FileType. Use validate_project() for diagnostic surfacing, or
275    // LintConfigBuilder::build() / LintConfig::validate() at config load time
276    // for strict validation.
277    let compiled = compile_files_config(files);
278    resolve_with_compiled(path, config.root_dir().map(|p| p.as_path()), &compiled)
279}
280
281/// Validate a single file.
282///
283/// Returns [`ValidationOutcome::Success`] with diagnostics when validation runs,
284/// [`ValidationOutcome::IoError`] when the file cannot be read, or
285/// [`ValidationOutcome::Skipped`] when the file type is unknown.
286///
287/// The `Err` path is reserved for config-level errors only (e.g. those that
288/// occur during `LintConfig` construction).
289///
290/// Note: This function creates a new [`ValidatorRegistry`] on every call. For
291/// bulk validation of multiple files, use
292/// [`validate_file_with_registry()`] with a pre-built shared registry for
293/// significantly better performance. Unlike [`validate_file_with_registry()`],
294/// this function applies `config.rules().disabled_validators` to the
295/// freshly-created registry at construction time.
296#[cfg(feature = "filesystem")]
297pub fn validate_file(path: &Path, config: &LintConfig) -> LintResult<ValidationOutcome> {
298    let mut registry = ValidatorRegistry::with_defaults();
299    for name in &config.rules().disabled_validators {
300        registry.disable_validator_owned(name);
301    }
302    validate_file_with_registry(path, config, &registry)
303}
304
305/// Validate a single file with a custom validator registry.
306///
307/// Returns [`ValidationOutcome::Success`] with diagnostics when validation runs,
308/// [`ValidationOutcome::IoError`] when the file cannot be read, or
309/// [`ValidationOutcome::Skipped`] when the file type is unknown.
310///
311/// The `Err` path is reserved for config-level errors only.
312///
313/// `config.rules().disabled_validators` is applied at runtime, so callers
314/// may share a single `ValidatorRegistry` across configs that differ only
315/// in their disabled-validator sets (e.g. the LSP path). This is consistent
316/// with [`validate_content()`].
317#[cfg(feature = "filesystem")]
318pub fn validate_file_with_registry(
319    path: &Path,
320    config: &LintConfig,
321    registry: &ValidatorRegistry,
322) -> LintResult<ValidationOutcome> {
323    let file_type = resolve_file_type(path, config);
324    validate_file_with_type(path, file_type, config, registry)
325}
326
327/// Validate a single file with a pre-resolved [`FileType`].
328///
329/// This avoids re-compiling `[files]` glob patterns when the file type has
330/// already been determined (e.g. in `validate_project_with_registry` where
331/// patterns are pre-compiled for the entire walk).
332#[cfg(feature = "filesystem")]
333fn validate_file_with_type(
334    path: &Path,
335    file_type: FileType,
336    config: &LintConfig,
337    registry: &ValidatorRegistry,
338) -> LintResult<ValidationOutcome> {
339    if file_type == FileType::Unknown {
340        return Ok(ValidationOutcome::Skipped);
341    }
342
343    let raw_content = match file_utils::safe_read_file(path) {
344        Ok(content) => content,
345        Err(CoreError::File(file_error)) => {
346            return Ok(ValidationOutcome::IoError(file_error));
347        }
348        Err(other) => return Err(other),
349    };
350    let content = normalize_line_endings(&raw_content);
351
352    let validators = registry.validators_for(file_type);
353    let disabled = &config.rules().disabled_validators;
354    let mut diagnostics = Vec::new();
355
356    if disabled.is_empty() {
357        for validator in validators {
358            diagnostics.extend(validator.validate(path, &content, config));
359        }
360    } else {
361        let disabled_set: HashSet<&str> = disabled.iter().map(|s| s.as_str()).collect();
362        for validator in validators {
363            if disabled_set.contains(validator.name()) {
364                continue;
365            }
366            diagnostics.extend(validator.validate(path, &content, config));
367        }
368    }
369
370    Ok(ValidationOutcome::Success(diagnostics))
371}
372
373/// Validate in-memory content for a given path.
374///
375/// This function performs no filesystem I/O -- the content is provided directly.
376/// File type is resolved from the path using [`resolve_file_type`], then all
377/// matching validators are run against the content.
378///
379/// Returns an empty `Vec` if the file type is unknown.
380pub fn validate_content(
381    path: &Path,
382    content: &str,
383    config: &LintConfig,
384    registry: &ValidatorRegistry,
385) -> Vec<Diagnostic> {
386    let file_type = resolve_file_type(path, config);
387    if file_type == FileType::Unknown {
388        return vec![];
389    }
390
391    let content = normalize_line_endings(content);
392
393    let validators = registry.validators_for(file_type);
394    let disabled = &config.rules().disabled_validators;
395    let mut diagnostics = Vec::new();
396
397    // Runtime disabled_validators check: honours per-config disabled_validators
398    // without requiring them to be pre-applied to the registry. The LSP creates
399    // a single shared registry via with_defaults() and relies on this check to
400    // respect per-workspace disabled_validators from the user's LintConfig.
401    if disabled.is_empty() {
402        for validator in validators {
403            diagnostics.extend(validator.validate(path, &content, config));
404        }
405    } else {
406        let disabled_set: HashSet<&str> = disabled.iter().map(|s| s.as_str()).collect();
407        for validator in validators {
408            if disabled_set.contains(validator.name()) {
409                continue;
410            }
411            diagnostics.extend(validator.validate(path, &content, config));
412        }
413    }
414
415    diagnostics
416}
417
418/// Main entry point for validating a project
419#[cfg(feature = "filesystem")]
420pub fn validate_project(path: &Path, config: &LintConfig) -> LintResult<ValidationResult> {
421    let mut registry = ValidatorRegistry::with_defaults();
422    for name in &config.rules().disabled_validators {
423        registry.disable_validator_owned(name);
424    }
425    validate_project_with_registry(path, config, &registry)
426}
427
428#[cfg(feature = "filesystem")]
429struct ExcludePattern {
430    pattern: glob::Pattern,
431    dir_only_prefix: Option<String>,
432    allow_probe: bool,
433}
434
435fn normalize_rel_path(entry_path: &Path, root: &Path) -> String {
436    let rel_path = entry_path.strip_prefix(root).unwrap_or(entry_path);
437    let path_str = rel_path.to_string_lossy().replace('\\', "/");
438    match path_str.strip_prefix("./") {
439        Some(stripped) => stripped.to_string(),
440        None => path_str,
441    }
442}
443
444#[cfg(feature = "filesystem")]
445fn compile_exclude_patterns(excludes: &[String]) -> LintResult<Vec<ExcludePattern>> {
446    excludes
447        .iter()
448        .map(|pattern| {
449            let normalized = pattern.replace('\\', "/");
450            let (glob_str, dir_only_prefix) = if let Some(prefix) = normalized.strip_suffix('/') {
451                (format!("{}/**", prefix), Some(prefix.to_string()))
452            } else {
453                (normalized.clone(), None)
454            };
455            let allow_probe = dir_only_prefix.is_some() || glob_str.contains("**");
456            let compiled = glob::Pattern::new(&glob_str).map_err(|e| {
457                CoreError::Config(ConfigError::InvalidExcludePattern {
458                    pattern: pattern.clone(),
459                    message: e.to_string(),
460                })
461            })?;
462            Ok(ExcludePattern {
463                pattern: compiled,
464                dir_only_prefix,
465                allow_probe,
466            })
467        })
468        .collect()
469}
470
471#[cfg(feature = "filesystem")]
472fn should_prune_dir(rel_dir: &str, exclude_patterns: &[ExcludePattern]) -> bool {
473    if rel_dir.is_empty() {
474        return false;
475    }
476    // Probe path used to detect patterns that match files inside a directory.
477    // Only apply it for recursive patterns (e.g. ** or dir-only prefix).
478    let probe = format!("{}/__agnix_probe__", rel_dir.trim_end_matches('/'));
479    exclude_patterns
480        .iter()
481        .any(|p| p.pattern.matches(rel_dir) || (p.allow_probe && p.pattern.matches(&probe)))
482}
483
484#[cfg(feature = "filesystem")]
485fn is_excluded_file(path_str: &str, exclude_patterns: &[ExcludePattern]) -> bool {
486    exclude_patterns
487        .iter()
488        .any(|p| p.pattern.matches(path_str) && p.dir_only_prefix.as_deref() != Some(path_str))
489}
490
491/// Run only project-level validation checks without per-file validation.
492///
493/// This is a lightweight alternative to [`validate_project`] that only runs
494/// cross-file analysis rules (AGM-006, XP-004/005/006, VER-001). It does
495/// not validate individual file contents.
496///
497/// Designed for the LSP server to provide project-level diagnostics that
498/// require workspace-wide analysis, without the overhead of full per-file
499/// validation (which the LSP handles incrementally via `did_open`/`did_change`).
500#[cfg(feature = "filesystem")]
501pub fn validate_project_rules(root: &Path, config: &LintConfig) -> LintResult<Vec<Diagnostic>> {
502    use ignore::WalkBuilder;
503    use std::sync::Arc;
504
505    let root_dir = resolve_validation_root(root)?;
506    let mut config = config.clone();
507    config.set_root_dir(root_dir.clone());
508
509    // Pre-compile exclude patterns once (Arc for filter_entry 'static bound)
510    let exclude_patterns = Arc::new(compile_exclude_patterns(config.exclude())?);
511
512    let walk_root = std::fs::canonicalize(root).unwrap_or_else(|_| root.to_path_buf());
513    let root_path = root_dir.clone();
514
515    let mut agents_md_paths: Vec<PathBuf> = Vec::new();
516    let mut instruction_file_paths: Vec<PathBuf> = Vec::new();
517    let max_files = config.max_files_to_validate();
518
519    // Walk directory tree collecting only paths relevant to project-level checks.
520    // No per-file validation is performed -- this walk is lightweight.
521    // Respects the same max_files_to_validate limit as validate_project_with_registry
522    // to prevent unbounded directory traversal in large workspaces.
523    for (files_seen, entry) in WalkBuilder::new(&walk_root)
524        .hidden(false)
525        .git_ignore(true)
526        .git_exclude(false)
527        .filter_entry({
528            let exclude_patterns = Arc::clone(&exclude_patterns);
529            let root_path = root_path.clone();
530            move |entry| {
531                let entry_path = entry.path();
532                if entry_path == root_path {
533                    return true;
534                }
535                if entry.file_type().is_some_and(|ft| ft.is_dir()) {
536                    let rel_path = normalize_rel_path(entry_path, &root_path);
537                    return !should_prune_dir(&rel_path, exclude_patterns.as_slice());
538                }
539                true
540            }
541        })
542        .build()
543        .filter_map(|entry| entry.ok())
544        .filter(|entry| entry.path().is_file())
545        .enumerate()
546    {
547        // Enforce file count limit to prevent unbounded traversal
548        if let Some(limit) = max_files {
549            if files_seen >= limit {
550                return Err(CoreError::Validation(ValidationError::TooManyFiles {
551                    count: files_seen,
552                    limit,
553                }));
554            }
555        }
556        let file_path = entry.path().to_path_buf();
557
558        let path_str = normalize_rel_path(&file_path, &root_path);
559        if is_excluded_file(&path_str, exclude_patterns.as_slice()) {
560            continue;
561        }
562
563        // Collect AGENTS.md paths for AGM-006 check
564        if file_path.file_name().and_then(|n| n.to_str()) == Some("AGENTS.md") {
565            agents_md_paths.push(file_path.clone());
566        }
567
568        // Collect instruction file paths for XP-004/005/006 checks
569        if schemas::cross_platform::is_instruction_file(&file_path) {
570            instruction_file_paths.push(file_path);
571        }
572    }
573
574    // Sort for deterministic ordering
575    agents_md_paths.sort();
576    instruction_file_paths.sort();
577
578    Ok(run_project_level_checks(
579        &agents_md_paths,
580        &instruction_file_paths,
581        &config,
582        &root_dir,
583    ))
584}
585
586/// Main entry point for validating a project with a custom validator registry
587#[cfg(feature = "filesystem")]
588pub fn validate_project_with_registry(
589    path: &Path,
590    config: &LintConfig,
591    registry: &ValidatorRegistry,
592) -> LintResult<ValidationResult> {
593    use ignore::WalkBuilder;
594    use std::sync::Arc;
595    use std::time::Instant;
596
597    let validation_start = Instant::now();
598
599    let root_dir = resolve_validation_root(path)?;
600    let mut config = config.clone();
601    config.set_root_dir(root_dir.clone());
602
603    // Initialize shared import cache for project-level validation (if not already set).
604    // This cache is shared across all file validations, allowing the ImportsValidator
605    // to avoid redundant parsing when traversing import chains that reference the same files.
606    if config.get_import_cache().is_none() {
607        let import_cache: crate::parsers::ImportCache =
608            std::sync::Arc::new(std::sync::RwLock::new(HashMap::new()));
609        config.set_import_cache(import_cache);
610    }
611
612    // Pre-compile exclude patterns once (avoids N+1 pattern compilation)
613    let exclude_patterns = compile_exclude_patterns(config.exclude())?;
614    let exclude_patterns = Arc::new(exclude_patterns);
615
616    // Pre-compile files config patterns once for the parallel walk.
617    // Invalid patterns produce Warning diagnostics that are prepended to results.
618    let config_file = root_dir.join(".agnix.toml");
619    let (compiled_files_inner, config_diags) =
620        compile_files_config_with_diagnostics(config.files_config(), &config_file);
621    let compiled_files = Arc::new(compiled_files_inner);
622
623    let root_path = root_dir.clone();
624
625    // Fallback to relative path is safe: symlink checks and size limits still apply per-file
626    let walk_root = std::fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
627
628    // Shared atomic state for file-limit enforcement across parallel workers.
629    // These must remain atomic (not fold/reduce) because the limit check must
630    // be visible immediately to all threads to stop work promptly.
631    let files_checked = Arc::new(AtomicUsize::new(0));
632    let limit_exceeded = Arc::new(AtomicBool::new(false));
633
634    // Get the file limit from config (None means no limit)
635    let max_files = config.max_files_to_validate();
636
637    // Stream file walk directly into parallel validation (no intermediate Vec)
638    // Note: hidden(false) includes .github, .codex, .claude, .cursor directories
639    // Note: git_exclude(false) prevents .git/info/exclude from hiding config dirs
640    //       that users may locally exclude (e.g. .codex/) but still need linting.
641    //       Trade-off: this may surface files the user intentionally excluded locally,
642    //       but security is still enforced via symlink rejection (file_utils::safe_read)
643    //       and file size limits, so the exposure is limited to lint noise, not unsafe I/O.
644    //
645    // Uses fold/reduce instead of Mutex-protected Vecs to accumulate paths and
646    // diagnostics thread-locally, eliminating lock contention in the hot loop.
647    let (mut diagnostics, mut agents_md_paths, mut instruction_file_paths) =
648        WalkBuilder::new(&walk_root)
649            .hidden(false)
650            .git_ignore(true)
651            .git_exclude(false)
652            .filter_entry({
653                let exclude_patterns = Arc::clone(&exclude_patterns);
654                let root_path = root_path.clone();
655                move |entry| {
656                    let entry_path = entry.path();
657                    if entry_path == root_path {
658                        return true;
659                    }
660                    if entry.file_type().is_some_and(|ft| ft.is_dir()) {
661                        let rel_path = normalize_rel_path(entry_path, &root_path);
662                        return !should_prune_dir(&rel_path, exclude_patterns.as_slice());
663                    }
664                    true
665                }
666            })
667            .build()
668            .filter_map(|entry| entry.ok())
669            .filter(|entry| entry.path().is_file())
670            .filter(|entry| {
671                let entry_path = entry.path();
672                let path_str = normalize_rel_path(entry_path, &root_path);
673                !is_excluded_file(&path_str, exclude_patterns.as_slice())
674            })
675            .map(|entry| entry.path().to_path_buf())
676            .par_bridge()
677            .fold(
678                || {
679                    (
680                        Vec::<Diagnostic>::new(),
681                        Vec::<PathBuf>::new(),
682                        Vec::<PathBuf>::new(),
683                    )
684                },
685                |(mut diags, mut agents, mut instructions), file_path| {
686                    // Security: Check if file limit has been exceeded
687                    // Once exceeded, skip processing additional files
688                    // Use SeqCst ordering for consistency with store operations
689                    if limit_exceeded.load(Ordering::SeqCst) {
690                        return (diags, agents, instructions);
691                    }
692
693                    // Count recognized files (resolve_with_compiled is string-only, no I/O)
694                    let file_type =
695                        resolve_with_compiled(&file_path, Some(&root_path), &compiled_files);
696                    if file_type != FileType::Unknown {
697                        let count = files_checked.fetch_add(1, Ordering::SeqCst) + 1;
698                        // Security: Enforce file count limit to prevent DoS
699                        if let Some(limit) = max_files {
700                            if count > limit {
701                                limit_exceeded.store(true, Ordering::SeqCst);
702                                return (diags, agents, instructions);
703                            }
704                        }
705                    }
706
707                    // Collect AGENTS.md paths for AGM-006 check (thread-local, no lock).
708                    if file_path.file_name().and_then(|n| n.to_str()) == Some("AGENTS.md") {
709                        agents.push(file_path.clone());
710                    }
711
712                    // Collect instruction file paths for XP-004/005/006 checks (thread-local, no lock).
713                    if schemas::cross_platform::is_instruction_file(&file_path) {
714                        instructions.push(file_path.clone());
715                    }
716
717                    // Validate the file using the pre-resolved file_type to avoid
718                    // re-compiling [files] glob patterns for every file.
719                    match validate_file_with_type(&file_path, file_type, &config, registry) {
720                        Ok(ValidationOutcome::Success(file_diagnostics)) => {
721                            diags.extend(file_diagnostics);
722                        }
723                        Ok(ValidationOutcome::IoError(file_error)) => {
724                            diags.push(
725                                Diagnostic::error(
726                                    file_path,
727                                    0,
728                                    0,
729                                    "file::read",
730                                    t!("rules.file_read_error", error = file_error.to_string()),
731                                )
732                                .with_suggestion(t!("rules.file_read_error_suggestion")),
733                            );
734                        }
735                        Ok(ValidationOutcome::Skipped) => {
736                            // File type unknown - no validation needed
737                        }
738                        Err(e) => {
739                            diags.push(
740                                Diagnostic::error(
741                                    file_path,
742                                    0,
743                                    0,
744                                    "file::read",
745                                    t!("rules.file_read_error", error = e.to_string()),
746                                )
747                                .with_suggestion(t!("rules.file_read_error_suggestion")),
748                            );
749                        }
750                    }
751
752                    (diags, agents, instructions)
753                },
754            )
755            .reduce(
756                || (Vec::new(), Vec::new(), Vec::new()),
757                |(mut d1, mut a1, mut i1), (d2, a2, i2)| {
758                    d1.extend(d2);
759                    a1.extend(a2);
760                    i1.extend(i2);
761                    (d1, a1, i1)
762                },
763            );
764
765    // Surface config-level diagnostics (e.g. invalid glob patterns in [files])
766    // before the TooManyFiles check so they are included on successful validation.
767    diagnostics.extend(config_diags);
768
769    // Check if limit was exceeded and return error
770    if limit_exceeded.load(Ordering::Relaxed) {
771        if let Some(limit) = max_files {
772            return Err(CoreError::Validation(ValidationError::TooManyFiles {
773                count: files_checked.load(Ordering::Relaxed),
774                limit,
775            }));
776        }
777    }
778
779    // Run project-level checks (AGM-006, XP-004/005/006, VER-001)
780    {
781        agents_md_paths.sort();
782        instruction_file_paths.sort();
783
784        diagnostics.extend(run_project_level_checks(
785            &agents_md_paths,
786            &instruction_file_paths,
787            &config,
788            &root_dir,
789        ));
790    }
791
792    // Sort by severity (errors first), then by file path, then by line/rule for full determinism
793    diagnostics.sort_by(|a, b| {
794        a.level
795            .cmp(&b.level)
796            .then_with(|| a.file.cmp(&b.file))
797            .then_with(|| a.line.cmp(&b.line))
798            .then_with(|| a.rule.cmp(&b.rule))
799    });
800
801    // Extract final count from atomic counter
802    let files_checked = files_checked.load(Ordering::Relaxed);
803
804    // as_millis() returns u128; clamp to u64 for the public API contract.
805    let elapsed_ms = validation_start.elapsed().as_millis().min(u64::MAX as u128) as u64;
806    let validator_factories_registered = registry.total_validator_count();
807
808    Ok(ValidationResult::new(diagnostics, files_checked)
809        .with_timing(elapsed_ms)
810        .with_validator_factories_registered(validator_factories_registered))
811}
812
813#[cfg(feature = "filesystem")]
814fn resolve_validation_root(path: &Path) -> LintResult<PathBuf> {
815    let metadata = match path.metadata() {
816        Ok(m) => m,
817        Err(_) => {
818            // Any I/O failure (not found, permission denied, etc.) is treated
819            // uniformly as RootNotFound. For a local linter running as the
820            // invoking user, the distinction is not actionable at this level.
821            return Err(CoreError::Validation(ValidationError::RootNotFound {
822                path: path.to_path_buf(),
823            }));
824        }
825    };
826    let candidate = if metadata.is_file() {
827        path.parent().unwrap_or(Path::new("."))
828    } else {
829        path
830    };
831    Ok(std::fs::canonicalize(candidate).unwrap_or_else(|_| candidate.to_path_buf()))
832}
833
834#[cfg(test)]
835mod validate_content_tests {
836    use super::*;
837    use crate::config::LintConfig;
838    use crate::registry::ValidatorRegistry;
839
840    #[test]
841    fn returns_diagnostics_for_known_file_type() {
842        let config = LintConfig::default();
843        let registry = ValidatorRegistry::with_defaults();
844        let path = Path::new("CLAUDE.md");
845        let content = "<unclosed>";
846        let diags = validate_content(path, content, &config, &registry);
847        assert!(
848            !diags.is_empty(),
849            "Should find diagnostics for unclosed XML tag"
850        );
851    }
852
853    #[test]
854    fn returns_empty_for_unknown_file_type() {
855        let config = LintConfig::default();
856        let registry = ValidatorRegistry::with_defaults();
857        let path = Path::new("main.rs");
858        let diags = validate_content(path, "", &config, &registry);
859        assert!(
860            diags.is_empty(),
861            "Unknown file type should produce no diagnostics"
862        );
863    }
864
865    #[test]
866    fn returns_empty_for_empty_content_with_known_type() {
867        let config = LintConfig::default();
868        let registry = ValidatorRegistry::with_defaults();
869        let path = Path::new("CLAUDE.md");
870        let diags = validate_content(path, "", &config, &registry);
871        // Empty CLAUDE.md is valid (no content to violate rules).
872        assert!(
873            diags.is_empty(),
874            "Empty content for a known file type should not produce diagnostics"
875        );
876    }
877
878    #[test]
879    fn respects_tool_filter() {
880        let config = LintConfig::builder()
881            .tools(vec!["cursor".to_string()])
882            .build()
883            .unwrap();
884        let registry = ValidatorRegistry::with_defaults();
885        let path = Path::new("CLAUDE.md");
886        let content = "# Project\n\nSome instructions.";
887        // Should not panic with tool filter
888        let _ = validate_content(path, content, &config, &registry);
889    }
890
891    #[test]
892    fn crlf_content_produces_same_diagnostics_as_lf() {
893        let config = LintConfig::default();
894        let registry = ValidatorRegistry::with_defaults();
895        let path = Path::new("skill.md");
896
897        let lf_content =
898            "---\nname: test-skill\ndescription: A test\n---\n\n# Instructions\n\n<unclosed>\n";
899        let crlf_content = "---\r\nname: test-skill\r\ndescription: A test\r\n---\r\n\r\n# Instructions\r\n\r\n<unclosed>\r\n";
900
901        let lf_diags = validate_content(path, lf_content, &config, &registry);
902        let crlf_diags = validate_content(path, crlf_content, &config, &registry);
903
904        assert_eq!(
905            lf_diags.len(),
906            crlf_diags.len(),
907            "CRLF and LF content should produce the same number of diagnostics.\nLF: {:?}\nCRLF: {:?}",
908            lf_diags
909                .iter()
910                .map(|d| (&d.rule, d.line, d.column))
911                .collect::<Vec<_>>(),
912            crlf_diags
913                .iter()
914                .map(|d| (&d.rule, d.line, d.column))
915                .collect::<Vec<_>>(),
916        );
917
918        for (lf_d, crlf_d) in lf_diags.iter().zip(crlf_diags.iter()) {
919            assert_eq!(
920                lf_d.rule, crlf_d.rule,
921                "Same rules should fire for LF and CRLF content"
922            );
923            assert_eq!(
924                lf_d.line, crlf_d.line,
925                "Line numbers should match between LF and CRLF for rule {}",
926                lf_d.rule
927            );
928            assert_eq!(
929                lf_d.column, crlf_d.column,
930                "Column numbers should match between LF and CRLF for rule {}",
931                lf_d.rule
932            );
933        }
934    }
935
936    #[test]
937    fn lf_validation_is_stable() {
938        let config = LintConfig::default();
939        let registry = ValidatorRegistry::with_defaults();
940        let path = Path::new("CLAUDE.md");
941
942        // Already-normalized content should produce the same result on repeated calls.
943        let content = "# Project\n\nInstructions here.\n";
944        let diags1 = validate_content(path, content, &config, &registry);
945        let diags2 = validate_content(path, content, &config, &registry);
946
947        assert_eq!(
948            diags1.len(),
949            diags2.len(),
950            "Repeated validation of LF content should be stable"
951        );
952    }
953
954    #[test]
955    fn crlf_validation_is_idempotent() {
956        let config = LintConfig::default();
957        let registry = ValidatorRegistry::with_defaults();
958        let path = Path::new("skill.md");
959
960        // Validating CRLF content twice should produce identical diagnostics each time.
961        let crlf_content =
962            "---\r\nname: test-skill\r\ndescription: A test\r\n---\r\n\r\n# Instructions\r\n";
963        let diags1 = validate_content(path, crlf_content, &config, &registry);
964        let diags2 = validate_content(path, crlf_content, &config, &registry);
965
966        assert_eq!(
967            diags1.len(),
968            diags2.len(),
969            "Repeated validation of CRLF content should be stable"
970        );
971        for (d1, d2) in diags1.iter().zip(diags2.iter()) {
972            assert_eq!(d1.rule, d2.rule);
973            assert_eq!(d1.line, d2.line);
974            assert_eq!(d1.column, d2.column);
975        }
976    }
977
978    #[test]
979    fn lone_cr_content_produces_same_diagnostics_as_lf() {
980        // Lone CR (\r without following \n) is the old Mac line ending format.
981        // normalize_line_endings handles it in its single-pass char iterator,
982        // which converts any bare \r (not followed by \n) to \n.
983        let config = LintConfig::default();
984        let registry = ValidatorRegistry::with_defaults();
985        let path = Path::new("skill.md");
986
987        let lf_content = "---\nname: test-skill\ndescription: A test\n---\n\n# Instructions\n";
988        // Same content with lone CR instead of LF
989        let cr_content = "---\rname: test-skill\rdescription: A test\r---\r\r# Instructions\r";
990
991        let lf_diags = validate_content(path, lf_content, &config, &registry);
992        let cr_diags = validate_content(path, cr_content, &config, &registry);
993
994        assert_eq!(
995            lf_diags.len(),
996            cr_diags.len(),
997            "Lone-CR and LF content should produce the same number of diagnostics.\nLF: {:?}\nCR: {:?}",
998            lf_diags
999                .iter()
1000                .map(|d| (&d.rule, d.line, d.column))
1001                .collect::<Vec<_>>(),
1002            cr_diags
1003                .iter()
1004                .map(|d| (&d.rule, d.line, d.column))
1005                .collect::<Vec<_>>(),
1006        );
1007        for (lf_d, cr_d) in lf_diags.iter().zip(cr_diags.iter()) {
1008            assert_eq!(lf_d.rule, cr_d.rule);
1009            assert_eq!(lf_d.line, cr_d.line);
1010            assert_eq!(lf_d.column, cr_d.column);
1011        }
1012    }
1013}
1014
1015#[cfg(all(test, feature = "filesystem"))]
1016mod tests {
1017    use super::*;
1018
1019    #[test]
1020    fn test_should_prune_dir_with_globbed_patterns() {
1021        let patterns =
1022            compile_exclude_patterns(&vec!["target/**".to_string(), "**/target/**".to_string()])
1023                .unwrap();
1024        assert!(
1025            should_prune_dir("target", &patterns),
1026            "Expected target/** to prune target directory"
1027        );
1028        assert!(
1029            should_prune_dir("sub/target", &patterns),
1030            "Expected **/target/** to prune nested target directory"
1031        );
1032    }
1033
1034    #[test]
1035    fn test_should_prune_dir_for_bare_pattern() {
1036        let patterns = compile_exclude_patterns(&vec!["target".to_string()]).unwrap();
1037        assert!(
1038            should_prune_dir("target", &patterns),
1039            "Bare pattern should prune directory"
1040        );
1041        assert!(
1042            !should_prune_dir("sub/target", &patterns),
1043            "Bare pattern should not prune nested directories"
1044        );
1045    }
1046
1047    #[test]
1048    fn test_should_prune_dir_for_trailing_slash_pattern() {
1049        let patterns = compile_exclude_patterns(&vec!["target/".to_string()]).unwrap();
1050        assert!(
1051            should_prune_dir("target", &patterns),
1052            "Trailing slash pattern should prune directory"
1053        );
1054    }
1055
1056    #[test]
1057    fn test_should_not_prune_root_dir() {
1058        let patterns = compile_exclude_patterns(&vec!["target/**".to_string()]).unwrap();
1059        assert!(
1060            !should_prune_dir("", &patterns),
1061            "Root directory should never be pruned"
1062        );
1063    }
1064
1065    #[test]
1066    fn test_should_not_prune_dir_for_single_level_glob() {
1067        let patterns = compile_exclude_patterns(&vec!["target/*".to_string()]).unwrap();
1068        assert!(
1069            !should_prune_dir("target", &patterns),
1070            "Single-level glob should not prune directory"
1071        );
1072    }
1073
1074    #[test]
1075    fn test_dir_only_pattern_does_not_exclude_file_named_dir() {
1076        let patterns = compile_exclude_patterns(&vec!["target/".to_string()]).unwrap();
1077        assert!(
1078            !is_excluded_file("target", &patterns),
1079            "Directory-only pattern should not exclude a file named target"
1080        );
1081    }
1082
1083    #[test]
1084    fn test_dir_only_pattern_excludes_files_under_dir() {
1085        let patterns = compile_exclude_patterns(&vec!["target/".to_string()]).unwrap();
1086        assert!(
1087            is_excluded_file("target/file.txt", &patterns),
1088            "Directory-only pattern should exclude files under target/"
1089        );
1090    }
1091
1092    #[test]
1093    fn test_compile_exclude_patterns_invalid_pattern_returns_error() {
1094        let result = compile_exclude_patterns(&vec!["[".to_string()]);
1095        assert!(matches!(
1096            result,
1097            Err(CoreError::Config(ConfigError::InvalidExcludePattern { .. }))
1098        ));
1099    }
1100
1101    // ===== compile_patterns_with_diagnostics tests =====
1102
1103    #[test]
1104    fn compile_patterns_with_diagnostics_all_valid() {
1105        let patterns = vec!["*.md".to_string(), "src/**/*.rs".to_string()];
1106        let config_file = Path::new(".agnix.toml");
1107        let (compiled, diags) = compile_patterns_with_diagnostics(&patterns, config_file);
1108        assert_eq!(compiled.len(), 2, "All valid patterns should compile");
1109        assert!(
1110            diags.is_empty(),
1111            "No diagnostics expected for valid patterns, got: {diags:?}"
1112        );
1113    }
1114
1115    #[test]
1116    fn compile_patterns_with_diagnostics_invalid_pattern() {
1117        let patterns = vec!["[invalid".to_string()];
1118        let config_file = Path::new(".agnix.toml");
1119        let (compiled, diags) = compile_patterns_with_diagnostics(&patterns, config_file);
1120        assert!(
1121            compiled.is_empty(),
1122            "Invalid pattern should not produce a compiled pattern"
1123        );
1124        assert_eq!(
1125            diags.len(),
1126            1,
1127            "Expected exactly one diagnostic for the invalid pattern"
1128        );
1129        assert_eq!(
1130            diags[0].level,
1131            crate::DiagnosticLevel::Warning,
1132            "Invalid glob diagnostic should be Warning level"
1133        );
1134        assert_eq!(
1135            diags[0].rule, "config::glob",
1136            "Invalid glob diagnostic should use rule config::glob"
1137        );
1138        assert!(
1139            diags[0].suggestion.is_some(),
1140            "Diagnostic should include a suggestion"
1141        );
1142        assert!(
1143            diags[0].message.contains("[invalid"),
1144            "diagnostic message should include the pattern"
1145        );
1146    }
1147
1148    #[test]
1149    fn compile_patterns_with_diagnostics_mixed_valid_and_invalid() {
1150        let patterns = vec![
1151            "*.md".to_string(),
1152            "[bad".to_string(),
1153            "src/**/*.rs".to_string(),
1154            "[also-bad".to_string(),
1155        ];
1156        let config_file = Path::new(".agnix.toml");
1157        let (compiled, diags) = compile_patterns_with_diagnostics(&patterns, config_file);
1158        assert_eq!(
1159            compiled.len(),
1160            2,
1161            "Only valid patterns should compile, got {}",
1162            compiled.len()
1163        );
1164        assert_eq!(
1165            diags.len(),
1166            2,
1167            "Expected 2 diagnostics for 2 invalid patterns, got {}",
1168            diags.len()
1169        );
1170        for d in &diags {
1171            assert_eq!(d.rule, "config::glob");
1172            assert_eq!(d.level, crate::DiagnosticLevel::Warning);
1173        }
1174    }
1175
1176    #[test]
1177    fn compile_patterns_with_diagnostics_empty_input() {
1178        let patterns: Vec<String> = vec![];
1179        let config_file = Path::new(".agnix.toml");
1180        let (compiled, diags) = compile_patterns_with_diagnostics(&patterns, config_file);
1181        assert!(compiled.is_empty());
1182        assert!(diags.is_empty());
1183    }
1184
1185    #[test]
1186    fn compile_files_config_with_diagnostics_aggregates_all_lists() {
1187        use crate::config::FilesConfig;
1188
1189        let files = FilesConfig {
1190            include_as_memory: vec!["*.md".to_string(), "[bad-memory".to_string()],
1191            include_as_generic: vec!["[bad-generic".to_string()],
1192            exclude: vec!["valid/**".to_string(), "[bad-exclude".to_string()],
1193        };
1194        let config_file = Path::new(".agnix.toml");
1195        let (compiled, diags) = compile_files_config_with_diagnostics(&files, config_file);
1196        // Valid patterns: *.md (memory), valid/** (exclude) = 2 compiled total
1197        assert_eq!(compiled.include_as_memory.len(), 1);
1198        assert_eq!(compiled.include_as_generic.len(), 0);
1199        assert_eq!(compiled.exclude.len(), 1);
1200        // Invalid patterns: [bad-memory, [bad-generic, [bad-exclude = 3 diagnostics
1201        assert_eq!(
1202            diags.len(),
1203            3,
1204            "Expected 3 diagnostics from all 3 pattern lists, got: {diags:?}"
1205        );
1206        for d in &diags {
1207            assert_eq!(d.rule, "config::glob");
1208        }
1209    }
1210
1211    #[test]
1212    fn crlf_file_on_disk_produces_same_diagnostics_as_lf() {
1213        // validate_file() reads from disk and normalizes CRLF in validate_file_with_type.
1214        // Verify the on-disk path produces the same diagnostics as the in-memory path.
1215        use crate::diagnostics::ValidationOutcome;
1216
1217        let temp = tempfile::TempDir::new().unwrap();
1218        let lf_path = temp.path().join("skill_lf.md");
1219        let crlf_path = temp.path().join("skill_crlf.md");
1220
1221        let lf_content =
1222            "---\nname: test-skill\ndescription: A test\n---\n\n# Instructions\n\n<unclosed>\n";
1223        let crlf_content = "---\r\nname: test-skill\r\ndescription: A test\r\n---\r\n\r\n# Instructions\r\n\r\n<unclosed>\r\n";
1224
1225        std::fs::write(&lf_path, lf_content).unwrap();
1226        std::fs::write(&crlf_path, crlf_content).unwrap();
1227
1228        let config = LintConfig::default();
1229
1230        let lf_outcome = validate_file(&lf_path, &config).unwrap();
1231        let crlf_outcome = validate_file(&crlf_path, &config).unwrap();
1232
1233        let lf_diags = match lf_outcome {
1234            ValidationOutcome::Success(d) => d,
1235            other => panic!("Expected Success, got {other:?}"),
1236        };
1237        let crlf_diags = match crlf_outcome {
1238            ValidationOutcome::Success(d) => d,
1239            other => panic!("Expected Success, got {other:?}"),
1240        };
1241
1242        assert_eq!(
1243            lf_diags.len(),
1244            crlf_diags.len(),
1245            "On-disk CRLF file should produce same diagnostic count as LF file.\nLF: {:?}\nCRLF: {:?}",
1246            lf_diags
1247                .iter()
1248                .map(|d| (&d.rule, d.line, d.column))
1249                .collect::<Vec<_>>(),
1250            crlf_diags
1251                .iter()
1252                .map(|d| (&d.rule, d.line, d.column))
1253                .collect::<Vec<_>>(),
1254        );
1255        for (lf_d, crlf_d) in lf_diags.iter().zip(crlf_diags.iter()) {
1256            assert_eq!(lf_d.rule, crlf_d.rule, "Same rules should fire");
1257            assert_eq!(
1258                lf_d.line, crlf_d.line,
1259                "Line numbers should match for rule {}",
1260                lf_d.rule
1261            );
1262            assert_eq!(
1263                lf_d.column, crlf_d.column,
1264                "Column numbers should match for rule {}",
1265                lf_d.rule
1266            );
1267        }
1268    }
1269}
agnix_core/pipeline.rs

agnix_core/
pipeline.rs