Skip to main content

lintel_validate/
validate.rs

1use alloc::collections::BTreeMap;
2use std::collections::HashMap;
3use std::fs;
4use std::path::{Path, PathBuf};
5
6use anyhow::{Context, Result};
7use glob::glob;
8use serde_json::Value;
9
10use crate::catalog;
11use lintel_schema_cache::{CacheStatus, SchemaCache};
12use lintel_validation_cache::{ValidationCacheStatus, ValidationError};
13use schema_catalog::CompiledCatalog;
14
15use crate::diagnostics::{DEFAULT_LABEL, find_instance_path_span, format_label};
16use crate::discover;
17use crate::parsers::{self, Parser};
18use crate::registry;
19
20/// Conservative limit for concurrent file reads to avoid exhausting file
21/// descriptors. 128 is well below the default soft limit on macOS (256) and
22/// Linux (1024) while still providing good throughput.
23const FD_CONCURRENCY_LIMIT: usize = 128;
24
25pub struct ValidateArgs {
26    /// Glob patterns to find files (empty = auto-discover)
27    pub globs: Vec<String>,
28
29    /// Exclude files matching these globs (repeatable)
30    pub exclude: Vec<String>,
31
32    /// Cache directory for remote schemas
33    pub cache_dir: Option<String>,
34
35    /// Bypass schema cache reads (still writes fetched schemas to cache)
36    pub force_schema_fetch: bool,
37
38    /// Bypass validation cache reads (still writes results to cache)
39    pub force_validation: bool,
40
41    /// Disable `SchemaStore` catalog matching
42    pub no_catalog: bool,
43
44    /// Directory to search for `lintel.toml` (defaults to cwd)
45    pub config_dir: Option<PathBuf>,
46
47    /// TTL for cached schemas. `None` means no expiry.
48    pub schema_cache_ttl: Option<core::time::Duration>,
49}
50
51/// Re-exported from [`crate::diagnostics::LintError`] so callers can use
52/// `lintel_validate::validate::LintError` without importing diagnostics.
53pub use crate::diagnostics::LintError;
54
55/// A file that was checked and the schema it resolved to.
56pub struct CheckedFile {
57    pub path: String,
58    pub schema: String,
59    /// `None` for local schemas and builtins; `Some` for remote schemas.
60    pub cache_status: Option<CacheStatus>,
61    /// `None` when validation caching is not applicable; `Some` for validation cache hits/misses.
62    pub validation_cache_status: Option<ValidationCacheStatus>,
63}
64
65/// Result of a validation run.
66pub struct ValidateResult {
67    pub errors: Vec<LintError>,
68    pub checked: Vec<CheckedFile>,
69}
70
71impl ValidateResult {
72    pub fn has_errors(&self) -> bool {
73        !self.errors.is_empty()
74    }
75
76    pub fn files_checked(&self) -> usize {
77        self.checked.len()
78    }
79}
80
81// ---------------------------------------------------------------------------
82// Internal types
83// ---------------------------------------------------------------------------
84
85/// A file that has been parsed and matched to a schema URI.
86struct ParsedFile {
87    path: String,
88    content: String,
89    instance: Value,
90    /// Original schema URI before rewrites (for override matching).
91    original_schema_uri: String,
92}
93
94// ---------------------------------------------------------------------------
95// Config loading
96// ---------------------------------------------------------------------------
97
98/// Locate `lintel.toml`, load the full config, and return the config directory.
99/// Returns `(config, config_dir, config_path)`.  When no config is found or
100/// cwd is unavailable the config is default and `config_path` is `None`.
101#[tracing::instrument(skip_all)]
102pub fn load_config(search_dir: Option<&Path>) -> (lintel_config::Config, PathBuf, Option<PathBuf>) {
103    let start_dir = match search_dir {
104        Some(d) => d.to_path_buf(),
105        None => match std::env::current_dir() {
106            Ok(d) => d,
107            Err(_) => return (lintel_config::Config::default(), PathBuf::from("."), None),
108        },
109    };
110
111    let Some(config_path) = lintel_config::find_config_path(&start_dir) else {
112        return (lintel_config::Config::default(), start_dir, None);
113    };
114
115    let dir = config_path.parent().unwrap_or(&start_dir).to_path_buf();
116    let cfg = lintel_config::find_and_load(&start_dir)
117        .ok()
118        .flatten()
119        .unwrap_or_default();
120    (cfg, dir, Some(config_path))
121}
122
123// ---------------------------------------------------------------------------
124// File collection
125// ---------------------------------------------------------------------------
126
127/// Collect input files from globs/directories, applying exclude filters.
128///
129/// # Errors
130///
131/// Returns an error if a glob pattern is invalid or a directory cannot be walked.
132#[tracing::instrument(skip_all, fields(glob_count = globs.len(), exclude_count = exclude.len()))]
133pub fn collect_files(globs: &[String], exclude: &[String]) -> Result<Vec<PathBuf>> {
134    if globs.is_empty() {
135        return discover::discover_files(".", exclude);
136    }
137
138    let mut result = Vec::new();
139    for pattern in globs {
140        let path = Path::new(pattern);
141        if path.is_dir() {
142            result.extend(discover::discover_files(pattern, exclude)?);
143        } else {
144            for entry in glob(pattern).with_context(|| format!("invalid glob: {pattern}"))? {
145                let path = entry?;
146                if path.is_file() && !is_excluded(&path, exclude) {
147                    result.push(path);
148                }
149            }
150        }
151    }
152    Ok(result)
153}
154
155fn is_excluded(path: &Path, excludes: &[String]) -> bool {
156    let path_str = match path.to_str() {
157        Some(s) => s.strip_prefix("./").unwrap_or(s),
158        None => return false,
159    };
160    excludes
161        .iter()
162        .any(|pattern| glob_match::glob_match(pattern, path_str))
163}
164
165// ---------------------------------------------------------------------------
166// Phase 1: Parse files and resolve schema URIs
167// ---------------------------------------------------------------------------
168
169/// Try parsing content with each known format, returning the first success.
170///
171/// JSONC is tried first (superset of JSON, handles comments), then YAML and
172/// TOML which cover the most common config formats, followed by the rest.
173pub fn try_parse_all(content: &str, file_name: &str) -> Option<(parsers::FileFormat, Value)> {
174    use parsers::FileFormat::{Json, Json5, Jsonc, Markdown, Toml, Yaml};
175    const FORMATS: [parsers::FileFormat; 6] = [Jsonc, Yaml, Toml, Json, Json5, Markdown];
176
177    for fmt in FORMATS {
178        let parser = parsers::parser_for(fmt);
179        if let Ok(val) = parser.parse(content, file_name) {
180            return Some((fmt, val));
181        }
182    }
183    None
184}
185
186/// Result of processing a single file: either a parsed file with its schema URI,
187/// a lint error, or nothing (file was skipped).
188enum FileResult {
189    Parsed {
190        schema_uri: String,
191        parsed: ParsedFile,
192    },
193    Error(LintError),
194    Skip,
195}
196
197/// Process a single file's already-read content: parse and resolve schema URI.
198#[allow(clippy::too_many_arguments)]
199fn process_one_file(
200    path: &Path,
201    content: String,
202    config: &lintel_config::Config,
203    config_dir: &Path,
204    compiled_catalogs: &[CompiledCatalog],
205) -> FileResult {
206    let path_str = path.display().to_string();
207    let file_name = path
208        .file_name()
209        .and_then(|n| n.to_str())
210        .unwrap_or(&path_str);
211
212    let detected_format = parsers::detect_format(path);
213
214    // For unrecognized extensions, only proceed if a catalog or config mapping matches.
215    if detected_format.is_none() {
216        let has_match = config.find_schema_mapping(&path_str, file_name).is_some()
217            || compiled_catalogs
218                .iter()
219                .any(|cat| cat.find_schema(&path_str, file_name).is_some());
220        if !has_match {
221            return FileResult::Skip;
222        }
223    }
224
225    // Parse the file content.
226    let (parser, instance): (Box<dyn Parser>, Value) = if let Some(fmt) = detected_format {
227        let parser = parsers::parser_for(fmt);
228        match parser.parse(&content, &path_str) {
229            Ok(val) => (parser, val),
230            Err(parse_err) => return FileResult::Error(parse_err.into()),
231        }
232    } else {
233        match try_parse_all(&content, &path_str) {
234            Some((fmt, val)) => (parsers::parser_for(fmt), val),
235            None => return FileResult::Skip,
236        }
237    };
238
239    // Skip markdown files with no frontmatter
240    if instance.is_null() {
241        return FileResult::Skip;
242    }
243
244    // Schema resolution priority:
245    // 1. Inline $schema / YAML modeline (always wins)
246    // 2. Custom schema mappings from lintel.toml [schemas]
247    // 3. Catalog matching (custom registries > Lintel catalog > SchemaStore)
248    let schema_uri = parser
249        .extract_schema_uri(&content, &instance)
250        .or_else(|| {
251            config
252                .find_schema_mapping(&path_str, file_name)
253                .map(str::to_string)
254        })
255        .or_else(|| {
256            compiled_catalogs
257                .iter()
258                .find_map(|cat| cat.find_schema(&path_str, file_name))
259                .map(str::to_string)
260        });
261
262    let Some(schema_uri) = schema_uri else {
263        return FileResult::Skip;
264    };
265
266    // Keep original URI for override matching (before rewrites)
267    let original_schema_uri = schema_uri.clone();
268
269    // Apply rewrite rules, then resolve // paths relative to lintel.toml
270    let schema_uri = lintel_config::apply_rewrites(&schema_uri, &config.rewrite);
271    let schema_uri = lintel_config::resolve_double_slash(&schema_uri, config_dir);
272
273    // Resolve relative local paths against the file's parent directory.
274    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
275    let schema_uri = if is_remote {
276        schema_uri
277    } else {
278        path.parent()
279            .map(|parent| parent.join(&schema_uri).to_string_lossy().to_string())
280            .unwrap_or(schema_uri)
281    };
282
283    FileResult::Parsed {
284        schema_uri,
285        parsed: ParsedFile {
286            path: path_str,
287            content,
288            instance,
289            original_schema_uri,
290        },
291    }
292}
293
294/// Read each file concurrently with tokio, parse its content, extract its
295/// schema URI, apply rewrites, and group by resolved schema URI.
296#[tracing::instrument(skip_all, fields(file_count = files.len()))]
297#[allow(clippy::too_many_arguments)]
298async fn parse_and_group_files(
299    files: &[PathBuf],
300    config: &lintel_config::Config,
301    config_dir: &Path,
302    compiled_catalogs: &[CompiledCatalog],
303    errors: &mut Vec<LintError>,
304) -> BTreeMap<String, Vec<ParsedFile>> {
305    // Read all files concurrently using tokio async I/O, with a semaphore
306    // to avoid exhausting file descriptors on large directories.
307    let semaphore = alloc::sync::Arc::new(tokio::sync::Semaphore::new(FD_CONCURRENCY_LIMIT));
308    let mut read_set = tokio::task::JoinSet::new();
309    for path in files {
310        let path = path.clone();
311        let sem = semaphore.clone();
312        read_set.spawn(async move {
313            let _permit = sem.acquire().await.expect("semaphore closed");
314            let result = tokio::fs::read_to_string(&path).await;
315            (path, result)
316        });
317    }
318
319    let mut file_contents = Vec::with_capacity(files.len());
320    while let Some(result) = read_set.join_next().await {
321        match result {
322            Ok(item) => file_contents.push(item),
323            Err(e) => tracing::warn!("file read task panicked: {e}"),
324        }
325    }
326
327    // Process files: parse content and resolve schema URIs.
328    let mut schema_groups: BTreeMap<String, Vec<ParsedFile>> = BTreeMap::new();
329    for (path, content_result) in file_contents {
330        let content = match content_result {
331            Ok(c) => c,
332            Err(e) => {
333                errors.push(LintError::Io {
334                    path: path.display().to_string(),
335                    message: format!("failed to read: {e}"),
336                });
337                continue;
338            }
339        };
340        let result = process_one_file(&path, content, config, config_dir, compiled_catalogs);
341        match result {
342            FileResult::Parsed { schema_uri, parsed } => {
343                schema_groups.entry(schema_uri).or_default().push(parsed);
344            }
345            FileResult::Error(e) => errors.push(e),
346            FileResult::Skip => {}
347        }
348    }
349
350    schema_groups
351}
352
353// ---------------------------------------------------------------------------
354// Phase 2: Schema fetching, compilation, and instance validation
355// ---------------------------------------------------------------------------
356
357/// Fetch a schema by URI, returning its parsed JSON and cache status.
358///
359/// For remote URIs, checks the prefetched map first; for local URIs, reads
360/// from disk (with in-memory caching to avoid redundant I/O for shared schemas).
361#[allow(clippy::too_many_arguments)]
362async fn fetch_schema_from_prefetched(
363    schema_uri: &str,
364    prefetched: &HashMap<String, Result<(Value, CacheStatus), String>>,
365    local_cache: &mut HashMap<String, Value>,
366    group: &[ParsedFile],
367    errors: &mut Vec<LintError>,
368    checked: &mut Vec<CheckedFile>,
369    on_check: &mut impl FnMut(&CheckedFile),
370) -> Option<(Value, Option<CacheStatus>)> {
371    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
372
373    let result: Result<(Value, Option<CacheStatus>), String> = if is_remote {
374        match prefetched.get(schema_uri) {
375            Some(Ok((v, status))) => Ok((v.clone(), Some(*status))),
376            Some(Err(e)) => Err(format!("failed to fetch schema: {schema_uri}: {e}")),
377            None => Err(format!("schema not prefetched: {schema_uri}")),
378        }
379    } else if let Some(cached) = local_cache.get(schema_uri) {
380        Ok((cached.clone(), None))
381    } else {
382        tokio::fs::read_to_string(schema_uri)
383            .await
384            .map_err(|e| format!("failed to read local schema {schema_uri}: {e}"))
385            .and_then(|content| {
386                serde_json::from_str::<Value>(&content)
387                    .map(|v| {
388                        local_cache.insert(schema_uri.to_string(), v.clone());
389                        (v, None)
390                    })
391                    .map_err(|e| format!("failed to parse local schema {schema_uri}: {e}"))
392            })
393    };
394
395    match result {
396        Ok(value) => Some(value),
397        Err(message) => {
398            report_group_error(
399                |path| LintError::SchemaFetch {
400                    path: path.to_string(),
401                    message: message.clone(),
402                },
403                schema_uri,
404                None,
405                group,
406                errors,
407                checked,
408                on_check,
409            );
410            None
411        }
412    }
413}
414
415/// Report the same error for every file in a schema group.
416#[allow(clippy::too_many_arguments)]
417fn report_group_error<P: alloc::borrow::Borrow<ParsedFile>>(
418    make_error: impl Fn(&str) -> LintError,
419    schema_uri: &str,
420    cache_status: Option<CacheStatus>,
421    group: &[P],
422    errors: &mut Vec<LintError>,
423    checked: &mut Vec<CheckedFile>,
424    on_check: &mut impl FnMut(&CheckedFile),
425) {
426    for item in group {
427        let pf = item.borrow();
428        let cf = CheckedFile {
429            path: pf.path.clone(),
430            schema: schema_uri.to_string(),
431            cache_status,
432            validation_cache_status: None,
433        };
434        on_check(&cf);
435        checked.push(cf);
436        errors.push(make_error(&pf.path));
437    }
438}
439
440/// Mark every file in a group as checked (no errors).
441#[allow(clippy::too_many_arguments)]
442fn mark_group_checked<P: alloc::borrow::Borrow<ParsedFile>>(
443    schema_uri: &str,
444    cache_status: Option<CacheStatus>,
445    validation_cache_status: Option<ValidationCacheStatus>,
446    group: &[P],
447    checked: &mut Vec<CheckedFile>,
448    on_check: &mut impl FnMut(&CheckedFile),
449) {
450    for item in group {
451        let pf = item.borrow();
452        let cf = CheckedFile {
453            path: pf.path.clone(),
454            schema: schema_uri.to_string(),
455            cache_status,
456            validation_cache_status,
457        };
458        on_check(&cf);
459        checked.push(cf);
460    }
461}
462
463/// Clean up error messages from the `jsonschema` crate.
464///
465/// For `anyOf`/`oneOf` failures the crate dumps the entire JSON value into the
466/// message (e.g. `{...} is not valid under any of the schemas listed in the 'oneOf' keyword`).
467/// The source snippet already shows the value, so we strip the redundant prefix
468/// and keep only `"not valid under any of the schemas listed in the 'oneOf' keyword"`.
469///
470/// All other messages are returned unchanged.
471fn clean_error_message(msg: String) -> String {
472    const MARKER: &str = " is not valid under any of the schemas listed in the '";
473    if let Some(pos) = msg.find(MARKER) {
474        // pos points to " is not valid...", skip " is " (4 chars) to get "not valid..."
475        return msg[pos + 4..].to_string();
476    }
477    msg
478}
479
480/// Convert [`ValidationError`]s into [`LintError::Validation`] diagnostics.
481fn push_validation_errors(
482    pf: &ParsedFile,
483    schema_url: &str,
484    validation_errors: &[ValidationError],
485    errors: &mut Vec<LintError>,
486) {
487    for ve in validation_errors {
488        let span = find_instance_path_span(&pf.content, &ve.instance_path);
489        let instance_path = if ve.instance_path.is_empty() {
490            DEFAULT_LABEL.to_string()
491        } else {
492            ve.instance_path.clone()
493        };
494        let label = format_label(&instance_path, &ve.schema_path);
495        let source_span: miette::SourceSpan = span.into();
496        errors.push(LintError::Validation {
497            src: miette::NamedSource::new(&pf.path, pf.content.clone()),
498            span: source_span,
499            schema_span: source_span,
500            path: pf.path.clone(),
501            instance_path,
502            label,
503            message: ve.message.clone(),
504            schema_url: schema_url.to_string(),
505            schema_path: ve.schema_path.clone(),
506        });
507    }
508}
509
510/// Validate all files in a group against an already-compiled validator and store
511/// results in the validation cache.
512#[tracing::instrument(skip_all, fields(schema_uri, file_count = group.len()))]
513#[allow(clippy::too_many_arguments)]
514async fn validate_group<P: alloc::borrow::Borrow<ParsedFile>>(
515    validator: &jsonschema::Validator,
516    schema_uri: &str,
517    schema_hash: &str,
518    validate_formats: bool,
519    cache_status: Option<CacheStatus>,
520    group: &[P],
521    vcache: &lintel_validation_cache::ValidationCache,
522    errors: &mut Vec<LintError>,
523    checked: &mut Vec<CheckedFile>,
524    on_check: &mut impl FnMut(&CheckedFile),
525) {
526    for item in group {
527        let pf = item.borrow();
528        let file_errors: Vec<ValidationError> = validator
529            .iter_errors(&pf.instance)
530            .map(|error| ValidationError {
531                instance_path: error.instance_path().to_string(),
532                message: clean_error_message(error.to_string()),
533                schema_path: error.schema_path().to_string(),
534            })
535            .collect();
536
537        vcache
538            .store(
539                &lintel_validation_cache::CacheKey {
540                    file_content: &pf.content,
541                    schema_hash,
542                    validate_formats,
543                },
544                &file_errors,
545            )
546            .await;
547        push_validation_errors(pf, schema_uri, &file_errors, errors);
548
549        let cf = CheckedFile {
550            path: pf.path.clone(),
551            schema: schema_uri.to_string(),
552            cache_status,
553            validation_cache_status: Some(ValidationCacheStatus::Miss),
554        };
555        on_check(&cf);
556        checked.push(cf);
557    }
558}
559
560// ---------------------------------------------------------------------------
561// Public API
562// ---------------------------------------------------------------------------
563
564/// Fetch and compile all schema catalogs (default, `SchemaStore`, and custom registries).
565///
566/// Returns a list of compiled catalogs, printing warnings for any that fail to fetch.
567pub async fn fetch_compiled_catalogs(
568    retriever: &SchemaCache,
569    config: &lintel_config::Config,
570    no_catalog: bool,
571) -> Vec<CompiledCatalog> {
572    let mut compiled_catalogs = Vec::new();
573
574    if !no_catalog {
575        let catalog_span = tracing::info_span!("fetch_catalogs").entered();
576
577        // Catalogs are fetched concurrently but sorted by priority so that
578        // the Lintel catalog wins over custom registries, which win over
579        // SchemaStore.  The `order` field encodes this precedence.
580        #[allow(clippy::items_after_statements)]
581        type CatalogResult = (
582            usize, // priority (lower = higher precedence)
583            String,
584            Result<CompiledCatalog, Box<dyn core::error::Error + Send + Sync>>,
585        );
586        let mut catalog_tasks: tokio::task::JoinSet<CatalogResult> = tokio::task::JoinSet::new();
587
588        // Custom registries from lintel.toml (highest precedence among catalogs)
589        for (i, registry_url) in config.registries.iter().enumerate() {
590            let r = retriever.clone();
591            let url = registry_url.clone();
592            let label = format!("registry {url}");
593            catalog_tasks.spawn(async move {
594                let result = registry::fetch(&r, &url)
595                    .await
596                    .map(|cat| CompiledCatalog::compile(&cat));
597                (i, label, result)
598            });
599        }
600
601        // Lintel catalog
602        let lintel_order = config.registries.len();
603        if !config.no_default_catalog {
604            let r = retriever.clone();
605            let label = format!("default catalog {}", registry::DEFAULT_REGISTRY);
606            catalog_tasks.spawn(async move {
607                let result = registry::fetch(&r, registry::DEFAULT_REGISTRY)
608                    .await
609                    .map(|cat| CompiledCatalog::compile(&cat));
610                (lintel_order, label, result)
611            });
612        }
613
614        // SchemaStore catalog (lowest precedence)
615        let schemastore_order = config.registries.len() + 1;
616        let r = retriever.clone();
617        catalog_tasks.spawn(async move {
618            let result = catalog::fetch_catalog(&r)
619                .await
620                .map(|cat| CompiledCatalog::compile(&cat));
621            (schemastore_order, "SchemaStore catalog".to_string(), result)
622        });
623
624        let mut results: Vec<(usize, CompiledCatalog)> = Vec::new();
625        while let Some(result) = catalog_tasks.join_next().await {
626            match result {
627                Ok((order, _, Ok(compiled))) => results.push((order, compiled)),
628                Ok((_, label, Err(e))) => eprintln!("warning: failed to fetch {label}: {e}"),
629                Err(e) => eprintln!("warning: catalog fetch task failed: {e}"),
630            }
631        }
632        results.sort_by_key(|(order, _)| *order);
633        compiled_catalogs.extend(results.into_iter().map(|(_, cat)| cat));
634
635        drop(catalog_span);
636    }
637
638    compiled_catalogs
639}
640
641/// # Errors
642///
643/// Returns an error if file collection or schema validation encounters an I/O error.
644pub async fn run(args: &ValidateArgs) -> Result<ValidateResult> {
645    run_with(args, None, |_| {}).await
646}
647
648/// Like [`run`], but calls `on_check` each time a file is checked, allowing
649/// callers to stream progress (e.g. verbose output) as files are processed.
650///
651/// # Errors
652///
653/// Returns an error if file collection or schema validation encounters an I/O error.
654#[tracing::instrument(skip_all, name = "validate")]
655#[allow(clippy::too_many_lines)]
656pub async fn run_with(
657    args: &ValidateArgs,
658    cache: Option<SchemaCache>,
659    mut on_check: impl FnMut(&CheckedFile),
660) -> Result<ValidateResult> {
661    let retriever = if let Some(c) = cache {
662        c
663    } else {
664        let mut builder = SchemaCache::builder().force_fetch(args.force_schema_fetch);
665        if let Some(dir) = &args.cache_dir {
666            let path = PathBuf::from(dir);
667            let _ = fs::create_dir_all(&path);
668            builder = builder.cache_dir(path);
669        }
670        if let Some(ttl) = args.schema_cache_ttl {
671            builder = builder.ttl(ttl);
672        }
673        builder.build()
674    };
675
676    let (config, config_dir, _config_path) = load_config(args.config_dir.as_deref());
677    let files = collect_files(&args.globs, &args.exclude)?;
678    tracing::info!(file_count = files.len(), "collected files");
679
680    let compiled_catalogs = fetch_compiled_catalogs(&retriever, &config, args.no_catalog).await;
681
682    let mut errors: Vec<LintError> = Vec::new();
683    let mut checked: Vec<CheckedFile> = Vec::new();
684
685    // Phase 1: Parse files and resolve schema URIs
686    let schema_groups = parse_and_group_files(
687        &files,
688        &config,
689        &config_dir,
690        &compiled_catalogs,
691        &mut errors,
692    )
693    .await;
694    tracing::info!(
695        schema_count = schema_groups.len(),
696        total_files = schema_groups.values().map(Vec::len).sum::<usize>(),
697        "grouped files by schema"
698    );
699
700    // Create validation cache
701    let vcache = lintel_validation_cache::ValidationCache::new(
702        lintel_validation_cache::ensure_cache_dir(),
703        args.force_validation,
704    );
705
706    // Prefetch all remote schemas in parallel
707    let remote_uris: Vec<&String> = schema_groups
708        .keys()
709        .filter(|uri| uri.starts_with("http://") || uri.starts_with("https://"))
710        .collect();
711
712    let prefetched = {
713        let _prefetch_span =
714            tracing::info_span!("prefetch_schemas", count = remote_uris.len()).entered();
715
716        let mut schema_tasks = tokio::task::JoinSet::new();
717        for uri in remote_uris {
718            let r = retriever.clone();
719            let u = uri.clone();
720            schema_tasks.spawn(async move {
721                let result = r.fetch(&u).await;
722                (u, result)
723            });
724        }
725
726        let mut prefetched: HashMap<String, Result<(Value, CacheStatus), String>> = HashMap::new();
727        while let Some(result) = schema_tasks.join_next().await {
728            match result {
729                Ok((uri, fetch_result)) => {
730                    prefetched.insert(uri, fetch_result.map_err(|e| e.to_string()));
731                }
732                Err(e) => eprintln!("warning: schema prefetch task failed: {e}"),
733            }
734        }
735
736        prefetched
737    };
738
739    // Phase 2: Compile each schema once and validate all matching files
740    let mut local_schema_cache: HashMap<String, Value> = HashMap::new();
741    let mut fetch_time = core::time::Duration::ZERO;
742    let mut hash_time = core::time::Duration::ZERO;
743    let mut vcache_time = core::time::Duration::ZERO;
744    let mut compile_time = core::time::Duration::ZERO;
745    let mut validate_time = core::time::Duration::ZERO;
746
747    for (schema_uri, group) in &schema_groups {
748        let _group_span = tracing::debug_span!(
749            "schema_group",
750            schema = schema_uri.as_str(),
751            files = group.len(),
752        )
753        .entered();
754
755        // If ANY file in the group matches a `validate_formats = false` override,
756        // disable format validation for the whole group (they share one compiled validator).
757        let validate_formats = group.iter().all(|pf| {
758            config
759                .should_validate_formats(&pf.path, &[&pf.original_schema_uri, schema_uri.as_str()])
760        });
761
762        // Remote schemas were prefetched in parallel above; local schemas are
763        // read from disk here (with in-memory caching).
764        let t = std::time::Instant::now();
765        let Some((schema_value, cache_status)) = fetch_schema_from_prefetched(
766            schema_uri,
767            &prefetched,
768            &mut local_schema_cache,
769            group,
770            &mut errors,
771            &mut checked,
772            &mut on_check,
773        )
774        .await
775        else {
776            fetch_time += t.elapsed();
777            continue;
778        };
779        fetch_time += t.elapsed();
780
781        // Pre-compute schema hash once for the entire group.
782        let t = std::time::Instant::now();
783        let schema_hash = lintel_validation_cache::schema_hash(&schema_value);
784        hash_time += t.elapsed();
785
786        // Split the group into validation cache hits and misses.
787        let mut cache_misses: Vec<&ParsedFile> = Vec::new();
788
789        let t = std::time::Instant::now();
790        for pf in group {
791            let (cached, vcache_status) = vcache
792                .lookup(&lintel_validation_cache::CacheKey {
793                    file_content: &pf.content,
794                    schema_hash: &schema_hash,
795                    validate_formats,
796                })
797                .await;
798
799            if let Some(cached_errors) = cached {
800                push_validation_errors(pf, schema_uri, &cached_errors, &mut errors);
801                let cf = CheckedFile {
802                    path: pf.path.clone(),
803                    schema: schema_uri.clone(),
804                    cache_status,
805                    validation_cache_status: Some(vcache_status),
806                };
807                on_check(&cf);
808                checked.push(cf);
809            } else {
810                cache_misses.push(pf);
811            }
812        }
813        vcache_time += t.elapsed();
814
815        tracing::debug!(
816            cache_hits = group.len() - cache_misses.len(),
817            cache_misses = cache_misses.len(),
818            "validation cache"
819        );
820
821        // If all files hit the validation cache, skip schema compilation entirely.
822        if cache_misses.is_empty() {
823            continue;
824        }
825
826        // Compile the schema for cache misses.
827        let t = std::time::Instant::now();
828        let validator = {
829            // Set base URI for remote schemas so relative $ref values
830            // (e.g. "./rule.json") resolve correctly.
831            let is_remote_schema =
832                schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
833            let opts = jsonschema::async_options()
834                .with_retriever(retriever.clone())
835                .should_validate_formats(validate_formats);
836            let opts = if is_remote_schema {
837                opts.with_base_uri(schema_uri.clone())
838            } else {
839                opts
840            };
841            match opts.build(&schema_value).await {
842                Ok(v) => v,
843                Err(e) => {
844                    compile_time += t.elapsed();
845                    // When format validation is disabled and the compilation error
846                    // is a uri-reference issue (e.g. Rust-style $ref paths in
847                    // vector.json), skip validation silently.
848                    if !validate_formats && e.to_string().contains("uri-reference") {
849                        mark_group_checked(
850                            schema_uri,
851                            cache_status,
852                            Some(ValidationCacheStatus::Miss),
853                            &cache_misses,
854                            &mut checked,
855                            &mut on_check,
856                        );
857                        continue;
858                    }
859                    let msg = format!("failed to compile schema: {e}");
860                    report_group_error(
861                        |path| LintError::SchemaCompile {
862                            path: path.to_string(),
863                            message: msg.clone(),
864                        },
865                        schema_uri,
866                        cache_status,
867                        &cache_misses,
868                        &mut errors,
869                        &mut checked,
870                        &mut on_check,
871                    );
872                    continue;
873                }
874            }
875        };
876        compile_time += t.elapsed();
877
878        let t = std::time::Instant::now();
879        validate_group(
880            &validator,
881            schema_uri,
882            &schema_hash,
883            validate_formats,
884            cache_status,
885            &cache_misses,
886            &vcache,
887            &mut errors,
888            &mut checked,
889            &mut on_check,
890        )
891        .await;
892        validate_time += t.elapsed();
893    }
894
895    #[allow(clippy::cast_possible_truncation)]
896    {
897        tracing::info!(
898            fetch_ms = fetch_time.as_millis() as u64,
899            hash_ms = hash_time.as_millis() as u64,
900            vcache_ms = vcache_time.as_millis() as u64,
901            compile_ms = compile_time.as_millis() as u64,
902            validate_ms = validate_time.as_millis() as u64,
903            "phase2 breakdown"
904        );
905    }
906
907    // Sort errors for deterministic output (by path, then by span offset)
908    errors.sort_by(|a, b| {
909        a.path()
910            .cmp(b.path())
911            .then_with(|| a.offset().cmp(&b.offset()))
912    });
913
914    Ok(ValidateResult { errors, checked })
915}
916
917#[cfg(test)]
918mod tests {
919    use super::*;
920    use lintel_schema_cache::SchemaCache;
921    use std::path::Path;
922
923    fn mock(entries: &[(&str, &str)]) -> SchemaCache {
924        let cache = SchemaCache::memory();
925        for (uri, body) in entries {
926            cache.insert(
927                uri,
928                serde_json::from_str(body).expect("test mock: invalid JSON"),
929            );
930        }
931        cache
932    }
933
934    fn testdata() -> PathBuf {
935        Path::new(env!("CARGO_MANIFEST_DIR")).join("testdata")
936    }
937
938    /// Build glob patterns that scan one or more testdata directories for all supported file types.
939    fn scenario_globs(dirs: &[&str]) -> Vec<String> {
940        dirs.iter()
941            .flat_map(|dir| {
942                let base = testdata().join(dir);
943                vec![
944                    base.join("*.json").to_string_lossy().to_string(),
945                    base.join("*.yaml").to_string_lossy().to_string(),
946                    base.join("*.yml").to_string_lossy().to_string(),
947                    base.join("*.json5").to_string_lossy().to_string(),
948                    base.join("*.jsonc").to_string_lossy().to_string(),
949                    base.join("*.toml").to_string_lossy().to_string(),
950                ]
951            })
952            .collect()
953    }
954
955    fn args_for_dirs(dirs: &[&str]) -> ValidateArgs {
956        ValidateArgs {
957            globs: scenario_globs(dirs),
958            exclude: vec![],
959            cache_dir: None,
960            force_schema_fetch: true,
961            force_validation: true,
962            no_catalog: true,
963            config_dir: None,
964            schema_cache_ttl: None,
965        }
966    }
967
968    const SCHEMA: &str =
969        r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
970
971    fn schema_mock() -> SchemaCache {
972        mock(&[("https://example.com/schema.json", SCHEMA)])
973    }
974
975    // --- Directory scanning tests ---
976
977    #[tokio::test]
978    async fn no_matching_files() -> anyhow::Result<()> {
979        let tmp = tempfile::tempdir()?;
980        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
981        let c = ValidateArgs {
982            globs: vec![pattern],
983            exclude: vec![],
984            cache_dir: None,
985            force_schema_fetch: true,
986            force_validation: true,
987            no_catalog: true,
988            config_dir: None,
989            schema_cache_ttl: None,
990        };
991        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
992        assert!(!result.has_errors());
993        Ok(())
994    }
995
996    #[tokio::test]
997    async fn dir_all_valid() -> anyhow::Result<()> {
998        let c = args_for_dirs(&["positive_tests"]);
999        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1000        assert!(!result.has_errors());
1001        Ok(())
1002    }
1003
1004    #[tokio::test]
1005    async fn dir_all_invalid() -> anyhow::Result<()> {
1006        let c = args_for_dirs(&["negative_tests"]);
1007        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1008        assert!(result.has_errors());
1009        Ok(())
1010    }
1011
1012    #[tokio::test]
1013    async fn dir_mixed_valid_and_invalid() -> anyhow::Result<()> {
1014        let c = args_for_dirs(&["positive_tests", "negative_tests"]);
1015        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1016        assert!(result.has_errors());
1017        Ok(())
1018    }
1019
1020    #[tokio::test]
1021    async fn dir_no_schemas_skipped() -> anyhow::Result<()> {
1022        let c = args_for_dirs(&["no_schema"]);
1023        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1024        assert!(!result.has_errors());
1025        Ok(())
1026    }
1027
1028    #[tokio::test]
1029    async fn dir_valid_with_no_schema_files() -> anyhow::Result<()> {
1030        let c = args_for_dirs(&["positive_tests", "no_schema"]);
1031        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1032        assert!(!result.has_errors());
1033        Ok(())
1034    }
1035
1036    // --- Directory as positional arg ---
1037
1038    #[tokio::test]
1039    async fn directory_arg_discovers_files() -> anyhow::Result<()> {
1040        let dir = testdata().join("positive_tests");
1041        let c = ValidateArgs {
1042            globs: vec![dir.to_string_lossy().to_string()],
1043            exclude: vec![],
1044            cache_dir: None,
1045            force_schema_fetch: true,
1046            force_validation: true,
1047            no_catalog: true,
1048            config_dir: None,
1049            schema_cache_ttl: None,
1050        };
1051        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1052        assert!(!result.has_errors());
1053        assert!(result.files_checked() > 0);
1054        Ok(())
1055    }
1056
1057    #[tokio::test]
1058    async fn multiple_directory_args() -> anyhow::Result<()> {
1059        let pos_dir = testdata().join("positive_tests");
1060        let no_schema_dir = testdata().join("no_schema");
1061        let c = ValidateArgs {
1062            globs: vec![
1063                pos_dir.to_string_lossy().to_string(),
1064                no_schema_dir.to_string_lossy().to_string(),
1065            ],
1066            exclude: vec![],
1067            cache_dir: None,
1068            force_schema_fetch: true,
1069            force_validation: true,
1070            no_catalog: true,
1071            config_dir: None,
1072            schema_cache_ttl: None,
1073        };
1074        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1075        assert!(!result.has_errors());
1076        Ok(())
1077    }
1078
1079    #[tokio::test]
1080    async fn mix_directory_and_glob_args() -> anyhow::Result<()> {
1081        let dir = testdata().join("positive_tests");
1082        let glob_pattern = testdata()
1083            .join("no_schema")
1084            .join("*.json")
1085            .to_string_lossy()
1086            .to_string();
1087        let c = ValidateArgs {
1088            globs: vec![dir.to_string_lossy().to_string(), glob_pattern],
1089            exclude: vec![],
1090            cache_dir: None,
1091            force_schema_fetch: true,
1092            force_validation: true,
1093            no_catalog: true,
1094            config_dir: None,
1095            schema_cache_ttl: None,
1096        };
1097        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1098        assert!(!result.has_errors());
1099        Ok(())
1100    }
1101
1102    #[tokio::test]
1103    async fn malformed_json_parse_error() -> anyhow::Result<()> {
1104        let base = testdata().join("malformed");
1105        let c = ValidateArgs {
1106            globs: vec![base.join("*.json").to_string_lossy().to_string()],
1107            exclude: vec![],
1108            cache_dir: None,
1109            force_schema_fetch: true,
1110            force_validation: true,
1111            no_catalog: true,
1112            config_dir: None,
1113            schema_cache_ttl: None,
1114        };
1115        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1116        assert!(result.has_errors());
1117        Ok(())
1118    }
1119
1120    #[tokio::test]
1121    async fn malformed_yaml_parse_error() -> anyhow::Result<()> {
1122        let base = testdata().join("malformed");
1123        let c = ValidateArgs {
1124            globs: vec![base.join("*.yaml").to_string_lossy().to_string()],
1125            exclude: vec![],
1126            cache_dir: None,
1127            force_schema_fetch: true,
1128            force_validation: true,
1129            no_catalog: true,
1130            config_dir: None,
1131            schema_cache_ttl: None,
1132        };
1133        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1134        assert!(result.has_errors());
1135        Ok(())
1136    }
1137
1138    // --- Exclude filter ---
1139
1140    #[tokio::test]
1141    async fn exclude_filters_files_in_dir() -> anyhow::Result<()> {
1142        let base = testdata().join("negative_tests");
1143        let c = ValidateArgs {
1144            globs: scenario_globs(&["positive_tests", "negative_tests"]),
1145            exclude: vec![
1146                base.join("missing_name.json").to_string_lossy().to_string(),
1147                base.join("missing_name.toml").to_string_lossy().to_string(),
1148                base.join("missing_name.yaml").to_string_lossy().to_string(),
1149            ],
1150            cache_dir: None,
1151            force_schema_fetch: true,
1152            force_validation: true,
1153            no_catalog: true,
1154            config_dir: None,
1155            schema_cache_ttl: None,
1156        };
1157        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1158        assert!(!result.has_errors());
1159        Ok(())
1160    }
1161
1162    // --- Cache options ---
1163
1164    #[tokio::test]
1165    async fn custom_cache_dir() -> anyhow::Result<()> {
1166        let c = ValidateArgs {
1167            globs: scenario_globs(&["positive_tests"]),
1168            exclude: vec![],
1169            cache_dir: None,
1170            force_schema_fetch: true,
1171            force_validation: true,
1172            no_catalog: true,
1173            config_dir: None,
1174            schema_cache_ttl: None,
1175        };
1176        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1177        assert!(!result.has_errors());
1178        Ok(())
1179    }
1180
1181    // --- Local schema ---
1182
1183    #[tokio::test]
1184    async fn json_valid_with_local_schema() -> anyhow::Result<()> {
1185        let tmp = tempfile::tempdir()?;
1186        let schema_path = tmp.path().join("schema.json");
1187        fs::write(&schema_path, SCHEMA)?;
1188
1189        let f = tmp.path().join("valid.json");
1190        fs::write(
1191            &f,
1192            format!(
1193                r#"{{"$schema":"{}","name":"hello"}}"#,
1194                schema_path.to_string_lossy()
1195            ),
1196        )?;
1197
1198        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1199        let c = ValidateArgs {
1200            globs: vec![pattern],
1201            exclude: vec![],
1202            cache_dir: None,
1203            force_schema_fetch: true,
1204            force_validation: true,
1205            no_catalog: true,
1206            config_dir: None,
1207            schema_cache_ttl: None,
1208        };
1209        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1210        assert!(!result.has_errors());
1211        Ok(())
1212    }
1213
1214    #[tokio::test]
1215    async fn yaml_valid_with_local_schema() -> anyhow::Result<()> {
1216        let tmp = tempfile::tempdir()?;
1217        let schema_path = tmp.path().join("schema.json");
1218        fs::write(&schema_path, SCHEMA)?;
1219
1220        let f = tmp.path().join("valid.yaml");
1221        fs::write(
1222            &f,
1223            format!(
1224                "# yaml-language-server: $schema={}\nname: hello\n",
1225                schema_path.to_string_lossy()
1226            ),
1227        )?;
1228
1229        let pattern = tmp.path().join("*.yaml").to_string_lossy().to_string();
1230        let c = ValidateArgs {
1231            globs: vec![pattern],
1232            exclude: vec![],
1233            cache_dir: None,
1234            force_schema_fetch: true,
1235            force_validation: true,
1236            no_catalog: true,
1237            config_dir: None,
1238            schema_cache_ttl: None,
1239        };
1240        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1241        assert!(!result.has_errors());
1242        Ok(())
1243    }
1244
1245    #[tokio::test]
1246    async fn missing_local_schema_errors() -> anyhow::Result<()> {
1247        let tmp = tempfile::tempdir()?;
1248        let f = tmp.path().join("ref.json");
1249        fs::write(&f, r#"{"$schema":"/nonexistent/schema.json"}"#)?;
1250
1251        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1252        let c = ValidateArgs {
1253            globs: vec![pattern],
1254            exclude: vec![],
1255            cache_dir: None,
1256            force_schema_fetch: true,
1257            force_validation: true,
1258            no_catalog: true,
1259            config_dir: None,
1260            schema_cache_ttl: None,
1261        };
1262        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1263        assert!(result.has_errors());
1264        Ok(())
1265    }
1266
1267    // --- JSON5 / JSONC tests ---
1268
1269    #[tokio::test]
1270    async fn json5_valid_with_schema() -> anyhow::Result<()> {
1271        let tmp = tempfile::tempdir()?;
1272        let schema_path = tmp.path().join("schema.json");
1273        fs::write(&schema_path, SCHEMA)?;
1274
1275        let f = tmp.path().join("config.json5");
1276        fs::write(
1277            &f,
1278            format!(
1279                r#"{{
1280  // JSON5 comment
1281  "$schema": "{}",
1282  name: "hello",
1283}}"#,
1284                schema_path.to_string_lossy()
1285            ),
1286        )?;
1287
1288        let pattern = tmp.path().join("*.json5").to_string_lossy().to_string();
1289        let c = ValidateArgs {
1290            globs: vec![pattern],
1291            exclude: vec![],
1292            cache_dir: None,
1293            force_schema_fetch: true,
1294            force_validation: true,
1295            no_catalog: true,
1296            config_dir: None,
1297            schema_cache_ttl: None,
1298        };
1299        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1300        assert!(!result.has_errors());
1301        Ok(())
1302    }
1303
1304    #[tokio::test]
1305    async fn jsonc_valid_with_schema() -> anyhow::Result<()> {
1306        let tmp = tempfile::tempdir()?;
1307        let schema_path = tmp.path().join("schema.json");
1308        fs::write(&schema_path, SCHEMA)?;
1309
1310        let f = tmp.path().join("config.jsonc");
1311        fs::write(
1312            &f,
1313            format!(
1314                r#"{{
1315  /* JSONC comment */
1316  "$schema": "{}",
1317  "name": "hello"
1318}}"#,
1319                schema_path.to_string_lossy()
1320            ),
1321        )?;
1322
1323        let pattern = tmp.path().join("*.jsonc").to_string_lossy().to_string();
1324        let c = ValidateArgs {
1325            globs: vec![pattern],
1326            exclude: vec![],
1327            cache_dir: None,
1328            force_schema_fetch: true,
1329            force_validation: true,
1330            no_catalog: true,
1331            config_dir: None,
1332            schema_cache_ttl: None,
1333        };
1334        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1335        assert!(!result.has_errors());
1336        Ok(())
1337    }
1338
1339    // --- Catalog-based schema matching ---
1340
1341    const GH_WORKFLOW_SCHEMA: &str = r#"{
1342        "type": "object",
1343        "properties": {
1344            "name": { "type": "string" },
1345            "on": {},
1346            "jobs": { "type": "object" }
1347        },
1348        "required": ["on", "jobs"]
1349    }"#;
1350
1351    fn gh_catalog_json() -> String {
1352        r#"{"version":1,"schemas":[{
1353            "name": "GitHub Workflow",
1354            "description": "GitHub Actions workflow",
1355            "url": "https://www.schemastore.org/github-workflow.json",
1356            "fileMatch": [
1357                "**/.github/workflows/*.yml",
1358                "**/.github/workflows/*.yaml"
1359            ]
1360        }]}"#
1361            .to_string()
1362    }
1363
1364    #[tokio::test]
1365    async fn catalog_matches_github_workflow_valid() -> anyhow::Result<()> {
1366        let tmp = tempfile::tempdir()?;
1367        let cache_tmp = tempfile::tempdir()?;
1368        let wf_dir = tmp.path().join(".github/workflows");
1369        fs::create_dir_all(&wf_dir)?;
1370        fs::write(
1371            wf_dir.join("ci.yml"),
1372            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1373        )?;
1374
1375        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1376        let client = mock(&[
1377            (
1378                "https://www.schemastore.org/api/json/catalog.json",
1379                &gh_catalog_json(),
1380            ),
1381            (
1382                "https://www.schemastore.org/github-workflow.json",
1383                GH_WORKFLOW_SCHEMA,
1384            ),
1385        ]);
1386        let c = ValidateArgs {
1387            globs: vec![pattern],
1388            exclude: vec![],
1389            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1390            force_schema_fetch: true,
1391            force_validation: true,
1392            no_catalog: false,
1393            config_dir: None,
1394            schema_cache_ttl: None,
1395        };
1396        let result = run_with(&c, Some(client), |_| {}).await?;
1397        assert!(!result.has_errors());
1398        Ok(())
1399    }
1400
1401    #[tokio::test]
1402    async fn catalog_matches_github_workflow_invalid() -> anyhow::Result<()> {
1403        let tmp = tempfile::tempdir()?;
1404        let cache_tmp = tempfile::tempdir()?;
1405        let wf_dir = tmp.path().join(".github/workflows");
1406        fs::create_dir_all(&wf_dir)?;
1407        fs::write(wf_dir.join("bad.yml"), "name: Broken\n")?;
1408
1409        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1410        let client = mock(&[
1411            (
1412                "https://www.schemastore.org/api/json/catalog.json",
1413                &gh_catalog_json(),
1414            ),
1415            (
1416                "https://www.schemastore.org/github-workflow.json",
1417                GH_WORKFLOW_SCHEMA,
1418            ),
1419        ]);
1420        let c = ValidateArgs {
1421            globs: vec![pattern],
1422            exclude: vec![],
1423            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1424            force_schema_fetch: true,
1425            force_validation: true,
1426            no_catalog: false,
1427            config_dir: None,
1428            schema_cache_ttl: None,
1429        };
1430        let result = run_with(&c, Some(client), |_| {}).await?;
1431        assert!(result.has_errors());
1432        Ok(())
1433    }
1434
1435    #[tokio::test]
1436    async fn auto_discover_finds_github_workflows() -> anyhow::Result<()> {
1437        let tmp = tempfile::tempdir()?;
1438        let cache_tmp = tempfile::tempdir()?;
1439        let wf_dir = tmp.path().join(".github/workflows");
1440        fs::create_dir_all(&wf_dir)?;
1441        fs::write(
1442            wf_dir.join("ci.yml"),
1443            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1444        )?;
1445
1446        let client = mock(&[
1447            (
1448                "https://www.schemastore.org/api/json/catalog.json",
1449                &gh_catalog_json(),
1450            ),
1451            (
1452                "https://www.schemastore.org/github-workflow.json",
1453                GH_WORKFLOW_SCHEMA,
1454            ),
1455        ]);
1456        let c = ValidateArgs {
1457            globs: vec![],
1458            exclude: vec![],
1459            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1460            force_schema_fetch: true,
1461            force_validation: true,
1462            no_catalog: false,
1463            config_dir: None,
1464            schema_cache_ttl: None,
1465        };
1466
1467        let orig_dir = std::env::current_dir()?;
1468        std::env::set_current_dir(tmp.path())?;
1469        let result = run_with(&c, Some(client), |_| {}).await?;
1470        std::env::set_current_dir(orig_dir)?;
1471
1472        assert!(!result.has_errors());
1473        Ok(())
1474    }
1475
1476    // --- TOML tests ---
1477
1478    #[tokio::test]
1479    async fn toml_valid_with_schema() -> anyhow::Result<()> {
1480        let tmp = tempfile::tempdir()?;
1481        let schema_path = tmp.path().join("schema.json");
1482        fs::write(&schema_path, SCHEMA)?;
1483
1484        let f = tmp.path().join("config.toml");
1485        fs::write(
1486            &f,
1487            format!(
1488                "# :schema {}\nname = \"hello\"\n",
1489                schema_path.to_string_lossy()
1490            ),
1491        )?;
1492
1493        let pattern = tmp.path().join("*.toml").to_string_lossy().to_string();
1494        let c = ValidateArgs {
1495            globs: vec![pattern],
1496            exclude: vec![],
1497            cache_dir: None,
1498            force_schema_fetch: true,
1499            force_validation: true,
1500            no_catalog: true,
1501            config_dir: None,
1502            schema_cache_ttl: None,
1503        };
1504        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1505        assert!(!result.has_errors());
1506        Ok(())
1507    }
1508
1509    // --- Rewrite rules + // resolution ---
1510
1511    #[tokio::test]
1512    async fn rewrite_rule_with_double_slash_resolves_schema() -> anyhow::Result<()> {
1513        let tmp = tempfile::tempdir()?;
1514
1515        let schemas_dir = tmp.path().join("schemas");
1516        fs::create_dir_all(&schemas_dir)?;
1517        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1518
1519        fs::write(
1520            tmp.path().join("lintel.toml"),
1521            r#"
1522[rewrite]
1523"http://localhost:9000/" = "//schemas/"
1524"#,
1525        )?;
1526
1527        let f = tmp.path().join("config.json");
1528        fs::write(
1529            &f,
1530            r#"{"$schema":"http://localhost:9000/test.json","name":"hello"}"#,
1531        )?;
1532
1533        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1534        let c = ValidateArgs {
1535            globs: vec![pattern],
1536            exclude: vec![],
1537            cache_dir: None,
1538            force_schema_fetch: true,
1539            force_validation: true,
1540            no_catalog: true,
1541            config_dir: Some(tmp.path().to_path_buf()),
1542            schema_cache_ttl: None,
1543        };
1544
1545        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1546        assert!(!result.has_errors());
1547        assert_eq!(result.files_checked(), 1);
1548        Ok(())
1549    }
1550
1551    #[tokio::test]
1552    async fn double_slash_schema_resolves_relative_to_config() -> anyhow::Result<()> {
1553        let tmp = tempfile::tempdir()?;
1554
1555        let schemas_dir = tmp.path().join("schemas");
1556        fs::create_dir_all(&schemas_dir)?;
1557        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1558
1559        fs::write(tmp.path().join("lintel.toml"), "")?;
1560
1561        let sub = tmp.path().join("deeply/nested");
1562        fs::create_dir_all(&sub)?;
1563        let f = sub.join("config.json");
1564        fs::write(&f, r#"{"$schema":"//schemas/test.json","name":"hello"}"#)?;
1565
1566        let pattern = sub.join("*.json").to_string_lossy().to_string();
1567        let c = ValidateArgs {
1568            globs: vec![pattern],
1569            exclude: vec![],
1570            cache_dir: None,
1571            force_schema_fetch: true,
1572            force_validation: true,
1573            no_catalog: true,
1574            config_dir: Some(tmp.path().to_path_buf()),
1575            schema_cache_ttl: None,
1576        };
1577
1578        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1579        assert!(!result.has_errors());
1580        Ok(())
1581    }
1582
1583    // --- Format validation override ---
1584
1585    const FORMAT_SCHEMA: &str = r#"{
1586        "type": "object",
1587        "properties": {
1588            "link": { "type": "string", "format": "uri-reference" }
1589        }
1590    }"#;
1591
1592    #[tokio::test]
1593    async fn format_errors_reported_without_override() -> anyhow::Result<()> {
1594        let tmp = tempfile::tempdir()?;
1595        let schema_path = tmp.path().join("schema.json");
1596        fs::write(&schema_path, FORMAT_SCHEMA)?;
1597
1598        let f = tmp.path().join("data.json");
1599        fs::write(
1600            &f,
1601            format!(
1602                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1603                schema_path.to_string_lossy()
1604            ),
1605        )?;
1606
1607        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1608        let c = ValidateArgs {
1609            globs: vec![pattern],
1610            exclude: vec![],
1611            cache_dir: None,
1612            force_schema_fetch: true,
1613            force_validation: true,
1614            no_catalog: true,
1615            config_dir: Some(tmp.path().to_path_buf()),
1616            schema_cache_ttl: None,
1617        };
1618        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1619        assert!(
1620            result.has_errors(),
1621            "expected format error without override"
1622        );
1623        Ok(())
1624    }
1625
1626    #[tokio::test]
1627    async fn format_errors_suppressed_with_override() -> anyhow::Result<()> {
1628        let tmp = tempfile::tempdir()?;
1629        let schema_path = tmp.path().join("schema.json");
1630        fs::write(&schema_path, FORMAT_SCHEMA)?;
1631
1632        let f = tmp.path().join("data.json");
1633        fs::write(
1634            &f,
1635            format!(
1636                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1637                schema_path.to_string_lossy()
1638            ),
1639        )?;
1640
1641        // Use **/data.json to match the absolute path from the tempdir.
1642        fs::write(
1643            tmp.path().join("lintel.toml"),
1644            r#"
1645[[override]]
1646files = ["**/data.json"]
1647validate_formats = false
1648"#,
1649        )?;
1650
1651        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1652        let c = ValidateArgs {
1653            globs: vec![pattern],
1654            exclude: vec![],
1655            cache_dir: None,
1656            force_schema_fetch: true,
1657            force_validation: true,
1658            no_catalog: true,
1659            config_dir: Some(tmp.path().to_path_buf()),
1660            schema_cache_ttl: None,
1661        };
1662        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1663        assert!(
1664            !result.has_errors(),
1665            "expected no errors with validate_formats = false override"
1666        );
1667        Ok(())
1668    }
1669
1670    // --- Unrecognized extension handling ---
1671
1672    #[tokio::test]
1673    async fn unrecognized_extension_skipped_without_catalog() -> anyhow::Result<()> {
1674        let tmp = tempfile::tempdir()?;
1675        fs::write(tmp.path().join("config.nix"), r#"{"name":"hello"}"#)?;
1676
1677        let pattern = tmp.path().join("config.nix").to_string_lossy().to_string();
1678        let c = ValidateArgs {
1679            globs: vec![pattern],
1680            exclude: vec![],
1681            cache_dir: None,
1682            force_schema_fetch: true,
1683            force_validation: true,
1684            no_catalog: true,
1685            config_dir: Some(tmp.path().to_path_buf()),
1686            schema_cache_ttl: None,
1687        };
1688        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1689        assert!(!result.has_errors());
1690        assert_eq!(result.files_checked(), 0);
1691        Ok(())
1692    }
1693
1694    #[tokio::test]
1695    async fn unrecognized_extension_parsed_when_catalog_matches() -> anyhow::Result<()> {
1696        let tmp = tempfile::tempdir()?;
1697        let cache_tmp = tempfile::tempdir()?;
1698        // File has .cfg extension (unrecognized) but content is valid JSON
1699        fs::write(
1700            tmp.path().join("myapp.cfg"),
1701            r#"{"name":"hello","on":"push","jobs":{"build":{}}}"#,
1702        )?;
1703
1704        let catalog_json = r#"{"version":1,"schemas":[{
1705            "name": "MyApp Config",
1706            "description": "MyApp configuration",
1707            "url": "https://example.com/myapp.schema.json",
1708            "fileMatch": ["*.cfg"]
1709        }]}"#;
1710        let schema =
1711            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1712
1713        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1714        let client = mock(&[
1715            (
1716                "https://www.schemastore.org/api/json/catalog.json",
1717                catalog_json,
1718            ),
1719            ("https://example.com/myapp.schema.json", schema),
1720        ]);
1721        let c = ValidateArgs {
1722            globs: vec![pattern],
1723            exclude: vec![],
1724            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1725            force_schema_fetch: true,
1726            force_validation: true,
1727            no_catalog: false,
1728            config_dir: Some(tmp.path().to_path_buf()),
1729            schema_cache_ttl: None,
1730        };
1731        let result = run_with(&c, Some(client), |_| {}).await?;
1732        assert!(!result.has_errors());
1733        assert_eq!(result.files_checked(), 1);
1734        Ok(())
1735    }
1736
1737    #[tokio::test]
1738    async fn unrecognized_extension_unparseable_skipped() -> anyhow::Result<()> {
1739        let tmp = tempfile::tempdir()?;
1740        let cache_tmp = tempfile::tempdir()?;
1741        // File matches catalog but content isn't parseable by any format
1742        fs::write(
1743            tmp.path().join("myapp.cfg"),
1744            "{ pkgs, ... }: { packages = [ pkgs.git ]; }",
1745        )?;
1746
1747        let catalog_json = r#"{"version":1,"schemas":[{
1748            "name": "MyApp Config",
1749            "description": "MyApp configuration",
1750            "url": "https://example.com/myapp.schema.json",
1751            "fileMatch": ["*.cfg"]
1752        }]}"#;
1753
1754        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1755        let client = mock(&[(
1756            "https://www.schemastore.org/api/json/catalog.json",
1757            catalog_json,
1758        )]);
1759        let c = ValidateArgs {
1760            globs: vec![pattern],
1761            exclude: vec![],
1762            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1763            force_schema_fetch: true,
1764            force_validation: true,
1765            no_catalog: false,
1766            config_dir: Some(tmp.path().to_path_buf()),
1767            schema_cache_ttl: None,
1768        };
1769        let result = run_with(&c, Some(client), |_| {}).await?;
1770        assert!(!result.has_errors());
1771        assert_eq!(result.files_checked(), 0);
1772        Ok(())
1773    }
1774
1775    #[tokio::test]
1776    async fn unrecognized_extension_invalid_against_schema() -> anyhow::Result<()> {
1777        let tmp = tempfile::tempdir()?;
1778        let cache_tmp = tempfile::tempdir()?;
1779        // File has .cfg extension, content is valid JSON but fails schema validation
1780        fs::write(tmp.path().join("myapp.cfg"), r#"{"wrong":"field"}"#)?;
1781
1782        let catalog_json = r#"{"version":1,"schemas":[{
1783            "name": "MyApp Config",
1784            "description": "MyApp configuration",
1785            "url": "https://example.com/myapp.schema.json",
1786            "fileMatch": ["*.cfg"]
1787        }]}"#;
1788        let schema =
1789            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1790
1791        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1792        let client = mock(&[
1793            (
1794                "https://www.schemastore.org/api/json/catalog.json",
1795                catalog_json,
1796            ),
1797            ("https://example.com/myapp.schema.json", schema),
1798        ]);
1799        let c = ValidateArgs {
1800            globs: vec![pattern],
1801            exclude: vec![],
1802            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1803            force_schema_fetch: true,
1804            force_validation: true,
1805            no_catalog: false,
1806            config_dir: Some(tmp.path().to_path_buf()),
1807            schema_cache_ttl: None,
1808        };
1809        let result = run_with(&c, Some(client), |_| {}).await?;
1810        assert!(result.has_errors());
1811        assert_eq!(result.files_checked(), 1);
1812        Ok(())
1813    }
1814
1815    // --- Validation cache ---
1816
1817    #[tokio::test]
1818    async fn validation_cache_hit_skips_revalidation() -> anyhow::Result<()> {
1819        let tmp = tempfile::tempdir()?;
1820        let schema_path = tmp.path().join("schema.json");
1821        fs::write(&schema_path, SCHEMA)?;
1822
1823        let f = tmp.path().join("valid.json");
1824        fs::write(
1825            &f,
1826            format!(
1827                r#"{{"$schema":"{}","name":"hello"}}"#,
1828                schema_path.to_string_lossy()
1829            ),
1830        )?;
1831
1832        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1833
1834        // First run: force_validation = false so results get cached
1835        let c = ValidateArgs {
1836            globs: vec![pattern.clone()],
1837            exclude: vec![],
1838            cache_dir: None,
1839            force_schema_fetch: true,
1840            force_validation: false,
1841            no_catalog: true,
1842            config_dir: None,
1843            schema_cache_ttl: None,
1844        };
1845        let mut first_statuses = Vec::new();
1846        let result = run_with(&c, Some(mock(&[])), |cf| {
1847            first_statuses.push(cf.validation_cache_status);
1848        })
1849        .await?;
1850        assert!(!result.has_errors());
1851        assert!(result.files_checked() > 0);
1852
1853        // Verify the first run recorded a validation cache miss
1854        assert!(
1855            first_statuses.contains(&Some(ValidationCacheStatus::Miss)),
1856            "expected at least one validation cache miss on first run"
1857        );
1858
1859        // Second run: same file, same schema — should hit validation cache
1860        let mut second_statuses = Vec::new();
1861        let result = run_with(&c, Some(mock(&[])), |cf| {
1862            second_statuses.push(cf.validation_cache_status);
1863        })
1864        .await?;
1865        assert!(!result.has_errors());
1866
1867        // Verify the second run got a validation cache hit
1868        assert!(
1869            second_statuses.contains(&Some(ValidationCacheStatus::Hit)),
1870            "expected at least one validation cache hit on second run"
1871        );
1872        Ok(())
1873    }
1874
1875    // --- clean_error_message ---
1876
1877    #[test]
1878    fn clean_strips_anyof_value() {
1879        let msg =
1880            r#"{"type":"bad"} is not valid under any of the schemas listed in the 'anyOf' keyword"#;
1881        assert_eq!(
1882            clean_error_message(msg.to_string()),
1883            "not valid under any of the schemas listed in the 'anyOf' keyword"
1884        );
1885    }
1886
1887    #[test]
1888    fn clean_strips_oneof_value() {
1889        let msg = r#"{"runs-on":"ubuntu-latest","steps":[]} is not valid under any of the schemas listed in the 'oneOf' keyword"#;
1890        assert_eq!(
1891            clean_error_message(msg.to_string()),
1892            "not valid under any of the schemas listed in the 'oneOf' keyword"
1893        );
1894    }
1895
1896    #[test]
1897    fn clean_strips_long_value() {
1898        let long_value = "x".repeat(5000);
1899        let suffix = " is not valid under any of the schemas listed in the 'anyOf' keyword";
1900        let msg = format!("{long_value}{suffix}");
1901        assert_eq!(
1902            clean_error_message(msg),
1903            "not valid under any of the schemas listed in the 'anyOf' keyword"
1904        );
1905    }
1906
1907    #[test]
1908    fn clean_preserves_type_error() {
1909        let msg = r#"12345 is not of types "null", "string""#;
1910        assert_eq!(clean_error_message(msg.to_string()), msg);
1911    }
1912
1913    #[test]
1914    fn clean_preserves_required_property() {
1915        let msg = "\"name\" is a required property";
1916        assert_eq!(clean_error_message(msg.to_string()), msg);
1917    }
1918}