Skip to main content

lintel_validate/
validate.rs

1use alloc::collections::BTreeMap;
2use std::collections::HashMap;
3use std::fs;
4use std::path::{Path, PathBuf};
5
6use anyhow::{Context, Result};
7use glob::glob;
8use serde_json::Value;
9
10use crate::catalog;
11use lintel_schema_cache::{CacheStatus, SchemaCache};
12use lintel_validation_cache::{ValidationCacheStatus, ValidationError};
13use schemastore::CompiledCatalog;
14
15use crate::diagnostics::{DEFAULT_LABEL, find_instance_path_span, format_label};
16use crate::discover;
17use crate::parsers::{self, FileFormat, JsoncParser, Parser};
18use crate::registry;
19
20/// Conservative limit for concurrent file reads to avoid exhausting file
21/// descriptors. 128 is well below the default soft limit on macOS (256) and
22/// Linux (1024) while still providing good throughput.
23const FD_CONCURRENCY_LIMIT: usize = 128;
24
25pub struct ValidateArgs {
26    /// Glob patterns to find files (empty = auto-discover)
27    pub globs: Vec<String>,
28
29    /// Exclude files matching these globs (repeatable)
30    pub exclude: Vec<String>,
31
32    /// Cache directory for remote schemas
33    pub cache_dir: Option<String>,
34
35    /// Bypass schema cache reads (still writes fetched schemas to cache)
36    pub force_schema_fetch: bool,
37
38    /// Bypass validation cache reads (still writes results to cache)
39    pub force_validation: bool,
40
41    /// Disable `SchemaStore` catalog matching
42    pub no_catalog: bool,
43
44    /// Directory to search for `lintel.toml` (defaults to cwd)
45    pub config_dir: Option<PathBuf>,
46
47    /// TTL for cached schemas. `None` means no expiry.
48    pub schema_cache_ttl: Option<core::time::Duration>,
49}
50
51/// Re-exported from [`crate::diagnostics::LintError`] so callers can use
52/// `lintel_validate::validate::LintError` without importing diagnostics.
53pub use crate::diagnostics::LintError;
54
55/// A file that was checked and the schema it resolved to.
56pub struct CheckedFile {
57    pub path: String,
58    pub schema: String,
59    /// `None` for local schemas and builtins; `Some` for remote schemas.
60    pub cache_status: Option<CacheStatus>,
61    /// `None` when validation caching is not applicable; `Some` for validation cache hits/misses.
62    pub validation_cache_status: Option<ValidationCacheStatus>,
63}
64
65/// Result of a validation run.
66pub struct ValidateResult {
67    pub errors: Vec<LintError>,
68    pub checked: Vec<CheckedFile>,
69}
70
71impl ValidateResult {
72    pub fn has_errors(&self) -> bool {
73        !self.errors.is_empty()
74    }
75
76    pub fn files_checked(&self) -> usize {
77        self.checked.len()
78    }
79}
80
81// ---------------------------------------------------------------------------
82// Internal types
83// ---------------------------------------------------------------------------
84
85/// A file that has been parsed and matched to a schema URI.
86struct ParsedFile {
87    path: String,
88    content: String,
89    instance: Value,
90    /// Original schema URI before rewrites (for override matching).
91    original_schema_uri: String,
92}
93
94// ---------------------------------------------------------------------------
95// Config loading
96// ---------------------------------------------------------------------------
97
98/// Locate `lintel.toml`, load the full config, and return the config directory.
99/// Returns `(config, config_dir, config_path)`.  When no config is found or
100/// cwd is unavailable the config is default and `config_path` is `None`.
101#[tracing::instrument(skip_all)]
102pub fn load_config(search_dir: Option<&Path>) -> (lintel_config::Config, PathBuf, Option<PathBuf>) {
103    let start_dir = match search_dir {
104        Some(d) => d.to_path_buf(),
105        None => match std::env::current_dir() {
106            Ok(d) => d,
107            Err(_) => return (lintel_config::Config::default(), PathBuf::from("."), None),
108        },
109    };
110
111    let Some(config_path) = lintel_config::find_config_path(&start_dir) else {
112        return (lintel_config::Config::default(), start_dir, None);
113    };
114
115    let dir = config_path.parent().unwrap_or(&start_dir).to_path_buf();
116    let cfg = lintel_config::find_and_load(&start_dir)
117        .ok()
118        .flatten()
119        .unwrap_or_default();
120    (cfg, dir, Some(config_path))
121}
122
123// ---------------------------------------------------------------------------
124// File collection
125// ---------------------------------------------------------------------------
126
127/// Collect input files from globs/directories, applying exclude filters.
128///
129/// # Errors
130///
131/// Returns an error if a glob pattern is invalid or a directory cannot be walked.
132#[tracing::instrument(skip_all, fields(glob_count = globs.len(), exclude_count = exclude.len()))]
133pub fn collect_files(globs: &[String], exclude: &[String]) -> Result<Vec<PathBuf>> {
134    if globs.is_empty() {
135        return discover::discover_files(".", exclude);
136    }
137
138    let mut result = Vec::new();
139    for pattern in globs {
140        let path = Path::new(pattern);
141        if path.is_dir() {
142            result.extend(discover::discover_files(pattern, exclude)?);
143        } else {
144            for entry in glob(pattern).with_context(|| format!("invalid glob: {pattern}"))? {
145                let path = entry?;
146                if path.is_file() && !is_excluded(&path, exclude) {
147                    result.push(path);
148                }
149            }
150        }
151    }
152    Ok(result)
153}
154
155fn is_excluded(path: &Path, excludes: &[String]) -> bool {
156    let path_str = match path.to_str() {
157        Some(s) => s.strip_prefix("./").unwrap_or(s),
158        None => return false,
159    };
160    excludes
161        .iter()
162        .any(|pattern| glob_match::glob_match(pattern, path_str))
163}
164
165// ---------------------------------------------------------------------------
166// Phase 1: Parse files and resolve schema URIs
167// ---------------------------------------------------------------------------
168
169/// Try parsing content with each known format, returning the first success.
170///
171/// JSONC is tried first (superset of JSON, handles comments), then YAML and
172/// TOML which cover the most common config formats, followed by the rest.
173pub fn try_parse_all(content: &str, file_name: &str) -> Option<(parsers::FileFormat, Value)> {
174    use parsers::FileFormat::{Json, Json5, Jsonc, Markdown, Toml, Yaml};
175    const FORMATS: [parsers::FileFormat; 6] = [Jsonc, Yaml, Toml, Json, Json5, Markdown];
176
177    for fmt in FORMATS {
178        let parser = parsers::parser_for(fmt);
179        if let Ok(val) = parser.parse(content, file_name) {
180            return Some((fmt, val));
181        }
182    }
183    None
184}
185
186/// Result of processing a single file: either a parsed file with its schema URI,
187/// a lint error, or nothing (file was skipped).
188enum FileResult {
189    Parsed {
190        schema_uri: String,
191        parsed: ParsedFile,
192    },
193    Error(LintError),
194    Skip,
195}
196
197/// Process a single file's already-read content: parse and resolve schema URI.
198#[allow(clippy::too_many_arguments)]
199fn process_one_file(
200    path: &Path,
201    content: String,
202    config: &lintel_config::Config,
203    config_dir: &Path,
204    compiled_catalogs: &[CompiledCatalog],
205) -> FileResult {
206    let path_str = path.display().to_string();
207    let file_name = path
208        .file_name()
209        .and_then(|n| n.to_str())
210        .unwrap_or(&path_str);
211
212    let detected_format = parsers::detect_format(path);
213
214    // For unrecognized extensions, only proceed if a catalog or config mapping matches.
215    if detected_format.is_none() {
216        let has_match = config.find_schema_mapping(&path_str, file_name).is_some()
217            || compiled_catalogs
218                .iter()
219                .any(|cat| cat.find_schema(&path_str, file_name).is_some());
220        if !has_match {
221            return FileResult::Skip;
222        }
223    }
224
225    // Parse the file content.
226    let (parser, instance): (Box<dyn Parser>, Value) = if let Some(fmt) = detected_format {
227        let parser = parsers::parser_for(fmt);
228        match parser.parse(&content, &path_str) {
229            Ok(val) => (parser, val),
230            Err(parse_err) => {
231                // JSONC fallback for .json files that match a catalog entry.
232                if fmt == FileFormat::Json
233                    && compiled_catalogs
234                        .iter()
235                        .any(|cat| cat.find_schema(&path_str, file_name).is_some())
236                {
237                    match JsoncParser.parse(&content, &path_str) {
238                        Ok(val) => (parsers::parser_for(FileFormat::Jsonc), val),
239                        Err(jsonc_err) => return FileResult::Error(jsonc_err.into()),
240                    }
241                } else {
242                    return FileResult::Error(parse_err.into());
243                }
244            }
245        }
246    } else {
247        match try_parse_all(&content, &path_str) {
248            Some((fmt, val)) => (parsers::parser_for(fmt), val),
249            None => return FileResult::Skip,
250        }
251    };
252
253    // Skip markdown files with no frontmatter
254    if instance.is_null() {
255        return FileResult::Skip;
256    }
257
258    // Schema resolution priority:
259    // 1. Inline $schema / YAML modeline (always wins)
260    // 2. Custom schema mappings from lintel.toml [schemas]
261    // 3. Catalog matching (custom registries > Lintel catalog > SchemaStore)
262    let schema_uri = parser
263        .extract_schema_uri(&content, &instance)
264        .or_else(|| {
265            config
266                .find_schema_mapping(&path_str, file_name)
267                .map(str::to_string)
268        })
269        .or_else(|| {
270            compiled_catalogs
271                .iter()
272                .find_map(|cat| cat.find_schema(&path_str, file_name))
273                .map(str::to_string)
274        });
275
276    let Some(schema_uri) = schema_uri else {
277        return FileResult::Skip;
278    };
279
280    // Keep original URI for override matching (before rewrites)
281    let original_schema_uri = schema_uri.clone();
282
283    // Apply rewrite rules, then resolve // paths relative to lintel.toml
284    let schema_uri = lintel_config::apply_rewrites(&schema_uri, &config.rewrite);
285    let schema_uri = lintel_config::resolve_double_slash(&schema_uri, config_dir);
286
287    // Resolve relative local paths against the file's parent directory.
288    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
289    let schema_uri = if is_remote {
290        schema_uri
291    } else {
292        path.parent()
293            .map(|parent| parent.join(&schema_uri).to_string_lossy().to_string())
294            .unwrap_or(schema_uri)
295    };
296
297    FileResult::Parsed {
298        schema_uri,
299        parsed: ParsedFile {
300            path: path_str,
301            content,
302            instance,
303            original_schema_uri,
304        },
305    }
306}
307
308/// Read each file concurrently with tokio, parse its content, extract its
309/// schema URI, apply rewrites, and group by resolved schema URI.
310#[tracing::instrument(skip_all, fields(file_count = files.len()))]
311#[allow(clippy::too_many_arguments)]
312async fn parse_and_group_files(
313    files: &[PathBuf],
314    config: &lintel_config::Config,
315    config_dir: &Path,
316    compiled_catalogs: &[CompiledCatalog],
317    errors: &mut Vec<LintError>,
318) -> BTreeMap<String, Vec<ParsedFile>> {
319    // Read all files concurrently using tokio async I/O, with a semaphore
320    // to avoid exhausting file descriptors on large directories.
321    let semaphore = alloc::sync::Arc::new(tokio::sync::Semaphore::new(FD_CONCURRENCY_LIMIT));
322    let mut read_set = tokio::task::JoinSet::new();
323    for path in files {
324        let path = path.clone();
325        let sem = semaphore.clone();
326        read_set.spawn(async move {
327            let _permit = sem.acquire().await.expect("semaphore closed");
328            let result = tokio::fs::read_to_string(&path).await;
329            (path, result)
330        });
331    }
332
333    let mut file_contents = Vec::with_capacity(files.len());
334    while let Some(result) = read_set.join_next().await {
335        match result {
336            Ok(item) => file_contents.push(item),
337            Err(e) => tracing::warn!("file read task panicked: {e}"),
338        }
339    }
340
341    // Process files: parse content and resolve schema URIs.
342    let mut schema_groups: BTreeMap<String, Vec<ParsedFile>> = BTreeMap::new();
343    for (path, content_result) in file_contents {
344        let content = match content_result {
345            Ok(c) => c,
346            Err(e) => {
347                errors.push(LintError::Io {
348                    path: path.display().to_string(),
349                    message: format!("failed to read: {e}"),
350                });
351                continue;
352            }
353        };
354        let result = process_one_file(&path, content, config, config_dir, compiled_catalogs);
355        match result {
356            FileResult::Parsed { schema_uri, parsed } => {
357                schema_groups.entry(schema_uri).or_default().push(parsed);
358            }
359            FileResult::Error(e) => errors.push(e),
360            FileResult::Skip => {}
361        }
362    }
363
364    schema_groups
365}
366
367// ---------------------------------------------------------------------------
368// Phase 2: Schema fetching, compilation, and instance validation
369// ---------------------------------------------------------------------------
370
371/// Fetch a schema by URI, returning its parsed JSON and cache status.
372///
373/// For remote URIs, checks the prefetched map first; for local URIs, reads
374/// from disk (with in-memory caching to avoid redundant I/O for shared schemas).
375#[allow(clippy::too_many_arguments)]
376async fn fetch_schema_from_prefetched(
377    schema_uri: &str,
378    prefetched: &HashMap<String, Result<(Value, CacheStatus), String>>,
379    local_cache: &mut HashMap<String, Value>,
380    group: &[ParsedFile],
381    errors: &mut Vec<LintError>,
382    checked: &mut Vec<CheckedFile>,
383    on_check: &mut impl FnMut(&CheckedFile),
384) -> Option<(Value, Option<CacheStatus>)> {
385    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
386
387    let result: Result<(Value, Option<CacheStatus>), String> = if is_remote {
388        match prefetched.get(schema_uri) {
389            Some(Ok((v, status))) => Ok((v.clone(), Some(*status))),
390            Some(Err(e)) => Err(format!("failed to fetch schema: {schema_uri}: {e}")),
391            None => Err(format!("schema not prefetched: {schema_uri}")),
392        }
393    } else if let Some(cached) = local_cache.get(schema_uri) {
394        Ok((cached.clone(), None))
395    } else {
396        tokio::fs::read_to_string(schema_uri)
397            .await
398            .map_err(|e| format!("failed to read local schema {schema_uri}: {e}"))
399            .and_then(|content| {
400                serde_json::from_str::<Value>(&content)
401                    .map(|v| {
402                        local_cache.insert(schema_uri.to_string(), v.clone());
403                        (v, None)
404                    })
405                    .map_err(|e| format!("failed to parse local schema {schema_uri}: {e}"))
406            })
407    };
408
409    match result {
410        Ok(value) => Some(value),
411        Err(message) => {
412            report_group_error(
413                |path| LintError::SchemaFetch {
414                    path: path.to_string(),
415                    message: message.clone(),
416                },
417                schema_uri,
418                None,
419                group,
420                errors,
421                checked,
422                on_check,
423            );
424            None
425        }
426    }
427}
428
429/// Report the same error for every file in a schema group.
430#[allow(clippy::too_many_arguments)]
431fn report_group_error<P: alloc::borrow::Borrow<ParsedFile>>(
432    make_error: impl Fn(&str) -> LintError,
433    schema_uri: &str,
434    cache_status: Option<CacheStatus>,
435    group: &[P],
436    errors: &mut Vec<LintError>,
437    checked: &mut Vec<CheckedFile>,
438    on_check: &mut impl FnMut(&CheckedFile),
439) {
440    for item in group {
441        let pf = item.borrow();
442        let cf = CheckedFile {
443            path: pf.path.clone(),
444            schema: schema_uri.to_string(),
445            cache_status,
446            validation_cache_status: None,
447        };
448        on_check(&cf);
449        checked.push(cf);
450        errors.push(make_error(&pf.path));
451    }
452}
453
454/// Mark every file in a group as checked (no errors).
455#[allow(clippy::too_many_arguments)]
456fn mark_group_checked<P: alloc::borrow::Borrow<ParsedFile>>(
457    schema_uri: &str,
458    cache_status: Option<CacheStatus>,
459    validation_cache_status: Option<ValidationCacheStatus>,
460    group: &[P],
461    checked: &mut Vec<CheckedFile>,
462    on_check: &mut impl FnMut(&CheckedFile),
463) {
464    for item in group {
465        let pf = item.borrow();
466        let cf = CheckedFile {
467            path: pf.path.clone(),
468            schema: schema_uri.to_string(),
469            cache_status,
470            validation_cache_status,
471        };
472        on_check(&cf);
473        checked.push(cf);
474    }
475}
476
477/// Clean up error messages from the `jsonschema` crate.
478///
479/// For `anyOf`/`oneOf` failures the crate dumps the entire JSON value into the
480/// message (e.g. `{...} is not valid under any of the schemas listed in the 'oneOf' keyword`).
481/// The source snippet already shows the value, so we strip the redundant prefix
482/// and keep only `"not valid under any of the schemas listed in the 'oneOf' keyword"`.
483///
484/// All other messages are returned unchanged.
485fn clean_error_message(msg: String) -> String {
486    const MARKER: &str = " is not valid under any of the schemas listed in the '";
487    if let Some(pos) = msg.find(MARKER) {
488        // pos points to " is not valid...", skip " is " (4 chars) to get "not valid..."
489        return msg[pos + 4..].to_string();
490    }
491    msg
492}
493
494/// Convert [`ValidationError`]s into [`LintError::Validation`] diagnostics.
495fn push_validation_errors(
496    pf: &ParsedFile,
497    schema_url: &str,
498    validation_errors: &[ValidationError],
499    errors: &mut Vec<LintError>,
500) {
501    for ve in validation_errors {
502        let span = find_instance_path_span(&pf.content, &ve.instance_path);
503        let instance_path = if ve.instance_path.is_empty() {
504            DEFAULT_LABEL.to_string()
505        } else {
506            ve.instance_path.clone()
507        };
508        let label = format_label(&instance_path, &ve.schema_path);
509        let source_span: miette::SourceSpan = span.into();
510        errors.push(LintError::Validation {
511            src: miette::NamedSource::new(&pf.path, pf.content.clone()),
512            span: source_span,
513            schema_span: source_span,
514            path: pf.path.clone(),
515            instance_path,
516            label,
517            message: ve.message.clone(),
518            schema_url: schema_url.to_string(),
519            schema_path: ve.schema_path.clone(),
520        });
521    }
522}
523
524/// Validate all files in a group against an already-compiled validator and store
525/// results in the validation cache.
526#[tracing::instrument(skip_all, fields(schema_uri, file_count = group.len()))]
527#[allow(clippy::too_many_arguments)]
528async fn validate_group<P: alloc::borrow::Borrow<ParsedFile>>(
529    validator: &jsonschema::Validator,
530    schema_uri: &str,
531    schema_hash: &str,
532    validate_formats: bool,
533    cache_status: Option<CacheStatus>,
534    group: &[P],
535    vcache: &lintel_validation_cache::ValidationCache,
536    errors: &mut Vec<LintError>,
537    checked: &mut Vec<CheckedFile>,
538    on_check: &mut impl FnMut(&CheckedFile),
539) {
540    for item in group {
541        let pf = item.borrow();
542        let file_errors: Vec<ValidationError> = validator
543            .iter_errors(&pf.instance)
544            .map(|error| ValidationError {
545                instance_path: error.instance_path().to_string(),
546                message: clean_error_message(error.to_string()),
547                schema_path: error.schema_path().to_string(),
548            })
549            .collect();
550
551        vcache
552            .store(
553                &lintel_validation_cache::CacheKey {
554                    file_content: &pf.content,
555                    schema_hash,
556                    validate_formats,
557                },
558                &file_errors,
559            )
560            .await;
561        push_validation_errors(pf, schema_uri, &file_errors, errors);
562
563        let cf = CheckedFile {
564            path: pf.path.clone(),
565            schema: schema_uri.to_string(),
566            cache_status,
567            validation_cache_status: Some(ValidationCacheStatus::Miss),
568        };
569        on_check(&cf);
570        checked.push(cf);
571    }
572}
573
574// ---------------------------------------------------------------------------
575// Public API
576// ---------------------------------------------------------------------------
577
578/// Fetch and compile all schema catalogs (default, `SchemaStore`, and custom registries).
579///
580/// Returns a list of compiled catalogs, printing warnings for any that fail to fetch.
581pub async fn fetch_compiled_catalogs(
582    retriever: &SchemaCache,
583    config: &lintel_config::Config,
584    no_catalog: bool,
585) -> Vec<CompiledCatalog> {
586    let mut compiled_catalogs = Vec::new();
587
588    if !no_catalog {
589        let catalog_span = tracing::info_span!("fetch_catalogs").entered();
590
591        // Catalogs are fetched concurrently but sorted by priority so that
592        // the Lintel catalog wins over custom registries, which win over
593        // SchemaStore.  The `order` field encodes this precedence.
594        #[allow(clippy::items_after_statements)]
595        type CatalogResult = (
596            usize, // priority (lower = higher precedence)
597            String,
598            Result<CompiledCatalog, Box<dyn core::error::Error + Send + Sync>>,
599        );
600        let mut catalog_tasks: tokio::task::JoinSet<CatalogResult> = tokio::task::JoinSet::new();
601
602        // Custom registries from lintel.toml (highest precedence among catalogs)
603        for (i, registry_url) in config.registries.iter().enumerate() {
604            let r = retriever.clone();
605            let url = registry_url.clone();
606            let label = format!("registry {url}");
607            catalog_tasks.spawn(async move {
608                let result = registry::fetch(&r, &url)
609                    .await
610                    .map(|cat| CompiledCatalog::compile(&cat));
611                (i, label, result)
612            });
613        }
614
615        // Lintel catalog
616        let lintel_order = config.registries.len();
617        if !config.no_default_catalog {
618            let r = retriever.clone();
619            let label = format!("default catalog {}", registry::DEFAULT_REGISTRY);
620            catalog_tasks.spawn(async move {
621                let result = registry::fetch(&r, registry::DEFAULT_REGISTRY)
622                    .await
623                    .map(|cat| CompiledCatalog::compile(&cat));
624                (lintel_order, label, result)
625            });
626        }
627
628        // SchemaStore catalog (lowest precedence)
629        let schemastore_order = config.registries.len() + 1;
630        let r = retriever.clone();
631        catalog_tasks.spawn(async move {
632            let result = catalog::fetch_catalog(&r)
633                .await
634                .map(|cat| CompiledCatalog::compile(&cat));
635            (schemastore_order, "SchemaStore catalog".to_string(), result)
636        });
637
638        let mut results: Vec<(usize, CompiledCatalog)> = Vec::new();
639        while let Some(result) = catalog_tasks.join_next().await {
640            match result {
641                Ok((order, _, Ok(compiled))) => results.push((order, compiled)),
642                Ok((_, label, Err(e))) => eprintln!("warning: failed to fetch {label}: {e}"),
643                Err(e) => eprintln!("warning: catalog fetch task failed: {e}"),
644            }
645        }
646        results.sort_by_key(|(order, _)| *order);
647        compiled_catalogs.extend(results.into_iter().map(|(_, cat)| cat));
648
649        drop(catalog_span);
650    }
651
652    compiled_catalogs
653}
654
655/// # Errors
656///
657/// Returns an error if file collection or schema validation encounters an I/O error.
658pub async fn run(args: &ValidateArgs) -> Result<ValidateResult> {
659    run_with(args, None, |_| {}).await
660}
661
662/// Like [`run`], but calls `on_check` each time a file is checked, allowing
663/// callers to stream progress (e.g. verbose output) as files are processed.
664///
665/// # Errors
666///
667/// Returns an error if file collection or schema validation encounters an I/O error.
668#[tracing::instrument(skip_all, name = "validate")]
669#[allow(clippy::too_many_lines)]
670pub async fn run_with(
671    args: &ValidateArgs,
672    cache: Option<SchemaCache>,
673    mut on_check: impl FnMut(&CheckedFile),
674) -> Result<ValidateResult> {
675    let retriever = if let Some(c) = cache {
676        c
677    } else {
678        let mut builder = SchemaCache::builder().force_fetch(args.force_schema_fetch);
679        if let Some(dir) = &args.cache_dir {
680            let path = PathBuf::from(dir);
681            let _ = fs::create_dir_all(&path);
682            builder = builder.cache_dir(path);
683        }
684        if let Some(ttl) = args.schema_cache_ttl {
685            builder = builder.ttl(ttl);
686        }
687        builder.build()
688    };
689
690    let (config, config_dir, _config_path) = load_config(args.config_dir.as_deref());
691    let files = collect_files(&args.globs, &args.exclude)?;
692    tracing::info!(file_count = files.len(), "collected files");
693
694    let compiled_catalogs = fetch_compiled_catalogs(&retriever, &config, args.no_catalog).await;
695
696    let mut errors: Vec<LintError> = Vec::new();
697    let mut checked: Vec<CheckedFile> = Vec::new();
698
699    // Phase 1: Parse files and resolve schema URIs
700    let schema_groups = parse_and_group_files(
701        &files,
702        &config,
703        &config_dir,
704        &compiled_catalogs,
705        &mut errors,
706    )
707    .await;
708    tracing::info!(
709        schema_count = schema_groups.len(),
710        total_files = schema_groups.values().map(Vec::len).sum::<usize>(),
711        "grouped files by schema"
712    );
713
714    // Create validation cache
715    let vcache = lintel_validation_cache::ValidationCache::new(
716        lintel_validation_cache::ensure_cache_dir(),
717        args.force_validation,
718    );
719
720    // Prefetch all remote schemas in parallel
721    let remote_uris: Vec<&String> = schema_groups
722        .keys()
723        .filter(|uri| uri.starts_with("http://") || uri.starts_with("https://"))
724        .collect();
725
726    let prefetched = {
727        let _prefetch_span =
728            tracing::info_span!("prefetch_schemas", count = remote_uris.len()).entered();
729
730        let mut schema_tasks = tokio::task::JoinSet::new();
731        for uri in remote_uris {
732            let r = retriever.clone();
733            let u = uri.clone();
734            schema_tasks.spawn(async move {
735                let result = r.fetch(&u).await;
736                (u, result)
737            });
738        }
739
740        let mut prefetched: HashMap<String, Result<(Value, CacheStatus), String>> = HashMap::new();
741        while let Some(result) = schema_tasks.join_next().await {
742            match result {
743                Ok((uri, fetch_result)) => {
744                    prefetched.insert(uri, fetch_result.map_err(|e| e.to_string()));
745                }
746                Err(e) => eprintln!("warning: schema prefetch task failed: {e}"),
747            }
748        }
749
750        prefetched
751    };
752
753    // Phase 2: Compile each schema once and validate all matching files
754    let mut local_schema_cache: HashMap<String, Value> = HashMap::new();
755    let mut fetch_time = core::time::Duration::ZERO;
756    let mut hash_time = core::time::Duration::ZERO;
757    let mut vcache_time = core::time::Duration::ZERO;
758    let mut compile_time = core::time::Duration::ZERO;
759    let mut validate_time = core::time::Duration::ZERO;
760
761    for (schema_uri, group) in &schema_groups {
762        let _group_span = tracing::debug_span!(
763            "schema_group",
764            schema = schema_uri.as_str(),
765            files = group.len(),
766        )
767        .entered();
768
769        // If ANY file in the group matches a `validate_formats = false` override,
770        // disable format validation for the whole group (they share one compiled validator).
771        let validate_formats = group.iter().all(|pf| {
772            config
773                .should_validate_formats(&pf.path, &[&pf.original_schema_uri, schema_uri.as_str()])
774        });
775
776        // Remote schemas were prefetched in parallel above; local schemas are
777        // read from disk here (with in-memory caching).
778        let t = std::time::Instant::now();
779        let Some((schema_value, cache_status)) = fetch_schema_from_prefetched(
780            schema_uri,
781            &prefetched,
782            &mut local_schema_cache,
783            group,
784            &mut errors,
785            &mut checked,
786            &mut on_check,
787        )
788        .await
789        else {
790            fetch_time += t.elapsed();
791            continue;
792        };
793        fetch_time += t.elapsed();
794
795        // Pre-compute schema hash once for the entire group.
796        let t = std::time::Instant::now();
797        let schema_hash = lintel_validation_cache::schema_hash(&schema_value);
798        hash_time += t.elapsed();
799
800        // Split the group into validation cache hits and misses.
801        let mut cache_misses: Vec<&ParsedFile> = Vec::new();
802
803        let t = std::time::Instant::now();
804        for pf in group {
805            let (cached, vcache_status) = vcache
806                .lookup(&lintel_validation_cache::CacheKey {
807                    file_content: &pf.content,
808                    schema_hash: &schema_hash,
809                    validate_formats,
810                })
811                .await;
812
813            if let Some(cached_errors) = cached {
814                push_validation_errors(pf, schema_uri, &cached_errors, &mut errors);
815                let cf = CheckedFile {
816                    path: pf.path.clone(),
817                    schema: schema_uri.clone(),
818                    cache_status,
819                    validation_cache_status: Some(vcache_status),
820                };
821                on_check(&cf);
822                checked.push(cf);
823            } else {
824                cache_misses.push(pf);
825            }
826        }
827        vcache_time += t.elapsed();
828
829        tracing::debug!(
830            cache_hits = group.len() - cache_misses.len(),
831            cache_misses = cache_misses.len(),
832            "validation cache"
833        );
834
835        // If all files hit the validation cache, skip schema compilation entirely.
836        if cache_misses.is_empty() {
837            continue;
838        }
839
840        // Compile the schema for cache misses.
841        let t = std::time::Instant::now();
842        let validator = {
843            match jsonschema::async_options()
844                .with_retriever(retriever.clone())
845                .should_validate_formats(validate_formats)
846                .build(&schema_value)
847                .await
848            {
849                Ok(v) => v,
850                Err(e) => {
851                    compile_time += t.elapsed();
852                    // When format validation is disabled and the compilation error
853                    // is a uri-reference issue (e.g. Rust-style $ref paths in
854                    // vector.json), skip validation silently.
855                    if !validate_formats && e.to_string().contains("uri-reference") {
856                        mark_group_checked(
857                            schema_uri,
858                            cache_status,
859                            Some(ValidationCacheStatus::Miss),
860                            &cache_misses,
861                            &mut checked,
862                            &mut on_check,
863                        );
864                        continue;
865                    }
866                    let msg = format!("failed to compile schema: {e}");
867                    report_group_error(
868                        |path| LintError::SchemaCompile {
869                            path: path.to_string(),
870                            message: msg.clone(),
871                        },
872                        schema_uri,
873                        cache_status,
874                        &cache_misses,
875                        &mut errors,
876                        &mut checked,
877                        &mut on_check,
878                    );
879                    continue;
880                }
881            }
882        };
883        compile_time += t.elapsed();
884
885        let t = std::time::Instant::now();
886        validate_group(
887            &validator,
888            schema_uri,
889            &schema_hash,
890            validate_formats,
891            cache_status,
892            &cache_misses,
893            &vcache,
894            &mut errors,
895            &mut checked,
896            &mut on_check,
897        )
898        .await;
899        validate_time += t.elapsed();
900    }
901
902    #[allow(clippy::cast_possible_truncation)]
903    {
904        tracing::info!(
905            fetch_ms = fetch_time.as_millis() as u64,
906            hash_ms = hash_time.as_millis() as u64,
907            vcache_ms = vcache_time.as_millis() as u64,
908            compile_ms = compile_time.as_millis() as u64,
909            validate_ms = validate_time.as_millis() as u64,
910            "phase2 breakdown"
911        );
912    }
913
914    // Sort errors for deterministic output (by path, then by span offset)
915    errors.sort_by(|a, b| {
916        a.path()
917            .cmp(b.path())
918            .then_with(|| a.offset().cmp(&b.offset()))
919    });
920
921    Ok(ValidateResult { errors, checked })
922}
923
924#[cfg(test)]
925mod tests {
926    use super::*;
927    use lintel_schema_cache::SchemaCache;
928    use std::path::Path;
929
930    fn mock(entries: &[(&str, &str)]) -> SchemaCache {
931        let cache = SchemaCache::memory();
932        for (uri, body) in entries {
933            cache.insert(
934                uri,
935                serde_json::from_str(body).expect("test mock: invalid JSON"),
936            );
937        }
938        cache
939    }
940
941    fn testdata() -> PathBuf {
942        Path::new(env!("CARGO_MANIFEST_DIR")).join("testdata")
943    }
944
945    /// Build glob patterns that scan one or more testdata directories for all supported file types.
946    fn scenario_globs(dirs: &[&str]) -> Vec<String> {
947        dirs.iter()
948            .flat_map(|dir| {
949                let base = testdata().join(dir);
950                vec![
951                    base.join("*.json").to_string_lossy().to_string(),
952                    base.join("*.yaml").to_string_lossy().to_string(),
953                    base.join("*.yml").to_string_lossy().to_string(),
954                    base.join("*.json5").to_string_lossy().to_string(),
955                    base.join("*.jsonc").to_string_lossy().to_string(),
956                    base.join("*.toml").to_string_lossy().to_string(),
957                ]
958            })
959            .collect()
960    }
961
962    fn args_for_dirs(dirs: &[&str]) -> ValidateArgs {
963        ValidateArgs {
964            globs: scenario_globs(dirs),
965            exclude: vec![],
966            cache_dir: None,
967            force_schema_fetch: true,
968            force_validation: true,
969            no_catalog: true,
970            config_dir: None,
971            schema_cache_ttl: None,
972        }
973    }
974
975    const SCHEMA: &str =
976        r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
977
978    fn schema_mock() -> SchemaCache {
979        mock(&[("https://example.com/schema.json", SCHEMA)])
980    }
981
982    // --- Directory scanning tests ---
983
984    #[tokio::test]
985    async fn no_matching_files() -> anyhow::Result<()> {
986        let tmp = tempfile::tempdir()?;
987        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
988        let c = ValidateArgs {
989            globs: vec![pattern],
990            exclude: vec![],
991            cache_dir: None,
992            force_schema_fetch: true,
993            force_validation: true,
994            no_catalog: true,
995            config_dir: None,
996            schema_cache_ttl: None,
997        };
998        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
999        assert!(!result.has_errors());
1000        Ok(())
1001    }
1002
1003    #[tokio::test]
1004    async fn dir_all_valid() -> anyhow::Result<()> {
1005        let c = args_for_dirs(&["positive_tests"]);
1006        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1007        assert!(!result.has_errors());
1008        Ok(())
1009    }
1010
1011    #[tokio::test]
1012    async fn dir_all_invalid() -> anyhow::Result<()> {
1013        let c = args_for_dirs(&["negative_tests"]);
1014        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1015        assert!(result.has_errors());
1016        Ok(())
1017    }
1018
1019    #[tokio::test]
1020    async fn dir_mixed_valid_and_invalid() -> anyhow::Result<()> {
1021        let c = args_for_dirs(&["positive_tests", "negative_tests"]);
1022        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1023        assert!(result.has_errors());
1024        Ok(())
1025    }
1026
1027    #[tokio::test]
1028    async fn dir_no_schemas_skipped() -> anyhow::Result<()> {
1029        let c = args_for_dirs(&["no_schema"]);
1030        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1031        assert!(!result.has_errors());
1032        Ok(())
1033    }
1034
1035    #[tokio::test]
1036    async fn dir_valid_with_no_schema_files() -> anyhow::Result<()> {
1037        let c = args_for_dirs(&["positive_tests", "no_schema"]);
1038        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1039        assert!(!result.has_errors());
1040        Ok(())
1041    }
1042
1043    // --- Directory as positional arg ---
1044
1045    #[tokio::test]
1046    async fn directory_arg_discovers_files() -> anyhow::Result<()> {
1047        let dir = testdata().join("positive_tests");
1048        let c = ValidateArgs {
1049            globs: vec![dir.to_string_lossy().to_string()],
1050            exclude: vec![],
1051            cache_dir: None,
1052            force_schema_fetch: true,
1053            force_validation: true,
1054            no_catalog: true,
1055            config_dir: None,
1056            schema_cache_ttl: None,
1057        };
1058        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1059        assert!(!result.has_errors());
1060        assert!(result.files_checked() > 0);
1061        Ok(())
1062    }
1063
1064    #[tokio::test]
1065    async fn multiple_directory_args() -> anyhow::Result<()> {
1066        let pos_dir = testdata().join("positive_tests");
1067        let no_schema_dir = testdata().join("no_schema");
1068        let c = ValidateArgs {
1069            globs: vec![
1070                pos_dir.to_string_lossy().to_string(),
1071                no_schema_dir.to_string_lossy().to_string(),
1072            ],
1073            exclude: vec![],
1074            cache_dir: None,
1075            force_schema_fetch: true,
1076            force_validation: true,
1077            no_catalog: true,
1078            config_dir: None,
1079            schema_cache_ttl: None,
1080        };
1081        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1082        assert!(!result.has_errors());
1083        Ok(())
1084    }
1085
1086    #[tokio::test]
1087    async fn mix_directory_and_glob_args() -> anyhow::Result<()> {
1088        let dir = testdata().join("positive_tests");
1089        let glob_pattern = testdata()
1090            .join("no_schema")
1091            .join("*.json")
1092            .to_string_lossy()
1093            .to_string();
1094        let c = ValidateArgs {
1095            globs: vec![dir.to_string_lossy().to_string(), glob_pattern],
1096            exclude: vec![],
1097            cache_dir: None,
1098            force_schema_fetch: true,
1099            force_validation: true,
1100            no_catalog: true,
1101            config_dir: None,
1102            schema_cache_ttl: None,
1103        };
1104        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1105        assert!(!result.has_errors());
1106        Ok(())
1107    }
1108
1109    #[tokio::test]
1110    async fn malformed_json_parse_error() -> anyhow::Result<()> {
1111        let base = testdata().join("malformed");
1112        let c = ValidateArgs {
1113            globs: vec![base.join("*.json").to_string_lossy().to_string()],
1114            exclude: vec![],
1115            cache_dir: None,
1116            force_schema_fetch: true,
1117            force_validation: true,
1118            no_catalog: true,
1119            config_dir: None,
1120            schema_cache_ttl: None,
1121        };
1122        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1123        assert!(result.has_errors());
1124        Ok(())
1125    }
1126
1127    #[tokio::test]
1128    async fn malformed_yaml_parse_error() -> anyhow::Result<()> {
1129        let base = testdata().join("malformed");
1130        let c = ValidateArgs {
1131            globs: vec![base.join("*.yaml").to_string_lossy().to_string()],
1132            exclude: vec![],
1133            cache_dir: None,
1134            force_schema_fetch: true,
1135            force_validation: true,
1136            no_catalog: true,
1137            config_dir: None,
1138            schema_cache_ttl: None,
1139        };
1140        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1141        assert!(result.has_errors());
1142        Ok(())
1143    }
1144
1145    // --- Exclude filter ---
1146
1147    #[tokio::test]
1148    async fn exclude_filters_files_in_dir() -> anyhow::Result<()> {
1149        let base = testdata().join("negative_tests");
1150        let c = ValidateArgs {
1151            globs: scenario_globs(&["positive_tests", "negative_tests"]),
1152            exclude: vec![
1153                base.join("missing_name.json").to_string_lossy().to_string(),
1154                base.join("missing_name.toml").to_string_lossy().to_string(),
1155                base.join("missing_name.yaml").to_string_lossy().to_string(),
1156            ],
1157            cache_dir: None,
1158            force_schema_fetch: true,
1159            force_validation: true,
1160            no_catalog: true,
1161            config_dir: None,
1162            schema_cache_ttl: None,
1163        };
1164        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1165        assert!(!result.has_errors());
1166        Ok(())
1167    }
1168
1169    // --- Cache options ---
1170
1171    #[tokio::test]
1172    async fn custom_cache_dir() -> anyhow::Result<()> {
1173        let c = ValidateArgs {
1174            globs: scenario_globs(&["positive_tests"]),
1175            exclude: vec![],
1176            cache_dir: None,
1177            force_schema_fetch: true,
1178            force_validation: true,
1179            no_catalog: true,
1180            config_dir: None,
1181            schema_cache_ttl: None,
1182        };
1183        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1184        assert!(!result.has_errors());
1185        Ok(())
1186    }
1187
1188    // --- Local schema ---
1189
1190    #[tokio::test]
1191    async fn json_valid_with_local_schema() -> anyhow::Result<()> {
1192        let tmp = tempfile::tempdir()?;
1193        let schema_path = tmp.path().join("schema.json");
1194        fs::write(&schema_path, SCHEMA)?;
1195
1196        let f = tmp.path().join("valid.json");
1197        fs::write(
1198            &f,
1199            format!(
1200                r#"{{"$schema":"{}","name":"hello"}}"#,
1201                schema_path.to_string_lossy()
1202            ),
1203        )?;
1204
1205        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1206        let c = ValidateArgs {
1207            globs: vec![pattern],
1208            exclude: vec![],
1209            cache_dir: None,
1210            force_schema_fetch: true,
1211            force_validation: true,
1212            no_catalog: true,
1213            config_dir: None,
1214            schema_cache_ttl: None,
1215        };
1216        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1217        assert!(!result.has_errors());
1218        Ok(())
1219    }
1220
1221    #[tokio::test]
1222    async fn yaml_valid_with_local_schema() -> anyhow::Result<()> {
1223        let tmp = tempfile::tempdir()?;
1224        let schema_path = tmp.path().join("schema.json");
1225        fs::write(&schema_path, SCHEMA)?;
1226
1227        let f = tmp.path().join("valid.yaml");
1228        fs::write(
1229            &f,
1230            format!(
1231                "# yaml-language-server: $schema={}\nname: hello\n",
1232                schema_path.to_string_lossy()
1233            ),
1234        )?;
1235
1236        let pattern = tmp.path().join("*.yaml").to_string_lossy().to_string();
1237        let c = ValidateArgs {
1238            globs: vec![pattern],
1239            exclude: vec![],
1240            cache_dir: None,
1241            force_schema_fetch: true,
1242            force_validation: true,
1243            no_catalog: true,
1244            config_dir: None,
1245            schema_cache_ttl: None,
1246        };
1247        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1248        assert!(!result.has_errors());
1249        Ok(())
1250    }
1251
1252    #[tokio::test]
1253    async fn missing_local_schema_errors() -> anyhow::Result<()> {
1254        let tmp = tempfile::tempdir()?;
1255        let f = tmp.path().join("ref.json");
1256        fs::write(&f, r#"{"$schema":"/nonexistent/schema.json"}"#)?;
1257
1258        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1259        let c = ValidateArgs {
1260            globs: vec![pattern],
1261            exclude: vec![],
1262            cache_dir: None,
1263            force_schema_fetch: true,
1264            force_validation: true,
1265            no_catalog: true,
1266            config_dir: None,
1267            schema_cache_ttl: None,
1268        };
1269        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1270        assert!(result.has_errors());
1271        Ok(())
1272    }
1273
1274    // --- JSON5 / JSONC tests ---
1275
1276    #[tokio::test]
1277    async fn json5_valid_with_schema() -> anyhow::Result<()> {
1278        let tmp = tempfile::tempdir()?;
1279        let schema_path = tmp.path().join("schema.json");
1280        fs::write(&schema_path, SCHEMA)?;
1281
1282        let f = tmp.path().join("config.json5");
1283        fs::write(
1284            &f,
1285            format!(
1286                r#"{{
1287  // JSON5 comment
1288  "$schema": "{}",
1289  name: "hello",
1290}}"#,
1291                schema_path.to_string_lossy()
1292            ),
1293        )?;
1294
1295        let pattern = tmp.path().join("*.json5").to_string_lossy().to_string();
1296        let c = ValidateArgs {
1297            globs: vec![pattern],
1298            exclude: vec![],
1299            cache_dir: None,
1300            force_schema_fetch: true,
1301            force_validation: true,
1302            no_catalog: true,
1303            config_dir: None,
1304            schema_cache_ttl: None,
1305        };
1306        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1307        assert!(!result.has_errors());
1308        Ok(())
1309    }
1310
1311    #[tokio::test]
1312    async fn jsonc_valid_with_schema() -> anyhow::Result<()> {
1313        let tmp = tempfile::tempdir()?;
1314        let schema_path = tmp.path().join("schema.json");
1315        fs::write(&schema_path, SCHEMA)?;
1316
1317        let f = tmp.path().join("config.jsonc");
1318        fs::write(
1319            &f,
1320            format!(
1321                r#"{{
1322  /* JSONC comment */
1323  "$schema": "{}",
1324  "name": "hello"
1325}}"#,
1326                schema_path.to_string_lossy()
1327            ),
1328        )?;
1329
1330        let pattern = tmp.path().join("*.jsonc").to_string_lossy().to_string();
1331        let c = ValidateArgs {
1332            globs: vec![pattern],
1333            exclude: vec![],
1334            cache_dir: None,
1335            force_schema_fetch: true,
1336            force_validation: true,
1337            no_catalog: true,
1338            config_dir: None,
1339            schema_cache_ttl: None,
1340        };
1341        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1342        assert!(!result.has_errors());
1343        Ok(())
1344    }
1345
1346    // --- Catalog-based schema matching ---
1347
1348    const GH_WORKFLOW_SCHEMA: &str = r#"{
1349        "type": "object",
1350        "properties": {
1351            "name": { "type": "string" },
1352            "on": {},
1353            "jobs": { "type": "object" }
1354        },
1355        "required": ["on", "jobs"]
1356    }"#;
1357
1358    fn gh_catalog_json() -> String {
1359        r#"{"schemas":[{
1360            "name": "GitHub Workflow",
1361            "url": "https://www.schemastore.org/github-workflow.json",
1362            "fileMatch": [
1363                "**/.github/workflows/*.yml",
1364                "**/.github/workflows/*.yaml"
1365            ]
1366        }]}"#
1367            .to_string()
1368    }
1369
1370    #[tokio::test]
1371    async fn catalog_matches_github_workflow_valid() -> anyhow::Result<()> {
1372        let tmp = tempfile::tempdir()?;
1373        let cache_tmp = tempfile::tempdir()?;
1374        let wf_dir = tmp.path().join(".github/workflows");
1375        fs::create_dir_all(&wf_dir)?;
1376        fs::write(
1377            wf_dir.join("ci.yml"),
1378            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1379        )?;
1380
1381        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1382        let client = mock(&[
1383            (
1384                "https://www.schemastore.org/api/json/catalog.json",
1385                &gh_catalog_json(),
1386            ),
1387            (
1388                "https://www.schemastore.org/github-workflow.json",
1389                GH_WORKFLOW_SCHEMA,
1390            ),
1391        ]);
1392        let c = ValidateArgs {
1393            globs: vec![pattern],
1394            exclude: vec![],
1395            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1396            force_schema_fetch: true,
1397            force_validation: true,
1398            no_catalog: false,
1399            config_dir: None,
1400            schema_cache_ttl: None,
1401        };
1402        let result = run_with(&c, Some(client), |_| {}).await?;
1403        assert!(!result.has_errors());
1404        Ok(())
1405    }
1406
1407    #[tokio::test]
1408    async fn catalog_matches_github_workflow_invalid() -> anyhow::Result<()> {
1409        let tmp = tempfile::tempdir()?;
1410        let cache_tmp = tempfile::tempdir()?;
1411        let wf_dir = tmp.path().join(".github/workflows");
1412        fs::create_dir_all(&wf_dir)?;
1413        fs::write(wf_dir.join("bad.yml"), "name: Broken\n")?;
1414
1415        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1416        let client = mock(&[
1417            (
1418                "https://www.schemastore.org/api/json/catalog.json",
1419                &gh_catalog_json(),
1420            ),
1421            (
1422                "https://www.schemastore.org/github-workflow.json",
1423                GH_WORKFLOW_SCHEMA,
1424            ),
1425        ]);
1426        let c = ValidateArgs {
1427            globs: vec![pattern],
1428            exclude: vec![],
1429            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1430            force_schema_fetch: true,
1431            force_validation: true,
1432            no_catalog: false,
1433            config_dir: None,
1434            schema_cache_ttl: None,
1435        };
1436        let result = run_with(&c, Some(client), |_| {}).await?;
1437        assert!(result.has_errors());
1438        Ok(())
1439    }
1440
1441    #[tokio::test]
1442    async fn auto_discover_finds_github_workflows() -> anyhow::Result<()> {
1443        let tmp = tempfile::tempdir()?;
1444        let cache_tmp = tempfile::tempdir()?;
1445        let wf_dir = tmp.path().join(".github/workflows");
1446        fs::create_dir_all(&wf_dir)?;
1447        fs::write(
1448            wf_dir.join("ci.yml"),
1449            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1450        )?;
1451
1452        let client = mock(&[
1453            (
1454                "https://www.schemastore.org/api/json/catalog.json",
1455                &gh_catalog_json(),
1456            ),
1457            (
1458                "https://www.schemastore.org/github-workflow.json",
1459                GH_WORKFLOW_SCHEMA,
1460            ),
1461        ]);
1462        let c = ValidateArgs {
1463            globs: vec![],
1464            exclude: vec![],
1465            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1466            force_schema_fetch: true,
1467            force_validation: true,
1468            no_catalog: false,
1469            config_dir: None,
1470            schema_cache_ttl: None,
1471        };
1472
1473        let orig_dir = std::env::current_dir()?;
1474        std::env::set_current_dir(tmp.path())?;
1475        let result = run_with(&c, Some(client), |_| {}).await?;
1476        std::env::set_current_dir(orig_dir)?;
1477
1478        assert!(!result.has_errors());
1479        Ok(())
1480    }
1481
1482    // --- TOML tests ---
1483
1484    #[tokio::test]
1485    async fn toml_valid_with_schema() -> anyhow::Result<()> {
1486        let tmp = tempfile::tempdir()?;
1487        let schema_path = tmp.path().join("schema.json");
1488        fs::write(&schema_path, SCHEMA)?;
1489
1490        let f = tmp.path().join("config.toml");
1491        fs::write(
1492            &f,
1493            format!(
1494                "# :schema {}\nname = \"hello\"\n",
1495                schema_path.to_string_lossy()
1496            ),
1497        )?;
1498
1499        let pattern = tmp.path().join("*.toml").to_string_lossy().to_string();
1500        let c = ValidateArgs {
1501            globs: vec![pattern],
1502            exclude: vec![],
1503            cache_dir: None,
1504            force_schema_fetch: true,
1505            force_validation: true,
1506            no_catalog: true,
1507            config_dir: None,
1508            schema_cache_ttl: None,
1509        };
1510        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1511        assert!(!result.has_errors());
1512        Ok(())
1513    }
1514
1515    // --- Rewrite rules + // resolution ---
1516
1517    #[tokio::test]
1518    async fn rewrite_rule_with_double_slash_resolves_schema() -> anyhow::Result<()> {
1519        let tmp = tempfile::tempdir()?;
1520
1521        let schemas_dir = tmp.path().join("schemas");
1522        fs::create_dir_all(&schemas_dir)?;
1523        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1524
1525        fs::write(
1526            tmp.path().join("lintel.toml"),
1527            r#"
1528[rewrite]
1529"http://localhost:9000/" = "//schemas/"
1530"#,
1531        )?;
1532
1533        let f = tmp.path().join("config.json");
1534        fs::write(
1535            &f,
1536            r#"{"$schema":"http://localhost:9000/test.json","name":"hello"}"#,
1537        )?;
1538
1539        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1540        let c = ValidateArgs {
1541            globs: vec![pattern],
1542            exclude: vec![],
1543            cache_dir: None,
1544            force_schema_fetch: true,
1545            force_validation: true,
1546            no_catalog: true,
1547            config_dir: Some(tmp.path().to_path_buf()),
1548            schema_cache_ttl: None,
1549        };
1550
1551        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1552        assert!(!result.has_errors());
1553        assert_eq!(result.files_checked(), 1);
1554        Ok(())
1555    }
1556
1557    #[tokio::test]
1558    async fn double_slash_schema_resolves_relative_to_config() -> anyhow::Result<()> {
1559        let tmp = tempfile::tempdir()?;
1560
1561        let schemas_dir = tmp.path().join("schemas");
1562        fs::create_dir_all(&schemas_dir)?;
1563        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1564
1565        fs::write(tmp.path().join("lintel.toml"), "")?;
1566
1567        let sub = tmp.path().join("deeply/nested");
1568        fs::create_dir_all(&sub)?;
1569        let f = sub.join("config.json");
1570        fs::write(&f, r#"{"$schema":"//schemas/test.json","name":"hello"}"#)?;
1571
1572        let pattern = sub.join("*.json").to_string_lossy().to_string();
1573        let c = ValidateArgs {
1574            globs: vec![pattern],
1575            exclude: vec![],
1576            cache_dir: None,
1577            force_schema_fetch: true,
1578            force_validation: true,
1579            no_catalog: true,
1580            config_dir: Some(tmp.path().to_path_buf()),
1581            schema_cache_ttl: None,
1582        };
1583
1584        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1585        assert!(!result.has_errors());
1586        Ok(())
1587    }
1588
1589    // --- Format validation override ---
1590
1591    const FORMAT_SCHEMA: &str = r#"{
1592        "type": "object",
1593        "properties": {
1594            "link": { "type": "string", "format": "uri-reference" }
1595        }
1596    }"#;
1597
1598    #[tokio::test]
1599    async fn format_errors_reported_without_override() -> anyhow::Result<()> {
1600        let tmp = tempfile::tempdir()?;
1601        let schema_path = tmp.path().join("schema.json");
1602        fs::write(&schema_path, FORMAT_SCHEMA)?;
1603
1604        let f = tmp.path().join("data.json");
1605        fs::write(
1606            &f,
1607            format!(
1608                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1609                schema_path.to_string_lossy()
1610            ),
1611        )?;
1612
1613        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1614        let c = ValidateArgs {
1615            globs: vec![pattern],
1616            exclude: vec![],
1617            cache_dir: None,
1618            force_schema_fetch: true,
1619            force_validation: true,
1620            no_catalog: true,
1621            config_dir: Some(tmp.path().to_path_buf()),
1622            schema_cache_ttl: None,
1623        };
1624        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1625        assert!(
1626            result.has_errors(),
1627            "expected format error without override"
1628        );
1629        Ok(())
1630    }
1631
1632    #[tokio::test]
1633    async fn format_errors_suppressed_with_override() -> anyhow::Result<()> {
1634        let tmp = tempfile::tempdir()?;
1635        let schema_path = tmp.path().join("schema.json");
1636        fs::write(&schema_path, FORMAT_SCHEMA)?;
1637
1638        let f = tmp.path().join("data.json");
1639        fs::write(
1640            &f,
1641            format!(
1642                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1643                schema_path.to_string_lossy()
1644            ),
1645        )?;
1646
1647        // Use **/data.json to match the absolute path from the tempdir.
1648        fs::write(
1649            tmp.path().join("lintel.toml"),
1650            r#"
1651[[override]]
1652files = ["**/data.json"]
1653validate_formats = false
1654"#,
1655        )?;
1656
1657        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1658        let c = ValidateArgs {
1659            globs: vec![pattern],
1660            exclude: vec![],
1661            cache_dir: None,
1662            force_schema_fetch: true,
1663            force_validation: true,
1664            no_catalog: true,
1665            config_dir: Some(tmp.path().to_path_buf()),
1666            schema_cache_ttl: None,
1667        };
1668        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1669        assert!(
1670            !result.has_errors(),
1671            "expected no errors with validate_formats = false override"
1672        );
1673        Ok(())
1674    }
1675
1676    // --- Unrecognized extension handling ---
1677
1678    #[tokio::test]
1679    async fn unrecognized_extension_skipped_without_catalog() -> anyhow::Result<()> {
1680        let tmp = tempfile::tempdir()?;
1681        fs::write(tmp.path().join("config.nix"), r#"{"name":"hello"}"#)?;
1682
1683        let pattern = tmp.path().join("config.nix").to_string_lossy().to_string();
1684        let c = ValidateArgs {
1685            globs: vec![pattern],
1686            exclude: vec![],
1687            cache_dir: None,
1688            force_schema_fetch: true,
1689            force_validation: true,
1690            no_catalog: true,
1691            config_dir: Some(tmp.path().to_path_buf()),
1692            schema_cache_ttl: None,
1693        };
1694        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1695        assert!(!result.has_errors());
1696        assert_eq!(result.files_checked(), 0);
1697        Ok(())
1698    }
1699
1700    #[tokio::test]
1701    async fn unrecognized_extension_parsed_when_catalog_matches() -> anyhow::Result<()> {
1702        let tmp = tempfile::tempdir()?;
1703        let cache_tmp = tempfile::tempdir()?;
1704        // File has .cfg extension (unrecognized) but content is valid JSON
1705        fs::write(
1706            tmp.path().join("myapp.cfg"),
1707            r#"{"name":"hello","on":"push","jobs":{"build":{}}}"#,
1708        )?;
1709
1710        let catalog_json = r#"{"schemas":[{
1711            "name": "MyApp Config",
1712            "url": "https://example.com/myapp.schema.json",
1713            "fileMatch": ["*.cfg"]
1714        }]}"#;
1715        let schema =
1716            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1717
1718        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1719        let client = mock(&[
1720            (
1721                "https://www.schemastore.org/api/json/catalog.json",
1722                catalog_json,
1723            ),
1724            ("https://example.com/myapp.schema.json", schema),
1725        ]);
1726        let c = ValidateArgs {
1727            globs: vec![pattern],
1728            exclude: vec![],
1729            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1730            force_schema_fetch: true,
1731            force_validation: true,
1732            no_catalog: false,
1733            config_dir: Some(tmp.path().to_path_buf()),
1734            schema_cache_ttl: None,
1735        };
1736        let result = run_with(&c, Some(client), |_| {}).await?;
1737        assert!(!result.has_errors());
1738        assert_eq!(result.files_checked(), 1);
1739        Ok(())
1740    }
1741
1742    #[tokio::test]
1743    async fn unrecognized_extension_unparseable_skipped() -> anyhow::Result<()> {
1744        let tmp = tempfile::tempdir()?;
1745        let cache_tmp = tempfile::tempdir()?;
1746        // File matches catalog but content isn't parseable by any format
1747        fs::write(
1748            tmp.path().join("myapp.cfg"),
1749            "{ pkgs, ... }: { packages = [ pkgs.git ]; }",
1750        )?;
1751
1752        let catalog_json = r#"{"schemas":[{
1753            "name": "MyApp Config",
1754            "url": "https://example.com/myapp.schema.json",
1755            "fileMatch": ["*.cfg"]
1756        }]}"#;
1757
1758        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1759        let client = mock(&[(
1760            "https://www.schemastore.org/api/json/catalog.json",
1761            catalog_json,
1762        )]);
1763        let c = ValidateArgs {
1764            globs: vec![pattern],
1765            exclude: vec![],
1766            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1767            force_schema_fetch: true,
1768            force_validation: true,
1769            no_catalog: false,
1770            config_dir: Some(tmp.path().to_path_buf()),
1771            schema_cache_ttl: None,
1772        };
1773        let result = run_with(&c, Some(client), |_| {}).await?;
1774        assert!(!result.has_errors());
1775        assert_eq!(result.files_checked(), 0);
1776        Ok(())
1777    }
1778
1779    #[tokio::test]
1780    async fn unrecognized_extension_invalid_against_schema() -> anyhow::Result<()> {
1781        let tmp = tempfile::tempdir()?;
1782        let cache_tmp = tempfile::tempdir()?;
1783        // File has .cfg extension, content is valid JSON but fails schema validation
1784        fs::write(tmp.path().join("myapp.cfg"), r#"{"wrong":"field"}"#)?;
1785
1786        let catalog_json = r#"{"schemas":[{
1787            "name": "MyApp Config",
1788            "url": "https://example.com/myapp.schema.json",
1789            "fileMatch": ["*.cfg"]
1790        }]}"#;
1791        let schema =
1792            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1793
1794        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1795        let client = mock(&[
1796            (
1797                "https://www.schemastore.org/api/json/catalog.json",
1798                catalog_json,
1799            ),
1800            ("https://example.com/myapp.schema.json", schema),
1801        ]);
1802        let c = ValidateArgs {
1803            globs: vec![pattern],
1804            exclude: vec![],
1805            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1806            force_schema_fetch: true,
1807            force_validation: true,
1808            no_catalog: false,
1809            config_dir: Some(tmp.path().to_path_buf()),
1810            schema_cache_ttl: None,
1811        };
1812        let result = run_with(&c, Some(client), |_| {}).await?;
1813        assert!(result.has_errors());
1814        assert_eq!(result.files_checked(), 1);
1815        Ok(())
1816    }
1817
1818    // --- Validation cache ---
1819
1820    #[tokio::test]
1821    async fn validation_cache_hit_skips_revalidation() -> anyhow::Result<()> {
1822        let tmp = tempfile::tempdir()?;
1823        let schema_path = tmp.path().join("schema.json");
1824        fs::write(&schema_path, SCHEMA)?;
1825
1826        let f = tmp.path().join("valid.json");
1827        fs::write(
1828            &f,
1829            format!(
1830                r#"{{"$schema":"{}","name":"hello"}}"#,
1831                schema_path.to_string_lossy()
1832            ),
1833        )?;
1834
1835        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1836
1837        // First run: force_validation = false so results get cached
1838        let c = ValidateArgs {
1839            globs: vec![pattern.clone()],
1840            exclude: vec![],
1841            cache_dir: None,
1842            force_schema_fetch: true,
1843            force_validation: false,
1844            no_catalog: true,
1845            config_dir: None,
1846            schema_cache_ttl: None,
1847        };
1848        let mut first_statuses = Vec::new();
1849        let result = run_with(&c, Some(mock(&[])), |cf| {
1850            first_statuses.push(cf.validation_cache_status);
1851        })
1852        .await?;
1853        assert!(!result.has_errors());
1854        assert!(result.files_checked() > 0);
1855
1856        // Verify the first run recorded a validation cache miss
1857        assert!(
1858            first_statuses.contains(&Some(ValidationCacheStatus::Miss)),
1859            "expected at least one validation cache miss on first run"
1860        );
1861
1862        // Second run: same file, same schema — should hit validation cache
1863        let mut second_statuses = Vec::new();
1864        let result = run_with(&c, Some(mock(&[])), |cf| {
1865            second_statuses.push(cf.validation_cache_status);
1866        })
1867        .await?;
1868        assert!(!result.has_errors());
1869
1870        // Verify the second run got a validation cache hit
1871        assert!(
1872            second_statuses.contains(&Some(ValidationCacheStatus::Hit)),
1873            "expected at least one validation cache hit on second run"
1874        );
1875        Ok(())
1876    }
1877
1878    // --- clean_error_message ---
1879
1880    #[test]
1881    fn clean_strips_anyof_value() {
1882        let msg =
1883            r#"{"type":"bad"} is not valid under any of the schemas listed in the 'anyOf' keyword"#;
1884        assert_eq!(
1885            clean_error_message(msg.to_string()),
1886            "not valid under any of the schemas listed in the 'anyOf' keyword"
1887        );
1888    }
1889
1890    #[test]
1891    fn clean_strips_oneof_value() {
1892        let msg = r#"{"runs-on":"ubuntu-latest","steps":[]} is not valid under any of the schemas listed in the 'oneOf' keyword"#;
1893        assert_eq!(
1894            clean_error_message(msg.to_string()),
1895            "not valid under any of the schemas listed in the 'oneOf' keyword"
1896        );
1897    }
1898
1899    #[test]
1900    fn clean_strips_long_value() {
1901        let long_value = "x".repeat(5000);
1902        let suffix = " is not valid under any of the schemas listed in the 'anyOf' keyword";
1903        let msg = format!("{long_value}{suffix}");
1904        assert_eq!(
1905            clean_error_message(msg),
1906            "not valid under any of the schemas listed in the 'anyOf' keyword"
1907        );
1908    }
1909
1910    #[test]
1911    fn clean_preserves_type_error() {
1912        let msg = r#"12345 is not of types "null", "string""#;
1913        assert_eq!(clean_error_message(msg.to_string()), msg);
1914    }
1915
1916    #[test]
1917    fn clean_preserves_required_property() {
1918        let msg = "\"name\" is a required property";
1919        assert_eq!(clean_error_message(msg.to_string()), msg);
1920    }
1921}