Skip to main content

lintel_validate/
validate.rs

1use alloc::collections::BTreeMap;
2use std::collections::HashMap;
3use std::fs;
4use std::path::{Path, PathBuf};
5
6use anyhow::{Context, Result};
7use glob::glob;
8use serde_json::Value;
9
10use crate::catalog;
11use lintel_schema_cache::{CacheStatus, SchemaCache};
12use lintel_validation_cache::{ValidationCacheStatus, ValidationError};
13use schemastore::CompiledCatalog;
14
15use crate::diagnostics::{DEFAULT_LABEL, find_instance_path_span, format_label};
16use crate::discover;
17use crate::parsers::{self, FileFormat, JsoncParser, Parser};
18use crate::registry;
19
20/// Conservative limit for concurrent file reads to avoid exhausting file
21/// descriptors. 128 is well below the default soft limit on macOS (256) and
22/// Linux (1024) while still providing good throughput.
23const FD_CONCURRENCY_LIMIT: usize = 128;
24
25pub struct ValidateArgs {
26    /// Glob patterns to find files (empty = auto-discover)
27    pub globs: Vec<String>,
28
29    /// Exclude files matching these globs (repeatable)
30    pub exclude: Vec<String>,
31
32    /// Cache directory for remote schemas
33    pub cache_dir: Option<String>,
34
35    /// Bypass schema cache reads (still writes fetched schemas to cache)
36    pub force_schema_fetch: bool,
37
38    /// Bypass validation cache reads (still writes results to cache)
39    pub force_validation: bool,
40
41    /// Disable `SchemaStore` catalog matching
42    pub no_catalog: bool,
43
44    /// Directory to search for `lintel.toml` (defaults to cwd)
45    pub config_dir: Option<PathBuf>,
46
47    /// TTL for cached schemas. `None` means no expiry.
48    pub schema_cache_ttl: Option<core::time::Duration>,
49}
50
51/// Re-exported from [`crate::diagnostics::LintError`] so callers can use
52/// `lintel_validate::validate::LintError` without importing diagnostics.
53pub use crate::diagnostics::LintError;
54
55/// A file that was checked and the schema it resolved to.
56pub struct CheckedFile {
57    pub path: String,
58    pub schema: String,
59    /// `None` for local schemas and builtins; `Some` for remote schemas.
60    pub cache_status: Option<CacheStatus>,
61    /// `None` when validation caching is not applicable; `Some` for validation cache hits/misses.
62    pub validation_cache_status: Option<ValidationCacheStatus>,
63}
64
65/// Result of a validation run.
66pub struct ValidateResult {
67    pub errors: Vec<LintError>,
68    pub checked: Vec<CheckedFile>,
69}
70
71impl ValidateResult {
72    pub fn has_errors(&self) -> bool {
73        !self.errors.is_empty()
74    }
75
76    pub fn files_checked(&self) -> usize {
77        self.checked.len()
78    }
79}
80
81// ---------------------------------------------------------------------------
82// Internal types
83// ---------------------------------------------------------------------------
84
85/// A file that has been parsed and matched to a schema URI.
86struct ParsedFile {
87    path: String,
88    content: String,
89    instance: Value,
90    /// Original schema URI before rewrites (for override matching).
91    original_schema_uri: String,
92}
93
94// ---------------------------------------------------------------------------
95// Config loading
96// ---------------------------------------------------------------------------
97
98/// Locate `lintel.toml`, load the full config, and return the config directory.
99/// Returns `(config, config_dir, config_path)`.  When no config is found or
100/// cwd is unavailable the config is default and `config_path` is `None`.
101#[tracing::instrument(skip_all)]
102pub fn load_config(search_dir: Option<&Path>) -> (lintel_config::Config, PathBuf, Option<PathBuf>) {
103    let start_dir = match search_dir {
104        Some(d) => d.to_path_buf(),
105        None => match std::env::current_dir() {
106            Ok(d) => d,
107            Err(_) => return (lintel_config::Config::default(), PathBuf::from("."), None),
108        },
109    };
110
111    let Some(config_path) = lintel_config::find_config_path(&start_dir) else {
112        return (lintel_config::Config::default(), start_dir, None);
113    };
114
115    let dir = config_path.parent().unwrap_or(&start_dir).to_path_buf();
116    let cfg = lintel_config::find_and_load(&start_dir)
117        .ok()
118        .flatten()
119        .unwrap_or_default();
120    (cfg, dir, Some(config_path))
121}
122
123// ---------------------------------------------------------------------------
124// File collection
125// ---------------------------------------------------------------------------
126
127/// Collect input files from globs/directories, applying exclude filters.
128///
129/// # Errors
130///
131/// Returns an error if a glob pattern is invalid or a directory cannot be walked.
132#[tracing::instrument(skip_all, fields(glob_count = globs.len(), exclude_count = exclude.len()))]
133pub fn collect_files(globs: &[String], exclude: &[String]) -> Result<Vec<PathBuf>> {
134    if globs.is_empty() {
135        return discover::discover_files(".", exclude);
136    }
137
138    let mut result = Vec::new();
139    for pattern in globs {
140        let path = Path::new(pattern);
141        if path.is_dir() {
142            result.extend(discover::discover_files(pattern, exclude)?);
143        } else {
144            for entry in glob(pattern).with_context(|| format!("invalid glob: {pattern}"))? {
145                let path = entry?;
146                if path.is_file() && !is_excluded(&path, exclude) {
147                    result.push(path);
148                }
149            }
150        }
151    }
152    Ok(result)
153}
154
155fn is_excluded(path: &Path, excludes: &[String]) -> bool {
156    let path_str = match path.to_str() {
157        Some(s) => s.strip_prefix("./").unwrap_or(s),
158        None => return false,
159    };
160    excludes
161        .iter()
162        .any(|pattern| glob_match::glob_match(pattern, path_str))
163}
164
165// ---------------------------------------------------------------------------
166// Phase 1: Parse files and resolve schema URIs
167// ---------------------------------------------------------------------------
168
169/// Try parsing content with each known format, returning the first success.
170///
171/// JSONC is tried first (superset of JSON, handles comments), then YAML and
172/// TOML which cover the most common config formats, followed by the rest.
173pub fn try_parse_all(content: &str, file_name: &str) -> Option<(parsers::FileFormat, Value)> {
174    use parsers::FileFormat::{Json, Json5, Jsonc, Markdown, Toml, Yaml};
175    const FORMATS: [parsers::FileFormat; 6] = [Jsonc, Yaml, Toml, Json, Json5, Markdown];
176
177    for fmt in FORMATS {
178        let parser = parsers::parser_for(fmt);
179        if let Ok(val) = parser.parse(content, file_name) {
180            return Some((fmt, val));
181        }
182    }
183    None
184}
185
186/// Result of processing a single file: either a parsed file with its schema URI,
187/// a lint error, or nothing (file was skipped).
188enum FileResult {
189    Parsed {
190        schema_uri: String,
191        parsed: ParsedFile,
192    },
193    Error(LintError),
194    Skip,
195}
196
197/// Process a single file's already-read content: parse and resolve schema URI.
198#[allow(clippy::too_many_arguments)]
199fn process_one_file(
200    path: &Path,
201    content: String,
202    config: &lintel_config::Config,
203    config_dir: &Path,
204    compiled_catalogs: &[CompiledCatalog],
205) -> FileResult {
206    let path_str = path.display().to_string();
207    let file_name = path
208        .file_name()
209        .and_then(|n| n.to_str())
210        .unwrap_or(&path_str);
211
212    let detected_format = parsers::detect_format(path);
213
214    // For unrecognized extensions, only proceed if a catalog or config mapping matches.
215    if detected_format.is_none() {
216        let has_match = config.find_schema_mapping(&path_str, file_name).is_some()
217            || compiled_catalogs
218                .iter()
219                .any(|cat| cat.find_schema(&path_str, file_name).is_some());
220        if !has_match {
221            return FileResult::Skip;
222        }
223    }
224
225    // Parse the file content.
226    let (parser, instance): (Box<dyn Parser>, Value) = if let Some(fmt) = detected_format {
227        let parser = parsers::parser_for(fmt);
228        match parser.parse(&content, &path_str) {
229            Ok(val) => (parser, val),
230            Err(parse_err) => {
231                // JSONC fallback for .json files that match a catalog entry.
232                if fmt == FileFormat::Json
233                    && compiled_catalogs
234                        .iter()
235                        .any(|cat| cat.find_schema(&path_str, file_name).is_some())
236                {
237                    match JsoncParser.parse(&content, &path_str) {
238                        Ok(val) => (parsers::parser_for(FileFormat::Jsonc), val),
239                        Err(jsonc_err) => return FileResult::Error(jsonc_err.into()),
240                    }
241                } else {
242                    return FileResult::Error(parse_err.into());
243                }
244            }
245        }
246    } else {
247        match try_parse_all(&content, &path_str) {
248            Some((fmt, val)) => (parsers::parser_for(fmt), val),
249            None => return FileResult::Skip,
250        }
251    };
252
253    // Skip markdown files with no frontmatter
254    if instance.is_null() {
255        return FileResult::Skip;
256    }
257
258    // Schema resolution priority:
259    // 1. Inline $schema / YAML modeline (always wins)
260    // 2. Custom schema mappings from lintel.toml [schemas]
261    // 3. Catalog matching (custom registries > Lintel catalog > SchemaStore)
262    let schema_uri = parser
263        .extract_schema_uri(&content, &instance)
264        .or_else(|| {
265            config
266                .find_schema_mapping(&path_str, file_name)
267                .map(str::to_string)
268        })
269        .or_else(|| {
270            compiled_catalogs
271                .iter()
272                .find_map(|cat| cat.find_schema(&path_str, file_name))
273                .map(str::to_string)
274        });
275
276    let Some(schema_uri) = schema_uri else {
277        return FileResult::Skip;
278    };
279
280    // Keep original URI for override matching (before rewrites)
281    let original_schema_uri = schema_uri.clone();
282
283    // Apply rewrite rules, then resolve // paths relative to lintel.toml
284    let schema_uri = lintel_config::apply_rewrites(&schema_uri, &config.rewrite);
285    let schema_uri = lintel_config::resolve_double_slash(&schema_uri, config_dir);
286
287    // Resolve relative local paths against the file's parent directory.
288    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
289    let schema_uri = if is_remote {
290        schema_uri
291    } else {
292        path.parent()
293            .map(|parent| parent.join(&schema_uri).to_string_lossy().to_string())
294            .unwrap_or(schema_uri)
295    };
296
297    FileResult::Parsed {
298        schema_uri,
299        parsed: ParsedFile {
300            path: path_str,
301            content,
302            instance,
303            original_schema_uri,
304        },
305    }
306}
307
308/// Read each file concurrently with tokio, parse its content, extract its
309/// schema URI, apply rewrites, and group by resolved schema URI.
310#[tracing::instrument(skip_all, fields(file_count = files.len()))]
311#[allow(clippy::too_many_arguments)]
312async fn parse_and_group_files(
313    files: &[PathBuf],
314    config: &lintel_config::Config,
315    config_dir: &Path,
316    compiled_catalogs: &[CompiledCatalog],
317    errors: &mut Vec<LintError>,
318) -> BTreeMap<String, Vec<ParsedFile>> {
319    // Read all files concurrently using tokio async I/O, with a semaphore
320    // to avoid exhausting file descriptors on large directories.
321    let semaphore = alloc::sync::Arc::new(tokio::sync::Semaphore::new(FD_CONCURRENCY_LIMIT));
322    let mut read_set = tokio::task::JoinSet::new();
323    for path in files {
324        let path = path.clone();
325        let sem = semaphore.clone();
326        read_set.spawn(async move {
327            let _permit = sem.acquire().await.expect("semaphore closed");
328            let result = tokio::fs::read_to_string(&path).await;
329            (path, result)
330        });
331    }
332
333    let mut file_contents = Vec::with_capacity(files.len());
334    while let Some(result) = read_set.join_next().await {
335        match result {
336            Ok(item) => file_contents.push(item),
337            Err(e) => tracing::warn!("file read task panicked: {e}"),
338        }
339    }
340
341    // Process files: parse content and resolve schema URIs.
342    let mut schema_groups: BTreeMap<String, Vec<ParsedFile>> = BTreeMap::new();
343    for (path, content_result) in file_contents {
344        let content = match content_result {
345            Ok(c) => c,
346            Err(e) => {
347                errors.push(LintError::Io {
348                    path: path.display().to_string(),
349                    message: format!("failed to read: {e}"),
350                });
351                continue;
352            }
353        };
354        let result = process_one_file(&path, content, config, config_dir, compiled_catalogs);
355        match result {
356            FileResult::Parsed { schema_uri, parsed } => {
357                schema_groups.entry(schema_uri).or_default().push(parsed);
358            }
359            FileResult::Error(e) => errors.push(e),
360            FileResult::Skip => {}
361        }
362    }
363
364    schema_groups
365}
366
367// ---------------------------------------------------------------------------
368// Phase 2: Schema fetching, compilation, and instance validation
369// ---------------------------------------------------------------------------
370
371/// Fetch a schema by URI, returning its parsed JSON and cache status.
372///
373/// For remote URIs, checks the prefetched map first; for local URIs, reads
374/// from disk (with in-memory caching to avoid redundant I/O for shared schemas).
375#[allow(clippy::too_many_arguments)]
376async fn fetch_schema_from_prefetched(
377    schema_uri: &str,
378    prefetched: &HashMap<String, Result<(Value, CacheStatus), String>>,
379    local_cache: &mut HashMap<String, Value>,
380    group: &[ParsedFile],
381    errors: &mut Vec<LintError>,
382    checked: &mut Vec<CheckedFile>,
383    on_check: &mut impl FnMut(&CheckedFile),
384) -> Option<(Value, Option<CacheStatus>)> {
385    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
386
387    let result: Result<(Value, Option<CacheStatus>), String> = if is_remote {
388        match prefetched.get(schema_uri) {
389            Some(Ok((v, status))) => Ok((v.clone(), Some(*status))),
390            Some(Err(e)) => Err(format!("failed to fetch schema: {schema_uri}: {e}")),
391            None => Err(format!("schema not prefetched: {schema_uri}")),
392        }
393    } else if let Some(cached) = local_cache.get(schema_uri) {
394        Ok((cached.clone(), None))
395    } else {
396        tokio::fs::read_to_string(schema_uri)
397            .await
398            .map_err(|e| format!("failed to read local schema {schema_uri}: {e}"))
399            .and_then(|content| {
400                serde_json::from_str::<Value>(&content)
401                    .map(|v| {
402                        local_cache.insert(schema_uri.to_string(), v.clone());
403                        (v, None)
404                    })
405                    .map_err(|e| format!("failed to parse local schema {schema_uri}: {e}"))
406            })
407    };
408
409    match result {
410        Ok(value) => Some(value),
411        Err(message) => {
412            report_group_error(
413                |path| LintError::SchemaFetch {
414                    path: path.to_string(),
415                    message: message.clone(),
416                },
417                schema_uri,
418                None,
419                group,
420                errors,
421                checked,
422                on_check,
423            );
424            None
425        }
426    }
427}
428
429/// Report the same error for every file in a schema group.
430#[allow(clippy::too_many_arguments)]
431fn report_group_error<P: alloc::borrow::Borrow<ParsedFile>>(
432    make_error: impl Fn(&str) -> LintError,
433    schema_uri: &str,
434    cache_status: Option<CacheStatus>,
435    group: &[P],
436    errors: &mut Vec<LintError>,
437    checked: &mut Vec<CheckedFile>,
438    on_check: &mut impl FnMut(&CheckedFile),
439) {
440    for item in group {
441        let pf = item.borrow();
442        let cf = CheckedFile {
443            path: pf.path.clone(),
444            schema: schema_uri.to_string(),
445            cache_status,
446            validation_cache_status: None,
447        };
448        on_check(&cf);
449        checked.push(cf);
450        errors.push(make_error(&pf.path));
451    }
452}
453
454/// Mark every file in a group as checked (no errors).
455#[allow(clippy::too_many_arguments)]
456fn mark_group_checked<P: alloc::borrow::Borrow<ParsedFile>>(
457    schema_uri: &str,
458    cache_status: Option<CacheStatus>,
459    validation_cache_status: Option<ValidationCacheStatus>,
460    group: &[P],
461    checked: &mut Vec<CheckedFile>,
462    on_check: &mut impl FnMut(&CheckedFile),
463) {
464    for item in group {
465        let pf = item.borrow();
466        let cf = CheckedFile {
467            path: pf.path.clone(),
468            schema: schema_uri.to_string(),
469            cache_status,
470            validation_cache_status,
471        };
472        on_check(&cf);
473        checked.push(cf);
474    }
475}
476
477/// Clean up error messages from the `jsonschema` crate.
478///
479/// For `anyOf`/`oneOf` failures the crate dumps the entire JSON value into the
480/// message (e.g. `{...} is not valid under any of the schemas listed in the 'oneOf' keyword`).
481/// The source snippet already shows the value, so we strip the redundant prefix
482/// and keep only `"not valid under any of the schemas listed in the 'oneOf' keyword"`.
483///
484/// All other messages are returned unchanged.
485fn clean_error_message(msg: String) -> String {
486    const MARKER: &str = " is not valid under any of the schemas listed in the '";
487    if let Some(pos) = msg.find(MARKER) {
488        // pos points to " is not valid...", skip " is " (4 chars) to get "not valid..."
489        return msg[pos + 4..].to_string();
490    }
491    msg
492}
493
494/// Convert [`ValidationError`]s into [`LintError::Validation`] diagnostics.
495fn push_validation_errors(
496    pf: &ParsedFile,
497    schema_url: &str,
498    validation_errors: &[ValidationError],
499    errors: &mut Vec<LintError>,
500) {
501    for ve in validation_errors {
502        let span = find_instance_path_span(&pf.content, &ve.instance_path);
503        let instance_path = if ve.instance_path.is_empty() {
504            DEFAULT_LABEL.to_string()
505        } else {
506            ve.instance_path.clone()
507        };
508        let label = format_label(&instance_path, &ve.schema_path);
509        let source_span: miette::SourceSpan = span.into();
510        errors.push(LintError::Validation {
511            src: miette::NamedSource::new(&pf.path, pf.content.clone()),
512            span: source_span,
513            schema_span: source_span,
514            path: pf.path.clone(),
515            instance_path,
516            label,
517            message: ve.message.clone(),
518            schema_url: schema_url.to_string(),
519            schema_path: ve.schema_path.clone(),
520        });
521    }
522}
523
524/// Validate all files in a group against an already-compiled validator and store
525/// results in the validation cache.
526#[tracing::instrument(skip_all, fields(schema_uri, file_count = group.len()))]
527#[allow(clippy::too_many_arguments)]
528async fn validate_group<P: alloc::borrow::Borrow<ParsedFile>>(
529    validator: &jsonschema::Validator,
530    schema_uri: &str,
531    schema_hash: &str,
532    validate_formats: bool,
533    cache_status: Option<CacheStatus>,
534    group: &[P],
535    vcache: &lintel_validation_cache::ValidationCache,
536    errors: &mut Vec<LintError>,
537    checked: &mut Vec<CheckedFile>,
538    on_check: &mut impl FnMut(&CheckedFile),
539) {
540    for item in group {
541        let pf = item.borrow();
542        let file_errors: Vec<ValidationError> = validator
543            .iter_errors(&pf.instance)
544            .map(|error| ValidationError {
545                instance_path: error.instance_path().to_string(),
546                message: clean_error_message(error.to_string()),
547                schema_path: error.schema_path().to_string(),
548            })
549            .collect();
550
551        vcache
552            .store(
553                &lintel_validation_cache::CacheKey {
554                    file_content: &pf.content,
555                    schema_hash,
556                    validate_formats,
557                },
558                &file_errors,
559            )
560            .await;
561        push_validation_errors(pf, schema_uri, &file_errors, errors);
562
563        let cf = CheckedFile {
564            path: pf.path.clone(),
565            schema: schema_uri.to_string(),
566            cache_status,
567            validation_cache_status: Some(ValidationCacheStatus::Miss),
568        };
569        on_check(&cf);
570        checked.push(cf);
571    }
572}
573
574// ---------------------------------------------------------------------------
575// Public API
576// ---------------------------------------------------------------------------
577
578/// Fetch and compile all schema catalogs (default, `SchemaStore`, and custom registries).
579///
580/// Returns a list of compiled catalogs, printing warnings for any that fail to fetch.
581pub async fn fetch_compiled_catalogs(
582    retriever: &SchemaCache,
583    config: &lintel_config::Config,
584    no_catalog: bool,
585) -> Vec<CompiledCatalog> {
586    let mut compiled_catalogs = Vec::new();
587
588    if !no_catalog {
589        let catalog_span = tracing::info_span!("fetch_catalogs").entered();
590
591        // Catalogs are fetched concurrently but sorted by priority so that
592        // the Lintel catalog wins over custom registries, which win over
593        // SchemaStore.  The `order` field encodes this precedence.
594        #[allow(clippy::items_after_statements)]
595        type CatalogResult = (
596            usize, // priority (lower = higher precedence)
597            String,
598            Result<CompiledCatalog, Box<dyn core::error::Error + Send + Sync>>,
599        );
600        let mut catalog_tasks: tokio::task::JoinSet<CatalogResult> = tokio::task::JoinSet::new();
601
602        // Custom registries from lintel.toml (highest precedence among catalogs)
603        for (i, registry_url) in config.registries.iter().enumerate() {
604            let r = retriever.clone();
605            let url = registry_url.clone();
606            let label = format!("registry {url}");
607            catalog_tasks.spawn(async move {
608                let result = registry::fetch(&r, &url)
609                    .await
610                    .map(|cat| CompiledCatalog::compile(&cat));
611                (i, label, result)
612            });
613        }
614
615        // Lintel catalog
616        let lintel_order = config.registries.len();
617        if !config.no_default_catalog {
618            let r = retriever.clone();
619            let label = format!("default catalog {}", registry::DEFAULT_REGISTRY);
620            catalog_tasks.spawn(async move {
621                let result = registry::fetch(&r, registry::DEFAULT_REGISTRY)
622                    .await
623                    .map(|cat| CompiledCatalog::compile(&cat));
624                (lintel_order, label, result)
625            });
626        }
627
628        // SchemaStore catalog (lowest precedence)
629        let schemastore_order = config.registries.len() + 1;
630        let r = retriever.clone();
631        catalog_tasks.spawn(async move {
632            let result = catalog::fetch_catalog(&r)
633                .await
634                .map(|cat| CompiledCatalog::compile(&cat));
635            (schemastore_order, "SchemaStore catalog".to_string(), result)
636        });
637
638        let mut results: Vec<(usize, CompiledCatalog)> = Vec::new();
639        while let Some(result) = catalog_tasks.join_next().await {
640            match result {
641                Ok((order, _, Ok(compiled))) => results.push((order, compiled)),
642                Ok((_, label, Err(e))) => eprintln!("warning: failed to fetch {label}: {e}"),
643                Err(e) => eprintln!("warning: catalog fetch task failed: {e}"),
644            }
645        }
646        results.sort_by_key(|(order, _)| *order);
647        compiled_catalogs.extend(results.into_iter().map(|(_, cat)| cat));
648
649        drop(catalog_span);
650    }
651
652    compiled_catalogs
653}
654
655/// # Errors
656///
657/// Returns an error if file collection or schema validation encounters an I/O error.
658pub async fn run(args: &ValidateArgs) -> Result<ValidateResult> {
659    run_with(args, None, |_| {}).await
660}
661
662/// Like [`run`], but calls `on_check` each time a file is checked, allowing
663/// callers to stream progress (e.g. verbose output) as files are processed.
664///
665/// # Errors
666///
667/// Returns an error if file collection or schema validation encounters an I/O error.
668#[tracing::instrument(skip_all, name = "validate")]
669#[allow(clippy::too_many_lines)]
670pub async fn run_with(
671    args: &ValidateArgs,
672    cache: Option<SchemaCache>,
673    mut on_check: impl FnMut(&CheckedFile),
674) -> Result<ValidateResult> {
675    let retriever = if let Some(c) = cache {
676        c
677    } else {
678        let mut builder = SchemaCache::builder().force_fetch(args.force_schema_fetch);
679        if let Some(dir) = &args.cache_dir {
680            let path = PathBuf::from(dir);
681            let _ = fs::create_dir_all(&path);
682            builder = builder.cache_dir(path);
683        }
684        if let Some(ttl) = args.schema_cache_ttl {
685            builder = builder.ttl(ttl);
686        }
687        builder.build()
688    };
689
690    let (config, config_dir, _config_path) = load_config(args.config_dir.as_deref());
691    let files = collect_files(&args.globs, &args.exclude)?;
692    tracing::info!(file_count = files.len(), "collected files");
693
694    let compiled_catalogs = fetch_compiled_catalogs(&retriever, &config, args.no_catalog).await;
695
696    let mut errors: Vec<LintError> = Vec::new();
697    let mut checked: Vec<CheckedFile> = Vec::new();
698
699    // Phase 1: Parse files and resolve schema URIs
700    let schema_groups = parse_and_group_files(
701        &files,
702        &config,
703        &config_dir,
704        &compiled_catalogs,
705        &mut errors,
706    )
707    .await;
708    tracing::info!(
709        schema_count = schema_groups.len(),
710        total_files = schema_groups.values().map(Vec::len).sum::<usize>(),
711        "grouped files by schema"
712    );
713
714    // Create validation cache
715    let vcache = lintel_validation_cache::ValidationCache::new(
716        lintel_validation_cache::ensure_cache_dir(),
717        args.force_validation,
718    );
719
720    // Prefetch all remote schemas in parallel
721    let remote_uris: Vec<&String> = schema_groups
722        .keys()
723        .filter(|uri| uri.starts_with("http://") || uri.starts_with("https://"))
724        .collect();
725
726    let prefetched = {
727        let _prefetch_span =
728            tracing::info_span!("prefetch_schemas", count = remote_uris.len()).entered();
729
730        let mut schema_tasks = tokio::task::JoinSet::new();
731        for uri in remote_uris {
732            let r = retriever.clone();
733            let u = uri.clone();
734            schema_tasks.spawn(async move {
735                let result = r.fetch(&u).await;
736                (u, result)
737            });
738        }
739
740        let mut prefetched: HashMap<String, Result<(Value, CacheStatus), String>> = HashMap::new();
741        while let Some(result) = schema_tasks.join_next().await {
742            match result {
743                Ok((uri, fetch_result)) => {
744                    prefetched.insert(uri, fetch_result.map_err(|e| e.to_string()));
745                }
746                Err(e) => eprintln!("warning: schema prefetch task failed: {e}"),
747            }
748        }
749
750        prefetched
751    };
752
753    // Phase 2: Compile each schema once and validate all matching files
754    let mut local_schema_cache: HashMap<String, Value> = HashMap::new();
755    let mut fetch_time = core::time::Duration::ZERO;
756    let mut hash_time = core::time::Duration::ZERO;
757    let mut vcache_time = core::time::Duration::ZERO;
758    let mut compile_time = core::time::Duration::ZERO;
759    let mut validate_time = core::time::Duration::ZERO;
760
761    for (schema_uri, group) in &schema_groups {
762        let _group_span = tracing::debug_span!(
763            "schema_group",
764            schema = schema_uri.as_str(),
765            files = group.len(),
766        )
767        .entered();
768
769        // If ANY file in the group matches a `validate_formats = false` override,
770        // disable format validation for the whole group (they share one compiled validator).
771        let validate_formats = group.iter().all(|pf| {
772            config
773                .should_validate_formats(&pf.path, &[&pf.original_schema_uri, schema_uri.as_str()])
774        });
775
776        // Remote schemas were prefetched in parallel above; local schemas are
777        // read from disk here (with in-memory caching).
778        let t = std::time::Instant::now();
779        let Some((schema_value, cache_status)) = fetch_schema_from_prefetched(
780            schema_uri,
781            &prefetched,
782            &mut local_schema_cache,
783            group,
784            &mut errors,
785            &mut checked,
786            &mut on_check,
787        )
788        .await
789        else {
790            fetch_time += t.elapsed();
791            continue;
792        };
793        fetch_time += t.elapsed();
794
795        // Pre-compute schema hash once for the entire group.
796        let t = std::time::Instant::now();
797        let schema_hash = lintel_validation_cache::schema_hash(&schema_value);
798        hash_time += t.elapsed();
799
800        // Split the group into validation cache hits and misses.
801        let mut cache_misses: Vec<&ParsedFile> = Vec::new();
802
803        let t = std::time::Instant::now();
804        for pf in group {
805            let (cached, vcache_status) = vcache
806                .lookup(&lintel_validation_cache::CacheKey {
807                    file_content: &pf.content,
808                    schema_hash: &schema_hash,
809                    validate_formats,
810                })
811                .await;
812
813            if let Some(cached_errors) = cached {
814                push_validation_errors(pf, schema_uri, &cached_errors, &mut errors);
815                let cf = CheckedFile {
816                    path: pf.path.clone(),
817                    schema: schema_uri.clone(),
818                    cache_status,
819                    validation_cache_status: Some(vcache_status),
820                };
821                on_check(&cf);
822                checked.push(cf);
823            } else {
824                cache_misses.push(pf);
825            }
826        }
827        vcache_time += t.elapsed();
828
829        tracing::debug!(
830            cache_hits = group.len() - cache_misses.len(),
831            cache_misses = cache_misses.len(),
832            "validation cache"
833        );
834
835        // If all files hit the validation cache, skip schema compilation entirely.
836        if cache_misses.is_empty() {
837            continue;
838        }
839
840        // Compile the schema for cache misses.
841        let t = std::time::Instant::now();
842        let validator = {
843            // Set base URI for remote schemas so relative $ref values
844            // (e.g. "./rule.json") resolve correctly.
845            let is_remote_schema =
846                schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
847            let opts = jsonschema::async_options()
848                .with_retriever(retriever.clone())
849                .should_validate_formats(validate_formats);
850            let opts = if is_remote_schema {
851                opts.with_base_uri(schema_uri.clone())
852            } else {
853                opts
854            };
855            match opts.build(&schema_value).await {
856                Ok(v) => v,
857                Err(e) => {
858                    compile_time += t.elapsed();
859                    // When format validation is disabled and the compilation error
860                    // is a uri-reference issue (e.g. Rust-style $ref paths in
861                    // vector.json), skip validation silently.
862                    if !validate_formats && e.to_string().contains("uri-reference") {
863                        mark_group_checked(
864                            schema_uri,
865                            cache_status,
866                            Some(ValidationCacheStatus::Miss),
867                            &cache_misses,
868                            &mut checked,
869                            &mut on_check,
870                        );
871                        continue;
872                    }
873                    let msg = format!("failed to compile schema: {e}");
874                    report_group_error(
875                        |path| LintError::SchemaCompile {
876                            path: path.to_string(),
877                            message: msg.clone(),
878                        },
879                        schema_uri,
880                        cache_status,
881                        &cache_misses,
882                        &mut errors,
883                        &mut checked,
884                        &mut on_check,
885                    );
886                    continue;
887                }
888            }
889        };
890        compile_time += t.elapsed();
891
892        let t = std::time::Instant::now();
893        validate_group(
894            &validator,
895            schema_uri,
896            &schema_hash,
897            validate_formats,
898            cache_status,
899            &cache_misses,
900            &vcache,
901            &mut errors,
902            &mut checked,
903            &mut on_check,
904        )
905        .await;
906        validate_time += t.elapsed();
907    }
908
909    #[allow(clippy::cast_possible_truncation)]
910    {
911        tracing::info!(
912            fetch_ms = fetch_time.as_millis() as u64,
913            hash_ms = hash_time.as_millis() as u64,
914            vcache_ms = vcache_time.as_millis() as u64,
915            compile_ms = compile_time.as_millis() as u64,
916            validate_ms = validate_time.as_millis() as u64,
917            "phase2 breakdown"
918        );
919    }
920
921    // Sort errors for deterministic output (by path, then by span offset)
922    errors.sort_by(|a, b| {
923        a.path()
924            .cmp(b.path())
925            .then_with(|| a.offset().cmp(&b.offset()))
926    });
927
928    Ok(ValidateResult { errors, checked })
929}
930
931#[cfg(test)]
932mod tests {
933    use super::*;
934    use lintel_schema_cache::SchemaCache;
935    use std::path::Path;
936
937    fn mock(entries: &[(&str, &str)]) -> SchemaCache {
938        let cache = SchemaCache::memory();
939        for (uri, body) in entries {
940            cache.insert(
941                uri,
942                serde_json::from_str(body).expect("test mock: invalid JSON"),
943            );
944        }
945        cache
946    }
947
948    fn testdata() -> PathBuf {
949        Path::new(env!("CARGO_MANIFEST_DIR")).join("testdata")
950    }
951
952    /// Build glob patterns that scan one or more testdata directories for all supported file types.
953    fn scenario_globs(dirs: &[&str]) -> Vec<String> {
954        dirs.iter()
955            .flat_map(|dir| {
956                let base = testdata().join(dir);
957                vec![
958                    base.join("*.json").to_string_lossy().to_string(),
959                    base.join("*.yaml").to_string_lossy().to_string(),
960                    base.join("*.yml").to_string_lossy().to_string(),
961                    base.join("*.json5").to_string_lossy().to_string(),
962                    base.join("*.jsonc").to_string_lossy().to_string(),
963                    base.join("*.toml").to_string_lossy().to_string(),
964                ]
965            })
966            .collect()
967    }
968
969    fn args_for_dirs(dirs: &[&str]) -> ValidateArgs {
970        ValidateArgs {
971            globs: scenario_globs(dirs),
972            exclude: vec![],
973            cache_dir: None,
974            force_schema_fetch: true,
975            force_validation: true,
976            no_catalog: true,
977            config_dir: None,
978            schema_cache_ttl: None,
979        }
980    }
981
982    const SCHEMA: &str =
983        r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
984
985    fn schema_mock() -> SchemaCache {
986        mock(&[("https://example.com/schema.json", SCHEMA)])
987    }
988
989    // --- Directory scanning tests ---
990
991    #[tokio::test]
992    async fn no_matching_files() -> anyhow::Result<()> {
993        let tmp = tempfile::tempdir()?;
994        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
995        let c = ValidateArgs {
996            globs: vec![pattern],
997            exclude: vec![],
998            cache_dir: None,
999            force_schema_fetch: true,
1000            force_validation: true,
1001            no_catalog: true,
1002            config_dir: None,
1003            schema_cache_ttl: None,
1004        };
1005        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1006        assert!(!result.has_errors());
1007        Ok(())
1008    }
1009
1010    #[tokio::test]
1011    async fn dir_all_valid() -> anyhow::Result<()> {
1012        let c = args_for_dirs(&["positive_tests"]);
1013        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1014        assert!(!result.has_errors());
1015        Ok(())
1016    }
1017
1018    #[tokio::test]
1019    async fn dir_all_invalid() -> anyhow::Result<()> {
1020        let c = args_for_dirs(&["negative_tests"]);
1021        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1022        assert!(result.has_errors());
1023        Ok(())
1024    }
1025
1026    #[tokio::test]
1027    async fn dir_mixed_valid_and_invalid() -> anyhow::Result<()> {
1028        let c = args_for_dirs(&["positive_tests", "negative_tests"]);
1029        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1030        assert!(result.has_errors());
1031        Ok(())
1032    }
1033
1034    #[tokio::test]
1035    async fn dir_no_schemas_skipped() -> anyhow::Result<()> {
1036        let c = args_for_dirs(&["no_schema"]);
1037        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1038        assert!(!result.has_errors());
1039        Ok(())
1040    }
1041
1042    #[tokio::test]
1043    async fn dir_valid_with_no_schema_files() -> anyhow::Result<()> {
1044        let c = args_for_dirs(&["positive_tests", "no_schema"]);
1045        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1046        assert!(!result.has_errors());
1047        Ok(())
1048    }
1049
1050    // --- Directory as positional arg ---
1051
1052    #[tokio::test]
1053    async fn directory_arg_discovers_files() -> anyhow::Result<()> {
1054        let dir = testdata().join("positive_tests");
1055        let c = ValidateArgs {
1056            globs: vec![dir.to_string_lossy().to_string()],
1057            exclude: vec![],
1058            cache_dir: None,
1059            force_schema_fetch: true,
1060            force_validation: true,
1061            no_catalog: true,
1062            config_dir: None,
1063            schema_cache_ttl: None,
1064        };
1065        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1066        assert!(!result.has_errors());
1067        assert!(result.files_checked() > 0);
1068        Ok(())
1069    }
1070
1071    #[tokio::test]
1072    async fn multiple_directory_args() -> anyhow::Result<()> {
1073        let pos_dir = testdata().join("positive_tests");
1074        let no_schema_dir = testdata().join("no_schema");
1075        let c = ValidateArgs {
1076            globs: vec![
1077                pos_dir.to_string_lossy().to_string(),
1078                no_schema_dir.to_string_lossy().to_string(),
1079            ],
1080            exclude: vec![],
1081            cache_dir: None,
1082            force_schema_fetch: true,
1083            force_validation: true,
1084            no_catalog: true,
1085            config_dir: None,
1086            schema_cache_ttl: None,
1087        };
1088        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1089        assert!(!result.has_errors());
1090        Ok(())
1091    }
1092
1093    #[tokio::test]
1094    async fn mix_directory_and_glob_args() -> anyhow::Result<()> {
1095        let dir = testdata().join("positive_tests");
1096        let glob_pattern = testdata()
1097            .join("no_schema")
1098            .join("*.json")
1099            .to_string_lossy()
1100            .to_string();
1101        let c = ValidateArgs {
1102            globs: vec![dir.to_string_lossy().to_string(), glob_pattern],
1103            exclude: vec![],
1104            cache_dir: None,
1105            force_schema_fetch: true,
1106            force_validation: true,
1107            no_catalog: true,
1108            config_dir: None,
1109            schema_cache_ttl: None,
1110        };
1111        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1112        assert!(!result.has_errors());
1113        Ok(())
1114    }
1115
1116    #[tokio::test]
1117    async fn malformed_json_parse_error() -> anyhow::Result<()> {
1118        let base = testdata().join("malformed");
1119        let c = ValidateArgs {
1120            globs: vec![base.join("*.json").to_string_lossy().to_string()],
1121            exclude: vec![],
1122            cache_dir: None,
1123            force_schema_fetch: true,
1124            force_validation: true,
1125            no_catalog: true,
1126            config_dir: None,
1127            schema_cache_ttl: None,
1128        };
1129        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1130        assert!(result.has_errors());
1131        Ok(())
1132    }
1133
1134    #[tokio::test]
1135    async fn malformed_yaml_parse_error() -> anyhow::Result<()> {
1136        let base = testdata().join("malformed");
1137        let c = ValidateArgs {
1138            globs: vec![base.join("*.yaml").to_string_lossy().to_string()],
1139            exclude: vec![],
1140            cache_dir: None,
1141            force_schema_fetch: true,
1142            force_validation: true,
1143            no_catalog: true,
1144            config_dir: None,
1145            schema_cache_ttl: None,
1146        };
1147        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1148        assert!(result.has_errors());
1149        Ok(())
1150    }
1151
1152    // --- Exclude filter ---
1153
1154    #[tokio::test]
1155    async fn exclude_filters_files_in_dir() -> anyhow::Result<()> {
1156        let base = testdata().join("negative_tests");
1157        let c = ValidateArgs {
1158            globs: scenario_globs(&["positive_tests", "negative_tests"]),
1159            exclude: vec![
1160                base.join("missing_name.json").to_string_lossy().to_string(),
1161                base.join("missing_name.toml").to_string_lossy().to_string(),
1162                base.join("missing_name.yaml").to_string_lossy().to_string(),
1163            ],
1164            cache_dir: None,
1165            force_schema_fetch: true,
1166            force_validation: true,
1167            no_catalog: true,
1168            config_dir: None,
1169            schema_cache_ttl: None,
1170        };
1171        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1172        assert!(!result.has_errors());
1173        Ok(())
1174    }
1175
1176    // --- Cache options ---
1177
1178    #[tokio::test]
1179    async fn custom_cache_dir() -> anyhow::Result<()> {
1180        let c = ValidateArgs {
1181            globs: scenario_globs(&["positive_tests"]),
1182            exclude: vec![],
1183            cache_dir: None,
1184            force_schema_fetch: true,
1185            force_validation: true,
1186            no_catalog: true,
1187            config_dir: None,
1188            schema_cache_ttl: None,
1189        };
1190        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1191        assert!(!result.has_errors());
1192        Ok(())
1193    }
1194
1195    // --- Local schema ---
1196
1197    #[tokio::test]
1198    async fn json_valid_with_local_schema() -> anyhow::Result<()> {
1199        let tmp = tempfile::tempdir()?;
1200        let schema_path = tmp.path().join("schema.json");
1201        fs::write(&schema_path, SCHEMA)?;
1202
1203        let f = tmp.path().join("valid.json");
1204        fs::write(
1205            &f,
1206            format!(
1207                r#"{{"$schema":"{}","name":"hello"}}"#,
1208                schema_path.to_string_lossy()
1209            ),
1210        )?;
1211
1212        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1213        let c = ValidateArgs {
1214            globs: vec![pattern],
1215            exclude: vec![],
1216            cache_dir: None,
1217            force_schema_fetch: true,
1218            force_validation: true,
1219            no_catalog: true,
1220            config_dir: None,
1221            schema_cache_ttl: None,
1222        };
1223        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1224        assert!(!result.has_errors());
1225        Ok(())
1226    }
1227
1228    #[tokio::test]
1229    async fn yaml_valid_with_local_schema() -> anyhow::Result<()> {
1230        let tmp = tempfile::tempdir()?;
1231        let schema_path = tmp.path().join("schema.json");
1232        fs::write(&schema_path, SCHEMA)?;
1233
1234        let f = tmp.path().join("valid.yaml");
1235        fs::write(
1236            &f,
1237            format!(
1238                "# yaml-language-server: $schema={}\nname: hello\n",
1239                schema_path.to_string_lossy()
1240            ),
1241        )?;
1242
1243        let pattern = tmp.path().join("*.yaml").to_string_lossy().to_string();
1244        let c = ValidateArgs {
1245            globs: vec![pattern],
1246            exclude: vec![],
1247            cache_dir: None,
1248            force_schema_fetch: true,
1249            force_validation: true,
1250            no_catalog: true,
1251            config_dir: None,
1252            schema_cache_ttl: None,
1253        };
1254        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1255        assert!(!result.has_errors());
1256        Ok(())
1257    }
1258
1259    #[tokio::test]
1260    async fn missing_local_schema_errors() -> anyhow::Result<()> {
1261        let tmp = tempfile::tempdir()?;
1262        let f = tmp.path().join("ref.json");
1263        fs::write(&f, r#"{"$schema":"/nonexistent/schema.json"}"#)?;
1264
1265        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1266        let c = ValidateArgs {
1267            globs: vec![pattern],
1268            exclude: vec![],
1269            cache_dir: None,
1270            force_schema_fetch: true,
1271            force_validation: true,
1272            no_catalog: true,
1273            config_dir: None,
1274            schema_cache_ttl: None,
1275        };
1276        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1277        assert!(result.has_errors());
1278        Ok(())
1279    }
1280
1281    // --- JSON5 / JSONC tests ---
1282
1283    #[tokio::test]
1284    async fn json5_valid_with_schema() -> anyhow::Result<()> {
1285        let tmp = tempfile::tempdir()?;
1286        let schema_path = tmp.path().join("schema.json");
1287        fs::write(&schema_path, SCHEMA)?;
1288
1289        let f = tmp.path().join("config.json5");
1290        fs::write(
1291            &f,
1292            format!(
1293                r#"{{
1294  // JSON5 comment
1295  "$schema": "{}",
1296  name: "hello",
1297}}"#,
1298                schema_path.to_string_lossy()
1299            ),
1300        )?;
1301
1302        let pattern = tmp.path().join("*.json5").to_string_lossy().to_string();
1303        let c = ValidateArgs {
1304            globs: vec![pattern],
1305            exclude: vec![],
1306            cache_dir: None,
1307            force_schema_fetch: true,
1308            force_validation: true,
1309            no_catalog: true,
1310            config_dir: None,
1311            schema_cache_ttl: None,
1312        };
1313        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1314        assert!(!result.has_errors());
1315        Ok(())
1316    }
1317
1318    #[tokio::test]
1319    async fn jsonc_valid_with_schema() -> anyhow::Result<()> {
1320        let tmp = tempfile::tempdir()?;
1321        let schema_path = tmp.path().join("schema.json");
1322        fs::write(&schema_path, SCHEMA)?;
1323
1324        let f = tmp.path().join("config.jsonc");
1325        fs::write(
1326            &f,
1327            format!(
1328                r#"{{
1329  /* JSONC comment */
1330  "$schema": "{}",
1331  "name": "hello"
1332}}"#,
1333                schema_path.to_string_lossy()
1334            ),
1335        )?;
1336
1337        let pattern = tmp.path().join("*.jsonc").to_string_lossy().to_string();
1338        let c = ValidateArgs {
1339            globs: vec![pattern],
1340            exclude: vec![],
1341            cache_dir: None,
1342            force_schema_fetch: true,
1343            force_validation: true,
1344            no_catalog: true,
1345            config_dir: None,
1346            schema_cache_ttl: None,
1347        };
1348        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1349        assert!(!result.has_errors());
1350        Ok(())
1351    }
1352
1353    // --- Catalog-based schema matching ---
1354
1355    const GH_WORKFLOW_SCHEMA: &str = r#"{
1356        "type": "object",
1357        "properties": {
1358            "name": { "type": "string" },
1359            "on": {},
1360            "jobs": { "type": "object" }
1361        },
1362        "required": ["on", "jobs"]
1363    }"#;
1364
1365    fn gh_catalog_json() -> String {
1366        r#"{"schemas":[{
1367            "name": "GitHub Workflow",
1368            "url": "https://www.schemastore.org/github-workflow.json",
1369            "fileMatch": [
1370                "**/.github/workflows/*.yml",
1371                "**/.github/workflows/*.yaml"
1372            ]
1373        }]}"#
1374            .to_string()
1375    }
1376
1377    #[tokio::test]
1378    async fn catalog_matches_github_workflow_valid() -> anyhow::Result<()> {
1379        let tmp = tempfile::tempdir()?;
1380        let cache_tmp = tempfile::tempdir()?;
1381        let wf_dir = tmp.path().join(".github/workflows");
1382        fs::create_dir_all(&wf_dir)?;
1383        fs::write(
1384            wf_dir.join("ci.yml"),
1385            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1386        )?;
1387
1388        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1389        let client = mock(&[
1390            (
1391                "https://www.schemastore.org/api/json/catalog.json",
1392                &gh_catalog_json(),
1393            ),
1394            (
1395                "https://www.schemastore.org/github-workflow.json",
1396                GH_WORKFLOW_SCHEMA,
1397            ),
1398        ]);
1399        let c = ValidateArgs {
1400            globs: vec![pattern],
1401            exclude: vec![],
1402            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1403            force_schema_fetch: true,
1404            force_validation: true,
1405            no_catalog: false,
1406            config_dir: None,
1407            schema_cache_ttl: None,
1408        };
1409        let result = run_with(&c, Some(client), |_| {}).await?;
1410        assert!(!result.has_errors());
1411        Ok(())
1412    }
1413
1414    #[tokio::test]
1415    async fn catalog_matches_github_workflow_invalid() -> anyhow::Result<()> {
1416        let tmp = tempfile::tempdir()?;
1417        let cache_tmp = tempfile::tempdir()?;
1418        let wf_dir = tmp.path().join(".github/workflows");
1419        fs::create_dir_all(&wf_dir)?;
1420        fs::write(wf_dir.join("bad.yml"), "name: Broken\n")?;
1421
1422        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1423        let client = mock(&[
1424            (
1425                "https://www.schemastore.org/api/json/catalog.json",
1426                &gh_catalog_json(),
1427            ),
1428            (
1429                "https://www.schemastore.org/github-workflow.json",
1430                GH_WORKFLOW_SCHEMA,
1431            ),
1432        ]);
1433        let c = ValidateArgs {
1434            globs: vec![pattern],
1435            exclude: vec![],
1436            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1437            force_schema_fetch: true,
1438            force_validation: true,
1439            no_catalog: false,
1440            config_dir: None,
1441            schema_cache_ttl: None,
1442        };
1443        let result = run_with(&c, Some(client), |_| {}).await?;
1444        assert!(result.has_errors());
1445        Ok(())
1446    }
1447
1448    #[tokio::test]
1449    async fn auto_discover_finds_github_workflows() -> anyhow::Result<()> {
1450        let tmp = tempfile::tempdir()?;
1451        let cache_tmp = tempfile::tempdir()?;
1452        let wf_dir = tmp.path().join(".github/workflows");
1453        fs::create_dir_all(&wf_dir)?;
1454        fs::write(
1455            wf_dir.join("ci.yml"),
1456            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1457        )?;
1458
1459        let client = mock(&[
1460            (
1461                "https://www.schemastore.org/api/json/catalog.json",
1462                &gh_catalog_json(),
1463            ),
1464            (
1465                "https://www.schemastore.org/github-workflow.json",
1466                GH_WORKFLOW_SCHEMA,
1467            ),
1468        ]);
1469        let c = ValidateArgs {
1470            globs: vec![],
1471            exclude: vec![],
1472            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1473            force_schema_fetch: true,
1474            force_validation: true,
1475            no_catalog: false,
1476            config_dir: None,
1477            schema_cache_ttl: None,
1478        };
1479
1480        let orig_dir = std::env::current_dir()?;
1481        std::env::set_current_dir(tmp.path())?;
1482        let result = run_with(&c, Some(client), |_| {}).await?;
1483        std::env::set_current_dir(orig_dir)?;
1484
1485        assert!(!result.has_errors());
1486        Ok(())
1487    }
1488
1489    // --- TOML tests ---
1490
1491    #[tokio::test]
1492    async fn toml_valid_with_schema() -> anyhow::Result<()> {
1493        let tmp = tempfile::tempdir()?;
1494        let schema_path = tmp.path().join("schema.json");
1495        fs::write(&schema_path, SCHEMA)?;
1496
1497        let f = tmp.path().join("config.toml");
1498        fs::write(
1499            &f,
1500            format!(
1501                "# :schema {}\nname = \"hello\"\n",
1502                schema_path.to_string_lossy()
1503            ),
1504        )?;
1505
1506        let pattern = tmp.path().join("*.toml").to_string_lossy().to_string();
1507        let c = ValidateArgs {
1508            globs: vec![pattern],
1509            exclude: vec![],
1510            cache_dir: None,
1511            force_schema_fetch: true,
1512            force_validation: true,
1513            no_catalog: true,
1514            config_dir: None,
1515            schema_cache_ttl: None,
1516        };
1517        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1518        assert!(!result.has_errors());
1519        Ok(())
1520    }
1521
1522    // --- Rewrite rules + // resolution ---
1523
1524    #[tokio::test]
1525    async fn rewrite_rule_with_double_slash_resolves_schema() -> anyhow::Result<()> {
1526        let tmp = tempfile::tempdir()?;
1527
1528        let schemas_dir = tmp.path().join("schemas");
1529        fs::create_dir_all(&schemas_dir)?;
1530        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1531
1532        fs::write(
1533            tmp.path().join("lintel.toml"),
1534            r#"
1535[rewrite]
1536"http://localhost:9000/" = "//schemas/"
1537"#,
1538        )?;
1539
1540        let f = tmp.path().join("config.json");
1541        fs::write(
1542            &f,
1543            r#"{"$schema":"http://localhost:9000/test.json","name":"hello"}"#,
1544        )?;
1545
1546        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1547        let c = ValidateArgs {
1548            globs: vec![pattern],
1549            exclude: vec![],
1550            cache_dir: None,
1551            force_schema_fetch: true,
1552            force_validation: true,
1553            no_catalog: true,
1554            config_dir: Some(tmp.path().to_path_buf()),
1555            schema_cache_ttl: None,
1556        };
1557
1558        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1559        assert!(!result.has_errors());
1560        assert_eq!(result.files_checked(), 1);
1561        Ok(())
1562    }
1563
1564    #[tokio::test]
1565    async fn double_slash_schema_resolves_relative_to_config() -> anyhow::Result<()> {
1566        let tmp = tempfile::tempdir()?;
1567
1568        let schemas_dir = tmp.path().join("schemas");
1569        fs::create_dir_all(&schemas_dir)?;
1570        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1571
1572        fs::write(tmp.path().join("lintel.toml"), "")?;
1573
1574        let sub = tmp.path().join("deeply/nested");
1575        fs::create_dir_all(&sub)?;
1576        let f = sub.join("config.json");
1577        fs::write(&f, r#"{"$schema":"//schemas/test.json","name":"hello"}"#)?;
1578
1579        let pattern = sub.join("*.json").to_string_lossy().to_string();
1580        let c = ValidateArgs {
1581            globs: vec![pattern],
1582            exclude: vec![],
1583            cache_dir: None,
1584            force_schema_fetch: true,
1585            force_validation: true,
1586            no_catalog: true,
1587            config_dir: Some(tmp.path().to_path_buf()),
1588            schema_cache_ttl: None,
1589        };
1590
1591        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1592        assert!(!result.has_errors());
1593        Ok(())
1594    }
1595
1596    // --- Format validation override ---
1597
1598    const FORMAT_SCHEMA: &str = r#"{
1599        "type": "object",
1600        "properties": {
1601            "link": { "type": "string", "format": "uri-reference" }
1602        }
1603    }"#;
1604
1605    #[tokio::test]
1606    async fn format_errors_reported_without_override() -> anyhow::Result<()> {
1607        let tmp = tempfile::tempdir()?;
1608        let schema_path = tmp.path().join("schema.json");
1609        fs::write(&schema_path, FORMAT_SCHEMA)?;
1610
1611        let f = tmp.path().join("data.json");
1612        fs::write(
1613            &f,
1614            format!(
1615                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1616                schema_path.to_string_lossy()
1617            ),
1618        )?;
1619
1620        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1621        let c = ValidateArgs {
1622            globs: vec![pattern],
1623            exclude: vec![],
1624            cache_dir: None,
1625            force_schema_fetch: true,
1626            force_validation: true,
1627            no_catalog: true,
1628            config_dir: Some(tmp.path().to_path_buf()),
1629            schema_cache_ttl: None,
1630        };
1631        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1632        assert!(
1633            result.has_errors(),
1634            "expected format error without override"
1635        );
1636        Ok(())
1637    }
1638
1639    #[tokio::test]
1640    async fn format_errors_suppressed_with_override() -> anyhow::Result<()> {
1641        let tmp = tempfile::tempdir()?;
1642        let schema_path = tmp.path().join("schema.json");
1643        fs::write(&schema_path, FORMAT_SCHEMA)?;
1644
1645        let f = tmp.path().join("data.json");
1646        fs::write(
1647            &f,
1648            format!(
1649                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1650                schema_path.to_string_lossy()
1651            ),
1652        )?;
1653
1654        // Use **/data.json to match the absolute path from the tempdir.
1655        fs::write(
1656            tmp.path().join("lintel.toml"),
1657            r#"
1658[[override]]
1659files = ["**/data.json"]
1660validate_formats = false
1661"#,
1662        )?;
1663
1664        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1665        let c = ValidateArgs {
1666            globs: vec![pattern],
1667            exclude: vec![],
1668            cache_dir: None,
1669            force_schema_fetch: true,
1670            force_validation: true,
1671            no_catalog: true,
1672            config_dir: Some(tmp.path().to_path_buf()),
1673            schema_cache_ttl: None,
1674        };
1675        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1676        assert!(
1677            !result.has_errors(),
1678            "expected no errors with validate_formats = false override"
1679        );
1680        Ok(())
1681    }
1682
1683    // --- Unrecognized extension handling ---
1684
1685    #[tokio::test]
1686    async fn unrecognized_extension_skipped_without_catalog() -> anyhow::Result<()> {
1687        let tmp = tempfile::tempdir()?;
1688        fs::write(tmp.path().join("config.nix"), r#"{"name":"hello"}"#)?;
1689
1690        let pattern = tmp.path().join("config.nix").to_string_lossy().to_string();
1691        let c = ValidateArgs {
1692            globs: vec![pattern],
1693            exclude: vec![],
1694            cache_dir: None,
1695            force_schema_fetch: true,
1696            force_validation: true,
1697            no_catalog: true,
1698            config_dir: Some(tmp.path().to_path_buf()),
1699            schema_cache_ttl: None,
1700        };
1701        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1702        assert!(!result.has_errors());
1703        assert_eq!(result.files_checked(), 0);
1704        Ok(())
1705    }
1706
1707    #[tokio::test]
1708    async fn unrecognized_extension_parsed_when_catalog_matches() -> anyhow::Result<()> {
1709        let tmp = tempfile::tempdir()?;
1710        let cache_tmp = tempfile::tempdir()?;
1711        // File has .cfg extension (unrecognized) but content is valid JSON
1712        fs::write(
1713            tmp.path().join("myapp.cfg"),
1714            r#"{"name":"hello","on":"push","jobs":{"build":{}}}"#,
1715        )?;
1716
1717        let catalog_json = r#"{"schemas":[{
1718            "name": "MyApp Config",
1719            "url": "https://example.com/myapp.schema.json",
1720            "fileMatch": ["*.cfg"]
1721        }]}"#;
1722        let schema =
1723            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1724
1725        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1726        let client = mock(&[
1727            (
1728                "https://www.schemastore.org/api/json/catalog.json",
1729                catalog_json,
1730            ),
1731            ("https://example.com/myapp.schema.json", schema),
1732        ]);
1733        let c = ValidateArgs {
1734            globs: vec![pattern],
1735            exclude: vec![],
1736            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1737            force_schema_fetch: true,
1738            force_validation: true,
1739            no_catalog: false,
1740            config_dir: Some(tmp.path().to_path_buf()),
1741            schema_cache_ttl: None,
1742        };
1743        let result = run_with(&c, Some(client), |_| {}).await?;
1744        assert!(!result.has_errors());
1745        assert_eq!(result.files_checked(), 1);
1746        Ok(())
1747    }
1748
1749    #[tokio::test]
1750    async fn unrecognized_extension_unparseable_skipped() -> anyhow::Result<()> {
1751        let tmp = tempfile::tempdir()?;
1752        let cache_tmp = tempfile::tempdir()?;
1753        // File matches catalog but content isn't parseable by any format
1754        fs::write(
1755            tmp.path().join("myapp.cfg"),
1756            "{ pkgs, ... }: { packages = [ pkgs.git ]; }",
1757        )?;
1758
1759        let catalog_json = r#"{"schemas":[{
1760            "name": "MyApp Config",
1761            "url": "https://example.com/myapp.schema.json",
1762            "fileMatch": ["*.cfg"]
1763        }]}"#;
1764
1765        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1766        let client = mock(&[(
1767            "https://www.schemastore.org/api/json/catalog.json",
1768            catalog_json,
1769        )]);
1770        let c = ValidateArgs {
1771            globs: vec![pattern],
1772            exclude: vec![],
1773            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1774            force_schema_fetch: true,
1775            force_validation: true,
1776            no_catalog: false,
1777            config_dir: Some(tmp.path().to_path_buf()),
1778            schema_cache_ttl: None,
1779        };
1780        let result = run_with(&c, Some(client), |_| {}).await?;
1781        assert!(!result.has_errors());
1782        assert_eq!(result.files_checked(), 0);
1783        Ok(())
1784    }
1785
1786    #[tokio::test]
1787    async fn unrecognized_extension_invalid_against_schema() -> anyhow::Result<()> {
1788        let tmp = tempfile::tempdir()?;
1789        let cache_tmp = tempfile::tempdir()?;
1790        // File has .cfg extension, content is valid JSON but fails schema validation
1791        fs::write(tmp.path().join("myapp.cfg"), r#"{"wrong":"field"}"#)?;
1792
1793        let catalog_json = r#"{"schemas":[{
1794            "name": "MyApp Config",
1795            "url": "https://example.com/myapp.schema.json",
1796            "fileMatch": ["*.cfg"]
1797        }]}"#;
1798        let schema =
1799            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1800
1801        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1802        let client = mock(&[
1803            (
1804                "https://www.schemastore.org/api/json/catalog.json",
1805                catalog_json,
1806            ),
1807            ("https://example.com/myapp.schema.json", schema),
1808        ]);
1809        let c = ValidateArgs {
1810            globs: vec![pattern],
1811            exclude: vec![],
1812            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1813            force_schema_fetch: true,
1814            force_validation: true,
1815            no_catalog: false,
1816            config_dir: Some(tmp.path().to_path_buf()),
1817            schema_cache_ttl: None,
1818        };
1819        let result = run_with(&c, Some(client), |_| {}).await?;
1820        assert!(result.has_errors());
1821        assert_eq!(result.files_checked(), 1);
1822        Ok(())
1823    }
1824
1825    // --- Validation cache ---
1826
1827    #[tokio::test]
1828    async fn validation_cache_hit_skips_revalidation() -> anyhow::Result<()> {
1829        let tmp = tempfile::tempdir()?;
1830        let schema_path = tmp.path().join("schema.json");
1831        fs::write(&schema_path, SCHEMA)?;
1832
1833        let f = tmp.path().join("valid.json");
1834        fs::write(
1835            &f,
1836            format!(
1837                r#"{{"$schema":"{}","name":"hello"}}"#,
1838                schema_path.to_string_lossy()
1839            ),
1840        )?;
1841
1842        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1843
1844        // First run: force_validation = false so results get cached
1845        let c = ValidateArgs {
1846            globs: vec![pattern.clone()],
1847            exclude: vec![],
1848            cache_dir: None,
1849            force_schema_fetch: true,
1850            force_validation: false,
1851            no_catalog: true,
1852            config_dir: None,
1853            schema_cache_ttl: None,
1854        };
1855        let mut first_statuses = Vec::new();
1856        let result = run_with(&c, Some(mock(&[])), |cf| {
1857            first_statuses.push(cf.validation_cache_status);
1858        })
1859        .await?;
1860        assert!(!result.has_errors());
1861        assert!(result.files_checked() > 0);
1862
1863        // Verify the first run recorded a validation cache miss
1864        assert!(
1865            first_statuses.contains(&Some(ValidationCacheStatus::Miss)),
1866            "expected at least one validation cache miss on first run"
1867        );
1868
1869        // Second run: same file, same schema — should hit validation cache
1870        let mut second_statuses = Vec::new();
1871        let result = run_with(&c, Some(mock(&[])), |cf| {
1872            second_statuses.push(cf.validation_cache_status);
1873        })
1874        .await?;
1875        assert!(!result.has_errors());
1876
1877        // Verify the second run got a validation cache hit
1878        assert!(
1879            second_statuses.contains(&Some(ValidationCacheStatus::Hit)),
1880            "expected at least one validation cache hit on second run"
1881        );
1882        Ok(())
1883    }
1884
1885    // --- clean_error_message ---
1886
1887    #[test]
1888    fn clean_strips_anyof_value() {
1889        let msg =
1890            r#"{"type":"bad"} is not valid under any of the schemas listed in the 'anyOf' keyword"#;
1891        assert_eq!(
1892            clean_error_message(msg.to_string()),
1893            "not valid under any of the schemas listed in the 'anyOf' keyword"
1894        );
1895    }
1896
1897    #[test]
1898    fn clean_strips_oneof_value() {
1899        let msg = r#"{"runs-on":"ubuntu-latest","steps":[]} is not valid under any of the schemas listed in the 'oneOf' keyword"#;
1900        assert_eq!(
1901            clean_error_message(msg.to_string()),
1902            "not valid under any of the schemas listed in the 'oneOf' keyword"
1903        );
1904    }
1905
1906    #[test]
1907    fn clean_strips_long_value() {
1908        let long_value = "x".repeat(5000);
1909        let suffix = " is not valid under any of the schemas listed in the 'anyOf' keyword";
1910        let msg = format!("{long_value}{suffix}");
1911        assert_eq!(
1912            clean_error_message(msg),
1913            "not valid under any of the schemas listed in the 'anyOf' keyword"
1914        );
1915    }
1916
1917    #[test]
1918    fn clean_preserves_type_error() {
1919        let msg = r#"12345 is not of types "null", "string""#;
1920        assert_eq!(clean_error_message(msg.to_string()), msg);
1921    }
1922
1923    #[test]
1924    fn clean_preserves_required_property() {
1925        let msg = "\"name\" is a required property";
1926        assert_eq!(clean_error_message(msg.to_string()), msg);
1927    }
1928}