Skip to main content

lintel_validate/
validate.rs

1use alloc::collections::BTreeMap;
2use std::collections::HashMap;
3use std::fs;
4use std::path::{Path, PathBuf};
5
6use anyhow::{Context, Result};
7use glob::glob;
8use serde_json::Value;
9
10use crate::catalog;
11use lintel_schema_cache::{CacheStatus, SchemaCache};
12use lintel_validation_cache::{ValidationCacheStatus, ValidationError};
13use schemastore::CompiledCatalog;
14
15use crate::diagnostics::{DEFAULT_LABEL, find_instance_path_span, format_label};
16use crate::discover;
17use crate::parsers::{self, FileFormat, JsoncParser, Parser};
18use crate::registry;
19
20/// Conservative limit for concurrent file reads to avoid exhausting file
21/// descriptors. 128 is well below the default soft limit on macOS (256) and
22/// Linux (1024) while still providing good throughput.
23const FD_CONCURRENCY_LIMIT: usize = 128;
24
25pub struct ValidateArgs {
26    /// Glob patterns to find files (empty = auto-discover)
27    pub globs: Vec<String>,
28
29    /// Exclude files matching these globs (repeatable)
30    pub exclude: Vec<String>,
31
32    /// Cache directory for remote schemas
33    pub cache_dir: Option<String>,
34
35    /// Bypass schema cache reads (still writes fetched schemas to cache)
36    pub force_schema_fetch: bool,
37
38    /// Bypass validation cache reads (still writes results to cache)
39    pub force_validation: bool,
40
41    /// Disable `SchemaStore` catalog matching
42    pub no_catalog: bool,
43
44    /// Directory to search for `lintel.toml` (defaults to cwd)
45    pub config_dir: Option<PathBuf>,
46
47    /// TTL for cached schemas. `None` means no expiry.
48    pub schema_cache_ttl: Option<core::time::Duration>,
49}
50
51/// Re-exported from [`crate::diagnostics::LintError`] so callers can use
52/// `lintel_validate::validate::LintError` without importing diagnostics.
53pub use crate::diagnostics::LintError;
54
55/// A file that was checked and the schema it resolved to.
56pub struct CheckedFile {
57    pub path: String,
58    pub schema: String,
59    /// `None` for local schemas and builtins; `Some` for remote schemas.
60    pub cache_status: Option<CacheStatus>,
61    /// `None` when validation caching is not applicable; `Some` for validation cache hits/misses.
62    pub validation_cache_status: Option<ValidationCacheStatus>,
63}
64
65/// Result of a validation run.
66pub struct ValidateResult {
67    pub errors: Vec<LintError>,
68    pub checked: Vec<CheckedFile>,
69}
70
71impl ValidateResult {
72    pub fn has_errors(&self) -> bool {
73        !self.errors.is_empty()
74    }
75
76    pub fn files_checked(&self) -> usize {
77        self.checked.len()
78    }
79}
80
81// ---------------------------------------------------------------------------
82// Internal types
83// ---------------------------------------------------------------------------
84
85/// A file that has been parsed and matched to a schema URI.
86struct ParsedFile {
87    path: String,
88    content: String,
89    instance: Value,
90    /// Original schema URI before rewrites (for override matching).
91    original_schema_uri: String,
92}
93
94// ---------------------------------------------------------------------------
95// Config loading
96// ---------------------------------------------------------------------------
97
98/// Locate `lintel.toml`, load the full config, and return the config directory.
99/// Returns `(config, config_dir, config_path)`.  When no config is found or
100/// cwd is unavailable the config is default and `config_path` is `None`.
101#[tracing::instrument(skip_all)]
102pub fn load_config(search_dir: Option<&Path>) -> (lintel_config::Config, PathBuf, Option<PathBuf>) {
103    let start_dir = match search_dir {
104        Some(d) => d.to_path_buf(),
105        None => match std::env::current_dir() {
106            Ok(d) => d,
107            Err(_) => return (lintel_config::Config::default(), PathBuf::from("."), None),
108        },
109    };
110
111    let Some(config_path) = lintel_config::find_config_path(&start_dir) else {
112        return (lintel_config::Config::default(), start_dir, None);
113    };
114
115    let dir = config_path.parent().unwrap_or(&start_dir).to_path_buf();
116    let cfg = lintel_config::find_and_load(&start_dir)
117        .ok()
118        .flatten()
119        .unwrap_or_default();
120    (cfg, dir, Some(config_path))
121}
122
123// ---------------------------------------------------------------------------
124// File collection
125// ---------------------------------------------------------------------------
126
127/// Collect input files from globs/directories, applying exclude filters.
128///
129/// # Errors
130///
131/// Returns an error if a glob pattern is invalid or a directory cannot be walked.
132#[tracing::instrument(skip_all, fields(glob_count = globs.len(), exclude_count = exclude.len()))]
133pub fn collect_files(globs: &[String], exclude: &[String]) -> Result<Vec<PathBuf>> {
134    if globs.is_empty() {
135        return discover::discover_files(".", exclude);
136    }
137
138    let mut result = Vec::new();
139    for pattern in globs {
140        let path = Path::new(pattern);
141        if path.is_dir() {
142            result.extend(discover::discover_files(pattern, exclude)?);
143        } else {
144            for entry in glob(pattern).with_context(|| format!("invalid glob: {pattern}"))? {
145                let path = entry?;
146                if path.is_file() && !is_excluded(&path, exclude) {
147                    result.push(path);
148                }
149            }
150        }
151    }
152    Ok(result)
153}
154
155fn is_excluded(path: &Path, excludes: &[String]) -> bool {
156    let path_str = match path.to_str() {
157        Some(s) => s.strip_prefix("./").unwrap_or(s),
158        None => return false,
159    };
160    excludes
161        .iter()
162        .any(|pattern| glob_match::glob_match(pattern, path_str))
163}
164
165// ---------------------------------------------------------------------------
166// Phase 1: Parse files and resolve schema URIs
167// ---------------------------------------------------------------------------
168
169/// Try parsing content with each known format, returning the first success.
170///
171/// JSONC is tried first (superset of JSON, handles comments), then YAML and
172/// TOML which cover the most common config formats, followed by the rest.
173pub fn try_parse_all(content: &str, file_name: &str) -> Option<(parsers::FileFormat, Value)> {
174    use parsers::FileFormat::{Json, Json5, Jsonc, Markdown, Toml, Yaml};
175    const FORMATS: [parsers::FileFormat; 6] = [Jsonc, Yaml, Toml, Json, Json5, Markdown];
176
177    for fmt in FORMATS {
178        let parser = parsers::parser_for(fmt);
179        if let Ok(val) = parser.parse(content, file_name) {
180            return Some((fmt, val));
181        }
182    }
183    None
184}
185
186/// Result of processing a single file: either a parsed file with its schema URI,
187/// a lint error, or nothing (file was skipped).
188enum FileResult {
189    Parsed {
190        schema_uri: String,
191        parsed: ParsedFile,
192    },
193    Error(LintError),
194    Skip,
195}
196
197/// Process a single file's already-read content: parse and resolve schema URI.
198#[allow(clippy::too_many_arguments)]
199fn process_one_file(
200    path: &Path,
201    content: String,
202    config: &lintel_config::Config,
203    config_dir: &Path,
204    compiled_catalogs: &[CompiledCatalog],
205) -> FileResult {
206    let path_str = path.display().to_string();
207    let file_name = path
208        .file_name()
209        .and_then(|n| n.to_str())
210        .unwrap_or(&path_str);
211
212    let detected_format = parsers::detect_format(path);
213
214    // For unrecognized extensions, only proceed if a catalog or config mapping matches.
215    if detected_format.is_none() {
216        let has_match = config.find_schema_mapping(&path_str, file_name).is_some()
217            || compiled_catalogs
218                .iter()
219                .any(|cat| cat.find_schema(&path_str, file_name).is_some());
220        if !has_match {
221            return FileResult::Skip;
222        }
223    }
224
225    // Parse the file content.
226    let (parser, instance): (Box<dyn Parser>, Value) = if let Some(fmt) = detected_format {
227        let parser = parsers::parser_for(fmt);
228        match parser.parse(&content, &path_str) {
229            Ok(val) => (parser, val),
230            Err(parse_err) => {
231                // JSONC fallback for .json files that match a catalog entry.
232                if fmt == FileFormat::Json
233                    && compiled_catalogs
234                        .iter()
235                        .any(|cat| cat.find_schema(&path_str, file_name).is_some())
236                {
237                    match JsoncParser.parse(&content, &path_str) {
238                        Ok(val) => (parsers::parser_for(FileFormat::Jsonc), val),
239                        Err(jsonc_err) => return FileResult::Error(jsonc_err.into()),
240                    }
241                } else {
242                    return FileResult::Error(parse_err.into());
243                }
244            }
245        }
246    } else {
247        match try_parse_all(&content, &path_str) {
248            Some((fmt, val)) => (parsers::parser_for(fmt), val),
249            None => return FileResult::Skip,
250        }
251    };
252
253    // Skip markdown files with no frontmatter
254    if instance.is_null() {
255        return FileResult::Skip;
256    }
257
258    // Schema resolution priority:
259    // 1. Inline $schema / YAML modeline (always wins)
260    // 2. Custom schema mappings from lintel.toml [schemas]
261    // 3. Catalog matching (custom registries > Lintel catalog > SchemaStore)
262    let schema_uri = parser
263        .extract_schema_uri(&content, &instance)
264        .or_else(|| {
265            config
266                .find_schema_mapping(&path_str, file_name)
267                .map(str::to_string)
268        })
269        .or_else(|| {
270            compiled_catalogs
271                .iter()
272                .find_map(|cat| cat.find_schema(&path_str, file_name))
273                .map(str::to_string)
274        });
275
276    let Some(schema_uri) = schema_uri else {
277        return FileResult::Skip;
278    };
279
280    // Keep original URI for override matching (before rewrites)
281    let original_schema_uri = schema_uri.clone();
282
283    // Apply rewrite rules, then resolve // paths relative to lintel.toml
284    let schema_uri = lintel_config::apply_rewrites(&schema_uri, &config.rewrite);
285    let schema_uri = lintel_config::resolve_double_slash(&schema_uri, config_dir);
286
287    // Resolve relative local paths against the file's parent directory.
288    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
289    let schema_uri = if is_remote {
290        schema_uri
291    } else {
292        path.parent()
293            .map(|parent| parent.join(&schema_uri).to_string_lossy().to_string())
294            .unwrap_or(schema_uri)
295    };
296
297    FileResult::Parsed {
298        schema_uri,
299        parsed: ParsedFile {
300            path: path_str,
301            content,
302            instance,
303            original_schema_uri,
304        },
305    }
306}
307
308/// Read each file concurrently with tokio, parse its content, extract its
309/// schema URI, apply rewrites, and group by resolved schema URI.
310#[tracing::instrument(skip_all, fields(file_count = files.len()))]
311#[allow(clippy::too_many_arguments)]
312async fn parse_and_group_files(
313    files: &[PathBuf],
314    config: &lintel_config::Config,
315    config_dir: &Path,
316    compiled_catalogs: &[CompiledCatalog],
317    errors: &mut Vec<LintError>,
318) -> BTreeMap<String, Vec<ParsedFile>> {
319    // Read all files concurrently using tokio async I/O, with a semaphore
320    // to avoid exhausting file descriptors on large directories.
321    let semaphore = alloc::sync::Arc::new(tokio::sync::Semaphore::new(FD_CONCURRENCY_LIMIT));
322    let mut read_set = tokio::task::JoinSet::new();
323    for path in files {
324        let path = path.clone();
325        let sem = semaphore.clone();
326        read_set.spawn(async move {
327            let _permit = sem.acquire().await.expect("semaphore closed");
328            let result = tokio::fs::read_to_string(&path).await;
329            (path, result)
330        });
331    }
332
333    let mut file_contents = Vec::with_capacity(files.len());
334    while let Some(result) = read_set.join_next().await {
335        match result {
336            Ok(item) => file_contents.push(item),
337            Err(e) => tracing::warn!("file read task panicked: {e}"),
338        }
339    }
340
341    // Process files: parse content and resolve schema URIs.
342    let mut schema_groups: BTreeMap<String, Vec<ParsedFile>> = BTreeMap::new();
343    for (path, content_result) in file_contents {
344        let content = match content_result {
345            Ok(c) => c,
346            Err(e) => {
347                errors.push(LintError::Io {
348                    path: path.display().to_string(),
349                    message: format!("failed to read: {e}"),
350                });
351                continue;
352            }
353        };
354        let result = process_one_file(&path, content, config, config_dir, compiled_catalogs);
355        match result {
356            FileResult::Parsed { schema_uri, parsed } => {
357                schema_groups.entry(schema_uri).or_default().push(parsed);
358            }
359            FileResult::Error(e) => errors.push(e),
360            FileResult::Skip => {}
361        }
362    }
363
364    schema_groups
365}
366
367// ---------------------------------------------------------------------------
368// Phase 2: Schema fetching, compilation, and instance validation
369// ---------------------------------------------------------------------------
370
371/// Fetch a schema by URI, returning its parsed JSON and cache status.
372///
373/// For remote URIs, checks the prefetched map first; for local URIs, reads
374/// from disk (with in-memory caching to avoid redundant I/O for shared schemas).
375#[allow(clippy::too_many_arguments)]
376async fn fetch_schema_from_prefetched(
377    schema_uri: &str,
378    prefetched: &HashMap<String, Result<(Value, CacheStatus), String>>,
379    local_cache: &mut HashMap<String, Value>,
380    group: &[ParsedFile],
381    errors: &mut Vec<LintError>,
382    checked: &mut Vec<CheckedFile>,
383    on_check: &mut impl FnMut(&CheckedFile),
384) -> Option<(Value, Option<CacheStatus>)> {
385    let is_remote = schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
386
387    let result: Result<(Value, Option<CacheStatus>), String> = if is_remote {
388        match prefetched.get(schema_uri) {
389            Some(Ok((v, status))) => Ok((v.clone(), Some(*status))),
390            Some(Err(e)) => Err(format!("failed to fetch schema: {schema_uri}: {e}")),
391            None => Err(format!("schema not prefetched: {schema_uri}")),
392        }
393    } else if let Some(cached) = local_cache.get(schema_uri) {
394        Ok((cached.clone(), None))
395    } else {
396        tokio::fs::read_to_string(schema_uri)
397            .await
398            .map_err(|e| format!("failed to read local schema {schema_uri}: {e}"))
399            .and_then(|content| {
400                serde_json::from_str::<Value>(&content)
401                    .map(|v| {
402                        local_cache.insert(schema_uri.to_string(), v.clone());
403                        (v, None)
404                    })
405                    .map_err(|e| format!("failed to parse local schema {schema_uri}: {e}"))
406            })
407    };
408
409    match result {
410        Ok(value) => Some(value),
411        Err(message) => {
412            report_group_error(
413                |path| LintError::SchemaFetch {
414                    path: path.to_string(),
415                    message: message.clone(),
416                },
417                schema_uri,
418                None,
419                group,
420                errors,
421                checked,
422                on_check,
423            );
424            None
425        }
426    }
427}
428
429/// Report the same error for every file in a schema group.
430#[allow(clippy::too_many_arguments)]
431fn report_group_error<P: alloc::borrow::Borrow<ParsedFile>>(
432    make_error: impl Fn(&str) -> LintError,
433    schema_uri: &str,
434    cache_status: Option<CacheStatus>,
435    group: &[P],
436    errors: &mut Vec<LintError>,
437    checked: &mut Vec<CheckedFile>,
438    on_check: &mut impl FnMut(&CheckedFile),
439) {
440    for item in group {
441        let pf = item.borrow();
442        let cf = CheckedFile {
443            path: pf.path.clone(),
444            schema: schema_uri.to_string(),
445            cache_status,
446            validation_cache_status: None,
447        };
448        on_check(&cf);
449        checked.push(cf);
450        errors.push(make_error(&pf.path));
451    }
452}
453
454/// Mark every file in a group as checked (no errors).
455#[allow(clippy::too_many_arguments)]
456fn mark_group_checked<P: alloc::borrow::Borrow<ParsedFile>>(
457    schema_uri: &str,
458    cache_status: Option<CacheStatus>,
459    validation_cache_status: Option<ValidationCacheStatus>,
460    group: &[P],
461    checked: &mut Vec<CheckedFile>,
462    on_check: &mut impl FnMut(&CheckedFile),
463) {
464    for item in group {
465        let pf = item.borrow();
466        let cf = CheckedFile {
467            path: pf.path.clone(),
468            schema: schema_uri.to_string(),
469            cache_status,
470            validation_cache_status,
471        };
472        on_check(&cf);
473        checked.push(cf);
474    }
475}
476
477/// Clean up error messages from the `jsonschema` crate.
478///
479/// For `anyOf`/`oneOf` failures the crate dumps the entire JSON value into the
480/// message (e.g. `{...} is not valid under any of the schemas listed in the 'oneOf' keyword`).
481/// The source snippet already shows the value, so we strip the redundant prefix
482/// and keep only `"not valid under any of the schemas listed in the 'oneOf' keyword"`.
483///
484/// All other messages are returned unchanged.
485fn clean_error_message(msg: String) -> String {
486    const MARKER: &str = " is not valid under any of the schemas listed in the '";
487    if let Some(pos) = msg.find(MARKER) {
488        // pos points to " is not valid...", skip " is " (4 chars) to get "not valid..."
489        return msg[pos + 4..].to_string();
490    }
491    msg
492}
493
494/// Convert [`ValidationError`]s into [`LintError::Validation`] diagnostics.
495fn push_validation_errors(
496    pf: &ParsedFile,
497    schema_url: &str,
498    validation_errors: &[ValidationError],
499    errors: &mut Vec<LintError>,
500) {
501    for ve in validation_errors {
502        let span = find_instance_path_span(&pf.content, &ve.instance_path);
503        let instance_path = if ve.instance_path.is_empty() {
504            DEFAULT_LABEL.to_string()
505        } else {
506            ve.instance_path.clone()
507        };
508        let label = format_label(&instance_path, &ve.schema_path);
509        let source_span: miette::SourceSpan = span.into();
510        errors.push(LintError::Validation {
511            src: miette::NamedSource::new(&pf.path, pf.content.clone()),
512            span: source_span,
513            schema_span: source_span,
514            path: pf.path.clone(),
515            instance_path,
516            label,
517            message: ve.message.clone(),
518            schema_url: schema_url.to_string(),
519            schema_path: ve.schema_path.clone(),
520        });
521    }
522}
523
524/// Validate all files in a group against an already-compiled validator and store
525/// results in the validation cache.
526#[tracing::instrument(skip_all, fields(schema_uri, file_count = group.len()))]
527#[allow(clippy::too_many_arguments)]
528async fn validate_group<P: alloc::borrow::Borrow<ParsedFile>>(
529    validator: &jsonschema::Validator,
530    schema_uri: &str,
531    schema_hash: &str,
532    validate_formats: bool,
533    cache_status: Option<CacheStatus>,
534    group: &[P],
535    vcache: &lintel_validation_cache::ValidationCache,
536    errors: &mut Vec<LintError>,
537    checked: &mut Vec<CheckedFile>,
538    on_check: &mut impl FnMut(&CheckedFile),
539) {
540    for item in group {
541        let pf = item.borrow();
542        let file_errors: Vec<ValidationError> = validator
543            .iter_errors(&pf.instance)
544            .map(|error| ValidationError {
545                instance_path: error.instance_path().to_string(),
546                message: clean_error_message(error.to_string()),
547                schema_path: error.schema_path().to_string(),
548            })
549            .collect();
550
551        vcache
552            .store(
553                &lintel_validation_cache::CacheKey {
554                    file_content: &pf.content,
555                    schema_hash,
556                    validate_formats,
557                },
558                &file_errors,
559            )
560            .await;
561        push_validation_errors(pf, schema_uri, &file_errors, errors);
562
563        let cf = CheckedFile {
564            path: pf.path.clone(),
565            schema: schema_uri.to_string(),
566            cache_status,
567            validation_cache_status: Some(ValidationCacheStatus::Miss),
568        };
569        on_check(&cf);
570        checked.push(cf);
571    }
572}
573
574// ---------------------------------------------------------------------------
575// Public API
576// ---------------------------------------------------------------------------
577
578/// Fetch and compile all schema catalogs (default, `SchemaStore`, and custom registries).
579///
580/// Returns a list of compiled catalogs, printing warnings for any that fail to fetch.
581pub async fn fetch_compiled_catalogs(
582    retriever: &SchemaCache,
583    config: &lintel_config::Config,
584    no_catalog: bool,
585) -> Vec<CompiledCatalog> {
586    let mut compiled_catalogs = Vec::new();
587
588    if !no_catalog {
589        let catalog_span = tracing::info_span!("fetch_catalogs").entered();
590
591        // Catalogs are fetched concurrently but sorted by priority so that
592        // the Lintel catalog wins over custom registries, which win over
593        // SchemaStore.  The `order` field encodes this precedence.
594        #[allow(clippy::items_after_statements)]
595        type CatalogResult = (
596            usize, // priority (lower = higher precedence)
597            String,
598            Result<CompiledCatalog, Box<dyn core::error::Error + Send + Sync>>,
599        );
600        let mut catalog_tasks: tokio::task::JoinSet<CatalogResult> = tokio::task::JoinSet::new();
601
602        // Custom registries from lintel.toml (highest precedence among catalogs)
603        for (i, registry_url) in config.registries.iter().enumerate() {
604            let r = retriever.clone();
605            let url = registry_url.clone();
606            let label = format!("registry {url}");
607            catalog_tasks.spawn(async move {
608                let result = registry::fetch(&r, &url)
609                    .await
610                    .map(|cat| CompiledCatalog::compile(&cat));
611                (i, label, result)
612            });
613        }
614
615        // Lintel catalog
616        let lintel_order = config.registries.len();
617        if !config.no_default_catalog {
618            let r = retriever.clone();
619            let label = format!("default catalog {}", registry::DEFAULT_REGISTRY);
620            catalog_tasks.spawn(async move {
621                let result = registry::fetch(&r, registry::DEFAULT_REGISTRY)
622                    .await
623                    .map(|cat| CompiledCatalog::compile(&cat));
624                (lintel_order, label, result)
625            });
626        }
627
628        // SchemaStore catalog (lowest precedence)
629        let schemastore_order = config.registries.len() + 1;
630        let r = retriever.clone();
631        catalog_tasks.spawn(async move {
632            let result = catalog::fetch_catalog(&r)
633                .await
634                .map(|cat| CompiledCatalog::compile(&cat));
635            (schemastore_order, "SchemaStore catalog".to_string(), result)
636        });
637
638        let mut results: Vec<(usize, CompiledCatalog)> = Vec::new();
639        while let Some(result) = catalog_tasks.join_next().await {
640            match result {
641                Ok((order, _, Ok(compiled))) => results.push((order, compiled)),
642                Ok((_, label, Err(e))) => eprintln!("warning: failed to fetch {label}: {e}"),
643                Err(e) => eprintln!("warning: catalog fetch task failed: {e}"),
644            }
645        }
646        results.sort_by_key(|(order, _)| *order);
647        compiled_catalogs.extend(results.into_iter().map(|(_, cat)| cat));
648
649        drop(catalog_span);
650    }
651
652    compiled_catalogs
653}
654
655/// # Errors
656///
657/// Returns an error if file collection or schema validation encounters an I/O error.
658pub async fn run(args: &ValidateArgs) -> Result<ValidateResult> {
659    run_with(args, None, |_| {}).await
660}
661
662/// Like [`run`], but calls `on_check` each time a file is checked, allowing
663/// callers to stream progress (e.g. verbose output) as files are processed.
664///
665/// # Errors
666///
667/// Returns an error if file collection or schema validation encounters an I/O error.
668#[tracing::instrument(skip_all, name = "validate")]
669#[allow(clippy::too_many_lines)]
670pub async fn run_with(
671    args: &ValidateArgs,
672    cache: Option<SchemaCache>,
673    mut on_check: impl FnMut(&CheckedFile),
674) -> Result<ValidateResult> {
675    let retriever = if let Some(c) = cache {
676        c
677    } else {
678        let mut builder = SchemaCache::builder().force_fetch(args.force_schema_fetch);
679        if let Some(dir) = &args.cache_dir {
680            let path = PathBuf::from(dir);
681            let _ = fs::create_dir_all(&path);
682            builder = builder.cache_dir(path);
683        }
684        if let Some(ttl) = args.schema_cache_ttl {
685            builder = builder.ttl(ttl);
686        }
687        builder.build()
688    };
689
690    let (config, config_dir, _config_path) = load_config(args.config_dir.as_deref());
691    let files = collect_files(&args.globs, &args.exclude)?;
692    tracing::info!(file_count = files.len(), "collected files");
693
694    let compiled_catalogs = fetch_compiled_catalogs(&retriever, &config, args.no_catalog).await;
695
696    let mut errors: Vec<LintError> = Vec::new();
697    let mut checked: Vec<CheckedFile> = Vec::new();
698
699    // Phase 1: Parse files and resolve schema URIs
700    let schema_groups = parse_and_group_files(
701        &files,
702        &config,
703        &config_dir,
704        &compiled_catalogs,
705        &mut errors,
706    )
707    .await;
708    tracing::info!(
709        schema_count = schema_groups.len(),
710        total_files = schema_groups.values().map(Vec::len).sum::<usize>(),
711        "grouped files by schema"
712    );
713
714    // Create validation cache
715    let vcache = lintel_validation_cache::ValidationCache::new(
716        lintel_validation_cache::ensure_cache_dir(),
717        args.force_validation,
718    );
719
720    // Prefetch all remote schemas in parallel
721    let remote_uris: Vec<&String> = schema_groups
722        .keys()
723        .filter(|uri| uri.starts_with("http://") || uri.starts_with("https://"))
724        .collect();
725
726    let prefetched = {
727        let _prefetch_span =
728            tracing::info_span!("prefetch_schemas", count = remote_uris.len()).entered();
729
730        let mut schema_tasks = tokio::task::JoinSet::new();
731        for uri in remote_uris {
732            let r = retriever.clone();
733            let u = uri.clone();
734            schema_tasks.spawn(async move {
735                let result = r.fetch(&u).await;
736                (u, result)
737            });
738        }
739
740        let mut prefetched: HashMap<String, Result<(Value, CacheStatus), String>> = HashMap::new();
741        while let Some(result) = schema_tasks.join_next().await {
742            match result {
743                Ok((uri, fetch_result)) => {
744                    prefetched.insert(uri, fetch_result.map_err(|e| e.to_string()));
745                }
746                Err(e) => eprintln!("warning: schema prefetch task failed: {e}"),
747            }
748        }
749
750        prefetched
751    };
752
753    // Phase 2: Compile each schema once and validate all matching files
754    let mut local_schema_cache: HashMap<String, Value> = HashMap::new();
755    let mut fetch_time = core::time::Duration::ZERO;
756    let mut hash_time = core::time::Duration::ZERO;
757    let mut vcache_time = core::time::Duration::ZERO;
758    let mut compile_time = core::time::Duration::ZERO;
759    let mut validate_time = core::time::Duration::ZERO;
760
761    for (schema_uri, group) in &schema_groups {
762        let _group_span = tracing::debug_span!(
763            "schema_group",
764            schema = schema_uri.as_str(),
765            files = group.len(),
766        )
767        .entered();
768
769        // If ANY file in the group matches a `validate_formats = false` override,
770        // disable format validation for the whole group (they share one compiled validator).
771        let validate_formats = group.iter().all(|pf| {
772            config
773                .should_validate_formats(&pf.path, &[&pf.original_schema_uri, schema_uri.as_str()])
774        });
775
776        // Remote schemas were prefetched in parallel above; local schemas are
777        // read from disk here (with in-memory caching).
778        let t = std::time::Instant::now();
779        let Some((schema_value, cache_status)) = fetch_schema_from_prefetched(
780            schema_uri,
781            &prefetched,
782            &mut local_schema_cache,
783            group,
784            &mut errors,
785            &mut checked,
786            &mut on_check,
787        )
788        .await
789        else {
790            fetch_time += t.elapsed();
791            continue;
792        };
793        fetch_time += t.elapsed();
794
795        // Pre-compute schema hash once for the entire group.
796        let t = std::time::Instant::now();
797        let schema_hash = lintel_validation_cache::schema_hash(&schema_value);
798        hash_time += t.elapsed();
799
800        // Split the group into validation cache hits and misses.
801        let mut cache_misses: Vec<&ParsedFile> = Vec::new();
802
803        let t = std::time::Instant::now();
804        for pf in group {
805            let (cached, vcache_status) = vcache
806                .lookup(&lintel_validation_cache::CacheKey {
807                    file_content: &pf.content,
808                    schema_hash: &schema_hash,
809                    validate_formats,
810                })
811                .await;
812
813            if let Some(cached_errors) = cached {
814                push_validation_errors(pf, schema_uri, &cached_errors, &mut errors);
815                let cf = CheckedFile {
816                    path: pf.path.clone(),
817                    schema: schema_uri.clone(),
818                    cache_status,
819                    validation_cache_status: Some(vcache_status),
820                };
821                on_check(&cf);
822                checked.push(cf);
823            } else {
824                cache_misses.push(pf);
825            }
826        }
827        vcache_time += t.elapsed();
828
829        tracing::debug!(
830            cache_hits = group.len() - cache_misses.len(),
831            cache_misses = cache_misses.len(),
832            "validation cache"
833        );
834
835        // If all files hit the validation cache, skip schema compilation entirely.
836        if cache_misses.is_empty() {
837            continue;
838        }
839
840        // Compile the schema for cache misses.
841        let t = std::time::Instant::now();
842        let validator = {
843            // Set base URI for remote schemas so relative $ref values
844            // (e.g. "./rule.json") resolve correctly.
845            let is_remote_schema =
846                schema_uri.starts_with("http://") || schema_uri.starts_with("https://");
847            let opts = jsonschema::async_options()
848                .with_retriever(retriever.clone())
849                .should_validate_formats(validate_formats);
850            let opts = if is_remote_schema {
851                opts.with_base_uri(schema_uri.clone())
852            } else {
853                opts
854            };
855            match opts.build(&schema_value).await {
856                Ok(v) => v,
857                Err(e) => {
858                    compile_time += t.elapsed();
859                    // When format validation is disabled and the compilation error
860                    // is a uri-reference issue (e.g. Rust-style $ref paths in
861                    // vector.json), skip validation silently.
862                    if !validate_formats && e.to_string().contains("uri-reference") {
863                        mark_group_checked(
864                            schema_uri,
865                            cache_status,
866                            Some(ValidationCacheStatus::Miss),
867                            &cache_misses,
868                            &mut checked,
869                            &mut on_check,
870                        );
871                        continue;
872                    }
873                    let msg = format!("failed to compile schema: {e}");
874                    report_group_error(
875                        |path| LintError::SchemaCompile {
876                            path: path.to_string(),
877                            message: msg.clone(),
878                        },
879                        schema_uri,
880                        cache_status,
881                        &cache_misses,
882                        &mut errors,
883                        &mut checked,
884                        &mut on_check,
885                    );
886                    continue;
887                }
888            }
889        };
890        compile_time += t.elapsed();
891
892        let t = std::time::Instant::now();
893        validate_group(
894            &validator,
895            schema_uri,
896            &schema_hash,
897            validate_formats,
898            cache_status,
899            &cache_misses,
900            &vcache,
901            &mut errors,
902            &mut checked,
903            &mut on_check,
904        )
905        .await;
906        validate_time += t.elapsed();
907    }
908
909    #[allow(clippy::cast_possible_truncation)]
910    {
911        tracing::info!(
912            fetch_ms = fetch_time.as_millis() as u64,
913            hash_ms = hash_time.as_millis() as u64,
914            vcache_ms = vcache_time.as_millis() as u64,
915            compile_ms = compile_time.as_millis() as u64,
916            validate_ms = validate_time.as_millis() as u64,
917            "phase2 breakdown"
918        );
919    }
920
921    // Sort errors for deterministic output (by path, then by span offset)
922    errors.sort_by(|a, b| {
923        a.path()
924            .cmp(b.path())
925            .then_with(|| a.offset().cmp(&b.offset()))
926    });
927
928    Ok(ValidateResult { errors, checked })
929}
930
931#[cfg(test)]
932mod tests {
933    use super::*;
934    use lintel_schema_cache::SchemaCache;
935    use std::path::Path;
936
937    fn mock(entries: &[(&str, &str)]) -> SchemaCache {
938        let cache = SchemaCache::memory();
939        for (uri, body) in entries {
940            cache.insert(
941                uri,
942                serde_json::from_str(body).expect("test mock: invalid JSON"),
943            );
944        }
945        cache
946    }
947
948    fn testdata() -> PathBuf {
949        Path::new(env!("CARGO_MANIFEST_DIR")).join("testdata")
950    }
951
952    /// Build glob patterns that scan one or more testdata directories for all supported file types.
953    fn scenario_globs(dirs: &[&str]) -> Vec<String> {
954        dirs.iter()
955            .flat_map(|dir| {
956                let base = testdata().join(dir);
957                vec![
958                    base.join("*.json").to_string_lossy().to_string(),
959                    base.join("*.yaml").to_string_lossy().to_string(),
960                    base.join("*.yml").to_string_lossy().to_string(),
961                    base.join("*.json5").to_string_lossy().to_string(),
962                    base.join("*.jsonc").to_string_lossy().to_string(),
963                    base.join("*.toml").to_string_lossy().to_string(),
964                ]
965            })
966            .collect()
967    }
968
969    fn args_for_dirs(dirs: &[&str]) -> ValidateArgs {
970        ValidateArgs {
971            globs: scenario_globs(dirs),
972            exclude: vec![],
973            cache_dir: None,
974            force_schema_fetch: true,
975            force_validation: true,
976            no_catalog: true,
977            config_dir: None,
978            schema_cache_ttl: None,
979        }
980    }
981
982    const SCHEMA: &str =
983        r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
984
985    fn schema_mock() -> SchemaCache {
986        mock(&[("https://example.com/schema.json", SCHEMA)])
987    }
988
989    // --- Directory scanning tests ---
990
991    #[tokio::test]
992    async fn no_matching_files() -> anyhow::Result<()> {
993        let tmp = tempfile::tempdir()?;
994        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
995        let c = ValidateArgs {
996            globs: vec![pattern],
997            exclude: vec![],
998            cache_dir: None,
999            force_schema_fetch: true,
1000            force_validation: true,
1001            no_catalog: true,
1002            config_dir: None,
1003            schema_cache_ttl: None,
1004        };
1005        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1006        assert!(!result.has_errors());
1007        Ok(())
1008    }
1009
1010    #[tokio::test]
1011    async fn dir_all_valid() -> anyhow::Result<()> {
1012        let c = args_for_dirs(&["positive_tests"]);
1013        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1014        assert!(!result.has_errors());
1015        Ok(())
1016    }
1017
1018    #[tokio::test]
1019    async fn dir_all_invalid() -> anyhow::Result<()> {
1020        let c = args_for_dirs(&["negative_tests"]);
1021        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1022        assert!(result.has_errors());
1023        Ok(())
1024    }
1025
1026    #[tokio::test]
1027    async fn dir_mixed_valid_and_invalid() -> anyhow::Result<()> {
1028        let c = args_for_dirs(&["positive_tests", "negative_tests"]);
1029        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1030        assert!(result.has_errors());
1031        Ok(())
1032    }
1033
1034    #[tokio::test]
1035    async fn dir_no_schemas_skipped() -> anyhow::Result<()> {
1036        let c = args_for_dirs(&["no_schema"]);
1037        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1038        assert!(!result.has_errors());
1039        Ok(())
1040    }
1041
1042    #[tokio::test]
1043    async fn dir_valid_with_no_schema_files() -> anyhow::Result<()> {
1044        let c = args_for_dirs(&["positive_tests", "no_schema"]);
1045        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1046        assert!(!result.has_errors());
1047        Ok(())
1048    }
1049
1050    // --- Directory as positional arg ---
1051
1052    #[tokio::test]
1053    async fn directory_arg_discovers_files() -> anyhow::Result<()> {
1054        let dir = testdata().join("positive_tests");
1055        let c = ValidateArgs {
1056            globs: vec![dir.to_string_lossy().to_string()],
1057            exclude: vec![],
1058            cache_dir: None,
1059            force_schema_fetch: true,
1060            force_validation: true,
1061            no_catalog: true,
1062            config_dir: None,
1063            schema_cache_ttl: None,
1064        };
1065        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1066        assert!(!result.has_errors());
1067        assert!(result.files_checked() > 0);
1068        Ok(())
1069    }
1070
1071    #[tokio::test]
1072    async fn multiple_directory_args() -> anyhow::Result<()> {
1073        let pos_dir = testdata().join("positive_tests");
1074        let no_schema_dir = testdata().join("no_schema");
1075        let c = ValidateArgs {
1076            globs: vec![
1077                pos_dir.to_string_lossy().to_string(),
1078                no_schema_dir.to_string_lossy().to_string(),
1079            ],
1080            exclude: vec![],
1081            cache_dir: None,
1082            force_schema_fetch: true,
1083            force_validation: true,
1084            no_catalog: true,
1085            config_dir: None,
1086            schema_cache_ttl: None,
1087        };
1088        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1089        assert!(!result.has_errors());
1090        Ok(())
1091    }
1092
1093    #[tokio::test]
1094    async fn mix_directory_and_glob_args() -> anyhow::Result<()> {
1095        let dir = testdata().join("positive_tests");
1096        let glob_pattern = testdata()
1097            .join("no_schema")
1098            .join("*.json")
1099            .to_string_lossy()
1100            .to_string();
1101        let c = ValidateArgs {
1102            globs: vec![dir.to_string_lossy().to_string(), glob_pattern],
1103            exclude: vec![],
1104            cache_dir: None,
1105            force_schema_fetch: true,
1106            force_validation: true,
1107            no_catalog: true,
1108            config_dir: None,
1109            schema_cache_ttl: None,
1110        };
1111        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1112        assert!(!result.has_errors());
1113        Ok(())
1114    }
1115
1116    #[tokio::test]
1117    async fn malformed_json_parse_error() -> anyhow::Result<()> {
1118        let base = testdata().join("malformed");
1119        let c = ValidateArgs {
1120            globs: vec![base.join("*.json").to_string_lossy().to_string()],
1121            exclude: vec![],
1122            cache_dir: None,
1123            force_schema_fetch: true,
1124            force_validation: true,
1125            no_catalog: true,
1126            config_dir: None,
1127            schema_cache_ttl: None,
1128        };
1129        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1130        assert!(result.has_errors());
1131        Ok(())
1132    }
1133
1134    #[tokio::test]
1135    async fn malformed_yaml_parse_error() -> anyhow::Result<()> {
1136        let base = testdata().join("malformed");
1137        let c = ValidateArgs {
1138            globs: vec![base.join("*.yaml").to_string_lossy().to_string()],
1139            exclude: vec![],
1140            cache_dir: None,
1141            force_schema_fetch: true,
1142            force_validation: true,
1143            no_catalog: true,
1144            config_dir: None,
1145            schema_cache_ttl: None,
1146        };
1147        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1148        assert!(result.has_errors());
1149        Ok(())
1150    }
1151
1152    // --- Exclude filter ---
1153
1154    #[tokio::test]
1155    async fn exclude_filters_files_in_dir() -> anyhow::Result<()> {
1156        let base = testdata().join("negative_tests");
1157        let c = ValidateArgs {
1158            globs: scenario_globs(&["positive_tests", "negative_tests"]),
1159            exclude: vec![
1160                base.join("missing_name.json").to_string_lossy().to_string(),
1161                base.join("missing_name.toml").to_string_lossy().to_string(),
1162                base.join("missing_name.yaml").to_string_lossy().to_string(),
1163            ],
1164            cache_dir: None,
1165            force_schema_fetch: true,
1166            force_validation: true,
1167            no_catalog: true,
1168            config_dir: None,
1169            schema_cache_ttl: None,
1170        };
1171        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1172        assert!(!result.has_errors());
1173        Ok(())
1174    }
1175
1176    // --- Cache options ---
1177
1178    #[tokio::test]
1179    async fn custom_cache_dir() -> anyhow::Result<()> {
1180        let c = ValidateArgs {
1181            globs: scenario_globs(&["positive_tests"]),
1182            exclude: vec![],
1183            cache_dir: None,
1184            force_schema_fetch: true,
1185            force_validation: true,
1186            no_catalog: true,
1187            config_dir: None,
1188            schema_cache_ttl: None,
1189        };
1190        let result = run_with(&c, Some(schema_mock()), |_| {}).await?;
1191        assert!(!result.has_errors());
1192        Ok(())
1193    }
1194
1195    // --- Local schema ---
1196
1197    #[tokio::test]
1198    async fn json_valid_with_local_schema() -> anyhow::Result<()> {
1199        let tmp = tempfile::tempdir()?;
1200        let schema_path = tmp.path().join("schema.json");
1201        fs::write(&schema_path, SCHEMA)?;
1202
1203        let f = tmp.path().join("valid.json");
1204        fs::write(
1205            &f,
1206            format!(
1207                r#"{{"$schema":"{}","name":"hello"}}"#,
1208                schema_path.to_string_lossy()
1209            ),
1210        )?;
1211
1212        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1213        let c = ValidateArgs {
1214            globs: vec![pattern],
1215            exclude: vec![],
1216            cache_dir: None,
1217            force_schema_fetch: true,
1218            force_validation: true,
1219            no_catalog: true,
1220            config_dir: None,
1221            schema_cache_ttl: None,
1222        };
1223        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1224        assert!(!result.has_errors());
1225        Ok(())
1226    }
1227
1228    #[tokio::test]
1229    async fn yaml_valid_with_local_schema() -> anyhow::Result<()> {
1230        let tmp = tempfile::tempdir()?;
1231        let schema_path = tmp.path().join("schema.json");
1232        fs::write(&schema_path, SCHEMA)?;
1233
1234        let f = tmp.path().join("valid.yaml");
1235        fs::write(
1236            &f,
1237            format!(
1238                "# yaml-language-server: $schema={}\nname: hello\n",
1239                schema_path.to_string_lossy()
1240            ),
1241        )?;
1242
1243        let pattern = tmp.path().join("*.yaml").to_string_lossy().to_string();
1244        let c = ValidateArgs {
1245            globs: vec![pattern],
1246            exclude: vec![],
1247            cache_dir: None,
1248            force_schema_fetch: true,
1249            force_validation: true,
1250            no_catalog: true,
1251            config_dir: None,
1252            schema_cache_ttl: None,
1253        };
1254        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1255        assert!(!result.has_errors());
1256        Ok(())
1257    }
1258
1259    #[tokio::test]
1260    async fn missing_local_schema_errors() -> anyhow::Result<()> {
1261        let tmp = tempfile::tempdir()?;
1262        let f = tmp.path().join("ref.json");
1263        fs::write(&f, r#"{"$schema":"/nonexistent/schema.json"}"#)?;
1264
1265        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1266        let c = ValidateArgs {
1267            globs: vec![pattern],
1268            exclude: vec![],
1269            cache_dir: None,
1270            force_schema_fetch: true,
1271            force_validation: true,
1272            no_catalog: true,
1273            config_dir: None,
1274            schema_cache_ttl: None,
1275        };
1276        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1277        assert!(result.has_errors());
1278        Ok(())
1279    }
1280
1281    // --- JSON5 / JSONC tests ---
1282
1283    #[tokio::test]
1284    async fn json5_valid_with_schema() -> anyhow::Result<()> {
1285        let tmp = tempfile::tempdir()?;
1286        let schema_path = tmp.path().join("schema.json");
1287        fs::write(&schema_path, SCHEMA)?;
1288
1289        let f = tmp.path().join("config.json5");
1290        fs::write(
1291            &f,
1292            format!(
1293                r#"{{
1294  // JSON5 comment
1295  "$schema": "{}",
1296  name: "hello",
1297}}"#,
1298                schema_path.to_string_lossy()
1299            ),
1300        )?;
1301
1302        let pattern = tmp.path().join("*.json5").to_string_lossy().to_string();
1303        let c = ValidateArgs {
1304            globs: vec![pattern],
1305            exclude: vec![],
1306            cache_dir: None,
1307            force_schema_fetch: true,
1308            force_validation: true,
1309            no_catalog: true,
1310            config_dir: None,
1311            schema_cache_ttl: None,
1312        };
1313        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1314        assert!(!result.has_errors());
1315        Ok(())
1316    }
1317
1318    #[tokio::test]
1319    async fn jsonc_valid_with_schema() -> anyhow::Result<()> {
1320        let tmp = tempfile::tempdir()?;
1321        let schema_path = tmp.path().join("schema.json");
1322        fs::write(&schema_path, SCHEMA)?;
1323
1324        let f = tmp.path().join("config.jsonc");
1325        fs::write(
1326            &f,
1327            format!(
1328                r#"{{
1329  /* JSONC comment */
1330  "$schema": "{}",
1331  "name": "hello"
1332}}"#,
1333                schema_path.to_string_lossy()
1334            ),
1335        )?;
1336
1337        let pattern = tmp.path().join("*.jsonc").to_string_lossy().to_string();
1338        let c = ValidateArgs {
1339            globs: vec![pattern],
1340            exclude: vec![],
1341            cache_dir: None,
1342            force_schema_fetch: true,
1343            force_validation: true,
1344            no_catalog: true,
1345            config_dir: None,
1346            schema_cache_ttl: None,
1347        };
1348        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1349        assert!(!result.has_errors());
1350        Ok(())
1351    }
1352
1353    // --- Catalog-based schema matching ---
1354
1355    const GH_WORKFLOW_SCHEMA: &str = r#"{
1356        "type": "object",
1357        "properties": {
1358            "name": { "type": "string" },
1359            "on": {},
1360            "jobs": { "type": "object" }
1361        },
1362        "required": ["on", "jobs"]
1363    }"#;
1364
1365    fn gh_catalog_json() -> String {
1366        r#"{"version":1,"schemas":[{
1367            "name": "GitHub Workflow",
1368            "description": "GitHub Actions workflow",
1369            "url": "https://www.schemastore.org/github-workflow.json",
1370            "fileMatch": [
1371                "**/.github/workflows/*.yml",
1372                "**/.github/workflows/*.yaml"
1373            ]
1374        }]}"#
1375            .to_string()
1376    }
1377
1378    #[tokio::test]
1379    async fn catalog_matches_github_workflow_valid() -> anyhow::Result<()> {
1380        let tmp = tempfile::tempdir()?;
1381        let cache_tmp = tempfile::tempdir()?;
1382        let wf_dir = tmp.path().join(".github/workflows");
1383        fs::create_dir_all(&wf_dir)?;
1384        fs::write(
1385            wf_dir.join("ci.yml"),
1386            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1387        )?;
1388
1389        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1390        let client = mock(&[
1391            (
1392                "https://www.schemastore.org/api/json/catalog.json",
1393                &gh_catalog_json(),
1394            ),
1395            (
1396                "https://www.schemastore.org/github-workflow.json",
1397                GH_WORKFLOW_SCHEMA,
1398            ),
1399        ]);
1400        let c = ValidateArgs {
1401            globs: vec![pattern],
1402            exclude: vec![],
1403            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1404            force_schema_fetch: true,
1405            force_validation: true,
1406            no_catalog: false,
1407            config_dir: None,
1408            schema_cache_ttl: None,
1409        };
1410        let result = run_with(&c, Some(client), |_| {}).await?;
1411        assert!(!result.has_errors());
1412        Ok(())
1413    }
1414
1415    #[tokio::test]
1416    async fn catalog_matches_github_workflow_invalid() -> anyhow::Result<()> {
1417        let tmp = tempfile::tempdir()?;
1418        let cache_tmp = tempfile::tempdir()?;
1419        let wf_dir = tmp.path().join(".github/workflows");
1420        fs::create_dir_all(&wf_dir)?;
1421        fs::write(wf_dir.join("bad.yml"), "name: Broken\n")?;
1422
1423        let pattern = wf_dir.join("*.yml").to_string_lossy().to_string();
1424        let client = mock(&[
1425            (
1426                "https://www.schemastore.org/api/json/catalog.json",
1427                &gh_catalog_json(),
1428            ),
1429            (
1430                "https://www.schemastore.org/github-workflow.json",
1431                GH_WORKFLOW_SCHEMA,
1432            ),
1433        ]);
1434        let c = ValidateArgs {
1435            globs: vec![pattern],
1436            exclude: vec![],
1437            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1438            force_schema_fetch: true,
1439            force_validation: true,
1440            no_catalog: false,
1441            config_dir: None,
1442            schema_cache_ttl: None,
1443        };
1444        let result = run_with(&c, Some(client), |_| {}).await?;
1445        assert!(result.has_errors());
1446        Ok(())
1447    }
1448
1449    #[tokio::test]
1450    async fn auto_discover_finds_github_workflows() -> anyhow::Result<()> {
1451        let tmp = tempfile::tempdir()?;
1452        let cache_tmp = tempfile::tempdir()?;
1453        let wf_dir = tmp.path().join(".github/workflows");
1454        fs::create_dir_all(&wf_dir)?;
1455        fs::write(
1456            wf_dir.join("ci.yml"),
1457            "name: CI\non: push\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps: []\n",
1458        )?;
1459
1460        let client = mock(&[
1461            (
1462                "https://www.schemastore.org/api/json/catalog.json",
1463                &gh_catalog_json(),
1464            ),
1465            (
1466                "https://www.schemastore.org/github-workflow.json",
1467                GH_WORKFLOW_SCHEMA,
1468            ),
1469        ]);
1470        let c = ValidateArgs {
1471            globs: vec![],
1472            exclude: vec![],
1473            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1474            force_schema_fetch: true,
1475            force_validation: true,
1476            no_catalog: false,
1477            config_dir: None,
1478            schema_cache_ttl: None,
1479        };
1480
1481        let orig_dir = std::env::current_dir()?;
1482        std::env::set_current_dir(tmp.path())?;
1483        let result = run_with(&c, Some(client), |_| {}).await?;
1484        std::env::set_current_dir(orig_dir)?;
1485
1486        assert!(!result.has_errors());
1487        Ok(())
1488    }
1489
1490    // --- TOML tests ---
1491
1492    #[tokio::test]
1493    async fn toml_valid_with_schema() -> anyhow::Result<()> {
1494        let tmp = tempfile::tempdir()?;
1495        let schema_path = tmp.path().join("schema.json");
1496        fs::write(&schema_path, SCHEMA)?;
1497
1498        let f = tmp.path().join("config.toml");
1499        fs::write(
1500            &f,
1501            format!(
1502                "# :schema {}\nname = \"hello\"\n",
1503                schema_path.to_string_lossy()
1504            ),
1505        )?;
1506
1507        let pattern = tmp.path().join("*.toml").to_string_lossy().to_string();
1508        let c = ValidateArgs {
1509            globs: vec![pattern],
1510            exclude: vec![],
1511            cache_dir: None,
1512            force_schema_fetch: true,
1513            force_validation: true,
1514            no_catalog: true,
1515            config_dir: None,
1516            schema_cache_ttl: None,
1517        };
1518        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1519        assert!(!result.has_errors());
1520        Ok(())
1521    }
1522
1523    // --- Rewrite rules + // resolution ---
1524
1525    #[tokio::test]
1526    async fn rewrite_rule_with_double_slash_resolves_schema() -> anyhow::Result<()> {
1527        let tmp = tempfile::tempdir()?;
1528
1529        let schemas_dir = tmp.path().join("schemas");
1530        fs::create_dir_all(&schemas_dir)?;
1531        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1532
1533        fs::write(
1534            tmp.path().join("lintel.toml"),
1535            r#"
1536[rewrite]
1537"http://localhost:9000/" = "//schemas/"
1538"#,
1539        )?;
1540
1541        let f = tmp.path().join("config.json");
1542        fs::write(
1543            &f,
1544            r#"{"$schema":"http://localhost:9000/test.json","name":"hello"}"#,
1545        )?;
1546
1547        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1548        let c = ValidateArgs {
1549            globs: vec![pattern],
1550            exclude: vec![],
1551            cache_dir: None,
1552            force_schema_fetch: true,
1553            force_validation: true,
1554            no_catalog: true,
1555            config_dir: Some(tmp.path().to_path_buf()),
1556            schema_cache_ttl: None,
1557        };
1558
1559        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1560        assert!(!result.has_errors());
1561        assert_eq!(result.files_checked(), 1);
1562        Ok(())
1563    }
1564
1565    #[tokio::test]
1566    async fn double_slash_schema_resolves_relative_to_config() -> anyhow::Result<()> {
1567        let tmp = tempfile::tempdir()?;
1568
1569        let schemas_dir = tmp.path().join("schemas");
1570        fs::create_dir_all(&schemas_dir)?;
1571        fs::write(schemas_dir.join("test.json"), SCHEMA)?;
1572
1573        fs::write(tmp.path().join("lintel.toml"), "")?;
1574
1575        let sub = tmp.path().join("deeply/nested");
1576        fs::create_dir_all(&sub)?;
1577        let f = sub.join("config.json");
1578        fs::write(&f, r#"{"$schema":"//schemas/test.json","name":"hello"}"#)?;
1579
1580        let pattern = sub.join("*.json").to_string_lossy().to_string();
1581        let c = ValidateArgs {
1582            globs: vec![pattern],
1583            exclude: vec![],
1584            cache_dir: None,
1585            force_schema_fetch: true,
1586            force_validation: true,
1587            no_catalog: true,
1588            config_dir: Some(tmp.path().to_path_buf()),
1589            schema_cache_ttl: None,
1590        };
1591
1592        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1593        assert!(!result.has_errors());
1594        Ok(())
1595    }
1596
1597    // --- Format validation override ---
1598
1599    const FORMAT_SCHEMA: &str = r#"{
1600        "type": "object",
1601        "properties": {
1602            "link": { "type": "string", "format": "uri-reference" }
1603        }
1604    }"#;
1605
1606    #[tokio::test]
1607    async fn format_errors_reported_without_override() -> anyhow::Result<()> {
1608        let tmp = tempfile::tempdir()?;
1609        let schema_path = tmp.path().join("schema.json");
1610        fs::write(&schema_path, FORMAT_SCHEMA)?;
1611
1612        let f = tmp.path().join("data.json");
1613        fs::write(
1614            &f,
1615            format!(
1616                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1617                schema_path.to_string_lossy()
1618            ),
1619        )?;
1620
1621        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1622        let c = ValidateArgs {
1623            globs: vec![pattern],
1624            exclude: vec![],
1625            cache_dir: None,
1626            force_schema_fetch: true,
1627            force_validation: true,
1628            no_catalog: true,
1629            config_dir: Some(tmp.path().to_path_buf()),
1630            schema_cache_ttl: None,
1631        };
1632        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1633        assert!(
1634            result.has_errors(),
1635            "expected format error without override"
1636        );
1637        Ok(())
1638    }
1639
1640    #[tokio::test]
1641    async fn format_errors_suppressed_with_override() -> anyhow::Result<()> {
1642        let tmp = tempfile::tempdir()?;
1643        let schema_path = tmp.path().join("schema.json");
1644        fs::write(&schema_path, FORMAT_SCHEMA)?;
1645
1646        let f = tmp.path().join("data.json");
1647        fs::write(
1648            &f,
1649            format!(
1650                r#"{{"$schema":"{}","link":"not a valid {{uri}}"}}"#,
1651                schema_path.to_string_lossy()
1652            ),
1653        )?;
1654
1655        // Use **/data.json to match the absolute path from the tempdir.
1656        fs::write(
1657            tmp.path().join("lintel.toml"),
1658            r#"
1659[[override]]
1660files = ["**/data.json"]
1661validate_formats = false
1662"#,
1663        )?;
1664
1665        let pattern = tmp.path().join("data.json").to_string_lossy().to_string();
1666        let c = ValidateArgs {
1667            globs: vec![pattern],
1668            exclude: vec![],
1669            cache_dir: None,
1670            force_schema_fetch: true,
1671            force_validation: true,
1672            no_catalog: true,
1673            config_dir: Some(tmp.path().to_path_buf()),
1674            schema_cache_ttl: None,
1675        };
1676        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1677        assert!(
1678            !result.has_errors(),
1679            "expected no errors with validate_formats = false override"
1680        );
1681        Ok(())
1682    }
1683
1684    // --- Unrecognized extension handling ---
1685
1686    #[tokio::test]
1687    async fn unrecognized_extension_skipped_without_catalog() -> anyhow::Result<()> {
1688        let tmp = tempfile::tempdir()?;
1689        fs::write(tmp.path().join("config.nix"), r#"{"name":"hello"}"#)?;
1690
1691        let pattern = tmp.path().join("config.nix").to_string_lossy().to_string();
1692        let c = ValidateArgs {
1693            globs: vec![pattern],
1694            exclude: vec![],
1695            cache_dir: None,
1696            force_schema_fetch: true,
1697            force_validation: true,
1698            no_catalog: true,
1699            config_dir: Some(tmp.path().to_path_buf()),
1700            schema_cache_ttl: None,
1701        };
1702        let result = run_with(&c, Some(mock(&[])), |_| {}).await?;
1703        assert!(!result.has_errors());
1704        assert_eq!(result.files_checked(), 0);
1705        Ok(())
1706    }
1707
1708    #[tokio::test]
1709    async fn unrecognized_extension_parsed_when_catalog_matches() -> anyhow::Result<()> {
1710        let tmp = tempfile::tempdir()?;
1711        let cache_tmp = tempfile::tempdir()?;
1712        // File has .cfg extension (unrecognized) but content is valid JSON
1713        fs::write(
1714            tmp.path().join("myapp.cfg"),
1715            r#"{"name":"hello","on":"push","jobs":{"build":{}}}"#,
1716        )?;
1717
1718        let catalog_json = r#"{"version":1,"schemas":[{
1719            "name": "MyApp Config",
1720            "description": "MyApp configuration",
1721            "url": "https://example.com/myapp.schema.json",
1722            "fileMatch": ["*.cfg"]
1723        }]}"#;
1724        let schema =
1725            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1726
1727        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1728        let client = mock(&[
1729            (
1730                "https://www.schemastore.org/api/json/catalog.json",
1731                catalog_json,
1732            ),
1733            ("https://example.com/myapp.schema.json", schema),
1734        ]);
1735        let c = ValidateArgs {
1736            globs: vec![pattern],
1737            exclude: vec![],
1738            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1739            force_schema_fetch: true,
1740            force_validation: true,
1741            no_catalog: false,
1742            config_dir: Some(tmp.path().to_path_buf()),
1743            schema_cache_ttl: None,
1744        };
1745        let result = run_with(&c, Some(client), |_| {}).await?;
1746        assert!(!result.has_errors());
1747        assert_eq!(result.files_checked(), 1);
1748        Ok(())
1749    }
1750
1751    #[tokio::test]
1752    async fn unrecognized_extension_unparseable_skipped() -> anyhow::Result<()> {
1753        let tmp = tempfile::tempdir()?;
1754        let cache_tmp = tempfile::tempdir()?;
1755        // File matches catalog but content isn't parseable by any format
1756        fs::write(
1757            tmp.path().join("myapp.cfg"),
1758            "{ pkgs, ... }: { packages = [ pkgs.git ]; }",
1759        )?;
1760
1761        let catalog_json = r#"{"version":1,"schemas":[{
1762            "name": "MyApp Config",
1763            "description": "MyApp configuration",
1764            "url": "https://example.com/myapp.schema.json",
1765            "fileMatch": ["*.cfg"]
1766        }]}"#;
1767
1768        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1769        let client = mock(&[(
1770            "https://www.schemastore.org/api/json/catalog.json",
1771            catalog_json,
1772        )]);
1773        let c = ValidateArgs {
1774            globs: vec![pattern],
1775            exclude: vec![],
1776            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1777            force_schema_fetch: true,
1778            force_validation: true,
1779            no_catalog: false,
1780            config_dir: Some(tmp.path().to_path_buf()),
1781            schema_cache_ttl: None,
1782        };
1783        let result = run_with(&c, Some(client), |_| {}).await?;
1784        assert!(!result.has_errors());
1785        assert_eq!(result.files_checked(), 0);
1786        Ok(())
1787    }
1788
1789    #[tokio::test]
1790    async fn unrecognized_extension_invalid_against_schema() -> anyhow::Result<()> {
1791        let tmp = tempfile::tempdir()?;
1792        let cache_tmp = tempfile::tempdir()?;
1793        // File has .cfg extension, content is valid JSON but fails schema validation
1794        fs::write(tmp.path().join("myapp.cfg"), r#"{"wrong":"field"}"#)?;
1795
1796        let catalog_json = r#"{"version":1,"schemas":[{
1797            "name": "MyApp Config",
1798            "description": "MyApp configuration",
1799            "url": "https://example.com/myapp.schema.json",
1800            "fileMatch": ["*.cfg"]
1801        }]}"#;
1802        let schema =
1803            r#"{"type":"object","properties":{"name":{"type":"string"}},"required":["name"]}"#;
1804
1805        let pattern = tmp.path().join("myapp.cfg").to_string_lossy().to_string();
1806        let client = mock(&[
1807            (
1808                "https://www.schemastore.org/api/json/catalog.json",
1809                catalog_json,
1810            ),
1811            ("https://example.com/myapp.schema.json", schema),
1812        ]);
1813        let c = ValidateArgs {
1814            globs: vec![pattern],
1815            exclude: vec![],
1816            cache_dir: Some(cache_tmp.path().to_string_lossy().to_string()),
1817            force_schema_fetch: true,
1818            force_validation: true,
1819            no_catalog: false,
1820            config_dir: Some(tmp.path().to_path_buf()),
1821            schema_cache_ttl: None,
1822        };
1823        let result = run_with(&c, Some(client), |_| {}).await?;
1824        assert!(result.has_errors());
1825        assert_eq!(result.files_checked(), 1);
1826        Ok(())
1827    }
1828
1829    // --- Validation cache ---
1830
1831    #[tokio::test]
1832    async fn validation_cache_hit_skips_revalidation() -> anyhow::Result<()> {
1833        let tmp = tempfile::tempdir()?;
1834        let schema_path = tmp.path().join("schema.json");
1835        fs::write(&schema_path, SCHEMA)?;
1836
1837        let f = tmp.path().join("valid.json");
1838        fs::write(
1839            &f,
1840            format!(
1841                r#"{{"$schema":"{}","name":"hello"}}"#,
1842                schema_path.to_string_lossy()
1843            ),
1844        )?;
1845
1846        let pattern = tmp.path().join("*.json").to_string_lossy().to_string();
1847
1848        // First run: force_validation = false so results get cached
1849        let c = ValidateArgs {
1850            globs: vec![pattern.clone()],
1851            exclude: vec![],
1852            cache_dir: None,
1853            force_schema_fetch: true,
1854            force_validation: false,
1855            no_catalog: true,
1856            config_dir: None,
1857            schema_cache_ttl: None,
1858        };
1859        let mut first_statuses = Vec::new();
1860        let result = run_with(&c, Some(mock(&[])), |cf| {
1861            first_statuses.push(cf.validation_cache_status);
1862        })
1863        .await?;
1864        assert!(!result.has_errors());
1865        assert!(result.files_checked() > 0);
1866
1867        // Verify the first run recorded a validation cache miss
1868        assert!(
1869            first_statuses.contains(&Some(ValidationCacheStatus::Miss)),
1870            "expected at least one validation cache miss on first run"
1871        );
1872
1873        // Second run: same file, same schema — should hit validation cache
1874        let mut second_statuses = Vec::new();
1875        let result = run_with(&c, Some(mock(&[])), |cf| {
1876            second_statuses.push(cf.validation_cache_status);
1877        })
1878        .await?;
1879        assert!(!result.has_errors());
1880
1881        // Verify the second run got a validation cache hit
1882        assert!(
1883            second_statuses.contains(&Some(ValidationCacheStatus::Hit)),
1884            "expected at least one validation cache hit on second run"
1885        );
1886        Ok(())
1887    }
1888
1889    // --- clean_error_message ---
1890
1891    #[test]
1892    fn clean_strips_anyof_value() {
1893        let msg =
1894            r#"{"type":"bad"} is not valid under any of the schemas listed in the 'anyOf' keyword"#;
1895        assert_eq!(
1896            clean_error_message(msg.to_string()),
1897            "not valid under any of the schemas listed in the 'anyOf' keyword"
1898        );
1899    }
1900
1901    #[test]
1902    fn clean_strips_oneof_value() {
1903        let msg = r#"{"runs-on":"ubuntu-latest","steps":[]} is not valid under any of the schemas listed in the 'oneOf' keyword"#;
1904        assert_eq!(
1905            clean_error_message(msg.to_string()),
1906            "not valid under any of the schemas listed in the 'oneOf' keyword"
1907        );
1908    }
1909
1910    #[test]
1911    fn clean_strips_long_value() {
1912        let long_value = "x".repeat(5000);
1913        let suffix = " is not valid under any of the schemas listed in the 'anyOf' keyword";
1914        let msg = format!("{long_value}{suffix}");
1915        assert_eq!(
1916            clean_error_message(msg),
1917            "not valid under any of the schemas listed in the 'anyOf' keyword"
1918        );
1919    }
1920
1921    #[test]
1922    fn clean_preserves_type_error() {
1923        let msg = r#"12345 is not of types "null", "string""#;
1924        assert_eq!(clean_error_message(msg.to_string()), msg);
1925    }
1926
1927    #[test]
1928    fn clean_preserves_required_property() {
1929        let msg = "\"name\" is a required property";
1930        assert_eq!(clean_error_message(msg.to_string()), msg);
1931    }
1932}