gts_validator/lib.rs
1//! # gts-validator
2//!
3//! GTS identifier validator for documentation and configuration files.
4//!
5//! This crate provides a clean separation between the **core validation engine**
6//! (input-agnostic) and **input strategies** (starting with filesystem scanning).
7//!
8//! ## Quick Start
9//!
10//! ```rust,no_run
11//! use std::path::PathBuf;
12//! use gts_validator::{validate_fs, FsSourceConfig, ValidationConfig, VendorPolicy};
13//!
14//! let mut fs_config = FsSourceConfig::default();
15//! fs_config.paths = vec![PathBuf::from("docs"), PathBuf::from("modules")];
16//! fs_config.exclude = vec!["target/*".to_owned()];
17//!
18//! let mut validation_config = ValidationConfig::default();
19//! validation_config.vendor_policy = VendorPolicy::MustMatch("x".to_owned());
20//!
21//! let report = validate_fs(&fs_config, &validation_config).unwrap();
22//! println!("Files scanned: {}", report.scanned_files);
23//! println!("Validation errors: {}", report.errors_count());
24//! println!("Scan errors: {}", report.scan_errors.len());
25//! println!("OK: {}", report.ok);
26//! ```
27
28mod config;
29mod error;
30mod format;
31mod normalize;
32pub mod output;
33mod report;
34mod strategy;
35mod validator;
36
37pub use config::{DiscoveryMode, FsSourceConfig, ValidationConfig, VendorPolicy};
38pub use error::{ScanError, ScanErrorKind, ValidationError};
39pub use report::ValidationReport;
40
41use strategy::ContentFormat;
42use strategy::fs::{ScanResult, content_format_for, find_files, read_file_bounded};
43
44/// Validate GTS identifiers in files on disk.
45///
46/// This is the primary public API.
47///
48/// # Arguments
49///
50/// * `fs_config` - Filesystem-specific source options (paths, exclude, max file size, limits)
51/// * `validation_config` - Core validation config (vendor policy, `scan_keys`, discovery mode)
52///
53/// # Errors
54///
55/// Returns an error if `fs_config.paths` is empty or if any provided path does not exist.
56/// Returns `Ok` with `scanned_files: 0` if paths exist but contain no scannable files.
57/// Scan failures (unreadable files, parse errors, etc.) are reported in `report.scan_errors`
58/// and never silently discarded.
59pub fn validate_fs(
60 fs_config: &FsSourceConfig,
61 validation_config: &ValidationConfig,
62) -> anyhow::Result<ValidationReport> {
63 if fs_config.paths.is_empty() {
64 anyhow::bail!("No paths provided for validation");
65 }
66
67 for path in &fs_config.paths {
68 if !path.exists() {
69 anyhow::bail!("Path does not exist: {}", path.display());
70 }
71 }
72
73 let (files, mut scan_errors) = find_files(fs_config);
74
75 if files.is_empty() && scan_errors.is_empty() {
76 return Ok(ValidationReport {
77 scanned_files: 0,
78 failed_files: 0,
79 ok: true,
80 validation_errors: vec![],
81 scan_errors: vec![],
82 });
83 }
84
85 let heuristic = validation_config.discovery_mode == DiscoveryMode::Heuristic;
86 // For AllowList, pass a sentinel vendor that no real GTS ID can match.
87 // This causes validate_candidate to emit "Vendor mismatch" for every non-example
88 // vendor, and apply_allow_list_filter then removes the allowed ones — leaving only
89 // genuinely disallowed vendors as errors.
90 let effective_vendor = effective_vendor_for_scanning(&validation_config.vendor_policy);
91
92 let mut validation_errors = Vec::new();
93 let mut scanned_files: usize = 0;
94 // Discovery-stage failures (walk errors, boundary violations, canonicalization errors)
95 // are already in scan_errors from find_files. Count them as failed files upfront.
96 let mut failed_files: usize = scan_errors.len();
97 let mut total_bytes: u64 = 0;
98
99 'files: for file_path in &files {
100 if scanned_files + failed_files >= fs_config.max_files {
101 scan_errors.push(ScanError {
102 file: file_path.clone(),
103 kind: ScanErrorKind::LimitExceeded,
104 message: format!(
105 "Scan aborted: max_files limit ({}) reached; remaining files not scanned",
106 fs_config.max_files
107 ),
108 });
109 failed_files += 1;
110 break;
111 }
112
113 let content = match read_file_bounded(file_path, fs_config.max_file_size) {
114 ScanResult::Ok(c) => c,
115 ScanResult::Err(e) => {
116 scan_errors.push(e);
117 failed_files += 1;
118 continue;
119 }
120 };
121
122 let file_bytes = content.len() as u64;
123 if total_bytes.saturating_add(file_bytes) > fs_config.max_total_bytes {
124 scan_errors.push(ScanError {
125 file: file_path.clone(),
126 kind: ScanErrorKind::LimitExceeded,
127 message: format!(
128 "Scan aborted: max_total_bytes limit ({}) reached; remaining files not scanned",
129 fs_config.max_total_bytes
130 ),
131 });
132 failed_files += 1;
133 break;
134 }
135 total_bytes = total_bytes.saturating_add(file_bytes);
136
137 let vendor = effective_vendor.as_deref();
138 let file_errors = match content_format_for(file_path) {
139 Some(ContentFormat::Markdown) => format::markdown::scan_markdown_content(
140 &content,
141 file_path,
142 vendor,
143 heuristic,
144 &validation_config.skip_tokens,
145 ),
146 Some(ContentFormat::Json) => {
147 match format::json::scan_json_content(
148 &content,
149 file_path,
150 vendor,
151 validation_config.scan_keys,
152 ) {
153 Ok(errs) => errs,
154 Err(scan_err) => {
155 scan_errors.push(scan_err);
156 failed_files += 1;
157 continue 'files;
158 }
159 }
160 }
161 Some(ContentFormat::Yaml) => {
162 let (val_errs, yaml_scan_errs) = format::yaml::scan_yaml_content(
163 &content,
164 file_path,
165 vendor,
166 validation_config.scan_keys,
167 );
168 if !yaml_scan_errs.is_empty() {
169 failed_files += 1;
170 scan_errors.extend(yaml_scan_errs);
171 }
172 val_errs
173 }
174 None => continue,
175 };
176
177 scanned_files += 1;
178
179 // For AllowList: filter out errors where the vendor IS in the allow list.
180 // The sentinel vendor caused mismatches for all vendors; remove the allowed ones.
181 let file_errors = apply_allow_list_filter(file_errors, &validation_config.vendor_policy);
182 validation_errors.extend(file_errors);
183 }
184
185 let ok = validation_errors.is_empty() && scan_errors.is_empty();
186 Ok(ValidationReport {
187 scanned_files,
188 failed_files,
189 ok,
190 validation_errors,
191 scan_errors,
192 })
193}
194
195/// Determine the effective vendor string to pass to scanners for a given policy.
196///
197/// - `Any` → `None` (no vendor enforcement).
198/// - `MustMatch(v)` → `Some(v)` (scanner enforces exact match directly).
199/// - `AllowList(_)` → `Some("\x00")` (sentinel that no real GTS vendor can match).
200/// GTS vendors must be lowercase alphanumeric, so `\x00` is guaranteed to never
201/// equal any real vendor. This causes `validate_candidate` to emit "Vendor mismatch"
202/// for every non-example vendor, and `apply_allow_list_filter` then removes the
203/// vendors that are in the allow list — leaving only genuinely disallowed vendors.
204fn effective_vendor_for_scanning(policy: &VendorPolicy) -> Option<String> {
205 match policy {
206 VendorPolicy::Any => None,
207 VendorPolicy::MustMatch(v) => Some(v.clone()),
208 VendorPolicy::AllowList(_) => Some("\x00".to_owned()),
209 }
210}
211
212/// For `VendorPolicy::AllowList`, remove validation errors whose vendor IS in the list.
213///
214/// Scanners run with a sentinel vendor (`\x00`) that generates "Vendor mismatch" for
215/// every non-example vendor. This function retains only errors where the vendor is NOT
216/// in the allow list — i.e., genuinely disallowed vendors produce errors.
217fn apply_allow_list_filter(
218 errors: Vec<ValidationError>,
219 policy: &VendorPolicy,
220) -> Vec<ValidationError> {
221 let VendorPolicy::AllowList(allowed) = policy else {
222 return errors;
223 };
224
225 errors
226 .into_iter()
227 .filter(|e| {
228 // Keep the error only if it is NOT a vendor-mismatch for an allowed vendor.
229 // Vendor-mismatch errors contain "Vendor mismatch" in the message.
230 // Extract the actual vendor from normalized_id (first segment before '.').
231 if !e.error.contains("Vendor mismatch") {
232 return true; // non-vendor errors always kept
233 }
234 // normalized_id format: "gts.<vendor>.<rest>..."
235 // The vendor is the second dot-separated segment (index 1).
236 let id_vendor = e.normalized_id.split('.').nth(1).unwrap_or("");
237 !allowed.iter().any(|a| a == id_vendor)
238 })
239 .collect()
240}