Skip to main content

lintel_validate/parsers/
mod.rs

1mod json;
2mod json5;
3mod jsonc;
4pub mod jsonl;
5mod markdown;
6mod toml_parser;
7mod yaml;
8
9use std::path::Path;
10
11use schema_catalog::FileFormat;
12use serde_json::Value;
13
14use crate::diagnostics::ParseDiagnostic;
15
16pub use self::json::JsonParser;
17pub use self::json5::Json5Parser;
18pub use self::jsonc::JsoncParser;
19pub use self::jsonl::JsonlParser;
20pub use self::markdown::MarkdownParser;
21pub use self::toml_parser::TomlParser;
22pub use self::yaml::YamlParser;
23
24/// Parse file content into a `serde_json::Value`.
25///
26/// Implementations must produce a [`ParseDiagnostic`] with an accurate source
27/// span when parsing fails.
28pub trait Parser {
29    /// # Errors
30    ///
31    /// Returns a [`ParseDiagnostic`] with an accurate source span when parsing fails.
32    fn parse(&self, content: &str, file_name: &str) -> Result<Value, ParseDiagnostic>;
33
34    /// Extract the `$schema` URI from file content and/or parsed value.
35    ///
36    /// The default implementation reads `value["$schema"]`, which works for
37    /// JSON, JSON5, and JSONC. YAML and TOML override this to handle their
38    /// format-specific conventions (modeline comments, etc.).
39    fn extract_schema_uri(&self, _content: &str, value: &Value) -> Option<String> {
40        value
41            .get("$schema")
42            .and_then(Value::as_str)
43            .map(String::from)
44    }
45
46    /// Insert a schema annotation into the file content.
47    ///
48    /// Returns `Some(annotated_content)` if the format supports inline schema
49    /// annotations, or `None` if it does not (e.g. Markdown).
50    fn annotate(&self, _content: &str, _schema_url: &str) -> Option<String> {
51        None
52    }
53
54    /// Remove an existing schema annotation from the file content.
55    ///
56    /// Returns the content with the annotation stripped. If no annotation is
57    /// found, returns the content unchanged.
58    fn strip_annotation(&self, content: &str) -> String {
59        content.to_string()
60    }
61}
62
63/// Detect file format from extension. Returns `None` for unrecognized extensions.
64pub fn detect_format(path: &Path) -> Option<FileFormat> {
65    match path.extension().and_then(|e| e.to_str()) {
66        Some("yaml" | "yml") => Some(FileFormat::Yaml),
67        Some("json5") => Some(FileFormat::Json5),
68        Some("jsonl" | "ndjson") => Some(FileFormat::Jsonl),
69        Some("json" | "jsonc") => Some(FileFormat::Jsonc),
70        Some("toml") => Some(FileFormat::Toml),
71        Some("md" | "mdx") => Some(FileFormat::Markdown),
72        _ => None,
73    }
74}
75
76/// Return a boxed parser for the given format.
77pub fn parser_for(format: FileFormat) -> Box<dyn Parser> {
78    match format {
79        FileFormat::Json => Box::new(JsonParser),
80        FileFormat::Jsonl => Box::new(JsonlParser),
81        FileFormat::Json5 => Box::new(Json5Parser),
82        FileFormat::Jsonc => Box::new(JsoncParser),
83        FileFormat::Toml => Box::new(TomlParser),
84        FileFormat::Yaml => Box::new(YamlParser),
85        FileFormat::Markdown => Box::new(MarkdownParser),
86    }
87}
88
89/// Insert `"$schema": "URL"` as the first property after `{` in a JSON object.
90///
91/// Uses string manipulation (not parse+reserialize) to preserve formatting.
92pub(crate) fn annotate_json_content(content: &str, schema_url: &str) -> String {
93    let Some(brace_pos) = content.find('{') else {
94        return content.to_string();
95    };
96
97    let after_brace = &content[brace_pos + 1..];
98
99    // Detect if the content is compact (no newline before next non-whitespace)
100    let next_non_ws = after_brace.find(|c: char| !c.is_ascii_whitespace());
101    let has_newline_before_content = after_brace
102        .get(..next_non_ws.unwrap_or(0))
103        .is_some_and(|s| s.contains('\n'));
104
105    if has_newline_before_content {
106        let indent = detect_json_indent(after_brace);
107        format!(
108            "{}{{\n{indent}\"$schema\": \"{schema_url}\",{}",
109            &content[..brace_pos],
110            after_brace
111        )
112    } else {
113        format!(
114            "{}{{\"$schema\":\"{schema_url}\",{}",
115            &content[..brace_pos],
116            after_brace.trim_start()
117        )
118    }
119}
120
121/// Detect the indentation used in a JSON string (the whitespace at the start
122/// of the first content line after the opening brace).
123fn detect_json_indent(after_brace: &str) -> String {
124    for line in after_brace.lines() {
125        let trimmed = line.trim();
126        if trimmed.is_empty() {
127            continue;
128        }
129        let indent_end = line.len() - line.trim_start().len();
130        return line[..indent_end].to_string();
131    }
132    "  ".to_string()
133}
134
135/// Remove the top-level `"$schema"` property from a JSON string.
136///
137/// Uses string manipulation (not parse+reserialize) to preserve formatting.
138pub(crate) fn strip_json_schema_property(content: &str) -> String {
139    let key = "\"$schema\"";
140    let Some(key_start) = content.find(key) else {
141        return content.to_string();
142    };
143
144    let key_end = key_start + key.len();
145    let mut pos = key_end;
146
147    // Skip whitespace (space/tab) between key and colon
148    while pos < content.len() && matches!(content.as_bytes()[pos], b' ' | b'\t') {
149        pos += 1;
150    }
151    // Expect colon
152    if content.as_bytes().get(pos) != Some(&b':') {
153        return content.to_string();
154    }
155    pos += 1;
156
157    // Skip whitespace (space/tab) between colon and value
158    while pos < content.len() && matches!(content.as_bytes()[pos], b' ' | b'\t') {
159        pos += 1;
160    }
161    // Expect opening quote
162    if content.as_bytes().get(pos) != Some(&b'"') {
163        return content.to_string();
164    }
165    pos += 1;
166
167    // Read string value until closing quote (handling backslash escapes)
168    while pos < content.len() {
169        match content.as_bytes()[pos] {
170            b'\\' => pos += 2,
171            b'"' => {
172                pos += 1;
173                break;
174            }
175            _ => pos += 1,
176        }
177    }
178    let value_end = pos;
179
180    // Check for trailing comma (with optional space/tab before it)
181    let ws_after = content.as_bytes()[value_end..]
182        .iter()
183        .take_while(|&&b| b == b' ' || b == b'\t')
184        .count();
185    let has_trailing_comma = content.as_bytes().get(value_end + ws_after) == Some(&b',');
186
187    if has_trailing_comma {
188        let remove_end = value_end + ws_after + 1; // past the comma
189        let before = &content[..key_start];
190        if let Some(nl_pos) = before.rfind('\n') {
191            // Pretty-printed: remove from newline to past the comma
192            format!("{}{}", &content[..nl_pos], &content[remove_end..])
193        } else {
194            // Compact: remove key-value+comma and any space/tab after comma
195            let ws_skip = content.as_bytes()[remove_end..]
196                .iter()
197                .take_while(|&&b| b == b' ' || b == b'\t')
198                .count();
199            format!(
200                "{}{}",
201                &content[..key_start],
202                &content[remove_end + ws_skip..]
203            )
204        }
205    } else {
206        // No trailing comma — $schema is the only or last property
207        let before = &content[..key_start];
208        let rtrimmed = before.trim_end();
209        if rtrimmed.ends_with(',') {
210            // Last property: also remove the preceding comma
211            let comma_pos = before.rfind(',').expect("comma before $schema");
212            format!("{}{}", &content[..comma_pos], &content[value_end..])
213        } else if let Some(nl_pos) = before.rfind('\n') {
214            // Only property, pretty-printed
215            format!("{}{}", &content[..nl_pos], &content[value_end..])
216        } else {
217            // Only property, compact
218            format!("{}{}", &content[..key_start], &content[value_end..])
219        }
220    }
221}
222
223/// Convert 1-based line and column to a byte offset in content.
224pub fn line_col_to_offset(content: &str, line: usize, col: usize) -> usize {
225    let mut offset = 0;
226    for (i, l) in content.lines().enumerate() {
227        if i + 1 == line {
228            return offset + col.saturating_sub(1);
229        }
230        offset += l.len() + 1; // +1 for newline
231    }
232    offset.min(content.len())
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    // --- detect_format ---
240
241    #[test]
242    fn detect_format_json() {
243        assert_eq!(
244            detect_format(Path::new("foo.json")),
245            Some(FileFormat::Jsonc)
246        );
247    }
248
249    #[test]
250    fn detect_format_yaml() {
251        assert_eq!(detect_format(Path::new("foo.yaml")), Some(FileFormat::Yaml));
252        assert_eq!(detect_format(Path::new("foo.yml")), Some(FileFormat::Yaml));
253    }
254
255    #[test]
256    fn detect_format_json5() {
257        assert_eq!(
258            detect_format(Path::new("foo.json5")),
259            Some(FileFormat::Json5)
260        );
261    }
262
263    #[test]
264    fn detect_format_jsonc() {
265        assert_eq!(
266            detect_format(Path::new("foo.jsonc")),
267            Some(FileFormat::Jsonc)
268        );
269    }
270
271    #[test]
272    fn detect_format_toml() {
273        assert_eq!(detect_format(Path::new("foo.toml")), Some(FileFormat::Toml));
274    }
275
276    #[test]
277    fn detect_format_jsonl() {
278        assert_eq!(
279            detect_format(Path::new("foo.jsonl")),
280            Some(FileFormat::Jsonl)
281        );
282    }
283
284    #[test]
285    fn detect_format_ndjson() {
286        assert_eq!(
287            detect_format(Path::new("foo.ndjson")),
288            Some(FileFormat::Jsonl)
289        );
290    }
291
292    #[test]
293    fn detect_format_unknown_returns_none() {
294        assert_eq!(detect_format(Path::new("foo.txt")), None);
295        assert_eq!(detect_format(Path::new("foo")), None);
296        assert_eq!(detect_format(Path::new("devenv.nix")), None);
297    }
298
299    // --- extract_schema_uri via trait ---
300
301    #[test]
302    fn extract_schema_json_with_schema() {
303        let val = serde_json::json!({"$schema": "https://example.com/s.json"});
304        let uri = JsonParser.extract_schema_uri("", &val);
305        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
306    }
307
308    #[test]
309    fn extract_schema_json_without_schema() {
310        let val = serde_json::json!({"key": "value"});
311        let uri = JsonParser.extract_schema_uri("", &val);
312        assert!(uri.is_none());
313    }
314
315    #[test]
316    fn extract_schema_json5_with_schema() {
317        let val = serde_json::json!({"$schema": "https://example.com/s.json"});
318        let uri = Json5Parser.extract_schema_uri("", &val);
319        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
320    }
321
322    #[test]
323    fn extract_schema_jsonc_with_schema() {
324        let val = serde_json::json!({"$schema": "https://example.com/s.json"});
325        let uri = JsoncParser.extract_schema_uri("", &val);
326        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
327    }
328
329    #[test]
330    fn extract_schema_yaml_modeline() {
331        let content = "# yaml-language-server: $schema=https://example.com/s.json\nkey: value\n";
332        let val = serde_json::json!({"key": "value"});
333        let uri = YamlParser.extract_schema_uri(content, &val);
334        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
335    }
336
337    #[test]
338    fn extract_schema_yaml_modeline_with_leading_blank_lines() {
339        let content = "\n# yaml-language-server: $schema=https://example.com/s.json\nkey: value\n";
340        let val = serde_json::json!({"key": "value"});
341        let uri = YamlParser.extract_schema_uri(content, &val);
342        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
343    }
344
345    #[test]
346    fn extract_schema_yaml_modeline_after_other_comment() {
347        let content = "# some comment\n# yaml-language-server: $schema=https://example.com/s.json\nkey: value\n";
348        let val = serde_json::json!({"key": "value"});
349        let uri = YamlParser.extract_schema_uri(content, &val);
350        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
351    }
352
353    #[test]
354    fn extract_schema_yaml_modeline_not_in_body() {
355        let content = "key: value\n# yaml-language-server: $schema=https://example.com/s.json\n";
356        let val = serde_json::json!({"key": "value"});
357        let uri = YamlParser.extract_schema_uri(content, &val);
358        assert!(uri.is_none());
359    }
360
361    #[test]
362    fn extract_schema_yaml_top_level_property() {
363        let content = "$schema: https://example.com/s.json\nkey: value\n";
364        let val = serde_json::json!({"$schema": "https://example.com/s.json", "key": "value"});
365        let uri = YamlParser.extract_schema_uri(content, &val);
366        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
367    }
368
369    #[test]
370    fn extract_schema_yaml_modeline_takes_priority() {
371        let content = "# yaml-language-server: $schema=https://modeline.com/s.json\n$schema: https://property.com/s.json\n";
372        let val = serde_json::json!({"$schema": "https://property.com/s.json"});
373        let uri = YamlParser.extract_schema_uri(content, &val);
374        assert_eq!(uri.as_deref(), Some("https://modeline.com/s.json"));
375    }
376
377    #[test]
378    fn extract_schema_yaml_none() {
379        let content = "key: value\n";
380        let val = serde_json::json!({"key": "value"});
381        let uri = YamlParser.extract_schema_uri(content, &val);
382        assert!(uri.is_none());
383    }
384
385    // --- TOML schema extraction ---
386
387    #[test]
388    fn extract_schema_toml_comment() {
389        let content = "# :schema https://example.com/s.json\nkey = \"value\"\n";
390        let val = serde_json::json!({"key": "value"});
391        let uri = TomlParser.extract_schema_uri(content, &val);
392        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
393    }
394
395    #[test]
396    fn extract_schema_toml_with_leading_blank_lines() {
397        let content = "\n# :schema https://example.com/s.json\nkey = \"value\"\n";
398        let val = serde_json::json!({"key": "value"});
399        let uri = TomlParser.extract_schema_uri(content, &val);
400        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
401    }
402
403    #[test]
404    fn extract_schema_toml_not_in_body() {
405        let content = "key = \"value\"\n# :schema https://example.com/s.json\n";
406        let val = serde_json::json!({"key": "value"});
407        let uri = TomlParser.extract_schema_uri(content, &val);
408        assert!(uri.is_none());
409    }
410
411    #[test]
412    fn extract_schema_toml_none() {
413        let content = "key = \"value\"\n";
414        let val = serde_json::json!({"key": "value"});
415        let uri = TomlParser.extract_schema_uri(content, &val);
416        assert!(uri.is_none());
417    }
418
419    #[test]
420    fn extract_schema_toml_legacy_dollar_schema() {
421        let content = "# $schema: https://example.com/s.json\nkey = \"value\"\n";
422        let val = serde_json::json!({"key": "value"});
423        let uri = TomlParser.extract_schema_uri(content, &val);
424        assert_eq!(uri.as_deref(), Some("https://example.com/s.json"));
425    }
426
427    // --- line_col_to_offset ---
428
429    #[test]
430    fn line_col_to_offset_first_line() {
431        assert_eq!(line_col_to_offset("hello\nworld", 1, 1), 0);
432        assert_eq!(line_col_to_offset("hello\nworld", 1, 3), 2);
433    }
434
435    #[test]
436    fn line_col_to_offset_second_line() {
437        assert_eq!(line_col_to_offset("hello\nworld", 2, 1), 6);
438        assert_eq!(line_col_to_offset("hello\nworld", 2, 3), 8);
439    }
440
441    // --- parser_for round-trip ---
442
443    #[test]
444    fn parser_for_json_parses() -> anyhow::Result<()> {
445        let p = parser_for(FileFormat::Json);
446        let val = p.parse(r#"{"key":"value"}"#, "test.json")?;
447        assert_eq!(val, serde_json::json!({"key": "value"}));
448        Ok(())
449    }
450
451    #[test]
452    fn parser_for_yaml_parses() -> anyhow::Result<()> {
453        let p = parser_for(FileFormat::Yaml);
454        let val = p.parse("key: value\n", "test.yaml")?;
455        assert_eq!(val, serde_json::json!({"key": "value"}));
456        Ok(())
457    }
458
459    #[test]
460    fn parser_for_json5_parses() -> anyhow::Result<()> {
461        let p = parser_for(FileFormat::Json5);
462        let val = p.parse(r#"{key: "value"}"#, "test.json5")?;
463        assert_eq!(val, serde_json::json!({"key": "value"}));
464        Ok(())
465    }
466
467    #[test]
468    fn parser_for_jsonc_parses() -> anyhow::Result<()> {
469        let p = parser_for(FileFormat::Jsonc);
470        let val = p.parse(r#"{"key": "value" /* comment */}"#, "test.jsonc")?;
471        assert_eq!(val, serde_json::json!({"key": "value"}));
472        Ok(())
473    }
474
475    #[test]
476    fn parser_for_toml_parses() -> anyhow::Result<()> {
477        let p = parser_for(FileFormat::Toml);
478        let val = p.parse("key = \"value\"\n", "test.toml")?;
479        assert_eq!(val, serde_json::json!({"key": "value"}));
480        Ok(())
481    }
482}