Skip to main content

anomalyx_normalize/parsers/
yaml.rs

1//! YAML parser — Kubernetes manifests and CI configs.
2//!
3//! Each YAML document becomes a record (a sequence document expands to one row
4//! per element, like a JSON array), deserialized into a [`serde_json::Value`]
5//! and lowered through the same union-key [`TableBuilder`] path as JSON — so a
6//! field present in one manifest but absent in another is an explicit `Null`,
7//! which is exactly what `struct.schema --baseline` reads as added/removed keys.
8//! Multi-document streams (`---` separators) are fully supported; an empty
9//! document produces no row.
10
11use crate::parser::{Confidence, FormatParser, TEXT};
12use crate::table::TableBuilder;
13use ax_core::{AxError, Column};
14use serde::Deserialize;
15
16#[derive(Debug, Default, Clone)]
17pub struct YamlParser;
18
19/// A `key:` mapping line — a bareword key (`[A-Za-z0-9._-]`) followed by `:` and
20/// then either end-of-line or a space. This is the distinctive YAML shape we
21/// sniff for; it deliberately rejects `12:00` (no space after colon) and CSV.
22fn is_mapping_key(line: &str) -> bool {
23    match line.find(':') {
24        Some(i) => {
25            let (key, after) = (&line[..i], &line[i + 1..]);
26            !key.is_empty()
27                && key
28                    .chars()
29                    .all(|c| c.is_ascii_alphanumeric() || c == '.' || c == '_' || c == '-')
30                && (after.is_empty() || after.starts_with(' '))
31        }
32        None => false,
33    }
34}
35
36/// A block-sequence item: `-` alone, or `- ` then content.
37fn is_list_item(line: &str) -> bool {
38    line == "-" || line.starts_with("- ")
39}
40
41impl YamlParser {
42    fn err(&self, msg: impl std::fmt::Display) -> AxError {
43        AxError::Parse {
44            format: self.id().to_string(),
45            message: msg.to_string(),
46        }
47    }
48}
49
50impl FormatParser for YamlParser {
51    fn id(&self) -> &'static str {
52        "yaml"
53    }
54    fn extensions(&self) -> &'static [&'static str] {
55        &["yaml", "yml"]
56    }
57    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
58        let text = std::str::from_utf8(bytes).ok()?;
59        for line in text.lines() {
60            if line.trim().is_empty() {
61                continue;
62            }
63            let lt = line.trim_start();
64            if lt.starts_with('#') {
65                continue; // YAML comment — keep looking
66            }
67            // The first meaningful line decides: a document marker, a mapping
68            // key, or a list item is YAML; anything else clearly is not.
69            let yaml_like =
70                lt == "---" || lt.starts_with("--- ") || is_mapping_key(lt) || is_list_item(lt);
71            return yaml_like.then_some(TEXT);
72        }
73        None
74    }
75    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
76        let mut builder = TableBuilder::new();
77        for document in serde_yaml::Deserializer::from_slice(bytes) {
78            let val = serde_json::Value::deserialize(document).map_err(|e| self.err(e))?;
79            match val {
80                serde_json::Value::Array(items) => {
81                    for item in items {
82                        builder.push_value(item);
83                    }
84                }
85                serde_json::Value::Null => {} // empty document → no row
86                other => builder.push_value(other),
87            }
88        }
89        Ok(builder.finish())
90    }
91}
92
93#[cfg(test)]
94mod tests {
95    use super::*;
96    use ax_core::{ColType, Value};
97
98    fn parse(s: &str) -> Vec<Column> {
99        YamlParser.parse("-", s.as_bytes()).unwrap()
100    }
101    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
102        cols.iter()
103            .find(|c| c.name == name)
104            .unwrap_or_else(|| panic!("missing column {name}"))
105    }
106
107    const MANIFEST: &str = "\
108apiVersion: apps/v1
109kind: Deployment
110replicas: 3
111";
112
113    #[test]
114    fn parses_a_mapping_document_with_typed_cells() {
115        let cols = parse(MANIFEST);
116        assert_eq!(col(&cols, "kind").cells[0], Value::Str("Deployment".into()));
117        assert_eq!(col(&cols, "replicas").ty, ColType::Int);
118        assert_eq!(col(&cols, "replicas").cells[0], Value::Int(3));
119        assert_eq!(
120            col(&cols, "apiVersion").cells[0],
121            Value::Str("apps/v1".into())
122        );
123    }
124
125    #[test]
126    fn multi_document_stream_is_one_row_per_doc() {
127        // The cross-manifest case `struct.schema --baseline` cares about: a key
128        // present in one doc, absent in the next, pads with Null.
129        let cols = parse("kind: A\nfoo: 1\n---\nkind: B\n");
130        let kind = col(&cols, "kind");
131        assert_eq!(kind.cells.len(), 2);
132        assert_eq!(kind.cells[0], Value::Str("A".into()));
133        assert_eq!(kind.cells[1], Value::Str("B".into()));
134        assert_eq!(col(&cols, "foo").cells[1], Value::Null, "absent in doc 2");
135    }
136
137    #[test]
138    fn sequence_document_expands_to_rows() {
139        let cols = parse("- x: 1\n- x: 2\n");
140        assert_eq!(col(&cols, "x").cells, vec![Value::Int(1), Value::Int(2)]);
141    }
142
143    #[test]
144    fn empty_document_produces_no_row() {
145        // A trailing `---` leaves an empty doc; it must not add a blank row.
146        let cols = parse("kind: A\n---\n");
147        assert_eq!(col(&cols, "kind").cells.len(), 1);
148    }
149
150    #[test]
151    fn malformed_yaml_errors() {
152        // A mapping value that is itself a mapping inline is invalid YAML.
153        assert!(matches!(
154            YamlParser.parse("-", b"a: b: c\n"),
155            Err(AxError::Parse { .. })
156        ));
157    }
158
159    #[test]
160    fn mapping_key_classification() {
161        assert!(is_mapping_key("apiVersion: v1"));
162        assert!(is_mapping_key("a.b-c_d: x"));
163        assert!(is_mapping_key("kind:")); // key with empty value (block follows)
164        assert!(!is_mapping_key("12:00")); // no space after colon → not a key
165        assert!(!is_mapping_key(": x")); // empty key
166        assert!(!is_mapping_key("no colon here"));
167        assert!(!is_mapping_key("foo bar: x")); // space in key
168    }
169
170    #[test]
171    fn list_item_classification() {
172        assert!(is_list_item("- item"));
173        assert!(is_list_item("-"));
174        assert!(!is_list_item("-nospace"));
175        assert!(!is_list_item("notalist"));
176    }
177
178    #[test]
179    fn sniff_recognizes_yaml_shapes() {
180        assert_eq!(YamlParser.sniff(MANIFEST.as_bytes()), Some(TEXT));
181        assert_eq!(YamlParser.sniff(b"---\nkind: Pod\n"), Some(TEXT)); // doc marker
182        assert_eq!(YamlParser.sniff(b"--- {inline: 1}\n"), Some(TEXT)); // inline marker
183        assert_eq!(YamlParser.sniff(b"- a\n- b\n"), Some(TEXT)); // sequence
184        assert_eq!(YamlParser.sniff(b"# header\nkind: Pod\n"), Some(TEXT)); // comment first
185        assert_eq!(YamlParser.sniff(b"\n\nkind: Pod\n"), Some(TEXT)); // blank lines first
186    }
187
188    #[test]
189    fn sniff_rejects_non_yaml() {
190        assert_eq!(YamlParser.sniff(b"a,b,c\n1,2,3"), None); // CSV
191        assert_eq!(YamlParser.sniff(b"k=1 v=2\n"), None); // logfmt
192        assert_eq!(YamlParser.sniff(b"{\"a\":1}"), None); // JSON object
193        assert_eq!(YamlParser.sniff(b"12:00 something\n"), None); // not a key
194        assert_eq!(YamlParser.sniff(b"hello world\n"), None); // prose
195        assert_eq!(
196            YamlParser.sniff(b"hello world\nkind: Pod\n"),
197            None,
198            "a non-YAML first line is decisive; we do not scan past it"
199        );
200    }
201
202    #[test]
203    fn claims_yaml_extensions() {
204        assert_eq!(YamlParser.extensions(), &["yaml", "yml"]);
205    }
206
207    #[test]
208    fn resolves_by_extension_and_content() {
209        let reg = crate::parser::ParserRegistry::default();
210        assert_eq!(reg.resolve("deploy.yaml", b"x: 1").unwrap().id(), "yaml");
211        assert_eq!(reg.resolve("deploy.yml", b"x: 1").unwrap().id(), "yaml");
212        assert_eq!(
213            reg.resolve("-", MANIFEST.as_bytes()).unwrap().id(),
214            "yaml",
215            "routed by content sniff"
216        );
217    }
218}