Skip to main content

anomalyx_normalize/parsers/
toml.rs

1//! TOML and INI config parsers — config drift between environments.
2//!
3//! Both collapse a config into a **single row** whose columns are the config's
4//! keys, so `struct.schema --baseline` reads two configs as schemas and surfaces
5//! unexpected/added/removed keys and type changes — the drift use case.
6//!
7//! - **TOML** is parsed with the `toml` crate into a `toml::Value`, converted to
8//!   a `serde_json::Value`, and lowered through the same union-key path as JSON
9//!   (nested tables become their canonical JSON string, exactly like JSON/YAML).
10//! - **INI** is hand-rolled: `[section]` headers namespace keys (`section.key`),
11//!   `key = value` / `key : value` lines are type-inferred, `;`/`#` lines are
12//!   comments. INI accepts bare (unquoted) values that TOML rejects, which keeps
13//!   the two formats cleanly separable.
14
15use crate::infer;
16use crate::parser::{Confidence, FormatParser, STRONG, TEXT};
17use crate::table::TableBuilder;
18use ax_core::{AxError, Column, Value};
19use std::collections::BTreeMap;
20
21// ---------------------------------------------------------------- TOML --------
22
23#[derive(Debug, Default, Clone)]
24pub struct TomlParser;
25
26/// Converts a `toml::Value` into the equivalent `serde_json::Value`. Datetimes
27/// become their canonical string (deterministic, no wall-clock); non-finite
28/// floats become `Null` since they cannot enter a deterministic reduction.
29fn toml_to_json(v: toml::Value) -> serde_json::Value {
30    use serde_json::Value as J;
31    use toml::Value as T;
32    match v {
33        T::String(s) => J::String(s),
34        T::Integer(i) => J::Number(i.into()),
35        T::Float(f) => serde_json::Number::from_f64(f).map_or(J::Null, J::Number),
36        T::Boolean(b) => J::Bool(b),
37        T::Datetime(dt) => J::String(dt.to_string()),
38        T::Array(a) => J::Array(a.into_iter().map(toml_to_json).collect()),
39        T::Table(t) => J::Object(t.into_iter().map(|(k, v)| (k, toml_to_json(v))).collect()),
40    }
41}
42
43impl TomlParser {
44    fn err(&self, msg: impl std::fmt::Display) -> AxError {
45        AxError::Parse {
46            format: self.id().to_string(),
47            message: msg.to_string(),
48        }
49    }
50}
51
52impl FormatParser for TomlParser {
53    fn id(&self) -> &'static str {
54        "toml"
55    }
56    fn extensions(&self) -> &'static [&'static str] {
57        &["toml"]
58    }
59    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
60        let text = std::str::from_utf8(bytes).ok()?;
61        // Confirm by parsing: TOML's top level is always a table, and we require
62        // at least one key so an empty/comment-only file is not claimed. This
63        // cleanly rejects a JSON array (`[1,2,3]` is not a valid TOML document).
64        let parsed = toml::from_str::<toml::Value>(text).ok()?;
65        let nonempty = parsed.as_table().is_some_and(|t| !t.is_empty());
66        nonempty.then_some(STRONG)
67    }
68    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
69        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
70        let value = toml::from_str::<toml::Value>(text).map_err(|e| self.err(e))?;
71        let mut builder = TableBuilder::new();
72        builder.push_value(toml_to_json(value));
73        Ok(builder.finish())
74    }
75}
76
77// ----------------------------------------------------------------- INI --------
78
79#[derive(Debug, Default, Clone)]
80pub struct IniParser;
81
82/// A `;`- or `#`-introduced comment line.
83fn ini_is_comment(line: &str) -> bool {
84    line.starts_with(';') || line.starts_with('#')
85}
86
87/// A `[section]` header (non-empty inner name).
88fn ini_is_section(line: &str) -> bool {
89    line.starts_with('[') && line.ends_with(']') && line.len() > 2
90}
91
92/// Splits a `key = value` / `key : value` line at the first `=` or `:`. `None`
93/// if there is no separator or the key is empty. Section lines are handled
94/// before this, so a `[a:b]` line never reaches here as a key/value.
95fn ini_kv_split(line: &str) -> Option<(&str, &str)> {
96    let i = line.find(['=', ':'])?;
97    let key = line[..i].trim();
98    (!key.is_empty()).then_some((key, &line[i + 1..]))
99}
100
101/// A quoted INI value is a verbatim string; otherwise it is type-inferred.
102fn parse_ini_value(raw: &str) -> Value {
103    let quoted = raw.len() >= 2
104        && ((raw.starts_with('"') && raw.ends_with('"'))
105            || (raw.starts_with('\'') && raw.ends_with('\'')));
106    if quoted {
107        Value::Str(raw[1..raw.len() - 1].to_string())
108    } else {
109        infer::infer_scalar(raw)
110    }
111}
112
113impl IniParser {
114    fn err(&self, msg: impl std::fmt::Display) -> AxError {
115        AxError::Parse {
116            format: self.id().to_string(),
117            message: msg.to_string(),
118        }
119    }
120}
121
122impl FormatParser for IniParser {
123    fn id(&self) -> &'static str {
124        "ini"
125    }
126    fn extensions(&self) -> &'static [&'static str] {
127        &["ini", "cfg", "conf"]
128    }
129    fn sniff(&self, bytes: &[u8]) -> Option<Confidence> {
130        let text = std::str::from_utf8(bytes).ok()?;
131        let mut first: Option<&str> = None;
132        let mut has_section = false;
133        let mut has_kv = false;
134        for raw in text.lines() {
135            let l = raw.trim();
136            if l.is_empty() || ini_is_comment(l) {
137                continue;
138            }
139            if first.is_none() {
140                first = Some(l);
141            }
142            if ini_is_section(l) {
143                has_section = true;
144            } else if ini_kv_split(l).is_some() {
145                has_kv = true;
146            }
147        }
148        let first = first?;
149        if has_section && has_kv {
150            // A `[section]` plus a `key = value` is unmistakably an INI config —
151            // strong enough to win a bare `[...]` line away from JSON.
152            return Some(STRONG);
153        }
154        (ini_is_section(first) || ini_kv_split(first).is_some()).then_some(TEXT)
155    }
156    fn parse(&self, _source: &str, bytes: &[u8]) -> Result<Vec<Column>, AxError> {
157        let text = std::str::from_utf8(bytes).map_err(|e| self.err(e))?;
158        let mut section = String::new();
159        let mut row: BTreeMap<String, Value> = BTreeMap::new();
160        for raw in text.lines() {
161            let l = raw.trim();
162            if l.is_empty() || ini_is_comment(l) {
163                continue;
164            }
165            if ini_is_section(l) {
166                section = l[1..l.len() - 1].trim().to_string();
167                continue;
168            }
169            match ini_kv_split(l) {
170                Some((key, val)) => {
171                    let column = if section.is_empty() {
172                        key.to_string()
173                    } else {
174                        format!("{section}.{key}")
175                    };
176                    row.insert(column, parse_ini_value(val.trim()));
177                }
178                None => return Err(self.err(format!("malformed INI line: {l}"))),
179            }
180        }
181        let mut builder = TableBuilder::new();
182        builder.push_row(row);
183        Ok(builder.finish())
184    }
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190    use ax_core::ColType;
191
192    fn col<'a>(cols: &'a [Column], name: &str) -> &'a Column {
193        cols.iter()
194            .find(|c| c.name == name)
195            .unwrap_or_else(|| panic!("missing column {name}"))
196    }
197
198    // ------------------------------------------------------------ TOML -------
199
200    const CONFIG: &str = r#"
201title = "anomalyx"
202retries = 3
203ratio = 0.5
204enabled = true
205notanum = nan
206tags = ["a", "b"]
207created = 2024-01-02T03:04:05Z
208
209[server]
210host = "localhost"
211port = 8080
212"#;
213
214    fn toml_parse(s: &str) -> Vec<Column> {
215        TomlParser.parse("-", s.as_bytes()).unwrap()
216    }
217
218    #[test]
219    fn toml_typed_scalars() {
220        let cols = toml_parse(CONFIG);
221        assert_eq!(col(&cols, "title").cells[0], Value::Str("anomalyx".into()));
222        assert_eq!(col(&cols, "retries").ty, ColType::Int);
223        assert_eq!(col(&cols, "retries").cells[0], Value::Int(3));
224        assert_eq!(col(&cols, "ratio").cells[0], Value::Float(0.5));
225        assert_eq!(col(&cols, "enabled").cells[0], Value::Bool(true));
226        // A non-finite float cannot enter a reduction → honest Null.
227        assert_eq!(col(&cols, "notanum").cells[0], Value::Null);
228    }
229
230    #[test]
231    fn toml_datetime_array_and_nested_table_are_strings() {
232        let cols = toml_parse(CONFIG);
233        // Datetime → canonical string.
234        match &col(&cols, "created").cells[0] {
235            Value::Str(s) => assert!(s.contains("2024-01-02"), "got {s}"),
236            other => panic!("expected Str datetime, got {other:?}"),
237        }
238        // Array → canonical JSON string.
239        assert_eq!(
240            col(&cols, "tags").cells[0],
241            Value::Str("[\"a\",\"b\"]".into())
242        );
243        // Nested table → canonical JSON string (sorted keys, deterministic).
244        assert_eq!(
245            col(&cols, "server").cells[0],
246            Value::Str("{\"host\":\"localhost\",\"port\":8080}".into())
247        );
248    }
249
250    #[test]
251    fn toml_is_a_single_row() {
252        assert_eq!(col(&toml_parse(CONFIG), "title").cells.len(), 1);
253    }
254
255    #[test]
256    fn toml_sniff_confirms_by_parsing() {
257        assert_eq!(TomlParser.sniff(CONFIG.as_bytes()), Some(STRONG));
258        assert_eq!(TomlParser.sniff(b"key = \"v\"\n"), Some(STRONG));
259        assert_eq!(TomlParser.sniff(b"key=1\n"), Some(STRONG)); // valid TOML, no spaces
260                                                                // Not claimed: empty / comment-only (empty table), and non-TOML shapes.
261        assert_eq!(TomlParser.sniff(b""), None);
262        assert_eq!(TomlParser.sniff(b"# just a comment\n"), None);
263        assert_eq!(TomlParser.sniff(b"[1,2,3]"), None); // JSON array, not a TOML table
264        assert_eq!(TomlParser.sniff(b"a,b,c\n1,2,3"), None); // CSV
265        assert_eq!(TomlParser.sniff(b"k=1 v=2\n"), None); // logfmt
266        assert_eq!(TomlParser.sniff(b"kind: Pod\n"), None); // YAML
267    }
268
269    #[test]
270    fn toml_malformed_errors() {
271        assert!(matches!(
272            TomlParser.parse("-", b"a = \n"),
273            Err(AxError::Parse { .. })
274        ));
275        assert!(matches!(
276            TomlParser.parse("-", b"= 5\n"),
277            Err(AxError::Parse { .. })
278        ));
279    }
280
281    #[test]
282    fn toml_resolves_by_extension_and_content() {
283        let reg = crate::parser::ParserRegistry::default();
284        assert_eq!(reg.resolve("app.toml", b"x = 1").unwrap().id(), "toml");
285        // A `[section]` document beats JSON's bare-`[` sniff via STRONG.
286        assert_eq!(
287            reg.resolve("-", b"[server]\nhost = \"x\"\n").unwrap().id(),
288            "toml"
289        );
290    }
291
292    // ------------------------------------------------------------- INI -------
293
294    const INI: &str = "\
295; a comment
296host = localhost
297port = 8080
298
299[database]
300name = mydb
301ssl = true
302timeout = 30
303";
304
305    fn ini_parse(s: &str) -> Vec<Column> {
306        IniParser.parse("-", s.as_bytes()).unwrap()
307    }
308
309    #[test]
310    fn ini_flattens_sections_and_infers_types() {
311        let cols = ini_parse(INI);
312        assert_eq!(col(&cols, "host").cells[0], Value::Str("localhost".into()));
313        assert_eq!(col(&cols, "port").cells[0], Value::Int(8080));
314        assert_eq!(
315            col(&cols, "database.name").cells[0],
316            Value::Str("mydb".into())
317        );
318        assert_eq!(col(&cols, "database.ssl").cells[0], Value::Bool(true));
319        assert_eq!(col(&cols, "database.timeout").cells[0], Value::Int(30));
320        assert_eq!(col(&cols, "host").cells.len(), 1, "one row per config");
321    }
322
323    #[test]
324    fn ini_quotes_colons_and_empties() {
325        let cols = ini_parse("a = \"123\"\nb : bare\nc =\n");
326        assert_eq!(col(&cols, "a").cells[0], Value::Str("123".into())); // quoted → string
327        assert_eq!(col(&cols, "b").cells[0], Value::Str("bare".into())); // colon separator
328        assert_eq!(col(&cols, "c").cells[0], Value::Null); // empty value → null
329    }
330
331    #[test]
332    fn ini_malformed_line_errors() {
333        assert!(matches!(
334            IniParser.parse("-", b"no separator here\n"),
335            Err(AxError::Parse { .. })
336        ));
337    }
338
339    #[test]
340    fn ini_helper_classification() {
341        assert!(ini_is_comment("; x"));
342        assert!(ini_is_comment("# x"));
343        assert!(!ini_is_comment("k = v"));
344        assert!(ini_is_section("[db]"));
345        assert!(!ini_is_section("[]")); // empty inner
346        assert!(!ini_is_section("[unclosed"));
347        assert!(!ini_is_section("k = v"));
348        assert_eq!(ini_kv_split("k = v"), Some(("k", " v")));
349        assert_eq!(ini_kv_split("k : v"), Some(("k", " v")));
350        assert_eq!(ini_kv_split("= v"), None); // empty key
351        assert_eq!(ini_kv_split("no sep"), None);
352        assert_eq!(parse_ini_value("'q'"), Value::Str("q".into()));
353        assert_eq!(parse_ini_value("42"), Value::Int(42));
354        // An unbalanced quote is NOT a quoted string — both ends must match, so
355        // it stays a literal (and is type-inferred).
356        assert_eq!(parse_ini_value("\"abc"), Value::Str("\"abc".into()));
357        assert_eq!(parse_ini_value("'abc"), Value::Str("'abc".into()));
358    }
359
360    #[test]
361    fn ini_sniff() {
362        assert_eq!(IniParser.sniff(INI.as_bytes()), Some(STRONG)); // section + kv
363        assert_eq!(
364            IniParser.sniff(b"host = localhost\nport = 8080\n"),
365            Some(TEXT)
366        ); // kv, no section
367           // A leading comment is skipped: the first *meaningful* line decides.
368        assert_eq!(IniParser.sniff(b"; c\nhost = localhost\n"), Some(TEXT));
369        assert_eq!(IniParser.sniff(b"[only_section]\n"), Some(TEXT)); // section, no kv
370        assert_eq!(IniParser.sniff(b"a,b,c\n1,2,3"), None); // CSV
371        assert_eq!(IniParser.sniff(b"hello world\n"), None); // prose
372        assert_eq!(IniParser.sniff(b"; only a comment\n"), None); // no meaningful line
373    }
374
375    #[test]
376    fn ini_resolves_by_extension() {
377        let reg = crate::parser::ParserRegistry::default();
378        assert_eq!(reg.resolve("app.ini", b"x = y").unwrap().id(), "ini");
379        assert_eq!(reg.resolve("app.cfg", b"x = y").unwrap().id(), "ini");
380        assert_eq!(reg.resolve("app.conf", b"x = y").unwrap().id(), "ini");
381        // A sectioned bare-value config (invalid TOML) routes to INI by content.
382        assert_eq!(
383            reg.resolve("-", b"[db]\nhost = localhost\n").unwrap().id(),
384            "ini"
385        );
386    }
387
388    #[test]
389    fn parsers_claim_their_extensions() {
390        assert_eq!(TomlParser.extensions(), &["toml"]);
391        assert_eq!(IniParser.extensions(), &["ini", "cfg", "conf"]);
392    }
393}