Skip to main content

haystack_core/codecs/trio/
parser.rs

1// Trio format parser — record-per-entity text format.
2
3use crate::codecs::CodecError;
4use crate::codecs::zinc::ZincParser;
5use crate::data::{HCol, HDict, HGrid};
6use crate::kinds::Kind;
7
8/// Parse Trio-formatted text into an HGrid.
9///
10/// Each record becomes a row in the grid. Records are separated by `---`
11/// (three or more dashes). Columns are derived from all unique tag names
12/// across all records.
13pub fn decode_grid(input: &str) -> Result<HGrid, CodecError> {
14    let records = parse_records(input)?;
15
16    if records.is_empty() {
17        return Ok(HGrid::new());
18    }
19
20    // Derive columns from all unique tag names, preserving insertion order
21    let mut col_names: Vec<String> = Vec::new();
22    let mut seen = std::collections::HashSet::new();
23    for rec in &records {
24        // Sort tag names for deterministic column order within each record
25        let mut names: Vec<&str> = rec.tag_names().collect();
26        names.sort();
27        for name in names {
28            if seen.insert(name.to_string()) {
29                col_names.push(name.to_string());
30            }
31        }
32    }
33
34    let cols: Vec<HCol> = col_names.iter().map(HCol::new).collect();
35    Ok(HGrid::from_parts(HDict::new(), cols, records))
36}
37
38/// Parse the input text into a list of HDict records.
39fn parse_records(input: &str) -> Result<Vec<HDict>, CodecError> {
40    let mut records: Vec<HDict> = Vec::new();
41    let mut current_tags: Vec<(String, Kind)> = Vec::new();
42    let mut multiline_name: Option<String> = None;
43    let mut multiline_lines: Vec<String> = Vec::new();
44
45    for line in input.split('\n') {
46        let stripped = line.trim();
47
48        // Record separator: line of three or more dashes
49        if is_record_separator(stripped) {
50            // Flush multiline string if active
51            if let Some(name) = multiline_name.take() {
52                current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
53                multiline_lines.clear();
54            }
55            // Flush current record
56            if !current_tags.is_empty() {
57                records.push(tags_to_dict(current_tags));
58                current_tags = Vec::new();
59            }
60            continue;
61        }
62
63        // Comment line
64        if stripped.starts_with("//") {
65            continue;
66        }
67
68        // In multiline string mode
69        if multiline_name.is_some() {
70            if let Some(content) = line.strip_prefix("  ").or_else(|| line.strip_prefix('\t')) {
71                // Indented continuation line
72                multiline_lines.push(content.to_string());
73                continue;
74            } else {
75                // Non-indented line ends the multiline
76                let name = multiline_name.take().unwrap();
77                current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
78                multiline_lines.clear();
79                // Fall through to parse this line normally
80            }
81        }
82
83        // Skip empty lines
84        if stripped.is_empty() {
85            continue;
86        }
87
88        // Parse name:value or marker-only line
89        match stripped.find(':') {
90            None => {
91                // Marker tag (just a name)
92                current_tags.push((stripped.to_string(), Kind::Marker));
93            }
94            Some(colon_idx) => {
95                let name = stripped[..colon_idx].trim().to_string();
96                let rest = &stripped[colon_idx + 1..];
97
98                if rest.trim().is_empty() {
99                    // Empty after colon -> multiline string starts on next line
100                    multiline_name = Some(name);
101                    multiline_lines.clear();
102                } else {
103                    // Value follows colon
104                    let val_str = rest.trim();
105                    let val = parse_scalar_value(val_str);
106                    current_tags.push((name, val));
107                }
108            }
109        }
110    }
111
112    // Flush final multiline
113    if let Some(name) = multiline_name.take() {
114        current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
115    }
116
117    // Flush final record
118    if !current_tags.is_empty() {
119        records.push(tags_to_dict(current_tags));
120    }
121
122    Ok(records)
123}
124
125/// Try to parse a value string as a Zinc scalar.
126/// If parsing fails or the parser doesn't consume all input, treat as a plain string.
127fn parse_scalar_value(val_str: &str) -> Kind {
128    let mut parser = ZincParser::new(val_str);
129    match parser.parse_scalar() {
130        Ok(val) => {
131            if parser.at_end() {
132                val
133            } else {
134                // Parser didn't consume all input, treat as plain string
135                Kind::Str(val_str.to_string())
136            }
137        }
138        Err(_) => {
139            // Unparseable as Zinc scalar, treat as plain string
140            Kind::Str(val_str.to_string())
141        }
142    }
143}
144
145/// Check if a line is a record separator (three or more dashes only).
146fn is_record_separator(stripped: &str) -> bool {
147    !stripped.is_empty() && stripped.len() >= 3 && stripped.chars().all(|ch| ch == '-')
148}
149
150/// Convert an ordered list of (name, value) pairs into an HDict.
151fn tags_to_dict(tags: Vec<(String, Kind)>) -> HDict {
152    let mut dict = HDict::new();
153    for (name, val) in tags {
154        dict.set(name, val);
155    }
156    dict
157}
158
159#[cfg(test)]
160mod tests {
161    use super::*;
162    use crate::kinds::{Coord, HRef, Number};
163    use chrono::NaiveDate;
164
165    #[test]
166    fn parse_empty_input() {
167        let grid = decode_grid("").unwrap();
168        assert!(grid.is_empty());
169        assert_eq!(grid.num_cols(), 0);
170    }
171
172    #[test]
173    fn parse_whitespace_only() {
174        let grid = decode_grid("   \n  \n  ").unwrap();
175        assert!(grid.is_empty());
176    }
177
178    #[test]
179    fn parse_single_record_with_markers_and_values() {
180        let input = "dis: \"Site 1\"\nsite\narea: 3702ft\u{00B2}\n";
181        let grid = decode_grid(input).unwrap();
182        assert_eq!(grid.len(), 1);
183
184        let row = grid.row(0).unwrap();
185        assert_eq!(row.get("dis"), Some(&Kind::Str("Site 1".into())));
186        assert_eq!(row.get("site"), Some(&Kind::Marker));
187        assert_eq!(
188            row.get("area"),
189            Some(&Kind::Number(Number::new(
190                3702.0,
191                Some("ft\u{00B2}".into())
192            )))
193        );
194    }
195
196    #[test]
197    fn parse_multiple_records() {
198        let input = "dis: \"Site A\"\nsite\n---\ndis: \"Site B\"\nsite\n";
199        let grid = decode_grid(input).unwrap();
200        assert_eq!(grid.len(), 2);
201
202        assert_eq!(
203            grid.row(0).unwrap().get("dis"),
204            Some(&Kind::Str("Site A".into()))
205        );
206        assert_eq!(
207            grid.row(1).unwrap().get("dis"),
208            Some(&Kind::Str("Site B".into()))
209        );
210    }
211
212    #[test]
213    fn parse_comments_skipped() {
214        let input = "// This is a comment\ndis: \"Site\"\nsite\n";
215        let grid = decode_grid(input).unwrap();
216        assert_eq!(grid.len(), 1);
217        assert_eq!(
218            grid.row(0).unwrap().get("dis"),
219            Some(&Kind::Str("Site".into()))
220        );
221        assert!(grid.row(0).unwrap().missing("//"));
222    }
223
224    #[test]
225    fn parse_multiline_string() {
226        let input = "dis: \"Test\"\ndoc:\n  This is line 1\n  This is line 2\nsite\n";
227        let grid = decode_grid(input).unwrap();
228        assert_eq!(grid.len(), 1);
229
230        let row = grid.row(0).unwrap();
231        assert_eq!(
232            row.get("doc"),
233            Some(&Kind::Str("This is line 1\nThis is line 2".into()))
234        );
235        assert_eq!(row.get("site"), Some(&Kind::Marker));
236    }
237
238    #[test]
239    fn parse_multiline_string_with_tab_indent() {
240        let input = "doc:\n\tLine A\n\tLine B\n";
241        let grid = decode_grid(input).unwrap();
242        assert_eq!(grid.len(), 1);
243
244        let row = grid.row(0).unwrap();
245        assert_eq!(row.get("doc"), Some(&Kind::Str("Line A\nLine B".into())));
246    }
247
248    #[test]
249    fn parse_multiline_string_at_end_of_input() {
250        let input = "doc:\n  Last line";
251        let grid = decode_grid(input).unwrap();
252        assert_eq!(grid.len(), 1);
253
254        let row = grid.row(0).unwrap();
255        assert_eq!(row.get("doc"), Some(&Kind::Str("Last line".into())));
256    }
257
258    #[test]
259    fn parse_markers_alone() {
260        let input = "site\nequip\nahu\n";
261        let grid = decode_grid(input).unwrap();
262        assert_eq!(grid.len(), 1);
263
264        let row = grid.row(0).unwrap();
265        assert_eq!(row.get("site"), Some(&Kind::Marker));
266        assert_eq!(row.get("equip"), Some(&Kind::Marker));
267        assert_eq!(row.get("ahu"), Some(&Kind::Marker));
268    }
269
270    #[test]
271    fn parse_blank_lines_between_tags() {
272        let input = "dis: \"Test\"\n\nsite\n\narea: 100\n";
273        let grid = decode_grid(input).unwrap();
274        assert_eq!(grid.len(), 1);
275
276        let row = grid.row(0).unwrap();
277        assert_eq!(row.get("dis"), Some(&Kind::Str("Test".into())));
278        assert_eq!(row.get("site"), Some(&Kind::Marker));
279        assert_eq!(
280            row.get("area"),
281            Some(&Kind::Number(Number::unitless(100.0)))
282        );
283    }
284
285    #[test]
286    fn parse_ref_values() {
287        let input = "id: @site-1\nsiteRef: @alpha\n";
288        let grid = decode_grid(input).unwrap();
289        assert_eq!(grid.len(), 1);
290
291        let row = grid.row(0).unwrap();
292        assert_eq!(row.get("id"), Some(&Kind::Ref(HRef::from_val("site-1"))));
293        assert_eq!(
294            row.get("siteRef"),
295            Some(&Kind::Ref(HRef::from_val("alpha")))
296        );
297    }
298
299    #[test]
300    fn parse_date_value() {
301        let input = "installed: 2024-03-13\n";
302        let grid = decode_grid(input).unwrap();
303        let row = grid.row(0).unwrap();
304        assert_eq!(
305            row.get("installed"),
306            Some(&Kind::Date(NaiveDate::from_ymd_opt(2024, 3, 13).unwrap()))
307        );
308    }
309
310    #[test]
311    fn parse_coord_value() {
312        let input = "geoCoord: C(37.5458,-77.4491)\n";
313        let grid = decode_grid(input).unwrap();
314        let row = grid.row(0).unwrap();
315        assert_eq!(
316            row.get("geoCoord"),
317            Some(&Kind::Coord(Coord::new(37.5458, -77.4491)))
318        );
319    }
320
321    #[test]
322    fn parse_bool_values() {
323        let input = "active: T\ndeleted: F\n";
324        let grid = decode_grid(input).unwrap();
325        let row = grid.row(0).unwrap();
326        assert_eq!(row.get("active"), Some(&Kind::Bool(true)));
327        assert_eq!(row.get("deleted"), Some(&Kind::Bool(false)));
328    }
329
330    #[test]
331    fn parse_number_with_unit() {
332        let input = "temp: 72.5\u{00B0}F\nflow: 350gal/min\n";
333        let grid = decode_grid(input).unwrap();
334        let row = grid.row(0).unwrap();
335        assert_eq!(
336            row.get("temp"),
337            Some(&Kind::Number(Number::new(72.5, Some("\u{00B0}F".into()))))
338        );
339        assert_eq!(
340            row.get("flow"),
341            Some(&Kind::Number(Number::new(350.0, Some("gal/min".into()))))
342        );
343    }
344
345    #[test]
346    fn parse_separator_with_more_dashes() {
347        let input = "site\n-----\nequip\n";
348        let grid = decode_grid(input).unwrap();
349        assert_eq!(grid.len(), 2);
350        assert_eq!(grid.row(0).unwrap().get("site"), Some(&Kind::Marker));
351        assert_eq!(grid.row(1).unwrap().get("equip"), Some(&Kind::Marker));
352    }
353
354    #[test]
355    fn parse_columns_derived_from_all_records() {
356        let input = "dis: \"A\"\nsite\n---\ndis: \"B\"\narea: 100\n";
357        let grid = decode_grid(input).unwrap();
358
359        // Columns should include tags from both records
360        let col_names: Vec<&str> = grid.col_names().collect();
361        assert!(col_names.contains(&"dis"));
362        assert!(col_names.contains(&"site"));
363        assert!(col_names.contains(&"area"));
364    }
365
366    #[test]
367    fn parse_complex_trio_file() {
368        let input = "\
369// Alpha Office
370id: @alpha
371dis: \"Alpha Office\"
372site
373geoAddr: \"600 N 2nd St, Richmond VA 23219\"
374geoCoord: C(37.5407,-77.4360)
375area: 120000ft\u{00B2}
376---
377// Floor 1
378id: @floor1
379dis: \"Floor 1\"
380floor
381siteRef: @alpha
382---
383id: @ahu1
384dis: \"AHU-1\"
385equip
386ahu
387siteRef: @alpha
388floorRef: @floor1
389";
390        let grid = decode_grid(input).unwrap();
391        assert_eq!(grid.len(), 3);
392
393        let site = grid.row(0).unwrap();
394        assert_eq!(site.get("dis"), Some(&Kind::Str("Alpha Office".into())));
395        assert_eq!(site.get("site"), Some(&Kind::Marker));
396        assert_eq!(site.get("id"), Some(&Kind::Ref(HRef::from_val("alpha"))));
397        assert_eq!(
398            site.get("area"),
399            Some(&Kind::Number(Number::new(
400                120000.0,
401                Some("ft\u{00B2}".into())
402            )))
403        );
404
405        let floor = grid.row(1).unwrap();
406        assert_eq!(floor.get("dis"), Some(&Kind::Str("Floor 1".into())));
407        assert_eq!(floor.get("floor"), Some(&Kind::Marker));
408
409        let ahu = grid.row(2).unwrap();
410        assert_eq!(ahu.get("dis"), Some(&Kind::Str("AHU-1".into())));
411        assert_eq!(ahu.get("equip"), Some(&Kind::Marker));
412        assert_eq!(ahu.get("ahu"), Some(&Kind::Marker));
413    }
414
415    #[test]
416    fn parse_multiline_between_records() {
417        let input = "dis: \"A\"\ndoc:\n  Hello world\n  Second line\n---\ndis: \"B\"\n";
418        let grid = decode_grid(input).unwrap();
419        assert_eq!(grid.len(), 2);
420
421        assert_eq!(
422            grid.row(0).unwrap().get("doc"),
423            Some(&Kind::Str("Hello world\nSecond line".into()))
424        );
425        assert_eq!(
426            grid.row(1).unwrap().get("dis"),
427            Some(&Kind::Str("B".into()))
428        );
429    }
430
431    #[test]
432    fn roundtrip_encode_decode() {
433        use crate::codecs::trio::encode_grid;
434        use crate::data::HCol;
435
436        let cols = vec![
437            HCol::new("area"),
438            HCol::new("dis"),
439            HCol::new("id"),
440            HCol::new("site"),
441        ];
442        let mut row1 = HDict::new();
443        row1.set("dis", Kind::Str("My Site".into()));
444        row1.set("site", Kind::Marker);
445        row1.set(
446            "area",
447            Kind::Number(Number::new(1000.0, Some("ft\u{00B2}".into()))),
448        );
449        row1.set("id", Kind::Ref(HRef::from_val("site-1")));
450
451        let mut row2 = HDict::new();
452        row2.set("dis", Kind::Str("AHU-1".into()));
453        row2.set("id", Kind::Ref(HRef::from_val("ahu-1")));
454
455        let g = HGrid::from_parts(HDict::new(), cols, vec![row1, row2]);
456        let encoded = encode_grid(&g).unwrap();
457        let decoded = decode_grid(&encoded).unwrap();
458
459        assert_eq!(decoded.len(), 2);
460
461        let r0 = decoded.row(0).unwrap();
462        assert_eq!(r0.get("dis"), Some(&Kind::Str("My Site".into())));
463        assert_eq!(r0.get("site"), Some(&Kind::Marker));
464        assert_eq!(
465            r0.get("area"),
466            Some(&Kind::Number(Number::new(
467                1000.0,
468                Some("ft\u{00B2}".into())
469            )))
470        );
471        assert_eq!(r0.get("id"), Some(&Kind::Ref(HRef::from_val("site-1"))));
472
473        let r1 = decoded.row(1).unwrap();
474        assert_eq!(r1.get("dis"), Some(&Kind::Str("AHU-1".into())));
475        assert_eq!(r1.get("id"), Some(&Kind::Ref(HRef::from_val("ahu-1"))));
476    }
477
478    #[test]
479    fn roundtrip_multiline_string() {
480        use crate::codecs::trio::encode_grid;
481        use crate::data::HCol;
482
483        let cols = vec![HCol::new("dis"), HCol::new("doc")];
484        let mut row = HDict::new();
485        row.set("dis", Kind::Str("Test".into()));
486        row.set("doc", Kind::Str("Line 1\nLine 2\nLine 3".into()));
487
488        let g = HGrid::from_parts(HDict::new(), cols, vec![row]);
489        let encoded = encode_grid(&g).unwrap();
490        let decoded = decode_grid(&encoded).unwrap();
491
492        assert_eq!(decoded.len(), 1);
493        let r = decoded.row(0).unwrap();
494        assert_eq!(r.get("dis"), Some(&Kind::Str("Test".into())));
495        assert_eq!(
496            r.get("doc"),
497            Some(&Kind::Str("Line 1\nLine 2\nLine 3".into()))
498        );
499    }
500
501    #[test]
502    fn parse_uri_value() {
503        use crate::kinds::Uri;
504
505        let input = "href: `http://example.com/api`\n";
506        let grid = decode_grid(input).unwrap();
507        let row = grid.row(0).unwrap();
508        assert_eq!(
509            row.get("href"),
510            Some(&Kind::Uri(Uri::new("http://example.com/api")))
511        );
512    }
513
514    #[test]
515    fn codec_for_registry() {
516        use crate::codecs::codec_for;
517
518        let trio = codec_for("text/trio").expect("trio codec should be registered");
519        assert_eq!(trio.mime_type(), "text/trio");
520
521        let zinc = codec_for("text/zinc").expect("zinc codec should be registered");
522        assert_eq!(zinc.mime_type(), "text/zinc");
523
524        assert!(codec_for("text/json").is_none());
525    }
526
527    #[test]
528    fn trio_codec_trait_impl() {
529        use crate::codecs::Codec;
530        use crate::codecs::trio::TrioCodec;
531
532        let codec = TrioCodec;
533        assert_eq!(codec.mime_type(), "text/trio");
534
535        // Scalar encoding/decoding delegates to Zinc
536        let val = Kind::Number(Number::unitless(42.0));
537        let encoded = codec.encode_scalar(&val).unwrap();
538        assert_eq!(encoded, "42");
539        let decoded = codec.decode_scalar(&encoded).unwrap();
540        assert_eq!(decoded, val);
541    }
542}