Skip to main content

haystack_core/codecs/trio/
parser.rs

1// Trio format parser — record-per-entity text format.
2
3use crate::codecs::CodecError;
4use crate::codecs::zinc::ZincParser;
5use crate::data::{HCol, HDict, HGrid};
6use crate::kinds::Kind;
7
8/// Parse Trio-formatted text into an HGrid.
9///
10/// Each record becomes a row in the grid. Records are separated by `---`
11/// (three or more dashes). Columns are derived from all unique tag names
12/// across all records.
13pub fn decode_grid(input: &str) -> Result<HGrid, CodecError> {
14    let records = parse_records(input)?;
15
16    if records.is_empty() {
17        return Ok(HGrid::new());
18    }
19
20    // Derive columns from all unique tag names, preserving insertion order
21    let mut col_names: Vec<String> = Vec::new();
22    let mut seen = std::collections::HashSet::new();
23    for rec in &records {
24        // Sort tag names for deterministic column order within each record
25        let mut names: Vec<&str> = rec.tag_names().collect();
26        names.sort();
27        for name in names {
28            if seen.insert(name.to_string()) {
29                col_names.push(name.to_string());
30            }
31        }
32    }
33
34    let cols: Vec<HCol> = col_names.iter().map(HCol::new).collect();
35    Ok(HGrid::from_parts(HDict::new(), cols, records))
36}
37
38/// Parse the input text into a list of HDict records.
39fn parse_records(input: &str) -> Result<Vec<HDict>, CodecError> {
40    let mut records: Vec<HDict> = Vec::new();
41    let mut current_tags: Vec<(String, Kind)> = Vec::new();
42    let mut multiline_name: Option<String> = None;
43    let mut multiline_lines: Vec<String> = Vec::new();
44
45    for line in input.split('\n') {
46        let stripped = line.trim();
47
48        // Record separator: line of three or more dashes
49        if is_record_separator(stripped) {
50            // Flush multiline string if active
51            if let Some(name) = multiline_name.take() {
52                current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
53                multiline_lines.clear();
54            }
55            // Flush current record
56            if !current_tags.is_empty() {
57                records.push(tags_to_dict(current_tags));
58                current_tags = Vec::new();
59            }
60            continue;
61        }
62
63        // Comment line
64        if stripped.starts_with("//") {
65            continue;
66        }
67
68        // In multiline string mode
69        if multiline_name.is_some() {
70            if let Some(content) = line.strip_prefix("  ").or_else(|| line.strip_prefix('\t')) {
71                // Indented continuation line
72                multiline_lines.push(content.to_string());
73                continue;
74            } else {
75                // Non-indented line ends the multiline
76                if let Some(name) = multiline_name.take() {
77                    current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
78                }
79                multiline_lines.clear();
80                // Fall through to parse this line normally
81            }
82        }
83
84        // Skip empty lines
85        if stripped.is_empty() {
86            continue;
87        }
88
89        // Parse name:value or marker-only line
90        match stripped.find(':') {
91            None => {
92                // Marker tag (just a name)
93                current_tags.push((stripped.to_string(), Kind::Marker));
94            }
95            Some(colon_idx) => {
96                let name = stripped[..colon_idx].trim().to_string();
97                let rest = &stripped[colon_idx + 1..];
98
99                if rest.trim().is_empty() {
100                    // Empty after colon -> multiline string starts on next line
101                    multiline_name = Some(name);
102                    multiline_lines.clear();
103                } else {
104                    // Value follows colon
105                    let val_str = rest.trim();
106                    let val = parse_scalar_value(val_str);
107                    current_tags.push((name, val));
108                }
109            }
110        }
111    }
112
113    // Flush final multiline
114    if let Some(name) = multiline_name.take() {
115        current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
116    }
117
118    // Flush final record
119    if !current_tags.is_empty() {
120        records.push(tags_to_dict(current_tags));
121    }
122
123    Ok(records)
124}
125
126/// Try to parse a value string as a Zinc scalar.
127/// If parsing fails or the parser doesn't consume all input, treat as a plain string.
128fn parse_scalar_value(val_str: &str) -> Kind {
129    let mut parser = ZincParser::new(val_str);
130    match parser.parse_scalar() {
131        Ok(val) => {
132            if parser.at_end() {
133                val
134            } else {
135                // Parser didn't consume all input, treat as plain string
136                Kind::Str(val_str.to_string())
137            }
138        }
139        Err(_) => {
140            // Unparseable as Zinc scalar, treat as plain string
141            Kind::Str(val_str.to_string())
142        }
143    }
144}
145
146/// Check if a line is a record separator (three or more dashes only).
147fn is_record_separator(stripped: &str) -> bool {
148    !stripped.is_empty() && stripped.len() >= 3 && stripped.chars().all(|ch| ch == '-')
149}
150
151/// Convert an ordered list of (name, value) pairs into an HDict.
152fn tags_to_dict(tags: Vec<(String, Kind)>) -> HDict {
153    let mut dict = HDict::new();
154    for (name, val) in tags {
155        dict.set(name, val);
156    }
157    dict
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163    use crate::kinds::{Coord, HRef, Number};
164    use chrono::NaiveDate;
165
166    #[test]
167    fn parse_empty_input() {
168        let grid = decode_grid("").unwrap();
169        assert!(grid.is_empty());
170        assert_eq!(grid.num_cols(), 0);
171    }
172
173    #[test]
174    fn parse_whitespace_only() {
175        let grid = decode_grid("   \n  \n  ").unwrap();
176        assert!(grid.is_empty());
177    }
178
179    #[test]
180    fn parse_single_record_with_markers_and_values() {
181        let input = "dis: \"Site 1\"\nsite\narea: 3702ft\u{00B2}\n";
182        let grid = decode_grid(input).unwrap();
183        assert_eq!(grid.len(), 1);
184
185        let row = grid.row(0).unwrap();
186        assert_eq!(row.get("dis"), Some(&Kind::Str("Site 1".into())));
187        assert_eq!(row.get("site"), Some(&Kind::Marker));
188        assert_eq!(
189            row.get("area"),
190            Some(&Kind::Number(Number::new(
191                3702.0,
192                Some("ft\u{00B2}".into())
193            )))
194        );
195    }
196
197    #[test]
198    fn parse_multiple_records() {
199        let input = "dis: \"Site A\"\nsite\n---\ndis: \"Site B\"\nsite\n";
200        let grid = decode_grid(input).unwrap();
201        assert_eq!(grid.len(), 2);
202
203        assert_eq!(
204            grid.row(0).unwrap().get("dis"),
205            Some(&Kind::Str("Site A".into()))
206        );
207        assert_eq!(
208            grid.row(1).unwrap().get("dis"),
209            Some(&Kind::Str("Site B".into()))
210        );
211    }
212
213    #[test]
214    fn parse_comments_skipped() {
215        let input = "// This is a comment\ndis: \"Site\"\nsite\n";
216        let grid = decode_grid(input).unwrap();
217        assert_eq!(grid.len(), 1);
218        assert_eq!(
219            grid.row(0).unwrap().get("dis"),
220            Some(&Kind::Str("Site".into()))
221        );
222        assert!(grid.row(0).unwrap().missing("//"));
223    }
224
225    #[test]
226    fn parse_multiline_string() {
227        let input = "dis: \"Test\"\ndoc:\n  This is line 1\n  This is line 2\nsite\n";
228        let grid = decode_grid(input).unwrap();
229        assert_eq!(grid.len(), 1);
230
231        let row = grid.row(0).unwrap();
232        assert_eq!(
233            row.get("doc"),
234            Some(&Kind::Str("This is line 1\nThis is line 2".into()))
235        );
236        assert_eq!(row.get("site"), Some(&Kind::Marker));
237    }
238
239    #[test]
240    fn parse_multiline_string_with_tab_indent() {
241        let input = "doc:\n\tLine A\n\tLine B\n";
242        let grid = decode_grid(input).unwrap();
243        assert_eq!(grid.len(), 1);
244
245        let row = grid.row(0).unwrap();
246        assert_eq!(row.get("doc"), Some(&Kind::Str("Line A\nLine B".into())));
247    }
248
249    #[test]
250    fn parse_multiline_string_at_end_of_input() {
251        let input = "doc:\n  Last line";
252        let grid = decode_grid(input).unwrap();
253        assert_eq!(grid.len(), 1);
254
255        let row = grid.row(0).unwrap();
256        assert_eq!(row.get("doc"), Some(&Kind::Str("Last line".into())));
257    }
258
259    #[test]
260    fn parse_markers_alone() {
261        let input = "site\nequip\nahu\n";
262        let grid = decode_grid(input).unwrap();
263        assert_eq!(grid.len(), 1);
264
265        let row = grid.row(0).unwrap();
266        assert_eq!(row.get("site"), Some(&Kind::Marker));
267        assert_eq!(row.get("equip"), Some(&Kind::Marker));
268        assert_eq!(row.get("ahu"), Some(&Kind::Marker));
269    }
270
271    #[test]
272    fn parse_blank_lines_between_tags() {
273        let input = "dis: \"Test\"\n\nsite\n\narea: 100\n";
274        let grid = decode_grid(input).unwrap();
275        assert_eq!(grid.len(), 1);
276
277        let row = grid.row(0).unwrap();
278        assert_eq!(row.get("dis"), Some(&Kind::Str("Test".into())));
279        assert_eq!(row.get("site"), Some(&Kind::Marker));
280        assert_eq!(
281            row.get("area"),
282            Some(&Kind::Number(Number::unitless(100.0)))
283        );
284    }
285
286    #[test]
287    fn parse_ref_values() {
288        let input = "id: @site-1\nsiteRef: @alpha\n";
289        let grid = decode_grid(input).unwrap();
290        assert_eq!(grid.len(), 1);
291
292        let row = grid.row(0).unwrap();
293        assert_eq!(row.get("id"), Some(&Kind::Ref(HRef::from_val("site-1"))));
294        assert_eq!(
295            row.get("siteRef"),
296            Some(&Kind::Ref(HRef::from_val("alpha")))
297        );
298    }
299
300    #[test]
301    fn parse_date_value() {
302        let input = "installed: 2024-03-13\n";
303        let grid = decode_grid(input).unwrap();
304        let row = grid.row(0).unwrap();
305        assert_eq!(
306            row.get("installed"),
307            Some(&Kind::Date(NaiveDate::from_ymd_opt(2024, 3, 13).unwrap()))
308        );
309    }
310
311    #[test]
312    fn parse_coord_value() {
313        let input = "geoCoord: C(37.5458,-77.4491)\n";
314        let grid = decode_grid(input).unwrap();
315        let row = grid.row(0).unwrap();
316        assert_eq!(
317            row.get("geoCoord"),
318            Some(&Kind::Coord(Coord::new(37.5458, -77.4491)))
319        );
320    }
321
322    #[test]
323    fn parse_bool_values() {
324        let input = "active: T\ndeleted: F\n";
325        let grid = decode_grid(input).unwrap();
326        let row = grid.row(0).unwrap();
327        assert_eq!(row.get("active"), Some(&Kind::Bool(true)));
328        assert_eq!(row.get("deleted"), Some(&Kind::Bool(false)));
329    }
330
331    #[test]
332    fn parse_number_with_unit() {
333        let input = "temp: 72.5\u{00B0}F\nflow: 350gal/min\n";
334        let grid = decode_grid(input).unwrap();
335        let row = grid.row(0).unwrap();
336        assert_eq!(
337            row.get("temp"),
338            Some(&Kind::Number(Number::new(72.5, Some("\u{00B0}F".into()))))
339        );
340        assert_eq!(
341            row.get("flow"),
342            Some(&Kind::Number(Number::new(350.0, Some("gal/min".into()))))
343        );
344    }
345
346    #[test]
347    fn parse_separator_with_more_dashes() {
348        let input = "site\n-----\nequip\n";
349        let grid = decode_grid(input).unwrap();
350        assert_eq!(grid.len(), 2);
351        assert_eq!(grid.row(0).unwrap().get("site"), Some(&Kind::Marker));
352        assert_eq!(grid.row(1).unwrap().get("equip"), Some(&Kind::Marker));
353    }
354
355    #[test]
356    fn parse_columns_derived_from_all_records() {
357        let input = "dis: \"A\"\nsite\n---\ndis: \"B\"\narea: 100\n";
358        let grid = decode_grid(input).unwrap();
359
360        // Columns should include tags from both records
361        let col_names: Vec<&str> = grid.col_names().collect();
362        assert!(col_names.contains(&"dis"));
363        assert!(col_names.contains(&"site"));
364        assert!(col_names.contains(&"area"));
365    }
366
367    #[test]
368    fn parse_complex_trio_file() {
369        let input = "\
370// Alpha Office
371id: @alpha
372dis: \"Alpha Office\"
373site
374geoAddr: \"600 N 2nd St, Richmond VA 23219\"
375geoCoord: C(37.5407,-77.4360)
376area: 120000ft\u{00B2}
377---
378// Floor 1
379id: @floor1
380dis: \"Floor 1\"
381floor
382siteRef: @alpha
383---
384id: @ahu1
385dis: \"AHU-1\"
386equip
387ahu
388siteRef: @alpha
389floorRef: @floor1
390";
391        let grid = decode_grid(input).unwrap();
392        assert_eq!(grid.len(), 3);
393
394        let site = grid.row(0).unwrap();
395        assert_eq!(site.get("dis"), Some(&Kind::Str("Alpha Office".into())));
396        assert_eq!(site.get("site"), Some(&Kind::Marker));
397        assert_eq!(site.get("id"), Some(&Kind::Ref(HRef::from_val("alpha"))));
398        assert_eq!(
399            site.get("area"),
400            Some(&Kind::Number(Number::new(
401                120000.0,
402                Some("ft\u{00B2}".into())
403            )))
404        );
405
406        let floor = grid.row(1).unwrap();
407        assert_eq!(floor.get("dis"), Some(&Kind::Str("Floor 1".into())));
408        assert_eq!(floor.get("floor"), Some(&Kind::Marker));
409
410        let ahu = grid.row(2).unwrap();
411        assert_eq!(ahu.get("dis"), Some(&Kind::Str("AHU-1".into())));
412        assert_eq!(ahu.get("equip"), Some(&Kind::Marker));
413        assert_eq!(ahu.get("ahu"), Some(&Kind::Marker));
414    }
415
416    #[test]
417    fn parse_multiline_between_records() {
418        let input = "dis: \"A\"\ndoc:\n  Hello world\n  Second line\n---\ndis: \"B\"\n";
419        let grid = decode_grid(input).unwrap();
420        assert_eq!(grid.len(), 2);
421
422        assert_eq!(
423            grid.row(0).unwrap().get("doc"),
424            Some(&Kind::Str("Hello world\nSecond line".into()))
425        );
426        assert_eq!(
427            grid.row(1).unwrap().get("dis"),
428            Some(&Kind::Str("B".into()))
429        );
430    }
431
432    #[test]
433    fn roundtrip_encode_decode() {
434        use crate::codecs::trio::encode_grid;
435        use crate::data::HCol;
436
437        let cols = vec![
438            HCol::new("area"),
439            HCol::new("dis"),
440            HCol::new("id"),
441            HCol::new("site"),
442        ];
443        let mut row1 = HDict::new();
444        row1.set("dis", Kind::Str("My Site".into()));
445        row1.set("site", Kind::Marker);
446        row1.set(
447            "area",
448            Kind::Number(Number::new(1000.0, Some("ft\u{00B2}".into()))),
449        );
450        row1.set("id", Kind::Ref(HRef::from_val("site-1")));
451
452        let mut row2 = HDict::new();
453        row2.set("dis", Kind::Str("AHU-1".into()));
454        row2.set("id", Kind::Ref(HRef::from_val("ahu-1")));
455
456        let g = HGrid::from_parts(HDict::new(), cols, vec![row1, row2]);
457        let encoded = encode_grid(&g).unwrap();
458        let decoded = decode_grid(&encoded).unwrap();
459
460        assert_eq!(decoded.len(), 2);
461
462        let r0 = decoded.row(0).unwrap();
463        assert_eq!(r0.get("dis"), Some(&Kind::Str("My Site".into())));
464        assert_eq!(r0.get("site"), Some(&Kind::Marker));
465        assert_eq!(
466            r0.get("area"),
467            Some(&Kind::Number(Number::new(
468                1000.0,
469                Some("ft\u{00B2}".into())
470            )))
471        );
472        assert_eq!(r0.get("id"), Some(&Kind::Ref(HRef::from_val("site-1"))));
473
474        let r1 = decoded.row(1).unwrap();
475        assert_eq!(r1.get("dis"), Some(&Kind::Str("AHU-1".into())));
476        assert_eq!(r1.get("id"), Some(&Kind::Ref(HRef::from_val("ahu-1"))));
477    }
478
479    #[test]
480    fn roundtrip_multiline_string() {
481        use crate::codecs::trio::encode_grid;
482        use crate::data::HCol;
483
484        let cols = vec![HCol::new("dis"), HCol::new("doc")];
485        let mut row = HDict::new();
486        row.set("dis", Kind::Str("Test".into()));
487        row.set("doc", Kind::Str("Line 1\nLine 2\nLine 3".into()));
488
489        let g = HGrid::from_parts(HDict::new(), cols, vec![row]);
490        let encoded = encode_grid(&g).unwrap();
491        let decoded = decode_grid(&encoded).unwrap();
492
493        assert_eq!(decoded.len(), 1);
494        let r = decoded.row(0).unwrap();
495        assert_eq!(r.get("dis"), Some(&Kind::Str("Test".into())));
496        assert_eq!(
497            r.get("doc"),
498            Some(&Kind::Str("Line 1\nLine 2\nLine 3".into()))
499        );
500    }
501
502    #[test]
503    fn parse_uri_value() {
504        use crate::kinds::Uri;
505
506        let input = "href: `http://example.com/api`\n";
507        let grid = decode_grid(input).unwrap();
508        let row = grid.row(0).unwrap();
509        assert_eq!(
510            row.get("href"),
511            Some(&Kind::Uri(Uri::new("http://example.com/api")))
512        );
513    }
514
515    #[test]
516    fn codec_for_registry() {
517        use crate::codecs::codec_for;
518
519        let trio = codec_for("text/trio").expect("trio codec should be registered");
520        assert_eq!(trio.mime_type(), "text/trio");
521
522        let zinc = codec_for("text/zinc").expect("zinc codec should be registered");
523        assert_eq!(zinc.mime_type(), "text/zinc");
524
525        assert!(codec_for("text/json").is_none());
526    }
527
528    #[test]
529    fn trio_codec_trait_impl() {
530        use crate::codecs::Codec;
531        use crate::codecs::trio::TrioCodec;
532
533        let codec = TrioCodec;
534        assert_eq!(codec.mime_type(), "text/trio");
535
536        // Scalar encoding/decoding delegates to Zinc
537        let val = Kind::Number(Number::unitless(42.0));
538        let encoded = codec.encode_scalar(&val).unwrap();
539        assert_eq!(encoded, "42");
540        let decoded = codec.decode_scalar(&encoded).unwrap();
541        assert_eq!(decoded, val);
542    }
543}