Skip to main content

haystack_core/codecs/trio/
parser.rs

1// Trio format parser — record-per-entity text format.
2
3use crate::codecs::CodecError;
4use crate::codecs::zinc::ZincParser;
5use crate::data::{HCol, HDict, HGrid};
6use crate::kinds::Kind;
7
8/// Parse Trio-formatted text into an HGrid.
9///
10/// Each record becomes a row in the grid. Records are separated by `---`
11/// (three or more dashes). Columns are derived from all unique tag names
12/// across all records.
13pub fn decode_grid(input: &str) -> Result<HGrid, CodecError> {
14    let records = parse_records(input)?;
15
16    if records.is_empty() {
17        return Ok(HGrid::new());
18    }
19
20    // Derive columns from all unique tag names, preserving insertion order
21    let mut col_names: Vec<String> = Vec::new();
22    let mut seen = std::collections::HashSet::new();
23    for rec in &records {
24        // Sort tag names for deterministic column order within each record
25        let mut names: Vec<&str> = rec.tag_names().collect();
26        names.sort();
27        for name in names {
28            if seen.insert(name.to_string()) {
29                col_names.push(name.to_string());
30            }
31        }
32    }
33
34    let cols: Vec<HCol> = col_names.iter().map(HCol::new).collect();
35    Ok(HGrid::from_parts(HDict::new(), cols, records))
36}
37
38/// Parse the input text into a list of HDict records.
39fn parse_records(input: &str) -> Result<Vec<HDict>, CodecError> {
40    let mut records: Vec<HDict> = Vec::new();
41    let mut current_tags: Vec<(String, Kind)> = Vec::new();
42    let mut multiline_name: Option<String> = None;
43    let mut multiline_lines: Vec<String> = Vec::new();
44
45    for line in input.split('\n') {
46        let stripped = line.trim();
47
48        // Record separator: line of three or more dashes
49        if is_record_separator(stripped) {
50            // Flush multiline string if active
51            if let Some(name) = multiline_name.take() {
52                current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
53                multiline_lines.clear();
54            }
55            // Flush current record
56            if !current_tags.is_empty() {
57                records.push(tags_to_dict(current_tags));
58                current_tags = Vec::new();
59            }
60            continue;
61        }
62
63        // Comment line
64        if stripped.starts_with("//") {
65            continue;
66        }
67
68        // In multiline string mode
69        if multiline_name.is_some() {
70            if let Some(content) = line.strip_prefix("  ").or_else(|| line.strip_prefix('\t')) {
71                // Indented continuation line
72                multiline_lines.push(content.to_string());
73                continue;
74            } else {
75                // Non-indented line ends the multiline
76                if let Some(name) = multiline_name.take() {
77                    current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
78                }
79                multiline_lines.clear();
80                // Fall through to parse this line normally
81            }
82        }
83
84        // Skip empty lines
85        if stripped.is_empty() {
86            continue;
87        }
88
89        // Parse name:value or marker-only line
90        match stripped.find(':') {
91            None => {
92                // Marker tag (just a name)
93                current_tags.push((stripped.to_string(), Kind::Marker));
94            }
95            Some(colon_idx) => {
96                let name = stripped[..colon_idx].trim().to_string();
97                let rest = &stripped[colon_idx + 1..];
98
99                if rest.trim().is_empty() {
100                    // Empty after colon -> multiline string starts on next line
101                    multiline_name = Some(name);
102                    multiline_lines.clear();
103                } else {
104                    // Value follows colon
105                    let val_str = rest.trim();
106                    let val = parse_scalar_value(val_str);
107                    current_tags.push((name, val));
108                }
109            }
110        }
111    }
112
113    // Flush final multiline
114    if let Some(name) = multiline_name.take() {
115        current_tags.push((name, Kind::Str(multiline_lines.join("\n"))));
116    }
117
118    // Flush final record
119    if !current_tags.is_empty() {
120        records.push(tags_to_dict(current_tags));
121    }
122
123    Ok(records)
124}
125
126/// Try to parse a value string as a Zinc scalar.
127/// If parsing fails or the parser doesn't consume all input, treat as a plain string.
128///
129/// The fallback-to-string behavior is by-design for the Trio format: values that
130/// cannot be parsed as Zinc scalars (e.g., unrecognized keywords, partial input,
131/// or free-form text) are intentionally treated as plain strings. This allows Trio
132/// files to contain arbitrary text values without requiring quoting, and provides
133/// forward-compatibility when new scalar types are added to the Zinc grammar.
134fn parse_scalar_value(val_str: &str) -> Kind {
135    let mut parser = ZincParser::new(val_str);
136    match parser.parse_scalar() {
137        Ok(val) => {
138            if parser.at_end() {
139                val
140            } else {
141                // Parser didn't consume all input, treat as plain string
142                Kind::Str(val_str.to_string())
143            }
144        }
145        Err(_) => {
146            // Unparseable as Zinc scalar, treat as plain string
147            Kind::Str(val_str.to_string())
148        }
149    }
150}
151
152/// Check if a line is a record separator (three or more dashes only).
153fn is_record_separator(stripped: &str) -> bool {
154    !stripped.is_empty() && stripped.len() >= 3 && stripped.chars().all(|ch| ch == '-')
155}
156
157/// Convert an ordered list of (name, value) pairs into an HDict.
158fn tags_to_dict(tags: Vec<(String, Kind)>) -> HDict {
159    let mut dict = HDict::new();
160    for (name, val) in tags {
161        dict.set(name, val);
162    }
163    dict
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169    use crate::kinds::{Coord, HRef, Number};
170    use chrono::NaiveDate;
171
172    #[test]
173    fn parse_empty_input() {
174        let grid = decode_grid("").unwrap();
175        assert!(grid.is_empty());
176        assert_eq!(grid.num_cols(), 0);
177    }
178
179    #[test]
180    fn parse_whitespace_only() {
181        let grid = decode_grid("   \n  \n  ").unwrap();
182        assert!(grid.is_empty());
183    }
184
185    #[test]
186    fn parse_single_record_with_markers_and_values() {
187        let input = "dis: \"Site 1\"\nsite\narea: 3702ft\u{00B2}\n";
188        let grid = decode_grid(input).unwrap();
189        assert_eq!(grid.len(), 1);
190
191        let row = grid.row(0).unwrap();
192        assert_eq!(row.get("dis"), Some(&Kind::Str("Site 1".into())));
193        assert_eq!(row.get("site"), Some(&Kind::Marker));
194        assert_eq!(
195            row.get("area"),
196            Some(&Kind::Number(Number::new(
197                3702.0,
198                Some("ft\u{00B2}".into())
199            )))
200        );
201    }
202
203    #[test]
204    fn parse_multiple_records() {
205        let input = "dis: \"Site A\"\nsite\n---\ndis: \"Site B\"\nsite\n";
206        let grid = decode_grid(input).unwrap();
207        assert_eq!(grid.len(), 2);
208
209        assert_eq!(
210            grid.row(0).unwrap().get("dis"),
211            Some(&Kind::Str("Site A".into()))
212        );
213        assert_eq!(
214            grid.row(1).unwrap().get("dis"),
215            Some(&Kind::Str("Site B".into()))
216        );
217    }
218
219    #[test]
220    fn parse_comments_skipped() {
221        let input = "// This is a comment\ndis: \"Site\"\nsite\n";
222        let grid = decode_grid(input).unwrap();
223        assert_eq!(grid.len(), 1);
224        assert_eq!(
225            grid.row(0).unwrap().get("dis"),
226            Some(&Kind::Str("Site".into()))
227        );
228        assert!(grid.row(0).unwrap().missing("//"));
229    }
230
231    #[test]
232    fn parse_multiline_string() {
233        let input = "dis: \"Test\"\ndoc:\n  This is line 1\n  This is line 2\nsite\n";
234        let grid = decode_grid(input).unwrap();
235        assert_eq!(grid.len(), 1);
236
237        let row = grid.row(0).unwrap();
238        assert_eq!(
239            row.get("doc"),
240            Some(&Kind::Str("This is line 1\nThis is line 2".into()))
241        );
242        assert_eq!(row.get("site"), Some(&Kind::Marker));
243    }
244
245    #[test]
246    fn parse_multiline_string_with_tab_indent() {
247        let input = "doc:\n\tLine A\n\tLine B\n";
248        let grid = decode_grid(input).unwrap();
249        assert_eq!(grid.len(), 1);
250
251        let row = grid.row(0).unwrap();
252        assert_eq!(row.get("doc"), Some(&Kind::Str("Line A\nLine B".into())));
253    }
254
255    #[test]
256    fn parse_multiline_string_at_end_of_input() {
257        let input = "doc:\n  Last line";
258        let grid = decode_grid(input).unwrap();
259        assert_eq!(grid.len(), 1);
260
261        let row = grid.row(0).unwrap();
262        assert_eq!(row.get("doc"), Some(&Kind::Str("Last line".into())));
263    }
264
265    #[test]
266    fn parse_markers_alone() {
267        let input = "site\nequip\nahu\n";
268        let grid = decode_grid(input).unwrap();
269        assert_eq!(grid.len(), 1);
270
271        let row = grid.row(0).unwrap();
272        assert_eq!(row.get("site"), Some(&Kind::Marker));
273        assert_eq!(row.get("equip"), Some(&Kind::Marker));
274        assert_eq!(row.get("ahu"), Some(&Kind::Marker));
275    }
276
277    #[test]
278    fn parse_blank_lines_between_tags() {
279        let input = "dis: \"Test\"\n\nsite\n\narea: 100\n";
280        let grid = decode_grid(input).unwrap();
281        assert_eq!(grid.len(), 1);
282
283        let row = grid.row(0).unwrap();
284        assert_eq!(row.get("dis"), Some(&Kind::Str("Test".into())));
285        assert_eq!(row.get("site"), Some(&Kind::Marker));
286        assert_eq!(
287            row.get("area"),
288            Some(&Kind::Number(Number::unitless(100.0)))
289        );
290    }
291
292    #[test]
293    fn parse_ref_values() {
294        let input = "id: @site-1\nsiteRef: @alpha\n";
295        let grid = decode_grid(input).unwrap();
296        assert_eq!(grid.len(), 1);
297
298        let row = grid.row(0).unwrap();
299        assert_eq!(row.get("id"), Some(&Kind::Ref(HRef::from_val("site-1"))));
300        assert_eq!(
301            row.get("siteRef"),
302            Some(&Kind::Ref(HRef::from_val("alpha")))
303        );
304    }
305
306    #[test]
307    fn parse_date_value() {
308        let input = "installed: 2024-03-13\n";
309        let grid = decode_grid(input).unwrap();
310        let row = grid.row(0).unwrap();
311        assert_eq!(
312            row.get("installed"),
313            Some(&Kind::Date(NaiveDate::from_ymd_opt(2024, 3, 13).unwrap()))
314        );
315    }
316
317    #[test]
318    fn parse_coord_value() {
319        let input = "geoCoord: C(37.5458,-77.4491)\n";
320        let grid = decode_grid(input).unwrap();
321        let row = grid.row(0).unwrap();
322        assert_eq!(
323            row.get("geoCoord"),
324            Some(&Kind::Coord(Coord::new(37.5458, -77.4491)))
325        );
326    }
327
328    #[test]
329    fn parse_bool_values() {
330        let input = "active: T\ndeleted: F\n";
331        let grid = decode_grid(input).unwrap();
332        let row = grid.row(0).unwrap();
333        assert_eq!(row.get("active"), Some(&Kind::Bool(true)));
334        assert_eq!(row.get("deleted"), Some(&Kind::Bool(false)));
335    }
336
337    #[test]
338    fn parse_number_with_unit() {
339        let input = "temp: 72.5\u{00B0}F\nflow: 350gal/min\n";
340        let grid = decode_grid(input).unwrap();
341        let row = grid.row(0).unwrap();
342        assert_eq!(
343            row.get("temp"),
344            Some(&Kind::Number(Number::new(72.5, Some("\u{00B0}F".into()))))
345        );
346        assert_eq!(
347            row.get("flow"),
348            Some(&Kind::Number(Number::new(350.0, Some("gal/min".into()))))
349        );
350    }
351
352    #[test]
353    fn parse_separator_with_more_dashes() {
354        let input = "site\n-----\nequip\n";
355        let grid = decode_grid(input).unwrap();
356        assert_eq!(grid.len(), 2);
357        assert_eq!(grid.row(0).unwrap().get("site"), Some(&Kind::Marker));
358        assert_eq!(grid.row(1).unwrap().get("equip"), Some(&Kind::Marker));
359    }
360
361    #[test]
362    fn parse_columns_derived_from_all_records() {
363        let input = "dis: \"A\"\nsite\n---\ndis: \"B\"\narea: 100\n";
364        let grid = decode_grid(input).unwrap();
365
366        // Columns should include tags from both records
367        let col_names: Vec<&str> = grid.col_names().collect();
368        assert!(col_names.contains(&"dis"));
369        assert!(col_names.contains(&"site"));
370        assert!(col_names.contains(&"area"));
371    }
372
373    #[test]
374    fn parse_complex_trio_file() {
375        let input = "\
376// Alpha Office
377id: @alpha
378dis: \"Alpha Office\"
379site
380geoAddr: \"600 N 2nd St, Richmond VA 23219\"
381geoCoord: C(37.5407,-77.4360)
382area: 120000ft\u{00B2}
383---
384// Floor 1
385id: @floor1
386dis: \"Floor 1\"
387floor
388siteRef: @alpha
389---
390id: @ahu1
391dis: \"AHU-1\"
392equip
393ahu
394siteRef: @alpha
395floorRef: @floor1
396";
397        let grid = decode_grid(input).unwrap();
398        assert_eq!(grid.len(), 3);
399
400        let site = grid.row(0).unwrap();
401        assert_eq!(site.get("dis"), Some(&Kind::Str("Alpha Office".into())));
402        assert_eq!(site.get("site"), Some(&Kind::Marker));
403        assert_eq!(site.get("id"), Some(&Kind::Ref(HRef::from_val("alpha"))));
404        assert_eq!(
405            site.get("area"),
406            Some(&Kind::Number(Number::new(
407                120000.0,
408                Some("ft\u{00B2}".into())
409            )))
410        );
411
412        let floor = grid.row(1).unwrap();
413        assert_eq!(floor.get("dis"), Some(&Kind::Str("Floor 1".into())));
414        assert_eq!(floor.get("floor"), Some(&Kind::Marker));
415
416        let ahu = grid.row(2).unwrap();
417        assert_eq!(ahu.get("dis"), Some(&Kind::Str("AHU-1".into())));
418        assert_eq!(ahu.get("equip"), Some(&Kind::Marker));
419        assert_eq!(ahu.get("ahu"), Some(&Kind::Marker));
420    }
421
422    #[test]
423    fn parse_multiline_between_records() {
424        let input = "dis: \"A\"\ndoc:\n  Hello world\n  Second line\n---\ndis: \"B\"\n";
425        let grid = decode_grid(input).unwrap();
426        assert_eq!(grid.len(), 2);
427
428        assert_eq!(
429            grid.row(0).unwrap().get("doc"),
430            Some(&Kind::Str("Hello world\nSecond line".into()))
431        );
432        assert_eq!(
433            grid.row(1).unwrap().get("dis"),
434            Some(&Kind::Str("B".into()))
435        );
436    }
437
438    #[test]
439    fn roundtrip_encode_decode() {
440        use crate::codecs::trio::encode_grid;
441        use crate::data::HCol;
442
443        let cols = vec![
444            HCol::new("area"),
445            HCol::new("dis"),
446            HCol::new("id"),
447            HCol::new("site"),
448        ];
449        let mut row1 = HDict::new();
450        row1.set("dis", Kind::Str("My Site".into()));
451        row1.set("site", Kind::Marker);
452        row1.set(
453            "area",
454            Kind::Number(Number::new(1000.0, Some("ft\u{00B2}".into()))),
455        );
456        row1.set("id", Kind::Ref(HRef::from_val("site-1")));
457
458        let mut row2 = HDict::new();
459        row2.set("dis", Kind::Str("AHU-1".into()));
460        row2.set("id", Kind::Ref(HRef::from_val("ahu-1")));
461
462        let g = HGrid::from_parts(HDict::new(), cols, vec![row1, row2]);
463        let encoded = encode_grid(&g).unwrap();
464        let decoded = decode_grid(&encoded).unwrap();
465
466        assert_eq!(decoded.len(), 2);
467
468        let r0 = decoded.row(0).unwrap();
469        assert_eq!(r0.get("dis"), Some(&Kind::Str("My Site".into())));
470        assert_eq!(r0.get("site"), Some(&Kind::Marker));
471        assert_eq!(
472            r0.get("area"),
473            Some(&Kind::Number(Number::new(
474                1000.0,
475                Some("ft\u{00B2}".into())
476            )))
477        );
478        assert_eq!(r0.get("id"), Some(&Kind::Ref(HRef::from_val("site-1"))));
479
480        let r1 = decoded.row(1).unwrap();
481        assert_eq!(r1.get("dis"), Some(&Kind::Str("AHU-1".into())));
482        assert_eq!(r1.get("id"), Some(&Kind::Ref(HRef::from_val("ahu-1"))));
483    }
484
485    #[test]
486    fn roundtrip_multiline_string() {
487        use crate::codecs::trio::encode_grid;
488        use crate::data::HCol;
489
490        let cols = vec![HCol::new("dis"), HCol::new("doc")];
491        let mut row = HDict::new();
492        row.set("dis", Kind::Str("Test".into()));
493        row.set("doc", Kind::Str("Line 1\nLine 2\nLine 3".into()));
494
495        let g = HGrid::from_parts(HDict::new(), cols, vec![row]);
496        let encoded = encode_grid(&g).unwrap();
497        let decoded = decode_grid(&encoded).unwrap();
498
499        assert_eq!(decoded.len(), 1);
500        let r = decoded.row(0).unwrap();
501        assert_eq!(r.get("dis"), Some(&Kind::Str("Test".into())));
502        assert_eq!(
503            r.get("doc"),
504            Some(&Kind::Str("Line 1\nLine 2\nLine 3".into()))
505        );
506    }
507
508    #[test]
509    fn parse_uri_value() {
510        use crate::kinds::Uri;
511
512        let input = "href: `http://example.com/api`\n";
513        let grid = decode_grid(input).unwrap();
514        let row = grid.row(0).unwrap();
515        assert_eq!(
516            row.get("href"),
517            Some(&Kind::Uri(Uri::new("http://example.com/api")))
518        );
519    }
520
521    #[test]
522    fn codec_for_registry() {
523        use crate::codecs::codec_for;
524
525        let trio = codec_for("text/trio").expect("trio codec should be registered");
526        assert_eq!(trio.mime_type(), "text/trio");
527
528        let zinc = codec_for("text/zinc").expect("zinc codec should be registered");
529        assert_eq!(zinc.mime_type(), "text/zinc");
530
531        assert!(codec_for("text/json").is_none());
532    }
533
534    #[test]
535    fn trio_codec_trait_impl() {
536        use crate::codecs::Codec;
537        use crate::codecs::trio::TrioCodec;
538
539        let codec = TrioCodec;
540        assert_eq!(codec.mime_type(), "text/trio");
541
542        // Scalar encoding/decoding delegates to Zinc
543        let val = Kind::Number(Number::unitless(42.0));
544        let encoded = codec.encode_scalar(&val).unwrap();
545        assert_eq!(encoded, "42");
546        let decoded = codec.decode_scalar(&encoded).unwrap();
547        assert_eq!(decoded, val);
548    }
549}