Skip to main content

sas/
parser.rs

1use crate::error::ParseError;
2use crate::value::{Object, Value};
3
4// ── Public entry point ────────────────────────────────────────────────────────
5
6/// Parse a SAS 1.1 document string into a [`Value::Object`].
7pub fn parse(source: &str) -> Result<Value, ParseError> {
8    Parser::new(source).parse()
9}
10
11// ── Frame ─────────────────────────────────────────────────────────────────────
12
13#[derive(Debug)]
14enum FrameContent {
15    Object(Object),
16    Array(Vec<Value>),
17}
18
19#[derive(Debug)]
20struct Frame {
21    key:     String,
22    content: FrameContent,
23    is_anon: bool,
24}
25
26impl Frame {
27    fn new_object(key: impl Into<String>) -> Self {
28        Self { key: key.into(), content: FrameContent::Object(Object::new()), is_anon: false }
29    }
30    fn new_anon() -> Self {
31        Self { key: "-".into(), content: FrameContent::Object(Object::new()), is_anon: true }
32    }
33
34    fn is_array(&self) -> bool {
35        matches!(self.content, FrameContent::Array(_))
36    }
37
38    #[allow(dead_code)]
39    fn obj_len(&self) -> usize {
40        match &self.content {
41            FrameContent::Object(o) => o.len(),
42            FrameContent::Array(_)  => 0,
43        }
44    }
45
46    fn to_value(self) -> Value {
47        match self.content {
48            FrameContent::Object(o) => Value::Object(o),
49            FrameContent::Array(a)  => Value::Array(a),
50        }
51    }
52}
53
54// ── Parser ────────────────────────────────────────────────────────────────────
55
56struct Parser<'a> {
57    lines:           Vec<&'a str>,
58    line_num:        usize,
59    stack:           Vec<Frame>,
60    in_multiline:    bool,
61    multiline_key:   String,
62    multiline_lines: Vec<String>,
63}
64
65impl<'a> Parser<'a> {
66    fn new(source: &'a str) -> Self {
67        let lines: Vec<&str> = source.split('\n').collect();
68        Self {
69            lines,
70            line_num: 0,
71            stack: Vec::new(),
72            in_multiline: false,
73            multiline_key: String::new(),
74            multiline_lines: Vec::new(),
75        }
76    }
77
78    fn parse(mut self) -> Result<Value, ParseError> {
79        self.stack.push(Frame::new_object("__root__"));
80
81        let lines: Vec<String> = self.lines.iter()
82            .map(|l| l.trim_end_matches('\r').to_string())
83            .collect();
84
85        for (i, raw) in lines.iter().enumerate() {
86            self.line_num = i + 1;
87
88            if self.in_multiline {
89                self.process_multiline_line(raw)?;
90                continue;
91            }
92            self.process_line(raw)?;
93        }
94
95        if self.in_multiline {
96            return Err(self.err("E03: Unexpected end of document inside multiline string"));
97        }
98        if self.stack.len() > 1 {
99            let top_key = self.stack.last().unwrap().key.clone();
100            return Err(ParseError::new(
101                lines.len(),
102                format!("E03: Unexpected end of document — unclosed block {:?}", top_key),
103            ));
104        }
105
106        let root = self.stack.pop().unwrap().to_value();
107        Ok(root)
108    }
109
110    // ── Line dispatch ─────────────────────────────────────────────────────────
111
112    fn process_line(&mut self, raw: &str) -> Result<(), ParseError> {
113        let line = raw.trim();
114
115        if line.is_empty() || line.starts_with('#') {
116            return Ok(());
117        }
118
119        // Block closer: ":: key" or ":: -"
120        if let Some(closer) = line.strip_prefix(":: ") {
121            if closer.is_empty() {
122                return Err(self.err("E02: Block closer missing identifier after \"::\""));
123            }
124            return self.close_block(closer);
125        }
126        if line == "::" {
127            return Err(self.err("E02: Bare \"::\" not permitted in SAS 1.1; use \":: key\" or \":: -\""));
128        }
129
130        // Anonymous block opener: "- ::"
131        if line == "- ::" {
132            return self.open_anon_block();
133        }
134
135        // Array item: "- value"
136        if let Some(rest) = line.strip_prefix("- ") {
137            let val = self.parse_value(rest)?;
138            return self.add_array_item(val);
139        }
140
141        // Key-based lines
142        let key_end = line.find(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
143            .unwrap_or(line.len());
144
145        if key_end == 0 {
146            if line.starts_with('-') && line.len() > 1 && line.chars().nth(1).map_or(false, |c| c.is_alphanumeric() || c == '_') {
147                return Err(self.errf(format!("E13: Key must not begin with \"-\": {:?}", line.split_whitespace().next().unwrap_or(line))));
148            }
149            return Err(self.errf(format!("Unexpected token: {:?}", line)));
150        }
151
152        // Keys must not start with '-'
153        if line.starts_with('-') {
154            return Err(self.errf(format!("E13: Key must not begin with \"-\": {:?}", &line[..key_end])));
155        }
156
157        let key = &line[..key_end];
158        let rest = &line[key_end..];
159
160        // Block opener: " ::"
161        if rest == " ::" {
162            return self.open_block(key);
163        }
164
165        // Key-value pair: " -> value"
166        if let Some(value_str) = rest.strip_prefix(" -> ") {
167            if value_str.is_empty() {
168                return Err(self.errf(format!("Missing value for key {:?}", key)));
169            }
170            self.check_no_inline_comment(value_str)?;
171            if value_str == "\"\"\"" {
172                return self.start_multiline(key);
173            }
174            let val = self.parse_value(value_str)?;
175            return self.assign_to_frame(key, val);
176        }
177
178        if rest.contains("->") || line.contains("->") {
179            return Err(self.err("E08: Missing spaces around \"->\"; expected \" -> \""));
180        }
181
182        Err(self.errf(format!("Unexpected token after key {:?}: {:?}", key, rest)))
183    }
184
185    // ── Multiline strings ─────────────────────────────────────────────────────
186
187    fn process_multiline_line(&mut self, raw: &str) -> Result<(), ParseError> {
188        if raw.trim_end() == "\"\"\"" {
189            let value = if self.multiline_lines.is_empty() {
190                String::new()
191            } else {
192                self.multiline_lines.join("\n") + "\n"
193            };
194            let key = std::mem::take(&mut self.multiline_key);
195            self.assign_to_frame(&key, Value::String(value))?;
196            self.in_multiline = false;
197            self.multiline_lines.clear();
198            Ok(())
199        } else {
200            self.multiline_lines.push(raw.to_string());
201            Ok(())
202        }
203    }
204
205    fn start_multiline(&mut self, key: &str) -> Result<(), ParseError> {
206        let frame = self.current_frame_mut();
207        if frame.is_array() {
208            return Err(ParseError::new(self.line_num, "E14: Key-value pair inside array block"));
209        }
210        if let FrameContent::Object(ref obj) = frame.content {
211            if obj.contains_key(key) {
212                return Err(self.errf(format!("E01: Duplicate key {:?}", key)));
213            }
214        }
215        self.in_multiline = true;
216        self.multiline_key = key.to_string();
217        self.multiline_lines.clear();
218        Ok(())
219    }
220
221    // ── Block management ──────────────────────────────────────────────────────
222
223    fn open_block(&mut self, key: &str) -> Result<(), ParseError> {
224        {
225            let parent = self.current_frame();
226            if parent.is_array() {
227                return Err(self.errf(format!(
228                    "E14: Named block opener {:?} inside array block; use \"- ::\" for anonymous elements",
229                    format!("{} ::", key)
230                )));
231            }
232            if let FrameContent::Object(ref obj) = parent.content {
233                if obj.contains_key(key) {
234                    return Err(self.errf(format!("E01: Duplicate key {:?}", key)));
235                }
236            }
237        }
238        self.stack.push(Frame::new_object(key));
239        Ok(())
240    }
241
242    fn open_anon_block(&mut self) -> Result<(), ParseError> {
243        {
244            let parent = self.current_frame();
245            if let FrameContent::Object(ref obj) = parent.content {
246                if obj.len() > 0 {
247                    return Err(self.err("E14: Anonymous block \"- ::\" inside object block (mixed block content)"));
248                }
249            }
250        }
251        // Convert object → array if needed
252        {
253            let parent = self.current_frame_mut();
254            if let FrameContent::Object(_) = &parent.content {
255                parent.content = FrameContent::Array(Vec::new());
256            }
257        }
258        if !self.current_frame().is_array() {
259            return Err(self.err("E15: Anonymous block opener \"- ::\" only valid inside array block"));
260        }
261        self.stack.push(Frame::new_anon());
262        Ok(())
263    }
264
265    fn close_block(&mut self, closer: &str) -> Result<(), ParseError> {
266        if self.stack.len() <= 1 {
267            return Err(self.errf(format!("E02: Unexpected block closer {:?} at top level", format!(":: {}", closer))));
268        }
269
270        let frame_key  = self.stack.last().unwrap().key.clone();
271        let frame_anon = self.stack.last().unwrap().is_anon;
272
273        if closer == "-" {
274            if !frame_anon {
275                return Err(self.errf(format!(
276                    "E15: Anonymous closer \":: -\" used to close named block {:?}", frame_key
277                )));
278            }
279            let frame = self.stack.pop().unwrap();
280            let val = frame.to_value();
281            // Push into parent array
282            let parent = self.current_frame_mut();
283            if let FrameContent::Array(ref mut arr) = parent.content {
284                arr.push(val);
285            }
286            return Ok(());
287        }
288
289        if frame_key != closer {
290            return Err(self.errf(format!(
291                "E02: Block closer {:?} does not match opener {:?}",
292                format!(":: {}", closer),
293                format!(":: {}", frame_key),
294            )));
295        }
296
297        let frame = self.stack.pop().unwrap();
298        let val = frame.to_value();
299
300        let parent = self.current_frame_mut();
301        match &mut parent.content {
302            FrameContent::Array(arr) => arr.push(val),
303            FrameContent::Object(obj) => {
304                obj.insert(frame_key, val);
305            }
306        }
307        Ok(())
308    }
309
310    // ── Value assignment ──────────────────────────────────────────────────────
311
312    fn assign_to_frame(&mut self, key: &str, val: Value) -> Result<(), ParseError> {
313        let frame = self.current_frame_mut();
314        if frame.is_array() {
315            return Err(ParseError::new(self.line_num, "E14: Key-value pair inside array block"));
316        }
317        if let FrameContent::Object(ref mut obj) = frame.content {
318            if !obj.insert(key.to_string(), val) {
319                return Err(self.errf(format!("E01: Duplicate key {:?}", key)));
320            }
321        }
322        Ok(())
323    }
324
325    fn add_array_item(&mut self, val: Value) -> Result<(), ParseError> {
326        let frame = self.current_frame_mut();
327        if let FrameContent::Object(ref obj) = frame.content {
328            if obj.len() > 0 {
329                return Err(ParseError::new(self.line_num, "E14: Array item inside object block (mixed block content)"));
330            }
331        }
332        if let FrameContent::Object(_) = &frame.content {
333            frame.content = FrameContent::Array(Vec::new());
334        }
335        if let FrameContent::Array(ref mut arr) = frame.content {
336            arr.push(val);
337        }
338        Ok(())
339    }
340
341    fn current_frame(&self) -> &Frame {
342        self.stack.last().unwrap()
343    }
344
345    fn current_frame_mut(&mut self) -> &mut Frame {
346        self.stack.last_mut().unwrap()
347    }
348
349    // ── Value parsing ─────────────────────────────────────────────────────────
350
351    fn parse_value(&self, raw: &str) -> Result<Value, ParseError> {
352        let s = raw.trim();
353
354        match s {
355            "null"  => return Ok(Value::Null),
356            "true"  => return Ok(Value::Bool(true)),
357            "false" => return Ok(Value::Bool(false)),
358            _ => {}
359        }
360
361        // E06: wrong-case boolean/null
362        if matches!(s, "True" | "TRUE" | "False" | "FALSE" | "Null" | "NULL") {
363            return Err(self.errf(format!("E06: Boolean and null must be lowercase; got {:?}", s)));
364        }
365
366        // E05: NaN / Infinity
367        let s_lower = s.to_lowercase();
368        if s_lower == "nan" || s_lower == "infinity" || s_lower == "inf"
369            || s_lower == "+nan" || s_lower == "+infinity"
370            || s_lower == "-nan" || s_lower == "-infinity"
371        {
372            return Err(self.err("E05: NaN and Infinity are not valid SAS number values"));
373        }
374
375        // E05: leading +
376        if s.starts_with('+') {
377            return Err(self.errf(format!("E05: Numbers must not have a leading \"+\": {:?}", s)));
378        }
379
380        if s.starts_with('[') { return self.parse_inline_array(s); }
381        if s.starts_with('{') { return self.parse_inline_object(s); }
382        if s.starts_with('"') { return self.parse_string(s).map(Value::String); }
383        if s.starts_with('-') || s.starts_with(|c: char| c.is_ascii_digit()) {
384            return self.parse_number(s);
385        }
386
387        Err(self.errf(format!("Unknown value: {:?}", s)))
388    }
389
390    // ── String parsing ────────────────────────────────────────────────────────
391
392    fn parse_string(&self, raw: &str) -> Result<String, ParseError> {
393        if !raw.starts_with('"') || !raw.ends_with('"') || raw.len() < 2 {
394            return Err(self.errf(format!("Malformed string: {}", raw)));
395        }
396        self.process_escapes(&raw[1..raw.len() - 1])
397    }
398
399    fn process_escapes(&self, s: &str) -> Result<String, ParseError> {
400        let mut result = String::with_capacity(s.len());
401        let chars: Vec<char> = s.chars().collect();
402        let mut i = 0;
403        while i < chars.len() {
404            let ch = chars[i];
405            if ch == '\\' {
406                i += 1;
407                if i >= chars.len() {
408                    return Err(self.err("E04: Invalid escape sequence at end of string"));
409                }
410                match chars[i] {
411                    '"'  => result.push('"'),
412                    '\\' => result.push('\\'),
413                    'n'  => result.push('\n'),
414                    't'  => result.push('\t'),
415                    'r'  => result.push('\r'),
416                    'u'  => {
417                        if i + 4 >= chars.len() {
418                            return Err(self.err("E04: Invalid \\u escape: insufficient digits"));
419                        }
420                        let hex: String = chars[i + 1..=i + 4].iter().collect();
421                        if !hex.chars().all(|c| c.is_ascii_hexdigit()) {
422                            return Err(self.errf(format!("E04: Invalid \\u escape: \"\\u{}\"", hex)));
423                        }
424                        let codepoint = u32::from_str_radix(&hex, 16).unwrap();
425                        let ch = char::from_u32(codepoint)
426                            .ok_or_else(|| self.errf(format!("E04: Invalid Unicode codepoint U+{}", hex)))?;
427                        result.push(ch);
428                        i += 4;
429                    }
430                    c => return Err(self.errf(format!("E04: Invalid escape sequence \"\\{}\"", c))),
431                }
432            } else if ch == '"' {
433                return Err(self.err("E04: Unescaped double-quote inside string"));
434            } else {
435                result.push(ch);
436            }
437            i += 1;
438        }
439        Ok(result)
440    }
441
442    // ── Number parsing ────────────────────────────────────────────────────────
443
444    fn parse_number(&self, s: &str) -> Result<Value, ParseError> {
445        // Validate format
446        if !is_valid_number(s) {
447            return Err(self.errf(format!("E05: Invalid number format: {:?}", s)));
448        }
449        if s.contains('.') || s.contains('e') || s.contains('E') {
450            let f: f64 = s.parse().map_err(|_| self.errf(format!("E05: Number out of range: {:?}", s)))?;
451            if f.is_infinite() || f.is_nan() {
452                return Err(self.errf(format!("E05: Number out of range: {:?}", s)));
453            }
454            Ok(Value::Float(f))
455        } else {
456            let n: i64 = s.parse().map_err(|_| self.errf(format!("E05: Integer out of range: {:?}", s)))?;
457            Ok(Value::Int(n))
458        }
459    }
460
461    // ── Inline array ──────────────────────────────────────────────────────────
462
463    fn parse_inline_array(&self, s: &str) -> Result<Value, ParseError> {
464        if !s.starts_with('[') || !s.ends_with(']') {
465            return Err(self.errf(format!("Malformed inline array: {:?}", s)));
466        }
467        let inner = s[1..s.len() - 1].trim();
468        if inner.is_empty() {
469            return Ok(Value::Array(Vec::new()));
470        }
471        if inner.ends_with(" |") || inner.ends_with('\t') {
472            return Err(self.err("E10: Trailing \"|\" in inline array"));
473        }
474        self.check_pipe_syntax(inner, "inline array")?;
475        let parts = split_by_pipe(inner);
476        let mut result = Vec::with_capacity(parts.len());
477        for part in parts {
478            let val = self.parse_value(part.trim())?;
479            if !val.is_scalar() {
480                return Err(self.err("E11: Inline array elements must be scalar (string, number, boolean, null)"));
481            }
482            result.push(val);
483        }
484        Ok(Value::Array(result))
485    }
486
487    // ── Inline object ─────────────────────────────────────────────────────────
488
489    fn parse_inline_object(&self, s: &str) -> Result<Value, ParseError> {
490        if !s.starts_with('{') || !s.ends_with('}') {
491            return Err(self.errf(format!("Malformed inline object: {:?}", s)));
492        }
493        let inner = s[1..s.len() - 1].trim();
494        if inner.is_empty() {
495            return Ok(Value::Object(Object::new()));
496        }
497        if inner.ends_with(" |") {
498            return Err(self.err("E10: Trailing \"|\" in inline object"));
499        }
500        self.check_pipe_syntax(inner, "inline object")?;
501
502        let mut obj = Object::new();
503        for part in split_by_pipe(inner) {
504            let part = part.trim();
505            let arrow = part.find(" -> ")
506                .ok_or_else(|| self.errf(format!("Invalid field in inline object: {:?}", part)))?;
507            let k = &part[..arrow];
508            let v_str = &part[arrow + 4..];
509
510            if !is_valid_key(k) {
511                return Err(self.errf(format!("Invalid key in inline object: {:?}", k)));
512            }
513            if obj.contains_key(k) {
514                return Err(self.errf(format!("E01: Duplicate key {:?} in inline object", k)));
515            }
516            if v_str.trim().starts_with('{') {
517                return Err(self.err("E12: Nested inline objects are not permitted"));
518            }
519            let val = self.parse_value(v_str.trim())?;
520            if !val.is_scalar() {
521                return Err(self.err("E11: Inline object values must be scalar"));
522            }
523            obj.insert(k.to_string(), val);
524        }
525        Ok(Value::Object(obj))
526    }
527
528    // ── Pipe / comment helpers ────────────────────────────────────────────────
529
530    fn check_pipe_syntax(&self, inner: &str, context: &str) -> Result<(), ParseError> {
531        let chars: Vec<char> = inner.chars().collect();
532        let mut in_str = false;
533        for (i, &ch) in chars.iter().enumerate() {
534            if ch == '"' { in_str = !in_str; continue; }
535            if !in_str && ch == '|' {
536                let before = if i > 0 { chars[i - 1] } else { '\0' };
537                let after  = if i + 1 < chars.len() { chars[i + 1] } else { '\0' };
538                if before != ' ' || after != ' ' {
539                    return Err(self.errf(format!(
540                        "E09: \"|\" in {} must be surrounded by single spaces", context
541                    )));
542                }
543            }
544        }
545        Ok(())
546    }
547
548    fn check_no_inline_comment(&self, value_str: &str) -> Result<(), ParseError> {
549        let mut in_str = false;
550        for ch in value_str.chars() {
551            if ch == '"' { in_str = !in_str; continue; }
552            if !in_str && ch == '#' {
553                return Err(self.err("E07: Inline comments are not permitted"));
554            }
555        }
556        Ok(())
557    }
558
559    // ── Error helpers ─────────────────────────────────────────────────────────
560
561    fn err(&self, msg: &str) -> ParseError {
562        ParseError::new(self.line_num, msg)
563    }
564
565    fn errf(&self, msg: String) -> ParseError {
566        ParseError::new(self.line_num, msg)
567    }
568}
569
570// ── Utilities ─────────────────────────────────────────────────────────────────
571
572fn split_by_pipe(s: &str) -> Vec<&str> {
573    let bytes = s.as_bytes();
574    let mut parts = Vec::new();
575    let mut start = 0;
576    let mut in_str = false;
577    let mut i = 0;
578    while i < bytes.len() {
579        if bytes[i] == b'"' { in_str = !in_str; }
580        else if !in_str && bytes[i] == b' ' && i + 2 < bytes.len() && bytes[i + 1] == b'|' && bytes[i + 2] == b' ' {
581            parts.push(&s[start..i]);
582            i += 3;
583            start = i;
584            continue;
585        }
586        i += 1;
587    }
588    if start < s.len() {
589        parts.push(&s[start..]);
590    }
591    parts
592}
593
594fn is_valid_number(s: &str) -> bool {
595    let s = if s.starts_with('-') { &s[1..] } else { s };
596    if s.is_empty() { return false; }
597
598    // Split on e/E for exponent
599    let (mantissa, _exp) = if let Some(pos) = s.find(|c| c == 'e' || c == 'E') {
600        let exp = &s[pos + 1..];
601        let exp_body = exp.strip_prefix('+').or_else(|| exp.strip_prefix('-')).unwrap_or(exp);
602        if exp_body.is_empty() || !exp_body.chars().all(|c| c.is_ascii_digit()) {
603            return false;
604        }
605        (&s[..pos], true)
606    } else {
607        (s, false)
608    };
609
610    // Split on decimal point
611    let (int_part, dec_part) = if let Some(pos) = mantissa.find('.') {
612        let dec = &mantissa[pos + 1..];
613        if dec.is_empty() || !dec.chars().all(|c| c.is_ascii_digit()) {
614            return false;
615        }
616        (&mantissa[..pos], Some(dec))
617    } else {
618        (mantissa, None)
619    };
620
621    let _ = dec_part;
622
623    // Integer part: no leading zeros (except literal "0")
624    if int_part.is_empty() { return false; }
625    if int_part.len() > 1 && int_part.starts_with('0') { return false; }
626    int_part.chars().all(|c| c.is_ascii_digit())
627}
628
629fn is_valid_key(s: &str) -> bool {
630    if s.is_empty() || s.starts_with('-') { return false; }
631    s.chars().all(|c| c.is_alphanumeric() || c == '_' || c == '-')
632}