Skip to main content

ooxml_codegen/
parser.rs

1//! Parser for RELAX NG Compact syntax.
2
3use crate::ast::{DatatypeParam, Definition, NameClass, Namespace, Pattern, QName, Schema};
4use crate::lexer::Token;
5
6/// Parser state.
7pub struct Parser {
8    tokens: Vec<Token>,
9    pos: usize,
10}
11
12impl Parser {
13    pub fn new(tokens: Vec<Token>) -> Self {
14        Self { tokens, pos: 0 }
15    }
16
17    pub fn parse(mut self) -> Result<Schema, ParseError> {
18        let mut namespaces = Vec::new();
19        let mut definitions = Vec::new();
20
21        while !self.at_end() {
22            if self.check(&Token::Namespace) || self.check(&Token::Default) {
23                namespaces.push(self.parse_namespace()?);
24            } else if let Some(Token::Ident(_)) = self.peek() {
25                definitions.push(self.parse_definition()?);
26            } else {
27                return Err(self.error("expected namespace or definition"));
28            }
29        }
30
31        Ok(Schema {
32            namespaces,
33            definitions,
34        })
35    }
36
37    fn parse_namespace(&mut self) -> Result<Namespace, ParseError> {
38        let is_default = self.check(&Token::Default);
39        if is_default {
40            self.advance();
41        }
42        self.expect(&Token::Namespace)?;
43
44        // Handle both forms:
45        // - `namespace prefix = "uri"`
46        // - `default namespace = "uri"` (no prefix for default namespace)
47        let prefix = if self.check(&Token::Equals) {
48            // No prefix - use empty string to indicate default namespace
49            String::new()
50        } else {
51            self.expect_ident()?
52        };
53        self.expect(&Token::Equals)?;
54        let uri = self.expect_string()?;
55
56        Ok(Namespace {
57            prefix,
58            uri,
59            is_default,
60        })
61    }
62
63    fn parse_definition(&mut self) -> Result<Definition, ParseError> {
64        let name = self.expect_ident()?;
65        self.expect(&Token::Equals)?;
66        let pattern = self.parse_pattern()?;
67
68        Ok(Definition {
69            name,
70            pattern,
71            doc_comment: None,
72        })
73    }
74
75    fn parse_pattern(&mut self) -> Result<Pattern, ParseError> {
76        self.parse_interleave()
77    }
78
79    // Interleave has lowest precedence: a & b & c
80    fn parse_interleave(&mut self) -> Result<Pattern, ParseError> {
81        let mut left = self.parse_choice()?;
82
83        while self.check(&Token::Ampersand) {
84            self.advance();
85            let right = self.parse_choice()?;
86            left = match left {
87                Pattern::Interleave(mut v) => {
88                    v.push(right);
89                    Pattern::Interleave(v)
90                }
91                _ => Pattern::Interleave(vec![left, right]),
92            };
93        }
94
95        Ok(left)
96    }
97
98    // Choice: a | b | c
99    fn parse_choice(&mut self) -> Result<Pattern, ParseError> {
100        let mut left = self.parse_sequence()?;
101
102        while self.check(&Token::Pipe) {
103            self.advance();
104            let right = self.parse_sequence()?;
105            left = match left {
106                Pattern::Choice(mut v) => {
107                    v.push(right);
108                    Pattern::Choice(v)
109                }
110                _ => Pattern::Choice(vec![left, right]),
111            };
112        }
113
114        Ok(left)
115    }
116
117    // Sequence: a, b, c
118    fn parse_sequence(&mut self) -> Result<Pattern, ParseError> {
119        let mut left = self.parse_postfix()?;
120
121        while self.check(&Token::Comma) {
122            self.advance();
123            let right = self.parse_postfix()?;
124            left = match left {
125                Pattern::Sequence(mut v) => {
126                    v.push(right);
127                    Pattern::Sequence(v)
128                }
129                _ => Pattern::Sequence(vec![left, right]),
130            };
131        }
132
133        Ok(left)
134    }
135
136    // Postfix operators: ?, *, +
137    fn parse_postfix(&mut self) -> Result<Pattern, ParseError> {
138        let mut pattern = self.parse_primary()?;
139
140        loop {
141            if self.check(&Token::Question) {
142                self.advance();
143                pattern = Pattern::Optional(Box::new(pattern));
144            } else if self.check(&Token::Star) {
145                self.advance();
146                pattern = Pattern::ZeroOrMore(Box::new(pattern));
147            } else if self.check(&Token::Plus) {
148                self.advance();
149                pattern = Pattern::OneOrMore(Box::new(pattern));
150            } else {
151                break;
152            }
153        }
154
155        Ok(pattern)
156    }
157
158    // Primary patterns
159    fn parse_primary(&mut self) -> Result<Pattern, ParseError> {
160        if self.check(&Token::Empty) {
161            self.advance();
162            return Ok(Pattern::Empty);
163        }
164
165        if self.check(&Token::String) {
166            self.advance();
167            let value = self.expect_string()?;
168            return Ok(Pattern::StringLiteral(value));
169        }
170
171        if self.check(&Token::Element) {
172            return self.parse_element();
173        }
174
175        if self.check(&Token::Attribute) {
176            return self.parse_attribute();
177        }
178
179        if self.check(&Token::Mixed) {
180            self.advance();
181            self.expect(&Token::LBrace)?;
182            let inner = self.parse_pattern()?;
183            self.expect(&Token::RBrace)?;
184            return Ok(Pattern::Mixed(Box::new(inner)));
185        }
186
187        if self.check(&Token::List) {
188            self.advance();
189            self.expect(&Token::LBrace)?;
190            let inner = self.parse_pattern()?;
191            self.expect(&Token::RBrace)?;
192            return Ok(Pattern::List(Box::new(inner)));
193        }
194
195        if self.check(&Token::Text) {
196            self.advance();
197            return Ok(Pattern::Text);
198        }
199
200        // Bare string literal (RELAX NG value pattern): "literal"
201        if let Some(Token::QuotedString(_)) = self.peek() {
202            let value = self.expect_string()?;
203            return Ok(Pattern::StringLiteral(value));
204        }
205
206        if self.check(&Token::LParen) {
207            self.advance();
208            let inner = self.parse_pattern()?;
209            self.expect(&Token::RParen)?;
210            return Ok(Pattern::Group(Box::new(inner)));
211        }
212
213        // Identifier - could be a reference or datatype
214        if let Some(Token::Ident(_)) = self.peek() {
215            let name = self.expect_ident()?;
216
217            // Check for datatype with colon (e.g., xsd:integer, xsd:string)
218            if self.check(&Token::Colon) {
219                self.advance();
220                let type_name = self.expect_ident_or_keyword()?;
221
222                // Check for params { ... } or literal value "..."
223                if self.check(&Token::LBrace) {
224                    let params = self.parse_datatype_params()?;
225                    return Ok(Pattern::Datatype {
226                        library: name,
227                        name: type_name,
228                        params,
229                    });
230                } else if let Some(Token::QuotedString(_)) = self.peek() {
231                    // Datatype with literal value pattern: xsd:int "255"
232                    let value = self.expect_string()?;
233                    return Ok(Pattern::Datatype {
234                        library: name,
235                        name: type_name,
236                        params: vec![DatatypeParam {
237                            name: "pattern".to_string(),
238                            value,
239                        }],
240                    });
241                } else {
242                    return Ok(Pattern::Datatype {
243                        library: name,
244                        name: type_name,
245                        params: vec![],
246                    });
247                }
248            }
249
250            return Ok(Pattern::Ref(name));
251        }
252
253        Err(self.error("expected pattern"))
254    }
255
256    fn parse_element(&mut self) -> Result<Pattern, ParseError> {
257        self.expect(&Token::Element)?;
258        let name_class = self.parse_name_class()?;
259        self.expect(&Token::LBrace)?;
260        let pattern = self.parse_pattern()?;
261        self.expect(&Token::RBrace)?;
262
263        // Convert NameClass to QName for simple cases, use placeholder for wildcards
264        let name = match name_class {
265            NameClass::Name(qn) => qn,
266            _ => QName {
267                prefix: None,
268                local: "_any".to_string(),
269            },
270        };
271
272        Ok(Pattern::Element {
273            name,
274            pattern: Box::new(pattern),
275        })
276    }
277
278    fn parse_attribute(&mut self) -> Result<Pattern, ParseError> {
279        self.expect(&Token::Attribute)?;
280        let name_class = self.parse_name_class()?;
281        self.expect(&Token::LBrace)?;
282        let pattern = self.parse_pattern()?;
283        self.expect(&Token::RBrace)?;
284
285        // Convert NameClass to QName for simple cases, use placeholder for wildcards
286        let name = match name_class {
287            NameClass::Name(qn) => qn,
288            _ => QName {
289                prefix: None,
290                local: "_any".to_string(),
291            },
292        };
293
294        Ok(Pattern::Attribute {
295            name,
296            pattern: Box::new(pattern),
297        })
298    }
299
300    /// Parse a name class (handles wildcards, namespace wildcards, and exclusions).
301    fn parse_name_class(&mut self) -> Result<NameClass, ParseError> {
302        let left = self.parse_name_class_primary()?;
303
304        // Check for subtraction: `nc - nc`
305        if self.check(&Token::Minus) {
306            self.advance();
307            let right = self.parse_name_class_primary()?;
308            return Ok(NameClass::Except(Box::new(left), Box::new(right)));
309        }
310
311        Ok(left)
312    }
313
314    fn parse_name_class_primary(&mut self) -> Result<NameClass, ParseError> {
315        // Check for wildcard `*`
316        if self.check(&Token::Star) {
317            self.advance();
318            return Ok(NameClass::AnyName);
319        }
320
321        // Check for parenthesized name class (for choices/groups)
322        if self.check(&Token::LParen) {
323            self.advance();
324            let mut choices = vec![self.parse_name_class()?];
325            while self.check(&Token::Pipe) {
326                self.advance();
327                choices.push(self.parse_name_class()?);
328            }
329            self.expect(&Token::RParen)?;
330            if choices.len() == 1 {
331                return Ok(choices.pop().unwrap());
332            }
333            return Ok(NameClass::Choice(choices));
334        }
335
336        // Otherwise, parse as a QName (possibly with ns:* wildcard)
337        let qname = self.parse_qname()?;
338
339        // Check if it's a namespace wildcard: `ns:*`
340        if qname.local == "*" {
341            if let Some(prefix) = qname.prefix {
342                return Ok(NameClass::NsName(prefix));
343            }
344            return Ok(NameClass::AnyName);
345        }
346
347        Ok(NameClass::Name(qname))
348    }
349
350    fn parse_qname(&mut self) -> Result<QName, ParseError> {
351        // Element/attribute names can be keywords (e.g., "default", "string")
352        let first = self.expect_name()?;
353
354        if self.check(&Token::Colon) {
355            self.advance();
356            // After colon, could be a name or `*` for namespace wildcard
357            let local = if self.check(&Token::Star) {
358                self.advance();
359                "*".to_string()
360            } else {
361                self.expect_name()?
362            };
363            Ok(QName {
364                prefix: Some(first),
365                local,
366            })
367        } else {
368            Ok(QName {
369                prefix: None,
370                local: first,
371            })
372        }
373    }
374
375    /// Accept any token that can be a name (identifier or keyword).
376    fn expect_name(&mut self) -> Result<String, ParseError> {
377        match self.peek() {
378            Some(Token::Ident(s)) => {
379                let s = s.clone();
380                self.advance();
381                Ok(s)
382            }
383            Some(Token::String) => {
384                self.advance();
385                Ok("string".to_string())
386            }
387            Some(Token::Default) => {
388                self.advance();
389                Ok("default".to_string())
390            }
391            Some(Token::Element) => {
392                self.advance();
393                Ok("element".to_string())
394            }
395            Some(Token::Attribute) => {
396                self.advance();
397                Ok("attribute".to_string())
398            }
399            Some(Token::Namespace) => {
400                self.advance();
401                Ok("namespace".to_string())
402            }
403            Some(Token::Empty) => {
404                self.advance();
405                Ok("empty".to_string())
406            }
407            Some(Token::Mixed) => {
408                self.advance();
409                Ok("mixed".to_string())
410            }
411            Some(Token::List) => {
412                self.advance();
413                Ok("list".to_string())
414            }
415            Some(Token::Text) => {
416                self.advance();
417                Ok("text".to_string())
418            }
419            _ => Err(self.error("expected name")),
420        }
421    }
422
423    fn parse_datatype_params(&mut self) -> Result<Vec<DatatypeParam>, ParseError> {
424        if !self.check(&Token::LBrace) {
425            return Ok(Vec::new());
426        }
427        self.advance();
428
429        let mut params = Vec::new();
430        while !self.check(&Token::RBrace) {
431            let name = self.expect_ident()?;
432            self.expect(&Token::Equals)?;
433            let value = self.expect_string()?;
434            params.push(DatatypeParam { name, value });
435        }
436        self.expect(&Token::RBrace)?;
437
438        Ok(params)
439    }
440
441    // Helper methods
442
443    fn peek(&self) -> Option<&Token> {
444        self.tokens.get(self.pos)
445    }
446
447    fn check(&self, token: &Token) -> bool {
448        self.peek()
449            .is_some_and(|t| std::mem::discriminant(t) == std::mem::discriminant(token))
450    }
451
452    fn at_end(&self) -> bool {
453        matches!(self.peek(), Some(Token::Eof) | None)
454    }
455
456    fn advance(&mut self) -> Option<&Token> {
457        if !self.at_end() {
458            self.pos += 1;
459        }
460        self.tokens.get(self.pos - 1)
461    }
462
463    fn expect(&mut self, expected: &Token) -> Result<(), ParseError> {
464        if self.check(expected) {
465            self.advance();
466            Ok(())
467        } else {
468            Err(self.error(&format!("expected {:?}", expected)))
469        }
470    }
471
472    fn expect_ident(&mut self) -> Result<String, ParseError> {
473        match self.peek() {
474            Some(Token::Ident(s)) => {
475                let s = s.clone();
476                self.advance();
477                Ok(s)
478            }
479            _ => Err(self.error("expected identifier")),
480        }
481    }
482
483    /// Like expect_ident but also accepts keywords that can be type names (e.g., "string" in xsd:string).
484    fn expect_ident_or_keyword(&mut self) -> Result<String, ParseError> {
485        match self.peek() {
486            Some(Token::Ident(s)) => {
487                let s = s.clone();
488                self.advance();
489                Ok(s)
490            }
491            Some(Token::String) => {
492                self.advance();
493                Ok("string".to_string())
494            }
495            _ => Err(self.error("expected identifier or type name")),
496        }
497    }
498
499    fn expect_string(&mut self) -> Result<String, ParseError> {
500        match self.peek() {
501            Some(Token::QuotedString(s)) => {
502                let s = s.clone();
503                self.advance();
504                Ok(s)
505            }
506            _ => Err(self.error("expected quoted string")),
507        }
508    }
509
510    fn error(&self, msg: &str) -> ParseError {
511        ParseError {
512            message: msg.to_string(),
513            position: self.pos,
514            token: self.peek().cloned(),
515        }
516    }
517}
518
519#[derive(Debug, thiserror::Error)]
520#[error("parse error at position {position}: {message} (found {:?})", token)]
521pub struct ParseError {
522    pub message: String,
523    pub position: usize,
524    pub token: Option<Token>,
525}
526
527#[cfg(test)]
528mod tests {
529    use super::*;
530    use crate::lexer::Lexer;
531
532    fn parse(input: &str) -> Schema {
533        let tokens = Lexer::new(input).tokenize().unwrap();
534        Parser::new(tokens).parse().unwrap()
535    }
536
537    #[test]
538    fn test_empty_definition() {
539        let schema = parse("w_CT_Empty = empty");
540        assert_eq!(schema.definitions.len(), 1);
541        assert_eq!(schema.definitions[0].name, "w_CT_Empty");
542        assert!(matches!(schema.definitions[0].pattern, Pattern::Empty));
543    }
544
545    #[test]
546    fn test_choice() {
547        let schema = parse(r#"w_ST_Foo = string "a" | string "b" | string "c""#);
548        assert_eq!(schema.definitions.len(), 1);
549        match &schema.definitions[0].pattern {
550            Pattern::Choice(v) => assert_eq!(v.len(), 3),
551            _ => panic!("expected choice"),
552        }
553    }
554
555    #[test]
556    fn test_attribute() {
557        let schema = parse("w_CT_OnOff = attribute w:val { s_ST_OnOff }?");
558        assert_eq!(schema.definitions.len(), 1);
559        match &schema.definitions[0].pattern {
560            Pattern::Optional(inner) => match inner.as_ref() {
561                Pattern::Attribute { name, .. } => {
562                    assert_eq!(name.prefix, Some("w".into()));
563                    assert_eq!(name.local, "val");
564                }
565                _ => panic!("expected attribute"),
566            },
567            _ => panic!("expected optional"),
568        }
569    }
570
571    #[test]
572    fn test_namespace() {
573        let schema = parse(r#"default namespace w = "http://example.com""#);
574        assert_eq!(schema.namespaces.len(), 1);
575        assert!(schema.namespaces[0].is_default);
576        assert_eq!(schema.namespaces[0].prefix, "w");
577    }
578}