Skip to main content

ooxml_codegen/
lexer.rs

1//! Lexer for RELAX NG Compact syntax.
2
3use std::iter::Peekable;
4use std::str::Chars;
5
6/// Token types for RNC.
7#[derive(Debug, Clone, PartialEq)]
8pub enum Token {
9    // Keywords
10    Namespace,
11    Default,
12    Element,
13    Attribute,
14    Empty,
15    String,
16    Mixed,
17    List,
18    Text,
19    // Identifiers and literals
20    Ident(String),
21    QuotedString(String),
22    // Symbols
23    Equals,
24    Comma,
25    Pipe,
26    Ampersand,
27    Question,
28    Star,
29    Plus,
30    Minus,
31    LBrace,
32    RBrace,
33    LParen,
34    RParen,
35    Colon,
36    // Documentation
37    DocComment(String),
38    // End of file
39    Eof,
40}
41
42/// Lexer state.
43pub struct Lexer<'a> {
44    input: Peekable<Chars<'a>>,
45    current_line: usize,
46}
47
48impl<'a> Lexer<'a> {
49    pub fn new(input: &'a str) -> Self {
50        Self {
51            input: input.chars().peekable(),
52            current_line: 1,
53        }
54    }
55
56    pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
57        let mut tokens = Vec::new();
58        loop {
59            let token = self.next_token()?;
60            if token == Token::Eof {
61                tokens.push(token);
62                break;
63            }
64            tokens.push(token);
65        }
66        Ok(tokens)
67    }
68
69    fn next_token(&mut self) -> Result<Token, LexError> {
70        self.skip_whitespace_and_comments();
71
72        let Some(ch) = self.input.peek().copied() else {
73            return Ok(Token::Eof);
74        };
75
76        match ch {
77            '=' => {
78                self.input.next();
79                Ok(Token::Equals)
80            }
81            ',' => {
82                self.input.next();
83                Ok(Token::Comma)
84            }
85            '|' => {
86                self.input.next();
87                Ok(Token::Pipe)
88            }
89            '&' => {
90                self.input.next();
91                Ok(Token::Ampersand)
92            }
93            '?' => {
94                self.input.next();
95                Ok(Token::Question)
96            }
97            '*' => {
98                self.input.next();
99                Ok(Token::Star)
100            }
101            '+' => {
102                self.input.next();
103                Ok(Token::Plus)
104            }
105            '-' => {
106                self.input.next();
107                Ok(Token::Minus)
108            }
109            '{' => {
110                self.input.next();
111                Ok(Token::LBrace)
112            }
113            '}' => {
114                self.input.next();
115                Ok(Token::RBrace)
116            }
117            '(' => {
118                self.input.next();
119                Ok(Token::LParen)
120            }
121            ')' => {
122                self.input.next();
123                Ok(Token::RParen)
124            }
125            ':' => {
126                self.input.next();
127                Ok(Token::Colon)
128            }
129            '"' => self.read_quoted_string(),
130            _ if ch.is_alphabetic() || ch == '_' => self.read_ident(),
131            _ => Err(LexError::UnexpectedChar(ch, self.current_line)),
132        }
133    }
134
135    fn skip_whitespace_and_comments(&mut self) {
136        loop {
137            // Skip whitespace
138            while let Some(&ch) = self.input.peek() {
139                if ch == '\n' {
140                    self.current_line += 1;
141                    self.input.next();
142                } else if ch.is_whitespace() {
143                    self.input.next();
144                } else {
145                    break;
146                }
147            }
148
149            // Check for comments (# or ##)
150            if self.input.peek() == Some(&'#') {
151                self.input.next(); // consume first #
152                // Check for doc comment (##)
153                let _is_doc = self.input.peek() == Some(&'#');
154                if _is_doc {
155                    self.input.next();
156                }
157                // Skip to end of line
158                while let Some(&ch) = self.input.peek() {
159                    if ch == '\n' {
160                        self.current_line += 1;
161                        self.input.next();
162                        break;
163                    }
164                    self.input.next();
165                }
166            } else {
167                break;
168            }
169        }
170    }
171
172    fn read_quoted_string(&mut self) -> Result<Token, LexError> {
173        self.input.next(); // consume opening quote
174        let mut s = String::new();
175        loop {
176            match self.input.next() {
177                Some('"') => break,
178                Some('\\') => {
179                    // Handle escape sequences
180                    match self.input.next() {
181                        Some('n') => s.push('\n'),
182                        Some('t') => s.push('\t'),
183                        Some('\\') => s.push('\\'),
184                        Some('"') => s.push('"'),
185                        Some(ch) => s.push(ch),
186                        None => return Err(LexError::UnterminatedString(self.current_line)),
187                    }
188                }
189                Some('\n') => {
190                    self.current_line += 1;
191                    s.push('\n');
192                }
193                Some(ch) => s.push(ch),
194                None => return Err(LexError::UnterminatedString(self.current_line)),
195            }
196        }
197        Ok(Token::QuotedString(s))
198    }
199
200    fn read_ident(&mut self) -> Result<Token, LexError> {
201        let mut s = String::new();
202        while let Some(&ch) = self.input.peek() {
203            if ch.is_alphanumeric() || ch == '_' || ch == '-' {
204                s.push(ch);
205                self.input.next();
206            } else {
207                break;
208            }
209        }
210        let token = match s.as_str() {
211            "namespace" => Token::Namespace,
212            "default" => Token::Default,
213            "element" => Token::Element,
214            "attribute" => Token::Attribute,
215            "empty" => Token::Empty,
216            "string" => Token::String,
217            "mixed" => Token::Mixed,
218            "list" => Token::List,
219            "text" => Token::Text,
220            _ => Token::Ident(s),
221        };
222        Ok(token)
223    }
224}
225
226#[derive(Debug, thiserror::Error)]
227pub enum LexError {
228    #[error("unexpected character '{0}' at line {1}")]
229    UnexpectedChar(char, usize),
230    #[error("unterminated string at line {0}")]
231    UnterminatedString(usize),
232}
233
234#[cfg(test)]
235mod tests {
236    use super::*;
237
238    #[test]
239    fn test_simple_definition() {
240        let input = r#"w_CT_Empty = empty"#;
241        let tokens = Lexer::new(input).tokenize().unwrap();
242        assert_eq!(
243            tokens,
244            vec![
245                Token::Ident("w_CT_Empty".into()),
246                Token::Equals,
247                Token::Empty,
248                Token::Eof,
249            ]
250        );
251    }
252
253    #[test]
254    fn test_attribute() {
255        let input = r#"attribute w:val { s_ST_String }"#;
256        let tokens = Lexer::new(input).tokenize().unwrap();
257        assert_eq!(
258            tokens,
259            vec![
260                Token::Attribute,
261                Token::Ident("w".into()),
262                Token::Colon,
263                Token::Ident("val".into()),
264                Token::LBrace,
265                Token::Ident("s_ST_String".into()),
266                Token::RBrace,
267                Token::Eof,
268            ]
269        );
270    }
271
272    #[test]
273    fn test_choice() {
274        let input = r#"string "foo" | string "bar""#;
275        let tokens = Lexer::new(input).tokenize().unwrap();
276        assert_eq!(
277            tokens,
278            vec![
279                Token::String,
280                Token::QuotedString("foo".into()),
281                Token::Pipe,
282                Token::String,
283                Token::QuotedString("bar".into()),
284                Token::Eof,
285            ]
286        );
287    }
288}