sgf_parse/
lexer.rs

1pub fn tokenize(
2    text: &str,
3) -> impl Iterator<Item = Result<(Token, std::ops::Range<usize>), LexerError>> + '_ {
4    Lexer { text, cursor: 0 }
5}
6
7#[derive(Debug, PartialEq)]
8pub enum Token {
9    StartGameTree,
10    EndGameTree,
11    StartNode,
12    Property((String, Vec<String>)),
13}
14
15/// Error type for failures to tokenize text.
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum LexerError {
18    UnexpectedPropertyIdentifier,
19    MissingPropertyIdentifier,
20    UnexpectedEndOfProperty,
21}
22
23impl std::fmt::Display for LexerError {
24    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
25        match self {
26            LexerError::UnexpectedPropertyIdentifier => {
27                write!(f, "Unexpected property identifier value")
28            }
29            LexerError::MissingPropertyIdentifier => {
30                write!(f, "Missing property identifier")
31            }
32            LexerError::UnexpectedEndOfProperty => write!(f, "Unexpected end of property"),
33        }
34    }
35}
36
37impl std::error::Error for LexerError {}
38
39struct Lexer<'a> {
40    text: &'a str,
41    cursor: usize,
42}
43
44impl Lexer<'_> {
45    fn trim_leading_whitespace(&mut self) {
46        while self.cursor < self.text.len()
47            && (self.text.as_bytes()[self.cursor] as char).is_ascii_whitespace()
48        {
49            self.cursor += 1;
50        }
51    }
52
53    fn get_char(&mut self) -> Option<char> {
54        let result = self.text[self.cursor..].chars().next();
55        result.iter().for_each(|c| self.cursor += c.len_utf8());
56
57        result
58    }
59
60    fn peek_char(&self) -> Option<char> {
61        self.text[self.cursor..].chars().next()
62    }
63
64    fn get_property(&mut self) -> Result<(String, Vec<String>), LexerError> {
65        Ok((self.get_prop_ident()?, self.get_prop_values()?))
66    }
67
68    fn get_prop_ident(&mut self) -> Result<String, LexerError> {
69        let mut prop_ident = vec![];
70        loop {
71            match self.peek_char() {
72                Some('[') => break,
73                Some(c) if c.is_ascii() => {
74                    self.cursor += 1;
75                    prop_ident.push(c);
76                }
77                Some(_c) => return Err(LexerError::UnexpectedEndOfProperty),
78                None => return Err(LexerError::MissingPropertyIdentifier),
79            }
80        }
81
82        Ok(prop_ident.iter().collect())
83    }
84
85    fn get_prop_values(&mut self) -> Result<Vec<String>, LexerError> {
86        let mut prop_values = vec![];
87        loop {
88            self.trim_leading_whitespace();
89            match self.peek_char() {
90                Some('[') => {
91                    self.cursor += 1;
92                    prop_values.push(self.get_prop_value()?);
93                }
94                _ => break,
95            }
96        }
97
98        Ok(prop_values)
99    }
100
101    fn get_prop_value(&mut self) -> Result<String, LexerError> {
102        let mut prop_value = vec![];
103        let mut escaped = false;
104        loop {
105            match self.get_char() {
106                Some(']') if !escaped => break,
107                Some('\\') if !escaped => escaped = true,
108                Some(c) => {
109                    escaped = false;
110                    prop_value.push(c);
111                }
112                None => return Err(LexerError::UnexpectedEndOfProperty),
113            }
114        }
115
116        Ok(prop_value.iter().collect())
117    }
118}
119
120impl Iterator for Lexer<'_> {
121    type Item = Result<(Token, std::ops::Range<usize>), LexerError>;
122
123    fn next(&mut self) -> Option<Self::Item> {
124        let span_start = self.cursor;
125        let token = match self.peek_char() {
126            Some('(') => {
127                self.cursor += 1;
128                Token::StartGameTree
129            }
130            Some(')') => {
131                self.cursor += 1;
132                Token::EndGameTree
133            }
134            Some(';') => {
135                self.cursor += 1;
136                Token::StartNode
137            }
138            None => return None,
139            _ => match self.get_property() {
140                Ok(property) => Token::Property(property),
141                Err(e) => return Some(Err(e)),
142            },
143        };
144        let span = span_start..self.cursor;
145        self.trim_leading_whitespace();
146
147        Some(Ok((token, span)))
148    }
149}
150
151#[cfg(test)]
152mod test {
153    use super::tokenize;
154    use super::Token::*;
155
156    #[test]
157    fn lexer() {
158        let sgf = "(;SZ[9]C[Some comment];B[de];W[fe])(;B[de];W[ff])";
159        let expected = vec![
160            (StartGameTree, 0..1),
161            (StartNode, 1..2),
162            (Property(("SZ".to_string(), vec!["9".to_string()])), 2..7),
163            (
164                Property(("C".to_string(), vec!["Some comment".to_string()])),
165                7..22,
166            ),
167            (StartNode, 22..23),
168            (Property(("B".to_string(), vec!["de".to_string()])), 23..28),
169            (StartNode, 28..29),
170            (Property(("W".to_string(), vec!["fe".to_string()])), 29..34),
171            (EndGameTree, 34..35),
172            (StartGameTree, 35..36),
173            (StartNode, 36..37),
174            (Property(("B".to_string(), vec!["de".to_string()])), 37..42),
175            (StartNode, 42..43),
176            (Property(("W".to_string(), vec!["ff".to_string()])), 43..48),
177            (EndGameTree, 48..49),
178        ];
179        let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
180
181        assert_eq!(tokens, expected);
182    }
183
184    #[test]
185    fn handles_old_style_properties() {
186        let sgf = "(;CoPyright[text])";
187        let expected = vec![
188            (StartGameTree, 0..1),
189            (StartNode, 1..2),
190            (
191                Property(("CoPyright".to_string(), vec!["text".to_string()])),
192                2..17,
193            ),
194            (EndGameTree, 17..18),
195        ];
196        let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
197
198        assert_eq!(tokens, expected);
199    }
200}