sgf_parse/
lexer.rs

1pub fn tokenize(
2    text: &str,
3) -> impl Iterator<Item = Result<(Token, std::ops::Range<usize>), LexerError>> + '_ {
4    Lexer { text, cursor: 0 }
5}
6
7#[derive(Debug, PartialEq)]
8pub enum Token {
9    StartGameTree,
10    EndGameTree,
11    StartNode,
12    Property((String, Vec<String>)),
13}
14
15/// Error type for failures to tokenize text.
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum LexerError {
18    UnexpectedPropertyIdentifier,
19    MissingPropertyIdentifier,
20    UnexpectedEndOfPropertyIdentifier,
21    UnexpectedEndOfPropertyValue,
22}
23
24impl std::fmt::Display for LexerError {
25    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
26        match self {
27            LexerError::UnexpectedPropertyIdentifier => {
28                write!(f, "Unexpected property identifier value")
29            }
30            LexerError::MissingPropertyIdentifier => {
31                write!(f, "Missing property identifier")
32            }
33            LexerError::UnexpectedEndOfPropertyIdentifier => {
34                write!(f, "Unexpected end of property identifier")
35            }
36            LexerError::UnexpectedEndOfPropertyValue => {
37                write!(f, "Unexpected end of property value")
38            }
39        }
40    }
41}
42
43impl std::error::Error for LexerError {}
44
45struct Lexer<'a> {
46    text: &'a str,
47    cursor: usize,
48}
49
50impl Lexer<'_> {
51    fn trim_leading_whitespace(&mut self) {
52        while self.cursor < self.text.len()
53            && (self.text.as_bytes()[self.cursor] as char).is_ascii_whitespace()
54        {
55            self.cursor += 1;
56        }
57    }
58
59    fn get_char(&mut self) -> Option<char> {
60        let result = self.text[self.cursor..].chars().next();
61        result.iter().for_each(|c| self.cursor += c.len_utf8());
62
63        result
64    }
65
66    fn peek_char(&self) -> Option<char> {
67        self.text[self.cursor..].chars().next()
68    }
69
70    fn get_property(&mut self) -> Result<(String, Vec<String>), LexerError> {
71        Ok((self.get_prop_ident()?, self.get_prop_values()?))
72    }
73
74    fn get_prop_ident(&mut self) -> Result<String, LexerError> {
75        let mut prop_ident = vec![];
76        loop {
77            match self.peek_char() {
78                Some('[') => break,
79                Some(c) if c.is_ascii() => {
80                    self.cursor += 1;
81                    prop_ident.push(c);
82                }
83                Some(_c) => return Err(LexerError::UnexpectedEndOfPropertyIdentifier),
84                None => return Err(LexerError::MissingPropertyIdentifier),
85            }
86        }
87
88        Ok(prop_ident.iter().collect())
89    }
90
91    fn get_prop_values(&mut self) -> Result<Vec<String>, LexerError> {
92        let mut prop_values = vec![];
93        loop {
94            self.trim_leading_whitespace();
95            match self.peek_char() {
96                Some('[') => {
97                    self.cursor += 1;
98                    prop_values.push(self.get_prop_value()?);
99                }
100                _ => break,
101            }
102        }
103
104        Ok(prop_values)
105    }
106
107    fn get_prop_value(&mut self) -> Result<String, LexerError> {
108        let mut prop_value = vec![];
109        let mut escaped = false;
110        loop {
111            match self.get_char() {
112                Some(']') if !escaped => break,
113                Some('\\') if !escaped => escaped = true,
114                Some(c) => {
115                    escaped = false;
116                    prop_value.push(c);
117                }
118                None => return Err(LexerError::UnexpectedEndOfPropertyValue),
119            }
120        }
121
122        Ok(prop_value.iter().collect())
123    }
124}
125
126impl Iterator for Lexer<'_> {
127    type Item = Result<(Token, std::ops::Range<usize>), LexerError>;
128
129    fn next(&mut self) -> Option<Self::Item> {
130        let span_start = self.cursor;
131        let token = match self.peek_char() {
132            Some('(') => {
133                self.cursor += 1;
134                Token::StartGameTree
135            }
136            Some(')') => {
137                self.cursor += 1;
138                Token::EndGameTree
139            }
140            Some(';') => {
141                self.cursor += 1;
142                Token::StartNode
143            }
144            None => return None,
145            _ => match self.get_property() {
146                Ok(property) => Token::Property(property),
147                Err(e) => return Some(Err(e)),
148            },
149        };
150        let span = span_start..self.cursor;
151        self.trim_leading_whitespace();
152
153        Some(Ok((token, span)))
154    }
155}
156
157#[cfg(test)]
158mod test {
159    use super::tokenize;
160    use super::Token::*;
161
162    #[test]
163    fn lexer() {
164        let sgf = "(;SZ[9]C[Some comment];B[de];W[fe])(;B[de];W[ff])";
165        let expected = vec![
166            (StartGameTree, 0..1),
167            (StartNode, 1..2),
168            (Property(("SZ".to_string(), vec!["9".to_string()])), 2..7),
169            (
170                Property(("C".to_string(), vec!["Some comment".to_string()])),
171                7..22,
172            ),
173            (StartNode, 22..23),
174            (Property(("B".to_string(), vec!["de".to_string()])), 23..28),
175            (StartNode, 28..29),
176            (Property(("W".to_string(), vec!["fe".to_string()])), 29..34),
177            (EndGameTree, 34..35),
178            (StartGameTree, 35..36),
179            (StartNode, 36..37),
180            (Property(("B".to_string(), vec!["de".to_string()])), 37..42),
181            (StartNode, 42..43),
182            (Property(("W".to_string(), vec!["ff".to_string()])), 43..48),
183            (EndGameTree, 48..49),
184        ];
185        let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
186
187        assert_eq!(tokens, expected);
188    }
189
190    #[test]
191    fn handles_old_style_properties() {
192        let sgf = "(;CoPyright[text])";
193        let expected = vec![
194            (StartGameTree, 0..1),
195            (StartNode, 1..2),
196            (
197                Property(("CoPyright".to_string(), vec!["text".to_string()])),
198                2..17,
199            ),
200            (EndGameTree, 17..18),
201        ];
202        let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
203
204        assert_eq!(tokens, expected);
205    }
206}