1pub fn tokenize(
2 text: &str,
3) -> impl Iterator<Item = Result<(Token, std::ops::Range<usize>), LexerError>> + '_ {
4 Lexer { text, cursor: 0 }
5}
6
7#[derive(Debug, PartialEq)]
8pub enum Token {
9 StartGameTree,
10 EndGameTree,
11 StartNode,
12 Property((String, Vec<String>)),
13}
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum LexerError {
18 UnexpectedPropertyIdentifier,
19 MissingPropertyIdentifier,
20 UnexpectedEndOfProperty,
21}
22
23impl std::fmt::Display for LexerError {
24 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
25 match self {
26 LexerError::UnexpectedPropertyIdentifier => {
27 write!(f, "Unexpected property identifier value")
28 }
29 LexerError::MissingPropertyIdentifier => {
30 write!(f, "Missing property identifier")
31 }
32 LexerError::UnexpectedEndOfProperty => write!(f, "Unexpected end of property"),
33 }
34 }
35}
36
37impl std::error::Error for LexerError {}
38
39struct Lexer<'a> {
40 text: &'a str,
41 cursor: usize,
42}
43
44impl Lexer<'_> {
45 fn trim_leading_whitespace(&mut self) {
46 while self.cursor < self.text.len()
47 && (self.text.as_bytes()[self.cursor] as char).is_ascii_whitespace()
48 {
49 self.cursor += 1;
50 }
51 }
52
53 fn get_char(&mut self) -> Option<char> {
54 let result = self.text[self.cursor..].chars().next();
55 result.iter().for_each(|c| self.cursor += c.len_utf8());
56
57 result
58 }
59
60 fn peek_char(&self) -> Option<char> {
61 self.text[self.cursor..].chars().next()
62 }
63
64 fn get_property(&mut self) -> Result<(String, Vec<String>), LexerError> {
65 Ok((self.get_prop_ident()?, self.get_prop_values()?))
66 }
67
68 fn get_prop_ident(&mut self) -> Result<String, LexerError> {
69 let mut prop_ident = vec![];
70 loop {
71 match self.peek_char() {
72 Some('[') => break,
73 Some(c) if c.is_ascii() => {
74 self.cursor += 1;
75 prop_ident.push(c);
76 }
77 Some(_c) => return Err(LexerError::UnexpectedEndOfProperty),
78 None => return Err(LexerError::MissingPropertyIdentifier),
79 }
80 }
81
82 Ok(prop_ident.iter().collect())
83 }
84
85 fn get_prop_values(&mut self) -> Result<Vec<String>, LexerError> {
86 let mut prop_values = vec![];
87 loop {
88 self.trim_leading_whitespace();
89 match self.peek_char() {
90 Some('[') => {
91 self.cursor += 1;
92 prop_values.push(self.get_prop_value()?);
93 }
94 _ => break,
95 }
96 }
97
98 Ok(prop_values)
99 }
100
101 fn get_prop_value(&mut self) -> Result<String, LexerError> {
102 let mut prop_value = vec![];
103 let mut escaped = false;
104 loop {
105 match self.get_char() {
106 Some(']') if !escaped => break,
107 Some('\\') if !escaped => escaped = true,
108 Some(c) => {
109 escaped = false;
110 prop_value.push(c);
111 }
112 None => return Err(LexerError::UnexpectedEndOfProperty),
113 }
114 }
115
116 Ok(prop_value.iter().collect())
117 }
118}
119
120impl Iterator for Lexer<'_> {
121 type Item = Result<(Token, std::ops::Range<usize>), LexerError>;
122
123 fn next(&mut self) -> Option<Self::Item> {
124 let span_start = self.cursor;
125 let token = match self.peek_char() {
126 Some('(') => {
127 self.cursor += 1;
128 Token::StartGameTree
129 }
130 Some(')') => {
131 self.cursor += 1;
132 Token::EndGameTree
133 }
134 Some(';') => {
135 self.cursor += 1;
136 Token::StartNode
137 }
138 None => return None,
139 _ => match self.get_property() {
140 Ok(property) => Token::Property(property),
141 Err(e) => return Some(Err(e)),
142 },
143 };
144 let span = span_start..self.cursor;
145 self.trim_leading_whitespace();
146
147 Some(Ok((token, span)))
148 }
149}
150
151#[cfg(test)]
152mod test {
153 use super::tokenize;
154 use super::Token::*;
155
156 #[test]
157 fn lexer() {
158 let sgf = "(;SZ[9]C[Some comment];B[de];W[fe])(;B[de];W[ff])";
159 let expected = vec![
160 (StartGameTree, 0..1),
161 (StartNode, 1..2),
162 (Property(("SZ".to_string(), vec!["9".to_string()])), 2..7),
163 (
164 Property(("C".to_string(), vec!["Some comment".to_string()])),
165 7..22,
166 ),
167 (StartNode, 22..23),
168 (Property(("B".to_string(), vec!["de".to_string()])), 23..28),
169 (StartNode, 28..29),
170 (Property(("W".to_string(), vec!["fe".to_string()])), 29..34),
171 (EndGameTree, 34..35),
172 (StartGameTree, 35..36),
173 (StartNode, 36..37),
174 (Property(("B".to_string(), vec!["de".to_string()])), 37..42),
175 (StartNode, 42..43),
176 (Property(("W".to_string(), vec!["ff".to_string()])), 43..48),
177 (EndGameTree, 48..49),
178 ];
179 let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
180
181 assert_eq!(tokens, expected);
182 }
183
184 #[test]
185 fn handles_old_style_properties() {
186 let sgf = "(;CoPyright[text])";
187 let expected = vec![
188 (StartGameTree, 0..1),
189 (StartNode, 1..2),
190 (
191 Property(("CoPyright".to_string(), vec!["text".to_string()])),
192 2..17,
193 ),
194 (EndGameTree, 17..18),
195 ];
196 let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
197
198 assert_eq!(tokens, expected);
199 }
200}