1pub fn tokenize(
2 text: &str,
3) -> impl Iterator<Item = Result<(Token, std::ops::Range<usize>), LexerError>> + '_ {
4 Lexer { text, cursor: 0 }
5}
6
7#[derive(Debug, PartialEq)]
8pub enum Token {
9 StartGameTree,
10 EndGameTree,
11 StartNode,
12 Property((String, Vec<String>)),
13}
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum LexerError {
18 UnexpectedPropertyIdentifier,
19 MissingPropertyIdentifier,
20 UnexpectedEndOfPropertyIdentifier,
21 UnexpectedEndOfPropertyValue,
22}
23
24impl std::fmt::Display for LexerError {
25 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
26 match self {
27 LexerError::UnexpectedPropertyIdentifier => {
28 write!(f, "Unexpected property identifier value")
29 }
30 LexerError::MissingPropertyIdentifier => {
31 write!(f, "Missing property identifier")
32 }
33 LexerError::UnexpectedEndOfPropertyIdentifier => {
34 write!(f, "Unexpected end of property identifier")
35 }
36 LexerError::UnexpectedEndOfPropertyValue => {
37 write!(f, "Unexpected end of property value")
38 }
39 }
40 }
41}
42
43impl std::error::Error for LexerError {}
44
45struct Lexer<'a> {
46 text: &'a str,
47 cursor: usize,
48}
49
50impl Lexer<'_> {
51 fn trim_leading_whitespace(&mut self) {
52 while self.cursor < self.text.len()
53 && (self.text.as_bytes()[self.cursor] as char).is_ascii_whitespace()
54 {
55 self.cursor += 1;
56 }
57 }
58
59 fn get_char(&mut self) -> Option<char> {
60 let result = self.text[self.cursor..].chars().next();
61 result.iter().for_each(|c| self.cursor += c.len_utf8());
62
63 result
64 }
65
66 fn peek_char(&self) -> Option<char> {
67 self.text[self.cursor..].chars().next()
68 }
69
70 fn get_property(&mut self) -> Result<(String, Vec<String>), LexerError> {
71 Ok((self.get_prop_ident()?, self.get_prop_values()?))
72 }
73
74 fn get_prop_ident(&mut self) -> Result<String, LexerError> {
75 let mut prop_ident = vec![];
76 loop {
77 match self.peek_char() {
78 Some('[') => break,
79 Some(c) if c.is_ascii() => {
80 self.cursor += 1;
81 prop_ident.push(c);
82 }
83 Some(_c) => return Err(LexerError::UnexpectedEndOfPropertyIdentifier),
84 None => return Err(LexerError::MissingPropertyIdentifier),
85 }
86 }
87
88 Ok(prop_ident.iter().collect())
89 }
90
91 fn get_prop_values(&mut self) -> Result<Vec<String>, LexerError> {
92 let mut prop_values = vec![];
93 loop {
94 self.trim_leading_whitespace();
95 match self.peek_char() {
96 Some('[') => {
97 self.cursor += 1;
98 prop_values.push(self.get_prop_value()?);
99 }
100 _ => break,
101 }
102 }
103
104 Ok(prop_values)
105 }
106
107 fn get_prop_value(&mut self) -> Result<String, LexerError> {
108 let mut prop_value = vec![];
109 let mut escaped = false;
110 loop {
111 match self.get_char() {
112 Some(']') if !escaped => break,
113 Some('\\') if !escaped => escaped = true,
114 Some(c) => {
115 escaped = false;
116 prop_value.push(c);
117 }
118 None => return Err(LexerError::UnexpectedEndOfPropertyValue),
119 }
120 }
121
122 Ok(prop_value.iter().collect())
123 }
124}
125
126impl Iterator for Lexer<'_> {
127 type Item = Result<(Token, std::ops::Range<usize>), LexerError>;
128
129 fn next(&mut self) -> Option<Self::Item> {
130 let span_start = self.cursor;
131 let token = match self.peek_char() {
132 Some('(') => {
133 self.cursor += 1;
134 Token::StartGameTree
135 }
136 Some(')') => {
137 self.cursor += 1;
138 Token::EndGameTree
139 }
140 Some(';') => {
141 self.cursor += 1;
142 Token::StartNode
143 }
144 None => return None,
145 _ => match self.get_property() {
146 Ok(property) => Token::Property(property),
147 Err(e) => return Some(Err(e)),
148 },
149 };
150 let span = span_start..self.cursor;
151 self.trim_leading_whitespace();
152
153 Some(Ok((token, span)))
154 }
155}
156
157#[cfg(test)]
158mod test {
159 use super::tokenize;
160 use super::Token::*;
161
162 #[test]
163 fn lexer() {
164 let sgf = "(;SZ[9]C[Some comment];B[de];W[fe])(;B[de];W[ff])";
165 let expected = vec![
166 (StartGameTree, 0..1),
167 (StartNode, 1..2),
168 (Property(("SZ".to_string(), vec!["9".to_string()])), 2..7),
169 (
170 Property(("C".to_string(), vec!["Some comment".to_string()])),
171 7..22,
172 ),
173 (StartNode, 22..23),
174 (Property(("B".to_string(), vec!["de".to_string()])), 23..28),
175 (StartNode, 28..29),
176 (Property(("W".to_string(), vec!["fe".to_string()])), 29..34),
177 (EndGameTree, 34..35),
178 (StartGameTree, 35..36),
179 (StartNode, 36..37),
180 (Property(("B".to_string(), vec!["de".to_string()])), 37..42),
181 (StartNode, 42..43),
182 (Property(("W".to_string(), vec!["ff".to_string()])), 43..48),
183 (EndGameTree, 48..49),
184 ];
185 let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
186
187 assert_eq!(tokens, expected);
188 }
189
190 #[test]
191 fn handles_old_style_properties() {
192 let sgf = "(;CoPyright[text])";
193 let expected = vec![
194 (StartGameTree, 0..1),
195 (StartNode, 1..2),
196 (
197 Property(("CoPyright".to_string(), vec!["text".to_string()])),
198 2..17,
199 ),
200 (EndGameTree, 17..18),
201 ];
202 let tokens: Vec<_> = tokenize(sgf).collect::<Result<_, _>>().unwrap();
203
204 assert_eq!(tokens, expected);
205 }
206}