1use alloc::string::String;
2use alloc::string::ToString;
3use alloc::vec::Vec;
4use core::str;
5
6pub type Pos = usize;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub struct Span {
12 pub start: Pos,
14 pub len: usize,
16}
17
18impl Span {
19 pub fn new(start: Pos, len: usize) -> Self {
21 Span { start, len }
22 }
23 pub fn start(&self) -> Pos {
25 self.start
26 }
27 pub fn len(&self) -> usize {
29 self.len
30 }
31 pub fn is_empty(&self) -> bool {
33 self.len == 0
34 }
35 pub fn end(&self) -> Pos {
37 self.start + self.len
38 }
39}
40
41#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct Spanned<T> {
44 pub node: T,
46 pub span: Span,
48}
49
50#[derive(Debug, Clone, PartialEq)]
52pub struct TokenError {
53 pub kind: TokenErrorKind,
55 pub span: Span,
57}
58
59#[derive(Debug, Clone, PartialEq)]
61pub enum TokenErrorKind {
62 UnexpectedCharacter(char),
64 UnexpectedEof(&'static str),
66 InvalidUtf8(String),
68 NumberOutOfRange(f64),
70}
71
72impl Display for TokenErrorKind {
73 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
74 match self {
75 TokenErrorKind::UnexpectedCharacter(c) => write!(f, "unexpected character: '{}'", c),
76 TokenErrorKind::UnexpectedEof(context) => write!(f, "unexpected EOF {}", context),
77 TokenErrorKind::InvalidUtf8(detail) => write!(f, "invalid UTF-8: {}", detail),
78 TokenErrorKind::NumberOutOfRange(n) => write!(f, "number out of range: {}", n),
79 }
80 }
81}
82
83pub type TokenizeResult = Result<Spanned<Token>, TokenError>;
85
86#[derive(Debug, Clone, PartialEq)]
88pub enum Token {
89 LBrace,
91 RBrace,
93 LBracket,
95 RBracket,
97 Colon,
99 Comma,
101 String(String),
103 F64(f64),
105 I64(i64),
107 U64(u64),
109 True,
111 False,
113 Null,
115 EOF,
117}
118
119use core::fmt::{self, Display, Formatter};
120
121impl Display for Token {
122 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
123 match self {
124 Token::LBrace => write!(f, "{{"),
125 Token::RBrace => write!(f, "}}"),
126 Token::LBracket => write!(f, "["),
127 Token::RBracket => write!(f, "]"),
128 Token::Colon => write!(f, ":"),
129 Token::Comma => write!(f, ","),
130 Token::String(s) => write!(f, "\"{}\"", s),
131 Token::F64(n) => write!(f, "{}", n),
132 Token::I64(n) => write!(f, "{}", n),
133 Token::U64(n) => write!(f, "{}", n),
134 Token::True => write!(f, "true"),
135 Token::False => write!(f, "false"),
136 Token::Null => write!(f, "null"),
137 Token::EOF => write!(f, "EOF"),
138 }
139 }
140}
141
142pub struct Tokenizer<'input> {
144 input: &'input [u8],
145 pos: Pos,
146}
147
148impl<'input> Tokenizer<'input> {
149 pub fn new(input: &'input [u8]) -> Self {
151 Tokenizer { input, pos: 0 }
152 }
153
154 pub fn position(&self) -> Pos {
156 self.pos
157 }
158
159 pub fn next_token(&mut self) -> TokenizeResult {
161 self.skip_whitespace();
162 let start = self.pos;
163 let c = match self.input.get(self.pos).copied() {
164 Some(c) => c,
165 None => {
166 let span = Span::new(self.pos, 0);
168 return Ok(Spanned {
169 node: Token::EOF,
170 span,
171 });
172 }
173 };
174 let sp = match c {
175 b'{' => {
176 self.pos += 1;
177 Spanned {
178 node: Token::LBrace,
179 span: Span::new(start, 1),
180 }
181 }
182 b'}' => {
183 self.pos += 1;
184 Spanned {
185 node: Token::RBrace,
186 span: Span::new(start, 1),
187 }
188 }
189 b'[' => {
190 self.pos += 1;
191 Spanned {
192 node: Token::LBracket,
193 span: Span::new(start, 1),
194 }
195 }
196 b']' => {
197 self.pos += 1;
198 Spanned {
199 node: Token::RBracket,
200 span: Span::new(start, 1),
201 }
202 }
203 b':' => {
204 self.pos += 1;
205 Spanned {
206 node: Token::Colon,
207 span: Span::new(start, 1),
208 }
209 }
210 b',' => {
211 self.pos += 1;
212 Spanned {
213 node: Token::Comma,
214 span: Span::new(start, 1),
215 }
216 }
217 b'"' => return self.parse_string(start),
218 b'-' | b'0'..=b'9' => return self.parse_number(start),
219 b't' => return self.parse_literal(start, b"true", || Token::True),
220 b'f' => return self.parse_literal(start, b"false", || Token::False),
221 b'n' => return self.parse_literal(start, b"null", || Token::Null),
222 _ => {
223 return Err(TokenError {
224 kind: TokenErrorKind::UnexpectedCharacter(c as char),
225 span: Span::new(start, 1),
226 });
227 }
228 };
229 Ok(sp)
230 }
231
232 fn skip_whitespace(&mut self) {
234 while let Some(&b) = self.input.get(self.pos) {
235 match b {
236 b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
237 _ => break,
238 }
239 }
240 }
241
242 fn parse_string(&mut self, start: Pos) -> TokenizeResult {
243 self.pos += 1;
245 let mut buf = Vec::new();
246 let content_start = self.pos;
247
248 while let Some(&b) = self.input.get(self.pos) {
249 match b {
250 b'"' => {
251 self.pos += 1;
252 break;
253 }
254 b'\\' => {
255 self.pos += 1;
256 if let Some(&esc) = self.input.get(self.pos) {
257 match esc {
258 b'"' | b'\\' | b'/' => buf.push(esc),
259 b'b' => buf.push(b'\x08'), b'f' => buf.push(b'\x0C'), b'n' => buf.push(b'\n'), b'r' => buf.push(b'\r'), b't' => buf.push(b'\t'), _ => buf.push(esc), }
266 self.pos += 1;
267 } else {
268 return Err(TokenError {
269 kind: TokenErrorKind::UnexpectedEof("in string escape"),
270 span: Span::new(self.pos, 0),
271 });
272 }
273 }
274 _ => {
275 buf.push(b);
276 self.pos += 1;
277 }
278 }
279 }
280
281 if self.pos > self.input.len()
283 || (self.pos == self.input.len() && self.input[self.pos - 1] != b'"')
284 {
285 return Err(TokenError {
286 kind: TokenErrorKind::UnexpectedEof("in string literal"),
287 span: Span::new(start, self.pos - start),
288 });
289 }
290
291 let s = match str::from_utf8(&buf) {
292 Ok(st) => st.to_string(),
293 Err(e) => {
294 return Err(TokenError {
295 kind: TokenErrorKind::InvalidUtf8(e.to_string()),
296 span: Span::new(content_start, buf.len()),
297 });
298 }
299 };
300
301 let len = self.pos - start;
302 let span = Span::new(start, len);
303 Ok(Spanned {
304 node: Token::String(s),
305 span,
306 })
307 }
308
309 fn parse_number(&mut self, start: Pos) -> TokenizeResult {
310 let mut end = self.pos;
311 if self.input[end] == b'-' {
312 end += 1;
313 }
314 while end < self.input.len() && self.input[end].is_ascii_digit() {
315 end += 1;
316 }
317 if end < self.input.len() && self.input[end] == b'.' {
318 end += 1;
319 while end < self.input.len() && self.input[end].is_ascii_digit() {
320 end += 1;
321 }
322 }
323 if end < self.input.len() && (self.input[end] == b'e' || self.input[end] == b'E') {
324 end += 1;
325 if end < self.input.len() && (self.input[end] == b'+' || self.input[end] == b'-') {
326 end += 1;
327 }
328 while end < self.input.len() && self.input[end].is_ascii_digit() {
329 end += 1;
330 }
331 }
332 let slice = &self.input[start..end];
333 let span = Span::new(start, end - start);
334
335 let text = match str::from_utf8(slice) {
336 Ok(t) => t,
337 Err(e) => {
338 return Err(TokenError {
339 kind: TokenErrorKind::InvalidUtf8(e.to_string()),
340 span,
341 });
342 }
343 };
344
345 let token = if text.contains('.') || text.contains('e') || text.contains('E') {
346 match text.parse::<f64>() {
348 Ok(n) => Token::F64(n),
349 Err(_) => {
350 return Err(TokenError {
351 kind: TokenErrorKind::NumberOutOfRange(0.0),
352 span,
353 });
354 }
355 }
356 } else if text.starts_with('-') {
357 match text.parse::<i64>() {
359 Ok(n) => Token::I64(n),
360 Err(_) => {
361 let num = text.parse::<f64>().unwrap_or(0.0);
363 return Err(TokenError {
364 kind: TokenErrorKind::NumberOutOfRange(num),
365 span,
366 });
367 }
368 }
369 } else {
370 match text.parse::<u64>() {
372 Ok(n) => Token::U64(n),
373 Err(_) => {
374 let num = text.parse::<f64>().unwrap_or(0.0);
376 return Err(TokenError {
377 kind: TokenErrorKind::NumberOutOfRange(num),
378 span,
379 });
380 }
381 }
382 };
383
384 self.pos = end;
385 Ok(Spanned { node: token, span })
386 }
387
388 fn parse_literal<F>(&mut self, start: Pos, pat: &[u8], ctor: F) -> TokenizeResult
389 where
390 F: FnOnce() -> Token,
391 {
392 let end = start + pat.len();
393 if end <= self.input.len() && &self.input[start..end] == pat {
394 self.pos = end;
395 let span = Span::new(start, pat.len());
396 Ok(Spanned { node: ctor(), span })
397 } else {
398 let actual_len = self.input.len().saturating_sub(start).min(pat.len());
400 let span = Span::new(start, actual_len.max(1)); let got = self.input.get(start).copied().unwrap_or(b'?') as char;
403 Err(TokenError {
404 kind: TokenErrorKind::UnexpectedCharacter(got),
405 span,
406 })
407 }
408 }
409}