json_parser/
tokenizer.rs

1use crate::error::Error;
2use crate::result::Result;
3use regex::Regex;
4use std::str::FromStr;
5use std::vec::IntoIter;
6use std::iter::{Iterator, Peekable};
7
8pub type PeekableIter<T> = Peekable<IntoIter<T>>;
9
10#[derive(Debug, PartialEq)]
11pub enum Token {
12    Coma,
13    Colon,
14    CurlyOpen,
15    CurlyClose,
16    SquareOpen,
17    SquareClose,
18    StringValue(String),
19    NumberValue(f64),
20    BoolValue(bool),
21    NullValue,
22}
23
24pub struct Tokenizer {
25    char_stream: PeekableIter<char>,
26}
27
28impl Tokenizer {
29    pub fn new(s: &str) -> Tokenizer {
30        let vec: Vec<char> = s.chars().collect();
31        let char_stream = vec.into_iter().peekable();
32        Tokenizer { char_stream }
33    }
34
35    fn take_until(&mut self, predicate: fn(char) -> bool) -> Result<Vec<char>> {
36        let mut res: Vec<char> = vec![];
37        while let Some(c) = self.char_stream.next() {
38            if !predicate(c) {
39                res.push(c);
40            } else {
41                return Ok(res);
42            }
43        }
44        let s: String = res.iter().collect();
45        Err(Error::Tokenize(
46            format!("unterminated token `{}`", s).into(),
47        ))
48    }
49
50    fn take_while(&mut self, predicate: fn(char) -> bool) -> Result<Vec<char>> {
51        let mut res = vec![];
52        while let Some(&c) = self.char_stream.peek() {
53            if predicate(c) {
54                self.char_stream.next();
55                res.push(c);
56            } else {
57                return Ok(res);
58            }
59        }
60        Ok(res)
61    }
62
63    fn skip(&mut self, ch: char) -> Result<()> {
64        match self.char_stream.next() {
65            Some(c) if c == ch => Ok(()),
66            _ => Err(Error::Tokenize(format!("expected token `{}`", ch).into())),
67        }
68    }
69
70    fn string_token(&mut self) -> Result<Token> {
71        self.skip('"')?;
72        let chars = self.take_until(|c| c == '"')?;
73        Ok(Token::StringValue(chars.iter().collect()))
74    }
75
76    fn number_token(&mut self) -> Result<Token> {
77        let chars = self.take_while(|c| Regex::new(r"^\d$").unwrap().is_match(&c.to_string()))?;
78        let num_string: String = chars.iter().collect();
79        match num_string.parse() {
80            Ok(num) => Ok(Token::NumberValue(num)),
81            Err(pfe) => Err(Error::Tokenize(pfe.to_string())),
82        }
83    }
84
85    fn keyword_token(&mut self) -> Result<Token> {
86        let chars = self.take_while(|c| {
87            Regex::new(r"^[a-zA-Z_\d]$")
88                .unwrap()
89                .is_match(&c.to_string())
90        })?;
91        let token: String = chars.iter().collect();
92        match &token[..] {
93            "true" => Ok(Token::BoolValue(true)),
94            "false" => Ok(Token::BoolValue(false)),
95            "null" => Ok(Token::NullValue),
96            _ => Err(Error::Tokenize(
97                format!("unrecognized token {}", token).into(),
98            )),
99        }
100    }
101
102    pub fn tokenize(&mut self) -> Result<Vec<Token>> {
103        let mut v: Vec<Token> = vec![];
104        while let Some(c) = self.char_stream.peek() {
105            match c {
106                ' ' | '\n' | '\t' => {
107                    self.char_stream.next();
108                }
109                '{' => {
110                    v.push(Token::CurlyOpen);
111                    self.char_stream.next();
112                }
113                '}' => {
114                    v.push(Token::CurlyClose);
115                    self.char_stream.next();
116                }
117                '[' => {
118                    v.push(Token::SquareOpen);
119                    self.char_stream.next();
120                }
121                ']' => {
122                    v.push(Token::SquareClose);
123                    self.char_stream.next();
124                }
125                ',' => {
126                    v.push(Token::Coma);
127                    self.char_stream.next();
128                }
129                ':' => {
130                    v.push(Token::Colon);
131                    self.char_stream.next();
132                }
133                '"' => v.push(self.string_token()?),
134                '0'...'9' => v.push(self.number_token()?),
135                _ => v.push(self.keyword_token()?),
136            }
137        }
138        Ok(v)
139    }
140}
141
142impl FromStr for Tokenizer {
143    type Err = Error;
144    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
145        Ok(Tokenizer::new(s))
146    }
147}
148
149#[test]
150fn test_string_token() {
151    let mut tokenizer: Tokenizer = (r#""hello""#).parse().unwrap();
152    let result = tokenizer.string_token();
153    assert_eq!(result.unwrap(), Token::StringValue("hello".into()));
154}
155
156#[test]
157fn test_number_token() {
158    let mut tokenizer = Tokenizer::new(r#"123"#);
159    let result = tokenizer.number_token();
160    assert_eq!(result.unwrap(), Token::NumberValue(123.0));
161}
162
163#[test]
164fn test_true_token() {
165    let mut tokenizer = Tokenizer::new(r#"true"#);
166    let result = tokenizer.keyword_token();
167    assert_eq!(result.unwrap(), Token::BoolValue(true));
168}
169
170#[test]
171fn test_false_token() {
172    let mut tokenizer = Tokenizer::new(r#"false"#);
173    let result = tokenizer.keyword_token();
174    assert_eq!(result.unwrap(), Token::BoolValue(false));
175}
176
177#[test]
178fn test_null_token() {
179    let mut tokenizer = Tokenizer::new(r#"null"#);
180    let result = tokenizer.keyword_token();
181    assert_eq!(result.unwrap(), Token::NullValue);
182}
183
184#[test]
185fn test_tokenize_token() {
186    let mut tokenizer =
187        Tokenizer::new(r#"{"str": "hello", "num": 123, "array":[true, false, null]}"#);
188    let result = tokenizer.tokenize();
189    assert_eq!(
190        result.unwrap(),
191        [
192            Token::CurlyOpen,
193            Token::StringValue("str".into()),
194            Token::Colon,
195            Token::StringValue("hello".into()),
196            Token::Coma,
197            Token::StringValue("num".into()),
198            Token::Colon,
199            Token::NumberValue(123.0),
200            Token::Coma,
201            Token::StringValue("array".into()),
202            Token::Colon,
203            Token::SquareOpen,
204            Token::BoolValue(true),
205            Token::Coma,
206            Token::BoolValue(false),
207            Token::Coma,
208            Token::NullValue,
209            Token::SquareClose,
210            Token::CurlyClose
211        ]
212    );
213}