conf_json/
parser.rs

1//! The parser that do the real work
2//!
3
4use std::io;
5use std::fmt::{self, Display, Debug};
6use std::error::Error;
7use std::str::FromStr;
8
9use crate::value::{ArrayType, ObjectType, Value};
10
11/// A error type that indicate lines and columns when parsing goes wrong
12#[derive(Debug, Clone)]
13pub struct ParseError {
14	msg: String,
15	line: usize,
16	col: usize
17}
18
19impl Display for ParseError {
20	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
21		write!(f, "\"{}\" at line {} col {}", self.msg, self.line, self.col)
22	}
23}
24
25impl Error for ParseError {}
26
27/// Alias for a parsing `Result` with the error type ParseError
28pub type ParseResult = Result<(), ParseError>;
29
30/// N must able to hold the longest keyword, for standard json which has keywords
31/// null true false, hence N >= 5 is good enough
32const N: usize = 5;
33
34#[derive(Debug)]
35pub struct Parser<T> {
36	buf: [u8; N],
37	cur_pos: usize,
38	end_pos: usize,
39	eof: bool,
40	lines: usize,
41	cols: usize,
42	src: T
43}
44
45impl<T: io::Read + Debug> Parser<T> {
46	pub fn new(src: T) -> Self {
47		Parser {
48			buf: [b'\0'; N],
49			cur_pos: 0,
50			end_pos: 0,
51			lines: 0,
52			cols: 0,
53			eof: false,
54			src: src
55		}
56	}
57
58	/// Raise a ParseError when parsing goes wrong with a brief description and
59	/// position to tell why and where
60	fn raise(&self, msg: &str) -> ParseError {
61		ParseError {
62			msg: msg.to_string(),
63			line: self.lines,
64			col: self.cols
65		}
66	}
67
68	/// The loaded but not-yet parsed data size in inner buffer
69	fn available(&self) -> usize {
70		self.end_pos - self.cur_pos
71	}
72
73	/// If there is not enough data in inner buffer, pump as much as possible
74	/// from underlying data source
75	fn pump(&mut self, n: usize) {
76		if self.available() < n {
77			// the implement will make sure following assertion pass
78			assert!(n <= self.buf.len());
79
80			if !self.eof {
81				// roll remaining data to the begining of inner buffer
82				self.buf.rotate_left(self.cur_pos);
83				self.end_pos -= self.cur_pos;
84				self.cur_pos = 0;
85
86				// try to fill inner buffer, pump as much as possible
87				let r = self.src.read(&mut self.buf[self.end_pos..]);
88				if let Ok(rd) = r {
89					if rd == 0 {
90						self.eof = true;
91					} else {
92						self.end_pos += rd;
93					}
94				} else {
95					panic!("data source reading error");
96				}
97			}
98		}
99	}
100
101	/// Pop parsed char, this is the only way to move internal cursor to next char
102	/// it will also update the current lines and columns numbers so we can know
103	/// when and where if parsing gose wrong
104	fn pop(&mut self, n: usize) {
105		// the implement will make sure following assertion pass
106		assert!(n <= self.available());
107
108		// this is the only way to `mark` each char as parsed, thus it's a good place
109		// to record line and column numbers
110		for i in self.cur_pos..self.cur_pos + n {
111			if self.buf[i] == b'\n' {
112				self.lines += 1;
113				self.cols = 0;
114			} else {
115				self.cols += 1;
116			}
117		}
118
119		self.cur_pos += n;
120	}
121
122	/// Get current char without moving the cursor, `None` if reach EOF
123	fn peek(&mut self) -> Option<u8> {
124		self.pump(1);
125		if self.cur_pos == self.end_pos {
126			None
127		} else {
128			Some(self.buf[self.cur_pos])
129		}
130	}
131
132	/// Compare a ascii string in inner buffer, it won't move the cursor
133	fn peek_match(&mut self, v: &[u8]) -> bool {
134		self.pump(v.len());
135		self.available() >= v.len() && v == &self.buf[self.cur_pos..self.cur_pos + v.len()]
136	}
137
138	/// Skip chars by predicate `F`, it will move the cursor by calling `pop()`
139	fn skip_by<F>(&mut self, test: F) where F: Fn(u8) -> bool {
140		while let Some(c) = self.peek() {
141			if !test(c) {
142				break
143			} else {
144				self.pop(1);
145			}
146		}
147	}
148
149	/// Skip chars until we meat the first `stop` char
150	fn skip_to(&mut self, stop: u8) {
151		self.skip_by(|c| { c != stop })
152	}
153
154	/// Skip spaces and comments to next meaningful char
155	fn skip_to_next(&mut self) {
156		loop {
157			// skip spaces
158			self.skip_by(|c| { c.is_ascii_whitespace() });
159
160			// check the first char after spaces
161			// '#' means comments so we skip to the end of current line
162			if let Some(c) = self.peek() {
163				if c == b'#' {
164					self.skip_to(b'\n');
165				} else {
166					break;
167				}
168			} else {
169				break;
170			}
171		}
172		// now, cur_pos points to next valid char or EOF
173	}
174
175	/// This is the main entrance of parsing, by checking the leading char,
176	/// different parsing method for each `Value` type were called respectively
177	pub fn parse(&mut self) -> Result<Value, ParseError> {
178		self.skip_to_next();
179		if let Some(c) = self.peek() {
180			match c {
181				b'{' => self.parse_object(),
182				b'[' => self.parse_array(),
183				b'\'' | b'"' => self.parse_string(),
184				_ => {
185					if self.peek_match(b"true") {
186						self.pop(4);
187						Ok(Value::Bool(true))
188					} else if self.peek_match(b"false") {
189						self.pop(5);
190						Ok(Value::Bool(false))
191					} else if self.peek_match(b"null") {
192						self.pop(4);
193						Ok(Value::Null)
194					} else {
195						self.parse_number()
196					}
197				}
198			}
199		} else {
200			Err(self.raise("not enough data"))
201		}
202	}
203
204	/// Parse `Value::Object`, the trailing comma is allowed
205	fn parse_object(&mut self) -> Result<Value, ParseError> {
206		assert_eq!(self.peek(), Some(b'{'));
207		self.pop(1);
208
209		let mut obj = ObjectType::new();
210		loop {
211			// read item key
212			self.skip_to_next();
213			if let Some(c) = self.peek() {
214				if c == b'}' {
215					self.pop(1);
216					return Ok(Value::Object(obj));
217				} else if c == b'\'' || c == b'"' {
218				} else {
219					return Err(self.raise("object: key expecting ' or \""));
220				}
221			} else {
222				return Err(self.raise("object: key expecting more data"));
223			}
224			let k = self.parse_string_raw()?;
225
226			// read kv delimiter :
227			self.skip_to_next();
228			if self.peek() != Some(b':') {
229				return Err(self.raise("object: expecting \":\""));
230			}
231			self.pop(1);
232
233			// read item value
234			self.skip_to_next();
235			let v = self.parse()?;
236
237			obj.insert(k, v);
238
239			// read item delimiter , or end }
240			// trailing , allowed, eg { k: v, kk: vv, } but not {,}
241			self.skip_to_next();
242			if let Some(c) = self.peek() {
243				if c == b',' {
244					self.pop(1);
245				} else if c == b'}' {
246					self.pop(1);
247					return Ok(Value::Object(obj));
248				} else {
249					return Err(self.raise("object: bad item delimeter, expecting , or }"));
250				}
251			} else {
252				return Err(self.raise("object: expecting , or }"));
253			}
254		}
255	}
256
257	/// Parse `Value::Array`, the trailing comma is allowed
258	fn parse_array(&mut self) -> Result<Value, ParseError> {
259		assert_eq!(self.peek(), Some(b'['));
260		self.pop(1);
261
262		let mut arr = ArrayType::new();
263		loop {
264			self.skip_to_next();
265			if Some(b']') == self.peek() {
266				self.pop(1);
267				return Ok(Value::Array(arr));
268			}
269			arr.push(self.parse()?);
270
271			// read array item delimiter , or end ]
272			// trailing , allowed eg [aaa, bbb, ], but not [,]
273			self.skip_to_next();
274			if let Some(c) = self.peek() {
275				if c == b',' {
276					self.pop(1);
277				} else if c == b']' {
278					self.pop(1);
279					return Ok(Value::Array(arr));
280				}
281			} else {
282				return Err(self.raise("array: expecting , or ]"));
283			}
284		}
285	}
286
287	/// Extract `String`, this method can be called by `parse_object()` for pasing object item key name
288	/// and `parse_string()`, both singal and double quotation marks are allowed
289	fn parse_string_raw(&mut self) -> Result<String, ParseError> {
290		// save quotation mark ' or "
291		assert!(self.peek() == Some(b'"') || self.peek() == Some(b'\''));
292		let quoter = self.peek().unwrap();
293		self.pop(1);
294
295		let mut v: Vec<u8> = Vec::new();
296		let mut esc = false;
297		while let Some(c) = self.peek() {
298			if esc {
299				match c {
300					b't' => v.push(b'\t'),
301					b'r' => v.push(b'\r'),
302					b'n' => v.push(b'\n'),
303					_ => v.push(c),
304				}
305				esc = false;
306			} else if c == b'\\' {
307				esc = true;
308			} else if c == quoter {
309				self.pop(1);
310				return if let Ok(s) = String::from_utf8(v) {
311					Ok(s)
312				} else {
313					Err(self.raise("string: bad utf-8 encode"))
314				}
315			} else {
316				v.push(c);
317			}
318			self.pop(1)
319		}
320		Err(self.raise("string: expecting more data"))
321	}
322
323	/// Parse `Value::String` by calling `parse_string_raw()`
324	fn parse_string(&mut self) -> Result<Value, ParseError> {
325		Ok(Value::String(self.parse_string_raw()?))
326	}
327
328	/// Check number car validation according to radix, for radix 2, '0' and '1' are allowed,
329	/// for radix 8, '0' ~ '7', etc
330	fn is_valid_number_char(c: u8, radix: u32) -> bool {
331		match radix {
332			2 => c >= b'0' && c <= b'1',
333			8 => c >= b'0' && c <= b'7',
334			10 => c >= b'0' && c <= b'9',
335			16 => c >= b'0' && c <= b'9' || c >= b'a' && c <= b'f' || c >= b'A' && c <= b'F',
336			_ => false,
337		}
338	}
339
340	/// Parse `Value::Number` they were store with type f64, thers are some valid forms:
341	///
342	/// 123, -123, 123.456, 0x00ff, 0XAA, 123E4, 123e-4, 123E+4
343	fn parse_number(&mut self) -> Result<Value, ParseError> {
344		#[derive(PartialEq)]
345		enum Phase {
346			Sign,
347			Radix,
348			Int,
349			Float,
350			SciSign,
351			Sci
352		}
353
354		let mut v: Vec<u8> = Vec::new();
355		let mut ph = Phase::Sign;
356		let mut radix = 10;
357
358		while let Some(c) = self.peek() {
359			match ph {
360				Phase::Sign => {
361					if b'-' == c || b'+' == c {
362						v.push(c);
363						self.pop(1);
364					} else if c.is_ascii_digit() {
365					} else {
366						return Err(self.raise("number: bad leading char, expecting \"+-[0-9]\""))
367					}
368					ph = Phase::Radix;
369				}
370				Phase::Radix => {
371					if self.peek_match(b"0b") || self.peek_match(b"0B") {
372						self.pop(2);
373						radix = 2;
374					} else if self.peek_match(b"0o") || self.peek_match(b"0O") {
375						self.pop(2);
376						radix = 8;
377					} else if self.peek_match(b"0x") || self.peek_match(b"0X") {
378						self.pop(2);
379						radix = 16;
380					}
381					ph = Phase::Int;
382				}
383				Phase::Int => {
384					if b'.' == c {
385						if radix != 10 {
386							return Err(self.raise(&format!("number: bad float parts for radix {}", radix)))
387						}
388						v.push(c);
389						self.pop(1);
390						ph = Phase::Float;
391					} else if Self::is_valid_number_char(c, radix) {
392						v.push(c);
393						self.pop(1);
394					} else if radix == 10 && (b'e' == c || b'E' == c) {
395						v.push(c);
396						self.pop(1);
397						ph = Phase::SciSign;
398					} else {
399						break;
400					}
401				}
402				Phase::Float => {
403					assert_eq!(radix, 10);
404					if Self::is_valid_number_char(c, 10) {
405						v.push(c);
406						self.pop(1);
407					} else if b'e' == c || b'E' == c {
408						v.push(c);
409						self.pop(1);
410						ph = Phase::SciSign;
411					} else {
412						break;
413					}
414				}
415				Phase::SciSign => {
416					if b'-' == c || b'+' == c {
417						v.push(c);
418						self.pop(1);
419					}
420					ph = Phase::Sci;
421				}
422				Phase::Sci => {
423					if Self::is_valid_number_char(c, 10) {
424						v.push(c);
425						self.pop(1);
426					} else {
427						break;
428					}
429				}
430			}
431		}
432
433		if v.is_empty() {
434			Err(self.raise("number: expecting more data"))
435		} else {
436			if let Ok(s) = String::from_utf8(v) {
437				if radix != 10 {
438					if let Ok(v) = i64::from_str_radix(&s, radix) {
439						Ok(Value::Number(v as f64))
440					} else {
441						Err(self.raise("number: bad i64 string"))
442					}
443				} else {
444					if let Ok(v) = f64::from_str(&s) {
445						Ok(Value::Number(v))
446					} else {
447						Err(self.raise("number: bad f64 string"))
448					}
449				}
450			} else {
451				Err(self.raise("number: bad utf-8 encoding"))
452			}
453		}
454	}
455}