1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
// Validate that JSON is well structured
// This is done using a state machine pattern
use crate::error::Reason;
use crate::iter::Format;
use crate::raw_token::RawToken;
// Encodes the current state of the JSON structure being parsed
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
enum State {
// Initial state
Begin,
// End state
End,
// We just saw a top-level json primitive
TopLevelPrimitive,
// We just saw the start of an object
ObjStart,
// We just saw the end of an object
ObjEnd,
// We just saw the object's key
ObjKey,
// We just saw a colon, indicating a value comes next
ObjColon,
// We just saw the object's value
ObjValue,
// We jus saw a comma, indicating a key must come next
ObjComma,
// We just saw the start of an array
ArrStart,
// We just saw the end of an array
ArrEnd,
// We just saw an array value
ArrValue,
// We just saw a comma, indicating a new value must come next
ArrComma,
}
// The type of json container currently being looked at
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
enum ContainerMarker {
Array,
Object,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct JsonStructure {
state: State,
format: Format,
// State machines cannot validate that nested structures are properly matched up. So i use a
// stack to keep track of where we are in object/array nesting
stack: Vec<ContainerMarker>,
}
impl JsonStructure {
// Figure out what the next state would be given `next_token`
#[inline]
fn transition(&mut self, next_token: &RawToken) -> Result<State, Reason> {
use State::*;
loop {
return match (&self.state, next_token) {
// ==
// == BEGIN
// ==
// Json can only begin with an array, object or primitive value
(Begin, RawToken::ArrayStart) => Ok(ArrStart),
(Begin, RawToken::ObjectStart) => Ok(ObjStart),
(Begin, t) if t.is_primitive_value() => Ok(TopLevelPrimitive),
// ==
// == TopLevelPrimitive
// ==
// Generally, nothing can come after a top level primitive.
(TopLevelPrimitive, RawToken::Eof) => Ok(End),
// Unless we are configured to read concatenated JSON
(TopLevelPrimitive, _) if self.format == Format::Concatenated => {
self.state = Begin;
continue;
}
// ==
// == Array tokens ==
// ==
// After an open bracket `[`, we only expect to see `[`, `]`, `{` or a primitive
// value
(ArrStart, RawToken::ArrayStart) => Ok(ArrStart),
(ArrStart, RawToken::ArrayEnd) => Ok(ArrEnd),
(ArrStart, RawToken::ObjectStart) => Ok(ObjStart),
(ArrStart, t) if t.is_primitive_value() => Ok(ArrValue),
// After an comma in an array, we only expect to see `[`, `{` or a primitive value
(ArrComma, RawToken::ArrayStart) => Ok(ArrStart),
(ArrComma, RawToken::ObjectStart) => Ok(ObjStart),
(ArrComma, t) if t.is_primitive_value() => Ok(ArrValue),
// After seeing a primitive array value, the only tokens we expect are `]` or `,`
(ArrValue, RawToken::Comma) => Ok(ArrComma),
(ArrValue, RawToken::ArrayEnd) => Ok(ArrEnd),
// ==
// == Object tokens ==
// ==
// After an open brace `{`, we only expect to see `}` or a string (the object's
// first key)
(ObjStart, RawToken::ObjectEnd) => Ok(ObjEnd),
(ObjStart, RawToken::String(_)) => Ok(ObjKey),
// After a comma in an object, we only expected to see a string
(ObjComma, RawToken::String(_)) => Ok(ObjKey),
// After seeing an object key, the only allowed token is a colon
(ObjKey, RawToken::Colon) => Ok(ObjColon),
// After seeing a colon, the only allowed tokens are `[`, `{`, or a primitive value
(ObjColon, RawToken::ArrayStart) => Ok(ArrStart),
(ObjColon, RawToken::ObjectStart) => Ok(ObjStart),
(ObjColon, t) if t.is_primitive_value() => Ok(ObjValue),
// After seeing a primitive object value, the only allowed tokens are `}` or a
// `,`
(ObjValue, RawToken::Comma) => Ok(ObjComma),
(ObjValue, RawToken::ObjectEnd) => Ok(ObjEnd),
// ==
// == Shared transitions for object & array tokens
// ==
// If we just closed an array or object, and are at the end of input, we can go
// to the `End` state. All structures need to be closed by this point
(ArrEnd | ObjEnd, RawToken::Eof) => {
if !self.stack.is_empty() {
return Err(Reason::UnexpectedEof);
}
Ok(End)
}
// If we just closed an array or object and then saw a comma, the next state
// depends on the current container
(ArrEnd | ObjEnd, RawToken::Comma) => {
if let Some(ContainerMarker::Array) = self.stack.last() {
return Ok(ArrComma);
}
if let Some(ContainerMarker::Object) = self.stack.last() {
return Ok(ObjComma);
}
Err(Reason::UnexpectedChar)
}
// Seeing the end of an array after a `}` or `]` is only valid if
// the current unclosed structure is an array
(ArrEnd | ObjEnd, RawToken::ArrayEnd) => {
if let Some(ContainerMarker::Array) = self.stack.last() {
return Ok(ArrEnd);
}
return Err(Reason::UnexpectedChar);
}
// Seeing the end of an object after a `}` or `]` is only valid if
// the current unclosed structure is an object
(ArrEnd | ObjEnd, RawToken::ObjectEnd) => {
if let Some(ContainerMarker::Object) = self.stack.last() {
return Ok(ObjEnd);
}
return Err(Reason::UnexpectedChar);
}
// If we are configured to recognized concatenated JSON, and there are no unclosed
// structures, we basically start validating a new json structure
(ArrEnd | ObjEnd, _)
if self.format == Format::Concatenated && self.stack.is_empty() =>
{
self.state = Begin;
continue;
}
// ==
// == The end
// ==
// Seeing an eof token after we've fully parsed a json structure is a no-op
(End, RawToken::Eof) => Ok(End),
// Seeing an eof token any other time is unexpected
(_, RawToken::Eof) => Err(Reason::UnexpectedEof),
// Everything that has not been explicitely handled is unexpected
_ => Err(Reason::UnexpectedChar),
};
}
}
// Any work that needs to be done after entering into a new state
#[inline]
fn on_enter_state(&mut self) {
use State::*;
match &self.state {
ArrStart => {
self.stack.push(ContainerMarker::Array);
}
ObjStart => {
self.stack.push(ContainerMarker::Object);
}
ObjEnd | ArrEnd => {
self.stack.pop();
}
_ => {}
};
}
pub fn new(format: Format) -> Self {
Self {
state: State::Begin,
format,
stack: vec![],
}
}
pub fn reset(&mut self) {
self.state = State::Begin;
self.stack.clear();
}
#[inline]
pub fn validate<'t>(&mut self, token: RawToken<'t>) -> Result<RawToken<'t>, Reason> {
self.state = self.transition(&token)?;
self.on_enter_state();
// Not sure where to put this yet: If the current state is `Object::Key`, we should spit out
// an `ObjectKey token instead of a String one
let token = match (self.state, &token) {
(State::ObjKey, RawToken::String(s)) => RawToken::ObjectKey(s),
_ => token,
};
Ok(token)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::input::Input;
use crate::scan::Scanner;
#[track_caller]
fn pass<I>(format: Format, json: &str, expected: I)
where
I: IntoIterator<Item = RawToken<'static>>,
{
let mut input = Input::new(json.as_bytes());
let mut scanner = Scanner::new();
let mut structure = JsonStructure::new(format);
for expected in expected.into_iter().collect::<Vec<RawToken>>() {
let actual = scanner
.read_token(&mut input)
.and_then(|t| structure.validate(t))
.expect("failed to parse json");
assert_eq!(expected, actual);
}
// Make sure there is nothing else to parse
assert_eq!(scanner.read_token(&mut input), Ok(RawToken::Eof));
}
#[track_caller]
fn fail(format: Format, json: &str, expected: Reason) {
let mut input = Input::new(json.as_bytes());
let mut scanner = Scanner::new();
let mut structure = JsonStructure::new(format);
loop {
let token = scanner
.read_token(&mut input)
.expect("scanning should succeed");
match structure.validate(token) {
Err(reason) => {
assert_eq!(reason, expected);
break;
}
Ok(RawToken::Eof) => panic!("json was unexpectedly valid: {}", json),
_ => {}
};
}
}
#[test]
fn nesting_tests() {
fail(Format::Regular, "[[]", Reason::UnexpectedEof);
fail(Format::Regular, "[]]", Reason::UnexpectedChar);
fail(Format::Regular, "{}}", Reason::UnexpectedChar);
fail(Format::Regular, "[null", Reason::UnexpectedEof);
}
#[test]
fn streaming_json() {
// Test that top level containers primitives can be followed by any other json value in
// streaming mode
pass(
Format::Concatenated,
"[]{}truenull1\"hello\"[]",
[
RawToken::ArrayStart,
RawToken::ArrayEnd,
RawToken::ObjectStart,
RawToken::ObjectEnd,
RawToken::Bool(true),
RawToken::Null,
RawToken::Number(b"1"),
RawToken::String("hello"),
RawToken::ArrayStart,
RawToken::ArrayEnd,
RawToken::Eof,
],
);
}
}