Skip to main content

couchbase_core/httpx/
scanner.rs

1/*
2 *
3 *  * Copyright (c) 2025 Couchbase, Inc.
4 *  *
5 *  * Licensed under the Apache License, Version 2.0 (the "License");
6 *  * you may not use this file except in compliance with the License.
7 *  * You may obtain a copy of the License at
8 *  *
9 *  *    http://www.apache.org/licenses/LICENSE-2.0
10 *  *
11 *  * Unless required by applicable law or agreed to in writing, software
12 *  * distributed under the License is distributed on an "AS IS" BASIS,
13 *  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 *  * See the License for the specific language governing permissions and
15 *  * limitations under the License.
16 *
17 */
18
19use crate::httpx::error;
20use std::str;
21
22#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord)]
23pub(crate) enum ScanState {
24    Continue = 0,
25    BeginLiteral = 1,
26    BeginObject = 2,
27    ObjectKey = 3,
28    ObjectValue = 4,
29    EndObject = 5,
30    BeginArray = 6,
31    ArrayValue = 7,
32    EndArray = 8,
33    SkipSpace = 9,
34
35    // Stop.
36    End = 10,
37    Error = 11,
38}
39
40#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord)]
41pub(crate) enum ParseState {
42    ObjectKey = 0,
43    ObjectValue = 1,
44    ArrayValue = 2,
45}
46
47pub(crate) struct Scanner {
48    step: fn(&mut Scanner, u8) -> ScanState,
49    end_top: bool,
50    parse_state: Vec<ParseState>,
51    err: Option<error::Error>,
52    bytes: usize,
53}
54
55impl Scanner {
56    pub fn new() -> Self {
57        Scanner {
58            step: Scanner::state_begin_value,
59            end_top: false,
60            parse_state: Vec::new(),
61            err: None,
62            bytes: 0,
63        }
64    }
65
66    pub fn step(&mut self, step: u8) -> ScanState {
67        (self.step)(self, step)
68    }
69
70    pub fn incr_bytes(&mut self, incr: isize) {
71        self.bytes = (self.bytes as isize + incr) as usize;
72    }
73
74    pub fn err(&self) -> Option<&error::Error> {
75        self.err.as_ref()
76    }
77
78    pub fn reset(&mut self) {
79        self.step = Scanner::state_begin_value;
80        self.parse_state.clear();
81        self.err = None;
82        self.end_top = false;
83    }
84
85    fn eof(&mut self) -> ScanState {
86        if self.err.is_some() {
87            return ScanState::Error;
88        }
89        if self.end_top {
90            return ScanState::End;
91        }
92        (self.step)(self, b' ')
93    }
94
95    fn state_begin_value(s: &mut Scanner, c: u8) -> ScanState {
96        if c.is_ascii_whitespace() {
97            return ScanState::SkipSpace;
98        }
99        match c {
100            b'{' => {
101                s.step = Scanner::state_begin_string_or_empty;
102                s.push_parse_state(ParseState::ObjectKey, ScanState::BeginObject)
103            }
104            b'[' => {
105                s.step = Scanner::state_begin_value_or_empty;
106                s.push_parse_state(ParseState::ArrayValue, ScanState::BeginArray)
107            }
108            b'"' => {
109                s.step = Scanner::state_in_string;
110                ScanState::BeginLiteral
111            }
112            b'-' => {
113                s.step = Scanner::state_neg;
114                ScanState::BeginLiteral
115            }
116            b'0' => {
117                s.step = Scanner::state0;
118                ScanState::BeginLiteral
119            }
120            b't' => {
121                s.step = Scanner::state_t;
122                ScanState::BeginLiteral
123            }
124            b'f' => {
125                s.step = Scanner::state_f;
126                ScanState::BeginLiteral
127            }
128            b'n' => {
129                s.step = Scanner::state_n;
130                ScanState::BeginLiteral
131            }
132            _ if c.is_ascii_digit() => {
133                s.step = Scanner::state1;
134                ScanState::BeginLiteral
135            }
136            _ => s.error(c, "looking for beginning of value"),
137        }
138    }
139
140    fn state_begin_value_or_empty(s: &mut Scanner, c: u8) -> ScanState {
141        if c.is_ascii_whitespace() {
142            return ScanState::SkipSpace;
143        }
144        if c == b']' {
145            return Scanner::state_end_value(s, c);
146        }
147        Scanner::state_begin_value(s, c)
148    }
149
150    fn state_begin_string_or_empty(s: &mut Scanner, c: u8) -> ScanState {
151        if c.is_ascii_whitespace() {
152            return ScanState::SkipSpace;
153        }
154        if c == b'}' {
155            let n = s.parse_state.len();
156            s.parse_state[n - 1] = ParseState::ObjectValue;
157            return Scanner::state_end_value(s, c);
158        }
159        Scanner::state_begin_string(s, c)
160    }
161
162    fn state_begin_string(s: &mut Scanner, c: u8) -> ScanState {
163        if c.is_ascii_whitespace() {
164            return ScanState::SkipSpace;
165        }
166        if c == b'"' {
167            s.step = Scanner::state_in_string;
168            return ScanState::BeginLiteral;
169        }
170        s.error(c, "looking for beginning of object key string")
171    }
172
173    fn state_end_value(s: &mut Scanner, c: u8) -> ScanState {
174        let n = s.parse_state.len();
175        if n == 0 {
176            s.step = Scanner::state_end_top;
177            s.end_top = true;
178            return Scanner::state_end_top(s, c);
179        }
180        if c.is_ascii_whitespace() {
181            s.step = Scanner::state_end_value;
182            return ScanState::SkipSpace;
183        }
184        let ps = s.parse_state[n - 1];
185        match ps {
186            ParseState::ObjectKey => {
187                if c == b':' {
188                    s.parse_state[n - 1] = ParseState::ObjectValue;
189                    s.step = Scanner::state_begin_value;
190                    return ScanState::ObjectKey;
191                }
192                s.error(c, "after object key")
193            }
194            ParseState::ObjectValue => {
195                if c == b',' {
196                    s.parse_state[n - 1] = ParseState::ObjectKey;
197                    s.step = Scanner::state_begin_string;
198                    return ScanState::ObjectValue;
199                }
200                if c == b'}' {
201                    s.pop_parse_state();
202                    return ScanState::EndObject;
203                }
204                s.error(c, "after object key:value pair")
205            }
206            ParseState::ArrayValue => {
207                if c == b',' {
208                    s.step = Scanner::state_begin_value;
209                    return ScanState::ArrayValue;
210                }
211                if c == b']' {
212                    s.pop_parse_state();
213                    return ScanState::EndArray;
214                }
215                s.error(c, "after array element")
216            }
217        }
218    }
219
220    fn state_end_top(s: &mut Scanner, c: u8) -> ScanState {
221        if !c.is_ascii_whitespace() {
222            s.error(c, "after top-level value");
223        }
224
225        ScanState::End
226    }
227
228    fn state_in_string(s: &mut Scanner, c: u8) -> ScanState {
229        if c == b'"' {
230            s.step = Scanner::state_end_value;
231            return ScanState::Continue;
232        }
233        if c == b'\\' {
234            s.step = Scanner::state_in_string_esc;
235            return ScanState::Continue;
236        }
237        if c < 0x20 {
238            return s.error(c, "in string literal");
239        }
240        ScanState::Continue
241    }
242
243    fn state_in_string_esc(s: &mut Scanner, c: u8) -> ScanState {
244        match c {
245            b'b' | b'f' | b'n' | b'r' | b't' | b'\\' | b'/' | b'"' => {
246                s.step = Scanner::state_in_string;
247                ScanState::Continue
248            }
249            b'u' => {
250                s.step = Scanner::state_in_string_esc_u;
251                ScanState::Continue
252            }
253            _ => s.error(c, "in string escape code"),
254        }
255    }
256
257    fn state_in_string_esc_u(s: &mut Scanner, c: u8) -> ScanState {
258        if c.is_ascii_hexdigit() {
259            s.step = Scanner::state_in_string_esc_u1;
260            ScanState::Continue
261        } else {
262            s.error(c, "in \\u hexadecimal character escape")
263        }
264    }
265
266    fn state_in_string_esc_u1(s: &mut Scanner, c: u8) -> ScanState {
267        if c.is_ascii_hexdigit() {
268            s.step = Scanner::state_in_string_esc_u12;
269            ScanState::Continue
270        } else {
271            s.error(c, "in \\u hexadecimal character escape")
272        }
273    }
274
275    fn state_in_string_esc_u12(s: &mut Scanner, c: u8) -> ScanState {
276        if c.is_ascii_hexdigit() {
277            s.step = Scanner::state_in_string_esc_u123;
278            ScanState::Continue
279        } else {
280            s.error(c, "in \\u hexadecimal character escape")
281        }
282    }
283
284    fn state_in_string_esc_u123(s: &mut Scanner, c: u8) -> ScanState {
285        if c.is_ascii_hexdigit() {
286            s.step = Scanner::state_in_string;
287            ScanState::Continue
288        } else {
289            s.error(c, "in \\u hexadecimal character escape")
290        }
291    }
292
293    fn state_neg(s: &mut Scanner, c: u8) -> ScanState {
294        if c == b'0' {
295            s.step = Scanner::state0;
296            ScanState::Continue
297        } else if c.is_ascii_digit() {
298            s.step = Scanner::state1;
299            ScanState::Continue
300        } else {
301            s.error(c, "in numeric literal")
302        }
303    }
304
305    fn state1(s: &mut Scanner, c: u8) -> ScanState {
306        if c.is_ascii_digit() {
307            s.step = Scanner::state1;
308            ScanState::Continue
309        } else {
310            Scanner::state0(s, c)
311        }
312    }
313
314    fn state0(s: &mut Scanner, c: u8) -> ScanState {
315        if c == b'.' {
316            s.step = Scanner::state_dot;
317            ScanState::Continue
318        } else if c == b'e' || c == b'E' {
319            s.step = Scanner::state_e;
320            ScanState::Continue
321        } else {
322            Scanner::state_end_value(s, c)
323        }
324    }
325
326    fn state_dot(s: &mut Scanner, c: u8) -> ScanState {
327        if c.is_ascii_digit() {
328            s.step = Scanner::state_dot0;
329            ScanState::Continue
330        } else {
331            s.error(c, "after decimal point in numeric literal")
332        }
333    }
334
335    fn state_dot0(s: &mut Scanner, c: u8) -> ScanState {
336        if c.is_ascii_digit() {
337            ScanState::Continue
338        } else if c == b'e' || c == b'E' {
339            s.step = Scanner::state_e;
340            ScanState::Continue
341        } else {
342            Scanner::state_end_value(s, c)
343        }
344    }
345
346    fn state_e(s: &mut Scanner, c: u8) -> ScanState {
347        if c == b'+' || c == b'-' {
348            s.step = Scanner::state_e_sign;
349            ScanState::Continue
350        } else {
351            Scanner::state_e_sign(s, c)
352        }
353    }
354
355    fn state_e_sign(s: &mut Scanner, c: u8) -> ScanState {
356        if c.is_ascii_digit() {
357            s.step = Scanner::state_e0;
358            ScanState::Continue
359        } else {
360            s.error(c, "in exponent of numeric literal")
361        }
362    }
363
364    fn state_e0(s: &mut Scanner, c: u8) -> ScanState {
365        if c.is_ascii_digit() {
366            ScanState::Continue
367        } else {
368            Scanner::state_end_value(s, c)
369        }
370    }
371
372    fn state_t(s: &mut Scanner, c: u8) -> ScanState {
373        if c == b'r' {
374            s.step = Scanner::state_tr;
375            ScanState::Continue
376        } else {
377            s.error(c, "in literal true (expecting 'r')")
378        }
379    }
380
381    fn state_tr(s: &mut Scanner, c: u8) -> ScanState {
382        if c == b'u' {
383            s.step = Scanner::state_tru;
384            ScanState::Continue
385        } else {
386            s.error(c, "in literal true (expecting 'u')")
387        }
388    }
389
390    fn state_tru(s: &mut Scanner, c: u8) -> ScanState {
391        if c == b'e' {
392            s.step = Scanner::state_end_value;
393            ScanState::Continue
394        } else {
395            s.error(c, "in literal true (expecting 'e')")
396        }
397    }
398
399    fn state_f(s: &mut Scanner, c: u8) -> ScanState {
400        if c == b'a' {
401            s.step = Scanner::state_fa;
402            ScanState::Continue
403        } else {
404            s.error(c, "in literal false (expecting 'a')")
405        }
406    }
407
408    fn state_fa(s: &mut Scanner, c: u8) -> ScanState {
409        if c == b'l' {
410            s.step = Scanner::state_fal;
411            ScanState::Continue
412        } else {
413            s.error(c, "in literal false (expecting 'l')")
414        }
415    }
416
417    fn state_fal(s: &mut Scanner, c: u8) -> ScanState {
418        if c == b's' {
419            s.step = Scanner::state_fals;
420            ScanState::Continue
421        } else {
422            s.error(c, "in literal false (expecting 's')")
423        }
424    }
425
426    fn state_fals(s: &mut Scanner, c: u8) -> ScanState {
427        if c == b'e' {
428            s.step = Scanner::state_end_value;
429            ScanState::Continue
430        } else {
431            s.error(c, "in literal false (expecting 'e')")
432        }
433    }
434
435    fn state_n(s: &mut Scanner, c: u8) -> ScanState {
436        if c == b'u' {
437            s.step = Scanner::state_nu;
438            ScanState::Continue
439        } else {
440            s.error(c, "in literal null (expecting 'u')")
441        }
442    }
443
444    fn state_nu(s: &mut Scanner, c: u8) -> ScanState {
445        if c == b'l' {
446            s.step = Scanner::state_nul;
447            ScanState::Continue
448        } else {
449            s.error(c, "in literal null (expecting 'l')")
450        }
451    }
452
453    fn state_nul(s: &mut Scanner, c: u8) -> ScanState {
454        if c == b'l' {
455            s.step = Scanner::state_end_value;
456            ScanState::Continue
457        } else {
458            s.error(c, "in literal null (expecting 'l')")
459        }
460    }
461
462    fn error(&mut self, c: u8, context: &str) -> ScanState {
463        self.step = Scanner::state_error;
464        self.err = Some(error::Error::new_message_error(format!(
465            "invalid character {} {}",
466            Scanner::quote_char(c),
467            context
468        )));
469        ScanState::Error
470    }
471
472    fn state_error(_s: &mut Scanner, _c: u8) -> ScanState {
473        ScanState::Error
474    }
475
476    pub fn quote_char(c: u8) -> String {
477        match c {
478            b'\'' => "'\\''".to_string(),
479            b'"' => "'\"'".to_string(),
480            _ => format!("'{}'", c as char),
481        }
482    }
483
484    fn push_parse_state(
485        &mut self,
486        new_parse_state: ParseState,
487        success_state: ScanState,
488    ) -> ScanState {
489        self.parse_state.push(new_parse_state);
490        success_state
491    }
492
493    fn pop_parse_state(&mut self) {
494        self.parse_state.pop();
495        if self.parse_state.is_empty() {
496            self.step = Scanner::state_end_top;
497            self.end_top = true;
498        } else {
499            self.step = Scanner::state_end_value;
500        }
501    }
502}
503
504fn valid(data: &[u8]) -> bool {
505    let mut scan = Scanner::new();
506    for &c in data {
507        scan.bytes += 1;
508        if (scan.step)(&mut scan, c) == ScanState::Error {
509            return false;
510        }
511    }
512    scan.eof() != ScanState::Error
513}
514
515#[cfg(test)]
516mod tests {
517    use super::*;
518
519    #[test]
520    fn test_valid() {
521        let tests = vec![
522            ("foo", false),
523            ("}{", false),
524            ("{]", false),
525            ("{}", true),
526            ("[]", true),
527            ("[1,2,3]", true),
528            ("[1,2,3,]", false),
529            (r#"{"foo":"bar"}"#, true),
530            (r#"{"foo": "bar",}"#, false),
531            (r#"{"foo": "bar", "baz":}"#, false),
532            (r#"{"foo": "bar", "baz": 123,}"#, false),
533            (r#"{"foo":"bar","bar":{"baz":["qux"]}}"#, true),
534            ("{\"foo\": \"bar\", \"baz\": 123, \"qux\":}", false),
535        ];
536
537        for (data, expected) in tests {
538            assert_eq!(valid(data.as_bytes()), expected, "data: {data}");
539        }
540    }
541}