Skip to main content

sfv/
parser.rs

1use std::{borrow::Cow, string::String as StdString};
2
3use crate::{
4    error, utils,
5    visitor::{
6        DictionaryVisitor, EntryVisitor, InnerListVisitor, ItemVisitor, ListVisitor,
7        ParameterVisitor,
8    },
9    BareItemFromInput, Date, Decimal, Integer, KeyRef, Num, SFVResult, String, StringRef, TokenRef,
10    Version,
11};
12
13fn parse_item<'de>(
14    parser: &mut Parser<'de>,
15    visitor: impl ItemVisitor<'de>,
16) -> Result<(), error::Repr> {
17    // https://httpwg.org/specs/rfc9651.html#parse-item
18    let param_visitor = visitor.bare_item(parser.parse_bare_item()?)?;
19    parser.parse_parameters(param_visitor)
20}
21
22fn parse_comma_separated<'de>(
23    parser: &mut Parser<'de>,
24    mut parse_member: impl FnMut(&mut Parser<'de>) -> Result<(), error::Repr>,
25) -> Result<(), error::Repr> {
26    while parser.peek().is_some() {
27        parse_member(parser)?;
28
29        parser.consume_ows_chars();
30
31        if parser.peek().is_none() {
32            return Ok(());
33        }
34
35        let comma_index = parser.index;
36
37        if let Some(c) = parser.peek() {
38            if c != b',' {
39                return Err(error::Repr::TrailingCharactersAfterMember(parser.index));
40            }
41            parser.next();
42        }
43
44        parser.consume_ows_chars();
45
46        if parser.peek().is_none() {
47            // Report the error at the position of the comma itself, rather
48            // than at the end of input.
49            return Err(error::Repr::TrailingComma(comma_index));
50        }
51    }
52
53    Ok(())
54}
55
56/// Exposes methods for parsing input into a structured field value.
57#[must_use]
58pub struct Parser<'de> {
59    input: &'de [u8],
60    index: usize,
61    version: Version,
62}
63
64impl<'de> Parser<'de> {
65    /// Creates a parser from the given input with [`Version::Rfc9651`].
66    pub fn new(input: &'de (impl ?Sized + AsRef<[u8]>)) -> Self {
67        Self {
68            input: input.as_ref(),
69            index: 0,
70            version: Version::Rfc9651,
71        }
72    }
73
74    /// Sets the parser's version and returns it.
75    pub fn with_version(mut self, version: Version) -> Self {
76        self.version = version;
77        self
78    }
79
80    /// Parses a structured field value.
81    ///
82    /// # Errors
83    /// When the parsing process is unsuccessful.
84    #[cfg(feature = "parsed-types")]
85    pub fn parse<T: crate::FieldType>(self) -> SFVResult<T> {
86        T::parse(self)
87    }
88
89    /// Parses input into a structured field value of `Dictionary` type, using
90    /// the given visitor.
91    #[cfg_attr(
92        feature = "parsed-types",
93        doc = r#"
94
95This can also be used to parse a dictionary that is split into multiple lines by merging
96them into an existing structure:
97
98```
99# use sfv::{Dictionary, FieldType, Parser};
100# fn main() -> Result<(), sfv::Error> {
101let mut dict: Dictionary = Parser::new("a=1").parse()?;
102
103Parser::new("b=2").parse_dictionary_with_visitor(&mut dict)?;
104
105assert_eq!(
106    dict.serialize().as_deref(),
107    Some("a=1, b=2"),
108);
109# Ok(())
110# }
111```
112"#
113    )]
114    ///
115    /// # Errors
116    /// When the parsing process is unsuccessful, including any error raised by a visitor.
117    pub fn parse_dictionary_with_visitor(
118        self,
119        visitor: &mut (impl ?Sized + DictionaryVisitor<'de>),
120    ) -> SFVResult<()> {
121        // https://httpwg.org/specs/rfc9651.html#parse-dictionary
122        self.parse_internal(move |parser| {
123            parse_comma_separated(parser, |parser| {
124                // Note: It is up to the visitor to properly handle duplicate keys.
125                let entry_visitor = visitor.entry(parser.parse_key()?)?;
126
127                if let Some(b'=') = parser.peek() {
128                    parser.next();
129                    parser.parse_list_entry(entry_visitor)
130                } else {
131                    let param_visitor = entry_visitor.bare_item(BareItemFromInput::from(true))?;
132                    parser.parse_parameters(param_visitor)
133                }
134            })
135        })
136    }
137
138    /// Parses input into a structured field value of `List` type, using the
139    /// given visitor.
140    #[allow(clippy::needless_raw_string_hashes)] // false positive: https://github.com/rust-lang/rust-clippy/issues/11737
141    #[cfg_attr(
142        feature = "parsed-types",
143        doc = r##"
144
145This can also be used to parse a list that is split into multiple lines by merging them
146into an existing structure:
147```
148# use sfv::{FieldType, List, Parser};
149# fn main() -> Result<(), sfv::Error> {
150let mut list: List = Parser::new("11, (12 13)").parse()?;
151
152Parser::new(r#""foo",        "bar""#).parse_list_with_visitor(&mut list)?;
153
154assert_eq!(
155    list.serialize().as_deref(),
156    Some(r#"11, (12 13), "foo", "bar""#),
157);
158# Ok(())
159# }
160```
161"##
162    )]
163    ///
164    /// # Errors
165    /// When the parsing process is unsuccessful, including any error raised by a visitor.
166    pub fn parse_list_with_visitor(
167        self,
168        visitor: &mut (impl ?Sized + ListVisitor<'de>),
169    ) -> SFVResult<()> {
170        // https://httpwg.org/specs/rfc9651.html#parse-list
171        self.parse_internal(|parser| {
172            parse_comma_separated(parser, |parser| parser.parse_list_entry(visitor.entry()?))
173        })
174    }
175
176    /// Parses input into a structured field value of `Item` type, using the
177    /// given visitor.
178    ///
179    /// # Errors
180    /// When the parsing process is unsuccessful, including any error raised by a visitor.
181    pub fn parse_item_with_visitor(self, visitor: impl ItemVisitor<'de>) -> SFVResult<()> {
182        self.parse_internal(|parser| parse_item(parser, visitor))
183    }
184
185    fn peek(&self) -> Option<u8> {
186        self.input.get(self.index).copied()
187    }
188
189    fn next(&mut self) -> Option<u8> {
190        self.peek().inspect(|_| self.index += 1)
191    }
192
193    // Generic parse method for checking input before parsing
194    // and handling trailing text error
195    fn parse_internal(
196        mut self,
197        f: impl FnOnce(&mut Self) -> Result<(), error::Repr>,
198    ) -> SFVResult<()> {
199        // https://httpwg.org/specs/rfc9651.html#text-parse
200
201        self.consume_sp_chars();
202
203        f(&mut self)?;
204
205        self.consume_sp_chars();
206
207        if self.peek().is_some() {
208            return Err(error::Repr::TrailingCharactersAfterParsedValue(self.index).into());
209        }
210
211        Ok(())
212    }
213
214    fn parse_list_entry(&mut self, visitor: impl EntryVisitor<'de>) -> Result<(), error::Repr> {
215        // https://httpwg.org/specs/rfc9651.html#parse-item-or-list
216        // ListEntry represents a tuple (item_or_inner_list, parameters)
217
218        match self.peek() {
219            Some(b'(') => self.parse_inner_list(visitor.inner_list()?),
220            _ => parse_item(self, visitor),
221        }
222    }
223
224    pub(crate) fn parse_inner_list(
225        &mut self,
226        mut visitor: impl InnerListVisitor<'de>,
227    ) -> Result<(), error::Repr> {
228        // https://httpwg.org/specs/rfc9651.html#parse-innerlist
229
230        if Some(b'(') != self.peek() {
231            return Err(error::Repr::ExpectedStartOfInnerList(self.index));
232        }
233
234        self.next();
235
236        while self.peek().is_some() {
237            self.consume_sp_chars();
238
239            if Some(b')') == self.peek() {
240                self.next();
241                let param_visitor = visitor.finish()?;
242                return self.parse_parameters(param_visitor);
243            }
244
245            parse_item(self, visitor.item()?)?;
246
247            if let Some(c) = self.peek() {
248                if c != b' ' && c != b')' {
249                    return Err(error::Repr::ExpectedInnerListDelimiter(self.index));
250                }
251            }
252        }
253
254        Err(error::Repr::UnterminatedInnerList(self.index))
255    }
256
257    pub(crate) fn parse_bare_item(&mut self) -> Result<BareItemFromInput<'de>, error::Repr> {
258        // https://httpwg.org/specs/rfc9651.html#parse-bare-item
259
260        Ok(match self.peek() {
261            Some(b'?') => BareItemFromInput::Boolean(self.parse_bool()?),
262            Some(b'"') => BareItemFromInput::String(self.parse_string()?),
263            Some(b':') => BareItemFromInput::ByteSequence(self.parse_byte_sequence()?),
264            Some(b'@') => BareItemFromInput::Date(self.parse_date()?),
265            Some(b'%') => BareItemFromInput::DisplayString(self.parse_display_string()?),
266            Some(c) if utils::is_allowed_start_token_char(c) => {
267                BareItemFromInput::Token(self.parse_token()?)
268            }
269            Some(c) if c == b'-' || c.is_ascii_digit() => match self.parse_number()? {
270                Num::Decimal(val) => BareItemFromInput::Decimal(val),
271                Num::Integer(val) => BareItemFromInput::Integer(val),
272            },
273            _ => return Err(error::Repr::ExpectedStartOfBareItem(self.index)),
274        })
275    }
276
277    pub(crate) fn parse_bool(&mut self) -> Result<bool, error::Repr> {
278        // https://httpwg.org/specs/rfc9651.html#parse-boolean
279
280        if self.peek() != Some(b'?') {
281            return Err(error::Repr::ExpectedStartOfBoolean(self.index));
282        }
283
284        self.next();
285
286        match self.peek() {
287            Some(b'0') => {
288                self.next();
289                Ok(false)
290            }
291            Some(b'1') => {
292                self.next();
293                Ok(true)
294            }
295            _ => Err(error::Repr::ExpectedBoolean(self.index)),
296        }
297    }
298
299    pub(crate) fn parse_string(&mut self) -> Result<Cow<'de, StringRef>, error::Repr> {
300        // https://httpwg.org/specs/rfc9651.html#parse-string
301
302        if self.peek() != Some(b'"') {
303            return Err(error::Repr::ExpectedStartOfString(self.index));
304        }
305
306        self.next();
307
308        let start = self.index;
309        let mut output = Cow::Borrowed(&[] as &[u8]);
310
311        while let Some(curr_char) = self.peek() {
312            match curr_char {
313                b'"' => {
314                    self.next();
315                    // TODO: The UTF-8 validation is redundant with the preceding character checks, but
316                    // its removal is only possible with unsafe code.
317                    return Ok(match output {
318                        Cow::Borrowed(output) => {
319                            let output = std::str::from_utf8(output).unwrap();
320                            Cow::Borrowed(StringRef::from_str(output).unwrap())
321                        }
322                        Cow::Owned(output) => {
323                            let output = StdString::from_utf8(output).unwrap();
324                            Cow::Owned(String::from_string(output).unwrap())
325                        }
326                    });
327                }
328                0x00..=0x1f | 0x7f..=0xff => {
329                    return Err(error::Repr::InvalidStringCharacter(self.index));
330                }
331                b'\\' => {
332                    self.next();
333                    match self.peek() {
334                        Some(c @ (b'\\' | b'"')) => {
335                            self.next();
336                            output.to_mut().push(c);
337                        }
338                        None => return Err(error::Repr::UnterminatedEscapeSequence(self.index)),
339                        Some(_) => return Err(error::Repr::InvalidEscapeSequence(self.index)),
340                    }
341                }
342                _ => {
343                    self.next();
344                    match output {
345                        Cow::Borrowed(ref mut output) => *output = &self.input[start..self.index],
346                        Cow::Owned(ref mut output) => output.push(curr_char),
347                    }
348                }
349            }
350        }
351        Err(error::Repr::UnterminatedString(self.index))
352    }
353
354    fn parse_non_empty_str(
355        &mut self,
356        is_allowed_start_char: impl FnOnce(u8) -> bool,
357        is_allowed_inner_char: impl Fn(u8) -> bool,
358    ) -> Option<&'de str> {
359        let start = self.index;
360
361        match self.peek() {
362            Some(c) if is_allowed_start_char(c) => {
363                self.next();
364            }
365            _ => return None,
366        }
367
368        loop {
369            match self.peek() {
370                Some(c) if is_allowed_inner_char(c) => {
371                    self.next();
372                }
373                // TODO: The UTF-8 validation is redundant with the preceding character checks, but
374                // its removal is only possible with unsafe code.
375                _ => return Some(std::str::from_utf8(&self.input[start..self.index]).unwrap()),
376            }
377        }
378    }
379
380    pub(crate) fn parse_token(&mut self) -> Result<&'de TokenRef, error::Repr> {
381        // https://httpwg.org/specs/9651.html#parse-token
382
383        match self.parse_non_empty_str(
384            utils::is_allowed_start_token_char,
385            utils::is_allowed_inner_token_char,
386        ) {
387            None => Err(error::Repr::ExpectedStartOfToken(self.index)),
388            Some(str) => Ok(TokenRef::from_validated_str(str)),
389        }
390    }
391
392    pub(crate) fn parse_byte_sequence(&mut self) -> Result<Vec<u8>, error::Repr> {
393        // https://httpwg.org/specs/rfc9651.html#parse-binary
394
395        if self.peek() != Some(b':') {
396            return Err(error::Repr::ExpectedStartOfByteSequence(self.index));
397        }
398
399        self.next();
400        let start = self.index;
401
402        loop {
403            match self.next() {
404                Some(b':') => break,
405                Some(_) => {}
406                None => return Err(error::Repr::UnterminatedByteSequence(self.index)),
407            }
408        }
409
410        let colon_index = self.index - 1;
411
412        match base64::Engine::decode(&utils::BASE64, &self.input[start..colon_index]) {
413            Ok(content) => Ok(content),
414            Err(err) => {
415                let index = match err {
416                    base64::DecodeError::InvalidByte(offset, _)
417                    | base64::DecodeError::InvalidLastSymbol(offset, _) => start + offset,
418                    // Report these two at the position of the last base64
419                    // character, since they correspond to errors in the input
420                    // as a whole.
421                    base64::DecodeError::InvalidLength(_) | base64::DecodeError::InvalidPadding => {
422                        colon_index - 1
423                    }
424                };
425
426                Err(error::Repr::InvalidByteSequence(index))
427            }
428        }
429    }
430
431    pub(crate) fn parse_number(&mut self) -> Result<Num, error::Repr> {
432        // https://httpwg.org/specs/rfc9651.html#parse-number
433
434        fn char_to_i64(c: u8) -> i64 {
435            i64::from(c - b'0')
436        }
437
438        let sign = if let Some(b'-') = self.peek() {
439            self.next();
440            -1
441        } else {
442            1
443        };
444
445        let mut magnitude = match self.peek() {
446            Some(c @ b'0'..=b'9') => {
447                self.next();
448                char_to_i64(c)
449            }
450            _ => return Err(error::Repr::ExpectedDigit(self.index)),
451        };
452
453        let mut digits = 1;
454
455        loop {
456            match self.peek() {
457                Some(b'.') => {
458                    if digits > 12 {
459                        return Err(error::Repr::TooManyDigitsBeforeDecimalPoint(self.index));
460                    }
461                    self.next();
462                    break;
463                }
464                Some(c @ b'0'..=b'9') => {
465                    digits += 1;
466                    if digits > 15 {
467                        return Err(error::Repr::TooManyDigits(self.index));
468                    }
469                    self.next();
470                    magnitude = magnitude * 10 + char_to_i64(c);
471                }
472                _ => return Ok(Num::Integer(Integer::try_from(sign * magnitude).unwrap())),
473            }
474        }
475
476        magnitude *= 1000;
477        let mut scale = 100;
478
479        while let Some(c @ b'0'..=b'9') = self.peek() {
480            if scale == 0 {
481                return Err(error::Repr::TooManyDigitsAfterDecimalPoint(self.index));
482            }
483
484            self.next();
485            magnitude += char_to_i64(c) * scale;
486            scale /= 10;
487        }
488
489        if scale == 100 {
490            // Report the error at the position of the decimal itself, rather
491            // than the next position.
492            Err(error::Repr::TrailingDecimalPoint(self.index - 1))
493        } else {
494            Ok(Num::Decimal(Decimal::from_integer_scaled_1000(
495                Integer::try_from(sign * magnitude).unwrap(),
496            )))
497        }
498    }
499
500    pub(crate) fn parse_date(&mut self) -> Result<Date, error::Repr> {
501        // https://httpwg.org/specs/rfc9651.html#parse-date
502
503        if self.peek() != Some(b'@') {
504            return Err(error::Repr::ExpectedStartOfDate(self.index));
505        }
506
507        match self.version {
508            Version::Rfc8941 => return Err(error::Repr::Rfc8941Date(self.index)),
509            Version::Rfc9651 => {}
510        }
511
512        let start = self.index;
513        self.next();
514
515        match self.parse_number()? {
516            Num::Integer(seconds) => Ok(Date::from_unix_seconds(seconds)),
517            Num::Decimal(_) => Err(error::Repr::NonIntegerDate(start)),
518        }
519    }
520
521    pub(crate) fn parse_display_string(&mut self) -> Result<Cow<'de, str>, error::Repr> {
522        // https://httpwg.org/specs/rfc9651.html#parse-display
523
524        if self.peek() != Some(b'%') {
525            return Err(error::Repr::ExpectedStartOfDisplayString(self.index));
526        }
527
528        match self.version {
529            Version::Rfc8941 => return Err(error::Repr::Rfc8941DisplayString(self.index)),
530            Version::Rfc9651 => {}
531        }
532
533        self.next();
534
535        if self.peek() != Some(b'"') {
536            return Err(error::Repr::ExpectedQuote(self.index));
537        }
538
539        self.next();
540
541        let start = self.index;
542        let mut output = Cow::Borrowed(&[] as &[u8]);
543
544        while let Some(curr_char) = self.peek() {
545            match curr_char {
546                b'"' => {
547                    self.next();
548                    return match output {
549                        Cow::Borrowed(output) => match std::str::from_utf8(output) {
550                            Ok(output) => Ok(Cow::Borrowed(output)),
551                            Err(err) => Err(error::Repr::InvalidUtf8InDisplayString(
552                                start + err.valid_up_to(),
553                            )),
554                        },
555                        Cow::Owned(output) => match StdString::from_utf8(output) {
556                            Ok(output) => Ok(Cow::Owned(output)),
557                            Err(err) => Err(error::Repr::InvalidUtf8InDisplayString(
558                                start + err.utf8_error().valid_up_to(),
559                            )),
560                        },
561                    };
562                }
563                0x00..=0x1f | 0x7f..=0xff => {
564                    return Err(error::Repr::InvalidDisplayStringCharacter(self.index));
565                }
566                b'%' => {
567                    self.next();
568
569                    let mut octet = 0;
570
571                    for _ in 0..2 {
572                        octet = (octet << 4)
573                            + match self.peek() {
574                                Some(c @ b'0'..=b'9') => {
575                                    self.next();
576                                    c - b'0'
577                                }
578                                Some(c @ b'a'..=b'f') => {
579                                    self.next();
580                                    c - b'a' + 10
581                                }
582                                None => {
583                                    return Err(error::Repr::UnterminatedEscapeSequence(self.index))
584                                }
585                                Some(_) => {
586                                    return Err(error::Repr::InvalidEscapeSequence(self.index))
587                                }
588                            };
589                    }
590
591                    output.to_mut().push(octet);
592                }
593                _ => {
594                    self.next();
595                    match output {
596                        Cow::Borrowed(ref mut output) => *output = &self.input[start..self.index],
597                        Cow::Owned(ref mut output) => output.push(curr_char),
598                    }
599                }
600            }
601        }
602        Err(error::Repr::UnterminatedDisplayString(self.index))
603    }
604
605    pub(crate) fn parse_parameters(
606        &mut self,
607        mut visitor: impl ParameterVisitor<'de>,
608    ) -> Result<(), error::Repr> {
609        // https://httpwg.org/specs/rfc9651.html#parse-param
610
611        while let Some(b';') = self.peek() {
612            self.next();
613            self.consume_sp_chars();
614
615            let param_name = self.parse_key()?;
616            let param_value = match self.peek() {
617                Some(b'=') => {
618                    self.next();
619                    self.parse_bare_item()?
620                }
621                _ => BareItemFromInput::Boolean(true),
622            };
623            // Note: It is up to the visitor to properly handle duplicate keys.
624            visitor.parameter(param_name, param_value)?;
625        }
626
627        visitor.finish()?;
628        Ok(())
629    }
630
631    pub(crate) fn parse_key(&mut self) -> Result<&'de KeyRef, error::Repr> {
632        // https://httpwg.org/specs/rfc9651.html#parse-key
633
634        match self.parse_non_empty_str(
635            utils::is_allowed_start_key_char,
636            utils::is_allowed_inner_key_char,
637        ) {
638            None => Err(error::Repr::ExpectedStartOfKey(self.index)),
639            Some(str) => Ok(KeyRef::from_validated_str(str)),
640        }
641    }
642
643    fn consume_ows_chars(&mut self) {
644        while let Some(b' ' | b'\t') = self.peek() {
645            self.next();
646        }
647    }
648
649    fn consume_sp_chars(&mut self) {
650        while let Some(b' ') = self.peek() {
651            self.next();
652        }
653    }
654
655    #[cfg(test)]
656    pub(crate) fn remaining(&self) -> &[u8] {
657        &self.input[self.index..]
658    }
659}