tor_netdoc/parse2/
lex.rs

1//! Lexing of netdoc elements
2
3use super::*;
4
5/// Linear whitespace as defined by torspec
6// Only pub via internal_prelude, for benefit of macros
7pub const WS: &[char] = &[' ', '\t'];
8
9define_derive_deftly! {
10    /// Define `parse_options` accessor
11    ///
12    /// The driver must have a lifetime named `'s`, which is suitable for the returned
13    /// `&'s ParseOptions`.
14    ///
15    /// # Top-level attributes:
16    ///
17    ///  * **`#[deftly(parse_options(field = ".field.field"))]`**, default `.options`
18    ParseOptions beta_deftly, expect items:
19
20    impl<$tgens> $ttype {
21        /// Examine the parsing options
22        pub fn parse_options(&self) -> &'s ParseOptions {
23            &self
24                ${tmeta(parse_options(field))
25                  as token_stream,
26                  default { .options }}
27        }
28    }
29}
30
31/// Top-level reader: Netdoc text interpreted as a stream of items
32#[derive(Debug, Clone, Deftly)]
33#[derive_deftly(ParseOptions)]
34pub struct ItemStream<'s> {
35    /// The whole document.  Used for signature hashing.
36    whole_for_signatures: &'s str,
37    /// Remaining document, as a stream of lines
38    lines: Lines<'s>,
39    /// If we have peeked ahead, what we discovered
40    peeked: PeekState<'s>,
41    /// Parsing options.
42    options: &'s ParseOptions,
43}
44
45/// Whether an `ItemStream` has peeked ahead, and if so what it discovered
46#[derive(Debug, Clone)]
47enum PeekState<'s> {
48    /// We've peeked a line
49    Some(ItemStreamPeeked<'s>),
50    /// We've not peeked, or peeking gave `None`
51    None {
52        /// Line number of the last item we yielded.
53        ///
54        /// `0` at the start.
55        yielded_item_lno: usize,
56    },
57}
58
59/// If an `ItemStream` has peeked ahead, what it discovered
60#[derive(Debug, Clone)]
61struct ItemStreamPeeked<'s> {
62    /// The next keyword
63    keyword: KeywordRef<'s>,
64    /// Token proving that we
65    line: lines::Peeked,
66    /// Length of the suffix of the line that is the arguments rather than the keyword
67    ///
68    /// Does not include the first whitespace, that terminated the keyword.
69    args_len: usize,
70}
71
72/// An Item that has been lexed but not parsed
73#[derive(Debug, Clone, amplify::Getters, Deftly)]
74#[derive_deftly(ParseOptions)]
75#[deftly(parse_options(field = ".args.options"))]
76pub struct UnparsedItem<'s> {
77    /// The item's Keyword
78    #[getter(as_copy)]
79    keyword: KeywordRef<'s>,
80    /// The Item's Arguments
81    #[getter(skip)]
82    args: ArgumentStream<'s>,
83    /// The Item's Object, if there was one
84    #[getter(as_clone)]
85    object: Option<UnparsedObject<'s>>,
86}
87
88/// Reader for arguments on an Item
89///
90/// Represents the (remaining) arguments.
91#[derive(Debug, Clone, Deftly)]
92#[derive_deftly(ParseOptions)]
93pub struct ArgumentStream<'s> {
94    /// The remaining unparsed arguments
95    ///
96    /// Can start with WS, which is usually trimmed
97    rest: &'s str,
98
99    /// Original line length
100    ///
101    /// Used for reporting column of argument errors.
102    whole_line_len: usize,
103
104    /// Remaining length *before* we last yielded.
105    previous_rest_len: usize,
106
107    /// Parsing options.
108    options: &'s ParseOptions,
109}
110
111/// An Object that has been lexed but not parsed
112#[derive(Debug, Clone, amplify::Getters, Deftly)]
113#[derive_deftly(ParseOptions)]
114pub struct UnparsedObject<'s> {
115    /// The Label
116    #[getter(as_copy)]
117    label: &'s str,
118
119    /// The portion of the input document which is base64 data (and newlines)
120    #[getter(skip)]
121    data_b64: &'s str,
122
123    /// Parsing options.
124    options: &'s ParseOptions,
125}
126
127impl<'s> ItemStream<'s> {
128    /// Start reading a network document as a series of Items
129    pub fn new(input: &'s ParseInput<'s>) -> Result<Self, ParseError> {
130        Ok(ItemStream {
131            whole_for_signatures: input.input,
132            lines: Lines::new(input.input),
133            peeked: PeekState::None {
134                yielded_item_lno: 0,
135            },
136            options: &input.options,
137        })
138    }
139
140    /// Line number for reporting an error we have just discovered
141    ///
142    /// If we have recent peeked, we report the line number of the peeked keyword line.
143    ///
144    /// Otherwise, we report the line number of the most-recently yielded item.
145    pub fn lno_for_error(&self) -> usize {
146        match self.peeked {
147            PeekState::Some { .. } => {
148                // The error was presumably caused by whatever was seen in the peek.
149                // That's the current line number.
150                self.lines.peek_lno()
151            }
152            PeekState::None { yielded_item_lno } => {
153                // The error was presumably caused by the results of next_item().
154                yielded_item_lno
155            }
156        }
157    }
158
159    /// Core of peeking.  Tries to make `.peeked` be `Some`.
160    fn peek_internal<'i>(&'i mut self) -> Result<(), EP> {
161        if matches!(self.peeked, PeekState::None { .. }) {
162            let Some(peeked) = self.lines.peek() else {
163                return Ok(());
164            };
165
166            let peeked_line = self.lines.peeked_line(&peeked);
167
168            let (keyword, args) = peeked_line.split_once(WS).unwrap_or((peeked_line, ""));
169            let keyword = KeywordRef::new(keyword)?;
170
171            self.peeked = PeekState::Some(ItemStreamPeeked {
172                keyword,
173                line: peeked,
174                args_len: args.len(),
175            });
176        }
177
178        Ok(())
179    }
180
181    /// Peek the next keyword
182    pub fn peek_keyword(&mut self) -> Result<Option<KeywordRef<'s>>, EP> {
183        self.peek_internal()?;
184        let PeekState::Some(peeked) = &self.peeked else {
185            return Ok(None);
186        };
187        Ok(Some(peeked.keyword))
188    }
189
190    /// Obtain the body so far, suitable for hashing for a Regular signature
191    pub fn body_sofar_for_signature(&self) -> SignedDocumentBody<'s> {
192        let body = &self.whole_for_signatures[0..self.byte_position()];
193        SignedDocumentBody { body }
194    }
195
196    /// Byte position, pointing to the start of the next item to yield
197    ///
198    /// Offset in bytes from the start of the original input string
199    /// to the "current" position,
200    /// ie to just after the item we yielded and just before the next item (or EOF).
201    pub fn byte_position(&self) -> usize {
202        self.whole_for_signatures.len() - self.lines.remaining().len()
203    }
204
205    /// Parse a (sub-)document with its own signatures
206    pub fn parse_signed<
207        B: NetdocParseable,
208        S: NetdocParseable,
209        O: NetdocSigned<Body = B, Signatures = S>,
210    >(
211        &mut self,
212        outer_stop: stop_at!(),
213    ) -> Result<O, EP> {
214        let mut input = ItemStream {
215            whole_for_signatures: &self.whole_for_signatures
216                [self.whole_for_signatures.len() - self.lines.remaining().len()..],
217            ..self.clone()
218        };
219        let r = (|| {
220            let inner_always_stop = outer_stop | StopAt::doc_intro::<B>();
221            let body = B::from_items(&mut input, inner_always_stop | StopAt::doc_intro::<S>())?;
222            let signatures = S::from_items(&mut input, inner_always_stop)?;
223            let signed = O::from_parts(body, signatures);
224            Ok(signed)
225        })(); // don't exit here
226
227        *self = ItemStream {
228            whole_for_signatures: self.whole_for_signatures,
229            ..input
230        };
231
232        r
233    }
234
235    /// Obtain the inputs that would be needed to hash any (even Irregular) signature
236    ///
237    /// These are the hash inputs which would be needed for the next item,
238    /// assuming it's a signature keyword.
239    pub fn peek_signature_hash_inputs(
240        &mut self,
241        body: SignedDocumentBody<'s>,
242    ) -> Result<Option<SignatureHashInputs<'s>>, EP> {
243        self.peek_internal()?;
244        let PeekState::Some(peeked) = &self.peeked else {
245            return Ok(None);
246        };
247        let signature_item_line = self.lines.peeked_line(&peeked.line);
248        let signature_item_kw_spc = signature_item_line.strip_end_counted(peeked.args_len);
249        Ok(Some(SignatureHashInputs {
250            body,
251            signature_item_kw_spc,
252            signature_item_line,
253        }))
254    }
255
256    /// Yield the next item.
257    pub fn next_item(&mut self) -> Result<Option<UnparsedItem<'s>>, EP> {
258        self.peek_internal()?;
259        let peeked = match self.peeked {
260            PeekState::None { .. } => return Ok(None),
261            PeekState::Some { .. } => match mem::replace(
262                &mut self.peeked,
263                PeekState::None {
264                    yielded_item_lno: self.lines.peek_lno(),
265                },
266            ) {
267                PeekState::Some(peeked) => peeked,
268                PeekState::None { .. } => panic!("it was Some just now"),
269            },
270        };
271
272        let keyword = peeked.keyword;
273        let line = self.lines.consume_peeked(peeked.line);
274        let args = &line[keyword.len()..];
275        let options = self.options;
276        let args = ArgumentStream::new(args, line.len(), options);
277
278        let object = if self.lines.remaining().starts_with('-') {
279            fn pem_delimiter<'s>(lines: &mut Lines<'s>, start: &str) -> Result<&'s str, EP> {
280                let line = lines.next().ok_or(
281                    // If this is the *header*, we already know there's a line,
282                    // so this error path is only for footers.
283                    EP::ObjectMissingFooter,
284                )?;
285                let label = line
286                    .strip_prefix(start)
287                    .ok_or(EP::InvalidObjectDelimiters)?
288                    .strip_suffix(PEM_AFTER_LABEL)
289                    .ok_or(EP::InvalidObjectDelimiters)?;
290                Ok(label)
291            }
292
293            let label1 = pem_delimiter(&mut self.lines, PEM_HEADER_START)?;
294            let base64_start_remaining = self.lines.remaining();
295            while !self.lines.remaining().starts_with('-') {
296                let _: &str = self.lines.next().ok_or(EP::ObjectMissingFooter)?;
297            }
298            let data_b64 = base64_start_remaining.strip_end_counted(self.lines.remaining().len());
299            let label2 = pem_delimiter(&mut self.lines, PEM_FOOTER_START)?;
300            let label = [label1, label2]
301                .into_iter()
302                .all_equal_value()
303                .map_err(|_| EP::ObjectMismatchedLabels)?;
304            Some(UnparsedObject {
305                label,
306                data_b64,
307                options,
308            })
309        } else {
310            None
311        };
312
313        Ok(Some(UnparsedItem {
314            keyword,
315            args,
316            object,
317        }))
318    }
319}
320
321impl<'s> UnparsedItem<'s> {
322    /// Access the arguments, mutably (for consuming and parsing them)
323    pub fn args_mut(&mut self) -> &mut ArgumentStream<'s> {
324        &mut self.args
325    }
326    /// Access a copy of the arguments
327    ///
328    /// When using this, be careful not to process any arguments twice.
329    pub fn args_copy(&self) -> ArgumentStream<'s> {
330        self.args.clone()
331    }
332
333    /// Access the arguments (readonly)
334    ///
335    /// When using this, be careful not to process any arguments twice.
336    pub fn args(&self) -> &ArgumentStream<'s> {
337        &self.args
338    }
339
340    /// Check that this item has no Object.
341    pub fn check_no_object(&self) -> Result<(), EP> {
342        if self.object.is_some() {
343            return Err(EP::ObjectUnexpected);
344        }
345        Ok(())
346    }
347    /// Convenience method for handling an error parsing an arguemnt
348    ///
349    /// Returns a closure that converts every error into [`ArgumentError::Invalid`]
350    /// and then to an [`ErrorProblem`] using
351    /// [`.args().handle_error()`](ArgumentStream::handle_error).
352    ///
353    /// Useful in manual `ItemValueParseable` impls, when parsing arguments ad-hoc.
354    pub fn invalid_argument_handler<E>(
355        &self,
356        field: &'static str,
357    ) -> impl FnOnce(E) -> ErrorProblem {
358        let error = self.args().handle_error(field, AE::Invalid);
359        move |_any_error| error
360    }
361}
362
363/// End of an argument list that does not accept any further (unknown) arguments
364///
365/// Implements `ItemArgumentParseable`.  Parses successfully iff the argument list is empty.
366#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
367#[allow(clippy::exhaustive_structs)]
368pub struct NoFurtherArguments;
369
370impl ItemArgumentParseable for NoFurtherArguments {
371    fn from_args(args: &mut ArgumentStream) -> Result<Self, AE> {
372        Ok(args.reject_extra_args()?)
373    }
374}
375
376impl<'s> Iterator for ItemStream<'s> {
377    type Item = Result<UnparsedItem<'s>, EP>;
378    fn next(&mut self) -> Option<Result<UnparsedItem<'s>, EP>> {
379        self.next_item().transpose()
380    }
381}
382
383impl<'s> ArgumentStream<'s> {
384    /// Make a new `ArgumentStream` from a string
385    ///
386    /// The string may start with whitespace (which will be ignored).
387    pub fn new(rest: &'s str, whole_line_len: usize, options: &'s ParseOptions) -> Self {
388        let previous_rest_len = whole_line_len;
389        ArgumentStream {
390            rest,
391            whole_line_len,
392            previous_rest_len,
393            options,
394        }
395    }
396
397    /// Consume this whole `ArgumnetStream`, giving the remaining arguments as a string
398    ///
399    /// The returned string won't start with whitespace.
400    //
401    /// `self` will be empty on return.
402    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
403    pub fn into_remaining(&mut self) -> &'s str {
404        self.prep_yield();
405        mem::take(&mut self.rest)
406    }
407
408    /// Return the component parts of this `ArgumnetStream`
409    ///
410    /// The returned string might start with whitespace.
411    pub fn whole_line_len(&self) -> usize {
412        self.whole_line_len
413    }
414
415    /// Prepares to yield an argument (or the rest)
416    ///
417    ///  * Trims leading WS from `rest`.
418    ///  * Records the `previous_rest_len`
419    fn prep_yield(&mut self) {
420        self.rest = self.rest.trim_start_matches(WS);
421        self.previous_rest_len = self.rest.len();
422    }
423
424    /// Prepares to yield, and then determines if there *is* anything to yield.
425    ///
426    ///  * Trim leading whitespace
427    ///  * Records the `previous_rest_len`
428    ///  * See if we're now empty
429    pub fn something_to_yield(&mut self) -> bool {
430        self.prep_yield();
431        !self.rest.is_empty()
432    }
433
434    /// Throw and error if there are further arguments
435    //
436    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
437    pub fn reject_extra_args(&mut self) -> Result<NoFurtherArguments, UnexpectedArgument> {
438        if self.something_to_yield() {
439            let column = self.next_arg_column();
440            Err(UnexpectedArgument { column })
441        } else {
442            Ok(NoFurtherArguments)
443        }
444    }
445
446    /// Convert a "length of `rest`" into the corresponding column number.
447    fn arg_column_from_rest_len(&self, rest_len: usize) -> usize {
448        // Can't underflow since rest is always part of the whole.
449        // Can't overflow since that would mean the document was as big as the address space.
450        self.whole_line_len - rest_len + 1
451    }
452
453    /// Obtain the column number of the previously yielded argument.
454    ///
455    /// (After `into_remaining`, gives the column number
456    /// of the start of the returned remaining argument string.)
457    pub fn prev_arg_column(&self) -> usize {
458        self.arg_column_from_rest_len(self.previous_rest_len)
459    }
460
461    /// Obtains the column number of the *next* argument.
462    ///
463    /// Should be called after `something_to_yield`; otherwise the returned value
464    /// may point to whitespace which is going to be skipped.
465    // ^ this possible misuse doesn't seem worth defending against with type-fu,
466    //   for a private function with few call sites.
467    fn next_arg_column(&self) -> usize {
468        self.arg_column_from_rest_len(self.rest.len())
469    }
470
471    /// Convert an `ArgumentError` to an `ErrorProblem`.
472    ///
473    /// The caller must supply the field name.
474    pub fn handle_error(&self, field: &'static str, ae: ArgumentError) -> ErrorProblem {
475        self.error_handler(field)(ae)
476    }
477
478    /// Return a converter from `ArgumentError` to `ErrorProblem`.
479    ///
480    /// Useful in `.map_err`.
481    pub fn error_handler(
482        &self,
483        field: &'static str,
484    ) -> impl Fn(ArgumentError) -> ErrorProblem + 'static {
485        let column = self.prev_arg_column();
486        move |ae| match ae {
487            AE::Missing => EP::MissingArgument { field },
488            AE::Invalid => EP::InvalidArgument { field, column },
489            AE::Unexpected => EP::UnexpectedArgument { column },
490        }
491    }
492}
493
494impl<'s> Iterator for ArgumentStream<'s> {
495    type Item = &'s str;
496    fn next(&mut self) -> Option<&'s str> {
497        if !self.something_to_yield() {
498            return None;
499        }
500        let arg;
501        (arg, self.rest) = self.rest.split_once(WS).unwrap_or((self.rest, ""));
502        Some(arg)
503    }
504}
505
506impl<'s> UnparsedObject<'s> {
507    /// Obtain the Object data, as decoded bytes
508    pub fn decode_data(&self) -> Result<Vec<u8>, EP> {
509        crate::parse::tokenize::base64_decode_multiline(self.data_b64)
510            .map_err(|_e| EP::ObjectInvalidBase64)
511    }
512}