tor_netdoc/parse2/
lex.rs

1//! Lexing of netdoc elements
2
3use super::*;
4
5/// Linear whitespace as defined by torspec
6// Only pub via internal_prelude, for benefit of macros
7pub const WS: &[char] = &[' ', '\t'];
8
9define_derive_deftly! {
10    /// Define `parse_options` accessor
11    ///
12    /// The driver must have a lifetime named `'s`, which is suitable for the returned
13    /// `&'s ParseOptions`.
14    ///
15    /// # Top-level attributes:
16    ///
17    ///  * **`#[deftly(parse_options(field = ".field.field"))]`**, default `.options`
18    ParseOptions beta_deftly, expect items:
19
20    impl<$tgens> $ttype {
21        /// Examine the parsing options
22        pub fn parse_options(&self) -> &'s ParseOptions {
23            &self
24                ${tmeta(parse_options(field))
25                  as token_stream,
26                  default { .options }}
27        }
28    }
29}
30
31/// Top-level reader: Netdoc text interpreted as a stream of items
32#[derive(Debug, Clone, Deftly)]
33#[derive_deftly(ParseOptions)]
34pub struct ItemStream<'s> {
35    /// The whole document.  Used for signature hashing.
36    whole_for_signatures: &'s str,
37    /// Remaining document, as a stream of lines
38    lines: Lines<'s>,
39    /// If we have peeked ahead, what we discovered
40    peeked: PeekState<'s>,
41    /// Parsing options.
42    options: &'s ParseOptions,
43}
44
45/// Whether an `ItemStream` has peeked ahead, and if so what it discovered
46#[derive(Debug, Clone)]
47enum PeekState<'s> {
48    /// We've peeked a line
49    Some(ItemStreamPeeked<'s>),
50    /// We've not peeked, or peeking gave `None`
51    None {
52        /// Line number of the last item we yielded.
53        ///
54        /// `0` at the start.
55        yielded_item_lno: usize,
56    },
57}
58
59/// If an `ItemStream` has peeked ahead, what it discovered
60#[derive(Debug, Clone)]
61struct ItemStreamPeeked<'s> {
62    /// The next keyword
63    keyword: KeywordRef<'s>,
64    /// Token proving that we
65    line: lines::Peeked,
66    /// Length of the suffix of the line that is the arguments rather than the keyword
67    ///
68    /// Does not include the first whitespace, that terminated the keyword.
69    args_len: usize,
70}
71
72/// An Item that has been lexed but not parsed
73#[derive(Debug, Clone, amplify::Getters, Deftly)]
74#[derive_deftly(ParseOptions)]
75#[deftly(parse_options(field = ".args.options"))]
76pub struct UnparsedItem<'s> {
77    /// The item's Keyword
78    #[getter(as_copy)]
79    keyword: KeywordRef<'s>,
80    /// The Item's Arguments
81    #[getter(skip)]
82    args: ArgumentStream<'s>,
83    /// The Item's Object, if there was one
84    #[getter(as_clone)]
85    object: Option<UnparsedObject<'s>>,
86}
87
88/// Reader for arguments on an Item
89///
90/// Represents the (remaining) arguments.
91#[derive(Debug, Clone, Deftly)]
92#[derive_deftly(ParseOptions)]
93pub struct ArgumentStream<'s> {
94    /// The remaining unparsed arguments
95    ///
96    /// Can start with WS, which is usually trimmed
97    rest: &'s str,
98
99    /// Original line length
100    ///
101    /// Used for reporting column of argument errors.
102    whole_line_len: usize,
103
104    /// Remaining length *before* we last yielded.
105    previous_rest_len: usize,
106
107    /// Parsing options.
108    options: &'s ParseOptions,
109}
110
111/// An Object that has been lexed but not parsed
112#[derive(Debug, Clone, amplify::Getters, Deftly)]
113#[derive_deftly(ParseOptions)]
114pub struct UnparsedObject<'s> {
115    /// The Label
116    #[getter(as_copy)]
117    label: &'s str,
118
119    /// The portion of the input document which is base64 data (and newlines)
120    #[getter(skip)]
121    data_b64: &'s str,
122
123    /// Parsing options.
124    options: &'s ParseOptions,
125}
126
127impl<'s> ItemStream<'s> {
128    /// Start reading a network document as a series of Items
129    pub fn new(input: &'s ParseInput<'s>) -> Result<Self, ParseError> {
130        Ok(ItemStream {
131            whole_for_signatures: input.input,
132            lines: Lines::new(input.input),
133            peeked: PeekState::None {
134                yielded_item_lno: 0,
135            },
136            options: &input.options,
137        })
138    }
139
140    /// Line number for reporting an error we have just discovered
141    ///
142    /// If we have recent peeked, we report the line number of the peeked keyword line.
143    ///
144    /// Otherwise, we report the line number of the most-recently yielded item.
145    pub fn lno_for_error(&self) -> usize {
146        match self.peeked {
147            PeekState::Some { .. } => {
148                // The error was presumably caused by whatever was seen in the peek.
149                // That's the current line number.
150                self.lines.peek_lno()
151            }
152            PeekState::None { yielded_item_lno } => {
153                // The error was presumably caused by the results of next_item().
154                yielded_item_lno
155            }
156        }
157    }
158
159    /// Core of peeking.  Tries to make `.peeked` be `Some`.
160    fn peek_internal<'i>(&'i mut self) -> Result<(), EP> {
161        if matches!(self.peeked, PeekState::None { .. }) {
162            let Some(peeked) = self.lines.peek() else {
163                return Ok(());
164            };
165
166            let peeked_line = self.lines.peeked_line(&peeked);
167
168            let (keyword, args) = peeked_line.split_once(WS).unwrap_or((peeked_line, ""));
169            let keyword = KeywordRef::new(keyword)?;
170
171            self.peeked = PeekState::Some(ItemStreamPeeked {
172                keyword,
173                line: peeked,
174                args_len: args.len(),
175            });
176        }
177
178        Ok(())
179    }
180
181    /// Peek the next keyword
182    pub fn peek_keyword(&mut self) -> Result<Option<KeywordRef<'s>>, EP> {
183        self.peek_internal()?;
184        let PeekState::Some(peeked) = &self.peeked else {
185            return Ok(None);
186        };
187        Ok(Some(peeked.keyword))
188    }
189
190    /// Obtain the body so far, suitable for hashing for a Regular signature
191    pub fn body_sofar_for_signature(&self) -> SignedDocumentBody<'s> {
192        let body = self
193            .whole_for_signatures
194            .strip_end_counted(self.lines.remaining().len());
195        SignedDocumentBody { body }
196    }
197
198    /// Parse a (sub-)document with its own signatures
199    pub fn parse_signed<
200        B: NetdocParseable,
201        S: NetdocParseable,
202        O: NetdocSigned<Body = B, Signatures = S>,
203    >(
204        &mut self,
205        outer_stop: stop_at!(),
206    ) -> Result<O, EP> {
207        let mut input = ItemStream {
208            whole_for_signatures: &self.whole_for_signatures
209                [self.whole_for_signatures.len() - self.lines.remaining().len()..],
210            ..self.clone()
211        };
212        let r = (|| {
213            let inner_always_stop = outer_stop | StopAt::doc_intro::<B>();
214            let body = B::from_items(&mut input, inner_always_stop | StopAt::doc_intro::<S>())?;
215            let signatures = S::from_items(&mut input, inner_always_stop)?;
216            let signed = O::from_parts(body, signatures);
217            Ok(signed)
218        })(); // don't exit here
219
220        *self = ItemStream {
221            whole_for_signatures: self.whole_for_signatures,
222            ..input
223        };
224
225        r
226    }
227
228    /// Obtain the inputs that would be needed to hash any (even Irregular) signature
229    ///
230    /// These are the hash inputs which would be needed for the next item,
231    /// assuming it's a signature keyword.
232    pub fn peek_signature_hash_inputs(
233        &mut self,
234        body: SignedDocumentBody<'s>,
235    ) -> Result<Option<SignatureHashInputs<'s>>, EP> {
236        self.peek_internal()?;
237        let PeekState::Some(peeked) = &self.peeked else {
238            return Ok(None);
239        };
240        let signature_item_line = self.lines.peeked_line(&peeked.line);
241        let signature_item_kw_spc = signature_item_line.strip_end_counted(peeked.args_len);
242        Ok(Some(SignatureHashInputs {
243            body,
244            signature_item_kw_spc,
245            signature_item_line,
246        }))
247    }
248
249    /// Yield the next item.
250    pub fn next_item(&mut self) -> Result<Option<UnparsedItem<'s>>, EP> {
251        self.peek_internal()?;
252        let peeked = match self.peeked {
253            PeekState::None { .. } => return Ok(None),
254            PeekState::Some { .. } => match mem::replace(
255                &mut self.peeked,
256                PeekState::None {
257                    yielded_item_lno: self.lines.peek_lno(),
258                },
259            ) {
260                PeekState::Some(peeked) => peeked,
261                PeekState::None { .. } => panic!("it was Some just now"),
262            },
263        };
264
265        let keyword = peeked.keyword;
266        let line = self.lines.consume_peeked(peeked.line);
267        let args = &line[keyword.len()..];
268        let options = self.options;
269        let args = ArgumentStream::new(args, line.len(), options);
270
271        let object = if self.lines.remaining().starts_with('-') {
272            fn pem_delimiter<'s>(lines: &mut Lines<'s>, start: &str) -> Result<&'s str, EP> {
273                let line = lines.next().ok_or(
274                    // If this is the *header*, we already know there's a line,
275                    // so this error path is only for footers.
276                    EP::ObjectMissingFooter,
277                )?;
278                let label = line
279                    .strip_prefix(start)
280                    .ok_or(EP::InvalidObjectDelimiters)?
281                    .strip_suffix(PEM_AFTER_LABEL)
282                    .ok_or(EP::InvalidObjectDelimiters)?;
283                Ok(label)
284            }
285
286            let label1 = pem_delimiter(&mut self.lines, PEM_HEADER_START)?;
287            let base64_start_remaining = self.lines.remaining();
288            while !self.lines.remaining().starts_with('-') {
289                let _: &str = self.lines.next().ok_or(EP::ObjectMissingFooter)?;
290            }
291            let data_b64 = base64_start_remaining.strip_end_counted(self.lines.remaining().len());
292            let label2 = pem_delimiter(&mut self.lines, PEM_FOOTER_START)?;
293            let label = [label1, label2]
294                .into_iter()
295                .all_equal_value()
296                .map_err(|_| EP::ObjectMismatchedLabels)?;
297            Some(UnparsedObject {
298                label,
299                data_b64,
300                options,
301            })
302        } else {
303            None
304        };
305
306        Ok(Some(UnparsedItem {
307            keyword,
308            args,
309            object,
310        }))
311    }
312}
313
314impl<'s> UnparsedItem<'s> {
315    /// Access the arguments, mutably (for consuming and parsing them)
316    pub fn args_mut(&mut self) -> &mut ArgumentStream<'s> {
317        &mut self.args
318    }
319    /// Access a copy of the arguments
320    ///
321    /// When using this, be careful not to process any arguments twice.
322    pub fn args_copy(&self) -> ArgumentStream<'s> {
323        self.args.clone()
324    }
325
326    /// Access the arguments (readonly)
327    ///
328    /// When using this, be careful not to process any arguments twice.
329    pub fn args(&self) -> &ArgumentStream<'s> {
330        &self.args
331    }
332
333    /// Check that this item has no Object.
334    pub fn check_no_object(&self) -> Result<(), EP> {
335        if self.object.is_some() {
336            return Err(EP::ObjectUnexpected);
337        }
338        Ok(())
339    }
340    /// Convenience method for handling an error parsing an arguemnt
341    ///
342    /// Returns a closure that converts every error into [`ArgumentError::Invalid`]
343    /// and then to an [`ErrorProblem`] using
344    /// [`.args().handle_error()`](ArgumentStream::handle_error).
345    ///
346    /// Useful in manual `ItemValueParseable` impls, when parsing arguments ad-hoc.
347    pub fn invalid_argument_handler<E>(
348        &self,
349        field: &'static str,
350    ) -> impl FnOnce(E) -> ErrorProblem {
351        let error = self.args().handle_error(field, AE::Invalid);
352        move |_any_error| error
353    }
354}
355
356/// End of an argument list that does not accept any further (unknown) arguments
357///
358/// Implements `ItemArgumentParseable`.  Parses successfully iff the argument list is empty.
359#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
360#[allow(clippy::exhaustive_structs)]
361pub struct NoFurtherArguments;
362
363impl ItemArgumentParseable for NoFurtherArguments {
364    fn from_args(args: &mut ArgumentStream) -> Result<Self, AE> {
365        Ok(args.reject_extra_args()?)
366    }
367}
368
369impl<'s> Iterator for ItemStream<'s> {
370    type Item = Result<UnparsedItem<'s>, EP>;
371    fn next(&mut self) -> Option<Result<UnparsedItem<'s>, EP>> {
372        self.next_item().transpose()
373    }
374}
375
376impl<'s> ArgumentStream<'s> {
377    /// Make a new `ArgumentStream` from a string
378    ///
379    /// The string may start with whitespace (which will be ignored).
380    pub fn new(rest: &'s str, whole_line_len: usize, options: &'s ParseOptions) -> Self {
381        let previous_rest_len = whole_line_len;
382        ArgumentStream {
383            rest,
384            whole_line_len,
385            previous_rest_len,
386            options,
387        }
388    }
389
390    /// Consume this whole `ArgumnetStream`, giving the remaining arguments as a string
391    ///
392    /// The returned string won't start with whitespace.
393    //
394    /// `self` will be empty on return.
395    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
396    pub fn into_remaining(&mut self) -> &'s str {
397        self.prep_yield();
398        mem::take(&mut self.rest)
399    }
400
401    /// Return the component parts of this `ArgumnetStream`
402    ///
403    /// The returned string might start with whitespace.
404    pub fn whole_line_len(&self) -> usize {
405        self.whole_line_len
406    }
407
408    /// Prepares to yield an argument (or the rest)
409    ///
410    ///  * Trims leading WS from `rest`.
411    ///  * Records the `previous_rest_len`
412    fn prep_yield(&mut self) {
413        self.rest = self.rest.trim_start_matches(WS);
414        self.previous_rest_len = self.rest.len();
415    }
416
417    /// Prepares to yield, and then determines if there *is* anything to yield.
418    ///
419    ///  * Trim leading whitespace
420    ///  * Records the `previous_rest_len`
421    ///  * See if we're now empty
422    pub fn something_to_yield(&mut self) -> bool {
423        self.prep_yield();
424        !self.rest.is_empty()
425    }
426
427    /// Throw and error if there are further arguments
428    //
429    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
430    pub fn reject_extra_args(&mut self) -> Result<NoFurtherArguments, UnexpectedArgument> {
431        if self.something_to_yield() {
432            let column = self.next_arg_column();
433            Err(UnexpectedArgument { column })
434        } else {
435            Ok(NoFurtherArguments)
436        }
437    }
438
439    /// Convert a "length of `rest`" into the corresponding column number.
440    fn arg_column_from_rest_len(&self, rest_len: usize) -> usize {
441        // Can't underflow since rest is always part of the whole.
442        // Can't overflow since that would mean the document was as big as the address space.
443        self.whole_line_len - rest_len + 1
444    }
445
446    /// Obtain the column number of the previously yielded argument.
447    ///
448    /// (After `into_remaining`, gives the column number
449    /// of the start of the returned remaining argument string.)
450    pub fn prev_arg_column(&self) -> usize {
451        self.arg_column_from_rest_len(self.previous_rest_len)
452    }
453
454    /// Obtains the column number of the *next* argument.
455    ///
456    /// Should be called after `something_to_yield`; otherwise the returned value
457    /// may point to whitespace which is going to be skipped.
458    // ^ this possible misuse doesn't seem worth defending against with type-fu,
459    //   for a private function with few call sites.
460    fn next_arg_column(&self) -> usize {
461        self.arg_column_from_rest_len(self.rest.len())
462    }
463
464    /// Convert an `ArgumentError` to an `ErrorProblem`.
465    ///
466    /// The caller must supply the field name.
467    pub fn handle_error(&self, field: &'static str, ae: ArgumentError) -> ErrorProblem {
468        self.error_handler(field)(ae)
469    }
470
471    /// Return a converter from `ArgumentError` to `ErrorProblem`.
472    ///
473    /// Useful in `.map_err`.
474    pub fn error_handler(
475        &self,
476        field: &'static str,
477    ) -> impl Fn(ArgumentError) -> ErrorProblem + 'static {
478        let column = self.prev_arg_column();
479        move |ae| match ae {
480            AE::Missing => EP::MissingArgument { field },
481            AE::Invalid => EP::InvalidArgument { field, column },
482            AE::Unexpected => EP::UnexpectedArgument { column },
483        }
484    }
485}
486
487impl<'s> Iterator for ArgumentStream<'s> {
488    type Item = &'s str;
489    fn next(&mut self) -> Option<&'s str> {
490        if !self.something_to_yield() {
491            return None;
492        }
493        let arg;
494        (arg, self.rest) = self.rest.split_once(WS).unwrap_or((self.rest, ""));
495        Some(arg)
496    }
497}
498
499impl<'s> UnparsedObject<'s> {
500    /// Obtain the Object data, as decoded bytes
501    pub fn decode_data(&self) -> Result<Vec<u8>, EP> {
502        crate::parse::tokenize::base64_decode_multiline(self.data_b64)
503            .map_err(|_e| EP::ObjectInvalidBase64)
504    }
505}