Skip to main content

tor_netdoc/parse2/
lex.rs

1//! Lexing of netdoc elements
2
3use super::*;
4
5/// Linear whitespace as defined by torspec
6// Only pub via internal_prelude, for benefit of macros
7pub const WS: &[char] = &[' ', '\t'];
8
9define_derive_deftly! {
10    /// Define `parse_options` accessor
11    ///
12    /// The driver must have a lifetime named `'s`, which is suitable for the returned
13    /// `&'s ParseOptions`.
14    ///
15    /// # Top-level attributes:
16    ///
17    ///  * **`#[deftly(parse_options(field = ".field.field"))]`**, default `.options`
18    ParseOptions beta_deftly, expect items:
19
20    impl<$tgens> $ttype {
21        /// Examine the parsing options
22        pub fn parse_options(&self) -> &'s ParseOptions {
23            &self
24                ${tmeta(parse_options(field))
25                  as token_stream,
26                  default { .options }}
27        }
28    }
29}
30
31/// Top-level reader: Netdoc text interpreted as a stream of items
32#[derive(Debug, Clone, Deftly)]
33#[derive_deftly(ParseOptions)]
34pub struct ItemStream<'s> {
35    /// The whole input document.
36    whole_input: &'s str,
37    /// Remaining document, as a stream of lines
38    lines: Lines<'s>,
39    /// If we have peeked ahead, what we discovered
40    peeked: PeekState<'s>,
41    /// Parsing options.
42    options: &'s ParseOptions,
43}
44
45/// Whether an `ItemStream` has peeked ahead, and if so what it discovered
46#[derive(Debug, Clone)]
47enum PeekState<'s> {
48    /// We've peeked a line
49    Some(ItemStreamPeeked<'s>),
50    /// We've not peeked, or peeking gave `None`
51    None {
52        /// Line number of the last item we yielded.
53        ///
54        /// `0` at the start.
55        yielded_item_lno: usize,
56    },
57}
58
59/// If an `ItemStream` has peeked ahead, what it discovered
60#[derive(Debug, Clone)]
61struct ItemStreamPeeked<'s> {
62    /// The next keyword
63    keyword: KeywordRef<'s>,
64    /// Token proving that we
65    line: lines::Peeked,
66    /// Length of the suffix of the line that is the arguments rather than the keyword
67    ///
68    /// Does not include the first whitespace, that terminated the keyword.
69    args_len: usize,
70}
71
72/// An Item that has been lexed but not parsed
73#[derive(Debug, Clone, amplify::Getters, Deftly)]
74#[derive_deftly(ParseOptions)]
75#[deftly(parse_options(field = ".args.options"))]
76pub struct UnparsedItem<'s> {
77    /// The item's Keyword
78    #[getter(as_copy)]
79    keyword: KeywordRef<'s>,
80    /// The Item's Arguments
81    #[getter(skip)]
82    args: ArgumentStream<'s>,
83    /// The Item's Object, if there was one
84    #[getter(as_clone)]
85    object: Option<UnparsedObject<'s>>,
86}
87
88/// Reader for arguments on an Item
89///
90/// Represents the (remaining) arguments.
91#[derive(Debug, Clone, Deftly)]
92#[derive_deftly(ParseOptions)]
93pub struct ArgumentStream<'s> {
94    /// The remaining unparsed arguments
95    ///
96    /// Can start with WS, which is usually trimmed
97    rest: &'s str,
98
99    /// Original line length
100    ///
101    /// Used for reporting column of argument errors.
102    whole_line_len: usize,
103
104    /// Remaining length *before* we last yielded.
105    previous_rest_len: usize,
106
107    /// Parsing options.
108    options: &'s ParseOptions,
109}
110
111/// An Object that has been lexed but not parsed
112#[derive(Debug, Clone, amplify::Getters, Deftly)]
113#[derive_deftly(ParseOptions)]
114pub struct UnparsedObject<'s> {
115    /// The Label
116    #[getter(as_copy)]
117    label: &'s str,
118
119    /// The portion of the input document which is base64 data (and newlines)
120    #[getter(skip)]
121    data_b64: &'s str,
122
123    /// Parsing options.
124    options: &'s ParseOptions,
125}
126
127impl<'s> ItemStream<'s> {
128    /// Start reading a network document as a series of Items
129    pub fn new(input: &'s ParseInput<'s>) -> Result<Self, ParseError> {
130        Ok(ItemStream {
131            whole_input: input.input,
132            lines: Lines::new(input.input),
133            peeked: PeekState::None {
134                yielded_item_lno: 0,
135            },
136            options: &input.options,
137        })
138    }
139
140    /// Line number for reporting an error we have just discovered
141    ///
142    /// If we have recent peeked, we report the line number of the peeked keyword line.
143    ///
144    /// Otherwise, we report the line number of the most-recently yielded item.
145    pub fn lno_for_error(&self) -> usize {
146        match self.peeked {
147            PeekState::Some { .. } => {
148                // The error was presumably caused by whatever was seen in the peek.
149                // That's the current line number.
150                self.lines.peek_lno()
151            }
152            PeekState::None { yielded_item_lno } => {
153                // The error was presumably caused by the results of next_item().
154                yielded_item_lno
155            }
156        }
157    }
158
159    /// Core of peeking.  Tries to make `.peeked` be `Some`.
160    fn peek_internal<'i>(&'i mut self) -> Result<(), EP> {
161        if matches!(self.peeked, PeekState::None { .. }) {
162            let Some(peeked) = self.lines.peek() else {
163                return Ok(());
164            };
165
166            let peeked_line = self.lines.peeked_line(&peeked);
167
168            let (keyword, args) = peeked_line.split_once(WS).unwrap_or((peeked_line, ""));
169            let keyword = KeywordRef::new(keyword)?;
170
171            self.peeked = PeekState::Some(ItemStreamPeeked {
172                keyword,
173                line: peeked,
174                args_len: args.len(),
175            });
176        }
177
178        Ok(())
179    }
180
181    /// Peek the next keyword
182    pub fn peek_keyword(&mut self) -> Result<Option<KeywordRef<'s>>, EP> {
183        self.peek_internal()?;
184        let PeekState::Some(peeked) = &self.peeked else {
185            return Ok(None);
186        };
187        Ok(Some(peeked.keyword))
188    }
189
190    /// Obtain the body so far, suitable for hashing for an Orderly signature
191    pub fn body_sofar_for_signature(&self) -> SignedDocumentBody<'s> {
192        let body = &self.whole_input[0..self.byte_position()];
193        SignedDocumentBody { body }
194    }
195
196    /// Byte position, pointing to the start of the next item to yield
197    ///
198    /// Offset in bytes from the start of the original input string
199    /// to the "current" position,
200    /// ie to just after the item we yielded and just before the next item (or EOF).
201    pub fn byte_position(&self) -> usize {
202        self.whole_input.len() - self.lines.remaining().len()
203    }
204
205    /// Access for the entire input string
206    ///
207    /// The original `input: &str` argument to [`ParseInput::new`].
208    ///
209    /// Includes both yielded and unyielded items.
210    pub fn whole_input(&self) -> &'s str {
211        self.whole_input
212    }
213
214    /// Parse a (sub-)document with its own signatures
215    ///
216    /// Used (mostly) by the
217    /// [`NetdocParseableUnverified`](derive_deftly_template_NetdocParseableUnverified)
218    /// derive macro.
219    ///
220    /// Generic parameters:
221    ///
222    ///  * **`B`**: the body type: the type to which `NetdocParseableUnverified` is applied.
223    ///  * **`S`**: the signatures section type.
224    ///  * **`O`**: the `FooUnverified` type, which embodies the parsed body and signatures.
225    pub fn parse_signed<
226        B: HasUnverifiedParsedBody,
227        S: NetdocParseableSignatures,
228        O: NetdocUnverified<Body = B, Signatures = S>,
229    >(
230        &mut self,
231        outer_stop: stop_at!(),
232    ) -> Result<O, EP> {
233        let mut input = ItemStream {
234            whole_input: &self.whole_input[self.whole_input.len() - self.lines.remaining().len()..],
235            ..self.clone()
236        };
237        let r = (|| {
238            let inner_always_stop = outer_stop | StopAt::doc_intro::<B::UnverifiedParsedBody>();
239            let body = B::UnverifiedParsedBody::from_items(
240                &mut input,
241                inner_always_stop | StopAt(S::is_item_keyword),
242            )?;
243            let signed_doc_body = input.body_sofar_for_signature();
244            let unsigned_body_len = signed_doc_body.body().len();
245            let mut hashes = S::HashesAccu::default();
246            let sigs = S::from_items(&mut input, signed_doc_body, &mut hashes, inner_always_stop)?;
247            let sigs = SignaturesData {
248                sigs,
249                unsigned_body_len,
250                hashes,
251            };
252            // SECURITY
253            // We unwrap the UnverifiedParsedBody and immediately wrap it up again
254            // in FooUnverified, passing on the obligation to verify the signatures,
255            // and still enforcing that with a newtype.
256            let signed = O::from_parts(B::unverified_into_inner_unchecked(body), sigs);
257            Ok(signed)
258        })(); // don't exit here
259
260        *self = ItemStream {
261            whole_input: self.whole_input,
262            ..input
263        };
264
265        r
266    }
267
268    /// Obtain the inputs that would be needed to hash any (even Disorderly) signature
269    ///
270    /// These are the hash inputs which would be needed for the next item,
271    /// assuming it's a signature keyword.
272    pub fn peek_signature_hash_inputs(
273        &mut self,
274        body: SignedDocumentBody<'s>,
275    ) -> Result<Option<SignatureHashInputs<'s>>, EP> {
276        self.peek_internal()?;
277        let PeekState::Some(peeked) = &self.peeked else {
278            return Ok(None);
279        };
280        let document_sofar = self.body_sofar_for_signature().body();
281        let signature_item_line = self.lines.peeked_line(&peeked.line);
282        let signature_item_kw_spc = signature_item_line.strip_end_counted(peeked.args_len);
283        Ok(Some(SignatureHashInputs {
284            body,
285            document_sofar,
286            signature_item_kw_spc,
287            signature_item_line,
288        }))
289    }
290
291    /// Yield the next item.
292    pub fn next_item(&mut self) -> Result<Option<UnparsedItem<'s>>, EP> {
293        self.peek_internal()?;
294        let peeked = match self.peeked {
295            PeekState::None { .. } => return Ok(None),
296            PeekState::Some { .. } => match mem::replace(
297                &mut self.peeked,
298                PeekState::None {
299                    yielded_item_lno: self.lines.peek_lno(),
300                },
301            ) {
302                PeekState::Some(peeked) => peeked,
303                PeekState::None { .. } => panic!("it was Some just now"),
304            },
305        };
306
307        let keyword = peeked.keyword;
308        let line = self.lines.consume_peeked(peeked.line);
309        let args = &line[keyword.len()..];
310        let options = self.options;
311        let args = ArgumentStream::new(args, line.len(), options);
312
313        let object = if self.lines.remaining().starts_with('-') {
314            fn pem_delimiter<'s>(lines: &mut Lines<'s>, start: &str) -> Result<&'s str, EP> {
315                let line = lines.next().ok_or(
316                    // If this is the *header*, we already know there's a line,
317                    // so this error path is only for footers.
318                    EP::ObjectMissingFooter,
319                )?;
320                let label = line
321                    .strip_prefix(start)
322                    .ok_or(EP::InvalidObjectDelimiters)?
323                    .strip_suffix(PEM_AFTER_LABEL)
324                    .ok_or(EP::InvalidObjectDelimiters)?;
325                Ok(label)
326            }
327
328            let label1 = pem_delimiter(&mut self.lines, PEM_HEADER_START)?;
329            let base64_start_remaining = self.lines.remaining();
330            while !self.lines.remaining().starts_with('-') {
331                let _: &str = self.lines.next().ok_or(EP::ObjectMissingFooter)?;
332            }
333            let data_b64 = base64_start_remaining.strip_end_counted(self.lines.remaining().len());
334            let label2 = pem_delimiter(&mut self.lines, PEM_FOOTER_START)?;
335            let label = [label1, label2]
336                .into_iter()
337                .all_equal_value()
338                .map_err(|_| EP::ObjectMismatchedLabels)?;
339            Some(UnparsedObject {
340                label,
341                data_b64,
342                options,
343            })
344        } else {
345            None
346        };
347
348        Ok(Some(UnparsedItem {
349            keyword,
350            args,
351            object,
352        }))
353    }
354}
355
356impl<'s> UnparsedItem<'s> {
357    /// Access the arguments, mutably (for consuming and parsing them)
358    pub fn args_mut(&mut self) -> &mut ArgumentStream<'s> {
359        &mut self.args
360    }
361    /// Access a copy of the arguments
362    ///
363    /// When using this, be careful not to process any arguments twice.
364    pub fn args_copy(&self) -> ArgumentStream<'s> {
365        self.args.clone()
366    }
367
368    /// Access the arguments (readonly)
369    ///
370    /// When using this, be careful not to process any arguments twice.
371    pub fn args(&self) -> &ArgumentStream<'s> {
372        &self.args
373    }
374
375    /// Check that this item has no Object.
376    pub fn check_no_object(&self) -> Result<(), EP> {
377        if self.object.is_some() {
378            return Err(EP::ObjectUnexpected);
379        }
380        Ok(())
381    }
382    /// Convenience method for handling an error parsing an argument
383    ///
384    /// Returns a closure that converts every error into [`ArgumentError::Invalid`]
385    /// and then to an [`ErrorProblem`] using
386    /// [`.args().handle_error()`](ArgumentStream::handle_error).
387    ///
388    /// Useful in manual `ItemValueParseable` impls, when parsing arguments ad-hoc.
389    pub fn invalid_argument_handler<E>(
390        &self,
391        field: &'static str,
392    ) -> impl FnOnce(E) -> ErrorProblem {
393        let error = self.args().handle_error(field, AE::Invalid);
394        move |_any_error| error
395    }
396}
397
398/// End of an argument list that does not accept any further (unknown) arguments
399///
400/// Implements `ItemArgumentParseable`.  Parses successfully iff the argument list is empty.
401#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
402#[allow(clippy::exhaustive_structs)]
403pub struct NoFurtherArguments;
404
405impl ItemArgumentParseable for NoFurtherArguments {
406    fn from_args(args: &mut ArgumentStream) -> Result<Self, AE> {
407        Ok(args.reject_extra_args()?)
408    }
409}
410
411impl<'s> Iterator for ItemStream<'s> {
412    type Item = Result<UnparsedItem<'s>, EP>;
413    fn next(&mut self) -> Option<Result<UnparsedItem<'s>, EP>> {
414        self.next_item().transpose()
415    }
416}
417
418impl<'s> ArgumentStream<'s> {
419    /// Make a new `ArgumentStream` from a string
420    ///
421    /// The string may start with whitespace (which will be ignored).
422    pub fn new(rest: &'s str, whole_line_len: usize, options: &'s ParseOptions) -> Self {
423        let previous_rest_len = whole_line_len;
424        ArgumentStream {
425            rest,
426            whole_line_len,
427            previous_rest_len,
428            options,
429        }
430    }
431
432    /// Consume this whole `ArgumentStream`, giving the remaining arguments as a string
433    ///
434    /// The returned string won't start with whitespace.
435    //
436    /// `self` will be empty on return.
437    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
438    pub fn into_remaining(&mut self) -> &'s str {
439        self.prep_yield();
440        mem::take(&mut self.rest)
441    }
442
443    /// Return the component parts of this `ArgumentStream`
444    ///
445    /// The returned string might start with whitespace.
446    pub fn whole_line_len(&self) -> usize {
447        self.whole_line_len
448    }
449
450    /// Prepares to yield an argument (or the rest)
451    ///
452    ///  * Trims leading WS from `rest`.
453    ///  * Records the `previous_rest_len`
454    fn prep_yield(&mut self) {
455        self.rest = self.rest.trim_start_matches(WS);
456        self.previous_rest_len = self.rest.len();
457    }
458
459    /// Prepares to yield, and then determines if there *is* anything to yield.
460    ///
461    ///  * Trim leading whitespace
462    ///  * Records the `previous_rest_len`
463    ///  * See if we're now empty
464    pub fn something_to_yield(&mut self) -> bool {
465        self.prep_yield();
466        !self.rest.is_empty()
467    }
468
469    /// Throw and error if there are further arguments
470    //
471    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
472    pub fn reject_extra_args(&mut self) -> Result<NoFurtherArguments, UnexpectedArgument> {
473        if self.something_to_yield() {
474            let column = self.next_arg_column();
475            Err(UnexpectedArgument { column })
476        } else {
477            Ok(NoFurtherArguments)
478        }
479    }
480
481    /// Convert a "length of `rest`" into the corresponding column number.
482    fn arg_column_from_rest_len(&self, rest_len: usize) -> usize {
483        // Can't underflow since rest is always part of the whole.
484        // Can't overflow since that would mean the document was as big as the address space.
485        self.whole_line_len - rest_len + 1
486    }
487
488    /// Obtain the column number of the previously yielded argument.
489    ///
490    /// (After `into_remaining`, gives the column number
491    /// of the start of the returned remaining argument string.)
492    pub fn prev_arg_column(&self) -> usize {
493        self.arg_column_from_rest_len(self.previous_rest_len)
494    }
495
496    /// Obtains the column number of the *next* argument.
497    ///
498    /// Should be called after `something_to_yield`; otherwise the returned value
499    /// may point to whitespace which is going to be skipped.
500    // ^ this possible misuse doesn't seem worth defending against with type-fu,
501    //   for a private function with few call sites.
502    fn next_arg_column(&self) -> usize {
503        self.arg_column_from_rest_len(self.rest.len())
504    }
505
506    /// Convert an `ArgumentError` to an `ErrorProblem`.
507    ///
508    /// The caller must supply the field name.
509    pub fn handle_error(&self, field: &'static str, ae: ArgumentError) -> ErrorProblem {
510        self.error_handler(field)(ae)
511    }
512
513    /// Return a converter from `ArgumentError` to `ErrorProblem`.
514    ///
515    /// Useful in `.map_err`.
516    pub fn error_handler(
517        &self,
518        field: &'static str,
519    ) -> impl Fn(ArgumentError) -> ErrorProblem + 'static {
520        let column = self.prev_arg_column();
521        move |ae| match ae {
522            AE::Missing => EP::MissingArgument { field },
523            AE::Invalid => EP::InvalidArgument { field, column },
524            AE::Unexpected => EP::UnexpectedArgument { column },
525        }
526    }
527}
528
529impl<'s> Iterator for ArgumentStream<'s> {
530    type Item = &'s str;
531    fn next(&mut self) -> Option<&'s str> {
532        if !self.something_to_yield() {
533            return None;
534        }
535        let arg;
536        (arg, self.rest) = self.rest.split_once(WS).unwrap_or((self.rest, ""));
537        Some(arg)
538    }
539}
540
541impl<'s> UnparsedObject<'s> {
542    /// Obtain the Object data, as decoded bytes
543    pub fn decode_data(&self) -> Result<Vec<u8>, EP> {
544        crate::parse::tokenize::base64_decode_multiline(self.data_b64)
545            .map_err(|_e| EP::ObjectInvalidBase64)
546    }
547}