Skip to main content

tor_netdoc/parse2/
lex.rs

1//! Lexing of netdoc elements
2
3use super::*;
4
5/// Linear whitespace as defined by torspec
6// Only pub via internal_prelude, for benefit of macros
7pub const WS: &[char] = &[' ', '\t'];
8
9define_derive_deftly! {
10    /// Define `parse_options` accessor
11    ///
12    /// The driver must have a lifetime named `'s`, which is suitable for the returned
13    /// `&'s ParseOptions`.
14    ///
15    /// # Top-level attributes:
16    ///
17    ///  * **`#[deftly(parse_options(field = ".field.field"))]`**, default `.options`
18    ParseOptions beta_deftly, expect items:
19
20    impl<$tgens> $ttype {
21        /// Examine the parsing options
22        pub fn parse_options(&self) -> &'s ParseOptions {
23            &self
24                ${tmeta(parse_options(field))
25                  as token_stream,
26                  default { .options }}
27        }
28    }
29}
30
31/// Top-level reader: Netdoc text interpreted as a stream of items
32#[derive(Debug, Clone, Deftly)]
33#[derive_deftly(ParseOptions)]
34pub struct ItemStream<'s> {
35    /// The whole input document.
36    whole_input: &'s str,
37    /// Remaining document, as a stream of lines
38    lines: Lines<'s>,
39    /// If we have peeked ahead, what we discovered
40    peeked: PeekState<'s>,
41    /// Parsing options.
42    options: &'s ParseOptions,
43}
44
45/// Whether an `ItemStream` has peeked ahead, and if so what it discovered
46#[derive(Debug, Clone)]
47enum PeekState<'s> {
48    /// We've peeked a line
49    Some(ItemStreamPeeked<'s>),
50    /// We've not peeked, or peeking gave `None`
51    None {
52        /// Line number of the last item we yielded.
53        ///
54        /// `0` at the start.
55        yielded_item_lno: usize,
56    },
57}
58
59/// If an `ItemStream` has peeked ahead, what it discovered
60#[derive(Debug, Clone)]
61struct ItemStreamPeeked<'s> {
62    /// The next keyword
63    keyword: KeywordRef<'s>,
64    /// Token proving that we
65    line: lines::Peeked,
66    /// Length of the suffix of the line that is the arguments rather than the keyword
67    ///
68    /// Does not include the first whitespace, that terminated the keyword.
69    args_len: usize,
70}
71
72/// An Item that has been lexed but not parsed
73#[derive(Debug, Clone, amplify::Getters, Deftly)]
74#[derive_deftly(ParseOptions)]
75#[deftly(parse_options(field = ".args.options"))]
76pub struct UnparsedItem<'s> {
77    /// The item's Keyword
78    #[getter(as_copy)]
79    keyword: KeywordRef<'s>,
80    /// The Item's Arguments
81    #[getter(skip)]
82    args: ArgumentStream<'s>,
83    /// The Item's Object, if there was one
84    #[getter(as_clone)]
85    object: Option<UnparsedObject<'s>>,
86}
87
88/// Reader for arguments on an Item
89///
90/// Represents the (remaining) arguments.
91#[derive(Debug, Clone, Deftly)]
92#[derive_deftly(ParseOptions)]
93pub struct ArgumentStream<'s> {
94    /// The remaining unparsed arguments
95    ///
96    /// Can start with WS, which is usually trimmed
97    rest: &'s str,
98
99    /// Original line length
100    ///
101    /// Used for reporting column of argument errors.
102    whole_line_len: usize,
103
104    /// Remaining length *before* we last yielded.
105    previous_rest_len: usize,
106
107    /// Parsing options.
108    options: &'s ParseOptions,
109}
110
111/// An Object that has been lexed but not parsed
112#[derive(Debug, Clone, amplify::Getters, Deftly)]
113#[derive_deftly(ParseOptions)]
114pub struct UnparsedObject<'s> {
115    /// The Label
116    #[getter(as_copy)]
117    label: &'s str,
118
119    /// The portion of the input document which is base64 data (and newlines)
120    #[getter(skip)]
121    data_b64: &'s str,
122
123    /// Parsing options.
124    options: &'s ParseOptions,
125}
126
127impl<'s> ItemStream<'s> {
128    /// Start reading a network document as a series of Items
129    pub fn new(input: &'s ParseInput<'s>) -> Result<Self, ParseError> {
130        Ok(ItemStream {
131            whole_input: input.input,
132            lines: Lines::new(input.input),
133            peeked: PeekState::None {
134                yielded_item_lno: 0,
135            },
136            options: &input.options,
137        })
138    }
139
140    /// Line number for reporting an error we have just discovered
141    ///
142    /// If we have recent peeked, we report the line number of the peeked keyword line.
143    ///
144    /// Otherwise, we report the line number of the most-recently yielded item.
145    pub fn lno_for_error(&self) -> usize {
146        match self.peeked {
147            PeekState::Some { .. } => {
148                // The error was presumably caused by whatever was seen in the peek.
149                // That's the current line number.
150                self.lines.peek_lno()
151            }
152            PeekState::None { yielded_item_lno } => {
153                // The error was presumably caused by the results of next_item().
154                yielded_item_lno
155            }
156        }
157    }
158
159    /// Core of peeking.  Tries to make `.peeked` be `Some`.
160    fn peek_internal<'i>(&'i mut self) -> Result<(), EP> {
161        if matches!(self.peeked, PeekState::None { .. }) {
162            let Some(peeked) = self.lines.peek() else {
163                return Ok(());
164            };
165
166            let peeked_line = self.lines.peeked_line(&peeked);
167
168            let (keyword, args) = peeked_line.split_once(WS).unwrap_or((peeked_line, ""));
169            let keyword = KeywordRef::new(keyword)?;
170
171            self.peeked = PeekState::Some(ItemStreamPeeked {
172                keyword,
173                line: peeked,
174                args_len: args.len(),
175            });
176        }
177
178        Ok(())
179    }
180
181    /// Peek the next keyword
182    pub fn peek_keyword(&mut self) -> Result<Option<KeywordRef<'s>>, EP> {
183        self.peek_internal()?;
184        let PeekState::Some(peeked) = &self.peeked else {
185            return Ok(None);
186        };
187        Ok(Some(peeked.keyword))
188    }
189
190    /// Obtain the body so far, suitable for hashing for an Orderly signature
191    #[allow(clippy::string_slice)] // TODO
192    pub fn body_sofar_for_signature(&self) -> SignedDocumentBody<'s> {
193        let body = &self.whole_input[0..self.byte_position()];
194        SignedDocumentBody { body }
195    }
196
197    /// Byte position, pointing to the start of the next item to yield
198    ///
199    /// Offset in bytes from the start of the original input string
200    /// to the "current" position,
201    /// ie to just after the item we yielded and just before the next item (or EOF).
202    pub fn byte_position(&self) -> usize {
203        self.whole_input.len() - self.lines.remaining().len()
204    }
205
206    /// Access for the entire input string
207    ///
208    /// The original `input: &str` argument to [`ParseInput::new`].
209    ///
210    /// Includes both yielded and unyielded items.
211    pub fn whole_input(&self) -> &'s str {
212        self.whole_input
213    }
214
215    /// Parse a (sub-)document with its own signatures
216    ///
217    /// Used (mostly) by the
218    /// [`NetdocParseableUnverified`](derive_deftly_template_NetdocParseableUnverified)
219    /// derive macro.
220    ///
221    /// Generic parameters:
222    ///
223    ///  * **`B`**: the body type: the type to which `NetdocParseableUnverified` is applied.
224    ///  * **`S`**: the signatures section type.
225    ///  * **`O`**: the `FooUnverified` type, which embodies the parsed body and signatures.
226    #[allow(clippy::string_slice)] // TODO
227    pub fn parse_signed<
228        B: HasUnverifiedParsedBody,
229        S: NetdocParseableSignatures,
230        O: NetdocParseableUnverified<Body = B, Signatures = S>,
231    >(
232        &mut self,
233        outer_stop: stop_at!(),
234    ) -> Result<O, EP> {
235        let mut input = ItemStream {
236            whole_input: &self.whole_input[self.whole_input.len() - self.lines.remaining().len()..],
237            ..self.clone()
238        };
239        let r = (|| {
240            let inner_always_stop = outer_stop | StopAt::doc_intro::<B::UnverifiedParsedBody>();
241            let body = B::UnverifiedParsedBody::from_items(
242                &mut input,
243                inner_always_stop | StopAt(S::is_item_keyword),
244            )?;
245            let signed_doc_body = input.body_sofar_for_signature();
246            let unsigned_body_len = signed_doc_body.body().len();
247            let mut hashes = S::HashesAccu::default();
248            let sigs = S::from_items(&mut input, signed_doc_body, &mut hashes, inner_always_stop)?;
249            let sigs = SignaturesData {
250                sigs,
251                unsigned_body_len,
252                hashes,
253            };
254            // SECURITY
255            // We unwrap the UnverifiedParsedBody and immediately wrap it up again
256            // in FooUnverified, passing on the obligation to verify the signatures,
257            // and still enforcing that with a newtype.
258            let signed = O::from_parts(B::unverified_into_inner_unchecked(body), sigs);
259            Ok(signed)
260        })(); // don't exit here
261
262        *self = ItemStream {
263            whole_input: self.whole_input,
264            ..input
265        };
266
267        r
268    }
269
270    /// Obtain the inputs that would be needed to hash any (even Disorderly) signature
271    ///
272    /// These are the hash inputs which would be needed for the next item,
273    /// assuming it's a signature keyword.
274    pub fn peek_signature_hash_inputs(
275        &mut self,
276        body: SignedDocumentBody<'s>,
277    ) -> Result<Option<SignatureHashInputs<'s>>, EP> {
278        self.peek_internal()?;
279        let PeekState::Some(peeked) = &self.peeked else {
280            return Ok(None);
281        };
282        let document_sofar = self.body_sofar_for_signature().body();
283        let signature_item_line = self.lines.peeked_line(&peeked.line);
284        let signature_item_kw_spc = signature_item_line.strip_end_counted(peeked.args_len);
285        Ok(Some(SignatureHashInputs {
286            body,
287            document_sofar,
288            signature_item_kw_spc,
289            signature_item_line,
290        }))
291    }
292
293    /// Yield the next item.
294    #[allow(clippy::string_slice)] // TODO
295    pub fn next_item(&mut self) -> Result<Option<UnparsedItem<'s>>, EP> {
296        self.peek_internal()?;
297        let peeked = match self.peeked {
298            PeekState::None { .. } => return Ok(None),
299            PeekState::Some { .. } => match mem::replace(
300                &mut self.peeked,
301                PeekState::None {
302                    yielded_item_lno: self.lines.peek_lno(),
303                },
304            ) {
305                PeekState::Some(peeked) => peeked,
306                PeekState::None { .. } => panic!("it was Some just now"),
307            },
308        };
309
310        let keyword = peeked.keyword;
311        let line = self.lines.consume_peeked(peeked.line);
312        let args = &line[keyword.len()..];
313        let options = self.options;
314        let args = ArgumentStream::new(args, line.len(), options);
315
316        let object = if self.lines.remaining().starts_with('-') {
317            fn pem_delimiter<'s>(lines: &mut Lines<'s>, start: &str) -> Result<&'s str, EP> {
318                let line = lines.next().ok_or(
319                    // If this is the *header*, we already know there's a line,
320                    // so this error path is only for footers.
321                    EP::ObjectMissingFooter,
322                )?;
323                let label = line
324                    .strip_prefix(start)
325                    .ok_or(EP::InvalidObjectDelimiters)?
326                    .strip_suffix(PEM_AFTER_LABEL)
327                    .ok_or(EP::InvalidObjectDelimiters)?;
328                Ok(label)
329            }
330
331            let label1 = pem_delimiter(&mut self.lines, PEM_HEADER_START)?;
332            let base64_start_remaining = self.lines.remaining();
333            while !self.lines.remaining().starts_with('-') {
334                let _: &str = self.lines.next().ok_or(EP::ObjectMissingFooter)?;
335            }
336            let data_b64 = base64_start_remaining.strip_end_counted(self.lines.remaining().len());
337            let label2 = pem_delimiter(&mut self.lines, PEM_FOOTER_START)?;
338            let label = [label1, label2]
339                .into_iter()
340                .all_equal_value()
341                .map_err(|_| EP::ObjectMismatchedLabels)?;
342            Some(UnparsedObject {
343                label,
344                data_b64,
345                options,
346            })
347        } else {
348            None
349        };
350
351        Ok(Some(UnparsedItem {
352            keyword,
353            args,
354            object,
355        }))
356    }
357}
358
359impl<'s> UnparsedItem<'s> {
360    /// Access the arguments, mutably (for consuming and parsing them)
361    pub fn args_mut(&mut self) -> &mut ArgumentStream<'s> {
362        &mut self.args
363    }
364    /// Access a copy of the arguments
365    ///
366    /// When using this, be careful not to process any arguments twice.
367    pub fn args_copy(&self) -> ArgumentStream<'s> {
368        self.args.clone()
369    }
370
371    /// Access the arguments (readonly)
372    ///
373    /// When using this, be careful not to process any arguments twice.
374    pub fn args(&self) -> &ArgumentStream<'s> {
375        &self.args
376    }
377
378    /// Check that this item has no Object.
379    pub fn check_no_object(&self) -> Result<(), EP> {
380        if self.object.is_some() {
381            return Err(EP::ObjectUnexpected);
382        }
383        Ok(())
384    }
385    /// Convenience method for handling an error parsing an argument
386    ///
387    /// Returns a closure that converts every error into [`ArgumentError::Invalid`]
388    /// and then to an [`ErrorProblem`] using
389    /// [`.args().handle_error()`](ArgumentStream::handle_error).
390    ///
391    /// Useful in manual `ItemValueParseable` impls, when parsing arguments ad-hoc.
392    pub fn invalid_argument_handler<E>(
393        &self,
394        field: &'static str,
395    ) -> impl FnOnce(E) -> ErrorProblem {
396        let error = self.args().handle_error(field, AE::Invalid);
397        move |_any_error| error
398    }
399}
400
401#[deprecated = "use types::NoFurtherArguments"]
402pub use crate::types::NoMoreArguments as NoFurtherArguments;
403
404impl<'s> Iterator for ItemStream<'s> {
405    type Item = Result<UnparsedItem<'s>, EP>;
406    fn next(&mut self) -> Option<Result<UnparsedItem<'s>, EP>> {
407        self.next_item().transpose()
408    }
409}
410
411impl<'s> ArgumentStream<'s> {
412    /// Make a new `ArgumentStream` from a string
413    ///
414    /// The string may start with whitespace (which will be ignored).
415    pub fn new(rest: &'s str, whole_line_len: usize, options: &'s ParseOptions) -> Self {
416        let previous_rest_len = whole_line_len;
417        ArgumentStream {
418            rest,
419            whole_line_len,
420            previous_rest_len,
421            options,
422        }
423    }
424
425    /// Consume this whole `ArgumentStream`, giving the remaining arguments as a string
426    ///
427    /// The returned string won't start with whitespace.
428    //
429    /// `self` will be empty on return.
430    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
431    pub fn into_remaining(&mut self) -> &'s str {
432        self.prep_yield();
433        mem::take(&mut self.rest)
434    }
435
436    /// Return the component parts of this `ArgumentStream`
437    ///
438    /// The returned string might start with whitespace.
439    pub fn whole_line_len(&self) -> usize {
440        self.whole_line_len
441    }
442
443    /// Prepares to yield an argument (or the rest)
444    ///
445    ///  * Trims leading WS from `rest`.
446    ///  * Records the `previous_rest_len`
447    fn prep_yield(&mut self) {
448        self.rest = self.rest.trim_start_matches(WS);
449        self.previous_rest_len = self.rest.len();
450    }
451
452    /// Prepares to yield, and then determines if there *is* anything to yield.
453    ///
454    ///  * Trim leading whitespace
455    ///  * Records the `previous_rest_len`
456    ///  * See if we're now empty
457    pub fn something_to_yield(&mut self) -> bool {
458        self.prep_yield();
459        !self.rest.is_empty()
460    }
461
462    /// Throw and error if there are further arguments
463    //
464    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
465    pub fn reject_extra_args(&mut self) -> Result<NoFurtherArguments, UnexpectedArgument> {
466        if self.something_to_yield() {
467            let column = self.next_arg_column();
468            Err(UnexpectedArgument { column })
469        } else {
470            Ok(NoFurtherArguments)
471        }
472    }
473
474    /// Convert a "length of `rest`" into the corresponding column number.
475    fn arg_column_from_rest_len(&self, rest_len: usize) -> usize {
476        // Can't underflow since rest is always part of the whole.
477        // Can't overflow since that would mean the document was as big as the address space.
478        self.whole_line_len - rest_len + 1
479    }
480
481    /// Obtain the column number of the previously yielded argument.
482    ///
483    /// (After `into_remaining`, gives the column number
484    /// of the start of the returned remaining argument string.)
485    pub fn prev_arg_column(&self) -> usize {
486        self.arg_column_from_rest_len(self.previous_rest_len)
487    }
488
489    /// Obtains the column number of the *next* argument.
490    ///
491    /// Should be called after `something_to_yield`; otherwise the returned value
492    /// may point to whitespace which is going to be skipped.
493    // ^ this possible misuse doesn't seem worth defending against with type-fu,
494    //   for a private function with few call sites.
495    fn next_arg_column(&self) -> usize {
496        self.arg_column_from_rest_len(self.rest.len())
497    }
498
499    /// Convert an `ArgumentError` to an `ErrorProblem`.
500    ///
501    /// The caller must supply the field name.
502    pub fn handle_error(&self, field: &'static str, ae: ArgumentError) -> ErrorProblem {
503        self.error_handler(field)(ae)
504    }
505
506    /// Return a converter from `ArgumentError` to `ErrorProblem`.
507    ///
508    /// Useful in `.map_err`.
509    pub fn error_handler(
510        &self,
511        field: &'static str,
512    ) -> impl Fn(ArgumentError) -> ErrorProblem + 'static {
513        let column = self.prev_arg_column();
514        move |ae| match ae {
515            AE::Missing => EP::MissingArgument { field },
516            AE::Invalid => EP::InvalidArgument { field, column },
517            AE::Unexpected => EP::UnexpectedArgument { column },
518        }
519    }
520}
521
522impl<'s> Iterator for ArgumentStream<'s> {
523    type Item = &'s str;
524    fn next(&mut self) -> Option<&'s str> {
525        if !self.something_to_yield() {
526            return None;
527        }
528        let arg;
529        (arg, self.rest) = self.rest.split_once(WS).unwrap_or((self.rest, ""));
530        Some(arg)
531    }
532}
533
534impl<'s> UnparsedObject<'s> {
535    /// Obtain the Object data, as decoded bytes
536    pub fn decode_data(&self) -> Result<Vec<u8>, EP> {
537        crate::parse::tokenize::base64_decode_multiline(self.data_b64)
538            .map_err(|_e| EP::ObjectInvalidBase64)
539    }
540}