Skip to main content

tor_netdoc/parse2/
lex.rs

1//! Lexing of netdoc elements
2
3use super::*;
4
5/// Linear whitespace as defined by torspec
6// Only pub via internal_prelude, for benefit of macros
7pub const WS: &[char] = &[' ', '\t'];
8
9define_derive_deftly! {
10    /// Define `parse_options` accessor
11    ///
12    /// The driver must have a lifetime named `'s`, which is suitable for the returned
13    /// `&'s ParseOptions`.
14    ///
15    /// # Top-level attributes:
16    ///
17    ///  * **`#[deftly(parse_options(field = ".field.field"))]`**, default `.options`
18    ParseOptions beta_deftly, expect items:
19
20    impl<$tgens> $ttype {
21        /// Examine the parsing options
22        pub fn parse_options(&self) -> &'s ParseOptions {
23            &self
24                ${tmeta(parse_options(field))
25                  as token_stream,
26                  default { .options }}
27        }
28    }
29}
30
31/// Top-level reader: Netdoc text interpreted as a stream of items
32#[derive(Debug, Clone, Deftly)]
33#[derive_deftly(ParseOptions)]
34pub struct ItemStream<'s> {
35    /// The whole input document.
36    whole_input: &'s str,
37    /// Remaining document, as a stream of lines
38    lines: Lines<'s>,
39    /// If we have peeked ahead, what we discovered
40    peeked: PeekState<'s>,
41    /// Parsing options.
42    options: &'s ParseOptions,
43}
44
45/// Whether an `ItemStream` has peeked ahead, and if so what it discovered
46#[derive(Debug, Clone)]
47enum PeekState<'s> {
48    /// We've peeked a line
49    Some(ItemStreamPeeked<'s>),
50    /// We've not peeked, or peeking gave `None`
51    None {
52        /// Line number of the last item we yielded.
53        ///
54        /// `0` at the start.
55        yielded_item_lno: usize,
56    },
57}
58
59/// If an `ItemStream` has peeked ahead, what it discovered
60#[derive(Debug, Clone)]
61struct ItemStreamPeeked<'s> {
62    /// The next keyword
63    keyword: KeywordRef<'s>,
64    /// Token proving that we
65    line: lines::Peeked,
66    /// Length of the suffix of the line that is the arguments rather than the keyword
67    ///
68    /// Does not include the first whitespace, that terminated the keyword.
69    args_len: usize,
70}
71
72/// An Item that has been lexed but not parsed
73#[derive(Debug, Clone, amplify::Getters, Deftly)]
74#[derive_deftly(ParseOptions)]
75#[deftly(parse_options(field = ".args.options"))]
76pub struct UnparsedItem<'s> {
77    /// The item's Keyword
78    #[getter(as_copy)]
79    keyword: KeywordRef<'s>,
80    /// The Item's Arguments
81    #[getter(skip)]
82    args: ArgumentStream<'s>,
83    /// The Item's Object, if there was one
84    #[getter(as_clone)]
85    object: Option<UnparsedObject<'s>>,
86}
87
88/// Reader for arguments on an Item
89///
90/// Represents the (remaining) arguments.
91#[derive(Debug, Clone, Deftly)]
92#[derive_deftly(ParseOptions)]
93pub struct ArgumentStream<'s> {
94    /// The remaining unparsed arguments
95    ///
96    /// Can start with WS, which is usually trimmed
97    rest: &'s str,
98
99    /// Original line length
100    ///
101    /// Used for reporting column of argument errors.
102    whole_line_len: usize,
103
104    /// Remaining length *before* we last yielded.
105    previous_rest_len: usize,
106
107    /// Parsing options.
108    options: &'s ParseOptions,
109}
110
111/// An Object that has been lexed but not parsed
112#[derive(Debug, Clone, amplify::Getters, Deftly)]
113#[derive_deftly(ParseOptions)]
114pub struct UnparsedObject<'s> {
115    /// The Label
116    #[getter(as_copy)]
117    label: &'s str,
118
119    /// The portion of the input document which is base64 data (and newlines)
120    #[getter(skip)]
121    data_b64: &'s str,
122
123    /// Parsing options.
124    options: &'s ParseOptions,
125}
126
127impl<'s> ItemStream<'s> {
128    /// Start reading a network document as a series of Items
129    pub fn new(input: &'s ParseInput<'s>) -> Result<Self, ParseError> {
130        Ok(ItemStream {
131            whole_input: input.input,
132            lines: Lines::new(input.input),
133            peeked: PeekState::None {
134                yielded_item_lno: 0,
135            },
136            options: &input.options,
137        })
138    }
139
140    /// Line number for reporting an error we have just discovered
141    ///
142    /// If we have recent peeked, we report the line number of the peeked keyword line.
143    ///
144    /// Otherwise, we report the line number of the most-recently yielded item.
145    pub fn lno_for_error(&self) -> usize {
146        match self.peeked {
147            PeekState::Some { .. } => {
148                // The error was presumably caused by whatever was seen in the peek.
149                // That's the current line number.
150                self.lines.peek_lno()
151            }
152            PeekState::None { yielded_item_lno } => {
153                // The error was presumably caused by the results of next_item().
154                yielded_item_lno
155            }
156        }
157    }
158
159    /// Core of peeking.  Tries to make `.peeked` be `Some`.
160    fn peek_internal<'i>(&'i mut self) -> Result<(), EP> {
161        if matches!(self.peeked, PeekState::None { .. }) {
162            let Some(peeked) = self.lines.peek() else {
163                return Ok(());
164            };
165
166            let peeked_line = self.lines.peeked_line(&peeked);
167
168            let (keyword, args) = peeked_line.split_once(WS).unwrap_or((peeked_line, ""));
169            let keyword = KeywordRef::new(keyword)?;
170
171            self.peeked = PeekState::Some(ItemStreamPeeked {
172                keyword,
173                line: peeked,
174                args_len: args.len(),
175            });
176        }
177
178        Ok(())
179    }
180
181    /// Peek the next keyword
182    pub fn peek_keyword(&mut self) -> Result<Option<KeywordRef<'s>>, EP> {
183        self.peek_internal()?;
184        let PeekState::Some(peeked) = &self.peeked else {
185            return Ok(None);
186        };
187        Ok(Some(peeked.keyword))
188    }
189
190    /// Obtain the body so far, suitable for hashing for a Regular signature
191    pub fn body_sofar_for_signature(&self) -> SignedDocumentBody<'s> {
192        let body = &self.whole_input[0..self.byte_position()];
193        SignedDocumentBody { body }
194    }
195
196    /// Byte position, pointing to the start of the next item to yield
197    ///
198    /// Offset in bytes from the start of the original input string
199    /// to the "current" position,
200    /// ie to just after the item we yielded and just before the next item (or EOF).
201    pub fn byte_position(&self) -> usize {
202        self.whole_input.len() - self.lines.remaining().len()
203    }
204
205    /// Access for the entire input string
206    ///
207    /// The original `input: &str` argument to [`ParseInput::new`].
208    ///
209    /// Includes both yielded and unyielded items.
210    pub fn whole_input(&self) -> &'s str {
211        self.whole_input
212    }
213
214    /// Parse a (sub-)document with its own signatures
215    pub fn parse_signed<
216        B: NetdocParseable,
217        S: NetdocParseable,
218        O: NetdocSigned<Body = B, Signatures = S>,
219    >(
220        &mut self,
221        outer_stop: stop_at!(),
222    ) -> Result<O, EP> {
223        let mut input = ItemStream {
224            whole_input: &self.whole_input[self.whole_input.len() - self.lines.remaining().len()..],
225            ..self.clone()
226        };
227        let r = (|| {
228            let inner_always_stop = outer_stop | StopAt::doc_intro::<B>();
229            let body = B::from_items(&mut input, inner_always_stop | StopAt::doc_intro::<S>())?;
230            let signatures = S::from_items(&mut input, inner_always_stop)?;
231            let signed = O::from_parts(body, signatures);
232            Ok(signed)
233        })(); // don't exit here
234
235        *self = ItemStream {
236            whole_input: self.whole_input,
237            ..input
238        };
239
240        r
241    }
242
243    /// Obtain the inputs that would be needed to hash any (even Irregular) signature
244    ///
245    /// These are the hash inputs which would be needed for the next item,
246    /// assuming it's a signature keyword.
247    pub fn peek_signature_hash_inputs(
248        &mut self,
249        body: SignedDocumentBody<'s>,
250    ) -> Result<Option<SignatureHashInputs<'s>>, EP> {
251        self.peek_internal()?;
252        let PeekState::Some(peeked) = &self.peeked else {
253            return Ok(None);
254        };
255        let signature_item_line = self.lines.peeked_line(&peeked.line);
256        let signature_item_kw_spc = signature_item_line.strip_end_counted(peeked.args_len);
257        Ok(Some(SignatureHashInputs {
258            body,
259            signature_item_kw_spc,
260            signature_item_line,
261        }))
262    }
263
264    /// Yield the next item.
265    pub fn next_item(&mut self) -> Result<Option<UnparsedItem<'s>>, EP> {
266        self.peek_internal()?;
267        let peeked = match self.peeked {
268            PeekState::None { .. } => return Ok(None),
269            PeekState::Some { .. } => match mem::replace(
270                &mut self.peeked,
271                PeekState::None {
272                    yielded_item_lno: self.lines.peek_lno(),
273                },
274            ) {
275                PeekState::Some(peeked) => peeked,
276                PeekState::None { .. } => panic!("it was Some just now"),
277            },
278        };
279
280        let keyword = peeked.keyword;
281        let line = self.lines.consume_peeked(peeked.line);
282        let args = &line[keyword.len()..];
283        let options = self.options;
284        let args = ArgumentStream::new(args, line.len(), options);
285
286        let object = if self.lines.remaining().starts_with('-') {
287            fn pem_delimiter<'s>(lines: &mut Lines<'s>, start: &str) -> Result<&'s str, EP> {
288                let line = lines.next().ok_or(
289                    // If this is the *header*, we already know there's a line,
290                    // so this error path is only for footers.
291                    EP::ObjectMissingFooter,
292                )?;
293                let label = line
294                    .strip_prefix(start)
295                    .ok_or(EP::InvalidObjectDelimiters)?
296                    .strip_suffix(PEM_AFTER_LABEL)
297                    .ok_or(EP::InvalidObjectDelimiters)?;
298                Ok(label)
299            }
300
301            let label1 = pem_delimiter(&mut self.lines, PEM_HEADER_START)?;
302            let base64_start_remaining = self.lines.remaining();
303            while !self.lines.remaining().starts_with('-') {
304                let _: &str = self.lines.next().ok_or(EP::ObjectMissingFooter)?;
305            }
306            let data_b64 = base64_start_remaining.strip_end_counted(self.lines.remaining().len());
307            let label2 = pem_delimiter(&mut self.lines, PEM_FOOTER_START)?;
308            let label = [label1, label2]
309                .into_iter()
310                .all_equal_value()
311                .map_err(|_| EP::ObjectMismatchedLabels)?;
312            Some(UnparsedObject {
313                label,
314                data_b64,
315                options,
316            })
317        } else {
318            None
319        };
320
321        Ok(Some(UnparsedItem {
322            keyword,
323            args,
324            object,
325        }))
326    }
327}
328
329impl<'s> UnparsedItem<'s> {
330    /// Access the arguments, mutably (for consuming and parsing them)
331    pub fn args_mut(&mut self) -> &mut ArgumentStream<'s> {
332        &mut self.args
333    }
334    /// Access a copy of the arguments
335    ///
336    /// When using this, be careful not to process any arguments twice.
337    pub fn args_copy(&self) -> ArgumentStream<'s> {
338        self.args.clone()
339    }
340
341    /// Access the arguments (readonly)
342    ///
343    /// When using this, be careful not to process any arguments twice.
344    pub fn args(&self) -> &ArgumentStream<'s> {
345        &self.args
346    }
347
348    /// Check that this item has no Object.
349    pub fn check_no_object(&self) -> Result<(), EP> {
350        if self.object.is_some() {
351            return Err(EP::ObjectUnexpected);
352        }
353        Ok(())
354    }
355    /// Convenience method for handling an error parsing an arguemnt
356    ///
357    /// Returns a closure that converts every error into [`ArgumentError::Invalid`]
358    /// and then to an [`ErrorProblem`] using
359    /// [`.args().handle_error()`](ArgumentStream::handle_error).
360    ///
361    /// Useful in manual `ItemValueParseable` impls, when parsing arguments ad-hoc.
362    pub fn invalid_argument_handler<E>(
363        &self,
364        field: &'static str,
365    ) -> impl FnOnce(E) -> ErrorProblem {
366        let error = self.args().handle_error(field, AE::Invalid);
367        move |_any_error| error
368    }
369}
370
371/// End of an argument list that does not accept any further (unknown) arguments
372///
373/// Implements `ItemArgumentParseable`.  Parses successfully iff the argument list is empty.
374#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
375#[allow(clippy::exhaustive_structs)]
376pub struct NoFurtherArguments;
377
378impl ItemArgumentParseable for NoFurtherArguments {
379    fn from_args(args: &mut ArgumentStream) -> Result<Self, AE> {
380        Ok(args.reject_extra_args()?)
381    }
382}
383
384impl<'s> Iterator for ItemStream<'s> {
385    type Item = Result<UnparsedItem<'s>, EP>;
386    fn next(&mut self) -> Option<Result<UnparsedItem<'s>, EP>> {
387        self.next_item().transpose()
388    }
389}
390
391impl<'s> ArgumentStream<'s> {
392    /// Make a new `ArgumentStream` from a string
393    ///
394    /// The string may start with whitespace (which will be ignored).
395    pub fn new(rest: &'s str, whole_line_len: usize, options: &'s ParseOptions) -> Self {
396        let previous_rest_len = whole_line_len;
397        ArgumentStream {
398            rest,
399            whole_line_len,
400            previous_rest_len,
401            options,
402        }
403    }
404
405    /// Consume this whole `ArgumnetStream`, giving the remaining arguments as a string
406    ///
407    /// The returned string won't start with whitespace.
408    //
409    /// `self` will be empty on return.
410    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
411    pub fn into_remaining(&mut self) -> &'s str {
412        self.prep_yield();
413        mem::take(&mut self.rest)
414    }
415
416    /// Return the component parts of this `ArgumnetStream`
417    ///
418    /// The returned string might start with whitespace.
419    pub fn whole_line_len(&self) -> usize {
420        self.whole_line_len
421    }
422
423    /// Prepares to yield an argument (or the rest)
424    ///
425    ///  * Trims leading WS from `rest`.
426    ///  * Records the `previous_rest_len`
427    fn prep_yield(&mut self) {
428        self.rest = self.rest.trim_start_matches(WS);
429        self.previous_rest_len = self.rest.len();
430    }
431
432    /// Prepares to yield, and then determines if there *is* anything to yield.
433    ///
434    ///  * Trim leading whitespace
435    ///  * Records the `previous_rest_len`
436    ///  * See if we're now empty
437    pub fn something_to_yield(&mut self) -> bool {
438        self.prep_yield();
439        !self.rest.is_empty()
440    }
441
442    /// Throw and error if there are further arguments
443    //
444    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
445    pub fn reject_extra_args(&mut self) -> Result<NoFurtherArguments, UnexpectedArgument> {
446        if self.something_to_yield() {
447            let column = self.next_arg_column();
448            Err(UnexpectedArgument { column })
449        } else {
450            Ok(NoFurtherArguments)
451        }
452    }
453
454    /// Convert a "length of `rest`" into the corresponding column number.
455    fn arg_column_from_rest_len(&self, rest_len: usize) -> usize {
456        // Can't underflow since rest is always part of the whole.
457        // Can't overflow since that would mean the document was as big as the address space.
458        self.whole_line_len - rest_len + 1
459    }
460
461    /// Obtain the column number of the previously yielded argument.
462    ///
463    /// (After `into_remaining`, gives the column number
464    /// of the start of the returned remaining argument string.)
465    pub fn prev_arg_column(&self) -> usize {
466        self.arg_column_from_rest_len(self.previous_rest_len)
467    }
468
469    /// Obtains the column number of the *next* argument.
470    ///
471    /// Should be called after `something_to_yield`; otherwise the returned value
472    /// may point to whitespace which is going to be skipped.
473    // ^ this possible misuse doesn't seem worth defending against with type-fu,
474    //   for a private function with few call sites.
475    fn next_arg_column(&self) -> usize {
476        self.arg_column_from_rest_len(self.rest.len())
477    }
478
479    /// Convert an `ArgumentError` to an `ErrorProblem`.
480    ///
481    /// The caller must supply the field name.
482    pub fn handle_error(&self, field: &'static str, ae: ArgumentError) -> ErrorProblem {
483        self.error_handler(field)(ae)
484    }
485
486    /// Return a converter from `ArgumentError` to `ErrorProblem`.
487    ///
488    /// Useful in `.map_err`.
489    pub fn error_handler(
490        &self,
491        field: &'static str,
492    ) -> impl Fn(ArgumentError) -> ErrorProblem + 'static {
493        let column = self.prev_arg_column();
494        move |ae| match ae {
495            AE::Missing => EP::MissingArgument { field },
496            AE::Invalid => EP::InvalidArgument { field, column },
497            AE::Unexpected => EP::UnexpectedArgument { column },
498        }
499    }
500}
501
502impl<'s> Iterator for ArgumentStream<'s> {
503    type Item = &'s str;
504    fn next(&mut self) -> Option<&'s str> {
505        if !self.something_to_yield() {
506            return None;
507        }
508        let arg;
509        (arg, self.rest) = self.rest.split_once(WS).unwrap_or((self.rest, ""));
510        Some(arg)
511    }
512}
513
514impl<'s> UnparsedObject<'s> {
515    /// Obtain the Object data, as decoded bytes
516    pub fn decode_data(&self) -> Result<Vec<u8>, EP> {
517        crate::parse::tokenize::base64_decode_multiline(self.data_b64)
518            .map_err(|_e| EP::ObjectInvalidBase64)
519    }
520}