tor_netdoc/parse2/
lex.rs

1//! Lexing of netdoc elements
2
3use super::*;
4
5/// Linear whitespace as defined by torspec
6// Only pub via internal_prelude, for benefit of macros
7pub const WS: &[char] = &[' ', '\t'];
8
9/// Top-level reader: Netdoc text interpreted as a stream of items
10#[derive(Debug, Clone)]
11pub struct ItemStream<'s> {
12    /// The whole document.  Used for signature hashing.
13    whole_for_signatures: &'s str,
14    /// Remaining document, as a stream of lines
15    lines: Lines<'s>,
16    /// If we have peeked ahead, what we discovered
17    peeked: PeekState<'s>,
18}
19
20/// Whether an `ItemStream` has peeked ahead, and if so what it discovered
21#[derive(Debug, Clone)]
22enum PeekState<'s> {
23    /// We've peeked a line
24    Some(ItemStreamPeeked<'s>),
25    /// We've not peeked, or peeking gave `None`
26    None {
27        /// Line number of the last item we yielded.
28        ///
29        /// `0` at the start.
30        yielded_item_lno: usize,
31    },
32}
33
34/// If an `ItemStream` has peeked ahead, what it discovered
35#[derive(Debug, Clone)]
36struct ItemStreamPeeked<'s> {
37    /// The next keyword
38    keyword: KeywordRef<'s>,
39    /// Token proving that we
40    line: lines::Peeked,
41    /// Length of the suffix of the line that is the arguments rather than the keyword
42    ///
43    /// Does not include the first whitespace, that terminated the keyword.
44    args_len: usize,
45}
46
47/// An Item that has been lexed but not parsed
48#[derive(Debug, Clone, amplify::Getters)]
49pub struct UnparsedItem<'s> {
50    /// The item's Keyword
51    #[getter(as_copy)]
52    keyword: KeywordRef<'s>,
53    /// The Item's Arguments
54    #[getter(skip)]
55    args: ArgumentStream<'s>,
56    /// The Item's Object, if there was one
57    #[getter(as_clone)]
58    object: Option<UnparsedObject<'s>>,
59}
60
61/// Reader for arguments on an Item
62///
63/// Represents the (remaining) arguments.
64#[derive(Debug, Clone)]
65pub struct ArgumentStream<'s> {
66    /// The remaining unparsed arguments
67    ///
68    /// Can start with WS, which is usually trimmed
69    rest: &'s str,
70
71    /// Original line length
72    ///
73    /// Used for reporting column of argument errors.
74    whole_line_len: usize,
75
76    /// Remaining length *before* we last yielded.
77    previous_rest_len: usize,
78}
79
80/// An Object that has been lexed but not parsed
81#[derive(Debug, Clone, amplify::Getters)]
82pub struct UnparsedObject<'s> {
83    /// The Label
84    #[getter(as_copy)]
85    label: &'s str,
86    /// The portion of the input document which is base64 data (and newlines)
87    #[getter(skip)]
88    data_b64: &'s str,
89}
90
91impl<'s> ItemStream<'s> {
92    /// Start reading a network document as a series of Items
93    pub fn new(s: &'s str) -> Result<Self, ParseError> {
94        Ok(ItemStream {
95            whole_for_signatures: s,
96            lines: Lines::new(s),
97            peeked: PeekState::None {
98                yielded_item_lno: 0,
99            },
100        })
101    }
102
103    /// Line number for reporting an error we have just discovered
104    ///
105    /// If we have recent peeked, we report the line number of the peeked keyword line.
106    ///
107    /// Otherwise, we report the line number of the most-recently yielded item.
108    pub fn lno_for_error(&self) -> usize {
109        match self.peeked {
110            PeekState::Some { .. } => {
111                // The error was presumably caused by whatever was seen in the peek.
112                // That's the current line number.
113                self.lines.peek_lno()
114            }
115            PeekState::None { yielded_item_lno } => {
116                // The error was presumably caused by the results of next_item().
117                yielded_item_lno
118            }
119        }
120    }
121
122    /// Core of peeking.  Tries to make `.peeked` be `Some`.
123    fn peek_internal<'i>(&'i mut self) -> Result<(), EP> {
124        if matches!(self.peeked, PeekState::None { .. }) {
125            let Some(peeked) = self.lines.peek() else {
126                return Ok(());
127            };
128
129            let peeked_line = self.lines.peeked_line(&peeked);
130
131            let (keyword, args) = peeked_line.split_once(WS).unwrap_or((peeked_line, ""));
132            let keyword = KeywordRef::new(keyword)?;
133
134            self.peeked = PeekState::Some(ItemStreamPeeked {
135                keyword,
136                line: peeked,
137                args_len: args.len(),
138            });
139        }
140
141        Ok(())
142    }
143
144    /// Peek the next keyword
145    pub fn peek_keyword(&mut self) -> Result<Option<KeywordRef<'s>>, EP> {
146        self.peek_internal()?;
147        let PeekState::Some(peeked) = &self.peeked else {
148            return Ok(None);
149        };
150        Ok(Some(peeked.keyword))
151    }
152
153    /// Obtain the body so far, suitable for hashing for a Regular signature
154    pub fn body_sofar_for_signature(&self) -> SignedDocumentBody<'s> {
155        let body = self
156            .whole_for_signatures
157            .strip_end_counted(self.lines.remaining().len());
158        SignedDocumentBody { body }
159    }
160
161    /// Parse a (sub-)document with its own signatures
162    pub fn parse_signed<
163        B: NetdocParseable,
164        S: NetdocParseable,
165        O: NetdocSigned<Body = B, Signatures = S>,
166    >(
167        &mut self,
168        outer_stop: stop_at!(),
169    ) -> Result<O, EP> {
170        let mut input = ItemStream {
171            whole_for_signatures: &self.whole_for_signatures
172                [self.whole_for_signatures.len() - self.lines.remaining().len()..],
173            ..self.clone()
174        };
175        let r = (|| {
176            let inner_always_stop = outer_stop | StopAt::doc_intro::<B>();
177            let body = B::from_items(&mut input, inner_always_stop | StopAt::doc_intro::<S>())?;
178            let signatures = S::from_items(&mut input, inner_always_stop)?;
179            let signed = O::from_parts(body, signatures);
180            Ok(signed)
181        })(); // don't exit here
182
183        *self = ItemStream {
184            whole_for_signatures: self.whole_for_signatures,
185            ..input
186        };
187
188        r
189    }
190
191    /// Obtain the inputs that would be needed to hash any (even Irregular) signature
192    ///
193    /// These are the hash inputs which would be needed for the next item,
194    /// assuming it's a signature keyword.
195    pub fn peek_signature_hash_inputs(
196        &mut self,
197        body: SignedDocumentBody<'s>,
198    ) -> Result<Option<SignatureHashInputs<'s>>, EP> {
199        self.peek_internal()?;
200        let PeekState::Some(peeked) = &self.peeked else {
201            return Ok(None);
202        };
203        let signature_item_line = self.lines.peeked_line(&peeked.line);
204        let signature_item_kw_spc = signature_item_line.strip_end_counted(peeked.args_len);
205        Ok(Some(SignatureHashInputs {
206            body,
207            signature_item_kw_spc,
208            signature_item_line,
209        }))
210    }
211
212    /// Yield the next item.
213    pub fn next_item(&mut self) -> Result<Option<UnparsedItem<'s>>, EP> {
214        self.peek_internal()?;
215        let peeked = match self.peeked {
216            PeekState::None { .. } => return Ok(None),
217            PeekState::Some { .. } => match mem::replace(
218                &mut self.peeked,
219                PeekState::None {
220                    yielded_item_lno: self.lines.peek_lno(),
221                },
222            ) {
223                PeekState::Some(peeked) => peeked,
224                PeekState::None { .. } => panic!("it was Some just now"),
225            },
226        };
227
228        let keyword = peeked.keyword;
229        let line = self.lines.consume_peeked(peeked.line);
230        let args = &line[keyword.len()..];
231        let args = ArgumentStream::new(args, line.len());
232
233        let object = if self.lines.remaining().starts_with('-') {
234            fn pem_delimiter<'s>(lines: &mut Lines<'s>, start: &str) -> Result<&'s str, EP> {
235                let line = lines.next().ok_or(
236                    // If this is the *header*, we already know there's a line,
237                    // so this error path is only for footers.
238                    EP::ObjectMissingFooter,
239                )?;
240                let label = line
241                    .strip_prefix(start)
242                    .ok_or(EP::InvalidObjectDelimiters)?
243                    .strip_suffix(PEM_AFTER_LABEL)
244                    .ok_or(EP::InvalidObjectDelimiters)?;
245                Ok(label)
246            }
247
248            let label1 = pem_delimiter(&mut self.lines, PEM_HEADER_START)?;
249            let base64_start_remaining = self.lines.remaining();
250            while !self.lines.remaining().starts_with('-') {
251                let _: &str = self.lines.next().ok_or(EP::ObjectMissingFooter)?;
252            }
253            let data_b64 = base64_start_remaining.strip_end_counted(self.lines.remaining().len());
254            let label2 = pem_delimiter(&mut self.lines, PEM_FOOTER_START)?;
255            let label = [label1, label2]
256                .into_iter()
257                .all_equal_value()
258                .map_err(|_| EP::ObjectMismatchedLabels)?;
259            Some(UnparsedObject { label, data_b64 })
260        } else {
261            None
262        };
263
264        Ok(Some(UnparsedItem {
265            keyword,
266            args,
267            object,
268        }))
269    }
270}
271
272impl<'s> UnparsedItem<'s> {
273    /// Access the arguments, mutably (for consuming and parsing them)
274    pub fn args_mut(&mut self) -> &mut ArgumentStream<'s> {
275        &mut self.args
276    }
277    /// Access a copy of the arguments
278    ///
279    /// When using this, be careful not to process any arguments twice.
280    pub fn args_copy(&self) -> ArgumentStream<'s> {
281        self.args.clone()
282    }
283
284    /// Access the arguments (readonly)
285    ///
286    /// When using this, be careful not to process any arguments twice.
287    pub fn args(&self) -> &ArgumentStream<'s> {
288        &self.args
289    }
290
291    /// Check that this item has no Object.
292    pub fn check_no_object(&self) -> Result<(), EP> {
293        if self.object.is_some() {
294            return Err(EP::ObjectUnexpected);
295        }
296        Ok(())
297    }
298    /// Convenience method for handling an error parsing an arguemnt
299    ///
300    /// Returns a closure that converts every error into [`ArgumentError::Invalid`]
301    /// and then to an [`ErrorProblem`] using
302    /// [`.args().handle_error()`](ArgumentStream::handle_error).
303    ///
304    /// Useful in manual `ItemValueParseable` impls, when parsing arguments ad-hoc.
305    pub fn invalid_argument_handler<E>(
306        &self,
307        field: &'static str,
308    ) -> impl FnOnce(E) -> ErrorProblem {
309        let error = self.args().handle_error(field, AE::Invalid);
310        move |_any_error| error
311    }
312}
313
314/// End of an argument list that does not accept any further (unknown) arguments
315///
316/// Implements `ItemArgumentParseable`.  Parses successfully iff the argument list is empty.
317#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
318#[allow(clippy::exhaustive_structs)]
319pub struct NoFurtherArguments;
320
321impl ItemArgumentParseable for NoFurtherArguments {
322    fn from_args(args: &mut ArgumentStream) -> Result<Self, AE> {
323        Ok(args.reject_extra_args()?)
324    }
325}
326
327impl<'s> Iterator for ItemStream<'s> {
328    type Item = Result<UnparsedItem<'s>, EP>;
329    fn next(&mut self) -> Option<Result<UnparsedItem<'s>, EP>> {
330        self.next_item().transpose()
331    }
332}
333
334impl<'s> ArgumentStream<'s> {
335    /// Make a new `ArgumentStream` from a string
336    ///
337    /// The string may start with whitespace (which will be ignored).
338    pub fn new(rest: &'s str, whole_line_len: usize) -> Self {
339        let previous_rest_len = whole_line_len;
340        ArgumentStream {
341            rest,
342            whole_line_len,
343            previous_rest_len,
344        }
345    }
346
347    /// Consume this whole `ArgumnetStream`, giving the remaining arguments as a string
348    ///
349    /// The returned string won't start with whitespace.
350    //
351    /// `self` will be empty on return.
352    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
353    pub fn into_remaining(&mut self) -> &'s str {
354        self.prep_yield();
355        mem::take(&mut self.rest)
356    }
357
358    /// Return the component parts of this `ArgumnetStream`
359    ///
360    /// The returned string might start with whitespace.
361    pub fn whole_line_len(&self) -> usize {
362        self.whole_line_len
363    }
364
365    /// Prepares to yield an argument (or the rest)
366    ///
367    ///  * Trims leading WS from `rest`.
368    ///  * Records the `previous_rest_len`
369    fn prep_yield(&mut self) {
370        self.rest = self.rest.trim_start_matches(WS);
371        self.previous_rest_len = self.rest.len();
372    }
373
374    /// Prepares to yield, and then determines if there *is* anything to yield.
375    ///
376    ///  * Trim leading whitespace
377    ///  * Records the `previous_rest_len`
378    ///  * See if we're now empty
379    pub fn something_to_yield(&mut self) -> bool {
380        self.prep_yield();
381        !self.rest.is_empty()
382    }
383
384    /// Throw and error if there are further arguments
385    //
386    // (We don't take `self` by value because that makes use with `UnparsedItem` annoying.)
387    pub fn reject_extra_args(&mut self) -> Result<NoFurtherArguments, UnexpectedArgument> {
388        if self.something_to_yield() {
389            let column = self.next_arg_column();
390            Err(UnexpectedArgument { column })
391        } else {
392            Ok(NoFurtherArguments)
393        }
394    }
395
396    /// Convert a "length of `rest`" into the corresponding column number.
397    fn arg_column_from_rest_len(&self, rest_len: usize) -> usize {
398        // Can't underflow since rest is always part of the whole.
399        // Can't overflow since that would mean the document was as big as the address space.
400        self.whole_line_len - rest_len + 1
401    }
402
403    /// Obtain the column number of the previously yielded argument.
404    ///
405    /// (After `into_remaining`, gives the column number
406    /// of the start of the returned remaining argument string.)
407    pub fn prev_arg_column(&self) -> usize {
408        self.arg_column_from_rest_len(self.previous_rest_len)
409    }
410
411    /// Obtains the column number of the *next* argument.
412    ///
413    /// Should be called after `something_to_yield`; otherwise the returned value
414    /// may point to whitespace which is going to be skipped.
415    // ^ this possible misuse doesn't seem worth defending against with type-fu,
416    //   for a private function with few call sites.
417    fn next_arg_column(&self) -> usize {
418        self.arg_column_from_rest_len(self.rest.len())
419    }
420
421    /// Convert an `ArgumentError` to an `ErrorProblem`.
422    ///
423    /// The caller must supply the field name.
424    pub fn handle_error(&self, field: &'static str, ae: ArgumentError) -> ErrorProblem {
425        self.error_handler(field)(ae)
426    }
427
428    /// Return a converter from `ArgumentError` to `ErrorProblem`.
429    ///
430    /// Useful in `.map_err`.
431    pub fn error_handler(
432        &self,
433        field: &'static str,
434    ) -> impl Fn(ArgumentError) -> ErrorProblem + 'static {
435        let column = self.prev_arg_column();
436        move |ae| match ae {
437            AE::Missing => EP::MissingArgument { field },
438            AE::Invalid => EP::InvalidArgument { field, column },
439            AE::Unexpected => EP::UnexpectedArgument { column },
440        }
441    }
442}
443
444impl<'s> Iterator for ArgumentStream<'s> {
445    type Item = &'s str;
446    fn next(&mut self) -> Option<&'s str> {
447        if !self.something_to_yield() {
448            return None;
449        }
450        let arg;
451        (arg, self.rest) = self.rest.split_once(WS).unwrap_or((self.rest, ""));
452        Some(arg)
453    }
454}
455
456impl<'s> UnparsedObject<'s> {
457    /// Obtain the Object data, as decoded bytes
458    pub fn decode_data(&self) -> Result<Vec<u8>, EP> {
459        crate::parse::tokenize::base64_decode_multiline(self.data_b64)
460            .map_err(|_e| EP::ObjectInvalidBase64)
461    }
462}