Skip to main content

tor_netdoc/
parse2.rs

1//! New netdoc parsing arrangements, with `derive`
2//!
3//! # Parsing principles
4//!
5//! A parseable network document is a type implementing [`NetdocParseable`].
6//! usually via the
7//! [`NetdocParseable` derive=deftly macro`](crate::derive_deftly_template_NetdocParseable).
8//!
9//! A document type is responsible for recognising its own heading item.
10//! Its parser will also be told other of structural items that it should not consume.
11//! The structural lines can then be used to pass control to the appropriate parser.
12//!
13//! A "structural item" is a netdoc item that is defines the structure of the document.
14//! This includes the intro items for whole documents,
15//! the items that introduce document sections
16//! (which we model by treating the section as a sub-document)
17//! and signature items (which introduce the signatures at the end of the document,
18//! and after which no non-signature items may appear).
19//!
20//! # Ordering
21//!
22//! We don't always parse things into a sorted order.
23//! Sorting will be done when assembling documents, before outputting.
24//!
25//! # Types, and signature handling
26//!
27//! Most top-level network documents are signed somehow.
28//! In this case there are three types:
29//!
30//!   * **`FooUnverified`**: a signed `Foo`, with its signatures, not yet verified.
31//!     Implements [`NetdocParseableUnverified`],
32//!     typically by invoking the
33//!     [`NetdocUParseablenverified` derive macro](crate::derive_deftly_template_NetdocParseableUnverified)
34//!     on `Foo`.
35//!
36//!     Type-specific methods are provided for verification,
37//!     to obtain a `Foo`.
38//!
39//!   * **`Foo`**: the body data for the document.
40//!     This doesn't contain any signatures.
41//!     Having one of these to play with means signatures have already been validated.
42//!     Can be parsed as part of the signed document,
43//!     via the `NetdocParseable` implementation on `FooUnverified`,
44//!     and then obtained via `.verify_...` method(s) on `FooUnverified`,
45//!
46//!   * **`FooSignatures`**: the signatures for a `Foo`.
47//!     Implements `NetdocParseableSignatures`, via
48//!     [derive](crate::derive_deftly_template_NetdocParseableSignatures),
49//!     with `#[deftly(netdoc(signatures))]`.
50//!
51//! # Relationship to tor_netdoc::parse
52//!
53//! This is a completely new parsing approach, based on different principles.
54//! The key principle is the recognition of "structural keywords",
55//! recursively within a parsing stack, via the p`NetdocParseable`] trait.
56//!
57//! This allows the parser to be derived.  We have type-driven parsing
58//! of whole Documents, Items, and their Arguments and Objects,
59//! including of their multiplicity.
60//!
61//! The different keyword handling means we can't use most of the existing lexer,
62//! and need new item parsing API:
63//!
64//!  * [`NetdocParseable`] trait.
65//!  * [`KeywordRef`] type.
66//!  * [`ItemStream`], [`UnparsedItem`], [`ArgumentStream`], [`UnparsedObject`].
67//!
68//! The different error handling means we have our own error types.
69//! (The crate's existing parse errors have information that we don't track,
70//! and is also a portmanteau error for parsing, writing, and other functions.)
71//!
72//! Document signing is handled in a more abstract way.
73//!
74//! Some old netdoc constructs are not supported.
75//! For example, the obsolete `opt` prefix on safe-to-ignore Items.
76//! The parser may make different decisions about netdocs with anomalous item ordering.
77
78#[doc(hidden)]
79#[macro_use]
80pub mod internal_prelude;
81
82#[macro_use]
83mod structural;
84
85#[macro_use]
86mod derive;
87
88mod error;
89mod impls;
90pub mod keyword;
91mod lex;
92mod lines;
93pub mod multiplicity;
94mod signatures;
95mod traits;
96
97#[cfg(feature = "incomplete")]
98pub mod poc;
99
100use internal_prelude::*;
101
102pub use error::{ArgumentError, ErrorProblem, ParseError, UnexpectedArgument, VerifyFailed};
103pub use impls::times::NdaSystemTimeDeprecatedSyntax;
104pub use keyword::KeywordRef;
105pub use lex::{ArgumentStream, ItemStream, NoFurtherArguments, UnparsedItem, UnparsedObject};
106pub use lines::{Lines, Peeked, StrExt};
107pub use signatures::{
108    HasUnverifiedParsedBody, NetdocParseableSignatures, NetdocParseableUnverified,
109    SignatureHashInputs, SignatureHashesAccumulator, SignatureItemParseable, SignaturesData,
110    sig_hashes,
111};
112#[allow(deprecated)]
113#[deprecated]
114pub use signatures::{check_validity_time, check_validity_time_tolerance};
115pub use structural::{StopAt, StopPredicate};
116pub use traits::{
117    IsStructural, ItemArgumentParseable, ItemObjectParseable, ItemValueParseable, NetdocParseable,
118    NetdocParseableFields,
119};
120
121#[doc(hidden)]
122pub use derive::netdoc_parseable_derive_debug;
123
124pub(crate) use internal_prelude::EP;
125
126//---------- input ----------
127
128/// Options for parsing
129///
130/// Specific document and type parsing methods may use these parameters
131/// to control their parsing behaviour at run-time.
132#[derive(educe::Educe, Debug, Clone)]
133#[allow(clippy::manual_non_exhaustive)]
134#[educe(Default)]
135pub struct ParseOptions {
136    /// Retain unknown values?
137    ///
138    /// Some field types, especially for flags fields, have the capability to retain
139    /// unknown flags.  But, whereas known flags can be represented as single bits,
140    /// representing unknown flags involves allocating and copying strings.
141    /// Unless the document is to be reproduced, this is a waste of effort.
142    ///
143    /// Each document field type affected by this option should store the unknowns
144    /// as `Unknown<HashSet<String>>` or similar.
145    ///
146    /// This feature should only be used where performance is important.
147    /// For example, it is useful for types that appear in md consensus routerdescs,
148    /// but less useful for types that appear only in a netstatus preamble.
149    ///
150    /// This is currently used for router flags.
151    #[educe(Default(expression = "Unknown::new_discard()"))]
152    pub retain_unknown_values: Unknown<()>,
153
154    // Like `#[non_exhaustive]`, but doesn't prevent use of struct display syntax with `..`
155    #[doc(hidden)]
156    _private_non_exhaustive: (),
157}
158
159/// Input to a network document top-level parsing operation
160#[derive(Debug, Clone, amplify::Getters)]
161pub struct ParseInput<'s> {
162    /// The actual document text
163    #[getter(as_copy)]
164    input: &'s str,
165
166    /// Filename (for error reporting)
167    #[getter(as_copy)]
168    file: &'s str,
169
170    /// Parsing options
171    #[getter(as_ref, as_mut)]
172    options: ParseOptions,
173}
174
175impl<'s> ParseInput<'s> {
176    /// Prepare to parse an input string
177    pub fn new(input: &'s str, file: &'s str) -> Self {
178        ParseInput {
179            input,
180            file,
181            options: ParseOptions::default(),
182        }
183    }
184
185    /// Enable retention of unknown values during parsing
186    ///
187    /// Convenience method to set
188    /// [`.options_mut().retain_unknown_values`](ParseOptions::retain_unknown_values)
189    /// to [`Unknown::Retained`].
190    #[cfg(feature = "retain-unknown")]
191    pub fn retain_unknown_values(&mut self) {
192        self.options_mut().retain_unknown_values = Unknown::Retained(());
193    }
194}
195
196//---------- parser ----------
197
198/// Common code for `parse_netdoc` and `parse_netdoc_multiple`
199///
200/// Creates the `ItemStream`, calls `parse_completely`, and handles errors.
201fn parse_internal<T, D: NetdocParseable>(
202    input: &ParseInput<'_>,
203    parse_completely: impl FnOnce(&mut ItemStream) -> Result<T, ErrorProblem>,
204) -> Result<T, ParseError> {
205    let mut items = ItemStream::new(input)?;
206    parse_completely(&mut items).map_err(|problem| ParseError {
207        problem,
208        doctype: D::doctype_for_error(),
209        file: input.file.to_owned(),
210        lno: items.lno_for_error(),
211        column: problem.column(),
212    })
213}
214
215/// Parse a network document - **toplevel entrypoint**
216pub fn parse_netdoc<D: NetdocParseable>(input: &ParseInput<'_>) -> Result<D, ParseError> {
217    parse_internal::<_, D>(input, |items| {
218        let doc = D::from_items(items, StopAt(false))?;
219        if let Some(_kw) = items.peek_keyword()? {
220            return Err(EP::MultipleDocuments);
221        }
222        Ok(doc)
223    })
224}
225
226/// Parse multiple concatenated network documents - **toplevel entrypoint**
227pub fn parse_netdoc_multiple<D: NetdocParseable>(
228    input: &ParseInput<'_>,
229) -> Result<Vec<D>, ParseError> {
230    parse_internal::<_, D>(input, |items| {
231        let mut docs = vec![];
232        while items.peek_keyword()?.is_some() {
233            let doc = D::from_items(items, StopAt(false))?;
234            docs.push(doc);
235        }
236        Ok(docs)
237    })
238}
239
240/// Parse multiple network documents, also returning their offsets  - **toplevel entrypoint**
241///
242/// Each returned document is accompanied by the byte offsets of its start and end.
243///
244/// (The netdoc metaformat does not allow anything in between subsequent documents in a file,
245/// so the end of one document is the start of the next.)
246///
247/// This returns byte offsets rather than string slices,
248/// because the caller can always convert the offsets into string slices,
249/// but it is not straightforward to convert string slices borrowed from some input string
250/// into offsets, in a way that is obviously correct without nightly `str::substr_range`.
251///
252/// Interfacing code can assume that slicing the input string with the returned
253/// [`usize`] values will not cause an out-of-bounds error, meaning runtime
254/// checks are not necessary there.
255pub fn parse_netdoc_multiple_with_offsets<D: NetdocParseable>(
256    input: &ParseInput<'_>,
257) -> Result<Vec<(D, usize, usize)>, ParseError> {
258    parse_internal::<_, D>(input, |items| {
259        let mut docs = vec![];
260        while items.peek_keyword()?.is_some() {
261            let start_pos = items.byte_position();
262            let doc = D::from_items(items, StopAt(false))?;
263            let end_pos = items.byte_position();
264
265            // Check start_pos and end_pos are in range.
266            if input.input.get(start_pos..end_pos).is_none() {
267                return Err(ErrorProblem::Internal("out-of-bounds bug?"));
268            }
269
270            docs.push((doc, start_pos, end_pos));
271        }
272        Ok(docs)
273    })
274}