tor_netdoc/parse2.rs
1//! New netdoc parsing arrangements, with `derive`
2//!
3//! # Parsing principles
4//!
5//! A parseable network document is a type implementing [`NetdocParseable`].
6//! usually via the
7//! [`NetdocParseable` derive=deftly macro`](crate::derive_deftly_template_NetdocParseable).
8//!
9//! A document type is responsible for recognising its own heading item.
10//! Its parser will also be told other of structural items that it should not consume.
11//! The structural lines can then be used to pass control to the appropriate parser.
12//!
13//! A "structural item" is a netdoc item that is defines the structure of the document.
14//! This includes the intro items for whole documents,
15//! the items that introduce document sections
16//! (which we model by treating the section as a sub-document)
17//! and signature items (which introduce the signatures at the end of the document,
18//! and after which no non-signature items may appear).
19//!
20//! # Ordering
21//!
22//! We don't always parse things into a sorted order.
23//! Sorting will be done when assembling documents, before outputting.
24// TODO we don't implement deriving output yet.
25//!
26//! # Types, and signature handling
27//!
28//! Most top-level network documents are signed somehow.
29//! In this case there are three types:
30//!
31//! * **`FooUnverified`**: a signed `Foo`, with its signatures, not yet verified.
32//! Implements [`NetdocUnverified`],
33//! typically by invoking the
34//! [`NetdocUParseablenverified` derive macro](crate::derive_deftly_template_NetdocParseableUnverified)
35//! on `Foo`.
36//!
37//! Type-specific methods are provided for verification,
38//! to obtain a `Foo`.
39//!
40//! * **`Foo`**: the body data for the document.
41//! This doesn't contain any signatures.
42//! Having one of these to play with means signatures have already been validated.
43//! Can be parsed as part of the signed document,
44//! via the `NetdocParseable` implementation on `FooUnverified`,
45//! and then obtained via `.verify_...` method(s) on `FooUnverified`,
46//!
47//! * **`FooSignatures`**: the signatures for a `Foo`.
48//! Implements `NetdocParseableSignatures`, via
49//! [derive](crate::derive_deftly_template_NetdocParseableSignatures),
50//! with `#[deftly(netdoc(signatures))]`.
51//!
52//! # Relationship to tor_netdoc::parse
53//!
54//! This is a completely new parsing approach, based on different principles.
55//! The key principle is the recognition of "structural keywords",
56//! recursively within a parsing stack, via the p`NetdocParseable`] trait.
57//!
58//! This allows the parser to be derived. We have type-driven parsing
59//! of whole Documents, Items, and their Arguments and Objects,
60//! including of their multiplicity.
61//!
62//! The different keyword handling means we can't use most of the existing lexer,
63//! and need new item parsing API:
64//!
65//! * [`NetdocParseable`] trait.
66//! * [`KeywordRef`] type.
67//! * [`ItemStream`], [`UnparsedItem`], [`ArgumentStream`], [`UnparsedObject`].
68//!
69//! The different error handling means we have our own error types.
70//! (The crate's existing parse errors have information that we don't track,
71//! and is also a portmanteau error for parsing, writing, and other functions.)
72//!
73//! Document signing is handled in a more abstract way.
74//!
75//! Some old netdoc constructs are not supported.
76//! For example, the obsolete `opt` prefix on safe-to-ignore Items.
77//! The parser may make different decisions about netdocs with anomalous item ordering.
78
79#[doc(hidden)]
80#[macro_use]
81pub mod internal_prelude;
82
83#[macro_use]
84mod structural;
85
86#[macro_use]
87mod derive;
88
89mod error;
90mod impls;
91pub mod keyword;
92mod lex;
93mod lines;
94pub mod multiplicity;
95mod signatures;
96mod traits;
97
98#[cfg(feature = "plain-consensus")]
99pub mod poc;
100
101use internal_prelude::*;
102
103pub use error::{ArgumentError, ErrorProblem, ParseError, UnexpectedArgument, VerifyFailed};
104pub use impls::raw_data_object;
105pub use impls::times::NdaSystemTimeDeprecatedSyntax;
106pub use keyword::KeywordRef;
107pub use lex::{ArgumentStream, ItemStream, NoFurtherArguments, UnparsedItem, UnparsedObject};
108pub use lines::{Lines, Peeked, StrExt};
109pub use signatures::{
110 HasUnverifiedParsedBody, NetdocParseableSignatures, NetdocUnverified, SignatureHashInputs,
111 SignatureHashesAccumulator, SignatureItemParseable, SignaturesData, check_validity_time,
112 check_validity_time_tolerance, sig_hashes,
113};
114pub use structural::{StopAt, StopPredicate};
115pub use traits::{
116 IsStructural, ItemArgumentParseable, ItemObjectParseable, ItemValueParseable, NetdocParseable,
117 NetdocParseableFields,
118};
119
120#[doc(hidden)]
121pub use derive::netdoc_parseable_derive_debug;
122
123pub(crate) use internal_prelude::EP;
124
125//---------- input ----------
126
127/// Options for parsing
128///
129/// Specific document and type parsing methods may use these parameters
130/// to control their parsing behaviour at run-time.
131#[derive(educe::Educe, Debug, Clone)]
132#[allow(clippy::manual_non_exhaustive)]
133#[educe(Default)]
134pub struct ParseOptions {
135 /// Retain unknown values?
136 ///
137 /// Some field types, especially for flags fields, have the capability to retain
138 /// unknown flags. But, whereas known flags can be represented as single bits,
139 /// representing unknown flags involves allocating and copying strings.
140 /// Unless the document is to be reproduced, this is a waste of effort.
141 ///
142 /// Each document field type affected by this option should store the unknowns
143 /// as `Unknown<HashSet<String>>` or similar.
144 ///
145 /// This feature should only be used where performance is important.
146 /// For example, it is useful for types that appear in md consensus routerdescs,
147 /// but less useful for types that appear only in a netstatus preamble.
148 ///
149 /// This is currently used for router flags.
150 #[educe(Default(expression = "Unknown::new_discard()"))]
151 pub retain_unknown_values: Unknown<()>,
152
153 // Like `#[non_exhaustive]`, but doesn't prevent use of struct display syntax with `..`
154 #[doc(hidden)]
155 _private_non_exhaustive: (),
156}
157
158/// Input to a network document top-level parsing operation
159pub struct ParseInput<'s> {
160 /// The actual document text
161 input: &'s str,
162 /// Filename (for error reporting)
163 file: &'s str,
164 /// Parsing options
165 options: ParseOptions,
166}
167
168impl<'s> ParseInput<'s> {
169 /// Prepare to parse an input string
170 pub fn new(input: &'s str, file: &'s str) -> Self {
171 ParseInput {
172 input,
173 file,
174 options: ParseOptions::default(),
175 }
176 }
177}
178
179//---------- parser ----------
180
181/// Common code for `parse_netdoc` and `parse_netdoc_multiple`
182///
183/// Creates the `ItemStream`, calls `parse_completely`, and handles errors.
184fn parse_internal<T, D: NetdocParseable>(
185 input: &ParseInput<'_>,
186 parse_completely: impl FnOnce(&mut ItemStream) -> Result<T, ErrorProblem>,
187) -> Result<T, ParseError> {
188 let mut items = ItemStream::new(input)?;
189 parse_completely(&mut items).map_err(|problem| ParseError {
190 problem,
191 doctype: D::doctype_for_error(),
192 file: input.file.to_owned(),
193 lno: items.lno_for_error(),
194 column: problem.column(),
195 })
196}
197
198/// Parse a network document - **toplevel entrypoint**
199pub fn parse_netdoc<D: NetdocParseable>(input: &ParseInput<'_>) -> Result<D, ParseError> {
200 parse_internal::<_, D>(input, |items| {
201 let doc = D::from_items(items, StopAt(false))?;
202 if let Some(_kw) = items.peek_keyword()? {
203 return Err(EP::MultipleDocuments);
204 }
205 Ok(doc)
206 })
207}
208
209/// Parse multiple concatenated network documents - **toplevel entrypoint**
210pub fn parse_netdoc_multiple<D: NetdocParseable>(
211 input: &ParseInput<'_>,
212) -> Result<Vec<D>, ParseError> {
213 parse_internal::<_, D>(input, |items| {
214 let mut docs = vec![];
215 while items.peek_keyword()?.is_some() {
216 let doc = D::from_items(items, StopAt(false))?;
217 docs.push(doc);
218 }
219 Ok(docs)
220 })
221}
222
223/// Parse multiple network documents, also returning their offsets - **toplevel entrypoint**
224///
225/// Each returned document is accompanied by the byte offsets of its start and end.
226///
227/// (The netdoc metaformat does not allow anything in between subsequent documents in a file,
228/// so the end of one document is the start of the next.)
229///
230/// This returns byte offsets rather than string slices,
231/// because the caller can always convert the offsets into string slices,
232/// but it is not straightforward to convert string slices borrowed from some input string
233/// into offsets, in a way that is obviously correct without nightly `str::substr_range`.
234///
235/// Interfacing code can assume that slicing the input string with the returned
236/// [`usize`] values will not cause an out-of-bounds error, meaning runtime
237/// checks are not necessary there.
238pub fn parse_netdoc_multiple_with_offsets<D: NetdocParseable>(
239 input: &ParseInput<'_>,
240) -> Result<Vec<(D, usize, usize)>, ParseError> {
241 parse_internal::<_, D>(input, |items| {
242 let mut docs = vec![];
243 while items.peek_keyword()?.is_some() {
244 let start_pos = items.byte_position();
245 let doc = D::from_items(items, StopAt(false))?;
246 let end_pos = items.byte_position();
247
248 // Check start_pos and end_pos are in range.
249 if input.input.get(start_pos..end_pos).is_none() {
250 return Err(ErrorProblem::Internal("out-of-bounds bug?"));
251 }
252
253 docs.push((doc, start_pos, end_pos));
254 }
255 Ok(docs)
256 })
257}