libreda_stream_parser/
lib.rs

1// SPDX-FileCopyrightText: 2023 Thomas Kramer
2// SPDX-License-Identifier: GPL-3.0-or-later
3
4//! A simple library for parsing data streams.
5//!
6//! Parsing is splitted into two tasks
7//! * Splitting an iterator into tokens. This is done by a [`Lexer`].
8//! * Processing the tokens: The 'Tokenized' struct provides helper functions for processing the stream of tokens.
9//!
10//! # Example
11//! ```
12//! use itertools::{Itertools, PeekingNext};
13//! use libreda_stream_parser::*;
14//!
15//! struct ArrayLexer {}
16//!
17//! impl Lexer for ArrayLexer {
18//!     type Char = char;
19//!
20//!     fn consume_next_token(
21//!         &mut self,
22//!         input: &mut (impl Iterator<Item = Self::Char> + PeekingNext),
23//!         mut output: impl FnMut(Self::Char),
24//!     ) -> Result<(), ParserError<char>> {
25//!         // Skip whitespace.
26//!         let _n = input.peeking_take_while(|c| c.is_whitespace()).count();
27//!
28//!         let is_terminal_char = |c: char| -> bool {
29//!             let terminals = "[],";
30//!             c.is_whitespace() || terminals.contains(c)
31//!         };
32//!
33//!         if let Some(c) = input.next() {
34//!             output(c);
35//!             // Continue reading token if `c` was no terminal character.
36//!             if !is_terminal_char(c) {
37//!                 input
38//!                     .peeking_take_while(|&c| !is_terminal_char(c))
39//!                     .for_each(output);
40//!             }
41//!         }
42//!
43//!         Ok(())
44//!     }
45//! }
46//!
47//! /// Parse an array of the form `[1.0, 2, 3.1324]`.
48//! fn parse_array(data: &str) -> Result<Vec<f64>, ParserError<char>> {
49//!     let mut tk = tokenize(data.chars(), ArrayLexer {});
50//!
51//!     tk.advance()?;
52//!
53//!     let mut arr: Vec<f64> = vec![];
54//!
55//!     tk.expect_str("[")?;
56//!
57//!     loop {
58//!         if tk.test_str("]")? {
59//!             break;
60//!         }
61//!
62//!         let num = tk.take_and_parse()?;
63//!         arr.push(num);
64//!
65//!         tk.expect_str(",")?;
66//!     }
67//!
68//!     Ok(arr)
69//! }
70//!
71//! let data = r#"
72//!     [
73//!         1.23,
74//!         2.34,
75//!         3.456,
76//!     ]
77//! "#;
78//!
79//! let arr = parse_array(data).expect("parsing failed");
80//!
81//! assert_eq!(arr, vec![1.23, 2.34, 3.456]);
82//! ```
83
84#![deny(missing_docs)]
85
86use std::error::Error;
87use std::fmt;
88use std::iter::Peekable;
89use std::num::ParseIntError;
90use std::str::FromStr;
91
92use itertools::PeekingNext;
93
94/// Partition an input stream into tokens.
95/// The lexer consumes one token from an input stream in each call of `consume_next_token`.
96pub trait Lexer {
97    /// Character datatype used by this lexer. Typically, this might be `char` or `u8`.
98    type Char;
99
100    /// Consume the next token from the iterator.
101    fn consume_next_token(
102        &mut self,
103        input: &mut (impl Iterator<Item = Self::Char> + PeekingNext),
104        output: impl FnMut(Self::Char),
105    ) -> Result<(), ParserError<Self::Char>>;
106}
107
108/// Provide sequential access to tokens that are created on the fly by
109/// splitting characters at whitespace.
110pub struct Tokenized<I, L>
111where
112    I: Iterator,
113{
114    /// Underlying iterator over characters.
115    iter: I,
116    /// Tokenizer/lexer.
117    lexer: L,
118    has_current: bool,
119    current_token: Option<Vec<I::Item>>,
120}
121
122// TODO: Implementing `StreamingIterator` from `streaming_iterator` could be a good fit.
123impl<I, L> Iterator for Tokenized<I, L>
124where
125    I: Iterator + PeekingNext,
126    L: Lexer<Char = I::Item>,
127    I::Item: PartialEq + Eq + Clone + Copy + 'static,
128{
129    type Item = Vec<I::Item>;
130
131    fn next(&mut self) -> Option<Self::Item> {
132        self.next_ref().map(|e| e.to_vec())
133    }
134}
135
136impl<I, L> Tokenized<I, L>
137where
138    I: Iterator + PeekingNext,
139    L: Lexer<Char = I::Item>,
140    I::Item: PartialEq + Eq + Clone + Copy + 'static,
141{
142    /// Go to the next token and return a reference to it, if any.
143    pub fn next_ref(&mut self) -> Option<&[I::Item]> {
144        self.advance().ok().and_then(|_| self.current_token_ref())
145    }
146
147    /// Consume the current token and return it.
148    /// Note that the current token is undefined before calling `advance` the first time.
149    pub fn take(&mut self) -> Result<Vec<I::Item>, ParserError<I::Item>> {
150        let s = self.current_token();
151        self.advance()?;
152        if let Some(s) = s {
153            Ok(s)
154        } else {
155            Err(ParserError::UnexpectedEndOfFile)
156        }
157    }
158
159    /// Advance to the next token.
160    pub fn advance(&mut self) -> Result<(), ParserError<I::Item>> {
161        let mut buffer = self.current_token.take().unwrap_or_default();
162
163        buffer.clear();
164
165        self.lexer
166            .consume_next_token(&mut self.iter, |c| buffer.push(c))?;
167
168        let has_next = !buffer.is_empty();
169
170        if has_next {
171            self.current_token = Some(buffer);
172        }
173
174        self.has_current = has_next;
175        Ok(())
176    }
177
178    /// Access the current token by reference without consuming it.
179    pub fn current_token_ref(&self) -> Option<&[I::Item]> {
180        if self.has_current {
181            self.current_token.as_deref()
182        } else {
183            None
184        }
185    }
186
187    /// Get a clone of the current token without consuming it.
188    pub fn current_token(&self) -> Option<Vec<I::Item>> {
189        self.current_token_ref().map(|s| s.to_vec())
190    }
191
192    /// Test if the current token equals to the expected token.
193    /// Returns `Ok(())` if the token matches and advances the iterator.
194    /// Returns the actual token otherwise.
195    /// Note that the current token is undefined before calling `advance` the first time.
196    pub fn expect(
197        &mut self,
198        s: impl IntoIterator<Item = I::Item> + Clone,
199    ) -> Result<(), ParserError<I::Item>> {
200        match &self.current_token {
201            None => Err(ParserError::UnexpectedEndOfFile)?,
202            Some(token) => {
203                if token.iter().copied().eq(s.clone()) {
204                    self.advance()?;
205                    Ok(())
206                } else {
207                    Err(ParserError::UnexpectedToken(
208                        s.into_iter().collect(),
209                        self.current_token().unwrap().to_vec(),
210                    ))
211                }
212            }
213        }
214    }
215
216    /// Test if the current token matches with the string.
217    /// The token is consumed only if it matches.
218    /// Note that the current token is undefined before calling `advance` the first time.
219    pub fn test(&mut self, s: &[I::Item]) -> Result<bool, ParserError<I::Item>> {
220        let result = self.peeking_test(s)?;
221        if result {
222            self.advance()?;
223        }
224        Ok(result)
225    }
226
227    /// Test if the current token matches with the string.
228    /// The token is not consumed.
229    /// Note that the current token is undefined before calling `advance` the first time.
230    pub fn peeking_test(&mut self, s: &[I::Item]) -> Result<bool, ParserError<I::Item>> {
231        if self.current_token.is_none() {
232            Err(ParserError::UnexpectedEndOfFile)?;
233        }
234
235        if self.current_token_ref() == Some(s) {
236            Ok(true)
237        } else {
238            Ok(false)
239        }
240    }
241
242    /// Consume all tokens until and including `s`.
243    pub fn skip_until(&mut self, s: &[I::Item]) -> Result<(), ParserError<I::Item>> {
244        while !self.test(s)? {
245            self.advance()?;
246        }
247        Ok(())
248    }
249}
250impl<I, L> Tokenized<I, L>
251where
252    I: Iterator<Item = char> + PeekingNext,
253    L: Lexer<Char = I::Item>,
254{
255    /// Get a clone of the current token without consuming it.
256    /// Note that the current token is undefined before calling `advance` the first time.
257    pub fn current_token_str(&self) -> Option<String> {
258        self.current_token_ref().map(|s| s.iter().collect())
259    }
260
261    /// Consume the current token, convert it to a string and return it.
262    pub fn take_str(&mut self) -> Result<String, ParserError<I::Item>> {
263        let s = self.current_token_str();
264        self.advance()?;
265        if let Some(s) = s {
266            Ok(s)
267        } else {
268            Err(ParserError::UnexpectedEndOfFile)
269        }
270    }
271
272    /// Consume a token and try to convert it to `F` it using `FromStr`.
273    /// Note that the current token is undefined before calling `advance` the first time.
274    pub fn take_and_parse<F: FromStr>(&mut self) -> Result<F, ParserError<I::Item>> {
275        let result = if let Some(token) = self.current_token_ref() {
276            let string: String = token.iter().collect();
277
278            if let Ok(parsed) = string.parse::<F>() {
279                Ok(parsed)
280            } else {
281                Err(ParserError::InvalidLiteral(token.to_vec()))
282            }
283        } else {
284            Err(ParserError::UnexpectedEndOfFile)
285        };
286
287        self.advance()?;
288
289        result
290    }
291
292    /// Test if the current token equals to the expected token.
293    /// Returns `Ok(())` if the token matches and advances the iterator.
294    /// Returns the actual token otherwise.
295    /// Note that the current token is undefined before calling `advance` the first time.
296    pub fn expect_str(&mut self, s: &str) -> Result<(), ParserError<I::Item>> {
297        match &self.current_token {
298            None => Err(ParserError::UnexpectedEndOfFile)?,
299            Some(token) => {
300                if token.iter().copied().eq(s.chars()) {
301                    self.advance()?;
302                    Ok(())
303                } else {
304                    Err(ParserError::UnexpectedToken(
305                        s.chars().collect(),
306                        self.current_token().unwrap().to_vec(),
307                    ))
308                }
309            }
310        }
311    }
312
313    /// Test if the current token matches with the string.
314    /// The token is consumed only if it matches.
315    pub fn test_str(&mut self, s: &str) -> Result<bool, ParserError<I::Item>> {
316        let result = self.peeking_test_str(s)?;
317        if result {
318            self.advance()?;
319        }
320        Ok(result)
321    }
322
323    /// Test if the current token matches with the string.
324    /// The token is not consumed.
325    pub fn peeking_test_str(&mut self, s: &str) -> Result<bool, ParserError<I::Item>> {
326        match &self.current_token {
327            None => Err(ParserError::UnexpectedEndOfFile)?,
328            Some(token) => Ok(token.iter().copied().eq(s.chars())),
329        }
330    }
331
332    /// Consume all tokens until and including `s`.
333    pub fn skip_until_str(&mut self, s: &str) -> Result<(), ParserError<I::Item>> {
334        while !self.test_str(s)? {
335            self.advance()?;
336        }
337        Ok(())
338    }
339}
340
341/// Split a stream of characters into tokens separated by whitespace.
342/// Comments are ignored.
343pub fn tokenize<I, L>(iter: I, lexer: L) -> Tokenized<Peekable<I>, L>
344where
345    I: Iterator<Item = char>,
346{
347    Tokenized {
348        iter: iter.peekable(),
349        lexer,
350        has_current: false,
351        current_token: None,
352    }
353}
354
355/// Error type issued from lexer and parser.
356#[derive(Clone, Debug)]
357pub enum ParserError<C: 'static> {
358    /// Reached end of file before end of library arrived.
359    UnexpectedEndOfFile,
360    /// Expected and actual token.
361    UnexpectedToken(Vec<C>, Vec<C>),
362    /// Unknown literal. The literal is given as a string.
363    InvalidLiteral(Vec<C>),
364    /// Failed to parse an integer.
365    ParseIntError(ParseIntError),
366}
367
368impl<C: 'static + fmt::Display + fmt::Debug> Error for ParserError<C> {}
369
370impl<C: fmt::Display + fmt::Debug> fmt::Display for ParserError<C> {
371    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
372        match self {
373            ParserError::UnexpectedEndOfFile => write!(f, "Unexpected end of file."),
374            ParserError::UnexpectedToken(actual, exp) => {
375                write!(f, "Unexpected token. '{actual:?}' instead of '{exp:?}'")
376            }
377            ParserError::InvalidLiteral(n) => write!(f, "Invalid literal: '{n:?}'."),
378            ParserError::ParseIntError(e) => write!(f, "Illegal integer: '{e:?}'"),
379        }
380    }
381}
382
383impl<C> From<ParseIntError> for ParserError<C> {
384    fn from(e: ParseIntError) -> Self {
385        Self::ParseIntError(e)
386    }
387}
388
389#[test]
390fn test_tokenize_simple() {
391    use itertools::Itertools;
392
393    struct MyLexer {}
394
395    impl Lexer for MyLexer {
396        type Char = char;
397
398        fn consume_next_token(
399            &mut self,
400            input: &mut (impl Iterator<Item = Self::Char> + PeekingNext),
401            mut output: impl FnMut(Self::Char),
402        ) -> Result<(), ParserError<char>> {
403            if let Some(c) = input.next() {
404                output(c);
405                let take_whitespace = c.is_whitespace();
406
407                input
408                    .peeking_take_while(|c| c.is_whitespace() == take_whitespace)
409                    .for_each(output);
410            }
411
412            Ok(())
413        }
414    }
415
416    let data = "here \n are \t some words  ";
417
418    let mut tk = tokenize(data.chars(), MyLexer {});
419
420    tk.advance().unwrap();
421    tk.expect_str("here").unwrap();
422    tk.next();
423    tk.expect_str("are").unwrap();
424    tk.next();
425    tk.expect_str("some").unwrap();
426    tk.next();
427    tk.expect_str("words").unwrap();
428}