noa_parser/bytes/components/
groups.rs

1//! Group components
2
3use crate::bytes::token::Token;
4use crate::errors::ParseResult;
5use crate::peek::{PeekResult, Peekable};
6use crate::recognizer::Recognizable;
7use crate::scanner::Scanner;
8
9/// Try to recognize either a start group or an end group token.
10///
11/// If the start group token is recognized, increment the balancing counter.
12/// If the end group token is recognized, decrement the balancing counter.
13/// If neither is recognized, move the tokenizer by one byte.
14///
15/// # Arguments
16///
17/// * `tokenizer` - The tokenizer to use
18/// * `balance` - A mutable reference to the balancing counter
19/// * `start` - The start group token to recognize
20/// * `end` - The end group token to recognize
21///
22/// # Errors
23///
24/// Returns `Err(ParseError)` if the tokenizer encounters an error.
25///
26/// # Examples
27///
28///
29pub fn match_for_balanced_group<'a, V1, T1, V2, T2>(
30    tokenizer: &mut Scanner<'a, u8>,
31    balance: &mut usize,
32    start: T1,
33    end: T2,
34) -> ParseResult<()>
35where
36    T1: Recognizable<'a, u8, V1> + Copy,
37    T2: Recognizable<'a, u8, V2> + Copy,
38{
39    // try to recognize start group token
40    match start.recognize(tokenizer)? {
41        // if not start token try to recognize end group token
42        None => match end.recognize(tokenizer)? {
43            // if end group token decrement balancing counter
44            Some(_end_token) => *balance -= 1,
45            // if neither, move by one byte
46            None => {
47                tokenizer.bump_by(1);
48                return Ok(());
49            }
50        },
51        // if start group token increment balancing counter
52        Some(_start_token) => *balance += 1,
53    };
54
55    Ok(())
56}
57
58/// A closure that takes a slice of bytes and returns a `PeekResult` indicating
59/// whether the slice matches a balanced group.
60///
61/// A balanced group is a sequence of bytes that has the same number of start
62/// and end group tokens. The start group token is recognized by the `start`
63/// parameter and the end group token is recognized by the `end` parameter.
64///
65/// The closure returns `Ok(PeekResult::Found { end_slice, start, end })` if the
66/// slice matches a balanced group, `Ok(PeekResult::NotFound)` if the slice
67/// does not match a balanced group, and `Err(ParseError)` if there is an error
68/// recognizing the tokens.
69///
70/// # Arguments
71///
72/// * `start` - The start group token to recognize
73/// * `end` - The end group token to recognize
74///
75/// # Returns
76///
77/// A closure that takes a slice of bytes and returns a `PeekResult` indicating
78/// whether the slice matches a balanced group.
79pub fn match_group<'a, V1, T1, V2, T2>(
80    start: T1,
81    end: T2,
82) -> impl Fn(&'a [u8]) -> ParseResult<PeekResult<T1, T2>> + 'a
83where
84    T1: Recognizable<'a, u8, V1> + Copy + 'a,
85    T2: Recognizable<'a, u8, V2> + Copy + 'a,
86{
87    move |input: &'a [u8]| {
88        // 0 if number of start token equals number of end token
89        // i.e: ( 5 + 3 - ( 10 * 8 ) ) => 2 "(" and 2 ")" => balanced
90        //      ( 5 + 3 - ( 10 * 8 )   => 2 "(" and 1 ")" => unbalanced
91        let mut balance = 0;
92
93        let mut tokenizer = Scanner::new(input);
94
95        loop {
96            match_for_balanced_group(&mut tokenizer, &mut balance, start, end)?;
97            // if balancing is 0 then either there is no group at all or is balanced
98            if balance == 0 {
99                break;
100            }
101        }
102
103        // not enough bytes to create a group
104        if tokenizer.current_position() == 1 {
105            return Ok(PeekResult::NotFound);
106        }
107
108        Ok(PeekResult::Found {
109            end_slice: tokenizer.current_position(),
110            start,
111            end,
112        })
113    }
114}
115
116/// A closure that takes a slice of bytes and returns a `PeekResult` indicating
117/// whether the slice matches a delimited group.
118///
119/// A delimited group is a sequence of bytes that starts and ends with the same
120/// token and has no other occurrence of that token in between. The token is
121/// recognized by the `token` parameter and the escape token is recognized by the
122/// `escape_token` parameter.
123///
124/// The closure returns `Ok(PeekResult::Found { end_slice, start, end })` if the
125/// slice matches a delimited group, `Ok(PeekResult::NotFound)` if the slice
126/// does not match a delimited group, and `Err(ParseError)` if there is an error
127/// recognizing the tokens.
128///
129/// # Arguments
130///
131/// * `token` - The token to recognize at the start and end of the group
132/// * `escape_token` - The escape token to recognize and ignore in the group
133///
134/// # Returns
135///
136/// A closure that takes a slice of bytes and returns a `PeekResult` indicating
137/// whether the slice matches a delimited group.
138pub fn match_for_delimited_group<'a, V, T, V2, T2>(
139    token: T,
140    escape_token: T2,
141) -> impl Fn(&'a [u8]) -> ParseResult<PeekResult<T, T>> + 'a
142where
143    T: Recognizable<'a, u8, V> + Copy + 'a,
144    T2: Recognizable<'a, u8, V2> + Copy + 'a,
145{
146    move |input: &'a [u8]| {
147        // le groupe doit au moins faire 2 tokens de taille
148        if input.len() < token.size() * 2 {
149            return Ok(PeekResult::NotFound);
150        }
151
152        // on créé un scanner à partir des données
153        let mut tokenizer = Scanner::new(input);
154
155        // le groupe doit obligatoirement débuter par le token
156        if token.recognize(&mut tokenizer)?.is_none() {
157            return Ok(PeekResult::NotFound);
158        }
159        // on avance de la taille du token reconnu
160        tokenizer.bump_by(token.size());
161
162        // ce flag permet de savoir si la prédiction a été un succès
163        let mut found = false;
164
165        // tant que la slice contient des bytes, on essaie de reconnaître le token
166        while !tokenizer.remaining().is_empty() {
167            // si le token est reconnu quelque part dans la slice
168            if token.recognize(&mut tokenizer)?.is_some() {
169                // on créé un nouveau scanner qui est un token et un \ en arrière
170                let mut rewind_tokenizer = Scanner::new(
171                    &tokenizer.data()
172                        [tokenizer.current_position() - token.size() - escape_token.size()..],
173                );
174                // on tente de reconnaître le \
175                if escape_token.recognize(&mut rewind_tokenizer)?.is_some() {
176                    // s'il est présent, le token est échappé
177                    continue;
178                }
179                // sinon on a atteint la fin du groupe et la prédiction est un succès
180                found = true;
181                break;
182            }
183            // sinon on avance d'un byte
184            tokenizer.bump_by(1);
185        }
186
187        // Si la prédiction est un échec
188        if !found {
189            return Ok(PeekResult::NotFound);
190        }
191
192        Ok(PeekResult::Found {
193            end_slice: tokenizer.current_position(),
194            start: token,
195            end: token,
196        })
197    }
198}
199
200/// Types of groups
201///
202/// This enum is used to specify the type of a group in a matcher.
203pub enum GroupKind {
204    /// A group enclosed in parentheses
205    Parenthesis,
206    /// A group enclosed in single quotes
207    Quotes,
208    /// A group enclosed in double quotes
209    DoubleQuotes,
210}
211
212impl GroupKind {
213    fn matcher<'a>(&self) -> Box<dyn Fn(&'a [u8]) -> ParseResult<PeekResult<Token, Token>> + 'a>
214where {
215        match self {
216            GroupKind::Parenthesis => Box::new(match_group(Token::OpenParen, Token::CloseParen)),
217            GroupKind::Quotes => {
218                Box::new(match_for_delimited_group(Token::Quote, Token::Backslash))
219            }
220            GroupKind::DoubleQuotes => Box::new(match_for_delimited_group(
221                Token::DoubleQuote,
222                Token::Backslash,
223            )),
224        }
225    }
226}
227
228impl<'a> Peekable<'a, u8, Token, Token> for GroupKind {
229    fn peek(&self, data: &Scanner<'a, u8>) -> ParseResult<PeekResult<Token, Token>> {
230        self.matcher()(data.remaining())
231    }
232}
233
234#[cfg(test)]
235mod tests {
236    use crate::bytes::components::groups::{GroupKind, match_for_delimited_group, match_group};
237    use crate::bytes::token::Token;
238    use crate::peek::{PeekResult, Peeking, peek};
239    use crate::scanner::Scanner;
240
241    #[test]
242    fn test_match_group() {
243        let data = b"( 5 + 3 - ( 10 * 8 ) ) + 54";
244        let result =
245            match_group(Token::OpenParen, Token::CloseParen)(data).expect("failed to parse");
246        assert_eq!(
247            result,
248            PeekResult::Found {
249                end_slice: 22,
250                start: Token::OpenParen,
251                end: Token::CloseParen
252            }
253        );
254        assert_eq!(&data[..22], b"( 5 + 3 - ( 10 * 8 ) )");
255    }
256
257    #[test]
258    fn test_match_group_delimited() {
259        let data = b"( 5 + 3 - ( 10 * 8 ) ) + 54";
260        let mut tokenizer = Scanner::new(data);
261        let result = peek(GroupKind::Parenthesis, &mut tokenizer).expect("failed to parse");
262        assert_eq!(
263            result,
264            Some(Peeking {
265                start: Token::OpenParen,
266                end: Token::CloseParen,
267                data: &data[0..22],
268                end_slice: 22
269            })
270        );
271        assert_eq!(&data[..22], b"( 5 + 3 - ( 10 * 8 ) )");
272    }
273
274    #[test]
275    fn test_match_quotes() {
276        let data = b"'hello world' data";
277        let result = match_for_delimited_group(Token::Quote, Token::Backslash)(data)
278            .expect("failed to parse");
279        assert_eq!(
280            result,
281            PeekResult::Found {
282                end_slice: 13,
283                start: Token::Quote,
284                end: Token::Quote
285            }
286        );
287        assert_eq!(&data[..13], b"'hello world'");
288
289        let data = r#"'hello world l\'éléphant' data"#;
290        let result = match_for_delimited_group(Token::Quote, Token::Backslash)(data.as_bytes())
291            .expect("failed to parse");
292        assert_eq!(
293            result,
294            PeekResult::Found {
295                end_slice: 27,
296                start: Token::Quote,
297                end: Token::Quote
298            }
299        );
300        assert_eq!(&data[..27], r#"'hello world l\'éléphant'"#);
301
302        let data = "\"hello world\" data";
303        let result =
304            match_for_delimited_group(Token::DoubleQuote, Token::Backslash)(data.as_bytes())
305                .expect("failed to parse");
306        assert_eq!(
307            result,
308            PeekResult::Found {
309                end_slice: 13,
310                start: Token::DoubleQuote,
311                end: Token::DoubleQuote
312            }
313        );
314        assert_eq!(&data[..13], "\"hello world\"");
315
316        let data = r#""hello world" data"#;
317        let result =
318            match_for_delimited_group(Token::DoubleQuote, Token::Backslash)(data.as_bytes())
319                .expect("failed to parse");
320        assert_eq!(
321            result,
322            PeekResult::Found {
323                end_slice: 13,
324                start: Token::DoubleQuote,
325                end: Token::DoubleQuote
326            }
327        );
328        assert_eq!(&data[..13], r#""hello world""#);
329    }
330}
noa_parser/bytes/components/groups.rs

noa_parser/bytes/components/
groups.rs