bracoxide/
tokenizer.rs

1/*
2 * This file is part of bracoxide.
3 *
4 * bracoxide is under MIT license.
5 *
6 * Copyright (c) 2023 A. Taha Baki <atahabaki@pm.me>
7 */
8
9//! Provides functionality for tokenizing input strings. It defines the
10//! [Token] enum, which represents individual tokens produced during tokenization, and the
11//! [tokenize] function, which converts an input string into a sequence of tokens.
12//!
13//! ## Usage
14//!
15//! To tokenize a string, use the [tokenize] function. It takes an input string as a parameter
16//! and returns a `Result<Vec<Token>, TokenizationError>`. If successful, it returns a vector
17//! of tokens representing the input string. If an error occurs during tokenization, it returns
18//! a [TokenizationError] indicating the specific error encountered.
19//!
20//! The [Token] enum represents different types of tokens, such as opening braces, closing braces,
21//! commas, text, numbers, and ranges. Each variant of the enum provides additional information
22//! related to the token, such as the position of the token in the input string.
23
24use std::sync::Arc;
25
26/// Defines the possible types of tokens that can be encountered during the process of
27/// tokenization.
28///
29/// The [Token] enum is used to represent different types of tokens that can be produced
30/// while tokenizing a string. Each variant of the [Token] enum corresponds to a specific
31/// type of token, such as opening brace, closing brace, comma, text, number, or range
32/// operator.
33///
34#[derive(Debug, PartialEq, Clone)]
35pub enum Token {
36    /// Represents an opening brace `{` at the specified position.
37    OBra(usize),
38    /// Represents a closing brace `}` at the specified position.
39    CBra(usize),
40    /// Represents a comma `,` at the specified position.
41    Comma(usize),
42    /// Represents any non-number text at the specified position.
43    ///
44    /// The associated `String` contains the text value.
45    Text(Arc<String>, usize),
46    /// Represents a number at the specified position.
47    ///
48    /// The associated `String` contains the numeric value.
49    Number(Arc<String>, usize),
50    /// Represents the range operator `..` at the specified position.
51    Range(usize),
52}
53
54/// Represents the possible errors that can occur during the tokenization.
55///
56/// # Example
57///
58/// ```rust
59/// use bracoxide::tokenizer::TokenizationError;
60///
61/// let content = "{a, b, c, d";
62/// let tokenization_result = bracoxide::tokenizer::tokenize(content);
63/// assert_eq!(tokenization_result, Err(TokenizationError::FormatNotSupported));
64/// ```
65#[derive(Debug, PartialEq, Clone)]
66pub enum TokenizationError {
67    /// The content to be tokenized is empty.
68    EmptyContent,
69    /// The input content has an unsupported format (e.g., only an opening brace or closing
70    /// brace).
71    FormatNotSupported,
72    /// The input content does not contain any braces.
73    NoBraces,
74}
75
76impl std::fmt::Display for TokenizationError {
77    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
78        match self {
79            TokenizationError::EmptyContent => write!(f, "Content is empty."),
80            TokenizationError::FormatNotSupported => {
81                write!(f, "Only opening brace or closing brace is used.")
82            }
83            TokenizationError::NoBraces => write!(f, "No braces have been used."),
84        }
85    }
86}
87
88impl std::error::Error for TokenizationError {}
89
90/// Tokenizes the provided content string and produces a vector of tokens.
91///
92/// This function is part of the `bracoxide` crate and is used to tokenize a given string `content`.
93/// The tokenization process splits the string into meaningful units called tokens, which can be
94/// further processed or analyzed as needed.
95///
96/// # Arguments
97///
98/// * `content` - The string to be tokenized.
99///
100/// # Returns
101///
102/// * `Result<Vec<Token>, TokenizationError>` - A result that contains a vector of tokens if the tokenization
103///   is successful, or a [TokenizationError] if an error occurs during the tokenization process.
104///
105/// # Errors
106///
107/// The function can return the following errors:
108///
109/// * [TokenizationError::EmptyContent] - If the `content` string is empty.
110/// * [TokenizationError::NoBraces] - If the `content` string does not contain any braces.
111/// * [TokenizationError::FormatNotSupported] - If the `content` string has an unsupported format, such as
112///   only an opening brace or closing brace without a corresponding pair.
113///
114/// # Examples
115///
116/// ```
117/// use bracoxide::tokenizer::{Token, TokenizationError, tokenize};
118///
119/// let content = "{1, 2, 3}";
120/// let tokens = tokenize(content);
121///
122/// match tokens {
123///     Ok(tokens) => {
124///         println!("Tokenization successful!");
125///         for token in tokens {
126///             println!("{:?}", token);
127///         }
128///     }
129///     Err(error) => {
130///         eprintln!("Tokenization failed: {:?}", error);
131///     }
132/// }
133/// ```
134///
135/// In this example, the `tokenize` function from the `bracoxide` crate is used to tokenize the content string "{1, 2, 3}".
136/// If the tokenization is successful, the resulting tokens are printed. Otherwise, the corresponding error is displayed.
137pub fn tokenize(content: &str) -> Result<Vec<Token>, TokenizationError> {
138    if content.is_empty() {
139        return Err(TokenizationError::EmptyContent);
140    }
141    let mut tokens = Vec::<Token>::new();
142    let mut is_escape = false;
143    // opening, closing
144    let mut count = (0_usize, 0_usize);
145    // text_buffer, number_buffer
146    let mut buffers = (String::new(), String::new());
147    let mut iter = content.chars().enumerate();
148    let tokenize_text_buffer = |tokens: &mut Vec<Token>, buffers: &mut (String, String), i| {
149        if !buffers.0.is_empty() {
150            tokens.push(Token::Text(
151                Arc::new(buffers.0.clone()),
152                i - buffers.0.len(),
153            ));
154            buffers.0.clear();
155        }
156    };
157    let tokenize_number_buffer = |tokens: &mut Vec<Token>, buffers: &mut (String, String), i| {
158        if !buffers.1.is_empty() {
159            tokens.push(Token::Number(
160                Arc::new(buffers.1.clone()),
161                i - buffers.1.len(),
162            ));
163            buffers.1.clear();
164        }
165    };
166    // Push buffers into tokens.
167    let tokenize_buffers = |tokens: &mut Vec<Token>, buffers: &mut (String, String), i| {
168        tokenize_text_buffer(tokens, buffers, i);
169        tokenize_number_buffer(tokens, buffers, i);
170    };
171    while let Some((i, c)) = iter.next() {
172        match (c, is_escape) {
173            (_, true) => {
174                if !buffers.1.is_empty() {
175                    buffers.0.push_str(&buffers.1);
176                    buffers.1.clear();
177                }
178                buffers.0.push(c);
179                buffers.1.clear();
180                is_escape = false;
181            }
182            ('\\', false) => is_escape = true,
183            // @1: COMMENT
184            // Look it is '{' OR '}' OR ','
185            // No other c value can pass this match ARM
186            // And now look to @2
187            ('{' | '}' | ',', _) => {
188                tokenize_buffers(&mut tokens, &mut buffers, i);
189                match c {
190                    '{' => {
191                        count.0 += 1;
192                        tokens.push(Token::OBra(i));
193                    }
194                    '}' => {
195                        count.1 += 1;
196                        tokens.push(Token::CBra(i));
197                    }
198                    ',' => tokens.push(Token::Comma(i)),
199                    // @2: COMMENT
200                    // Look @1 the above catch, you see
201                    // c can be just '{' OR '}' OR ','.
202                    // AND Why the god damn rust wants me to handle all cases,
203                    // Where I got covered all cases above.
204                    _ => unreachable!(),
205                }
206            }
207            ('.', _) => {
208                let mut r_iter = iter.clone();
209                if let Some((_ix, cx)) = r_iter.next() {
210                    match cx {
211                        '.' if count.0 == count.1 => {
212                            buffers.0.push(c);
213                            buffers.0.push(cx);
214                            tokenize_buffers(&mut tokens, &mut buffers, i + 2);
215                            iter = r_iter;
216                        }
217                        '.' => {
218                            tokenize_buffers(&mut tokens, &mut buffers, i);
219                            tokens.push(Token::Range(i));
220                            iter = r_iter;
221                            continue;
222                        }
223                        _ => {
224                            tokenize_number_buffer(&mut tokens, &mut buffers, i);
225                            buffers.0.push(c);
226                        }
227                    }
228                } else {
229                    buffers.0.push(c);
230                }
231            }
232            ('0'..='9', _) => {
233                tokenize_text_buffer(&mut tokens, &mut buffers, i);
234                buffers.1.push(c);
235            }
236            _ => {
237                tokenize_number_buffer(&mut tokens, &mut buffers, i);
238                buffers.0.push(c);
239            }
240        }
241    }
242    match count {
243        (0, 0) => return Err(TokenizationError::NoBraces),
244        (0, _) | (_, 0) => return Err(TokenizationError::FormatNotSupported),
245        (_, _) => (),
246    }
247    tokenize_buffers(&mut tokens, &mut buffers, content.len());
248    Ok(tokens)
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254
255    #[test]
256    fn test_empty_content() {
257        assert_eq!(tokenize(""), Err(TokenizationError::EmptyContent));
258        assert_eq!(
259            tokenize(String::new().as_str()),
260            Err(TokenizationError::EmptyContent)
261        );
262    }
263
264    #[test]
265    fn test_double_dots_noerror() {
266        assert_eq!(
267            tokenize("..{a,b}",),
268            Ok(vec![
269                Token::Text(Arc::new("..".to_string()), 0),
270                Token::OBra(2),
271                Token::Text(Arc::new("a".to_string()), 3),
272                Token::Comma(4),
273                Token::Text(Arc::new("b".to_string()), 5),
274                Token::CBra(6),
275            ])
276        )
277    }
278
279    #[test]
280    fn test_no_braces() {
281        assert_eq!(tokenize("a"), Err(TokenizationError::NoBraces));
282        assert_eq!(tokenize("1..3"), Err(TokenizationError::NoBraces));
283        assert_eq!(tokenize("a,b"), Err(TokenizationError::NoBraces));
284        assert_eq!(
285            tokenize("arst1..3.(arst)xt"),
286            Err(TokenizationError::NoBraces)
287        );
288    }
289
290    #[test]
291    fn test_format_not_supported() {
292        assert_eq!(
293            tokenize("{a, b, c, d"),
294            Err(TokenizationError::FormatNotSupported)
295        );
296        assert_eq!(
297            tokenize("{{a, b, c, d"),
298            Err(TokenizationError::FormatNotSupported)
299        );
300        assert_eq!(
301            tokenize("a, b, c, d}}"),
302            Err(TokenizationError::FormatNotSupported)
303        );
304        assert_eq!(
305            tokenize("a{, b{, c{, d{"),
306            Err(TokenizationError::FormatNotSupported)
307        );
308    }
309
310    #[test]
311    fn test_tokenize_single_brace_expansion() {
312        let content = "A{1..3}";
313        let expected_result: Result<Vec<Token>, TokenizationError> = Ok(vec![
314            Token::Text(Arc::new("A".to_string()), 0),
315            Token::OBra(1),
316            Token::Number(Arc::new("1".to_string()), 2),
317            Token::Range(3),
318            Token::Number(Arc::new("3".to_string()), 5),
319            Token::CBra(6),
320        ]);
321        assert_eq!(tokenize(content), expected_result);
322        let content = "{AB12}";
323        let expected_result: Result<Vec<Token>, TokenizationError> = Ok(vec![
324            Token::OBra(0),
325            Token::Text(Arc::new("AB".to_string()), 1),
326            Token::Number(Arc::new("12".to_string()), 3),
327            Token::CBra(5),
328        ]);
329        assert_eq!(tokenize(content), expected_result);
330        let content = "{12AB}";
331        let expected_result: Result<Vec<Token>, TokenizationError> = Ok(vec![
332            Token::OBra(0),
333            Token::Number(Arc::new("12".to_string()), 1),
334            Token::Text(Arc::new("AB".to_string()), 3),
335            Token::CBra(5),
336        ]);
337        assert_eq!(tokenize(content), expected_result);
338    }
339
340    #[test]
341    fn test_tokenize_multiple_brace_expansions() {
342        let content = "A{1,2}..B{3,4}";
343        let expected_result: Result<Vec<Token>, TokenizationError> = Ok(vec![
344            Token::Text(Arc::new("A".to_string()), 0),
345            Token::OBra(1),
346            Token::Number(Arc::new("1".to_string()), 2),
347            Token::Comma(3),
348            Token::Number(Arc::new("2".to_string()), 4),
349            Token::CBra(5),
350            Token::Text(Arc::new("..".to_string()), 6),
351            Token::Text(Arc::new("B".to_string()), 8),
352            Token::OBra(9),
353            Token::Number(Arc::new("3".to_string()), 10),
354            Token::Comma(11),
355            Token::Number(Arc::new("4".to_string()), 12),
356            Token::CBra(13),
357        ]);
358        assert_eq!(tokenize(content), expected_result);
359    }
360
361    #[test]
362    fn test_tokenize() {
363        // Test case 1: {1..3}
364        assert_eq!(
365            tokenize("{1..3}"),
366            Ok(vec![
367                Token::OBra(0),
368                Token::Number(Arc::new("1".to_owned()), 1),
369                Token::Range(2),
370                Token::Number(Arc::new("3".to_owned()), 4),
371                Token::CBra(5)
372            ])
373        );
374
375        // Test case 2: {a,b,c}
376        assert_eq!(
377            tokenize("{a,b,c}"),
378            Ok(vec![
379                Token::OBra(0),
380                Token::Text(Arc::new("a".to_owned()), 1),
381                Token::Comma(2),
382                Token::Text(Arc::new("b".to_owned()), 3),
383                Token::Comma(4),
384                Token::Text(Arc::new("c".to_owned()), 5),
385                Token::CBra(6)
386            ])
387        );
388
389        // Test case 12: A{1..3}..B{2,5}
390        assert_eq!(
391            tokenize("A{1..3}..B{2,5}"),
392            Ok(vec![
393                Token::Text(Arc::new("A".to_owned()), 0),
394                Token::OBra(1),
395                Token::Number(Arc::new("1".to_owned()), 2),
396                Token::Range(3),
397                Token::Number(Arc::new("3".to_owned()), 5),
398                Token::CBra(6),
399                Token::Text(Arc::new("..".to_owned()), 7),
400                Token::Text(Arc::new("B".to_owned()), 9),
401                Token::OBra(10),
402                Token::Number(Arc::new("2".to_owned()), 11),
403                Token::Comma(12),
404                Token::Number(Arc::new("5".to_owned()), 13),
405                Token::CBra(14)
406            ])
407        );
408    }
409
410    #[test]
411    fn test_dots() {
412        assert_eq!(
413            tokenize("{1..3}"),
414            Ok(vec![
415                Token::OBra(0),
416                Token::Number(Arc::new("1".to_owned()), 1),
417                Token::Range(2),
418                Token::Number(Arc::new("3".to_owned()), 4),
419                Token::CBra(5),
420            ])
421        );
422        assert_eq!(
423            tokenize("{1.2.3,b}"),
424            Ok(vec![
425                Token::OBra(0),
426                Token::Number(Arc::new("1".to_owned()), 1),
427                Token::Text(Arc::new(".".to_owned()), 2),
428                Token::Number(Arc::new("2".to_owned()), 3),
429                Token::Text(Arc::new(".".to_owned()), 4),
430                Token::Number(Arc::new("3".to_owned()), 5),
431                Token::Comma(6),
432                Token::Text(Arc::new("b".to_owned()), 7),
433                Token::CBra(8),
434            ])
435        );
436        assert_eq!(
437            tokenize("{a.b.c,d}"),
438            Ok(vec![
439                Token::OBra(0),
440                Token::Text(Arc::new("a.b.c".to_owned()), 1),
441                Token::Comma(6),
442                Token::Text(Arc::new("d".to_owned()), 7),
443                Token::CBra(8),
444            ])
445        );
446    }
447
448    #[test]
449    fn test_numbers_with_proceeding_escapees_are_text_now() {
450        assert_eq!(
451            tokenize("1\\\\{a,b}"),
452            Ok(vec![
453                Token::Text(Arc::new("1\\".into()), 1),
454                Token::OBra(3),
455                Token::Text(Arc::new("a".into()), 4),
456                Token::Comma(5),
457                Token::Text(Arc::new("b".into()), 6),
458                Token::CBra(7),
459            ])
460        );
461        assert_eq!(
462            tokenize("1\\a{b,c}"),
463            Ok(vec![
464                Token::Text(Arc::new("1a".into()), 1),
465                Token::OBra(3),
466                Token::Text(Arc::new("b".into()), 4),
467                Token::Comma(5),
468                Token::Text(Arc::new("c".into()), 6),
469                Token::CBra(7),
470            ])
471        );
472        assert_eq!(
473            tokenize("{1\\2,3\\\\{4\\5,6\\7}}"),
474            Ok(vec![
475                Token::OBra(0),
476                Token::Text(Arc::new("12".into()), 2),
477                Token::Comma(4),
478                Token::Text(Arc::new("3\\".into()), 6),
479                Token::OBra(8),
480                Token::Text(Arc::new("45".into()), 10),
481                Token::Comma(12),
482                Token::Text(Arc::new("67".into()), 14),
483                Token::CBra(16),
484                Token::CBra(17),
485            ])
486        );
487    }
488}