asn1rs_model/
parser.rs

1use std::fmt::{Display, Formatter};
2
3#[derive(Debug, Default, Copy, Clone, PartialOrd, PartialEq, Eq)]
4pub struct Location {
5    line: usize,
6    column: usize,
7}
8
9impl Location {
10    pub const fn at(line: usize, column: usize) -> Location {
11        Self { line, column }
12    }
13
14    pub const fn line(&self) -> usize {
15        self.line
16    }
17
18    pub const fn column(&self) -> usize {
19        self.column
20    }
21}
22
23#[derive(Debug, PartialOrd, PartialEq, Eq, Clone)]
24pub enum Token {
25    Text(Location, String),
26    Separator(Location, char),
27}
28
29impl From<char> for Token {
30    fn from(separator: char) -> Self {
31        Token::Separator(Location::default(), separator)
32    }
33}
34
35impl From<String> for Token {
36    fn from(text: String) -> Self {
37        Token::Text(Location::default(), text)
38    }
39}
40
41impl Display for Token {
42    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
43        match self {
44            Token::Text(_, text) => write!(f, "\"{}\"", text),
45            Token::Separator(_, separator) => write!(f, "\'{}\'", separator),
46        }
47    }
48}
49
50impl Token {
51    fn append(self, other: Token) -> (Token, Option<Token>) {
52        match (self, other) {
53            (Token::Text(location, mut text), Token::Text(_, other)) => (
54                Token::Text(location, {
55                    text.push_str(&other);
56                    text
57                }),
58                None,
59            ),
60            (a, b) => (a, Some(b)),
61        }
62    }
63
64    pub fn location(&self) -> Location {
65        match self {
66            Token::Text(location, _) => *location,
67            Token::Separator(location, _) => *location,
68        }
69    }
70
71    pub fn eq_text(&self, text: &str) -> bool {
72        self.text().map(|t| t.eq(text)).unwrap_or(false)
73    }
74
75    pub fn eq_text_ignore_ascii_case(&self, text: &str) -> bool {
76        self.text()
77            .map(|t| t.eq_ignore_ascii_case(text))
78            .unwrap_or(false)
79    }
80
81    pub fn eq_separator(&self, separator: char) -> bool {
82        self.separator().map(|s| s == separator).unwrap_or(false)
83    }
84
85    pub fn text(&self) -> Option<&str> {
86        match self {
87            Token::Text(_, text) => Some(text),
88            _ => None,
89        }
90    }
91
92    pub fn separator(&self) -> Option<char> {
93        match self {
94            Token::Separator(_, char) => Some(*char),
95            _ => None,
96        }
97    }
98
99    pub fn is_text(&self) -> bool {
100        self.text().is_some()
101    }
102
103    pub fn is_separator(&self) -> bool {
104        self.separator().is_some()
105    }
106
107    pub fn into_text(self) -> Option<String> {
108        if let Token::Text(_, text) = self {
109            Some(text)
110        } else {
111            None
112        }
113    }
114
115    pub fn into_text_or_else<E, F: FnOnce(Token) -> E>(self, f: F) -> Result<String, E> {
116        match self {
117            Token::Text(_, text) => Ok(text),
118            token => Err(f(token)),
119        }
120    }
121
122    pub fn into_separator_or_else<E, F: FnOnce(Token) -> E>(self, f: F) -> Result<char, E> {
123        match self {
124            Token::Separator(_, separator) => Ok(separator),
125            token => Err(f(token)),
126        }
127    }
128}
129
130#[derive(Default)]
131pub struct Tokenizer;
132
133impl Tokenizer {
134    /// Tokenize the given ASN.1 string.
135    /// Parse the string line by line and character by character.
136    /// Exclude comments as defined in 12.6.2-4  ITU-T Rec. X.680 (02/2021)
137    /// Ignore single-line comments defined with "--".
138    /// Ignore multi-line comments defined with /*  */.
139    /// Comment terminates when a matching "*/" has been found for each "/*"
140    pub fn parse(&self, asn: &str) -> Vec<Token> {
141        let mut previous = None;
142        let mut tokens = Vec::new();
143        let mut nest_lvl = 0; // Nest level of the comments
144
145        for (line_0, line) in asn.lines().enumerate() {
146            let mut token = None;
147            let mut content_iterator = line.chars().enumerate().peekable();
148
149            while let Some((column_0, char)) = content_iterator.next() {
150                if nest_lvl > 0 {
151                    match char {
152                        '*' => {
153                            if let Some((_, '/')) = content_iterator.peek() {
154                                nest_lvl -= 1;
155                                content_iterator.next(); // remove closing '/'
156                            }
157                        }
158                        '/' => {
159                            if let Some((_, '*')) = content_iterator.peek() {
160                                nest_lvl += 1;
161                                content_iterator.next(); // remove opening '*'
162                            }
163                        }
164                        _ => {
165                            if content_iterator.peek().is_none()
166                                && line_0 == asn.lines().count() - 1
167                            {
168                                panic!("The file has unclosed comment blocks. Nested comment blocks are counted.");
169                            } else {
170                                continue;
171                            }
172                        }
173                    }
174                    continue;
175                }
176                // Get rid of one-line comments. Can also happen immediately after closing block comment
177                if nest_lvl == 0
178                    && char == '-'
179                    && content_iterator.peek().map(|&(_, ch)| ch) == Some('-')
180                {
181                    content_iterator.next(); // remove second '-'
182                    break; // ignore rest of the line
183                }
184                match char {
185                    '/' if content_iterator.peek().map(|&(_, ch)| ch) == Some('*') => {
186                        content_iterator.next(); // remove opening '*'
187                        nest_lvl += 1;
188                    }
189                    // asn syntax
190                    ':' | ';' | '=' | '(' | ')' | '{' | '}' | '.' | ',' | '[' | ']' | '\''
191                    | '"' => {
192                        token = Some(Token::Separator(
193                            Location::at(line_0 + 1, column_0 + 1),
194                            char,
195                        ))
196                    }
197                    // text
198                    c if !c.is_control() && c != ' ' => {
199                        token = Some(Token::Text(
200                            Location::at(line_0 + 1, column_0 + 1),
201                            format!("{}", c),
202                        ));
203                    }
204                    // text separator
205                    ' ' | '\r' | '\n' | '\t' => {
206                        if let Some(token) = previous.take() {
207                            tokens.push(token);
208                        }
209                    }
210                    c => eprintln!(
211                        "Ignoring unexpected character: {}-0x{:02x}-{:03}",
212                        c, c as u8, c as u8
213                    ),
214                }
215
216                if let Some(token) = token.take() {
217                    previous = match previous {
218                        None => Some(token),
219                        Some(current) => {
220                            let (token, second) = current.append(token);
221                            match second {
222                                None => Some(token),
223                                Some(next) => {
224                                    tokens.push(token);
225                                    Some(next)
226                                }
227                            }
228                        }
229                    }
230                }
231            }
232
233            if let Some(token) = previous.take() {
234                tokens.push(token);
235            }
236        }
237
238        if let Some(token) = previous {
239            tokens.push(token);
240        }
241
242        tokens
243    }
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    pub fn test_separator_tokens_not_merged() {
252        let result = Tokenizer.parse(":;=(){}.,[]");
253        let mut iter = result.into_iter();
254        assert!(iter.next().unwrap().eq_separator(':'));
255        assert!(iter.next().unwrap().eq_separator(';'));
256        assert!(iter.next().unwrap().eq_separator('='));
257        assert!(iter.next().unwrap().eq_separator('('));
258        assert!(iter.next().unwrap().eq_separator(')'));
259        assert!(iter.next().unwrap().eq_separator('{'));
260        assert!(iter.next().unwrap().eq_separator('}'));
261        assert!(iter.next().unwrap().eq_separator('.'));
262        assert!(iter.next().unwrap().eq_separator(','));
263        assert!(iter.next().unwrap().eq_separator('['));
264        assert!(iter.next().unwrap().eq_separator(']'));
265        assert!(iter.next().is_none());
266    }
267
268    #[test]
269    pub fn test_text_between_seapators_is_represented_as_one_text_token() {
270        let result = Tokenizer.parse("::=ASN{");
271        let mut iter = result.into_iter();
272        assert!(iter.next().unwrap().eq_separator(':'));
273        assert!(iter.next().unwrap().eq_separator(':'));
274        assert!(iter.next().unwrap().eq_separator('='));
275        assert!(iter.next().unwrap().eq_text("ASN"));
276        assert!(iter.next().unwrap().eq_separator('{'));
277        assert!(iter.next().is_none());
278    }
279
280    #[test]
281    pub fn test_invisible_separator_characters() {
282        let result = Tokenizer.parse("a b\rc\nd\te AB\rCD\nEF\tGH aa  bb\r\rcc\n\ndd\t\tee");
283        let mut iter = result.into_iter();
284        assert!(iter.next().unwrap().eq_text("a"));
285        assert!(iter.next().unwrap().eq_text("b"));
286        assert!(iter.next().unwrap().eq_text("c"));
287        assert!(iter.next().unwrap().eq_text("d"));
288        assert!(iter.next().unwrap().eq_text("e"));
289        assert!(iter.next().unwrap().eq_text("AB"));
290        assert!(iter.next().unwrap().eq_text("CD"));
291        assert!(iter.next().unwrap().eq_text("EF"));
292        assert!(iter.next().unwrap().eq_text("GH"));
293        assert!(iter.next().unwrap().eq_text("aa"));
294        assert!(iter.next().unwrap().eq_text("bb"));
295        assert!(iter.next().unwrap().eq_text("cc"));
296        assert!(iter.next().unwrap().eq_text("dd"));
297        assert!(iter.next().unwrap().eq_text("ee"));
298        assert!(iter.next().is_none());
299    }
300
301    #[test]
302    pub fn test_token_text() {
303        let token = Token::from("some text".to_string());
304        assert_eq!(token.text(), Some("some text"));
305        assert_eq!(token.separator(), None);
306    }
307
308    #[test]
309    pub fn test_token_separator() {
310        let result = Tokenizer.parse("AS\x00N");
311        let mut iter = result.into_iter();
312        assert!(iter.next().unwrap().eq_text("ASN"));
313        assert!(iter.next().is_none());
314    }
315
316    #[test]
317    pub fn test_control_char_is_ignored() {
318        let token = Token::from(':');
319        assert_eq!(token.text(), None);
320        assert_eq!(token.separator(), Some(':'),)
321    }
322
323    #[test]
324    pub fn test_ignores_line_comments() {
325        let result = Tokenizer::default().parse(
326            r"
327                Some ::= None -- very clever
328                        -- ignore true ::= false
329        ",
330        );
331        let mut iter = result.into_iter();
332        assert!(iter.next().unwrap().eq_text("Some"));
333        assert!(iter.next().unwrap().eq_separator(':'));
334        assert!(iter.next().unwrap().eq_separator(':'));
335        assert!(iter.next().unwrap().eq_separator('='));
336        assert!(iter.next().unwrap().eq_text("None"));
337        assert!(iter.next().is_none());
338    }
339    #[test]
340    pub fn test_ignores_multiline_comments() {
341        let result = Tokenizer::default().parse(
342            r"
343            ASN1 DEFINITION ::= BEGIN
344            /* This is a comment */
345            -- This is also a comment
346            SomeTypeDef ::= SEQUENCE {
347            /* Nested comment level 1
348               /* Nested comment -- level 2 */
349            still in level 1 comment */
350            integer INTEGER
351            }
352            END",
353        );
354        let mut iter = result.into_iter();
355        assert!(iter.next().unwrap().eq_text("ASN1"));
356        assert!(iter.next().unwrap().eq_text("DEFINITION"));
357        assert!(iter.next().unwrap().eq_separator(':'));
358        assert!(iter.next().unwrap().eq_separator(':'));
359        assert!(iter.next().unwrap().eq_separator('='));
360        assert!(iter.next().unwrap().eq_text("BEGIN"));
361        assert!(iter.next().unwrap().eq_text("SomeTypeDef"));
362        assert!(iter.next().unwrap().eq_separator(':'));
363        assert!(iter.next().unwrap().eq_separator(':'));
364        assert!(iter.next().unwrap().eq_separator('='));
365        assert!(iter.next().unwrap().eq_text("SEQUENCE"));
366        assert!(iter.next().unwrap().eq_separator('{'));
367        assert!(iter.next().unwrap().eq_text("integer"));
368        assert!(iter.next().unwrap().eq_text("INTEGER"));
369        assert!(iter.next().unwrap().eq_separator('}'));
370        assert!(iter.next().unwrap().eq_text("END"));
371        assert!(iter.next().is_none());
372    }
373
374    #[test]
375    #[should_panic(
376        expected = "The file has unclosed comment blocks. Nested comment blocks are counted."
377    )]
378    pub fn test_unclosed_comment() {
379        let _ = Tokenizer::default().parse(
380            r"
381            ASN1 DEFINITION ::= BEGIN
382            /* This is a comment
383            SomeTypeDef ::= SEQUENCE {
384            /* Nested comment level 1
385               /* Nested comment -- level 2 */
386            still in level 1 comment */
387            integer INTEGER
388            }
389            END",
390        );
391    }
392
393    #[test]
394    pub fn test_token_is_separator() {
395        assert!(Token::Separator(Location::default(), ',').is_separator());
396    }
397
398    #[test]
399    pub fn test_token_is_text() {
400        assert!(Token::Text(Location::default(), String::default()).is_text());
401    }
402
403    #[test]
404    pub fn test_token_location_separator() {
405        let location = Location::at(42, 1337);
406        assert_eq!(location, Token::Separator(location, ',').location());
407    }
408
409    #[test]
410    pub fn test_token_location_text() {
411        let location = Location::at(42, 1337);
412        assert_eq!(
413            location,
414            Token::Text(location, String::default()).location()
415        );
416    }
417
418    #[test]
419    pub fn test_token_eq_text() {
420        assert!(Token::Text(Location::default(), "aBc".to_string()).eq_text("aBc"));
421        assert!(!Token::Text(Location::default(), "aBc".to_string()).eq_text("abc"));
422        assert!(!Token::Text(Location::default(), "aBc".to_string()).eq_text("cde"));
423    }
424
425    #[test]
426    pub fn test_token_eq_text_ignore_ascii_case() {
427        assert!(
428            Token::Text(Location::default(), "aBc".to_string()).eq_text_ignore_ascii_case("aBc")
429        );
430        assert!(
431            Token::Text(Location::default(), "aBc".to_string()).eq_text_ignore_ascii_case("abc")
432        );
433        assert!(
434            !Token::Text(Location::default(), "aBc".to_string()).eq_text_ignore_ascii_case("cde")
435        );
436    }
437
438    #[test]
439    pub fn test_token_display_text() {
440        assert_eq!(
441            "\"The text\"",
442            format!(
443                "{}",
444                Token::Text(Location::default(), "The text".to_string())
445            )
446        );
447    }
448
449    #[test]
450    pub fn test_token_display_separator() {
451        assert_eq!(
452            "'.'",
453            format!("{}", Token::Separator(Location::default(), '.'))
454        );
455    }
456
457    #[test]
458    pub fn test_token_into_text_none() {
459        assert_eq!(None, Token::Separator(Location::default(), '.').into_text());
460    }
461
462    #[test]
463    pub fn test_token_into_text_or_else_succeed() {
464        assert_eq!(
465            Ok("SEQUENCE".to_string()),
466            Token::Text(Location::default(), "SEQUENCE".to_string())
467                .into_text_or_else(|_| unreachable!())
468        );
469    }
470
471    #[test]
472    pub fn test_token_into_text_or_else_fail() {
473        assert_eq!(
474            Err(()),
475            Token::Separator(Location::default(), '.').into_text_or_else(|_| ())
476        );
477    }
478
479    #[test]
480    pub fn test_token_into_separator_or_else_succeed() {
481        assert_eq!(
482            Ok('.'),
483            Token::Separator(Location::default(), '.').into_separator_or_else(|_| unreachable!())
484        );
485    }
486
487    #[test]
488    pub fn test_token_into_separator_or_else_fail() {
489        assert_eq!(
490            Err(()),
491            Token::Text(Location::default(), String::default()).into_separator_or_else(|_| ())
492        );
493    }
494}