asn1_compiler/tokenizer/
mod.rs

1#![allow(unused_variables)]
2//! Tokenizer for an ASN.1 module
3
4#[macro_use]
5pub(crate) mod types;
6
7use crate::error::Error;
8use anyhow::Result;
9
10use types::TokenType;
11
12// Keywords
13const KEYWORDS: &[&str] = &[
14    "ABSENT",
15    "ABSTRACT-SYNTAX",
16    "ALL",
17    "APPLICATION",
18    "AUTOMATIC",
19    "BEGIN",
20    "BIT",
21    "BMPString",
22    "BOOLEAN",
23    "BY",
24    "CHARACTER",
25    "CHOICE",
26    "CLASS",
27    "COMPONENT",
28    "COMPONENTS",
29    "CONSTRAINED",
30    "CONTAINING",
31    "DEFAULT",
32    "DEFINITIONS",
33    "EMBEDDED",
34    "ENCODED",
35    "END",
36    "ENUMERATED",
37    "EXCEPT",
38    "EXPLICIT",
39    "EXPORTS",
40    "EXTENSIBILITY",
41    "EXTERNAL",
42    "FALSE",
43    "FROM",
44    "GeneralizedTime",
45    "GeneralString",
46    "GraphicString",
47    "IA5String",
48    "IDENTIFIER",
49    "IMPLIED",
50    "IMPLICIT",
51    "IMPORTS",
52    "INCLUDES",
53    "INSTANCE",
54    "INTEGER",
55    "INTERSECTION",
56    "ISO646String",
57    "MAX",
58    "MIN",
59    "MINUS-INFINITY",
60    "NULL",
61    "NumericString",
62    "OBJECT",
63    "ObjectDescriptor",
64    "OCTET",
65    "OF",
66    "OPTIONAL",
67    "PATTERN",
68    "PDV",
69    "Plus-Infinity",
70    "PRESENT",
71    "PrintableString",
72    "PRIVATE",
73    "REAL",
74    "RELATIVE-OID",
75    "SEQUENCE",
76    "SET",
77    "SIZE",
78    "STRING",
79    "SYNTAX",
80    "T61String",
81    "TAGS",
82    "TeletexString",
83    "TRUE",
84    "TYPE-IDENTIFIER",
85    "UNION",
86    "UNIQUE",
87    "UNIVERSAL",
88    "UniversalString",
89    "UTCTime",
90    "UTF8String",
91    "VideotexString",
92    "VisibleString",
93    "WITH",
94];
95
96// FIXME: Add other types
97const BASE_TYPES: &[&str] = &[
98    "INTEGER",
99    "BOOLEAN",
100    "ENUMERATED",
101    "NULL",
102    "UTF8String",
103    "IA5String",
104    "PrintableString",
105    "VisibleString",
106    "UTCTime",
107    "GeneralizedTime",
108    // Spliced types (Note: actual ASN.1 Type names are different.
109    "OBJECT",
110    "OCTET",
111    "BIT",
112    "CHARACTER",
113    "REAL",
114];
115
116const CONSTRUCTED_TYPES: &[&str] = &["SEQUENCE", "SET", "CHOICE"];
117
118const WITH_SYNTAX_RESERVED_WORDS: &[&str] = &[
119    "BIT",
120    "BOOLEAN",
121    "CHARACTER",
122    "CHOICE",
123    "EMBEDDED",
124    "END",
125    "ENUMERATED",
126    "EXTERNAL",
127    "FALSE",
128    "INSTANCE",
129    "INTEGER",
130    "INTERSECTION",
131    "MINUS-INFINITY",
132    "NULL",
133    "OBJECT",
134    "PLUS-INFINITY",
135    "REAL",
136    "RELATIVE-OID",
137    "SEQUENCE",
138    "SET",
139    "TRUE",
140    "UNION",
141];
142
143/// Line and Column in the source where the token begins.
144#[derive(Debug, PartialEq, Copy, Clone)]
145pub(crate) struct LineColumn {
146    line: usize,
147    column: usize,
148}
149
150impl std::fmt::Display for LineColumn {
151    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
152        write!(f, "Line: {}, Column: {}", self.line, self.column)
153    }
154}
155
156impl LineColumn {
157    fn new(line: usize, column: usize) -> Self {
158        LineColumn { line, column }
159    }
160}
161
162/// Span of a Token in the ASN Source file.
163#[derive(Debug, Clone)]
164pub(crate) struct Span {
165    start: LineColumn,
166    end: LineColumn,
167}
168
169impl std::fmt::Display for Span {
170    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171        write!(f, "Start:: {}, End:: {}", self.start, self.end)
172    }
173}
174
175impl Span {
176    fn new(start: LineColumn, end: LineColumn) -> Self {
177        Span { start, end }
178    }
179
180    pub(crate) fn start(&self) -> LineColumn {
181        self.start
182    }
183}
184
185/// A parsed token before AST is created.
186///
187/// Going through an ASN.1 module source results in a vector of parsed tokens of appropriate types.
188/// Each parsed token contains the 'type', where it is found in the source ('span') and the actual
189/// token string.
190///
191/// The tokens are then used by the Parser to 'resolve' type and value definitions that generates
192/// the AST.
193#[derive(Debug, Clone)]
194pub struct Token {
195    pub(crate) r#type: TokenType,
196    pub(crate) span: Span,
197    pub(crate) text: String,
198}
199
200impl Token {
201    create_is_tokentype_fns! {
202        (is_curly_begin, TokenType::CurlyBegin),
203        (is_curly_end, TokenType::CurlyEnd),
204        (is_round_begin, TokenType::RoundBegin),
205        (is_round_end, TokenType::RoundEnd),
206        (is_exception_marker, TokenType::ExceptionMarker),
207        (is_square_begin, TokenType::SquareBegin),
208        (is_square_end, TokenType::SquareEnd),
209        (is_addition_groups_begin, TokenType::AdditionGroupsBegin),
210        (is_addition_groups_end, TokenType::AdditionGroupsEnd),
211        (is_extension, TokenType::Extension),
212        (is_range_separator, TokenType::RangeSeparator),
213        (is_assignment, TokenType::Assignment),
214        (is_colon, TokenType::Colon),
215        (is_semicolon, TokenType::SemiColon),
216        (is_identifier, TokenType::Identifier),
217        (is_keyword, TokenType::Keyword),
218        (is_comment, TokenType::Comment),
219        (is_and_identifier, TokenType::AndIdentifier),
220        (is_numeric, TokenType::NumberInt),
221        (is_bitstring, TokenType::BitString),
222        (is_hexstring, TokenType::HexString),
223        (is_tstring, TokenType::TString),
224        (is_dot, TokenType::Dot),
225        (is_comma, TokenType::Comma),
226        (is_set_union_token, TokenType::SetUnionToken),
227        (is_set_intersection_token, TokenType::SetIntersectionToken),
228        (is_at_component_list, TokenType::AtComponentIdList),
229        (is_less_than, TokenType::LessThan),
230    }
231
232    // Checkers for ASN.1 Lexical Token types
233    //
234    /// Checks whether the current token is a 'valuereference'
235    pub(crate) fn is_value_reference(&self) -> bool {
236        self.is_identifier() && self.text.starts_with(char::is_lowercase)
237    }
238
239    /// Checks whether the current token is a 'typereference'
240    pub(crate) fn is_type_reference(&self) -> bool {
241        self.is_identifier() && self.text.starts_with(char::is_uppercase)
242    }
243
244    /// Checks whether the given token is a 'modulereference'
245    pub(crate) fn is_module_reference(&self) -> bool {
246        self.is_type_reference()
247    }
248
249    /// Checks whether the given token is an Object Class Reference
250    pub(crate) fn is_object_class_reference(&self) -> bool {
251        self.is_type_reference()
252            && self
253                .text
254                .chars()
255                .all(|c| matches!(c, 'A'..='Z' | '0'..='9' | '-'))
256    }
257
258    /// Checks whether the given token is an Object Reference
259    pub(crate) fn is_object_reference(&self) -> bool {
260        self.is_value_reference()
261    }
262
263    /// Checks whether the given token is an Object Set Reference
264    pub(crate) fn is_object_set_reference(&self) -> bool {
265        self.is_type_reference()
266    }
267
268    /// Checks whether the given identifier is a Type Field Reference
269    pub(crate) fn is_type_field_reference(&self) -> bool {
270        self.is_and_identifier() && self.text[1..].starts_with(char::is_uppercase)
271    }
272    /// Checks whether the given token is a Value Field Reference
273    pub(crate) fn is_value_field_reference(&self) -> bool {
274        self.is_and_identifier() && self.text[1..].starts_with(char::is_lowercase)
275    }
276
277    #[allow(dead_code)]
278    /// Checks whether the given token is a Value Set field reference (same as Type Field reference.)
279    pub(crate) fn is_value_set_field_reference(&self) -> bool {
280        self.is_type_field_reference()
281    }
282
283    #[allow(dead_code)]
284    /// Checks whether the given token is an Object Field Reference (same as Value Field Reference.)
285    pub(crate) fn is_object_field_reference(&self) -> bool {
286        self.is_value_field_reference()
287    }
288
289    #[allow(dead_code)]
290    /// Checks whether the given token is an Object Set Field Reference
291    pub(crate) fn is_object_set_field_reference(&self) -> bool {
292        self.is_type_field_reference()
293    }
294
295    /// Checks whether given token is a particular keyword.
296    pub(crate) fn is_given_keyword(&self, keyword: &str) -> bool {
297        self.is_keyword() && self.text == keyword
298    }
299
300    /// Checks whether the given token is a builtin type.
301    pub(crate) fn is_asn_builtin_type(&self) -> bool {
302        BASE_TYPES.iter().any(|&t| t == self.text.as_str())
303            || CONSTRUCTED_TYPES.iter().any(|&t| t == self.text.as_str())
304    }
305
306    /// Checks whether a given token is a with syntax reserved word
307    pub(crate) fn is_with_syntax_reserved_word(&self) -> bool {
308        WITH_SYNTAX_RESERVED_WORDS
309            .iter()
310            .any(|&t| t == self.text.as_str())
311    }
312
313    /// Returns the 'span' of the current token.
314    pub(crate) fn span(&self) -> Span {
315        self.span.clone()
316    }
317
318    /// Returns the 'String' obtained by Concatenating tokens.
319    pub(crate) fn concat(tokens: &[Token], joinstr: &str) -> String {
320        tokens
321            .iter()
322            .map(|x| x.text.clone())
323            .collect::<Vec<String>>()
324            .join(joinstr)
325    }
326
327    /// Returns if the given token is a Set 'intersection'
328    pub(crate) fn is_set_intersection(&self) -> bool {
329        self.is_set_intersection_token() || self.is_given_keyword("INTERSECTION")
330    }
331
332    /// Returns if the given token is a Set 'union'
333    pub(crate) fn is_set_union(&self) -> bool {
334        self.is_set_union_token() || self.is_given_keyword("UNION")
335    }
336}
337
338// Get string token.
339fn get_string_token(
340    chars: &[char],
341    line: usize,
342    begin: usize,
343) -> Result<(Token, usize, usize, usize)> {
344    let mut last: Option<usize> = None;
345
346    if chars.len() == 1 {
347        return Err(Error::TokenizeError(0, line, begin).into());
348    }
349
350    let mut i = 1;
351    loop {
352        // " " " " .
353        // 0 1 2 3 4 (len = 5)
354        // i = 1
355        // i = 3
356        //
357        // " a " . .
358        // 0 1 2 3 4 (len = 5)
359        // i = 1
360        // i = 2
361        //
362        // " " " .
363        // 0 1 2 3 (len = 4)
364        // i = 1
365        // i = 3
366        if i >= chars.len() - 1 {
367            if i == chars.len() - 1 && chars[i] == '"' {
368                last = Some(i);
369            }
370            break;
371        }
372        if chars[i] == '"' {
373            if chars[i + 1] == '"' {
374                i += 2;
375            } else {
376                last = Some(i);
377                break;
378            }
379        } else {
380            i += 1;
381        }
382    }
383
384    // If we didn't find the last '"'
385    if last.is_none() {
386        return Err(Error::TokenizeError(5, line, begin).into());
387    }
388
389    let consumed = last.unwrap() + 1;
390
391    let mut text = chars[..consumed].iter().collect::<String>();
392    let lines = text.lines().count() - 1;
393    let last_line = text.lines().last().unwrap();
394    let end_column = if lines > 0 {
395        last_line.len()
396    } else {
397        begin + consumed
398    };
399    text = text
400        .lines()
401        .map(|line| line.trim())
402        .collect::<Vec<&str>>()
403        .join("");
404
405    Ok((
406        Token {
407            r#type: TokenType::TString,
408            span: Span::new(
409                LineColumn::new(line, begin),
410                LineColumn::new(line + lines, end_column),
411            ),
412            text,
413        },
414        consumed,
415        lines,
416        end_column,
417    ))
418}
419// Get bit string or hex string
420fn get_bit_or_hex_string_token(
421    chars: &[char],
422    line: usize,
423    begin: usize,
424) -> Result<(Token, usize, usize, usize)> {
425    if chars.len() == 1 {
426        return Err(Error::TokenizeError(6, line, begin).into());
427    }
428
429    let last = chars[1..].iter().position(|&c| c == '\'');
430    if last.is_none() {
431        // No matching '\'' found till the end of the string. Clearly an error.
432        return Err(Error::TokenizeError(7, line, begin).into());
433    }
434    let mut consumed = last.unwrap() + 1 + 1;
435    if consumed == chars.len() {
436        // Matching'\'' found, but the string ends, Error.
437        return Err(Error::TokenizeError(8, line, begin).into());
438    }
439
440    let c = chars[consumed];
441    let token_type = match c.to_lowercase().to_string().as_str() {
442        "h" => TokenType::HexString,
443        "b" => TokenType::BitString,
444        _ => {
445            return Err(Error::TokenizeError(9, line, begin).into());
446        }
447    };
448
449    let mut text = chars[..consumed].iter().collect::<String>();
450    let lines = text.lines().count() - 1;
451    let last_line = text.lines().last().unwrap();
452    let end_column = if lines > 0 {
453        last_line.len()
454    } else {
455        begin + consumed
456    };
457    text = text.replace(char::is_whitespace, "");
458
459    if token_type == TokenType::BitString && !text.replace(&['0', '1', '\''][..], "").is_empty() {
460        return Err(Error::TokenizeError(10, line, begin).into());
461    }
462
463    if token_type == TokenType::HexString
464        && !text.chars().all(|c| c.is_ascii_hexdigit() || c == '\'')
465    {
466        return Err(Error::TokenizeError(11, line, begin).into());
467    }
468
469    consumed += 1; // last 'h' or 'b'
470
471    Ok((
472        Token {
473            r#type: token_type,
474            span: Span::new(
475                LineColumn::new(line, begin),
476                LineColumn::new(line + lines, end_column), // FIXME: This span may be wrong, but ignore right now
477            ),
478            text,
479        },
480        consumed,
481        lines,
482        end_column,
483    ))
484}
485
486// Get at and component ID list something like @.id or @component.id
487fn get_at_component_id_list(chars: &[char], line: usize, begin: usize) -> Result<(Token, usize)> {
488    if chars.len() == 1 {
489        return Err(Error::TokenizeError(12, line, begin).into());
490    }
491
492    let mut consumed = 1;
493    let last = chars[1..]
494        .iter()
495        .position(|&x| !(x.is_ascii_alphanumeric() || x == '-' || x == '.'));
496    if let Some(lst) = last {
497        consumed += lst;
498    } else {
499        consumed += chars[1..].len();
500    }
501
502    // Identifier should not end with a '-'
503    if ['.', '-'].iter().any(|&c| c == chars[consumed - 1]) {
504        return Err(Error::TokenizeError(13, line, begin).into());
505    }
506    Ok((
507        Token {
508            r#type: TokenType::AtComponentIdList,
509            span: Span::new(
510                LineColumn::new(line, begin),
511                LineColumn::new(line, begin + consumed),
512            ),
513            text: chars[..consumed].iter().collect::<String>(), // include the sign as well
514        },
515        consumed,
516    ))
517}
518// Get token for a number Integer or Real
519// This function will try to tokenize strings of following forms -
520// 1..2 => Will return 1 as a number
521// 1.2 => Will return 1.2 as a number
522// -1.2.3 => Will return Error
523fn get_number_token(chars: &[char], line: usize, begin: usize) -> Result<(Token, usize)> {
524    let neg = (chars[0] == '-') as usize;
525
526    if neg > 0 && chars.len() == 1 {
527        return Err(Error::TokenizeError(14, line, begin).into());
528    }
529
530    let mut consumed = neg;
531    let last = chars[neg..]
532        .iter()
533        .position(|&x| !(x.is_numeric() || x == '.'));
534    if let Some(lst) = last {
535        consumed += lst;
536    } else {
537        consumed += chars[neg..].len();
538    }
539
540    let text = chars[..consumed].iter().collect::<String>();
541    if text.parse::<f32>().is_err() {
542        let dot_index = chars[neg..].iter().position(|&x| x == '.');
543        if let Some(index) = dot_index {
544            if index == chars.len() {
545                Err(Error::TokenizeError(14, line, begin).into())
546                // Error (Last .)
547            } else if chars[index + 1] == '.' {
548                // Atleast two .. Return this number, this becomes a parse error later on
549                Ok((
550                    Token {
551                        r#type: TokenType::NumberInt,
552                        span: Span::new(
553                            LineColumn::new(line, begin),
554                            LineColumn::new(line, begin + consumed),
555                        ),
556                        text: chars[..index + neg].iter().collect::<String>(), // include the sign as well
557                    },
558                    index + neg,
559                ))
560            } else {
561                // Error something in weird form like 3.14.159
562                Err(Error::TokenizeError(14, line, begin).into())
563            }
564        } else {
565            unreachable!();
566        }
567    } else {
568        Ok((
569            Token {
570                r#type: TokenType::NumberInt,
571                span: Span::new(
572                    LineColumn::new(line, begin),
573                    LineColumn::new(line, begin + consumed),
574                ),
575                text, // include the sign as well
576            },
577            consumed,
578        ))
579    }
580}
581
582// Get token for Identifier or a Keyword
583//
584// This parses all types of identifiers including references and ASN.1 keywords. Returns the
585// appropriate type of the token and bytes consumed.
586// This also processes identifiers of the form `&identifer` or `&Identifier`.
587fn get_identifier_or_keyword_token(
588    chars: &[char],
589    line: usize,
590    begin: usize,
591) -> Result<(Token, usize)> {
592    let and = (chars[0] == '&') as usize;
593
594    if and > 0 && chars.len() == 1 {
595        return Err(Error::TokenizeError(15, line, begin).into());
596    }
597
598    let mut consumed = and;
599    let last = chars[and..]
600        .iter()
601        .position(|&x| !(x.is_ascii_alphanumeric() || x == '-'));
602
603    if let Some(lst) = last {
604        consumed += lst;
605    } else {
606        consumed += chars[and..].len();
607    }
608
609    // Identifier should not end with a '-'
610    if chars[consumed - 1] == '-' {
611        return Err(Error::TokenizeError(16, line, begin).into());
612    }
613
614    // Free standing '&' this is an error.
615    if and > 0 && consumed == 1 {
616        return Err(Error::TokenizeError(17, line, begin).into());
617    }
618
619    let text = chars[..consumed].iter().collect::<String>();
620    if text.contains("--") {
621        return Err(Error::TokenizeError(18, line, begin).into());
622    }
623
624    let token_type = if and > 0 {
625        TokenType::AndIdentifier
626    } else if KEYWORDS.iter().any(|&kw| text == kw) {
627        TokenType::Keyword
628    } else {
629        TokenType::Identifier
630    };
631
632    Ok((
633        Token {
634            r#type: token_type,
635            span: Span::new(
636                LineColumn::new(line, begin),
637                LineColumn::new(line, begin + consumed),
638            ),
639            text,
640        },
641        consumed,
642    ))
643}
644
645// Get token for Range ".." or Extension  "..."
646fn get_range_or_extension_token(
647    chars: &[char],
648    line: usize,
649    begin: usize,
650) -> Result<(Token, usize)> {
651    let (token_type, consumed) = if chars.len() == 1 {
652        (TokenType::Dot, 1)
653    } else if chars.len() == 2 {
654        if chars[1] == '.' {
655            (TokenType::RangeSeparator, 2)
656        } else {
657            (TokenType::Dot, 1)
658        }
659    } else if chars[1] == '.' {
660        if chars[2] == '.' {
661            (TokenType::Extension, 3)
662        } else {
663            (TokenType::RangeSeparator, 2)
664        }
665    } else {
666        (TokenType::Dot, 1)
667    };
668
669    Ok((
670        Token {
671            r#type: token_type,
672            span: Span::new(
673                LineColumn::new(line, begin),
674                LineColumn::new(line, begin + consumed),
675            ),
676            text: chars[..consumed].iter().collect::<String>(),
677        },
678        consumed,
679    ))
680}
681
682// Parse either an assignment token "::=" pr a single ':'
683fn get_assignment_or_colon_token(
684    chars: &[char],
685    line: usize,
686    begin: usize,
687) -> Result<(Token, usize)> {
688    let (token_type, consumed) = if chars.len() == 1 {
689        (TokenType::Colon, 1)
690    } else if chars.len() == 2 {
691        if chars[1] == ':' {
692            return Err(Error::TokenizeError(19, line, begin).into());
693        } else {
694            (TokenType::Colon, 1)
695        }
696    } else if chars[1] == ':' {
697        if chars[2] == '=' {
698            (TokenType::Assignment, 3)
699        } else {
700            return Err(Error::TokenizeError(20, line, begin).into());
701        }
702    } else {
703        (TokenType::Colon, 1)
704    };
705
706    Ok((
707        Token {
708            r#type: token_type,
709            span: Span::new(
710                LineColumn::new(line, begin),
711                LineColumn::new(line, begin + consumed),
712            ),
713            text: chars[..consumed].iter().collect::<String>(),
714        },
715        consumed,
716    ))
717}
718
719// Gets either a Square bracket or Sequence Extension
720//
721// This gives all the tokens '[[' or ']]' or '[' or ']'
722fn get_seq_extension_or_square_brackets_token(
723    chars: &[char],
724    line: usize,
725    begin: usize,
726) -> Result<(Token, usize)> {
727    let (token_type, consumed) = if chars[0] == '[' {
728        if chars[1] == '[' {
729            (TokenType::AdditionGroupsBegin, 2)
730        } else {
731            (TokenType::SquareBegin, 1)
732        }
733    } else if chars[1] == ']' {
734        (TokenType::AdditionGroupsEnd, 2)
735    } else {
736        (TokenType::SquareEnd, 1)
737    };
738    Ok((
739        Token {
740            r#type: token_type,
741            span: Span::new(
742                LineColumn::new(line, begin),
743                LineColumn::new(line, begin + consumed),
744            ),
745            text: chars[..consumed].iter().collect::<String>(),
746        },
747        consumed,
748    ))
749}
750
751// Gets Begin/End of round/curly brackets.
752//
753// Note: square brackets need a special treatment due to "[[" and "]]"
754fn get_single_char_token(token: char, line: usize, begin: usize) -> Result<Token> {
755    let token_type: TokenType = match token {
756        '{' => TokenType::CurlyBegin,
757        '}' => TokenType::CurlyEnd,
758        '(' => TokenType::RoundBegin,
759        ')' => TokenType::RoundEnd,
760        '!' => TokenType::ExceptionMarker,
761        ';' => TokenType::SemiColon,
762        ',' => TokenType::Comma,
763        '|' => TokenType::SetUnionToken,
764        '^' => TokenType::SetIntersectionToken,
765        '<' => TokenType::LessThan,
766        _ => return Err(Error::TokenizeError(21, line, begin).into()),
767    };
768    Ok(Token {
769        r#type: token_type,
770        span: Span::new(
771            LineColumn::new(line, begin),
772            LineColumn::new(line, begin + 1),
773        ),
774        text: token.to_string(),
775    })
776}
777
778// Gets a comment. The comment will be of the form -
779// -- Some Comment \n or
780// -- Some Comment -- or
781// -- Some Comment EOF (Note: last is a special case and not exactly confirming to standards.)
782//
783// A cleverly crafted pathologically long line could cause this function to panic.
784// Which we don't expect to see in real life normally.
785fn get_maybe_comment_token(
786    chars: &[char], // From the first "--"
787    line: usize,
788    begin: usize,
789) -> Result<(Option<Token>, usize)> {
790    if chars[1] != '-' {
791        return Ok((None, 0));
792    }
793    let mut consumed = 2; // initial "--"
794    let mut last_idx: Option<usize> = None;
795
796    // Search for Either '\n' or "--"
797    for (idx, window) in chars[2..].windows(2).enumerate() {
798        if window[0] == '\n' {
799            last_idx = Some(idx);
800            consumed += idx;
801            break;
802        }
803        if window[0] == '-' && window[1] == '-' {
804            last_idx = Some(idx);
805            consumed += idx + 2; // --123-- : idx: 3 consumed: 7
806            break;
807        }
808    }
809
810    // Neither "--" nor '\n' found. consume everything. (may be last line.)
811    if last_idx.is_none() {
812        consumed = chars.len();
813    }
814
815    let text = chars[..consumed].iter().collect::<String>();
816    //.trim_start_matches("--")
817    //.trim_end_matches("--")
818    //.trim()
819
820    Ok((
821        Some(Token {
822            r#type: TokenType::Comment,
823            span: Span::new(
824                LineColumn::new(line, begin),
825                LineColumn::new(line, consumed),
826            ),
827            text,
828        }),
829        consumed,
830    ))
831}
832
833/// Tokenize ASN file.
834///
835/// This function would work on any input that implements `std::io::Read` trait, but would work
836/// mostly with files because this 'reads the input to end'. We look at the first character of a
837/// non-whitespace sequence and then tokenize that into appropriate tokens.
838pub fn tokenize<T>(mut input: T) -> Result<Vec<Token>>
839where
840    T: std::io::Read,
841{
842    let mut buffer = Vec::new();
843    let _ = input.read_to_end(&mut buffer).unwrap();
844    let buffer = String::from_utf8(buffer).unwrap();
845
846    tokenize_string(&buffer)
847}
848
849/// Tokenize a String
850///
851/// Tokenize a given 'String' to ASN.1 Tokens. This API Can be used to write simple test cases for
852/// ASN.1 files say.
853pub fn tokenize_string(buffer: &str) -> Result<Vec<Token>> {
854    let chars: Vec<char> = buffer.chars().collect();
855
856    let mut column = 0_usize;
857    let mut processed = 0;
858    let total_read = chars.len();
859
860    let mut line = 1;
861    let mut tokens: Vec<Token> = Vec::new();
862    loop {
863        let c = chars[processed];
864        match c {
865            ' ' | '\t' => {
866                processed += 1;
867                column += 1;
868            }
869            '\n' => {
870                line += 1;
871                processed += 1;
872                column = 0;
873            }
874            '-' => {
875                let (token, consumed) = get_maybe_comment_token(&chars[processed..], line, column)?;
876                match token {
877                    Some(tok) => {
878                        tokens.push(tok);
879                        column += consumed;
880                        processed += consumed;
881                    }
882                    None => {
883                        let (token, consumed) =
884                            get_number_token(&chars[processed..], line, column)?;
885                        tokens.push(token);
886                        column += consumed;
887                        processed += consumed;
888                    }
889                }
890            }
891            '{' | '}' | '(' | ')' | '!' | ';' | ',' | '|' | '^' | '<' => {
892                let token = get_single_char_token(chars[processed], line, column)?;
893                tokens.push(token);
894                column += 1;
895                processed += 1;
896            }
897            '[' | ']' => {
898                let (token, consumed) =
899                    get_seq_extension_or_square_brackets_token(&chars[processed..], line, column)?;
900                tokens.push(token);
901                column += consumed;
902                processed += consumed;
903            }
904            ':' => {
905                let (token, consumed) =
906                    get_assignment_or_colon_token(&chars[processed..], line, column)?;
907                tokens.push(token);
908                column += consumed;
909                processed += consumed;
910            }
911            '.' => {
912                let (token, consumed) =
913                    get_range_or_extension_token(&chars[processed..], line, column)?;
914                tokens.push(token);
915                column += consumed;
916                processed += consumed;
917            }
918            '&' | 'a'..='z' | 'A'..='Z' => {
919                let (token, consumed) =
920                    get_identifier_or_keyword_token(&chars[processed..], line, column)?;
921                tokens.push(token);
922
923                column += consumed;
924                processed += consumed;
925            }
926            '0'..='9' => {
927                let (token, consumed) = get_number_token(&chars[processed..], line, column)?;
928                tokens.push(token);
929                column += consumed;
930                processed += consumed;
931            }
932            '@' => {
933                let (token, consumed) =
934                    get_at_component_id_list(&chars[processed..], line, column)?;
935                tokens.push(token);
936                column += consumed;
937                processed += consumed;
938            }
939            '\'' => {
940                let (token, consumed, l, c) =
941                    get_bit_or_hex_string_token(&chars[processed..], line, column)?;
942                tokens.push(token);
943                processed += consumed;
944                if l > 0 {
945                    column = c;
946                } else {
947                    column += consumed;
948                }
949                line += l;
950            }
951            '"' => {
952                let (token, consumed, l, c) = get_string_token(&chars[processed..], line, column)?;
953                tokens.push(token);
954                processed += consumed;
955                if l > 0 {
956                    column = c;
957                } else {
958                    column += consumed;
959                }
960                line += l;
961            }
962            // Zero width ....
963            '\u{feff}' => {
964                processed += 1;
965            }
966            '\r' => {
967                processed += 1;
968            }
969            _ => {
970                panic!(
971                    "Unsupported First character for a token: '{:?}'. Line: {}, Column: {}",
972                    chars[processed], line, column
973                );
974            }
975        }
976        if processed == total_read {
977            break;
978        }
979    }
980    Ok(tokens)
981}
982
983#[cfg(test)]
984mod tests {
985
986    use super::*;
987
988    #[test]
989    fn tokenize_identifier_tokens() {
990        let reader = std::io::BufReader::new(std::io::Cursor::new(b"Hello World!"));
991        let result = tokenize(reader);
992        assert!(result.is_ok(), "{:#?}", result.err().unwrap());
993        let tokens = result.unwrap();
994        assert!(tokens.len() == 3, "{:#?}", tokens);
995    }
996
997    #[test]
998    fn tokenize_and_tokens() {
999        let reader = std::io::BufReader::new(std::io::Cursor::new(b"&Id &id-IDentifier"));
1000        let result = tokenize(reader);
1001        assert!(result.is_ok());
1002        let tokens = result.unwrap();
1003        assert!(tokens.len() == 2, "{:#?}", tokens);
1004    }
1005
1006    #[test]
1007    fn tokenize_comment_two_lines() {
1008        let reader =
1009            std::io::BufReader::new(std::io::Cursor::new(b"Hello World!\n-- Some comment --\n"));
1010        let result = tokenize(reader);
1011        assert!(result.is_ok());
1012        let tokens = result.unwrap();
1013        assert!(tokens.len() == 4, "{:#?}", tokens);
1014    }
1015
1016    #[test]
1017    fn tokenize_two_comments() {
1018        let reader = std::io::BufReader::new(std::io::Cursor::new(
1019            b" -- Hello World!\n-- Some comment --\n",
1020        ));
1021        let result = tokenize(reader);
1022        assert!(result.is_ok());
1023        let tokens = result.unwrap();
1024        assert!(tokens.len() == 2, "{:#?}", tokens);
1025    }
1026
1027    #[test]
1028    fn tokenize_comment_no_trailing_newline() {
1029        let reader = std::io::BufReader::new(std::io::Cursor::new(b" -- Hello World!"));
1030        let result = tokenize(reader);
1031        assert!(result.is_ok());
1032        let tokens = result.unwrap();
1033        assert!(tokens.len() == 1, "{:#?}", tokens);
1034    }
1035
1036    #[test]
1037    fn tokenize_keywords() {
1038        let reader = std::io::BufReader::new(std::io::Cursor::new(b"  INTEGER ENUMERATED "));
1039        let result = tokenize(reader);
1040        assert!(result.is_ok());
1041        let tokens = result.unwrap();
1042        assert!(tokens.len() == 2, "{:#?}", tokens);
1043        assert!(tokens.iter().all(|t| t.is_keyword()));
1044    }
1045
1046    #[test]
1047    fn tokenize_at_component_list() {
1048        let reader =
1049            std::io::BufReader::new(std::io::Cursor::new(b"@component.id-List @.another "));
1050        let result = tokenize(reader);
1051        assert!(result.is_ok());
1052        let tokens = result.unwrap();
1053        assert!(tokens.len() == 2, "{:#?}", tokens);
1054    }
1055
1056    #[test]
1057    fn tokenize_numbers() {
1058        let reader = std::io::BufReader::new(std::io::Cursor::new(b" 123456789 -123"));
1059        let result = tokenize(reader);
1060        assert!(result.is_ok());
1061        let tokens = result.unwrap();
1062        assert!(tokens.len() == 2, "{:#?}", tokens);
1063        assert!(tokens.iter().all(|t| t.is_numeric()), "{:#?}", tokens);
1064    }
1065
1066    #[test]
1067    fn tokenize_keyword_dot_andkeyword() {
1068        let reader = std::io::BufReader::new(std::io::Cursor::new(
1069            b"ATTRIBUTE.&equality-match.&AssertionType",
1070        ));
1071        let result = tokenize(reader);
1072        assert!(result.is_ok());
1073        let tokens = result.unwrap();
1074        assert!(tokens.len() == 5, "{:#?}", tokens);
1075    }
1076
1077    #[test]
1078    fn tokenize_range() {
1079        let reader = std::io::BufReader::new(std::io::Cursor::new(b" -123456789..-123"));
1080        let result = tokenize(reader);
1081        assert!(result.is_ok());
1082        let tokens = result.unwrap();
1083        assert!(tokens.len() == 3, "{:#?}", tokens);
1084        assert!(tokens[0].is_numeric(), "{:#?}", tokens[0]);
1085        assert!(tokens[1].is_range_separator(), "{:#?}", tokens[1]);
1086        assert!(tokens[2].is_numeric(), "{:#?}", tokens[2]);
1087    }
1088
1089    #[test]
1090    fn tokenize_bitstring() {
1091        struct BitHexStringTestCase<'t> {
1092            input: &'t [u8],
1093            success: bool,
1094            span_end_line: usize,
1095        }
1096        let test_cases = vec![
1097            BitHexStringTestCase {
1098                input: b"'010101'b",
1099                success: true,
1100                span_end_line: 1,
1101            },
1102            BitHexStringTestCase {
1103                input: b"'010101'",
1104                success: false,
1105                span_end_line: 1,
1106            },
1107            BitHexStringTestCase {
1108                input: b"'010101'h",
1109                success: true,
1110                span_end_line: 1,
1111            },
1112            BitHexStringTestCase {
1113                input: b"'01 0101'b",
1114                success: true,
1115                span_end_line: 1,
1116            },
1117            BitHexStringTestCase {
1118                input: b"'01 0101'h",
1119                success: true,
1120                span_end_line: 1,
1121            },
1122            BitHexStringTestCase {
1123                input: b"'01 0101\n\t0101\n00'h",
1124                success: true,
1125                span_end_line: 3,
1126            },
1127        ];
1128        for t in test_cases {
1129            let reader = std::io::BufReader::new(std::io::Cursor::new(t.input));
1130            let result = tokenize(reader);
1131            assert_eq!(result.is_ok(), t.success, "{:#?}", result.unwrap()[0]);
1132            if result.is_ok() {
1133                let tokens = result.unwrap();
1134                assert!(tokens.len() == 1, "{:#?}", tokens[0]);
1135                let token = &tokens[0];
1136                assert!(
1137                    token.span.end.line == t.span_end_line,
1138                    "input: {:#?}, token end: {}, tc: span_end_line {}",
1139                    t.input,
1140                    token.span.end.line,
1141                    t.span_end_line
1142                );
1143            }
1144        }
1145    }
1146
1147    #[test]
1148    fn tokenize_string() {
1149        struct TestTokenizeString<'t> {
1150            input: &'t [u8],
1151            len: usize,
1152            success: bool,
1153        }
1154        let test_cases = vec![
1155            TestTokenizeString {
1156                input: b"\"Foo Bar\n\tFoo-baz\"",
1157                len: 1,
1158                success: true,
1159            },
1160            TestTokenizeString {
1161                input: b"\"",
1162                len: 1,
1163                success: false,
1164            },
1165            TestTokenizeString {
1166                input: b"\"\"",
1167                len: 1,
1168                success: true,
1169            },
1170            TestTokenizeString {
1171                input: b"\"\"\"",
1172                len: 1,
1173                success: false,
1174            },
1175            TestTokenizeString {
1176                input: b"\"\"\"\" ",
1177                len: 1,
1178                success: true,
1179            },
1180            TestTokenizeString {
1181                //input: b"\"\"Some Quoted String\"\"x",
1182                input: b"\"\"\"a\"\"x\"",
1183                len: 1,
1184                success: true,
1185            },
1186            TestTokenizeString {
1187                input: b"\"a\"..\"z\"",
1188                len: 3,
1189                success: true,
1190            },
1191        ];
1192        for test_case in test_cases {
1193            let reader = std::io::BufReader::new(std::io::Cursor::new(test_case.input));
1194            let result = tokenize(reader);
1195            assert_eq!(
1196                result.is_ok(),
1197                test_case.success,
1198                "{}",
1199                result.err().unwrap()
1200            );
1201            if result.is_ok() {
1202                let tokens = result.unwrap();
1203                assert!(tokens.len() == test_case.len, "{:#?}", tokens);
1204            }
1205        }
1206    }
1207
1208    #[test]
1209    fn tokenizer_test_object_class_reference() {
1210        let reader = std::io::BufReader::new(std::io::Cursor::new("SOME-OBJECT-CLASS"));
1211        let result = tokenize(reader);
1212        assert!(result.is_ok());
1213        let result = result.unwrap();
1214        assert!(result.len() == 1);
1215
1216        assert!(result[0].is_object_class_reference());
1217    }
1218
1219    #[test]
1220    fn tokenize_small_tokens() {
1221        struct SmallTokenTestCase<'t> {
1222            input: &'t [u8],
1223            count: usize,
1224            success: bool,
1225        }
1226        let test_cases = vec![
1227            SmallTokenTestCase {
1228                input: b"{{}}",
1229                count: 4,
1230                success: true,
1231            },
1232            SmallTokenTestCase {
1233                input: b"[[{}]}",
1234                count: 5,
1235                success: true,
1236            },
1237            SmallTokenTestCase {
1238                input: b"[[]]",
1239                count: 2,
1240                success: true,
1241            },
1242            SmallTokenTestCase {
1243                input: b"..{...}",
1244                count: 4,
1245                success: true,
1246            },
1247            SmallTokenTestCase {
1248                input: b":(::=)",
1249                count: 4,
1250                success: true,
1251            },
1252            SmallTokenTestCase {
1253                input: b": ::=",
1254                count: 2,
1255                success: true,
1256            },
1257            SmallTokenTestCase {
1258                input: b": :: ",
1259                count: 2,
1260                success: false,
1261            },
1262            SmallTokenTestCase {
1263                input: b".",
1264                count: 1,
1265                success: true,
1266            },
1267            SmallTokenTestCase {
1268                input: b"..",
1269                count: 1,
1270                success: true,
1271            },
1272            SmallTokenTestCase {
1273                input: b". ",
1274                count: 1,
1275                success: true,
1276            },
1277            SmallTokenTestCase {
1278                input: b". . .. ",
1279                count: 3,
1280                success: true,
1281            },
1282            SmallTokenTestCase {
1283                input: b"...",
1284                count: 1,
1285                success: true,
1286            },
1287        ];
1288        for test_case in test_cases {
1289            let reader = std::io::BufReader::new(std::io::Cursor::new(test_case.input));
1290            let result = tokenize(reader);
1291            assert_eq!(
1292                result.is_ok(),
1293                test_case.success,
1294                "{}",
1295                String::from_utf8(test_case.input.to_vec()).unwrap()
1296            );
1297            if result.is_ok() {
1298                let tokens = result.unwrap();
1299                assert!(tokens.len() == test_case.count, "{:#?}", tokens);
1300            }
1301        }
1302    }
1303}
asn1_compiler/tokenizer/mod.rs

asn1_compiler/tokenizer/
mod.rs