1#![allow(unused_variables)]
2#[macro_use]
5pub(crate) mod types;
6
7use crate::error::Error;
8use anyhow::Result;
9
10use types::TokenType;
11
12const KEYWORDS: &[&str] = &[
14 "ABSENT",
15 "ABSTRACT-SYNTAX",
16 "ALL",
17 "APPLICATION",
18 "AUTOMATIC",
19 "BEGIN",
20 "BIT",
21 "BMPString",
22 "BOOLEAN",
23 "BY",
24 "CHARACTER",
25 "CHOICE",
26 "CLASS",
27 "COMPONENT",
28 "COMPONENTS",
29 "CONSTRAINED",
30 "CONTAINING",
31 "DEFAULT",
32 "DEFINITIONS",
33 "EMBEDDED",
34 "ENCODED",
35 "END",
36 "ENUMERATED",
37 "EXCEPT",
38 "EXPLICIT",
39 "EXPORTS",
40 "EXTENSIBILITY",
41 "EXTERNAL",
42 "FALSE",
43 "FROM",
44 "GeneralizedTime",
45 "GeneralString",
46 "GraphicString",
47 "IA5String",
48 "IDENTIFIER",
49 "IMPLIED",
50 "IMPLICIT",
51 "IMPORTS",
52 "INCLUDES",
53 "INSTANCE",
54 "INTEGER",
55 "INTERSECTION",
56 "ISO646String",
57 "MAX",
58 "MIN",
59 "MINUS-INFINITY",
60 "NULL",
61 "NumericString",
62 "OBJECT",
63 "ObjectDescriptor",
64 "OCTET",
65 "OF",
66 "OPTIONAL",
67 "PATTERN",
68 "PDV",
69 "Plus-Infinity",
70 "PRESENT",
71 "PrintableString",
72 "PRIVATE",
73 "REAL",
74 "RELATIVE-OID",
75 "SEQUENCE",
76 "SET",
77 "SIZE",
78 "STRING",
79 "SYNTAX",
80 "T61String",
81 "TAGS",
82 "TeletexString",
83 "TRUE",
84 "TYPE-IDENTIFIER",
85 "UNION",
86 "UNIQUE",
87 "UNIVERSAL",
88 "UniversalString",
89 "UTCTime",
90 "UTF8String",
91 "VideotexString",
92 "VisibleString",
93 "WITH",
94];
95
96const BASE_TYPES: &[&str] = &[
98 "INTEGER",
99 "BOOLEAN",
100 "ENUMERATED",
101 "NULL",
102 "UTF8String",
103 "IA5String",
104 "PrintableString",
105 "VisibleString",
106 "UTCTime",
107 "GeneralizedTime",
108 "OBJECT",
110 "OCTET",
111 "BIT",
112 "CHARACTER",
113 "REAL",
114];
115
116const CONSTRUCTED_TYPES: &[&str] = &["SEQUENCE", "SET", "CHOICE"];
117
118const WITH_SYNTAX_RESERVED_WORDS: &[&str] = &[
119 "BIT",
120 "BOOLEAN",
121 "CHARACTER",
122 "CHOICE",
123 "EMBEDDED",
124 "END",
125 "ENUMERATED",
126 "EXTERNAL",
127 "FALSE",
128 "INSTANCE",
129 "INTEGER",
130 "INTERSECTION",
131 "MINUS-INFINITY",
132 "NULL",
133 "OBJECT",
134 "PLUS-INFINITY",
135 "REAL",
136 "RELATIVE-OID",
137 "SEQUENCE",
138 "SET",
139 "TRUE",
140 "UNION",
141];
142
143#[derive(Debug, PartialEq, Copy, Clone)]
145pub(crate) struct LineColumn {
146 line: usize,
147 column: usize,
148}
149
150impl std::fmt::Display for LineColumn {
151 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
152 write!(f, "Line: {}, Column: {}", self.line, self.column)
153 }
154}
155
156impl LineColumn {
157 fn new(line: usize, column: usize) -> Self {
158 LineColumn { line, column }
159 }
160}
161
162#[derive(Debug, Clone)]
164pub(crate) struct Span {
165 start: LineColumn,
166 end: LineColumn,
167}
168
169impl std::fmt::Display for Span {
170 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
171 write!(f, "Start:: {}, End:: {}", self.start, self.end)
172 }
173}
174
175impl Span {
176 fn new(start: LineColumn, end: LineColumn) -> Self {
177 Span { start, end }
178 }
179
180 pub(crate) fn start(&self) -> LineColumn {
181 self.start
182 }
183}
184
185#[derive(Debug, Clone)]
194pub struct Token {
195 pub(crate) r#type: TokenType,
196 pub(crate) span: Span,
197 pub(crate) text: String,
198}
199
200impl Token {
201 create_is_tokentype_fns! {
202 (is_curly_begin, TokenType::CurlyBegin),
203 (is_curly_end, TokenType::CurlyEnd),
204 (is_round_begin, TokenType::RoundBegin),
205 (is_round_end, TokenType::RoundEnd),
206 (is_exception_marker, TokenType::ExceptionMarker),
207 (is_square_begin, TokenType::SquareBegin),
208 (is_square_end, TokenType::SquareEnd),
209 (is_addition_groups_begin, TokenType::AdditionGroupsBegin),
210 (is_addition_groups_end, TokenType::AdditionGroupsEnd),
211 (is_extension, TokenType::Extension),
212 (is_range_separator, TokenType::RangeSeparator),
213 (is_assignment, TokenType::Assignment),
214 (is_colon, TokenType::Colon),
215 (is_semicolon, TokenType::SemiColon),
216 (is_identifier, TokenType::Identifier),
217 (is_keyword, TokenType::Keyword),
218 (is_comment, TokenType::Comment),
219 (is_and_identifier, TokenType::AndIdentifier),
220 (is_numeric, TokenType::NumberInt),
221 (is_bitstring, TokenType::BitString),
222 (is_hexstring, TokenType::HexString),
223 (is_tstring, TokenType::TString),
224 (is_dot, TokenType::Dot),
225 (is_comma, TokenType::Comma),
226 (is_set_union_token, TokenType::SetUnionToken),
227 (is_set_intersection_token, TokenType::SetIntersectionToken),
228 (is_at_component_list, TokenType::AtComponentIdList),
229 (is_less_than, TokenType::LessThan),
230 }
231
232 pub(crate) fn is_value_reference(&self) -> bool {
236 self.is_identifier() && self.text.starts_with(char::is_lowercase)
237 }
238
239 pub(crate) fn is_type_reference(&self) -> bool {
241 self.is_identifier() && self.text.starts_with(char::is_uppercase)
242 }
243
244 pub(crate) fn is_module_reference(&self) -> bool {
246 self.is_type_reference()
247 }
248
249 pub(crate) fn is_object_class_reference(&self) -> bool {
251 self.is_type_reference()
252 && self
253 .text
254 .chars()
255 .all(|c| matches!(c, 'A'..='Z' | '0'..='9' | '-'))
256 }
257
258 pub(crate) fn is_object_reference(&self) -> bool {
260 self.is_value_reference()
261 }
262
263 pub(crate) fn is_object_set_reference(&self) -> bool {
265 self.is_type_reference()
266 }
267
268 pub(crate) fn is_type_field_reference(&self) -> bool {
270 self.is_and_identifier() && self.text[1..].starts_with(char::is_uppercase)
271 }
272 pub(crate) fn is_value_field_reference(&self) -> bool {
274 self.is_and_identifier() && self.text[1..].starts_with(char::is_lowercase)
275 }
276
277 #[allow(dead_code)]
278 pub(crate) fn is_value_set_field_reference(&self) -> bool {
280 self.is_type_field_reference()
281 }
282
283 #[allow(dead_code)]
284 pub(crate) fn is_object_field_reference(&self) -> bool {
286 self.is_value_field_reference()
287 }
288
289 #[allow(dead_code)]
290 pub(crate) fn is_object_set_field_reference(&self) -> bool {
292 self.is_type_field_reference()
293 }
294
295 pub(crate) fn is_given_keyword(&self, keyword: &str) -> bool {
297 self.is_keyword() && self.text == keyword
298 }
299
300 pub(crate) fn is_asn_builtin_type(&self) -> bool {
302 BASE_TYPES.iter().any(|&t| t == self.text.as_str())
303 || CONSTRUCTED_TYPES.iter().any(|&t| t == self.text.as_str())
304 }
305
306 pub(crate) fn is_with_syntax_reserved_word(&self) -> bool {
308 WITH_SYNTAX_RESERVED_WORDS
309 .iter()
310 .any(|&t| t == self.text.as_str())
311 }
312
313 pub(crate) fn span(&self) -> Span {
315 self.span.clone()
316 }
317
318 pub(crate) fn concat(tokens: &[Token], joinstr: &str) -> String {
320 tokens
321 .iter()
322 .map(|x| x.text.clone())
323 .collect::<Vec<String>>()
324 .join(joinstr)
325 }
326
327 pub(crate) fn is_set_intersection(&self) -> bool {
329 self.is_set_intersection_token() || self.is_given_keyword("INTERSECTION")
330 }
331
332 pub(crate) fn is_set_union(&self) -> bool {
334 self.is_set_union_token() || self.is_given_keyword("UNION")
335 }
336}
337
338fn get_string_token(
340 chars: &[char],
341 line: usize,
342 begin: usize,
343) -> Result<(Token, usize, usize, usize)> {
344 let mut last: Option<usize> = None;
345
346 if chars.len() == 1 {
347 return Err(Error::TokenizeError(0, line, begin).into());
348 }
349
350 let mut i = 1;
351 loop {
352 if i >= chars.len() - 1 {
367 if i == chars.len() - 1 && chars[i] == '"' {
368 last = Some(i);
369 }
370 break;
371 }
372 if chars[i] == '"' {
373 if chars[i + 1] == '"' {
374 i += 2;
375 } else {
376 last = Some(i);
377 break;
378 }
379 } else {
380 i += 1;
381 }
382 }
383
384 if last.is_none() {
386 return Err(Error::TokenizeError(5, line, begin).into());
387 }
388
389 let consumed = last.unwrap() + 1;
390
391 let mut text = chars[..consumed].iter().collect::<String>();
392 let lines = text.lines().count() - 1;
393 let last_line = text.lines().last().unwrap();
394 let end_column = if lines > 0 {
395 last_line.len()
396 } else {
397 begin + consumed
398 };
399 text = text
400 .lines()
401 .map(|line| line.trim())
402 .collect::<Vec<&str>>()
403 .join("");
404
405 Ok((
406 Token {
407 r#type: TokenType::TString,
408 span: Span::new(
409 LineColumn::new(line, begin),
410 LineColumn::new(line + lines, end_column),
411 ),
412 text,
413 },
414 consumed,
415 lines,
416 end_column,
417 ))
418}
419fn get_bit_or_hex_string_token(
421 chars: &[char],
422 line: usize,
423 begin: usize,
424) -> Result<(Token, usize, usize, usize)> {
425 if chars.len() == 1 {
426 return Err(Error::TokenizeError(6, line, begin).into());
427 }
428
429 let last = chars[1..].iter().position(|&c| c == '\'');
430 if last.is_none() {
431 return Err(Error::TokenizeError(7, line, begin).into());
433 }
434 let mut consumed = last.unwrap() + 1 + 1;
435 if consumed == chars.len() {
436 return Err(Error::TokenizeError(8, line, begin).into());
438 }
439
440 let c = chars[consumed];
441 let token_type = match c.to_lowercase().to_string().as_str() {
442 "h" => TokenType::HexString,
443 "b" => TokenType::BitString,
444 _ => {
445 return Err(Error::TokenizeError(9, line, begin).into());
446 }
447 };
448
449 let mut text = chars[..consumed].iter().collect::<String>();
450 let lines = text.lines().count() - 1;
451 let last_line = text.lines().last().unwrap();
452 let end_column = if lines > 0 {
453 last_line.len()
454 } else {
455 begin + consumed
456 };
457 text = text.replace(char::is_whitespace, "");
458
459 if token_type == TokenType::BitString && !text.replace(&['0', '1', '\''][..], "").is_empty() {
460 return Err(Error::TokenizeError(10, line, begin).into());
461 }
462
463 if token_type == TokenType::HexString
464 && !text.chars().all(|c| c.is_ascii_hexdigit() || c == '\'')
465 {
466 return Err(Error::TokenizeError(11, line, begin).into());
467 }
468
469 consumed += 1; Ok((
472 Token {
473 r#type: token_type,
474 span: Span::new(
475 LineColumn::new(line, begin),
476 LineColumn::new(line + lines, end_column), ),
478 text,
479 },
480 consumed,
481 lines,
482 end_column,
483 ))
484}
485
486fn get_at_component_id_list(chars: &[char], line: usize, begin: usize) -> Result<(Token, usize)> {
488 if chars.len() == 1 {
489 return Err(Error::TokenizeError(12, line, begin).into());
490 }
491
492 let mut consumed = 1;
493 let last = chars[1..]
494 .iter()
495 .position(|&x| !(x.is_ascii_alphanumeric() || x == '-' || x == '.'));
496 if let Some(lst) = last {
497 consumed += lst;
498 } else {
499 consumed += chars[1..].len();
500 }
501
502 if ['.', '-'].iter().any(|&c| c == chars[consumed - 1]) {
504 return Err(Error::TokenizeError(13, line, begin).into());
505 }
506 Ok((
507 Token {
508 r#type: TokenType::AtComponentIdList,
509 span: Span::new(
510 LineColumn::new(line, begin),
511 LineColumn::new(line, begin + consumed),
512 ),
513 text: chars[..consumed].iter().collect::<String>(), },
515 consumed,
516 ))
517}
518fn get_number_token(chars: &[char], line: usize, begin: usize) -> Result<(Token, usize)> {
524 let neg = (chars[0] == '-') as usize;
525
526 if neg > 0 && chars.len() == 1 {
527 return Err(Error::TokenizeError(14, line, begin).into());
528 }
529
530 let mut consumed = neg;
531 let last = chars[neg..]
532 .iter()
533 .position(|&x| !(x.is_numeric() || x == '.'));
534 if let Some(lst) = last {
535 consumed += lst;
536 } else {
537 consumed += chars[neg..].len();
538 }
539
540 let text = chars[..consumed].iter().collect::<String>();
541 if text.parse::<f32>().is_err() {
542 let dot_index = chars[neg..].iter().position(|&x| x == '.');
543 if let Some(index) = dot_index {
544 if index == chars.len() {
545 Err(Error::TokenizeError(14, line, begin).into())
546 } else if chars[index + 1] == '.' {
548 Ok((
550 Token {
551 r#type: TokenType::NumberInt,
552 span: Span::new(
553 LineColumn::new(line, begin),
554 LineColumn::new(line, begin + consumed),
555 ),
556 text: chars[..index + neg].iter().collect::<String>(), },
558 index + neg,
559 ))
560 } else {
561 Err(Error::TokenizeError(14, line, begin).into())
563 }
564 } else {
565 unreachable!();
566 }
567 } else {
568 Ok((
569 Token {
570 r#type: TokenType::NumberInt,
571 span: Span::new(
572 LineColumn::new(line, begin),
573 LineColumn::new(line, begin + consumed),
574 ),
575 text, },
577 consumed,
578 ))
579 }
580}
581
582fn get_identifier_or_keyword_token(
588 chars: &[char],
589 line: usize,
590 begin: usize,
591) -> Result<(Token, usize)> {
592 let and = (chars[0] == '&') as usize;
593
594 if and > 0 && chars.len() == 1 {
595 return Err(Error::TokenizeError(15, line, begin).into());
596 }
597
598 let mut consumed = and;
599 let last = chars[and..]
600 .iter()
601 .position(|&x| !(x.is_ascii_alphanumeric() || x == '-'));
602
603 if let Some(lst) = last {
604 consumed += lst;
605 } else {
606 consumed += chars[and..].len();
607 }
608
609 if chars[consumed - 1] == '-' {
611 return Err(Error::TokenizeError(16, line, begin).into());
612 }
613
614 if and > 0 && consumed == 1 {
616 return Err(Error::TokenizeError(17, line, begin).into());
617 }
618
619 let text = chars[..consumed].iter().collect::<String>();
620 if text.contains("--") {
621 return Err(Error::TokenizeError(18, line, begin).into());
622 }
623
624 let token_type = if and > 0 {
625 TokenType::AndIdentifier
626 } else if KEYWORDS.iter().any(|&kw| text == kw) {
627 TokenType::Keyword
628 } else {
629 TokenType::Identifier
630 };
631
632 Ok((
633 Token {
634 r#type: token_type,
635 span: Span::new(
636 LineColumn::new(line, begin),
637 LineColumn::new(line, begin + consumed),
638 ),
639 text,
640 },
641 consumed,
642 ))
643}
644
645fn get_range_or_extension_token(
647 chars: &[char],
648 line: usize,
649 begin: usize,
650) -> Result<(Token, usize)> {
651 let (token_type, consumed) = if chars.len() == 1 {
652 (TokenType::Dot, 1)
653 } else if chars.len() == 2 {
654 if chars[1] == '.' {
655 (TokenType::RangeSeparator, 2)
656 } else {
657 (TokenType::Dot, 1)
658 }
659 } else if chars[1] == '.' {
660 if chars[2] == '.' {
661 (TokenType::Extension, 3)
662 } else {
663 (TokenType::RangeSeparator, 2)
664 }
665 } else {
666 (TokenType::Dot, 1)
667 };
668
669 Ok((
670 Token {
671 r#type: token_type,
672 span: Span::new(
673 LineColumn::new(line, begin),
674 LineColumn::new(line, begin + consumed),
675 ),
676 text: chars[..consumed].iter().collect::<String>(),
677 },
678 consumed,
679 ))
680}
681
682fn get_assignment_or_colon_token(
684 chars: &[char],
685 line: usize,
686 begin: usize,
687) -> Result<(Token, usize)> {
688 let (token_type, consumed) = if chars.len() == 1 {
689 (TokenType::Colon, 1)
690 } else if chars.len() == 2 {
691 if chars[1] == ':' {
692 return Err(Error::TokenizeError(19, line, begin).into());
693 } else {
694 (TokenType::Colon, 1)
695 }
696 } else if chars[1] == ':' {
697 if chars[2] == '=' {
698 (TokenType::Assignment, 3)
699 } else {
700 return Err(Error::TokenizeError(20, line, begin).into());
701 }
702 } else {
703 (TokenType::Colon, 1)
704 };
705
706 Ok((
707 Token {
708 r#type: token_type,
709 span: Span::new(
710 LineColumn::new(line, begin),
711 LineColumn::new(line, begin + consumed),
712 ),
713 text: chars[..consumed].iter().collect::<String>(),
714 },
715 consumed,
716 ))
717}
718
719fn get_seq_extension_or_square_brackets_token(
723 chars: &[char],
724 line: usize,
725 begin: usize,
726) -> Result<(Token, usize)> {
727 let (token_type, consumed) = if chars[0] == '[' {
728 if chars[1] == '[' {
729 (TokenType::AdditionGroupsBegin, 2)
730 } else {
731 (TokenType::SquareBegin, 1)
732 }
733 } else if chars[1] == ']' {
734 (TokenType::AdditionGroupsEnd, 2)
735 } else {
736 (TokenType::SquareEnd, 1)
737 };
738 Ok((
739 Token {
740 r#type: token_type,
741 span: Span::new(
742 LineColumn::new(line, begin),
743 LineColumn::new(line, begin + consumed),
744 ),
745 text: chars[..consumed].iter().collect::<String>(),
746 },
747 consumed,
748 ))
749}
750
751fn get_single_char_token(token: char, line: usize, begin: usize) -> Result<Token> {
755 let token_type: TokenType = match token {
756 '{' => TokenType::CurlyBegin,
757 '}' => TokenType::CurlyEnd,
758 '(' => TokenType::RoundBegin,
759 ')' => TokenType::RoundEnd,
760 '!' => TokenType::ExceptionMarker,
761 ';' => TokenType::SemiColon,
762 ',' => TokenType::Comma,
763 '|' => TokenType::SetUnionToken,
764 '^' => TokenType::SetIntersectionToken,
765 '<' => TokenType::LessThan,
766 _ => return Err(Error::TokenizeError(21, line, begin).into()),
767 };
768 Ok(Token {
769 r#type: token_type,
770 span: Span::new(
771 LineColumn::new(line, begin),
772 LineColumn::new(line, begin + 1),
773 ),
774 text: token.to_string(),
775 })
776}
777
778fn get_maybe_comment_token(
786 chars: &[char], line: usize,
788 begin: usize,
789) -> Result<(Option<Token>, usize)> {
790 if chars[1] != '-' {
791 return Ok((None, 0));
792 }
793 let mut consumed = 2; let mut last_idx: Option<usize> = None;
795
796 for (idx, window) in chars[2..].windows(2).enumerate() {
798 if window[0] == '\n' {
799 last_idx = Some(idx);
800 consumed += idx;
801 break;
802 }
803 if window[0] == '-' && window[1] == '-' {
804 last_idx = Some(idx);
805 consumed += idx + 2; break;
807 }
808 }
809
810 if last_idx.is_none() {
812 consumed = chars.len();
813 }
814
815 let text = chars[..consumed].iter().collect::<String>();
816 Ok((
821 Some(Token {
822 r#type: TokenType::Comment,
823 span: Span::new(
824 LineColumn::new(line, begin),
825 LineColumn::new(line, consumed),
826 ),
827 text,
828 }),
829 consumed,
830 ))
831}
832
833pub fn tokenize<T>(mut input: T) -> Result<Vec<Token>>
839where
840 T: std::io::Read,
841{
842 let mut buffer = Vec::new();
843 let _ = input.read_to_end(&mut buffer).unwrap();
844 let buffer = String::from_utf8(buffer).unwrap();
845
846 tokenize_string(&buffer)
847}
848
849pub fn tokenize_string(buffer: &str) -> Result<Vec<Token>> {
854 let chars: Vec<char> = buffer.chars().collect();
855
856 let mut column = 0_usize;
857 let mut processed = 0;
858 let total_read = chars.len();
859
860 let mut line = 1;
861 let mut tokens: Vec<Token> = Vec::new();
862 loop {
863 let c = chars[processed];
864 match c {
865 ' ' | '\t' => {
866 processed += 1;
867 column += 1;
868 }
869 '\n' => {
870 line += 1;
871 processed += 1;
872 column = 0;
873 }
874 '-' => {
875 let (token, consumed) = get_maybe_comment_token(&chars[processed..], line, column)?;
876 match token {
877 Some(tok) => {
878 tokens.push(tok);
879 column += consumed;
880 processed += consumed;
881 }
882 None => {
883 let (token, consumed) =
884 get_number_token(&chars[processed..], line, column)?;
885 tokens.push(token);
886 column += consumed;
887 processed += consumed;
888 }
889 }
890 }
891 '{' | '}' | '(' | ')' | '!' | ';' | ',' | '|' | '^' | '<' => {
892 let token = get_single_char_token(chars[processed], line, column)?;
893 tokens.push(token);
894 column += 1;
895 processed += 1;
896 }
897 '[' | ']' => {
898 let (token, consumed) =
899 get_seq_extension_or_square_brackets_token(&chars[processed..], line, column)?;
900 tokens.push(token);
901 column += consumed;
902 processed += consumed;
903 }
904 ':' => {
905 let (token, consumed) =
906 get_assignment_or_colon_token(&chars[processed..], line, column)?;
907 tokens.push(token);
908 column += consumed;
909 processed += consumed;
910 }
911 '.' => {
912 let (token, consumed) =
913 get_range_or_extension_token(&chars[processed..], line, column)?;
914 tokens.push(token);
915 column += consumed;
916 processed += consumed;
917 }
918 '&' | 'a'..='z' | 'A'..='Z' => {
919 let (token, consumed) =
920 get_identifier_or_keyword_token(&chars[processed..], line, column)?;
921 tokens.push(token);
922
923 column += consumed;
924 processed += consumed;
925 }
926 '0'..='9' => {
927 let (token, consumed) = get_number_token(&chars[processed..], line, column)?;
928 tokens.push(token);
929 column += consumed;
930 processed += consumed;
931 }
932 '@' => {
933 let (token, consumed) =
934 get_at_component_id_list(&chars[processed..], line, column)?;
935 tokens.push(token);
936 column += consumed;
937 processed += consumed;
938 }
939 '\'' => {
940 let (token, consumed, l, c) =
941 get_bit_or_hex_string_token(&chars[processed..], line, column)?;
942 tokens.push(token);
943 processed += consumed;
944 if l > 0 {
945 column = c;
946 } else {
947 column += consumed;
948 }
949 line += l;
950 }
951 '"' => {
952 let (token, consumed, l, c) = get_string_token(&chars[processed..], line, column)?;
953 tokens.push(token);
954 processed += consumed;
955 if l > 0 {
956 column = c;
957 } else {
958 column += consumed;
959 }
960 line += l;
961 }
962 '\u{feff}' => {
964 processed += 1;
965 }
966 '\r' => {
967 processed += 1;
968 }
969 _ => {
970 panic!(
971 "Unsupported First character for a token: '{:?}'. Line: {}, Column: {}",
972 chars[processed], line, column
973 );
974 }
975 }
976 if processed == total_read {
977 break;
978 }
979 }
980 Ok(tokens)
981}
982
983#[cfg(test)]
984mod tests {
985
986 use super::*;
987
988 #[test]
989 fn tokenize_identifier_tokens() {
990 let reader = std::io::BufReader::new(std::io::Cursor::new(b"Hello World!"));
991 let result = tokenize(reader);
992 assert!(result.is_ok(), "{:#?}", result.err().unwrap());
993 let tokens = result.unwrap();
994 assert!(tokens.len() == 3, "{:#?}", tokens);
995 }
996
997 #[test]
998 fn tokenize_and_tokens() {
999 let reader = std::io::BufReader::new(std::io::Cursor::new(b"&Id &id-IDentifier"));
1000 let result = tokenize(reader);
1001 assert!(result.is_ok());
1002 let tokens = result.unwrap();
1003 assert!(tokens.len() == 2, "{:#?}", tokens);
1004 }
1005
1006 #[test]
1007 fn tokenize_comment_two_lines() {
1008 let reader =
1009 std::io::BufReader::new(std::io::Cursor::new(b"Hello World!\n-- Some comment --\n"));
1010 let result = tokenize(reader);
1011 assert!(result.is_ok());
1012 let tokens = result.unwrap();
1013 assert!(tokens.len() == 4, "{:#?}", tokens);
1014 }
1015
1016 #[test]
1017 fn tokenize_two_comments() {
1018 let reader = std::io::BufReader::new(std::io::Cursor::new(
1019 b" -- Hello World!\n-- Some comment --\n",
1020 ));
1021 let result = tokenize(reader);
1022 assert!(result.is_ok());
1023 let tokens = result.unwrap();
1024 assert!(tokens.len() == 2, "{:#?}", tokens);
1025 }
1026
1027 #[test]
1028 fn tokenize_comment_no_trailing_newline() {
1029 let reader = std::io::BufReader::new(std::io::Cursor::new(b" -- Hello World!"));
1030 let result = tokenize(reader);
1031 assert!(result.is_ok());
1032 let tokens = result.unwrap();
1033 assert!(tokens.len() == 1, "{:#?}", tokens);
1034 }
1035
1036 #[test]
1037 fn tokenize_keywords() {
1038 let reader = std::io::BufReader::new(std::io::Cursor::new(b" INTEGER ENUMERATED "));
1039 let result = tokenize(reader);
1040 assert!(result.is_ok());
1041 let tokens = result.unwrap();
1042 assert!(tokens.len() == 2, "{:#?}", tokens);
1043 assert!(tokens.iter().all(|t| t.is_keyword()));
1044 }
1045
1046 #[test]
1047 fn tokenize_at_component_list() {
1048 let reader =
1049 std::io::BufReader::new(std::io::Cursor::new(b"@component.id-List @.another "));
1050 let result = tokenize(reader);
1051 assert!(result.is_ok());
1052 let tokens = result.unwrap();
1053 assert!(tokens.len() == 2, "{:#?}", tokens);
1054 }
1055
1056 #[test]
1057 fn tokenize_numbers() {
1058 let reader = std::io::BufReader::new(std::io::Cursor::new(b" 123456789 -123"));
1059 let result = tokenize(reader);
1060 assert!(result.is_ok());
1061 let tokens = result.unwrap();
1062 assert!(tokens.len() == 2, "{:#?}", tokens);
1063 assert!(tokens.iter().all(|t| t.is_numeric()), "{:#?}", tokens);
1064 }
1065
1066 #[test]
1067 fn tokenize_keyword_dot_andkeyword() {
1068 let reader = std::io::BufReader::new(std::io::Cursor::new(
1069 b"ATTRIBUTE.&equality-match.&AssertionType",
1070 ));
1071 let result = tokenize(reader);
1072 assert!(result.is_ok());
1073 let tokens = result.unwrap();
1074 assert!(tokens.len() == 5, "{:#?}", tokens);
1075 }
1076
1077 #[test]
1078 fn tokenize_range() {
1079 let reader = std::io::BufReader::new(std::io::Cursor::new(b" -123456789..-123"));
1080 let result = tokenize(reader);
1081 assert!(result.is_ok());
1082 let tokens = result.unwrap();
1083 assert!(tokens.len() == 3, "{:#?}", tokens);
1084 assert!(tokens[0].is_numeric(), "{:#?}", tokens[0]);
1085 assert!(tokens[1].is_range_separator(), "{:#?}", tokens[1]);
1086 assert!(tokens[2].is_numeric(), "{:#?}", tokens[2]);
1087 }
1088
1089 #[test]
1090 fn tokenize_bitstring() {
1091 struct BitHexStringTestCase<'t> {
1092 input: &'t [u8],
1093 success: bool,
1094 span_end_line: usize,
1095 }
1096 let test_cases = vec![
1097 BitHexStringTestCase {
1098 input: b"'010101'b",
1099 success: true,
1100 span_end_line: 1,
1101 },
1102 BitHexStringTestCase {
1103 input: b"'010101'",
1104 success: false,
1105 span_end_line: 1,
1106 },
1107 BitHexStringTestCase {
1108 input: b"'010101'h",
1109 success: true,
1110 span_end_line: 1,
1111 },
1112 BitHexStringTestCase {
1113 input: b"'01 0101'b",
1114 success: true,
1115 span_end_line: 1,
1116 },
1117 BitHexStringTestCase {
1118 input: b"'01 0101'h",
1119 success: true,
1120 span_end_line: 1,
1121 },
1122 BitHexStringTestCase {
1123 input: b"'01 0101\n\t0101\n00'h",
1124 success: true,
1125 span_end_line: 3,
1126 },
1127 ];
1128 for t in test_cases {
1129 let reader = std::io::BufReader::new(std::io::Cursor::new(t.input));
1130 let result = tokenize(reader);
1131 assert_eq!(result.is_ok(), t.success, "{:#?}", result.unwrap()[0]);
1132 if result.is_ok() {
1133 let tokens = result.unwrap();
1134 assert!(tokens.len() == 1, "{:#?}", tokens[0]);
1135 let token = &tokens[0];
1136 assert!(
1137 token.span.end.line == t.span_end_line,
1138 "input: {:#?}, token end: {}, tc: span_end_line {}",
1139 t.input,
1140 token.span.end.line,
1141 t.span_end_line
1142 );
1143 }
1144 }
1145 }
1146
1147 #[test]
1148 fn tokenize_string() {
1149 struct TestTokenizeString<'t> {
1150 input: &'t [u8],
1151 len: usize,
1152 success: bool,
1153 }
1154 let test_cases = vec![
1155 TestTokenizeString {
1156 input: b"\"Foo Bar\n\tFoo-baz\"",
1157 len: 1,
1158 success: true,
1159 },
1160 TestTokenizeString {
1161 input: b"\"",
1162 len: 1,
1163 success: false,
1164 },
1165 TestTokenizeString {
1166 input: b"\"\"",
1167 len: 1,
1168 success: true,
1169 },
1170 TestTokenizeString {
1171 input: b"\"\"\"",
1172 len: 1,
1173 success: false,
1174 },
1175 TestTokenizeString {
1176 input: b"\"\"\"\" ",
1177 len: 1,
1178 success: true,
1179 },
1180 TestTokenizeString {
1181 input: b"\"\"\"a\"\"x\"",
1183 len: 1,
1184 success: true,
1185 },
1186 TestTokenizeString {
1187 input: b"\"a\"..\"z\"",
1188 len: 3,
1189 success: true,
1190 },
1191 ];
1192 for test_case in test_cases {
1193 let reader = std::io::BufReader::new(std::io::Cursor::new(test_case.input));
1194 let result = tokenize(reader);
1195 assert_eq!(
1196 result.is_ok(),
1197 test_case.success,
1198 "{}",
1199 result.err().unwrap()
1200 );
1201 if result.is_ok() {
1202 let tokens = result.unwrap();
1203 assert!(tokens.len() == test_case.len, "{:#?}", tokens);
1204 }
1205 }
1206 }
1207
1208 #[test]
1209 fn tokenizer_test_object_class_reference() {
1210 let reader = std::io::BufReader::new(std::io::Cursor::new("SOME-OBJECT-CLASS"));
1211 let result = tokenize(reader);
1212 assert!(result.is_ok());
1213 let result = result.unwrap();
1214 assert!(result.len() == 1);
1215
1216 assert!(result[0].is_object_class_reference());
1217 }
1218
1219 #[test]
1220 fn tokenize_small_tokens() {
1221 struct SmallTokenTestCase<'t> {
1222 input: &'t [u8],
1223 count: usize,
1224 success: bool,
1225 }
1226 let test_cases = vec![
1227 SmallTokenTestCase {
1228 input: b"{{}}",
1229 count: 4,
1230 success: true,
1231 },
1232 SmallTokenTestCase {
1233 input: b"[[{}]}",
1234 count: 5,
1235 success: true,
1236 },
1237 SmallTokenTestCase {
1238 input: b"[[]]",
1239 count: 2,
1240 success: true,
1241 },
1242 SmallTokenTestCase {
1243 input: b"..{...}",
1244 count: 4,
1245 success: true,
1246 },
1247 SmallTokenTestCase {
1248 input: b":(::=)",
1249 count: 4,
1250 success: true,
1251 },
1252 SmallTokenTestCase {
1253 input: b": ::=",
1254 count: 2,
1255 success: true,
1256 },
1257 SmallTokenTestCase {
1258 input: b": :: ",
1259 count: 2,
1260 success: false,
1261 },
1262 SmallTokenTestCase {
1263 input: b".",
1264 count: 1,
1265 success: true,
1266 },
1267 SmallTokenTestCase {
1268 input: b"..",
1269 count: 1,
1270 success: true,
1271 },
1272 SmallTokenTestCase {
1273 input: b". ",
1274 count: 1,
1275 success: true,
1276 },
1277 SmallTokenTestCase {
1278 input: b". . .. ",
1279 count: 3,
1280 success: true,
1281 },
1282 SmallTokenTestCase {
1283 input: b"...",
1284 count: 1,
1285 success: true,
1286 },
1287 ];
1288 for test_case in test_cases {
1289 let reader = std::io::BufReader::new(std::io::Cursor::new(test_case.input));
1290 let result = tokenize(reader);
1291 assert_eq!(
1292 result.is_ok(),
1293 test_case.success,
1294 "{}",
1295 String::from_utf8(test_case.input.to_vec()).unwrap()
1296 );
1297 if result.is_ok() {
1298 let tokens = result.unwrap();
1299 assert!(tokens.len() == test_case.count, "{:#?}", tokens);
1300 }
1301 }
1302 }
1303}