b2c2_tokenizer/
lib.rs

1// b2c2-tokenizer crate
2// author: Leonardone @ NEETSDKASU
3
4use b2c2_common::*;
5use std::io::{self, BufRead};
6use std::result;
7
8#[cfg(test)]
9mod test;
10
11type Result = result::Result<(usize, Vec<(usize, Token)>), SyntaxError>;
12
13pub struct Tokenizer<R> {
14    line_number: usize,
15    reader: R,
16}
17
18impl<R> Tokenizer<R> {
19    pub fn new(reader: R) -> Self {
20        Self {
21            line_number: 0,
22            reader,
23        }
24    }
25}
26
27impl<R: BufRead> Iterator for Tokenizer<R> {
28    type Item = io::Result<Result>;
29    fn next(&mut self) -> Option<Self::Item> {
30        self.line_number += 1;
31        let mut line = String::new();
32        match self.reader.read_line(&mut line) {
33            Err(error) => Some(Err(error)),
34            Ok(0) => None,
35            Ok(_) => Some(Ok(self.parse_line(&line))),
36        }
37    }
38}
39
40impl<R> Tokenizer<R> {
41    fn parse_line(&self, src: &str) -> Result {
42        let mut line = src.trim_start();
43        let mut ret: Vec<(usize, Token)> = vec![];
44        while !line.is_empty() && !is_comment(line, ret.is_empty()) {
45            let pos = src.len() - line.len() + 1;
46            match take_token(line) {
47                Some((Token::Keyword(Keyword::Mid), rest)) if !ret.is_empty() => {
48                    ret.push((pos, Token::Function(Function::Mid)));
49                    line = rest.trim_start();
50                }
51                Some((token, rest)) => {
52                    ret.push((pos, token));
53                    line = rest.trim_start();
54                }
55                None => {
56                    return Err(SyntaxError::new(
57                        self.line_number,
58                        pos,
59                        format!("不正なトークンです: {}", line),
60                    ))
61                }
62            }
63        }
64        Ok((self.line_number, ret))
65    }
66}
67
68fn is_comment(line: &str, is_toplevel: bool) -> bool {
69    line.starts_with('\'')
70        || (is_toplevel
71            && take_word(line)
72                .filter(|(word, _)| "Rem".eq_ignore_ascii_case(word))
73                .is_some())
74}
75
76fn take_token(line: &str) -> Option<(Token, &str)> {
77    [
78        take_word_token,
79        take_hex_integer_token,
80        take_integer_token,
81        take_char_token,
82        take_string_token,
83        take_operator_token,
84    ]
85    .iter()
86    .find_map(|f| f(line))
87}
88
89fn take_hex_integer_token(s: &str) -> Option<(Token, &str)> {
90    let mut char_indices = s.char_indices();
91    char_indices.next().filter(|(_, ch)| *ch == '&')?;
92    char_indices
93        .next()
94        .filter(|(_, ch)| 'H'.eq_ignore_ascii_case(ch))?;
95    let mut char_indices = char_indices.peekable();
96    let &(prefix_position, _) = char_indices.peek()?;
97    let split_position = char_indices
98        .find(|(_, ch)| !ch.is_ascii_hexdigit())
99        .map_or(s.len(), |(p, _)| p);
100    let (num, rest) = s.split_at(split_position);
101    let (_prefix, num) = num.split_at(prefix_position);
102    u16::from_str_radix(num, 16)
103        .ok()
104        .map(|n| (Token::Integer(n as i16 as i32), rest))
105}
106
107fn take_integer_token(s: &str) -> Option<(Token, &str)> {
108    let split_position = s
109        .char_indices()
110        .find(|(_, ch)| !ch.is_ascii_digit())
111        .map_or(s.len(), |(p, _)| p);
112    let (number, rest) = s.split_at(split_position);
113    number
114        .parse::<i32>()
115        .ok()
116        .filter(|n| *n <= (i16::MIN as i32).abs())
117        .map(|n| (Token::Integer(n), rest))
118}
119
120fn take_char_token(s: &str) -> Option<(Token, &str)> {
121    let mut char_indices = s.char_indices();
122    char_indices.next().filter(|(_, ch)| *ch == '"')?;
123    let mut quotation = false;
124    let mut split_position = s.len();
125    let mut text: Option<char> = None;
126    for (p, ch) in char_indices {
127        if quotation {
128            if ch == '"' {
129                if text.is_some() {
130                    return None;
131                }
132                quotation = false;
133                text = Some('"');
134            } else if text.is_some() {
135                split_position = p;
136                break;
137            } else {
138                return None;
139            }
140        } else if ch == '"' {
141            quotation = true;
142        } else if text.is_none() {
143            text = Some(ch);
144        } else {
145            return None;
146        }
147    }
148    if !quotation {
149        return None;
150    }
151    let ch = text.take()?;
152    let (_, rest) = s.split_at(split_position);
153    let (suffix, rest) = take_word(rest)?;
154    if !"c".eq_ignore_ascii_case(suffix) {
155        return None;
156    }
157    Some((Token::Character(ch), rest))
158}
159
160fn take_string_token(s: &str) -> Option<(Token, &str)> {
161    let mut char_indices = s.char_indices();
162    char_indices.next().filter(|(_, ch)| *ch == '"')?;
163    let mut quotation = false;
164    let mut split_position = s.len();
165    let mut text = String::new();
166    for (p, ch) in char_indices {
167        if quotation {
168            if ch == '"' {
169                quotation = false;
170                text.push('"');
171            } else {
172                split_position = p;
173                break;
174            }
175        } else if ch == '"' {
176            quotation = true;
177        } else {
178            text.push(ch);
179        }
180    }
181    if !quotation || text.chars().count() > 256 {
182        return None;
183    }
184    let (_, rest) = s.split_at(split_position);
185    Some((Token::String(text), rest))
186}
187
188fn take_operator_token(s: &str) -> Option<(Token, &str)> {
189    let mut char_indices = s.char_indices().take(5);
190    char_indices
191        .next()
192        .filter(|(_, ch)| !ch.is_ascii_alphanumeric())?;
193    char_indices
194        .chain(vec![(s.len(), '\n')])
195        .filter_map(|(p, _)| {
196            let (word, rest) = s.split_at(p);
197            Operator::parse(word).map(|token| (token, rest))
198        })
199        .last()
200}
201
202fn take_word(s: &str) -> Option<(&str, &str)> {
203    let mut char_indices = s.char_indices();
204    char_indices
205        .next()
206        .filter(|(_, head)| head.is_ascii_alphabetic())?;
207    let split_position = char_indices
208        .find(|(_, ch)| !(ch.is_ascii_alphanumeric() || *ch == '_'))
209        .map_or(s.len(), |(p, _)| p);
210    Some(s.split_at(split_position)).filter(|(word, _)| word.chars().count() <= 30)
211}
212
213fn parse_boolean(token: &str) -> Option<Token> {
214    token
215        .to_ascii_lowercase()
216        .parse::<bool>()
217        .map(Token::Boolean)
218        .ok()
219}
220
221fn take_word_token(s: &str) -> Option<(Token, &str)> {
222    take_word(s).and_then(|(word, rest)| {
223        [
224            parse_boolean,
225            Keyword::parse,
226            TypeName::parse,
227            Function::parse,
228            Operator::parse,
229        ]
230        .iter()
231        .find_map(|f| f(word))
232        .or_else(|| Some(Token::Name(word.into())))
233        .map(|token| (token, rest))
234    })
235}
236
237#[derive(PartialEq, Eq, Clone, Debug)]
238pub enum Token {
239    Name(String),
240    Keyword(Keyword),
241    Function(Function),
242    TypeName(TypeName),
243    Operator(Operator),
244    String(String),
245    Integer(i32),
246    Boolean(bool),
247    Character(char),
248}
249
250macro_rules! enumdef {
251    ($v:ident,) => (1);
252    ($v:ident, $($vs:ident,)*) => (1 + enumdef!($($vs,)*));
253    ($name:ident; $array:ident;; $($value:ident,)* ) => {
254        #[derive(PartialEq,Eq,Clone,Copy,Debug)]
255        pub enum $name {
256            $($value,)*
257        }
258        static $array: [$name; enumdef!($($value,)*)] = [
259            $($name::$value,)*
260        ];
261        impl std::convert::TryFrom<&str> for $name {
262            type Error = ();
263            fn try_from(token: &str) -> std::result::Result<Self, Self::Error> {
264                $array
265                    .iter()
266                    .find(|v| v.to_str().eq_ignore_ascii_case(token))
267                    .cloned()
268                    .ok_or(())
269            }
270        }
271        impl From<$name> for Token {
272            fn from(v: $name) -> Token {
273                Token::$name(v)
274            }
275        }
276        impl $name {
277            fn parse(token: &str) -> Option<Token> {
278                use std::convert::*;
279                $name::try_from(token).map(Into::into).ok()
280            }
281        }
282        impl std::fmt::Display for $name {
283            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
284                self.to_str().fmt(f)
285            }
286        }
287    };
288    ($name:ident; $array:ident; $($value:ident,)* ) => {
289        enumdef!($name; $array;; $($value,)*);
290        impl $name {
291            fn to_str(self) -> &'static str {
292                match self {
293                    $($name::$value => stringify!($value),)*
294                }
295            }
296        }
297    };
298}
299
300enumdef!(
301    Keyword;
302    KEYWORDS;
303    Argument,
304    As,
305    ByRef,
306    ByVal,
307    Call,
308    Case,
309    Continue,
310    Else,
311    ElseIf,
312    End,
313    Exit,
314    Extern,
315    Dim,
316    Do,
317    Fill,
318    For,
319    From,
320    If,
321    Input,
322    Loop,
323    Mid,
324    Next,
325    Option,
326    Print,
327    Rem,
328    Select,
329    Step,
330    Sub,
331    Then,
332    To,
333    Until,
334    While,
335    With,
336);
337
338enumdef!(
339    Function;
340    FUNCTIONS;
341    Abs,
342    Array,
343    Asc,
344    CArray,
345    CBool,
346    Chr,
347    CInt,
348    CStr,
349    Eof,
350    Len,
351    Max,
352    Mid,
353    Min,
354    Space,
355    String,
356    SubArray,
357);
358
359enumdef!(
360    TypeName;
361    TYPE_NAMES;
362    Boolean,
363    Integer,
364    String,
365);
366
367enumdef!(
368    Operator;
369    OPERATORS;;
370    Mod,
371    Not,
372    And,
373    Xor,
374    Or,
375    NotEqual,
376    LessOrEequal,
377    GreaterOrEqual,
378    ShiftLeftArithmetic,
379    ShiftRightArithmetic,
380    ShiftLeftLogical,
381    ShiftRightLogical,
382    AddInto,
383    SubInto,
384    Equal,
385    LessThan,
386    GreaterThan,
387    Add,
388    Sub,
389    Mul,
390    Div,
391    Concat,
392    OpenBracket,
393    CloseBracket,
394    Comma,
395);
396
397impl Operator {
398    fn to_str(self) -> &'static str {
399        use Operator::*;
400        match self {
401            Mod => "Mod",
402            Not => "Not",
403            And => "And",
404            Xor => "Xor",
405            Or => "Or",
406            NotEqual => "<>",
407            LessOrEequal => "<=",
408            GreaterOrEqual => ">=",
409            ShiftLeftArithmetic => "<<",
410            ShiftRightArithmetic => ">>",
411            ShiftLeftLogical => "<<<",
412            ShiftRightLogical => ">>>",
413            AddInto => "+=",
414            SubInto => "-=",
415            Equal => "=",
416            LessThan => "<",
417            GreaterThan => ">",
418            Add => "+",
419            Sub => "-",
420            Mul => "*",
421            Div => "\\",
422            Concat => "&",
423            OpenBracket => "(",
424            CloseBracket => ")",
425            Comma => ",",
426        }
427    }
428}