excel_lib/lexer/
mod.rs

1use nom::branch::*;
2use nom::bytes::complete::{tag, take, take_while, take_while1};
3use nom::character::complete::{alpha1, alphanumeric1, digit1, multispace0};
4use nom::combinator::{map, map_res, recognize, opt};
5use nom::multi::many0;
6use nom::sequence::{terminated, delimited, separated_pair, pair};
7use nom::*;
8
9use std::str;
10use std::str::FromStr;
11use std::str::Utf8Error;
12
13pub mod token; 
14use crate::lexer::token::*; 
15use crate::errors::Error; 
16
17macro_rules! syntax {
18    ($func_name: ident, $tag_string: literal, $output_token: expr) => {
19        fn $func_name<'a>(s: &'a [u8]) -> IResult<&[u8], Token> {
20            map(tag($tag_string), |_| $output_token)(s)
21        }
22    };
23}
24
25// Syntax 
26syntax! {null_err, "#NULL!", Token::Null}
27syntax! {div_err, "#DIV/0!", Token::Div}
28syntax! {value_err, "#VALUE!", Token::Value}
29syntax! {ref_err, "#REF!", Token::Ref}
30syntax! {name_err, "#NAME!", Token::Name}
31syntax! {num_err, "#NUM!", Token::Num}
32syntax! {na_err, "#N/A", Token::NA}
33syntax! {getting_data_err, "#GETTING_DATA", Token::GettingData}
34syntax! {plus, "+", Token::Plus}
35syntax! {minus, "-", Token::Minus}
36syntax! {divide, "/", Token::Divide}
37syntax! {multiply, "*", Token::Multiply}
38syntax! {exponent, "^", Token::Exponent}
39syntax! {ampersand, "&", Token::Ampersand}
40syntax! {equal, "=", Token::Equal}
41syntax! {comma, ",", Token::Comma}
42syntax! {period, ".", Token::Period}
43syntax! {colon, ":", Token::Colon}
44syntax! {semicolon, ";", Token::SemiColon}
45syntax! {langle, "<", Token::LAngle}
46syntax! {rangle, ">", Token::RAngle}
47syntax! {lparen, "(", Token::LParen}
48syntax! {rparen, ")", Token::RParen}
49syntax! {lbrace, "{", Token::LBrace}
50syntax! {rbrace, "}", Token::RBrace}
51syntax! {lbracket, "[", Token::LBracket}
52syntax! {rbracket, "]", Token::RBracket}
53syntax! {true_bool, "TRUE", Token::Boolean(true)}
54syntax! {false_bool , "FALSE", Token::Boolean(false)}
55
56pub fn lex_syntax(input: &[u8]) -> IResult<&[u8], Token> {
57    alt((
58        alt((
59            null_err, 
60            div_err, 
61            value_err, 
62            ref_err, 
63            name_err, 
64            num_err, 
65            na_err, 
66            getting_data_err
67        )), 
68        alt((
69            plus,
70            minus, 
71            divide, 
72            multiply, 
73            exponent 
74        )), 
75        alt((
76            ampersand, 
77            equal, 
78            comma, 
79            colon, 
80            period, 
81            semicolon, 
82            langle, 
83            rangle, 
84            lparen, 
85            rparen, 
86            lbrace, 
87            rbrace, 
88            lbracket, 
89            rbracket
90        )),
91        alt((
92            true_bool, 
93            false_bool
94        ))
95    ))(input)
96}
97
98// String
99fn pis(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
100    use std::result::Result::*;
101
102    let (i1, c1) = take(1usize)(input)?;
103    match c1.as_bytes() {
104        b"\"" => Ok((input, vec![])),
105        b"\\" => {
106            let (i2, c2) = take(1usize)(i1)?;
107            pis(i2).map(|(slice, done)| (slice, concat_slice_vec(c2, done)))
108        }
109        c => pis(i1).map(|(slice, done)| (slice, concat_slice_vec(c, done))),
110    }
111}
112
113fn concat_slice_vec(c: &[u8], done: Vec<u8>) -> Vec<u8> {
114	let mut new_vec = c.to_vec();
115    new_vec.extend(&done);
116    new_vec
117}
118
119fn convert_vec_utf8(v: Vec<u8>) -> Result<String, Utf8Error> {
120	let slice = v.as_slice();
121	str::from_utf8(slice).map(|s| s.to_owned())
122}
123
124fn complete_byte_slice_str_from_utf8(c: &[u8]) -> Result<&str, Utf8Error> {
125    str::from_utf8(c)
126}
127
128fn string(input: &[u8]) -> IResult<&[u8], String> {
129	delimited(tag("\""), map_res(pis, convert_vec_utf8), tag("\""))(input)
130}
131
132fn lex_string(input: &[u8]) -> IResult<&[u8], Token> {
133    map(string, Token::Text)(input)
134}
135
136// References 
137fn lex_vrange(input: &[u8]) -> IResult<&[u8], Token> {
138    let vrange_token = recognize(separated_pair(
139        pair(opt(tag("$")), alpha1), 
140        tag(":"), 
141        pair(opt(tag("$")), alpha1), 
142    ));
143    map_res(
144        vrange_token,
145        |s| {
146            let c = complete_byte_slice_str_from_utf8(s); 
147            c.map(|syntax| Token::VRange(syntax.to_string()))
148        }
149    )(input)
150}
151
152fn lex_hrange(input: &[u8]) -> IResult<&[u8], Token> {
153    let vrange_token = recognize(separated_pair(
154        pair(opt(tag("$")), digit1), 
155        tag(":"), 
156        pair(opt(tag("$")), digit1), 
157    ));
158    map_res(
159        vrange_token,
160        |s| {
161            let c = complete_byte_slice_str_from_utf8(s); 
162            c.map(|syntax| Token::HRange(syntax.to_string()))
163        }
164    )(input)
165}
166
167fn in_quote_sheet_name(chr: u8) -> bool {
168    let is_special = b"`~@#$%^&-_=+{}|;,<.>() ".contains(&chr); 
169    is_digit_or_alpha(chr) || is_special
170}
171
172fn in_sheet_name(chr: u8) -> bool {
173    let is_special = b"`~@#$%^_{}|;.".contains(&chr); 
174    is_digit_or_alpha(chr) || is_special
175}
176
177fn is_digit(chr: u8) -> bool {
178    (0x30..=0x39).contains(&chr)
179}
180
181fn is_alpha(chr: u8) -> bool {
182    (0x41..=0x5A).contains(&chr) || (0x61..=0x7A).contains(&chr)
183}
184
185fn is_digit_or_alpha(chr: u8) -> bool {
186    is_digit(chr) || is_alpha(chr)
187}
188
189fn lex_sheet_name(input: &[u8]) -> IResult<&[u8], &[u8]> {
190    alt((
191        take_while1(in_sheet_name),
192        recognize(delimited(tag("'"), take_while(in_quote_sheet_name), tag("'")))
193    ))(input)
194}
195
196fn lex_sheet(input: &[u8]) -> IResult<&[u8], Token> {
197    map_res(
198        alt((
199            terminated(lex_sheet_name, tag("\\!")), // Workaround shell expansion of !. 
200            terminated(lex_sheet_name, tag("!")), 
201        )), 
202        |s| {
203            let c = complete_byte_slice_str_from_utf8(s);
204            c.map(|syntax| Token::Sheet(syntax.replace('\'', "")))
205        }
206    )(input)
207}
208
209fn lex_multisheet(input: &[u8]) -> IResult<&[u8], Token> {
210    map(
211        terminated(recognize(separated_pair(lex_sheet_name, tag(":"), lex_sheet_name)), tag("!")), 
212        |a| {
213            let x = complete_byte_slice_str_from_utf8(a).unwrap();
214            Token::MultiSheet(x.to_string())
215        }
216    )(input)
217}
218
219fn lex_cell(input: &[u8]) -> IResult<&[u8], Token> {
220    map(
221        recognize(pair(pair(opt(tag("$")), alpha1), pair(opt(tag("$")), digit1))), 
222        |c| {
223            let s = complete_byte_slice_str_from_utf8(c).unwrap(); 
224            Token::Cell(s.to_string())
225        }
226    )(input)
227}
228
229fn lex_range(input: &[u8]) -> IResult<&[u8], Token> {
230    map(
231        separated_pair(lex_cell, tag(":"), lex_cell), 
232        |(a, b)| {
233            Token::Range(format!("{}:{}", a, b))
234        }
235    )(input)
236}
237
238fn lex_references(input: &[u8]) -> IResult<&[u8], Token> {
239    alt((
240        lex_multisheet,
241        lex_sheet, 
242        lex_hrange, 
243        lex_vrange, 
244        lex_range, 
245        lex_cell
246    ))(input)
247}
248
249// Integer
250fn complete_str_from_str<F: FromStr>(c: &str) -> Result<F, F::Err> {
251    FromStr::from_str(c)
252}
253
254fn lex_int(input: &[u8]) -> IResult<&[u8], Token> {
255    map(
256        map_res(
257            map_res(digit1, complete_byte_slice_str_from_utf8),
258            complete_str_from_str,
259        ),
260        Token::Integer,
261    )(input)
262}
263
264fn lex_float(input: &[u8]) -> IResult<&[u8], Token> {
265    map(
266        map_res(
267            recognize(separated_pair(digit1, period, digit1)),
268            complete_byte_slice_str_from_utf8
269        ), 
270        |c: &str| {
271            Token::Float(c.parse::<f64>().unwrap())
272        }
273    )(input)
274}
275
276// Ident
277fn lex_ident(input: &[u8]) -> IResult<&[u8], Token> {
278    map(
279        map_res(
280            map_res(alphanumeric1, complete_byte_slice_str_from_utf8),
281            complete_str_from_str,
282        ),
283        Token::Ident,
284    )(input)
285}
286
287// Tokens
288fn lex_token(input: &[u8]) -> IResult<&[u8], Token> {
289    alt((
290        lex_syntax,
291        lex_string,
292        lex_references,
293        lex_float, 
294        lex_int, 
295        lex_ident, 
296    ))(input)
297}
298
299fn lex_tokens(input: &[u8]) -> IResult<&[u8], Vec<Token>> {
300    many0(delimited(multispace0, lex_token, multispace0))(input)
301}
302
303pub struct Lexer; 
304impl Lexer {
305    pub fn lex_tokens(bytes: &[u8]) -> Result<Vec<Token>, Error> {
306		match lex_tokens(bytes)
307			.map(|(slice, result)| (slice, [&result[..]].concat())) {
308                Ok((_, mut tokens)) => {
309                    tokens.push(Token::EOF); 
310                    Ok(tokens)
311                }, 
312                _ => Err(Error::UnableToLex(String::from_utf8(bytes.to_vec()).unwrap()))
313            }
314	}
315}
316
317#[cfg(test)]
318mod tests {
319	use super::*; 
320    use crate::errors::Error; 
321
322	fn lex(b: &[u8]) -> Result<Vec<Token>, Error> {
323        Lexer::lex_tokens(b)
324    }
325
326	#[test]
327	fn test_symbols() -> Result<(), Error> {
328        assert_eq!(lex(b"=+(){},;")?, vec![
329			Token::Equal, 
330            Token::Plus,
331            Token::LParen,
332            Token::RParen,
333            Token::LBrace,
334            Token::RBrace,
335            Token::Comma,
336            Token::SemiColon,
337            Token::EOF, 
338        ]);
339        Ok(())
340	}
341
342    #[test]
343    fn test_strings() -> Result<(), Error> {
344        assert_eq!(lex(b"\"this is a test\"")?, vec![
345            Token::Text(String::from("this is a test")), Token::EOF
346        ]);
347        assert_eq!(lex(b"\"this\", \"is\" \"a\" \"test\"")?, vec![
348            Token::Text(String::from("this")), 
349            Token::Comma, 
350            Token::Text(String::from("is")), 
351            Token::Text(String::from("a")), 
352            Token::Text(String::from("test")), 
353            Token::EOF, 
354        ]);
355        Ok(())
356    }
357
358    #[test]
359    fn test_ints() -> Result<(), Error> {
360        assert_eq!(lex(b"123")?, vec![
361            Token::Integer(123), Token::EOF
362        ]); 
363        assert_eq!(lex(b"0.05")?, vec![
364            Token::Float(0.05), Token::EOF
365        ]); 
366        assert_eq!(lex(b"12.30")?, vec![
367            Token::Float(12.30), Token::EOF
368        ]); 
369        Ok(())
370    }
371
372    #[test]
373    fn test_errors() -> Result<(), Error> {
374        assert_eq!(lex(b"#NUM!")?, vec![Token::Num, Token::EOF]); 
375        assert_eq!(lex(b"#DIV/0!")?, vec![Token::Div, Token::EOF]); 
376        assert_eq!(lex(b"#VALUE!")?, vec![Token::Value, Token::EOF]); 
377        assert_eq!(lex(b"#REF!")?, vec![Token::Ref, Token::EOF]); 
378        assert_eq!(lex(b"#NAME!")?, vec![Token::Name, Token::EOF]); 
379        assert_eq!(lex(b"#N/A")?, vec![Token::NA, Token::EOF]); 
380        assert_eq!(lex(b"#GETTING_DATA")?, vec![Token::GettingData, Token::EOF]); 
381        Ok(())
382    }
383
384    #[test]
385    fn test_bool() -> Result<(), Error> {
386        assert_eq!(lex(b"TRUE")?, vec![Token::Boolean(true), Token::EOF]); 
387        assert_eq!(lex(b"FALSE")?, vec![Token::Boolean(false), Token::EOF]); 
388        Ok(())
389    }
390
391    #[test]
392    fn test_multisheet() -> Result<(), Error> {
393        assert_eq!(lex(b"test:test!")?, vec![Token::MultiSheet(String::from("test:test")), Token::EOF]); 
394        Ok(())
395    }
396
397    #[test]
398    fn test_sheet() -> Result<(), Error> {
399        assert_eq!(lex(b"'Test'!")?, vec![Token::Sheet(String::from("Test")), Token::EOF]); 
400        Ok(())
401    }
402
403    #[test]
404    fn test_vrange() -> Result<(), Error> {
405        assert_eq!(lex(b"A:A")?, vec![Token::VRange(String::from("A:A")), Token::EOF]); 
406        assert_eq!(lex(b"$A:$A")?, vec![Token::VRange(String::from("$A:$A")), Token::EOF]); 
407        Ok(())
408    }
409
410    #[test]
411    fn test_hrange() -> Result<(), Error> {
412        assert_eq!(lex(b"1:1")?, vec![Token::HRange(String::from("1:1")), Token::EOF]); 
413        assert_eq!(lex(b"$1:$1")?, vec![Token::HRange(String::from("$1:$1")), Token::EOF]); 
414        Ok(())
415    }
416
417    #[test]
418    fn test_range() -> Result<(), Error> {
419        assert_eq!(lex(b"A1:A1")?, vec![Token::Range(String::from("A1:A1")), Token::EOF]); 
420        Ok(())
421    }
422
423    #[test]
424    fn test_cell() -> Result<(), Error> {
425        assert_eq!(lex(b"A1")?, vec![Token::Cell(String::from("A1")), Token::EOF]); 
426        Ok(())
427    }
428
429    #[test]
430    fn test_ident() -> Result<(), Error> {
431        assert_eq!(lex(b"test")?, vec![Token::Ident("test".to_string()), Token::EOF]); 
432        Ok(())
433    }
434}