1use nom::branch::*;
2use nom::bytes::complete::{tag, take, take_while, take_while1};
3use nom::character::complete::{alpha1, alphanumeric1, digit1, multispace0};
4use nom::combinator::{map, map_res, recognize, opt};
5use nom::multi::many0;
6use nom::sequence::{terminated, delimited, separated_pair, pair};
7use nom::*;
8
9use std::str;
10use std::str::FromStr;
11use std::str::Utf8Error;
12
13pub mod token;
14use crate::lexer::token::*;
15use crate::errors::Error;
16
17macro_rules! syntax {
18 ($func_name: ident, $tag_string: literal, $output_token: expr) => {
19 fn $func_name<'a>(s: &'a [u8]) -> IResult<&[u8], Token> {
20 map(tag($tag_string), |_| $output_token)(s)
21 }
22 };
23}
24
25syntax! {null_err, "#NULL!", Token::Null}
27syntax! {div_err, "#DIV/0!", Token::Div}
28syntax! {value_err, "#VALUE!", Token::Value}
29syntax! {ref_err, "#REF!", Token::Ref}
30syntax! {name_err, "#NAME!", Token::Name}
31syntax! {num_err, "#NUM!", Token::Num}
32syntax! {na_err, "#N/A", Token::NA}
33syntax! {getting_data_err, "#GETTING_DATA", Token::GettingData}
34syntax! {plus, "+", Token::Plus}
35syntax! {minus, "-", Token::Minus}
36syntax! {divide, "/", Token::Divide}
37syntax! {multiply, "*", Token::Multiply}
38syntax! {exponent, "^", Token::Exponent}
39syntax! {ampersand, "&", Token::Ampersand}
40syntax! {equal, "=", Token::Equal}
41syntax! {comma, ",", Token::Comma}
42syntax! {period, ".", Token::Period}
43syntax! {colon, ":", Token::Colon}
44syntax! {semicolon, ";", Token::SemiColon}
45syntax! {langle, "<", Token::LAngle}
46syntax! {rangle, ">", Token::RAngle}
47syntax! {lparen, "(", Token::LParen}
48syntax! {rparen, ")", Token::RParen}
49syntax! {lbrace, "{", Token::LBrace}
50syntax! {rbrace, "}", Token::RBrace}
51syntax! {lbracket, "[", Token::LBracket}
52syntax! {rbracket, "]", Token::RBracket}
53syntax! {true_bool, "TRUE", Token::Boolean(true)}
54syntax! {false_bool , "FALSE", Token::Boolean(false)}
55
56pub fn lex_syntax(input: &[u8]) -> IResult<&[u8], Token> {
57 alt((
58 alt((
59 null_err,
60 div_err,
61 value_err,
62 ref_err,
63 name_err,
64 num_err,
65 na_err,
66 getting_data_err
67 )),
68 alt((
69 plus,
70 minus,
71 divide,
72 multiply,
73 exponent
74 )),
75 alt((
76 ampersand,
77 equal,
78 comma,
79 colon,
80 period,
81 semicolon,
82 langle,
83 rangle,
84 lparen,
85 rparen,
86 lbrace,
87 rbrace,
88 lbracket,
89 rbracket
90 )),
91 alt((
92 true_bool,
93 false_bool
94 ))
95 ))(input)
96}
97
98fn pis(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
100 use std::result::Result::*;
101
102 let (i1, c1) = take(1usize)(input)?;
103 match c1.as_bytes() {
104 b"\"" => Ok((input, vec![])),
105 b"\\" => {
106 let (i2, c2) = take(1usize)(i1)?;
107 pis(i2).map(|(slice, done)| (slice, concat_slice_vec(c2, done)))
108 }
109 c => pis(i1).map(|(slice, done)| (slice, concat_slice_vec(c, done))),
110 }
111}
112
113fn concat_slice_vec(c: &[u8], done: Vec<u8>) -> Vec<u8> {
114 let mut new_vec = c.to_vec();
115 new_vec.extend(&done);
116 new_vec
117}
118
119fn convert_vec_utf8(v: Vec<u8>) -> Result<String, Utf8Error> {
120 let slice = v.as_slice();
121 str::from_utf8(slice).map(|s| s.to_owned())
122}
123
124fn complete_byte_slice_str_from_utf8(c: &[u8]) -> Result<&str, Utf8Error> {
125 str::from_utf8(c)
126}
127
128fn string(input: &[u8]) -> IResult<&[u8], String> {
129 delimited(tag("\""), map_res(pis, convert_vec_utf8), tag("\""))(input)
130}
131
132fn lex_string(input: &[u8]) -> IResult<&[u8], Token> {
133 map(string, Token::Text)(input)
134}
135
136fn lex_vrange(input: &[u8]) -> IResult<&[u8], Token> {
138 let vrange_token = recognize(separated_pair(
139 pair(opt(tag("$")), alpha1),
140 tag(":"),
141 pair(opt(tag("$")), alpha1),
142 ));
143 map_res(
144 vrange_token,
145 |s| {
146 let c = complete_byte_slice_str_from_utf8(s);
147 c.map(|syntax| Token::VRange(syntax.to_string()))
148 }
149 )(input)
150}
151
152fn lex_hrange(input: &[u8]) -> IResult<&[u8], Token> {
153 let vrange_token = recognize(separated_pair(
154 pair(opt(tag("$")), digit1),
155 tag(":"),
156 pair(opt(tag("$")), digit1),
157 ));
158 map_res(
159 vrange_token,
160 |s| {
161 let c = complete_byte_slice_str_from_utf8(s);
162 c.map(|syntax| Token::HRange(syntax.to_string()))
163 }
164 )(input)
165}
166
167fn in_quote_sheet_name(chr: u8) -> bool {
168 let is_special = b"`~@#$%^&-_=+{}|;,<.>() ".contains(&chr);
169 is_digit_or_alpha(chr) || is_special
170}
171
172fn in_sheet_name(chr: u8) -> bool {
173 let is_special = b"`~@#$%^_{}|;.".contains(&chr);
174 is_digit_or_alpha(chr) || is_special
175}
176
177fn is_digit(chr: u8) -> bool {
178 (0x30..=0x39).contains(&chr)
179}
180
181fn is_alpha(chr: u8) -> bool {
182 (0x41..=0x5A).contains(&chr) || (0x61..=0x7A).contains(&chr)
183}
184
185fn is_digit_or_alpha(chr: u8) -> bool {
186 is_digit(chr) || is_alpha(chr)
187}
188
189fn lex_sheet_name(input: &[u8]) -> IResult<&[u8], &[u8]> {
190 alt((
191 take_while1(in_sheet_name),
192 recognize(delimited(tag("'"), take_while(in_quote_sheet_name), tag("'")))
193 ))(input)
194}
195
196fn lex_sheet(input: &[u8]) -> IResult<&[u8], Token> {
197 map_res(
198 alt((
199 terminated(lex_sheet_name, tag("\\!")), terminated(lex_sheet_name, tag("!")),
201 )),
202 |s| {
203 let c = complete_byte_slice_str_from_utf8(s);
204 c.map(|syntax| Token::Sheet(syntax.replace('\'', "")))
205 }
206 )(input)
207}
208
209fn lex_multisheet(input: &[u8]) -> IResult<&[u8], Token> {
210 map(
211 terminated(recognize(separated_pair(lex_sheet_name, tag(":"), lex_sheet_name)), tag("!")),
212 |a| {
213 let x = complete_byte_slice_str_from_utf8(a).unwrap();
214 Token::MultiSheet(x.to_string())
215 }
216 )(input)
217}
218
219fn lex_cell(input: &[u8]) -> IResult<&[u8], Token> {
220 map(
221 recognize(pair(pair(opt(tag("$")), alpha1), pair(opt(tag("$")), digit1))),
222 |c| {
223 let s = complete_byte_slice_str_from_utf8(c).unwrap();
224 Token::Cell(s.to_string())
225 }
226 )(input)
227}
228
229fn lex_range(input: &[u8]) -> IResult<&[u8], Token> {
230 map(
231 separated_pair(lex_cell, tag(":"), lex_cell),
232 |(a, b)| {
233 Token::Range(format!("{}:{}", a, b))
234 }
235 )(input)
236}
237
238fn lex_references(input: &[u8]) -> IResult<&[u8], Token> {
239 alt((
240 lex_multisheet,
241 lex_sheet,
242 lex_hrange,
243 lex_vrange,
244 lex_range,
245 lex_cell
246 ))(input)
247}
248
249fn complete_str_from_str<F: FromStr>(c: &str) -> Result<F, F::Err> {
251 FromStr::from_str(c)
252}
253
254fn lex_int(input: &[u8]) -> IResult<&[u8], Token> {
255 map(
256 map_res(
257 map_res(digit1, complete_byte_slice_str_from_utf8),
258 complete_str_from_str,
259 ),
260 Token::Integer,
261 )(input)
262}
263
264fn lex_float(input: &[u8]) -> IResult<&[u8], Token> {
265 map(
266 map_res(
267 recognize(separated_pair(digit1, period, digit1)),
268 complete_byte_slice_str_from_utf8
269 ),
270 |c: &str| {
271 Token::Float(c.parse::<f64>().unwrap())
272 }
273 )(input)
274}
275
276fn lex_ident(input: &[u8]) -> IResult<&[u8], Token> {
278 map(
279 map_res(
280 map_res(alphanumeric1, complete_byte_slice_str_from_utf8),
281 complete_str_from_str,
282 ),
283 Token::Ident,
284 )(input)
285}
286
287fn lex_token(input: &[u8]) -> IResult<&[u8], Token> {
289 alt((
290 lex_syntax,
291 lex_string,
292 lex_references,
293 lex_float,
294 lex_int,
295 lex_ident,
296 ))(input)
297}
298
299fn lex_tokens(input: &[u8]) -> IResult<&[u8], Vec<Token>> {
300 many0(delimited(multispace0, lex_token, multispace0))(input)
301}
302
303pub struct Lexer;
304impl Lexer {
305 pub fn lex_tokens(bytes: &[u8]) -> Result<Vec<Token>, Error> {
306 match lex_tokens(bytes)
307 .map(|(slice, result)| (slice, [&result[..]].concat())) {
308 Ok((_, mut tokens)) => {
309 tokens.push(Token::EOF);
310 Ok(tokens)
311 },
312 _ => Err(Error::UnableToLex(String::from_utf8(bytes.to_vec()).unwrap()))
313 }
314 }
315}
316
317#[cfg(test)]
318mod tests {
319 use super::*;
320 use crate::errors::Error;
321
322 fn lex(b: &[u8]) -> Result<Vec<Token>, Error> {
323 Lexer::lex_tokens(b)
324 }
325
326 #[test]
327 fn test_symbols() -> Result<(), Error> {
328 assert_eq!(lex(b"=+(){},;")?, vec![
329 Token::Equal,
330 Token::Plus,
331 Token::LParen,
332 Token::RParen,
333 Token::LBrace,
334 Token::RBrace,
335 Token::Comma,
336 Token::SemiColon,
337 Token::EOF,
338 ]);
339 Ok(())
340 }
341
342 #[test]
343 fn test_strings() -> Result<(), Error> {
344 assert_eq!(lex(b"\"this is a test\"")?, vec![
345 Token::Text(String::from("this is a test")), Token::EOF
346 ]);
347 assert_eq!(lex(b"\"this\", \"is\" \"a\" \"test\"")?, vec![
348 Token::Text(String::from("this")),
349 Token::Comma,
350 Token::Text(String::from("is")),
351 Token::Text(String::from("a")),
352 Token::Text(String::from("test")),
353 Token::EOF,
354 ]);
355 Ok(())
356 }
357
358 #[test]
359 fn test_ints() -> Result<(), Error> {
360 assert_eq!(lex(b"123")?, vec![
361 Token::Integer(123), Token::EOF
362 ]);
363 assert_eq!(lex(b"0.05")?, vec![
364 Token::Float(0.05), Token::EOF
365 ]);
366 assert_eq!(lex(b"12.30")?, vec![
367 Token::Float(12.30), Token::EOF
368 ]);
369 Ok(())
370 }
371
372 #[test]
373 fn test_errors() -> Result<(), Error> {
374 assert_eq!(lex(b"#NUM!")?, vec![Token::Num, Token::EOF]);
375 assert_eq!(lex(b"#DIV/0!")?, vec![Token::Div, Token::EOF]);
376 assert_eq!(lex(b"#VALUE!")?, vec![Token::Value, Token::EOF]);
377 assert_eq!(lex(b"#REF!")?, vec![Token::Ref, Token::EOF]);
378 assert_eq!(lex(b"#NAME!")?, vec![Token::Name, Token::EOF]);
379 assert_eq!(lex(b"#N/A")?, vec![Token::NA, Token::EOF]);
380 assert_eq!(lex(b"#GETTING_DATA")?, vec![Token::GettingData, Token::EOF]);
381 Ok(())
382 }
383
384 #[test]
385 fn test_bool() -> Result<(), Error> {
386 assert_eq!(lex(b"TRUE")?, vec![Token::Boolean(true), Token::EOF]);
387 assert_eq!(lex(b"FALSE")?, vec![Token::Boolean(false), Token::EOF]);
388 Ok(())
389 }
390
391 #[test]
392 fn test_multisheet() -> Result<(), Error> {
393 assert_eq!(lex(b"test:test!")?, vec![Token::MultiSheet(String::from("test:test")), Token::EOF]);
394 Ok(())
395 }
396
397 #[test]
398 fn test_sheet() -> Result<(), Error> {
399 assert_eq!(lex(b"'Test'!")?, vec![Token::Sheet(String::from("Test")), Token::EOF]);
400 Ok(())
401 }
402
403 #[test]
404 fn test_vrange() -> Result<(), Error> {
405 assert_eq!(lex(b"A:A")?, vec![Token::VRange(String::from("A:A")), Token::EOF]);
406 assert_eq!(lex(b"$A:$A")?, vec![Token::VRange(String::from("$A:$A")), Token::EOF]);
407 Ok(())
408 }
409
410 #[test]
411 fn test_hrange() -> Result<(), Error> {
412 assert_eq!(lex(b"1:1")?, vec![Token::HRange(String::from("1:1")), Token::EOF]);
413 assert_eq!(lex(b"$1:$1")?, vec![Token::HRange(String::from("$1:$1")), Token::EOF]);
414 Ok(())
415 }
416
417 #[test]
418 fn test_range() -> Result<(), Error> {
419 assert_eq!(lex(b"A1:A1")?, vec![Token::Range(String::from("A1:A1")), Token::EOF]);
420 Ok(())
421 }
422
423 #[test]
424 fn test_cell() -> Result<(), Error> {
425 assert_eq!(lex(b"A1")?, vec![Token::Cell(String::from("A1")), Token::EOF]);
426 Ok(())
427 }
428
429 #[test]
430 fn test_ident() -> Result<(), Error> {
431 assert_eq!(lex(b"test")?, vec![Token::Ident("test".to_string()), Token::EOF]);
432 Ok(())
433 }
434}