Skip to main content

zql_cli/core/
lexer.rs

1use crate::error::{EngineError, MyError, MyResult};
2use crate::util::regex::get_match;
3use crate::{regex, regex_insensitive};
4use itertools::Itertools;
5use regex::{Regex, RegexBuilder};
6use std::cmp::{min, Ordering};
7use std::collections::VecDeque;
8use std::io::{BufRead, BufReader, Lines, Read};
9
10#[derive(Clone, Copy, Eq, PartialEq)]
11enum State {
12    Initial,
13    Delimiter(usize, usize),
14    CommentLine(usize),
15    BlockBegin(usize),
16    BlockMiddle,
17    BlockEnd(usize),
18    QuoteBegin(usize),
19    QuoteEnd(usize),
20}
21
22impl State {
23    fn index(&self) -> (usize, usize, usize) {
24        match self {
25            Self::Initial => (usize::MAX, usize::MAX, 0),
26            Self::Delimiter(offset, length) => (*offset, *length, 1),
27            Self::CommentLine(offset) => (*offset, usize::MAX, 2),
28            Self::BlockBegin(offset) => (*offset, usize::MAX, 3),
29            Self::BlockMiddle => (usize::MAX, usize::MAX, 4),
30            Self::BlockEnd(offset) => (*offset, usize::MAX, 5),
31            Self::QuoteBegin(offset) => (*offset, 1, 6),
32            Self::QuoteEnd(offset) => (*offset, 1, 7),
33        }
34    }
35}
36
37impl Ord for State {
38    fn cmp(&self, other: &Self) -> Ordering {
39        self.index().cmp(&other.index())
40    }
41}
42
43impl PartialOrd for State {
44    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
45        Some(self.cmp(other))
46    }
47}
48
49#[derive(PartialEq)]
50enum Debug {
51    Unknown, // published nothing (batch missing)
52    Enabled, // published nothing (batch found)
53    Disabled, // published something
54}
55
56pub struct Lexer<R: Read> {
57    reader: Lines<BufReader<R>>,
58    segments: VecDeque<(String, bool)>,
59    batch: Regex,
60    state: State,
61    debug: Debug,
62    interact: bool,
63}
64
65impl<R: Read> Lexer<R> {
66    pub fn new(reader: R, batch: &str, interact: bool) -> MyResult<Self> {
67        let reader = BufReader::new(reader);
68        let reader = reader.lines();
69        let segments = VecDeque::new();
70        let batch = Self::build_regex(batch)?;
71        let state = State::Initial;
72        #[cfg(any(test, debug_assertions))]
73        let debug = Debug::Unknown;
74        #[cfg(not(any(test, debug_assertions)))]
75        let debug = Debug::Disabled;
76        let lexer = Self { reader, segments, batch, state, debug, interact };
77        Ok(lexer)
78    }
79
80    fn build_regex(batch: &str) -> MyResult<Regex> {
81        let starting = regex!(r"^\w");
82        let ending = regex!(r"\w$");
83        let batch = batch.trim();
84        let prefix = if starting.is_match(batch) { r"\b" } else { "" };
85        let suffix = if ending.is_match(batch) { r"\b" } else { "" };
86        let batch = format!("{}{}{}", prefix, batch, suffix);
87        let regex = RegexBuilder::new(&batch).case_insensitive(true).build()?;
88        Ok(regex)
89    }
90
91    fn read_next(&mut self) -> MyResult<Option<String>> {
92        let line = self.read_line()?;
93        if let Some(line) = line {
94            self.debug = Debug::Disabled;
95            Ok(Some(line))
96        } else if self.debug == Debug::Enabled {
97            self.debug = Debug::Disabled;
98            Ok(Some(String::new()))
99        } else {
100            Ok(None)
101        }
102    }
103
104    fn read_line(&mut self) -> MyResult<Option<String>> {
105        if let Some(joined) = self.join_terminated() {
106            return Ok(Some(joined));
107        }
108        while let Some(line) = self.reader.next() {
109            let line = line?;
110            let term = self.parse_line(line)?;
111            if term {
112                if let Some(joined) = self.join_terminated() {
113                    return Ok(Some(joined));
114                }
115            }
116        }
117        if let Some(joined) = self.join_remaining() {
118            return Ok(Some(joined));
119        }
120        Ok(None)
121    }
122
123    fn parse_line(&mut self, line: String) -> MyResult<bool> {
124        let regex = regex_insensitive!(r"^DELIMITER\s+(\S+)");
125        let line = line.trim();
126        if self.interact && line.is_empty() {
127            self.push_segment("", true);
128            Ok(true)
129        } else if let Some(captures) = regex.captures(line) {
130            let batch = get_match(&captures, 1);
131            self.batch = Self::build_regex(batch)?;
132            Ok(true)
133        } else {
134            self.parse_segments(line)
135        }
136    }
137
138    fn parse_segments(&mut self, line: &str) -> MyResult<bool> {
139        let mut start = 0;
140        let mut stop = 0;
141        let mut term = false;
142        while self.advance_state(line, stop) {
143            match self.state {
144                State::Initial => {
145                    self.push_segment(&line[start..], false);
146                    break;
147                }
148                State::Delimiter(offset, length) => {
149                    stop += offset;
150                    self.push_segment(&line[start..stop], true);
151                    stop += length;
152                    start = stop;
153                    term = true;
154                }
155                State::CommentLine(offset) => {
156                    stop += offset;
157                    self.push_segment(&line[start..stop], false);
158                    break;
159                }
160                State::BlockBegin(offset) => {
161                    stop += offset;
162                    self.push_segment(&line[start..stop], false);
163                    stop += 2;
164                    start = stop;
165                }
166                State::BlockMiddle => {
167                    break;
168                }
169                State::BlockEnd(offset) => {
170                    stop += offset + 2;
171                    start = stop;
172                }
173                State::QuoteBegin(offset) => {
174                    stop += offset + 1;
175                }
176                State::QuoteEnd(offset) => {
177                    stop += offset + 1;
178                }
179            }
180        }
181        if let State::QuoteBegin(_) = self.state {
182            Err(MyError::Engine(EngineError::UnmatchedQuote))
183        } else {
184            Ok(term)
185        }
186    }
187
188    fn advance_state(&mut self, line: &str, stop: usize) -> bool {
189        if stop <= line.len() {
190            let tail = &line[stop..];
191            match self.state {
192                State::BlockBegin(_) | State::BlockMiddle => {
193                    self.state = State::BlockMiddle;
194                    if let Some(offset) = tail.find("*/") {
195                        self.state = min(self.state, State::BlockEnd(offset));
196                    }
197                }
198                State::QuoteBegin(_) => {
199                    self.state = State::QuoteBegin(tail.len());
200                    if let Some(offset) = tail.find('\'') {
201                        self.state = min(self.state, State::QuoteEnd(offset));
202                    }
203                }
204                _ => {
205                    self.state = State::Initial;
206                    if let Some(offset) = tail.find("--") {
207                        self.state = min(self.state, State::CommentLine(offset));
208                    }
209                    if let Some(offset) = tail.find("/*") {
210                        self.state = min(self.state, State::BlockBegin(offset));
211                    }
212                    if let Some(offset) = tail.find('\'') {
213                        self.state = min(self.state, State::QuoteBegin(offset));
214                    }
215                    if let Some(delim) = self.batch.find(tail) {
216                        self.state = min(self.state, State::Delimiter(delim.start(), delim.len()));
217                        if self.debug == Debug::Unknown {
218                            self.debug = Debug::Enabled;
219                        }
220                    }
221                }
222            }
223            return true;
224        }
225        false
226    }
227
228    fn push_segment(&mut self, segment: &str, term: bool) {
229        let segment = segment.trim().to_string();
230        if !segment.is_empty() || term {
231            self.segments.push_back((segment, term));
232        }
233    }
234
235    fn join_terminated(&mut self) -> Option<String> {
236        while let Some((index, _)) = self.segments.iter().find_position(|(_, term)| *term) {
237            let segments = self.segments
238                .drain(0..=index)
239                .map(|(seg, _)| seg)
240                .filter(|seg| !seg.is_empty())
241                .join(" ");
242            if !segments.is_empty() {
243                return Some(segments);
244            }
245        }
246        None
247    }
248
249    fn join_remaining(&mut self) -> Option<String> {
250        let segments = self.segments
251            .drain(..)
252            .map(|(seg, _)| seg)
253            .filter(|seg| !seg.is_empty())
254            .join(" ");
255        if !segments.is_empty() {
256            return Some(segments);
257        }
258        None
259    }
260}
261
262impl<R: Read> Iterator for Lexer<R> {
263    type Item = MyResult<String>;
264
265    fn next(&mut self) -> Option<Self::Item> {
266        self.read_next().transpose()
267    }
268}
269
270#[cfg(test)]
271mod tests {
272    use crate::core::lexer::Lexer;
273    use crate::error::MyResult;
274    use crate::str_vec;
275    use pretty_assertions::assert_eq;
276
277    #[test]
278    fn test_empty_file_is_parsed() -> MyResult<()> {
279        let result = split_contents("", ";", false)?;
280        assert_eq!(result.is_empty(), true);
281        Ok(())
282    }
283
284    #[test]
285    fn test_single_delimiter_is_parsed() -> MyResult<()> {
286        let expected = str_vec![""];
287        let result = split_contents(";", ";", false)?;
288        assert_eq!(result, expected);
289        Ok(())
290    }
291
292    #[test]
293    fn test_one_statement_no_terminating_semicolon_is_parsed() -> MyResult<()> {
294        let expected = str_vec![
295            "AAA BBB CCC DDD EEE FFF",
296        ];
297        let contents = "\
298AAA BBB CCC
299DDD EEE FFF
300";
301        let result = split_contents(contents, ";", false)?;
302        assert_eq!(result, expected);
303        Ok(())
304    }
305
306    #[test]
307    fn test_one_statement_with_terminating_semicolon_is_parsed() -> MyResult<()> {
308        let expected = str_vec![
309            "AAA BBB CCC DDD EEE FFF",
310        ];
311        let contents = "\
312AAA BBB CCC
313DDD EEE FFF;
314";
315        let result = split_contents(contents, ";", false)?;
316        assert_eq!(result, expected);
317        Ok(())
318    }
319
320    #[test]
321    fn test_two_statements_with_separating_semicolon_are_parsed() -> MyResult<()> {
322        let expected = str_vec![
323            "AAA BBB CCC DDD EEE FFF",
324            "GGG HHH III JJJ KKK LLL",
325        ];
326        let contents = "\
327AAA BBB CCC
328DDD EEE FFF;
329GGG HHH III
330JJJ KKK LLL
331";
332        let result = split_contents(contents, ";", false)?;
333        assert_eq!(result, expected);
334        Ok(())
335    }
336
337    #[test]
338    fn test_two_statements_with_terminating_semicolon_are_parsed() -> MyResult<()> {
339        let expected = str_vec![
340            "AAA BBB CCC DDD EEE FFF",
341            "GGG HHH III JJJ KKK LLL",
342        ];
343        let contents = "\
344AAA BBB CCC
345DDD EEE FFF;
346GGG HHH III
347JJJ KKK LLL;
348";
349        let result = split_contents(contents, ";", false)?;
350        assert_eq!(result, expected);
351        Ok(())
352    }
353
354    #[test]
355    fn test_statements_with_separating_newline_are_parsed_in_batch_mode() -> MyResult<()> {
356        let expected = str_vec![
357            "AAA BBB CCC DDD EEE FFF GGG HHH III JJJ KKK LLL",
358        ];
359        let contents = "\
360__
361AAA BBB CCC
362DDD EEE FFF
363__
364GGG HHH III
365JJJ KKK LLL
366__
367";
368        let result = split_contents(contents, ";", false)?;
369        assert_eq!(result, expected);
370        Ok(())
371    }
372
373    #[test]
374    fn test_statements_with_separating_newline_are_parsed_in_interactive_mode() -> MyResult<()> {
375        let expected = str_vec![
376            "AAA BBB CCC DDD EEE FFF",
377            "GGG HHH III JJJ KKK LLL",
378        ];
379        let contents = "\
380__
381AAA BBB CCC
382DDD EEE FFF
383__
384GGG HHH III
385JJJ KKK LLL
386__
387";
388        let result = split_contents(contents, ";", true)?;
389        assert_eq!(result, expected);
390        Ok(())
391    }
392
393    #[test]
394    fn test_statements_with_unquoted_line_comments_are_parsed() -> MyResult<()> {
395        let expected = str_vec![
396            "AAA BBB CCC DDD EEE FFF GGG HHH III JJJ KKK LLL",
397        ];
398        let contents = "\
399--
400AAA BBB CCC -- WWW
401DDD EEE FFF -- XXX
402--
403GGG HHH III -- YYY
404JJJ KKK LLL -- ZZZ
405--
406";
407        let result = split_contents(contents, ";", false)?;
408        assert_eq!(result, expected);
409        Ok(())
410    }
411
412    #[test]
413    fn test_statements_with_unquoted_block_comments_are_parsed() -> MyResult<()> {
414        let expected = str_vec![
415            "AAA CCC DDD EEE KKK LLL",
416        ];
417        let contents = "\
418AAA /* BBB */ CCC -- WWW
419DDD EEE /* FFF -- XXX
420--
421GGG HHH III -- YYY
422JJJ */ KKK LLL -- ZZZ
423";
424        let result = split_contents(contents, ";", false)?;
425        assert_eq!(result, expected);
426        Ok(())
427    }
428
429    #[test]
430    fn test_statements_with_quoted_line_comments_are_parsed() -> MyResult<()> {
431        let expected = str_vec![
432            "AAA BBB 'CCC -- WWW' DDD 'EEE FFF' '--' GGG HHH III '--' YYY 'JJJ KKK LLL -- ZZZ'",
433        ];
434        let contents = "\
435AAA BBB 'CCC -- WWW'
436DDD 'EEE FFF' -- XXX
437'--'
438GGG HHH III '--' YYY
439'JJJ KKK LLL -- ZZZ'
440";
441        let result = split_contents(contents, ";", false)?;
442        assert_eq!(result, expected);
443        Ok(())
444    }
445
446    #[test]
447    fn test_statements_with_quoted_block_comments_are_parsed() -> MyResult<()> {
448        let expected = str_vec![
449            "AAA '/*' BBB '*/' CCC DDD EEE '/*' FFF GGG HHH III JJJ '*/' KKK LLL",
450        ];
451        let contents = "\
452AAA '/*' BBB '*/' CCC -- WWW
453DDD EEE '/*' FFF -- XXX
454--
455GGG HHH III -- YYY
456JJJ '*/' KKK LLL -- ZZZ
457";
458        let result = split_contents(contents, ";", false)?;
459        assert_eq!(result, expected);
460        Ok(())
461    }
462
463    #[test]
464    fn test_statements_with_embedded_semicolons_are_parsed() -> MyResult<()> {
465        let expected = str_vec![
466            "AAA BBB",
467            "CCC DDD EEE FFF",
468            "GGG HHH",
469            "III JJJ",
470        ];
471        let contents = "\
472AAA BBB ; ; ; CCC DDD
473EEE FFF ; GGG HHH ; III JJJ
474";
475        let result = split_contents(contents, ";", false)?;
476        assert_eq!(result, expected);
477        Ok(())
478    }
479
480    #[test]
481    fn test_statements_with_quoted_semicolons_are_parsed() -> MyResult<()> {
482        let expected = str_vec![
483            "AAA ';' BBB ';' CCC",
484            "DDD '''; ; ;''' EEE",
485        ];
486        let contents = "\
487AAA ';' BBB ';' CCC;
488DDD '''; ; ;''' EEE;
489";
490        let result = split_contents(contents, ";", false)?;
491        assert_eq!(result, expected);
492        Ok(())
493    }
494
495    #[test]
496    fn test_statements_with_malformed_quotes_are_rejected() -> MyResult<()> {
497        let contents = "\
498AAA ';' BBB '; CCC;
499DDD ''';''' EEE;
500";
501        let error = split_contents(contents, ";", false).unwrap_err();
502        assert_eq!(error.to_string(), "Unmatched quote");
503        Ok(())
504    }
505
506    #[test]
507    fn test_statements_with_standalone_word_delimiter_are_parsed() -> MyResult<()> {
508        let expected = str_vec![
509            "AAA BBB XGO CCC DDD",
510            "EEE FFF GOX GGG HHH",
511            "III JJJ XGOX KKK LLL",
512        ];
513        let contents = "\
514AAA BBB
515XGO
516CCC DDD
517GO
518EEE FFF
519GOX
520GGG HHH
521GO
522III JJJ
523XGOX
524KKK LLL
525GO
526";
527        let result = split_contents(contents, "Go", false)?;
528        assert_eq!(result, expected);
529        Ok(())
530    }
531
532    #[test]
533    fn test_statements_with_prefixed_word_delimiter_are_parsed() -> MyResult<()> {
534        let expected = str_vec![
535            "AAA BBB XGO CCC DDD",
536            "EEE FFF GOX GGG HHH",
537            "III JJJ XGOX KKK LLL",
538        ];
539        let contents = "\
540AAA BBB XGO
541CCC DDD GO
542EEE FFF GOX
543GGG HHH GO
544III JJJ XGOX
545KKK LLL GO
546";
547        let result = split_contents(contents, "Go", false)?;
548        assert_eq!(result, expected);
549        Ok(())
550    }
551
552    #[test]
553    fn test_delimiter_is_changed_on_directive() -> MyResult<()> {
554        let expected = str_vec![
555            "AAA BBB",
556            "CCC DDD",
557            "EEE FFF; GGG HHH; III JJJ; KKK LLL",
558            "MMM NNN",
559            "OOO PPP",
560        ];
561        let contents = "\
562AAA BBB; CCC DDD;
563DELIMITER //
564EEE FFF; GGG HHH;
565III JJJ; KKK LLL //
566DELIMITER ;
567MMM NNN; OOO PPP;
568";
569        let result = split_contents(contents, ";", false)?;
570        assert_eq!(result, expected);
571        Ok(())
572    }
573
574    fn split_contents(contents: &str, batch: &str, interact: bool) -> MyResult<Vec<String>> {
575        let contents = contents.replace("_", " ");
576        let lexer = Lexer::new(contents.as_bytes(), batch, interact)?;
577        lexer.collect()
578    }
579}