1use crate::error::{EngineError, MyError, MyResult};
2use crate::util::regex::get_match;
3use crate::{regex, regex_insensitive};
4use itertools::Itertools;
5use regex::{Regex, RegexBuilder};
6use std::cmp::{min, Ordering};
7use std::collections::VecDeque;
8use std::io::{BufRead, BufReader, Lines, Read};
9
10#[derive(Clone, Copy, Eq, PartialEq)]
11enum State {
12 Initial,
13 Delimiter(usize, usize),
14 CommentLine(usize),
15 BlockBegin(usize),
16 BlockMiddle,
17 BlockEnd(usize),
18 QuoteBegin(usize),
19 QuoteEnd(usize),
20}
21
22impl State {
23 fn index(&self) -> (usize, usize, usize) {
24 match self {
25 Self::Initial => (usize::MAX, usize::MAX, 0),
26 Self::Delimiter(offset, length) => (*offset, *length, 1),
27 Self::CommentLine(offset) => (*offset, usize::MAX, 2),
28 Self::BlockBegin(offset) => (*offset, usize::MAX, 3),
29 Self::BlockMiddle => (usize::MAX, usize::MAX, 4),
30 Self::BlockEnd(offset) => (*offset, usize::MAX, 5),
31 Self::QuoteBegin(offset) => (*offset, 1, 6),
32 Self::QuoteEnd(offset) => (*offset, 1, 7),
33 }
34 }
35}
36
37impl Ord for State {
38 fn cmp(&self, other: &Self) -> Ordering {
39 self.index().cmp(&other.index())
40 }
41}
42
43impl PartialOrd for State {
44 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
45 Some(self.cmp(other))
46 }
47}
48
49#[derive(PartialEq)]
50enum Debug {
51 Unknown, Enabled, Disabled, }
55
56pub struct Lexer<R: Read> {
57 reader: Lines<BufReader<R>>,
58 segments: VecDeque<(String, bool)>,
59 batch: Regex,
60 state: State,
61 debug: Debug,
62 interact: bool,
63}
64
65impl<R: Read> Lexer<R> {
66 pub fn new(reader: R, batch: &str, interact: bool) -> MyResult<Self> {
67 let reader = BufReader::new(reader);
68 let reader = reader.lines();
69 let segments = VecDeque::new();
70 let batch = Self::build_regex(batch)?;
71 let state = State::Initial;
72 #[cfg(any(test, debug_assertions))]
73 let debug = Debug::Unknown;
74 #[cfg(not(any(test, debug_assertions)))]
75 let debug = Debug::Disabled;
76 let lexer = Self { reader, segments, batch, state, debug, interact };
77 Ok(lexer)
78 }
79
80 fn build_regex(batch: &str) -> MyResult<Regex> {
81 let starting = regex!(r"^\w");
82 let ending = regex!(r"\w$");
83 let batch = batch.trim();
84 let prefix = if starting.is_match(batch) { r"\b" } else { "" };
85 let suffix = if ending.is_match(batch) { r"\b" } else { "" };
86 let batch = format!("{}{}{}", prefix, batch, suffix);
87 let regex = RegexBuilder::new(&batch).case_insensitive(true).build()?;
88 Ok(regex)
89 }
90
91 fn read_next(&mut self) -> MyResult<Option<String>> {
92 let line = self.read_line()?;
93 if let Some(line) = line {
94 self.debug = Debug::Disabled;
95 Ok(Some(line))
96 } else if self.debug == Debug::Enabled {
97 self.debug = Debug::Disabled;
98 Ok(Some(String::new()))
99 } else {
100 Ok(None)
101 }
102 }
103
104 fn read_line(&mut self) -> MyResult<Option<String>> {
105 if let Some(joined) = self.join_terminated() {
106 return Ok(Some(joined));
107 }
108 while let Some(line) = self.reader.next() {
109 let line = line?;
110 let term = self.parse_line(line)?;
111 if term {
112 if let Some(joined) = self.join_terminated() {
113 return Ok(Some(joined));
114 }
115 }
116 }
117 if let Some(joined) = self.join_remaining() {
118 return Ok(Some(joined));
119 }
120 Ok(None)
121 }
122
123 fn parse_line(&mut self, line: String) -> MyResult<bool> {
124 let regex = regex_insensitive!(r"^DELIMITER\s+(\S+)");
125 let line = line.trim();
126 if self.interact && line.is_empty() {
127 self.push_segment("", true);
128 Ok(true)
129 } else if let Some(captures) = regex.captures(line) {
130 let batch = get_match(&captures, 1);
131 self.batch = Self::build_regex(batch)?;
132 Ok(true)
133 } else {
134 self.parse_segments(line)
135 }
136 }
137
138 fn parse_segments(&mut self, line: &str) -> MyResult<bool> {
139 let mut start = 0;
140 let mut stop = 0;
141 let mut term = false;
142 while self.advance_state(line, stop) {
143 match self.state {
144 State::Initial => {
145 self.push_segment(&line[start..], false);
146 break;
147 }
148 State::Delimiter(offset, length) => {
149 stop += offset;
150 self.push_segment(&line[start..stop], true);
151 stop += length;
152 start = stop;
153 term = true;
154 }
155 State::CommentLine(offset) => {
156 stop += offset;
157 self.push_segment(&line[start..stop], false);
158 break;
159 }
160 State::BlockBegin(offset) => {
161 stop += offset;
162 self.push_segment(&line[start..stop], false);
163 stop += 2;
164 start = stop;
165 }
166 State::BlockMiddle => {
167 break;
168 }
169 State::BlockEnd(offset) => {
170 stop += offset + 2;
171 start = stop;
172 }
173 State::QuoteBegin(offset) => {
174 stop += offset + 1;
175 }
176 State::QuoteEnd(offset) => {
177 stop += offset + 1;
178 }
179 }
180 }
181 if let State::QuoteBegin(_) = self.state {
182 Err(MyError::Engine(EngineError::UnmatchedQuote))
183 } else {
184 Ok(term)
185 }
186 }
187
188 fn advance_state(&mut self, line: &str, stop: usize) -> bool {
189 if stop <= line.len() {
190 let tail = &line[stop..];
191 match self.state {
192 State::BlockBegin(_) | State::BlockMiddle => {
193 self.state = State::BlockMiddle;
194 if let Some(offset) = tail.find("*/") {
195 self.state = min(self.state, State::BlockEnd(offset));
196 }
197 }
198 State::QuoteBegin(_) => {
199 self.state = State::QuoteBegin(tail.len());
200 if let Some(offset) = tail.find('\'') {
201 self.state = min(self.state, State::QuoteEnd(offset));
202 }
203 }
204 _ => {
205 self.state = State::Initial;
206 if let Some(offset) = tail.find("--") {
207 self.state = min(self.state, State::CommentLine(offset));
208 }
209 if let Some(offset) = tail.find("/*") {
210 self.state = min(self.state, State::BlockBegin(offset));
211 }
212 if let Some(offset) = tail.find('\'') {
213 self.state = min(self.state, State::QuoteBegin(offset));
214 }
215 if let Some(delim) = self.batch.find(tail) {
216 self.state = min(self.state, State::Delimiter(delim.start(), delim.len()));
217 if self.debug == Debug::Unknown {
218 self.debug = Debug::Enabled;
219 }
220 }
221 }
222 }
223 return true;
224 }
225 false
226 }
227
228 fn push_segment(&mut self, segment: &str, term: bool) {
229 let segment = segment.trim().to_string();
230 if !segment.is_empty() || term {
231 self.segments.push_back((segment, term));
232 }
233 }
234
235 fn join_terminated(&mut self) -> Option<String> {
236 while let Some((index, _)) = self.segments.iter().find_position(|(_, term)| *term) {
237 let segments = self.segments
238 .drain(0..=index)
239 .map(|(seg, _)| seg)
240 .filter(|seg| !seg.is_empty())
241 .join(" ");
242 if !segments.is_empty() {
243 return Some(segments);
244 }
245 }
246 None
247 }
248
249 fn join_remaining(&mut self) -> Option<String> {
250 let segments = self.segments
251 .drain(..)
252 .map(|(seg, _)| seg)
253 .filter(|seg| !seg.is_empty())
254 .join(" ");
255 if !segments.is_empty() {
256 return Some(segments);
257 }
258 None
259 }
260}
261
262impl<R: Read> Iterator for Lexer<R> {
263 type Item = MyResult<String>;
264
265 fn next(&mut self) -> Option<Self::Item> {
266 self.read_next().transpose()
267 }
268}
269
270#[cfg(test)]
271mod tests {
272 use crate::core::lexer::Lexer;
273 use crate::error::MyResult;
274 use crate::str_vec;
275 use pretty_assertions::assert_eq;
276
277 #[test]
278 fn test_empty_file_is_parsed() -> MyResult<()> {
279 let result = split_contents("", ";", false)?;
280 assert_eq!(result.is_empty(), true);
281 Ok(())
282 }
283
284 #[test]
285 fn test_single_delimiter_is_parsed() -> MyResult<()> {
286 let expected = str_vec![""];
287 let result = split_contents(";", ";", false)?;
288 assert_eq!(result, expected);
289 Ok(())
290 }
291
292 #[test]
293 fn test_one_statement_no_terminating_semicolon_is_parsed() -> MyResult<()> {
294 let expected = str_vec![
295 "AAA BBB CCC DDD EEE FFF",
296 ];
297 let contents = "\
298AAA BBB CCC
299DDD EEE FFF
300";
301 let result = split_contents(contents, ";", false)?;
302 assert_eq!(result, expected);
303 Ok(())
304 }
305
306 #[test]
307 fn test_one_statement_with_terminating_semicolon_is_parsed() -> MyResult<()> {
308 let expected = str_vec![
309 "AAA BBB CCC DDD EEE FFF",
310 ];
311 let contents = "\
312AAA BBB CCC
313DDD EEE FFF;
314";
315 let result = split_contents(contents, ";", false)?;
316 assert_eq!(result, expected);
317 Ok(())
318 }
319
320 #[test]
321 fn test_two_statements_with_separating_semicolon_are_parsed() -> MyResult<()> {
322 let expected = str_vec![
323 "AAA BBB CCC DDD EEE FFF",
324 "GGG HHH III JJJ KKK LLL",
325 ];
326 let contents = "\
327AAA BBB CCC
328DDD EEE FFF;
329GGG HHH III
330JJJ KKK LLL
331";
332 let result = split_contents(contents, ";", false)?;
333 assert_eq!(result, expected);
334 Ok(())
335 }
336
337 #[test]
338 fn test_two_statements_with_terminating_semicolon_are_parsed() -> MyResult<()> {
339 let expected = str_vec![
340 "AAA BBB CCC DDD EEE FFF",
341 "GGG HHH III JJJ KKK LLL",
342 ];
343 let contents = "\
344AAA BBB CCC
345DDD EEE FFF;
346GGG HHH III
347JJJ KKK LLL;
348";
349 let result = split_contents(contents, ";", false)?;
350 assert_eq!(result, expected);
351 Ok(())
352 }
353
354 #[test]
355 fn test_statements_with_separating_newline_are_parsed_in_batch_mode() -> MyResult<()> {
356 let expected = str_vec![
357 "AAA BBB CCC DDD EEE FFF GGG HHH III JJJ KKK LLL",
358 ];
359 let contents = "\
360__
361AAA BBB CCC
362DDD EEE FFF
363__
364GGG HHH III
365JJJ KKK LLL
366__
367";
368 let result = split_contents(contents, ";", false)?;
369 assert_eq!(result, expected);
370 Ok(())
371 }
372
373 #[test]
374 fn test_statements_with_separating_newline_are_parsed_in_interactive_mode() -> MyResult<()> {
375 let expected = str_vec![
376 "AAA BBB CCC DDD EEE FFF",
377 "GGG HHH III JJJ KKK LLL",
378 ];
379 let contents = "\
380__
381AAA BBB CCC
382DDD EEE FFF
383__
384GGG HHH III
385JJJ KKK LLL
386__
387";
388 let result = split_contents(contents, ";", true)?;
389 assert_eq!(result, expected);
390 Ok(())
391 }
392
393 #[test]
394 fn test_statements_with_unquoted_line_comments_are_parsed() -> MyResult<()> {
395 let expected = str_vec![
396 "AAA BBB CCC DDD EEE FFF GGG HHH III JJJ KKK LLL",
397 ];
398 let contents = "\
399--
400AAA BBB CCC -- WWW
401DDD EEE FFF -- XXX
402--
403GGG HHH III -- YYY
404JJJ KKK LLL -- ZZZ
405--
406";
407 let result = split_contents(contents, ";", false)?;
408 assert_eq!(result, expected);
409 Ok(())
410 }
411
412 #[test]
413 fn test_statements_with_unquoted_block_comments_are_parsed() -> MyResult<()> {
414 let expected = str_vec![
415 "AAA CCC DDD EEE KKK LLL",
416 ];
417 let contents = "\
418AAA /* BBB */ CCC -- WWW
419DDD EEE /* FFF -- XXX
420--
421GGG HHH III -- YYY
422JJJ */ KKK LLL -- ZZZ
423";
424 let result = split_contents(contents, ";", false)?;
425 assert_eq!(result, expected);
426 Ok(())
427 }
428
429 #[test]
430 fn test_statements_with_quoted_line_comments_are_parsed() -> MyResult<()> {
431 let expected = str_vec![
432 "AAA BBB 'CCC -- WWW' DDD 'EEE FFF' '--' GGG HHH III '--' YYY 'JJJ KKK LLL -- ZZZ'",
433 ];
434 let contents = "\
435AAA BBB 'CCC -- WWW'
436DDD 'EEE FFF' -- XXX
437'--'
438GGG HHH III '--' YYY
439'JJJ KKK LLL -- ZZZ'
440";
441 let result = split_contents(contents, ";", false)?;
442 assert_eq!(result, expected);
443 Ok(())
444 }
445
446 #[test]
447 fn test_statements_with_quoted_block_comments_are_parsed() -> MyResult<()> {
448 let expected = str_vec![
449 "AAA '/*' BBB '*/' CCC DDD EEE '/*' FFF GGG HHH III JJJ '*/' KKK LLL",
450 ];
451 let contents = "\
452AAA '/*' BBB '*/' CCC -- WWW
453DDD EEE '/*' FFF -- XXX
454--
455GGG HHH III -- YYY
456JJJ '*/' KKK LLL -- ZZZ
457";
458 let result = split_contents(contents, ";", false)?;
459 assert_eq!(result, expected);
460 Ok(())
461 }
462
463 #[test]
464 fn test_statements_with_embedded_semicolons_are_parsed() -> MyResult<()> {
465 let expected = str_vec![
466 "AAA BBB",
467 "CCC DDD EEE FFF",
468 "GGG HHH",
469 "III JJJ",
470 ];
471 let contents = "\
472AAA BBB ; ; ; CCC DDD
473EEE FFF ; GGG HHH ; III JJJ
474";
475 let result = split_contents(contents, ";", false)?;
476 assert_eq!(result, expected);
477 Ok(())
478 }
479
480 #[test]
481 fn test_statements_with_quoted_semicolons_are_parsed() -> MyResult<()> {
482 let expected = str_vec![
483 "AAA ';' BBB ';' CCC",
484 "DDD '''; ; ;''' EEE",
485 ];
486 let contents = "\
487AAA ';' BBB ';' CCC;
488DDD '''; ; ;''' EEE;
489";
490 let result = split_contents(contents, ";", false)?;
491 assert_eq!(result, expected);
492 Ok(())
493 }
494
495 #[test]
496 fn test_statements_with_malformed_quotes_are_rejected() -> MyResult<()> {
497 let contents = "\
498AAA ';' BBB '; CCC;
499DDD ''';''' EEE;
500";
501 let error = split_contents(contents, ";", false).unwrap_err();
502 assert_eq!(error.to_string(), "Unmatched quote");
503 Ok(())
504 }
505
506 #[test]
507 fn test_statements_with_standalone_word_delimiter_are_parsed() -> MyResult<()> {
508 let expected = str_vec![
509 "AAA BBB XGO CCC DDD",
510 "EEE FFF GOX GGG HHH",
511 "III JJJ XGOX KKK LLL",
512 ];
513 let contents = "\
514AAA BBB
515XGO
516CCC DDD
517GO
518EEE FFF
519GOX
520GGG HHH
521GO
522III JJJ
523XGOX
524KKK LLL
525GO
526";
527 let result = split_contents(contents, "Go", false)?;
528 assert_eq!(result, expected);
529 Ok(())
530 }
531
532 #[test]
533 fn test_statements_with_prefixed_word_delimiter_are_parsed() -> MyResult<()> {
534 let expected = str_vec![
535 "AAA BBB XGO CCC DDD",
536 "EEE FFF GOX GGG HHH",
537 "III JJJ XGOX KKK LLL",
538 ];
539 let contents = "\
540AAA BBB XGO
541CCC DDD GO
542EEE FFF GOX
543GGG HHH GO
544III JJJ XGOX
545KKK LLL GO
546";
547 let result = split_contents(contents, "Go", false)?;
548 assert_eq!(result, expected);
549 Ok(())
550 }
551
552 #[test]
553 fn test_delimiter_is_changed_on_directive() -> MyResult<()> {
554 let expected = str_vec![
555 "AAA BBB",
556 "CCC DDD",
557 "EEE FFF; GGG HHH; III JJJ; KKK LLL",
558 "MMM NNN",
559 "OOO PPP",
560 ];
561 let contents = "\
562AAA BBB; CCC DDD;
563DELIMITER //
564EEE FFF; GGG HHH;
565III JJJ; KKK LLL //
566DELIMITER ;
567MMM NNN; OOO PPP;
568";
569 let result = split_contents(contents, ";", false)?;
570 assert_eq!(result, expected);
571 Ok(())
572 }
573
574 fn split_contents(contents: &str, batch: &str, interact: bool) -> MyResult<Vec<String>> {
575 let contents = contents.replace("_", " ");
576 let lexer = Lexer::new(contents.as_bytes(), batch, interact)?;
577 lexer.collect()
578 }
579}