Skip to main content

osp_cli/dsl/parse/
lexer.rs

1use std::{error::Error, fmt};
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4pub struct Span {
5    pub start: usize,
6    pub end: usize,
7}
8
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct StageSegment {
11    pub raw: String,
12    pub span: Span,
13}
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum Op {
17    Eq,
18    EqEq,
19    Ne,
20    Lt,
21    Le,
22    Gt,
23    Ge,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum TokenKind {
28    Word,
29    Op(Op),
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct Token {
34    pub kind: TokenKind,
35    pub span: Span,
36    pub text: String,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum LexerError {
41    UnterminatedSingleQuote { start: usize },
42    UnterminatedDoubleQuote { start: usize },
43    TrailingEscape { index: usize },
44}
45
46impl fmt::Display for LexerError {
47    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
48        match self {
49            Self::UnterminatedSingleQuote { start } => {
50                write!(f, "unterminated single quote starting at byte {start}")
51            }
52            Self::UnterminatedDoubleQuote { start } => {
53                write!(f, "unterminated double quote starting at byte {start}")
54            }
55            Self::TrailingEscape { index } => {
56                write!(f, "trailing escape at byte {index}")
57            }
58        }
59    }
60}
61
62impl Error for LexerError {}
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65enum State {
66    Normal,
67    SingleQuote,
68    DoubleQuote,
69    EscapeNormal,
70    EscapeDouble,
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74enum ScanTransition {
75    Structural,
76    NormalChar(char),
77    QuotedChar(char),
78    EscapedChar(char),
79}
80
81#[derive(Debug, Clone, Copy)]
82struct QuoteScanner {
83    state: State,
84    base_offset: usize,
85    single_quote_start: usize,
86    double_quote_start: usize,
87}
88
89impl QuoteScanner {
90    fn new(base_offset: usize) -> Self {
91        Self {
92            state: State::Normal,
93            base_offset,
94            single_quote_start: 0,
95            double_quote_start: 0,
96        }
97    }
98
99    fn is_normal(&self) -> bool {
100        matches!(self.state, State::Normal)
101    }
102
103    fn advance(&mut self, index: usize, ch: char) -> ScanTransition {
104        match self.state {
105            State::Normal => match ch {
106                '\\' => {
107                    self.state = State::EscapeNormal;
108                    ScanTransition::Structural
109                }
110                '\'' => {
111                    self.single_quote_start = self.base_offset + index;
112                    self.state = State::SingleQuote;
113                    ScanTransition::Structural
114                }
115                '"' => {
116                    self.double_quote_start = self.base_offset + index;
117                    self.state = State::DoubleQuote;
118                    ScanTransition::Structural
119                }
120                _ => ScanTransition::NormalChar(ch),
121            },
122            State::SingleQuote => {
123                if ch == '\'' {
124                    self.state = State::Normal;
125                    ScanTransition::Structural
126                } else {
127                    ScanTransition::QuotedChar(ch)
128                }
129            }
130            State::DoubleQuote => {
131                if ch == '"' {
132                    self.state = State::Normal;
133                    ScanTransition::Structural
134                } else if ch == '\\' {
135                    self.state = State::EscapeDouble;
136                    ScanTransition::Structural
137                } else {
138                    ScanTransition::QuotedChar(ch)
139                }
140            }
141            State::EscapeNormal => {
142                self.state = State::Normal;
143                ScanTransition::EscapedChar(ch)
144            }
145            State::EscapeDouble => {
146                self.state = State::DoubleQuote;
147                ScanTransition::EscapedChar(ch)
148            }
149        }
150    }
151
152    fn finish(&self, input_len: usize) -> Result<(), LexerError> {
153        match self.state {
154            State::Normal => Ok(()),
155            State::SingleQuote => Err(LexerError::UnterminatedSingleQuote {
156                start: self.single_quote_start,
157            }),
158            State::DoubleQuote => Err(LexerError::UnterminatedDoubleQuote {
159                start: self.double_quote_start,
160            }),
161            State::EscapeNormal | State::EscapeDouble => Err(LexerError::TrailingEscape {
162                index: self.base_offset + input_len,
163            }),
164        }
165    }
166}
167
168/// Split a full `command | stage | stage` string while respecting quotes.
169pub fn split_pipeline(input: &str) -> Result<Vec<StageSegment>, LexerError> {
170    let mut out = Vec::new();
171    let mut scanner = QuoteScanner::new(0);
172    let mut segment_start = 0usize;
173
174    for (index, ch) in input.char_indices() {
175        if matches!(scanner.advance(index, ch), ScanTransition::NormalChar('|')) {
176            push_segment(input, segment_start, index, &mut out);
177            segment_start = index + ch.len_utf8();
178        }
179    }
180
181    scanner.finish(input.len())?;
182    push_segment(input, segment_start, input.len(), &mut out);
183    Ok(out)
184}
185
186/// Tokenize one stage into words/operators while preserving token spans.
187pub fn tokenize_stage(segment: &StageSegment) -> Result<Vec<Token>, LexerError> {
188    let mut words = tokenize_words(&segment.raw, segment.span.start)?;
189    let mut out = Vec::new();
190    for word in words.drain(..) {
191        split_word_token(word, segment, &mut out);
192    }
193    Ok(out)
194}
195
196fn tokenize_words(input: &str, base_offset: usize) -> Result<Vec<Token>, LexerError> {
197    let mut scanner = QuoteScanner::new(base_offset);
198    let mut words = Vec::new();
199    let mut current = String::new();
200    let mut token_start: Option<usize> = None;
201
202    for (index, ch) in input.char_indices() {
203        if scanner.is_normal() && ch.is_whitespace() {
204            finish_word(
205                &mut words,
206                &mut current,
207                &mut token_start,
208                index,
209                base_offset,
210            );
211            continue;
212        }
213
214        if scanner.is_normal() && token_start.is_none() {
215            token_start = Some(index);
216        }
217
218        match scanner.advance(index, ch) {
219            ScanTransition::NormalChar(ch)
220            | ScanTransition::QuotedChar(ch)
221            | ScanTransition::EscapedChar(ch) => {
222                current.push(ch);
223            }
224            ScanTransition::Structural => {}
225        }
226    }
227
228    scanner.finish(input.len())?;
229    finish_word(
230        &mut words,
231        &mut current,
232        &mut token_start,
233        input.len(),
234        base_offset,
235    );
236
237    Ok(words)
238}
239
240fn finish_word(
241    out: &mut Vec<Token>,
242    current: &mut String,
243    token_start: &mut Option<usize>,
244    end_index: usize,
245    base_offset: usize,
246) {
247    if let Some(start_index) = token_start.take() {
248        out.push(Token {
249            kind: TokenKind::Word,
250            span: Span {
251                start: base_offset + start_index,
252                end: base_offset + end_index,
253            },
254            text: std::mem::take(current),
255        });
256    }
257}
258
259fn split_word_token(token: Token, segment: &StageSegment, out: &mut Vec<Token>) {
260    if token.kind != TokenKind::Word {
261        out.push(token);
262        return;
263    }
264
265    let relative_start = token.span.start.saturating_sub(segment.span.start);
266    let relative_end = token.span.end.saturating_sub(segment.span.start);
267    let raw = &segment.raw[relative_start..relative_end];
268
269    if let Some(op) = parse_full_operator(raw) {
270        out.push(Token {
271            kind: TokenKind::Op(op),
272            ..token
273        });
274        return;
275    }
276
277    let mut state = State::Normal;
278    let mut split_happened = false;
279    let mut current_text = String::new();
280    let mut current_raw_start: Option<usize> = None;
281    let mut cursor = 0usize;
282
283    while cursor < raw.len() {
284        let tail = &raw[cursor..];
285        let ch = tail
286            .chars()
287            .next()
288            .expect("cursor should always point at a valid character boundary");
289        let width = ch.len_utf8();
290
291        match state {
292            State::Normal => {
293                if current_raw_start.is_none()
294                    && current_text.is_empty()
295                    && cursor == 0
296                    && !raw.is_empty()
297                {
298                    let protected_prefix_len = protected_prefix_len(raw);
299                    if protected_prefix_len > 0 && protected_prefix_len < raw.len() {
300                        current_raw_start = Some(0);
301                        current_text.push_str(&raw[..protected_prefix_len]);
302                        cursor += protected_prefix_len;
303                        continue;
304                    }
305                }
306
307                match ch {
308                    '\\' => {
309                        current_raw_start.get_or_insert(cursor);
310                        state = State::EscapeNormal;
311                    }
312                    '\'' => {
313                        current_raw_start.get_or_insert(cursor);
314                        state = State::SingleQuote;
315                    }
316                    '"' => {
317                        current_raw_start.get_or_insert(cursor);
318                        state = State::DoubleQuote;
319                    }
320                    _ => {
321                        if let Some((op, op_width)) = parse_operator_at(raw, cursor) {
322                            push_split_word(
323                                out,
324                                token.span.start,
325                                current_raw_start.take(),
326                                cursor,
327                                &mut current_text,
328                            );
329                            out.push(Token {
330                                kind: TokenKind::Op(op),
331                                span: Span {
332                                    start: token.span.start + cursor,
333                                    end: token.span.start + cursor + op_width,
334                                },
335                                text: raw[cursor..cursor + op_width].to_string(),
336                            });
337                            split_happened = true;
338                            cursor += op_width;
339                            continue;
340                        }
341
342                        current_raw_start.get_or_insert(cursor);
343                        current_text.push(ch);
344                    }
345                }
346            }
347            State::SingleQuote => {
348                if ch == '\'' {
349                    state = State::Normal;
350                } else {
351                    current_text.push(ch);
352                }
353            }
354            State::DoubleQuote => {
355                if ch == '"' {
356                    state = State::Normal;
357                } else if ch == '\\' {
358                    state = State::EscapeDouble;
359                } else {
360                    current_text.push(ch);
361                }
362            }
363            State::EscapeNormal => {
364                current_text.push(ch);
365                state = State::Normal;
366            }
367            State::EscapeDouble => {
368                current_text.push(ch);
369                state = State::DoubleQuote;
370            }
371        }
372
373        cursor += width;
374    }
375
376    if !split_happened {
377        out.push(token);
378        return;
379    }
380
381    push_split_word(
382        out,
383        token.span.start,
384        current_raw_start,
385        raw.len(),
386        &mut current_text,
387    );
388}
389
390fn push_split_word(
391    out: &mut Vec<Token>,
392    base_start: usize,
393    raw_start: Option<usize>,
394    raw_end: usize,
395    text: &mut String,
396) {
397    let Some(raw_start) = raw_start else {
398        return;
399    };
400
401    out.push(Token {
402        kind: TokenKind::Word,
403        span: Span {
404            start: base_start + raw_start,
405            end: base_start + raw_end,
406        },
407        text: std::mem::take(text),
408    });
409}
410
411fn parse_full_operator(text: &str) -> Option<Op> {
412    match text {
413        "=" => Some(Op::Eq),
414        "==" => Some(Op::EqEq),
415        "!=" => Some(Op::Ne),
416        "<" => Some(Op::Lt),
417        "<=" => Some(Op::Le),
418        ">" => Some(Op::Gt),
419        ">=" => Some(Op::Ge),
420        _ => None,
421    }
422}
423
424fn protected_prefix_len(text: &str) -> usize {
425    // DSL prefix sigils such as `!`, `?`, `==`, and `!=` can be part of a
426    // single search token; do not split them off as standalone operators.
427    if text.starts_with("!?") || text.starts_with("==") || text.starts_with("!=") {
428        2
429    } else if text.starts_with('!') || text.starts_with('?') || text.starts_with('=') {
430        1
431    } else {
432        0
433    }
434}
435
436fn parse_operator_at(text: &str, offset: usize) -> Option<(Op, usize)> {
437    let tail = text.get(offset..)?;
438    if tail.starts_with("<=") {
439        return Some((Op::Le, 2));
440    }
441    if tail.starts_with(">=") {
442        return Some((Op::Ge, 2));
443    }
444    if tail.starts_with("==") {
445        return Some((Op::EqEq, 2));
446    }
447    if tail.starts_with("!=") {
448        return Some((Op::Ne, 2));
449    }
450    if tail.starts_with('<') {
451        return Some((Op::Lt, 1));
452    }
453    if tail.starts_with('>') {
454        return Some((Op::Gt, 1));
455    }
456    if tail.starts_with('=') {
457        return Some((Op::Eq, 1));
458    }
459    None
460}
461
462fn push_segment(input: &str, start: usize, end: usize, out: &mut Vec<StageSegment>) {
463    let (trimmed_start, trimmed_end) = trim_span(input, start, end);
464    if trimmed_start >= trimmed_end {
465        return;
466    }
467
468    out.push(StageSegment {
469        raw: input[trimmed_start..trimmed_end].to_string(),
470        span: Span {
471            start: trimmed_start,
472            end: trimmed_end,
473        },
474    });
475}
476
477fn trim_span(input: &str, start: usize, end: usize) -> (usize, usize) {
478    if start >= end {
479        return (start, start);
480    }
481
482    let mut trimmed_start = start;
483    while trimmed_start < end {
484        let Some(ch) = input[trimmed_start..].chars().next() else {
485            break;
486        };
487        if ch.is_whitespace() {
488            trimmed_start += ch.len_utf8();
489        } else {
490            break;
491        }
492    }
493
494    let mut trimmed_end = end;
495    while trimmed_end > trimmed_start {
496        let Some(ch) = input[..trimmed_end].chars().next_back() else {
497            break;
498        };
499        if ch.is_whitespace() {
500            trimmed_end -= ch.len_utf8();
501        } else {
502            break;
503        }
504    }
505
506    (trimmed_start, trimmed_end)
507}
508
509#[cfg(test)]
510mod tests {
511    use super::{LexerError, Op, Span, StageSegment, TokenKind, split_pipeline, tokenize_stage};
512
513    #[test]
514    fn split_pipeline_respects_quoted_pipes() {
515        let segments = split_pipeline("ldap user 'foo|bar' | P uid | F uid=oistes")
516            .expect("pipeline should parse");
517        assert_eq!(segments.len(), 3);
518        assert_eq!(segments[0].raw, "ldap user 'foo|bar'");
519        assert_eq!(segments[1].raw, "P uid");
520        assert_eq!(segments[2].raw, "F uid=oistes");
521    }
522
523    #[test]
524    fn split_pipeline_reports_unterminated_quote() {
525        let error = split_pipeline("ldap user 'foo|bar | P uid").expect_err("should fail");
526        assert_eq!(error, LexerError::UnterminatedSingleQuote { start: 10 });
527    }
528
529    #[test]
530    fn split_pipeline_reports_trailing_escape() {
531        let input = "ldap user foo\\";
532        let error = split_pipeline(input).expect_err("trailing escape should fail");
533        assert_eq!(error, LexerError::TrailingEscape { index: input.len() });
534    }
535
536    #[test]
537    fn tokenize_stage_splits_inline_operators() {
538        let stage = StageSegment {
539            raw: "F uid>=5".to_string(),
540            span: Span { start: 0, end: 8 },
541        };
542
543        let tokens = tokenize_stage(&stage).expect("tokenization should work");
544        assert_eq!(tokens.len(), 4);
545        assert_eq!(tokens[0].text, "F");
546        assert_eq!(tokens[1].text, "uid");
547        assert_eq!(tokens[2].kind, TokenKind::Op(Op::Ge));
548        assert_eq!(tokens[3].text, "5");
549    }
550
551    #[test]
552    fn tokenize_stage_keeps_prefix_operators_in_single_token() {
553        let stage = StageSegment {
554            raw: "Q ==online !?interfaces".to_string(),
555            span: Span { start: 0, end: 22 },
556        };
557
558        let tokens = tokenize_stage(&stage).expect("tokenization should work");
559        assert_eq!(tokens[1].text, "==online");
560        assert_eq!(tokens[2].text, "!?interfaces");
561    }
562
563    #[test]
564    fn tokenize_stage_handles_quotes_and_escapes() {
565        let stage = StageSegment {
566            raw: "F cn=\"foo bar\"".to_string(),
567            span: Span { start: 0, end: 14 },
568        };
569
570        let tokens = tokenize_stage(&stage).expect("tokenization should work");
571        assert_eq!(tokens[0].text, "F");
572        assert_eq!(tokens[1].text, "cn");
573        assert_eq!(tokens[2].kind, TokenKind::Op(Op::Eq));
574        assert_eq!(tokens[3].text, "foo bar");
575    }
576
577    #[test]
578    fn tokenize_stage_keeps_operator_chars_inside_quoted_value() {
579        let stage = StageSegment {
580            raw: "F note=\"a=b>=c\"".to_string(),
581            span: Span { start: 0, end: 15 },
582        };
583
584        let tokens = tokenize_stage(&stage).expect("tokenization should work");
585        assert_eq!(tokens.len(), 4);
586        assert_eq!(tokens[0].text, "F");
587        assert_eq!(tokens[1].text, "note");
588        assert_eq!(tokens[2].kind, TokenKind::Op(Op::Eq));
589        assert_eq!(tokens[3].text, "a=b>=c");
590    }
591
592    #[test]
593    fn tokenize_stage_reports_trailing_escape() {
594        let stage = StageSegment {
595            raw: "F path=C:\\Temp\\".to_string(),
596            span: Span { start: 7, end: 22 },
597        };
598
599        let error = tokenize_stage(&stage).expect_err("trailing escape should fail");
600        assert_eq!(error, LexerError::TrailingEscape { index: 22 });
601    }
602}