1use std::{error::Error, fmt};
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq)]
4pub struct Span {
5 pub start: usize,
6 pub end: usize,
7}
8
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct StageSegment {
11 pub raw: String,
12 pub span: Span,
13}
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum Op {
17 Eq,
18 EqEq,
19 Ne,
20 Lt,
21 Le,
22 Gt,
23 Ge,
24}
25
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum TokenKind {
28 Word,
29 Op(Op),
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct Token {
34 pub kind: TokenKind,
35 pub span: Span,
36 pub text: String,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum LexerError {
41 UnterminatedSingleQuote { start: usize },
42 UnterminatedDoubleQuote { start: usize },
43 TrailingEscape { index: usize },
44}
45
46impl fmt::Display for LexerError {
47 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
48 match self {
49 Self::UnterminatedSingleQuote { start } => {
50 write!(f, "unterminated single quote starting at byte {start}")
51 }
52 Self::UnterminatedDoubleQuote { start } => {
53 write!(f, "unterminated double quote starting at byte {start}")
54 }
55 Self::TrailingEscape { index } => {
56 write!(f, "trailing escape at byte {index}")
57 }
58 }
59 }
60}
61
62impl Error for LexerError {}
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65enum State {
66 Normal,
67 SingleQuote,
68 DoubleQuote,
69 EscapeNormal,
70 EscapeDouble,
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74enum ScanTransition {
75 Structural,
76 NormalChar(char),
77 QuotedChar(char),
78 EscapedChar(char),
79}
80
81#[derive(Debug, Clone, Copy)]
82struct QuoteScanner {
83 state: State,
84 base_offset: usize,
85 single_quote_start: usize,
86 double_quote_start: usize,
87}
88
89impl QuoteScanner {
90 fn new(base_offset: usize) -> Self {
91 Self {
92 state: State::Normal,
93 base_offset,
94 single_quote_start: 0,
95 double_quote_start: 0,
96 }
97 }
98
99 fn is_normal(&self) -> bool {
100 matches!(self.state, State::Normal)
101 }
102
103 fn advance(&mut self, index: usize, ch: char) -> ScanTransition {
104 match self.state {
105 State::Normal => match ch {
106 '\\' => {
107 self.state = State::EscapeNormal;
108 ScanTransition::Structural
109 }
110 '\'' => {
111 self.single_quote_start = self.base_offset + index;
112 self.state = State::SingleQuote;
113 ScanTransition::Structural
114 }
115 '"' => {
116 self.double_quote_start = self.base_offset + index;
117 self.state = State::DoubleQuote;
118 ScanTransition::Structural
119 }
120 _ => ScanTransition::NormalChar(ch),
121 },
122 State::SingleQuote => {
123 if ch == '\'' {
124 self.state = State::Normal;
125 ScanTransition::Structural
126 } else {
127 ScanTransition::QuotedChar(ch)
128 }
129 }
130 State::DoubleQuote => {
131 if ch == '"' {
132 self.state = State::Normal;
133 ScanTransition::Structural
134 } else if ch == '\\' {
135 self.state = State::EscapeDouble;
136 ScanTransition::Structural
137 } else {
138 ScanTransition::QuotedChar(ch)
139 }
140 }
141 State::EscapeNormal => {
142 self.state = State::Normal;
143 ScanTransition::EscapedChar(ch)
144 }
145 State::EscapeDouble => {
146 self.state = State::DoubleQuote;
147 ScanTransition::EscapedChar(ch)
148 }
149 }
150 }
151
152 fn finish(&self, input_len: usize) -> Result<(), LexerError> {
153 match self.state {
154 State::Normal => Ok(()),
155 State::SingleQuote => Err(LexerError::UnterminatedSingleQuote {
156 start: self.single_quote_start,
157 }),
158 State::DoubleQuote => Err(LexerError::UnterminatedDoubleQuote {
159 start: self.double_quote_start,
160 }),
161 State::EscapeNormal | State::EscapeDouble => Err(LexerError::TrailingEscape {
162 index: self.base_offset + input_len,
163 }),
164 }
165 }
166}
167
168pub fn split_pipeline(input: &str) -> Result<Vec<StageSegment>, LexerError> {
170 let mut out = Vec::new();
171 let mut scanner = QuoteScanner::new(0);
172 let mut segment_start = 0usize;
173
174 for (index, ch) in input.char_indices() {
175 if matches!(scanner.advance(index, ch), ScanTransition::NormalChar('|')) {
176 push_segment(input, segment_start, index, &mut out);
177 segment_start = index + ch.len_utf8();
178 }
179 }
180
181 scanner.finish(input.len())?;
182 push_segment(input, segment_start, input.len(), &mut out);
183 Ok(out)
184}
185
186pub fn tokenize_stage(segment: &StageSegment) -> Result<Vec<Token>, LexerError> {
188 let mut words = tokenize_words(&segment.raw, segment.span.start)?;
189 let mut out = Vec::new();
190 for word in words.drain(..) {
191 split_word_token(word, segment, &mut out);
192 }
193 Ok(out)
194}
195
196fn tokenize_words(input: &str, base_offset: usize) -> Result<Vec<Token>, LexerError> {
197 let mut scanner = QuoteScanner::new(base_offset);
198 let mut words = Vec::new();
199 let mut current = String::new();
200 let mut token_start: Option<usize> = None;
201
202 for (index, ch) in input.char_indices() {
203 if scanner.is_normal() && ch.is_whitespace() {
204 finish_word(
205 &mut words,
206 &mut current,
207 &mut token_start,
208 index,
209 base_offset,
210 );
211 continue;
212 }
213
214 if scanner.is_normal() && token_start.is_none() {
215 token_start = Some(index);
216 }
217
218 match scanner.advance(index, ch) {
219 ScanTransition::NormalChar(ch)
220 | ScanTransition::QuotedChar(ch)
221 | ScanTransition::EscapedChar(ch) => {
222 current.push(ch);
223 }
224 ScanTransition::Structural => {}
225 }
226 }
227
228 scanner.finish(input.len())?;
229 finish_word(
230 &mut words,
231 &mut current,
232 &mut token_start,
233 input.len(),
234 base_offset,
235 );
236
237 Ok(words)
238}
239
240fn finish_word(
241 out: &mut Vec<Token>,
242 current: &mut String,
243 token_start: &mut Option<usize>,
244 end_index: usize,
245 base_offset: usize,
246) {
247 if let Some(start_index) = token_start.take() {
248 out.push(Token {
249 kind: TokenKind::Word,
250 span: Span {
251 start: base_offset + start_index,
252 end: base_offset + end_index,
253 },
254 text: std::mem::take(current),
255 });
256 }
257}
258
259fn split_word_token(token: Token, segment: &StageSegment, out: &mut Vec<Token>) {
260 if token.kind != TokenKind::Word {
261 out.push(token);
262 return;
263 }
264
265 let relative_start = token.span.start.saturating_sub(segment.span.start);
266 let relative_end = token.span.end.saturating_sub(segment.span.start);
267 let raw = &segment.raw[relative_start..relative_end];
268
269 if let Some(op) = parse_full_operator(raw) {
270 out.push(Token {
271 kind: TokenKind::Op(op),
272 ..token
273 });
274 return;
275 }
276
277 let mut state = State::Normal;
278 let mut split_happened = false;
279 let mut current_text = String::new();
280 let mut current_raw_start: Option<usize> = None;
281 let mut cursor = 0usize;
282
283 while cursor < raw.len() {
284 let tail = &raw[cursor..];
285 let ch = tail
286 .chars()
287 .next()
288 .expect("cursor should always point at a valid character boundary");
289 let width = ch.len_utf8();
290
291 match state {
292 State::Normal => {
293 if current_raw_start.is_none()
294 && current_text.is_empty()
295 && cursor == 0
296 && !raw.is_empty()
297 {
298 let protected_prefix_len = protected_prefix_len(raw);
299 if protected_prefix_len > 0 && protected_prefix_len < raw.len() {
300 current_raw_start = Some(0);
301 current_text.push_str(&raw[..protected_prefix_len]);
302 cursor += protected_prefix_len;
303 continue;
304 }
305 }
306
307 match ch {
308 '\\' => {
309 current_raw_start.get_or_insert(cursor);
310 state = State::EscapeNormal;
311 }
312 '\'' => {
313 current_raw_start.get_or_insert(cursor);
314 state = State::SingleQuote;
315 }
316 '"' => {
317 current_raw_start.get_or_insert(cursor);
318 state = State::DoubleQuote;
319 }
320 _ => {
321 if let Some((op, op_width)) = parse_operator_at(raw, cursor) {
322 push_split_word(
323 out,
324 token.span.start,
325 current_raw_start.take(),
326 cursor,
327 &mut current_text,
328 );
329 out.push(Token {
330 kind: TokenKind::Op(op),
331 span: Span {
332 start: token.span.start + cursor,
333 end: token.span.start + cursor + op_width,
334 },
335 text: raw[cursor..cursor + op_width].to_string(),
336 });
337 split_happened = true;
338 cursor += op_width;
339 continue;
340 }
341
342 current_raw_start.get_or_insert(cursor);
343 current_text.push(ch);
344 }
345 }
346 }
347 State::SingleQuote => {
348 if ch == '\'' {
349 state = State::Normal;
350 } else {
351 current_text.push(ch);
352 }
353 }
354 State::DoubleQuote => {
355 if ch == '"' {
356 state = State::Normal;
357 } else if ch == '\\' {
358 state = State::EscapeDouble;
359 } else {
360 current_text.push(ch);
361 }
362 }
363 State::EscapeNormal => {
364 current_text.push(ch);
365 state = State::Normal;
366 }
367 State::EscapeDouble => {
368 current_text.push(ch);
369 state = State::DoubleQuote;
370 }
371 }
372
373 cursor += width;
374 }
375
376 if !split_happened {
377 out.push(token);
378 return;
379 }
380
381 push_split_word(
382 out,
383 token.span.start,
384 current_raw_start,
385 raw.len(),
386 &mut current_text,
387 );
388}
389
390fn push_split_word(
391 out: &mut Vec<Token>,
392 base_start: usize,
393 raw_start: Option<usize>,
394 raw_end: usize,
395 text: &mut String,
396) {
397 let Some(raw_start) = raw_start else {
398 return;
399 };
400
401 out.push(Token {
402 kind: TokenKind::Word,
403 span: Span {
404 start: base_start + raw_start,
405 end: base_start + raw_end,
406 },
407 text: std::mem::take(text),
408 });
409}
410
411fn parse_full_operator(text: &str) -> Option<Op> {
412 match text {
413 "=" => Some(Op::Eq),
414 "==" => Some(Op::EqEq),
415 "!=" => Some(Op::Ne),
416 "<" => Some(Op::Lt),
417 "<=" => Some(Op::Le),
418 ">" => Some(Op::Gt),
419 ">=" => Some(Op::Ge),
420 _ => None,
421 }
422}
423
424fn protected_prefix_len(text: &str) -> usize {
425 if text.starts_with("!?") || text.starts_with("==") || text.starts_with("!=") {
428 2
429 } else if text.starts_with('!') || text.starts_with('?') || text.starts_with('=') {
430 1
431 } else {
432 0
433 }
434}
435
436fn parse_operator_at(text: &str, offset: usize) -> Option<(Op, usize)> {
437 let tail = text.get(offset..)?;
438 if tail.starts_with("<=") {
439 return Some((Op::Le, 2));
440 }
441 if tail.starts_with(">=") {
442 return Some((Op::Ge, 2));
443 }
444 if tail.starts_with("==") {
445 return Some((Op::EqEq, 2));
446 }
447 if tail.starts_with("!=") {
448 return Some((Op::Ne, 2));
449 }
450 if tail.starts_with('<') {
451 return Some((Op::Lt, 1));
452 }
453 if tail.starts_with('>') {
454 return Some((Op::Gt, 1));
455 }
456 if tail.starts_with('=') {
457 return Some((Op::Eq, 1));
458 }
459 None
460}
461
462fn push_segment(input: &str, start: usize, end: usize, out: &mut Vec<StageSegment>) {
463 let (trimmed_start, trimmed_end) = trim_span(input, start, end);
464 if trimmed_start >= trimmed_end {
465 return;
466 }
467
468 out.push(StageSegment {
469 raw: input[trimmed_start..trimmed_end].to_string(),
470 span: Span {
471 start: trimmed_start,
472 end: trimmed_end,
473 },
474 });
475}
476
477fn trim_span(input: &str, start: usize, end: usize) -> (usize, usize) {
478 if start >= end {
479 return (start, start);
480 }
481
482 let mut trimmed_start = start;
483 while trimmed_start < end {
484 let Some(ch) = input[trimmed_start..].chars().next() else {
485 break;
486 };
487 if ch.is_whitespace() {
488 trimmed_start += ch.len_utf8();
489 } else {
490 break;
491 }
492 }
493
494 let mut trimmed_end = end;
495 while trimmed_end > trimmed_start {
496 let Some(ch) = input[..trimmed_end].chars().next_back() else {
497 break;
498 };
499 if ch.is_whitespace() {
500 trimmed_end -= ch.len_utf8();
501 } else {
502 break;
503 }
504 }
505
506 (trimmed_start, trimmed_end)
507}
508
509#[cfg(test)]
510mod tests {
511 use super::{LexerError, Op, Span, StageSegment, TokenKind, split_pipeline, tokenize_stage};
512
513 #[test]
514 fn split_pipeline_respects_quoted_pipes() {
515 let segments = split_pipeline("ldap user 'foo|bar' | P uid | F uid=oistes")
516 .expect("pipeline should parse");
517 assert_eq!(segments.len(), 3);
518 assert_eq!(segments[0].raw, "ldap user 'foo|bar'");
519 assert_eq!(segments[1].raw, "P uid");
520 assert_eq!(segments[2].raw, "F uid=oistes");
521 }
522
523 #[test]
524 fn split_pipeline_reports_unterminated_quote() {
525 let error = split_pipeline("ldap user 'foo|bar | P uid").expect_err("should fail");
526 assert_eq!(error, LexerError::UnterminatedSingleQuote { start: 10 });
527 }
528
529 #[test]
530 fn split_pipeline_reports_trailing_escape() {
531 let input = "ldap user foo\\";
532 let error = split_pipeline(input).expect_err("trailing escape should fail");
533 assert_eq!(error, LexerError::TrailingEscape { index: input.len() });
534 }
535
536 #[test]
537 fn tokenize_stage_splits_inline_operators() {
538 let stage = StageSegment {
539 raw: "F uid>=5".to_string(),
540 span: Span { start: 0, end: 8 },
541 };
542
543 let tokens = tokenize_stage(&stage).expect("tokenization should work");
544 assert_eq!(tokens.len(), 4);
545 assert_eq!(tokens[0].text, "F");
546 assert_eq!(tokens[1].text, "uid");
547 assert_eq!(tokens[2].kind, TokenKind::Op(Op::Ge));
548 assert_eq!(tokens[3].text, "5");
549 }
550
551 #[test]
552 fn tokenize_stage_keeps_prefix_operators_in_single_token() {
553 let stage = StageSegment {
554 raw: "Q ==online !?interfaces".to_string(),
555 span: Span { start: 0, end: 22 },
556 };
557
558 let tokens = tokenize_stage(&stage).expect("tokenization should work");
559 assert_eq!(tokens[1].text, "==online");
560 assert_eq!(tokens[2].text, "!?interfaces");
561 }
562
563 #[test]
564 fn tokenize_stage_handles_quotes_and_escapes() {
565 let stage = StageSegment {
566 raw: "F cn=\"foo bar\"".to_string(),
567 span: Span { start: 0, end: 14 },
568 };
569
570 let tokens = tokenize_stage(&stage).expect("tokenization should work");
571 assert_eq!(tokens[0].text, "F");
572 assert_eq!(tokens[1].text, "cn");
573 assert_eq!(tokens[2].kind, TokenKind::Op(Op::Eq));
574 assert_eq!(tokens[3].text, "foo bar");
575 }
576
577 #[test]
578 fn tokenize_stage_keeps_operator_chars_inside_quoted_value() {
579 let stage = StageSegment {
580 raw: "F note=\"a=b>=c\"".to_string(),
581 span: Span { start: 0, end: 15 },
582 };
583
584 let tokens = tokenize_stage(&stage).expect("tokenization should work");
585 assert_eq!(tokens.len(), 4);
586 assert_eq!(tokens[0].text, "F");
587 assert_eq!(tokens[1].text, "note");
588 assert_eq!(tokens[2].kind, TokenKind::Op(Op::Eq));
589 assert_eq!(tokens[3].text, "a=b>=c");
590 }
591
592 #[test]
593 fn tokenize_stage_reports_trailing_escape() {
594 let stage = StageSegment {
595 raw: "F path=C:\\Temp\\".to_string(),
596 span: Span { start: 7, end: 22 },
597 };
598
599 let error = tokenize_stage(&stage).expect_err("trailing escape should fail");
600 assert_eq!(error, LexerError::TrailingEscape { index: 22 });
601 }
602}