1use std::fmt;
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum ParseError {
7 UnexpectedChar(char),
8 UnexpectedToken(String),
9 UnterminatedString,
10 UnsupportedSyntax(String),
11 EmptyExpression,
12}
13
14impl fmt::Display for ParseError {
15 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
16 match self {
17 ParseError::UnexpectedChar(c) => write!(f, "unexpected character '{c}'"),
18 ParseError::UnexpectedToken(s) => write!(f, "unexpected token '{s}'"),
19 ParseError::UnterminatedString => write!(f, "unterminated string literal"),
20 ParseError::UnsupportedSyntax(s) => write!(f, "unsupported syntax: {s}"),
21 ParseError::EmptyExpression => write!(f, "empty expression"),
22 }
23 }
24}
25
26impl std::error::Error for ParseError {}
27
28#[derive(Debug, Clone, PartialEq)]
32pub enum FieldRef {
33 Fmt(String),
35 Info(String),
37 Qual,
39 Filter,
41 Gt,
43}
44
45#[derive(Debug, Clone, PartialEq)]
48pub enum Value {
49 Num(f64),
51 Str(String),
53}
54
55#[derive(Debug, Clone, PartialEq)]
58pub enum CmpOp {
59 Lt,
60 Le,
61 Gt,
62 Ge,
63 Eq,
64 Ne,
65}
66
67#[derive(Debug, Clone, PartialEq)]
68pub enum LogOp {
69 And,
71 AndVec,
74 Or,
76 OrVec,
78}
79
80#[derive(Debug, Clone, PartialEq)]
81pub enum Expr {
82 Cmp {
84 field: FieldRef,
85 op: CmpOp,
86 val: Value,
87 },
88 Logic {
90 op: LogOp,
91 lhs: Box<Expr>,
92 rhs: Box<Expr>,
93 },
94 Paren(Box<Expr>),
96}
97
98#[derive(Debug, Clone, PartialEq)]
101enum Tok {
102 Ident(String),
103 NumLit(f64),
104 StrLit(String),
105 Lt,
106 Le,
107 Gt,
108 Ge,
109 Eq,
110 Ne,
111 And,
113 AndVec,
115 Or,
117 OrVec,
119 LParen,
120 RParen,
121 Eof,
122}
123
124struct Lexer<'a> {
127 src: &'a [u8],
128 pos: usize,
129}
130
131impl<'a> Lexer<'a> {
132 fn new(src: &'a str) -> Self {
133 Self {
134 src: src.as_bytes(),
135 pos: 0,
136 }
137 }
138
139 fn peek(&self) -> Option<u8> {
140 self.src.get(self.pos).copied()
141 }
142
143 fn advance(&mut self) -> Option<u8> {
144 let b = self.src.get(self.pos).copied();
145 self.pos += 1;
146 b
147 }
148
149 fn skip_ws(&mut self) {
150 while matches!(self.peek(), Some(b' ' | b'\t' | b'\r' | b'\n')) {
151 self.pos += 1;
152 }
153 }
154
155 fn read_ident(&mut self) -> String {
156 let start = self.pos - 1; while matches!(
158 self.peek(),
159 Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.')
160 ) {
161 self.pos += 1;
162 }
163 if self.peek() == Some(b'/') {
165 self.pos += 1;
166 while matches!(
167 self.peek(),
168 Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-')
169 ) {
170 self.pos += 1;
171 }
172 }
173 String::from_utf8_lossy(&self.src[start..self.pos]).into_owned()
174 }
175
176 fn read_num(&mut self, first: u8) -> Result<f64, ParseError> {
177 let start = self.pos - 1;
178 if first == b'-' || first == b'+' {
179 }
181 while matches!(
182 self.peek(),
183 Some(b'0'..=b'9' | b'.' | b'e' | b'E' | b'+' | b'-')
184 ) {
185 self.pos += 1;
186 }
187 let s = std::str::from_utf8(&self.src[start..self.pos]).unwrap_or("0");
188 s.parse::<f64>()
189 .map_err(|_| ParseError::UnexpectedToken(s.to_owned()))
190 }
191
192 fn read_string(&mut self, quote: u8) -> Result<String, ParseError> {
193 let mut s = String::new();
194 loop {
195 match self.advance() {
196 None => return Err(ParseError::UnterminatedString),
197 Some(b) if b == quote => break,
198 Some(b'\\') => match self.advance() {
199 None => return Err(ParseError::UnterminatedString),
200 Some(c) => s.push(c as char),
201 },
202 Some(b) => s.push(b as char),
203 }
204 }
205 Ok(s)
206 }
207
208 fn next_tok(&mut self) -> Result<Tok, ParseError> {
209 self.skip_ws();
210 let Some(b) = self.advance() else {
211 return Ok(Tok::Eof);
212 };
213 match b {
214 b'(' => Ok(Tok::LParen),
215 b')' => Ok(Tok::RParen),
216 b'<' => {
217 if self.peek() == Some(b'=') {
218 self.pos += 1;
219 Ok(Tok::Le)
220 } else {
221 Ok(Tok::Lt)
222 }
223 }
224 b'>' => {
225 if self.peek() == Some(b'=') {
226 self.pos += 1;
227 Ok(Tok::Ge)
228 } else {
229 Ok(Tok::Gt)
230 }
231 }
232 b'=' => {
233 if self.peek() == Some(b'=') {
234 self.pos += 1;
235 Ok(Tok::Eq)
236 } else {
237 Ok(Tok::Eq)
239 }
240 }
241 b'!' => {
242 if self.peek() == Some(b'=') {
243 self.pos += 1;
244 Ok(Tok::Ne)
245 } else {
246 Err(ParseError::UnexpectedChar('!'))
247 }
248 }
249 b'&' => {
250 if self.peek() == Some(b'&') {
251 self.pos += 1;
252 Ok(Tok::AndVec) } else {
254 Ok(Tok::And) }
256 }
257 b'|' => {
258 if self.peek() == Some(b'|') {
259 self.pos += 1;
260 Ok(Tok::OrVec) } else {
262 Ok(Tok::Or) }
264 }
265 b'"' | b'\'' => {
266 let s = self.read_string(b)?;
267 Ok(Tok::StrLit(s))
268 }
269 b'~' | b'@' => Err(ParseError::UnsupportedSyntax(format!(
270 "operator '{}' (regex/file-match) is not supported",
271 b as char
272 ))),
273 b if b.is_ascii_alphabetic() || b == b'_' => {
274 let ident = self.read_ident();
275 if self.peek() == Some(b'[') {
277 return Err(ParseError::UnsupportedSyntax(format!(
278 "array-index syntax '{ident}[...]' is not supported"
279 )));
280 }
281 Ok(Tok::Ident(ident))
282 }
283 b if b.is_ascii_digit() || b == b'-' => self.read_num(b).map(Tok::NumLit),
284 other => Err(ParseError::UnexpectedChar(other as char)),
285 }
286 }
287}
288
289struct Parser<'a> {
292 lex: Lexer<'a>,
293 current: Tok,
294}
295
296impl<'a> Parser<'a> {
297 fn new(src: &'a str) -> Result<Self, ParseError> {
298 let mut lex = Lexer::new(src);
299 let current = lex.next_tok()?;
300 Ok(Self { lex, current })
301 }
302
303 fn advance(&mut self) -> Result<Tok, ParseError> {
304 let old = std::mem::replace(&mut self.current, self.lex.next_tok()?);
305 Ok(old)
306 }
307
308 fn expect_eof(&self) -> Result<(), ParseError> {
309 if self.current == Tok::Eof {
310 Ok(())
311 } else {
312 Err(ParseError::UnexpectedToken(format!("{:?}", self.current)))
313 }
314 }
315
316 fn parse_field(ident: &str) -> Result<FieldRef, ParseError> {
318 let up = ident.to_ascii_uppercase();
319 if up == "QUAL" {
320 return Ok(FieldRef::Qual);
321 }
322 if up == "FILTER" {
323 return Ok(FieldRef::Filter);
324 }
325 if up == "GT" {
326 return Ok(FieldRef::Gt);
327 }
328 for bad in &[
330 "N_MISSING",
331 "F_MISSING",
332 "N_ALT",
333 "N_PASS",
334 "F_PASS",
335 "SMPL_MAX",
336 "SMPL_MIN",
337 "SMPL_AVG",
338 "AC",
339 "AN",
340 "AF",
341 "MAC",
342 "MAF",
343 "ILEN",
344 "CHROM",
345 "POS",
346 "REF",
347 "ALT",
348 ] {
349 if up == *bad {
350 return Err(ParseError::UnsupportedSyntax(format!(
351 "field '{ident}' is not supported in rsomics-vcf-expr"
352 )));
353 }
354 }
355 if let Some(tag) = up
356 .strip_prefix("FMT/")
357 .or_else(|| up.strip_prefix("FORMAT/"))
358 {
359 return Ok(FieldRef::Fmt(tag.to_owned()));
360 }
361 if let Some(tag) = up.strip_prefix("INFO/") {
362 return Ok(FieldRef::Info(tag.to_owned()));
363 }
364 Ok(FieldRef::Info(up))
368 }
369
370 fn parse_cmp_op(tok: &Tok) -> Option<CmpOp> {
371 match tok {
372 Tok::Lt => Some(CmpOp::Lt),
373 Tok::Le => Some(CmpOp::Le),
374 Tok::Gt => Some(CmpOp::Gt),
375 Tok::Ge => Some(CmpOp::Ge),
376 Tok::Eq => Some(CmpOp::Eq),
377 Tok::Ne => Some(CmpOp::Ne),
378 _ => None,
379 }
380 }
381
382 fn parse_primary(&mut self) -> Result<Expr, ParseError> {
384 match &self.current {
385 Tok::LParen => {
386 self.advance()?;
387 let inner = self.parse_or()?;
388 if self.current != Tok::RParen {
389 return Err(ParseError::UnexpectedToken(format!("{:?}", self.current)));
390 }
391 self.advance()?;
392 Ok(Expr::Paren(Box::new(inner)))
393 }
394 Tok::Ident(_) => {
395 let Tok::Ident(ident) = self.advance()? else {
396 unreachable!()
397 };
398 let field = Self::parse_field(&ident)?;
399 let op = Self::parse_cmp_op(&self.current)
401 .ok_or_else(|| ParseError::UnexpectedToken(format!("{:?}", self.current)))?;
402 self.advance()?;
403 let val = match &self.current {
405 Tok::NumLit(n) => {
406 let v = Value::Num(*n);
407 self.advance()?;
408 v
409 }
410 Tok::StrLit(_) => {
411 if let Tok::StrLit(s) = self.advance()? {
412 Value::Str(s)
413 } else {
414 unreachable!()
415 }
416 }
417 other => {
418 return Err(ParseError::UnexpectedToken(format!("{other:?}")));
419 }
420 };
421 Ok(Expr::Cmp { field, op, val })
422 }
423 other => Err(ParseError::UnexpectedToken(format!("{other:?}"))),
424 }
425 }
426
427 fn tok_to_logop(tok: &Tok) -> Option<LogOp> {
428 match tok {
429 Tok::And => Some(LogOp::And),
430 Tok::AndVec => Some(LogOp::AndVec),
431 Tok::Or => Some(LogOp::Or),
432 Tok::OrVec => Some(LogOp::OrVec),
433 _ => None,
434 }
435 }
436
437 fn parse_and(&mut self) -> Result<Expr, ParseError> {
438 let mut lhs = self.parse_primary()?;
439 while matches!(self.current, Tok::And | Tok::AndVec) {
440 let op = Self::tok_to_logop(&self.current).unwrap();
441 self.advance()?;
442 let rhs = self.parse_primary()?;
443 lhs = Expr::Logic {
444 op,
445 lhs: Box::new(lhs),
446 rhs: Box::new(rhs),
447 };
448 }
449 Ok(lhs)
450 }
451
452 fn parse_or(&mut self) -> Result<Expr, ParseError> {
453 let mut lhs = self.parse_and()?;
454 while matches!(self.current, Tok::Or | Tok::OrVec) {
455 let op = Self::tok_to_logop(&self.current).unwrap();
456 self.advance()?;
457 let rhs = self.parse_and()?;
458 lhs = Expr::Logic {
459 op,
460 lhs: Box::new(lhs),
461 rhs: Box::new(rhs),
462 };
463 }
464 Ok(lhs)
465 }
466
467 fn parse(mut self) -> Result<Expr, ParseError> {
468 if self.current == Tok::Eof {
469 return Err(ParseError::EmptyExpression);
470 }
471 let expr = self.parse_or()?;
472 self.expect_eof()?;
473 Ok(expr)
474 }
475}
476
477pub fn parse_expr(src: &str) -> Result<Expr, ParseError> {
479 Parser::new(src)?.parse()
480}
481
482#[cfg(test)]
485mod tests {
486 use super::*;
487
488 #[test]
489 fn parse_fmt_dp_lt() {
490 let expr = parse_expr("FMT/DP<5").unwrap();
491 assert_eq!(
492 expr,
493 Expr::Cmp {
494 field: FieldRef::Fmt("DP".into()),
495 op: CmpOp::Lt,
496 val: Value::Num(5.0),
497 }
498 );
499 }
500
501 #[test]
502 fn parse_qual_ge() {
503 let expr = parse_expr("QUAL>=30").unwrap();
504 assert_eq!(
505 expr,
506 Expr::Cmp {
507 field: FieldRef::Qual,
508 op: CmpOp::Ge,
509 val: Value::Num(30.0)
510 }
511 );
512 }
513
514 #[test]
515 fn parse_gt_eq_missing() {
516 let expr = parse_expr("GT=\".\"").unwrap();
517 assert_eq!(
518 expr,
519 Expr::Cmp {
520 field: FieldRef::Gt,
521 op: CmpOp::Eq,
522 val: Value::Str(".".into())
523 }
524 );
525 }
526
527 #[test]
528 fn parse_and_expr() {
529 let expr = parse_expr("FMT/DP>10 & FMT/GQ>=20").unwrap();
531 matches!(expr, Expr::Logic { op: LogOp::And, .. });
532 }
533
534 #[test]
535 fn parse_andvec_expr() {
536 let expr = parse_expr("FMT/DP>10 && FMT/GQ>=20").unwrap();
538 matches!(
539 expr,
540 Expr::Logic {
541 op: LogOp::AndVec,
542 ..
543 }
544 );
545 }
546
547 #[test]
548 fn parse_or_expr() {
549 let expr = parse_expr("FMT/DP<5 || FMT/GQ<10").unwrap();
550 matches!(
551 expr,
552 Expr::Logic {
553 op: LogOp::OrVec,
554 ..
555 }
556 );
557 }
558
559 #[test]
560 fn parse_paren() {
561 let expr = parse_expr("(FMT/DP<5)").unwrap();
562 matches!(expr, Expr::Paren(_));
563 }
564
565 #[test]
566 fn unsupported_regex_rejected() {
567 assert!(parse_expr("FILTER~PASS").is_err());
568 }
569
570 #[test]
571 fn unsupported_array_index_rejected() {
572 assert!(parse_expr("FMT/AD[0]>5").is_err());
573 }
574
575 #[test]
576 fn empty_expression_rejected() {
577 assert!(parse_expr("").is_err());
578 }
579
580 #[test]
581 fn parse_format_prefix() {
582 let expr = parse_expr("FORMAT/DP<5").unwrap();
583 assert_eq!(
584 expr,
585 Expr::Cmp {
586 field: FieldRef::Fmt("DP".into()),
587 op: CmpOp::Lt,
588 val: Value::Num(5.0),
589 }
590 );
591 }
592
593 #[test]
594 fn parse_info_field() {
595 let expr = parse_expr("INFO/AF<0.05").unwrap();
596 assert_eq!(
597 expr,
598 Expr::Cmp {
599 field: FieldRef::Info("AF".into()),
600 op: CmpOp::Lt,
601 val: Value::Num(0.05),
602 }
603 );
604 }
605}