Skip to main content

rsomics_vcf_expr/
parse.rs

1//! Tokenizer and recursive-descent parser for the filter-expression grammar.
2
3use std::fmt;
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum ParseError {
7    UnexpectedChar(char),
8    UnexpectedToken(String),
9    UnterminatedString,
10    UnsupportedSyntax(String),
11    EmptyExpression,
12}
13
14impl fmt::Display for ParseError {
15    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
16        match self {
17            ParseError::UnexpectedChar(c) => write!(f, "unexpected character '{c}'"),
18            ParseError::UnexpectedToken(s) => write!(f, "unexpected token '{s}'"),
19            ParseError::UnterminatedString => write!(f, "unterminated string literal"),
20            ParseError::UnsupportedSyntax(s) => write!(f, "unsupported syntax: {s}"),
21            ParseError::EmptyExpression => write!(f, "empty expression"),
22        }
23    }
24}
25
26impl std::error::Error for ParseError {}
27
28// ── Field reference ───────────────────────────────────────────────────────────
29
30/// A field reference parsed from the expression.
31#[derive(Debug, Clone, PartialEq)]
32pub enum FieldRef {
33    /// `FMT/<tag>` or `FORMAT/<tag>` — per-sample FORMAT field.
34    Fmt(String),
35    /// `INFO/<tag>` — site-level INFO field.
36    Info(String),
37    /// `QUAL` — site-level QUAL column.
38    Qual,
39    /// `FILTER` — site-level FILTER string.
40    Filter,
41    /// `GT` — per-sample genotype with bcftools GT string semantics.
42    Gt,
43}
44
45// ── Value ─────────────────────────────────────────────────────────────────────
46
47#[derive(Debug, Clone, PartialEq)]
48pub enum Value {
49    /// Numeric literal.
50    Num(f64),
51    /// String literal (stripped of quotes).
52    Str(String),
53}
54
55// ── Expression AST ────────────────────────────────────────────────────────────
56
57#[derive(Debug, Clone, PartialEq)]
58pub enum CmpOp {
59    Lt,
60    Le,
61    Gt,
62    Ge,
63    Eq,
64    Ne,
65}
66
67#[derive(Debug, Clone, PartialEq)]
68pub enum LogOp {
69    /// `&`  — per-sample AND: sample passes if it satisfies BOTH sub-expressions.
70    And,
71    /// `&&` — bcftools "sample-wise AND": per-sample OR (sample passes if it satisfies EITHER).
72    /// Named "vec-and" in bcftools filter.c; confusingly acts as per-sample OR.
73    AndVec,
74    /// `|`  — site-level OR (alias: per-sample OR, same as `AndVec` for setGT purposes).
75    Or,
76    /// `||` — site-level OR (same as `|` in practice).
77    OrVec,
78}
79
80#[derive(Debug, Clone, PartialEq)]
81pub enum Expr {
82    /// `<field> <op> <value>`
83    Cmp {
84        field: FieldRef,
85        op: CmpOp,
86        val: Value,
87    },
88    /// Logical combination of two sub-expressions.
89    Logic {
90        op: LogOp,
91        lhs: Box<Expr>,
92        rhs: Box<Expr>,
93    },
94    /// `(<expr>)`
95    Paren(Box<Expr>),
96}
97
98// ── Token ─────────────────────────────────────────────────────────────────────
99
100#[derive(Debug, Clone, PartialEq)]
101enum Tok {
102    Ident(String),
103    NumLit(f64),
104    StrLit(String),
105    Lt,
106    Le,
107    Gt,
108    Ge,
109    Eq,
110    Ne,
111    /// Single `&` — per-sample AND (true intersection).
112    And,
113    /// Double `&&` — per-sample OR in bcftools semantics (vec-and).
114    AndVec,
115    /// Single `|` — OR.
116    Or,
117    /// Double `||` — OR (same as `|`).
118    OrVec,
119    LParen,
120    RParen,
121    Eof,
122}
123
124// ── Tokenizer ─────────────────────────────────────────────────────────────────
125
126struct Lexer<'a> {
127    src: &'a [u8],
128    pos: usize,
129}
130
131impl<'a> Lexer<'a> {
132    fn new(src: &'a str) -> Self {
133        Self {
134            src: src.as_bytes(),
135            pos: 0,
136        }
137    }
138
139    fn peek(&self) -> Option<u8> {
140        self.src.get(self.pos).copied()
141    }
142
143    fn advance(&mut self) -> Option<u8> {
144        let b = self.src.get(self.pos).copied();
145        self.pos += 1;
146        b
147    }
148
149    fn skip_ws(&mut self) {
150        while matches!(self.peek(), Some(b' ' | b'\t' | b'\r' | b'\n')) {
151            self.pos += 1;
152        }
153    }
154
155    fn read_ident(&mut self) -> String {
156        let start = self.pos - 1; // first char already consumed
157        while matches!(
158            self.peek(),
159            Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.')
160        ) {
161            self.pos += 1;
162        }
163        // Include a trailing `/` to allow `FMT/DP` as a single ident
164        if self.peek() == Some(b'/') {
165            self.pos += 1;
166            while matches!(
167                self.peek(),
168                Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-')
169            ) {
170                self.pos += 1;
171            }
172        }
173        String::from_utf8_lossy(&self.src[start..self.pos]).into_owned()
174    }
175
176    fn read_num(&mut self, first: u8) -> Result<f64, ParseError> {
177        let start = self.pos - 1;
178        if first == b'-' || first == b'+' {
179            // sign only — digits must follow
180        }
181        while matches!(
182            self.peek(),
183            Some(b'0'..=b'9' | b'.' | b'e' | b'E' | b'+' | b'-')
184        ) {
185            self.pos += 1;
186        }
187        let s = std::str::from_utf8(&self.src[start..self.pos]).unwrap_or("0");
188        s.parse::<f64>()
189            .map_err(|_| ParseError::UnexpectedToken(s.to_owned()))
190    }
191
192    fn read_string(&mut self, quote: u8) -> Result<String, ParseError> {
193        let mut s = String::new();
194        loop {
195            match self.advance() {
196                None => return Err(ParseError::UnterminatedString),
197                Some(b) if b == quote => break,
198                Some(b'\\') => match self.advance() {
199                    None => return Err(ParseError::UnterminatedString),
200                    Some(c) => s.push(c as char),
201                },
202                Some(b) => s.push(b as char),
203            }
204        }
205        Ok(s)
206    }
207
208    fn next_tok(&mut self) -> Result<Tok, ParseError> {
209        self.skip_ws();
210        let Some(b) = self.advance() else {
211            return Ok(Tok::Eof);
212        };
213        match b {
214            b'(' => Ok(Tok::LParen),
215            b')' => Ok(Tok::RParen),
216            b'<' => {
217                if self.peek() == Some(b'=') {
218                    self.pos += 1;
219                    Ok(Tok::Le)
220                } else {
221                    Ok(Tok::Lt)
222                }
223            }
224            b'>' => {
225                if self.peek() == Some(b'=') {
226                    self.pos += 1;
227                    Ok(Tok::Ge)
228                } else {
229                    Ok(Tok::Gt)
230                }
231            }
232            b'=' => {
233                if self.peek() == Some(b'=') {
234                    self.pos += 1;
235                    Ok(Tok::Eq)
236                } else {
237                    // bare `=` also means equality in bcftools expressions (GT=".")
238                    Ok(Tok::Eq)
239                }
240            }
241            b'!' => {
242                if self.peek() == Some(b'=') {
243                    self.pos += 1;
244                    Ok(Tok::Ne)
245                } else {
246                    Err(ParseError::UnexpectedChar('!'))
247                }
248            }
249            b'&' => {
250                if self.peek() == Some(b'&') {
251                    self.pos += 1;
252                    Ok(Tok::AndVec) // `&&` — bcftools vec-and (per-sample OR)
253                } else {
254                    Ok(Tok::And) // `&` — per-sample true AND
255                }
256            }
257            b'|' => {
258                if self.peek() == Some(b'|') {
259                    self.pos += 1;
260                    Ok(Tok::OrVec) // `||`
261                } else {
262                    Ok(Tok::Or) // `|`
263                }
264            }
265            b'"' | b'\'' => {
266                let s = self.read_string(b)?;
267                Ok(Tok::StrLit(s))
268            }
269            b'~' | b'@' => Err(ParseError::UnsupportedSyntax(format!(
270                "operator '{}' (regex/file-match) is not supported",
271                b as char
272            ))),
273            b if b.is_ascii_alphabetic() || b == b'_' => {
274                let ident = self.read_ident();
275                // Check for unsupported bracket syntax TAG[0]
276                if self.peek() == Some(b'[') {
277                    return Err(ParseError::UnsupportedSyntax(format!(
278                        "array-index syntax '{ident}[...]' is not supported"
279                    )));
280                }
281                Ok(Tok::Ident(ident))
282            }
283            b if b.is_ascii_digit() || b == b'-' => self.read_num(b).map(Tok::NumLit),
284            other => Err(ParseError::UnexpectedChar(other as char)),
285        }
286    }
287}
288
289// ── Parser ────────────────────────────────────────────────────────────────────
290
291struct Parser<'a> {
292    lex: Lexer<'a>,
293    current: Tok,
294}
295
296impl<'a> Parser<'a> {
297    fn new(src: &'a str) -> Result<Self, ParseError> {
298        let mut lex = Lexer::new(src);
299        let current = lex.next_tok()?;
300        Ok(Self { lex, current })
301    }
302
303    fn advance(&mut self) -> Result<Tok, ParseError> {
304        let old = std::mem::replace(&mut self.current, self.lex.next_tok()?);
305        Ok(old)
306    }
307
308    fn expect_eof(&self) -> Result<(), ParseError> {
309        if self.current == Tok::Eof {
310            Ok(())
311        } else {
312            Err(ParseError::UnexpectedToken(format!("{:?}", self.current)))
313        }
314    }
315
316    /// Parse a field reference from an identifier token.
317    fn parse_field(ident: &str) -> Result<FieldRef, ParseError> {
318        let up = ident.to_ascii_uppercase();
319        if up == "QUAL" {
320            return Ok(FieldRef::Qual);
321        }
322        if up == "FILTER" {
323            return Ok(FieldRef::Filter);
324        }
325        if up == "GT" {
326            return Ok(FieldRef::Gt);
327        }
328        // Unsupported aggregation / special fields
329        for bad in &[
330            "N_MISSING",
331            "F_MISSING",
332            "N_ALT",
333            "N_PASS",
334            "F_PASS",
335            "SMPL_MAX",
336            "SMPL_MIN",
337            "SMPL_AVG",
338            "AC",
339            "AN",
340            "AF",
341            "MAC",
342            "MAF",
343            "ILEN",
344            "CHROM",
345            "POS",
346            "REF",
347            "ALT",
348        ] {
349            if up == *bad {
350                return Err(ParseError::UnsupportedSyntax(format!(
351                    "field '{ident}' is not supported in rsomics-vcf-expr"
352                )));
353            }
354        }
355        if let Some(tag) = up
356            .strip_prefix("FMT/")
357            .or_else(|| up.strip_prefix("FORMAT/"))
358        {
359            return Ok(FieldRef::Fmt(tag.to_owned()));
360        }
361        if let Some(tag) = up.strip_prefix("INFO/") {
362            return Ok(FieldRef::Info(tag.to_owned()));
363        }
364        // Bare tag — treat as FORMAT if no prefix (bcftools falls back to INFO for bare names,
365        // but FORMAT fields are more common in -t q usage). We try FMT first at eval time.
366        // Represent as INFO here; eval layer will check FORMAT first.
367        Ok(FieldRef::Info(up))
368    }
369
370    fn parse_cmp_op(tok: &Tok) -> Option<CmpOp> {
371        match tok {
372            Tok::Lt => Some(CmpOp::Lt),
373            Tok::Le => Some(CmpOp::Le),
374            Tok::Gt => Some(CmpOp::Gt),
375            Tok::Ge => Some(CmpOp::Ge),
376            Tok::Eq => Some(CmpOp::Eq),
377            Tok::Ne => Some(CmpOp::Ne),
378            _ => None,
379        }
380    }
381
382    /// Parse a primary expression: `(expr)` or `field op value`.
383    fn parse_primary(&mut self) -> Result<Expr, ParseError> {
384        match &self.current {
385            Tok::LParen => {
386                self.advance()?;
387                let inner = self.parse_or()?;
388                if self.current != Tok::RParen {
389                    return Err(ParseError::UnexpectedToken(format!("{:?}", self.current)));
390                }
391                self.advance()?;
392                Ok(Expr::Paren(Box::new(inner)))
393            }
394            Tok::Ident(_) => {
395                let Tok::Ident(ident) = self.advance()? else {
396                    unreachable!()
397                };
398                let field = Self::parse_field(&ident)?;
399                // Expect comparison operator next
400                let op = Self::parse_cmp_op(&self.current)
401                    .ok_or_else(|| ParseError::UnexpectedToken(format!("{:?}", self.current)))?;
402                self.advance()?;
403                // Expect value next
404                let val = match &self.current {
405                    Tok::NumLit(n) => {
406                        let v = Value::Num(*n);
407                        self.advance()?;
408                        v
409                    }
410                    Tok::StrLit(_) => {
411                        if let Tok::StrLit(s) = self.advance()? {
412                            Value::Str(s)
413                        } else {
414                            unreachable!()
415                        }
416                    }
417                    other => {
418                        return Err(ParseError::UnexpectedToken(format!("{other:?}")));
419                    }
420                };
421                Ok(Expr::Cmp { field, op, val })
422            }
423            other => Err(ParseError::UnexpectedToken(format!("{other:?}"))),
424        }
425    }
426
427    fn tok_to_logop(tok: &Tok) -> Option<LogOp> {
428        match tok {
429            Tok::And => Some(LogOp::And),
430            Tok::AndVec => Some(LogOp::AndVec),
431            Tok::Or => Some(LogOp::Or),
432            Tok::OrVec => Some(LogOp::OrVec),
433            _ => None,
434        }
435    }
436
437    fn parse_and(&mut self) -> Result<Expr, ParseError> {
438        let mut lhs = self.parse_primary()?;
439        while matches!(self.current, Tok::And | Tok::AndVec) {
440            let op = Self::tok_to_logop(&self.current).unwrap();
441            self.advance()?;
442            let rhs = self.parse_primary()?;
443            lhs = Expr::Logic {
444                op,
445                lhs: Box::new(lhs),
446                rhs: Box::new(rhs),
447            };
448        }
449        Ok(lhs)
450    }
451
452    fn parse_or(&mut self) -> Result<Expr, ParseError> {
453        let mut lhs = self.parse_and()?;
454        while matches!(self.current, Tok::Or | Tok::OrVec) {
455            let op = Self::tok_to_logop(&self.current).unwrap();
456            self.advance()?;
457            let rhs = self.parse_and()?;
458            lhs = Expr::Logic {
459                op,
460                lhs: Box::new(lhs),
461                rhs: Box::new(rhs),
462            };
463        }
464        Ok(lhs)
465    }
466
467    fn parse(mut self) -> Result<Expr, ParseError> {
468        if self.current == Tok::Eof {
469            return Err(ParseError::EmptyExpression);
470        }
471        let expr = self.parse_or()?;
472        self.expect_eof()?;
473        Ok(expr)
474    }
475}
476
477/// Parse a bcftools-style filter expression into an AST.
478pub fn parse_expr(src: &str) -> Result<Expr, ParseError> {
479    Parser::new(src)?.parse()
480}
481
482// ── Unit tests ────────────────────────────────────────────────────────────────
483
484#[cfg(test)]
485mod tests {
486    use super::*;
487
488    #[test]
489    fn parse_fmt_dp_lt() {
490        let expr = parse_expr("FMT/DP<5").unwrap();
491        assert_eq!(
492            expr,
493            Expr::Cmp {
494                field: FieldRef::Fmt("DP".into()),
495                op: CmpOp::Lt,
496                val: Value::Num(5.0),
497            }
498        );
499    }
500
501    #[test]
502    fn parse_qual_ge() {
503        let expr = parse_expr("QUAL>=30").unwrap();
504        assert_eq!(
505            expr,
506            Expr::Cmp {
507                field: FieldRef::Qual,
508                op: CmpOp::Ge,
509                val: Value::Num(30.0)
510            }
511        );
512    }
513
514    #[test]
515    fn parse_gt_eq_missing() {
516        let expr = parse_expr("GT=\".\"").unwrap();
517        assert_eq!(
518            expr,
519            Expr::Cmp {
520                field: FieldRef::Gt,
521                op: CmpOp::Eq,
522                val: Value::Str(".".into())
523            }
524        );
525    }
526
527    #[test]
528    fn parse_and_expr() {
529        // `&` is per-sample true AND
530        let expr = parse_expr("FMT/DP>10 & FMT/GQ>=20").unwrap();
531        matches!(expr, Expr::Logic { op: LogOp::And, .. });
532    }
533
534    #[test]
535    fn parse_andvec_expr() {
536        // `&&` is bcftools "vec-and" (per-sample OR)
537        let expr = parse_expr("FMT/DP>10 && FMT/GQ>=20").unwrap();
538        matches!(
539            expr,
540            Expr::Logic {
541                op: LogOp::AndVec,
542                ..
543            }
544        );
545    }
546
547    #[test]
548    fn parse_or_expr() {
549        let expr = parse_expr("FMT/DP<5 || FMT/GQ<10").unwrap();
550        matches!(
551            expr,
552            Expr::Logic {
553                op: LogOp::OrVec,
554                ..
555            }
556        );
557    }
558
559    #[test]
560    fn parse_paren() {
561        let expr = parse_expr("(FMT/DP<5)").unwrap();
562        matches!(expr, Expr::Paren(_));
563    }
564
565    #[test]
566    fn unsupported_regex_rejected() {
567        assert!(parse_expr("FILTER~PASS").is_err());
568    }
569
570    #[test]
571    fn unsupported_array_index_rejected() {
572        assert!(parse_expr("FMT/AD[0]>5").is_err());
573    }
574
575    #[test]
576    fn empty_expression_rejected() {
577        assert!(parse_expr("").is_err());
578    }
579
580    #[test]
581    fn parse_format_prefix() {
582        let expr = parse_expr("FORMAT/DP<5").unwrap();
583        assert_eq!(
584            expr,
585            Expr::Cmp {
586                field: FieldRef::Fmt("DP".into()),
587                op: CmpOp::Lt,
588                val: Value::Num(5.0),
589            }
590        );
591    }
592
593    #[test]
594    fn parse_info_field() {
595        let expr = parse_expr("INFO/AF<0.05").unwrap();
596        assert_eq!(
597            expr,
598            Expr::Cmp {
599                field: FieldRef::Info("AF".into()),
600                op: CmpOp::Lt,
601                val: Value::Num(0.05),
602            }
603        );
604    }
605}