use std::fmt;
#[derive(Debug, Clone, PartialEq)]
pub enum ParseError {
UnexpectedChar(char),
UnexpectedToken(String),
UnterminatedString,
UnsupportedSyntax(String),
EmptyExpression,
}
impl fmt::Display for ParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ParseError::UnexpectedChar(c) => write!(f, "unexpected character '{c}'"),
ParseError::UnexpectedToken(s) => write!(f, "unexpected token '{s}'"),
ParseError::UnterminatedString => write!(f, "unterminated string literal"),
ParseError::UnsupportedSyntax(s) => write!(f, "unsupported syntax: {s}"),
ParseError::EmptyExpression => write!(f, "empty expression"),
}
}
}
impl std::error::Error for ParseError {}
#[derive(Debug, Clone, PartialEq)]
pub enum FieldRef {
Fmt(String),
Info(String),
Qual,
Filter,
Gt,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Value {
Num(f64),
Str(String),
}
#[derive(Debug, Clone, PartialEq)]
pub enum CmpOp {
Lt,
Le,
Gt,
Ge,
Eq,
Ne,
}
#[derive(Debug, Clone, PartialEq)]
pub enum LogOp {
And,
AndVec,
Or,
OrVec,
}
#[derive(Debug, Clone, PartialEq)]
pub enum Expr {
Cmp {
field: FieldRef,
op: CmpOp,
val: Value,
},
Logic {
op: LogOp,
lhs: Box<Expr>,
rhs: Box<Expr>,
},
Paren(Box<Expr>),
}
#[derive(Debug, Clone, PartialEq)]
enum Tok {
Ident(String),
NumLit(f64),
StrLit(String),
Lt,
Le,
Gt,
Ge,
Eq,
Ne,
And,
AndVec,
Or,
OrVec,
LParen,
RParen,
Eof,
}
struct Lexer<'a> {
src: &'a [u8],
pos: usize,
}
impl<'a> Lexer<'a> {
fn new(src: &'a str) -> Self {
Self {
src: src.as_bytes(),
pos: 0,
}
}
fn peek(&self) -> Option<u8> {
self.src.get(self.pos).copied()
}
fn advance(&mut self) -> Option<u8> {
let b = self.src.get(self.pos).copied();
self.pos += 1;
b
}
fn skip_ws(&mut self) {
while matches!(self.peek(), Some(b' ' | b'\t' | b'\r' | b'\n')) {
self.pos += 1;
}
}
fn read_ident(&mut self) -> String {
let start = self.pos - 1; while matches!(
self.peek(),
Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.')
) {
self.pos += 1;
}
if self.peek() == Some(b'/') {
self.pos += 1;
while matches!(
self.peek(),
Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-')
) {
self.pos += 1;
}
}
String::from_utf8_lossy(&self.src[start..self.pos]).into_owned()
}
fn read_num(&mut self, first: u8) -> Result<f64, ParseError> {
let start = self.pos - 1;
if first == b'-' || first == b'+' {
}
while matches!(
self.peek(),
Some(b'0'..=b'9' | b'.' | b'e' | b'E' | b'+' | b'-')
) {
self.pos += 1;
}
let s = std::str::from_utf8(&self.src[start..self.pos]).unwrap_or("0");
s.parse::<f64>()
.map_err(|_| ParseError::UnexpectedToken(s.to_owned()))
}
fn read_string(&mut self, quote: u8) -> Result<String, ParseError> {
let mut s = String::new();
loop {
match self.advance() {
None => return Err(ParseError::UnterminatedString),
Some(b) if b == quote => break,
Some(b'\\') => match self.advance() {
None => return Err(ParseError::UnterminatedString),
Some(c) => s.push(c as char),
},
Some(b) => s.push(b as char),
}
}
Ok(s)
}
fn next_tok(&mut self) -> Result<Tok, ParseError> {
self.skip_ws();
let Some(b) = self.advance() else {
return Ok(Tok::Eof);
};
match b {
b'(' => Ok(Tok::LParen),
b')' => Ok(Tok::RParen),
b'<' => {
if self.peek() == Some(b'=') {
self.pos += 1;
Ok(Tok::Le)
} else {
Ok(Tok::Lt)
}
}
b'>' => {
if self.peek() == Some(b'=') {
self.pos += 1;
Ok(Tok::Ge)
} else {
Ok(Tok::Gt)
}
}
b'=' => {
if self.peek() == Some(b'=') {
self.pos += 1;
Ok(Tok::Eq)
} else {
Ok(Tok::Eq)
}
}
b'!' => {
if self.peek() == Some(b'=') {
self.pos += 1;
Ok(Tok::Ne)
} else {
Err(ParseError::UnexpectedChar('!'))
}
}
b'&' => {
if self.peek() == Some(b'&') {
self.pos += 1;
Ok(Tok::AndVec) } else {
Ok(Tok::And) }
}
b'|' => {
if self.peek() == Some(b'|') {
self.pos += 1;
Ok(Tok::OrVec) } else {
Ok(Tok::Or) }
}
b'"' | b'\'' => {
let s = self.read_string(b)?;
Ok(Tok::StrLit(s))
}
b'~' | b'@' => Err(ParseError::UnsupportedSyntax(format!(
"operator '{}' (regex/file-match) is not supported",
b as char
))),
b if b.is_ascii_alphabetic() || b == b'_' => {
let ident = self.read_ident();
if self.peek() == Some(b'[') {
return Err(ParseError::UnsupportedSyntax(format!(
"array-index syntax '{ident}[...]' is not supported"
)));
}
Ok(Tok::Ident(ident))
}
b if b.is_ascii_digit() || b == b'-' => self.read_num(b).map(Tok::NumLit),
other => Err(ParseError::UnexpectedChar(other as char)),
}
}
}
struct Parser<'a> {
lex: Lexer<'a>,
current: Tok,
}
impl<'a> Parser<'a> {
fn new(src: &'a str) -> Result<Self, ParseError> {
let mut lex = Lexer::new(src);
let current = lex.next_tok()?;
Ok(Self { lex, current })
}
fn advance(&mut self) -> Result<Tok, ParseError> {
let old = std::mem::replace(&mut self.current, self.lex.next_tok()?);
Ok(old)
}
fn expect_eof(&self) -> Result<(), ParseError> {
if self.current == Tok::Eof {
Ok(())
} else {
Err(ParseError::UnexpectedToken(format!("{:?}", self.current)))
}
}
fn parse_field(ident: &str) -> Result<FieldRef, ParseError> {
let up = ident.to_ascii_uppercase();
if up == "QUAL" {
return Ok(FieldRef::Qual);
}
if up == "FILTER" {
return Ok(FieldRef::Filter);
}
if up == "GT" {
return Ok(FieldRef::Gt);
}
for bad in &[
"N_MISSING",
"F_MISSING",
"N_ALT",
"N_PASS",
"F_PASS",
"SMPL_MAX",
"SMPL_MIN",
"SMPL_AVG",
"AC",
"AN",
"AF",
"MAC",
"MAF",
"ILEN",
"CHROM",
"POS",
"REF",
"ALT",
] {
if up == *bad {
return Err(ParseError::UnsupportedSyntax(format!(
"field '{ident}' is not supported in rsomics-vcf-expr"
)));
}
}
if let Some(tag) = up
.strip_prefix("FMT/")
.or_else(|| up.strip_prefix("FORMAT/"))
{
return Ok(FieldRef::Fmt(tag.to_owned()));
}
if let Some(tag) = up.strip_prefix("INFO/") {
return Ok(FieldRef::Info(tag.to_owned()));
}
Ok(FieldRef::Info(up))
}
fn parse_cmp_op(tok: &Tok) -> Option<CmpOp> {
match tok {
Tok::Lt => Some(CmpOp::Lt),
Tok::Le => Some(CmpOp::Le),
Tok::Gt => Some(CmpOp::Gt),
Tok::Ge => Some(CmpOp::Ge),
Tok::Eq => Some(CmpOp::Eq),
Tok::Ne => Some(CmpOp::Ne),
_ => None,
}
}
fn parse_primary(&mut self) -> Result<Expr, ParseError> {
match &self.current {
Tok::LParen => {
self.advance()?;
let inner = self.parse_or()?;
if self.current != Tok::RParen {
return Err(ParseError::UnexpectedToken(format!("{:?}", self.current)));
}
self.advance()?;
Ok(Expr::Paren(Box::new(inner)))
}
Tok::Ident(_) => {
let Tok::Ident(ident) = self.advance()? else {
unreachable!()
};
let field = Self::parse_field(&ident)?;
let op = Self::parse_cmp_op(&self.current)
.ok_or_else(|| ParseError::UnexpectedToken(format!("{:?}", self.current)))?;
self.advance()?;
let val = match &self.current {
Tok::NumLit(n) => {
let v = Value::Num(*n);
self.advance()?;
v
}
Tok::StrLit(_) => {
if let Tok::StrLit(s) = self.advance()? {
Value::Str(s)
} else {
unreachable!()
}
}
other => {
return Err(ParseError::UnexpectedToken(format!("{other:?}")));
}
};
Ok(Expr::Cmp { field, op, val })
}
other => Err(ParseError::UnexpectedToken(format!("{other:?}"))),
}
}
fn tok_to_logop(tok: &Tok) -> Option<LogOp> {
match tok {
Tok::And => Some(LogOp::And),
Tok::AndVec => Some(LogOp::AndVec),
Tok::Or => Some(LogOp::Or),
Tok::OrVec => Some(LogOp::OrVec),
_ => None,
}
}
fn parse_and(&mut self) -> Result<Expr, ParseError> {
let mut lhs = self.parse_primary()?;
while matches!(self.current, Tok::And | Tok::AndVec) {
let op = Self::tok_to_logop(&self.current).unwrap();
self.advance()?;
let rhs = self.parse_primary()?;
lhs = Expr::Logic {
op,
lhs: Box::new(lhs),
rhs: Box::new(rhs),
};
}
Ok(lhs)
}
fn parse_or(&mut self) -> Result<Expr, ParseError> {
let mut lhs = self.parse_and()?;
while matches!(self.current, Tok::Or | Tok::OrVec) {
let op = Self::tok_to_logop(&self.current).unwrap();
self.advance()?;
let rhs = self.parse_and()?;
lhs = Expr::Logic {
op,
lhs: Box::new(lhs),
rhs: Box::new(rhs),
};
}
Ok(lhs)
}
fn parse(mut self) -> Result<Expr, ParseError> {
if self.current == Tok::Eof {
return Err(ParseError::EmptyExpression);
}
let expr = self.parse_or()?;
self.expect_eof()?;
Ok(expr)
}
}
pub fn parse_expr(src: &str) -> Result<Expr, ParseError> {
Parser::new(src)?.parse()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_fmt_dp_lt() {
let expr = parse_expr("FMT/DP<5").unwrap();
assert_eq!(
expr,
Expr::Cmp {
field: FieldRef::Fmt("DP".into()),
op: CmpOp::Lt,
val: Value::Num(5.0),
}
);
}
#[test]
fn parse_qual_ge() {
let expr = parse_expr("QUAL>=30").unwrap();
assert_eq!(
expr,
Expr::Cmp {
field: FieldRef::Qual,
op: CmpOp::Ge,
val: Value::Num(30.0)
}
);
}
#[test]
fn parse_gt_eq_missing() {
let expr = parse_expr("GT=\".\"").unwrap();
assert_eq!(
expr,
Expr::Cmp {
field: FieldRef::Gt,
op: CmpOp::Eq,
val: Value::Str(".".into())
}
);
}
#[test]
fn parse_and_expr() {
let expr = parse_expr("FMT/DP>10 & FMT/GQ>=20").unwrap();
matches!(expr, Expr::Logic { op: LogOp::And, .. });
}
#[test]
fn parse_andvec_expr() {
let expr = parse_expr("FMT/DP>10 && FMT/GQ>=20").unwrap();
matches!(
expr,
Expr::Logic {
op: LogOp::AndVec,
..
}
);
}
#[test]
fn parse_or_expr() {
let expr = parse_expr("FMT/DP<5 || FMT/GQ<10").unwrap();
matches!(
expr,
Expr::Logic {
op: LogOp::OrVec,
..
}
);
}
#[test]
fn parse_paren() {
let expr = parse_expr("(FMT/DP<5)").unwrap();
matches!(expr, Expr::Paren(_));
}
#[test]
fn unsupported_regex_rejected() {
assert!(parse_expr("FILTER~PASS").is_err());
}
#[test]
fn unsupported_array_index_rejected() {
assert!(parse_expr("FMT/AD[0]>5").is_err());
}
#[test]
fn empty_expression_rejected() {
assert!(parse_expr("").is_err());
}
#[test]
fn parse_format_prefix() {
let expr = parse_expr("FORMAT/DP<5").unwrap();
assert_eq!(
expr,
Expr::Cmp {
field: FieldRef::Fmt("DP".into()),
op: CmpOp::Lt,
val: Value::Num(5.0),
}
);
}
#[test]
fn parse_info_field() {
let expr = parse_expr("INFO/AF<0.05").unwrap();
assert_eq!(
expr,
Expr::Cmp {
field: FieldRef::Info("AF".into()),
op: CmpOp::Lt,
val: Value::Num(0.05),
}
);
}
}