#define HTS_BUILDING_LIBRARY
#include <config.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdint.h>
#include <float.h>
#include <regex.h>
#include <math.h>
#include "htslib/hts_expr.h"
#include "htslib/hts_log.h"
#include "textutils_internal.h"
#define MAX_REGEX 10
struct hts_filter_t {
char *str;
int parsed;
int curr_regex, max_regex;
regex_t preg[MAX_REGEX];
};
static char *ws(char *str) {
while (*str && (*str == ' ' || *str == '\t'))
str++;
return str;
}
static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res);
static int expr_func_length(hts_expr_val_t *res) {
if (!res->is_str)
return -1;
res->is_str = 0;
res->d = res->s.l;
return 0;
}
static int expr_func_min(hts_expr_val_t *res) {
if (!res->is_str)
return -1;
size_t l = res->s.l;
int v = INT_MAX;
const uint8_t *x = (uint8_t *)res->s.s;
for (l = 0; l < res->s.l; l++)
if (v > x[l])
v = x[l];
res->is_str = 0;
res->d = v == INT_MAX ? NAN : v;
return 0;
}
static int expr_func_max(hts_expr_val_t *res) {
if (!res->is_str)
return -1;
size_t l = res->s.l;
int v = INT_MIN;
const uint8_t *x = (uint8_t *)res->s.s;
for (l = 0; l < res->s.l; l++)
if (v < x[l])
v = x[l];
res->is_str = 0;
res->d = v == INT_MIN ? NAN : v;
return 0;
}
static int expr_func_avg(hts_expr_val_t *res) {
if (!res->is_str)
return -1;
size_t l = res->s.l;
double v = 0;
const uint8_t *x = (uint8_t *)res->s.s;
for (l = 0; l < res->s.l; l++)
v += x[l];
if (l)
v /= l;
res->is_str = 0;
res->d = v;
return 0;
}
static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
int func_ok = -1;
switch (*str) {
case 'a':
if (strncmp(str, "avg(", 4) == 0) {
if (expression(filt, data, fn, str+4, end, res)) return -1;
func_ok = expr_func_avg(res);
}
break;
case 'd':
if (strncmp(str, "default(", 8) == 0) {
if (expression(filt, data, fn, str+8, end, res)) return -1;
if (**end != ',')
return -1;
(*end)++;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
if (expression(filt, data, fn, ws(*end), end, &val)) return -1;
func_ok = 1;
if (!hts_expr_val_existsT(res)) {
kstring_t swap = res->s;
*res = val;
val.s = swap;
hts_expr_val_free(&val);
}
}
break;
case 'e':
if (strncmp(str, "exists(", 7) == 0) {
if (expression(filt, data, fn, str+7, end, res)) return -1;
func_ok = 1;
res->is_true = res->d = hts_expr_val_existsT(res);
res->is_str = 0;
} else if (strncmp(str, "exp(", 4) == 0) {
if (expression(filt, data, fn, str+4, end, res)) return -1;
func_ok = 1;
res->d = exp(res->d);
res->is_str = 0;
if (isnan(res->d))
hts_expr_val_undef(res);
}
break;
case 'l':
if (strncmp(str, "length(", 7) == 0) {
if (expression(filt, data, fn, str+7, end, res)) return -1;
func_ok = expr_func_length(res);
} else if (strncmp(str, "log(", 4) == 0) {
if (expression(filt, data, fn, str+4, end, res)) return -1;
func_ok = 1;
res->d = log(res->d);
res->is_str = 0;
if (isnan(res->d))
hts_expr_val_undef(res);
}
break;
case 'm':
if (strncmp(str, "min(", 4) == 0) {
if (expression(filt, data, fn, str+4, end, res)) return -1;
func_ok = expr_func_min(res);
} else if (strncmp(str, "max(", 4) == 0) {
if (expression(filt, data, fn, str+4, end, res)) return -1;
func_ok = expr_func_max(res);
}
break;
case 'p':
if (strncmp(str, "pow(", 4) == 0) {
if (expression(filt, data, fn, str+4, end, res)) return -1;
func_ok = 1;
if (**end != ',')
return -1;
(*end)++;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
if (expression(filt, data, fn, ws(*end), end, &val)) return -1;
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else if (res->is_str || val.is_str) {
hts_expr_val_free(&val); return -1;
} else {
func_ok = 1;
res->d = pow(res->d, val.d);
hts_expr_val_free(&val);
res->is_str = 0;
}
if (isnan(res->d))
hts_expr_val_undef(res);
}
break;
case 's':
if (strncmp(str, "sqrt(", 5) == 0) {
if (expression(filt, data, fn, str+5, end, res)) return -1;
func_ok = 1;
res->d = sqrt(res->d);
res->is_str = 0;
if (isnan(res->d))
hts_expr_val_undef(res);
}
break;
}
if (func_ok < 0)
return -1;
str = ws(*end);
if (*str != ')') {
fprintf(stderr, "Missing ')'\n");
return -1;
}
*end = str+1;
return 0;
}
static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
str = ws(str);
if (*str == '(') {
if (expression(filt, data, fn, str+1, end, res)) return -1;
str = ws(*end);
if (*str != ')') {
fprintf(stderr, "Missing ')'\n");
return -1;
}
*end = str+1;
return 0;
}
int fail = 0;
double d = hts_str2dbl(str, end, &fail);
if (str != *end) {
res->is_str = 0;
res->d = d;
} else {
if (*str == '"') {
res->is_str = 1;
char *e = str+1;
int backslash = 0;
while (*e && *e != '"') {
if (*e == '\\')
backslash=1, e+=1+(e[1]!='\0');
else
e++;
}
kputsn(str+1, e-(str+1), ks_clear(&res->s));
if (backslash) {
size_t i, j;
for (i = j = 0; i < res->s.l; i++) {
res->s.s[j++] = res->s.s[i];
if (res->s.s[i] == '\\') {
switch (res->s.s[++i]) {
case '"': res->s.s[j-1] = '"'; break;
case '\\':res->s.s[j-1] = '\\'; break;
case 't': res->s.s[j-1] = '\t'; break;
case 'n': res->s.s[j-1] = '\n'; break;
case 'r': res->s.s[j-1] = '\r'; break;
default: res->s.s[j++] = res->s.s[i];
}
}
}
res->s.s[j] = 0;
res->s.l = j;
}
if (*e != '"')
return -1;
*end = e+1;
} else if (fn) {
if (fn(data, str, end, res) == 0)
return 0;
else
return func_expr(filt, data, fn, str, end, res);
} else {
return -1;
}
}
return 0;
}
static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
int err;
str = ws(str);
if (*str == '+' || *str == '-') {
err = simple_expr(filt, data, fn, str+1, end, res);
if (!hts_expr_val_exists(res)) {
hts_expr_val_undef(res);
} else {
err |= res->is_str;
if (*str == '-')
res->d = -res->d;
res->is_true = res->d != 0;
}
} else if (*str == '!') {
err = unary_expr(filt, data, fn, str+1, end, res);
if (res->is_true) {
res->d = res->is_true = 0;
} else if (!hts_expr_val_exists(res)) {
res->d = res->is_true = !res->is_true;
} else if (res->is_str) {
res->d = res->is_true = (res->s.s == NULL);
} else {
res->d = !(int64_t)res->d;
res->is_true = res->d != 0;
}
res->is_str = 0;
} else if (*str == '~') {
err = unary_expr(filt, data, fn, str+1, end, res);
if (!hts_expr_val_exists(res)) {
hts_expr_val_undef(res);
} else {
err |= res->is_str;
if (!hts_expr_val_exists(res)) {
hts_expr_val_undef(res);
} else {
res->d = ~(int64_t)res->d;
res->is_true = res->d != 0;
}
}
} else {
err = simple_expr(filt, data, fn, str, end, res);
}
return err ? -1 : 0;
}
static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (unary_expr(filt, data, fn, str, end, res))
return -1;
str = *end;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
while (*str) {
str = ws(str);
if (*str == '*' || *str == '/' || *str == '%') {
if (unary_expr(filt, data, fn, str+1, end, &val)) return -1;
if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
hts_expr_val_undef(res);
} else if (val.is_str || res->is_str) {
hts_expr_val_free(&val);
return -1; }
}
if (*str == '*')
res->d *= val.d;
else if (*str == '/')
res->d /= val.d;
else if (*str == '%') {
if (val.d)
res->d = (int64_t)res->d % (int64_t)val.d;
else
hts_expr_val_undef(res);
} else
break;
res->is_true = hts_expr_val_exists(res) && (res->d != 0);
str = *end;
}
hts_expr_val_free(&val);
return 0;
}
static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (mul_expr(filt, data, fn, str, end, res))
return -1;
str = *end;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
while (*str) {
str = ws(str);
int undef = 0;
if (*str == '+' || *str == '-') {
if (mul_expr(filt, data, fn, str+1, end, &val)) return -1;
if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
undef = 1;
} else if (val.is_str || res->is_str) {
hts_expr_val_free(&val);
return -1; }
}
if (*str == '+')
res->d += val.d;
else if (*str == '-')
res->d -= val.d;
else
break;
if (undef)
hts_expr_val_undef(res);
else
res->is_true = res->d != 0;
str = *end;
}
hts_expr_val_free(&val);
return 0;
}
static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (add_expr(filt, data, fn, str, end, res)) return -1;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
int undef = 0;
for (;;) {
str = ws(*end);
if (*str == '&' && str[1] != '&') {
if (add_expr(filt, data, fn, str+1, end, &val)) return -1;
if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
undef = 1;
} else if (res->is_str || val.is_str) {
hts_expr_val_free(&val);
return -1;
}
res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0;
} else {
break;
}
}
hts_expr_val_free(&val);
if (undef)
hts_expr_val_undef(res);
return 0;
}
static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (bitand_expr(filt, data, fn, str, end, res)) return -1;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
int undef = 0;
for (;;) {
str = ws(*end);
if (*str == '^') {
if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1;
if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
undef = 1;
} else if (res->is_str || val.is_str) {
hts_expr_val_free(&val);
return -1;
}
res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0;
} else {
break;
}
}
hts_expr_val_free(&val);
if (undef)
hts_expr_val_undef(res);
return 0;
}
static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (bitxor_expr(filt, data, fn, str, end, res)) return -1;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
int undef = 0;
for (;;) {
str = ws(*end);
if (*str == '|' && str[1] != '|') {
if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1;
if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) {
undef = 1;
} else if (res->is_str || val.is_str) {
hts_expr_val_free(&val);
return -1;
}
res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0;
} else {
break;
}
}
hts_expr_val_free(&val);
if (undef)
hts_expr_val_undef(res);
return 0;
}
static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (bitor_expr(filt, data, fn, str, end, res)) return -1;
str = ws(*end);
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
int err = 0, cmp_done = 0;
if (*str == '>' && str[1] == '=') {
cmp_done = 1;
err = cmp_expr(filt, data, fn, str+2, end, &val);
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else {
res->is_true=res->d
= res->is_str && res->s.s && val.is_str && val.s.s
? strcmp(res->s.s, val.s.s) >= 0
: !res->is_str && !val.is_str && res->d >= val.d;
res->is_str = 0;
}
} else if (*str == '>') {
cmp_done = 1;
err = cmp_expr(filt, data, fn, str+1, end, &val);
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else {
res->is_true=res->d
= res->is_str && res->s.s && val.is_str && val.s.s
? strcmp(res->s.s, val.s.s) > 0
: !res->is_str && !val.is_str && res->d > val.d;
res->is_str = 0;
}
} else if (*str == '<' && str[1] == '=') {
cmp_done = 1;
err = cmp_expr(filt, data, fn, str+2, end, &val);
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else {
res->is_true=res->d
= res->is_str && res->s.s && val.is_str && val.s.s
? strcmp(res->s.s, val.s.s) <= 0
: !res->is_str && !val.is_str && res->d <= val.d;
res->is_str = 0;
}
} else if (*str == '<') {
cmp_done = 1;
err = cmp_expr(filt, data, fn, str+1, end, &val);
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else {
res->is_true=res->d
= res->is_str && res->s.s && val.is_str && val.s.s
? strcmp(res->s.s, val.s.s) < 0
: !res->is_str && !val.is_str && res->d < val.d;
res->is_str = 0;
}
}
if (cmp_done && (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)))
hts_expr_val_undef(res);
hts_expr_val_free(&val);
return err ? -1 : 0;
}
static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (cmp_expr(filt, data, fn, str, end, res)) return -1;
str = ws(*end);
int err = 0, eq_done = 0;
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
if (str[0] == '=' && str[1] == '=') {
eq_done = 1;
if ((err = eq_expr(filt, data, fn, str+2, end, &val))) {
res->is_true = res->d = 0;
} else {
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else {
res->is_true = res->d = res->is_str
? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s)==0 :0)
: !res->is_str && !val.is_str && res->d == val.d;
}
}
res->is_str = 0;
} else if (str[0] == '!' && str[1] == '=') {
eq_done = 1;
if ((err = eq_expr(filt, data, fn, str+2, end, &val))) {
res->is_true = res->d = 0;
} else {
if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) {
hts_expr_val_undef(res);
} else {
res->is_true = res->d = res->is_str
? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s) != 0 :1)
: res->is_str != val.is_str || res->d != val.d;
}
}
res->is_str = 0;
} else if ((str[0] == '=' && str[1] == '~') ||
(str[0] == '!' && str[1] == '~')) {
eq_done = 1;
err = eq_expr(filt, data, fn, str+2, end, &val);
if (!val.is_str || !res->is_str) {
hts_expr_val_free(&val);
return -1;
}
if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) {
regex_t preg_, *preg;
if (filt->curr_regex >= filt->max_regex) {
if (filt->curr_regex >= MAX_REGEX) {
preg = &preg_;
} else {
preg = &filt->preg[filt->curr_regex];
filt->max_regex++;
}
int ec = regcomp(preg, val.s.s, REG_EXTENDED | REG_NOSUB);
if (ec != 0) {
char errbuf[1024];
regerror(ec, preg, errbuf, 1024);
fprintf(stderr, "Failed regex: %.1024s\n", errbuf);
hts_expr_val_free(&val);
return -1;
}
} else {
preg = &filt->preg[filt->curr_regex];
}
res->is_true = res->d = regexec(preg, res->s.s, 0, NULL, 0) == 0
? *str == '=' : *str == '!'; if (preg == &preg_)
regfree(preg);
filt->curr_regex++;
} else {
res->is_true = 0;
}
res->is_str = 0;
}
if (eq_done && ((!hts_expr_val_exists(&val)) || !hts_expr_val_exists(res)))
hts_expr_val_undef(res);
hts_expr_val_free(&val);
return err ? -1 : 0;
}
static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
if (eq_expr(filt, data, fn, str, end, res)) return -1;
for (;;) {
hts_expr_val_t val = HTS_EXPR_VAL_INIT;
str = ws(*end);
if (str[0] == '&' && str[1] == '&') {
if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
if (!hts_expr_val_existsT(res) || !hts_expr_val_existsT(&val)) {
hts_expr_val_undef(res);
res->d = 0;
} else {
res->is_true = res->d =
(res->is_true || (res->is_str && res->s.s) || res->d) &&
(val.is_true || (val.is_str && val.s.s) || val.d);
res->is_str = 0;
}
} else if (str[0] == '|' && str[1] == '|') {
if (eq_expr(filt, data, fn, str+2, end, &val)) return -1;
if (!hts_expr_val_existsT(res) && !hts_expr_val_existsT(&val)) {
hts_expr_val_undef(res);
res->d = 0;
} else if (!hts_expr_val_existsT(res) &&
!(val.is_true || (val.is_str && val.s.s ) || val.d)) {
hts_expr_val_undef(res);
res->d = 0;
} else if (!hts_expr_val_existsT(&val) &&
!(res->is_true || (res->is_str && res->s.s) || res->d)){
hts_expr_val_undef(res);
res->d = 0;
} else {
res->is_true = res->d =
res->is_true || (res->is_str && res->s.s) || res->d ||
val.is_true || (val.is_str && val.s.s ) || val.d;
res->is_str = 0;
}
} else {
break;
}
hts_expr_val_free(&val);
}
return 0;
}
static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn,
char *str, char **end, hts_expr_val_t *res) {
return and_expr(filt, data, fn, str, end, res);
}
hts_filter_t *hts_filter_init(const char *str) {
hts_filter_t *f = calloc(1, sizeof(*f));
if (!f) return NULL;
size_t len = strlen(str)+100;
if (!(f->str = malloc(len))) {
free(f);
return NULL;
}
strcpy(f->str, str);
return f;
}
void hts_filter_free(hts_filter_t *filt) {
if (!filt)
return;
int i;
for (i = 0; i < filt->max_regex; i++)
regfree(&filt->preg[i]);
free(filt->str);
free(filt);
}
static int hts_filter_eval_(hts_filter_t *filt,
void *data, hts_expr_sym_func *fn,
hts_expr_val_t *res) {
char *end = NULL;
filt->curr_regex = 0;
if (expression(filt, data, fn, filt->str, &end, res))
return -1;
if (end && *ws(end)) {
fprintf(stderr, "Unable to parse expression at %s\n", filt->str);
return -1;
}
if (res->is_str) {
res->is_true |= res->s.s != NULL;
res->d = res->is_true;
} else if (hts_expr_val_exists(res)) {
res->is_true |= res->d != 0;
}
return 0;
}
int hts_filter_eval(hts_filter_t *filt,
void *data, hts_expr_sym_func *fn,
hts_expr_val_t *res) {
if (res->s.l != 0 || res->s.m != 0 || res->s.s != NULL) {
hts_log_error("Results structure must be cleared before calling this function");
return -1;
}
memset(res, 0, sizeof(*res));
return hts_filter_eval_(filt, data, fn, res);
}
int hts_filter_eval2(hts_filter_t *filt,
void *data, hts_expr_sym_func *fn,
hts_expr_val_t *res) {
ks_free(&res->s);
memset(res, 0, sizeof(*res));
return hts_filter_eval_(filt, data, fn, res);
}