mod kernel;
use std::fmt::Display;
use std::fmt::Formatter;
pub use kernel::*;
use prost::Message;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_proto::expr as pb;
use vortex_session::VortexSession;
use crate::ArrayRef;
use crate::ExecutionCtx;
use crate::arrow::Datum;
use crate::arrow::from_arrow_array_with_len;
use crate::dtype::DType;
use crate::expr::Expression;
use crate::expr::StatsCatalog;
use crate::expr::and;
use crate::expr::gt;
use crate::expr::gt_eq;
use crate::expr::lit;
use crate::expr::lt;
use crate::expr::or;
use crate::scalar::StringLike;
use crate::scalar_fn::Arity;
use crate::scalar_fn::ChildName;
use crate::scalar_fn::ExecutionArgs;
use crate::scalar_fn::ScalarFnId;
use crate::scalar_fn::ScalarFnVTable;
use crate::scalar_fn::fns::literal::Literal;
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct LikeOptions {
pub negated: bool,
pub case_insensitive: bool,
}
impl Display for LikeOptions {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
if self.negated {
write!(f, "NOT ")?;
}
if self.case_insensitive {
write!(f, "ILIKE")
} else {
write!(f, "LIKE")
}
}
}
#[derive(Clone)]
pub struct Like;
impl ScalarFnVTable for Like {
type Options = LikeOptions;
fn id(&self) -> ScalarFnId {
ScalarFnId::from("vortex.like")
}
fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
Ok(Some(
pb::LikeOpts {
negated: instance.negated,
case_insensitive: instance.case_insensitive,
}
.encode_to_vec(),
))
}
fn deserialize(
&self,
_metadata: &[u8],
_session: &VortexSession,
) -> VortexResult<Self::Options> {
let opts = pb::LikeOpts::decode(_metadata)?;
Ok(LikeOptions {
negated: opts.negated,
case_insensitive: opts.case_insensitive,
})
}
fn arity(&self, _options: &Self::Options) -> Arity {
Arity::Exact(2)
}
fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
match child_idx {
0 => ChildName::from("child"),
1 => ChildName::from("pattern"),
_ => unreachable!("Invalid child index {} for Like expression", child_idx),
}
}
fn fmt_sql(
&self,
options: &Self::Options,
expr: &Expression,
f: &mut Formatter<'_>,
) -> std::fmt::Result {
expr.child(0).fmt_sql(f)?;
if options.negated {
write!(f, " not")?;
}
if options.case_insensitive {
write!(f, " ilike ")?;
} else {
write!(f, " like ")?;
}
expr.child(1).fmt_sql(f)
}
fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
let input = &arg_dtypes[0];
let pattern = &arg_dtypes[1];
if !input.is_utf8() {
vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
}
if !pattern.is_utf8() {
vortex_bail!(
"LIKE expression requires UTF8 pattern dtype, got {}",
pattern
);
}
Ok(DType::Bool(
(input.is_nullable() || pattern.is_nullable()).into(),
))
}
fn execute(
&self,
options: &Self::Options,
args: &dyn ExecutionArgs,
_ctx: &mut ExecutionCtx,
) -> VortexResult<ArrayRef> {
let child = args.get(0)?;
let pattern = args.get(1)?;
arrow_like(&child, &pattern, *options)
}
fn validity(
&self,
_options: &Self::Options,
expression: &Expression,
) -> VortexResult<Option<Expression>> {
tracing::warn!("Computing validity for LIKE expression");
let child_validity = expression.child(0).validity()?;
let pattern_validity = expression.child(1).validity()?;
Ok(Some(and(child_validity, pattern_validity)))
}
fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
false
}
fn stat_falsification(
&self,
like_opts: &LikeOptions,
expr: &Expression,
catalog: &dyn StatsCatalog,
) -> Option<Expression> {
if like_opts.negated || like_opts.case_insensitive {
return None;
}
let pat = expr.child(1).as_::<Literal>();
let pat_str = pat.as_utf8().value()?;
let src = expr.child(0).clone();
let src_min = src.stat_min(catalog)?;
let src_max = src.stat_max(catalog)?;
match LikeVariant::from_str(pat_str)? {
LikeVariant::Exact(text) => {
Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
}
LikeVariant::Prefix(prefix) => {
let succ = prefix.to_string().increment().ok()?;
Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
}
}
}
}
pub(crate) fn arrow_like(
array: &ArrayRef,
pattern: &ArrayRef,
options: LikeOptions,
) -> VortexResult<ArrayRef> {
let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
let len = array.len();
assert_eq!(
array.len(),
pattern.len(),
"Arrow Like: length mismatch for {}",
array.encoding_id()
);
let lhs = Datum::try_new(array)?;
let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type())?;
let result = match (options.negated, options.case_insensitive) {
(false, false) => arrow_string::like::like(&lhs, &rhs)?,
(true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
(false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
(true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
};
from_arrow_array_with_len(&result, len, nullable)
}
#[derive(Debug, PartialEq)]
enum LikeVariant<'a> {
Exact(&'a str),
Prefix(&'a str),
}
impl<'a> LikeVariant<'a> {
fn from_str(string: &str) -> Option<LikeVariant<'_>> {
let Some(wildcard_pos) = string.find(['%', '_']) else {
return Some(LikeVariant::Exact(string));
};
if wildcard_pos == 0 {
return None;
}
let prefix = &string[..wildcard_pos];
Some(LikeVariant::Prefix(prefix))
}
}
#[cfg(test)]
mod tests {
use crate::IntoArray;
use crate::arrays::BoolArray;
use crate::assert_arrays_eq;
use crate::dtype::DType;
use crate::dtype::Nullability;
use crate::expr::col;
use crate::expr::get_item;
use crate::expr::ilike;
use crate::expr::like;
use crate::expr::lit;
use crate::expr::not;
use crate::expr::not_ilike;
use crate::expr::not_like;
use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
use crate::expr::root;
use crate::scalar_fn::fns::like::LikeVariant;
#[test]
fn invert_booleans() {
let not_expr = not(root());
let bools = BoolArray::from_iter([false, true, false, false, true, true]);
assert_arrays_eq!(
bools.into_array().apply(¬_expr).unwrap(),
BoolArray::from_iter([true, false, true, true, false, false])
);
}
#[test]
fn dtype() {
let dtype = DType::Utf8(Nullability::NonNullable);
let like_expr = like(root(), lit("%test%"));
assert_eq!(
like_expr.return_dtype(&dtype).unwrap(),
DType::Bool(Nullability::NonNullable)
);
}
#[test]
fn test_display() {
let expr = like(get_item("name", root()), lit("%john%"));
assert_eq!(expr.to_string(), "$.name like \"%john%\"");
let expr2 = not_ilike(root(), lit("test*"));
assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
}
#[test]
fn test_like_variant() {
assert_eq!(
LikeVariant::from_str("simple"),
Some(LikeVariant::Exact("simple"))
);
assert_eq!(
LikeVariant::from_str("prefix%"),
Some(LikeVariant::Prefix("prefix"))
);
assert_eq!(
LikeVariant::from_str("first%rest_stuff"),
Some(LikeVariant::Prefix("first"))
);
assert_eq!(LikeVariant::from_str("%suffix"), None);
assert_eq!(LikeVariant::from_str("_pattern"), None);
}
#[test]
fn test_like_pushdown() {
let catalog = TrackingStatsCatalog::default();
let pruning_expr = like(col("a"), lit("prefix%"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
let pruning_expr = like(col("a"), lit("pref%ix%"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
let pruning_expr = like(col("a"), lit("pref_ix_"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
let pruning_expr = like(col("a"), lit("exactly"))
.stat_falsification(&catalog)
.expect("LIKE stat falsification");
insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
assert_eq!(pruning_expr, None);
assert_eq!(
None,
not_like(col("a"), lit("a")).stat_falsification(&catalog)
);
assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
}
}