use crate::compiler::prelude::*;
use std::sync::LazyLock;
static DEFAULT_REPLACE_SINGLE: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("")));
static DEFAULT_REPLACE_REPEATED: LazyLock<Value> = LazyLock::new(|| Value::Bytes(Bytes::from("")));
static PARAMETERS: LazyLock<Vec<Parameter>> = LazyLock::new(|| {
vec![
Parameter::required("value", kind::BYTES, "The original string."),
Parameter::required(
"permitted_characters",
kind::REGEX,
"Keep all matches of this pattern.",
),
Parameter::optional(
"replace_single",
kind::BYTES,
"The string to use to replace single rejected characters.",
)
.default(&DEFAULT_REPLACE_SINGLE),
Parameter::optional(
"replace_repeated",
kind::BYTES,
"The string to use to replace multiple sequential instances of rejected characters.",
)
.default(&DEFAULT_REPLACE_REPEATED),
]
});
fn sieve(
value: &Value,
permitted_characters: Value,
replace_single: &Value,
replace_repeated: &Value,
) -> Resolved {
let value = value.try_bytes_utf8_lossy()?;
let replace_single = replace_single.try_bytes_utf8_lossy()?;
let replace_repeated = replace_repeated.try_bytes_utf8_lossy()?;
match permitted_characters {
Value::Regex(regex) => {
let mut result = String::with_capacity(value.len());
let mut last_end = 0;
for m in regex.find_iter(&value) {
match m.start() - last_end {
l if l > 1 => result += &replace_repeated,
1 => result += &replace_single,
_ => (),
}
last_end = m.end();
result += m.as_str();
}
Ok(result.into())
}
value => Err(ValueError::Expected {
got: value.kind(),
expected: Kind::regex(),
}
.into()),
}
}
#[derive(Clone, Copy, Debug)]
pub struct Sieve;
impl Function for Sieve {
fn identifier(&self) -> &'static str {
"sieve"
}
fn usage(&self) -> &'static str {
indoc! {"
Keeps only matches of `pattern` in `value`.
This can be used to define patterns that are allowed in the string and
remove everything else.
"}
}
fn category(&self) -> &'static str {
Category::String.as_ref()
}
fn return_kind(&self) -> u16 {
kind::BYTES
}
fn parameters(&self) -> &'static [Parameter] {
PARAMETERS.as_slice()
}
fn examples(&self) -> &'static [Example] {
&[
example! {
title: "Keep only lowercase letters",
source: r#"sieve("vector.dev/lowerUPPER", permitted_characters: r'[a-z]')"#,
result: Ok("vectordevlower"),
},
example! {
title: "Sieve with regex",
source: r#"sieve("test123%456.فوائد.net.", r'[a-z0-9.]')"#,
result: Ok("test123456..net."),
},
example! {
title: "Custom replacements",
source: r#"sieve("test123%456.فوائد.net.", r'[a-z.0-9]', replace_single: "X", replace_repeated: "<REMOVED>")"#,
result: Ok("test123X456.<REMOVED>.net."),
},
]
}
fn compile(
&self,
_state: &state::TypeState,
_ctx: &mut FunctionCompileContext,
arguments: ArgumentList,
) -> Compiled {
let value = arguments.required("value");
let permitted_characters = arguments.required("permitted_characters");
let replace_single = arguments.optional("replace_single");
let replace_repeated = arguments.optional("replace_repeated");
Ok(SieveFn {
value,
permitted_characters,
replace_single,
replace_repeated,
}
.as_expr())
}
}
#[derive(Debug, Clone)]
struct SieveFn {
value: Box<dyn Expression>,
permitted_characters: Box<dyn Expression>,
replace_single: Option<Box<dyn Expression>>,
replace_repeated: Option<Box<dyn Expression>>,
}
impl FunctionExpression for SieveFn {
fn resolve(&self, ctx: &mut Context) -> Resolved {
let value = self.value.resolve(ctx)?;
let permitted_characters = self.permitted_characters.resolve(ctx)?;
let replace_single = self
.replace_single
.map_resolve_with_default(ctx, || DEFAULT_REPLACE_SINGLE.clone())?;
let replace_repeated = self
.replace_repeated
.map_resolve_with_default(ctx, || DEFAULT_REPLACE_REPEATED.clone())?;
sieve(
&value,
permitted_characters,
&replace_single,
&replace_repeated,
)
}
fn type_def(&self, _: &state::TypeState) -> TypeDef {
TypeDef::bytes().infallible()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::value;
test_function![
sieve => Sieve;
lowercase_letters_only {
args: func_args![value: value!("vector.dev"), permitted_characters: regex::Regex::new("[a-z]").unwrap()],
want: Ok(value!("vectordev")),
tdef: TypeDef::bytes().infallible(),
}
alphanumeric_and_dots {
args: func_args![value: value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z.فوائد.net."), permitted_characters: regex::Regex::new("[a-z.0-9]").unwrap()],
want: Ok(value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z..net.")),
tdef: TypeDef::bytes().infallible(),
}
all_options {
args: func_args![value: value!("test123%456.فوائد.net."), permitted_characters: regex::Regex::new("[a-z.0-9]").unwrap(), replace_single: "X", replace_repeated: "<REMOVED>"],
want: Ok(value!("test123X456.<REMOVED>.net.")),
tdef: TypeDef::bytes().infallible(),
}
replace_repeated {
args: func_args![value: value!("37ccx6a5uf52a7dv2hfxgpmltji09x6xkg0zv6yxsoi4kqs9atmjh7k50dcjb7z.فوائد.net."), permitted_characters: regex::Regex::new(r"[\.]").unwrap(), replace_repeated: "<REMOVED>"],
want: Ok(value!("<REMOVED>.<REMOVED>.<REMOVED>.")),
tdef: TypeDef::bytes().infallible(),
}
];
}