use crate::regexp::{
collect_lookarounds, expand_word_boundaries, has_top_level_lookaround, has_word_boundary,
parse_regexp, LookaroundType, RegexpBranch, RegexpRoot,
};
use crate::segments_tree::SEGMENT_SEPARATOR;
use crate::QuaminaError;
use std::collections::HashMap;
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct ArrayPos {
pub array: i32,
pub pos: i32,
}
#[derive(Clone, Debug)]
pub struct Field {
pub path: String,
pub value: String,
pub array_trail: Vec<ArrayPos>,
pub is_number: bool,
}
#[derive(Debug, Clone)]
pub enum Matcher {
Exact(String),
NumericExact(f64), Exists(bool),
Prefix(String),
Suffix(String),
Wildcard(String),
Shellstyle(String), AnythingBut(Vec<String>),
AnythingButNumeric(Vec<f64>),
EqualsIgnoreCase(String),
Numeric(NumericComparison),
ParsedRegexp(RegexpRoot),
Cidr(CidrPattern),
MultiCondition(MultiConditionPattern),
}
#[derive(Debug, Clone, PartialEq)]
pub struct NumericComparison {
pub lower: Option<(bool, f64)>, pub upper: Option<(bool, f64)>, }
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CidrPattern {
V4 { network: [u8; 4], prefix_len: u8 },
V6 { network: [u8; 16], prefix_len: u8 },
}
impl CidrPattern {
pub fn parse(s: &str) -> Option<Self> {
let (addr_str, prefix_str) = s.split_once('/')?;
let prefix_len: u8 = prefix_str.parse().ok()?;
if let Some(addr) = Self::parse_ipv4(addr_str) {
if prefix_len > 32 {
return None;
}
let mask = if prefix_len == 0 {
0u32
} else {
!0u32 << (32 - prefix_len)
};
let network_bits = u32::from_be_bytes(addr) & mask;
return Some(Self::V4 {
network: network_bits.to_be_bytes(),
prefix_len,
});
}
if let Some(addr) = Self::parse_ipv6(addr_str) {
if prefix_len > 128 {
return None;
}
let network = Self::apply_ipv6_mask(&addr, prefix_len);
return Some(Self::V6 {
network,
prefix_len,
});
}
None
}
fn parse_ipv4(s: &str) -> Option<[u8; 4]> {
let parts: Vec<&str> = s.split('.').collect();
if parts.len() != 4 {
return None;
}
let mut addr = [0u8; 4];
for (i, part) in parts.iter().enumerate() {
addr[i] = part.parse().ok()?;
}
Some(addr)
}
fn parse_ipv6(s: &str) -> Option<[u8; 16]> {
let mut addr = [0u8; 16];
if s.contains("::") {
let parts: Vec<&str> = s.split("::").collect();
if parts.len() > 2 {
return None; }
let left: Vec<&str> = if parts[0].is_empty() {
vec![]
} else {
parts[0].split(':').collect()
};
let right: Vec<&str> = if parts.len() > 1 && !parts[1].is_empty() {
parts[1].split(':').collect()
} else {
vec![]
};
if left.len() + right.len() > 8 {
return None;
}
for (i, part) in left.iter().enumerate() {
let val = u16::from_str_radix(part, 16).ok()?;
addr[i * 2] = (val >> 8) as u8;
addr[i * 2 + 1] = val as u8;
}
let right_start = 8 - right.len();
for (i, part) in right.iter().enumerate() {
let val = u16::from_str_radix(part, 16).ok()?;
addr[(right_start + i) * 2] = (val >> 8) as u8;
addr[(right_start + i) * 2 + 1] = val as u8;
}
} else {
let parts: Vec<&str> = s.split(':').collect();
if parts.len() != 8 {
return None;
}
for (i, part) in parts.iter().enumerate() {
let val = u16::from_str_radix(part, 16).ok()?;
addr[i * 2] = (val >> 8) as u8;
addr[i * 2 + 1] = val as u8;
}
}
Some(addr)
}
fn apply_ipv6_mask(addr: &[u8; 16], prefix_len: u8) -> [u8; 16] {
let mut result = *addr;
let full_bytes = (prefix_len / 8) as usize;
let remaining_bits = prefix_len % 8;
for byte in result
.iter_mut()
.skip(full_bytes + if remaining_bits > 0 { 1 } else { 0 })
{
*byte = 0;
}
if remaining_bits > 0 && full_bytes < 16 {
let mask = !0u8 << (8 - remaining_bits);
result[full_bytes] &= mask;
}
result
}
}
#[derive(Debug, Clone)]
pub enum LookaroundCondition {
PositiveLookahead(RegexpRoot),
NegativeLookahead(RegexpRoot),
PositiveLookbehind {
pattern: RegexpRoot,
byte_length: usize,
},
NegativeLookbehind {
pattern: RegexpRoot,
byte_length: usize,
},
}
impl LookaroundCondition {
pub fn is_negative(&self) -> bool {
matches!(
self,
Self::NegativeLookahead(_) | Self::NegativeLookbehind { .. }
)
}
pub fn is_lookbehind(&self) -> bool {
matches!(
self,
Self::PositiveLookbehind { .. } | Self::NegativeLookbehind { .. }
)
}
pub fn cost_estimate(&self) -> u32 {
match self {
Self::PositiveLookahead(_) => 10,
Self::NegativeLookahead(_) => 20,
Self::PositiveLookbehind { .. } => 30,
Self::NegativeLookbehind { .. } => 40,
}
}
}
#[derive(Debug, Clone)]
pub struct MultiConditionPattern {
pub primary: RegexpRoot,
pub conditions: Vec<LookaroundCondition>,
}
impl MultiConditionPattern {
pub fn new(primary: RegexpRoot, mut conditions: Vec<LookaroundCondition>) -> Self {
conditions.sort_by_key(|c| c.cost_estimate());
Self {
primary,
conditions,
}
}
}
pub fn transform_lookaround_pattern(tree: &RegexpRoot) -> Result<MultiConditionPattern, String> {
let lookarounds = collect_lookarounds(tree);
if lookarounds.is_empty() {
return Err("no lookarounds found in pattern".into());
}
if tree.len() != 1 {
return Err("lookaround patterns with top-level alternation not yet supported".into());
}
let branch = &tree[0];
let mut conditions = Vec::new();
let mut primary_atoms: RegexpBranch = Vec::new();
for (i, atom) in branch.iter().enumerate() {
if let Some(la_type) = atom.lookaround {
let la_subtree = atom
.subtree
.as_ref()
.ok_or("lookaround atom missing subtree")?;
match la_type {
LookaroundType::PositiveLookahead => {
let combined = build_combined_pattern(&primary_atoms, la_subtree);
conditions.push(LookaroundCondition::PositiveLookahead(combined));
}
LookaroundType::NegativeLookahead => {
let combined = build_combined_pattern(&primary_atoms, la_subtree);
conditions.push(LookaroundCondition::NegativeLookahead(combined));
}
LookaroundType::PositiveLookbehind => {
let byte_length = compute_lookbehind_byte_length(la_subtree)?;
conditions.push(LookaroundCondition::PositiveLookbehind {
pattern: la_subtree.clone(),
byte_length,
});
}
LookaroundType::NegativeLookbehind => {
let byte_length = compute_lookbehind_byte_length(la_subtree)?;
conditions.push(LookaroundCondition::NegativeLookbehind {
pattern: la_subtree.clone(),
byte_length,
});
}
}
} else {
primary_atoms.push(atom.clone());
}
let _ = i; }
let primary = if primary_atoms.is_empty() {
vec![] } else {
vec![primary_atoms]
};
Ok(MultiConditionPattern::new(primary, conditions))
}
fn build_combined_pattern(primary_atoms: &RegexpBranch, lookahead: &RegexpRoot) -> RegexpRoot {
if lookahead.is_empty() {
return vec![primary_atoms.clone()];
}
let mut combined_branches = Vec::new();
for la_branch in lookahead {
let mut combined: RegexpBranch = primary_atoms.clone();
combined.extend(la_branch.clone());
combined_branches.push(combined);
}
combined_branches
}
fn compute_lookbehind_byte_length(tree: &RegexpRoot) -> Result<usize, String> {
if tree.is_empty() {
return Ok(0);
}
if tree.len() == 1 {
return compute_branch_byte_length(&tree[0]);
}
let first_len = compute_branch_byte_length(&tree[0])?;
for branch in tree.iter().skip(1) {
let len = compute_branch_byte_length(branch)?;
if len != first_len {
return Err("variable-length lookbehind not supported".into());
}
}
Ok(first_len)
}
fn compute_branch_byte_length(branch: &RegexpBranch) -> Result<usize, String> {
let mut total = 0usize;
for atom in branch {
if atom.quant_min != atom.quant_max {
return Err("variable quantifier in lookbehind not supported".into());
}
let count = atom.quant_min as usize;
let atom_len = if atom.is_dot {
4
} else if !atom.runes.is_empty() {
let mut max_len = 1;
for rp in &atom.runes {
max_len = max_len.max(rp.hi.len_utf8());
}
max_len
} else if let Some(subtree) = &atom.subtree {
compute_lookbehind_byte_length(subtree)?
} else {
0
};
total += atom_len * count;
}
Ok(total)
}
pub fn parse_pattern(
json: &str,
limits: &crate::PatternLimits,
) -> Result<HashMap<String, Vec<Matcher>>, QuaminaError> {
let mut parser = Parser::new(json);
let value = parser.parse_value()?;
let Value::Object(obj) = value else {
return Err(QuaminaError::InvalidPattern(
"pattern must be an object".into(),
));
};
let mut fields = HashMap::new();
extract_pattern_fields(&obj, String::new(), &mut fields, 0, limits)?;
Ok(fields)
}
fn extract_pattern_fields(
obj: &[(String, Value)],
prefix: String,
fields: &mut HashMap<String, Vec<Matcher>>,
depth: usize,
limits: &crate::PatternLimits,
) -> Result<(), QuaminaError> {
if depth >= limits.max_pattern_depth {
return Err(QuaminaError::PatternTooComplex(format!(
"pattern nesting depth {} exceeds maximum of {} (at path '{}')",
depth + 1,
limits.max_pattern_depth,
prefix
)));
}
for (key, value) in obj {
let path = if prefix.is_empty() {
key.clone()
} else {
format!("{}{}{}", prefix, SEGMENT_SEPARATOR, key)
};
match value {
Value::Array(arr) => {
let matchers: Result<Vec<Matcher>, QuaminaError> =
arr.iter().map(value_to_matcher).collect();
fields.insert(path, matchers?);
if fields.len() > limits.max_fields_per_pattern {
return Err(QuaminaError::PatternTooComplex(format!(
"pattern has {} fields, exceeding maximum of {}",
fields.len(),
limits.max_fields_per_pattern
)));
}
}
Value::Object(nested) => {
extract_pattern_fields(nested, path, fields, depth + 1, limits)?;
}
_ => {
return Err(QuaminaError::InvalidPattern(format!(
"pattern field '{}' must be array or object",
path
)))
}
}
}
Ok(())
}
fn value_to_matcher(value: &Value) -> Result<Matcher, QuaminaError> {
match value {
Value::Object(obj) => {
if let Some((key, val)) = obj.first() {
match key.as_str() {
"exists" => {
if let Value::Bool(b) = val {
return Ok(Matcher::Exists(*b));
}
return Err(QuaminaError::InvalidPattern(
"exists value must be a boolean".into(),
));
}
"prefix" => {
if let Value::String(s) = val {
return Ok(Matcher::Prefix(s.clone()));
}
return Err(QuaminaError::InvalidPattern(
"prefix value must be a string".into(),
));
}
"suffix" => {
if let Value::String(s) = val {
return Ok(Matcher::Suffix(s.clone()));
}
return Err(QuaminaError::InvalidPattern(
"suffix value must be a string".into(),
));
}
"wildcard" => {
if let Value::String(s) = val {
if !validate_wildcard(s) {
return Err(QuaminaError::InvalidPattern(
"wildcard pattern has invalid escape sequence or adjacent '**'"
.into(),
));
}
return Ok(Matcher::Wildcard(s.clone()));
}
return Err(QuaminaError::InvalidPattern(
"wildcard value must be a string".into(),
));
}
"shellstyle" => {
if let Value::String(s) = val {
if s.contains("**") {
return Err(QuaminaError::InvalidPattern(
"shellstyle pattern cannot contain '**'".into(),
));
}
return Ok(Matcher::Shellstyle(s.clone()));
}
return Err(QuaminaError::InvalidPattern(
"shellstyle value must be a string".into(),
));
}
"anything-but" => {
if let Value::String(s) = val {
return Ok(Matcher::AnythingBut(vec![s.clone()]));
}
if let Value::Number(n) = val {
if let Ok(f) = n.parse::<f64>() {
return Ok(Matcher::AnythingButNumeric(vec![f]));
}
return Err(QuaminaError::InvalidPattern(
"anything-but numeric value is not a valid number".into(),
));
}
if let Value::Array(arr) = val {
if arr.is_empty() {
return Err(QuaminaError::InvalidPattern(
"anything-but array cannot be empty".into(),
));
}
let strings: Vec<String> = arr
.iter()
.filter_map(|v| match v {
Value::String(s) => Some(s.clone()),
_ => None,
})
.collect();
let numbers: Vec<f64> = arr
.iter()
.filter_map(|v| match v {
Value::Number(n) => n.parse::<f64>().ok(),
_ => None,
})
.collect();
if !strings.is_empty() && !numbers.is_empty() {
return Err(QuaminaError::InvalidPattern(
"anything-but array must contain only strings or only numbers"
.into(),
));
}
if !strings.is_empty() {
return Ok(Matcher::AnythingBut(strings));
}
if !numbers.is_empty() {
return Ok(Matcher::AnythingButNumeric(numbers));
}
return Err(QuaminaError::InvalidPattern(
"anything-but array must contain strings or numbers".into(),
));
}
return Err(QuaminaError::InvalidPattern(
"anything-but value must be a string, number, or array".into(),
));
}
"equals-ignore-case" => {
if let Value::String(s) = val {
return Ok(Matcher::EqualsIgnoreCase(s.clone()));
}
return Err(QuaminaError::InvalidPattern(
"equals-ignore-case value must be a string".into(),
));
}
"numeric" => {
if let Value::Array(arr) = val {
if let Some(cmp) = parse_numeric_comparison(arr) {
return Ok(Matcher::Numeric(cmp));
}
return Err(QuaminaError::InvalidPattern(
"invalid numeric comparison format".into(),
));
}
return Err(QuaminaError::InvalidPattern(
"numeric value must be an array".into(),
));
}
"regexp" | "regex" => {
if let Value::String(s) = val {
match parse_regexp(s) {
Ok(tree) => {
let tree = if has_word_boundary(&tree) {
let expanded = expand_word_boundaries(&tree);
match expanded {
Ok(t) if t.is_empty() => {
return Err(QuaminaError::InvalidPattern(
"word boundary ~b/~B is impossible in this pattern: adjacent characters are in the same word-class".into(),
));
}
Ok(t) => t,
Err(e) => {
return Err(QuaminaError::InvalidPattern(format!(
"word boundary expansion failed: {}",
e
)))
}
}
} else {
tree
};
if has_top_level_lookaround(&tree) {
match transform_lookaround_pattern(&tree) {
Ok(mc) => return Ok(Matcher::MultiCondition(mc)),
Err(e) => {
return Err(QuaminaError::InvalidPattern(format!(
"lookaround transformation failed: {}",
e
)))
}
}
}
return Ok(Matcher::ParsedRegexp(tree));
}
Err(e) => {
return Err(QuaminaError::InvalidPattern(format!(
"invalid regexp: {}",
e.message
)));
}
}
}
return Err(QuaminaError::InvalidPattern(
"regex value must be a string".into(),
));
}
"cidr" => {
if let Value::String(s) = val {
if let Some(cidr) = CidrPattern::parse(s) {
return Ok(Matcher::Cidr(cidr));
}
return Err(QuaminaError::InvalidPattern(format!(
"invalid CIDR notation: {}",
s
)));
}
return Err(QuaminaError::InvalidPattern(
"cidr value must be a string".into(),
));
}
_ => {
return Err(QuaminaError::InvalidPattern(format!(
"unknown operator '{}'",
key
)));
}
}
}
Err(QuaminaError::InvalidPattern(
"matcher object cannot be empty".into(),
))
}
Value::Number(n) => {
n.parse::<f64>().map_or_else(
|_| Ok(Matcher::Exact(value_to_string(value))),
|f| Ok(Matcher::NumericExact(f)),
)
}
_ => Ok(Matcher::Exact(value_to_string(value))),
}
}
fn parse_numeric_comparison(arr: &[Value]) -> Option<NumericComparison> {
let mut lower = None;
let mut upper = None;
let mut i = 0;
while i < arr.len() {
if let Value::String(op) = &arr[i] {
if i + 1 >= arr.len() {
return None;
}
let num = match &arr[i + 1] {
Value::Number(n) => n.parse::<f64>().ok()?,
_ => return None,
};
match op.as_str() {
">" => lower = Some((false, num)),
">=" => lower = Some((true, num)),
"<" => upper = Some((false, num)),
"<=" => upper = Some((true, num)),
"=" => {
lower = Some((true, num));
upper = Some((true, num));
}
_ => return None,
}
i += 2;
} else {
return None;
}
}
Some(NumericComparison { lower, upper })
}
fn value_to_string(value: &Value) -> String {
match value {
Value::String(s) => format!("\"{}\"", s),
Value::Number(n) => n.clone(),
Value::Bool(b) => b.to_string(),
Value::Null => "null".to_string(),
Value::Object(_) | Value::Array(_) => String::new(),
}
}
fn validate_wildcard(pattern: &str) -> bool {
let mut chars = pattern.chars();
let mut prev_was_star = false;
while let Some(c) = chars.next() {
match c {
'\\' => {
match chars.next() {
Some('*') | Some('\\') => prev_was_star = false,
Some(_) | None => return false, }
}
'*' => {
if prev_was_star {
return false; }
prev_was_star = true;
}
_ => prev_was_star = false,
}
}
true
}
#[derive(Debug, Clone)]
enum Value {
Object(Vec<(String, Self)>),
Array(Vec<Self>),
String(String),
Number(String),
Bool(bool),
Null,
}
struct Parser<'a> {
input: &'a str,
pos: usize,
}
impl<'a> Parser<'a> {
fn new(input: &'a str) -> Self {
Self { input, pos: 0 }
}
fn parse_value(&mut self) -> Result<Value, QuaminaError> {
self.skip_whitespace();
match self.peek() {
Some('{') => self.parse_object(),
Some('[') => self.parse_array(),
Some('"') => self.parse_string().map(Value::String),
Some('t') | Some('f') => self.parse_bool(),
Some('n') => self.parse_null(),
Some(c) if c == '-' || c.is_ascii_digit() => self.parse_number(),
Some(c) => Err(QuaminaError::InvalidJson(format!("unexpected char: {}", c))),
None => Err(QuaminaError::InvalidJson("unexpected end".into())),
}
}
fn parse_object(&mut self) -> Result<Value, QuaminaError> {
self.expect('{')?;
let mut pairs = Vec::new();
self.skip_whitespace();
if self.peek() == Some('}') {
self.advance();
return Ok(Value::Object(pairs));
}
loop {
self.skip_whitespace();
let key = self.parse_string()?;
self.skip_whitespace();
self.expect(':')?;
let value = self.parse_value()?;
pairs.push((key, value));
self.skip_whitespace();
match self.peek() {
Some(',') => {
self.advance();
}
Some('}') => {
self.advance();
break;
}
_ => return Err(QuaminaError::InvalidJson("expected , or }".into())),
}
}
Ok(Value::Object(pairs))
}
fn parse_array(&mut self) -> Result<Value, QuaminaError> {
self.expect('[')?;
let mut items = Vec::new();
self.skip_whitespace();
if self.peek() == Some(']') {
self.advance();
return Ok(Value::Array(items));
}
loop {
items.push(self.parse_value()?);
self.skip_whitespace();
match self.peek() {
Some(',') => {
self.advance();
}
Some(']') => {
self.advance();
break;
}
_ => return Err(QuaminaError::InvalidJson("expected , or ]".into())),
}
}
Ok(Value::Array(items))
}
fn parse_string(&mut self) -> Result<String, QuaminaError> {
self.expect('"')?;
let mut result = String::new();
while let Some(c) = self.peek() {
if c == '"' {
break;
}
if c == '\\' {
self.advance();
if let Some(escaped) = self.peek() {
match escaped {
'n' => {
result.push('\n');
self.advance();
}
'r' => {
result.push('\r');
self.advance();
}
't' => {
result.push('\t');
self.advance();
}
'b' => {
result.push('\x08');
self.advance();
}
'f' => {
result.push('\x0c');
self.advance();
}
'\\' => {
result.push('\\');
self.advance();
}
'"' => {
result.push('"');
self.advance();
}
'/' => {
result.push('/');
self.advance();
}
'u' => {
self.advance(); let code_point = self.parse_unicode_escape()?;
if (0xD800..=0xDBFF).contains(&code_point) {
if self.peek() == Some('\\') {
self.advance();
if self.peek() == Some('u') {
self.advance();
let low = self.parse_unicode_escape()?;
if (0xDC00..=0xDFFF).contains(&low) {
let full = 0x10000
+ ((code_point - 0xD800) << 10)
+ (low - 0xDC00);
if let Some(ch) = char::from_u32(full) {
result.push(ch);
}
}
}
}
} else if let Some(ch) = char::from_u32(code_point) {
result.push(ch);
}
}
_ => {
result.push(escaped);
self.advance();
}
}
}
} else {
result.push(c);
self.advance();
}
}
self.expect('"')?;
Ok(result)
}
fn parse_unicode_escape(&mut self) -> Result<u32, QuaminaError> {
let mut value = 0u32;
for _ in 0..4 {
let digit = self
.peek()
.and_then(|c| c.to_digit(16))
.ok_or_else(|| QuaminaError::InvalidJson("invalid unicode escape".into()))?;
value = value * 16 + digit;
self.advance();
}
Ok(value)
}
fn parse_number(&mut self) -> Result<Value, QuaminaError> {
let start = self.pos;
if self.peek() == Some('-') {
self.advance();
}
while self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
self.advance();
}
if self.peek() == Some('.') {
self.advance();
while self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
self.advance();
}
}
if self.peek() == Some('e') || self.peek() == Some('E') {
self.advance();
if self.peek() == Some('+') || self.peek() == Some('-') {
self.advance();
}
while self.peek().map(|c| c.is_ascii_digit()).unwrap_or(false) {
self.advance();
}
}
Ok(Value::Number(self.input[start..self.pos].to_string()))
}
fn parse_bool(&mut self) -> Result<Value, QuaminaError> {
if self.input[self.pos..].starts_with("true") {
self.pos += 4;
Ok(Value::Bool(true))
} else if self.input[self.pos..].starts_with("false") {
self.pos += 5;
Ok(Value::Bool(false))
} else {
Err(QuaminaError::InvalidJson("expected bool".into()))
}
}
fn parse_null(&mut self) -> Result<Value, QuaminaError> {
if self.input[self.pos..].starts_with("null") {
self.pos += 4;
Ok(Value::Null)
} else {
Err(QuaminaError::InvalidJson("expected null".into()))
}
}
fn peek(&self) -> Option<char> {
self.input[self.pos..].chars().next()
}
fn advance(&mut self) {
if let Some(c) = self.peek() {
self.pos += c.len_utf8();
}
}
fn skip_whitespace(&mut self) {
while self.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
self.advance();
}
}
fn expect(&mut self, c: char) -> Result<(), QuaminaError> {
if self.peek() == Some(c) {
self.advance();
Ok(())
} else {
Err(QuaminaError::InvalidJson(format!("expected '{}'", c)))
}
}
}