use crate::ast::Literal;
use crate::lexer::{lex, LexError};
use crate::token::Token;
const FNV_OFFSET: u64 = 0xcbf29ce484222325;
const FNV_PRIME: u64 = 0x100000001b3;
#[inline]
fn hash_byte(mut h: u64, b: u8) -> u64 {
h ^= b as u64;
h = h.wrapping_mul(FNV_PRIME);
h
}
#[inline]
fn hash_bytes(mut h: u64, bytes: &[u8]) -> u64 {
for b in bytes {
h = hash_byte(h, *b);
}
h
}
pub fn canonicalize(input: &str) -> Result<(u64, Vec<Literal>), LexError> {
let tokens = lex(input)?;
let tokens = normalize_limit_offset_order(tokens);
let mut hash = FNV_OFFSET;
let mut literals: Vec<Literal> = Vec::with_capacity(4);
for tok in &tokens {
hash = hash_token(hash, tok, &mut literals);
}
Ok((hash, literals))
}
fn normalize_limit_offset_order(mut tokens: Vec<Token>) -> Vec<Token> {
let mut limit_pos: Option<usize> = None;
let mut offset_pos: Option<usize> = None;
for (i, tok) in tokens.iter().enumerate() {
match tok {
Token::Limit if limit_pos.is_none() => limit_pos = Some(i),
Token::Offset if offset_pos.is_none() => offset_pos = Some(i),
_ => {}
}
}
let (Some(off_at), Some(lim_at)) = (offset_pos, limit_pos) else {
return tokens;
};
if off_at >= lim_at {
return tokens;
}
fn clause_end(tokens: &[Token], start: usize) -> usize {
let mut i = start;
while i < tokens.len() {
match &tokens[i] {
Token::Limit
| Token::Offset
| Token::Filter
| Token::Order
| Token::Group
| Token::LBrace
| Token::Update
| Token::Delete
| Token::Eof => return i,
_ => i += 1,
}
}
i
}
let off_expr_end = clause_end(&tokens, off_at + 1);
if off_expr_end != lim_at {
return tokens;
}
let lim_expr_end = clause_end(&tokens, lim_at + 1);
let tail: Vec<Token> = tokens.drain(lim_expr_end..).collect();
let limit_chunk: Vec<Token> = tokens.drain(lim_at..lim_expr_end).collect();
let offset_chunk: Vec<Token> = tokens.drain(off_at..lim_at).collect();
tokens.extend(limit_chunk);
tokens.extend(offset_chunk);
tokens.extend(tail);
tokens
}
fn hash_token(h: u64, tok: &Token, literals: &mut Vec<Literal>) -> u64 {
match tok {
Token::Ident(s) => {
let h = hash_byte(h, 0x01);
let h = hash_byte(h, s.len() as u8);
hash_bytes(h, s.as_bytes())
}
Token::DotIdent(s) => {
let h = hash_byte(h, 0x02);
let h = hash_byte(h, s.len() as u8);
hash_bytes(h, s.as_bytes())
}
Token::Param(s) => {
let h = hash_byte(h, 0x03);
let h = hash_byte(h, s.len() as u8);
hash_bytes(h, s.as_bytes())
}
Token::IntLit(v) => {
literals.push(Literal::Int(*v));
hash_byte(h, 0xF0)
}
Token::FloatLit(v) => {
literals.push(Literal::Float(*v));
hash_byte(h, 0xF1)
}
Token::StringLit(s) => {
literals.push(Literal::String(s.clone()));
hash_byte(h, 0xF2)
}
Token::BoolLit(v) => {
literals.push(Literal::Bool(*v));
hash_byte(h, 0xF3)
}
Token::Type => hash_byte(h, 0x10),
Token::Filter => hash_byte(h, 0x11),
Token::Order => hash_byte(h, 0x12),
Token::Limit => hash_byte(h, 0x13),
Token::Offset => hash_byte(h, 0x14),
Token::Insert => hash_byte(h, 0x15),
Token::Update => hash_byte(h, 0x16),
Token::Delete => hash_byte(h, 0x17),
Token::Upsert => hash_byte(h, 0x18),
Token::Select => hash_byte(h, 0x19),
Token::Required => hash_byte(h, 0x1A),
Token::Multi => hash_byte(h, 0x1B),
Token::Link => hash_byte(h, 0x1C),
Token::Index => hash_byte(h, 0x1D),
Token::On => hash_byte(h, 0x1E),
Token::Asc => hash_byte(h, 0x1F),
Token::Desc => hash_byte(h, 0x20),
Token::And => hash_byte(h, 0x21),
Token::Or => hash_byte(h, 0x22),
Token::Not => hash_byte(h, 0x23),
Token::Exists => hash_byte(h, 0x24),
Token::Let => hash_byte(h, 0x25),
Token::As => hash_byte(h, 0x26),
Token::Match => hash_byte(h, 0x27),
Token::Group => hash_byte(h, 0x28),
Token::Transaction => hash_byte(h, 0x29),
Token::View => hash_byte(h, 0x2A),
Token::Materialized => hash_byte(h, 0x2B),
Token::Count => hash_byte(h, 0x2C),
Token::Avg => hash_byte(h, 0x2D),
Token::Sum => hash_byte(h, 0x2E),
Token::Min => hash_byte(h, 0x2F),
Token::Max => hash_byte(h, 0x30),
Token::Join => hash_byte(h, 0x31),
Token::Inner => hash_byte(h, 0x32),
Token::LeftKw => hash_byte(h, 0x33),
Token::RightKw => hash_byte(h, 0x34),
Token::Outer => hash_byte(h, 0x35),
Token::Cross => hash_byte(h, 0x36),
Token::Distinct => hash_byte(h, 0x37),
Token::In => hash_byte(h, 0x38),
Token::Between => hash_byte(h, 0x39),
Token::Like => hash_byte(h, 0x3A),
Token::Having => hash_byte(h, 0x3B),
Token::Is => hash_byte(h, 0x3C),
Token::Null => hash_byte(h, 0x3D),
Token::Upper => hash_byte(h, 0x50),
Token::Lower => hash_byte(h, 0x51),
Token::Length => hash_byte(h, 0x52),
Token::Trim => hash_byte(h, 0x53),
Token::Substring => hash_byte(h, 0x54),
Token::Concat => hash_byte(h, 0x55),
Token::Case => hash_byte(h, 0x56),
Token::When => hash_byte(h, 0x57),
Token::Then => hash_byte(h, 0x58),
Token::Else => hash_byte(h, 0x59),
Token::End => hash_byte(h, 0x5A),
Token::Alter => hash_byte(h, 0x5B),
Token::Drop => hash_byte(h, 0x5C),
Token::Add => hash_byte(h, 0x5D),
Token::Column => hash_byte(h, 0x5E),
Token::Refresh => hash_byte(h, 0x5F),
Token::Union => hash_byte(h, 0x67),
Token::Over => hash_byte(h, 0x68),
Token::Partition => hash_byte(h, 0x69),
Token::RowNumber => hash_byte(h, 0x6A),
Token::Rank => hash_byte(h, 0x6B),
Token::DenseRank => hash_byte(h, 0x6C),
Token::Explain => hash_byte(h, 0x6D),
Token::Conflict => hash_byte(h, 0x6E),
Token::Abs => hash_byte(h, 0x70),
Token::Round => hash_byte(h, 0x71),
Token::Ceil => hash_byte(h, 0x72),
Token::Floor => hash_byte(h, 0x73),
Token::Sqrt => hash_byte(h, 0x74),
Token::Pow => hash_byte(h, 0x75),
Token::Now => hash_byte(h, 0x76),
Token::Extract => hash_byte(h, 0x77),
Token::DateAdd => hash_byte(h, 0x78),
Token::DateDiff => hash_byte(h, 0x79),
Token::Cast => hash_byte(h, 0x7A),
Token::Begin => hash_byte(h, 0x7B),
Token::Commit => hash_byte(h, 0x7C),
Token::Rollback => hash_byte(h, 0x7D),
Token::Eq => hash_byte(h, 0x40),
Token::Neq => hash_byte(h, 0x41),
Token::Lt => hash_byte(h, 0x42),
Token::Gt => hash_byte(h, 0x43),
Token::Lte => hash_byte(h, 0x44),
Token::Gte => hash_byte(h, 0x45),
Token::Assign => hash_byte(h, 0x46),
Token::Arrow => hash_byte(h, 0x47),
Token::Pipe => hash_byte(h, 0x48),
Token::Coalesce => hash_byte(h, 0x49),
Token::Plus => hash_byte(h, 0x4A),
Token::Minus => hash_byte(h, 0x4B),
Token::Star => hash_byte(h, 0x4C),
Token::Slash => hash_byte(h, 0x4D),
Token::LBrace => hash_byte(h, 0x60),
Token::RBrace => hash_byte(h, 0x61),
Token::LParen => hash_byte(h, 0x62),
Token::RParen => hash_byte(h, 0x63),
Token::Comma => hash_byte(h, 0x64),
Token::Colon => hash_byte(h, 0x65),
Token::Dot => hash_byte(h, 0x66),
Token::Eof => hash_byte(h, 0x7F),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_same_query_same_hash() {
let (h1, lits1) = canonicalize("User filter .age > 30").unwrap();
let (h2, lits2) = canonicalize("User filter .age > 30").unwrap();
assert_eq!(h1, h2);
assert_eq!(lits1, lits2);
assert_eq!(lits1, vec![Literal::Int(30)]);
}
#[test]
fn test_different_literal_same_hash() {
let (h1, lits1) = canonicalize("User filter .age > 30").unwrap();
let (h2, lits2) = canonicalize("User filter .age > 99").unwrap();
assert_eq!(h1, h2);
assert_eq!(lits1, vec![Literal::Int(30)]);
assert_eq!(lits2, vec![Literal::Int(99)]);
}
#[test]
fn test_whitespace_and_comments_normalised() {
let (h1, _) = canonicalize("User filter .age > 30").unwrap();
let (h2, _) = canonicalize("User filter\n.age > 30 # foo\n").unwrap();
assert_eq!(h1, h2);
}
#[test]
fn test_different_field_different_hash() {
let (h1, _) = canonicalize("User filter .age > 30").unwrap();
let (h2, _) = canonicalize("User filter .id > 30").unwrap();
assert_ne!(h1, h2);
}
#[test]
fn test_different_table_different_hash() {
let (h1, _) = canonicalize("User filter .age > 30").unwrap();
let (h2, _) = canonicalize("Order filter .age > 30").unwrap();
assert_ne!(h1, h2);
}
#[test]
fn test_different_operator_different_hash() {
let (h1, _) = canonicalize("User filter .age > 30").unwrap();
let (h2, _) = canonicalize("User filter .age < 30").unwrap();
assert_ne!(h1, h2);
}
#[test]
fn test_string_literal_canonicalised() {
let (h1, lits1) = canonicalize(r#"User filter .status = "active""#).unwrap();
let (h2, lits2) = canonicalize(r#"User filter .status = "pending""#).unwrap();
assert_eq!(h1, h2);
assert_eq!(lits1, vec![Literal::String("active".into())]);
assert_eq!(lits2, vec![Literal::String("pending".into())]);
}
#[test]
fn test_multi_literal_collected_in_source_order() {
let (_, lits) =
canonicalize(r#"User filter .age > 30 and .status = "active" { .name }"#).unwrap();
assert_eq!(
lits,
vec![Literal::Int(30), Literal::String("active".into()),]
);
}
#[test]
fn test_insert_literals_in_assignment_order() {
let (_, lits) =
canonicalize(r#"insert User { id := 42, name := "Alice", age := 30 }"#).unwrap();
assert_eq!(
lits,
vec![
Literal::Int(42),
Literal::String("Alice".into()),
Literal::Int(30),
]
);
}
#[test]
fn test_update_by_pk_literals_in_source_order() {
let (_, lits) = canonicalize("User filter .id = 42 update { age := 31 }").unwrap();
assert_eq!(lits, vec![Literal::Int(42), Literal::Int(31)]);
}
}