Skip to main content

heliosdb_proxy/cache/
normalizer.rs

1//! Query Normalizer
2//!
3//! Normalizes SQL queries for cache key generation by:
4//! - Replacing literal values with placeholders
5//! - Normalizing whitespace
6//! - Extracting table names
7//! - Computing stable hashes
8
9use std::collections::hash_map::DefaultHasher;
10use std::hash::{Hash, Hasher};
11use regex::Regex;
12use once_cell::sync::Lazy;
13
14/// Normalized query representation
15#[derive(Debug, Clone)]
16pub struct NormalizedQuery {
17    /// Normalized query fingerprint (literals replaced with ?)
18    pub fingerprint: String,
19
20    /// Hash of the fingerprint for fast comparison
21    pub hash: u64,
22
23    /// Tables referenced in the query
24    pub tables: Vec<String>,
25
26    /// Extracted parameter values
27    pub parameters: Vec<String>,
28}
29
30impl NormalizedQuery {
31    /// Get the fingerprint for display
32    pub fn fingerprint(&self) -> &str {
33        &self.fingerprint
34    }
35
36    /// Get the hash value
37    pub fn hash(&self) -> u64 {
38        self.hash
39    }
40
41    /// Get referenced tables
42    pub fn tables(&self) -> &[String] {
43        &self.tables
44    }
45}
46
47/// Query normalizer for cache key generation
48#[derive(Debug, Clone)]
49pub struct QueryNormalizer {
50    /// Whether to preserve parameter order
51    preserve_order: bool,
52}
53
54// Regex patterns for normalization
55static STRING_LITERAL: Lazy<Regex> = Lazy::new(|| {
56    Regex::new(r#"'(?:[^'\\]|\\.)*'"#).unwrap()
57});
58
59static DOUBLE_QUOTED: Lazy<Regex> = Lazy::new(|| {
60    Regex::new(r#""(?:[^"\\]|\\.)*""#).unwrap()
61});
62
63static NUMBER_LITERAL: Lazy<Regex> = Lazy::new(|| {
64    Regex::new(r"\b\d+(?:\.\d+)?(?:e[+-]?\d+)?\b").unwrap()
65});
66
67static WHITESPACE: Lazy<Regex> = Lazy::new(|| {
68    Regex::new(r"\s+").unwrap()
69});
70
71static TABLE_PATTERN: Lazy<Regex> = Lazy::new(|| {
72    Regex::new(r"(?i)(?:FROM|JOIN|INTO|UPDATE|TABLE)\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)").unwrap()
73});
74
75static HINT_PATTERN: Lazy<Regex> = Lazy::new(|| {
76    Regex::new(r"/\*[^*]*\*/").unwrap()
77});
78
79static COMMENT_PATTERN: Lazy<Regex> = Lazy::new(|| {
80    Regex::new(r"--[^\n]*").unwrap()
81});
82
83impl QueryNormalizer {
84    /// Create a new query normalizer
85    pub fn new() -> Self {
86        Self {
87            preserve_order: true,
88        }
89    }
90
91    /// Create a normalizer that doesn't preserve parameter order
92    pub fn unordered() -> Self {
93        Self {
94            preserve_order: false,
95        }
96    }
97
98    /// Normalize a SQL query
99    pub fn normalize(&self, sql: &str) -> NormalizedQuery {
100        let mut parameters = Vec::new();
101
102        // Strip comments and hints first
103        let sql = HINT_PATTERN.replace_all(sql, "");
104        let sql = COMMENT_PATTERN.replace_all(&sql, "");
105
106        // Extract tables before normalization
107        let tables = self.extract_tables(&sql);
108
109        // Replace string literals with placeholders
110        let sql = STRING_LITERAL.replace_all(&sql, |caps: &regex::Captures| {
111            let value = caps.get(0).unwrap().as_str();
112            // Remove quotes and store the value
113            let inner = &value[1..value.len()-1];
114            parameters.push(inner.to_string());
115            "?"
116        });
117
118        // Replace number literals with placeholders
119        let sql = NUMBER_LITERAL.replace_all(&sql, |caps: &regex::Captures| {
120            let value = caps.get(0).unwrap().as_str();
121            parameters.push(value.to_string());
122            "?"
123        });
124
125        // Normalize whitespace
126        let sql = WHITESPACE.replace_all(&sql, " ");
127
128        // Trim and convert to uppercase for consistency
129        let fingerprint = sql.trim().to_uppercase();
130
131        // Compute hash
132        let mut hasher = DefaultHasher::new();
133        fingerprint.hash(&mut hasher);
134        if self.preserve_order {
135            // Include parameters in hash if order matters
136            for param in &parameters {
137                param.hash(&mut hasher);
138            }
139        }
140        let hash = hasher.finish();
141
142        NormalizedQuery {
143            fingerprint,
144            hash,
145            tables,
146            parameters,
147        }
148    }
149
150    /// Extract table names from a SQL query
151    fn extract_tables(&self, sql: &str) -> Vec<String> {
152        let mut tables = Vec::new();
153
154        for cap in TABLE_PATTERN.captures_iter(sql) {
155            if let Some(table_match) = cap.get(1) {
156                let table = table_match.as_str().to_lowercase();
157                // Remove schema prefix if present
158                let table_name = table.split('.').last().unwrap_or(&table);
159                if !tables.contains(&table_name.to_string()) {
160                    tables.push(table_name.to_string());
161                }
162            }
163        }
164
165        tables
166    }
167
168    /// Normalize for comparison only (no parameter extraction)
169    pub fn fingerprint(&self, sql: &str) -> String {
170        // Strip comments
171        let sql = HINT_PATTERN.replace_all(sql, "");
172        let sql = COMMENT_PATTERN.replace_all(&sql, "");
173
174        // Replace literals
175        let sql = STRING_LITERAL.replace_all(&sql, "?");
176        let sql = NUMBER_LITERAL.replace_all(&sql, "?");
177
178        // Normalize whitespace
179        let sql = WHITESPACE.replace_all(&sql, " ");
180
181        sql.trim().to_uppercase()
182    }
183
184    /// Check if two queries are equivalent (same fingerprint)
185    pub fn are_equivalent(&self, sql1: &str, sql2: &str) -> bool {
186        self.fingerprint(sql1) == self.fingerprint(sql2)
187    }
188}
189
190impl Default for QueryNormalizer {
191    fn default() -> Self {
192        Self::new()
193    }
194}
195
196/// Quick fingerprint generation without full normalization
197pub fn quick_fingerprint(sql: &str) -> u64 {
198    let normalized = QueryNormalizer::new().fingerprint(sql);
199    let mut hasher = DefaultHasher::new();
200    normalized.hash(&mut hasher);
201    hasher.finish()
202}
203
204/// Extract tables from SQL without full normalization
205pub fn extract_tables(sql: &str) -> Vec<String> {
206    let normalizer = QueryNormalizer::new();
207    normalizer.extract_tables(sql)
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn test_normalize_simple_query() {
216        let normalizer = QueryNormalizer::new();
217        let query = "SELECT * FROM users WHERE id = 123";
218        let normalized = normalizer.normalize(query);
219
220        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE ID = ?");
221        assert_eq!(normalized.parameters, vec!["123"]);
222        assert_eq!(normalized.tables, vec!["users"]);
223    }
224
225    #[test]
226    fn test_normalize_string_literals() {
227        let normalizer = QueryNormalizer::new();
228        let query = "SELECT * FROM users WHERE name = 'John Doe'";
229        let normalized = normalizer.normalize(query);
230
231        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE NAME = ?");
232        assert_eq!(normalized.parameters, vec!["John Doe"]);
233    }
234
235    #[test]
236    fn test_normalize_multiple_parameters() {
237        let normalizer = QueryNormalizer::new();
238        let query = "SELECT * FROM users WHERE age > 18 AND status = 'active' AND score < 100";
239        let normalized = normalizer.normalize(query);
240
241        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE AGE > ? AND STATUS = ? AND SCORE < ?");
242        assert_eq!(normalized.parameters.len(), 3);
243    }
244
245    #[test]
246    fn test_extract_tables_join() {
247        let normalizer = QueryNormalizer::new();
248        let query = "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id";
249        let normalized = normalizer.normalize(query);
250
251        assert!(normalized.tables.contains(&"users".to_string()));
252        assert!(normalized.tables.contains(&"orders".to_string()));
253    }
254
255    #[test]
256    fn test_normalize_removes_comments() {
257        let normalizer = QueryNormalizer::new();
258        let query = "/* helios:cache_ttl=60 */ SELECT * FROM users -- inline comment\nWHERE id = 1";
259        let normalized = normalizer.normalize(query);
260
261        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE ID = ?");
262    }
263
264    #[test]
265    fn test_normalize_whitespace() {
266        let normalizer = QueryNormalizer::new();
267        let query1 = "SELECT  *  FROM   users   WHERE   id=1";
268        let query2 = "SELECT * FROM users WHERE id=1";
269
270        assert_eq!(
271            normalizer.fingerprint(query1),
272            normalizer.fingerprint(query2)
273        );
274    }
275
276    #[test]
277    fn test_equivalent_queries() {
278        let normalizer = QueryNormalizer::new();
279
280        // Same query with different literal values
281        let query1 = "SELECT * FROM users WHERE id = 123";
282        let query2 = "SELECT * FROM users WHERE id = 456";
283
284        assert!(normalizer.are_equivalent(query1, query2));
285
286        // Different query structure
287        let query3 = "SELECT * FROM users WHERE name = 'test'";
288        assert!(!normalizer.are_equivalent(query1, query3));
289    }
290
291    #[test]
292    fn test_hash_consistency() {
293        let normalizer = QueryNormalizer::new();
294
295        let query1 = "SELECT * FROM users WHERE id = 1";
296        let query2 = "SELECT * FROM users WHERE id = 1";
297
298        let norm1 = normalizer.normalize(query1);
299        let norm2 = normalizer.normalize(query2);
300
301        assert_eq!(norm1.hash, norm2.hash);
302    }
303
304    #[test]
305    fn test_hash_different_params() {
306        let normalizer = QueryNormalizer::new();
307
308        // With preserve_order=true, different params should have different hashes
309        let query1 = "SELECT * FROM users WHERE id = 1";
310        let query2 = "SELECT * FROM users WHERE id = 2";
311
312        let norm1 = normalizer.normalize(query1);
313        let norm2 = normalizer.normalize(query2);
314
315        assert_ne!(norm1.hash, norm2.hash);
316    }
317
318    #[test]
319    fn test_unordered_normalizer() {
320        let normalizer = QueryNormalizer::unordered();
321
322        // With preserve_order=false, different params should have same hash
323        let query1 = "SELECT * FROM users WHERE id = 1";
324        let query2 = "SELECT * FROM users WHERE id = 2";
325
326        let norm1 = normalizer.normalize(query1);
327        let norm2 = normalizer.normalize(query2);
328
329        // Fingerprints are the same
330        assert_eq!(norm1.fingerprint, norm2.fingerprint);
331    }
332
333    #[test]
334    fn test_extract_tables_various() {
335        let normalizer = QueryNormalizer::new();
336
337        let queries = vec![
338            ("INSERT INTO users VALUES (1)", vec!["users"]),
339            ("UPDATE products SET price = 10", vec!["products"]),
340            ("DELETE FROM orders WHERE id = 1", vec!["orders"]),
341            ("SELECT * FROM schema.table", vec!["table"]),
342            ("TABLE users", vec!["users"]),
343        ];
344
345        for (sql, expected_tables) in queries {
346            let normalized = normalizer.normalize(sql);
347            for table in expected_tables {
348                assert!(
349                    normalized.tables.contains(&table.to_string()),
350                    "Query '{}' should contain table '{}'",
351                    sql,
352                    table
353                );
354            }
355        }
356    }
357
358    #[test]
359    fn test_decimal_numbers() {
360        let normalizer = QueryNormalizer::new();
361        let query = "SELECT * FROM products WHERE price < 99.99 AND rating > 4.5";
362        let normalized = normalizer.normalize(query);
363
364        assert!(normalized.parameters.contains(&"99.99".to_string()));
365        assert!(normalized.parameters.contains(&"4.5".to_string()));
366    }
367
368    #[test]
369    fn test_scientific_notation() {
370        let normalizer = QueryNormalizer::new();
371        let query = "SELECT * FROM data WHERE value = 1e10";
372        let normalized = normalizer.normalize(query);
373
374        assert!(normalized.fingerprint.contains("VALUE = ?"));
375    }
376
377    #[test]
378    fn test_quick_fingerprint() {
379        let hash1 = quick_fingerprint("SELECT * FROM users WHERE id = 1");
380        let hash2 = quick_fingerprint("SELECT * FROM users WHERE id = 2");
381
382        // Quick fingerprint ignores parameter values
383        assert_eq!(hash1, hash2);
384    }
385}