Skip to main content

heliosdb_proxy/cache/
normalizer.rs

1//! Query Normalizer
2//!
3//! Normalizes SQL queries for cache key generation by:
4//! - Replacing literal values with placeholders
5//! - Normalizing whitespace
6//! - Extracting table names
7//! - Computing stable hashes
8
9use once_cell::sync::Lazy;
10use regex::Regex;
11use std::collections::hash_map::DefaultHasher;
12use std::hash::{Hash, Hasher};
13
14/// Normalized query representation
15#[derive(Debug, Clone)]
16pub struct NormalizedQuery {
17    /// Normalized query fingerprint (literals replaced with ?)
18    pub fingerprint: String,
19
20    /// Hash of the fingerprint for fast comparison
21    pub hash: u64,
22
23    /// Tables referenced in the query
24    pub tables: Vec<String>,
25
26    /// Extracted parameter values
27    pub parameters: Vec<String>,
28}
29
30impl NormalizedQuery {
31    /// Get the fingerprint for display
32    pub fn fingerprint(&self) -> &str {
33        &self.fingerprint
34    }
35
36    /// Get the hash value
37    pub fn hash(&self) -> u64 {
38        self.hash
39    }
40
41    /// Get referenced tables
42    pub fn tables(&self) -> &[String] {
43        &self.tables
44    }
45}
46
47/// Query normalizer for cache key generation
48#[derive(Debug, Clone)]
49pub struct QueryNormalizer {
50    /// Whether to preserve parameter order
51    preserve_order: bool,
52}
53
54// Regex patterns for normalization
55static STRING_LITERAL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"'(?:[^'\\]|\\.)*'"#).unwrap());
56
57#[allow(dead_code)]
58static DOUBLE_QUOTED: Lazy<Regex> = Lazy::new(|| Regex::new(r#""(?:[^"\\]|\\.)*""#).unwrap());
59
60static NUMBER_LITERAL: Lazy<Regex> =
61    Lazy::new(|| Regex::new(r"\b\d+(?:\.\d+)?(?:e[+-]?\d+)?\b").unwrap());
62
63static WHITESPACE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
64
65static TABLE_PATTERN: Lazy<Regex> = Lazy::new(|| {
66    Regex::new(r"(?i)(?:FROM|JOIN|INTO|UPDATE|TABLE)\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)").unwrap()
67});
68
69static HINT_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"/\*[^*]*\*/").unwrap());
70
71static COMMENT_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"--[^\n]*").unwrap());
72
73impl QueryNormalizer {
74    /// Create a new query normalizer
75    pub fn new() -> Self {
76        Self {
77            preserve_order: true,
78        }
79    }
80
81    /// Create a normalizer that doesn't preserve parameter order
82    pub fn unordered() -> Self {
83        Self {
84            preserve_order: false,
85        }
86    }
87
88    /// Normalize a SQL query
89    pub fn normalize(&self, sql: &str) -> NormalizedQuery {
90        let mut parameters = Vec::new();
91
92        // Strip comments and hints first
93        let sql = HINT_PATTERN.replace_all(sql, "");
94        let sql = COMMENT_PATTERN.replace_all(&sql, "");
95
96        // Extract tables before normalization
97        let tables = self.extract_tables(&sql);
98
99        // Replace string literals with placeholders
100        let sql = STRING_LITERAL.replace_all(&sql, |caps: &regex::Captures| {
101            let value = caps.get(0).unwrap().as_str();
102            // Remove quotes and store the value
103            let inner = &value[1..value.len() - 1];
104            parameters.push(inner.to_string());
105            "?"
106        });
107
108        // Replace number literals with placeholders
109        let sql = NUMBER_LITERAL.replace_all(&sql, |caps: &regex::Captures| {
110            let value = caps.get(0).unwrap().as_str();
111            parameters.push(value.to_string());
112            "?"
113        });
114
115        // Normalize whitespace
116        let sql = WHITESPACE.replace_all(&sql, " ");
117
118        // Trim and convert to uppercase for consistency
119        let fingerprint = sql.trim().to_uppercase();
120
121        // Compute hash
122        let mut hasher = DefaultHasher::new();
123        fingerprint.hash(&mut hasher);
124        if self.preserve_order {
125            // Include parameters in hash if order matters
126            for param in &parameters {
127                param.hash(&mut hasher);
128            }
129        }
130        let hash = hasher.finish();
131
132        NormalizedQuery {
133            fingerprint,
134            hash,
135            tables,
136            parameters,
137        }
138    }
139
140    /// Extract table names from a SQL query
141    fn extract_tables(&self, sql: &str) -> Vec<String> {
142        let mut tables = Vec::new();
143
144        for cap in TABLE_PATTERN.captures_iter(sql) {
145            if let Some(table_match) = cap.get(1) {
146                let table = table_match.as_str().to_lowercase();
147                // Remove schema prefix if present
148                let table_name = table.split('.').next_back().unwrap_or(&table);
149                if !tables.contains(&table_name.to_string()) {
150                    tables.push(table_name.to_string());
151                }
152            }
153        }
154
155        tables
156    }
157
158    /// Normalize for comparison only (no parameter extraction)
159    pub fn fingerprint(&self, sql: &str) -> String {
160        // Strip comments
161        let sql = HINT_PATTERN.replace_all(sql, "");
162        let sql = COMMENT_PATTERN.replace_all(&sql, "");
163
164        // Replace literals
165        let sql = STRING_LITERAL.replace_all(&sql, "?");
166        let sql = NUMBER_LITERAL.replace_all(&sql, "?");
167
168        // Normalize whitespace
169        let sql = WHITESPACE.replace_all(&sql, " ");
170
171        sql.trim().to_uppercase()
172    }
173
174    /// Check if two queries are equivalent (same fingerprint)
175    pub fn are_equivalent(&self, sql1: &str, sql2: &str) -> bool {
176        self.fingerprint(sql1) == self.fingerprint(sql2)
177    }
178}
179
180impl Default for QueryNormalizer {
181    fn default() -> Self {
182        Self::new()
183    }
184}
185
186/// Quick fingerprint generation without full normalization
187pub fn quick_fingerprint(sql: &str) -> u64 {
188    let normalized = QueryNormalizer::new().fingerprint(sql);
189    let mut hasher = DefaultHasher::new();
190    normalized.hash(&mut hasher);
191    hasher.finish()
192}
193
194/// Extract tables from SQL without full normalization
195pub fn extract_tables(sql: &str) -> Vec<String> {
196    let normalizer = QueryNormalizer::new();
197    normalizer.extract_tables(sql)
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn test_normalize_simple_query() {
206        let normalizer = QueryNormalizer::new();
207        let query = "SELECT * FROM users WHERE id = 123";
208        let normalized = normalizer.normalize(query);
209
210        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE ID = ?");
211        assert_eq!(normalized.parameters, vec!["123"]);
212        assert_eq!(normalized.tables, vec!["users"]);
213    }
214
215    #[test]
216    fn test_normalize_string_literals() {
217        let normalizer = QueryNormalizer::new();
218        let query = "SELECT * FROM users WHERE name = 'John Doe'";
219        let normalized = normalizer.normalize(query);
220
221        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE NAME = ?");
222        assert_eq!(normalized.parameters, vec!["John Doe"]);
223    }
224
225    #[test]
226    fn test_normalize_multiple_parameters() {
227        let normalizer = QueryNormalizer::new();
228        let query = "SELECT * FROM users WHERE age > 18 AND status = 'active' AND score < 100";
229        let normalized = normalizer.normalize(query);
230
231        assert_eq!(
232            normalized.fingerprint,
233            "SELECT * FROM USERS WHERE AGE > ? AND STATUS = ? AND SCORE < ?"
234        );
235        assert_eq!(normalized.parameters.len(), 3);
236    }
237
238    #[test]
239    fn test_extract_tables_join() {
240        let normalizer = QueryNormalizer::new();
241        let query = "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id";
242        let normalized = normalizer.normalize(query);
243
244        assert!(normalized.tables.contains(&"users".to_string()));
245        assert!(normalized.tables.contains(&"orders".to_string()));
246    }
247
248    #[test]
249    fn test_normalize_removes_comments() {
250        let normalizer = QueryNormalizer::new();
251        let query = "/* helios:cache_ttl=60 */ SELECT * FROM users -- inline comment\nWHERE id = 1";
252        let normalized = normalizer.normalize(query);
253
254        assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE ID = ?");
255    }
256
257    #[test]
258    fn test_normalize_whitespace() {
259        let normalizer = QueryNormalizer::new();
260        let query1 = "SELECT  *  FROM   users   WHERE   id=1";
261        let query2 = "SELECT * FROM users WHERE id=1";
262
263        assert_eq!(
264            normalizer.fingerprint(query1),
265            normalizer.fingerprint(query2)
266        );
267    }
268
269    #[test]
270    fn test_equivalent_queries() {
271        let normalizer = QueryNormalizer::new();
272
273        // Same query with different literal values
274        let query1 = "SELECT * FROM users WHERE id = 123";
275        let query2 = "SELECT * FROM users WHERE id = 456";
276
277        assert!(normalizer.are_equivalent(query1, query2));
278
279        // Different query structure
280        let query3 = "SELECT * FROM users WHERE name = 'test'";
281        assert!(!normalizer.are_equivalent(query1, query3));
282    }
283
284    #[test]
285    fn test_hash_consistency() {
286        let normalizer = QueryNormalizer::new();
287
288        let query1 = "SELECT * FROM users WHERE id = 1";
289        let query2 = "SELECT * FROM users WHERE id = 1";
290
291        let norm1 = normalizer.normalize(query1);
292        let norm2 = normalizer.normalize(query2);
293
294        assert_eq!(norm1.hash, norm2.hash);
295    }
296
297    #[test]
298    fn test_hash_different_params() {
299        let normalizer = QueryNormalizer::new();
300
301        // With preserve_order=true, different params should have different hashes
302        let query1 = "SELECT * FROM users WHERE id = 1";
303        let query2 = "SELECT * FROM users WHERE id = 2";
304
305        let norm1 = normalizer.normalize(query1);
306        let norm2 = normalizer.normalize(query2);
307
308        assert_ne!(norm1.hash, norm2.hash);
309    }
310
311    #[test]
312    fn test_unordered_normalizer() {
313        let normalizer = QueryNormalizer::unordered();
314
315        // With preserve_order=false, different params should have same hash
316        let query1 = "SELECT * FROM users WHERE id = 1";
317        let query2 = "SELECT * FROM users WHERE id = 2";
318
319        let norm1 = normalizer.normalize(query1);
320        let norm2 = normalizer.normalize(query2);
321
322        // Fingerprints are the same
323        assert_eq!(norm1.fingerprint, norm2.fingerprint);
324    }
325
326    #[test]
327    fn test_extract_tables_various() {
328        let normalizer = QueryNormalizer::new();
329
330        let queries = vec![
331            ("INSERT INTO users VALUES (1)", vec!["users"]),
332            ("UPDATE products SET price = 10", vec!["products"]),
333            ("DELETE FROM orders WHERE id = 1", vec!["orders"]),
334            ("SELECT * FROM schema.table", vec!["table"]),
335            ("TABLE users", vec!["users"]),
336        ];
337
338        for (sql, expected_tables) in queries {
339            let normalized = normalizer.normalize(sql);
340            for table in expected_tables {
341                assert!(
342                    normalized.tables.contains(&table.to_string()),
343                    "Query '{}' should contain table '{}'",
344                    sql,
345                    table
346                );
347            }
348        }
349    }
350
351    #[test]
352    fn test_decimal_numbers() {
353        let normalizer = QueryNormalizer::new();
354        let query = "SELECT * FROM products WHERE price < 99.99 AND rating > 4.5";
355        let normalized = normalizer.normalize(query);
356
357        assert!(normalized.parameters.contains(&"99.99".to_string()));
358        assert!(normalized.parameters.contains(&"4.5".to_string()));
359    }
360
361    #[test]
362    fn test_scientific_notation() {
363        let normalizer = QueryNormalizer::new();
364        let query = "SELECT * FROM data WHERE value = 1e10";
365        let normalized = normalizer.normalize(query);
366
367        assert!(normalized.fingerprint.contains("VALUE = ?"));
368    }
369
370    #[test]
371    fn test_quick_fingerprint() {
372        let hash1 = quick_fingerprint("SELECT * FROM users WHERE id = 1");
373        let hash2 = quick_fingerprint("SELECT * FROM users WHERE id = 2");
374
375        // Quick fingerprint ignores parameter values
376        assert_eq!(hash1, hash2);
377    }
378}