heliosdb_proxy/cache/
normalizer.rs1use std::collections::hash_map::DefaultHasher;
10use std::hash::{Hash, Hasher};
11use regex::Regex;
12use once_cell::sync::Lazy;
13
14#[derive(Debug, Clone)]
16pub struct NormalizedQuery {
17 pub fingerprint: String,
19
20 pub hash: u64,
22
23 pub tables: Vec<String>,
25
26 pub parameters: Vec<String>,
28}
29
30impl NormalizedQuery {
31 pub fn fingerprint(&self) -> &str {
33 &self.fingerprint
34 }
35
36 pub fn hash(&self) -> u64 {
38 self.hash
39 }
40
41 pub fn tables(&self) -> &[String] {
43 &self.tables
44 }
45}
46
47#[derive(Debug, Clone)]
49pub struct QueryNormalizer {
50 preserve_order: bool,
52}
53
54static STRING_LITERAL: Lazy<Regex> = Lazy::new(|| {
56 Regex::new(r#"'(?:[^'\\]|\\.)*'"#).unwrap()
57});
58
59static DOUBLE_QUOTED: Lazy<Regex> = Lazy::new(|| {
60 Regex::new(r#""(?:[^"\\]|\\.)*""#).unwrap()
61});
62
63static NUMBER_LITERAL: Lazy<Regex> = Lazy::new(|| {
64 Regex::new(r"\b\d+(?:\.\d+)?(?:e[+-]?\d+)?\b").unwrap()
65});
66
67static WHITESPACE: Lazy<Regex> = Lazy::new(|| {
68 Regex::new(r"\s+").unwrap()
69});
70
71static TABLE_PATTERN: Lazy<Regex> = Lazy::new(|| {
72 Regex::new(r"(?i)(?:FROM|JOIN|INTO|UPDATE|TABLE)\s+([a-zA-Z_][a-zA-Z0-9_]*(?:\.[a-zA-Z_][a-zA-Z0-9_]*)?)").unwrap()
73});
74
75static HINT_PATTERN: Lazy<Regex> = Lazy::new(|| {
76 Regex::new(r"/\*[^*]*\*/").unwrap()
77});
78
79static COMMENT_PATTERN: Lazy<Regex> = Lazy::new(|| {
80 Regex::new(r"--[^\n]*").unwrap()
81});
82
83impl QueryNormalizer {
84 pub fn new() -> Self {
86 Self {
87 preserve_order: true,
88 }
89 }
90
91 pub fn unordered() -> Self {
93 Self {
94 preserve_order: false,
95 }
96 }
97
98 pub fn normalize(&self, sql: &str) -> NormalizedQuery {
100 let mut parameters = Vec::new();
101
102 let sql = HINT_PATTERN.replace_all(sql, "");
104 let sql = COMMENT_PATTERN.replace_all(&sql, "");
105
106 let tables = self.extract_tables(&sql);
108
109 let sql = STRING_LITERAL.replace_all(&sql, |caps: ®ex::Captures| {
111 let value = caps.get(0).unwrap().as_str();
112 let inner = &value[1..value.len()-1];
114 parameters.push(inner.to_string());
115 "?"
116 });
117
118 let sql = NUMBER_LITERAL.replace_all(&sql, |caps: ®ex::Captures| {
120 let value = caps.get(0).unwrap().as_str();
121 parameters.push(value.to_string());
122 "?"
123 });
124
125 let sql = WHITESPACE.replace_all(&sql, " ");
127
128 let fingerprint = sql.trim().to_uppercase();
130
131 let mut hasher = DefaultHasher::new();
133 fingerprint.hash(&mut hasher);
134 if self.preserve_order {
135 for param in ¶meters {
137 param.hash(&mut hasher);
138 }
139 }
140 let hash = hasher.finish();
141
142 NormalizedQuery {
143 fingerprint,
144 hash,
145 tables,
146 parameters,
147 }
148 }
149
150 fn extract_tables(&self, sql: &str) -> Vec<String> {
152 let mut tables = Vec::new();
153
154 for cap in TABLE_PATTERN.captures_iter(sql) {
155 if let Some(table_match) = cap.get(1) {
156 let table = table_match.as_str().to_lowercase();
157 let table_name = table.split('.').last().unwrap_or(&table);
159 if !tables.contains(&table_name.to_string()) {
160 tables.push(table_name.to_string());
161 }
162 }
163 }
164
165 tables
166 }
167
168 pub fn fingerprint(&self, sql: &str) -> String {
170 let sql = HINT_PATTERN.replace_all(sql, "");
172 let sql = COMMENT_PATTERN.replace_all(&sql, "");
173
174 let sql = STRING_LITERAL.replace_all(&sql, "?");
176 let sql = NUMBER_LITERAL.replace_all(&sql, "?");
177
178 let sql = WHITESPACE.replace_all(&sql, " ");
180
181 sql.trim().to_uppercase()
182 }
183
184 pub fn are_equivalent(&self, sql1: &str, sql2: &str) -> bool {
186 self.fingerprint(sql1) == self.fingerprint(sql2)
187 }
188}
189
190impl Default for QueryNormalizer {
191 fn default() -> Self {
192 Self::new()
193 }
194}
195
196pub fn quick_fingerprint(sql: &str) -> u64 {
198 let normalized = QueryNormalizer::new().fingerprint(sql);
199 let mut hasher = DefaultHasher::new();
200 normalized.hash(&mut hasher);
201 hasher.finish()
202}
203
204pub fn extract_tables(sql: &str) -> Vec<String> {
206 let normalizer = QueryNormalizer::new();
207 normalizer.extract_tables(sql)
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn test_normalize_simple_query() {
216 let normalizer = QueryNormalizer::new();
217 let query = "SELECT * FROM users WHERE id = 123";
218 let normalized = normalizer.normalize(query);
219
220 assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE ID = ?");
221 assert_eq!(normalized.parameters, vec!["123"]);
222 assert_eq!(normalized.tables, vec!["users"]);
223 }
224
225 #[test]
226 fn test_normalize_string_literals() {
227 let normalizer = QueryNormalizer::new();
228 let query = "SELECT * FROM users WHERE name = 'John Doe'";
229 let normalized = normalizer.normalize(query);
230
231 assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE NAME = ?");
232 assert_eq!(normalized.parameters, vec!["John Doe"]);
233 }
234
235 #[test]
236 fn test_normalize_multiple_parameters() {
237 let normalizer = QueryNormalizer::new();
238 let query = "SELECT * FROM users WHERE age > 18 AND status = 'active' AND score < 100";
239 let normalized = normalizer.normalize(query);
240
241 assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE AGE > ? AND STATUS = ? AND SCORE < ?");
242 assert_eq!(normalized.parameters.len(), 3);
243 }
244
245 #[test]
246 fn test_extract_tables_join() {
247 let normalizer = QueryNormalizer::new();
248 let query = "SELECT u.*, o.* FROM users u JOIN orders o ON u.id = o.user_id";
249 let normalized = normalizer.normalize(query);
250
251 assert!(normalized.tables.contains(&"users".to_string()));
252 assert!(normalized.tables.contains(&"orders".to_string()));
253 }
254
255 #[test]
256 fn test_normalize_removes_comments() {
257 let normalizer = QueryNormalizer::new();
258 let query = "/* helios:cache_ttl=60 */ SELECT * FROM users -- inline comment\nWHERE id = 1";
259 let normalized = normalizer.normalize(query);
260
261 assert_eq!(normalized.fingerprint, "SELECT * FROM USERS WHERE ID = ?");
262 }
263
264 #[test]
265 fn test_normalize_whitespace() {
266 let normalizer = QueryNormalizer::new();
267 let query1 = "SELECT * FROM users WHERE id=1";
268 let query2 = "SELECT * FROM users WHERE id=1";
269
270 assert_eq!(
271 normalizer.fingerprint(query1),
272 normalizer.fingerprint(query2)
273 );
274 }
275
276 #[test]
277 fn test_equivalent_queries() {
278 let normalizer = QueryNormalizer::new();
279
280 let query1 = "SELECT * FROM users WHERE id = 123";
282 let query2 = "SELECT * FROM users WHERE id = 456";
283
284 assert!(normalizer.are_equivalent(query1, query2));
285
286 let query3 = "SELECT * FROM users WHERE name = 'test'";
288 assert!(!normalizer.are_equivalent(query1, query3));
289 }
290
291 #[test]
292 fn test_hash_consistency() {
293 let normalizer = QueryNormalizer::new();
294
295 let query1 = "SELECT * FROM users WHERE id = 1";
296 let query2 = "SELECT * FROM users WHERE id = 1";
297
298 let norm1 = normalizer.normalize(query1);
299 let norm2 = normalizer.normalize(query2);
300
301 assert_eq!(norm1.hash, norm2.hash);
302 }
303
304 #[test]
305 fn test_hash_different_params() {
306 let normalizer = QueryNormalizer::new();
307
308 let query1 = "SELECT * FROM users WHERE id = 1";
310 let query2 = "SELECT * FROM users WHERE id = 2";
311
312 let norm1 = normalizer.normalize(query1);
313 let norm2 = normalizer.normalize(query2);
314
315 assert_ne!(norm1.hash, norm2.hash);
316 }
317
318 #[test]
319 fn test_unordered_normalizer() {
320 let normalizer = QueryNormalizer::unordered();
321
322 let query1 = "SELECT * FROM users WHERE id = 1";
324 let query2 = "SELECT * FROM users WHERE id = 2";
325
326 let norm1 = normalizer.normalize(query1);
327 let norm2 = normalizer.normalize(query2);
328
329 assert_eq!(norm1.fingerprint, norm2.fingerprint);
331 }
332
333 #[test]
334 fn test_extract_tables_various() {
335 let normalizer = QueryNormalizer::new();
336
337 let queries = vec![
338 ("INSERT INTO users VALUES (1)", vec!["users"]),
339 ("UPDATE products SET price = 10", vec!["products"]),
340 ("DELETE FROM orders WHERE id = 1", vec!["orders"]),
341 ("SELECT * FROM schema.table", vec!["table"]),
342 ("TABLE users", vec!["users"]),
343 ];
344
345 for (sql, expected_tables) in queries {
346 let normalized = normalizer.normalize(sql);
347 for table in expected_tables {
348 assert!(
349 normalized.tables.contains(&table.to_string()),
350 "Query '{}' should contain table '{}'",
351 sql,
352 table
353 );
354 }
355 }
356 }
357
358 #[test]
359 fn test_decimal_numbers() {
360 let normalizer = QueryNormalizer::new();
361 let query = "SELECT * FROM products WHERE price < 99.99 AND rating > 4.5";
362 let normalized = normalizer.normalize(query);
363
364 assert!(normalized.parameters.contains(&"99.99".to_string()));
365 assert!(normalized.parameters.contains(&"4.5".to_string()));
366 }
367
368 #[test]
369 fn test_scientific_notation() {
370 let normalizer = QueryNormalizer::new();
371 let query = "SELECT * FROM data WHERE value = 1e10";
372 let normalized = normalizer.normalize(query);
373
374 assert!(normalized.fingerprint.contains("VALUE = ?"));
375 }
376
377 #[test]
378 fn test_quick_fingerprint() {
379 let hash1 = quick_fingerprint("SELECT * FROM users WHERE id = 1");
380 let hash2 = quick_fingerprint("SELECT * FROM users WHERE id = 2");
381
382 assert_eq!(hash1, hash2);
384 }
385}