Skip to main content

heliosdb_proxy/analytics/
fingerprinter.rs

1//! Query Fingerprinting
2//!
3//! Normalize queries and generate fingerprints for grouping similar queries.
4
5use std::collections::HashSet;
6use std::hash::{Hash, Hasher};
7
8use regex::Regex;
9
10/// Query fingerprinter
11#[derive(Debug)]
12pub struct QueryFingerprinter {
13    /// Regex for string literals
14    string_literal_re: Regex,
15    /// Regex for numeric literals
16    numeric_literal_re: Regex,
17    /// Regex for IN lists
18    in_list_re: Regex,
19    /// Regex for whitespace
20    whitespace_re: Regex,
21    /// Regex for UUID
22    uuid_re: Regex,
23    /// Regex for hex values
24    hex_re: Regex,
25}
26
27impl QueryFingerprinter {
28    /// Create a new fingerprinter
29    pub fn new() -> Self {
30        Self {
31            string_literal_re: Regex::new(r"'[^']*'").expect("Invalid regex"),
32            numeric_literal_re: Regex::new(r"\b\d+(\.\d+)?\b").expect("Invalid regex"),
33            in_list_re: Regex::new(r"(?i)IN\s*\([^)]+\)").expect("Invalid regex"),
34            whitespace_re: Regex::new(r"\s+").expect("Invalid regex"),
35            uuid_re: Regex::new(
36                r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
37            )
38            .expect("Invalid regex"),
39            hex_re: Regex::new(r"0x[0-9a-fA-F]+").expect("Invalid regex"),
40        }
41    }
42
43    /// Generate fingerprint from query
44    pub fn fingerprint(&self, query: &str) -> QueryFingerprint {
45        let normalized = self.normalize(query);
46        let hash = self.compute_hash(&normalized);
47
48        QueryFingerprint {
49            hash,
50            normalized: normalized.clone(),
51            tables: self.extract_tables(query),
52            operation: self.detect_operation(query),
53            original_length: query.len(),
54        }
55    }
56
57    /// Normalize query (remove literals, standardize whitespace)
58    pub fn normalize(&self, query: &str) -> String {
59        let mut normalized = query.to_string();
60
61        // Replace UUIDs first
62        normalized = self.uuid_re.replace_all(&normalized, "?").to_string();
63
64        // Replace hex values
65        normalized = self.hex_re.replace_all(&normalized, "?").to_string();
66
67        // Replace string literals with ?
68        normalized = self
69            .string_literal_re
70            .replace_all(&normalized, "?")
71            .to_string();
72
73        // Replace numeric literals with ?
74        normalized = self
75            .numeric_literal_re
76            .replace_all(&normalized, "?")
77            .to_string();
78
79        // Replace IN lists with (?)
80        normalized = self
81            .in_list_re
82            .replace_all(&normalized, "IN (?)")
83            .to_string();
84
85        // Normalize whitespace
86        normalized = self.whitespace_re.replace_all(&normalized, " ").to_string();
87
88        normalized.trim().to_lowercase()
89    }
90
91    /// Compute hash of normalized query
92    fn compute_hash(&self, normalized: &str) -> u64 {
93        use std::collections::hash_map::DefaultHasher;
94        let mut hasher = DefaultHasher::new();
95        normalized.hash(&mut hasher);
96        hasher.finish()
97    }
98
99    /// Extract table names from query
100    fn extract_tables(&self, query: &str) -> Vec<String> {
101        let query_upper = query.to_uppercase();
102        let mut tables = HashSet::new();
103
104        // FROM clause
105        if let Some(from_pos) = query_upper.find("FROM") {
106            let after_from = &query[from_pos + 4..];
107            if let Some(table) = self.extract_first_identifier(after_from) {
108                tables.insert(table);
109            }
110        }
111
112        // JOIN clauses
113        for keyword in [
114            "JOIN",
115            "INNER JOIN",
116            "LEFT JOIN",
117            "RIGHT JOIN",
118            "OUTER JOIN",
119        ] {
120            let mut search_pos = 0;
121            while let Some(pos) = query_upper[search_pos..].find(keyword) {
122                let absolute_pos = search_pos + pos + keyword.len();
123                if absolute_pos < query.len() {
124                    let after_join = &query[absolute_pos..];
125                    if let Some(table) = self.extract_first_identifier(after_join) {
126                        tables.insert(table);
127                    }
128                }
129                search_pos = absolute_pos;
130            }
131        }
132
133        // INSERT INTO
134        if let Some(pos) = query_upper.find("INSERT INTO") {
135            let after_insert = &query[pos + 11..];
136            if let Some(table) = self.extract_first_identifier(after_insert) {
137                tables.insert(table);
138            }
139        }
140
141        // UPDATE
142        if let Some(pos) = query_upper.find("UPDATE") {
143            let after_update = &query[pos + 6..];
144            if let Some(table) = self.extract_first_identifier(after_update) {
145                tables.insert(table);
146            }
147        }
148
149        // DELETE FROM
150        if let Some(pos) = query_upper.find("DELETE FROM") {
151            let after_delete = &query[pos + 11..];
152            if let Some(table) = self.extract_first_identifier(after_delete) {
153                tables.insert(table);
154            }
155        }
156
157        tables.into_iter().collect()
158    }
159
160    /// Extract first identifier from string
161    fn extract_first_identifier(&self, s: &str) -> Option<String> {
162        let trimmed = s.trim();
163        let mut chars = trimmed.chars().peekable();
164
165        // Skip leading whitespace
166        while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
167            chars.next();
168        }
169
170        // Collect identifier characters
171        let mut ident = String::new();
172        while let Some(&c) = chars.peek() {
173            if c.is_alphanumeric() || c == '_' || c == '.' || c == '"' {
174                ident.push(c);
175                chars.next();
176            } else {
177                break;
178            }
179        }
180
181        if ident.is_empty() {
182            None
183        } else {
184            // Remove quotes and return lowercase
185            let cleaned = ident.replace('"', "").to_lowercase();
186            Some(cleaned)
187        }
188    }
189
190    /// Detect operation type
191    #[allow(clippy::if_same_then_else)]
192    fn detect_operation(&self, query: &str) -> OperationType {
193        let trimmed = query.trim().to_uppercase();
194
195        if trimmed.starts_with("SELECT") {
196            OperationType::Select
197        } else if trimmed.starts_with("INSERT") {
198            OperationType::Insert
199        } else if trimmed.starts_with("UPDATE") {
200            OperationType::Update
201        } else if trimmed.starts_with("DELETE") {
202            OperationType::Delete
203        } else if trimmed.starts_with("CREATE") {
204            OperationType::Ddl
205        } else if trimmed.starts_with("ALTER") {
206            OperationType::Ddl
207        } else if trimmed.starts_with("DROP") {
208            OperationType::Ddl
209        } else if trimmed.starts_with("BEGIN") || trimmed.starts_with("START TRANSACTION") {
210            OperationType::Transaction
211        } else if trimmed.starts_with("COMMIT") || trimmed.starts_with("ROLLBACK") {
212            OperationType::Transaction
213        } else if trimmed.starts_with("SET") {
214            OperationType::Utility
215        } else if trimmed.starts_with("EXPLAIN") {
216            OperationType::Utility
217        } else if trimmed.starts_with("ANALYZE") {
218            OperationType::Utility
219        } else {
220            OperationType::Other
221        }
222    }
223}
224
225impl Default for QueryFingerprinter {
226    fn default() -> Self {
227        Self::new()
228    }
229}
230
231/// Query fingerprint
232#[derive(Debug, Clone)]
233pub struct QueryFingerprint {
234    /// 64-bit hash of normalized query
235    pub hash: u64,
236
237    /// Normalized query text
238    pub normalized: String,
239
240    /// Tables involved
241    pub tables: Vec<String>,
242
243    /// Operation type
244    pub operation: OperationType,
245
246    /// Original query length
247    pub original_length: usize,
248}
249
250impl QueryFingerprint {
251    /// Get a short identifier for this fingerprint
252    pub fn short_id(&self) -> String {
253        format!("{:016x}", self.hash)
254    }
255}
256
257/// Operation type
258#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
259pub enum OperationType {
260    Select,
261    Insert,
262    Update,
263    Delete,
264    Ddl,
265    Transaction,
266    Utility,
267    Other,
268}
269
270impl std::fmt::Display for OperationType {
271    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
272        match self {
273            OperationType::Select => write!(f, "SELECT"),
274            OperationType::Insert => write!(f, "INSERT"),
275            OperationType::Update => write!(f, "UPDATE"),
276            OperationType::Delete => write!(f, "DELETE"),
277            OperationType::Ddl => write!(f, "DDL"),
278            OperationType::Transaction => write!(f, "TRANSACTION"),
279            OperationType::Utility => write!(f, "UTILITY"),
280            OperationType::Other => write!(f, "OTHER"),
281        }
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn test_fingerprinter_new() {
291        let fp = QueryFingerprinter::new();
292        assert!(fp.string_literal_re.is_match("'hello'"));
293    }
294
295    #[test]
296    fn test_normalize_string_literals() {
297        let fp = QueryFingerprinter::new();
298
299        let normalized = fp.normalize("SELECT * FROM users WHERE name = 'Alice'");
300        assert_eq!(normalized, "select * from users where name = ?");
301    }
302
303    #[test]
304    fn test_normalize_numeric_literals() {
305        let fp = QueryFingerprinter::new();
306
307        let normalized = fp.normalize("SELECT * FROM users WHERE id = 123 AND age > 25");
308        assert_eq!(normalized, "select * from users where id = ? and age > ?");
309    }
310
311    #[test]
312    fn test_normalize_in_list() {
313        let fp = QueryFingerprinter::new();
314
315        let normalized = fp.normalize("SELECT * FROM users WHERE id IN (1, 2, 3, 4, 5)");
316        assert_eq!(normalized, "select * from users where id in (?)");
317    }
318
319    #[test]
320    fn test_normalize_uuid() {
321        let fp = QueryFingerprinter::new();
322
323        let normalized =
324            fp.normalize("SELECT * FROM users WHERE id = 'a1b2c3d4-e5f6-7890-abcd-ef1234567890'");
325        assert!(normalized.contains("?"));
326    }
327
328    #[test]
329    fn test_same_fingerprint_different_values() {
330        let fp = QueryFingerprinter::new();
331
332        let fp1 = fp.fingerprint("SELECT * FROM users WHERE id = 1");
333        let fp2 = fp.fingerprint("SELECT * FROM users WHERE id = 2");
334
335        assert_eq!(fp1.hash, fp2.hash);
336        assert_eq!(fp1.normalized, fp2.normalized);
337    }
338
339    #[test]
340    fn test_different_fingerprint_different_queries() {
341        let fp = QueryFingerprinter::new();
342
343        let fp1 = fp.fingerprint("SELECT * FROM users WHERE id = 1");
344        let fp2 = fp.fingerprint("SELECT * FROM orders WHERE id = 1");
345
346        assert_ne!(fp1.hash, fp2.hash);
347    }
348
349    #[test]
350    fn test_extract_tables() {
351        let fp = QueryFingerprinter::new();
352
353        let result = fp.fingerprint("SELECT * FROM users WHERE id = 1");
354        assert!(result.tables.contains(&"users".to_string()));
355
356        let result = fp.fingerprint("SELECT * FROM users u JOIN orders o ON u.id = o.user_id");
357        assert!(result.tables.contains(&"users".to_string()));
358        assert!(result.tables.contains(&"orders".to_string()));
359    }
360
361    #[test]
362    fn test_detect_operation() {
363        let fp = QueryFingerprinter::new();
364
365        assert_eq!(
366            fp.detect_operation("SELECT * FROM users"),
367            OperationType::Select
368        );
369        assert_eq!(
370            fp.detect_operation("INSERT INTO users VALUES (1)"),
371            OperationType::Insert
372        );
373        assert_eq!(
374            fp.detect_operation("UPDATE users SET name = 'Bob'"),
375            OperationType::Update
376        );
377        assert_eq!(
378            fp.detect_operation("DELETE FROM users WHERE id = 1"),
379            OperationType::Delete
380        );
381        assert_eq!(
382            fp.detect_operation("CREATE TABLE foo (id INT)"),
383            OperationType::Ddl
384        );
385        assert_eq!(fp.detect_operation("BEGIN"), OperationType::Transaction);
386    }
387
388    #[test]
389    fn test_operation_display() {
390        assert_eq!(OperationType::Select.to_string(), "SELECT");
391        assert_eq!(OperationType::Insert.to_string(), "INSERT");
392    }
393}