Skip to main content

heliosdb_proxy/analytics/
fingerprinter.rs

1//! Query Fingerprinting
2//!
3//! Normalize queries and generate fingerprints for grouping similar queries.
4
5use std::collections::HashSet;
6use std::hash::{Hash, Hasher};
7
8use regex::Regex;
9
10/// Query fingerprinter
11#[derive(Debug)]
12pub struct QueryFingerprinter {
13    /// Regex for string literals
14    string_literal_re: Regex,
15    /// Regex for numeric literals
16    numeric_literal_re: Regex,
17    /// Regex for IN lists
18    in_list_re: Regex,
19    /// Regex for whitespace
20    whitespace_re: Regex,
21    /// Regex for UUID
22    uuid_re: Regex,
23    /// Regex for hex values
24    hex_re: Regex,
25}
26
27impl QueryFingerprinter {
28    /// Create a new fingerprinter
29    pub fn new() -> Self {
30        Self {
31            string_literal_re: Regex::new(r"'[^']*'").expect("Invalid regex"),
32            numeric_literal_re: Regex::new(r"\b\d+(\.\d+)?\b").expect("Invalid regex"),
33            in_list_re: Regex::new(r"(?i)IN\s*\([^)]+\)").expect("Invalid regex"),
34            whitespace_re: Regex::new(r"\s+").expect("Invalid regex"),
35            uuid_re: Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}").expect("Invalid regex"),
36            hex_re: Regex::new(r"0x[0-9a-fA-F]+").expect("Invalid regex"),
37        }
38    }
39
40    /// Generate fingerprint from query
41    pub fn fingerprint(&self, query: &str) -> QueryFingerprint {
42        let normalized = self.normalize(query);
43        let hash = self.compute_hash(&normalized);
44
45        QueryFingerprint {
46            hash,
47            normalized: normalized.clone(),
48            tables: self.extract_tables(query),
49            operation: self.detect_operation(query),
50            original_length: query.len(),
51        }
52    }
53
54    /// Normalize query (remove literals, standardize whitespace)
55    pub fn normalize(&self, query: &str) -> String {
56        let mut normalized = query.to_string();
57
58        // Replace UUIDs first
59        normalized = self.uuid_re.replace_all(&normalized, "?").to_string();
60
61        // Replace hex values
62        normalized = self.hex_re.replace_all(&normalized, "?").to_string();
63
64        // Replace string literals with ?
65        normalized = self.string_literal_re.replace_all(&normalized, "?").to_string();
66
67        // Replace numeric literals with ?
68        normalized = self.numeric_literal_re.replace_all(&normalized, "?").to_string();
69
70        // Replace IN lists with (?)
71        normalized = self.in_list_re.replace_all(&normalized, "IN (?)").to_string();
72
73        // Normalize whitespace
74        normalized = self.whitespace_re.replace_all(&normalized, " ").to_string();
75
76        normalized.trim().to_lowercase()
77    }
78
79    /// Compute hash of normalized query
80    fn compute_hash(&self, normalized: &str) -> u64 {
81        use std::collections::hash_map::DefaultHasher;
82        let mut hasher = DefaultHasher::new();
83        normalized.hash(&mut hasher);
84        hasher.finish()
85    }
86
87    /// Extract table names from query
88    fn extract_tables(&self, query: &str) -> Vec<String> {
89        let query_upper = query.to_uppercase();
90        let mut tables = HashSet::new();
91
92        // FROM clause
93        if let Some(from_pos) = query_upper.find("FROM") {
94            let after_from = &query[from_pos + 4..];
95            if let Some(table) = self.extract_first_identifier(after_from) {
96                tables.insert(table);
97            }
98        }
99
100        // JOIN clauses
101        for keyword in ["JOIN", "INNER JOIN", "LEFT JOIN", "RIGHT JOIN", "OUTER JOIN"] {
102            let mut search_pos = 0;
103            while let Some(pos) = query_upper[search_pos..].find(keyword) {
104                let absolute_pos = search_pos + pos + keyword.len();
105                if absolute_pos < query.len() {
106                    let after_join = &query[absolute_pos..];
107                    if let Some(table) = self.extract_first_identifier(after_join) {
108                        tables.insert(table);
109                    }
110                }
111                search_pos = absolute_pos;
112            }
113        }
114
115        // INSERT INTO
116        if let Some(pos) = query_upper.find("INSERT INTO") {
117            let after_insert = &query[pos + 11..];
118            if let Some(table) = self.extract_first_identifier(after_insert) {
119                tables.insert(table);
120            }
121        }
122
123        // UPDATE
124        if let Some(pos) = query_upper.find("UPDATE") {
125            let after_update = &query[pos + 6..];
126            if let Some(table) = self.extract_first_identifier(after_update) {
127                tables.insert(table);
128            }
129        }
130
131        // DELETE FROM
132        if let Some(pos) = query_upper.find("DELETE FROM") {
133            let after_delete = &query[pos + 11..];
134            if let Some(table) = self.extract_first_identifier(after_delete) {
135                tables.insert(table);
136            }
137        }
138
139        tables.into_iter().collect()
140    }
141
142    /// Extract first identifier from string
143    fn extract_first_identifier(&self, s: &str) -> Option<String> {
144        let trimmed = s.trim();
145        let mut chars = trimmed.chars().peekable();
146
147        // Skip leading whitespace
148        while chars.peek().map(|c| c.is_whitespace()).unwrap_or(false) {
149            chars.next();
150        }
151
152        // Collect identifier characters
153        let mut ident = String::new();
154        while let Some(&c) = chars.peek() {
155            if c.is_alphanumeric() || c == '_' || c == '.' || c == '"' {
156                ident.push(c);
157                chars.next();
158            } else {
159                break;
160            }
161        }
162
163        if ident.is_empty() {
164            None
165        } else {
166            // Remove quotes and return lowercase
167            let cleaned = ident.replace('"', "").to_lowercase();
168            Some(cleaned)
169        }
170    }
171
172    /// Detect operation type
173    fn detect_operation(&self, query: &str) -> OperationType {
174        let trimmed = query.trim().to_uppercase();
175
176        if trimmed.starts_with("SELECT") {
177            OperationType::Select
178        } else if trimmed.starts_with("INSERT") {
179            OperationType::Insert
180        } else if trimmed.starts_with("UPDATE") {
181            OperationType::Update
182        } else if trimmed.starts_with("DELETE") {
183            OperationType::Delete
184        } else if trimmed.starts_with("CREATE") {
185            OperationType::Ddl
186        } else if trimmed.starts_with("ALTER") {
187            OperationType::Ddl
188        } else if trimmed.starts_with("DROP") {
189            OperationType::Ddl
190        } else if trimmed.starts_with("BEGIN") || trimmed.starts_with("START TRANSACTION") {
191            OperationType::Transaction
192        } else if trimmed.starts_with("COMMIT") || trimmed.starts_with("ROLLBACK") {
193            OperationType::Transaction
194        } else if trimmed.starts_with("SET") {
195            OperationType::Utility
196        } else if trimmed.starts_with("EXPLAIN") {
197            OperationType::Utility
198        } else if trimmed.starts_with("ANALYZE") {
199            OperationType::Utility
200        } else {
201            OperationType::Other
202        }
203    }
204}
205
206impl Default for QueryFingerprinter {
207    fn default() -> Self {
208        Self::new()
209    }
210}
211
212/// Query fingerprint
213#[derive(Debug, Clone)]
214pub struct QueryFingerprint {
215    /// 64-bit hash of normalized query
216    pub hash: u64,
217
218    /// Normalized query text
219    pub normalized: String,
220
221    /// Tables involved
222    pub tables: Vec<String>,
223
224    /// Operation type
225    pub operation: OperationType,
226
227    /// Original query length
228    pub original_length: usize,
229}
230
231impl QueryFingerprint {
232    /// Get a short identifier for this fingerprint
233    pub fn short_id(&self) -> String {
234        format!("{:016x}", self.hash)
235    }
236}
237
238/// Operation type
239#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
240pub enum OperationType {
241    Select,
242    Insert,
243    Update,
244    Delete,
245    Ddl,
246    Transaction,
247    Utility,
248    Other,
249}
250
251impl std::fmt::Display for OperationType {
252    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
253        match self {
254            OperationType::Select => write!(f, "SELECT"),
255            OperationType::Insert => write!(f, "INSERT"),
256            OperationType::Update => write!(f, "UPDATE"),
257            OperationType::Delete => write!(f, "DELETE"),
258            OperationType::Ddl => write!(f, "DDL"),
259            OperationType::Transaction => write!(f, "TRANSACTION"),
260            OperationType::Utility => write!(f, "UTILITY"),
261            OperationType::Other => write!(f, "OTHER"),
262        }
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269
270    #[test]
271    fn test_fingerprinter_new() {
272        let fp = QueryFingerprinter::new();
273        assert!(fp.string_literal_re.is_match("'hello'"));
274    }
275
276    #[test]
277    fn test_normalize_string_literals() {
278        let fp = QueryFingerprinter::new();
279
280        let normalized = fp.normalize("SELECT * FROM users WHERE name = 'Alice'");
281        assert_eq!(normalized, "select * from users where name = ?");
282    }
283
284    #[test]
285    fn test_normalize_numeric_literals() {
286        let fp = QueryFingerprinter::new();
287
288        let normalized = fp.normalize("SELECT * FROM users WHERE id = 123 AND age > 25");
289        assert_eq!(normalized, "select * from users where id = ? and age > ?");
290    }
291
292    #[test]
293    fn test_normalize_in_list() {
294        let fp = QueryFingerprinter::new();
295
296        let normalized = fp.normalize("SELECT * FROM users WHERE id IN (1, 2, 3, 4, 5)");
297        assert_eq!(normalized, "select * from users where id in (?)");
298    }
299
300    #[test]
301    fn test_normalize_uuid() {
302        let fp = QueryFingerprinter::new();
303
304        let normalized = fp.normalize("SELECT * FROM users WHERE id = 'a1b2c3d4-e5f6-7890-abcd-ef1234567890'");
305        assert!(normalized.contains("?"));
306    }
307
308    #[test]
309    fn test_same_fingerprint_different_values() {
310        let fp = QueryFingerprinter::new();
311
312        let fp1 = fp.fingerprint("SELECT * FROM users WHERE id = 1");
313        let fp2 = fp.fingerprint("SELECT * FROM users WHERE id = 2");
314
315        assert_eq!(fp1.hash, fp2.hash);
316        assert_eq!(fp1.normalized, fp2.normalized);
317    }
318
319    #[test]
320    fn test_different_fingerprint_different_queries() {
321        let fp = QueryFingerprinter::new();
322
323        let fp1 = fp.fingerprint("SELECT * FROM users WHERE id = 1");
324        let fp2 = fp.fingerprint("SELECT * FROM orders WHERE id = 1");
325
326        assert_ne!(fp1.hash, fp2.hash);
327    }
328
329    #[test]
330    fn test_extract_tables() {
331        let fp = QueryFingerprinter::new();
332
333        let result = fp.fingerprint("SELECT * FROM users WHERE id = 1");
334        assert!(result.tables.contains(&"users".to_string()));
335
336        let result = fp.fingerprint("SELECT * FROM users u JOIN orders o ON u.id = o.user_id");
337        assert!(result.tables.contains(&"users".to_string()));
338        assert!(result.tables.contains(&"orders".to_string()));
339    }
340
341    #[test]
342    fn test_detect_operation() {
343        let fp = QueryFingerprinter::new();
344
345        assert_eq!(fp.detect_operation("SELECT * FROM users"), OperationType::Select);
346        assert_eq!(fp.detect_operation("INSERT INTO users VALUES (1)"), OperationType::Insert);
347        assert_eq!(fp.detect_operation("UPDATE users SET name = 'Bob'"), OperationType::Update);
348        assert_eq!(fp.detect_operation("DELETE FROM users WHERE id = 1"), OperationType::Delete);
349        assert_eq!(fp.detect_operation("CREATE TABLE foo (id INT)"), OperationType::Ddl);
350        assert_eq!(fp.detect_operation("BEGIN"), OperationType::Transaction);
351    }
352
353    #[test]
354    fn test_operation_display() {
355        assert_eq!(OperationType::Select.to_string(), "SELECT");
356        assert_eq!(OperationType::Insert.to_string(), "INSERT");
357    }
358}