Skip to main content

oxirs_arq/
query_fingerprinting.rs

1//! Advanced Query Fingerprinting for SPARQL Query Analysis
2//!
3//! This module provides sophisticated query fingerprinting for:
4//! - Query deduplication and caching
5//! - Query pattern recognition
6//! - Workload analysis
7//! - Query similarity detection
8//!
9//! # Features
10//!
11//! - **Structural Fingerprinting**: Captures query structure independent of literal values
12//! - **Semantic Fingerprinting**: Incorporates predicate and type information
13//! - **Normalized Fingerprints**: Canonicalized representation for comparison
14//! - **Parameterized Templates**: Extracts query templates with parameter slots
15//! - **Similarity Metrics**: Computes query similarity scores
16//!
17//! # Example
18//!
19//! ```rust
20//! use oxirs_arq::query_fingerprinting::{QueryFingerprinter, FingerprintConfig};
21//!
22//! let config = FingerprintConfig::default();
23//! let fingerprinter = QueryFingerprinter::new(config);
24//!
25//! let query1 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
26//! let query2 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }";
27//!
28//! let fp1 = fingerprinter.fingerprint(query1).unwrap();
29//! let fp2 = fingerprinter.fingerprint(query2).unwrap();
30//!
31//! // Same structure, different literals -> similar fingerprints
32//! let similarity = fingerprinter.similarity(&fp1, &fp2);
33//! println!("Similarity: {:.2}", similarity);
34//! ```
35
36use anyhow::Result;
37use regex::Regex;
38use serde::{Deserialize, Serialize};
39use sha2::{Digest, Sha256};
40use std::collections::{HashMap, HashSet};
41use std::hash::{Hash, Hasher};
42use std::sync::atomic::{AtomicU64, Ordering};
43use std::sync::{Arc, OnceLock, RwLock};
44
45/// Configuration for query fingerprinting
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct FingerprintConfig {
48    /// Include variable names in fingerprint
49    pub include_variables: bool,
50    /// Normalize literal values (replace with placeholders)
51    pub normalize_literals: bool,
52    /// Normalize numeric values
53    pub normalize_numbers: bool,
54    /// Normalize IRI local names
55    pub normalize_iri_locals: bool,
56    /// Preserve predicate IRIs
57    pub preserve_predicates: bool,
58    /// Preserve type assertions (rdf:type)
59    pub preserve_types: bool,
60    /// Hash algorithm to use
61    pub hash_algorithm: HashAlgorithm,
62    /// Maximum fingerprint cache size
63    pub cache_size: usize,
64}
65
66impl Default for FingerprintConfig {
67    fn default() -> Self {
68        Self {
69            include_variables: false,
70            normalize_literals: true,
71            normalize_numbers: true,
72            normalize_iri_locals: true,
73            preserve_predicates: true,
74            preserve_types: true,
75            hash_algorithm: HashAlgorithm::Sha256,
76            cache_size: 10000,
77        }
78    }
79}
80
81/// Hash algorithm options
82#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
83pub enum HashAlgorithm {
84    /// MD5 (fast, 128-bit)
85    Md5,
86    /// SHA-256 (secure, 256-bit)
87    Sha256,
88    /// FNV-1a (very fast, 64-bit)
89    Fnv1a,
90}
91
92/// Query fingerprint
93#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
94pub struct QueryFingerprint {
95    /// Primary hash of normalized query structure
96    pub hash: String,
97    /// Short hash for quick comparison
98    pub short_hash: String,
99    /// Query template with parameter placeholders
100    pub template: String,
101    /// Extracted parameters
102    pub parameters: Vec<ParameterSlot>,
103    /// Query form (SELECT, ASK, CONSTRUCT, DESCRIBE)
104    pub query_form: QueryForm,
105    /// Structural features
106    pub features: QueryFeatures,
107    /// Original query length
108    pub original_length: usize,
109}
110
111impl QueryFingerprint {
112    /// Get the template ID (short hash)
113    pub fn template_id(&self) -> &str {
114        &self.short_hash
115    }
116
117    /// Check if this fingerprint matches another (same template)
118    pub fn matches_template(&self, other: &Self) -> bool {
119        self.short_hash == other.short_hash
120    }
121
122    /// Get parameter count
123    pub fn parameter_count(&self) -> usize {
124        self.parameters.len()
125    }
126
127    /// Check if query has parameters
128    pub fn is_parameterized(&self) -> bool {
129        !self.parameters.is_empty()
130    }
131}
132
133/// Parameter slot in a query template
134#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
135pub struct ParameterSlot {
136    /// Slot name ($1, $2, etc.)
137    pub name: String,
138    /// Parameter type
139    pub param_type: ParameterType,
140    /// Position in original query
141    pub position: usize,
142    /// Original value (if extracted)
143    pub original_value: Option<String>,
144}
145
146/// Types of parameters
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
148pub enum ParameterType {
149    /// String literal
150    StringLiteral,
151    /// Numeric literal
152    NumericLiteral,
153    /// Date/time literal
154    DateTimeLiteral,
155    /// IRI reference
156    Iri,
157    /// Boolean literal
158    Boolean,
159    /// Language-tagged literal
160    LangLiteral,
161    /// Unknown type
162    Unknown,
163}
164
165/// Query form types
166#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
167pub enum QueryForm {
168    /// SELECT query
169    Select,
170    /// ASK query
171    Ask,
172    /// CONSTRUCT query
173    Construct,
174    /// DESCRIBE query
175    Describe,
176    /// INSERT/DELETE (update)
177    Update,
178    /// Unknown
179    Unknown,
180}
181
182impl QueryForm {
183    fn from_str(s: &str) -> Self {
184        match s.to_uppercase().as_str() {
185            "SELECT" => QueryForm::Select,
186            "ASK" => QueryForm::Ask,
187            "CONSTRUCT" => QueryForm::Construct,
188            "DESCRIBE" => QueryForm::Describe,
189            "INSERT" | "DELETE" => QueryForm::Update,
190            _ => QueryForm::Unknown,
191        }
192    }
193}
194
195/// Structural features of a query
196#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
197pub struct QueryFeatures {
198    /// Number of triple patterns
199    pub triple_pattern_count: usize,
200    /// Number of variables
201    pub variable_count: usize,
202    /// Number of filters
203    pub filter_count: usize,
204    /// Number of OPTIONAL blocks
205    pub optional_count: usize,
206    /// Number of UNION blocks
207    pub union_count: usize,
208    /// Number of subqueries
209    pub subquery_count: usize,
210    /// Has GROUP BY
211    pub has_group_by: bool,
212    /// Has ORDER BY
213    pub has_order_by: bool,
214    /// Has LIMIT
215    pub has_limit: bool,
216    /// Has OFFSET
217    pub has_offset: bool,
218    /// Has DISTINCT
219    pub has_distinct: bool,
220    /// Has SERVICE (federated)
221    pub has_service: bool,
222    /// Has VALUES clause
223    pub has_values: bool,
224    /// Has property paths
225    pub has_property_paths: bool,
226    /// Has aggregates
227    pub has_aggregates: bool,
228    /// Has BIND
229    pub has_bind: bool,
230    /// Has MINUS
231    pub has_minus: bool,
232    /// Estimated complexity score
233    pub complexity_score: u32,
234}
235
236/// Query fingerprinter
237pub struct QueryFingerprinter {
238    /// Configuration
239    config: FingerprintConfig,
240    /// Fingerprint cache
241    cache: Arc<RwLock<HashMap<String, QueryFingerprint>>>,
242    /// Statistics
243    stats: FingerprintStats,
244}
245
246/// Fingerprinting statistics
247#[derive(Debug, Default)]
248struct FingerprintStats {
249    /// Total fingerprints computed
250    computed: AtomicU64,
251    /// Cache hits
252    cache_hits: AtomicU64,
253    /// Cache misses
254    cache_misses: AtomicU64,
255}
256
257impl QueryFingerprinter {
258    /// Create new fingerprinter
259    pub fn new(config: FingerprintConfig) -> Self {
260        Self {
261            config,
262            cache: Arc::new(RwLock::new(HashMap::new())),
263            stats: FingerprintStats::default(),
264        }
265    }
266
267    /// Compute fingerprint for a query
268    pub fn fingerprint(&self, query: &str) -> Result<QueryFingerprint> {
269        // Check cache first
270        let query_hash = self.quick_hash(query);
271        {
272            let cache = self.cache.read().expect("read lock should not be poisoned");
273            if let Some(fp) = cache.get(&query_hash) {
274                self.stats.cache_hits.fetch_add(1, Ordering::Relaxed);
275                return Ok(fp.clone());
276            }
277        }
278
279        self.stats.cache_misses.fetch_add(1, Ordering::Relaxed);
280
281        // Compute fingerprint
282        let fingerprint = self.compute_fingerprint(query)?;
283
284        // Update cache
285        {
286            let mut cache = self
287                .cache
288                .write()
289                .expect("write lock should not be poisoned");
290            if cache.len() >= self.config.cache_size {
291                // Simple eviction: clear half the cache
292                let keys_to_remove: Vec<_> = cache.keys().take(cache.len() / 2).cloned().collect();
293                for key in keys_to_remove {
294                    cache.remove(&key);
295                }
296            }
297            cache.insert(query_hash, fingerprint.clone());
298        }
299
300        self.stats.computed.fetch_add(1, Ordering::Relaxed);
301        Ok(fingerprint)
302    }
303
304    /// Compute fingerprint without caching
305    fn compute_fingerprint(&self, query: &str) -> Result<QueryFingerprint> {
306        // Extract query form
307        let query_form = self.extract_query_form(query);
308
309        // Normalize the query
310        let (normalized, parameters) = self.normalize_query(query)?;
311
312        // Extract structural features
313        let features = self.extract_features(query);
314
315        // Compute hash
316        let hash = self.compute_hash(&normalized);
317        let short_hash = hash[..16].to_string();
318
319        Ok(QueryFingerprint {
320            hash,
321            short_hash,
322            template: normalized,
323            parameters,
324            query_form,
325            features,
326            original_length: query.len(),
327        })
328    }
329
330    /// Extract query form
331    fn extract_query_form(&self, query: &str) -> QueryForm {
332        let trimmed = query.trim_start();
333        let first_word = trimmed.split_whitespace().next().unwrap_or("");
334        QueryForm::from_str(first_word)
335    }
336
337    /// Normalize query and extract parameters
338    fn normalize_query(&self, query: &str) -> Result<(String, Vec<ParameterSlot>)> {
339        let mut normalized = query.to_string();
340        let mut parameters = Vec::new();
341        let mut param_index = 0;
342
343        // Remove comments
344        normalized = remove_comments(&normalized);
345
346        // Normalize whitespace
347        normalized = normalize_whitespace(&normalized);
348
349        // Normalize literals
350        if self.config.normalize_literals {
351            let (new_query, new_params) =
352                self.normalize_string_literals(&normalized, param_index)?;
353            normalized = new_query;
354            param_index += new_params.len();
355            parameters.extend(new_params);
356        }
357
358        // Normalize numbers
359        if self.config.normalize_numbers {
360            let (new_query, new_params) =
361                self.normalize_numeric_literals(&normalized, param_index)?;
362            normalized = new_query;
363            param_index += new_params.len();
364            parameters.extend(new_params);
365        }
366
367        // Normalize IRI locals (unless predicate or type)
368        if self.config.normalize_iri_locals {
369            let (new_query, new_params) = self.normalize_iri_locals(&normalized, param_index)?;
370            normalized = new_query;
371            parameters.extend(new_params);
372        }
373
374        // Normalize variable names if configured
375        if !self.config.include_variables {
376            normalized = self.normalize_variables(&normalized);
377        }
378
379        // Final cleanup
380        normalized = normalize_whitespace(&normalized);
381
382        Ok((normalized, parameters))
383    }
384
385    /// Normalize string literals
386    fn normalize_string_literals(
387        &self,
388        query: &str,
389        start_index: usize,
390    ) -> Result<(String, Vec<ParameterSlot>)> {
391        let string_pattern = string_literal_regex();
392        let mut result = query.to_string();
393        let mut params = Vec::new();
394        let mut index = start_index;
395
396        // Find all string literals
397        let matches: Vec<_> = string_pattern.find_iter(query).collect();
398
399        // Replace from end to preserve positions
400        for m in matches.into_iter().rev() {
401            let original = m.as_str().to_string();
402            let param_name = format!("${}", index);
403
404            // Determine type
405            let param_type = if original.contains("@") {
406                ParameterType::LangLiteral
407            } else if original.contains("^^") {
408                if original.contains("dateTime") || original.contains("date") {
409                    ParameterType::DateTimeLiteral
410                } else {
411                    ParameterType::StringLiteral
412                }
413            } else {
414                ParameterType::StringLiteral
415            };
416
417            params.push(ParameterSlot {
418                name: param_name.clone(),
419                param_type,
420                position: m.start(),
421                original_value: Some(original),
422            });
423
424            result = format!(
425                "{}{}{}",
426                &result[..m.start()],
427                param_name,
428                &result[m.end()..]
429            );
430            index += 1;
431        }
432
433        params.reverse(); // Correct order
434        Ok((result, params))
435    }
436
437    /// Normalize numeric literals
438    fn normalize_numeric_literals(
439        &self,
440        query: &str,
441        start_index: usize,
442    ) -> Result<(String, Vec<ParameterSlot>)> {
443        let number_pattern = numeric_literal_regex();
444        let mut result = query.to_string();
445        let mut params = Vec::new();
446        let mut index = start_index;
447
448        let matches: Vec<_> = number_pattern.find_iter(query).collect();
449
450        for m in matches.into_iter().rev() {
451            let original = m.as_str().to_string();
452            let param_name = format!("${}", index);
453
454            params.push(ParameterSlot {
455                name: param_name.clone(),
456                param_type: ParameterType::NumericLiteral,
457                position: m.start(),
458                original_value: Some(original),
459            });
460
461            result = format!(
462                "{}{}{}",
463                &result[..m.start()],
464                param_name,
465                &result[m.end()..]
466            );
467            index += 1;
468        }
469
470        params.reverse();
471        Ok((result, params))
472    }
473
474    /// Normalize IRI local names
475    fn normalize_iri_locals(
476        &self,
477        query: &str,
478        start_index: usize,
479    ) -> Result<(String, Vec<ParameterSlot>)> {
480        // Skip predicates and type IRIs if configured
481        let iri_pattern = iri_local_regex();
482        let mut result = query.to_string();
483        let mut params = Vec::new();
484        let mut index = start_index;
485
486        let matches: Vec<_> = iri_pattern.find_iter(query).collect();
487
488        for m in matches.into_iter().rev() {
489            let iri = m.as_str();
490
491            // Check if this should be preserved
492            if self.config.preserve_predicates && is_likely_predicate(query, m.start()) {
493                continue;
494            }
495            if self.config.preserve_types && iri.contains("rdf:type") || iri.contains("#type") {
496                continue;
497            }
498
499            let param_name = format!("${}", index);
500
501            params.push(ParameterSlot {
502                name: param_name.clone(),
503                param_type: ParameterType::Iri,
504                position: m.start(),
505                original_value: Some(iri.to_string()),
506            });
507
508            // Replace only the local name part
509            if let Some(local_start) = iri.rfind('#').or_else(|| iri.rfind('/')) {
510                let prefix = &iri[..=local_start];
511                result = format!(
512                    "{}{}{}{}",
513                    &result[..m.start()],
514                    prefix,
515                    param_name,
516                    &result[m.end()..]
517                );
518            }
519            index += 1;
520        }
521
522        params.reverse();
523        Ok((result, params))
524    }
525
526    /// Normalize variable names
527    fn normalize_variables(&self, query: &str) -> String {
528        let var_pattern = variable_regex();
529        let mut result = query.to_string();
530        let mut var_mapping: HashMap<String, String> = HashMap::new();
531        let mut var_index = 0;
532
533        // Find all variables and create mapping
534        for m in var_pattern.find_iter(query) {
535            let var_name = m.as_str().to_string();
536            if !var_mapping.contains_key(&var_name) {
537                var_mapping.insert(var_name.clone(), format!("?v{}", var_index));
538                var_index += 1;
539            }
540        }
541
542        // Replace variables (sorted by length descending to avoid partial replacements)
543        let mut sorted_vars: Vec<_> = var_mapping.iter().collect();
544        sorted_vars.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
545
546        for (original, normalized) in sorted_vars {
547            result = result.replace(original, normalized);
548        }
549
550        result
551    }
552
553    /// Extract structural features
554    fn extract_features(&self, query: &str) -> QueryFeatures {
555        let upper_query = query.to_uppercase();
556
557        let triple_pattern_count = count_triple_patterns(query);
558        let variable_count = count_variables(query);
559        let filter_count = upper_query.matches("FILTER").count();
560        let optional_count = upper_query.matches("OPTIONAL").count();
561        let union_count = upper_query.matches("UNION").count();
562        let subquery_count = upper_query.matches("SELECT").count().saturating_sub(1);
563
564        let has_group_by = upper_query.contains("GROUP BY");
565        let has_order_by = upper_query.contains("ORDER BY");
566        let has_limit = upper_query.contains("LIMIT");
567        let has_offset = upper_query.contains("OFFSET");
568        let has_distinct = upper_query.contains("DISTINCT");
569        let has_service = upper_query.contains("SERVICE");
570        let has_values = upper_query.contains("VALUES");
571        let has_property_paths = query.contains("/")
572            || query.contains("*")
573            || query.contains("+")
574            || query.contains("|");
575        let has_aggregates = upper_query.contains("COUNT")
576            || upper_query.contains("SUM")
577            || upper_query.contains("AVG")
578            || upper_query.contains("MIN")
579            || upper_query.contains("MAX");
580        let has_bind = upper_query.contains("BIND");
581        let has_minus = upper_query.contains("MINUS");
582
583        // Calculate complexity score
584        let complexity_score = (triple_pattern_count * 10
585            + filter_count * 5
586            + optional_count * 8
587            + union_count * 6
588            + subquery_count * 15
589            + if has_group_by { 5 } else { 0 }
590            + if has_order_by { 3 } else { 0 }
591            + if has_service { 20 } else { 0 }
592            + if has_property_paths { 10 } else { 0 }
593            + if has_aggregates { 7 } else { 0 }
594            + if has_minus { 5 } else { 0 }) as u32;
595
596        QueryFeatures {
597            triple_pattern_count,
598            variable_count,
599            filter_count,
600            optional_count,
601            union_count,
602            subquery_count,
603            has_group_by,
604            has_order_by,
605            has_limit,
606            has_offset,
607            has_distinct,
608            has_service,
609            has_values,
610            has_property_paths,
611            has_aggregates,
612            has_bind,
613            has_minus,
614            complexity_score,
615        }
616    }
617
618    /// Compute hash of normalized query
619    fn compute_hash(&self, normalized: &str) -> String {
620        match self.config.hash_algorithm {
621            HashAlgorithm::Md5 => {
622                let digest = md5::compute(normalized.as_bytes());
623                format!("{:x}", digest)
624            }
625            HashAlgorithm::Sha256 => {
626                let mut hasher = Sha256::new();
627                hasher.update(normalized.as_bytes());
628                format!("{:x}", hasher.finalize())
629            }
630            HashAlgorithm::Fnv1a => {
631                let mut hasher = std::collections::hash_map::DefaultHasher::new();
632                normalized.hash(&mut hasher);
633                format!("{:016x}", hasher.finish())
634            }
635        }
636    }
637
638    /// Quick hash for cache lookup
639    fn quick_hash(&self, query: &str) -> String {
640        let mut hasher = std::collections::hash_map::DefaultHasher::new();
641        query.hash(&mut hasher);
642        format!("{:016x}", hasher.finish())
643    }
644
645    /// Calculate similarity between two fingerprints
646    pub fn similarity(&self, fp1: &QueryFingerprint, fp2: &QueryFingerprint) -> f64 {
647        // If same hash, they're identical
648        if fp1.hash == fp2.hash {
649            return 1.0;
650        }
651
652        // Different query forms = very different
653        if fp1.query_form != fp2.query_form {
654            return 0.0;
655        }
656
657        // Calculate structural similarity
658        let structure_sim = self.structural_similarity(&fp1.features, &fp2.features);
659
660        // Calculate template similarity (string similarity)
661        let template_sim = string_similarity(&fp1.template, &fp2.template);
662
663        // Weighted combination
664        0.6 * structure_sim + 0.4 * template_sim
665    }
666
667    /// Calculate structural similarity between features
668    fn structural_similarity(&self, f1: &QueryFeatures, f2: &QueryFeatures) -> f64 {
669        let mut matches = 0.0;
670        let mut total = 0.0;
671
672        // Compare numeric features
673        let num_features = [
674            (f1.triple_pattern_count, f2.triple_pattern_count, 2.0),
675            (f1.variable_count, f2.variable_count, 1.0),
676            (f1.filter_count, f2.filter_count, 1.5),
677            (f1.optional_count, f2.optional_count, 1.5),
678            (f1.union_count, f2.union_count, 1.5),
679            (f1.subquery_count, f2.subquery_count, 2.0),
680        ];
681
682        for (v1, v2, weight) in num_features {
683            total += weight;
684            if v1 == v2 {
685                matches += weight;
686            } else {
687                let max_val = v1.max(v2) as f64;
688                let min_val = v1.min(v2) as f64;
689                if max_val > 0.0 {
690                    matches += weight * (min_val / max_val);
691                }
692            }
693        }
694
695        // Compare boolean features
696        let bool_features = [
697            (f1.has_group_by, f2.has_group_by),
698            (f1.has_order_by, f2.has_order_by),
699            (f1.has_limit, f2.has_limit),
700            (f1.has_distinct, f2.has_distinct),
701            (f1.has_service, f2.has_service),
702            (f1.has_values, f2.has_values),
703            (f1.has_property_paths, f2.has_property_paths),
704            (f1.has_aggregates, f2.has_aggregates),
705            (f1.has_bind, f2.has_bind),
706            (f1.has_minus, f2.has_minus),
707        ];
708
709        for (b1, b2) in bool_features {
710            total += 1.0;
711            if b1 == b2 {
712                matches += 1.0;
713            }
714        }
715
716        matches / total
717    }
718
719    /// Find similar fingerprints in a collection
720    pub fn find_similar(
721        &self,
722        target: &QueryFingerprint,
723        fingerprints: &[QueryFingerprint],
724        threshold: f64,
725    ) -> Vec<(usize, f64)> {
726        let mut results: Vec<(usize, f64)> = fingerprints
727            .iter()
728            .enumerate()
729            .filter_map(|(idx, fp)| {
730                let sim = self.similarity(target, fp);
731                if sim >= threshold {
732                    Some((idx, sim))
733                } else {
734                    None
735                }
736            })
737            .collect();
738
739        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
740        results
741    }
742
743    /// Group fingerprints by similarity
744    pub fn cluster_fingerprints(
745        &self,
746        fingerprints: &[QueryFingerprint],
747        threshold: f64,
748    ) -> Vec<Vec<usize>> {
749        let n = fingerprints.len();
750        let mut visited = vec![false; n];
751        let mut clusters = Vec::new();
752
753        for i in 0..n {
754            if visited[i] {
755                continue;
756            }
757
758            let mut cluster = vec![i];
759            visited[i] = true;
760
761            for j in (i + 1)..n {
762                if visited[j] {
763                    continue;
764                }
765
766                let sim = self.similarity(&fingerprints[i], &fingerprints[j]);
767                if sim >= threshold {
768                    cluster.push(j);
769                    visited[j] = true;
770                }
771            }
772
773            clusters.push(cluster);
774        }
775
776        clusters
777    }
778
779    /// Get fingerprinting statistics
780    pub fn statistics(&self) -> FingerprintingStatistics {
781        FingerprintingStatistics {
782            fingerprints_computed: self.stats.computed.load(Ordering::Relaxed),
783            cache_hits: self.stats.cache_hits.load(Ordering::Relaxed),
784            cache_misses: self.stats.cache_misses.load(Ordering::Relaxed),
785            cache_size: self
786                .cache
787                .read()
788                .expect("read lock should not be poisoned")
789                .len(),
790        }
791    }
792
793    /// Clear the fingerprint cache
794    pub fn clear_cache(&self) {
795        self.cache
796            .write()
797            .expect("write lock should not be poisoned")
798            .clear();
799    }
800}
801
802/// Fingerprinting statistics
803#[derive(Debug, Clone, Serialize, Deserialize)]
804pub struct FingerprintingStatistics {
805    /// Total fingerprints computed
806    pub fingerprints_computed: u64,
807    /// Cache hits
808    pub cache_hits: u64,
809    /// Cache misses
810    pub cache_misses: u64,
811    /// Current cache size
812    pub cache_size: usize,
813}
814
815impl FingerprintingStatistics {
816    /// Calculate cache hit rate
817    pub fn cache_hit_rate(&self) -> f64 {
818        let total = self.cache_hits + self.cache_misses;
819        if total == 0 {
820            0.0
821        } else {
822            self.cache_hits as f64 / total as f64
823        }
824    }
825}
826
827// Helper functions
828
829fn remove_comments(query: &str) -> String {
830    let mut result = String::new();
831    let mut in_string = false;
832    let mut escape = false;
833    let mut chars = query.chars().peekable();
834
835    while let Some(c) = chars.next() {
836        if escape {
837            result.push(c);
838            escape = false;
839            continue;
840        }
841
842        if c == '\\' && in_string {
843            result.push(c);
844            escape = true;
845            continue;
846        }
847
848        if c == '"' && !in_string {
849            in_string = true;
850            result.push(c);
851            continue;
852        }
853
854        if c == '"' && in_string {
855            in_string = false;
856            result.push(c);
857            continue;
858        }
859
860        if !in_string && c == '#' {
861            // Skip to end of line
862            while let Some(&nc) = chars.peek() {
863                if nc == '\n' {
864                    break;
865                }
866                chars.next();
867            }
868            result.push(' ');
869            continue;
870        }
871
872        result.push(c);
873    }
874
875    result
876}
877
878fn normalize_whitespace(query: &str) -> String {
879    let whitespace_pattern = whitespace_regex();
880    whitespace_pattern
881        .replace_all(query, " ")
882        .trim()
883        .to_string()
884}
885
886fn count_triple_patterns(query: &str) -> usize {
887    // Count occurrences of patterns like "?var predicate ?var" or ". ?var predicate"
888    let triple_pattern = triple_pattern_regex();
889    triple_pattern.find_iter(query).count()
890}
891
892fn count_variables(query: &str) -> usize {
893    let var_pattern = variable_regex();
894    let vars: HashSet<_> = var_pattern.find_iter(query).map(|m| m.as_str()).collect();
895    vars.len()
896}
897
898fn is_likely_predicate(query: &str, position: usize) -> bool {
899    // Check if this IRI is in predicate position (middle of triple pattern)
900    // This is a heuristic - look for variable before and after
901    let before = &query[..position];
902    let after = &query[position..];
903
904    // Check if there's a variable or IRI before (subject)
905    let has_subject = before
906        .trim_end()
907        .ends_with(|c: char| c == '>' || c.is_alphabetic() || c == '_');
908    // Check if there's a variable or literal after (object)
909    let has_object = after.trim_start().starts_with(['?', '<', '"']);
910
911    has_subject && has_object
912}
913
914fn string_similarity(s1: &str, s2: &str) -> f64 {
915    // Simple Jaccard similarity on tokens
916    let tokens1: HashSet<_> = s1.split_whitespace().collect();
917    let tokens2: HashSet<_> = s2.split_whitespace().collect();
918
919    let intersection = tokens1.intersection(&tokens2).count();
920    let union = tokens1.union(&tokens2).count();
921
922    if union == 0 {
923        1.0
924    } else {
925        intersection as f64 / union as f64
926    }
927}
928
929// Lazy static regex patterns
930
931fn string_literal_regex() -> &'static Regex {
932    static REGEX: OnceLock<Regex> = OnceLock::new();
933    REGEX.get_or_init(|| {
934        Regex::new(r#""(?:[^"\\]|\\.)*"(?:@[a-zA-Z-]+)?(?:\^\^<[^>]+>)?"#).expect("Invalid regex")
935    })
936}
937
938fn numeric_literal_regex() -> &'static Regex {
939    static REGEX: OnceLock<Regex> = OnceLock::new();
940    REGEX.get_or_init(|| {
941        // Simple numeric pattern without look-around (which regex crate doesn't support)
942        // Match numbers that appear as standalone tokens
943        Regex::new(r"\b[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?\b").expect("Invalid regex")
944    })
945}
946
947fn iri_local_regex() -> &'static Regex {
948    static REGEX: OnceLock<Regex> = OnceLock::new();
949    REGEX.get_or_init(|| Regex::new(r"<[^>]+>").expect("Invalid regex"))
950}
951
952fn variable_regex() -> &'static Regex {
953    static REGEX: OnceLock<Regex> = OnceLock::new();
954    REGEX.get_or_init(|| Regex::new(r"\?[a-zA-Z_][a-zA-Z0-9_]*").expect("Invalid regex"))
955}
956
957fn whitespace_regex() -> &'static Regex {
958    static REGEX: OnceLock<Regex> = OnceLock::new();
959    REGEX.get_or_init(|| Regex::new(r"\s+").expect("Invalid regex"))
960}
961
962fn triple_pattern_regex() -> &'static Regex {
963    static REGEX: OnceLock<Regex> = OnceLock::new();
964    REGEX.get_or_init(|| {
965        Regex::new(r#"(?:\?[a-zA-Z_]\w*|<[^>]+>)\s+(?:\?[a-zA-Z_]\w*|<[^>]+>|[a-zA-Z]+:[a-zA-Z_]\w*)\s+(?:\?[a-zA-Z_]\w*|<[^>]+>|"[^"]*"|[0-9]+)"#).expect("Invalid regex")
966    })
967}
968
969#[cfg(test)]
970mod tests {
971    use super::*;
972
973    #[test]
974    fn test_basic_fingerprint() {
975        let config = FingerprintConfig::default();
976        let fingerprinter = QueryFingerprinter::new(config);
977
978        let query = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
979        let fp = fingerprinter.fingerprint(query).unwrap();
980
981        assert_eq!(fp.query_form, QueryForm::Select);
982        assert!(!fp.hash.is_empty());
983        assert!(!fp.template.is_empty());
984    }
985
986    #[test]
987    fn test_similar_queries_same_template() {
988        let config = FingerprintConfig::default();
989        let fingerprinter = QueryFingerprinter::new(config);
990
991        let query1 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
992        let query2 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }";
993
994        let fp1 = fingerprinter.fingerprint(query1).unwrap();
995        let fp2 = fingerprinter.fingerprint(query2).unwrap();
996
997        // Same structure, different literals
998        let similarity = fingerprinter.similarity(&fp1, &fp2);
999        assert!(
1000            similarity > 0.7,
1001            "Similarity should be high for same structure"
1002        );
1003    }
1004
1005    #[test]
1006    fn test_different_queries_different_template() {
1007        let config = FingerprintConfig::default();
1008        let fingerprinter = QueryFingerprinter::new(config);
1009
1010        let query1 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
1011        let query2 = "ASK { ?x <http://example.org/age> ?y . ?y <http://example.org/value> ?z }";
1012
1013        let fp1 = fingerprinter.fingerprint(query1).unwrap();
1014        let fp2 = fingerprinter.fingerprint(query2).unwrap();
1015
1016        let similarity = fingerprinter.similarity(&fp1, &fp2);
1017        assert!(
1018            similarity < 0.5,
1019            "Similarity should be low for different structures"
1020        );
1021    }
1022
1023    #[test]
1024    fn test_feature_extraction() {
1025        let config = FingerprintConfig::default();
1026        let fingerprinter = QueryFingerprinter::new(config);
1027
1028        let query = r#"
1029            SELECT ?s ?name (COUNT(?o) AS ?count)
1030            WHERE {
1031                ?s <http://example.org/type> <http://example.org/Person> .
1032                ?s <http://example.org/name> ?name .
1033                OPTIONAL { ?s <http://example.org/friend> ?o }
1034                FILTER(?name != "")
1035            }
1036            GROUP BY ?s ?name
1037            ORDER BY DESC(?count)
1038            LIMIT 10
1039        "#;
1040
1041        let fp = fingerprinter.fingerprint(query).unwrap();
1042
1043        assert!(fp.features.has_group_by);
1044        assert!(fp.features.has_order_by);
1045        assert!(fp.features.has_limit);
1046        assert!(fp.features.has_aggregates);
1047        assert!(fp.features.optional_count > 0);
1048        assert!(fp.features.filter_count > 0);
1049    }
1050
1051    #[test]
1052    fn test_cache_behavior() {
1053        let config = FingerprintConfig::default();
1054        let fingerprinter = QueryFingerprinter::new(config);
1055
1056        let query = "SELECT ?s WHERE { ?s ?p ?o }";
1057
1058        // First call - cache miss
1059        let _fp1 = fingerprinter.fingerprint(query).unwrap();
1060        let stats1 = fingerprinter.statistics();
1061        assert_eq!(stats1.cache_misses, 1);
1062
1063        // Second call - cache hit
1064        let _fp2 = fingerprinter.fingerprint(query).unwrap();
1065        let stats2 = fingerprinter.statistics();
1066        assert_eq!(stats2.cache_hits, 1);
1067    }
1068
1069    #[test]
1070    fn test_query_forms() {
1071        let config = FingerprintConfig::default();
1072        let fingerprinter = QueryFingerprinter::new(config);
1073
1074        let select = "SELECT ?s WHERE { ?s ?p ?o }";
1075        let ask = "ASK { ?s ?p ?o }";
1076        let construct = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }";
1077        let describe = "DESCRIBE <http://example.org/thing>";
1078
1079        assert_eq!(
1080            fingerprinter.fingerprint(select).unwrap().query_form,
1081            QueryForm::Select
1082        );
1083        assert_eq!(
1084            fingerprinter.fingerprint(ask).unwrap().query_form,
1085            QueryForm::Ask
1086        );
1087        assert_eq!(
1088            fingerprinter.fingerprint(construct).unwrap().query_form,
1089            QueryForm::Construct
1090        );
1091        assert_eq!(
1092            fingerprinter.fingerprint(describe).unwrap().query_form,
1093            QueryForm::Describe
1094        );
1095    }
1096
1097    #[test]
1098    fn test_parameter_extraction() {
1099        let config = FingerprintConfig::default();
1100        let fingerprinter = QueryFingerprinter::new(config);
1101
1102        let query = r#"SELECT ?s WHERE { ?s <http://example.org/age> 25 . ?s <http://example.org/name> "Alice" }"#;
1103        let fp = fingerprinter.fingerprint(query).unwrap();
1104
1105        assert!(!fp.parameters.is_empty());
1106    }
1107
1108    #[test]
1109    fn test_find_similar() {
1110        let config = FingerprintConfig::default();
1111        let fingerprinter = QueryFingerprinter::new(config);
1112
1113        let queries = [
1114            "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }",
1115            "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }",
1116            "SELECT ?s WHERE { ?s <http://example.org/name> \"Charlie\" }",
1117            "ASK { ?x <http://example.org/age> ?y }",
1118        ];
1119
1120        let fingerprints: Vec<_> = queries
1121            .iter()
1122            .map(|q| fingerprinter.fingerprint(q).unwrap())
1123            .collect();
1124
1125        let target = &fingerprints[0];
1126        let similar = fingerprinter.find_similar(target, &fingerprints, 0.7);
1127
1128        // Should find at least the first 3 as similar
1129        assert!(similar.len() >= 2);
1130    }
1131
1132    #[test]
1133    fn test_cluster_fingerprints() {
1134        let config = FingerprintConfig::default();
1135        let fingerprinter = QueryFingerprinter::new(config);
1136
1137        let queries = [
1138            "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }",
1139            "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }",
1140            "ASK { ?x <http://example.org/age> ?y }",
1141            "ASK { ?x <http://example.org/age> ?z }",
1142        ];
1143
1144        let fingerprints: Vec<_> = queries
1145            .iter()
1146            .map(|q| fingerprinter.fingerprint(q).unwrap())
1147            .collect();
1148
1149        let clusters = fingerprinter.cluster_fingerprints(&fingerprints, 0.7);
1150
1151        // Should have 2 clusters: one for SELECT queries, one for ASK queries
1152        assert!(clusters.len() >= 2);
1153    }
1154
1155    #[test]
1156    fn test_complexity_score() {
1157        let config = FingerprintConfig::default();
1158        let fingerprinter = QueryFingerprinter::new(config);
1159
1160        let simple = "SELECT ?s WHERE { ?s ?p ?o }";
1161        let complex = r#"
1162            SELECT ?s ?name (COUNT(?friend) AS ?friendCount)
1163            WHERE {
1164                ?s <http://example.org/type> <http://example.org/Person> .
1165                ?s <http://example.org/name> ?name .
1166                OPTIONAL { ?s <http://example.org/friend> ?friend }
1167                FILTER(?name != "")
1168                UNION {
1169                    ?s <http://example.org/nickname> ?name
1170                }
1171            }
1172            GROUP BY ?s ?name
1173            ORDER BY DESC(?friendCount)
1174            LIMIT 100
1175        "#;
1176
1177        let fp_simple = fingerprinter.fingerprint(simple).unwrap();
1178        let fp_complex = fingerprinter.fingerprint(complex).unwrap();
1179
1180        assert!(fp_complex.features.complexity_score > fp_simple.features.complexity_score);
1181    }
1182
1183    #[test]
1184    fn test_hash_algorithms() {
1185        let queries = "SELECT ?s WHERE { ?s ?p ?o }";
1186
1187        // MD5
1188        let config_md5 = FingerprintConfig {
1189            hash_algorithm: HashAlgorithm::Md5,
1190            ..Default::default()
1191        };
1192        let fp_md5 = QueryFingerprinter::new(config_md5)
1193            .fingerprint(queries)
1194            .unwrap();
1195        assert_eq!(fp_md5.hash.len(), 32); // MD5 = 128 bits = 32 hex chars
1196
1197        // SHA-256
1198        let config_sha = FingerprintConfig {
1199            hash_algorithm: HashAlgorithm::Sha256,
1200            ..Default::default()
1201        };
1202        let fp_sha = QueryFingerprinter::new(config_sha)
1203            .fingerprint(queries)
1204            .unwrap();
1205        assert_eq!(fp_sha.hash.len(), 64); // SHA256 = 256 bits = 64 hex chars
1206
1207        // FNV-1a
1208        let config_fnv = FingerprintConfig {
1209            hash_algorithm: HashAlgorithm::Fnv1a,
1210            ..Default::default()
1211        };
1212        let fp_fnv = QueryFingerprinter::new(config_fnv)
1213            .fingerprint(queries)
1214            .unwrap();
1215        assert_eq!(fp_fnv.hash.len(), 16); // FNV-1a 64-bit = 16 hex chars
1216    }
1217
1218    #[test]
1219    fn test_statistics() {
1220        let config = FingerprintConfig::default();
1221        let fingerprinter = QueryFingerprinter::new(config);
1222
1223        let _ = fingerprinter.fingerprint("SELECT ?s WHERE { ?s ?p ?o }");
1224        let _ = fingerprinter.fingerprint("SELECT ?s WHERE { ?s ?p ?o }"); // Cache hit
1225
1226        let stats = fingerprinter.statistics();
1227        assert_eq!(stats.fingerprints_computed, 1);
1228        assert_eq!(stats.cache_hits, 1);
1229        assert_eq!(stats.cache_misses, 1);
1230        assert!(stats.cache_hit_rate() > 0.0);
1231    }
1232
1233    #[test]
1234    fn test_clear_cache() {
1235        let config = FingerprintConfig::default();
1236        let fingerprinter = QueryFingerprinter::new(config);
1237
1238        let _ = fingerprinter.fingerprint("SELECT ?s WHERE { ?s ?p ?o }");
1239        assert_eq!(fingerprinter.statistics().cache_size, 1);
1240
1241        fingerprinter.clear_cache();
1242        assert_eq!(fingerprinter.statistics().cache_size, 0);
1243    }
1244}