1use anyhow::Result;
37use regex::Regex;
38use serde::{Deserialize, Serialize};
39use sha2::{Digest, Sha256};
40use std::collections::{HashMap, HashSet};
41use std::hash::{Hash, Hasher};
42use std::sync::atomic::{AtomicU64, Ordering};
43use std::sync::{Arc, OnceLock, RwLock};
44
45#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct FingerprintConfig {
48 pub include_variables: bool,
50 pub normalize_literals: bool,
52 pub normalize_numbers: bool,
54 pub normalize_iri_locals: bool,
56 pub preserve_predicates: bool,
58 pub preserve_types: bool,
60 pub hash_algorithm: HashAlgorithm,
62 pub cache_size: usize,
64}
65
66impl Default for FingerprintConfig {
67 fn default() -> Self {
68 Self {
69 include_variables: false,
70 normalize_literals: true,
71 normalize_numbers: true,
72 normalize_iri_locals: true,
73 preserve_predicates: true,
74 preserve_types: true,
75 hash_algorithm: HashAlgorithm::Sha256,
76 cache_size: 10000,
77 }
78 }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
83pub enum HashAlgorithm {
84 Md5,
86 Sha256,
88 Fnv1a,
90}
91
92#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
94pub struct QueryFingerprint {
95 pub hash: String,
97 pub short_hash: String,
99 pub template: String,
101 pub parameters: Vec<ParameterSlot>,
103 pub query_form: QueryForm,
105 pub features: QueryFeatures,
107 pub original_length: usize,
109}
110
111impl QueryFingerprint {
112 pub fn template_id(&self) -> &str {
114 &self.short_hash
115 }
116
117 pub fn matches_template(&self, other: &Self) -> bool {
119 self.short_hash == other.short_hash
120 }
121
122 pub fn parameter_count(&self) -> usize {
124 self.parameters.len()
125 }
126
127 pub fn is_parameterized(&self) -> bool {
129 !self.parameters.is_empty()
130 }
131}
132
133#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
135pub struct ParameterSlot {
136 pub name: String,
138 pub param_type: ParameterType,
140 pub position: usize,
142 pub original_value: Option<String>,
144}
145
146#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
148pub enum ParameterType {
149 StringLiteral,
151 NumericLiteral,
153 DateTimeLiteral,
155 Iri,
157 Boolean,
159 LangLiteral,
161 Unknown,
163}
164
165#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
167pub enum QueryForm {
168 Select,
170 Ask,
172 Construct,
174 Describe,
176 Update,
178 Unknown,
180}
181
182impl QueryForm {
183 fn from_str(s: &str) -> Self {
184 match s.to_uppercase().as_str() {
185 "SELECT" => QueryForm::Select,
186 "ASK" => QueryForm::Ask,
187 "CONSTRUCT" => QueryForm::Construct,
188 "DESCRIBE" => QueryForm::Describe,
189 "INSERT" | "DELETE" => QueryForm::Update,
190 _ => QueryForm::Unknown,
191 }
192 }
193}
194
195#[derive(Debug, Clone, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
197pub struct QueryFeatures {
198 pub triple_pattern_count: usize,
200 pub variable_count: usize,
202 pub filter_count: usize,
204 pub optional_count: usize,
206 pub union_count: usize,
208 pub subquery_count: usize,
210 pub has_group_by: bool,
212 pub has_order_by: bool,
214 pub has_limit: bool,
216 pub has_offset: bool,
218 pub has_distinct: bool,
220 pub has_service: bool,
222 pub has_values: bool,
224 pub has_property_paths: bool,
226 pub has_aggregates: bool,
228 pub has_bind: bool,
230 pub has_minus: bool,
232 pub complexity_score: u32,
234}
235
236pub struct QueryFingerprinter {
238 config: FingerprintConfig,
240 cache: Arc<RwLock<HashMap<String, QueryFingerprint>>>,
242 stats: FingerprintStats,
244}
245
246#[derive(Debug, Default)]
248struct FingerprintStats {
249 computed: AtomicU64,
251 cache_hits: AtomicU64,
253 cache_misses: AtomicU64,
255}
256
257impl QueryFingerprinter {
258 pub fn new(config: FingerprintConfig) -> Self {
260 Self {
261 config,
262 cache: Arc::new(RwLock::new(HashMap::new())),
263 stats: FingerprintStats::default(),
264 }
265 }
266
267 pub fn fingerprint(&self, query: &str) -> Result<QueryFingerprint> {
269 let query_hash = self.quick_hash(query);
271 {
272 let cache = self.cache.read().expect("read lock should not be poisoned");
273 if let Some(fp) = cache.get(&query_hash) {
274 self.stats.cache_hits.fetch_add(1, Ordering::Relaxed);
275 return Ok(fp.clone());
276 }
277 }
278
279 self.stats.cache_misses.fetch_add(1, Ordering::Relaxed);
280
281 let fingerprint = self.compute_fingerprint(query)?;
283
284 {
286 let mut cache = self
287 .cache
288 .write()
289 .expect("write lock should not be poisoned");
290 if cache.len() >= self.config.cache_size {
291 let keys_to_remove: Vec<_> = cache.keys().take(cache.len() / 2).cloned().collect();
293 for key in keys_to_remove {
294 cache.remove(&key);
295 }
296 }
297 cache.insert(query_hash, fingerprint.clone());
298 }
299
300 self.stats.computed.fetch_add(1, Ordering::Relaxed);
301 Ok(fingerprint)
302 }
303
304 fn compute_fingerprint(&self, query: &str) -> Result<QueryFingerprint> {
306 let query_form = self.extract_query_form(query);
308
309 let (normalized, parameters) = self.normalize_query(query)?;
311
312 let features = self.extract_features(query);
314
315 let hash = self.compute_hash(&normalized);
317 let short_hash = hash[..16].to_string();
318
319 Ok(QueryFingerprint {
320 hash,
321 short_hash,
322 template: normalized,
323 parameters,
324 query_form,
325 features,
326 original_length: query.len(),
327 })
328 }
329
330 fn extract_query_form(&self, query: &str) -> QueryForm {
332 let trimmed = query.trim_start();
333 let first_word = trimmed.split_whitespace().next().unwrap_or("");
334 QueryForm::from_str(first_word)
335 }
336
337 fn normalize_query(&self, query: &str) -> Result<(String, Vec<ParameterSlot>)> {
339 let mut normalized = query.to_string();
340 let mut parameters = Vec::new();
341 let mut param_index = 0;
342
343 normalized = remove_comments(&normalized);
345
346 normalized = normalize_whitespace(&normalized);
348
349 if self.config.normalize_literals {
351 let (new_query, new_params) =
352 self.normalize_string_literals(&normalized, param_index)?;
353 normalized = new_query;
354 param_index += new_params.len();
355 parameters.extend(new_params);
356 }
357
358 if self.config.normalize_numbers {
360 let (new_query, new_params) =
361 self.normalize_numeric_literals(&normalized, param_index)?;
362 normalized = new_query;
363 param_index += new_params.len();
364 parameters.extend(new_params);
365 }
366
367 if self.config.normalize_iri_locals {
369 let (new_query, new_params) = self.normalize_iri_locals(&normalized, param_index)?;
370 normalized = new_query;
371 parameters.extend(new_params);
372 }
373
374 if !self.config.include_variables {
376 normalized = self.normalize_variables(&normalized);
377 }
378
379 normalized = normalize_whitespace(&normalized);
381
382 Ok((normalized, parameters))
383 }
384
385 fn normalize_string_literals(
387 &self,
388 query: &str,
389 start_index: usize,
390 ) -> Result<(String, Vec<ParameterSlot>)> {
391 let string_pattern = string_literal_regex();
392 let mut result = query.to_string();
393 let mut params = Vec::new();
394 let mut index = start_index;
395
396 let matches: Vec<_> = string_pattern.find_iter(query).collect();
398
399 for m in matches.into_iter().rev() {
401 let original = m.as_str().to_string();
402 let param_name = format!("${}", index);
403
404 let param_type = if original.contains("@") {
406 ParameterType::LangLiteral
407 } else if original.contains("^^") {
408 if original.contains("dateTime") || original.contains("date") {
409 ParameterType::DateTimeLiteral
410 } else {
411 ParameterType::StringLiteral
412 }
413 } else {
414 ParameterType::StringLiteral
415 };
416
417 params.push(ParameterSlot {
418 name: param_name.clone(),
419 param_type,
420 position: m.start(),
421 original_value: Some(original),
422 });
423
424 result = format!(
425 "{}{}{}",
426 &result[..m.start()],
427 param_name,
428 &result[m.end()..]
429 );
430 index += 1;
431 }
432
433 params.reverse(); Ok((result, params))
435 }
436
437 fn normalize_numeric_literals(
439 &self,
440 query: &str,
441 start_index: usize,
442 ) -> Result<(String, Vec<ParameterSlot>)> {
443 let number_pattern = numeric_literal_regex();
444 let mut result = query.to_string();
445 let mut params = Vec::new();
446 let mut index = start_index;
447
448 let matches: Vec<_> = number_pattern.find_iter(query).collect();
449
450 for m in matches.into_iter().rev() {
451 let original = m.as_str().to_string();
452 let param_name = format!("${}", index);
453
454 params.push(ParameterSlot {
455 name: param_name.clone(),
456 param_type: ParameterType::NumericLiteral,
457 position: m.start(),
458 original_value: Some(original),
459 });
460
461 result = format!(
462 "{}{}{}",
463 &result[..m.start()],
464 param_name,
465 &result[m.end()..]
466 );
467 index += 1;
468 }
469
470 params.reverse();
471 Ok((result, params))
472 }
473
474 fn normalize_iri_locals(
476 &self,
477 query: &str,
478 start_index: usize,
479 ) -> Result<(String, Vec<ParameterSlot>)> {
480 let iri_pattern = iri_local_regex();
482 let mut result = query.to_string();
483 let mut params = Vec::new();
484 let mut index = start_index;
485
486 let matches: Vec<_> = iri_pattern.find_iter(query).collect();
487
488 for m in matches.into_iter().rev() {
489 let iri = m.as_str();
490
491 if self.config.preserve_predicates && is_likely_predicate(query, m.start()) {
493 continue;
494 }
495 if self.config.preserve_types && iri.contains("rdf:type") || iri.contains("#type") {
496 continue;
497 }
498
499 let param_name = format!("${}", index);
500
501 params.push(ParameterSlot {
502 name: param_name.clone(),
503 param_type: ParameterType::Iri,
504 position: m.start(),
505 original_value: Some(iri.to_string()),
506 });
507
508 if let Some(local_start) = iri.rfind('#').or_else(|| iri.rfind('/')) {
510 let prefix = &iri[..=local_start];
511 result = format!(
512 "{}{}{}{}",
513 &result[..m.start()],
514 prefix,
515 param_name,
516 &result[m.end()..]
517 );
518 }
519 index += 1;
520 }
521
522 params.reverse();
523 Ok((result, params))
524 }
525
526 fn normalize_variables(&self, query: &str) -> String {
528 let var_pattern = variable_regex();
529 let mut result = query.to_string();
530 let mut var_mapping: HashMap<String, String> = HashMap::new();
531 let mut var_index = 0;
532
533 for m in var_pattern.find_iter(query) {
535 let var_name = m.as_str().to_string();
536 if !var_mapping.contains_key(&var_name) {
537 var_mapping.insert(var_name.clone(), format!("?v{}", var_index));
538 var_index += 1;
539 }
540 }
541
542 let mut sorted_vars: Vec<_> = var_mapping.iter().collect();
544 sorted_vars.sort_by(|a, b| b.0.len().cmp(&a.0.len()));
545
546 for (original, normalized) in sorted_vars {
547 result = result.replace(original, normalized);
548 }
549
550 result
551 }
552
553 fn extract_features(&self, query: &str) -> QueryFeatures {
555 let upper_query = query.to_uppercase();
556
557 let triple_pattern_count = count_triple_patterns(query);
558 let variable_count = count_variables(query);
559 let filter_count = upper_query.matches("FILTER").count();
560 let optional_count = upper_query.matches("OPTIONAL").count();
561 let union_count = upper_query.matches("UNION").count();
562 let subquery_count = upper_query.matches("SELECT").count().saturating_sub(1);
563
564 let has_group_by = upper_query.contains("GROUP BY");
565 let has_order_by = upper_query.contains("ORDER BY");
566 let has_limit = upper_query.contains("LIMIT");
567 let has_offset = upper_query.contains("OFFSET");
568 let has_distinct = upper_query.contains("DISTINCT");
569 let has_service = upper_query.contains("SERVICE");
570 let has_values = upper_query.contains("VALUES");
571 let has_property_paths = query.contains("/")
572 || query.contains("*")
573 || query.contains("+")
574 || query.contains("|");
575 let has_aggregates = upper_query.contains("COUNT")
576 || upper_query.contains("SUM")
577 || upper_query.contains("AVG")
578 || upper_query.contains("MIN")
579 || upper_query.contains("MAX");
580 let has_bind = upper_query.contains("BIND");
581 let has_minus = upper_query.contains("MINUS");
582
583 let complexity_score = (triple_pattern_count * 10
585 + filter_count * 5
586 + optional_count * 8
587 + union_count * 6
588 + subquery_count * 15
589 + if has_group_by { 5 } else { 0 }
590 + if has_order_by { 3 } else { 0 }
591 + if has_service { 20 } else { 0 }
592 + if has_property_paths { 10 } else { 0 }
593 + if has_aggregates { 7 } else { 0 }
594 + if has_minus { 5 } else { 0 }) as u32;
595
596 QueryFeatures {
597 triple_pattern_count,
598 variable_count,
599 filter_count,
600 optional_count,
601 union_count,
602 subquery_count,
603 has_group_by,
604 has_order_by,
605 has_limit,
606 has_offset,
607 has_distinct,
608 has_service,
609 has_values,
610 has_property_paths,
611 has_aggregates,
612 has_bind,
613 has_minus,
614 complexity_score,
615 }
616 }
617
618 fn compute_hash(&self, normalized: &str) -> String {
620 match self.config.hash_algorithm {
621 HashAlgorithm::Md5 => {
622 let digest = md5::compute(normalized.as_bytes());
623 format!("{:x}", digest)
624 }
625 HashAlgorithm::Sha256 => {
626 let mut hasher = Sha256::new();
627 hasher.update(normalized.as_bytes());
628 format!("{:x}", hasher.finalize())
629 }
630 HashAlgorithm::Fnv1a => {
631 let mut hasher = std::collections::hash_map::DefaultHasher::new();
632 normalized.hash(&mut hasher);
633 format!("{:016x}", hasher.finish())
634 }
635 }
636 }
637
638 fn quick_hash(&self, query: &str) -> String {
640 let mut hasher = std::collections::hash_map::DefaultHasher::new();
641 query.hash(&mut hasher);
642 format!("{:016x}", hasher.finish())
643 }
644
645 pub fn similarity(&self, fp1: &QueryFingerprint, fp2: &QueryFingerprint) -> f64 {
647 if fp1.hash == fp2.hash {
649 return 1.0;
650 }
651
652 if fp1.query_form != fp2.query_form {
654 return 0.0;
655 }
656
657 let structure_sim = self.structural_similarity(&fp1.features, &fp2.features);
659
660 let template_sim = string_similarity(&fp1.template, &fp2.template);
662
663 0.6 * structure_sim + 0.4 * template_sim
665 }
666
667 fn structural_similarity(&self, f1: &QueryFeatures, f2: &QueryFeatures) -> f64 {
669 let mut matches = 0.0;
670 let mut total = 0.0;
671
672 let num_features = [
674 (f1.triple_pattern_count, f2.triple_pattern_count, 2.0),
675 (f1.variable_count, f2.variable_count, 1.0),
676 (f1.filter_count, f2.filter_count, 1.5),
677 (f1.optional_count, f2.optional_count, 1.5),
678 (f1.union_count, f2.union_count, 1.5),
679 (f1.subquery_count, f2.subquery_count, 2.0),
680 ];
681
682 for (v1, v2, weight) in num_features {
683 total += weight;
684 if v1 == v2 {
685 matches += weight;
686 } else {
687 let max_val = v1.max(v2) as f64;
688 let min_val = v1.min(v2) as f64;
689 if max_val > 0.0 {
690 matches += weight * (min_val / max_val);
691 }
692 }
693 }
694
695 let bool_features = [
697 (f1.has_group_by, f2.has_group_by),
698 (f1.has_order_by, f2.has_order_by),
699 (f1.has_limit, f2.has_limit),
700 (f1.has_distinct, f2.has_distinct),
701 (f1.has_service, f2.has_service),
702 (f1.has_values, f2.has_values),
703 (f1.has_property_paths, f2.has_property_paths),
704 (f1.has_aggregates, f2.has_aggregates),
705 (f1.has_bind, f2.has_bind),
706 (f1.has_minus, f2.has_minus),
707 ];
708
709 for (b1, b2) in bool_features {
710 total += 1.0;
711 if b1 == b2 {
712 matches += 1.0;
713 }
714 }
715
716 matches / total
717 }
718
719 pub fn find_similar(
721 &self,
722 target: &QueryFingerprint,
723 fingerprints: &[QueryFingerprint],
724 threshold: f64,
725 ) -> Vec<(usize, f64)> {
726 let mut results: Vec<(usize, f64)> = fingerprints
727 .iter()
728 .enumerate()
729 .filter_map(|(idx, fp)| {
730 let sim = self.similarity(target, fp);
731 if sim >= threshold {
732 Some((idx, sim))
733 } else {
734 None
735 }
736 })
737 .collect();
738
739 results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
740 results
741 }
742
743 pub fn cluster_fingerprints(
745 &self,
746 fingerprints: &[QueryFingerprint],
747 threshold: f64,
748 ) -> Vec<Vec<usize>> {
749 let n = fingerprints.len();
750 let mut visited = vec![false; n];
751 let mut clusters = Vec::new();
752
753 for i in 0..n {
754 if visited[i] {
755 continue;
756 }
757
758 let mut cluster = vec![i];
759 visited[i] = true;
760
761 for j in (i + 1)..n {
762 if visited[j] {
763 continue;
764 }
765
766 let sim = self.similarity(&fingerprints[i], &fingerprints[j]);
767 if sim >= threshold {
768 cluster.push(j);
769 visited[j] = true;
770 }
771 }
772
773 clusters.push(cluster);
774 }
775
776 clusters
777 }
778
779 pub fn statistics(&self) -> FingerprintingStatistics {
781 FingerprintingStatistics {
782 fingerprints_computed: self.stats.computed.load(Ordering::Relaxed),
783 cache_hits: self.stats.cache_hits.load(Ordering::Relaxed),
784 cache_misses: self.stats.cache_misses.load(Ordering::Relaxed),
785 cache_size: self
786 .cache
787 .read()
788 .expect("read lock should not be poisoned")
789 .len(),
790 }
791 }
792
793 pub fn clear_cache(&self) {
795 self.cache
796 .write()
797 .expect("write lock should not be poisoned")
798 .clear();
799 }
800}
801
802#[derive(Debug, Clone, Serialize, Deserialize)]
804pub struct FingerprintingStatistics {
805 pub fingerprints_computed: u64,
807 pub cache_hits: u64,
809 pub cache_misses: u64,
811 pub cache_size: usize,
813}
814
815impl FingerprintingStatistics {
816 pub fn cache_hit_rate(&self) -> f64 {
818 let total = self.cache_hits + self.cache_misses;
819 if total == 0 {
820 0.0
821 } else {
822 self.cache_hits as f64 / total as f64
823 }
824 }
825}
826
827fn remove_comments(query: &str) -> String {
830 let mut result = String::new();
831 let mut in_string = false;
832 let mut escape = false;
833 let mut chars = query.chars().peekable();
834
835 while let Some(c) = chars.next() {
836 if escape {
837 result.push(c);
838 escape = false;
839 continue;
840 }
841
842 if c == '\\' && in_string {
843 result.push(c);
844 escape = true;
845 continue;
846 }
847
848 if c == '"' && !in_string {
849 in_string = true;
850 result.push(c);
851 continue;
852 }
853
854 if c == '"' && in_string {
855 in_string = false;
856 result.push(c);
857 continue;
858 }
859
860 if !in_string && c == '#' {
861 while let Some(&nc) = chars.peek() {
863 if nc == '\n' {
864 break;
865 }
866 chars.next();
867 }
868 result.push(' ');
869 continue;
870 }
871
872 result.push(c);
873 }
874
875 result
876}
877
878fn normalize_whitespace(query: &str) -> String {
879 let whitespace_pattern = whitespace_regex();
880 whitespace_pattern
881 .replace_all(query, " ")
882 .trim()
883 .to_string()
884}
885
886fn count_triple_patterns(query: &str) -> usize {
887 let triple_pattern = triple_pattern_regex();
889 triple_pattern.find_iter(query).count()
890}
891
892fn count_variables(query: &str) -> usize {
893 let var_pattern = variable_regex();
894 let vars: HashSet<_> = var_pattern.find_iter(query).map(|m| m.as_str()).collect();
895 vars.len()
896}
897
898fn is_likely_predicate(query: &str, position: usize) -> bool {
899 let before = &query[..position];
902 let after = &query[position..];
903
904 let has_subject = before
906 .trim_end()
907 .ends_with(|c: char| c == '>' || c.is_alphabetic() || c == '_');
908 let has_object = after.trim_start().starts_with(['?', '<', '"']);
910
911 has_subject && has_object
912}
913
914fn string_similarity(s1: &str, s2: &str) -> f64 {
915 let tokens1: HashSet<_> = s1.split_whitespace().collect();
917 let tokens2: HashSet<_> = s2.split_whitespace().collect();
918
919 let intersection = tokens1.intersection(&tokens2).count();
920 let union = tokens1.union(&tokens2).count();
921
922 if union == 0 {
923 1.0
924 } else {
925 intersection as f64 / union as f64
926 }
927}
928
929fn string_literal_regex() -> &'static Regex {
932 static REGEX: OnceLock<Regex> = OnceLock::new();
933 REGEX.get_or_init(|| {
934 Regex::new(r#""(?:[^"\\]|\\.)*"(?:@[a-zA-Z-]+)?(?:\^\^<[^>]+>)?"#).expect("Invalid regex")
935 })
936}
937
938fn numeric_literal_regex() -> &'static Regex {
939 static REGEX: OnceLock<Regex> = OnceLock::new();
940 REGEX.get_or_init(|| {
941 Regex::new(r"\b[-+]?(?:\d+\.?\d*|\.\d+)(?:[eE][-+]?\d+)?\b").expect("Invalid regex")
944 })
945}
946
947fn iri_local_regex() -> &'static Regex {
948 static REGEX: OnceLock<Regex> = OnceLock::new();
949 REGEX.get_or_init(|| Regex::new(r"<[^>]+>").expect("Invalid regex"))
950}
951
952fn variable_regex() -> &'static Regex {
953 static REGEX: OnceLock<Regex> = OnceLock::new();
954 REGEX.get_or_init(|| Regex::new(r"\?[a-zA-Z_][a-zA-Z0-9_]*").expect("Invalid regex"))
955}
956
957fn whitespace_regex() -> &'static Regex {
958 static REGEX: OnceLock<Regex> = OnceLock::new();
959 REGEX.get_or_init(|| Regex::new(r"\s+").expect("Invalid regex"))
960}
961
962fn triple_pattern_regex() -> &'static Regex {
963 static REGEX: OnceLock<Regex> = OnceLock::new();
964 REGEX.get_or_init(|| {
965 Regex::new(r#"(?:\?[a-zA-Z_]\w*|<[^>]+>)\s+(?:\?[a-zA-Z_]\w*|<[^>]+>|[a-zA-Z]+:[a-zA-Z_]\w*)\s+(?:\?[a-zA-Z_]\w*|<[^>]+>|"[^"]*"|[0-9]+)"#).expect("Invalid regex")
966 })
967}
968
969#[cfg(test)]
970mod tests {
971 use super::*;
972
973 #[test]
974 fn test_basic_fingerprint() {
975 let config = FingerprintConfig::default();
976 let fingerprinter = QueryFingerprinter::new(config);
977
978 let query = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
979 let fp = fingerprinter.fingerprint(query).unwrap();
980
981 assert_eq!(fp.query_form, QueryForm::Select);
982 assert!(!fp.hash.is_empty());
983 assert!(!fp.template.is_empty());
984 }
985
986 #[test]
987 fn test_similar_queries_same_template() {
988 let config = FingerprintConfig::default();
989 let fingerprinter = QueryFingerprinter::new(config);
990
991 let query1 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
992 let query2 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }";
993
994 let fp1 = fingerprinter.fingerprint(query1).unwrap();
995 let fp2 = fingerprinter.fingerprint(query2).unwrap();
996
997 let similarity = fingerprinter.similarity(&fp1, &fp2);
999 assert!(
1000 similarity > 0.7,
1001 "Similarity should be high for same structure"
1002 );
1003 }
1004
1005 #[test]
1006 fn test_different_queries_different_template() {
1007 let config = FingerprintConfig::default();
1008 let fingerprinter = QueryFingerprinter::new(config);
1009
1010 let query1 = "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }";
1011 let query2 = "ASK { ?x <http://example.org/age> ?y . ?y <http://example.org/value> ?z }";
1012
1013 let fp1 = fingerprinter.fingerprint(query1).unwrap();
1014 let fp2 = fingerprinter.fingerprint(query2).unwrap();
1015
1016 let similarity = fingerprinter.similarity(&fp1, &fp2);
1017 assert!(
1018 similarity < 0.5,
1019 "Similarity should be low for different structures"
1020 );
1021 }
1022
1023 #[test]
1024 fn test_feature_extraction() {
1025 let config = FingerprintConfig::default();
1026 let fingerprinter = QueryFingerprinter::new(config);
1027
1028 let query = r#"
1029 SELECT ?s ?name (COUNT(?o) AS ?count)
1030 WHERE {
1031 ?s <http://example.org/type> <http://example.org/Person> .
1032 ?s <http://example.org/name> ?name .
1033 OPTIONAL { ?s <http://example.org/friend> ?o }
1034 FILTER(?name != "")
1035 }
1036 GROUP BY ?s ?name
1037 ORDER BY DESC(?count)
1038 LIMIT 10
1039 "#;
1040
1041 let fp = fingerprinter.fingerprint(query).unwrap();
1042
1043 assert!(fp.features.has_group_by);
1044 assert!(fp.features.has_order_by);
1045 assert!(fp.features.has_limit);
1046 assert!(fp.features.has_aggregates);
1047 assert!(fp.features.optional_count > 0);
1048 assert!(fp.features.filter_count > 0);
1049 }
1050
1051 #[test]
1052 fn test_cache_behavior() {
1053 let config = FingerprintConfig::default();
1054 let fingerprinter = QueryFingerprinter::new(config);
1055
1056 let query = "SELECT ?s WHERE { ?s ?p ?o }";
1057
1058 let _fp1 = fingerprinter.fingerprint(query).unwrap();
1060 let stats1 = fingerprinter.statistics();
1061 assert_eq!(stats1.cache_misses, 1);
1062
1063 let _fp2 = fingerprinter.fingerprint(query).unwrap();
1065 let stats2 = fingerprinter.statistics();
1066 assert_eq!(stats2.cache_hits, 1);
1067 }
1068
1069 #[test]
1070 fn test_query_forms() {
1071 let config = FingerprintConfig::default();
1072 let fingerprinter = QueryFingerprinter::new(config);
1073
1074 let select = "SELECT ?s WHERE { ?s ?p ?o }";
1075 let ask = "ASK { ?s ?p ?o }";
1076 let construct = "CONSTRUCT { ?s ?p ?o } WHERE { ?s ?p ?o }";
1077 let describe = "DESCRIBE <http://example.org/thing>";
1078
1079 assert_eq!(
1080 fingerprinter.fingerprint(select).unwrap().query_form,
1081 QueryForm::Select
1082 );
1083 assert_eq!(
1084 fingerprinter.fingerprint(ask).unwrap().query_form,
1085 QueryForm::Ask
1086 );
1087 assert_eq!(
1088 fingerprinter.fingerprint(construct).unwrap().query_form,
1089 QueryForm::Construct
1090 );
1091 assert_eq!(
1092 fingerprinter.fingerprint(describe).unwrap().query_form,
1093 QueryForm::Describe
1094 );
1095 }
1096
1097 #[test]
1098 fn test_parameter_extraction() {
1099 let config = FingerprintConfig::default();
1100 let fingerprinter = QueryFingerprinter::new(config);
1101
1102 let query = r#"SELECT ?s WHERE { ?s <http://example.org/age> 25 . ?s <http://example.org/name> "Alice" }"#;
1103 let fp = fingerprinter.fingerprint(query).unwrap();
1104
1105 assert!(!fp.parameters.is_empty());
1106 }
1107
1108 #[test]
1109 fn test_find_similar() {
1110 let config = FingerprintConfig::default();
1111 let fingerprinter = QueryFingerprinter::new(config);
1112
1113 let queries = [
1114 "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }",
1115 "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }",
1116 "SELECT ?s WHERE { ?s <http://example.org/name> \"Charlie\" }",
1117 "ASK { ?x <http://example.org/age> ?y }",
1118 ];
1119
1120 let fingerprints: Vec<_> = queries
1121 .iter()
1122 .map(|q| fingerprinter.fingerprint(q).unwrap())
1123 .collect();
1124
1125 let target = &fingerprints[0];
1126 let similar = fingerprinter.find_similar(target, &fingerprints, 0.7);
1127
1128 assert!(similar.len() >= 2);
1130 }
1131
1132 #[test]
1133 fn test_cluster_fingerprints() {
1134 let config = FingerprintConfig::default();
1135 let fingerprinter = QueryFingerprinter::new(config);
1136
1137 let queries = [
1138 "SELECT ?s WHERE { ?s <http://example.org/name> \"Alice\" }",
1139 "SELECT ?s WHERE { ?s <http://example.org/name> \"Bob\" }",
1140 "ASK { ?x <http://example.org/age> ?y }",
1141 "ASK { ?x <http://example.org/age> ?z }",
1142 ];
1143
1144 let fingerprints: Vec<_> = queries
1145 .iter()
1146 .map(|q| fingerprinter.fingerprint(q).unwrap())
1147 .collect();
1148
1149 let clusters = fingerprinter.cluster_fingerprints(&fingerprints, 0.7);
1150
1151 assert!(clusters.len() >= 2);
1153 }
1154
1155 #[test]
1156 fn test_complexity_score() {
1157 let config = FingerprintConfig::default();
1158 let fingerprinter = QueryFingerprinter::new(config);
1159
1160 let simple = "SELECT ?s WHERE { ?s ?p ?o }";
1161 let complex = r#"
1162 SELECT ?s ?name (COUNT(?friend) AS ?friendCount)
1163 WHERE {
1164 ?s <http://example.org/type> <http://example.org/Person> .
1165 ?s <http://example.org/name> ?name .
1166 OPTIONAL { ?s <http://example.org/friend> ?friend }
1167 FILTER(?name != "")
1168 UNION {
1169 ?s <http://example.org/nickname> ?name
1170 }
1171 }
1172 GROUP BY ?s ?name
1173 ORDER BY DESC(?friendCount)
1174 LIMIT 100
1175 "#;
1176
1177 let fp_simple = fingerprinter.fingerprint(simple).unwrap();
1178 let fp_complex = fingerprinter.fingerprint(complex).unwrap();
1179
1180 assert!(fp_complex.features.complexity_score > fp_simple.features.complexity_score);
1181 }
1182
1183 #[test]
1184 fn test_hash_algorithms() {
1185 let queries = "SELECT ?s WHERE { ?s ?p ?o }";
1186
1187 let config_md5 = FingerprintConfig {
1189 hash_algorithm: HashAlgorithm::Md5,
1190 ..Default::default()
1191 };
1192 let fp_md5 = QueryFingerprinter::new(config_md5)
1193 .fingerprint(queries)
1194 .unwrap();
1195 assert_eq!(fp_md5.hash.len(), 32); let config_sha = FingerprintConfig {
1199 hash_algorithm: HashAlgorithm::Sha256,
1200 ..Default::default()
1201 };
1202 let fp_sha = QueryFingerprinter::new(config_sha)
1203 .fingerprint(queries)
1204 .unwrap();
1205 assert_eq!(fp_sha.hash.len(), 64); let config_fnv = FingerprintConfig {
1209 hash_algorithm: HashAlgorithm::Fnv1a,
1210 ..Default::default()
1211 };
1212 let fp_fnv = QueryFingerprinter::new(config_fnv)
1213 .fingerprint(queries)
1214 .unwrap();
1215 assert_eq!(fp_fnv.hash.len(), 16); }
1217
1218 #[test]
1219 fn test_statistics() {
1220 let config = FingerprintConfig::default();
1221 let fingerprinter = QueryFingerprinter::new(config);
1222
1223 let _ = fingerprinter.fingerprint("SELECT ?s WHERE { ?s ?p ?o }");
1224 let _ = fingerprinter.fingerprint("SELECT ?s WHERE { ?s ?p ?o }"); let stats = fingerprinter.statistics();
1227 assert_eq!(stats.fingerprints_computed, 1);
1228 assert_eq!(stats.cache_hits, 1);
1229 assert_eq!(stats.cache_misses, 1);
1230 assert!(stats.cache_hit_rate() > 0.0);
1231 }
1232
1233 #[test]
1234 fn test_clear_cache() {
1235 let config = FingerprintConfig::default();
1236 let fingerprinter = QueryFingerprinter::new(config);
1237
1238 let _ = fingerprinter.fingerprint("SELECT ?s WHERE { ?s ?p ?o }");
1239 assert_eq!(fingerprinter.statistics().cache_size, 1);
1240
1241 fingerprinter.clear_cache();
1242 assert_eq!(fingerprinter.statistics().cache_size, 0);
1243 }
1244}