magnetise/lib.rs
1//! A Rust library to asses the similarity between SQL queries.
2
3use regex::Regex;
4use std::collections::HashSet;
5
6/// Itemizes a query string based on a given pattern using regular expressions.
7///
8/// # Arguments
9///
10/// * `query` - The input query string to be itemized.
11/// * `pattern` - The regular expression pattern used for itemization.
12///
13/// # Returns
14///
15/// A `HashSet` containing the itemized tokens from the query, converted to uppercase.
16///
17/// # Examples
18///
19/// ```rust
20/// use std::collections::HashSet;
21/// use magnetise::itemize_query;
22///
23/// let query = "SELECT * FROM users WHERE age > 30";
24/// let pattern = r"\b(SELECT|WHERE|FROM|AND|OR)\B|\w+";
25/// let tokens: HashSet<_> = itemize_query(query, pattern).into_iter().collect();
26///
27/// let expected_tokens: HashSet<String> = vec![
28/// "SELECT", "FROM", "USERS", "WHERE", "AGE", "30"].iter().map(|&s| s.to_string()).collect();
29/// assert_eq!(tokens, expected_tokens);
30/// ```
31
32pub fn itemize_query(query: &str, pattern: &str) -> HashSet<String> {
33 let re = Regex::new(pattern).unwrap();
34 re.find_iter(query)
35 .map(|matched| matched.as_str().to_uppercase())
36 .collect()
37}
38
39/// Calculates the Jaccard similarity between two queries.
40///
41/// The Jaccard similarity is a measure of similarity between two sets, defined as the size of the intersection divided by the size of the union of the sets.
42///
43/// # Arguments
44///
45/// * `query1` - The first query string.
46/// * `query2` - The second query string.
47/// * `itemizer_pattern` - The regular expression pattern used for itemizing the queries.
48///
49/// # Returns
50///
51/// The Jaccard similarity between the two queries, as a floating-point value between 0.0 and 1.0.
52///
53/// # Examples
54///
55/// ```rust
56/// use magnetise::jaccard_similarity;
57///
58/// let query1 = "SELECT * FROM users WHERE age > 30";
59/// let query2 = "SELECT * FROM users WHERE age > 31";
60/// let pattern = r"\b(SELECT|WHERE|FROM|AND|OR)\B|\w+";
61/// let similarity = jaccard_similarity(query1, query2, pattern);
62/// assert!(similarity > 0.7 && similarity < 0.8);
63/// ```
64
65pub fn jaccard_similarity(query1: &str, query2: &str, itemizer_pattern: &str) -> f64 {
66 let tokens1: HashSet<_> = itemize_query(query1, itemizer_pattern);
67 let tokens2: HashSet<_> = itemize_query(query2, itemizer_pattern);
68 let intersection_size = tokens1.intersection(&tokens2).count();
69 let union_size = tokens1.union(&tokens2).count();
70
71 if union_size == 0 {
72 return 0.0;
73 }
74
75 let jaccard = intersection_size as f64 / union_size as f64;
76
77 println!("The tokens from 1st query: {:?}", tokens1);
78 println!("The tokens from 2nd query: {:?}", tokens2);
79 println!(
80 "Jaccard = intersection / union => {:?} / {:?} = {:?}",
81 intersection_size, union_size, jaccard
82 );
83
84 jaccard
85}