Skip to main content

elid/
lib.rs

1//! # ELID - Embedding Locality IDentifier
2//!
3//! ELID enables vector search without a vector store by encoding high-dimensional embeddings
4//! into sortable string IDs that preserve locality. Similar vectors produce similar IDs,
5//! allowing you to use standard database indexes for similarity search.
6//!
7//! ELID also includes a complete suite of fast, zero-dependency string similarity algorithms.
8//!
9//! ## Feature Sets
10//!
11//! ### Embedding Encoding (`embeddings` feature)
12//!
13//! Convert embeddings from any ML model into compact, sortable identifiers:
14//!
15//! - **Mini128**: 128-bit SimHash using signed random projections (fast, Hamming distance)
16//! - **Morton10x10**: Z-order curve encoding (database range queries)
17//! - **Hilbert10x10**: Hilbert curve encoding (maximum locality preservation)
18//!
19//! ### String Similarity (`strings` feature, default)
20//!
21//! - **Levenshtein Distance**: Classic edit distance algorithm
22//! - **Normalized Levenshtein**: Returns similarity as a value between 0.0 and 1.0
23//! - **Jaro-Winkler Similarity**: Better for short strings like names
24//! - **Hamming Distance**: For equal-length strings
25//! - **Optimal String Alignment (OSA)**: Levenshtein with transpositions
26//! - **SimHash**: Locality-sensitive hashing for string similarity queries
27//!
28//! ## Feature Flags
29//!
30//! - `strings` (default): Zero-dependency string similarity algorithms
31//! - `embeddings` (default): Vector encoding with Mini128, Morton, and Hilbert profiles
32//! - `models`: Base ONNX model support using tract-onnx (WASM compatible)
33//! - `models-text`: Text embedding models (Model2Vec potion-base-8M)
34//! - `models-image`: Image embedding models (MobileNetV3-Small)
35//! - `wasm`: WebAssembly bindings (includes embeddings)
36//! - `python`: Python bindings via PyO3 (includes embeddings + numpy)
37//! - `ffi`: C FFI bindings
38//!
39//! ## Embedding Encoding Example
40//!
41//! ```rust,ignore
42//! use elid::embeddings::{encode, Profile, hamming_distance};
43//!
44//! // Get embeddings from your ML model
45//! let embedding1 = model.embed("Hello, world!")?;
46//! let embedding2 = model.embed("Hello, universe!")?;
47//!
48//! // Encode to sortable ELIDs
49//! let profile = Profile::default(); // Mini128
50//! let elid1 = encode(&embedding1, &profile)?;
51//! let elid2 = encode(&embedding2, &profile)?;
52//!
53//! // Compare via Hamming distance (lower = more similar)
54//! let distance = hamming_distance(&elid1, &elid2)?;
55//! ```
56//!
57//! ## String Similarity Example
58//!
59//! ```rust
60//! use elid::{levenshtein, normalized_levenshtein, jaro_winkler, simhash, simhash_similarity};
61//!
62//! let distance = levenshtein("kitten", "sitting");
63//! assert_eq!(distance, 3);
64//!
65//! let similarity = normalized_levenshtein("kitten", "sitting");
66//! assert!(similarity > 0.5 && similarity < 0.7);
67//!
68//! let jw_similarity = jaro_winkler("martha", "marhta");
69//! assert!(jw_similarity > 0.9);
70//!
71//! // SimHash for numeric database queries
72//! let hash1 = simhash("iPhone 14");
73//! let hash2 = simhash("iPhone 15");
74//! let sim = simhash_similarity("iPhone 14", "iPhone 15");
75//! assert!(sim > 0.8);
76//! ```
77
78#![deny(missing_docs)]
79#![cfg_attr(not(feature = "ffi"), deny(unsafe_code))]
80
81mod strings;
82
83#[cfg(feature = "embeddings")]
84pub mod embeddings;
85
86#[cfg(feature = "wasm")]
87pub mod wasm;
88
89#[cfg(feature = "python")]
90pub mod python;
91
92#[cfg(feature = "ffi")]
93pub mod ffi;
94
95#[cfg(feature = "models")]
96pub mod models;
97
98// Re-export everything from strings for backwards compatibility
99pub use strings::{
100    find_similar_hashes, hamming, jaro, jaro_winkler, jaro_winkler_with_prefix, levenshtein,
101    levenshtein_with_opts, normalized_hamming, normalized_levenshtein, normalized_osa,
102    osa_distance, simhash, simhash_distance, simhash_similarity, SimilarityOpts,
103};
104
105/// Compute the best matching similarity between two strings using multiple algorithms
106/// and return the highest score.
107///
108/// This function runs multiple algorithms and returns the best result, useful when
109/// you're not sure which algorithm will work best for your data.
110///
111/// # Example
112///
113/// ```rust
114/// use elid::best_match;
115///
116/// let score = best_match("hello", "hallo");
117/// assert!(score > 0.7);
118/// ```
119pub fn best_match(a: &str, b: &str) -> f64 {
120    let lev = normalized_levenshtein(a, b);
121    let jw = jaro_winkler(a, b);
122    lev.max(jw)
123}
124
125/// Find the best match for a query string in a list of candidates.
126///
127/// Returns the index and similarity score of the best match.
128///
129/// # Example
130///
131/// ```rust
132/// use elid::find_best_match;
133///
134/// let candidates = vec!["apple", "application", "apply"];
135/// let (idx, score) = find_best_match("app", &candidates);
136/// assert!(score > 0.5);
137/// ```
138pub fn find_best_match(query: &str, candidates: &[&str]) -> (usize, f64) {
139    candidates
140        .iter()
141        .enumerate()
142        .map(|(i, candidate)| (i, best_match(query, candidate)))
143        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
144        .unwrap_or((0, 0.0))
145}
146
147/// Find all matches above a threshold score.
148///
149/// Returns a vector of (index, score) tuples for all candidates above the threshold.
150///
151/// # Example
152///
153/// ```rust
154/// use elid::find_matches_above_threshold;
155///
156/// let candidates = vec!["apple", "application", "apply", "banana"];
157/// let matches = find_matches_above_threshold("app", &candidates, 0.5);
158/// assert!(matches.len() >= 2); // Should match at least "apple" and "apply"
159/// ```
160pub fn find_matches_above_threshold(
161    query: &str,
162    candidates: &[&str],
163    threshold: f64,
164) -> Vec<(usize, f64)> {
165    candidates
166        .iter()
167        .enumerate()
168        .filter_map(|(i, candidate)| {
169            let score = best_match(query, candidate);
170            if score >= threshold {
171                Some((i, score))
172            } else {
173                None
174            }
175        })
176        .collect()
177}
178
179// Python module is defined in python.rs and exported via #[pymodule]
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn test_best_match() {
187        let score = best_match("hello", "hallo");
188        assert!(score > 0.7);
189    }
190
191    #[test]
192    fn test_find_best_match() {
193        let candidates = vec!["apple", "application", "apply"];
194        let (idx, score) = find_best_match("app", &candidates);
195        assert!(score > 0.5);
196        assert!(candidates[idx].starts_with("app"));
197    }
198
199    #[test]
200    fn test_find_matches_above_threshold() {
201        let candidates = vec!["apple", "application", "apply", "banana"];
202        let matches = find_matches_above_threshold("app", &candidates, 0.5);
203        assert!(matches.len() >= 2);
204    }
205}