cyanea_seq/
minhash.rs

1//! MinHash and FracMinHash sketching for rapid genome comparison.
2//!
3//! Implements bottom-k MinHash and scaled FracMinHash for estimating Jaccard
4//! similarity, containment, and average nucleotide identity (ANI) between
5//! DNA sequences without full alignment.
6//!
7//! # Algorithms
8//!
9//! - **MinHash** — bottom-k sketch: keeps the `sketch_size` smallest hash values
10//!   from the set of canonical k-mers. Jaccard similarity is estimated as the
11//!   fraction of shared hashes in the union of the two sketches.
12//!
13//! - **FracMinHash** — scaled sketch: keeps all hash values below
14//!   `u64::MAX / scale`. This allows unbiased containment estimation and works
15//!   well for genomes of very different sizes.
16//!
17//! Both use canonical k-mers (min of forward and reverse complement hash) so
18//! that a sequence and its reverse complement produce identical sketches.
19//!
20//! # Example
21//!
22//! ```
23//! use cyanea_seq::minhash::{MinHash, FracMinHash};
24//!
25//! let seq_a = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
26//! let seq_b = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
27//!
28//! let sketch_a = MinHash::from_sequence(seq_a, 7, 100).unwrap();
29//! let sketch_b = MinHash::from_sequence(seq_b, 7, 100).unwrap();
30//!
31//! let jaccard = sketch_a.jaccard(&sketch_b).unwrap();
32//! assert!((jaccard - 1.0).abs() < 1e-10);
33//! ```
34
35use cyanea_core::{CyaneaError, Result};
36
37// ---------------------------------------------------------------------------
38// MurmurHash3-style hash function (inline, no external deps)
39// ---------------------------------------------------------------------------
40
41/// MurmurHash3 finalizer-based 64-bit hash.
42fn murmurhash3_64(key: &[u8], seed: u64) -> u64 {
43    let mut h = seed;
44    for chunk in key.chunks(8) {
45        let mut k = 0u64;
46        for (i, &b) in chunk.iter().enumerate() {
47            k |= (b as u64) << (i * 8);
48        }
49        k = k.wrapping_mul(0xff51afd7ed558ccd);
50        k ^= k >> 33;
51        k = k.wrapping_mul(0xc4ceb9fe1a85ec53);
52        h ^= k;
53        h = h.wrapping_mul(5).wrapping_add(0xe6546b64);
54    }
55    h ^= h >> 33;
56    h = h.wrapping_mul(0xff51afd7ed558ccd);
57    h ^= h >> 33;
58    h = h.wrapping_mul(0xc4ceb9fe1a85ec53);
59    h ^= h >> 33;
60    h
61}
62
63// ---------------------------------------------------------------------------
64// DNA complement and canonical k-mer helpers
65// ---------------------------------------------------------------------------
66
67/// DNA complement for canonical k-mer computation (ACGT only).
68#[inline]
69fn dna_complement(b: u8) -> u8 {
70    match b {
71        b'A' | b'a' => b'T',
72        b'T' | b't' => b'A',
73        b'C' | b'c' => b'G',
74        b'G' | b'g' => b'C',
75        _ => b'N',
76    }
77}
78
79/// Compute the reverse complement of a k-mer in place into a buffer.
80fn reverse_complement(kmer: &[u8], buf: &mut Vec<u8>) {
81    buf.clear();
82    buf.extend(kmer.iter().rev().map(|&b| dna_complement(b)));
83}
84
85/// Normalize a byte to uppercase.
86#[inline]
87fn to_upper(b: u8) -> u8 {
88    b.to_ascii_uppercase()
89}
90
91/// Check if a byte is a valid DNA base (ACGT, case-insensitive).
92#[inline]
93fn is_dna(b: u8) -> bool {
94    matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
95}
96
97/// Hash a k-mer canonically: hash both forward and reverse complement, return
98/// the minimum. This ensures a sequence and its reverse complement produce the
99/// same sketch.
100fn canonical_kmer_hash(kmer: &[u8], rc_buf: &mut Vec<u8>, seed: u64) -> u64 {
101    // Uppercase the k-mer for consistent hashing
102    let upper: Vec<u8> = kmer.iter().map(|&b| to_upper(b)).collect();
103    let h_fwd = murmurhash3_64(&upper, seed);
104    reverse_complement(&upper, rc_buf);
105    let h_rc = murmurhash3_64(rc_buf, seed);
106    h_fwd.min(h_rc)
107}
108
109// ---------------------------------------------------------------------------
110// MinHash — bottom-k sketch
111// ---------------------------------------------------------------------------
112
113/// Bottom-k MinHash sketch for rapid genome comparison.
114///
115/// Keeps the `sketch_size` smallest canonical k-mer hash values from a DNA
116/// sequence. Jaccard similarity between two sketches is estimated from the
117/// overlap of their bottom-k hash sets.
118#[derive(Debug, Clone)]
119pub struct MinHash {
120    /// K-mer size.
121    k: usize,
122    /// Number of minimum hashes to keep.
123    sketch_size: usize,
124    /// Sorted bottom-k hash values (ascending).
125    hashes: Vec<u64>,
126}
127
128const HASH_SEED: u64 = 42;
129
130impl MinHash {
131    /// Create a new empty MinHash sketch.
132    ///
133    /// # Errors
134    ///
135    /// Returns an error if `k == 0` or `sketch_size == 0`.
136    pub fn new(k: usize, sketch_size: usize) -> Result<Self> {
137        if k == 0 {
138            return Err(CyaneaError::InvalidInput(
139                "k-mer size must be at least 1".into(),
140            ));
141        }
142        if sketch_size == 0 {
143            return Err(CyaneaError::InvalidInput(
144                "sketch size must be at least 1".into(),
145            ));
146        }
147        Ok(Self {
148            k,
149            sketch_size,
150            hashes: Vec::new(),
151        })
152    }
153
154    /// Build a MinHash sketch from a DNA sequence.
155    ///
156    /// Non-ACGT bases act as k-mer break points (k-mers spanning them are
157    /// skipped). Input is case-insensitive.
158    ///
159    /// # Errors
160    ///
161    /// Returns an error if `k == 0` or `sketch_size == 0`.
162    pub fn from_sequence(seq: &[u8], k: usize, sketch_size: usize) -> Result<Self> {
163        let mut mh = Self::new(k, sketch_size)?;
164        mh.add_sequence(seq);
165        Ok(mh)
166    }
167
168    /// Add k-mers from a DNA sequence to this sketch.
169    ///
170    /// Non-ACGT bases act as k-mer break points. The sketch is maintained as a
171    /// sorted bottom-k set.
172    pub fn add_sequence(&mut self, seq: &[u8]) {
173        if seq.len() < self.k {
174            return;
175        }
176
177        let mut rc_buf = Vec::with_capacity(self.k);
178
179        for window in seq.windows(self.k) {
180            // Skip windows containing non-DNA bases
181            if !window.iter().all(|&b| is_dna(b)) {
182                continue;
183            }
184
185            let h = canonical_kmer_hash(window, &mut rc_buf, HASH_SEED);
186            self.insert_hash(h);
187        }
188    }
189
190    /// Insert a hash value into the bottom-k set.
191    fn insert_hash(&mut self, h: u64) {
192        // If we haven't filled the sketch yet, insert and maintain sort
193        if self.hashes.len() < self.sketch_size {
194            let pos = self.hashes.binary_search(&h).unwrap_or_else(|p| p);
195            // Skip duplicates
196            if pos < self.hashes.len() && self.hashes[pos] == h {
197                return;
198            }
199            self.hashes.insert(pos, h);
200        } else if h < self.hashes[self.hashes.len() - 1] {
201            // h is smaller than the current max in our bottom-k set
202            let pos = self.hashes.binary_search(&h).unwrap_or_else(|p| p);
203            // Skip duplicates
204            if pos < self.hashes.len() && self.hashes[pos] == h {
205                return;
206            }
207            self.hashes.insert(pos, h);
208            self.hashes.truncate(self.sketch_size);
209        }
210        // else: h >= current max and sketch is full, discard
211    }
212
213    /// Estimate Jaccard similarity between this sketch and another.
214    ///
215    /// Uses the merge-based estimator: merge both sorted hash arrays, count
216    /// how many of the bottom-k values from the union appear in both sketches.
217    ///
218    /// # Errors
219    ///
220    /// Returns an error if the sketches have different `k` values.
221    pub fn jaccard(&self, other: &MinHash) -> Result<f64> {
222        if self.k != other.k {
223            return Err(CyaneaError::InvalidInput(format!(
224                "incompatible k-mer sizes: {} vs {}",
225                self.k, other.k
226            )));
227        }
228        if self.hashes.is_empty() && other.hashes.is_empty() {
229            return Ok(1.0);
230        }
231        if self.hashes.is_empty() || other.hashes.is_empty() {
232            return Ok(0.0);
233        }
234
235        // Merge the two sorted hash lists, take the smallest `sketch_size`
236        // from the union, count how many appear in both.
237        let max_size = self.sketch_size.max(other.sketch_size);
238        let mut i = 0;
239        let mut j = 0;
240        let mut union_count = 0;
241        let mut intersection_count = 0;
242
243        while union_count < max_size && i < self.hashes.len() && j < other.hashes.len() {
244            let a = self.hashes[i];
245            let b = other.hashes[j];
246            if a < b {
247                i += 1;
248            } else if a > b {
249                j += 1;
250            } else {
251                // Equal
252                intersection_count += 1;
253                i += 1;
254                j += 1;
255            }
256            union_count += 1;
257        }
258
259        // Fill remaining union slots from whichever list still has elements
260        while union_count < max_size && i < self.hashes.len() {
261            i += 1;
262            union_count += 1;
263        }
264        while union_count < max_size && j < other.hashes.len() {
265            j += 1;
266            union_count += 1;
267        }
268
269        if union_count == 0 {
270            return Ok(0.0);
271        }
272
273        Ok(intersection_count as f64 / union_count as f64)
274    }
275
276    /// Estimate containment of `self` in `other`.
277    ///
278    /// Containment C(A, B) = |A intersect B| / |A|.
279    ///
280    /// # Errors
281    ///
282    /// Returns an error if the sketches have different `k` values.
283    pub fn containment(&self, other: &MinHash) -> Result<f64> {
284        if self.k != other.k {
285            return Err(CyaneaError::InvalidInput(format!(
286                "incompatible k-mer sizes: {} vs {}",
287                self.k, other.k
288            )));
289        }
290        if self.hashes.is_empty() {
291            return Ok(1.0);
292        }
293
294        let mut shared = 0usize;
295        let mut j = 0;
296        for &h in &self.hashes {
297            while j < other.hashes.len() && other.hashes[j] < h {
298                j += 1;
299            }
300            if j < other.hashes.len() && other.hashes[j] == h {
301                shared += 1;
302            }
303        }
304
305        Ok(shared as f64 / self.hashes.len() as f64)
306    }
307
308    /// Estimate average nucleotide identity (ANI) from Jaccard similarity.
309    ///
310    /// Uses the Mash formula: `ANI = 1 + (2/k) * ln(2J / (1 + J))`
311    ///
312    /// # Errors
313    ///
314    /// Returns an error if the sketches have different `k` values, or if the
315    /// Jaccard similarity is zero (ANI undefined).
316    pub fn ani(&self, other: &MinHash) -> Result<f64> {
317        let j = self.jaccard(other)?;
318        ani_from_jaccard(j, self.k)
319    }
320
321    /// Number of hash values currently in the sketch.
322    pub fn len(&self) -> usize {
323        self.hashes.len()
324    }
325
326    /// Whether the sketch is empty (no k-mers hashed).
327    pub fn is_empty(&self) -> bool {
328        self.hashes.is_empty()
329    }
330
331    /// The k-mer size used by this sketch.
332    pub fn k(&self) -> usize {
333        self.k
334    }
335
336    /// The target sketch size (bottom-k parameter).
337    pub fn sketch_size(&self) -> usize {
338        self.sketch_size
339    }
340
341    /// Return a reference to the sorted hash values.
342    pub fn hashes(&self) -> &[u64] {
343        &self.hashes
344    }
345}
346
347// ---------------------------------------------------------------------------
348// FracMinHash — scaled sketch
349// ---------------------------------------------------------------------------
350
351/// Scaled (FracMinHash) sketch for rapid genome comparison.
352///
353/// Keeps all canonical k-mer hash values below `u64::MAX / scale`. This gives
354/// a predictable sampling fraction regardless of genome size, making containment
355/// estimation more accurate for genomes of different sizes.
356#[derive(Debug, Clone)]
357pub struct FracMinHash {
358    /// K-mer size.
359    k: usize,
360    /// Scale factor: keep hashes below `u64::MAX / scale`.
361    scale: u64,
362    /// Maximum hash threshold: `u64::MAX / scale`.
363    max_hash: u64,
364    /// Sorted hash values below threshold (ascending).
365    hashes: Vec<u64>,
366}
367
368impl FracMinHash {
369    /// Create a new empty FracMinHash sketch.
370    ///
371    /// # Errors
372    ///
373    /// Returns an error if `k == 0` or `scale == 0`.
374    pub fn new(k: usize, scale: u64) -> Result<Self> {
375        if k == 0 {
376            return Err(CyaneaError::InvalidInput(
377                "k-mer size must be at least 1".into(),
378            ));
379        }
380        if scale == 0 {
381            return Err(CyaneaError::InvalidInput(
382                "scale must be at least 1".into(),
383            ));
384        }
385        let max_hash = u64::MAX / scale;
386        Ok(Self {
387            k,
388            scale,
389            max_hash,
390            hashes: Vec::new(),
391        })
392    }
393
394    /// Build a FracMinHash sketch from a DNA sequence.
395    ///
396    /// # Errors
397    ///
398    /// Returns an error if `k == 0` or `scale == 0`.
399    pub fn from_sequence(seq: &[u8], k: usize, scale: u64) -> Result<Self> {
400        let mut fmh = Self::new(k, scale)?;
401        fmh.add_sequence(seq);
402        Ok(fmh)
403    }
404
405    /// Add k-mers from a DNA sequence to this sketch.
406    ///
407    /// Non-ACGT bases act as k-mer break points.
408    pub fn add_sequence(&mut self, seq: &[u8]) {
409        if seq.len() < self.k {
410            return;
411        }
412
413        let mut rc_buf = Vec::with_capacity(self.k);
414
415        for window in seq.windows(self.k) {
416            if !window.iter().all(|&b| is_dna(b)) {
417                continue;
418            }
419
420            let h = canonical_kmer_hash(window, &mut rc_buf, HASH_SEED);
421            if h <= self.max_hash {
422                // Insert maintaining sorted order, skip duplicates
423                let pos = self.hashes.binary_search(&h).unwrap_or_else(|p| p);
424                if pos >= self.hashes.len() || self.hashes[pos] != h {
425                    self.hashes.insert(pos, h);
426                }
427            }
428        }
429    }
430
431    /// Estimate Jaccard similarity between this sketch and another.
432    ///
433    /// Jaccard = |A intersect B| / |A union B|, computed exactly over the
434    /// hashes below the shared threshold.
435    ///
436    /// # Errors
437    ///
438    /// Returns an error if the sketches have different `k` or `scale` values.
439    pub fn jaccard(&self, other: &FracMinHash) -> Result<f64> {
440        self.check_compatible(other)?;
441
442        if self.hashes.is_empty() && other.hashes.is_empty() {
443            return Ok(1.0);
444        }
445        if self.hashes.is_empty() || other.hashes.is_empty() {
446            return Ok(0.0);
447        }
448
449        let (intersection, union) = self.intersection_union(other);
450
451        if union == 0 {
452            return Ok(0.0);
453        }
454
455        Ok(intersection as f64 / union as f64)
456    }
457
458    /// Estimate containment of `self` in `other`.
459    ///
460    /// Containment C(A, B) = |A intersect B| / |A|.
461    ///
462    /// # Errors
463    ///
464    /// Returns an error if the sketches have different `k` or `scale` values.
465    pub fn containment(&self, other: &FracMinHash) -> Result<f64> {
466        self.check_compatible(other)?;
467
468        if self.hashes.is_empty() {
469            return Ok(1.0);
470        }
471
472        let mut shared = 0usize;
473        let mut j = 0;
474        for &h in &self.hashes {
475            while j < other.hashes.len() && other.hashes[j] < h {
476                j += 1;
477            }
478            if j < other.hashes.len() && other.hashes[j] == h {
479                shared += 1;
480            }
481        }
482
483        Ok(shared as f64 / self.hashes.len() as f64)
484    }
485
486    /// Estimate average nucleotide identity (ANI) from Jaccard similarity.
487    ///
488    /// # Errors
489    ///
490    /// Returns an error if the sketches are incompatible or Jaccard is zero.
491    pub fn ani(&self, other: &FracMinHash) -> Result<f64> {
492        let j = self.jaccard(other)?;
493        ani_from_jaccard(j, self.k)
494    }
495
496    /// Number of hash values in the sketch.
497    pub fn len(&self) -> usize {
498        self.hashes.len()
499    }
500
501    /// Whether the sketch is empty.
502    pub fn is_empty(&self) -> bool {
503        self.hashes.is_empty()
504    }
505
506    /// The k-mer size used by this sketch.
507    pub fn k(&self) -> usize {
508        self.k
509    }
510
511    /// The scale factor.
512    pub fn scale(&self) -> u64 {
513        self.scale
514    }
515
516    /// Return a reference to the sorted hash values.
517    pub fn hashes(&self) -> &[u64] {
518        &self.hashes
519    }
520
521    /// Check that two sketches have compatible parameters.
522    fn check_compatible(&self, other: &FracMinHash) -> Result<()> {
523        if self.k != other.k {
524            return Err(CyaneaError::InvalidInput(format!(
525                "incompatible k-mer sizes: {} vs {}",
526                self.k, other.k
527            )));
528        }
529        if self.scale != other.scale {
530            return Err(CyaneaError::InvalidInput(format!(
531                "incompatible scale values: {} vs {}",
532                self.scale, other.scale
533            )));
534        }
535        Ok(())
536    }
537
538    /// Count intersection and union sizes via merge of sorted hash lists.
539    fn intersection_union(&self, other: &FracMinHash) -> (usize, usize) {
540        let mut i = 0;
541        let mut j = 0;
542        let mut intersection = 0;
543        let mut union = 0;
544
545        while i < self.hashes.len() && j < other.hashes.len() {
546            let a = self.hashes[i];
547            let b = other.hashes[j];
548            if a < b {
549                i += 1;
550            } else if a > b {
551                j += 1;
552            } else {
553                intersection += 1;
554                i += 1;
555                j += 1;
556            }
557            union += 1;
558        }
559
560        // Count remaining elements
561        union += (self.hashes.len() - i) + (other.hashes.len() - j);
562
563        (intersection, union)
564    }
565}
566
567// ---------------------------------------------------------------------------
568// Shared ANI estimation
569// ---------------------------------------------------------------------------
570
571/// Estimate ANI from Jaccard similarity and k-mer size.
572///
573/// Uses the Mash formula: `ANI = 1 + (2/k) * ln(2J / (1 + J))`
574///
575/// Returns an error if Jaccard is zero (ANI undefined due to log(0)).
576fn ani_from_jaccard(j: f64, k: usize) -> Result<f64> {
577    if j <= 0.0 {
578        return Err(CyaneaError::InvalidInput(
579            "cannot estimate ANI when Jaccard similarity is zero".into(),
580        ));
581    }
582    if (j - 1.0).abs() < f64::EPSILON {
583        return Ok(1.0);
584    }
585
586    let mash_distance = -((2.0 * j) / (1.0 + j)).ln() / (k as f64);
587    let ani = 1.0 - mash_distance;
588
589    // Clamp to [0, 1] — numerical edge cases can push slightly outside
590    Ok(ani.clamp(0.0, 1.0))
591}
592
593// ---------------------------------------------------------------------------
594// Tests
595// ---------------------------------------------------------------------------
596
597#[cfg(test)]
598mod tests {
599    use super::*;
600
601    // --- Parameter validation ---
602
603    #[test]
604    fn minhash_k_zero_error() {
605        assert!(MinHash::new(0, 100).is_err());
606    }
607
608    #[test]
609    fn minhash_sketch_size_zero_error() {
610        assert!(MinHash::new(7, 0).is_err());
611    }
612
613    #[test]
614    fn fracminhash_k_zero_error() {
615        assert!(FracMinHash::new(0, 10).is_err());
616    }
617
618    #[test]
619    fn fracminhash_scale_zero_error() {
620        assert!(FracMinHash::new(7, 0).is_err());
621    }
622
623    // --- MinHash: identical sequences ---
624
625    #[test]
626    fn minhash_identical_jaccard_one() {
627        let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
628        let a = MinHash::from_sequence(seq, 7, 100).unwrap();
629        let b = MinHash::from_sequence(seq, 7, 100).unwrap();
630        let j = a.jaccard(&b).unwrap();
631        assert!(
632            (j - 1.0).abs() < 1e-10,
633            "expected Jaccard ~1.0, got {}",
634            j
635        );
636    }
637
638    // --- MinHash: disjoint sequences ---
639
640    #[test]
641    fn minhash_disjoint_jaccard_near_zero() {
642        // Two sequences with no shared k-mers (different base composition)
643        let seq_a = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
644        let seq_b = b"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
645        let a = MinHash::from_sequence(seq_a, 7, 100).unwrap();
646        let b = MinHash::from_sequence(seq_b, 7, 100).unwrap();
647        let j = a.jaccard(&b).unwrap();
648        assert!(
649            j < 0.01,
650            "expected Jaccard ~0.0 for disjoint sequences, got {}",
651            j
652        );
653    }
654
655    // --- MinHash: sketch size correct ---
656
657    #[test]
658    fn minhash_sketch_size_capped() {
659        // A long enough sequence to produce many distinct k-mers
660        let seq = b"ACGTACGTAAACCCGGGTTTACGTACGTAAACCCGGGTTTACGTACGT";
661        let mh = MinHash::from_sequence(seq, 5, 10).unwrap();
662        assert!(
663            mh.len() <= 10,
664            "sketch should have at most 10 hashes, got {}",
665            mh.len()
666        );
667        // With 43 5-mers from this sequence, we should fill the sketch
668        assert!(
669            mh.len() > 0,
670            "sketch should not be empty for a non-trivial sequence"
671        );
672    }
673
674    // --- MinHash: containment of subset ---
675
676    #[test]
677    fn minhash_containment_subset() {
678        // seq_a is a prefix of seq_b — all k-mers of seq_a appear in seq_b
679        let seq_a = b"ACGTACGTACGTACGT";
680        let seq_b = b"ACGTACGTACGTACGTTTTTCCCCAAAAGGGGG";
681        let a = MinHash::from_sequence(seq_a, 5, 1000).unwrap();
682        let b = MinHash::from_sequence(seq_b, 5, 1000).unwrap();
683        let c = a.containment(&b).unwrap();
684        assert!(
685            c > 0.9,
686            "expected containment ~1.0 for subset, got {}",
687            c
688        );
689    }
690
691    // --- MinHash: ANI of identical sequences ---
692
693    #[test]
694    fn minhash_ani_identical_is_one() {
695        let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
696        let a = MinHash::from_sequence(seq, 7, 100).unwrap();
697        let b = MinHash::from_sequence(seq, 7, 100).unwrap();
698        let ani = a.ani(&b).unwrap();
699        assert!(
700            (ani - 1.0).abs() < 1e-10,
701            "expected ANI ~1.0, got {}",
702            ani
703        );
704    }
705
706    // --- MinHash: incompatible k error ---
707
708    #[test]
709    fn minhash_incompatible_k_error() {
710        let a = MinHash::from_sequence(b"ACGTACGT", 3, 10).unwrap();
711        let b = MinHash::from_sequence(b"ACGTACGT", 5, 10).unwrap();
712        assert!(a.jaccard(&b).is_err());
713    }
714
715    // --- Canonical k-mers: sequence and reverse complement produce same sketch ---
716
717    #[test]
718    fn canonical_kmers_same_sketch() {
719        let fwd = b"ACGTACGTACGTACGTACGTACGT";
720        // Manually compute reverse complement
721        let rc: Vec<u8> = fwd.iter().rev().map(|&b| dna_complement(b)).collect();
722
723        let a = MinHash::from_sequence(fwd, 7, 100).unwrap();
724        let b = MinHash::from_sequence(&rc, 7, 100).unwrap();
725
726        assert_eq!(
727            a.hashes(),
728            b.hashes(),
729            "forward and reverse complement should produce identical sketches"
730        );
731    }
732
733    // --- FracMinHash: scaling behavior ---
734
735    #[test]
736    fn fracminhash_scales_properly() {
737        let seq = b"ACGTACGTAAACCCGGGTTTACGTACGTAAACCCGGGTTTACGTACGT";
738        // With scale=1, keep all hashes (max_hash = u64::MAX)
739        let all = FracMinHash::from_sequence(seq, 5, 1).unwrap();
740        // With scale=10, keep ~1/10 of hashes
741        let tenth = FracMinHash::from_sequence(seq, 5, 10).unwrap();
742
743        assert!(
744            tenth.len() <= all.len(),
745            "scale=10 should keep fewer hashes than scale=1: {} vs {}",
746            tenth.len(),
747            all.len()
748        );
749    }
750
751    // --- FracMinHash: identical sequences ---
752
753    #[test]
754    fn fracminhash_identical_jaccard_one() {
755        let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
756        let a = FracMinHash::from_sequence(seq, 7, 2).unwrap();
757        let b = FracMinHash::from_sequence(seq, 7, 2).unwrap();
758        let j = a.jaccard(&b).unwrap();
759        assert!(
760            (j - 1.0).abs() < 1e-10,
761            "expected Jaccard ~1.0, got {}",
762            j
763        );
764    }
765
766    // --- FracMinHash: containment ---
767
768    #[test]
769    fn fracminhash_containment_subset() {
770        let seq_a = b"ACGTACGTACGTACGT";
771        let seq_b = b"ACGTACGTACGTACGTTTTTCCCCAAAAGGGGG";
772        let a = FracMinHash::from_sequence(seq_a, 5, 1).unwrap();
773        let b = FracMinHash::from_sequence(seq_b, 5, 1).unwrap();
774        let c = a.containment(&b).unwrap();
775        assert!(
776            c > 0.9,
777            "expected containment ~1.0 for subset, got {}",
778            c
779        );
780    }
781
782    // --- FracMinHash: incompatible parameters error ---
783
784    #[test]
785    fn fracminhash_incompatible_scale_error() {
786        let a = FracMinHash::from_sequence(b"ACGTACGT", 5, 10).unwrap();
787        let b = FracMinHash::from_sequence(b"ACGTACGT", 5, 20).unwrap();
788        assert!(a.jaccard(&b).is_err());
789    }
790
791    // --- Round-trip: Jaccard in reasonable range ---
792
793    #[test]
794    fn jaccard_in_valid_range() {
795        let seq_a = b"ACGTACGTACGTACGTAAAA";
796        let seq_b = b"ACGTACGTACGTACGTCCCC";
797        let a = MinHash::from_sequence(seq_a, 5, 100).unwrap();
798        let b = MinHash::from_sequence(seq_b, 5, 100).unwrap();
799        let j = a.jaccard(&b).unwrap();
800        assert!(
801            (0.0..=1.0).contains(&j),
802            "Jaccard should be in [0,1], got {}",
803            j
804        );
805        // These sequences share a prefix, so Jaccard should be non-zero
806        assert!(j > 0.0, "sequences share k-mers, Jaccard should be > 0");
807        assert!(j < 1.0, "sequences differ, Jaccard should be < 1");
808    }
809
810    // --- Empty sketch behavior ---
811
812    #[test]
813    fn minhash_empty_sketch() {
814        let mh = MinHash::new(7, 100).unwrap();
815        assert!(mh.is_empty());
816        assert_eq!(mh.len(), 0);
817    }
818
819    // --- Hash function determinism ---
820
821    #[test]
822    fn murmurhash3_deterministic() {
823        let key = b"ACGTACGT";
824        let h1 = murmurhash3_64(key, 42);
825        let h2 = murmurhash3_64(key, 42);
826        assert_eq!(h1, h2);
827
828        // Different seed gives different hash
829        let h3 = murmurhash3_64(key, 99);
830        assert_ne!(h1, h3);
831    }
832}
cyanea_seq/minhash.rs

cyanea_seq/
minhash.rs