1use cyanea_core::{CyaneaError, Result};
36
37fn murmurhash3_64(key: &[u8], seed: u64) -> u64 {
43 let mut h = seed;
44 for chunk in key.chunks(8) {
45 let mut k = 0u64;
46 for (i, &b) in chunk.iter().enumerate() {
47 k |= (b as u64) << (i * 8);
48 }
49 k = k.wrapping_mul(0xff51afd7ed558ccd);
50 k ^= k >> 33;
51 k = k.wrapping_mul(0xc4ceb9fe1a85ec53);
52 h ^= k;
53 h = h.wrapping_mul(5).wrapping_add(0xe6546b64);
54 }
55 h ^= h >> 33;
56 h = h.wrapping_mul(0xff51afd7ed558ccd);
57 h ^= h >> 33;
58 h = h.wrapping_mul(0xc4ceb9fe1a85ec53);
59 h ^= h >> 33;
60 h
61}
62
63#[inline]
69fn dna_complement(b: u8) -> u8 {
70 match b {
71 b'A' | b'a' => b'T',
72 b'T' | b't' => b'A',
73 b'C' | b'c' => b'G',
74 b'G' | b'g' => b'C',
75 _ => b'N',
76 }
77}
78
79fn reverse_complement(kmer: &[u8], buf: &mut Vec<u8>) {
81 buf.clear();
82 buf.extend(kmer.iter().rev().map(|&b| dna_complement(b)));
83}
84
85#[inline]
87fn to_upper(b: u8) -> u8 {
88 b.to_ascii_uppercase()
89}
90
91#[inline]
93fn is_dna(b: u8) -> bool {
94 matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
95}
96
97fn canonical_kmer_hash(kmer: &[u8], rc_buf: &mut Vec<u8>, seed: u64) -> u64 {
101 let upper: Vec<u8> = kmer.iter().map(|&b| to_upper(b)).collect();
103 let h_fwd = murmurhash3_64(&upper, seed);
104 reverse_complement(&upper, rc_buf);
105 let h_rc = murmurhash3_64(rc_buf, seed);
106 h_fwd.min(h_rc)
107}
108
109#[derive(Debug, Clone)]
119pub struct MinHash {
120 k: usize,
122 sketch_size: usize,
124 hashes: Vec<u64>,
126}
127
128const HASH_SEED: u64 = 42;
129
130impl MinHash {
131 pub fn new(k: usize, sketch_size: usize) -> Result<Self> {
137 if k == 0 {
138 return Err(CyaneaError::InvalidInput(
139 "k-mer size must be at least 1".into(),
140 ));
141 }
142 if sketch_size == 0 {
143 return Err(CyaneaError::InvalidInput(
144 "sketch size must be at least 1".into(),
145 ));
146 }
147 Ok(Self {
148 k,
149 sketch_size,
150 hashes: Vec::new(),
151 })
152 }
153
154 pub fn from_sequence(seq: &[u8], k: usize, sketch_size: usize) -> Result<Self> {
163 let mut mh = Self::new(k, sketch_size)?;
164 mh.add_sequence(seq);
165 Ok(mh)
166 }
167
168 pub fn add_sequence(&mut self, seq: &[u8]) {
173 if seq.len() < self.k {
174 return;
175 }
176
177 let mut rc_buf = Vec::with_capacity(self.k);
178
179 for window in seq.windows(self.k) {
180 if !window.iter().all(|&b| is_dna(b)) {
182 continue;
183 }
184
185 let h = canonical_kmer_hash(window, &mut rc_buf, HASH_SEED);
186 self.insert_hash(h);
187 }
188 }
189
190 fn insert_hash(&mut self, h: u64) {
192 if self.hashes.len() < self.sketch_size {
194 let pos = self.hashes.binary_search(&h).unwrap_or_else(|p| p);
195 if pos < self.hashes.len() && self.hashes[pos] == h {
197 return;
198 }
199 self.hashes.insert(pos, h);
200 } else if h < self.hashes[self.hashes.len() - 1] {
201 let pos = self.hashes.binary_search(&h).unwrap_or_else(|p| p);
203 if pos < self.hashes.len() && self.hashes[pos] == h {
205 return;
206 }
207 self.hashes.insert(pos, h);
208 self.hashes.truncate(self.sketch_size);
209 }
210 }
212
213 pub fn jaccard(&self, other: &MinHash) -> Result<f64> {
222 if self.k != other.k {
223 return Err(CyaneaError::InvalidInput(format!(
224 "incompatible k-mer sizes: {} vs {}",
225 self.k, other.k
226 )));
227 }
228 if self.hashes.is_empty() && other.hashes.is_empty() {
229 return Ok(1.0);
230 }
231 if self.hashes.is_empty() || other.hashes.is_empty() {
232 return Ok(0.0);
233 }
234
235 let max_size = self.sketch_size.max(other.sketch_size);
238 let mut i = 0;
239 let mut j = 0;
240 let mut union_count = 0;
241 let mut intersection_count = 0;
242
243 while union_count < max_size && i < self.hashes.len() && j < other.hashes.len() {
244 let a = self.hashes[i];
245 let b = other.hashes[j];
246 if a < b {
247 i += 1;
248 } else if a > b {
249 j += 1;
250 } else {
251 intersection_count += 1;
253 i += 1;
254 j += 1;
255 }
256 union_count += 1;
257 }
258
259 while union_count < max_size && i < self.hashes.len() {
261 i += 1;
262 union_count += 1;
263 }
264 while union_count < max_size && j < other.hashes.len() {
265 j += 1;
266 union_count += 1;
267 }
268
269 if union_count == 0 {
270 return Ok(0.0);
271 }
272
273 Ok(intersection_count as f64 / union_count as f64)
274 }
275
276 pub fn containment(&self, other: &MinHash) -> Result<f64> {
284 if self.k != other.k {
285 return Err(CyaneaError::InvalidInput(format!(
286 "incompatible k-mer sizes: {} vs {}",
287 self.k, other.k
288 )));
289 }
290 if self.hashes.is_empty() {
291 return Ok(1.0);
292 }
293
294 let mut shared = 0usize;
295 let mut j = 0;
296 for &h in &self.hashes {
297 while j < other.hashes.len() && other.hashes[j] < h {
298 j += 1;
299 }
300 if j < other.hashes.len() && other.hashes[j] == h {
301 shared += 1;
302 }
303 }
304
305 Ok(shared as f64 / self.hashes.len() as f64)
306 }
307
308 pub fn ani(&self, other: &MinHash) -> Result<f64> {
317 let j = self.jaccard(other)?;
318 ani_from_jaccard(j, self.k)
319 }
320
321 pub fn len(&self) -> usize {
323 self.hashes.len()
324 }
325
326 pub fn is_empty(&self) -> bool {
328 self.hashes.is_empty()
329 }
330
331 pub fn k(&self) -> usize {
333 self.k
334 }
335
336 pub fn sketch_size(&self) -> usize {
338 self.sketch_size
339 }
340
341 pub fn hashes(&self) -> &[u64] {
343 &self.hashes
344 }
345}
346
347#[derive(Debug, Clone)]
357pub struct FracMinHash {
358 k: usize,
360 scale: u64,
362 max_hash: u64,
364 hashes: Vec<u64>,
366}
367
368impl FracMinHash {
369 pub fn new(k: usize, scale: u64) -> Result<Self> {
375 if k == 0 {
376 return Err(CyaneaError::InvalidInput(
377 "k-mer size must be at least 1".into(),
378 ));
379 }
380 if scale == 0 {
381 return Err(CyaneaError::InvalidInput(
382 "scale must be at least 1".into(),
383 ));
384 }
385 let max_hash = u64::MAX / scale;
386 Ok(Self {
387 k,
388 scale,
389 max_hash,
390 hashes: Vec::new(),
391 })
392 }
393
394 pub fn from_sequence(seq: &[u8], k: usize, scale: u64) -> Result<Self> {
400 let mut fmh = Self::new(k, scale)?;
401 fmh.add_sequence(seq);
402 Ok(fmh)
403 }
404
405 pub fn add_sequence(&mut self, seq: &[u8]) {
409 if seq.len() < self.k {
410 return;
411 }
412
413 let mut rc_buf = Vec::with_capacity(self.k);
414
415 for window in seq.windows(self.k) {
416 if !window.iter().all(|&b| is_dna(b)) {
417 continue;
418 }
419
420 let h = canonical_kmer_hash(window, &mut rc_buf, HASH_SEED);
421 if h <= self.max_hash {
422 let pos = self.hashes.binary_search(&h).unwrap_or_else(|p| p);
424 if pos >= self.hashes.len() || self.hashes[pos] != h {
425 self.hashes.insert(pos, h);
426 }
427 }
428 }
429 }
430
431 pub fn jaccard(&self, other: &FracMinHash) -> Result<f64> {
440 self.check_compatible(other)?;
441
442 if self.hashes.is_empty() && other.hashes.is_empty() {
443 return Ok(1.0);
444 }
445 if self.hashes.is_empty() || other.hashes.is_empty() {
446 return Ok(0.0);
447 }
448
449 let (intersection, union) = self.intersection_union(other);
450
451 if union == 0 {
452 return Ok(0.0);
453 }
454
455 Ok(intersection as f64 / union as f64)
456 }
457
458 pub fn containment(&self, other: &FracMinHash) -> Result<f64> {
466 self.check_compatible(other)?;
467
468 if self.hashes.is_empty() {
469 return Ok(1.0);
470 }
471
472 let mut shared = 0usize;
473 let mut j = 0;
474 for &h in &self.hashes {
475 while j < other.hashes.len() && other.hashes[j] < h {
476 j += 1;
477 }
478 if j < other.hashes.len() && other.hashes[j] == h {
479 shared += 1;
480 }
481 }
482
483 Ok(shared as f64 / self.hashes.len() as f64)
484 }
485
486 pub fn ani(&self, other: &FracMinHash) -> Result<f64> {
492 let j = self.jaccard(other)?;
493 ani_from_jaccard(j, self.k)
494 }
495
496 pub fn len(&self) -> usize {
498 self.hashes.len()
499 }
500
501 pub fn is_empty(&self) -> bool {
503 self.hashes.is_empty()
504 }
505
506 pub fn k(&self) -> usize {
508 self.k
509 }
510
511 pub fn scale(&self) -> u64 {
513 self.scale
514 }
515
516 pub fn hashes(&self) -> &[u64] {
518 &self.hashes
519 }
520
521 fn check_compatible(&self, other: &FracMinHash) -> Result<()> {
523 if self.k != other.k {
524 return Err(CyaneaError::InvalidInput(format!(
525 "incompatible k-mer sizes: {} vs {}",
526 self.k, other.k
527 )));
528 }
529 if self.scale != other.scale {
530 return Err(CyaneaError::InvalidInput(format!(
531 "incompatible scale values: {} vs {}",
532 self.scale, other.scale
533 )));
534 }
535 Ok(())
536 }
537
538 fn intersection_union(&self, other: &FracMinHash) -> (usize, usize) {
540 let mut i = 0;
541 let mut j = 0;
542 let mut intersection = 0;
543 let mut union = 0;
544
545 while i < self.hashes.len() && j < other.hashes.len() {
546 let a = self.hashes[i];
547 let b = other.hashes[j];
548 if a < b {
549 i += 1;
550 } else if a > b {
551 j += 1;
552 } else {
553 intersection += 1;
554 i += 1;
555 j += 1;
556 }
557 union += 1;
558 }
559
560 union += (self.hashes.len() - i) + (other.hashes.len() - j);
562
563 (intersection, union)
564 }
565}
566
567fn ani_from_jaccard(j: f64, k: usize) -> Result<f64> {
577 if j <= 0.0 {
578 return Err(CyaneaError::InvalidInput(
579 "cannot estimate ANI when Jaccard similarity is zero".into(),
580 ));
581 }
582 if (j - 1.0).abs() < f64::EPSILON {
583 return Ok(1.0);
584 }
585
586 let mash_distance = -((2.0 * j) / (1.0 + j)).ln() / (k as f64);
587 let ani = 1.0 - mash_distance;
588
589 Ok(ani.clamp(0.0, 1.0))
591}
592
593#[cfg(test)]
598mod tests {
599 use super::*;
600
601 #[test]
604 fn minhash_k_zero_error() {
605 assert!(MinHash::new(0, 100).is_err());
606 }
607
608 #[test]
609 fn minhash_sketch_size_zero_error() {
610 assert!(MinHash::new(7, 0).is_err());
611 }
612
613 #[test]
614 fn fracminhash_k_zero_error() {
615 assert!(FracMinHash::new(0, 10).is_err());
616 }
617
618 #[test]
619 fn fracminhash_scale_zero_error() {
620 assert!(FracMinHash::new(7, 0).is_err());
621 }
622
623 #[test]
626 fn minhash_identical_jaccard_one() {
627 let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
628 let a = MinHash::from_sequence(seq, 7, 100).unwrap();
629 let b = MinHash::from_sequence(seq, 7, 100).unwrap();
630 let j = a.jaccard(&b).unwrap();
631 assert!(
632 (j - 1.0).abs() < 1e-10,
633 "expected Jaccard ~1.0, got {}",
634 j
635 );
636 }
637
638 #[test]
641 fn minhash_disjoint_jaccard_near_zero() {
642 let seq_a = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
644 let seq_b = b"CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC";
645 let a = MinHash::from_sequence(seq_a, 7, 100).unwrap();
646 let b = MinHash::from_sequence(seq_b, 7, 100).unwrap();
647 let j = a.jaccard(&b).unwrap();
648 assert!(
649 j < 0.01,
650 "expected Jaccard ~0.0 for disjoint sequences, got {}",
651 j
652 );
653 }
654
655 #[test]
658 fn minhash_sketch_size_capped() {
659 let seq = b"ACGTACGTAAACCCGGGTTTACGTACGTAAACCCGGGTTTACGTACGT";
661 let mh = MinHash::from_sequence(seq, 5, 10).unwrap();
662 assert!(
663 mh.len() <= 10,
664 "sketch should have at most 10 hashes, got {}",
665 mh.len()
666 );
667 assert!(
669 mh.len() > 0,
670 "sketch should not be empty for a non-trivial sequence"
671 );
672 }
673
674 #[test]
677 fn minhash_containment_subset() {
678 let seq_a = b"ACGTACGTACGTACGT";
680 let seq_b = b"ACGTACGTACGTACGTTTTTCCCCAAAAGGGGG";
681 let a = MinHash::from_sequence(seq_a, 5, 1000).unwrap();
682 let b = MinHash::from_sequence(seq_b, 5, 1000).unwrap();
683 let c = a.containment(&b).unwrap();
684 assert!(
685 c > 0.9,
686 "expected containment ~1.0 for subset, got {}",
687 c
688 );
689 }
690
691 #[test]
694 fn minhash_ani_identical_is_one() {
695 let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
696 let a = MinHash::from_sequence(seq, 7, 100).unwrap();
697 let b = MinHash::from_sequence(seq, 7, 100).unwrap();
698 let ani = a.ani(&b).unwrap();
699 assert!(
700 (ani - 1.0).abs() < 1e-10,
701 "expected ANI ~1.0, got {}",
702 ani
703 );
704 }
705
706 #[test]
709 fn minhash_incompatible_k_error() {
710 let a = MinHash::from_sequence(b"ACGTACGT", 3, 10).unwrap();
711 let b = MinHash::from_sequence(b"ACGTACGT", 5, 10).unwrap();
712 assert!(a.jaccard(&b).is_err());
713 }
714
715 #[test]
718 fn canonical_kmers_same_sketch() {
719 let fwd = b"ACGTACGTACGTACGTACGTACGT";
720 let rc: Vec<u8> = fwd.iter().rev().map(|&b| dna_complement(b)).collect();
722
723 let a = MinHash::from_sequence(fwd, 7, 100).unwrap();
724 let b = MinHash::from_sequence(&rc, 7, 100).unwrap();
725
726 assert_eq!(
727 a.hashes(),
728 b.hashes(),
729 "forward and reverse complement should produce identical sketches"
730 );
731 }
732
733 #[test]
736 fn fracminhash_scales_properly() {
737 let seq = b"ACGTACGTAAACCCGGGTTTACGTACGTAAACCCGGGTTTACGTACGT";
738 let all = FracMinHash::from_sequence(seq, 5, 1).unwrap();
740 let tenth = FracMinHash::from_sequence(seq, 5, 10).unwrap();
742
743 assert!(
744 tenth.len() <= all.len(),
745 "scale=10 should keep fewer hashes than scale=1: {} vs {}",
746 tenth.len(),
747 all.len()
748 );
749 }
750
751 #[test]
754 fn fracminhash_identical_jaccard_one() {
755 let seq = b"ACGTACGTACGTACGTACGTACGTACGTACGT";
756 let a = FracMinHash::from_sequence(seq, 7, 2).unwrap();
757 let b = FracMinHash::from_sequence(seq, 7, 2).unwrap();
758 let j = a.jaccard(&b).unwrap();
759 assert!(
760 (j - 1.0).abs() < 1e-10,
761 "expected Jaccard ~1.0, got {}",
762 j
763 );
764 }
765
766 #[test]
769 fn fracminhash_containment_subset() {
770 let seq_a = b"ACGTACGTACGTACGT";
771 let seq_b = b"ACGTACGTACGTACGTTTTTCCCCAAAAGGGGG";
772 let a = FracMinHash::from_sequence(seq_a, 5, 1).unwrap();
773 let b = FracMinHash::from_sequence(seq_b, 5, 1).unwrap();
774 let c = a.containment(&b).unwrap();
775 assert!(
776 c > 0.9,
777 "expected containment ~1.0 for subset, got {}",
778 c
779 );
780 }
781
782 #[test]
785 fn fracminhash_incompatible_scale_error() {
786 let a = FracMinHash::from_sequence(b"ACGTACGT", 5, 10).unwrap();
787 let b = FracMinHash::from_sequence(b"ACGTACGT", 5, 20).unwrap();
788 assert!(a.jaccard(&b).is_err());
789 }
790
791 #[test]
794 fn jaccard_in_valid_range() {
795 let seq_a = b"ACGTACGTACGTACGTAAAA";
796 let seq_b = b"ACGTACGTACGTACGTCCCC";
797 let a = MinHash::from_sequence(seq_a, 5, 100).unwrap();
798 let b = MinHash::from_sequence(seq_b, 5, 100).unwrap();
799 let j = a.jaccard(&b).unwrap();
800 assert!(
801 (0.0..=1.0).contains(&j),
802 "Jaccard should be in [0,1], got {}",
803 j
804 );
805 assert!(j > 0.0, "sequences share k-mers, Jaccard should be > 0");
807 assert!(j < 1.0, "sequences differ, Jaccard should be < 1");
808 }
809
810 #[test]
813 fn minhash_empty_sketch() {
814 let mh = MinHash::new(7, 100).unwrap();
815 assert!(mh.is_empty());
816 assert_eq!(mh.len(), 0);
817 }
818
819 #[test]
822 fn murmurhash3_deterministic() {
823 let key = b"ACGTACGT";
824 let h1 = murmurhash3_64(key, 42);
825 let h2 = murmurhash3_64(key, 42);
826 assert_eq!(h1, h2);
827
828 let h3 = murmurhash3_64(key, 99);
830 assert_ne!(h1, h3);
831 }
832}