1use hashbrown::HashMap;
120
121mod multilen;
122mod split;
123pub use multilen::{MultiLenMatch, MultiLenSeqHash, MultiLenSeqHashBuilder};
124pub use split::{Half, SplitMatch, SplitSeqHash};
125
126pub const MAX_SEQ_LEN: usize = 16383;
128
129const VALID_BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
131
132const VALID_BASES_WITH_N: [u8; 5] = [b'A', b'C', b'G', b'T', b'N'];
134
135const AMBIGUOUS_BIT: u64 = 1 << 63;
144const IS_PARENT_BIT: u64 = 1 << 62;
145const POSITION_SHIFT: u64 = 48;
146const POSITION_MASK: u64 = 0x3FFF; const ORIGINAL_BASE_SHIFT: u64 = 40;
148const MUTATED_BASE_SHIFT: u64 = 32;
149const BASE_MASK: u64 = 0xFF;
150const PARENT_IDX_MASK: u64 = 0xFFFFFFFF;
151
152#[derive(Debug, Clone, Copy, PartialEq, Eq)]
154#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
155pub enum Match {
156 Exact { parent_idx: usize },
158 Mismatch { parent_idx: usize, pos: usize },
160}
161
162impl Match {
163 #[inline]
165 #[must_use]
166 pub fn parent_idx(&self) -> usize {
167 match self {
168 Match::Exact { parent_idx } | Match::Mismatch { parent_idx, .. } => *parent_idx,
169 }
170 }
171
172 #[inline]
174 #[must_use]
175 pub fn is_exact(&self) -> bool {
176 matches!(self, Match::Exact { .. })
177 }
178
179 #[inline]
181 #[must_use]
182 pub fn mismatch_pos(&self) -> Option<usize> {
183 match self {
184 Match::Exact { .. } => None,
185 Match::Mismatch { pos, .. } => Some(*pos),
186 }
187 }
188
189 #[inline]
193 #[must_use]
194 pub fn hdist(&self) -> usize {
195 match self {
196 Match::Exact { .. } => 0,
197 Match::Mismatch { .. } => 1,
198 }
199 }
200}
201
202#[derive(Debug, Clone, PartialEq, Eq)]
204#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
205pub enum SeqHashError {
206 EmptyParents,
208 InconsistentLength {
210 expected: usize,
211 found: usize,
212 index: usize,
213 },
214 SequenceTooLong { len: usize },
216 DuplicateParent { index: usize, original: usize },
218 InvalidBase { index: usize, pos: usize, base: u8 },
220}
221
222impl std::fmt::Display for SeqHashError {
223 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
224 match self {
225 SeqHashError::EmptyParents => write!(f, "no parent sequences provided"),
226 SeqHashError::InconsistentLength {
227 expected,
228 found,
229 index,
230 } => write!(
231 f,
232 "parent at index {index} has length {found} (expected {expected})"
233 ),
234 SeqHashError::SequenceTooLong { len } => {
235 write!(f, "sequence length {len} exceeds maximum {MAX_SEQ_LEN}")
236 }
237 SeqHashError::DuplicateParent { index, original } => {
238 write!(
239 f,
240 "parent at index {index} is duplicate of parent at index {original}"
241 )
242 }
243 SeqHashError::InvalidBase { index, pos, base } => {
244 write!(
245 f,
246 "invalid base '{}' at position {} in parent {}",
247 *base as char, pos, index
248 )
249 }
250 }
251 }
252}
253
254impl std::error::Error for SeqHashError {}
255
256#[derive(Debug, Clone, Copy, PartialEq, Eq)]
258#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
259struct Entry(u64);
260
261impl Entry {
262 #[inline]
264 fn new_parent(parent_idx: u32) -> Self {
265 Entry(IS_PARENT_BIT | u64::from(parent_idx))
266 }
267
268 #[inline]
270 fn new_mismatch(parent_idx: u32, pos: u16, original_base: u8, mutated_base: u8) -> Self {
271 Entry(
272 (u64::from(pos) << POSITION_SHIFT)
273 | (u64::from(original_base) << ORIGINAL_BASE_SHIFT)
274 | (u64::from(mutated_base) << MUTATED_BASE_SHIFT)
275 | u64::from(parent_idx),
276 )
277 }
278
279 #[inline]
281 fn ambiguous() -> Self {
282 Entry(AMBIGUOUS_BIT)
283 }
284
285 #[inline]
287 fn is_ambiguous(self) -> bool {
288 (self.0 & AMBIGUOUS_BIT) != 0
289 }
290
291 #[inline]
293 fn is_parent(self) -> bool {
294 (self.0 & IS_PARENT_BIT) != 0
295 }
296
297 #[inline]
299 fn parent_idx(self) -> usize {
300 (self.0 & PARENT_IDX_MASK) as usize
301 }
302
303 #[inline]
305 fn position(self) -> usize {
306 ((self.0 >> POSITION_SHIFT) & POSITION_MASK) as usize
307 }
308
309 #[inline]
311 fn original_base(self) -> u8 {
312 ((self.0 >> ORIGINAL_BASE_SHIFT) & BASE_MASK) as u8
313 }
314
315 #[inline]
317 fn mutated_base(self) -> u8 {
318 ((self.0 >> MUTATED_BASE_SHIFT) & BASE_MASK) as u8
319 }
320}
321
322#[inline]
324fn hash_sequence(seq: &[u8]) -> u64 {
325 fxhash::hash64(seq)
326}
327
328#[inline]
330fn is_valid_base(b: u8) -> bool {
331 matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
332}
333
334#[inline]
336fn is_valid_base_with_n(b: u8, allow_n: bool) -> bool {
337 is_valid_base(b) || (allow_n && (b == b'N' || b == b'n'))
338}
339
340#[inline]
342fn within_hamming_distance(seq1: &[u8], seq2: &[u8], max_hdist: usize) -> bool {
343 if seq1.len() != seq2.len() {
344 return false;
345 }
346 let mut hdist = 0;
347 for (a, b) in seq1.iter().zip(seq2.iter()) {
348 if a != b {
349 hdist += 1;
350 if hdist > max_hdist {
351 return false;
352 }
353 }
354 }
355 true
356}
357
358#[derive(Debug, Clone)]
360#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
361pub struct SeqHash {
362 parents: Vec<u8>,
364 num_parents: usize,
366 seq_len: usize,
368 lookup: HashMap<u64, Entry>,
370 num_ambiguous: usize,
372 exact_only: bool,
374 allow_n: bool,
376 normalize_case: bool,
378}
379
380#[derive(Debug, Clone, Copy)]
411pub struct SeqHashBuilder {
412 exact_only: bool,
414 allow_n: bool,
416 normalize_case: bool,
418 threads: usize,
420}
421
422impl Default for SeqHashBuilder {
423 fn default() -> Self {
424 SeqHashBuilder {
425 exact_only: false,
426 allow_n: true,
427 normalize_case: true,
428 threads: 1,
429 }
430 }
431}
432
433impl SeqHashBuilder {
434 #[must_use]
439 pub fn exact(mut self) -> Self {
440 self.exact_only = true;
441 self
442 }
443
444 #[must_use]
450 pub fn exclude_n(mut self) -> Self {
451 self.allow_n = false;
452 self
453 }
454
455 #[must_use]
461 pub fn keep_case(mut self) -> Self {
462 self.normalize_case = false;
463 self
464 }
465
466 #[must_use]
494 #[cfg(feature = "parallel")]
495 pub fn threads(mut self, num_threads: usize) -> Self {
496 self.threads = match num_threads {
497 0 => num_cpus::get(),
498 x => num_cpus::get().min(x),
499 };
500 self
501 }
502
503 pub fn build<S: AsRef<[u8]>>(self, parents: &[S]) -> Result<SeqHash, SeqHashError> {
514 SeqHash::build_internal(
515 parents,
516 self.exact_only,
517 self.allow_n,
518 self.normalize_case,
519 self.threads,
520 )
521 }
522}
523
524impl SeqHash {
525 pub fn new<S: AsRef<[u8]>>(parents: &[S]) -> Result<Self, SeqHashError> {
541 Self::build_internal(parents, false, true, true, 1)
542 }
543
544 fn build_internal<S: AsRef<[u8]>>(
546 parents: &[S],
547 exact_only: bool,
548 allow_n: bool,
549 normalize_case: bool,
550 #[allow(unused_variables)] threads: usize,
551 ) -> Result<Self, SeqHashError> {
552 if parents.is_empty() {
553 return Err(SeqHashError::EmptyParents);
554 }
555
556 let seq_len = parents[0].as_ref().len();
557 if seq_len > MAX_SEQ_LEN {
558 return Err(SeqHashError::SequenceTooLong { len: seq_len });
559 }
560
561 let num_parents = parents.len();
562
563 let mut parent_data = Vec::with_capacity(num_parents * seq_len);
565
566 let estimated_entries = if exact_only {
568 num_parents
569 } else {
570 num_parents * (1 + 3 * seq_len)
571 };
572 let mut lookup: HashMap<u64, Entry> = HashMap::with_capacity(estimated_entries);
573 let mut num_ambiguous = 0;
574
575 Self::initialize_parents(
576 &mut lookup,
577 &mut parent_data,
578 &mut num_ambiguous,
579 parents,
580 seq_len,
581 normalize_case,
582 allow_n,
583 )?;
584
585 if !exact_only {
587 #[cfg(feature = "parallel")]
588 if threads > 1 {
589 Self::initialize_mutations_parallel(
590 &mut lookup,
591 &mut num_ambiguous,
592 &parent_data,
593 seq_len,
594 num_parents,
595 allow_n,
596 threads,
597 );
598 } else {
599 Self::initialize_mutations(
600 &mut lookup,
601 &mut num_ambiguous,
602 &parent_data,
603 seq_len,
604 num_parents,
605 allow_n,
606 );
607 }
608
609 #[cfg(not(feature = "parallel"))]
610 {
611 Self::initialize_mutations(
612 &mut lookup,
613 &mut num_ambiguous,
614 &parent_data,
615 seq_len,
616 num_parents,
617 allow_n,
618 );
619 }
620 }
621
622 Ok(SeqHash {
623 parents: parent_data,
624 num_parents,
625 seq_len,
626 lookup,
627 num_ambiguous,
628 exact_only,
629 allow_n,
630 normalize_case,
631 })
632 }
633
634 fn initialize_parents<S: AsRef<[u8]>>(
636 lookup: &mut HashMap<u64, Entry>,
637 parent_data: &mut Vec<u8>,
638 num_ambiguous: &mut usize,
639 parents: &[S],
640 seq_len: usize,
641 normalize_case: bool,
642 allow_n: bool,
643 ) -> Result<(), SeqHashError> {
644 for (idx, parent) in parents.iter().enumerate() {
646 let seq = parent.as_ref();
647
648 if seq.len() != seq_len {
650 return Err(SeqHashError::InconsistentLength {
651 expected: seq_len,
652 found: seq.len(),
653 index: idx,
654 });
655 }
656
657 let normalized_seq: Vec<u8>;
659 let seq_to_use = if normalize_case {
660 normalized_seq = seq.to_ascii_uppercase();
661 &normalized_seq
662 } else {
663 seq
664 };
665
666 for (pos, &base) in seq_to_use.iter().enumerate() {
668 if !is_valid_base_with_n(base, allow_n) {
669 return Err(SeqHashError::InvalidBase {
670 index: idx,
671 pos,
672 base,
673 });
674 }
675 }
676
677 parent_data.extend_from_slice(seq_to_use);
679
680 let hash = hash_sequence(seq_to_use);
682 if let Some(existing) = lookup.get(&hash) {
683 if existing.is_parent() {
684 let existing_idx = existing.parent_idx();
686 let existing_seq =
687 &parent_data[existing_idx * seq_len..(existing_idx + 1) * seq_len];
688 if existing_seq == seq {
689 return Err(SeqHashError::DuplicateParent {
690 index: idx,
691 original: existing_idx,
692 });
693 }
694 }
695 lookup.insert(hash, Entry::ambiguous());
697 *num_ambiguous += 1;
698 } else {
699 lookup.insert(hash, Entry::new_parent(idx as u32));
700 }
701 }
702
703 Ok(())
704 }
705
706 fn initialize_mutations(
708 lookup: &mut HashMap<u64, Entry>,
709 num_ambiguous: &mut usize,
710 parent_data: &[u8],
711 seq_len: usize,
712 num_parents: usize,
713 allow_n: bool,
714 ) {
715 let mut mutant_seq = vec![0u8; seq_len];
716
717 let mutation_bases: &[u8] = if allow_n {
719 &VALID_BASES_WITH_N
720 } else {
721 &VALID_BASES
722 };
723
724 for parent_idx in 0..num_parents {
725 let parent_start = parent_idx * seq_len;
726 let parent_seq = &parent_data[parent_start..parent_start + seq_len];
727
728 for pos in 0..seq_len {
729 let original_base = parent_seq[pos];
730
731 for &new_base in mutation_bases {
732 if new_base == original_base {
733 continue;
734 }
735
736 mutant_seq.copy_from_slice(parent_seq);
738 mutant_seq[pos] = new_base;
739
740 let hash = hash_sequence(&mutant_seq);
741
742 match lookup.get(&hash) {
743 None => {
744 lookup.insert(
746 hash,
747 Entry::new_mismatch(
748 parent_idx as u32,
749 pos as u16,
750 original_base,
751 new_base,
752 ),
753 );
754 }
755 Some(existing) => {
756 if !existing.is_ambiguous() && !existing.is_parent() {
760 lookup.insert(hash, Entry::ambiguous());
761 *num_ambiguous += 1;
762 }
763 }
764 }
765 }
766 }
767 }
768 }
769
770 #[inline]
774 #[must_use]
775 pub fn query(&self, seq: &[u8]) -> Option<Match> {
776 if seq.len() != self.seq_len {
777 return None;
778 }
779
780 let hash = hash_sequence(seq);
781 let entry = *self.lookup.get(&hash)?;
782
783 if entry.is_ambiguous() {
784 return None;
785 }
786
787 if entry.is_parent() {
788 let parent_idx = entry.parent_idx();
790 let parent_seq = self.get_parent(parent_idx)?;
791 if seq == parent_seq {
792 Some(Match::Exact { parent_idx })
793 } else {
794 None }
796 } else {
797 let parent_idx = entry.parent_idx();
799 let pos = entry.position();
800 let original_base = entry.original_base();
801 let mutated_base = entry.mutated_base();
802
803 let parent_seq = self.get_parent(parent_idx)?;
804
805 if seq[pos] != mutated_base || parent_seq[pos] != original_base {
807 return None;
808 }
809
810 if seq[..pos] != parent_seq[..pos] || seq[pos + 1..] != parent_seq[pos + 1..] {
812 return None;
813 }
814
815 Some(Match::Mismatch { parent_idx, pos })
816 }
817 }
818
819 #[inline]
821 #[must_use]
822 pub fn is_ambiguous(&self, seq: &[u8]) -> bool {
823 if seq.len() != self.seq_len {
824 return false;
825 }
826
827 let hash = hash_sequence(seq);
828 self.lookup.get(&hash).is_some_and(|e| e.is_ambiguous())
829 }
830
831 #[inline]
849 #[must_use]
850 pub fn query_at(&self, seq: &[u8], pos: usize) -> Option<Match> {
851 let end = pos.checked_add(self.seq_len)?;
852 if end > seq.len() {
853 return None;
854 }
855 self.query(&seq[pos..end])
856 }
857
858 #[inline]
878 #[must_use]
879 pub fn query_at_with_remap(&self, seq: &[u8], pos: usize, window: usize) -> Option<Match> {
880 self.query_at_with_remap_offset(seq, pos, window)
881 .map(|(m, _)| m)
882 }
883
884 #[must_use]
905 pub fn query_at_with_remap_offset(
906 &self,
907 seq: &[u8],
908 pos: usize,
909 window: usize,
910 ) -> Option<(Match, isize)> {
911 if let Some(m) = self.query_at(seq, pos) {
913 return Some((m, 0));
914 }
915
916 for offset in 1..=window {
918 if let Some(m) = self.query_at(seq, pos + offset) {
919 return Some((m, offset as isize));
920 }
921 if offset <= pos {
922 if let Some(m) = self.query_at(seq, pos - offset) {
923 return Some((m, -(offset as isize)));
924 }
925 }
926 }
927
928 None
929 }
930
931 #[must_use]
951 pub fn query_sliding(&self, seq: &[u8]) -> Option<(Match, usize)> {
952 self.query_sliding_iter(seq).next()
953 }
954
955 pub fn query_sliding_iter<'a>(
975 &'a self,
976 seq: &'a [u8],
977 ) -> impl Iterator<Item = (Match, usize)> + 'a {
978 let num_positions = if seq.len() >= self.seq_len {
979 seq.len() - self.seq_len + 1
980 } else {
981 0
982 };
983 (0..num_positions)
984 .filter_map(move |pos| self.query(&seq[pos..pos + self.seq_len]).map(|m| (m, pos)))
985 }
986
987 #[inline]
989 #[must_use]
990 pub fn get_parent(&self, idx: usize) -> Option<&[u8]> {
991 if idx >= self.num_parents {
992 return None;
993 }
994 let start = idx * self.seq_len;
995 let end = start + self.seq_len;
996 Some(&self.parents[start..end])
997 }
998
999 #[inline]
1001 pub fn iter_parents(&self) -> impl Iterator<Item = &[u8]> {
1002 self.parents.chunks_exact(self.seq_len)
1003 }
1004
1005 #[inline]
1007 #[must_use]
1008 pub fn num_parents(&self) -> usize {
1009 self.num_parents
1010 }
1011
1012 #[inline]
1014 #[must_use]
1015 pub fn seq_len(&self) -> usize {
1016 self.seq_len
1017 }
1018
1019 #[inline]
1021 #[must_use]
1022 pub fn num_entries(&self) -> usize {
1023 self.lookup.len()
1024 }
1025
1026 #[inline]
1028 #[must_use]
1029 pub fn num_ambiguous(&self) -> usize {
1030 self.num_ambiguous
1031 }
1032
1033 #[inline]
1035 #[must_use]
1036 pub fn is_exact_only(&self) -> bool {
1037 self.exact_only
1038 }
1039
1040 #[inline]
1042 #[must_use]
1043 pub fn allows_n(&self) -> bool {
1044 self.allow_n
1045 }
1046
1047 #[inline]
1049 #[must_use]
1050 pub fn normalizes_case(&self) -> bool {
1051 self.normalize_case
1052 }
1053
1054 #[inline]
1080 #[must_use]
1081 pub fn is_within_hdist(&self, query: &[u8], parent_idx: usize, hdist: usize) -> bool {
1082 if query.len() != self.seq_len {
1083 return false;
1084 }
1085
1086 let parent_seq = match self.get_parent(parent_idx) {
1087 Some(seq) => seq,
1088 None => return false, };
1090
1091 within_hamming_distance(query, parent_seq, hdist)
1093 }
1094
1095 #[cfg(feature = "serde")]
1109 pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> std::io::Result<()> {
1110 let bytes = bincode::serialize(self)
1111 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
1112 std::fs::write(path, bytes)
1113 }
1114
1115 #[cfg(feature = "serde")]
1127 pub fn load<P: AsRef<std::path::Path>>(path: P) -> std::io::Result<Self> {
1128 let bytes = std::fs::read(path)?;
1129 bincode::deserialize(&bytes)
1130 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
1131 }
1132}
1133
1134#[cfg(feature = "parallel")]
1135impl SeqHash {
1136 fn initialize_mutations_parallel(
1137 lookup: &mut HashMap<u64, Entry>,
1138 num_ambiguous: &mut usize,
1139 parent_data: &[u8],
1140 seq_len: usize,
1141 num_parents: usize,
1142 allow_n: bool,
1143 threads: usize,
1144 ) {
1145 use std::sync::Arc;
1146
1147 let mutation_bases: &[u8] = if allow_n {
1148 &VALID_BASES_WITH_N
1149 } else {
1150 &VALID_BASES
1151 };
1152
1153 let threads = threads.min(num_parents);
1154 let parents_per_thread = (num_parents / threads).max(1);
1155
1156 let parent_lookup = Arc::new(lookup.clone());
1158
1159 let thread_results: Vec<_> = std::thread::scope(|s| {
1161 (0..threads)
1162 .map(|tid| {
1163 let parent_idx_start = tid * parents_per_thread;
1164 let parent_idx_end = if tid == threads - 1 {
1165 num_parents
1166 } else {
1167 parent_idx_start + parents_per_thread
1168 };
1169 let parent_lookup = Arc::clone(&parent_lookup);
1170
1171 s.spawn(move || {
1172 let estimated_capacity =
1174 (parent_idx_end - parent_idx_start) * seq_len * mutation_bases.len();
1175 let mut local_lookup: HashMap<u64, Entry> =
1176 HashMap::with_capacity(estimated_capacity);
1177 let mut local_ambiguous = 0;
1178 let mut mutant_seq = vec![0u8; seq_len];
1179
1180 for parent_idx in parent_idx_start..parent_idx_end {
1181 let parent_start = parent_idx * seq_len;
1182 let parent_seq = &parent_data[parent_start..parent_start + seq_len];
1183
1184 for pos in 0..seq_len {
1185 let original_base = parent_seq[pos];
1186
1187 for &new_base in mutation_bases {
1188 if new_base == original_base {
1189 continue;
1190 }
1191
1192 mutant_seq.copy_from_slice(parent_seq);
1193 mutant_seq[pos] = new_base;
1194 let hash = hash_sequence(&mutant_seq);
1195
1196 if let Some(parent_entry) = parent_lookup.get(&hash) {
1198 if parent_entry.is_parent() {
1199 continue;
1201 }
1202 }
1203
1204 match local_lookup.get(&hash) {
1206 None => {
1207 local_lookup.insert(
1208 hash,
1209 Entry::new_mismatch(
1210 parent_idx as u32,
1211 pos as u16,
1212 original_base,
1213 new_base,
1214 ),
1215 );
1216 }
1217 Some(existing) => {
1218 if !existing.is_ambiguous() && !existing.is_parent() {
1219 local_lookup.insert(hash, Entry::ambiguous());
1220 local_ambiguous += 1;
1221 }
1222 }
1223 }
1224 }
1225 }
1226 }
1227
1228 (local_lookup, local_ambiguous)
1229 })
1230 })
1231 .collect::<Vec<_>>()
1232 .into_iter()
1233 .map(|handle| handle.join().unwrap())
1234 .collect()
1235 });
1236
1237 for (local_lookup, local_ambiguous) in thread_results {
1239 for (hash, entry) in local_lookup {
1240 match lookup.get(&hash) {
1241 None => {
1242 lookup.insert(hash, entry);
1243 }
1244 Some(existing) => {
1245 if existing.is_parent() {
1247 continue;
1248 }
1249
1250 if !existing.is_ambiguous() && !entry.is_ambiguous() {
1252 lookup.insert(hash, Entry::ambiguous());
1254 *num_ambiguous += 1;
1255 }
1256 }
1257 }
1258 }
1259 *num_ambiguous += local_ambiguous;
1260 }
1261 }
1262}
1263
1264#[cfg(test)]
1265mod tests {
1266
1267 use super::*;
1268
1269 #[test]
1270 fn test_empty_parents() {
1271 let parents: Vec<&[u8]> = vec![];
1272 let result = SeqHash::new(&parents);
1273 assert_eq!(result.unwrap_err(), SeqHashError::EmptyParents);
1274 }
1275
1276 #[test]
1277 fn test_inconsistent_length() {
1278 let parents: Vec<&[u8]> = vec![b"ACGT", b"ACGTACGT"];
1279 let result = SeqHash::new(&parents);
1280 assert_eq!(
1281 result.unwrap_err(),
1282 SeqHashError::InconsistentLength {
1283 expected: 4,
1284 found: 8,
1285 index: 1
1286 }
1287 );
1288 }
1289
1290 #[test]
1291 fn test_invalid_base() {
1292 let parents: Vec<&[u8]> = vec![b"ACGX"];
1294 let result = SeqHash::new(&parents);
1295 assert_eq!(
1296 result.unwrap_err(),
1297 SeqHashError::InvalidBase {
1298 index: 0,
1299 pos: 3,
1300 base: b'X'
1301 }
1302 );
1303
1304 let parents_with_n: Vec<&[u8]> = vec![b"ACGN"];
1306 assert!(SeqHash::new(&parents_with_n).is_ok());
1307 }
1308
1309 #[test]
1310 fn test_duplicate_parent() {
1311 let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG", b"ACGT"];
1312 let result = SeqHash::new(&parents);
1313 assert_eq!(
1314 result.unwrap_err(),
1315 SeqHashError::DuplicateParent {
1316 index: 2,
1317 original: 0
1318 }
1319 );
1320 }
1321
1322 #[test]
1323 fn test_exact_match() {
1324 let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT", b"GGGGCCCCAAAA", b"TTTTAAAACCCC"];
1325
1326 let index = SeqHash::new(&parents).unwrap();
1327
1328 assert_eq!(index.num_parents(), 3);
1329 assert_eq!(index.seq_len(), 12);
1330
1331 assert_eq!(
1333 index.query(b"ACGTACGTACGT"),
1334 Some(Match::Exact { parent_idx: 0 })
1335 );
1336 assert_eq!(
1337 index.query(b"GGGGCCCCAAAA"),
1338 Some(Match::Exact { parent_idx: 1 })
1339 );
1340 assert_eq!(
1341 index.query(b"TTTTAAAACCCC"),
1342 Some(Match::Exact { parent_idx: 2 })
1343 );
1344 }
1345
1346 #[test]
1347 fn test_mismatch_match() {
1348 let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT"];
1349
1350 let index = SeqHash::new(&parents).unwrap();
1351
1352 let result = index.query(b"ACGTACGTACGA");
1354 assert_eq!(
1355 result,
1356 Some(Match::Mismatch {
1357 parent_idx: 0,
1358 pos: 11
1359 })
1360 );
1361
1362 let result = index.query(b"GCGTACGTACGT");
1364 assert_eq!(
1365 result,
1366 Some(Match::Mismatch {
1367 parent_idx: 0,
1368 pos: 0
1369 })
1370 );
1371
1372 let result = index.query(b"ATGTACGTACGT");
1374 assert_eq!(
1375 result,
1376 Some(Match::Mismatch {
1377 parent_idx: 0,
1378 pos: 1
1379 })
1380 );
1381 }
1382
1383 #[test]
1384 fn test_no_match() {
1385 let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT"];
1386
1387 let index = SeqHash::new(&parents).unwrap();
1388
1389 assert_eq!(index.query(b"GCGTACGTACGA"), None);
1391
1392 assert_eq!(index.query(b"TTTTTTTTTTTT"), None);
1394 }
1395
1396 #[test]
1397 fn test_wrong_length_query() {
1398 let parents: Vec<&[u8]> = vec![b"ACGTACGT"];
1399
1400 let index = SeqHash::new(&parents).unwrap();
1401
1402 assert_eq!(index.query(b"ACGT"), None);
1404
1405 assert_eq!(index.query(b"ACGTACGTACGT"), None);
1407 }
1408
1409 #[test]
1410 fn test_ambiguous_detection() {
1411 let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"TCGTACGT"]; let index = SeqHash::new(&parents).unwrap();
1416
1417 assert_eq!(
1419 index.query(b"ACGTACGT"),
1420 Some(Match::Exact { parent_idx: 0 })
1421 );
1422 assert_eq!(
1423 index.query(b"TCGTACGT"),
1424 Some(Match::Exact { parent_idx: 1 })
1425 );
1426
1427 assert_eq!(
1430 index.query(b"ACGTGCGT"),
1431 Some(Match::Mismatch {
1432 parent_idx: 0,
1433 pos: 4
1434 })
1435 );
1436
1437 assert!(index.is_ambiguous(b"CCGTACGT"));
1440 assert_eq!(index.query(b"CCGTACGT"), None);
1441 }
1442
1443 #[test]
1444 fn test_match_methods() {
1445 let exact = Match::Exact { parent_idx: 5 };
1446 assert_eq!(exact.parent_idx(), 5);
1447 assert!(exact.is_exact());
1448 assert_eq!(exact.mismatch_pos(), None);
1449
1450 let mismatch = Match::Mismatch {
1451 parent_idx: 3,
1452 pos: 7,
1453 };
1454 assert_eq!(mismatch.parent_idx(), 3);
1455 assert!(!mismatch.is_exact());
1456 assert_eq!(mismatch.mismatch_pos(), Some(7));
1457 }
1458
1459 #[test]
1460 fn test_get_parent() {
1461 let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG", b"TTTT"];
1462
1463 let index = SeqHash::new(&parents).unwrap();
1464
1465 assert_eq!(index.get_parent(0), Some(b"ACGT".as_slice()));
1466 assert_eq!(index.get_parent(1), Some(b"GGGG".as_slice()));
1467 assert_eq!(index.get_parent(2), Some(b"TTTT".as_slice()));
1468 assert_eq!(index.get_parent(3), None);
1469 }
1470
1471 #[test]
1472 fn test_entry_encoding() {
1473 let entry = Entry::new_parent(12345);
1475 assert!(entry.is_parent());
1476 assert!(!entry.is_ambiguous());
1477 assert_eq!(entry.parent_idx(), 12345);
1478
1479 let entry = Entry::new_mismatch(999, 100, b'A', b'T');
1481 assert!(!entry.is_parent());
1482 assert!(!entry.is_ambiguous());
1483 assert_eq!(entry.parent_idx(), 999);
1484 assert_eq!(entry.position(), 100);
1485 assert_eq!(entry.original_base(), b'A');
1486 assert_eq!(entry.mutated_base(), b'T');
1487
1488 let entry = Entry::ambiguous();
1490 assert!(entry.is_ambiguous());
1491 }
1492
1493 #[test]
1494 fn test_hash_function() {
1495 assert_eq!(hash_sequence(b"ACGT"), hash_sequence(b"ACGT"));
1497
1498 assert_ne!(hash_sequence(b"ACGT"), hash_sequence(b"ACGA"));
1500 }
1501
1502 #[test]
1503 fn test_num_entries() {
1504 let parents: Vec<&[u8]> = vec![b"ACGT"];
1505
1506 let index = SeqHash::new(&parents).unwrap();
1507
1508 assert!(index.num_entries() >= 1);
1511 assert!(index.num_entries() <= 17); }
1513
1514 #[test]
1515 fn test_all_single_mutations() {
1516 let parents: Vec<&[u8]> = vec![b"AAAA"];
1517 let index = SeqHash::new(&parents).unwrap();
1518
1519 let mutations = [
1521 (b"CAAA", 0),
1522 (b"GAAA", 0),
1523 (b"TAAA", 0),
1524 (b"ACAA", 1),
1525 (b"AGAA", 1),
1526 (b"ATAA", 1),
1527 (b"AACA", 2),
1528 (b"AAGA", 2),
1529 (b"AATA", 2),
1530 (b"AAAC", 3),
1531 (b"AAAG", 3),
1532 (b"AAAT", 3),
1533 ];
1534
1535 for (query, expected_pos) in mutations {
1536 let result = index.query(query);
1537 assert_eq!(
1538 result,
1539 Some(Match::Mismatch {
1540 parent_idx: 0,
1541 pos: expected_pos
1542 }),
1543 "Failed for query {:?}",
1544 std::str::from_utf8(query)
1545 );
1546 }
1547 }
1548
1549 #[test]
1550 fn test_error_display() {
1551 assert_eq!(
1552 SeqHashError::EmptyParents.to_string(),
1553 "no parent sequences provided"
1554 );
1555
1556 assert_eq!(
1557 SeqHashError::InconsistentLength {
1558 expected: 10,
1559 found: 5,
1560 index: 3
1561 }
1562 .to_string(),
1563 "parent at index 3 has length 5 (expected 10)"
1564 );
1565
1566 assert_eq!(
1567 SeqHashError::SequenceTooLong { len: 20000 }.to_string(),
1568 "sequence length 20000 exceeds maximum 16383"
1569 );
1570
1571 assert_eq!(
1572 SeqHashError::DuplicateParent {
1573 index: 5,
1574 original: 2
1575 }
1576 .to_string(),
1577 "parent at index 5 is duplicate of parent at index 2"
1578 );
1579
1580 assert_eq!(
1581 SeqHashError::InvalidBase {
1582 index: 1,
1583 pos: 4,
1584 base: b'N'
1585 }
1586 .to_string(),
1587 "invalid base 'N' at position 4 in parent 1"
1588 );
1589 }
1590
1591 #[test]
1592 fn test_multiple_parents_different_mutations() {
1593 let parents: Vec<&[u8]> = vec![
1595 b"AAAAAAAA", b"CCCCCCCC", b"GGGGGGGG", ];
1599
1600 let index = SeqHash::new(&parents).unwrap();
1601
1602 assert_eq!(
1607 index.query(b"CAAAAAAA"),
1608 Some(Match::Mismatch {
1609 parent_idx: 0,
1610 pos: 0
1611 })
1612 );
1613
1614 assert_eq!(
1616 index.query(b"ACCCCCCC"),
1617 Some(Match::Mismatch {
1618 parent_idx: 1,
1619 pos: 0
1620 })
1621 );
1622
1623 assert_eq!(
1625 index.query(b"AGGGGGGG"),
1626 Some(Match::Mismatch {
1627 parent_idx: 2,
1628 pos: 0
1629 })
1630 );
1631 }
1632
1633 #[test]
1634 fn test_is_ambiguous_wrong_length() {
1635 let parents: Vec<&[u8]> = vec![b"ACGT"];
1636 let index = SeqHash::new(&parents).unwrap();
1637
1638 assert!(!index.is_ambiguous(b"AC"));
1640 assert!(!index.is_ambiguous(b"ACGTACGT"));
1641 }
1642
1643 #[test]
1644 fn test_query_at() {
1645 let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG"];
1646 let index = SeqHash::new(&parents).unwrap();
1647
1648 let read = b"NNACGTNN";
1650 assert_eq!(
1651 index.query_at(read, 2),
1652 Some(Match::Exact { parent_idx: 0 })
1653 );
1654
1655 let read = b"ACGTNNNN";
1657 assert_eq!(
1658 index.query_at(read, 0),
1659 Some(Match::Exact { parent_idx: 0 })
1660 );
1661
1662 let read = b"NNNNACGT";
1664 assert_eq!(
1665 index.query_at(read, 4),
1666 Some(Match::Exact { parent_idx: 0 })
1667 );
1668
1669 let read = b"NNACGANN"; assert_eq!(
1672 index.query_at(read, 2),
1673 Some(Match::Mismatch {
1674 parent_idx: 0,
1675 pos: 3
1676 })
1677 );
1678
1679 let read = b"NNTTTTNN";
1681 assert_eq!(index.query_at(read, 2), None);
1682
1683 let read = b"NNGGGGNN";
1685 assert_eq!(
1686 index.query_at(read, 2),
1687 Some(Match::Exact { parent_idx: 1 })
1688 );
1689 }
1690
1691 #[test]
1692 fn test_query_at_bounds() {
1693 let parents: Vec<&[u8]> = vec![b"ACGT"];
1694 let index = SeqHash::new(&parents).unwrap();
1695
1696 let read = b"NNNNACGT"; assert_eq!(index.query_at(read, 5), None);
1700 assert_eq!(index.query_at(read, 6), None);
1701 assert_eq!(index.query_at(read, 100), None);
1702
1703 assert_eq!(
1705 index.query_at(read, 4),
1706 Some(Match::Exact { parent_idx: 0 })
1707 );
1708
1709 assert_eq!(index.query_at(b"", 0), None);
1711
1712 assert_eq!(index.query_at(b"AC", 0), None);
1714 }
1715
1716 #[test]
1717 fn test_query_at_with_remap() {
1718 let parents: Vec<&[u8]> = vec![b"ACGT"];
1719 let index = SeqHash::new(&parents).unwrap();
1720
1721 let read = b"NNACGTNNNN";
1723 assert_eq!(
1724 index.query_at_with_remap(read, 2, 3),
1725 Some(Match::Exact { parent_idx: 0 })
1726 );
1727
1728 let read = b"NNNACGTNNN";
1730 assert_eq!(
1731 index.query_at_with_remap(read, 2, 3),
1732 Some(Match::Exact { parent_idx: 0 })
1733 );
1734
1735 let read = b"NACGTNNNNN";
1737 assert_eq!(
1738 index.query_at_with_remap(read, 2, 3),
1739 Some(Match::Exact { parent_idx: 0 })
1740 );
1741
1742 let read = b"NNNNACGTNN";
1744 assert_eq!(
1745 index.query_at_with_remap(read, 2, 3),
1746 Some(Match::Exact { parent_idx: 0 })
1747 );
1748
1749 let read = b"ACGTNNNNNN";
1751 assert_eq!(
1752 index.query_at_with_remap(read, 2, 3),
1753 Some(Match::Exact { parent_idx: 0 })
1754 );
1755
1756 let read = b"NNNNNNACGT";
1758 assert_eq!(index.query_at_with_remap(read, 2, 2), None);
1759
1760 let read = b"NNNNNACGTN";
1762 assert_eq!(
1763 index.query_at_with_remap(read, 2, 3),
1764 Some(Match::Exact { parent_idx: 0 })
1765 );
1766 }
1767
1768 #[test]
1769 fn test_query_at_with_remap_offset() {
1770 let parents: Vec<&[u8]> = vec![b"ACGT"];
1771 let index = SeqHash::new(&parents).unwrap();
1772
1773 let read = b"NNACGTNNNN";
1775 assert_eq!(
1776 index.query_at_with_remap_offset(read, 2, 3),
1777 Some((Match::Exact { parent_idx: 0 }, 0))
1778 );
1779
1780 let read = b"NNNACGTNNN";
1782 assert_eq!(
1783 index.query_at_with_remap_offset(read, 2, 3),
1784 Some((Match::Exact { parent_idx: 0 }, 1))
1785 );
1786
1787 let read = b"NACGTNNNNN";
1789 assert_eq!(
1790 index.query_at_with_remap_offset(read, 2, 3),
1791 Some((Match::Exact { parent_idx: 0 }, -1))
1792 );
1793
1794 let read = b"NNNNACGTNN";
1796 assert_eq!(
1797 index.query_at_with_remap_offset(read, 2, 3),
1798 Some((Match::Exact { parent_idx: 0 }, 2))
1799 );
1800
1801 let read = b"ACGTNNNNNN";
1803 assert_eq!(
1804 index.query_at_with_remap_offset(read, 2, 3),
1805 Some((Match::Exact { parent_idx: 0 }, -2))
1806 );
1807
1808 let read = b"NNNACGANN"; let result = index.query_at_with_remap_offset(read, 2, 3);
1811 assert_eq!(
1812 result,
1813 Some((
1814 Match::Mismatch {
1815 parent_idx: 0,
1816 pos: 3
1817 },
1818 1
1819 ))
1820 );
1821
1822 let read = b"NNNNNNACGT";
1824 assert_eq!(index.query_at_with_remap_offset(read, 2, 2), None);
1825 }
1826
1827 #[test]
1828 fn test_query_at_with_remap_prefers_direct_hit() {
1829 let parents: Vec<&[u8]> = vec![b"AAAA"];
1832 let index = SeqHash::new(&parents).unwrap();
1833
1834 let read = b"AAAAAAAAAA";
1836
1837 let result = index.query_at_with_remap_offset(read, 3, 3);
1839 assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1840 }
1841
1842 #[test]
1843 fn test_query_at_with_remap_edge_cases() {
1844 let parents: Vec<&[u8]> = vec![b"ACGT"];
1845 let index = SeqHash::new(&parents).unwrap();
1846
1847 let read = b"NACGTNNNN";
1849 let result = index.query_at_with_remap_offset(read, 0, 3);
1850 assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 1)));
1851
1852 let read = b"ACGTNNNN";
1854 let result = index.query_at_with_remap_offset(read, 0, 3);
1855 assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1856
1857 let read = b"NACGTNNNN";
1859 let result = index.query_at_with_remap_offset(read, 0, 0);
1860 assert_eq!(result, None);
1861
1862 let read = b"ACGTNNNN";
1863 let result = index.query_at_with_remap_offset(read, 0, 0);
1864 assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1865 }
1866
1867 #[test]
1868 fn test_query_sliding() {
1869 let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG"];
1870 let index = SeqHash::new(&parents).unwrap();
1871
1872 let read = b"ACGTNNNN";
1874 assert_eq!(
1875 index.query_sliding(read),
1876 Some((Match::Exact { parent_idx: 0 }, 0))
1877 );
1878
1879 let read = b"NNNACGTNNN";
1881 assert_eq!(
1882 index.query_sliding(read),
1883 Some((Match::Exact { parent_idx: 0 }, 3))
1884 );
1885
1886 let read = b"NNNNACGT";
1888 assert_eq!(
1889 index.query_sliding(read),
1890 Some((Match::Exact { parent_idx: 0 }, 4))
1891 );
1892
1893 let read = b"GGGGTTTT";
1895 assert_eq!(
1896 index.query_sliding(read),
1897 Some((Match::Exact { parent_idx: 1 }, 0))
1898 );
1899
1900 let read = b"NNACGANN"; assert_eq!(
1903 index.query_sliding(read),
1904 Some((
1905 Match::Mismatch {
1906 parent_idx: 0,
1907 pos: 3
1908 },
1909 2
1910 ))
1911 );
1912
1913 let read = b"TTTTTTTT";
1915 assert_eq!(index.query_sliding(read), None);
1916
1917 let read = b"AC";
1919 assert_eq!(index.query_sliding(read), None);
1920
1921 let read = b"ACGT";
1923 assert_eq!(
1924 index.query_sliding(read),
1925 Some((Match::Exact { parent_idx: 0 }, 0))
1926 );
1927 }
1928
1929 #[test]
1930 fn test_query_sliding_returns_first_match() {
1931 let parents: Vec<&[u8]> = vec![b"AAAA"];
1932 let index = SeqHash::new(&parents).unwrap();
1933
1934 let read = b"AAAAAAAAA";
1936 let result = index.query_sliding(read);
1937 assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1938 }
1939
1940 #[test]
1941 fn test_query_sliding_empty() {
1942 let parents: Vec<&[u8]> = vec![b"ACGT"];
1943 let index = SeqHash::new(&parents).unwrap();
1944
1945 assert_eq!(index.query_sliding(b""), None);
1946 }
1947
1948 #[test]
1949 fn test_query_sliding_iter_multiple_matches() {
1950 let parents: Vec<&[u8]> = vec![b"ACGT"];
1951 let index = SeqHash::new(&parents).unwrap();
1952
1953 let read = b"ACGTNNACGT";
1955 let matches: Vec<_> = index.query_sliding_iter(read).collect();
1956 assert_eq!(matches.len(), 2);
1957 assert_eq!(matches[0], (Match::Exact { parent_idx: 0 }, 0));
1958 assert_eq!(matches[1], (Match::Exact { parent_idx: 0 }, 6));
1959 }
1960
1961 #[test]
1962 fn test_query_sliding_iter_mixed_matches() {
1963 let parents: Vec<&[u8]> = vec![b"ACGT"];
1964 let index = SeqHash::new(&parents).unwrap();
1965
1966 let read = b"ACGTNNACGA"; let matches: Vec<_> = index.query_sliding_iter(read).collect();
1969 assert_eq!(matches.len(), 2);
1970 assert_eq!(matches[0], (Match::Exact { parent_idx: 0 }, 0));
1971 assert_eq!(
1972 matches[1],
1973 (
1974 Match::Mismatch {
1975 parent_idx: 0,
1976 pos: 3
1977 },
1978 6
1979 )
1980 );
1981 }
1982
1983 #[test]
1984 fn test_query_sliding_iter_no_matches() {
1985 let parents: Vec<&[u8]> = vec![b"ACGT"];
1986 let index = SeqHash::new(&parents).unwrap();
1987
1988 let read = b"TTTTTTTTTT";
1989 let matches: Vec<_> = index.query_sliding_iter(read).collect();
1990 assert!(matches.is_empty());
1991 }
1992
1993 #[test]
1994 fn test_query_sliding_iter_empty_seq() {
1995 let parents: Vec<&[u8]> = vec![b"ACGT"];
1996 let index = SeqHash::new(&parents).unwrap();
1997
1998 let matches: Vec<_> = index.query_sliding_iter(b"").collect();
1999 assert!(matches.is_empty());
2000 }
2001
2002 #[test]
2003 fn test_query_sliding_iter_short_seq() {
2004 let parents: Vec<&[u8]> = vec![b"ACGT"];
2005 let index = SeqHash::new(&parents).unwrap();
2006
2007 let matches: Vec<_> = index.query_sliding_iter(b"AC").collect();
2008 assert!(matches.is_empty());
2009 }
2010
2011 #[test]
2012 fn test_query_sliding_iter_lazy() {
2013 let parents: Vec<&[u8]> = vec![b"ACGT"];
2014 let index = SeqHash::new(&parents).unwrap();
2015
2016 let read = b"ACGTACGTACGTACGT";
2018 let matches: Vec<_> = index.query_sliding_iter(read).take(2).collect();
2019 assert_eq!(matches.len(), 2);
2020 assert_eq!(matches[0].1, 0);
2021 assert_eq!(matches[1].1, 4);
2022 }
2023
2024 #[test]
2025 fn test_query_sliding_iter_multiple_parents() {
2026 let parents: Vec<&[u8]> = vec![b"AAAA", b"GGGG"];
2027 let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2028
2029 let read = b"AAAACCGGGG";
2031 let matches: Vec<_> = index.query_sliding_iter(read).collect();
2032 assert_eq!(matches.len(), 2);
2033 assert_eq!(matches[0], (Match::Exact { parent_idx: 0 }, 0));
2034 assert_eq!(matches[1], (Match::Exact { parent_idx: 1 }, 6));
2035 }
2036
2037 #[test]
2038 fn test_string_input() {
2039 let parents: Vec<String> = vec!["ACGTACGT".to_string(), "GGGGCCCC".to_string()];
2041
2042 let index = SeqHash::new(&parents).unwrap();
2043 assert_eq!(index.num_parents(), 2);
2044 assert_eq!(
2045 index.query(b"ACGTACGT"),
2046 Some(Match::Exact { parent_idx: 0 })
2047 );
2048 }
2049
2050 #[test]
2051 fn test_vec_u8_input() {
2052 let parents: Vec<Vec<u8>> = vec![b"ACGTACGT".to_vec(), b"GGGGCCCC".to_vec()];
2053
2054 let index = SeqHash::new(&parents).unwrap();
2055 assert_eq!(index.num_parents(), 2);
2056 assert_eq!(
2057 index.query(b"ACGTACGT"),
2058 Some(Match::Exact { parent_idx: 0 })
2059 );
2060 }
2061
2062 #[test]
2063 fn test_builder_default() {
2064 let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
2065
2066 let index = SeqHashBuilder::default().build(&parents).unwrap();
2067
2068 assert_eq!(index.num_parents(), 2);
2069 assert!(!index.is_exact_only());
2070
2071 assert_eq!(
2073 index.query(b"ACGTACGT"),
2074 Some(Match::Exact { parent_idx: 0 })
2075 );
2076
2077 assert_eq!(
2079 index.query(b"ACGTACGA"), Some(Match::Mismatch {
2081 parent_idx: 0,
2082 pos: 7
2083 })
2084 );
2085 }
2086
2087 #[test]
2088 fn test_builder_exact_only() {
2089 let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
2090
2091 let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2092
2093 assert_eq!(index.num_parents(), 2);
2094 assert!(index.is_exact_only());
2095
2096 assert_eq!(
2098 index.query(b"ACGTACGT"),
2099 Some(Match::Exact { parent_idx: 0 })
2100 );
2101
2102 assert_eq!(index.query(b"ACGTACGA"), None);
2104
2105 assert_eq!(index.num_entries(), 2);
2107 }
2108
2109 #[test]
2110 fn test_builder_exclude_n() {
2111 let parents_with_n: Vec<&[u8]> = vec![b"ACGTNCGT"];
2113 let result = SeqHashBuilder::default().exclude_n().build(&parents_with_n);
2114 assert_eq!(
2115 result.unwrap_err(),
2116 SeqHashError::InvalidBase {
2117 index: 0,
2118 pos: 4,
2119 base: b'N'
2120 }
2121 );
2122
2123 let index = SeqHashBuilder::default().build(&parents_with_n).unwrap();
2125
2126 assert_eq!(index.num_parents(), 1);
2127
2128 assert_eq!(
2130 index.query(b"ACGTNCGT"),
2131 Some(Match::Exact { parent_idx: 0 })
2132 );
2133
2134 assert_eq!(
2136 index.query(b"GCGTNCGT"), Some(Match::Mismatch {
2138 parent_idx: 0,
2139 pos: 0
2140 })
2141 );
2142 }
2143
2144 #[test]
2145 fn test_builder_generates_n_mutations() {
2146 let parents: Vec<&[u8]> = vec![b"ACGT"];
2147
2148 let index = SeqHashBuilder::default().build(&parents).unwrap();
2149
2150 assert_eq!(
2152 index.query(b"NCGT"), Some(Match::Mismatch {
2154 parent_idx: 0,
2155 pos: 0
2156 })
2157 );
2158 assert_eq!(
2159 index.query(b"ANGT"), Some(Match::Mismatch {
2161 parent_idx: 0,
2162 pos: 1
2163 })
2164 );
2165 assert_eq!(
2166 index.query(b"ACNT"), Some(Match::Mismatch {
2168 parent_idx: 0,
2169 pos: 2
2170 })
2171 );
2172 assert_eq!(
2173 index.query(b"ACGN"), Some(Match::Mismatch {
2175 parent_idx: 0,
2176 pos: 3
2177 })
2178 );
2179 }
2180
2181 #[test]
2182 fn test_builder_exclude_n_no_n_mutations() {
2183 let parents: Vec<&[u8]> = vec![b"ACGT"];
2184
2185 let index = SeqHashBuilder::default()
2186 .exclude_n()
2187 .build(&parents)
2188 .unwrap();
2189
2190 assert_eq!(index.query(b"NCGT"), None);
2192 assert_eq!(index.query(b"ANGT"), None);
2193 assert_eq!(index.query(b"ACNT"), None);
2194 assert_eq!(index.query(b"ACGN"), None);
2195
2196 assert_eq!(
2198 index.query(b"GCGT"), Some(Match::Mismatch {
2200 parent_idx: 0,
2201 pos: 0
2202 })
2203 );
2204 }
2205
2206 #[test]
2207 fn test_builder_exact_with_n() {
2208 let parents: Vec<&[u8]> = vec![b"ACNTNC"];
2209
2210 let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2211
2212 assert!(index.is_exact_only());
2213 assert_eq!(index.num_entries(), 1);
2214
2215 assert_eq!(index.query(b"ACNTNC"), Some(Match::Exact { parent_idx: 0 }));
2217 assert_eq!(index.query(b"GCNTNC"), None);
2218 }
2219
2220 #[test]
2221 fn test_new_allows_n_by_default() {
2222 let parents: Vec<&[u8]> = vec![b"ACNGT"];
2224 let index = SeqHash::new(&parents).unwrap();
2225
2226 assert_eq!(index.num_parents(), 1);
2227 assert_eq!(index.query(b"ACNGT"), Some(Match::Exact { parent_idx: 0 }));
2228 }
2229
2230 #[test]
2231 fn test_case_normalization_default() {
2232 let parents: Vec<&[u8]> = vec![b"acgtacgt", b"ggggcccc"];
2234 let index = SeqHash::new(&parents).unwrap();
2235
2236 assert!(index.normalizes_case());
2237 assert_eq!(index.num_parents(), 2);
2238
2239 assert_eq!(index.get_parent(0), Some(b"ACGTACGT".as_slice()));
2241 assert_eq!(index.get_parent(1), Some(b"GGGGCCCC".as_slice()));
2242
2243 assert_eq!(
2245 index.query(b"ACGTACGT"),
2246 Some(Match::Exact { parent_idx: 0 })
2247 );
2248
2249 assert_eq!(
2251 index.query(b"ACGTACGA"), Some(Match::Mismatch {
2253 parent_idx: 0,
2254 pos: 7
2255 })
2256 );
2257 }
2258
2259 #[test]
2260 fn test_keep_case() {
2261 let parents: Vec<&[u8]> = vec![b"acgtACGT", b"GGGGcccc"];
2263 let index = SeqHashBuilder::default()
2264 .keep_case()
2265 .build(&parents)
2266 .unwrap();
2267
2268 assert!(!index.normalizes_case());
2269 assert_eq!(index.num_parents(), 2);
2270
2271 assert_eq!(index.get_parent(0), Some(b"acgtACGT".as_slice()));
2273 assert_eq!(index.get_parent(1), Some(b"GGGGcccc".as_slice()));
2274
2275 assert_eq!(
2277 index.query(b"acgtACGT"),
2278 Some(Match::Exact { parent_idx: 0 })
2279 );
2280
2281 assert_eq!(index.query(b"ACGTACGT"), None);
2283
2284 assert_eq!(
2286 index.query(b"acgtACGA"), Some(Match::Mismatch {
2288 parent_idx: 0,
2289 pos: 7
2290 })
2291 );
2292 }
2293
2294 #[test]
2295 fn test_case_normalization_with_builder() {
2296 let parents: Vec<&[u8]> = vec![b"acgt", b"gggg"];
2298 let index = SeqHashBuilder::default().build(&parents).unwrap();
2299
2300 assert!(index.normalizes_case());
2301 assert_eq!(index.get_parent(0), Some(b"ACGT".as_slice()));
2302 assert_eq!(index.get_parent(1), Some(b"GGGG".as_slice()));
2303 }
2304
2305 #[test]
2306 fn test_case_normalization_mixed_case_parents() {
2307 let parents: Vec<&[u8]> = vec![b"AcGt", b"gGcC"];
2309 let index = SeqHash::new(&parents).unwrap();
2310
2311 assert_eq!(index.get_parent(0), Some(b"ACGT".as_slice()));
2312 assert_eq!(index.get_parent(1), Some(b"GGCC".as_slice()));
2313
2314 assert_eq!(index.query(b"ACGT"), Some(Match::Exact { parent_idx: 0 }));
2316 assert_eq!(index.query(b"GGCC"), Some(Match::Exact { parent_idx: 1 }));
2317 }
2318
2319 #[test]
2320 fn test_keep_case_exact_only() {
2321 let parents: Vec<&[u8]> = vec![b"acgtACGT"];
2323 let index = SeqHashBuilder::default()
2324 .keep_case()
2325 .exact()
2326 .build(&parents)
2327 .unwrap();
2328
2329 assert!(!index.normalizes_case());
2330 assert!(index.is_exact_only());
2331
2332 assert_eq!(
2334 index.query(b"acgtACGT"),
2335 Some(Match::Exact { parent_idx: 0 })
2336 );
2337
2338 assert_eq!(index.query(b"ACGTACGT"), None);
2340
2341 assert_eq!(index.query(b"acgtACGA"), None);
2343 }
2344
2345 #[test]
2346 fn test_case_normalization_with_n() {
2347 let parents: Vec<&[u8]> = vec![b"acgtn"];
2349 let index = SeqHash::new(&parents).unwrap();
2350
2351 assert_eq!(index.get_parent(0), Some(b"ACGTN".as_slice()));
2352 assert_eq!(index.query(b"ACGTN"), Some(Match::Exact { parent_idx: 0 }));
2353 }
2354
2355 #[test]
2356 fn test_keep_case_with_lowercase_validation() {
2357 let parents: Vec<&[u8]> = vec![b"acgt"];
2359 let index = SeqHashBuilder::default()
2360 .keep_case()
2361 .build(&parents)
2362 .unwrap();
2363
2364 assert_eq!(index.num_parents(), 1);
2366 assert_eq!(index.get_parent(0), Some(b"acgt".as_slice()));
2367 }
2368
2369 #[test]
2370 fn test_is_within_hdist() {
2371 let parents: Vec<&[u8]> = vec![
2372 b"ACGTACGT", b"GGGGCCCC", b"TTTTAAAA", ];
2376 let index = SeqHash::new(&parents).unwrap();
2377
2378 assert!(index.is_within_hdist(b"ACGTACGT", 0, 0));
2380 assert!(index.is_within_hdist(b"ACGTACGT", 0, 1));
2381
2382 assert!(!index.is_within_hdist(b"ACGTACGA", 0, 0)); assert!(index.is_within_hdist(b"ACGTACGA", 0, 1)); assert!(index.is_within_hdist(b"ACGTACGA", 0, 2)); assert!(!index.is_within_hdist(b"ACGTACAA", 0, 1)); assert!(index.is_within_hdist(b"ACGTACAA", 0, 2)); assert!(!index.is_within_hdist(b"GGGGGGGG", 0, 3)); assert!(index.is_within_hdist(b"GGGGGGGG", 0, 8)); assert!(index.is_within_hdist(b"GGGGCCCC", 1, 0)); assert!(!index.is_within_hdist(b"GGGGCCCC", 0, 5)); assert!(index.is_within_hdist(b"GGGGCCCC", 0, 6)); assert!(!index.is_within_hdist(b"ACGTACGT", 99, 0));
2402
2403 assert!(!index.is_within_hdist(b"ACGT", 0, 10));
2405 assert!(!index.is_within_hdist(b"ACGTACGTACGT", 0, 10));
2406 }
2407}
2408
2409#[cfg(all(test, feature = "serde"))]
2410mod serde_tests {
2411 use super::*;
2412
2413 #[test]
2414 fn test_seqhash_roundtrip_json() {
2415 let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC", b"TTTTAAAA"];
2416 let index = SeqHash::new(&parents).unwrap();
2417
2418 let json = serde_json::to_string(&index).unwrap();
2420
2421 let restored: SeqHash = serde_json::from_str(&json).unwrap();
2423
2424 assert_eq!(restored.num_parents(), index.num_parents());
2426 assert_eq!(restored.seq_len(), index.seq_len());
2427 assert_eq!(restored.num_entries(), index.num_entries());
2428 assert_eq!(restored.num_ambiguous(), index.num_ambiguous());
2429 assert_eq!(restored.is_exact_only(), index.is_exact_only());
2430 assert_eq!(restored.allows_n(), index.allows_n());
2431
2432 assert_eq!(
2434 restored.query(b"ACGTACGT"),
2435 Some(Match::Exact { parent_idx: 0 })
2436 );
2437 assert_eq!(
2438 restored.query(b"GCGTACGT"), Some(Match::Mismatch {
2440 parent_idx: 0,
2441 pos: 0
2442 })
2443 );
2444
2445 for i in 0..index.num_parents() {
2447 assert_eq!(restored.get_parent(i), index.get_parent(i));
2448 }
2449 }
2450
2451 #[test]
2452 fn test_seqhash_roundtrip_bincode() {
2453 let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT", b"GGGGCCCCAAAA"];
2454 let index = SeqHash::new(&parents).unwrap();
2455
2456 let bytes = bincode::serialize(&index).unwrap();
2458
2459 let restored: SeqHash = bincode::deserialize(&bytes).unwrap();
2461
2462 assert_eq!(
2464 restored.query(b"ACGTACGTACGT"),
2465 Some(Match::Exact { parent_idx: 0 })
2466 );
2467 assert_eq!(
2468 restored.query(b"ACGTACGTACGA"), Some(Match::Mismatch {
2470 parent_idx: 0,
2471 pos: 11
2472 })
2473 );
2474 }
2475
2476 #[test]
2477 fn test_seqhash_exact_only_roundtrip() {
2478 let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
2479 let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2480
2481 let bytes = bincode::serialize(&index).unwrap();
2482 let restored: SeqHash = bincode::deserialize(&bytes).unwrap();
2483
2484 assert!(restored.is_exact_only());
2485 assert_eq!(
2486 restored.query(b"ACGTACGT"),
2487 Some(Match::Exact { parent_idx: 0 })
2488 );
2489 assert_eq!(restored.query(b"GCGTACGT"), None);
2491 }
2492
2493 #[test]
2494 fn test_match_serde() {
2495 let exact = Match::Exact { parent_idx: 42 };
2496 let json = serde_json::to_string(&exact).unwrap();
2497 let restored: Match = serde_json::from_str(&json).unwrap();
2498 assert_eq!(restored, exact);
2499
2500 let mismatch = Match::Mismatch {
2501 parent_idx: 7,
2502 pos: 13,
2503 };
2504 let json = serde_json::to_string(&mismatch).unwrap();
2505 let restored: Match = serde_json::from_str(&json).unwrap();
2506 assert_eq!(restored, mismatch);
2507 }
2508
2509 #[test]
2510 fn test_error_serde() {
2511 let errors = vec![
2512 SeqHashError::EmptyParents,
2513 SeqHashError::InconsistentLength {
2514 expected: 10,
2515 found: 5,
2516 index: 2,
2517 },
2518 SeqHashError::SequenceTooLong { len: 20000 },
2519 SeqHashError::DuplicateParent {
2520 index: 3,
2521 original: 1,
2522 },
2523 SeqHashError::InvalidBase {
2524 index: 0,
2525 pos: 5,
2526 base: b'X',
2527 },
2528 ];
2529
2530 for error in errors {
2531 let json = serde_json::to_string(&error).unwrap();
2532 let restored: SeqHashError = serde_json::from_str(&json).unwrap();
2533 assert_eq!(restored, error);
2534 }
2535 }
2536
2537 #[test]
2538 fn test_save_and_load() {
2539 let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT", b"GGGGCCCCAAAA", b"TTTTAAAACCCC"];
2540 let index = SeqHash::new(&parents).unwrap();
2541
2542 let temp_dir = std::env::temp_dir();
2544 let file_path = temp_dir.join("test_index.seqhash");
2545
2546 index.save(&file_path).unwrap();
2548
2549 let loaded = SeqHash::load(&file_path).unwrap();
2551
2552 assert_eq!(loaded.num_parents(), index.num_parents());
2554 assert_eq!(loaded.seq_len(), index.seq_len());
2555 assert_eq!(loaded.num_entries(), index.num_entries());
2556 assert_eq!(loaded.num_ambiguous(), index.num_ambiguous());
2557 assert_eq!(loaded.is_exact_only(), index.is_exact_only());
2558 assert_eq!(loaded.allows_n(), index.allows_n());
2559
2560 assert_eq!(
2562 loaded.query(b"ACGTACGTACGT"),
2563 Some(Match::Exact { parent_idx: 0 })
2564 );
2565 assert_eq!(
2566 loaded.query(b"ACGTACGTACGA"), Some(Match::Mismatch {
2568 parent_idx: 0,
2569 pos: 11
2570 })
2571 );
2572
2573 for i in 0..index.num_parents() {
2575 assert_eq!(loaded.get_parent(i), index.get_parent(i));
2576 }
2577
2578 std::fs::remove_file(&file_path).ok();
2580 }
2581
2582 #[test]
2583 fn test_load_nonexistent_file() {
2584 let result = SeqHash::load("/nonexistent/path/to/file.seqhash");
2585 assert!(result.is_err());
2586 assert_eq!(result.unwrap_err().kind(), std::io::ErrorKind::NotFound);
2587 }
2588
2589 #[test]
2590 fn test_load_invalid_file() {
2591 let temp_dir = std::env::temp_dir();
2592 let file_path = temp_dir.join("invalid.seqhash");
2593
2594 std::fs::write(&file_path, b"not valid bincode data").unwrap();
2596
2597 let result = SeqHash::load(&file_path);
2598 assert!(result.is_err());
2599 assert_eq!(result.unwrap_err().kind(), std::io::ErrorKind::InvalidData);
2600
2601 std::fs::remove_file(&file_path).ok();
2603 }
2604}
2605
2606#[cfg(all(test, feature = "parallel"))]
2607mod parallel_tests {
2608
2609 use super::*;
2610 use rand::Rng;
2611
2612 fn generate_random_parents(n_parents: usize, seq_len: usize) -> Vec<Vec<u8>> {
2613 let mut parents = Vec::new();
2614 let mut rng = rand::rng();
2615
2616 while parents.len() < n_parents {
2617 let parent = (0..seq_len)
2618 .map(|_| rng.random_range(0..4))
2619 .map(|base_idx| VALID_BASES[base_idx])
2620 .collect::<Vec<u8>>();
2621 if !parents.contains(&parent) {
2622 parents.push(parent);
2623 }
2624 }
2625
2626 parents
2627 }
2628
2629 #[test]
2630 fn test_construction_parallel() {
2631 let n_parents = 100;
2632 let seq_len = 10;
2633
2634 let parents = generate_random_parents(n_parents, seq_len);
2635
2636 let index_sequental = SeqHashBuilder::default().build(&parents).unwrap();
2637
2638 for threads in [0, 1, 4, 8] {
2639 let index_parallel = SeqHashBuilder::default()
2640 .threads(threads)
2641 .build(&parents)
2642 .unwrap();
2643
2644 assert_eq!(index_sequental.parents, index_parallel.parents);
2646
2647 assert_eq!(index_sequental.lookup.len(), index_parallel.lookup.len());
2649
2650 for key in index_sequental.lookup.keys() {
2652 assert_eq!(index_sequental.lookup[key], index_parallel.lookup[key]);
2653 }
2654 }
2655 }
2656
2657 #[test]
2658 fn test_all_single_mutations_parallel() {
2659 let parents: Vec<&[u8]> = vec![b"AAAA"];
2660 let index = SeqHashBuilder::default()
2661 .threads(4)
2662 .build(&parents)
2663 .unwrap();
2664
2665 let mutations = [
2667 (b"CAAA", 0),
2668 (b"GAAA", 0),
2669 (b"TAAA", 0),
2670 (b"ACAA", 1),
2671 (b"AGAA", 1),
2672 (b"ATAA", 1),
2673 (b"AACA", 2),
2674 (b"AAGA", 2),
2675 (b"AATA", 2),
2676 (b"AAAC", 3),
2677 (b"AAAG", 3),
2678 (b"AAAT", 3),
2679 ];
2680
2681 for (query, expected_pos) in mutations {
2682 let result = index.query(query);
2683 assert_eq!(
2684 result,
2685 Some(Match::Mismatch {
2686 parent_idx: 0,
2687 pos: expected_pos
2688 }),
2689 "Failed for query {:?}",
2690 std::str::from_utf8(query)
2691 );
2692 }
2693 }
2694}