Skip to main content

seqhash/
lib.rs

1//! Fast mismatch-tolerant sequence lookup with disambiguation.
2//!
3//! `seqhash` is a high-performance Rust library for building mismatch-tolerant
4//! sequence lookup indices. Given a set of parent sequences, it constructs an
5//! index that can query whether an input sequence matches any parent exactly
6//! OR is exactly one substitution away—while detecting and rejecting ambiguous
7//! cases where a sequence could map to multiple parents.
8//!
9//! # Example
10//!
11//! ```
12//! use seqhash::{SeqHash, Match};
13//!
14//! let parents: Vec<&[u8]> = vec![
15//!     b"ACGTACGTACGT",
16//!     b"GGGGCCCCAAAA",
17//!     b"TTTTAAAACCCC",
18//! ];
19//!
20//! let index = SeqHash::new(&parents).unwrap();
21//!
22//! // Exact match
23//! assert!(matches!(
24//!     index.query(b"ACGTACGTACGT"),
25//!     Some(Match::Exact { parent_idx: 0 })
26//! ));
27//!
28//! // Mismatch match (one base different)
29//! let query_with_error = b"ACGTACGTACGA"; // T->A at position 11
30//! assert!(matches!(
31//!     index.query(query_with_error),
32//!     Some(Match::Mismatch { parent_idx: 0, pos: 11 })
33//! ));
34//! ```
35//!
36//! # Case Normalization
37//!
38//! By default, parent sequences are normalized to uppercase during index construction.
39//! This ensures consistent matching regardless of input case:
40//!
41//! ```
42//! use seqhash::SeqHash;
43//!
44//! // Lowercase input is automatically converted to uppercase
45//! let parents: Vec<&[u8]> = vec![b"acgtacgt", b"ggggcccc"];
46//! let index = SeqHash::new(&parents).unwrap();
47//!
48//! // Queries must match the normalized (uppercase) sequences
49//! assert!(index.query(b"ACGTACGT").is_some());
50//! ```
51//!
52//! For cases where lowercase bases have special meaning (e.g., soft-masked regions),
53//! use [`SeqHashBuilder::keep_case()`] to preserve the original case:
54//!
55//! ```
56//! use seqhash::SeqHashBuilder;
57//!
58//! let parents: Vec<&[u8]> = vec![b"ACGTacgt"]; // Mixed case preserved
59//! let index = SeqHashBuilder::default()
60//!     .keep_case()
61//!     .build(&parents)
62//!     .unwrap();
63//!
64//! // Only exact case matches will work
65//! assert!(index.query(b"ACGTacgt").is_some());
66//! assert!(index.query(b"ACGTACGT").is_none());
67//! ```
68//!
69//! > **Note**: Querying always matches *exact* sequences, so if you choose to store lowercase bases, they will be treated as distinct from their uppercase counterparts.
70//!
71//! # Parallel Construction
72//!
73//! The `parallel` feature (enabled by default) enables multi-threaded index construction
74//! for improved performance on large parent sets:
75//!
76//! ```
77//! use seqhash::SeqHashBuilder;
78//!
79//! let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
80//!
81//! // Use 4 threads for construction
82//! # #[cfg(feature = "parallel")]
83//! let index = SeqHashBuilder::default()
84//!     .threads(4)
85//!     .build(&parents)
86//!     .unwrap();
87//!
88//! // Use all available CPU cores
89//! # #[cfg(feature = "parallel")]
90//! let index = SeqHashBuilder::default()
91//!     .threads(0)
92//!     .build(&parents)
93//!     .unwrap();
94//! ```
95//!
96//! # Serialization
97//!
98//! The `serde` feature enables saving and loading pre-built indices to disk.
99//! This is useful when you want to build an index once and reuse it across
100//! multiple runs without rebuilding.
101//!
102//! ```toml
103//! [dependencies]
104//! seqhash = { version = "0.1", features = ["serde"] }
105//! ```
106//!
107//! ```ignore
108//! // Save an index to disk
109//! index.save("my_index.seqhash")?;
110//!
111//! // Load an index from disk
112//! let index = SeqHash::load("my_index.seqhash")?;
113//! ```
114//!
115//! The recommended file extension is `.seqhash`. The index is stored in
116//! bincode format. With the `serde` feature enabled, you can also serialize
117//! to any serde-compatible format (JSON, MessagePack, etc.) directly.
118
119use hashbrown::HashMap;
120
121mod multilen;
122mod split;
123pub use multilen::{MultiLenMatch, MultiLenSeqHash, MultiLenSeqHashBuilder};
124pub use split::{Half, SplitMatch, SplitSeqHash};
125
126/// Maximum sequence length (14 bits for position encoding).
127pub const MAX_SEQ_LEN: usize = 16383;
128
129/// Valid DNA bases.
130const VALID_BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
131
132/// Valid DNA bases including N.
133const VALID_BASES_WITH_N: [u8; 5] = [b'A', b'C', b'G', b'T', b'N'];
134
135// Entry bit layout:
136// Bit 63:    ambiguous flag
137// Bit 62:    is_parent flag
138// Bits 48-61: mutation position (14 bits)
139// Bits 40-47: original base (8 bits)
140// Bits 32-39: mutated base (8 bits)
141// Bits 0-31:  parent index (32 bits)
142
143const AMBIGUOUS_BIT: u64 = 1 << 63;
144const IS_PARENT_BIT: u64 = 1 << 62;
145const POSITION_SHIFT: u64 = 48;
146const POSITION_MASK: u64 = 0x3FFF; // 14 bits
147const ORIGINAL_BASE_SHIFT: u64 = 40;
148const MUTATED_BASE_SHIFT: u64 = 32;
149const BASE_MASK: u64 = 0xFF;
150const PARENT_IDX_MASK: u64 = 0xFFFFFFFF;
151
152/// A successful match result.
153#[derive(Debug, Clone, Copy, PartialEq, Eq)]
154#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
155pub enum Match {
156    /// Query exactly matches parent.
157    Exact { parent_idx: usize },
158    /// Query has single-base mismatch from parent.
159    Mismatch { parent_idx: usize, pos: usize },
160}
161
162impl Match {
163    /// Returns the parent index regardless of match type.
164    #[inline]
165    #[must_use]
166    pub fn parent_idx(&self) -> usize {
167        match self {
168            Match::Exact { parent_idx } | Match::Mismatch { parent_idx, .. } => *parent_idx,
169        }
170    }
171
172    /// Returns true if this was an exact match.
173    #[inline]
174    #[must_use]
175    pub fn is_exact(&self) -> bool {
176        matches!(self, Match::Exact { .. })
177    }
178
179    /// Returns the mismatch position, if any.
180    #[inline]
181    #[must_use]
182    pub fn mismatch_pos(&self) -> Option<usize> {
183        match self {
184            Match::Exact { .. } => None,
185            Match::Mismatch { pos, .. } => Some(*pos),
186        }
187    }
188
189    /// Returns the hamming distance contribution of this match.
190    ///
191    /// Returns 0 for exact matches, 1 for mismatch matches.
192    #[inline]
193    #[must_use]
194    pub fn hdist(&self) -> usize {
195        match self {
196            Match::Exact { .. } => 0,
197            Match::Mismatch { .. } => 1,
198        }
199    }
200}
201
202/// Errors during index construction.
203#[derive(Debug, Clone, PartialEq, Eq)]
204#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
205pub enum SeqHashError {
206    /// No parent sequences provided.
207    EmptyParents,
208    /// Parent sequences have different lengths.
209    InconsistentLength {
210        expected: usize,
211        found: usize,
212        index: usize,
213    },
214    /// Sequence length exceeds maximum (16383).
215    SequenceTooLong { len: usize },
216    /// Duplicate parent sequence found.
217    DuplicateParent { index: usize, original: usize },
218    /// Sequence contains invalid bases (not A, C, G, T).
219    InvalidBase { index: usize, pos: usize, base: u8 },
220}
221
222impl std::fmt::Display for SeqHashError {
223    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
224        match self {
225            SeqHashError::EmptyParents => write!(f, "no parent sequences provided"),
226            SeqHashError::InconsistentLength {
227                expected,
228                found,
229                index,
230            } => write!(
231                f,
232                "parent at index {index} has length {found} (expected {expected})"
233            ),
234            SeqHashError::SequenceTooLong { len } => {
235                write!(f, "sequence length {len} exceeds maximum {MAX_SEQ_LEN}")
236            }
237            SeqHashError::DuplicateParent { index, original } => {
238                write!(
239                    f,
240                    "parent at index {index} is duplicate of parent at index {original}"
241                )
242            }
243            SeqHashError::InvalidBase { index, pos, base } => {
244                write!(
245                    f,
246                    "invalid base '{}' at position {} in parent {}",
247                    *base as char, pos, index
248                )
249            }
250        }
251    }
252}
253
254impl std::error::Error for SeqHashError {}
255
256/// Encoded entry in the lookup table.
257#[derive(Debug, Clone, Copy, PartialEq, Eq)]
258#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
259struct Entry(u64);
260
261impl Entry {
262    /// Create a new entry for a parent sequence.
263    #[inline]
264    fn new_parent(parent_idx: u32) -> Self {
265        Entry(IS_PARENT_BIT | u64::from(parent_idx))
266    }
267
268    /// Create a new entry for a mismatch.
269    #[inline]
270    fn new_mismatch(parent_idx: u32, pos: u16, original_base: u8, mutated_base: u8) -> Self {
271        Entry(
272            (u64::from(pos) << POSITION_SHIFT)
273                | (u64::from(original_base) << ORIGINAL_BASE_SHIFT)
274                | (u64::from(mutated_base) << MUTATED_BASE_SHIFT)
275                | u64::from(parent_idx),
276        )
277    }
278
279    /// Create an ambiguous entry marker.
280    #[inline]
281    fn ambiguous() -> Self {
282        Entry(AMBIGUOUS_BIT)
283    }
284
285    /// Check if this entry is marked ambiguous.
286    #[inline]
287    fn is_ambiguous(self) -> bool {
288        (self.0 & AMBIGUOUS_BIT) != 0
289    }
290
291    /// Check if this entry represents a parent (exact match).
292    #[inline]
293    fn is_parent(self) -> bool {
294        (self.0 & IS_PARENT_BIT) != 0
295    }
296
297    /// Get the parent index.
298    #[inline]
299    fn parent_idx(self) -> usize {
300        (self.0 & PARENT_IDX_MASK) as usize
301    }
302
303    /// Get the mutation position.
304    #[inline]
305    fn position(self) -> usize {
306        ((self.0 >> POSITION_SHIFT) & POSITION_MASK) as usize
307    }
308
309    /// Get the original base at the mutation position.
310    #[inline]
311    fn original_base(self) -> u8 {
312        ((self.0 >> ORIGINAL_BASE_SHIFT) & BASE_MASK) as u8
313    }
314
315    /// Get the mutated base (what the query should have).
316    #[inline]
317    fn mutated_base(self) -> u8 {
318        ((self.0 >> MUTATED_BASE_SHIFT) & BASE_MASK) as u8
319    }
320}
321
322/// Hash a sequence using fxhash.
323#[inline]
324fn hash_sequence(seq: &[u8]) -> u64 {
325    fxhash::hash64(seq)
326}
327
328/// Check if a base is valid (A, C, G, T, case-insensitive).
329#[inline]
330fn is_valid_base(b: u8) -> bool {
331    matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
332}
333
334/// Check if a base is valid, optionally allowing N.
335#[inline]
336fn is_valid_base_with_n(b: u8, allow_n: bool) -> bool {
337    is_valid_base(b) || (allow_n && (b == b'N' || b == b'n'))
338}
339
340/// Calculate whether two sequences are within a given hamming distance.
341#[inline]
342fn within_hamming_distance(seq1: &[u8], seq2: &[u8], max_hdist: usize) -> bool {
343    if seq1.len() != seq2.len() {
344        return false;
345    }
346    let mut hdist = 0;
347    for (a, b) in seq1.iter().zip(seq2.iter()) {
348        if a != b {
349            hdist += 1;
350            if hdist > max_hdist {
351                return false;
352            }
353        }
354    }
355    true
356}
357
358/// Fast mismatch-tolerant sequence lookup index.
359#[derive(Debug, Clone)]
360#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
361pub struct SeqHash {
362    /// Contiguous storage of all parent sequences.
363    parents: Vec<u8>,
364    /// Number of parent sequences.
365    num_parents: usize,
366    /// Length of each sequence.
367    seq_len: usize,
368    /// Hash -> Entry lookup table.
369    lookup: HashMap<u64, Entry>,
370    /// Count of ambiguous sequences detected.
371    num_ambiguous: usize,
372    /// If true, only exact matches are supported (no mismatch entries).
373    exact_only: bool,
374    /// If true, N bases are allowed in sequences.
375    allow_n: bool,
376    /// If true, sequences are normalized to uppercase.
377    normalize_case: bool,
378}
379
380/// Builder for constructing a [`SeqHash`] index with custom configuration.
381///
382/// # Example
383///
384/// ```
385/// use seqhash::SeqHashBuilder;
386///
387/// let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
388///
389/// // Build with default settings (allows 1 mismatch, allows N bases, normalizes case)
390/// let index = SeqHashBuilder::default().build(&parents).unwrap();
391///
392/// // Build with exact match only (no mismatch tolerance)
393/// let exact_only = SeqHashBuilder::default()
394///     .exact()
395///     .build(&parents)
396///     .unwrap();
397///
398/// // Build rejecting N bases in sequences
399/// let no_n = SeqHashBuilder::default()
400///     .exclude_n()
401///     .build(&parents)
402///     .unwrap();
403///
404/// // Build preserving lowercase bases (useful when case has special meaning)
405/// let keep_case = SeqHashBuilder::default()
406///     .keep_case()
407///     .build(&parents)
408///     .unwrap();
409/// ```
410#[derive(Debug, Clone, Copy)]
411pub struct SeqHashBuilder {
412    /// If true, only index exact matches (no mismatch entries).
413    exact_only: bool,
414    /// If true, allow N bases in sequences (skip N positions for mutations).
415    allow_n: bool,
416    /// If true, convert sequences to uppercase before indexing (default: true).
417    normalize_case: bool,
418    /// Number of threads to use for parallel processing (default: 1) [0: all available].
419    threads: usize,
420}
421
422impl Default for SeqHashBuilder {
423    fn default() -> Self {
424        SeqHashBuilder {
425            exact_only: false,
426            allow_n: true,
427            normalize_case: true,
428            threads: 1,
429        }
430    }
431}
432
433impl SeqHashBuilder {
434    /// Configure for exact matching only (no mismatch tolerance).
435    ///
436    /// When set, the index will only match sequences that exactly match a parent.
437    /// This reduces memory usage since no mutation entries are generated.
438    #[must_use]
439    pub fn exact(mut self) -> Self {
440        self.exact_only = true;
441        self
442    }
443
444    /// Reject N bases in sequences.
445    ///
446    /// By default, sequences containing N are accepted (N positions are skipped
447    /// when generating mismatch entries). When this is set, sequences containing
448    /// N will be rejected with an `InvalidBase` error.
449    #[must_use]
450    pub fn exclude_n(mut self) -> Self {
451        self.allow_n = false;
452        self
453    }
454
455    /// Preserve the case of input sequences.
456    ///
457    /// By default, sequences are converted to uppercase before indexing.
458    /// When this is set, sequences are kept as-is, preserving lowercase bases.
459    /// This is useful when lowercase bases have special meaning in your data.
460    #[must_use]
461    pub fn keep_case(mut self) -> Self {
462        self.normalize_case = false;
463        self
464    }
465
466    /// Number of threads to use when building the index.
467    ///
468    /// By default, uses a single thread. Setting this to 0 will use all available threads.
469    ///
470    /// Parallel construction is most beneficial for large parent sets (>100,000 sequences)
471    /// or long sequences. The number of threads is automatically capped by the number
472    /// of available CPU cores and the number of parent sequences.
473    ///
474    /// # Example
475    ///
476    /// ```
477    /// use seqhash::SeqHashBuilder;
478    ///
479    /// let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
480    ///
481    /// // Use 4 threads
482    /// let index = SeqHashBuilder::default()
483    ///     .threads(4)
484    ///     .build(&parents)
485    ///     .unwrap();
486    ///
487    /// // Use all available CPU cores
488    /// let index = SeqHashBuilder::default()
489    ///     .threads(0)
490    ///     .build(&parents)
491    ///     .unwrap();
492    /// ```
493    #[must_use]
494    #[cfg(feature = "parallel")]
495    pub fn threads(mut self, num_threads: usize) -> Self {
496        self.threads = match num_threads {
497            0 => num_cpus::get(),
498            x => num_cpus::get().min(x),
499        };
500        self
501    }
502
503    /// Build the [`SeqHash`] index from the given parent sequences.
504    ///
505    /// # Errors
506    ///
507    /// Returns an error if:
508    /// - No parent sequences are provided
509    /// - Sequences have inconsistent lengths
510    /// - Sequence length exceeds 16383
511    /// - Duplicate parent sequences exist
512    /// - Sequences contain invalid bases (unless `allow_n()` is set for N)
513    pub fn build<S: AsRef<[u8]>>(self, parents: &[S]) -> Result<SeqHash, SeqHashError> {
514        SeqHash::build_internal(
515            parents,
516            self.exact_only,
517            self.allow_n,
518            self.normalize_case,
519            self.threads,
520        )
521    }
522}
523
524impl SeqHash {
525    /// Construct a new index from parent sequences.
526    ///
527    /// All sequences must be the same length and contain only A, C, G, T, or N.
528    /// This uses default settings (allows 1 mismatch, allows N bases, normalizes case).
529    ///
530    /// For more control, use [`SeqHashBuilder`].
531    ///
532    /// # Errors
533    ///
534    /// Returns an error if:
535    /// - No parent sequences are provided
536    /// - Sequences have inconsistent lengths
537    /// - Sequence length exceeds 16383
538    /// - Duplicate parent sequences exist
539    /// - Sequences contain invalid bases
540    pub fn new<S: AsRef<[u8]>>(parents: &[S]) -> Result<Self, SeqHashError> {
541        Self::build_internal(parents, false, true, true, 1)
542    }
543
544    /// Internal build function used by both `new` and `SeqHashBuilder`.
545    fn build_internal<S: AsRef<[u8]>>(
546        parents: &[S],
547        exact_only: bool,
548        allow_n: bool,
549        normalize_case: bool,
550        #[allow(unused_variables)] threads: usize,
551    ) -> Result<Self, SeqHashError> {
552        if parents.is_empty() {
553            return Err(SeqHashError::EmptyParents);
554        }
555
556        let seq_len = parents[0].as_ref().len();
557        if seq_len > MAX_SEQ_LEN {
558            return Err(SeqHashError::SequenceTooLong { len: seq_len });
559        }
560
561        let num_parents = parents.len();
562
563        // Pre-allocate contiguous parent storage
564        let mut parent_data = Vec::with_capacity(num_parents * seq_len);
565
566        // Estimated capacity: parents + ~3*len mutations per parent (if not exact_only)
567        let estimated_entries = if exact_only {
568            num_parents
569        } else {
570            num_parents * (1 + 3 * seq_len)
571        };
572        let mut lookup: HashMap<u64, Entry> = HashMap::with_capacity(estimated_entries);
573        let mut num_ambiguous = 0;
574
575        Self::initialize_parents(
576            &mut lookup,
577            &mut parent_data,
578            &mut num_ambiguous,
579            parents,
580            seq_len,
581            normalize_case,
582            allow_n,
583        )?;
584
585        // Second pass: generate all single-base mutations (unless exact_only)
586        if !exact_only {
587            #[cfg(feature = "parallel")]
588            if threads > 1 {
589                Self::initialize_mutations_parallel(
590                    &mut lookup,
591                    &mut num_ambiguous,
592                    &parent_data,
593                    seq_len,
594                    num_parents,
595                    allow_n,
596                    threads,
597                );
598            } else {
599                Self::initialize_mutations(
600                    &mut lookup,
601                    &mut num_ambiguous,
602                    &parent_data,
603                    seq_len,
604                    num_parents,
605                    allow_n,
606                );
607            }
608
609            #[cfg(not(feature = "parallel"))]
610            {
611                Self::initialize_mutations(
612                    &mut lookup,
613                    &mut num_ambiguous,
614                    &parent_data,
615                    seq_len,
616                    num_parents,
617                    allow_n,
618                );
619            }
620        }
621
622        Ok(SeqHash {
623            parents: parent_data,
624            num_parents,
625            seq_len,
626            lookup,
627            num_ambiguous,
628            exact_only,
629            allow_n,
630            normalize_case,
631        })
632    }
633
634    /// Internal function used to initialize the parent data in the lookup table
635    fn initialize_parents<S: AsRef<[u8]>>(
636        lookup: &mut HashMap<u64, Entry>,
637        parent_data: &mut Vec<u8>,
638        num_ambiguous: &mut usize,
639        parents: &[S],
640        seq_len: usize,
641        normalize_case: bool,
642        allow_n: bool,
643    ) -> Result<(), SeqHashError> {
644        // First pass: validate and store parents, insert parent entries
645        for (idx, parent) in parents.iter().enumerate() {
646            let seq = parent.as_ref();
647
648            // Check length consistency
649            if seq.len() != seq_len {
650                return Err(SeqHashError::InconsistentLength {
651                    expected: seq_len,
652                    found: seq.len(),
653                    index: idx,
654                });
655            }
656
657            // Normalize case if requested
658            let normalized_seq: Vec<u8>;
659            let seq_to_use = if normalize_case {
660                normalized_seq = seq.to_ascii_uppercase();
661                &normalized_seq
662            } else {
663                seq
664            };
665
666            // Validate bases (using normalized sequence)
667            for (pos, &base) in seq_to_use.iter().enumerate() {
668                if !is_valid_base_with_n(base, allow_n) {
669                    return Err(SeqHashError::InvalidBase {
670                        index: idx,
671                        pos,
672                        base,
673                    });
674                }
675            }
676
677            // Store parent sequence (normalized if requested)
678            parent_data.extend_from_slice(seq_to_use);
679
680            // Insert parent into lookup (using normalized sequence)
681            let hash = hash_sequence(seq_to_use);
682            if let Some(existing) = lookup.get(&hash) {
683                if existing.is_parent() {
684                    // Check if it's actually a duplicate sequence
685                    let existing_idx = existing.parent_idx();
686                    let existing_seq =
687                        &parent_data[existing_idx * seq_len..(existing_idx + 1) * seq_len];
688                    if existing_seq == seq {
689                        return Err(SeqHashError::DuplicateParent {
690                            index: idx,
691                            original: existing_idx,
692                        });
693                    }
694                }
695                // Hash collision - mark as ambiguous
696                lookup.insert(hash, Entry::ambiguous());
697                *num_ambiguous += 1;
698            } else {
699                lookup.insert(hash, Entry::new_parent(idx as u32));
700            }
701        }
702
703        Ok(())
704    }
705
706    /// Internal function used to generate all mutational sequences
707    fn initialize_mutations(
708        lookup: &mut HashMap<u64, Entry>,
709        num_ambiguous: &mut usize,
710        parent_data: &[u8],
711        seq_len: usize,
712        num_parents: usize,
713        allow_n: bool,
714    ) {
715        let mut mutant_seq = vec![0u8; seq_len];
716
717        // Choose mutation alphabet based on allow_n setting
718        let mutation_bases: &[u8] = if allow_n {
719            &VALID_BASES_WITH_N
720        } else {
721            &VALID_BASES
722        };
723
724        for parent_idx in 0..num_parents {
725            let parent_start = parent_idx * seq_len;
726            let parent_seq = &parent_data[parent_start..parent_start + seq_len];
727
728            for pos in 0..seq_len {
729                let original_base = parent_seq[pos];
730
731                for &new_base in mutation_bases {
732                    if new_base == original_base {
733                        continue;
734                    }
735
736                    // Create mutant sequence
737                    mutant_seq.copy_from_slice(parent_seq);
738                    mutant_seq[pos] = new_base;
739
740                    let hash = hash_sequence(&mutant_seq);
741
742                    match lookup.get(&hash) {
743                        None => {
744                            // New entry
745                            lookup.insert(
746                                hash,
747                                Entry::new_mismatch(
748                                    parent_idx as u32,
749                                    pos as u16,
750                                    original_base,
751                                    new_base,
752                                ),
753                            );
754                        }
755                        Some(existing) => {
756                            // If collision is with a parent entry, keep the parent
757                            // (exact matches always take precedence)
758                            // If collision is with another mismatch entry, mark ambiguous
759                            if !existing.is_ambiguous() && !existing.is_parent() {
760                                lookup.insert(hash, Entry::ambiguous());
761                                *num_ambiguous += 1;
762                            }
763                        }
764                    }
765                }
766            }
767        }
768    }
769
770    /// Query a sequence.
771    ///
772    /// Returns the match if unambiguous, None if ambiguous or not found.
773    #[inline]
774    #[must_use]
775    pub fn query(&self, seq: &[u8]) -> Option<Match> {
776        if seq.len() != self.seq_len {
777            return None;
778        }
779
780        let hash = hash_sequence(seq);
781        let entry = *self.lookup.get(&hash)?;
782
783        if entry.is_ambiguous() {
784            return None;
785        }
786
787        if entry.is_parent() {
788            // Verify exact match
789            let parent_idx = entry.parent_idx();
790            let parent_seq = self.get_parent(parent_idx)?;
791            if seq == parent_seq {
792                Some(Match::Exact { parent_idx })
793            } else {
794                None // Hash collision, not actual match
795            }
796        } else {
797            // Verify mismatch match
798            let parent_idx = entry.parent_idx();
799            let pos = entry.position();
800            let original_base = entry.original_base();
801            let mutated_base = entry.mutated_base();
802
803            let parent_seq = self.get_parent(parent_idx)?;
804
805            // The query should have mutated_base at pos, parent has original_base
806            if seq[pos] != mutated_base || parent_seq[pos] != original_base {
807                return None;
808            }
809
810            // All other positions should match - use slice comparisons for vectorization
811            if seq[..pos] != parent_seq[..pos] || seq[pos + 1..] != parent_seq[pos + 1..] {
812                return None;
813            }
814
815            Some(Match::Mismatch { parent_idx, pos })
816        }
817    }
818
819    /// Check if a sequence would be ambiguous (maps to multiple parents).
820    #[inline]
821    #[must_use]
822    pub fn is_ambiguous(&self, seq: &[u8]) -> bool {
823        if seq.len() != self.seq_len {
824            return false;
825        }
826
827        let hash = hash_sequence(seq);
828        self.lookup.get(&hash).is_some_and(|e| e.is_ambiguous())
829    }
830
831    /// Query at a specific position within a longer sequence.
832    ///
833    /// Extracts a subsequence of length `seq_len` starting at `pos` and queries it.
834    /// Returns `None` if the position is out of bounds or no match is found.
835    ///
836    /// # Example
837    ///
838    /// ```
839    /// use seqhash::SeqHash;
840    ///
841    /// let parents: Vec<&[u8]> = vec![b"ACGT"];
842    /// let index = SeqHash::new(&parents).unwrap();
843    ///
844    /// // Target sequence is embedded at position 2
845    /// let read = b"NNACGTNN";
846    /// assert!(index.query_at(read, 2).is_some());
847    /// ```
848    #[inline]
849    #[must_use]
850    pub fn query_at(&self, seq: &[u8], pos: usize) -> Option<Match> {
851        let end = pos.checked_add(self.seq_len)?;
852        if end > seq.len() {
853            return None;
854        }
855        self.query(&seq[pos..end])
856    }
857
858    /// Query at a position with remapping window.
859    ///
860    /// Tries `pos` first, then alternates `+1, -1, +2, -2, ...` up to `window`.
861    /// Returns the first match found, or `None` if no match within the window.
862    ///
863    /// This is useful when the target position may drift slightly due to small indels.
864    ///
865    /// # Example
866    ///
867    /// ```
868    /// use seqhash::SeqHash;
869    ///
870    /// let parents: Vec<&[u8]> = vec![b"ACGT"];
871    /// let index = SeqHash::new(&parents).unwrap();
872    ///
873    /// // Target is at position 3, but we think it's at position 2
874    /// let read = b"NNNACGTNN";
875    /// assert!(index.query_at_with_remap(read, 2, 2).is_some());
876    /// ```
877    #[inline]
878    #[must_use]
879    pub fn query_at_with_remap(&self, seq: &[u8], pos: usize, window: usize) -> Option<Match> {
880        self.query_at_with_remap_offset(seq, pos, window)
881            .map(|(m, _)| m)
882    }
883
884    /// Query at a position with remapping, also returning the offset where match was found.
885    ///
886    /// Tries `pos` first, then alternates `+1, -1, +2, -2, ...` up to `window`.
887    /// Returns the match and offset (0 for direct hit, positive for downstream, negative for upstream).
888    ///
889    /// This is useful when you want to track position drift statistics.
890    ///
891    /// # Example
892    ///
893    /// ```
894    /// use seqhash::SeqHash;
895    ///
896    /// let parents: Vec<&[u8]> = vec![b"ACGT"];
897    /// let index = SeqHash::new(&parents).unwrap();
898    ///
899    /// // Target is at position 3, but we think it's at position 2
900    /// let read = b"NNNACGTNN";
901    /// let result = index.query_at_with_remap_offset(read, 2, 2);
902    /// assert!(matches!(result, Some((_, 1)))); // Found at offset +1
903    /// ```
904    #[must_use]
905    pub fn query_at_with_remap_offset(
906        &self,
907        seq: &[u8],
908        pos: usize,
909        window: usize,
910    ) -> Option<(Match, isize)> {
911        // Try exact position first
912        if let Some(m) = self.query_at(seq, pos) {
913            return Some((m, 0));
914        }
915
916        // Alternate +1, -1, +2, -2, etc.
917        for offset in 1..=window {
918            if let Some(m) = self.query_at(seq, pos + offset) {
919                return Some((m, offset as isize));
920            }
921            if offset <= pos {
922                if let Some(m) = self.query_at(seq, pos - offset) {
923                    return Some((m, -(offset as isize)));
924                }
925            }
926        }
927
928        None
929    }
930
931    /// Sliding window search from start of sequence.
932    ///
933    /// Scans through the sequence looking for the first match.
934    /// Returns the match and its position in the input sequence.
935    ///
936    /// This is useful when the target position is unknown.
937    ///
938    /// # Example
939    ///
940    /// ```
941    /// use seqhash::SeqHash;
942    ///
943    /// let parents: Vec<&[u8]> = vec![b"ACGT"];
944    /// let index = SeqHash::new(&parents).unwrap();
945    ///
946    /// let read = b"NNNACGTNNN";
947    /// let result = index.query_sliding(read);
948    /// assert!(matches!(result, Some((_, 3)))); // Found at position 3
949    /// ```
950    #[must_use]
951    pub fn query_sliding(&self, seq: &[u8]) -> Option<(Match, usize)> {
952        self.query_sliding_iter(seq).next()
953    }
954
955    /// Sliding window search returning an iterator over all matches.
956    ///
957    /// Scans through the sequence and yields all matches found.
958    /// This is useful when a sequence may contain multiple target regions.
959    ///
960    /// # Example
961    ///
962    /// ```
963    /// use seqhash::SeqHash;
964    ///
965    /// let parents: Vec<&[u8]> = vec![b"ACGT"];
966    /// let index = SeqHash::new(&parents).unwrap();
967    ///
968    /// let read = b"ACGTNNACGT"; // Two occurrences
969    /// let matches: Vec<_> = index.query_sliding_iter(read).collect();
970    /// assert_eq!(matches.len(), 2);
971    /// assert_eq!(matches[0].1, 0); // First at position 0
972    /// assert_eq!(matches[1].1, 6); // Second at position 6
973    /// ```
974    pub fn query_sliding_iter<'a>(
975        &'a self,
976        seq: &'a [u8],
977    ) -> impl Iterator<Item = (Match, usize)> + 'a {
978        let num_positions = if seq.len() >= self.seq_len {
979            seq.len() - self.seq_len + 1
980        } else {
981            0
982        };
983        (0..num_positions)
984            .filter_map(move |pos| self.query(&seq[pos..pos + self.seq_len]).map(|m| (m, pos)))
985    }
986
987    /// Get a parent sequence by index.
988    #[inline]
989    #[must_use]
990    pub fn get_parent(&self, idx: usize) -> Option<&[u8]> {
991        if idx >= self.num_parents {
992            return None;
993        }
994        let start = idx * self.seq_len;
995        let end = start + self.seq_len;
996        Some(&self.parents[start..end])
997    }
998
999    /// Iterate over all parent sequences.
1000    #[inline]
1001    pub fn iter_parents(&self) -> impl Iterator<Item = &[u8]> {
1002        self.parents.chunks_exact(self.seq_len)
1003    }
1004
1005    /// Number of parent sequences.
1006    #[inline]
1007    #[must_use]
1008    pub fn num_parents(&self) -> usize {
1009        self.num_parents
1010    }
1011
1012    /// Length of each sequence.
1013    #[inline]
1014    #[must_use]
1015    pub fn seq_len(&self) -> usize {
1016        self.seq_len
1017    }
1018
1019    /// Number of entries in the lookup table (parents + unambiguous mutations).
1020    #[inline]
1021    #[must_use]
1022    pub fn num_entries(&self) -> usize {
1023        self.lookup.len()
1024    }
1025
1026    /// Number of ambiguous sequences detected.
1027    #[inline]
1028    #[must_use]
1029    pub fn num_ambiguous(&self) -> usize {
1030        self.num_ambiguous
1031    }
1032
1033    /// Returns true if this index only supports exact matches.
1034    #[inline]
1035    #[must_use]
1036    pub fn is_exact_only(&self) -> bool {
1037        self.exact_only
1038    }
1039
1040    /// Returns true if this index allows N bases.
1041    #[inline]
1042    #[must_use]
1043    pub fn allows_n(&self) -> bool {
1044        self.allow_n
1045    }
1046
1047    /// Returns true if this index normalizes sequences to uppercase.
1048    #[inline]
1049    #[must_use]
1050    pub fn normalizes_case(&self) -> bool {
1051        self.normalize_case
1052    }
1053
1054    /// Check if a specific parent is within the specified hamming distance of the query sequence.
1055    ///
1056    /// This method calculates the hamming distance between the query and the specified parent.
1057    /// It returns true if the parent is within the specified distance.
1058    ///
1059    /// # Example
1060    ///
1061    /// ```
1062    /// use seqhash::SeqHash;
1063    ///
1064    /// let parents: Vec<&[u8]> = vec![
1065    ///     b"ACGTACGT",
1066    ///     b"GGGGCCCC",
1067    /// ];
1068    /// let index = SeqHash::new(&parents).unwrap();
1069    ///
1070    /// // Query differs by 1 base from first parent
1071    /// assert!(index.is_within_hdist(b"ACGTACGA", 0, 1));
1072    ///
1073    /// // Query differs by more than 1 base from first parent
1074    /// assert!(!index.is_within_hdist(b"TTTTTTTT", 0, 1));
1075    ///
1076    /// // But it's within 8 bases
1077    /// assert!(index.is_within_hdist(b"TTTTTTTT", 0, 8));
1078    /// ```
1079    #[inline]
1080    #[must_use]
1081    pub fn is_within_hdist(&self, query: &[u8], parent_idx: usize, hdist: usize) -> bool {
1082        if query.len() != self.seq_len {
1083            return false;
1084        }
1085
1086        let parent_seq = match self.get_parent(parent_idx) {
1087            Some(seq) => seq,
1088            None => return false, // Invalid parent index
1089        };
1090
1091        // check hamming distance
1092        within_hamming_distance(query, parent_seq, hdist)
1093    }
1094
1095    /// Save the index to a file.
1096    ///
1097    /// The file will be saved in bincode format. The recommended extension is `.seqhash`.
1098    ///
1099    /// # Example
1100    ///
1101    /// ```no_run
1102    /// use seqhash::SeqHash;
1103    ///
1104    /// let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
1105    /// let index = SeqHash::new(&parents).unwrap();
1106    /// index.save("my_index.seqhash").unwrap();
1107    /// ```
1108    #[cfg(feature = "serde")]
1109    pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> std::io::Result<()> {
1110        let bytes = bincode::serialize(self)
1111            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
1112        std::fs::write(path, bytes)
1113    }
1114
1115    /// Load an index from a file.
1116    ///
1117    /// The file should be in bincode format, as created by [`SeqHash::save`].
1118    ///
1119    /// # Example
1120    ///
1121    /// ```no_run
1122    /// use seqhash::SeqHash;
1123    ///
1124    /// let index = SeqHash::load("my_index.seqhash").unwrap();
1125    /// ```
1126    #[cfg(feature = "serde")]
1127    pub fn load<P: AsRef<std::path::Path>>(path: P) -> std::io::Result<Self> {
1128        let bytes = std::fs::read(path)?;
1129        bincode::deserialize(&bytes)
1130            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
1131    }
1132}
1133
1134#[cfg(feature = "parallel")]
1135impl SeqHash {
1136    fn initialize_mutations_parallel(
1137        lookup: &mut HashMap<u64, Entry>,
1138        num_ambiguous: &mut usize,
1139        parent_data: &[u8],
1140        seq_len: usize,
1141        num_parents: usize,
1142        allow_n: bool,
1143        threads: usize,
1144    ) {
1145        use std::sync::Arc;
1146
1147        let mutation_bases: &[u8] = if allow_n {
1148            &VALID_BASES_WITH_N
1149        } else {
1150            &VALID_BASES
1151        };
1152
1153        let threads = threads.min(num_parents);
1154        let parents_per_thread = (num_parents / threads).max(1);
1155
1156        // Clone existing lookup to preserve parent entries and use for collision detection
1157        let parent_lookup = Arc::new(lookup.clone());
1158
1159        // Collect thread-local results
1160        let thread_results: Vec<_> = std::thread::scope(|s| {
1161            (0..threads)
1162                .map(|tid| {
1163                    let parent_idx_start = tid * parents_per_thread;
1164                    let parent_idx_end = if tid == threads - 1 {
1165                        num_parents
1166                    } else {
1167                        parent_idx_start + parents_per_thread
1168                    };
1169                    let parent_lookup = Arc::clone(&parent_lookup);
1170
1171                    s.spawn(move || {
1172                        // Estimate capacity: ~3-4 mutations per position per parent in this thread
1173                        let estimated_capacity =
1174                            (parent_idx_end - parent_idx_start) * seq_len * mutation_bases.len();
1175                        let mut local_lookup: HashMap<u64, Entry> =
1176                            HashMap::with_capacity(estimated_capacity);
1177                        let mut local_ambiguous = 0;
1178                        let mut mutant_seq = vec![0u8; seq_len];
1179
1180                        for parent_idx in parent_idx_start..parent_idx_end {
1181                            let parent_start = parent_idx * seq_len;
1182                            let parent_seq = &parent_data[parent_start..parent_start + seq_len];
1183
1184                            for pos in 0..seq_len {
1185                                let original_base = parent_seq[pos];
1186
1187                                for &new_base in mutation_bases {
1188                                    if new_base == original_base {
1189                                        continue;
1190                                    }
1191
1192                                    mutant_seq.copy_from_slice(parent_seq);
1193                                    mutant_seq[pos] = new_base;
1194                                    let hash = hash_sequence(&mutant_seq);
1195
1196                                    // Check against parent entries first (read-only, no contention)
1197                                    if let Some(parent_entry) = parent_lookup.get(&hash) {
1198                                        if parent_entry.is_parent() {
1199                                            // Skip - parent entries take precedence
1200                                            continue;
1201                                        }
1202                                    }
1203
1204                                    // Insert into local map
1205                                    match local_lookup.get(&hash) {
1206                                        None => {
1207                                            local_lookup.insert(
1208                                                hash,
1209                                                Entry::new_mismatch(
1210                                                    parent_idx as u32,
1211                                                    pos as u16,
1212                                                    original_base,
1213                                                    new_base,
1214                                                ),
1215                                            );
1216                                        }
1217                                        Some(existing) => {
1218                                            if !existing.is_ambiguous() && !existing.is_parent() {
1219                                                local_lookup.insert(hash, Entry::ambiguous());
1220                                                local_ambiguous += 1;
1221                                            }
1222                                        }
1223                                    }
1224                                }
1225                            }
1226                        }
1227
1228                        (local_lookup, local_ambiguous)
1229                    })
1230                })
1231                .collect::<Vec<_>>()
1232                .into_iter()
1233                .map(|handle| handle.join().unwrap())
1234                .collect()
1235        });
1236
1237        // Merge results into main lookup (which already has parent entries)
1238        for (local_lookup, local_ambiguous) in thread_results {
1239            for (hash, entry) in local_lookup {
1240                match lookup.get(&hash) {
1241                    None => {
1242                        lookup.insert(hash, entry);
1243                    }
1244                    Some(existing) => {
1245                        // Parent entries are already in lookup, skip those
1246                        if existing.is_parent() {
1247                            continue;
1248                        }
1249
1250                        // Handle collision between thread results
1251                        if !existing.is_ambiguous() && !entry.is_ambiguous() {
1252                            // Both are mismatch entries from different threads - mark as ambiguous
1253                            lookup.insert(hash, Entry::ambiguous());
1254                            *num_ambiguous += 1;
1255                        }
1256                    }
1257                }
1258            }
1259            *num_ambiguous += local_ambiguous;
1260        }
1261    }
1262}
1263
1264#[cfg(test)]
1265mod tests {
1266
1267    use super::*;
1268
1269    #[test]
1270    fn test_empty_parents() {
1271        let parents: Vec<&[u8]> = vec![];
1272        let result = SeqHash::new(&parents);
1273        assert_eq!(result.unwrap_err(), SeqHashError::EmptyParents);
1274    }
1275
1276    #[test]
1277    fn test_inconsistent_length() {
1278        let parents: Vec<&[u8]> = vec![b"ACGT", b"ACGTACGT"];
1279        let result = SeqHash::new(&parents);
1280        assert_eq!(
1281            result.unwrap_err(),
1282            SeqHashError::InconsistentLength {
1283                expected: 4,
1284                found: 8,
1285                index: 1
1286            }
1287        );
1288    }
1289
1290    #[test]
1291    fn test_invalid_base() {
1292        // X is always invalid
1293        let parents: Vec<&[u8]> = vec![b"ACGX"];
1294        let result = SeqHash::new(&parents);
1295        assert_eq!(
1296            result.unwrap_err(),
1297            SeqHashError::InvalidBase {
1298                index: 0,
1299                pos: 3,
1300                base: b'X'
1301            }
1302        );
1303
1304        // N is now valid by default
1305        let parents_with_n: Vec<&[u8]> = vec![b"ACGN"];
1306        assert!(SeqHash::new(&parents_with_n).is_ok());
1307    }
1308
1309    #[test]
1310    fn test_duplicate_parent() {
1311        let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG", b"ACGT"];
1312        let result = SeqHash::new(&parents);
1313        assert_eq!(
1314            result.unwrap_err(),
1315            SeqHashError::DuplicateParent {
1316                index: 2,
1317                original: 0
1318            }
1319        );
1320    }
1321
1322    #[test]
1323    fn test_exact_match() {
1324        let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT", b"GGGGCCCCAAAA", b"TTTTAAAACCCC"];
1325
1326        let index = SeqHash::new(&parents).unwrap();
1327
1328        assert_eq!(index.num_parents(), 3);
1329        assert_eq!(index.seq_len(), 12);
1330
1331        // Test exact matches
1332        assert_eq!(
1333            index.query(b"ACGTACGTACGT"),
1334            Some(Match::Exact { parent_idx: 0 })
1335        );
1336        assert_eq!(
1337            index.query(b"GGGGCCCCAAAA"),
1338            Some(Match::Exact { parent_idx: 1 })
1339        );
1340        assert_eq!(
1341            index.query(b"TTTTAAAACCCC"),
1342            Some(Match::Exact { parent_idx: 2 })
1343        );
1344    }
1345
1346    #[test]
1347    fn test_mismatch_match() {
1348        let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT"];
1349
1350        let index = SeqHash::new(&parents).unwrap();
1351
1352        // T->A at position 11
1353        let result = index.query(b"ACGTACGTACGA");
1354        assert_eq!(
1355            result,
1356            Some(Match::Mismatch {
1357                parent_idx: 0,
1358                pos: 11
1359            })
1360        );
1361
1362        // A->G at position 0
1363        let result = index.query(b"GCGTACGTACGT");
1364        assert_eq!(
1365            result,
1366            Some(Match::Mismatch {
1367                parent_idx: 0,
1368                pos: 0
1369            })
1370        );
1371
1372        // C->T at position 1
1373        let result = index.query(b"ATGTACGTACGT");
1374        assert_eq!(
1375            result,
1376            Some(Match::Mismatch {
1377                parent_idx: 0,
1378                pos: 1
1379            })
1380        );
1381    }
1382
1383    #[test]
1384    fn test_no_match() {
1385        let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT"];
1386
1387        let index = SeqHash::new(&parents).unwrap();
1388
1389        // Two mismatches - should not match
1390        assert_eq!(index.query(b"GCGTACGTACGA"), None);
1391
1392        // Completely different
1393        assert_eq!(index.query(b"TTTTTTTTTTTT"), None);
1394    }
1395
1396    #[test]
1397    fn test_wrong_length_query() {
1398        let parents: Vec<&[u8]> = vec![b"ACGTACGT"];
1399
1400        let index = SeqHash::new(&parents).unwrap();
1401
1402        // Too short
1403        assert_eq!(index.query(b"ACGT"), None);
1404
1405        // Too long
1406        assert_eq!(index.query(b"ACGTACGTACGT"), None);
1407    }
1408
1409    #[test]
1410    fn test_ambiguous_detection() {
1411        // Create two parents that are one mismatch apart
1412        // Their shared mutant will be ambiguous
1413        let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"TCGTACGT"]; // Differ only at position 0
1414
1415        let index = SeqHash::new(&parents).unwrap();
1416
1417        // Both parents should still be findable
1418        assert_eq!(
1419            index.query(b"ACGTACGT"),
1420            Some(Match::Exact { parent_idx: 0 })
1421        );
1422        assert_eq!(
1423            index.query(b"TCGTACGT"),
1424            Some(Match::Exact { parent_idx: 1 })
1425        );
1426
1427        // Mutations that don't overlap should work
1428        // ACGTACGT with A->G at position 4 = ACGTGCGT
1429        assert_eq!(
1430            index.query(b"ACGTGCGT"),
1431            Some(Match::Mismatch {
1432                parent_idx: 0,
1433                pos: 4
1434            })
1435        );
1436
1437        // Some sequences should be ambiguous (one mutation from both parents)
1438        // CCGTACGT is one mutation from both ACGTACGT (A->C at 0) and TCGTACGT (T->C at 0)
1439        assert!(index.is_ambiguous(b"CCGTACGT"));
1440        assert_eq!(index.query(b"CCGTACGT"), None);
1441    }
1442
1443    #[test]
1444    fn test_match_methods() {
1445        let exact = Match::Exact { parent_idx: 5 };
1446        assert_eq!(exact.parent_idx(), 5);
1447        assert!(exact.is_exact());
1448        assert_eq!(exact.mismatch_pos(), None);
1449
1450        let mismatch = Match::Mismatch {
1451            parent_idx: 3,
1452            pos: 7,
1453        };
1454        assert_eq!(mismatch.parent_idx(), 3);
1455        assert!(!mismatch.is_exact());
1456        assert_eq!(mismatch.mismatch_pos(), Some(7));
1457    }
1458
1459    #[test]
1460    fn test_get_parent() {
1461        let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG", b"TTTT"];
1462
1463        let index = SeqHash::new(&parents).unwrap();
1464
1465        assert_eq!(index.get_parent(0), Some(b"ACGT".as_slice()));
1466        assert_eq!(index.get_parent(1), Some(b"GGGG".as_slice()));
1467        assert_eq!(index.get_parent(2), Some(b"TTTT".as_slice()));
1468        assert_eq!(index.get_parent(3), None);
1469    }
1470
1471    #[test]
1472    fn test_entry_encoding() {
1473        // Test parent entry
1474        let entry = Entry::new_parent(12345);
1475        assert!(entry.is_parent());
1476        assert!(!entry.is_ambiguous());
1477        assert_eq!(entry.parent_idx(), 12345);
1478
1479        // Test mismatch entry
1480        let entry = Entry::new_mismatch(999, 100, b'A', b'T');
1481        assert!(!entry.is_parent());
1482        assert!(!entry.is_ambiguous());
1483        assert_eq!(entry.parent_idx(), 999);
1484        assert_eq!(entry.position(), 100);
1485        assert_eq!(entry.original_base(), b'A');
1486        assert_eq!(entry.mutated_base(), b'T');
1487
1488        // Test ambiguous entry
1489        let entry = Entry::ambiguous();
1490        assert!(entry.is_ambiguous());
1491    }
1492
1493    #[test]
1494    fn test_hash_function() {
1495        // Same sequence should produce same hash
1496        assert_eq!(hash_sequence(b"ACGT"), hash_sequence(b"ACGT"));
1497
1498        // Different sequences should (usually) produce different hashes
1499        assert_ne!(hash_sequence(b"ACGT"), hash_sequence(b"ACGA"));
1500    }
1501
1502    #[test]
1503    fn test_num_entries() {
1504        let parents: Vec<&[u8]> = vec![b"ACGT"];
1505
1506        let index = SeqHash::new(&parents).unwrap();
1507
1508        // 1 parent + up to 16 mutations (4 positions * 4 alternatives each, including N)
1509        // Some might collide if hash collisions occur, but shouldn't for this simple case
1510        assert!(index.num_entries() >= 1);
1511        assert!(index.num_entries() <= 17); // 1 + 16
1512    }
1513
1514    #[test]
1515    fn test_all_single_mutations() {
1516        let parents: Vec<&[u8]> = vec![b"AAAA"];
1517        let index = SeqHash::new(&parents).unwrap();
1518
1519        // Test all single mutations from AAAA
1520        let mutations = [
1521            (b"CAAA", 0),
1522            (b"GAAA", 0),
1523            (b"TAAA", 0),
1524            (b"ACAA", 1),
1525            (b"AGAA", 1),
1526            (b"ATAA", 1),
1527            (b"AACA", 2),
1528            (b"AAGA", 2),
1529            (b"AATA", 2),
1530            (b"AAAC", 3),
1531            (b"AAAG", 3),
1532            (b"AAAT", 3),
1533        ];
1534
1535        for (query, expected_pos) in mutations {
1536            let result = index.query(query);
1537            assert_eq!(
1538                result,
1539                Some(Match::Mismatch {
1540                    parent_idx: 0,
1541                    pos: expected_pos
1542                }),
1543                "Failed for query {:?}",
1544                std::str::from_utf8(query)
1545            );
1546        }
1547    }
1548
1549    #[test]
1550    fn test_error_display() {
1551        assert_eq!(
1552            SeqHashError::EmptyParents.to_string(),
1553            "no parent sequences provided"
1554        );
1555
1556        assert_eq!(
1557            SeqHashError::InconsistentLength {
1558                expected: 10,
1559                found: 5,
1560                index: 3
1561            }
1562            .to_string(),
1563            "parent at index 3 has length 5 (expected 10)"
1564        );
1565
1566        assert_eq!(
1567            SeqHashError::SequenceTooLong { len: 20000 }.to_string(),
1568            "sequence length 20000 exceeds maximum 16383"
1569        );
1570
1571        assert_eq!(
1572            SeqHashError::DuplicateParent {
1573                index: 5,
1574                original: 2
1575            }
1576            .to_string(),
1577            "parent at index 5 is duplicate of parent at index 2"
1578        );
1579
1580        assert_eq!(
1581            SeqHashError::InvalidBase {
1582                index: 1,
1583                pos: 4,
1584                base: b'N'
1585            }
1586            .to_string(),
1587            "invalid base 'N' at position 4 in parent 1"
1588        );
1589    }
1590
1591    #[test]
1592    fn test_multiple_parents_different_mutations() {
1593        // Three parents that are well-separated
1594        let parents: Vec<&[u8]> = vec![
1595            b"AAAAAAAA", // All A
1596            b"CCCCCCCC", // All C
1597            b"GGGGGGGG", // All G
1598        ];
1599
1600        let index = SeqHash::new(&parents).unwrap();
1601
1602        // Each parent's mutations should be unambiguous
1603        // since they're so different from each other
1604
1605        // Mutation of first parent
1606        assert_eq!(
1607            index.query(b"CAAAAAAA"),
1608            Some(Match::Mismatch {
1609                parent_idx: 0,
1610                pos: 0
1611            })
1612        );
1613
1614        // Mutation of second parent
1615        assert_eq!(
1616            index.query(b"ACCCCCCC"),
1617            Some(Match::Mismatch {
1618                parent_idx: 1,
1619                pos: 0
1620            })
1621        );
1622
1623        // Mutation of third parent
1624        assert_eq!(
1625            index.query(b"AGGGGGGG"),
1626            Some(Match::Mismatch {
1627                parent_idx: 2,
1628                pos: 0
1629            })
1630        );
1631    }
1632
1633    #[test]
1634    fn test_is_ambiguous_wrong_length() {
1635        let parents: Vec<&[u8]> = vec![b"ACGT"];
1636        let index = SeqHash::new(&parents).unwrap();
1637
1638        // Wrong length should return false, not panic
1639        assert!(!index.is_ambiguous(b"AC"));
1640        assert!(!index.is_ambiguous(b"ACGTACGT"));
1641    }
1642
1643    #[test]
1644    fn test_query_at() {
1645        let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG"];
1646        let index = SeqHash::new(&parents).unwrap();
1647
1648        // Exact match at position 2
1649        let read = b"NNACGTNN";
1650        assert_eq!(
1651            index.query_at(read, 2),
1652            Some(Match::Exact { parent_idx: 0 })
1653        );
1654
1655        // Exact match at position 0
1656        let read = b"ACGTNNNN";
1657        assert_eq!(
1658            index.query_at(read, 0),
1659            Some(Match::Exact { parent_idx: 0 })
1660        );
1661
1662        // Exact match at end
1663        let read = b"NNNNACGT";
1664        assert_eq!(
1665            index.query_at(read, 4),
1666            Some(Match::Exact { parent_idx: 0 })
1667        );
1668
1669        // Mismatch match
1670        let read = b"NNACGANN"; // T->A mismatch
1671        assert_eq!(
1672            index.query_at(read, 2),
1673            Some(Match::Mismatch {
1674                parent_idx: 0,
1675                pos: 3
1676            })
1677        );
1678
1679        // No match
1680        let read = b"NNTTTTNN";
1681        assert_eq!(index.query_at(read, 2), None);
1682
1683        // Second parent match
1684        let read = b"NNGGGGNN";
1685        assert_eq!(
1686            index.query_at(read, 2),
1687            Some(Match::Exact { parent_idx: 1 })
1688        );
1689    }
1690
1691    #[test]
1692    fn test_query_at_bounds() {
1693        let parents: Vec<&[u8]> = vec![b"ACGT"];
1694        let index = SeqHash::new(&parents).unwrap();
1695
1696        let read = b"NNNNACGT"; // 8 bytes, ACGT at position 4
1697
1698        // Position out of bounds (would read past end)
1699        assert_eq!(index.query_at(read, 5), None);
1700        assert_eq!(index.query_at(read, 6), None);
1701        assert_eq!(index.query_at(read, 100), None);
1702
1703        // Exactly at the boundary (last valid position)
1704        assert_eq!(
1705            index.query_at(read, 4),
1706            Some(Match::Exact { parent_idx: 0 })
1707        );
1708
1709        // Empty sequence
1710        assert_eq!(index.query_at(b"", 0), None);
1711
1712        // Sequence shorter than seq_len
1713        assert_eq!(index.query_at(b"AC", 0), None);
1714    }
1715
1716    #[test]
1717    fn test_query_at_with_remap() {
1718        let parents: Vec<&[u8]> = vec![b"ACGT"];
1719        let index = SeqHash::new(&parents).unwrap();
1720
1721        // Direct hit at expected position
1722        let read = b"NNACGTNNNN";
1723        assert_eq!(
1724            index.query_at_with_remap(read, 2, 3),
1725            Some(Match::Exact { parent_idx: 0 })
1726        );
1727
1728        // Hit at offset +1
1729        let read = b"NNNACGTNNN";
1730        assert_eq!(
1731            index.query_at_with_remap(read, 2, 3),
1732            Some(Match::Exact { parent_idx: 0 })
1733        );
1734
1735        // Hit at offset -1
1736        let read = b"NACGTNNNNN";
1737        assert_eq!(
1738            index.query_at_with_remap(read, 2, 3),
1739            Some(Match::Exact { parent_idx: 0 })
1740        );
1741
1742        // Hit at offset +2
1743        let read = b"NNNNACGTNN";
1744        assert_eq!(
1745            index.query_at_with_remap(read, 2, 3),
1746            Some(Match::Exact { parent_idx: 0 })
1747        );
1748
1749        // Hit at offset -2
1750        let read = b"ACGTNNNNNN";
1751        assert_eq!(
1752            index.query_at_with_remap(read, 2, 3),
1753            Some(Match::Exact { parent_idx: 0 })
1754        );
1755
1756        // No hit within window
1757        let read = b"NNNNNNACGT";
1758        assert_eq!(index.query_at_with_remap(read, 2, 2), None);
1759
1760        // Hit just at edge of window
1761        let read = b"NNNNNACGTN";
1762        assert_eq!(
1763            index.query_at_with_remap(read, 2, 3),
1764            Some(Match::Exact { parent_idx: 0 })
1765        );
1766    }
1767
1768    #[test]
1769    fn test_query_at_with_remap_offset() {
1770        let parents: Vec<&[u8]> = vec![b"ACGT"];
1771        let index = SeqHash::new(&parents).unwrap();
1772
1773        // Direct hit - offset should be 0
1774        let read = b"NNACGTNNNN";
1775        assert_eq!(
1776            index.query_at_with_remap_offset(read, 2, 3),
1777            Some((Match::Exact { parent_idx: 0 }, 0))
1778        );
1779
1780        // Hit at offset +1
1781        let read = b"NNNACGTNNN";
1782        assert_eq!(
1783            index.query_at_with_remap_offset(read, 2, 3),
1784            Some((Match::Exact { parent_idx: 0 }, 1))
1785        );
1786
1787        // Hit at offset -1
1788        let read = b"NACGTNNNNN";
1789        assert_eq!(
1790            index.query_at_with_remap_offset(read, 2, 3),
1791            Some((Match::Exact { parent_idx: 0 }, -1))
1792        );
1793
1794        // Hit at offset +2
1795        let read = b"NNNNACGTNN";
1796        assert_eq!(
1797            index.query_at_with_remap_offset(read, 2, 3),
1798            Some((Match::Exact { parent_idx: 0 }, 2))
1799        );
1800
1801        // Hit at offset -2
1802        let read = b"ACGTNNNNNN";
1803        assert_eq!(
1804            index.query_at_with_remap_offset(read, 2, 3),
1805            Some((Match::Exact { parent_idx: 0 }, -2))
1806        );
1807
1808        // Mismatch with offset tracking
1809        let read = b"NNNACGANN"; // T->A mismatch at offset +1
1810        let result = index.query_at_with_remap_offset(read, 2, 3);
1811        assert_eq!(
1812            result,
1813            Some((
1814                Match::Mismatch {
1815                    parent_idx: 0,
1816                    pos: 3
1817                },
1818                1
1819            ))
1820        );
1821
1822        // No hit within window
1823        let read = b"NNNNNNACGT";
1824        assert_eq!(index.query_at_with_remap_offset(read, 2, 2), None);
1825    }
1826
1827    #[test]
1828    fn test_query_at_with_remap_prefers_direct_hit() {
1829        // When there could be multiple matches at different offsets,
1830        // direct hit (offset 0) should be returned
1831        let parents: Vec<&[u8]> = vec![b"AAAA"];
1832        let index = SeqHash::new(&parents).unwrap();
1833
1834        // All A's - matches at multiple positions
1835        let read = b"AAAAAAAAAA";
1836
1837        // Should return offset 0 (direct hit)
1838        let result = index.query_at_with_remap_offset(read, 3, 3);
1839        assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1840    }
1841
1842    #[test]
1843    fn test_query_at_with_remap_edge_cases() {
1844        let parents: Vec<&[u8]> = vec![b"ACGT"];
1845        let index = SeqHash::new(&parents).unwrap();
1846
1847        // pos=0, can't go negative
1848        let read = b"NACGTNNNN";
1849        let result = index.query_at_with_remap_offset(read, 0, 3);
1850        assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 1)));
1851
1852        // pos=0, direct hit
1853        let read = b"ACGTNNNN";
1854        let result = index.query_at_with_remap_offset(read, 0, 3);
1855        assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1856
1857        // Window of 0 means only direct hit
1858        let read = b"NACGTNNNN";
1859        let result = index.query_at_with_remap_offset(read, 0, 0);
1860        assert_eq!(result, None);
1861
1862        let read = b"ACGTNNNN";
1863        let result = index.query_at_with_remap_offset(read, 0, 0);
1864        assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1865    }
1866
1867    #[test]
1868    fn test_query_sliding() {
1869        let parents: Vec<&[u8]> = vec![b"ACGT", b"GGGG"];
1870        let index = SeqHash::new(&parents).unwrap();
1871
1872        // Match at beginning
1873        let read = b"ACGTNNNN";
1874        assert_eq!(
1875            index.query_sliding(read),
1876            Some((Match::Exact { parent_idx: 0 }, 0))
1877        );
1878
1879        // Match in middle
1880        let read = b"NNNACGTNNN";
1881        assert_eq!(
1882            index.query_sliding(read),
1883            Some((Match::Exact { parent_idx: 0 }, 3))
1884        );
1885
1886        // Match at end
1887        let read = b"NNNNACGT";
1888        assert_eq!(
1889            index.query_sliding(read),
1890            Some((Match::Exact { parent_idx: 0 }, 4))
1891        );
1892
1893        // Second parent exact match at position 0
1894        let read = b"GGGGTTTT";
1895        assert_eq!(
1896            index.query_sliding(read),
1897            Some((Match::Exact { parent_idx: 1 }, 0))
1898        );
1899
1900        // Mismatch match
1901        let read = b"NNACGANN"; // T->A
1902        assert_eq!(
1903            index.query_sliding(read),
1904            Some((
1905                Match::Mismatch {
1906                    parent_idx: 0,
1907                    pos: 3
1908                },
1909                2
1910            ))
1911        );
1912
1913        // No match
1914        let read = b"TTTTTTTT";
1915        assert_eq!(index.query_sliding(read), None);
1916
1917        // Sequence too short
1918        let read = b"AC";
1919        assert_eq!(index.query_sliding(read), None);
1920
1921        // Exact length match
1922        let read = b"ACGT";
1923        assert_eq!(
1924            index.query_sliding(read),
1925            Some((Match::Exact { parent_idx: 0 }, 0))
1926        );
1927    }
1928
1929    #[test]
1930    fn test_query_sliding_returns_first_match() {
1931        let parents: Vec<&[u8]> = vec![b"AAAA"];
1932        let index = SeqHash::new(&parents).unwrap();
1933
1934        // Multiple possible matches, should return first one
1935        let read = b"AAAAAAAAA";
1936        let result = index.query_sliding(read);
1937        assert_eq!(result, Some((Match::Exact { parent_idx: 0 }, 0)));
1938    }
1939
1940    #[test]
1941    fn test_query_sliding_empty() {
1942        let parents: Vec<&[u8]> = vec![b"ACGT"];
1943        let index = SeqHash::new(&parents).unwrap();
1944
1945        assert_eq!(index.query_sliding(b""), None);
1946    }
1947
1948    #[test]
1949    fn test_query_sliding_iter_multiple_matches() {
1950        let parents: Vec<&[u8]> = vec![b"ACGT"];
1951        let index = SeqHash::new(&parents).unwrap();
1952
1953        // Two exact matches
1954        let read = b"ACGTNNACGT";
1955        let matches: Vec<_> = index.query_sliding_iter(read).collect();
1956        assert_eq!(matches.len(), 2);
1957        assert_eq!(matches[0], (Match::Exact { parent_idx: 0 }, 0));
1958        assert_eq!(matches[1], (Match::Exact { parent_idx: 0 }, 6));
1959    }
1960
1961    #[test]
1962    fn test_query_sliding_iter_mixed_matches() {
1963        let parents: Vec<&[u8]> = vec![b"ACGT"];
1964        let index = SeqHash::new(&parents).unwrap();
1965
1966        // One exact, one mismatch
1967        let read = b"ACGTNNACGA"; // ACGT at 0, ACGA (T->A mismatch) at 6
1968        let matches: Vec<_> = index.query_sliding_iter(read).collect();
1969        assert_eq!(matches.len(), 2);
1970        assert_eq!(matches[0], (Match::Exact { parent_idx: 0 }, 0));
1971        assert_eq!(
1972            matches[1],
1973            (
1974                Match::Mismatch {
1975                    parent_idx: 0,
1976                    pos: 3
1977                },
1978                6
1979            )
1980        );
1981    }
1982
1983    #[test]
1984    fn test_query_sliding_iter_no_matches() {
1985        let parents: Vec<&[u8]> = vec![b"ACGT"];
1986        let index = SeqHash::new(&parents).unwrap();
1987
1988        let read = b"TTTTTTTTTT";
1989        let matches: Vec<_> = index.query_sliding_iter(read).collect();
1990        assert!(matches.is_empty());
1991    }
1992
1993    #[test]
1994    fn test_query_sliding_iter_empty_seq() {
1995        let parents: Vec<&[u8]> = vec![b"ACGT"];
1996        let index = SeqHash::new(&parents).unwrap();
1997
1998        let matches: Vec<_> = index.query_sliding_iter(b"").collect();
1999        assert!(matches.is_empty());
2000    }
2001
2002    #[test]
2003    fn test_query_sliding_iter_short_seq() {
2004        let parents: Vec<&[u8]> = vec![b"ACGT"];
2005        let index = SeqHash::new(&parents).unwrap();
2006
2007        let matches: Vec<_> = index.query_sliding_iter(b"AC").collect();
2008        assert!(matches.is_empty());
2009    }
2010
2011    #[test]
2012    fn test_query_sliding_iter_lazy() {
2013        let parents: Vec<&[u8]> = vec![b"ACGT"];
2014        let index = SeqHash::new(&parents).unwrap();
2015
2016        // Many matches, but only take first 2
2017        let read = b"ACGTACGTACGTACGT";
2018        let matches: Vec<_> = index.query_sliding_iter(read).take(2).collect();
2019        assert_eq!(matches.len(), 2);
2020        assert_eq!(matches[0].1, 0);
2021        assert_eq!(matches[1].1, 4);
2022    }
2023
2024    #[test]
2025    fn test_query_sliding_iter_multiple_parents() {
2026        let parents: Vec<&[u8]> = vec![b"AAAA", b"GGGG"];
2027        let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2028
2029        // With exact-only mode, only exact matches are found
2030        let read = b"AAAACCGGGG";
2031        let matches: Vec<_> = index.query_sliding_iter(read).collect();
2032        assert_eq!(matches.len(), 2);
2033        assert_eq!(matches[0], (Match::Exact { parent_idx: 0 }, 0));
2034        assert_eq!(matches[1], (Match::Exact { parent_idx: 1 }, 6));
2035    }
2036
2037    #[test]
2038    fn test_string_input() {
2039        // Test that we can use String/&str via AsRef<[u8]>
2040        let parents: Vec<String> = vec!["ACGTACGT".to_string(), "GGGGCCCC".to_string()];
2041
2042        let index = SeqHash::new(&parents).unwrap();
2043        assert_eq!(index.num_parents(), 2);
2044        assert_eq!(
2045            index.query(b"ACGTACGT"),
2046            Some(Match::Exact { parent_idx: 0 })
2047        );
2048    }
2049
2050    #[test]
2051    fn test_vec_u8_input() {
2052        let parents: Vec<Vec<u8>> = vec![b"ACGTACGT".to_vec(), b"GGGGCCCC".to_vec()];
2053
2054        let index = SeqHash::new(&parents).unwrap();
2055        assert_eq!(index.num_parents(), 2);
2056        assert_eq!(
2057            index.query(b"ACGTACGT"),
2058            Some(Match::Exact { parent_idx: 0 })
2059        );
2060    }
2061
2062    #[test]
2063    fn test_builder_default() {
2064        let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
2065
2066        let index = SeqHashBuilder::default().build(&parents).unwrap();
2067
2068        assert_eq!(index.num_parents(), 2);
2069        assert!(!index.is_exact_only());
2070
2071        // Should support exact matches
2072        assert_eq!(
2073            index.query(b"ACGTACGT"),
2074            Some(Match::Exact { parent_idx: 0 })
2075        );
2076
2077        // Should support mismatch matches
2078        assert_eq!(
2079            index.query(b"ACGTACGA"), // T->A at pos 7
2080            Some(Match::Mismatch {
2081                parent_idx: 0,
2082                pos: 7
2083            })
2084        );
2085    }
2086
2087    #[test]
2088    fn test_builder_exact_only() {
2089        let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
2090
2091        let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2092
2093        assert_eq!(index.num_parents(), 2);
2094        assert!(index.is_exact_only());
2095
2096        // Should support exact matches
2097        assert_eq!(
2098            index.query(b"ACGTACGT"),
2099            Some(Match::Exact { parent_idx: 0 })
2100        );
2101
2102        // Should NOT support mismatch matches
2103        assert_eq!(index.query(b"ACGTACGA"), None);
2104
2105        // Exact-only index should have fewer entries (just parents)
2106        assert_eq!(index.num_entries(), 2);
2107    }
2108
2109    #[test]
2110    fn test_builder_exclude_n() {
2111        // With exclude_n, N should be rejected
2112        let parents_with_n: Vec<&[u8]> = vec![b"ACGTNCGT"];
2113        let result = SeqHashBuilder::default().exclude_n().build(&parents_with_n);
2114        assert_eq!(
2115            result.unwrap_err(),
2116            SeqHashError::InvalidBase {
2117                index: 0,
2118                pos: 4,
2119                base: b'N'
2120            }
2121        );
2122
2123        // By default, N should be accepted
2124        let index = SeqHashBuilder::default().build(&parents_with_n).unwrap();
2125
2126        assert_eq!(index.num_parents(), 1);
2127
2128        // Exact match should work
2129        assert_eq!(
2130            index.query(b"ACGTNCGT"),
2131            Some(Match::Exact { parent_idx: 0 })
2132        );
2133
2134        // Mismatch at non-N position should work
2135        assert_eq!(
2136            index.query(b"GCGTNCGT"), // A->G at pos 0
2137            Some(Match::Mismatch {
2138                parent_idx: 0,
2139                pos: 0
2140            })
2141        );
2142    }
2143
2144    #[test]
2145    fn test_builder_generates_n_mutations() {
2146        let parents: Vec<&[u8]> = vec![b"ACGT"];
2147
2148        let index = SeqHashBuilder::default().build(&parents).unwrap();
2149
2150        // Should generate N mutations at each position
2151        assert_eq!(
2152            index.query(b"NCGT"), // A->N at pos 0
2153            Some(Match::Mismatch {
2154                parent_idx: 0,
2155                pos: 0
2156            })
2157        );
2158        assert_eq!(
2159            index.query(b"ANGT"), // C->N at pos 1
2160            Some(Match::Mismatch {
2161                parent_idx: 0,
2162                pos: 1
2163            })
2164        );
2165        assert_eq!(
2166            index.query(b"ACNT"), // G->N at pos 2
2167            Some(Match::Mismatch {
2168                parent_idx: 0,
2169                pos: 2
2170            })
2171        );
2172        assert_eq!(
2173            index.query(b"ACGN"), // T->N at pos 3
2174            Some(Match::Mismatch {
2175                parent_idx: 0,
2176                pos: 3
2177            })
2178        );
2179    }
2180
2181    #[test]
2182    fn test_builder_exclude_n_no_n_mutations() {
2183        let parents: Vec<&[u8]> = vec![b"ACGT"];
2184
2185        let index = SeqHashBuilder::default()
2186            .exclude_n()
2187            .build(&parents)
2188            .unwrap();
2189
2190        // Should NOT generate N mutations
2191        assert_eq!(index.query(b"NCGT"), None);
2192        assert_eq!(index.query(b"ANGT"), None);
2193        assert_eq!(index.query(b"ACNT"), None);
2194        assert_eq!(index.query(b"ACGN"), None);
2195
2196        // But regular mutations should still work
2197        assert_eq!(
2198            index.query(b"GCGT"), // A->G at pos 0
2199            Some(Match::Mismatch {
2200                parent_idx: 0,
2201                pos: 0
2202            })
2203        );
2204    }
2205
2206    #[test]
2207    fn test_builder_exact_with_n() {
2208        let parents: Vec<&[u8]> = vec![b"ACNTNC"];
2209
2210        let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2211
2212        assert!(index.is_exact_only());
2213        assert_eq!(index.num_entries(), 1);
2214
2215        // Only exact match should work
2216        assert_eq!(index.query(b"ACNTNC"), Some(Match::Exact { parent_idx: 0 }));
2217        assert_eq!(index.query(b"GCNTNC"), None);
2218    }
2219
2220    #[test]
2221    fn test_new_allows_n_by_default() {
2222        // SeqHash::new should allow N bases by default
2223        let parents: Vec<&[u8]> = vec![b"ACNGT"];
2224        let index = SeqHash::new(&parents).unwrap();
2225
2226        assert_eq!(index.num_parents(), 1);
2227        assert_eq!(index.query(b"ACNGT"), Some(Match::Exact { parent_idx: 0 }));
2228    }
2229
2230    #[test]
2231    fn test_case_normalization_default() {
2232        // By default, sequences should be normalized to uppercase
2233        let parents: Vec<&[u8]> = vec![b"acgtacgt", b"ggggcccc"];
2234        let index = SeqHash::new(&parents).unwrap();
2235
2236        assert!(index.normalizes_case());
2237        assert_eq!(index.num_parents(), 2);
2238
2239        // Stored sequences should be uppercase
2240        assert_eq!(index.get_parent(0), Some(b"ACGTACGT".as_slice()));
2241        assert_eq!(index.get_parent(1), Some(b"GGGGCCCC".as_slice()));
2242
2243        // Query with uppercase should work
2244        assert_eq!(
2245            index.query(b"ACGTACGT"),
2246            Some(Match::Exact { parent_idx: 0 })
2247        );
2248
2249        // Query with uppercase mismatch should work
2250        assert_eq!(
2251            index.query(b"ACGTACGA"), // T->A at pos 7
2252            Some(Match::Mismatch {
2253                parent_idx: 0,
2254                pos: 7
2255            })
2256        );
2257    }
2258
2259    #[test]
2260    fn test_keep_case() {
2261        // With keep_case, sequences should preserve lowercase
2262        let parents: Vec<&[u8]> = vec![b"acgtACGT", b"GGGGcccc"];
2263        let index = SeqHashBuilder::default()
2264            .keep_case()
2265            .build(&parents)
2266            .unwrap();
2267
2268        assert!(!index.normalizes_case());
2269        assert_eq!(index.num_parents(), 2);
2270
2271        // Stored sequences should preserve case
2272        assert_eq!(index.get_parent(0), Some(b"acgtACGT".as_slice()));
2273        assert_eq!(index.get_parent(1), Some(b"GGGGcccc".as_slice()));
2274
2275        // Query with exact case should work
2276        assert_eq!(
2277            index.query(b"acgtACGT"),
2278            Some(Match::Exact { parent_idx: 0 })
2279        );
2280
2281        // Query with different case should NOT work
2282        assert_eq!(index.query(b"ACGTACGT"), None);
2283
2284        // Mismatch with correct case should work
2285        assert_eq!(
2286            index.query(b"acgtACGA"), // T->A at pos 7
2287            Some(Match::Mismatch {
2288                parent_idx: 0,
2289                pos: 7
2290            })
2291        );
2292    }
2293
2294    #[test]
2295    fn test_case_normalization_with_builder() {
2296        // Using builder default should normalize case
2297        let parents: Vec<&[u8]> = vec![b"acgt", b"gggg"];
2298        let index = SeqHashBuilder::default().build(&parents).unwrap();
2299
2300        assert!(index.normalizes_case());
2301        assert_eq!(index.get_parent(0), Some(b"ACGT".as_slice()));
2302        assert_eq!(index.get_parent(1), Some(b"GGGG".as_slice()));
2303    }
2304
2305    #[test]
2306    fn test_case_normalization_mixed_case_parents() {
2307        // Mixed case input should be normalized to uppercase
2308        let parents: Vec<&[u8]> = vec![b"AcGt", b"gGcC"];
2309        let index = SeqHash::new(&parents).unwrap();
2310
2311        assert_eq!(index.get_parent(0), Some(b"ACGT".as_slice()));
2312        assert_eq!(index.get_parent(1), Some(b"GGCC".as_slice()));
2313
2314        // Exact match with uppercase
2315        assert_eq!(index.query(b"ACGT"), Some(Match::Exact { parent_idx: 0 }));
2316        assert_eq!(index.query(b"GGCC"), Some(Match::Exact { parent_idx: 1 }));
2317    }
2318
2319    #[test]
2320    fn test_keep_case_exact_only() {
2321        // Combining keep_case with exact_only
2322        let parents: Vec<&[u8]> = vec![b"acgtACGT"];
2323        let index = SeqHashBuilder::default()
2324            .keep_case()
2325            .exact()
2326            .build(&parents)
2327            .unwrap();
2328
2329        assert!(!index.normalizes_case());
2330        assert!(index.is_exact_only());
2331
2332        // Should match exact case
2333        assert_eq!(
2334            index.query(b"acgtACGT"),
2335            Some(Match::Exact { parent_idx: 0 })
2336        );
2337
2338        // Should not match different case
2339        assert_eq!(index.query(b"ACGTACGT"), None);
2340
2341        // Should not support mismatches
2342        assert_eq!(index.query(b"acgtACGA"), None);
2343    }
2344
2345    #[test]
2346    fn test_case_normalization_with_n() {
2347        // Normalization should work with N bases
2348        let parents: Vec<&[u8]> = vec![b"acgtn"];
2349        let index = SeqHash::new(&parents).unwrap();
2350
2351        assert_eq!(index.get_parent(0), Some(b"ACGTN".as_slice()));
2352        assert_eq!(index.query(b"ACGTN"), Some(Match::Exact { parent_idx: 0 }));
2353    }
2354
2355    #[test]
2356    fn test_keep_case_with_lowercase_validation() {
2357        // When keeping case, lowercase letters should still be valid DNA bases
2358        let parents: Vec<&[u8]> = vec![b"acgt"];
2359        let index = SeqHashBuilder::default()
2360            .keep_case()
2361            .build(&parents)
2362            .unwrap();
2363
2364        // Should validate lowercase as valid bases
2365        assert_eq!(index.num_parents(), 1);
2366        assert_eq!(index.get_parent(0), Some(b"acgt".as_slice()));
2367    }
2368
2369    #[test]
2370    fn test_is_within_hdist() {
2371        let parents: Vec<&[u8]> = vec![
2372            b"ACGTACGT", // parent 0
2373            b"GGGGCCCC", // parent 1
2374            b"TTTTAAAA", // parent 2
2375        ];
2376        let index = SeqHash::new(&parents).unwrap();
2377
2378        // Test exact match (distance 0)
2379        assert!(index.is_within_hdist(b"ACGTACGT", 0, 0));
2380        assert!(index.is_within_hdist(b"ACGTACGT", 0, 1));
2381
2382        // Test single mismatch
2383        assert!(!index.is_within_hdist(b"ACGTACGA", 0, 0)); // 1 mismatch, hdist 0
2384        assert!(index.is_within_hdist(b"ACGTACGA", 0, 1)); // 1 mismatch, hdist 1
2385        assert!(index.is_within_hdist(b"ACGTACGA", 0, 2)); // 1 mismatch, hdist 2
2386
2387        // Test multiple mismatches
2388        assert!(!index.is_within_hdist(b"ACGTACAA", 0, 1)); // 2 mismatches, hdist 1
2389        assert!(index.is_within_hdist(b"ACGTACAA", 0, 2)); // 2 mismatches, hdist 2
2390
2391        // Test completely different sequence
2392        assert!(!index.is_within_hdist(b"GGGGGGGG", 0, 3)); // 8 mismatches, hdist 3
2393        assert!(index.is_within_hdist(b"GGGGGGGG", 0, 8)); // 8 mismatches, hdist 8
2394
2395        // Test different parents
2396        assert!(index.is_within_hdist(b"GGGGCCCC", 1, 0)); // exact match to parent 1
2397        assert!(!index.is_within_hdist(b"GGGGCCCC", 0, 5)); // 6 mismatches from parent 0, hdist 5
2398        assert!(index.is_within_hdist(b"GGGGCCCC", 0, 6)); // 6 mismatches from parent 0, hdist 6
2399
2400        // Test invalid parent index
2401        assert!(!index.is_within_hdist(b"ACGTACGT", 99, 0));
2402
2403        // Test wrong query length
2404        assert!(!index.is_within_hdist(b"ACGT", 0, 10));
2405        assert!(!index.is_within_hdist(b"ACGTACGTACGT", 0, 10));
2406    }
2407}
2408
2409#[cfg(all(test, feature = "serde"))]
2410mod serde_tests {
2411    use super::*;
2412
2413    #[test]
2414    fn test_seqhash_roundtrip_json() {
2415        let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC", b"TTTTAAAA"];
2416        let index = SeqHash::new(&parents).unwrap();
2417
2418        // Serialize to JSON
2419        let json = serde_json::to_string(&index).unwrap();
2420
2421        // Deserialize back
2422        let restored: SeqHash = serde_json::from_str(&json).unwrap();
2423
2424        // Verify all properties match
2425        assert_eq!(restored.num_parents(), index.num_parents());
2426        assert_eq!(restored.seq_len(), index.seq_len());
2427        assert_eq!(restored.num_entries(), index.num_entries());
2428        assert_eq!(restored.num_ambiguous(), index.num_ambiguous());
2429        assert_eq!(restored.is_exact_only(), index.is_exact_only());
2430        assert_eq!(restored.allows_n(), index.allows_n());
2431
2432        // Verify queries work correctly
2433        assert_eq!(
2434            restored.query(b"ACGTACGT"),
2435            Some(Match::Exact { parent_idx: 0 })
2436        );
2437        assert_eq!(
2438            restored.query(b"GCGTACGT"), // A->G at pos 0
2439            Some(Match::Mismatch {
2440                parent_idx: 0,
2441                pos: 0
2442            })
2443        );
2444
2445        // Verify parent retrieval
2446        for i in 0..index.num_parents() {
2447            assert_eq!(restored.get_parent(i), index.get_parent(i));
2448        }
2449    }
2450
2451    #[test]
2452    fn test_seqhash_roundtrip_bincode() {
2453        let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT", b"GGGGCCCCAAAA"];
2454        let index = SeqHash::new(&parents).unwrap();
2455
2456        // Serialize to bincode
2457        let bytes = bincode::serialize(&index).unwrap();
2458
2459        // Deserialize back
2460        let restored: SeqHash = bincode::deserialize(&bytes).unwrap();
2461
2462        // Verify queries work
2463        assert_eq!(
2464            restored.query(b"ACGTACGTACGT"),
2465            Some(Match::Exact { parent_idx: 0 })
2466        );
2467        assert_eq!(
2468            restored.query(b"ACGTACGTACGA"), // T->A at pos 11
2469            Some(Match::Mismatch {
2470                parent_idx: 0,
2471                pos: 11
2472            })
2473        );
2474    }
2475
2476    #[test]
2477    fn test_seqhash_exact_only_roundtrip() {
2478        let parents: Vec<&[u8]> = vec![b"ACGTACGT", b"GGGGCCCC"];
2479        let index = SeqHashBuilder::default().exact().build(&parents).unwrap();
2480
2481        let bytes = bincode::serialize(&index).unwrap();
2482        let restored: SeqHash = bincode::deserialize(&bytes).unwrap();
2483
2484        assert!(restored.is_exact_only());
2485        assert_eq!(
2486            restored.query(b"ACGTACGT"),
2487            Some(Match::Exact { parent_idx: 0 })
2488        );
2489        // Mismatch should not work in exact-only mode
2490        assert_eq!(restored.query(b"GCGTACGT"), None);
2491    }
2492
2493    #[test]
2494    fn test_match_serde() {
2495        let exact = Match::Exact { parent_idx: 42 };
2496        let json = serde_json::to_string(&exact).unwrap();
2497        let restored: Match = serde_json::from_str(&json).unwrap();
2498        assert_eq!(restored, exact);
2499
2500        let mismatch = Match::Mismatch {
2501            parent_idx: 7,
2502            pos: 13,
2503        };
2504        let json = serde_json::to_string(&mismatch).unwrap();
2505        let restored: Match = serde_json::from_str(&json).unwrap();
2506        assert_eq!(restored, mismatch);
2507    }
2508
2509    #[test]
2510    fn test_error_serde() {
2511        let errors = vec![
2512            SeqHashError::EmptyParents,
2513            SeqHashError::InconsistentLength {
2514                expected: 10,
2515                found: 5,
2516                index: 2,
2517            },
2518            SeqHashError::SequenceTooLong { len: 20000 },
2519            SeqHashError::DuplicateParent {
2520                index: 3,
2521                original: 1,
2522            },
2523            SeqHashError::InvalidBase {
2524                index: 0,
2525                pos: 5,
2526                base: b'X',
2527            },
2528        ];
2529
2530        for error in errors {
2531            let json = serde_json::to_string(&error).unwrap();
2532            let restored: SeqHashError = serde_json::from_str(&json).unwrap();
2533            assert_eq!(restored, error);
2534        }
2535    }
2536
2537    #[test]
2538    fn test_save_and_load() {
2539        let parents: Vec<&[u8]> = vec![b"ACGTACGTACGT", b"GGGGCCCCAAAA", b"TTTTAAAACCCC"];
2540        let index = SeqHash::new(&parents).unwrap();
2541
2542        // Create a temporary file path
2543        let temp_dir = std::env::temp_dir();
2544        let file_path = temp_dir.join("test_index.seqhash");
2545
2546        // Save the index
2547        index.save(&file_path).unwrap();
2548
2549        // Load the index
2550        let loaded = SeqHash::load(&file_path).unwrap();
2551
2552        // Verify all properties match
2553        assert_eq!(loaded.num_parents(), index.num_parents());
2554        assert_eq!(loaded.seq_len(), index.seq_len());
2555        assert_eq!(loaded.num_entries(), index.num_entries());
2556        assert_eq!(loaded.num_ambiguous(), index.num_ambiguous());
2557        assert_eq!(loaded.is_exact_only(), index.is_exact_only());
2558        assert_eq!(loaded.allows_n(), index.allows_n());
2559
2560        // Verify queries work
2561        assert_eq!(
2562            loaded.query(b"ACGTACGTACGT"),
2563            Some(Match::Exact { parent_idx: 0 })
2564        );
2565        assert_eq!(
2566            loaded.query(b"ACGTACGTACGA"), // T->A at pos 11
2567            Some(Match::Mismatch {
2568                parent_idx: 0,
2569                pos: 11
2570            })
2571        );
2572
2573        // Verify parent data
2574        for i in 0..index.num_parents() {
2575            assert_eq!(loaded.get_parent(i), index.get_parent(i));
2576        }
2577
2578        // Clean up
2579        std::fs::remove_file(&file_path).ok();
2580    }
2581
2582    #[test]
2583    fn test_load_nonexistent_file() {
2584        let result = SeqHash::load("/nonexistent/path/to/file.seqhash");
2585        assert!(result.is_err());
2586        assert_eq!(result.unwrap_err().kind(), std::io::ErrorKind::NotFound);
2587    }
2588
2589    #[test]
2590    fn test_load_invalid_file() {
2591        let temp_dir = std::env::temp_dir();
2592        let file_path = temp_dir.join("invalid.seqhash");
2593
2594        // Write invalid data
2595        std::fs::write(&file_path, b"not valid bincode data").unwrap();
2596
2597        let result = SeqHash::load(&file_path);
2598        assert!(result.is_err());
2599        assert_eq!(result.unwrap_err().kind(), std::io::ErrorKind::InvalidData);
2600
2601        // Clean up
2602        std::fs::remove_file(&file_path).ok();
2603    }
2604}
2605
2606#[cfg(all(test, feature = "parallel"))]
2607mod parallel_tests {
2608
2609    use super::*;
2610    use rand::Rng;
2611
2612    fn generate_random_parents(n_parents: usize, seq_len: usize) -> Vec<Vec<u8>> {
2613        let mut parents = Vec::new();
2614        let mut rng = rand::rng();
2615
2616        while parents.len() < n_parents {
2617            let parent = (0..seq_len)
2618                .map(|_| rng.random_range(0..4))
2619                .map(|base_idx| VALID_BASES[base_idx])
2620                .collect::<Vec<u8>>();
2621            if !parents.contains(&parent) {
2622                parents.push(parent);
2623            }
2624        }
2625
2626        parents
2627    }
2628
2629    #[test]
2630    fn test_construction_parallel() {
2631        let n_parents = 100;
2632        let seq_len = 10;
2633
2634        let parents = generate_random_parents(n_parents, seq_len);
2635
2636        let index_sequental = SeqHashBuilder::default().build(&parents).unwrap();
2637
2638        for threads in [0, 1, 4, 8] {
2639            let index_parallel = SeqHashBuilder::default()
2640                .threads(threads)
2641                .build(&parents)
2642                .unwrap();
2643
2644            // all parents are identical (plus order)
2645            assert_eq!(index_sequental.parents, index_parallel.parents);
2646
2647            // all mutations made it into the lookup table
2648            assert_eq!(index_sequental.lookup.len(), index_parallel.lookup.len());
2649
2650            // all keys (mutations) have identical metadata
2651            for key in index_sequental.lookup.keys() {
2652                assert_eq!(index_sequental.lookup[key], index_parallel.lookup[key]);
2653            }
2654        }
2655    }
2656
2657    #[test]
2658    fn test_all_single_mutations_parallel() {
2659        let parents: Vec<&[u8]> = vec![b"AAAA"];
2660        let index = SeqHashBuilder::default()
2661            .threads(4)
2662            .build(&parents)
2663            .unwrap();
2664
2665        // Test all single mutations from AAAA
2666        let mutations = [
2667            (b"CAAA", 0),
2668            (b"GAAA", 0),
2669            (b"TAAA", 0),
2670            (b"ACAA", 1),
2671            (b"AGAA", 1),
2672            (b"ATAA", 1),
2673            (b"AACA", 2),
2674            (b"AAGA", 2),
2675            (b"AATA", 2),
2676            (b"AAAC", 3),
2677            (b"AAAG", 3),
2678            (b"AAAT", 3),
2679        ];
2680
2681        for (query, expected_pos) in mutations {
2682            let result = index.query(query);
2683            assert_eq!(
2684                result,
2685                Some(Match::Mismatch {
2686                    parent_idx: 0,
2687                    pos: expected_pos
2688                }),
2689                "Failed for query {:?}",
2690                std::str::from_utf8(query)
2691            );
2692        }
2693    }
2694}