nthash_rs/
lib.rs

1//! # ntHash‑rs
2//!
3//! An idiomatic, pure‑Rust port of the classic *ntHash* rolling‑hash suite,
4//! focused on contiguous k‑mer hashing for DNA sequences.
5//!
6//! This crate currently provides:
7//! - [`kmer::NtHash`]: the canonical contiguous‑k‑mer hasher that skips over
8//!   non‑ACGT bases (`N` or other characters).
9//!
10//! All heavy bit‑twiddling is delegated to low‑level modules (`tables` and
11//! `constants`), which mirror the original C++ reference implementation, and
12//! helper functionality in `util` for canonicalization and hash extension.
13//!
14//! ## Example
15//!
16//! ```rust
17//! use nthash_rs::{NtHash, Result};
18//!
19//! fn main() -> Result<()> {
20//!     // Create a new NtHash over "ACGTNACGT", k=4, emit 2 hashes per k‑mer, start at pos=0
21//!     let mut hasher = NtHash::new(b"ACGTNACGT", 4, 2, 0)?;
22//!
23//!     // First call to roll() initializes and returns true if a valid k‑mer was found
24//!     assert!(hasher.roll());
25//!     // Retrieve the two hash values for the first valid 4‑mer
26//!     let hashes = hasher.hashes();
27//!     println!("First k‑mer hashes: {:#x}, {:#x}", hashes[0], hashes[1]);
28//!
29//!     // Advance through the sequence
30//!     while hasher.roll() {
31//!         let h = hasher.hashes()[0];
32//!         println!("Next k‑mer forward hash: {:#x}", h);
33//!     }
34//!     Ok(())
35//! }
36//! ```
37
38// Uncomment to build with `no_std` support
39// #![cfg_attr(not(feature = "std"), no_std)]
40
41/// Low‑level random seeds, split‑rotate tables, and numeric constants.
42// Not re‑exported directly.
43mod constants;
44mod tables;
45
46pub mod util;
47/// High‑level contiguous k‑mer rolling hasher.
48/// Skips over non‑ACGT bases exactly as the original reference.
49pub mod kmer;
50pub mod blind;
51pub mod seed;
52
53// ──────────────────────────────────────────────────────────────
54// Re‑exports: public API surface
55// --------------------------------------------------------------------------
56
57/// One‑bit split‑rotate left (33 + 31 halves).
58pub use tables::srol;
59/// Arbitrary split‑rotate via lookup tables.
60pub use tables::srol_table;
61/// One‑bit split‑rotate right (33 + 31 halves).
62pub use tables::sror;
63
64/// Combine forward and reverse hashes into a strand‑independent value.
65pub use util::canonical;
66/// Derive multiple hash values from a single canonical hash.
67pub use util::extend_hashes;
68
69/// Primary rolling k‑mer hasher.
70///
71/// See [`kmer::NtHash`] for full documentation.
72pub use kmer::NtHash;
73pub use kmer::NtHashBuilder;
74
75pub use blind::BlindNtHash;
76pub use blind::BlindNtHashBuilder;
77
78pub use seed::SeedNtHash;
79pub use seed::SeedNtHashBuilder;
80
81// ──────────────────────────────────────────────────────────────
82// Crate‑wide result and error types
83// --------------------------------------------------------------------------
84
85/// Shorthand `Result` alias for this crate’s operations.
86pub type Result<T, E = NtHashError> = std::result::Result<T, E>;
87
88/// Errors common to all ntHash k‑mer hashers.
89#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
90pub enum NtHashError {
91    /// `k` was zero.
92    #[error("k must be > 0")]
93    InvalidK,
94
95    /// Provided sequence length is shorter than `k`.
96    #[error("sequence length ({seq_len}) < k ({k})")]
97    SequenceTooShort { seq_len: usize, k: u16 },
98
99    /// Starting `pos` is beyond the last valid window (`seq.len() - k`).
100    #[error("position ({pos}) exceeds sequence length ({seq_len})")]
101    PositionOutOfRange { pos: usize, seq_len: usize },
102
103    #[error("invalid sequence")]
104    InvalidSequence,
105
106    #[error("invalid window offsets")]
107    InvalidWindowOffsets,
108}
109
110// ──────────────────────────────────────────────────────────────
111// Basic smoke tests
112// --------------------------------------------------------------------------
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn sanity_kmer() {
119        // Create hasher over "ACGTACGT", k=4, 1 hash per k‑mer, start at 0
120        let mut h = NtHash::new("ACGTACGT".as_bytes(), 4, 1, 0).unwrap();
121        // First valid k‑mer should be produced
122        assert!(h.roll());
123        assert_eq!(h.hashes().len(), 1);
124    }
125}