Skip to main content

hibp_verifier/
lib.rs

1//! High-performance library for checking passwords against the Have I Been Pwned
2//! breach database using binary search on a compact 6-byte (sha1t48) format.
3//!
4//! This library provides sub-microsecond password breach checking by reading
5//! pre-processed HIBP dataset files and performing binary search on sorted records.
6//! The hot path is zero-allocation for maximum performance.
7//!
8//! # Quick Start
9//!
10//! ```rust,ignore
11//! use hibp_verifier::BreachChecker;
12//! use std::path::Path;
13//!
14//! let checker = BreachChecker::new(Path::new("/path/to/hibp-data"));
15//!
16//! match checker.is_breached("password123") {
17//!     Ok(true) => println!("Password found in breach database"),
18//!     Ok(false) => println!("Password not found"),
19//!     Err(e) => eprintln!("Error: {}", e),
20//! }
21//! ```
22//!
23//! # Dataset Setup
24//!
25//! This library requires a pre-downloaded dataset in sha1t48 binary format.
26//! Use [hibp-bin-fetch](https://crates.io/crates/hibp-bin-fetch) to download and
27//! convert the data:
28//!
29//! ```sh
30//! cargo install hibp-bin-fetch
31//! hibp-bin-fetch --output /path/to/hibp-data
32//! ```
33//!
34//! # Binary Format
35//!
36//! The library expects a directory containing 1,048,576 files named `00000.bin`
37//! through `FFFFF.bin`. Each file contains sorted 6-byte records (bytes 2-7 of
38//! SHA1 hashes) for the corresponding prefix.
39//!
40//! This format reduces storage from 77 GB (original text) to 13 GB while enabling
41//! O(log n) binary search with direct indexing—no parsing overhead.
42//!
43//! # Performance
44//!
45//! High concurrency benchmark (10k concurrent lookups, 24 worker threads):
46//!
47//! | API                             | Per check |
48//! |---------------------------------|-----------|
49//! | `is_breached_async` (tokio)     | ~3.1 us   |
50//! | `is_breached_compio` (io-uring) | ~4.6 us   |
51//! | `is_breached` (sync threads)    | ~19.8 us  |
52//!
53//! The sync API is fastest for isolated serial lookups (~1.4 us) but performs
54//! poorly under concurrency due to OS thread creation overhead. For concurrent
55//! workloads, use `is_breached_async` which leverages tokio's blocking thread
56//! pool with work-stealing for optimal throughput.
57//!
58//! # Async Support
59//!
60//! Enable the `tokio` feature for async support:
61//!
62//! ```toml
63//! [dependencies]
64//! hibp-verifier = { version = "0.1", features = ["tokio"] }
65//! ```
66//!
67//! ```rust,ignore
68//! use hibp_verifier::BreachChecker;
69//! use std::path::Path;
70//!
71//! #[tokio::main]
72//! async fn main() -> std::io::Result<()> {
73//!     let checker = BreachChecker::new(Path::new("/path/to/hibp-data"));
74//!
75//!     if checker.is_breached_async("password123").await? {
76//!         println!("Password found in breach database!");
77//!     }
78//!
79//!     Ok(())
80//! }
81//! ```
82//!
83//! The async API performs SHA1 hashing and path construction on the async thread,
84//! then uses `spawn_blocking` only for file I/O. This is faster than `tokio::fs::File`
85//! because it uses a single blocking call instead of multiple calls per I/O operation.
86//!
87//! # Compio Support (io-uring)
88//!
89//! Enable the `compio` feature for native io-uring async support:
90//!
91//! ```toml
92//! [dependencies]
93//! hibp-verifier = { version = "0.1", features = ["compio"] }
94//! ```
95//!
96//! This uses compio's native io-uring file I/O. Note that benchmarks show this is
97//! ~1.5x slower than the tokio `spawn_blocking` approach due to the non-work-stealing
98//! model required by io-uring's thread-local buffer requirements.
99
100use std::fs::File;
101use std::io::{self, Read};
102use std::path::{Path, PathBuf};
103
104use sha1::{Digest, Sha1};
105
106/// Environment variable name for specifying the HIBP dataset directory.
107pub const HIBP_DATA_DIR_ENV: &str = "HIBP_DATA_DIR";
108
109/// Returns the dataset path from the HIBP_DATA_DIR environment variable,
110/// or falls back to the default location (pwnedpasswords-bin sibling directory).
111pub fn dataset_path_from_env() -> PathBuf {
112    std::env::var(HIBP_DATA_DIR_ENV).map(PathBuf::from).unwrap_or_else(|_| {
113        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
114            .parent()
115            .unwrap()
116            .join("pwndpasswords-bin")
117    })
118}
119
120/// The length of a sha1t64 record in bytes (truncated 64-bit hash).
121pub const RECORD_SIZE: usize = 6;
122
123/// The length of a SHA1 hash prefix used for file naming (5 hex characters).
124pub const PREFIX_LEN: usize = 5;
125
126/// Hex lookup table for prefix conversion.
127pub const HEX_CHARS: &[u8; 16] = b"0123456789ABCDEF";
128
129/// Checks if a password has been found in known data breaches.
130///
131/// This struct holds a reference to the directory containing the HIBP binary dataset files.
132pub struct BreachChecker<'a> {
133    dataset_path: &'a Path,
134}
135
136impl<'a> BreachChecker<'a> {
137    /// Creates a new BreachChecker with the given dataset directory path.
138    ///
139    /// The directory should contain binary files named `{PREFIX}.bin` where PREFIX
140    /// is a 5-character uppercase hex string (00000-FFFFF).
141    pub fn new(dataset_path: &'a Path) -> Self {
142        Self { dataset_path }
143    }
144
145    /// Checks if the given password has been found in a data breach.
146    ///
147    /// Returns `Ok(true)` if the password was found in the breach database,
148    /// `Ok(false)` if it was not found, or an error if the lookup failed.
149    pub fn is_breached(&self, password: &str) -> io::Result<bool> {
150        // Compute SHA1 hash as raw bytes
151        let mut hasher = Sha1::new();
152        hasher.update(password.as_bytes());
153        let hash: [u8; 20] = hasher.finalize().into();
154
155        let prefix_hex = Self::prefix_hex(&hash);
156        let mut file = self.open_file(prefix_hex)?;
157
158        // largest file size currently is 14.6KB for 6-byte records (2495 records in that prefix
159        // file) Use a 16KB stack buffer to avoid allocation. This should provide room for
160        // growth over time.
161        let mut buf = [0u8; 16384];
162
163        // read() is not guaranteed to return the full file in a single call.
164        // This loop logic handles ensuring we always read to the end.
165        //
166        // I've benchmarked this against getting the metadata for the file
167        // upfront and reading until total bytes read == size from metadata, and
168        // that approach was slower. Likely because fstat() has to copy the full
169        // stat structure(144 bytes on x86_64) from kernel to userspace.
170        let mut total = 0usize;
171        loop {
172            match file.read(&mut buf[total..]) {
173                Ok(0) => break,
174                Ok(n) => {
175                    total += n;
176                }
177                Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
178                Err(e) => return Err(e),
179            }
180        }
181
182        let search_key: [u8; 6] = unsafe { hash[2..8].try_into().unwrap_unchecked() };
183
184        Ok(buf[..total].as_chunks::<RECORD_SIZE>().0.binary_search(&search_key).is_ok())
185    }
186
187    /// Returns the prefix for the hash as hex (first 5 hex chars == first 2.5 bytes)
188    /// that matches the file name on disk where the hash might be found.
189    #[doc(hidden)]
190    #[inline(always)]
191    pub fn prefix_hex(hash: &[u8; 20]) -> [u8; PREFIX_LEN] {
192        let mut prefix_hex = [0u8; PREFIX_LEN];
193
194        prefix_hex[0] = HEX_CHARS[(hash[0] >> 4) as usize];
195        prefix_hex[1] = HEX_CHARS[(hash[0] & 0x0f) as usize];
196        prefix_hex[2] = HEX_CHARS[(hash[1] >> 4) as usize];
197        prefix_hex[3] = HEX_CHARS[(hash[1] & 0x0f) as usize];
198        prefix_hex[4] = HEX_CHARS[(hash[2] >> 4) as usize];
199
200        prefix_hex
201    }
202
203    // Build file path without allocation: base_path + '/' + prefix + ".bin"
204    #[inline(always)]
205    fn build_path(&self, prefix_hex: [u8; PREFIX_LEN]) -> ([u8; 512], usize) {
206        let base = self.dataset_path.as_os_str().as_encoded_bytes();
207        let mut path_buf = [0u8; 512];
208        let path_len = base.len() + 1 + PREFIX_LEN + 4; // +4 for ".bin"
209        path_buf[..base.len()].copy_from_slice(base);
210        path_buf[base.len()] = b'/';
211        path_buf[base.len() + 1..base.len() + 1 + PREFIX_LEN].copy_from_slice(&prefix_hex);
212        path_buf[base.len() + 1 + PREFIX_LEN..path_len].copy_from_slice(b".bin");
213
214        (path_buf, path_len)
215    }
216
217    /// Build file path without allocation: base_path + '/' + prefix + ".bin"
218    #[doc(hidden)]
219    #[inline(always)]
220    pub fn open_file(&self, prefix_hex: [u8; PREFIX_LEN]) -> io::Result<File> {
221        let (path_buf, path_len) = self.build_path(prefix_hex);
222
223        // SAFETY: path_buf contains valid UTF-8 (base path + '/' + hex prefix + ".bin")
224        let file_path = unsafe { std::str::from_utf8_unchecked(&path_buf[..path_len]) };
225
226        File::open(file_path)
227    }
228
229    /// Async version of `is_breached` using tokio.
230    ///
231    /// Performs SHA1 hashing and path construction on the async thread,
232    /// then uses `spawn_blocking` only for file I/O.
233    ///
234    /// # Example
235    ///
236    /// ```rust,ignore
237    /// use hibp_verifier::BreachChecker;
238    /// use std::path::Path;
239    ///
240    /// #[tokio::main]
241    /// async fn main() -> std::io::Result<()> {
242    ///     let checker = BreachChecker::new(Path::new("/path/to/hibp-data"));
243    ///
244    ///     if checker.is_breached_async("password123").await? {
245    ///         println!("Password found in breach database!");
246    ///     }
247    ///
248    ///     Ok(())
249    /// }
250    /// ```
251    #[cfg(feature = "tokio")]
252    pub async fn is_breached_async(&self, password: &str) -> io::Result<bool> {
253        let mut hasher = Sha1::new();
254        hasher.update(password.as_bytes());
255        let hash: [u8; 20] = hasher.finalize().into();
256
257        let search_key: [u8; 6] = unsafe { hash[2..8].try_into().unwrap_unchecked() };
258
259        let prefix_hex = Self::prefix_hex(&hash);
260        let (path_buf, path_len) = self.build_path(prefix_hex);
261
262        // Only file I/O goes into spawn_blocking
263        tokio::task::spawn_blocking(move || {
264            let file_path = unsafe { std::str::from_utf8_unchecked(&path_buf[..path_len]) };
265            let mut file = File::open(file_path)?;
266
267            let mut buf = [0u8; 16384];
268            let mut total = 0usize;
269            loop {
270                match file.read(&mut buf[total..]) {
271                    Ok(0) => break,
272                    Ok(n) => total += n,
273                    Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
274                    Err(e) => return Err(e),
275                }
276            }
277
278            Ok(buf[..total].as_chunks::<RECORD_SIZE>().0.binary_search(&search_key).is_ok())
279        })
280        .await
281        .expect("spawn_blocking task panicked")
282    }
283
284    /// Async version of `is_breached` using compio's native io-uring file I/O.
285    ///
286    /// This method uses compio-fs which provides true async file operations
287    /// via io-uring on Linux.
288    ///
289    /// compio is compatible with ntex's compio runtime feature, making this
290    /// suitable for use within ntex web applications that want to use compio.
291    #[cfg(feature = "compio")]
292    pub async fn is_breached_compio(&self, password: &str) -> io::Result<bool> {
293        use compio::fs::File;
294        use compio::io::AsyncReadAt;
295
296        let mut hasher = Sha1::new();
297        hasher.update(password.as_bytes());
298        let hash: [u8; 20] = hasher.finalize().into();
299
300        let search_key: [u8; 6] = unsafe { hash[2..8].try_into().unwrap_unchecked() };
301
302        let prefix_hex = Self::prefix_hex(&hash);
303        let (path_buf, path_len) = self.build_path(prefix_hex);
304        let file_path = unsafe { std::str::from_utf8_unchecked(&path_buf[..path_len]) };
305
306        let file = File::open(file_path).await?;
307
308        // compio returns the buffer back to us after each operation
309        let mut buf = [0u8; 16384];
310        let mut total = 0usize;
311
312        loop {
313            let buf_result = file.read_at(buf, total as u64).await;
314            buf = buf_result.1;
315            match buf_result.0 {
316                Ok(0) => break,
317                Ok(n) => total += n,
318                Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
319                Err(e) => return Err(e),
320            }
321        }
322
323        Ok(buf[..total].as_chunks::<RECORD_SIZE>().0.binary_search(&search_key).is_ok())
324    }
325}
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330
331    #[test]
332    fn test_sha1t64_conversion() {
333        // password123 -> SHA1: CBFDAC6008F9CAB4083784CBD1874F76618D2A97
334        // sha1t64 (first 8 bytes): CB FD AC 60 08 F9 CA B4
335        let mut hasher = Sha1::new();
336        hasher.update(b"password123");
337        let hash: [u8; 20] = hasher.finalize().into();
338
339        assert_eq!(hash[0], 0xCB);
340        assert_eq!(hash[1], 0xFD);
341        assert_eq!(hash[2], 0xAC);
342        assert_eq!(hash[3], 0x60);
343        assert_eq!(hash[4], 0x08);
344        assert_eq!(hash[5], 0xF9);
345        assert_eq!(hash[6], 0xCA);
346        assert_eq!(hash[7], 0xB4);
347    }
348
349    #[test]
350    #[ignore = "requires HIBP dataset"]
351    fn test_breached_password() {
352        // "password123" is a commonly breached password
353        // SHA1: CBFDAC6008F9CAB4083784CBD1874F76618D2A97
354        // Prefix: CBFDA
355        let path = dataset_path_from_env();
356        let checker = BreachChecker::new(&path);
357        let result = checker.is_breached("password123").unwrap();
358        assert!(result, "password123 should be found in the breach database");
359    }
360
361    #[test]
362    #[ignore = "requires HIBP dataset"]
363    fn test_non_breached_password() {
364        let path = dataset_path_from_env();
365        let checker = BreachChecker::new(&path);
366        // "hAwT?}cuC:r#kW5" is a complex random password that shouldn't be in breaches
367        let result = checker.is_breached("hAwT?}cuC:r#kW5").unwrap();
368        assert!(
369            !result,
370            "random complex password should not be in the breach database"
371        );
372    }
373
374    #[test]
375    fn test_binary_search_sha1t48() {
376        // Create a small sorted dataset for testing
377        let data: Vec<u8> = vec![
378            0x00, 0x00, 0x00, 0x00, 0x00, 0x01, // record 0
379            0x00, 0x00, 0x00, 0x00, 0x00, 0x05, // record 1
380            0x00, 0x00, 0x00, 0x00, 0x00, 0x10, // record 2
381            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, // record 3
382        ];
383
384        // Test finding existing records
385        assert!(
386            data.as_chunks::<RECORD_SIZE>()
387                .0
388                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01])
389                .is_ok()
390        );
391        assert!(
392            data.as_chunks::<RECORD_SIZE>()
393                .0
394                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x05])
395                .is_ok()
396        );
397        assert!(
398            data.as_chunks::<RECORD_SIZE>()
399                .0
400                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x10])
401                .is_ok()
402        );
403        assert!(
404            data.as_chunks::<RECORD_SIZE>()
405                .0
406                .binary_search(&[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF])
407                .is_ok()
408        );
409
410        // Test not finding non-existent records
411        assert!(
412            data.as_chunks::<RECORD_SIZE>()
413                .0
414                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00])
415                .is_err()
416        );
417        assert!(
418            data.as_chunks::<RECORD_SIZE>()
419                .0
420                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x02])
421                .is_err()
422        );
423        assert!(
424            data.as_chunks::<RECORD_SIZE>()
425                .0
426                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0xFF])
427                .is_err()
428        );
429        assert!(
430            data.as_chunks::<RECORD_SIZE>()
431                .0
432                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00])
433                .is_err()
434        );
435    }
436
437    #[test]
438    fn test_empty_data() {
439        let data: Vec<u8> = vec![];
440        assert!(
441            data.as_chunks::<RECORD_SIZE>()
442                .0
443                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x01])
444                .is_err()
445        );
446    }
447
448    #[test]
449    fn test_single_record() {
450        let data: Vec<u8> = vec![0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0];
451
452        assert!(
453            data.as_chunks::<RECORD_SIZE>()
454                .0
455                .binary_search(&[0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0])
456                .is_ok()
457        );
458        assert!(
459            data.as_chunks::<RECORD_SIZE>()
460                .0
461                .binary_search(&[0x00, 0x00, 0x00, 0x00, 0x00, 0x00])
462                .is_err()
463        );
464        assert!(
465            data.as_chunks::<RECORD_SIZE>()
466                .0
467                .binary_search(&[0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF])
468                .is_err()
469        );
470    }
471}
472
473#[cfg(all(test, feature = "tokio"))]
474mod tokio_tests {
475    use super::*;
476
477    #[tokio::test]
478    #[ignore = "requires HIBP dataset"]
479    async fn test_async_breached_password() {
480        let path = dataset_path_from_env();
481        let checker = BreachChecker::new(&path);
482
483        let result = checker.is_breached_async("password123").await.unwrap();
484        assert!(result, "password123 should be found in breach database");
485    }
486
487    #[tokio::test]
488    #[ignore = "requires HIBP dataset"]
489    async fn test_async_non_breached_password() {
490        let path = dataset_path_from_env();
491        let checker = BreachChecker::new(&path);
492
493        let result = checker.is_breached_async("hAwT?}cuC:r#kW5").await.unwrap();
494        assert!(!result, "random password should not be in breach database");
495    }
496
497    #[tokio::test]
498    #[ignore = "requires HIBP dataset"]
499    async fn test_async_matches_sync() {
500        let path = dataset_path_from_env();
501        let checker = BreachChecker::new(&path);
502
503        let passwords = [
504            "password123",
505            "123456",
506            "qwerty",
507            "hAwT?}cuC:r#kW5",
508            "letmein",
509            "xK9#mP2$vL7@nQ4",
510        ];
511
512        for password in passwords {
513            let sync_result = checker.is_breached(password).unwrap();
514            let async_result = checker.is_breached_async(password).await.unwrap();
515            assert_eq!(
516                sync_result, async_result,
517                "sync and async results should match for '{}'",
518                password
519            );
520        }
521    }
522}
523
524#[cfg(all(test, feature = "compio"))]
525mod compio_tests {
526    use compio::runtime as compio_runtime;
527
528    use super::*;
529
530    #[test]
531    #[ignore = "requires HIBP dataset"]
532    fn test_compio_breached_password() {
533        let path = dataset_path_from_env();
534
535        compio_runtime::Runtime::new().unwrap().block_on(async {
536            let checker = BreachChecker::new(&path);
537            let result = checker.is_breached_compio("password123").await.unwrap();
538            assert!(result, "password123 should be found in breach database");
539        });
540    }
541
542    #[test]
543    #[ignore = "requires HIBP dataset"]
544    fn test_compio_non_breached_password() {
545        let path = dataset_path_from_env();
546
547        compio_runtime::Runtime::new().unwrap().block_on(async {
548            let checker = BreachChecker::new(&path);
549            let result = checker.is_breached_compio("hAwT?}cuC:r#kW5").await.unwrap();
550            assert!(!result, "random password should not be in breach database");
551        });
552    }
553
554    #[test]
555    #[ignore = "requires HIBP dataset"]
556    fn test_compio_matches_sync() {
557        let path = dataset_path_from_env();
558
559        compio_runtime::Runtime::new().unwrap().block_on(async {
560            let checker = BreachChecker::new(&path);
561
562            let passwords = [
563                "password123",
564                "123456",
565                "qwerty",
566                "hAwT?}cuC:r#kW5",
567                "letmein",
568                "xK9#mP2$vL7@nQ4",
569            ];
570
571            for password in passwords {
572                let sync_result = checker.is_breached(password).unwrap();
573                let compio_result = checker.is_breached_compio(password).await.unwrap();
574                assert_eq!(
575                    sync_result, compio_result,
576                    "sync and compio results should match for '{}'",
577                    password
578                );
579            }
580        });
581    }
582}