Skip to main content

tirith_core/
threatdb.rs

1//! Threat intelligence database — binary format with sorted sections for O(log n) lookup.
2//!
3//! The DB file (`tirith-threatdb.dat`) is compiled daily by CI from open threat feeds,
4//! signed with Ed25519, and distributed via GitHub Releases.
5//!
6//! ## Binary layout
7//!
8//! | Offset | Field | Size |
9//! |--------|-------|------|
10//! | 0 | Magic `TIRITHDB` | 8 |
11//! | 8 | Format version (u32 LE) | 4 |
12//! | 12 | Build timestamp (u64 LE, Unix epoch secs) | 8 |
13//! | 20 | Build sequence (u64 LE, monotonic) | 8 |
14//! | 28 | Section 1 (packages) offset (u32 LE) | 4 |
15//! | 32 | Section 1 count (u32 LE) | 4 |
16//! | 36 | Section 2 (hostnames) offset (u32 LE) | 4 |
17//! | 40 | Section 2 count (u32 LE) | 4 |
18//! | 44 | Section 3 (IPs) offset (u32 LE) | 4 |
19//! | 48 | Section 3 count (u32 LE) | 4 |
20//! | 52 | Section 4 (typosquats) offset (u32 LE) | 4 |
21//! | 56 | Section 4 count (u32 LE) | 4 |
22//! | 60 | Section 5 (popular pkgs) offset (u32 LE) | 4 |
23//! | 64 | Section 5 count (u32 LE) | 4 |
24//! | 68 | Section 6 (string table) offset (u32 LE) | 4 |
25//! | 72 | Section 6 size (u32 LE, bytes) | 4 |
26//! | 76 | Signer pubkey fingerprint (SHA-256, 32 bytes) | 32 |
27//! | 108 | Ed25519 signature (64 bytes) | 64 |
28//! | 172 | (sections data follows) | ... |
29//!
30//! Signature covers bytes `[0..108)` (header before sig) ++ bytes `[172..)` (all section data).
31
32use std::net::Ipv4Addr;
33use std::path::{Path, PathBuf};
34use std::sync::atomic::{AtomicU64, Ordering};
35use std::sync::{Arc, OnceLock, RwLock};
36
37use ed25519_dalek::{Signature, VerifyingKey, PUBLIC_KEY_LENGTH, SIGNATURE_LENGTH};
38use sha2::{Digest, Sha256};
39use thiserror::Error;
40
41use crate::policy;
42use crate::util::levenshtein;
43
44const MAGIC: &[u8; 8] = b"TIRITHDB";
45const FORMAT_VERSION: u32 = 1;
46/// Total header size in bytes.
47const HEADER_SIZE: usize = 172;
48/// Offset of the Ed25519 signature within the header.
49const SIG_OFFSET: usize = 108;
50/// Offset of the signer fingerprint within the header.
51const FINGERPRINT_OFFSET: usize = 76;
52const FINGERPRINT_LEN: usize = 32;
53const DB_FILENAME: &str = "tirith-threatdb.dat";
54const SUPPLEMENTAL_DB_FILENAME: &str = "tirith-threatdb-supplemental.dat";
55/// Re-check file mtime at most every 60 seconds.
56const MTIME_CHECK_INTERVAL_SECS: u64 = 60;
57
58/// Ed25519 verification key for threat DB signatures, compiled into the binary.
59/// The corresponding private key is stored as a GitHub Actions secret (THREATDB_SIGNING_KEY).
60static VERIFY_KEY_BYTES: &[u8; PUBLIC_KEY_LENGTH] =
61    include_bytes!("../assets/keys/threatdb-verify.pub");
62
63/// Package ecosystem identifiers, encoded as a single byte in the DB.
64#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
65#[repr(u8)]
66pub enum Ecosystem {
67    Npm = 0,
68    PyPI = 1,
69    RubyGems = 2,
70    Crates = 3,
71    Go = 4,
72    Maven = 5,
73    NuGet = 6,
74    Packagist = 7,
75}
76
77impl std::fmt::Display for Ecosystem {
78    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
79        match self {
80            Ecosystem::Npm => write!(f, "npm"),
81            Ecosystem::PyPI => write!(f, "pypi"),
82            Ecosystem::RubyGems => write!(f, "rubygems"),
83            Ecosystem::Crates => write!(f, "crates.io"),
84            Ecosystem::Go => write!(f, "go"),
85            Ecosystem::Maven => write!(f, "maven"),
86            Ecosystem::NuGet => write!(f, "nuget"),
87            Ecosystem::Packagist => write!(f, "packagist"),
88        }
89    }
90}
91
92impl Ecosystem {
93    fn from_u8(v: u8) -> Option<Self> {
94        match v {
95            0 => Some(Self::Npm),
96            1 => Some(Self::PyPI),
97            2 => Some(Self::RubyGems),
98            3 => Some(Self::Crates),
99            4 => Some(Self::Go),
100            5 => Some(Self::Maven),
101            6 => Some(Self::NuGet),
102            7 => Some(Self::Packagist),
103            _ => None,
104        }
105    }
106
107    /// Parse an ecosystem from its string name (case-insensitive).
108    /// Accepts common aliases like "crates.io", "cargo" for `Crates`.
109    pub fn from_name(s: &str) -> Option<Self> {
110        match s.to_lowercase().as_str() {
111            "npm" => Some(Self::Npm),
112            "pypi" => Some(Self::PyPI),
113            "rubygems" => Some(Self::RubyGems),
114            "crates.io" | "crates" | "cargo" => Some(Self::Crates),
115            "go" => Some(Self::Go),
116            "maven" => Some(Self::Maven),
117            "nuget" => Some(Self::NuGet),
118            "packagist" => Some(Self::Packagist),
119            _ => None,
120        }
121    }
122}
123
124/// Origin of the threat intelligence signal.
125#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
126#[repr(u8)]
127pub enum ThreatSource {
128    OssfMalicious = 0,
129    DatadogMalicious = 1,
130    FeodoTracker = 2,
131    EcosystemsTyposquat = 3,
132    CisaKev = 4,
133    Urlhaus = 5,
134    PhishingArmy = 6,
135    PhishTank = 7,
136    ThreatFoxIoc = 8,
137    FireholIp = 9,
138    TorExit = 10,
139}
140
141impl ThreatSource {
142    fn from_u8(v: u8) -> Option<Self> {
143        match v {
144            0 => Some(Self::OssfMalicious),
145            1 => Some(Self::DatadogMalicious),
146            2 => Some(Self::FeodoTracker),
147            3 => Some(Self::EcosystemsTyposquat),
148            4 => Some(Self::CisaKev),
149            5 => Some(Self::Urlhaus),
150            6 => Some(Self::PhishingArmy),
151            7 => Some(Self::PhishTank),
152            8 => Some(Self::ThreatFoxIoc),
153            9 => Some(Self::FireholIp),
154            10 => Some(Self::TorExit),
155            _ => None,
156        }
157    }
158
159    /// Human-readable label for display.
160    pub fn label(&self) -> &'static str {
161        match self {
162            Self::OssfMalicious => "OSSF Malicious Packages",
163            Self::DatadogMalicious => "Datadog Malicious Packages",
164            Self::FeodoTracker => "Feodo Tracker",
165            Self::EcosystemsTyposquat => "ecosyste.ms Typosquats",
166            Self::CisaKev => "CISA KEV",
167            Self::Urlhaus => "URLhaus",
168            Self::PhishingArmy => "Phishing Army",
169            Self::PhishTank => "PhishTank",
170            Self::ThreatFoxIoc => "ThreatFox IOC",
171            Self::FireholIp => "FireHOL IP",
172            Self::TorExit => "Tor Exit Node",
173        }
174    }
175
176    /// Default confidence level for network-indicator sources (hostnames, IPs).
177    /// Package-level matches carry their own per-record confidence from the DB.
178    pub fn default_confidence(self) -> Confidence {
179        match self {
180            Self::TorExit => Confidence::Medium,
181            Self::OssfMalicious
182            | Self::DatadogMalicious
183            | Self::FeodoTracker
184            | Self::EcosystemsTyposquat
185            | Self::CisaKev
186            | Self::Urlhaus
187            | Self::PhishingArmy
188            | Self::PhishTank
189            | Self::ThreatFoxIoc
190            | Self::FireholIp => Confidence::Confirmed,
191        }
192    }
193}
194
195/// Confidence level for a threat match.
196#[derive(
197    Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, serde::Serialize, serde::Deserialize,
198)]
199#[serde(rename_all = "lowercase")]
200#[repr(u8)]
201pub enum Confidence {
202    Low = 0,
203    Medium = 1,
204    Confirmed = 2,
205}
206
207impl Confidence {
208    fn from_u8(v: u8) -> Option<Self> {
209        match v {
210            0 => Some(Self::Low),
211            1 => Some(Self::Medium),
212            2 => Some(Self::Confirmed),
213            _ => None,
214        }
215    }
216}
217
218/// Result of a package, hostname, or IP lookup in the threat DB.
219///
220/// `ecosystem` is `Some` for package matches and `None` for hostname/IP matches
221/// (where the concept of ecosystem does not apply).
222/// `all_versions_malicious` is only meaningful for package matches.
223#[derive(Debug, Clone)]
224pub struct ThreatMatch {
225    pub ecosystem: Option<Ecosystem>,
226    pub name: String,
227    pub source: ThreatSource,
228    pub confidence: Confidence,
229    pub reference_url: Option<String>,
230    pub all_versions_malicious: bool,
231}
232
233/// Result of a typosquat lookup.
234#[derive(Debug, Clone)]
235pub struct TyposquatMatch {
236    pub ecosystem: Ecosystem,
237    pub malicious_name: String,
238    pub target_name: String,
239}
240
241/// Aggregate statistics about a loaded DB.
242#[derive(Debug, Clone, Default)]
243pub struct ThreatDbStats {
244    pub format_version: u32,
245    pub build_timestamp: u64,
246    pub build_sequence: u64,
247    pub package_count: u32,
248    pub hostname_count: u32,
249    pub ip_count: u32,
250    pub typosquat_count: u32,
251    pub popular_count: u32,
252    pub string_table_bytes: u32,
253}
254
255#[derive(Debug, Error)]
256pub enum ThreatDbError {
257    #[error("invalid magic: expected TIRITHDB")]
258    InvalidMagic,
259    #[error("unsupported format version {0}")]
260    UnsupportedVersion(u32),
261    #[error("file too small: {0} bytes, need at least {HEADER_SIZE}")]
262    FileTooSmall(usize),
263    #[error("section offset/count out of bounds")]
264    SectionOutOfBounds,
265    #[error("invalid signature")]
266    InvalidSignature,
267    #[error("signer fingerprint mismatch")]
268    FingerprintMismatch,
269    #[error("rollback detected: sequence {got} <= current {current}")]
270    RollbackDetected { got: u64, current: u64 },
271    #[error("I/O error: {0}")]
272    Io(#[from] std::io::Error),
273    #[error("invalid record at offset {0}")]
274    InvalidRecord(usize),
275    #[error("string table offset out of bounds: {0}")]
276    StringOutOfBounds(u32),
277}
278
279/// Package record size:
280///   ecosystem(1) + name_len(2) + name(variable, capped at 256) + NOT fixed-size.
281/// We use a length-prefixed variable-size format within a contiguous buffer,
282/// with a separate index of offsets for binary search.
283///
284/// Index entry (8 bytes each):
285///   offset_into_data(u32 LE)
286///   key_hash(u32 LE) — FNV-1a of (ecosystem, name) for fast comparison
287///
288/// Data record (variable):
289///   ecosystem(u8)
290///   name_len(u16 LE)
291///   name(name_len bytes)
292///   source(u8)
293///   confidence(u8)
294///   flags(u8) — bit 0 = all_versions_malicious
295///   version_count(u16 LE)
296///   [version strings: len(u16 LE) + bytes] * version_count
297///   reference_offset(u32 LE) — into string table (0xFFFFFFFF = none)
298const PKG_INDEX_ENTRY_SIZE: usize = 8;
299
300/// IP record: u32 LE (IPv4) + source(u8) = 5 bytes.
301const IP_RECORD_SIZE: usize = 5;
302
303/// Typosquat index entry: offset(u32 LE) + key_hash(u32 LE) = 8 bytes.
304const TYPOSQUAT_INDEX_ENTRY_SIZE: usize = 8;
305
306/// Popular package index entry: offset(u32 LE) + key_hash(u32 LE) = 8 bytes.
307const POPULAR_INDEX_ENTRY_SIZE: usize = 8;
308
309/// Hostname index entry: offset(u32 LE) + key_hash(u32 LE) = 8 bytes.
310const HOSTNAME_INDEX_ENTRY_SIZE: usize = 8;
311
312/// FNV-1a 32-bit hash used for both string-table dedup and index key hashes.
313fn fnv1a_hash(data: &[u8]) -> u32 {
314    let mut h: u32 = 0x811c_9dc5;
315    for &b in data {
316        h ^= b as u32;
317        h = h.wrapping_mul(0x0100_0193);
318    }
319    h
320}
321
322fn pkg_key_hash(eco: Ecosystem, name: &[u8]) -> u32 {
323    let mut buf = Vec::with_capacity(1 + name.len());
324    buf.push(eco as u8);
325    buf.extend_from_slice(name);
326    fnv1a_hash(&buf)
327}
328
329fn read_u16_le(buf: &[u8], off: usize) -> Option<u16> {
330    buf.get(off..off + 2)
331        .map(|b| u16::from_le_bytes([b[0], b[1]]))
332}
333
334fn read_u32_le(buf: &[u8], off: usize) -> Option<u32> {
335    buf.get(off..off + 4)
336        .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]]))
337}
338
339fn read_u64_le(buf: &[u8], off: usize) -> Option<u64> {
340    buf.get(off..off + 8)
341        .map(|b| u64::from_le_bytes([b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7]]))
342}
343
344/// In-memory threat intelligence database loaded from the signed binary file.
345#[derive(Debug)]
346pub struct ThreatDb {
347    data: Vec<u8>,
348    supplemental: Option<Box<ThreatDb>>,
349    // Parsed header fields cached for fast access
350    format_version: u32,
351    build_timestamp: u64,
352    build_sequence: u64,
353    // Section metadata
354    pkg_index_offset: u32,
355    pkg_index_count: u32,
356    hostname_index_offset: u32,
357    hostname_index_count: u32,
358    ip_offset: u32,
359    ip_count: u32,
360    typosquat_index_offset: u32,
361    typosquat_index_count: u32,
362    popular_index_offset: u32,
363    popular_index_count: u32,
364    string_table_offset: u32,
365    string_table_size: u32,
366}
367
368impl ThreatDb {
369    /// Load and verify a threat DB from raw bytes.
370    ///
371    /// `min_sequence` enforces rollback protection — the DB is rejected if its
372    /// build sequence is <= this value.  Pass 0 to skip.
373    pub fn from_bytes(data: Vec<u8>, min_sequence: u64) -> Result<Self, ThreatDbError> {
374        if data.len() < HEADER_SIZE {
375            return Err(ThreatDbError::FileTooSmall(data.len()));
376        }
377
378        // Magic
379        if &data[0..8] != MAGIC {
380            return Err(ThreatDbError::InvalidMagic);
381        }
382
383        // Format version
384        let err = || ThreatDbError::InvalidRecord(0);
385        let version = read_u32_le(&data, 8).ok_or_else(err)?;
386        if version != FORMAT_VERSION {
387            return Err(ThreatDbError::UnsupportedVersion(version));
388        }
389
390        let build_timestamp = read_u64_le(&data, 12).ok_or_else(err)?;
391        let build_sequence = read_u64_le(&data, 20).ok_or_else(err)?;
392
393        // Rollback protection
394        if min_sequence > 0 && build_sequence <= min_sequence {
395            return Err(ThreatDbError::RollbackDetected {
396                got: build_sequence,
397                current: min_sequence,
398            });
399        }
400
401        // Section offsets/counts (all within bounds-checked HEADER_SIZE)
402        let pkg_index_offset = read_u32_le(&data, 28).ok_or_else(err)?;
403        let pkg_index_count = read_u32_le(&data, 32).ok_or_else(err)?;
404        let hostname_index_offset = read_u32_le(&data, 36).ok_or_else(err)?;
405        let hostname_index_count = read_u32_le(&data, 40).ok_or_else(err)?;
406        let ip_offset = read_u32_le(&data, 44).ok_or_else(err)?;
407        let ip_count = read_u32_le(&data, 48).ok_or_else(err)?;
408        let typosquat_index_offset = read_u32_le(&data, 52).ok_or_else(err)?;
409        let typosquat_index_count = read_u32_le(&data, 56).ok_or_else(err)?;
410        let popular_index_offset = read_u32_le(&data, 60).ok_or_else(err)?;
411        let popular_index_count = read_u32_le(&data, 64).ok_or_else(err)?;
412        let string_table_offset = read_u32_le(&data, 68).ok_or_else(err)?;
413        let string_table_size = read_u32_le(&data, 72).ok_or_else(err)?;
414
415        // Bounds checks on sections
416        let len = data.len() as u64;
417        let check_section = |off: u32, count: u32, entry_size: usize| -> bool {
418            let end = off as u64 + count as u64 * entry_size as u64;
419            end <= len
420        };
421
422        // IP section is fixed-size records
423        if !check_section(ip_offset, ip_count, IP_RECORD_SIZE) {
424            return Err(ThreatDbError::SectionOutOfBounds);
425        }
426
427        // Index sections — their entries are fixed size, but they point into
428        // variable-size data regions that live between sections.  We just
429        // validate the index extent here; individual records are validated
430        // lazily on access.
431        if !check_section(pkg_index_offset, pkg_index_count, PKG_INDEX_ENTRY_SIZE) {
432            return Err(ThreatDbError::SectionOutOfBounds);
433        }
434        if !check_section(
435            hostname_index_offset,
436            hostname_index_count,
437            HOSTNAME_INDEX_ENTRY_SIZE,
438        ) {
439            return Err(ThreatDbError::SectionOutOfBounds);
440        }
441        if !check_section(
442            typosquat_index_offset,
443            typosquat_index_count,
444            TYPOSQUAT_INDEX_ENTRY_SIZE,
445        ) {
446            return Err(ThreatDbError::SectionOutOfBounds);
447        }
448        if !check_section(
449            popular_index_offset,
450            popular_index_count,
451            POPULAR_INDEX_ENTRY_SIZE,
452        ) {
453            return Err(ThreatDbError::SectionOutOfBounds);
454        }
455
456        // String table
457        if (string_table_offset as u64 + string_table_size as u64) > len {
458            return Err(ThreatDbError::SectionOutOfBounds);
459        }
460
461        Ok(Self {
462            data,
463            supplemental: None,
464            format_version: version,
465            build_timestamp,
466            build_sequence,
467            pkg_index_offset,
468            pkg_index_count,
469            hostname_index_offset,
470            hostname_index_count,
471            ip_offset,
472            ip_count,
473            typosquat_index_offset,
474            typosquat_index_count,
475            popular_index_offset,
476            popular_index_count,
477            string_table_offset,
478            string_table_size,
479        })
480    }
481
482    /// Load from the default data directory (`~/.local/share/tirith/tirith-threatdb.dat`).
483    pub fn load_from_data_dir() -> Result<Self, ThreatDbError> {
484        let path = Self::default_path().ok_or_else(|| {
485            ThreatDbError::Io(std::io::Error::new(
486                std::io::ErrorKind::NotFound,
487                "cannot determine data directory",
488            ))
489        })?;
490        Self::load_from_path(&path, 0)
491    }
492
493    /// Load from a specific path with rollback protection.
494    pub fn load_from_path(path: &Path, min_sequence: u64) -> Result<Self, ThreatDbError> {
495        let data = std::fs::read(path)?;
496        Self::from_bytes(data, min_sequence)
497    }
498
499    /// Default filesystem path for the threat DB file.
500    ///
501    /// Checks `TIRITH_THREATDB_PATH` env var first (useful for testing with
502    /// fixture DBs), then falls back to `~/.local/share/tirith/tirith-threatdb.dat`.
503    pub fn default_path() -> Option<PathBuf> {
504        if let Ok(p) = std::env::var("TIRITH_THREATDB_PATH") {
505            if !p.is_empty() {
506                return Some(PathBuf::from(p));
507            }
508        }
509        policy::data_dir().map(|d| d.join(DB_FILENAME))
510    }
511
512    /// Optional supplemental DB path for user-local keyed feeds compiled on the
513    /// user's machine during `tirith threat-db update`.
514    pub fn supplemental_path() -> Option<PathBuf> {
515        if let Ok(p) = std::env::var("TIRITH_THREATDB_SUPPLEMENTAL_PATH") {
516            if !p.is_empty() {
517                return Some(PathBuf::from(p));
518            }
519        }
520        policy::data_dir().map(|d| d.join(SUPPLEMENTAL_DB_FILENAME))
521    }
522
523    fn with_supplemental(mut self, supplemental: Option<ThreatDb>) -> Self {
524        self.supplemental = supplemental.map(Box::new);
525        self
526    }
527
528    /// Verify the Ed25519 signature and signer fingerprint.
529    ///
530    /// Returns `Ok(())` if the signature is valid and the signer fingerprint
531    /// matches the embedded public key.  Returns `Err(reason)` on any error
532    /// (invalid key, corrupt data, wrong signer).
533    pub fn verify_signature(&self) -> Result<(), String> {
534        // Check signer fingerprint
535        let key_fingerprint = Sha256::digest(VERIFY_KEY_BYTES);
536        let stored_fp = &self.data[FINGERPRINT_OFFSET..FINGERPRINT_OFFSET + FINGERPRINT_LEN];
537        if key_fingerprint.as_slice() != stored_fp {
538            return Err("signer fingerprint does not match embedded public key".to_string());
539        }
540
541        // Parse the verification key
542        let verify_key = VerifyingKey::from_bytes(VERIFY_KEY_BYTES)
543            .map_err(|e| format!("invalid embedded public key: {e}"))?;
544
545        // Parse signature from header
546        let sig_bytes = &self.data[SIG_OFFSET..SIG_OFFSET + SIGNATURE_LENGTH];
547        let signature = Signature::from_slice(sig_bytes)
548            .map_err(|e| format!("invalid signature in header: {e}"))?;
549
550        // Signed message = header before sig ++ all data after header
551        let mut signed_data = Vec::with_capacity(SIG_OFFSET + (self.data.len() - HEADER_SIZE));
552        signed_data.extend_from_slice(&self.data[..SIG_OFFSET]);
553        signed_data.extend_from_slice(&self.data[HEADER_SIZE..]);
554
555        use ed25519_dalek::Verifier;
556        verify_key
557            .verify(&signed_data, &signature)
558            .map_err(|_| "Ed25519 signature verification failed".to_string())
559    }
560
561    pub fn build_time(&self) -> u64 {
562        self.build_timestamp
563    }
564
565    pub fn build_sequence(&self) -> u64 {
566        self.build_sequence
567    }
568
569    pub fn stats(&self) -> ThreatDbStats {
570        let overlay = self
571            .supplemental
572            .as_deref()
573            .map(|db| db.stats())
574            .unwrap_or_default();
575        ThreatDbStats {
576            format_version: self.format_version,
577            build_timestamp: self.build_timestamp,
578            build_sequence: self.build_sequence,
579            package_count: self.pkg_index_count + overlay.package_count,
580            hostname_count: self.hostname_index_count + overlay.hostname_count,
581            ip_count: self.ip_count + overlay.ip_count,
582            typosquat_count: self.typosquat_index_count + overlay.typosquat_count,
583            popular_count: self.popular_index_count + overlay.popular_count,
584            string_table_bytes: self.string_table_size + overlay.string_table_bytes,
585        }
586    }
587
588    fn read_string_table_entry(&self, offset: u32) -> Option<&str> {
589        if offset == 0xFFFF_FFFF {
590            return None;
591        }
592        let abs = self.string_table_offset as usize + offset as usize;
593        let len = read_u16_le(&self.data, abs)? as usize;
594        let start = abs + 2;
595        let end = start + len;
596        if end > self.data.len() {
597            return None;
598        }
599        std::str::from_utf8(&self.data[start..end]).ok()
600    }
601
602    /// Look up index entry: returns (data_offset, key_hash).
603    fn pkg_index_entry(&self, idx: u32) -> Option<(u32, u32)> {
604        let base = self.pkg_index_offset as usize + idx as usize * PKG_INDEX_ENTRY_SIZE;
605        let data_off = read_u32_le(&self.data, base)?;
606        let hash = read_u32_le(&self.data, base + 4)?;
607        Some((data_off, hash))
608    }
609
610    /// Parse a package record at an absolute offset.
611    fn parse_pkg_record(&self, off: usize) -> Option<PkgRecord<'_>> {
612        let eco = Ecosystem::from_u8(*self.data.get(off)?)?;
613        let name_len = read_u16_le(&self.data, off + 1)? as usize;
614        let name_start = off + 3;
615        let name_end = name_start + name_len;
616        if name_end + 4 > self.data.len() {
617            return None;
618        }
619        let name = std::str::from_utf8(&self.data[name_start..name_end]).ok()?;
620        let mut cursor = name_end;
621
622        let source = ThreatSource::from_u8(*self.data.get(cursor)?)?;
623        cursor += 1;
624        let confidence = Confidence::from_u8(*self.data.get(cursor)?)?;
625        cursor += 1;
626        let flags = *self.data.get(cursor)?;
627        cursor += 1;
628        let all_versions_malicious = (flags & 1) != 0;
629
630        let version_count = read_u16_le(&self.data, cursor)? as usize;
631        cursor += 2;
632
633        let mut versions = Vec::with_capacity(version_count);
634        for _ in 0..version_count {
635            let vlen = read_u16_le(&self.data, cursor)? as usize;
636            cursor += 2;
637            let vend = cursor + vlen;
638            if vend > self.data.len() {
639                return None;
640            }
641            let v = std::str::from_utf8(&self.data[cursor..vend]).ok()?;
642            versions.push(v);
643            cursor = vend;
644        }
645
646        let ref_offset = read_u32_le(&self.data, cursor)?;
647
648        Some(PkgRecord {
649            ecosystem: eco,
650            name,
651            source,
652            confidence,
653            all_versions_malicious,
654            versions,
655            reference_offset: ref_offset,
656        })
657    }
658
659    /// Check a package against the threat DB.
660    ///
661    /// - If `version` is `Some`, match if `all_versions_malicious` is set OR
662    ///   the version appears in the record's affected versions list.
663    /// - If `version` is `None`, match only if `all_versions_malicious` is set.
664    pub fn check_package(
665        &self,
666        eco: Ecosystem,
667        name: &str,
668        version: Option<&str>,
669    ) -> Option<ThreatMatch> {
670        let target_hash = pkg_key_hash(eco, name.as_bytes());
671
672        if let Some(idx) = self.binary_search_pkg_index(eco, name, target_hash) {
673            let (data_off, _) = self.pkg_index_entry(idx)?;
674            let rec = self.parse_pkg_record(data_off as usize)?;
675
676            // Version-aware matching
677            match version {
678                Some(v) => {
679                    if !rec.all_versions_malicious && !rec.versions.iter().any(|rv| rv == &v) {
680                        return self
681                            .supplemental
682                            .as_deref()
683                            .and_then(|db| db.check_package(eco, name, version));
684                    }
685                }
686                None => {
687                    if !rec.all_versions_malicious {
688                        return self
689                            .supplemental
690                            .as_deref()
691                            .and_then(|db| db.check_package(eco, name, version));
692                    }
693                }
694            }
695
696            let reference_url = self
697                .read_string_table_entry(rec.reference_offset)
698                .map(String::from);
699
700            return Some(ThreatMatch {
701                ecosystem: Some(rec.ecosystem),
702                name: rec.name.to_string(),
703                source: rec.source,
704                confidence: rec.confidence,
705                reference_url,
706                all_versions_malicious: rec.all_versions_malicious,
707            });
708        }
709
710        self.supplemental
711            .as_deref()
712            .and_then(|db| db.check_package(eco, name, version))
713    }
714
715    fn binary_search_pkg_index(&self, eco: Ecosystem, name: &str, target_hash: u32) -> Option<u32> {
716        if self.pkg_index_count == 0 {
717            return None;
718        }
719        let mut lo: u32 = 0;
720        let mut hi: u32 = self.pkg_index_count;
721        while lo < hi {
722            let mid = lo + (hi - lo) / 2;
723            let (data_off, hash) = self.pkg_index_entry(mid)?;
724
725            // Compare by hash first (fast path), then verify by parsing
726            match hash.cmp(&target_hash) {
727                std::cmp::Ordering::Less => lo = mid + 1,
728                std::cmp::Ordering::Greater => hi = mid,
729                std::cmp::Ordering::Equal => {
730                    // Hash match — verify actual key
731                    let rec = self.parse_pkg_record(data_off as usize)?;
732                    match (rec.ecosystem as u8, rec.name).cmp(&(eco as u8, name)) {
733                        std::cmp::Ordering::Equal => return Some(mid),
734                        std::cmp::Ordering::Less => lo = mid + 1,
735                        std::cmp::Ordering::Greater => hi = mid,
736                    }
737                }
738            }
739        }
740        None
741    }
742
743    /// Check a hostname against the threat DB.
744    pub fn check_hostname(&self, host: &str) -> Option<ThreatMatch> {
745        if self.hostname_index_count == 0 {
746            return self
747                .supplemental
748                .as_deref()
749                .and_then(|db| db.check_hostname(host));
750        }
751        let normalized = host.to_ascii_lowercase();
752        let target_hash = fnv1a_hash(normalized.as_bytes());
753
754        let Some(idx) = self.binary_search_hostname_index(&normalized, target_hash) else {
755            return self
756                .supplemental
757                .as_deref()
758                .and_then(|db| db.check_hostname(host));
759        };
760        let base = self.hostname_index_offset as usize + idx as usize * HOSTNAME_INDEX_ENTRY_SIZE;
761        let data_off = read_u32_le(&self.data, base)? as usize;
762
763        // Hostname data record: source(u8) + name_len(u16 LE) + name(bytes)
764        let source = ThreatSource::from_u8(*self.data.get(data_off)?)?;
765        let name_len = read_u16_le(&self.data, data_off + 1)? as usize;
766        let name_start = data_off + 3;
767        let name_end = name_start + name_len;
768        if name_end > self.data.len() {
769            return None;
770        }
771
772        Some(ThreatMatch {
773            ecosystem: None,
774            name: normalized,
775            confidence: source.default_confidence(),
776            source,
777            reference_url: None,
778            all_versions_malicious: false,
779        })
780    }
781
782    fn binary_search_hostname_index(&self, normalized: &str, target_hash: u32) -> Option<u32> {
783        if self.hostname_index_count == 0 {
784            return None;
785        }
786        let mut lo: u32 = 0;
787        let mut hi: u32 = self.hostname_index_count;
788        while lo < hi {
789            let mid = lo + (hi - lo) / 2;
790            let base =
791                self.hostname_index_offset as usize + mid as usize * HOSTNAME_INDEX_ENTRY_SIZE;
792            let _data_off = read_u32_le(&self.data, base)?;
793            let hash = read_u32_le(&self.data, base + 4)?;
794            match hash.cmp(&target_hash) {
795                std::cmp::Ordering::Less => lo = mid + 1,
796                std::cmp::Ordering::Greater => hi = mid,
797                std::cmp::Ordering::Equal => {
798                    // Verify: parse the actual hostname
799                    let data_off = _data_off as usize;
800                    let name_len = read_u16_le(&self.data, data_off + 1)? as usize;
801                    let name_start = data_off + 3;
802                    let name_end = name_start + name_len;
803                    if name_end > self.data.len() {
804                        return None;
805                    }
806                    let stored = std::str::from_utf8(&self.data[name_start..name_end]).ok()?;
807                    match stored.cmp(normalized) {
808                        std::cmp::Ordering::Equal => return Some(mid),
809                        std::cmp::Ordering::Less => lo = mid + 1,
810                        std::cmp::Ordering::Greater => hi = mid,
811                    }
812                }
813            }
814        }
815        None
816    }
817
818    /// Check an IPv4 address against the threat DB.
819    pub fn check_ip(&self, ip: Ipv4Addr) -> Option<ThreatMatch> {
820        if self.ip_count == 0 {
821            return self.supplemental.as_deref().and_then(|db| db.check_ip(ip));
822        }
823        let target = u32::from(ip);
824        let Some(idx) = self.binary_search_ip(target) else {
825            return self.supplemental.as_deref().and_then(|db| db.check_ip(ip));
826        };
827        let base = self.ip_offset as usize + idx as usize * IP_RECORD_SIZE;
828        let source = ThreatSource::from_u8(*self.data.get(base + 4)?)?;
829
830        Some(ThreatMatch {
831            ecosystem: None,
832            name: ip.to_string(),
833            confidence: source.default_confidence(),
834            source,
835            reference_url: None,
836            all_versions_malicious: false,
837        })
838    }
839
840    fn binary_search_ip(&self, target: u32) -> Option<u32> {
841        let mut lo: u32 = 0;
842        let mut hi: u32 = self.ip_count;
843        while lo < hi {
844            let mid = lo + (hi - lo) / 2;
845            let base = self.ip_offset as usize + mid as usize * IP_RECORD_SIZE;
846            let val = read_u32_le(&self.data, base)?;
847            match val.cmp(&target) {
848                std::cmp::Ordering::Less => lo = mid + 1,
849                std::cmp::Ordering::Greater => hi = mid,
850                std::cmp::Ordering::Equal => return Some(mid),
851            }
852        }
853        None
854    }
855
856    /// Check a package name against known typosquats.
857    pub fn check_typosquat(&self, eco: Ecosystem, name: &str) -> Option<TyposquatMatch> {
858        if self.typosquat_index_count == 0 {
859            return self
860                .supplemental
861                .as_deref()
862                .and_then(|db| db.check_typosquat(eco, name));
863        }
864        let target_hash = pkg_key_hash(eco, name.as_bytes());
865        let Some(idx) = self.binary_search_typosquat_index(eco, name, target_hash) else {
866            return self
867                .supplemental
868                .as_deref()
869                .and_then(|db| db.check_typosquat(eco, name));
870        };
871        let base = self.typosquat_index_offset as usize + idx as usize * TYPOSQUAT_INDEX_ENTRY_SIZE;
872        let data_off = read_u32_le(&self.data, base)? as usize;
873
874        // Typosquat data record:
875        //   ecosystem(u8) + mal_len(u16 LE) + mal(bytes) + tgt_len(u16 LE) + tgt(bytes)
876        let _eco = Ecosystem::from_u8(*self.data.get(data_off)?)?;
877        let mut cursor = data_off + 1;
878        let mal_len = read_u16_le(&self.data, cursor)? as usize;
879        cursor += 2;
880        let mal_end = cursor + mal_len;
881        if mal_end > self.data.len() {
882            return None;
883        }
884        let malicious_name = std::str::from_utf8(&self.data[cursor..mal_end]).ok()?;
885        cursor = mal_end;
886
887        let tgt_len = read_u16_le(&self.data, cursor)? as usize;
888        cursor += 2;
889        let tgt_end = cursor + tgt_len;
890        if tgt_end > self.data.len() {
891            return None;
892        }
893        let target_name = std::str::from_utf8(&self.data[cursor..tgt_end]).ok()?;
894
895        Some(TyposquatMatch {
896            ecosystem: eco,
897            malicious_name: malicious_name.to_string(),
898            target_name: target_name.to_string(),
899        })
900    }
901
902    fn binary_search_typosquat_index(
903        &self,
904        eco: Ecosystem,
905        name: &str,
906        target_hash: u32,
907    ) -> Option<u32> {
908        let mut lo: u32 = 0;
909        let mut hi: u32 = self.typosquat_index_count;
910        while lo < hi {
911            let mid = lo + (hi - lo) / 2;
912            let base =
913                self.typosquat_index_offset as usize + mid as usize * TYPOSQUAT_INDEX_ENTRY_SIZE;
914            let _data_off = read_u32_le(&self.data, base)?;
915            let hash = read_u32_le(&self.data, base + 4)?;
916            match hash.cmp(&target_hash) {
917                std::cmp::Ordering::Less => lo = mid + 1,
918                std::cmp::Ordering::Greater => hi = mid,
919                std::cmp::Ordering::Equal => {
920                    // Verify actual key
921                    let data_off = _data_off as usize;
922                    let rec_eco = Ecosystem::from_u8(*self.data.get(data_off)?)?;
923                    let mal_len = read_u16_le(&self.data, data_off + 1)? as usize;
924                    let mal_start = data_off + 3;
925                    let mal_end = mal_start + mal_len;
926                    if mal_end > self.data.len() {
927                        return None;
928                    }
929                    let stored = std::str::from_utf8(&self.data[mal_start..mal_end]).ok()?;
930                    match (rec_eco as u8, stored).cmp(&(eco as u8, name)) {
931                        std::cmp::Ordering::Equal => return Some(mid),
932                        std::cmp::Ordering::Less => lo = mid + 1,
933                        std::cmp::Ordering::Greater => hi = mid,
934                    }
935                }
936            }
937        }
938        None
939    }
940
941    /// Find the closest popular package name within Levenshtein distance.
942    /// Returns `(popular_name, distance)` if distance <= 1.
943    pub fn check_popular_distance(&self, eco: Ecosystem, name: &str) -> Option<(String, usize)> {
944        // Linear scan is fine for ~5k short names.
945        let mut best: Option<(String, usize)> = None;
946        let max_distance = 1;
947
948        for i in 0..self.popular_index_count {
949            let base = self.popular_index_offset as usize + i as usize * POPULAR_INDEX_ENTRY_SIZE;
950            let data_off = match read_u32_le(&self.data, base) {
951                Some(v) => v as usize,
952                None => continue,
953            };
954
955            // Popular data record: ecosystem(u8) + name_len(u16 LE) + name(bytes)
956            let rec_eco = match self.data.get(data_off).and_then(|&b| Ecosystem::from_u8(b)) {
957                Some(e) => e,
958                None => continue,
959            };
960            if rec_eco != eco {
961                continue;
962            }
963
964            let name_len = match read_u16_le(&self.data, data_off + 1) {
965                Some(l) => l as usize,
966                None => continue,
967            };
968            let name_start = data_off + 3;
969            let name_end = name_start + name_len;
970            if name_end > self.data.len() {
971                continue;
972            }
973            let popular_name = match std::str::from_utf8(&self.data[name_start..name_end]) {
974                Ok(s) => s,
975                Err(_) => continue,
976            };
977
978            // Skip exact matches (the package itself is popular — not suspicious)
979            if popular_name == name {
980                continue;
981            }
982
983            let dist = levenshtein(name, popular_name);
984            if dist <= max_distance {
985                match &best {
986                    Some((_, d)) if dist < *d => {
987                        best = Some((popular_name.to_string(), dist));
988                    }
989                    None => {
990                        best = Some((popular_name.to_string(), dist));
991                    }
992                    _ => {}
993                }
994            }
995        }
996
997        let overlay = self
998            .supplemental
999            .as_deref()
1000            .and_then(|db| db.check_popular_distance(eco, name));
1001
1002        // Return whichever result has the smaller edit distance; prefer primary on tie.
1003        match (best, overlay) {
1004            (Some(a), Some(b)) if b.1 < a.1 => Some(b),
1005            (Some(a), _) => Some(a),
1006            (None, b) => b,
1007        }
1008    }
1009
1010    /// Get the cached threat DB instance, loading/reloading as needed.
1011    ///
1012    /// Re-checks file mtime every 60 seconds.  If the file changed, reloads
1013    /// into a new `Arc`; readers holding the old `Arc` finish safely.
1014    ///
1015    /// Returns `None` if no DB file exists or loading fails (fail-open).
1016    pub fn cached() -> Option<Arc<ThreatDb>> {
1017        let cache = CACHE.get_or_init(ThreatDbCache::new);
1018        cache.get()
1019    }
1020
1021    /// Force-refresh the cached DB (useful after download).
1022    pub fn refresh_cache() {
1023        if let Some(cache) = CACHE.get() {
1024            cache.force_reload();
1025        }
1026    }
1027}
1028
1029// Internal parsed record (borrows from DB data).
1030struct PkgRecord<'a> {
1031    ecosystem: Ecosystem,
1032    name: &'a str,
1033    source: ThreatSource,
1034    confidence: Confidence,
1035    all_versions_malicious: bool,
1036    versions: Vec<&'a str>,
1037    reference_offset: u32,
1038}
1039
1040static CACHE: OnceLock<ThreatDbCache> = OnceLock::new();
1041
1042struct ThreatDbCache {
1043    db: RwLock<Option<Arc<ThreatDb>>>,
1044    last_mtime_check: AtomicU64,
1045    loaded_mtime: AtomicU64,
1046}
1047
1048impl ThreatDbCache {
1049    fn new() -> Self {
1050        let cache = Self {
1051            db: RwLock::new(None),
1052            last_mtime_check: AtomicU64::new(0),
1053            loaded_mtime: AtomicU64::new(0),
1054        };
1055        // Attempt initial load
1056        cache.force_reload();
1057        cache
1058    }
1059
1060    fn get(&self) -> Option<Arc<ThreatDb>> {
1061        let now = unix_now();
1062        let last_check = self.last_mtime_check.load(Ordering::Relaxed);
1063        if now.saturating_sub(last_check) >= MTIME_CHECK_INTERVAL_SECS {
1064            self.last_mtime_check.store(now, Ordering::Relaxed);
1065            if let Some(file_mtime) = combined_mtime_epoch() {
1066                if file_mtime != self.loaded_mtime.load(Ordering::Relaxed) {
1067                    self.reload(file_mtime);
1068                }
1069            }
1070        }
1071        self.db.read().ok()?.clone()
1072    }
1073
1074    fn force_reload(&self) {
1075        if let Some(file_mtime) = combined_mtime_epoch() {
1076            self.reload(file_mtime);
1077        }
1078    }
1079
1080    fn reload(&self, file_mtime: u64) {
1081        let min_seq = self
1082            .db
1083            .read()
1084            .ok()
1085            .and_then(|guard| guard.as_ref().map(|db| db.build_sequence))
1086            .unwrap_or(0);
1087
1088        match ThreatDb::load_from_data_dir() {
1089            Ok(primary_db) => {
1090                if let Err(e) = primary_db.verify_signature() {
1091                    eprintln!(
1092                        "tirith: warning: threat DB failed signature verification, ignoring update: {e}"
1093                    );
1094                    return;
1095                }
1096                // Supplemental DBs are user-local overlays built from optional keyed feeds.
1097                // They are intentionally not verified against the pinned release signing key:
1098                // `load_from_path()` still validates the binary structure/header/version, but
1099                // authenticity is anchored to the local machine policy and filesystem, not CI.
1100                let supplemental_db = ThreatDb::supplemental_path()
1101                    .filter(|path| path.exists())
1102                    .and_then(|path| match ThreatDb::load_from_path(&path, 0) {
1103                        Ok(db) => Some(db),
1104                        Err(e) => {
1105                            eprintln!(
1106                                "tirith: warning: failed to load supplemental threat DB {}: {e}",
1107                                path.display()
1108                            );
1109                            None
1110                        }
1111                    });
1112                let new_db = primary_db.with_supplemental(supplemental_db);
1113                // Skip rollback check on reload — the from_bytes already checks min_sequence=0.
1114                // We accept any newer sequence.
1115                if new_db.build_sequence > min_seq || min_seq == 0 {
1116                    if let Ok(mut guard) = self.db.write() {
1117                        *guard = Some(Arc::new(new_db));
1118                        self.loaded_mtime.store(file_mtime, Ordering::Relaxed);
1119                    }
1120                }
1121            }
1122            Err(e) => {
1123                eprintln!("tirith: warning: failed to reload threat DB: {e}");
1124            }
1125        }
1126    }
1127}
1128
1129fn unix_now() -> u64 {
1130    std::time::SystemTime::now()
1131        .duration_since(std::time::UNIX_EPOCH)
1132        .map(|d| d.as_secs())
1133        .unwrap_or(0)
1134}
1135
1136fn file_mtime_epoch() -> Option<u64> {
1137    let path = ThreatDb::default_path()?;
1138    let meta = std::fs::metadata(&path).ok()?;
1139    meta.modified()
1140        .ok()?
1141        .duration_since(std::time::UNIX_EPOCH)
1142        .ok()
1143        .map(|d| d.as_secs())
1144}
1145
1146fn combined_mtime_epoch() -> Option<u64> {
1147    let primary = file_mtime_epoch();
1148    let supplemental = ThreatDb::supplemental_path()
1149        .and_then(|path| std::fs::metadata(path).ok())
1150        .and_then(|meta| meta.modified().ok())
1151        .and_then(|mtime| mtime.duration_since(std::time::UNIX_EPOCH).ok())
1152        .map(|d| d.as_secs())
1153        .unwrap_or(0);
1154
1155    // A supplemental DB is never authoritative on its own, so we only expose a
1156    // combined mtime when the primary signed DB exists.
1157    primary
1158        .map(|mtime| mtime.rotate_left(13) ^ supplemental.rotate_left(29) ^ 0x5448_5245_4154_4442)
1159}
1160
1161/// Builder for creating threat DB files (used by the compiler binary to
1162/// produce `.dat` files).
1163///
1164/// Usage:
1165/// ```ignore
1166/// let mut writer = ThreatDbWriter::new(build_timestamp, build_sequence);
1167/// writer.add_package(Ecosystem::Npm, "evil-pkg", &["1.0.0"], ThreatSource::OssfMalicious,
1168///                    Confidence::Confirmed, true, Some("https://ref"));
1169/// writer.add_ip(Ipv4Addr::new(1,2,3,4), ThreatSource::FeodoTracker);
1170/// writer.write_to(Path::new("threatdb.dat"), &signing_key)?;
1171/// ```
1172pub struct ThreatDbWriter {
1173    build_timestamp: u64,
1174    build_sequence: u64,
1175    packages: Vec<WriterPkg>,
1176    hostnames: Vec<WriterHostname>,
1177    ips: Vec<WriterIp>,
1178    typosquats: Vec<WriterTyposquat>,
1179    popular: Vec<WriterPopular>,
1180    string_table: StringTable,
1181}
1182
1183struct WriterPkg {
1184    ecosystem: Ecosystem,
1185    name: String,
1186    versions: Vec<String>,
1187    source: ThreatSource,
1188    confidence: Confidence,
1189    all_versions_malicious: bool,
1190    reference_offset: u32, // into string table
1191}
1192
1193struct WriterHostname {
1194    name: String,
1195    source: ThreatSource,
1196}
1197
1198struct WriterIp {
1199    addr: u32,
1200    source: ThreatSource,
1201}
1202
1203struct WriterTyposquat {
1204    ecosystem: Ecosystem,
1205    malicious_name: String,
1206    target_name: String,
1207}
1208
1209struct WriterPopular {
1210    ecosystem: Ecosystem,
1211    name: String,
1212}
1213
1214/// Deduplicated string table builder.
1215struct StringTable {
1216    data: Vec<u8>,
1217    index: std::collections::HashMap<String, u32>,
1218}
1219
1220impl StringTable {
1221    fn new() -> Self {
1222        Self {
1223            data: Vec::new(),
1224            index: std::collections::HashMap::new(),
1225        }
1226    }
1227
1228    /// Intern a string, returning its offset within the table.
1229    fn intern(&mut self, s: &str) -> u32 {
1230        if let Some(&off) = self.index.get(s) {
1231            return off;
1232        }
1233        let off = self.data.len() as u32;
1234        let bytes = s.as_bytes();
1235        self.data
1236            .extend_from_slice(&(bytes.len() as u16).to_le_bytes());
1237        self.data.extend_from_slice(bytes);
1238        self.index.insert(s.to_string(), off);
1239        off
1240    }
1241
1242    fn bytes(&self) -> &[u8] {
1243        &self.data
1244    }
1245
1246    fn len(&self) -> u32 {
1247        self.data.len() as u32
1248    }
1249}
1250
1251impl ThreatDbWriter {
1252    pub fn new(build_timestamp: u64, build_sequence: u64) -> Self {
1253        Self {
1254            build_timestamp,
1255            build_sequence,
1256            packages: Vec::new(),
1257            hostnames: Vec::new(),
1258            ips: Vec::new(),
1259            typosquats: Vec::new(),
1260            popular: Vec::new(),
1261            string_table: StringTable::new(),
1262        }
1263    }
1264
1265    #[allow(clippy::too_many_arguments)]
1266    pub fn add_package(
1267        &mut self,
1268        eco: Ecosystem,
1269        name: &str,
1270        versions: &[&str],
1271        source: ThreatSource,
1272        confidence: Confidence,
1273        all_versions_malicious: bool,
1274        reference: Option<&str>,
1275    ) {
1276        let ref_offset = match reference {
1277            Some(r) => self.string_table.intern(r),
1278            None => 0xFFFF_FFFF,
1279        };
1280        self.packages.push(WriterPkg {
1281            ecosystem: eco,
1282            name: name.to_string(),
1283            versions: versions.iter().map(|v| v.to_string()).collect(),
1284            source,
1285            confidence,
1286            all_versions_malicious,
1287            reference_offset: ref_offset,
1288        });
1289    }
1290
1291    pub fn add_hostname(&mut self, name: &str, source: ThreatSource) {
1292        self.hostnames.push(WriterHostname {
1293            name: name.to_ascii_lowercase(),
1294            source,
1295        });
1296    }
1297
1298    pub fn add_ip(&mut self, addr: Ipv4Addr, source: ThreatSource) {
1299        self.ips.push(WriterIp {
1300            addr: u32::from(addr),
1301            source,
1302        });
1303    }
1304
1305    pub fn add_typosquat(&mut self, eco: Ecosystem, malicious_name: &str, target_name: &str) {
1306        self.typosquats.push(WriterTyposquat {
1307            ecosystem: eco,
1308            malicious_name: malicious_name.to_string(),
1309            target_name: target_name.to_string(),
1310        });
1311    }
1312
1313    pub fn add_popular(&mut self, eco: Ecosystem, name: &str) {
1314        self.popular.push(WriterPopular {
1315            ecosystem: eco,
1316            name: name.to_string(),
1317        });
1318    }
1319
1320    /// Build and write the database to a file. Signs with the provided keypair.
1321    pub fn write_to(
1322        mut self,
1323        path: &Path,
1324        signing_key: &ed25519_dalek::SigningKey,
1325    ) -> Result<(), ThreatDbError> {
1326        let bytes = self.build(signing_key)?;
1327        std::fs::write(path, bytes)?;
1328        Ok(())
1329    }
1330
1331    /// Build the database into bytes (for testing or in-memory use).
1332    pub fn build(
1333        &mut self,
1334        signing_key: &ed25519_dalek::SigningKey,
1335    ) -> Result<Vec<u8>, ThreatDbError> {
1336        // Sort and deduplicate
1337        self.packages
1338            .sort_by(|a, b| (a.ecosystem as u8, &a.name).cmp(&(b.ecosystem as u8, &b.name)));
1339        self.packages
1340            .dedup_by(|a, b| a.ecosystem == b.ecosystem && a.name == b.name);
1341
1342        self.hostnames.sort_by(|a, b| a.name.cmp(&b.name));
1343        self.hostnames.dedup_by(|a, b| a.name == b.name);
1344
1345        self.ips.sort_by_key(|ip| ip.addr);
1346        self.ips.dedup_by_key(|ip| ip.addr);
1347
1348        self.typosquats.sort_by(|a, b| {
1349            (a.ecosystem as u8, &a.malicious_name).cmp(&(b.ecosystem as u8, &b.malicious_name))
1350        });
1351        self.typosquats
1352            .dedup_by(|a, b| a.ecosystem == b.ecosystem && a.malicious_name == b.malicious_name);
1353
1354        self.popular
1355            .sort_by(|a, b| (a.ecosystem as u8, &a.name).cmp(&(b.ecosystem as u8, &b.name)));
1356        self.popular
1357            .dedup_by(|a, b| a.ecosystem == b.ecosystem && a.name == b.name);
1358
1359        let mut pkg_data: Vec<u8> = Vec::new();
1360        let mut pkg_index: Vec<(u32, u32)> = Vec::new(); // (data_offset, key_hash)
1361
1362        for pkg in &self.packages {
1363            let data_offset = (HEADER_SIZE + pkg_data.len()) as u32; // absolute offset placeholder
1364            let key_hash = pkg_key_hash(pkg.ecosystem, pkg.name.as_bytes());
1365
1366            pkg_data.push(pkg.ecosystem as u8);
1367            let name_bytes = pkg.name.as_bytes();
1368            pkg_data.extend_from_slice(&(name_bytes.len() as u16).to_le_bytes());
1369            pkg_data.extend_from_slice(name_bytes);
1370            pkg_data.push(pkg.source as u8);
1371            pkg_data.push(pkg.confidence as u8);
1372            let flags: u8 = if pkg.all_versions_malicious { 1 } else { 0 };
1373            pkg_data.push(flags);
1374            pkg_data.extend_from_slice(&(pkg.versions.len() as u16).to_le_bytes());
1375            for v in &pkg.versions {
1376                let vbytes = v.as_bytes();
1377                pkg_data.extend_from_slice(&(vbytes.len() as u16).to_le_bytes());
1378                pkg_data.extend_from_slice(vbytes);
1379            }
1380            pkg_data.extend_from_slice(&pkg.reference_offset.to_le_bytes());
1381
1382            pkg_index.push((data_offset, key_hash));
1383        }
1384
1385        // Hostname data region
1386        let mut hostname_data: Vec<u8> = Vec::new();
1387        let mut hostname_index: Vec<(u32, u32)> = Vec::new();
1388
1389        for hn in &self.hostnames {
1390            let key_hash = fnv1a_hash(hn.name.as_bytes());
1391            let local_off = hostname_data.len();
1392
1393            hostname_data.push(hn.source as u8);
1394            let name_bytes = hn.name.as_bytes();
1395            hostname_data.extend_from_slice(&(name_bytes.len() as u16).to_le_bytes());
1396            hostname_data.extend_from_slice(name_bytes);
1397
1398            hostname_index.push((local_off as u32, key_hash));
1399        }
1400
1401        // Typosquat data region
1402        let mut typo_data: Vec<u8> = Vec::new();
1403        let mut typo_index: Vec<(u32, u32)> = Vec::new();
1404
1405        for ts in &self.typosquats {
1406            let local_off = typo_data.len();
1407            let key_hash = pkg_key_hash(ts.ecosystem, ts.malicious_name.as_bytes());
1408
1409            typo_data.push(ts.ecosystem as u8);
1410            let mal_bytes = ts.malicious_name.as_bytes();
1411            typo_data.extend_from_slice(&(mal_bytes.len() as u16).to_le_bytes());
1412            typo_data.extend_from_slice(mal_bytes);
1413            let tgt_bytes = ts.target_name.as_bytes();
1414            typo_data.extend_from_slice(&(tgt_bytes.len() as u16).to_le_bytes());
1415            typo_data.extend_from_slice(tgt_bytes);
1416
1417            typo_index.push((local_off as u32, key_hash));
1418        }
1419
1420        // Popular data region
1421        let mut popular_data: Vec<u8> = Vec::new();
1422        let mut popular_index: Vec<(u32, u32)> = Vec::new();
1423
1424        for pop in &self.popular {
1425            let local_off = popular_data.len();
1426            let key_hash = pkg_key_hash(pop.ecosystem, pop.name.as_bytes());
1427
1428            popular_data.push(pop.ecosystem as u8);
1429            let name_bytes = pop.name.as_bytes();
1430            popular_data.extend_from_slice(&(name_bytes.len() as u16).to_le_bytes());
1431            popular_data.extend_from_slice(name_bytes);
1432
1433            popular_index.push((local_off as u32, key_hash));
1434        }
1435
1436        // IP records
1437        let mut ip_data: Vec<u8> = Vec::with_capacity(self.ips.len() * IP_RECORD_SIZE);
1438        for ip in &self.ips {
1439            ip_data.extend_from_slice(&ip.addr.to_le_bytes());
1440            ip_data.push(ip.source as u8);
1441        }
1442
1443        // File layout after header:
1444        //   pkg_index | pkg_data | hostname_index | hostname_data |
1445        //   ip_data | typo_index | typo_data | popular_index | popular_data | string_table
1446
1447        let pkg_index_size = pkg_index.len() * PKG_INDEX_ENTRY_SIZE;
1448        let hostname_index_size = hostname_index.len() * HOSTNAME_INDEX_ENTRY_SIZE;
1449        let typo_index_size = typo_index.len() * TYPOSQUAT_INDEX_ENTRY_SIZE;
1450        let popular_index_size = popular_index.len() * POPULAR_INDEX_ENTRY_SIZE;
1451
1452        let mut offset = HEADER_SIZE;
1453
1454        let pkg_index_offset = offset as u32;
1455        offset += pkg_index_size;
1456        let pkg_data_offset = offset;
1457        offset += pkg_data.len();
1458
1459        let hostname_index_offset = offset as u32;
1460        offset += hostname_index_size;
1461        let hostname_data_offset = offset;
1462        offset += hostname_data.len();
1463
1464        let ip_data_offset = offset as u32;
1465        offset += ip_data.len();
1466
1467        let typo_index_offset = offset as u32;
1468        offset += typo_index_size;
1469        let typo_data_offset = offset;
1470        offset += typo_data.len();
1471
1472        let popular_index_offset = offset as u32;
1473        offset += popular_index_size;
1474        let popular_data_offset = offset;
1475        offset += popular_data.len();
1476
1477        let string_table_offset = offset as u32;
1478        // offset += self.string_table.len() as usize; // not needed further
1479
1480        // Fix up data offsets to be absolute
1481        for (data_off, _) in &mut pkg_index {
1482            // pkg_index was built with absolute offsets assuming data starts right after header.
1483            // Now data starts at pkg_data_offset. Adjust.
1484            // The original data_offset was HEADER_SIZE + local_offset.
1485            // We need pkg_data_offset + local_offset.
1486            let local_off = *data_off as usize - HEADER_SIZE;
1487            *data_off = (pkg_data_offset + local_off) as u32;
1488        }
1489
1490        for (data_off, _) in &mut hostname_index {
1491            *data_off = (hostname_data_offset + *data_off as usize) as u32;
1492        }
1493
1494        for (data_off, _) in &mut typo_index {
1495            *data_off = (typo_data_offset + *data_off as usize) as u32;
1496        }
1497
1498        for (data_off, _) in &mut popular_index {
1499            *data_off = (popular_data_offset + *data_off as usize) as u32;
1500        }
1501
1502        // Sort index vectors by hash so binary search works correctly.
1503        // (The input data was sorted by (ecosystem, name), but lookups
1504        // use FNV hash ordering.)  popular_index uses linear scan, but
1505        // sort it too for consistency.
1506        pkg_index.sort_by_key(|&(_, hash)| hash);
1507        hostname_index.sort_by_key(|&(_, hash)| hash);
1508        typo_index.sort_by_key(|&(_, hash)| hash);
1509
1510        let total_size = HEADER_SIZE
1511            + pkg_index_size
1512            + pkg_data.len()
1513            + hostname_index_size
1514            + hostname_data.len()
1515            + ip_data.len()
1516            + typo_index_size
1517            + typo_data.len()
1518            + popular_index_size
1519            + popular_data.len()
1520            + self.string_table.len() as usize;
1521
1522        let mut buf = vec![0u8; total_size];
1523
1524        // Header (will fill signature + fingerprint after writing data)
1525        buf[0..8].copy_from_slice(MAGIC);
1526        buf[8..12].copy_from_slice(&FORMAT_VERSION.to_le_bytes());
1527        buf[12..20].copy_from_slice(&self.build_timestamp.to_le_bytes());
1528        buf[20..28].copy_from_slice(&self.build_sequence.to_le_bytes());
1529        buf[28..32].copy_from_slice(&pkg_index_offset.to_le_bytes());
1530        buf[32..36].copy_from_slice(&(self.packages.len() as u32).to_le_bytes());
1531        buf[36..40].copy_from_slice(&hostname_index_offset.to_le_bytes());
1532        buf[40..44].copy_from_slice(&(self.hostnames.len() as u32).to_le_bytes());
1533        buf[44..48].copy_from_slice(&ip_data_offset.to_le_bytes());
1534        buf[48..52].copy_from_slice(&(self.ips.len() as u32).to_le_bytes());
1535        buf[52..56].copy_from_slice(&typo_index_offset.to_le_bytes());
1536        buf[56..60].copy_from_slice(&(self.typosquats.len() as u32).to_le_bytes());
1537        buf[60..64].copy_from_slice(&popular_index_offset.to_le_bytes());
1538        buf[64..68].copy_from_slice(&(self.popular.len() as u32).to_le_bytes());
1539        buf[68..72].copy_from_slice(&string_table_offset.to_le_bytes());
1540        buf[72..76].copy_from_slice(&self.string_table.len().to_le_bytes());
1541
1542        // Signer fingerprint
1543        let fingerprint = Sha256::digest(signing_key.verifying_key().as_bytes());
1544        buf[FINGERPRINT_OFFSET..FINGERPRINT_OFFSET + FINGERPRINT_LEN].copy_from_slice(&fingerprint);
1545
1546        // Write sections
1547        let mut pos = HEADER_SIZE;
1548
1549        // Package index
1550        for (data_off, hash) in &pkg_index {
1551            buf[pos..pos + 4].copy_from_slice(&data_off.to_le_bytes());
1552            buf[pos + 4..pos + 8].copy_from_slice(&hash.to_le_bytes());
1553            pos += PKG_INDEX_ENTRY_SIZE;
1554        }
1555        // Package data
1556        buf[pos..pos + pkg_data.len()].copy_from_slice(&pkg_data);
1557        pos += pkg_data.len();
1558
1559        // Hostname index
1560        for (data_off, hash) in &hostname_index {
1561            buf[pos..pos + 4].copy_from_slice(&data_off.to_le_bytes());
1562            buf[pos + 4..pos + 8].copy_from_slice(&hash.to_le_bytes());
1563            pos += HOSTNAME_INDEX_ENTRY_SIZE;
1564        }
1565        // Hostname data
1566        buf[pos..pos + hostname_data.len()].copy_from_slice(&hostname_data);
1567        pos += hostname_data.len();
1568
1569        // IP data
1570        buf[pos..pos + ip_data.len()].copy_from_slice(&ip_data);
1571        pos += ip_data.len();
1572
1573        // Typosquat index
1574        for (data_off, hash) in &typo_index {
1575            buf[pos..pos + 4].copy_from_slice(&data_off.to_le_bytes());
1576            buf[pos + 4..pos + 8].copy_from_slice(&hash.to_le_bytes());
1577            pos += TYPOSQUAT_INDEX_ENTRY_SIZE;
1578        }
1579        // Typosquat data
1580        buf[pos..pos + typo_data.len()].copy_from_slice(&typo_data);
1581        pos += typo_data.len();
1582
1583        // Popular index
1584        for (data_off, hash) in &popular_index {
1585            buf[pos..pos + 4].copy_from_slice(&data_off.to_le_bytes());
1586            buf[pos + 4..pos + 8].copy_from_slice(&hash.to_le_bytes());
1587            pos += POPULAR_INDEX_ENTRY_SIZE;
1588        }
1589        // Popular data
1590        buf[pos..pos + popular_data.len()].copy_from_slice(&popular_data);
1591        pos += popular_data.len();
1592
1593        // String table
1594        let st = self.string_table.bytes();
1595        buf[pos..pos + st.len()].copy_from_slice(st);
1596
1597        // Sign: header before sig ++ all data after header
1598        let mut signed_data = Vec::with_capacity(SIG_OFFSET + (buf.len() - HEADER_SIZE));
1599        signed_data.extend_from_slice(&buf[..SIG_OFFSET]);
1600        signed_data.extend_from_slice(&buf[HEADER_SIZE..]);
1601
1602        use ed25519_dalek::Signer;
1603        let signature = signing_key.sign(&signed_data);
1604        buf[SIG_OFFSET..SIG_OFFSET + SIGNATURE_LENGTH].copy_from_slice(&signature.to_bytes());
1605
1606        Ok(buf)
1607    }
1608}
1609
1610#[cfg(test)]
1611mod tests {
1612    use super::*;
1613    use ed25519_dalek::SigningKey;
1614    use rand_core::OsRng;
1615    use std::sync::Mutex;
1616
1617    static ENV_LOCK: Mutex<()> = Mutex::new(());
1618
1619    /// Helper: create a writer, add test data, build, and return a ThreatDb.
1620    fn build_test_db(signing_key: &SigningKey) -> ThreatDb {
1621        let mut writer = ThreatDbWriter::new(1700000000, 42);
1622
1623        // Packages
1624        writer.add_package(
1625            Ecosystem::Npm,
1626            "evil-package",
1627            &["1.0.0", "1.0.1"],
1628            ThreatSource::OssfMalicious,
1629            Confidence::Confirmed,
1630            false,
1631            Some("https://example.com/advisory/1"),
1632        );
1633        writer.add_package(
1634            Ecosystem::PyPI,
1635            "malware-pkg",
1636            &[],
1637            ThreatSource::DatadogMalicious,
1638            Confidence::Confirmed,
1639            true,
1640            None,
1641        );
1642        writer.add_package(
1643            Ecosystem::Npm,
1644            "borderline-pkg",
1645            &["2.0.0"],
1646            ThreatSource::OssfMalicious,
1647            Confidence::Medium,
1648            false,
1649            Some("https://example.com/advisory/2"),
1650        );
1651
1652        // IPs
1653        writer.add_ip(Ipv4Addr::new(192, 168, 1, 100), ThreatSource::FeodoTracker);
1654        writer.add_ip(Ipv4Addr::new(10, 0, 0, 1), ThreatSource::FeodoTracker);
1655        writer.add_ip(Ipv4Addr::new(203, 0, 113, 50), ThreatSource::FeodoTracker);
1656
1657        // Typosquats
1658        writer.add_typosquat(Ecosystem::Npm, "reacct", "react");
1659        writer.add_typosquat(Ecosystem::PyPI, "reqeusts", "requests");
1660
1661        // Popular packages
1662        writer.add_popular(Ecosystem::Npm, "react");
1663        writer.add_popular(Ecosystem::Npm, "express");
1664        writer.add_popular(Ecosystem::PyPI, "requests");
1665        writer.add_popular(Ecosystem::PyPI, "flask");
1666
1667        let bytes = writer.build(signing_key).expect("build failed");
1668        ThreatDb::from_bytes(bytes, 0).expect("load failed")
1669    }
1670
1671    #[test]
1672    fn test_round_trip_all_sections() {
1673        let key = SigningKey::generate(&mut OsRng);
1674        let db = build_test_db(&key);
1675
1676        let stats = db.stats();
1677        assert_eq!(stats.format_version, 1);
1678        assert_eq!(stats.build_timestamp, 1700000000);
1679        assert_eq!(stats.build_sequence, 42);
1680        assert_eq!(stats.package_count, 3);
1681        assert_eq!(stats.ip_count, 3);
1682        assert_eq!(stats.typosquat_count, 2);
1683        assert_eq!(stats.popular_count, 4);
1684        assert_eq!(stats.hostname_count, 0);
1685    }
1686
1687    #[test]
1688    fn test_package_version_in_list() {
1689        let key = SigningKey::generate(&mut OsRng);
1690        let db = build_test_db(&key);
1691
1692        let m = db
1693            .check_package(Ecosystem::Npm, "evil-package", Some("1.0.0"))
1694            .expect("should match");
1695        assert_eq!(m.source, ThreatSource::OssfMalicious);
1696        assert_eq!(m.confidence, Confidence::Confirmed);
1697        assert!(!m.all_versions_malicious);
1698        assert!(m.reference_url.is_some());
1699    }
1700
1701    #[test]
1702    fn test_package_version_not_in_list() {
1703        let key = SigningKey::generate(&mut OsRng);
1704        let db = build_test_db(&key);
1705
1706        assert!(db
1707            .check_package(Ecosystem::Npm, "evil-package", Some("2.0.0"))
1708            .is_none());
1709    }
1710
1711    #[test]
1712    fn test_package_no_version_all_malicious() {
1713        let key = SigningKey::generate(&mut OsRng);
1714        let db = build_test_db(&key);
1715
1716        let m = db
1717            .check_package(Ecosystem::PyPI, "malware-pkg", None)
1718            .expect("should match all-versions-malicious without version");
1719        assert!(m.all_versions_malicious);
1720        assert_eq!(m.source, ThreatSource::DatadogMalicious);
1721    }
1722
1723    #[test]
1724    fn test_package_no_version_not_all_malicious() {
1725        let key = SigningKey::generate(&mut OsRng);
1726        let db = build_test_db(&key);
1727
1728        assert!(
1729            db.check_package(Ecosystem::Npm, "evil-package", None)
1730                .is_none(),
1731            "should NOT match when no version provided and all_versions_malicious=false"
1732        );
1733    }
1734
1735    #[test]
1736    fn test_package_all_malicious_with_version() {
1737        let key = SigningKey::generate(&mut OsRng);
1738        let db = build_test_db(&key);
1739
1740        let m = db
1741            .check_package(Ecosystem::PyPI, "malware-pkg", Some("99.99.99"))
1742            .expect("all_versions_malicious should match any version");
1743        assert!(m.all_versions_malicious);
1744    }
1745
1746    #[test]
1747    fn test_package_missing() {
1748        let key = SigningKey::generate(&mut OsRng);
1749        let db = build_test_db(&key);
1750
1751        assert!(db
1752            .check_package(Ecosystem::Npm, "safe-package", Some("1.0.0"))
1753            .is_none());
1754    }
1755
1756    #[test]
1757    fn test_package_wrong_ecosystem() {
1758        let key = SigningKey::generate(&mut OsRng);
1759        let db = build_test_db(&key);
1760
1761        assert!(db
1762            .check_package(Ecosystem::PyPI, "evil-package", Some("1.0.0"))
1763            .is_none());
1764    }
1765
1766    #[test]
1767    fn test_package_medium_confidence() {
1768        let key = SigningKey::generate(&mut OsRng);
1769        let db = build_test_db(&key);
1770
1771        let m = db
1772            .check_package(Ecosystem::Npm, "borderline-pkg", Some("2.0.0"))
1773            .expect("should match");
1774        assert_eq!(m.confidence, Confidence::Medium);
1775    }
1776
1777    #[test]
1778    fn test_ip_found() {
1779        let key = SigningKey::generate(&mut OsRng);
1780        let db = build_test_db(&key);
1781
1782        let m = db
1783            .check_ip(Ipv4Addr::new(192, 168, 1, 100))
1784            .expect("should find IP");
1785        assert_eq!(m.source, ThreatSource::FeodoTracker);
1786    }
1787
1788    #[test]
1789    fn test_ip_not_found() {
1790        let key = SigningKey::generate(&mut OsRng);
1791        let db = build_test_db(&key);
1792
1793        assert!(db.check_ip(Ipv4Addr::new(8, 8, 8, 8)).is_none());
1794    }
1795
1796    #[test]
1797    fn test_ip_first_element() {
1798        let key = SigningKey::generate(&mut OsRng);
1799        let db = build_test_db(&key);
1800
1801        assert!(db.check_ip(Ipv4Addr::new(10, 0, 0, 1)).is_some());
1802    }
1803
1804    #[test]
1805    fn test_ip_last_element() {
1806        let key = SigningKey::generate(&mut OsRng);
1807        let db = build_test_db(&key);
1808
1809        assert!(db.check_ip(Ipv4Addr::new(203, 0, 113, 50)).is_some());
1810    }
1811
1812    #[test]
1813    fn test_typosquat_found() {
1814        let key = SigningKey::generate(&mut OsRng);
1815        let db = build_test_db(&key);
1816
1817        let m = db
1818            .check_typosquat(Ecosystem::Npm, "reacct")
1819            .expect("should find typosquat");
1820        assert_eq!(m.target_name, "react");
1821    }
1822
1823    #[test]
1824    fn test_typosquat_not_found() {
1825        let key = SigningKey::generate(&mut OsRng);
1826        let db = build_test_db(&key);
1827
1828        assert!(db.check_typosquat(Ecosystem::Npm, "react").is_none());
1829    }
1830
1831    #[test]
1832    fn test_typosquat_wrong_ecosystem() {
1833        let key = SigningKey::generate(&mut OsRng);
1834        let db = build_test_db(&key);
1835
1836        assert!(db.check_typosquat(Ecosystem::PyPI, "reacct").is_none());
1837    }
1838
1839    #[test]
1840    fn test_popular_distance_1() {
1841        let key = SigningKey::generate(&mut OsRng);
1842        let db = build_test_db(&key);
1843
1844        let result = db.check_popular_distance(Ecosystem::PyPI, "reqests");
1845        assert!(result.is_some(), "should find close match");
1846        let (name, dist) = result.unwrap();
1847        assert_eq!(name, "requests");
1848        assert_eq!(dist, 1);
1849    }
1850
1851    #[test]
1852    fn test_popular_exact_match_skipped() {
1853        let key = SigningKey::generate(&mut OsRng);
1854        let db = build_test_db(&key);
1855
1856        assert!(db.check_popular_distance(Ecosystem::Npm, "react").is_none());
1857    }
1858
1859    #[test]
1860    fn test_popular_distance_too_far() {
1861        let key = SigningKey::generate(&mut OsRng);
1862        let db = build_test_db(&key);
1863
1864        assert!(db.check_popular_distance(Ecosystem::Npm, "xyz").is_none());
1865    }
1866
1867    #[test]
1868    fn test_hostname_empty_section() {
1869        let key = SigningKey::generate(&mut OsRng);
1870        let db = build_test_db(&key);
1871
1872        assert!(db.check_hostname("evil.example.com").is_none());
1873    }
1874
1875    #[test]
1876    fn test_signature_valid() {
1877        let key = SigningKey::generate(&mut OsRng);
1878        let mut writer = ThreatDbWriter::new(1700000000, 1);
1879        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
1880
1881        // Override the embedded key for this test by checking against
1882        // the key that actually signed the data. Since VERIFY_KEY_BYTES
1883        // is the placeholder all-zeros key, we verify manually.
1884        let bytes = writer.build(&key).expect("build");
1885        let db = ThreatDb::from_bytes(bytes, 0).expect("load");
1886
1887        // verify_signature() will fail because the embedded key doesn't match.
1888        // This tests the negative path for the placeholder key.
1889        assert!(
1890            db.verify_signature().is_err(),
1891            "placeholder key should not verify real signature"
1892        );
1893    }
1894
1895    #[test]
1896    fn test_signature_corrupt_byte() {
1897        let key = SigningKey::generate(&mut OsRng);
1898        let mut writer = ThreatDbWriter::new(1700000000, 1);
1899        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
1900
1901        let mut bytes = writer.build(&key).expect("build");
1902
1903        // Corrupt a data byte
1904        if bytes.len() > HEADER_SIZE + 1 {
1905            bytes[HEADER_SIZE + 1] ^= 0xFF;
1906        }
1907
1908        let db = ThreatDb::from_bytes(bytes, 0).expect("load");
1909        assert!(
1910            db.verify_signature().is_err(),
1911            "corrupt data should fail verification"
1912        );
1913    }
1914
1915    #[test]
1916    fn test_signature_with_matching_key() {
1917        // Test that verification works when we construct data signed with a
1918        // known key and then check against that same key (simulating what
1919        // happens when the real key replaces the placeholder).
1920        let key = SigningKey::generate(&mut OsRng);
1921        let mut writer = ThreatDbWriter::new(1700000000, 1);
1922        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
1923
1924        let bytes = writer.build(&key).expect("build");
1925
1926        // Manually verify: reconstruct signed data and check
1927        let sig_bytes = &bytes[SIG_OFFSET..SIG_OFFSET + SIGNATURE_LENGTH];
1928        let signature = Signature::from_slice(sig_bytes).expect("parse sig");
1929
1930        let mut signed_data = Vec::new();
1931        signed_data.extend_from_slice(&bytes[..SIG_OFFSET]);
1932        signed_data.extend_from_slice(&bytes[HEADER_SIZE..]);
1933
1934        use ed25519_dalek::Verifier;
1935        assert!(
1936            key.verifying_key().verify(&signed_data, &signature).is_ok(),
1937            "signature should verify against signing key"
1938        );
1939    }
1940
1941    #[test]
1942    fn test_rollback_rejected() {
1943        let key = SigningKey::generate(&mut OsRng);
1944        let mut writer = ThreatDbWriter::new(1700000000, 5);
1945        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
1946        let bytes = writer.build(&key).expect("build");
1947
1948        let err = ThreatDb::from_bytes(bytes, 10).expect_err("should reject rollback");
1949        match err {
1950            ThreatDbError::RollbackDetected {
1951                got: 5,
1952                current: 10,
1953            } => {}
1954            other => panic!("expected RollbackDetected, got: {other}"),
1955        }
1956    }
1957
1958    #[test]
1959    fn test_rollback_equal_rejected() {
1960        let key = SigningKey::generate(&mut OsRng);
1961        let mut writer = ThreatDbWriter::new(1700000000, 10);
1962        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
1963        let bytes = writer.build(&key).expect("build");
1964
1965        let err = ThreatDb::from_bytes(bytes, 10).expect_err("equal sequence should be rejected");
1966        assert!(matches!(err, ThreatDbError::RollbackDetected { .. }));
1967    }
1968
1969    #[test]
1970    fn test_rollback_newer_accepted() {
1971        let key = SigningKey::generate(&mut OsRng);
1972        let mut writer = ThreatDbWriter::new(1700000000, 20);
1973        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
1974        let bytes = writer.build(&key).expect("build");
1975
1976        assert!(ThreatDb::from_bytes(bytes, 10).is_ok());
1977    }
1978
1979    #[test]
1980    fn test_invalid_magic() {
1981        let mut data = vec![0u8; HEADER_SIZE + 10];
1982        data[0..8].copy_from_slice(b"BADMAGIC");
1983        assert!(matches!(
1984            ThreatDb::from_bytes(data, 0),
1985            Err(ThreatDbError::InvalidMagic)
1986        ));
1987    }
1988
1989    #[test]
1990    fn test_file_too_small() {
1991        let data = vec![0u8; 10];
1992        assert!(matches!(
1993            ThreatDb::from_bytes(data, 0),
1994            Err(ThreatDbError::FileTooSmall(_))
1995        ));
1996    }
1997
1998    #[test]
1999    fn test_unsupported_version() {
2000        let mut data = vec![0u8; HEADER_SIZE + 10];
2001        data[0..8].copy_from_slice(MAGIC);
2002        data[8..12].copy_from_slice(&99u32.to_le_bytes()); // bad version
2003        assert!(matches!(
2004            ThreatDb::from_bytes(data, 0),
2005            Err(ThreatDbError::UnsupportedVersion(99))
2006        ));
2007    }
2008
2009    #[test]
2010    fn test_single_entry_db() {
2011        let key = SigningKey::generate(&mut OsRng);
2012        let mut writer = ThreatDbWriter::new(1700000000, 1);
2013        writer.add_package(
2014            Ecosystem::Crates,
2015            "only-pkg",
2016            &["0.1.0"],
2017            ThreatSource::OssfMalicious,
2018            Confidence::Confirmed,
2019            false,
2020            None,
2021        );
2022        let bytes = writer.build(&key).expect("build");
2023        let db = ThreatDb::from_bytes(bytes, 0).expect("load");
2024
2025        assert!(db
2026            .check_package(Ecosystem::Crates, "only-pkg", Some("0.1.0"))
2027            .is_some());
2028        assert!(db
2029            .check_package(Ecosystem::Crates, "other", Some("0.1.0"))
2030            .is_none());
2031    }
2032
2033    #[test]
2034    fn test_empty_db() {
2035        let key = SigningKey::generate(&mut OsRng);
2036        let mut writer = ThreatDbWriter::new(1700000000, 1);
2037        let bytes = writer.build(&key).expect("build");
2038        let db = ThreatDb::from_bytes(bytes, 0).expect("load");
2039
2040        assert!(db.check_package(Ecosystem::Npm, "anything", None).is_none());
2041        assert!(db.check_ip(Ipv4Addr::new(1, 2, 3, 4)).is_none());
2042        assert!(db.check_typosquat(Ecosystem::Npm, "anything").is_none());
2043        assert!(db.check_hostname("anything.com").is_none());
2044        assert!(db
2045            .check_popular_distance(Ecosystem::Npm, "anything")
2046            .is_none());
2047
2048        let stats = db.stats();
2049        assert_eq!(stats.package_count, 0);
2050        assert_eq!(stats.ip_count, 0);
2051    }
2052
2053    #[test]
2054    fn test_cache_returns_none_when_no_file() {
2055        // ThreatDb::cached() should return None when no DB file exists.
2056        // This is the normal case on first install.
2057        // We can't easily test the full cache lifecycle without filesystem
2058        // setup, but we verify the fail-open behavior.
2059        // Note: The OnceLock means this test might interact with other tests
2060        // that use cached(). In practice, the default_path() won't exist
2061        // in test environments.
2062        let result = ThreatDb::cached();
2063        // May be None or Some depending on test environment; just ensure no panic.
2064        let _ = result;
2065    }
2066
2067    #[test]
2068    fn test_writer_deduplicates() {
2069        let key = SigningKey::generate(&mut OsRng);
2070        let mut writer = ThreatDbWriter::new(1700000000, 1);
2071
2072        // Add same package twice
2073        writer.add_package(
2074            Ecosystem::Npm,
2075            "dupe-pkg",
2076            &["1.0.0"],
2077            ThreatSource::OssfMalicious,
2078            Confidence::Confirmed,
2079            false,
2080            None,
2081        );
2082        writer.add_package(
2083            Ecosystem::Npm,
2084            "dupe-pkg",
2085            &["2.0.0"],
2086            ThreatSource::DatadogMalicious,
2087            Confidence::Confirmed,
2088            false,
2089            None,
2090        );
2091
2092        // Add same IP twice
2093        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
2094        writer.add_ip(Ipv4Addr::new(1, 2, 3, 4), ThreatSource::FeodoTracker);
2095
2096        let bytes = writer.build(&key).expect("build");
2097        let db = ThreatDb::from_bytes(bytes, 0).expect("load");
2098
2099        assert_eq!(
2100            db.stats().package_count,
2101            1,
2102            "duplicate packages should be deduped"
2103        );
2104        assert_eq!(db.stats().ip_count, 1, "duplicate IPs should be deduped");
2105    }
2106
2107    #[test]
2108    fn test_supplemental_overlay_lookup_and_stats() {
2109        let key = SigningKey::generate(&mut OsRng);
2110
2111        let mut primary_writer = ThreatDbWriter::new(1700000000, 1);
2112        primary_writer.add_package(
2113            Ecosystem::Npm,
2114            "primary-pkg",
2115            &["1.0.0"],
2116            ThreatSource::OssfMalicious,
2117            Confidence::Confirmed,
2118            false,
2119            None,
2120        );
2121        let primary = ThreatDb::from_bytes(primary_writer.build(&key).expect("primary build"), 0)
2122            .expect("primary load");
2123
2124        let mut supplemental_writer = ThreatDbWriter::new(1700000001, 1);
2125        supplemental_writer.add_package(
2126            Ecosystem::PyPI,
2127            "overlay-pkg",
2128            &["2.0.0"],
2129            ThreatSource::DatadogMalicious,
2130            Confidence::Confirmed,
2131            false,
2132            None,
2133        );
2134        supplemental_writer.add_hostname("overlay.example", ThreatSource::Urlhaus);
2135        supplemental_writer.add_ip(Ipv4Addr::new(203, 0, 113, 77), ThreatSource::ThreatFoxIoc);
2136        supplemental_writer.add_typosquat(Ecosystem::Npm, "reacct", "react");
2137        supplemental_writer.add_popular(Ecosystem::Npm, "react");
2138
2139        let supplemental = ThreatDb::from_bytes(
2140            supplemental_writer.build(&key).expect("supplemental build"),
2141            0,
2142        )
2143        .expect("supplemental load");
2144
2145        let db = primary.with_supplemental(Some(supplemental));
2146
2147        assert!(db
2148            .check_package(Ecosystem::Npm, "primary-pkg", Some("1.0.0"))
2149            .is_some());
2150        assert!(db
2151            .check_package(Ecosystem::PyPI, "overlay-pkg", Some("2.0.0"))
2152            .is_some());
2153        assert!(db.check_hostname("overlay.example").is_some());
2154        assert!(db.check_ip(Ipv4Addr::new(203, 0, 113, 77)).is_some());
2155        assert!(db.check_typosquat(Ecosystem::Npm, "reacct").is_some());
2156        assert_eq!(
2157            db.check_popular_distance(Ecosystem::Npm, "reac"),
2158            Some(("react".to_string(), 1))
2159        );
2160
2161        let stats = db.stats();
2162        assert_eq!(stats.package_count, 2);
2163        assert_eq!(stats.hostname_count, 1);
2164        assert_eq!(stats.ip_count, 1);
2165        assert_eq!(stats.typosquat_count, 1);
2166        assert_eq!(stats.popular_count, 1);
2167    }
2168
2169    #[test]
2170    fn test_supplemental_overlay_falls_through_on_primary_version_mismatch() {
2171        let key = SigningKey::generate(&mut OsRng);
2172
2173        let mut primary_writer = ThreatDbWriter::new(1700000000, 1);
2174        primary_writer.add_package(
2175            Ecosystem::Npm,
2176            "shared-pkg",
2177            &["1.0.0"],
2178            ThreatSource::OssfMalicious,
2179            Confidence::Confirmed,
2180            false,
2181            None,
2182        );
2183        let primary = ThreatDb::from_bytes(primary_writer.build(&key).expect("primary build"), 0)
2184            .expect("primary load");
2185
2186        let mut supplemental_writer = ThreatDbWriter::new(1700000001, 1);
2187        supplemental_writer.add_package(
2188            Ecosystem::Npm,
2189            "shared-pkg",
2190            &["2.0.0"],
2191            ThreatSource::DatadogMalicious,
2192            Confidence::Confirmed,
2193            false,
2194            None,
2195        );
2196        let supplemental = ThreatDb::from_bytes(
2197            supplemental_writer.build(&key).expect("supplemental build"),
2198            0,
2199        )
2200        .expect("supplemental load");
2201
2202        let db = primary.with_supplemental(Some(supplemental));
2203        let threat = db
2204            .check_package(Ecosystem::Npm, "shared-pkg", Some("2.0.0"))
2205            .expect("supplemental version should match");
2206        assert_eq!(threat.source, ThreatSource::DatadogMalicious);
2207    }
2208
2209    #[test]
2210    fn test_combined_mtime_requires_primary_db() {
2211        let _guard = ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner());
2212        let tmp = tempfile::tempdir().unwrap();
2213        let primary = tmp.path().join("primary.dat");
2214        let supplemental = tmp.path().join("supplemental.dat");
2215
2216        unsafe {
2217            std::env::set_var("TIRITH_THREATDB_PATH", &primary);
2218            std::env::set_var("TIRITH_THREATDB_SUPPLEMENTAL_PATH", &supplemental);
2219        }
2220
2221        assert_eq!(combined_mtime_epoch(), None);
2222
2223        std::fs::write(&supplemental, b"overlay").unwrap();
2224        assert_eq!(combined_mtime_epoch(), None);
2225
2226        std::fs::remove_file(&supplemental).unwrap();
2227        std::fs::write(&primary, b"primary").unwrap();
2228        let primary_only = combined_mtime_epoch().expect("primary mtime");
2229
2230        std::fs::write(&supplemental, b"overlay-updated").unwrap();
2231        let combined = combined_mtime_epoch().expect("combined mtime");
2232        assert_ne!(primary_only, combined);
2233
2234        unsafe {
2235            std::env::remove_var("TIRITH_THREATDB_PATH");
2236            std::env::remove_var("TIRITH_THREATDB_SUPPLEMENTAL_PATH");
2237        }
2238    }
2239
2240    #[test]
2241    fn test_string_table_deduplication() {
2242        let mut st = StringTable::new();
2243        let off1 = st.intern("https://example.com");
2244        let off2 = st.intern("https://example.com");
2245        let off3 = st.intern("https://other.com");
2246
2247        assert_eq!(off1, off2, "same string should return same offset");
2248        assert_ne!(
2249            off1, off3,
2250            "different strings should have different offsets"
2251        );
2252    }
2253
2254    #[test]
2255    fn test_reference_url_round_trip() {
2256        let key = SigningKey::generate(&mut OsRng);
2257        let mut writer = ThreatDbWriter::new(1700000000, 1);
2258        writer.add_package(
2259            Ecosystem::Npm,
2260            "ref-pkg",
2261            &["1.0.0"],
2262            ThreatSource::OssfMalicious,
2263            Confidence::Confirmed,
2264            false,
2265            Some("https://example.com/advisory/123"),
2266        );
2267        let bytes = writer.build(&key).expect("build");
2268        let db = ThreatDb::from_bytes(bytes, 0).expect("load");
2269
2270        let m = db
2271            .check_package(Ecosystem::Npm, "ref-pkg", Some("1.0.0"))
2272            .expect("should match");
2273        assert_eq!(
2274            m.reference_url.as_deref(),
2275            Some("https://example.com/advisory/123")
2276        );
2277    }
2278}