Skip to main content

spam_db/
format.rs

1/// The magic prefix that identifies all spam databases.
2pub(crate) const DB_MAGIC: &str = "# spam-db-v1";
3
4/// Number of index buckets.
5pub(crate) const INDEX_BUCKETS: usize = 256;
6
7/// Bytes per index entry: 4-byte little-endian offset, 4-byte little-endian length.
8pub(crate) const INDEX_ENTRY_SIZE: usize = 8;
9
10/// Total index size in bytes.
11pub(crate) const INDEX_SIZE: usize = INDEX_BUCKETS * INDEX_ENTRY_SIZE;
12
13use crate::{Error, Result};
14
15/// Whether a spam database stores NixOS module options or package file paths.
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum DbKind {
18  /// Options database produced from `nixosOptionsDoc`.
19  Options,
20  /// Package-file database mapping file paths to package names.
21  Packages,
22}
23
24/// The parsed contents of a spam database file.
25///
26/// The entire file is loaded into memory on construction. Bucket lookups
27/// decompress only the relevant slice on demand.
28///
29/// Layout:
30/// ```text
31/// [header line]\n
32/// [256 x 8-byte index entries]
33/// [concatenated zstd-compressed bucket blobs]
34/// ```
35#[derive(Debug)]
36pub(crate) struct DbFile {
37  pub(crate) kind: DbKind,
38  index: [u8; INDEX_SIZE],
39  data: Vec<u8>,
40}
41
42impl DbFile {
43  /// Load a spam database from `path`.
44  pub(crate) fn open(path: impl AsRef<std::path::Path>) -> Result<Self> {
45    let bytes = std::fs::read(path)?;
46
47    let nl = bytes
48      .iter()
49      .position(|&b| b == b'\n')
50      .ok_or_else(|| Error::InvalidDatabase("missing header newline".into()))?;
51
52    let header = std::str::from_utf8(&bytes[..nl])
53      .map_err(|_| Error::InvalidDatabase("non-UTF-8 header".into()))?;
54
55    let kind = parse_kind(header)?;
56
57    let index_start = nl + 1;
58    let data_start = index_start + INDEX_SIZE;
59
60    if bytes.len() < data_start {
61      return Err(Error::InvalidDatabase(
62        "file is too short to contain index".into(),
63      ));
64    }
65
66    let mut index = [0u8; INDEX_SIZE];
67    index.copy_from_slice(&bytes[index_start..index_start + INDEX_SIZE]);
68
69    let data = bytes[data_start..].to_vec();
70
71    Ok(Self { kind, index, data })
72  }
73
74  /// Decompress and return all non-empty lines in `bucket`.
75  pub(crate) fn bucket_lines(&self, bucket: usize) -> Result<Vec<String>> {
76    let entry = bucket * INDEX_ENTRY_SIZE;
77    let offset = read_u32le(&self.index, entry) as usize;
78    let length = read_u32le(&self.index, entry + 4) as usize;
79
80    if length == 0 {
81      return Ok(Vec::new());
82    }
83
84    let end = offset
85      .checked_add(length)
86      .filter(|&e| e <= self.data.len())
87      .ok_or_else(|| {
88        Error::InvalidDatabase("bucket slice out of bounds".into())
89      })?;
90
91    let compressed = &self.data[offset..end];
92    let decompressed = zstd::decode_all(compressed)
93      .map_err(|e| Error::InvalidDatabase(format!("zstd error: {e}")))?;
94
95    let text = String::from_utf8(decompressed).map_err(|_| {
96      Error::InvalidDatabase("non-UTF-8 database content".into())
97    })?;
98
99    Ok(
100      text
101        .lines()
102        .filter(|l| !l.is_empty())
103        .map(String::from)
104        .collect(),
105    )
106  }
107
108  /// The bucket index for `query`: the first byte value, or 0 for empty input.
109  pub(crate) fn query_bucket(query: &str) -> usize {
110    query.bytes().next().map(|b| b as usize).unwrap_or(0)
111  }
112}
113
114/// Parse the DB kind from the header line, e.g. `"# spam-db-v1\toptions"`.
115fn parse_kind(header: &str) -> Result<DbKind> {
116  let rest = header.strip_prefix(DB_MAGIC).ok_or_else(|| {
117    Error::InvalidDatabase("missing spam-db magic header".into())
118  })?;
119
120  let kind_str = rest.strip_prefix('\t').unwrap_or(rest);
121
122  match kind_str {
123    "options" => Ok(DbKind::Options),
124    "packages" => Ok(DbKind::Packages),
125    other => Err(Error::InvalidDatabase(format!(
126      "unknown database kind: {other}"
127    ))),
128  }
129}
130
131/// Read a little-endian `u32` from `data` at `offset`.
132fn read_u32le(data: &[u8], offset: usize) -> u32 {
133  u32::from_le_bytes(
134    data[offset..offset + 4]
135      .try_into()
136      .expect("slice length guaranteed by caller"),
137  )
138}