Skip to main content

spam_db/
format.rs

1/// The magic prefix that identifies all spam databases.
2pub(crate) const DB_MAGIC: &str = "# spam-db-v2";
3
4/// Number of index buckets.
5pub(crate) const INDEX_BUCKETS: usize = 256;
6
7/// Bytes per index entry: 8-byte little-endian offset, 8-byte little-endian length.
8pub(crate) const INDEX_ENTRY_SIZE: usize = 16;
9
10/// Total index size in bytes.
11pub(crate) const INDEX_SIZE: usize = INDEX_BUCKETS * INDEX_ENTRY_SIZE;
12
13use std::{
14  io::{Read, Seek, SeekFrom},
15  path::{Path, PathBuf},
16};
17
18use crate::{Error, Result};
19
20/// Whether a spam database stores NixOS module options or package file paths.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum DbKind {
23  /// Options database produced from `nixosOptionsDoc`.
24  Options,
25  /// Package-file database mapping file paths to package names.
26  Packages,
27}
28
29/// An open spam database file.
30///
31/// Only the fixed-size bucket index is loaded into memory. Bucket lookups read
32/// and decompress the relevant payload slice on demand.
33///
34/// Layout:
35/// ```text
36/// [header line]\n
37/// [256 x 16-byte index entries]
38/// [concatenated zstd-compressed bucket blobs]
39/// ```
40#[derive(Debug)]
41pub(crate) struct DbFile {
42  pub(crate) kind: DbKind,
43  index: [u8; INDEX_SIZE],
44  path: PathBuf,
45  data_start: u64,
46}
47
48impl DbFile {
49  /// Load a spam database from `path`.
50  pub(crate) fn open(path: impl AsRef<Path>) -> Result<Self> {
51    let path = path.as_ref().to_owned();
52    let mut file = std::fs::File::open(&path)?;
53
54    let mut header_bytes = Vec::new();
55    loop {
56      let mut byte = [0u8; 1];
57      if file.read(&mut byte)? == 0 {
58        return Err(Error::InvalidDatabase("missing header newline".into()));
59      }
60      if byte[0] == b'\n' {
61        break;
62      }
63      header_bytes.push(byte[0]);
64    }
65
66    let header = std::str::from_utf8(&header_bytes)
67      .map_err(|_| Error::InvalidDatabase("non-UTF-8 header".into()))?;
68
69    let kind = parse_kind(header)?;
70
71    let data_start = u64::try_from(header_bytes.len() + 1 + INDEX_SIZE)
72      .map_err(|_| Error::InvalidDatabase("database header is too large".into()))?;
73
74    if file.metadata()?.len() < data_start {
75      return Err(Error::InvalidDatabase(
76        "file is too short to contain index".into(),
77      ));
78    }
79
80    let mut index = [0u8; INDEX_SIZE];
81    file.read_exact(&mut index)?;
82
83    Ok(Self {
84      kind,
85      index,
86      path,
87      data_start,
88    })
89  }
90
91  /// Decompress and return all non-empty lines in `bucket`.
92  pub(crate) fn bucket_lines(&self, bucket: usize) -> Result<Vec<String>> {
93    let entry = bucket * INDEX_ENTRY_SIZE;
94    let offset = read_u64le(&self.index, entry).try_into().map_err(|_| {
95      Error::InvalidDatabase("bucket offset is too large for this platform".into())
96    })?;
97    let length = read_u64le(&self.index, entry + 8).try_into().map_err(|_| {
98      Error::InvalidDatabase("bucket length is too large for this platform".into())
99    })?;
100
101    if length == 0 {
102      return Ok(Vec::new());
103    }
104
105    let start = self
106      .data_start
107      .checked_add(offset)
108      .ok_or_else(|| Error::InvalidDatabase("bucket offset overflow".into()))?;
109    let end = start
110      .checked_add(length)
111      .ok_or_else(|| Error::InvalidDatabase("bucket length overflow".into()))?;
112
113    let mut file = std::fs::File::open(&self.path)?;
114    let file_len = file.metadata()?.len();
115    if end > file_len {
116      return Err(Error::InvalidDatabase(
117        "bucket slice out of bounds".into(),
118      ));
119    }
120
121    let length_usize = length.try_into().map_err(|_| {
122      Error::InvalidDatabase("bucket length is too large for this platform".into())
123    })?;
124    let mut compressed = vec![0u8; length_usize];
125    file.seek(SeekFrom::Start(start))?;
126    file.read_exact(&mut compressed)?;
127
128    let decompressed = zstd::decode_all(compressed.as_slice())
129      .map_err(|e| Error::InvalidDatabase(format!("zstd error: {e}")))?;
130
131    let text = String::from_utf8(decompressed).map_err(|_| {
132      Error::InvalidDatabase("non-UTF-8 database content".into())
133    })?;
134
135    Ok(
136      text
137        .lines()
138        .filter(|l| !l.is_empty())
139        .map(String::from)
140        .collect(),
141    )
142  }
143
144  /// The bucket index for `query`: the first byte value, or 0 for empty input.
145  pub(crate) fn query_bucket(query: &str) -> usize {
146    query.bytes().next().map(|b| b as usize).unwrap_or(0)
147  }
148}
149
150/// Parse the DB kind from the header line, e.g. `"# spam-db-v2\toptions"`.
151fn parse_kind(header: &str) -> Result<DbKind> {
152  let rest = header.strip_prefix(DB_MAGIC).ok_or_else(|| {
153    Error::InvalidDatabase("missing spam-db magic header".into())
154  })?;
155
156  let kind_str = rest.strip_prefix('\t').unwrap_or(rest);
157
158  match kind_str {
159    "options" => Ok(DbKind::Options),
160    "packages" => Ok(DbKind::Packages),
161    other => Err(Error::InvalidDatabase(format!(
162      "unknown database kind: {other}"
163    ))),
164  }
165}
166
167/// Read a little-endian `u64` from `data` at `offset`.
168fn read_u64le(data: &[u8], offset: usize) -> u64 {
169  u64::from_le_bytes(
170    data[offset..offset + 8]
171      .try_into()
172      .expect("slice length guaranteed by caller"),
173  )
174}