Skip to main content

spam_db/
format.rs

1/// The magic prefix that identifies all spam databases.
2pub(crate) const DB_MAGIC: &str = "# spam-db-v2";
3
4/// Number of index buckets.
5pub(crate) const INDEX_BUCKETS: usize = 256;
6
7/// Bytes per index entry: 8-byte little-endian offset, 8-byte little-endian length.
8pub(crate) const INDEX_ENTRY_SIZE: usize = 16;
9
10/// Total index size in bytes.
11pub(crate) const INDEX_SIZE: usize = INDEX_BUCKETS * INDEX_ENTRY_SIZE;
12
13use std::{
14  io::{Read, Seek, SeekFrom},
15  path::{Path, PathBuf},
16};
17
18use crate::{Error, Result};
19
20/// Whether a spam database stores NixOS module options or package file paths.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum DbKind {
23  /// Options database produced from `nixosOptionsDoc`.
24  Options,
25  /// Package-file database built from a local manifest via `spam db build`.
26  Packages,
27  /// Autonomous package index produced by `spam index`.
28  Index,
29}
30
31/// An open spam database file.
32///
33/// Only the fixed-size bucket index is loaded into memory. Bucket lookups read
34/// and decompress the relevant payload slice on demand.
35///
36/// Layout:
37/// ```text
38/// [header line]\n
39/// [256 x 16-byte index entries]
40/// [concatenated zstd-compressed bucket blobs]
41/// ```
42#[derive(Debug)]
43pub(crate) struct DbFile {
44  pub(crate) kind: DbKind,
45  index: [u8; INDEX_SIZE],
46  path: PathBuf,
47  data_start: u64,
48}
49
50impl DbFile {
51  /// Load a spam database from `path`.
52  pub(crate) fn open(path: impl AsRef<Path>) -> Result<Self> {
53    let path = path.as_ref().to_owned();
54    let mut file = std::fs::File::open(&path)?;
55
56    let mut header_bytes = Vec::new();
57    loop {
58      let mut byte = [0u8; 1];
59      if file.read(&mut byte)? == 0 {
60        return Err(Error::InvalidDatabase("missing header newline".into()));
61      }
62      if byte[0] == b'\n' {
63        break;
64      }
65      header_bytes.push(byte[0]);
66    }
67
68    let header = std::str::from_utf8(&header_bytes)
69      .map_err(|_| Error::InvalidDatabase("non-UTF-8 header".into()))?;
70
71    let kind = parse_kind(header)?;
72
73    let data_start = u64::try_from(header_bytes.len() + 1 + INDEX_SIZE)
74      .map_err(|_| Error::InvalidDatabase("database header is too large".into()))?;
75
76    if file.metadata()?.len() < data_start {
77      return Err(Error::InvalidDatabase(
78        "file is too short to contain index".into(),
79      ));
80    }
81
82    let mut index = [0u8; INDEX_SIZE];
83    file.read_exact(&mut index)?;
84
85    Ok(Self {
86      kind,
87      index,
88      path,
89      data_start,
90    })
91  }
92
93  /// Decompress and return all non-empty lines in `bucket`.
94  pub(crate) fn bucket_lines(&self, bucket: usize) -> Result<Vec<String>> {
95    let entry = bucket * INDEX_ENTRY_SIZE;
96    let offset = read_u64le(&self.index, entry).try_into().map_err(|_| {
97      Error::InvalidDatabase("bucket offset is too large for this platform".into())
98    })?;
99    let length = read_u64le(&self.index, entry + 8).try_into().map_err(|_| {
100      Error::InvalidDatabase("bucket length is too large for this platform".into())
101    })?;
102
103    if length == 0 {
104      return Ok(Vec::new());
105    }
106
107    let start = self
108      .data_start
109      .checked_add(offset)
110      .ok_or_else(|| Error::InvalidDatabase("bucket offset overflow".into()))?;
111    let end = start
112      .checked_add(length)
113      .ok_or_else(|| Error::InvalidDatabase("bucket length overflow".into()))?;
114
115    let mut file = std::fs::File::open(&self.path)?;
116    let file_len = file.metadata()?.len();
117    if end > file_len {
118      return Err(Error::InvalidDatabase(
119        "bucket slice out of bounds".into(),
120      ));
121    }
122
123    let length_usize = length.try_into().map_err(|_| {
124      Error::InvalidDatabase("bucket length is too large for this platform".into())
125    })?;
126    let mut compressed = vec![0u8; length_usize];
127    file.seek(SeekFrom::Start(start))?;
128    file.read_exact(&mut compressed)?;
129
130    let decompressed = zstd::decode_all(compressed.as_slice())
131      .map_err(|e| Error::InvalidDatabase(format!("zstd error: {e}")))?;
132
133    let text = String::from_utf8(decompressed).map_err(|_| {
134      Error::InvalidDatabase("non-UTF-8 database content".into())
135    })?;
136
137    Ok(
138      text
139        .lines()
140        .filter(|l| !l.is_empty())
141        .map(String::from)
142        .collect(),
143    )
144  }
145
146  /// The bucket index for `query`: the first byte value, or 0 for empty input.
147  pub(crate) fn query_bucket(query: &str) -> usize {
148    query.bytes().next().map(|b| b as usize).unwrap_or(0)
149  }
150}
151
152/// Parse the DB kind from the header line, e.g. `"# spam-db-v2\toptions"`.
153fn parse_kind(header: &str) -> Result<DbKind> {
154  let rest = header.strip_prefix(DB_MAGIC).ok_or_else(|| {
155    Error::InvalidDatabase("missing spam-db magic header".into())
156  })?;
157
158  let kind_str = rest.strip_prefix('\t').unwrap_or(rest);
159
160  match kind_str {
161    "options" => Ok(DbKind::Options),
162    "packages" => Ok(DbKind::Packages),
163    "index" => Ok(DbKind::Index),
164    other => Err(Error::InvalidDatabase(format!(
165      "unknown database kind: {other}"
166    ))),
167  }
168}
169
170/// Read a little-endian `u64` from `data` at `offset`.
171fn read_u64le(data: &[u8], offset: usize) -> u64 {
172  u64::from_le_bytes(
173    data[offset..offset + 8]
174      .try_into()
175      .expect("slice length guaranteed by caller"),
176  )
177}