Skip to main content

spam_db/
format.rs

1/// The magic prefix that identifies all spam databases.
2pub(crate) const DB_MAGIC: &str = "# spam-db-v3";
3
4/// Number of index buckets.
5pub(crate) const INDEX_BUCKETS: usize = 256;
6
7/// Bytes per index entry: 8-byte little-endian offset, 8-byte little-endian length.
8pub(crate) const INDEX_ENTRY_SIZE: usize = 16;
9
10/// Total index size in bytes.
11pub(crate) const INDEX_SIZE: usize = INDEX_BUCKETS * INDEX_ENTRY_SIZE;
12
13use std::{
14    io::{Read, Seek, SeekFrom},
15    path::{Path, PathBuf},
16};
17
18use crate::{Error, Result};
19
20/// Whether a spam database stores NixOS module options or package file paths.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum DbKind {
23    /// Options database produced from `nixosOptionsDoc`.
24    Options,
25    /// Package-file database built from a local manifest via `spam db build`.
26    Packages,
27    /// Autonomous package index produced by `spam index`.
28    Index,
29}
30
31/// An open spam database file.
32///
33/// Only the fixed-size bucket index is loaded into memory. Bucket lookups read
34/// and decompress the relevant payload slice on demand.
35///
36/// Layout:
37/// ```text
38/// [header line]\n
39/// [256 x 16-byte index entries]
40/// [concatenated zstd-compressed bucket blobs]
41/// ```
42#[derive(Debug)]
43pub(crate) struct DbFile {
44    pub(crate) kind: DbKind,
45    index: Option<[u8; INDEX_SIZE]>,
46    path: PathBuf,
47    data_start: u64,
48}
49
50impl DbFile {
51    /// Load a spam database from `path`.
52    pub(crate) fn open(path: impl AsRef<Path>) -> Result<Self> {
53        let path = path.as_ref().to_owned();
54        let mut file = std::fs::File::open(&path)?;
55
56        let mut header_bytes = Vec::new();
57        loop {
58            let mut byte = [0u8; 1];
59            if file.read(&mut byte)? == 0 {
60                return Err(Error::InvalidDatabase("missing header newline".into()));
61            }
62            if byte[0] == b'\n' {
63                break;
64            }
65            header_bytes.push(byte[0]);
66        }
67
68        let header = std::str::from_utf8(&header_bytes)
69            .map_err(|_| Error::InvalidDatabase("non-UTF-8 header".into()))?;
70
71        let kind = parse_kind(header)?;
72
73        let has_bucket_index = kind != DbKind::Index;
74        let index_size = if has_bucket_index { INDEX_SIZE } else { 0 };
75
76        let data_start = u64::try_from(header_bytes.len() + 1 + index_size)
77            .map_err(|_| Error::InvalidDatabase("database header is too large".into()))?;
78
79        if file.metadata()?.len() < data_start {
80            return Err(Error::InvalidDatabase(
81                "file is too short to contain index".into(),
82            ));
83        }
84
85        let index = if has_bucket_index {
86            let mut index = [0u8; INDEX_SIZE];
87            file.read_exact(&mut index)?;
88            Some(index)
89        } else {
90            None
91        };
92
93        Ok(Self {
94            kind,
95            index,
96            path,
97            data_start,
98        })
99    }
100
101    /// Decompress and return all non-empty lines in `bucket`.
102    pub(crate) fn bucket_lines(&self, bucket: usize) -> Result<Vec<String>> {
103        let index = self.index.as_ref().ok_or_else(|| {
104            Error::InvalidDatabase("database does not contain a bucket index".into())
105        })?;
106        let entry = bucket * INDEX_ENTRY_SIZE;
107        let offset = read_u64le(index, entry).try_into().map_err(|_| {
108            Error::InvalidDatabase("bucket offset is too large for this platform".into())
109        })?;
110        let length = read_u64le(index, entry + 8).try_into().map_err(|_| {
111            Error::InvalidDatabase("bucket length is too large for this platform".into())
112        })?;
113
114        if length == 0 {
115            return Ok(Vec::new());
116        }
117
118        let start = self
119            .data_start
120            .checked_add(offset)
121            .ok_or_else(|| Error::InvalidDatabase("bucket offset overflow".into()))?;
122        let end = start
123            .checked_add(length)
124            .ok_or_else(|| Error::InvalidDatabase("bucket length overflow".into()))?;
125
126        let mut file = std::fs::File::open(&self.path)?;
127        let file_len = file.metadata()?.len();
128        if end > file_len {
129            return Err(Error::InvalidDatabase("bucket slice out of bounds".into()));
130        }
131
132        let length_usize = length.try_into().map_err(|_| {
133            Error::InvalidDatabase("bucket length is too large for this platform".into())
134        })?;
135        let mut compressed = vec![0u8; length_usize];
136        file.seek(SeekFrom::Start(start))?;
137        file.read_exact(&mut compressed)?;
138
139        let decompressed = zstd::decode_all(compressed.as_slice())
140            .map_err(|e| Error::InvalidDatabase(format!("zstd error: {e}")))?;
141
142        let text = String::from_utf8(decompressed)
143            .map_err(|_| Error::InvalidDatabase("non-UTF-8 database content".into()))?;
144
145        Ok(text
146            .lines()
147            .filter(|l| !l.is_empty())
148            .map(String::from)
149            .collect())
150    }
151
152    /// The bucket index for `query`: the first byte value, or 0 for empty input.
153    pub(crate) fn query_bucket(query: &str) -> usize {
154        query.bytes().next().map(|b| b as usize).unwrap_or(0)
155    }
156
157    /// Decompress and return all non-empty lines from a stream database.
158    pub(crate) fn stream_lines(&self) -> Result<Vec<String>> {
159        let mut file = std::fs::File::open(&self.path)?;
160        let file_len = file.metadata()?.len();
161        if self.data_start > file_len {
162            return Err(Error::InvalidDatabase(
163                "stream payload starts past end of file".into(),
164            ));
165        }
166
167        let length = usize::try_from(file_len - self.data_start).map_err(|_| {
168            Error::InvalidDatabase("stream payload is too large for this platform".into())
169        })?;
170        let mut compressed = vec![0u8; length];
171        file.seek(SeekFrom::Start(self.data_start))?;
172        file.read_exact(&mut compressed)?;
173
174        let decompressed = zstd::decode_all(compressed.as_slice())
175            .map_err(|e| Error::InvalidDatabase(format!("zstd error: {e}")))?;
176        let text = String::from_utf8(decompressed)
177            .map_err(|_| Error::InvalidDatabase("non-UTF-8 database content".into()))?;
178
179        Ok(text
180            .lines()
181            .filter(|l| !l.is_empty())
182            .map(String::from)
183            .collect())
184    }
185}
186
187/// Parse the DB kind from the header line, e.g. `"# spam-db-v3\toptions"`.
188fn parse_kind(header: &str) -> Result<DbKind> {
189    let rest = header
190        .strip_prefix(DB_MAGIC)
191        .ok_or_else(|| Error::InvalidDatabase("missing spam-db magic header".into()))?;
192
193    let kind_str = rest.strip_prefix('\t').unwrap_or(rest);
194
195    match kind_str {
196        "options" => Ok(DbKind::Options),
197        "packages" => Ok(DbKind::Packages),
198        "index" => Ok(DbKind::Index),
199        other => Err(Error::InvalidDatabase(format!(
200            "unknown database kind: {other}"
201        ))),
202    }
203}
204
205/// Read a little-endian `u64` from `data` at `offset`.
206fn read_u64le(data: &[u8], offset: usize) -> u64 {
207    u64::from_le_bytes(
208        data[offset..offset + 8]
209            .try_into()
210            .expect("slice length guaranteed by caller"),
211    )
212}