Skip to main content

sift_core/storage/
lexicon.rs

1//! Sorted trigram → postings slice descriptor.
2
3use std::fs::File;
4use std::io::{BufWriter, Write};
5use std::path::Path;
6
7use memmap2::Mmap;
8
9use crate::storage::format::{write_magic, LEXICON_MAGIC};
10use crate::storage::mmap::open_mmap;
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct LexiconEntry {
14    pub trigram: [u8; 3],
15    pub offset: u64,
16    pub len: u32,
17}
18
19/// Write sorted `entries` to `out_path`.
20///
21/// # Errors
22///
23/// Propagates IO errors from writing `out_path`.
24pub fn write_lexicon(out_path: &Path, entries: &[LexiconEntry]) -> std::io::Result<()> {
25    let f = File::create(out_path)?;
26    let mut w = BufWriter::new(f);
27    write_magic(&mut w, LEXICON_MAGIC)?;
28    let n: u32 = entries
29        .len()
30        .try_into()
31        .map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidInput, "lexicon too large"))?;
32    w.write_all(&n.to_le_bytes())?;
33    for e in entries {
34        w.write_all(&e.trigram)?;
35        w.write_all(&e.offset.to_le_bytes())?;
36        w.write_all(&e.len.to_le_bytes())?;
37    }
38    w.flush()?;
39    Ok(())
40}
41
42/// Memory-mapped lexicon view with owning storage.
43///
44/// Binary search over on-disk sorted trigram table.
45#[derive(Debug)]
46pub struct MappedLexicon {
47    backing: Backing,
48    count: usize,
49}
50
51#[derive(Debug)]
52enum Backing {
53    Mmap(Mmap),
54    Owned(Vec<u8>),
55}
56
57impl MappedLexicon {
58    const ENTRY_SIZE: usize = 15;
59
60    fn bytes(&self) -> &[u8] {
61        match &self.backing {
62            Backing::Mmap(m) => m.as_ref(),
63            Backing::Owned(v) => v.as_slice(),
64        }
65    }
66
67    /// Construct from a list of lexicon entries (used by the builder for in-memory index).
68    ///
69    /// # Panics
70    ///
71    /// Panics if `entries.len()` exceeds `u32::MAX`.
72    #[must_use]
73    pub fn from_entries(entries: &[LexiconEntry]) -> Self {
74        let count = entries.len();
75        let size = LEXICON_MAGIC.len() + 4 + count * Self::ENTRY_SIZE;
76        let mut data = vec![0u8; size];
77        let mut cursor = 0;
78
79        data[cursor..cursor + LEXICON_MAGIC.len()].copy_from_slice(&LEXICON_MAGIC);
80        cursor += LEXICON_MAGIC.len();
81
82        data[cursor..cursor + 4].copy_from_slice(&u32::try_from(count).unwrap().to_le_bytes());
83        cursor += 4;
84
85        for e in entries {
86            data[cursor..cursor + 3].copy_from_slice(&e.trigram);
87            cursor += 3;
88            data[cursor..cursor + 8].copy_from_slice(&e.offset.to_le_bytes());
89            cursor += 8;
90            data[cursor..cursor + 4].copy_from_slice(&e.len.to_le_bytes());
91            cursor += 4;
92        }
93
94        Self {
95            backing: Backing::Owned(data),
96            count,
97        }
98    }
99
100    /// Open a lexicon from a memory-mapped file.
101    ///
102    /// # Errors
103    ///
104    /// Returns an error if the file is malformed.
105    pub fn open(path: &Path) -> std::io::Result<Self> {
106        let mmap = open_mmap(path)?;
107        let bytes = mmap.as_ref();
108        let count = Self::validate(bytes)?;
109        Ok(Self {
110            backing: Backing::Mmap(mmap),
111            count,
112        })
113    }
114
115    fn validate(bytes: &[u8]) -> std::io::Result<usize> {
116        let magic_len = LEXICON_MAGIC.len();
117        if bytes.len() < magic_len + 4 {
118            return Err(std::io::Error::new(
119                std::io::ErrorKind::InvalidData,
120                "lexicon too short for magic+count",
121            ));
122        }
123        if bytes[..magic_len] != LEXICON_MAGIC {
124            return Err(std::io::Error::new(
125                std::io::ErrorKind::InvalidData,
126                "unexpected lexicon magic",
127            ));
128        }
129        let n = u32::from_le_bytes(bytes[magic_len..magic_len + 4].try_into().unwrap()) as usize;
130        let expected_bytes = n * Self::ENTRY_SIZE;
131        if bytes.len() < magic_len + 4 + expected_bytes {
132            return Err(std::io::Error::new(
133                std::io::ErrorKind::InvalidData,
134                "lexicon truncated",
135            ));
136        }
137        Ok(n)
138    }
139
140    #[must_use]
141    pub const fn len(&self) -> usize {
142        self.count
143    }
144
145    #[must_use]
146    pub const fn is_empty(&self) -> bool {
147        self.count == 0
148    }
149
150    /// Binary search for `tri` in the mapped lexicon. Returns the entry if found.
151    ///
152    /// # Panics
153    ///
154    /// Panics if the internal data layout is corrupted (out of bounds read).
155    #[must_use]
156    pub fn get(&self, tri: [u8; 3]) -> Option<LexiconEntry> {
157        if self.count == 0 {
158            return None;
159        }
160        let bytes = self.bytes();
161        let magic_len = LEXICON_MAGIC.len();
162        let data_start = magic_len + 4;
163
164        let mut lo = 0;
165        let mut hi = self.count;
166
167        while lo < hi {
168            let mid = lo + (hi - lo) / 2;
169            let offset = data_start + mid * Self::ENTRY_SIZE;
170            let entry_tri: [u8; 3] = bytes[offset..offset + 3].try_into().unwrap();
171
172            match entry_tri.cmp(&tri) {
173                std::cmp::Ordering::Less => lo = mid + 1,
174                std::cmp::Ordering::Greater => hi = mid,
175                std::cmp::Ordering::Equal => {
176                    let off =
177                        u64::from_le_bytes(bytes[offset + 3..offset + 11].try_into().unwrap());
178                    let len =
179                        u32::from_le_bytes(bytes[offset + 11..offset + 15].try_into().unwrap());
180                    return Some(LexiconEntry {
181                        trigram: entry_tri,
182                        offset: off,
183                        len,
184                    });
185                }
186            }
187        }
188        None
189    }
190
191    #[must_use]
192    pub const fn iter(&self) -> MappedLexiconIter<'_> {
193        MappedLexiconIter {
194            lexicon: self,
195            pos: 0,
196        }
197    }
198
199    #[must_use]
200    pub fn backing_slice(&self) -> &[u8] {
201        self.bytes()
202    }
203}
204
205impl<'a> IntoIterator for &'a MappedLexicon {
206    type Item = LexiconEntry;
207    type IntoIter = MappedLexiconIter<'a>;
208
209    fn into_iter(self) -> Self::IntoIter {
210        MappedLexiconIter {
211            lexicon: self,
212            pos: 0,
213        }
214    }
215}
216
217pub struct MappedLexiconIter<'_a> {
218    lexicon: &'_a MappedLexicon,
219    pos: usize,
220}
221
222impl Iterator for MappedLexiconIter<'_> {
223    type Item = LexiconEntry;
224
225    fn next(&mut self) -> Option<Self::Item> {
226        if self.pos >= self.lexicon.count {
227            return None;
228        }
229        let bytes = self.lexicon.bytes();
230        let magic_len = LEXICON_MAGIC.len();
231        let offset = magic_len + 4 + self.pos * MappedLexicon::ENTRY_SIZE;
232        let tri: [u8; 3] = bytes[offset..offset + 3].try_into().unwrap();
233        let off = u64::from_le_bytes(bytes[offset + 3..offset + 11].try_into().unwrap());
234        let len = u32::from_le_bytes(bytes[offset + 11..offset + 15].try_into().unwrap());
235        self.pos += 1;
236        Some(LexiconEntry {
237            trigram: tri,
238            offset: off,
239            len,
240        })
241    }
242}