Skip to main content

ix/
posting.rs

1//! Posting list encode/decode (delta + varint + ZSTD compression).
2//!
3//! Compact representation of (`file_id`, `offsets`) for a single trigram.
4//! ZSTD compression provides ~60-70% size reduction on posting data.
5
6use crate::error::{Error, Result};
7use crate::varint;
8
9/// A collection of per-file posting entries for a single trigram.
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct PostingList {
12    /// Ordered list of files that contain this trigram, with their hit offsets.
13    pub entries: Vec<PostingEntry>,
14}
15
16/// One file's hit data within a posting list.
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub struct PostingEntry {
19    /// Unique file identifier in the index.
20    pub file_id: u32,
21    /// Byte offsets (absolute within the file) where the trigram occurs.
22    pub offsets: Vec<u32>,
23}
24
25impl PostingList {
26    /// ZSTD compression level used for posting list and CDX block encoding.
27    pub const ZSTD_COMPRESSION_LEVEL: i32 = 3;
28
29    /// Encode the posting list into a compressed byte buffer.
30    /// Format: ZSTD(varint-encoded posting data)
31    /// ZSTD's built-in `XXHash64` checksum provides integrity verification.
32    ///
33    /// # Errors
34    ///
35    /// Returns an error if ZSTD compression fails.
36    pub fn encode(&self) -> Result<Vec<u8>> {
37        let mut buf = Vec::new();
38        varint::encode(u64::try_from(self.entries.len()).unwrap_or(0), &mut buf);
39
40        let mut last_file_id = 0u32;
41        for entry in &self.entries {
42            let file_id_delta = entry.file_id - last_file_id;
43            varint::encode(u64::from(file_id_delta), &mut buf);
44            last_file_id = entry.file_id;
45
46            varint::encode(u64::try_from(entry.offsets.len()).unwrap_or(0), &mut buf);
47            let mut last_offset = 0u32;
48            for &offset in &entry.offsets {
49                let offset_delta = offset - last_offset;
50                varint::encode(u64::from(offset_delta), &mut buf);
51                last_offset = offset;
52            }
53        }
54
55        zstd::encode_all(&buf[..], Self::ZSTD_COMPRESSION_LEVEL)
56            .map_err(|e| Error::Config(format!("posting zstd encode: {e}")))
57    }
58
59    /// Decode the posting list from a compressed byte slice.
60    /// ZSTD decompression verifies the built-in checksum automatically.
61    ///
62    /// # Errors
63    ///
64    /// Returns an error if the ZSTD data is corrupted (checksum mismatch) or if
65    /// the varint-decoded structure is invalid.
66    pub fn decode(data: &[u8]) -> Result<Self> {
67        let payload = zstd::decode_all(data).map_err(|_| Error::PostingCorrupted)?;
68
69        let mut pos = 0;
70        let num_files = usize::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
71        let mut entries = Vec::with_capacity(num_files);
72
73        let mut last_file_id = 0u32;
74        for _ in 0..num_files {
75            let file_id_delta = u32::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
76            let file_id = last_file_id + file_id_delta;
77            last_file_id = file_id;
78
79            let num_offsets = usize::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
80            let mut offsets = Vec::with_capacity(num_offsets);
81            let mut last_offset = 0u32;
82            for _ in 0..num_offsets {
83                let offset_delta = u32::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
84                let offset = last_offset + offset_delta;
85                last_offset = offset;
86                offsets.push(offset);
87            }
88            entries.push(PostingEntry { file_id, offsets });
89        }
90
91        Ok(Self { entries })
92    }
93}
94
95#[cfg(test)]
96#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
97mod tests {
98    use super::*;
99
100    #[test]
101    fn roundtrip() {
102        let list = PostingList {
103            entries: vec![
104                PostingEntry {
105                    file_id: 5,
106                    offsets: vec![100, 340, 342],
107                },
108                PostingEntry {
109                    file_id: 12,
110                    offsets: vec![44],
111                },
112                PostingEntry {
113                    file_id: 15,
114                    offsets: vec![200, 880],
115                },
116            ],
117        };
118
119    let encoded = list.encode().unwrap();
120    let decoded = PostingList::decode(&encoded).unwrap();
121    assert_eq!(list, decoded);
122}
123
124#[test]
125fn test_corruption_detection() {
126        let list = PostingList {
127            entries: vec![PostingEntry {
128                file_id: 1,
129                offsets: vec![10, 20],
130            }],
131        };
132        let mut encoded = list.encode().unwrap();
133
134        // ZSTD's built-in checksum should detect corruption
135        encoded[0] ^= 0xFF;
136
137        let result = PostingList::decode(&encoded);
138        assert!(result.is_err(), "Decoding corrupted ZSTD data should fail");
139    }
140
141    #[test]
142    fn empty() {
143        let list = PostingList { entries: vec![] };
144    let encoded = list.encode().unwrap();
145    let decoded = PostingList::decode(&encoded).unwrap();
146    assert_eq!(list, decoded);
147}
148
149#[test]
150fn compression_ratio() {
151        let mut entries = Vec::new();
152        for i in 0..1000 {
153            entries.push(PostingEntry {
154                file_id: i,
155                offsets: (0..100).map(|j| i * 100 + j).collect(),
156            });
157        }
158    let list = PostingList { entries };
159    let encoded = list.encode().unwrap();
160
161    assert!(
162        encoded.len() < 50000,
163        "Expected compression, got {} bytes",
164        encoded.len()
165    );
166    }
167}