Skip to main content

ix/
posting.rs

1//! Posting list encode/decode (delta + varint + ZSTD compression).
2//!
3//! Compact representation of (`file_id`, `offsets`) for a single trigram.
4//! ZSTD compression provides ~60-70% size reduction on posting data.
5
6use crate::error::{Error, Result};
7use crate::varint;
8
9/// A collection of per-file posting entries for a single trigram.
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct PostingList {
12    /// Ordered list of files that contain this trigram, with their hit offsets.
13    pub entries: Vec<PostingEntry>,
14}
15
16/// One file's hit data within a posting list.
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub struct PostingEntry {
19    /// Unique file identifier in the index.
20    pub file_id: u32,
21    /// Byte offsets (absolute within the file) where the trigram occurs.
22    pub offsets: Vec<u32>,
23}
24
25impl PostingList {
26    /// ZSTD compression level used for posting list and CDX block encoding.
27    pub const ZSTD_COMPRESSION_LEVEL: i32 = 3;
28
29    /// Encode the posting list into a compressed byte buffer.
30    /// Format: ZSTD(varint-encoded posting data)
31    /// ZSTD's built-in `XXHash64` checksum provides integrity verification.
32    #[must_use]
33    pub fn encode(&self) -> Vec<u8> {
34        let mut buf = Vec::new();
35        varint::encode(u64::try_from(self.entries.len()).unwrap_or(0), &mut buf);
36
37        let mut last_file_id = 0u32;
38        for entry in &self.entries {
39            let file_id_delta = entry.file_id - last_file_id;
40            varint::encode(u64::from(file_id_delta), &mut buf);
41            last_file_id = entry.file_id;
42
43            varint::encode(u64::try_from(entry.offsets.len()).unwrap_or(0), &mut buf);
44            let mut last_offset = 0u32;
45            for &offset in &entry.offsets {
46                let offset_delta = offset - last_offset;
47                varint::encode(u64::from(offset_delta), &mut buf);
48                last_offset = offset;
49            }
50        }
51
52        zstd::encode_all(&buf[..], Self::ZSTD_COMPRESSION_LEVEL).unwrap_or(buf)
53    }
54
55    /// Decode the posting list from a compressed byte slice.
56    /// ZSTD decompression verifies the built-in checksum automatically.
57    ///
58    /// # Errors
59    ///
60    /// Returns an error if the ZSTD data is corrupted (checksum mismatch) or if
61    /// the varint-decoded structure is invalid.
62    pub fn decode(data: &[u8]) -> Result<Self> {
63        let payload = zstd::decode_all(data).map_err(|_| Error::PostingCorrupted)?;
64
65        let mut pos = 0;
66        let num_files = usize::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
67        let mut entries = Vec::with_capacity(num_files);
68
69        let mut last_file_id = 0u32;
70        for _ in 0..num_files {
71            let file_id_delta = u32::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
72            let file_id = last_file_id + file_id_delta;
73            last_file_id = file_id;
74
75            let num_offsets = usize::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
76            let mut offsets = Vec::with_capacity(num_offsets);
77            let mut last_offset = 0u32;
78            for _ in 0..num_offsets {
79                let offset_delta = u32::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
80                let offset = last_offset + offset_delta;
81                last_offset = offset;
82                offsets.push(offset);
83            }
84            entries.push(PostingEntry { file_id, offsets });
85        }
86
87        Ok(Self { entries })
88    }
89}
90
91#[cfg(test)]
92#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
93mod tests {
94    use super::*;
95
96    #[test]
97    fn roundtrip() {
98        let list = PostingList {
99            entries: vec![
100                PostingEntry {
101                    file_id: 5,
102                    offsets: vec![100, 340, 342],
103                },
104                PostingEntry {
105                    file_id: 12,
106                    offsets: vec![44],
107                },
108                PostingEntry {
109                    file_id: 15,
110                    offsets: vec![200, 880],
111                },
112            ],
113        };
114
115        let encoded = list.encode();
116        let decoded = PostingList::decode(&encoded).unwrap();
117        assert_eq!(list, decoded);
118    }
119
120    #[test]
121    fn test_corruption_detection() {
122        let list = PostingList {
123            entries: vec![PostingEntry {
124                file_id: 1,
125                offsets: vec![10, 20],
126            }],
127        };
128        let mut encoded = list.encode();
129
130        // ZSTD's built-in checksum should detect corruption
131        encoded[0] ^= 0xFF;
132
133        let result = PostingList::decode(&encoded);
134        assert!(result.is_err(), "Decoding corrupted ZSTD data should fail");
135    }
136
137    #[test]
138    fn empty() {
139        let list = PostingList { entries: vec![] };
140        let encoded = list.encode();
141        let decoded = PostingList::decode(&encoded).unwrap();
142        assert_eq!(list, decoded);
143    }
144
145    #[test]
146    fn compression_ratio() {
147        let mut entries = Vec::new();
148        for i in 0..1000 {
149            entries.push(PostingEntry {
150                file_id: i,
151                offsets: (0..100).map(|j| i * 100 + j).collect(),
152            });
153        }
154        let list = PostingList { entries };
155        let encoded = list.encode();
156
157        // ZSTD should compress significantly
158        assert!(
159            encoded.len() < 50000,
160            "Expected compression, got {} bytes",
161            encoded.len()
162        );
163    }
164}