Skip to main content

ix/
posting.rs

1//! Posting list encode/decode (delta + varint + ZSTD compression).
2//!
3//! Compact representation of (`file_id`, `offsets`) for a single trigram.
4//! ZSTD compression provides ~60-70% size reduction on posting data.
5
6use crate::error::{Error, Result};
7use crate::varint;
8
9/// A collection of per-file posting entries for a single trigram.
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct PostingList {
12    /// Ordered list of files that contain this trigram, with their hit offsets.
13    pub entries: Vec<PostingEntry>,
14}
15
16/// One file's hit data within a posting list.
17#[derive(Debug, Clone, PartialEq, Eq)]
18pub struct PostingEntry {
19    /// Unique file identifier in the index.
20    pub file_id: u32,
21    /// Byte offsets (absolute within the file) where the trigram occurs.
22    pub offsets: Vec<u32>,
23}
24
25impl PostingList {
26    const ZSTD_COMPRESSION_LEVEL: i32 = 3;
27
28    /// Encode the posting list into a compressed byte buffer.
29    /// Format: ZSTD(varint-encoded posting data)
30    /// ZSTD's built-in `XXHash64` checksum provides integrity verification.
31    #[must_use] 
32    pub fn encode(&self) -> Vec<u8> {
33        let mut buf = Vec::new();
34        varint::encode(u64::try_from(self.entries.len()).unwrap_or(0), &mut buf);
35
36        let mut last_file_id = 0u32;
37        for entry in &self.entries {
38            let file_id_delta = entry.file_id - last_file_id;
39            varint::encode(u64::from(file_id_delta), &mut buf);
40            last_file_id = entry.file_id;
41
42            varint::encode(u64::try_from(entry.offsets.len()).unwrap_or(0), &mut buf);
43            let mut last_offset = 0u32;
44            for &offset in &entry.offsets {
45                let offset_delta = offset - last_offset;
46                varint::encode(u64::from(offset_delta), &mut buf);
47                last_offset = offset;
48            }
49        }
50
51        zstd::encode_all(&buf[..], Self::ZSTD_COMPRESSION_LEVEL).unwrap_or(buf)
52    }
53
54    /// Decode the posting list from a compressed byte slice.
55    /// ZSTD decompression verifies the built-in checksum automatically.
56    ///
57    /// # Errors
58    ///
59    /// Returns an error if the ZSTD data is corrupted (checksum mismatch) or if
60    /// the varint-decoded structure is invalid.
61    pub fn decode(data: &[u8]) -> Result<Self> {
62        let payload = zstd::decode_all(data).map_err(|_| Error::PostingCorrupted)?;
63
64        let mut pos = 0;
65        let num_files = usize::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
66        let mut entries = Vec::with_capacity(num_files);
67
68        let mut last_file_id = 0u32;
69        for _ in 0..num_files {
70            let file_id_delta = u32::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
71            let file_id = last_file_id + file_id_delta;
72            last_file_id = file_id;
73
74            let num_offsets = usize::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
75            let mut offsets = Vec::with_capacity(num_offsets);
76            let mut last_offset = 0u32;
77            for _ in 0..num_offsets {
78                let offset_delta = u32::try_from(varint::decode(&payload, &mut pos)?).unwrap_or(0);
79                let offset = last_offset + offset_delta;
80                last_offset = offset;
81                offsets.push(offset);
82            }
83            entries.push(PostingEntry { file_id, offsets });
84        }
85
86        Ok(Self { entries })
87    }
88}
89
90#[cfg(test)]
91#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
92mod tests {
93    use super::*;
94
95    #[test]
96    fn roundtrip() {
97        let list = PostingList {
98            entries: vec![
99                PostingEntry {
100                    file_id: 5,
101                    offsets: vec![100, 340, 342],
102                },
103                PostingEntry {
104                    file_id: 12,
105                    offsets: vec![44],
106                },
107                PostingEntry {
108                    file_id: 15,
109                    offsets: vec![200, 880],
110                },
111            ],
112        };
113
114        let encoded = list.encode();
115        let decoded = PostingList::decode(&encoded).unwrap();
116        assert_eq!(list, decoded);
117    }
118
119    #[test]
120    fn test_corruption_detection() {
121        let list = PostingList {
122            entries: vec![PostingEntry {
123                file_id: 1,
124                offsets: vec![10, 20],
125            }],
126        };
127        let mut encoded = list.encode();
128
129        // ZSTD's built-in checksum should detect corruption
130        encoded[0] ^= 0xFF;
131
132        let result = PostingList::decode(&encoded);
133        assert!(result.is_err(), "Decoding corrupted ZSTD data should fail");
134    }
135
136    #[test]
137    fn empty() {
138        let list = PostingList { entries: vec![] };
139        let encoded = list.encode();
140        let decoded = PostingList::decode(&encoded).unwrap();
141        assert_eq!(list, decoded);
142    }
143
144    #[test]
145    fn compression_ratio() {
146        let mut entries = Vec::new();
147        for i in 0..1000 {
148            entries.push(PostingEntry {
149                file_id: i,
150                offsets: (0..100).map(|j| i * 100 + j).collect(),
151            });
152        }
153        let list = PostingList { entries };
154        let encoded = list.encode();
155
156        // ZSTD should compress significantly
157        assert!(
158            encoded.len() < 50000,
159            "Expected compression, got {} bytes",
160            encoded.len()
161        );
162    }
163}