Skip to main content

ix/
posting.rs

1//! Posting list encode/decode (delta + varint + ZSTD compression).
2//!
3//! Compact representation of (file_id, [offsets]) for a single trigram.
4//! ZSTD compression provides ~60-70% size reduction on posting data.
5
6use crate::error::{Error, Result};
7use crate::varint;
8
9#[derive(Debug, Clone, PartialEq, Eq)]
10pub struct PostingList {
11    pub entries: Vec<PostingEntry>,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct PostingEntry {
16    pub file_id: u32,
17    pub offsets: Vec<u32>,
18}
19
20impl PostingList {
21    const ZSTD_COMPRESSION_LEVEL: i32 = 3;
22
23    /// Encode the posting list into a compressed byte buffer.
24    /// Format: ZSTD(varint-encoded posting data)
25    /// ZSTD's built-in XXHash64 checksum provides integrity verification.
26    pub fn encode(&self) -> Vec<u8> {
27        let mut buf = Vec::new();
28        varint::encode(self.entries.len() as u64, &mut buf);
29
30        let mut last_file_id = 0u32;
31        for entry in &self.entries {
32            let file_id_delta = entry.file_id - last_file_id;
33            varint::encode(file_id_delta as u64, &mut buf);
34            last_file_id = entry.file_id;
35
36            varint::encode(entry.offsets.len() as u64, &mut buf);
37            let mut last_offset = 0u32;
38            for &offset in &entry.offsets {
39                let offset_delta = offset - last_offset;
40                varint::encode(offset_delta as u64, &mut buf);
41                last_offset = offset;
42            }
43        }
44
45        zstd::encode_all(&buf[..], Self::ZSTD_COMPRESSION_LEVEL).unwrap_or(buf)
46    }
47
48    /// Decode the posting list from a compressed byte slice.
49    /// ZSTD decompression verifies the built-in checksum automatically.
50    pub fn decode(data: &[u8]) -> Result<Self> {
51        let payload = zstd::decode_all(data).map_err(|_| Error::PostingCorrupted)?;
52
53        let mut pos = 0;
54        let num_files = varint::decode(&payload, &mut pos)? as usize;
55        let mut entries = Vec::with_capacity(num_files);
56
57        let mut last_file_id = 0u32;
58        for _ in 0..num_files {
59            let file_id_delta = varint::decode(&payload, &mut pos)? as u32;
60            let file_id = last_file_id + file_id_delta;
61            last_file_id = file_id;
62
63            let num_offsets = varint::decode(&payload, &mut pos)? as usize;
64            let mut offsets = Vec::with_capacity(num_offsets);
65            let mut last_offset = 0u32;
66            for _ in 0..num_offsets {
67                let offset_delta = varint::decode(&payload, &mut pos)? as u32;
68                let offset = last_offset + offset_delta;
69                last_offset = offset;
70                offsets.push(offset);
71            }
72            entries.push(PostingEntry { file_id, offsets });
73        }
74
75        Ok(PostingList { entries })
76    }
77}
78
79#[cfg(test)]
80mod tests {
81    use super::*;
82
83    #[test]
84    fn roundtrip() {
85        let list = PostingList {
86            entries: vec![
87                PostingEntry {
88                    file_id: 5,
89                    offsets: vec![100, 340, 342],
90                },
91                PostingEntry {
92                    file_id: 12,
93                    offsets: vec![44],
94                },
95                PostingEntry {
96                    file_id: 15,
97                    offsets: vec![200, 880],
98                },
99            ],
100        };
101
102        let encoded = list.encode();
103        let decoded = PostingList::decode(&encoded).unwrap();
104        assert_eq!(list, decoded);
105    }
106
107    #[test]
108    fn test_corruption_detection() {
109        let list = PostingList {
110            entries: vec![PostingEntry {
111                file_id: 1,
112                offsets: vec![10, 20],
113            }],
114        };
115        let mut encoded = list.encode();
116
117        // ZSTD's built-in checksum should detect corruption
118        encoded[0] ^= 0xFF;
119
120        let result = PostingList::decode(&encoded);
121        assert!(result.is_err(), "Decoding corrupted ZSTD data should fail");
122    }
123
124    #[test]
125    fn empty() {
126        let list = PostingList { entries: vec![] };
127        let encoded = list.encode();
128        let decoded = PostingList::decode(&encoded).unwrap();
129        assert_eq!(list, decoded);
130    }
131
132    #[test]
133    fn compression_ratio() {
134        let mut entries = Vec::new();
135        for i in 0..1000 {
136            entries.push(PostingEntry {
137                file_id: i,
138                offsets: (0..100).map(|j| i * 100 + j).collect(),
139            });
140        }
141        let list = PostingList { entries };
142        let encoded = list.encode();
143
144        // ZSTD should compress significantly
145        assert!(
146            encoded.len() < 50000,
147            "Expected compression, got {} bytes",
148            encoded.len()
149        );
150    }
151}