Skip to main content

cqlite_core/storage/serialization/
vint.rs

1//! VInt (Variable-length integer) encoder for Cassandra compatibility
2//!
3//! Implements byte-identical encoding to Cassandra's VIntCoding.java.
4//! All SSTable component writers depend on this producing correct output.
5//!
6//! Encoding format:
7//! - Single byte: values 0-127 (unsigned) or -64 to 63 (signed)
8//! - Multi-byte: first byte encodes length via leading 1-bits, remaining bytes are big-endian
9//! - Signed values use ZigZag encoding: 0→0, -1→1, 1→2, -2→3, etc.
10//!
11//! References:
12//! - Cassandra VIntCoding.java: org.apache.cassandra.utils.vint.VIntCoding
13//! - Appendix B: docs/sstables-definitive-guide/chapters/appendix-b-encodings-cheat-sheet.md
14
15/// ZigZag encode a signed integer to unsigned
16///
17/// Maps signed integers to unsigned so that small absolute values
18/// have small encodings: 0→0, -1→1, 1→2, -2→3, 2→4, -3→5, ...
19///
20/// Formula: (n << 1) ^ (n >> 63)
21/// - For positive n: left shift gives 2n, right shift gives 0, so result is 2n
22/// - For negative n: left shift gives 2n, right shift gives -1 (all 1s), so XOR inverts
23#[inline]
24fn zigzag_encode(n: i64) -> u64 {
25    ((n << 1) ^ (n >> 63)) as u64
26}
27
28/// Encode a signed i64 to VInt bytes (Cassandra-compatible)
29///
30/// Uses ZigZag encoding to efficiently encode small negative numbers.
31///
32/// # Arguments
33///
34/// * `value` - Signed 64-bit integer to encode
35/// * `buf` - Target buffer to write VInt bytes
36///
37/// # Examples
38///
39/// ```
40/// # use cqlite_core::storage::serialization::vint::encode_signed;
41/// let mut buf = Vec::new();
42/// encode_signed(0, &mut buf);
43/// assert_eq!(buf, vec![0x00]); // Single-byte zero
44///
45/// buf.clear();
46/// encode_signed(-1, &mut buf);
47/// assert_eq!(buf, vec![0x01]); // ZigZag encoded -1
48///
49/// buf.clear();
50/// encode_signed(64, &mut buf);
51/// assert_eq!(buf, vec![0x80, 0x80]); // Two-byte format
52/// ```
53pub fn encode_signed(value: i64, buf: &mut Vec<u8>) {
54    let unsigned_value = zigzag_encode(value);
55    encode_unsigned(unsigned_value, buf);
56}
57
58/// Encode an unsigned u64 to VInt bytes
59///
60/// Encoding format (standard Cassandra unsigned VInt):
61/// - 1 byte: 0xxxxxxx (values 0-127)
62/// - 2 bytes: 10xxxxxx xxxxxxxx (values 128-16383, 14 bits total: 6+8)
63/// - 3 bytes: 110xxxxx xxxxxxxx xxxxxxxx (values 16384-2097151, 21 bits: 5+8+8)
64/// - ... up to 9 bytes total
65///
66/// The first byte encodes the number of extra bytes via leading 1-bits,
67/// followed by a 0 separator bit, then data bits.
68///
69/// # Arguments
70///
71/// * `value` - Unsigned 64-bit integer to encode
72/// * `buf` - Target buffer to write VInt bytes
73///
74/// # Examples
75///
76/// ```
77/// # use cqlite_core::storage::serialization::vint::encode_unsigned;
78/// let mut buf = Vec::new();
79/// encode_unsigned(0, &mut buf);
80/// assert_eq!(buf, vec![0x00]);
81///
82/// buf.clear();
83/// encode_unsigned(127, &mut buf);
84/// assert_eq!(buf, vec![0x7F]);
85///
86/// buf.clear();
87/// encode_unsigned(128, &mut buf);
88/// assert_eq!(buf, vec![0x80, 0x80]); // 10xxxxxx xxxxxxxx
89/// ```
90pub fn encode_unsigned(value: u64, buf: &mut Vec<u8>) {
91    let size = unsigned_len_value(value);
92
93    if size == 1 {
94        // Single byte: 0xxxxxxx (values 0-127)
95        buf.push(value as u8);
96    } else if size == 9 {
97        // 9 bytes: 0xFF followed by full 8-byte long
98        buf.push(0xFF);
99        buf.extend_from_slice(&value.to_be_bytes());
100    } else {
101        // Multi-byte (2-8 bytes): [leading 1s][0][data bits][remaining bytes]
102        let extra_bytes = size - 1;
103
104        // Create first byte with leading 1-bits pattern
105        // For 2 bytes (1 extra): 0x80 | data_bits (10xxxxxx)
106        // For 3 bytes (2 extra): 0xC0 | data_bits (110xxxxx)
107        let mask = encode_extra_bytes_to_read(extra_bytes);
108
109        // Calculate how many data bits fit in the first byte
110        let first_byte_data_bits = 8 - extra_bytes - 1;
111
112        // Extract data bits for first byte
113        let shift = extra_bytes * 8;
114        let first_byte_data = ((value >> shift) & ((1 << first_byte_data_bits) - 1)) as u8;
115        buf.push(mask | first_byte_data);
116
117        // Add remaining bytes in big-endian order
118        for i in (0..extra_bytes).rev() {
119            buf.push(((value >> (i * 8)) & 0xFF) as u8);
120        }
121    }
122}
123
124/// Calculate the encoded length of a signed i64
125///
126/// Returns the number of bytes that would be needed to encode this value.
127///
128/// # Examples
129///
130/// ```
131/// # use cqlite_core::storage::serialization::vint::signed_len;
132/// assert_eq!(signed_len(0), 1);
133/// assert_eq!(signed_len(63), 1);
134/// assert_eq!(signed_len(-64), 1);
135/// assert_eq!(signed_len(64), 2);
136/// assert_eq!(signed_len(-65), 2);
137/// ```
138#[inline]
139pub fn signed_len(value: i64) -> usize {
140    let unsigned_value = zigzag_encode(value);
141    unsigned_len_value(unsigned_value)
142}
143
144/// Calculate the encoded length of an unsigned u64
145///
146/// Returns the number of bytes that would be needed to encode this value.
147///
148/// # Examples
149///
150/// ```
151/// # use cqlite_core::storage::serialization::vint::unsigned_len;
152/// assert_eq!(unsigned_len(0), 1);
153/// assert_eq!(unsigned_len(127), 1);
154/// assert_eq!(unsigned_len(128), 2);
155/// assert_eq!(unsigned_len(16384), 3);
156/// ```
157#[inline]
158pub fn unsigned_len(value: u64) -> usize {
159    unsigned_len_value(value)
160}
161
162/// Compute the number of bytes needed to encode an unsigned VInt
163///
164/// Matches Cassandra's computeUnsignedVIntSize algorithm:
165/// Uses bit manipulation to calculate size based on the number of leading zeros.
166///
167/// Formula: (639 - magnitude * 9) >> 6
168/// where magnitude = numberOfLeadingZeros(value | 1)
169#[inline]
170fn unsigned_len_value(value: u64) -> usize {
171    // | with 1 ensures magnitude <= 63, so (63 - 1) / 7 <= 8
172    let magnitude = (value | 1).leading_zeros();
173    // Hand-picked formula from Cassandra that matches: 9 - ((magnitude - 1) / 7)
174    ((639 - (magnitude * 9)) >> 6) as usize
175}
176
177/// Encode the number of extra bytes to read in the first byte
178///
179/// For a VInt with N extra bytes, this returns the bit pattern
180/// with N leading 1-bits followed by a 0 separator.
181///
182/// Examples:
183/// - 0 extra bytes: 0x00 (00000000)
184/// - 1 extra byte:  0x80 (10000000)
185/// - 2 extra bytes: 0xC0 (11000000)
186/// - 3 extra bytes: 0xE0 (11100000)
187#[inline]
188fn encode_extra_bytes_to_read(extra_bytes: usize) -> u8 {
189    if extra_bytes == 0 {
190        0x00
191    } else if extra_bytes >= 8 {
192        0xFF
193    } else {
194        // Generate mask with N leading 1-bits followed by 0s
195        // For 1 extra byte: 0x80 (10000000)
196        // For 2 extra bytes: 0xC0 (11000000)
197        // Formula: ~((1 << (8 - extra_bytes)) - 1)
198        let shift = 8 - extra_bytes;
199        0xFF_u8 << shift
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn test_zigzag_encode() {
209        assert_eq!(zigzag_encode(0), 0);
210        assert_eq!(zigzag_encode(-1), 1);
211        assert_eq!(zigzag_encode(1), 2);
212        assert_eq!(zigzag_encode(-2), 3);
213        assert_eq!(zigzag_encode(2), 4);
214        assert_eq!(zigzag_encode(-3), 5);
215        assert_eq!(zigzag_encode(63), 126);
216        assert_eq!(zigzag_encode(-64), 127);
217        assert_eq!(zigzag_encode(64), 128);
218    }
219
220    #[test]
221    fn test_encode_signed_test_vectors() {
222        // Test vectors from Issue #363
223        let test_cases = vec![
224            (0i64, vec![0x00]),
225            (1i64, vec![0x02]),
226            (-1i64, vec![0x01]),
227            (63i64, vec![0x7E]),
228            (-64i64, vec![0x7F]),
229            (64i64, vec![0x80, 0x80]),
230        ];
231
232        for (value, expected) in test_cases {
233            let mut buf = Vec::new();
234            encode_signed(value, &mut buf);
235            assert_eq!(
236                buf, expected,
237                "encode_signed({}) failed: expected {:?}, got {:?}",
238                value, expected, buf
239            );
240        }
241    }
242
243    #[test]
244    fn test_encode_unsigned_boundaries() {
245        let test_cases = vec![
246            (0u64, vec![0x00]),
247            (127u64, vec![0x7F]),               // Max single byte
248            (128u64, vec![0x80, 0x80]),         // Min two bytes
249            (16383u64, vec![0xBF, 0xFF]),       // Max two bytes (14 bits: 6+8)
250            (16384u64, vec![0xC0, 0x40, 0x00]), // Min three bytes
251        ];
252
253        for (value, expected) in test_cases {
254            let mut buf = Vec::new();
255            encode_unsigned(value, &mut buf);
256            assert_eq!(
257                buf, expected,
258                "encode_unsigned({}) failed: expected {:?}, got {:?}",
259                value, expected, buf
260            );
261        }
262    }
263
264    #[test]
265    fn test_signed_len() {
266        assert_eq!(signed_len(0), 1);
267        assert_eq!(signed_len(1), 1);
268        assert_eq!(signed_len(-1), 1);
269        assert_eq!(signed_len(63), 1);
270        assert_eq!(signed_len(-64), 1);
271        assert_eq!(signed_len(64), 2);
272        assert_eq!(signed_len(-65), 2);
273        assert_eq!(signed_len(127), 2);
274        assert_eq!(signed_len(-128), 2);
275    }
276
277    #[test]
278    fn test_unsigned_len() {
279        assert_eq!(unsigned_len(0), 1);
280        assert_eq!(unsigned_len(127), 1);
281        assert_eq!(unsigned_len(128), 2);
282        assert_eq!(unsigned_len(16383), 2);
283        assert_eq!(unsigned_len(16384), 3);
284        assert_eq!(unsigned_len(2097151), 3);
285        assert_eq!(unsigned_len(2097152), 4);
286    }
287
288    #[test]
289    fn test_encode_extra_bytes() {
290        assert_eq!(encode_extra_bytes_to_read(0), 0x00);
291        assert_eq!(encode_extra_bytes_to_read(1), 0x80);
292        assert_eq!(encode_extra_bytes_to_read(2), 0xC0);
293        assert_eq!(encode_extra_bytes_to_read(3), 0xE0);
294        assert_eq!(encode_extra_bytes_to_read(4), 0xF0);
295        assert_eq!(encode_extra_bytes_to_read(5), 0xF8);
296        assert_eq!(encode_extra_bytes_to_read(6), 0xFC);
297        assert_eq!(encode_extra_bytes_to_read(7), 0xFE);
298        assert_eq!(encode_extra_bytes_to_read(8), 0xFF);
299    }
300
301    #[test]
302    fn test_roundtrip_with_decoder() {
303        // Test roundtrip encode → decode using standard Cassandra VInt format
304        use crate::parser::vint::parse_vint;
305
306        let test_values = vec![
307            0,
308            1,
309            -1,
310            63,
311            -64,
312            64,
313            -65,
314            127,
315            -128,
316            255,
317            -255,
318            1000,
319            -1000,
320            32767,
321            -32768,
322            1048576,
323            -1048576,
324            i32::MAX as i64,
325            i32::MIN as i64,
326        ];
327
328        for value in test_values {
329            // Test our encoder
330            let mut buf = Vec::new();
331            encode_signed(value, &mut buf);
332
333            // Test decoder can parse it
334            let (remaining, decoded) = parse_vint(&buf).unwrap();
335            assert!(
336                remaining.is_empty(),
337                "Decoder should consume all bytes for value {}",
338                value
339            );
340            assert_eq!(
341                decoded, value,
342                "Roundtrip failed for value {}: encoded as {:?}",
343                value, buf
344            );
345        }
346    }
347
348    #[test]
349    fn test_large_values() {
350        // Test values requiring different byte sizes
351        let test_cases = vec![
352            (1u64 << 7, 2),  // 128 - 2 bytes
353            (1u64 << 14, 3), // 16384 - 3 bytes
354            (1u64 << 21, 4), // 2097152 - 4 bytes
355            (1u64 << 28, 5), // 268435456 - 5 bytes
356            (1u64 << 35, 6), // 6 bytes
357            (1u64 << 42, 7), // 7 bytes
358            (1u64 << 49, 8), // 8 bytes
359            (1u64 << 56, 9), // 9 bytes
360        ];
361
362        for (value, expected_size) in test_cases {
363            assert_eq!(
364                unsigned_len(value),
365                expected_size,
366                "Value {} should encode to {} bytes",
367                value,
368                expected_size
369            );
370
371            let mut buf = Vec::new();
372            encode_unsigned(value, &mut buf);
373            assert_eq!(
374                buf.len(),
375                expected_size,
376                "Encoded {} to {:?}, expected {} bytes",
377                value,
378                buf,
379                expected_size
380            );
381        }
382    }
383
384    #[test]
385    fn test_performance_target() {
386        // Performance target: <100ns per encode
387        // This is a rough check that we're not doing anything obviously slow
388        use std::time::Instant;
389
390        let iterations = 10_000;
391        let mut buf = Vec::with_capacity(9);
392
393        let start = Instant::now();
394        for i in 0..iterations {
395            buf.clear();
396            encode_signed(i, &mut buf);
397        }
398        let elapsed = start.elapsed();
399
400        let avg_ns = elapsed.as_nanos() / iterations as u128;
401        println!("Average encode time: {} ns", avg_ns);
402
403        // Sanity check: should be much faster than 1µs per encode
404        assert!(
405            avg_ns < 1000,
406            "Encoding is too slow: {} ns per operation",
407            avg_ns
408        );
409    }
410
411    #[test]
412    fn test_cassandra_compatibility() {
413        // Test specific patterns from standard Cassandra VInt encoding
414        // These use ZigZag encoding: signed → unsigned → VInt bytes
415        let test_cases = vec![
416            (0i64, vec![0x00], "Zero value"), // zigzag(0)=0, vint(0)=[0x00]
417            (1i64, vec![0x02], "Single byte positive"), // zigzag(1)=2, vint(2)=[0x02]
418            (63i64, vec![0x7E], "Maximum single byte positive"), // zigzag(63)=126, vint(126)=[0x7E]
419            (64i64, vec![0x80, 0x80], "Two byte encoding start"), // zigzag(64)=128, vint(128)=[0x80,0x80]
420            (127i64, vec![0x80, 0xFE], "Two byte positive"), // zigzag(127)=254, vint(254)=[0x80,0xFE]
421            (-1i64, vec![0x01], "Single byte negative"),     // zigzag(-1)=1, vint(1)=[0x01]
422            (-64i64, vec![0x7F], "Single byte negative boundary"), // zigzag(-64)=127, vint(127)=[0x7F]
423            (-65i64, vec![0x80, 0x81], "Two byte negative"), // zigzag(-65)=129, vint(129)=[0x80,0x81]
424        ];
425
426        for (value, expected_bytes, description) in test_cases {
427            let mut buf = Vec::new();
428            encode_signed(value, &mut buf);
429            assert_eq!(
430                buf, expected_bytes,
431                "{}: encode_signed({}) failed: expected {:?}, got {:?}",
432                description, value, expected_bytes, buf
433            );
434        }
435    }
436}