cqlite_core/storage/serialization/vint.rs
1//! VInt (Variable-length integer) encoder for Cassandra compatibility
2//!
3//! Implements byte-identical encoding to Cassandra's VIntCoding.java.
4//! All SSTable component writers depend on this producing correct output.
5//!
6//! Encoding format:
7//! - Single byte: values 0-127 (unsigned) or -64 to 63 (signed)
8//! - Multi-byte: first byte encodes length via leading 1-bits, remaining bytes are big-endian
9//! - Signed values use ZigZag encoding: 0→0, -1→1, 1→2, -2→3, etc.
10//!
11//! References:
12//! - Cassandra VIntCoding.java: org.apache.cassandra.utils.vint.VIntCoding
13//! - Appendix B: docs/sstables-definitive-guide/chapters/appendix-b-encodings-cheat-sheet.md
14
15/// ZigZag encode a signed integer to unsigned
16///
17/// Maps signed integers to unsigned so that small absolute values
18/// have small encodings: 0→0, -1→1, 1→2, -2→3, 2→4, -3→5, ...
19///
20/// Formula: (n << 1) ^ (n >> 63)
21/// - For positive n: left shift gives 2n, right shift gives 0, so result is 2n
22/// - For negative n: left shift gives 2n, right shift gives -1 (all 1s), so XOR inverts
23#[inline]
24fn zigzag_encode(n: i64) -> u64 {
25 ((n << 1) ^ (n >> 63)) as u64
26}
27
28/// Encode a signed i64 to VInt bytes (Cassandra-compatible)
29///
30/// Uses ZigZag encoding to efficiently encode small negative numbers.
31///
32/// # Arguments
33///
34/// * `value` - Signed 64-bit integer to encode
35/// * `buf` - Target buffer to write VInt bytes
36///
37/// # Examples
38///
39/// ```
40/// # use cqlite_core::storage::serialization::vint::encode_signed;
41/// let mut buf = Vec::new();
42/// encode_signed(0, &mut buf);
43/// assert_eq!(buf, vec![0x00]); // Single-byte zero
44///
45/// buf.clear();
46/// encode_signed(-1, &mut buf);
47/// assert_eq!(buf, vec![0x01]); // ZigZag encoded -1
48///
49/// buf.clear();
50/// encode_signed(64, &mut buf);
51/// assert_eq!(buf, vec![0x80, 0x80]); // Two-byte format
52/// ```
53pub fn encode_signed(value: i64, buf: &mut Vec<u8>) {
54 let unsigned_value = zigzag_encode(value);
55 encode_unsigned(unsigned_value, buf);
56}
57
58/// Encode an unsigned u64 to VInt bytes
59///
60/// Encoding format (standard Cassandra unsigned VInt):
61/// - 1 byte: 0xxxxxxx (values 0-127)
62/// - 2 bytes: 10xxxxxx xxxxxxxx (values 128-16383, 14 bits total: 6+8)
63/// - 3 bytes: 110xxxxx xxxxxxxx xxxxxxxx (values 16384-2097151, 21 bits: 5+8+8)
64/// - ... up to 9 bytes total
65///
66/// The first byte encodes the number of extra bytes via leading 1-bits,
67/// followed by a 0 separator bit, then data bits.
68///
69/// # Arguments
70///
71/// * `value` - Unsigned 64-bit integer to encode
72/// * `buf` - Target buffer to write VInt bytes
73///
74/// # Examples
75///
76/// ```
77/// # use cqlite_core::storage::serialization::vint::encode_unsigned;
78/// let mut buf = Vec::new();
79/// encode_unsigned(0, &mut buf);
80/// assert_eq!(buf, vec![0x00]);
81///
82/// buf.clear();
83/// encode_unsigned(127, &mut buf);
84/// assert_eq!(buf, vec![0x7F]);
85///
86/// buf.clear();
87/// encode_unsigned(128, &mut buf);
88/// assert_eq!(buf, vec![0x80, 0x80]); // 10xxxxxx xxxxxxxx
89/// ```
90pub fn encode_unsigned(value: u64, buf: &mut Vec<u8>) {
91 let size = unsigned_len_value(value);
92
93 if size == 1 {
94 // Single byte: 0xxxxxxx (values 0-127)
95 buf.push(value as u8);
96 } else if size == 9 {
97 // 9 bytes: 0xFF followed by full 8-byte long
98 buf.push(0xFF);
99 buf.extend_from_slice(&value.to_be_bytes());
100 } else {
101 // Multi-byte (2-8 bytes): [leading 1s][0][data bits][remaining bytes]
102 let extra_bytes = size - 1;
103
104 // Create first byte with leading 1-bits pattern
105 // For 2 bytes (1 extra): 0x80 | data_bits (10xxxxxx)
106 // For 3 bytes (2 extra): 0xC0 | data_bits (110xxxxx)
107 let mask = encode_extra_bytes_to_read(extra_bytes);
108
109 // Calculate how many data bits fit in the first byte
110 let first_byte_data_bits = 8 - extra_bytes - 1;
111
112 // Extract data bits for first byte
113 let shift = extra_bytes * 8;
114 let first_byte_data = ((value >> shift) & ((1 << first_byte_data_bits) - 1)) as u8;
115 buf.push(mask | first_byte_data);
116
117 // Add remaining bytes in big-endian order
118 for i in (0..extra_bytes).rev() {
119 buf.push(((value >> (i * 8)) & 0xFF) as u8);
120 }
121 }
122}
123
124/// Calculate the encoded length of a signed i64
125///
126/// Returns the number of bytes that would be needed to encode this value.
127///
128/// # Examples
129///
130/// ```
131/// # use cqlite_core::storage::serialization::vint::signed_len;
132/// assert_eq!(signed_len(0), 1);
133/// assert_eq!(signed_len(63), 1);
134/// assert_eq!(signed_len(-64), 1);
135/// assert_eq!(signed_len(64), 2);
136/// assert_eq!(signed_len(-65), 2);
137/// ```
138#[inline]
139pub fn signed_len(value: i64) -> usize {
140 let unsigned_value = zigzag_encode(value);
141 unsigned_len_value(unsigned_value)
142}
143
144/// Calculate the encoded length of an unsigned u64
145///
146/// Returns the number of bytes that would be needed to encode this value.
147///
148/// # Examples
149///
150/// ```
151/// # use cqlite_core::storage::serialization::vint::unsigned_len;
152/// assert_eq!(unsigned_len(0), 1);
153/// assert_eq!(unsigned_len(127), 1);
154/// assert_eq!(unsigned_len(128), 2);
155/// assert_eq!(unsigned_len(16384), 3);
156/// ```
157#[inline]
158pub fn unsigned_len(value: u64) -> usize {
159 unsigned_len_value(value)
160}
161
162/// Compute the number of bytes needed to encode an unsigned VInt
163///
164/// Matches Cassandra's computeUnsignedVIntSize algorithm:
165/// Uses bit manipulation to calculate size based on the number of leading zeros.
166///
167/// Formula: (639 - magnitude * 9) >> 6
168/// where magnitude = numberOfLeadingZeros(value | 1)
169#[inline]
170fn unsigned_len_value(value: u64) -> usize {
171 // | with 1 ensures magnitude <= 63, so (63 - 1) / 7 <= 8
172 let magnitude = (value | 1).leading_zeros();
173 // Hand-picked formula from Cassandra that matches: 9 - ((magnitude - 1) / 7)
174 ((639 - (magnitude * 9)) >> 6) as usize
175}
176
177/// Encode the number of extra bytes to read in the first byte
178///
179/// For a VInt with N extra bytes, this returns the bit pattern
180/// with N leading 1-bits followed by a 0 separator.
181///
182/// Examples:
183/// - 0 extra bytes: 0x00 (00000000)
184/// - 1 extra byte: 0x80 (10000000)
185/// - 2 extra bytes: 0xC0 (11000000)
186/// - 3 extra bytes: 0xE0 (11100000)
187#[inline]
188fn encode_extra_bytes_to_read(extra_bytes: usize) -> u8 {
189 if extra_bytes == 0 {
190 0x00
191 } else if extra_bytes >= 8 {
192 0xFF
193 } else {
194 // Generate mask with N leading 1-bits followed by 0s
195 // For 1 extra byte: 0x80 (10000000)
196 // For 2 extra bytes: 0xC0 (11000000)
197 // Formula: ~((1 << (8 - extra_bytes)) - 1)
198 let shift = 8 - extra_bytes;
199 0xFF_u8 << shift
200 }
201}
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206
207 #[test]
208 fn test_zigzag_encode() {
209 assert_eq!(zigzag_encode(0), 0);
210 assert_eq!(zigzag_encode(-1), 1);
211 assert_eq!(zigzag_encode(1), 2);
212 assert_eq!(zigzag_encode(-2), 3);
213 assert_eq!(zigzag_encode(2), 4);
214 assert_eq!(zigzag_encode(-3), 5);
215 assert_eq!(zigzag_encode(63), 126);
216 assert_eq!(zigzag_encode(-64), 127);
217 assert_eq!(zigzag_encode(64), 128);
218 }
219
220 #[test]
221 fn test_encode_signed_test_vectors() {
222 // Test vectors from Issue #363
223 let test_cases = vec![
224 (0i64, vec![0x00]),
225 (1i64, vec![0x02]),
226 (-1i64, vec![0x01]),
227 (63i64, vec![0x7E]),
228 (-64i64, vec![0x7F]),
229 (64i64, vec![0x80, 0x80]),
230 ];
231
232 for (value, expected) in test_cases {
233 let mut buf = Vec::new();
234 encode_signed(value, &mut buf);
235 assert_eq!(
236 buf, expected,
237 "encode_signed({}) failed: expected {:?}, got {:?}",
238 value, expected, buf
239 );
240 }
241 }
242
243 #[test]
244 fn test_encode_unsigned_boundaries() {
245 let test_cases = vec![
246 (0u64, vec![0x00]),
247 (127u64, vec![0x7F]), // Max single byte
248 (128u64, vec![0x80, 0x80]), // Min two bytes
249 (16383u64, vec![0xBF, 0xFF]), // Max two bytes (14 bits: 6+8)
250 (16384u64, vec![0xC0, 0x40, 0x00]), // Min three bytes
251 ];
252
253 for (value, expected) in test_cases {
254 let mut buf = Vec::new();
255 encode_unsigned(value, &mut buf);
256 assert_eq!(
257 buf, expected,
258 "encode_unsigned({}) failed: expected {:?}, got {:?}",
259 value, expected, buf
260 );
261 }
262 }
263
264 #[test]
265 fn test_signed_len() {
266 assert_eq!(signed_len(0), 1);
267 assert_eq!(signed_len(1), 1);
268 assert_eq!(signed_len(-1), 1);
269 assert_eq!(signed_len(63), 1);
270 assert_eq!(signed_len(-64), 1);
271 assert_eq!(signed_len(64), 2);
272 assert_eq!(signed_len(-65), 2);
273 assert_eq!(signed_len(127), 2);
274 assert_eq!(signed_len(-128), 2);
275 }
276
277 #[test]
278 fn test_unsigned_len() {
279 assert_eq!(unsigned_len(0), 1);
280 assert_eq!(unsigned_len(127), 1);
281 assert_eq!(unsigned_len(128), 2);
282 assert_eq!(unsigned_len(16383), 2);
283 assert_eq!(unsigned_len(16384), 3);
284 assert_eq!(unsigned_len(2097151), 3);
285 assert_eq!(unsigned_len(2097152), 4);
286 }
287
288 #[test]
289 fn test_encode_extra_bytes() {
290 assert_eq!(encode_extra_bytes_to_read(0), 0x00);
291 assert_eq!(encode_extra_bytes_to_read(1), 0x80);
292 assert_eq!(encode_extra_bytes_to_read(2), 0xC0);
293 assert_eq!(encode_extra_bytes_to_read(3), 0xE0);
294 assert_eq!(encode_extra_bytes_to_read(4), 0xF0);
295 assert_eq!(encode_extra_bytes_to_read(5), 0xF8);
296 assert_eq!(encode_extra_bytes_to_read(6), 0xFC);
297 assert_eq!(encode_extra_bytes_to_read(7), 0xFE);
298 assert_eq!(encode_extra_bytes_to_read(8), 0xFF);
299 }
300
301 #[test]
302 fn test_roundtrip_with_decoder() {
303 // Test roundtrip encode → decode using standard Cassandra VInt format
304 use crate::parser::vint::parse_vint;
305
306 let test_values = vec![
307 0,
308 1,
309 -1,
310 63,
311 -64,
312 64,
313 -65,
314 127,
315 -128,
316 255,
317 -255,
318 1000,
319 -1000,
320 32767,
321 -32768,
322 1048576,
323 -1048576,
324 i32::MAX as i64,
325 i32::MIN as i64,
326 ];
327
328 for value in test_values {
329 // Test our encoder
330 let mut buf = Vec::new();
331 encode_signed(value, &mut buf);
332
333 // Test decoder can parse it
334 let (remaining, decoded) = parse_vint(&buf).unwrap();
335 assert!(
336 remaining.is_empty(),
337 "Decoder should consume all bytes for value {}",
338 value
339 );
340 assert_eq!(
341 decoded, value,
342 "Roundtrip failed for value {}: encoded as {:?}",
343 value, buf
344 );
345 }
346 }
347
348 #[test]
349 fn test_large_values() {
350 // Test values requiring different byte sizes
351 let test_cases = vec![
352 (1u64 << 7, 2), // 128 - 2 bytes
353 (1u64 << 14, 3), // 16384 - 3 bytes
354 (1u64 << 21, 4), // 2097152 - 4 bytes
355 (1u64 << 28, 5), // 268435456 - 5 bytes
356 (1u64 << 35, 6), // 6 bytes
357 (1u64 << 42, 7), // 7 bytes
358 (1u64 << 49, 8), // 8 bytes
359 (1u64 << 56, 9), // 9 bytes
360 ];
361
362 for (value, expected_size) in test_cases {
363 assert_eq!(
364 unsigned_len(value),
365 expected_size,
366 "Value {} should encode to {} bytes",
367 value,
368 expected_size
369 );
370
371 let mut buf = Vec::new();
372 encode_unsigned(value, &mut buf);
373 assert_eq!(
374 buf.len(),
375 expected_size,
376 "Encoded {} to {:?}, expected {} bytes",
377 value,
378 buf,
379 expected_size
380 );
381 }
382 }
383
384 #[test]
385 fn test_performance_target() {
386 // Performance target: <100ns per encode
387 // This is a rough check that we're not doing anything obviously slow
388 use std::time::Instant;
389
390 let iterations = 10_000;
391 let mut buf = Vec::with_capacity(9);
392
393 let start = Instant::now();
394 for i in 0..iterations {
395 buf.clear();
396 encode_signed(i, &mut buf);
397 }
398 let elapsed = start.elapsed();
399
400 let avg_ns = elapsed.as_nanos() / iterations as u128;
401 println!("Average encode time: {} ns", avg_ns);
402
403 // Sanity check: should be much faster than 1µs per encode
404 assert!(
405 avg_ns < 1000,
406 "Encoding is too slow: {} ns per operation",
407 avg_ns
408 );
409 }
410
411 #[test]
412 fn test_cassandra_compatibility() {
413 // Test specific patterns from standard Cassandra VInt encoding
414 // These use ZigZag encoding: signed → unsigned → VInt bytes
415 let test_cases = vec![
416 (0i64, vec![0x00], "Zero value"), // zigzag(0)=0, vint(0)=[0x00]
417 (1i64, vec![0x02], "Single byte positive"), // zigzag(1)=2, vint(2)=[0x02]
418 (63i64, vec![0x7E], "Maximum single byte positive"), // zigzag(63)=126, vint(126)=[0x7E]
419 (64i64, vec![0x80, 0x80], "Two byte encoding start"), // zigzag(64)=128, vint(128)=[0x80,0x80]
420 (127i64, vec![0x80, 0xFE], "Two byte positive"), // zigzag(127)=254, vint(254)=[0x80,0xFE]
421 (-1i64, vec![0x01], "Single byte negative"), // zigzag(-1)=1, vint(1)=[0x01]
422 (-64i64, vec![0x7F], "Single byte negative boundary"), // zigzag(-64)=127, vint(127)=[0x7F]
423 (-65i64, vec![0x80, 0x81], "Two byte negative"), // zigzag(-65)=129, vint(129)=[0x80,0x81]
424 ];
425
426 for (value, expected_bytes, description) in test_cases {
427 let mut buf = Vec::new();
428 encode_signed(value, &mut buf);
429 assert_eq!(
430 buf, expected_bytes,
431 "{}: encode_signed({}) failed: expected {:?}, got {:?}",
432 description, value, expected_bytes, buf
433 );
434 }
435 }
436}