Skip to main content

nodedb_fts/codec/
smallfloat.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! SmallFloat: 1-byte quantized document lengths for BM25 fieldnorms.
4//!
5//! Maps u32 doc lengths to a monotonic 0..255 byte scale.
6//! - Byte 0 → length 0
7//! - Byte 255 → length ~16M+
8//! - Monotonic: larger length → larger (or equal) byte
9//! - Max relative error dampened by BM25's `b=0.75` length normalization
10//! - 4x space reduction vs raw u32
11
12/// Encode a u32 value into a single byte.
13///
14/// Exact for 0. For values 1+, uses `floor(log2(value)) * 8 + top_3_fractional_bits`.
15/// Clamped to 255.
16pub fn encode(value: u32) -> u8 {
17    if value == 0 {
18        return 0;
19    }
20    // Position of the highest set bit (0-indexed): 0 for value=1, 1 for value=2..3, etc.
21    let msb = 31 - value.leading_zeros(); // 0..=31
22    // Extract 3 fractional bits below the leading 1-bit.
23    let frac = if msb >= 3 {
24        (value >> (msb - 3)) & 0x07
25    } else {
26        // For very small values (1..=7), shift up to fill 3 bits.
27        (value << (3 - msb)) & 0x07
28    };
29    let byte = msb * 8 + frac + 1; // +1 so byte 0 is reserved for value 0
30    byte.min(255) as u8
31}
32
33/// Precomputed decode table: byte → approximate u32 value.
34static DECODE: [u32; 256] = build_decode_table();
35
36const fn build_decode_table() -> [u32; 256] {
37    let mut table = [0u32; 256];
38    // byte 0 → 0
39    let mut b = 1u32;
40    while b < 256 {
41        let adjusted = b - 1; // Remove the +1 offset from encode.
42        let msb = adjusted / 8;
43        let frac = adjusted % 8;
44        // Reconstruct: leading 1-bit at position `msb`, then frac bits below it.
45        if msb >= 3 {
46            table[b as usize] = (1 << msb) | (frac << (msb - 3));
47        } else {
48            // Small msb: value = frac >> (3 - msb) with the leading 1-bit.
49            table[b as usize] = (1 << msb) | (frac >> (3 - msb));
50        }
51        b += 1;
52    }
53    table
54}
55
56/// Decode a byte back to an approximate u32 value.
57///
58/// Direct table lookup — O(1).
59pub fn decode(byte: u8) -> u32 {
60    DECODE[byte as usize]
61}
62
63#[cfg(test)]
64mod tests {
65    use super::*;
66
67    #[test]
68    fn zero_roundtrips() {
69        assert_eq!(encode(0), 0);
70        assert_eq!(decode(0), 0);
71    }
72
73    #[test]
74    fn small_values_exact() {
75        // Values 1..=8 should be exact or very close.
76        for v in 1..=8u32 {
77            let e = encode(v);
78            let d = decode(e);
79            assert_eq!(d, v, "value {v}: encoded={e}, decoded={d}");
80        }
81    }
82
83    #[test]
84    fn decode_table_monotonic() {
85        for i in 1..256 {
86            assert!(
87                DECODE[i] >= DECODE[i - 1],
88                "DECODE[{i}]={} < DECODE[{}]={}",
89                DECODE[i],
90                i - 1,
91                DECODE[i - 1]
92            );
93        }
94    }
95
96    #[test]
97    fn encode_monotonic() {
98        let mut prev = 0u8;
99        for length in 0..200_000u32 {
100            let encoded = encode(length);
101            assert!(
102                encoded >= prev,
103                "encode({length})={encoded} < prev encode({})={prev}",
104                length - 1
105            );
106            prev = encoded;
107        }
108    }
109
110    #[test]
111    fn roundtrip_within_error() {
112        for length in [
113            10, 20, 50, 100, 200, 500, 1000, 5000, 10_000, 50_000, 100_000,
114        ] {
115            let encoded = encode(length);
116            let decoded = decode(encoded);
117            assert!(decoded <= length, "decoded {decoded} > original {length}");
118            let error = (length - decoded) as f64 / length as f64;
119            assert!(
120                error < 0.25,
121                "length={length}, decoded={decoded}, error={error:.4}"
122            );
123        }
124    }
125
126    #[test]
127    fn max_value() {
128        let encoded = encode(u32::MAX);
129        assert_eq!(encoded, 255);
130        assert!(decode(255) > 1_000_000);
131    }
132
133    #[test]
134    fn full_range_encode_decode_monotonic() {
135        // Verify encode→decode is reasonable for sampled values across the range.
136        let mut prev_byte = 0u8;
137        let mut prev_decoded = 0u32;
138        for &v in &[
139            0, 1, 2, 5, 10, 50, 100, 500, 1000, 10_000, 100_000, 1_000_000,
140        ] {
141            let b = encode(v);
142            let d = decode(b);
143            assert!(b >= prev_byte);
144            assert!(d >= prev_decoded);
145            prev_byte = b;
146            prev_decoded = d;
147        }
148    }
149}