lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Field length norms for BM25 length normalization.
//!
//! Encodes field lengths (number of tokens) to single bytes via lossy
//! compression, similar to Lucene's `SmallFloat.intToByte4`. This reduces
//! storage to 1 byte per document per field while preserving enough precision
//! for BM25 scoring.
//!
//! On-disk format per field:
//! ```text
//! [field_id: u16] [doc_count: u32] [norm_bytes: u8 * doc_count]
//! ```
//!
//! See [[best-matching-25]] and [[architecture-overview#Step 3]].

use crate::core::{DocId, FieldId};

/// Encode a field length to a single byte.
///
/// Uses a float16-like scheme: lengths 0-24 are exact, larger values are
/// approximated. Monotonic: longer fields produce smaller decoded norms
/// (1/sqrt(length) characteristic).
pub fn encode_norm(field_length: u32) -> u8 {
    match field_length {
        0 => 0,
        l if l <= 24 => l as u8,
        l => {
            // Logarithmic compression for larger values.
            // Find the highest bit position and encode lucisa + exponent.
            let shift = 32 - l.leading_zeros() - 5; // bits to shift to get 5 significant bits
            let lucisa = (l >> shift) as u8 & 0x0F; // 4 lucisa bits
            let exponent = shift as u8; // exponent
            24 + exponent * 16 + lucisa
        }
    }
}

/// Decode a norm byte back to an approximate field length.
pub fn decode_norm_to_length(byte: u8) -> u32 {
    match byte {
        0 => 0,
        b if b <= 24 => b as u32,
        b => {
            let adjusted = b - 24;
            let exponent = adjusted / 16;
            let lucisa = adjusted % 16;
            let base = (16 + lucisa as u32) << exponent;
            base
        }
    }
}

/// Decode a norm byte to a float suitable for BM25's `dl` (document length).
///
/// Returns the approximate field length as f32.
pub fn decode_norm(byte: u8) -> f32 {
    decode_norm_to_length(byte) as f32
}

// --- FieldNormsWriter ---

/// Collects field lengths and produces encoded norm bytes.
pub struct FieldNormsWriter {
    field_id: FieldId,
    norms: Vec<u8>,
}

impl FieldNormsWriter {
    pub fn new(field_id: FieldId) -> Self {
        Self {
            field_id,
            norms: Vec::new(),
        }
    }

    /// Record the field length for the next document.
    /// Documents must be added in doc_id order (0, 1, 2, ...).
    pub fn add(&mut self, field_length: u32) {
        self.norms.push(encode_norm(field_length));
    }

    /// Number of documents recorded.
    pub fn doc_count(&self) -> u32 {
        self.norms.len() as u32
    }

    /// Finalize and return the encoded norms.
    pub fn finish(self) -> Vec<u8> {
        let mut result = Vec::with_capacity(6 + self.norms.len());
        result.extend_from_slice(&self.field_id.as_u16().to_le_bytes());
        result.extend_from_slice(&(self.norms.len() as u32).to_le_bytes());
        result.extend_from_slice(&self.norms);
        result
    }
}

// --- FieldNormsReader ---

/// Reads field norms from encoded bytes.
pub struct FieldNormsReader<'a> {
    field_id: FieldId,
    doc_count: u32,
    norms: &'a [u8],
}

impl<'a> FieldNormsReader<'a> {
    /// Open a norms reader from encoded bytes.
    pub fn open(data: &'a [u8]) -> Self {
        let field_id = FieldId::new(u16::from_le_bytes([data[0], data[1]]));
        let doc_count = u32::from_le_bytes([data[2], data[3], data[4], data[5]]);
        let norms = &data[6..6 + doc_count as usize];
        Self {
            field_id,
            doc_count,
            norms,
        }
    }

    /// The field this norms data is for.
    pub fn field_id(&self) -> FieldId {
        self.field_id
    }

    /// Number of documents.
    pub fn doc_count(&self) -> u32 {
        self.doc_count
    }

    /// Get the decoded norm (approximate field length) for a document.
    pub fn norm(&self, doc_id: DocId) -> f32 {
        let idx = doc_id.as_u32() as usize;
        if idx < self.norms.len() {
            decode_norm(self.norms[idx])
        } else {
            0.0
        }
    }

    /// Get the raw norm byte for a document (for precomputed lookup tables).
    #[inline(always)]
    pub fn raw_byte(&self, doc_id: DocId) -> u8 {
        let idx = doc_id.as_u32() as usize;
        if idx < self.norms.len() {
            self.norms[idx]
        } else {
            0
        }
    }

    /// Check if all scored documents have the same field length (e.g., keyword
    /// fields where every document has field_length=1). Returns the decoded
    /// norm if uniform, None otherwise.
    ///
    /// Ignores zero-norm entries (nested doc slots without this field).
    pub fn uniform_norm(&self) -> Option<f32> {
        let mut common: Option<u8> = None;
        for &b in self.norms {
            if b == 0 {
                continue;
            }
            match common {
                None => common = Some(b),
                Some(c) if c != b => return None,
                _ => {}
            }
        }
        common.map(decode_norm)
    }

    /// Get the raw norm byte for a document.
    pub fn raw_norm(&self, doc_id: DocId) -> u8 {
        let idx = doc_id.as_u32() as usize;
        if idx < self.norms.len() {
            self.norms[idx]
        } else {
            0
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn encode_decode_zero() {
        assert_eq!(encode_norm(0), 0);
        assert_eq!(decode_norm(0), 0.0);
    }

    #[test]
    fn encode_decode_one() {
        assert_eq!(encode_norm(1), 1);
        assert_eq!(decode_norm(1), 1.0);
    }

    #[test]
    fn exact_for_small_lengths() {
        for len in 0..=24 {
            let encoded = encode_norm(len);
            let decoded = decode_norm(encoded);
            assert_eq!(decoded, len as f32, "exact for length {len}");
        }
    }

    #[test]
    fn monotonic_encoding() {
        // Encoded bytes should be monotonically increasing with field length.
        let mut prev_byte = 0u8;
        for len in 1..1000 {
            let byte = encode_norm(len);
            assert!(
                byte >= prev_byte,
                "norm byte should be monotonic: length {len} encoded to {byte}, previous was {prev_byte}"
            );
            prev_byte = byte;
        }
    }

    #[test]
    fn longer_docs_decode_larger() {
        // Decoded norms should increase with field length (approximately).
        let short = decode_norm(encode_norm(5));
        let medium = decode_norm(encode_norm(50));
        let long = decode_norm(encode_norm(500));
        assert!(short < medium);
        assert!(medium < long);
    }

    #[test]
    fn lossy_but_close_for_moderate_lengths() {
        // For moderate field lengths, the decoded value should be reasonably
        // close to the original (within 2x).
        for &len in &[25, 50, 100, 200, 500, 1000] {
            let decoded = decode_norm(encode_norm(len)) as u32;
            let ratio = decoded as f64 / len as f64;
            assert!(
                (0.5..=2.0).contains(&ratio),
                "length {len} decoded to {decoded}, ratio {ratio}"
            );
        }
    }

    #[test]
    fn writer_reader_round_trip() {
        let field_id = FieldId::new(3);
        let mut writer = FieldNormsWriter::new(field_id);
        writer.add(5);
        writer.add(10);
        writer.add(100);
        assert_eq!(writer.doc_count(), 3);

        let data = writer.finish();
        let reader = FieldNormsReader::open(&data);

        assert_eq!(reader.field_id(), field_id);
        assert_eq!(reader.doc_count(), 3);
        assert_eq!(reader.norm(DocId(0)), 5.0);
        assert_eq!(reader.norm(DocId(1)), 10.0);
        // Length 100 may be approximate
        assert!(reader.norm(DocId(2)) > 50.0);
    }

    #[test]
    fn reader_out_of_range() {
        let mut writer = FieldNormsWriter::new(FieldId::new(0));
        writer.add(10);
        let data = writer.finish();
        let reader = FieldNormsReader::open(&data);

        assert_eq!(reader.norm(DocId(99)), 0.0);
    }

    #[test]
    fn very_long_field() {
        let len = 100_000;
        let encoded = encode_norm(len);
        let decoded = decode_norm(encoded);
        // Should still produce a positive value
        assert!(decoded > 0.0);
        // And should be in the right order of magnitude (within 4x)
        let ratio = decoded / len as f32;
        assert!(
            (0.25..=4.0).contains(&ratio),
            "length {len} decoded to {decoded}, ratio {ratio}"
        );
    }
}