use crate::core::{DocId, FieldId};
pub fn encode_norm(field_length: u32) -> u8 {
match field_length {
0 => 0,
l if l <= 24 => l as u8,
l => {
let shift = 32 - l.leading_zeros() - 5; let lucisa = (l >> shift) as u8 & 0x0F; let exponent = shift as u8; 24 + exponent * 16 + lucisa
}
}
}
pub fn decode_norm_to_length(byte: u8) -> u32 {
match byte {
0 => 0,
b if b <= 24 => b as u32,
b => {
let adjusted = b - 24;
let exponent = adjusted / 16;
let lucisa = adjusted % 16;
let base = (16 + lucisa as u32) << exponent;
base
}
}
}
pub fn decode_norm(byte: u8) -> f32 {
decode_norm_to_length(byte) as f32
}
pub struct FieldNormsWriter {
field_id: FieldId,
norms: Vec<u8>,
}
impl FieldNormsWriter {
pub fn new(field_id: FieldId) -> Self {
Self {
field_id,
norms: Vec::new(),
}
}
pub fn add(&mut self, field_length: u32) {
self.norms.push(encode_norm(field_length));
}
pub fn doc_count(&self) -> u32 {
self.norms.len() as u32
}
pub fn finish(self) -> Vec<u8> {
let mut result = Vec::with_capacity(6 + self.norms.len());
result.extend_from_slice(&self.field_id.as_u16().to_le_bytes());
result.extend_from_slice(&(self.norms.len() as u32).to_le_bytes());
result.extend_from_slice(&self.norms);
result
}
}
pub struct FieldNormsReader<'a> {
field_id: FieldId,
doc_count: u32,
norms: &'a [u8],
}
impl<'a> FieldNormsReader<'a> {
pub fn open(data: &'a [u8]) -> Self {
let field_id = FieldId::new(u16::from_le_bytes([data[0], data[1]]));
let doc_count = u32::from_le_bytes([data[2], data[3], data[4], data[5]]);
let norms = &data[6..6 + doc_count as usize];
Self {
field_id,
doc_count,
norms,
}
}
pub fn field_id(&self) -> FieldId {
self.field_id
}
pub fn doc_count(&self) -> u32 {
self.doc_count
}
pub fn norm(&self, doc_id: DocId) -> f32 {
let idx = doc_id.as_u32() as usize;
if idx < self.norms.len() {
decode_norm(self.norms[idx])
} else {
0.0
}
}
#[inline(always)]
pub fn raw_byte(&self, doc_id: DocId) -> u8 {
let idx = doc_id.as_u32() as usize;
if idx < self.norms.len() {
self.norms[idx]
} else {
0
}
}
pub fn uniform_norm(&self) -> Option<f32> {
let mut common: Option<u8> = None;
for &b in self.norms {
if b == 0 {
continue;
}
match common {
None => common = Some(b),
Some(c) if c != b => return None,
_ => {}
}
}
common.map(decode_norm)
}
pub fn raw_norm(&self, doc_id: DocId) -> u8 {
let idx = doc_id.as_u32() as usize;
if idx < self.norms.len() {
self.norms[idx]
} else {
0
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn encode_decode_zero() {
assert_eq!(encode_norm(0), 0);
assert_eq!(decode_norm(0), 0.0);
}
#[test]
fn encode_decode_one() {
assert_eq!(encode_norm(1), 1);
assert_eq!(decode_norm(1), 1.0);
}
#[test]
fn exact_for_small_lengths() {
for len in 0..=24 {
let encoded = encode_norm(len);
let decoded = decode_norm(encoded);
assert_eq!(decoded, len as f32, "exact for length {len}");
}
}
#[test]
fn monotonic_encoding() {
let mut prev_byte = 0u8;
for len in 1..1000 {
let byte = encode_norm(len);
assert!(
byte >= prev_byte,
"norm byte should be monotonic: length {len} encoded to {byte}, previous was {prev_byte}"
);
prev_byte = byte;
}
}
#[test]
fn longer_docs_decode_larger() {
let short = decode_norm(encode_norm(5));
let medium = decode_norm(encode_norm(50));
let long = decode_norm(encode_norm(500));
assert!(short < medium);
assert!(medium < long);
}
#[test]
fn lossy_but_close_for_moderate_lengths() {
for &len in &[25, 50, 100, 200, 500, 1000] {
let decoded = decode_norm(encode_norm(len)) as u32;
let ratio = decoded as f64 / len as f64;
assert!(
(0.5..=2.0).contains(&ratio),
"length {len} decoded to {decoded}, ratio {ratio}"
);
}
}
#[test]
fn writer_reader_round_trip() {
let field_id = FieldId::new(3);
let mut writer = FieldNormsWriter::new(field_id);
writer.add(5);
writer.add(10);
writer.add(100);
assert_eq!(writer.doc_count(), 3);
let data = writer.finish();
let reader = FieldNormsReader::open(&data);
assert_eq!(reader.field_id(), field_id);
assert_eq!(reader.doc_count(), 3);
assert_eq!(reader.norm(DocId(0)), 5.0);
assert_eq!(reader.norm(DocId(1)), 10.0);
assert!(reader.norm(DocId(2)) > 50.0);
}
#[test]
fn reader_out_of_range() {
let mut writer = FieldNormsWriter::new(FieldId::new(0));
writer.add(10);
let data = writer.finish();
let reader = FieldNormsReader::open(&data);
assert_eq!(reader.norm(DocId(99)), 0.0);
}
#[test]
fn very_long_field() {
let len = 100_000;
let encoded = encode_norm(len);
let decoded = decode_norm(encoded);
assert!(decoded > 0.0);
let ratio = decoded / len as f32;
assert!(
(0.25..=4.0).contains(&ratio),
"length {len} decoded to {decoded}, ratio {ratio}"
);
}
}