pub mod codec;
use std::collections::BTreeMap;
use std::io::{self, Read, Write};
use std::sync::OnceLock;
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
pub const FAST_FIELD_MAGIC: u32 = 0x32545346;
pub const FAST_FIELD_FOOTER_SIZE: u64 = 16;
pub const FAST_FIELD_MISSING: u64 = u64::MAX;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum FastFieldColumnType {
U64 = 0,
I64 = 1,
F64 = 2,
TextOrdinal = 3,
}
impl FastFieldColumnType {
pub fn from_u8(v: u8) -> Option<Self> {
match v {
0 => Some(Self::U64),
1 => Some(Self::I64),
2 => Some(Self::F64),
3 => Some(Self::TextOrdinal),
_ => None,
}
}
}
#[inline]
pub fn zigzag_encode(v: i64) -> u64 {
((v << 1) ^ (v >> 63)) as u64
}
#[inline]
pub fn zigzag_decode(v: u64) -> i64 {
((v >> 1) as i64) ^ -((v & 1) as i64)
}
#[inline]
pub fn f64_to_sortable_u64(f: f64) -> u64 {
let bits = f.to_bits();
if (bits >> 63) == 0 {
bits ^ (1u64 << 63) } else {
!bits }
}
#[inline]
pub fn sortable_u64_to_f64(v: u64) -> f64 {
let bits = if (v >> 63) != 0 {
v ^ (1u64 << 63) } else {
!v };
f64::from_bits(bits)
}
#[inline]
pub fn bits_needed_u64(val: u64) -> u8 {
if val == 0 {
0
} else {
64 - val.leading_zeros() as u8
}
}
pub fn bitpack_write(values: &[u64], bits_per_value: u8, out: &mut Vec<u8>) {
if bits_per_value == 0 {
return; }
let bpv = bits_per_value as usize;
let total_bits = values.len() * bpv;
let total_bytes = total_bits.div_ceil(8);
out.reserve(total_bytes);
let start = out.len();
out.resize(start + total_bytes, 0);
let buf = &mut out[start..];
for (i, &val) in values.iter().enumerate() {
let bit_offset = i * bpv;
let byte_offset = bit_offset / 8;
let bit_shift = bit_offset % 8;
let mut remaining_bits = bpv;
let mut v = val;
let mut bo = byte_offset;
let mut bs = bit_shift;
while remaining_bits > 0 {
let can_write = (8 - bs).min(remaining_bits);
let mask = (1u64 << can_write) - 1;
buf[bo] |= ((v & mask) << bs) as u8;
v >>= can_write;
remaining_bits -= can_write;
bo += 1;
bs = 0;
}
}
}
#[inline]
pub fn bitpack_read(data: &[u8], bits_per_value: u8, index: usize) -> u64 {
if bits_per_value == 0 {
return 0;
}
let bpv = bits_per_value as usize;
let bit_offset = index * bpv;
let byte_offset = bit_offset / 8;
let bit_shift = bit_offset % 8;
if bit_shift + bpv <= 64 && byte_offset + 8 <= data.len() {
let raw = u64::from_le_bytes(data[byte_offset..byte_offset + 8].try_into().unwrap());
let mask = if bpv >= 64 {
u64::MAX
} else {
(1u64 << bpv) - 1
};
return (raw >> bit_shift) & mask;
}
let mut result: u64 = 0;
let mut remaining_bits = bpv;
let mut bo = byte_offset;
let mut bs = bit_shift;
let mut out_shift = 0;
while remaining_bits > 0 {
let can_read = (8 - bs).min(remaining_bits);
let mask = ((1u64 << can_read) - 1) as u8;
let byte_val = if bo < data.len() { data[bo] } else { 0 };
result |= (((byte_val >> bs) & mask) as u64) << out_shift;
remaining_bits -= can_read;
out_shift += can_read;
bo += 1;
bs = 0;
}
result
}
#[derive(Debug, Clone)]
pub struct FastFieldTocEntry {
pub field_id: u32,
pub column_type: FastFieldColumnType,
pub multi: bool,
pub data_offset: u64,
pub data_len: u64,
pub num_docs: u32,
pub dict_offset: u64,
pub dict_count: u32,
}
pub const FAST_FIELD_TOC_ENTRY_SIZE: usize = 4 + 1 + 1 + 8 + 8 + 4 + 8 + 4;
#[derive(Debug, Clone)]
pub struct BlockIndexEntry {
pub num_docs: u32,
pub data_len: u32,
pub dict_count: u32,
pub dict_len: u32,
}
pub const BLOCK_INDEX_ENTRY_SIZE: usize = 16;
impl BlockIndexEntry {
pub fn write_to(&self, w: &mut dyn Write) -> io::Result<()> {
w.write_u32::<LittleEndian>(self.num_docs)?;
w.write_u32::<LittleEndian>(self.data_len)?;
w.write_u32::<LittleEndian>(self.dict_count)?;
w.write_u32::<LittleEndian>(self.dict_len)?;
Ok(())
}
pub fn read_from(r: &mut dyn Read) -> io::Result<Self> {
let num_docs = r.read_u32::<LittleEndian>()?;
let data_len = r.read_u32::<LittleEndian>()?;
let dict_count = r.read_u32::<LittleEndian>()?;
let dict_len = r.read_u32::<LittleEndian>()?;
Ok(Self {
num_docs,
data_len,
dict_count,
dict_len,
})
}
}
impl FastFieldTocEntry {
pub fn write_to(&self, w: &mut dyn Write) -> io::Result<()> {
w.write_u32::<LittleEndian>(self.field_id)?;
w.write_u8(self.column_type as u8)?;
let flags: u8 = if self.multi { 1 } else { 0 };
w.write_u8(flags)?;
w.write_u64::<LittleEndian>(self.data_offset)?;
w.write_u64::<LittleEndian>(self.data_len)?;
w.write_u32::<LittleEndian>(self.num_docs)?;
w.write_u64::<LittleEndian>(self.dict_offset)?;
w.write_u32::<LittleEndian>(self.dict_count)?;
Ok(())
}
pub fn read_from(r: &mut dyn Read) -> io::Result<Self> {
let field_id = r.read_u32::<LittleEndian>()?;
let ct = r.read_u8()?;
let column_type = FastFieldColumnType::from_u8(ct)
.ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "bad column type"))?;
let flags = r.read_u8()?;
let multi = (flags & 1) != 0;
let data_offset = r.read_u64::<LittleEndian>()?;
let data_len = r.read_u64::<LittleEndian>()?;
let num_docs = r.read_u32::<LittleEndian>()?;
let dict_offset = r.read_u64::<LittleEndian>()?;
let dict_count = r.read_u32::<LittleEndian>()?;
Ok(Self {
field_id,
column_type,
multi,
data_offset,
data_len,
num_docs,
dict_offset,
dict_count,
})
}
}
pub struct FastFieldWriter {
pub column_type: FastFieldColumnType,
pub multi: bool,
values: Vec<u64>,
multi_values: Vec<u64>,
multi_offsets: Vec<u32>,
multi_current_doc: u32,
text_values: Option<BTreeMap<String, u32>>,
text_per_doc: Option<Vec<Option<String>>>,
text_multi_values: Option<Vec<String>>,
}
impl FastFieldWriter {
pub fn new_numeric(column_type: FastFieldColumnType) -> Self {
debug_assert!(matches!(
column_type,
FastFieldColumnType::U64 | FastFieldColumnType::I64 | FastFieldColumnType::F64
));
Self {
column_type,
multi: false,
values: Vec::new(),
multi_values: Vec::new(),
multi_offsets: vec![0],
multi_current_doc: 0,
text_values: None,
text_per_doc: None,
text_multi_values: None,
}
}
pub fn new_numeric_multi(column_type: FastFieldColumnType) -> Self {
debug_assert!(matches!(
column_type,
FastFieldColumnType::U64 | FastFieldColumnType::I64 | FastFieldColumnType::F64
));
Self {
column_type,
multi: true,
values: Vec::new(),
multi_values: Vec::new(),
multi_offsets: vec![0],
multi_current_doc: 0,
text_values: None,
text_per_doc: None,
text_multi_values: None,
}
}
pub fn new_text() -> Self {
Self {
column_type: FastFieldColumnType::TextOrdinal,
multi: false,
values: Vec::new(),
multi_values: Vec::new(),
multi_offsets: vec![0],
multi_current_doc: 0,
text_values: Some(BTreeMap::new()),
text_per_doc: Some(Vec::new()),
text_multi_values: None,
}
}
pub fn new_text_multi() -> Self {
Self {
column_type: FastFieldColumnType::TextOrdinal,
multi: true,
values: Vec::new(),
multi_values: Vec::new(),
multi_offsets: vec![0],
multi_current_doc: 0,
text_values: Some(BTreeMap::new()),
text_per_doc: None,
text_multi_values: Some(Vec::new()),
}
}
pub fn add_u64(&mut self, doc_id: u32, value: u64) {
if self.multi {
self.add_multi_u64(doc_id, value);
return;
}
let idx = doc_id as usize;
if idx >= self.values.len() {
self.values.resize(idx + 1, FAST_FIELD_MISSING);
if let Some(ref mut tpd) = self.text_per_doc {
tpd.resize(idx + 1, None);
}
}
self.values[idx] = value;
}
fn add_multi_u64(&mut self, doc_id: u32, value: u64) {
while self.multi_current_doc < doc_id {
self.multi_current_doc += 1;
self.multi_offsets.push(self.multi_values.len() as u32);
}
if self.multi_current_doc == doc_id && self.multi_offsets.len() == doc_id as usize + 1 {
}
self.multi_values.push(value);
}
pub fn add_i64(&mut self, doc_id: u32, value: i64) {
self.add_u64(doc_id, zigzag_encode(value));
}
pub fn add_f64(&mut self, doc_id: u32, value: f64) {
self.add_u64(doc_id, f64_to_sortable_u64(value));
}
pub fn add_text(&mut self, doc_id: u32, value: &str) {
if let Some(ref mut dict) = self.text_values {
let next_id = dict.len() as u32;
dict.entry(value.to_string()).or_insert(next_id);
}
if self.multi {
if let Some(ref mut tmv) = self.text_multi_values {
while self.multi_current_doc < doc_id {
self.multi_current_doc += 1;
self.multi_offsets.push(self.multi_values.len() as u32);
}
if self.multi_current_doc == doc_id
&& self.multi_offsets.len() == doc_id as usize + 1
{
}
self.multi_values.push(0); tmv.push(value.to_string());
}
} else {
let idx = doc_id as usize;
if idx >= self.values.len() {
self.values.resize(idx + 1, FAST_FIELD_MISSING);
}
if let Some(ref mut tpd) = self.text_per_doc {
if idx >= tpd.len() {
tpd.resize(idx + 1, None);
}
tpd[idx] = Some(value.to_string());
}
}
}
pub fn pad_to(&mut self, num_docs: u32) {
let n = num_docs as usize;
if self.multi {
while (self.multi_offsets.len() as u32) <= num_docs {
self.multi_offsets.push(self.multi_values.len() as u32);
}
self.multi_current_doc = num_docs;
} else {
if self.values.len() < n {
self.values.resize(n, FAST_FIELD_MISSING);
if let Some(ref mut tpd) = self.text_per_doc {
tpd.resize(n, None);
}
}
}
}
pub fn num_docs(&self) -> u32 {
if self.multi {
(self.multi_offsets.len() as u32).saturating_sub(1)
} else {
self.values.len() as u32
}
}
pub fn serialize(
&mut self,
writer: &mut dyn Write,
data_offset: u64,
) -> io::Result<(FastFieldTocEntry, u64)> {
if self.column_type == FastFieldColumnType::TextOrdinal {
self.resolve_text_ordinals();
}
let num_docs = self.num_docs();
let mut block_data = Vec::new();
if self.multi {
let offsets_u64: Vec<u64> = self.multi_offsets.iter().map(|&v| v as u64).collect();
let mut offset_buf = Vec::new();
codec::serialize_auto(&offsets_u64, &mut offset_buf)?;
block_data.write_u32::<LittleEndian>(offset_buf.len() as u32)?;
block_data.write_all(&offset_buf)?;
codec::serialize_auto(&self.multi_values, &mut block_data)?;
} else {
codec::serialize_auto(&self.values, &mut block_data)?;
}
let mut dict_buf = Vec::new();
let dict_count = if self.column_type == FastFieldColumnType::TextOrdinal {
let (count, _) = self.write_text_dictionary(&mut dict_buf)?;
count
} else {
0u32
};
let block_entry = BlockIndexEntry {
num_docs,
data_len: block_data.len() as u32,
dict_count,
dict_len: dict_buf.len() as u32,
};
let mut total_bytes = 0u64;
writer.write_u32::<LittleEndian>(1u32)?; total_bytes += 4;
block_entry.write_to(writer)?;
total_bytes += BLOCK_INDEX_ENTRY_SIZE as u64;
writer.write_all(&block_data)?;
total_bytes += block_data.len() as u64;
writer.write_all(&dict_buf)?;
total_bytes += dict_buf.len() as u64;
let toc = FastFieldTocEntry {
field_id: 0, column_type: self.column_type,
multi: self.multi,
data_offset,
data_len: total_bytes,
num_docs,
dict_offset: 0, dict_count: 0,
};
Ok((toc, total_bytes))
}
fn resolve_text_ordinals(&mut self) {
let dict = self.text_values.as_ref().expect("text_values required");
let sorted_ordinals: BTreeMap<&str, u64> = dict
.keys()
.enumerate()
.map(|(ord, key)| (key.as_str(), ord as u64))
.collect();
if self.multi {
if let Some(ref tmv) = self.text_multi_values {
for (i, text) in tmv.iter().enumerate() {
self.multi_values[i] = sorted_ordinals[text.as_str()];
}
}
} else {
let tpd = self.text_per_doc.as_ref().expect("text_per_doc required");
for (i, doc_text) in tpd.iter().enumerate() {
match doc_text {
Some(text) => {
self.values[i] = sorted_ordinals[text.as_str()];
}
None => {
self.values[i] = FAST_FIELD_MISSING;
}
}
}
}
}
fn write_text_dictionary(&self, writer: &mut dyn Write) -> io::Result<(u32, u64)> {
let dict = self.text_values.as_ref().expect("text_values required");
let mut bytes_written = 0u64;
let count = dict.len() as u32;
for key in dict.keys() {
let key_bytes = key.as_bytes();
writer.write_u32::<LittleEndian>(key_bytes.len() as u32)?;
writer.write_all(key_bytes)?;
bytes_written += 4 + key_bytes.len() as u64;
}
Ok((count, bytes_written))
}
}
use crate::directories::OwnedBytes;
pub struct ColumnBlock {
pub cumulative_docs: u32,
pub num_docs: u32,
pub data: OwnedBytes,
pub offset_data: OwnedBytes,
pub value_data: OwnedBytes,
pub dict: Option<TextDictReader>,
pub raw_dict: OwnedBytes,
}
pub struct FastFieldReader {
pub column_type: FastFieldColumnType,
pub num_docs: u32,
pub multi: bool,
blocks: Vec<ColumnBlock>,
text_state: OnceLock<TextState>,
}
struct TextState {
global_dict: TextDictReader,
ordinal_maps: Vec<Vec<u32>>,
}
impl FastFieldReader {
pub fn open(file_data: &OwnedBytes, toc: &FastFieldTocEntry) -> io::Result<Self> {
let region_start = toc.data_offset as usize;
let region_end = region_start + toc.data_len as usize;
if region_end > file_data.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"fast field data out of bounds",
));
}
let raw = file_data.as_slice();
let mut pos = region_start;
if pos + 4 > region_end {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"fast field: missing num_blocks",
));
}
let num_blocks = u32::from_le_bytes(raw[pos..pos + 4].try_into().unwrap());
pos += 4;
let idx_size = num_blocks as usize * BLOCK_INDEX_ENTRY_SIZE;
if pos + idx_size > region_end {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"fast field: block index truncated",
));
}
let mut block_entries = Vec::with_capacity(num_blocks as usize);
{
let mut cursor = std::io::Cursor::new(&raw[pos..pos + idx_size]);
for _ in 0..num_blocks {
block_entries.push(BlockIndexEntry::read_from(&mut cursor)?);
}
}
pos += idx_size;
let empty = OwnedBytes::new(Vec::new());
let mut blocks = Vec::with_capacity(num_blocks as usize);
let mut cumulative = 0u32;
for entry in &block_entries {
let data_start = pos;
let data_end = data_start + entry.data_len as usize;
let dict_start = data_end;
let dict_end = dict_start + entry.dict_len as usize;
if dict_end > file_data.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"fast field: block data/dict truncated",
));
}
let (block_data, offset_data, value_data) = if toc.multi {
let block_raw = &raw[data_start..data_end];
if block_raw.len() < 4 {
(empty.clone(), empty.clone(), empty.clone())
} else {
let offset_col_len =
u32::from_le_bytes(block_raw[0..4].try_into().unwrap()) as usize;
let o_start = data_start + 4;
let o_end = o_start + offset_col_len;
let v_start = o_end;
let v_end = data_end;
(
file_data.slice(data_start..data_end),
file_data.slice(o_start..o_end),
file_data.slice(v_start..v_end),
)
}
} else {
(
file_data.slice(data_start..data_end),
empty.clone(),
empty.clone(),
)
};
let dict = if entry.dict_count > 0 {
Some(TextDictReader::new_lazy(
file_data.slice(dict_start..dict_end),
entry.dict_count,
))
} else {
None
};
let raw_dict = if entry.dict_len > 0 {
file_data.slice(dict_start..dict_end)
} else {
empty.clone()
};
blocks.push(ColumnBlock {
cumulative_docs: cumulative,
num_docs: entry.num_docs,
data: block_data,
offset_data,
value_data,
dict,
raw_dict,
});
cumulative += entry.num_docs;
pos = dict_end;
}
Ok(Self {
column_type: toc.column_type,
num_docs: toc.num_docs,
multi: toc.multi,
blocks,
text_state: OnceLock::new(),
})
}
fn ensure_text_state(&self) -> &TextState {
self.text_state
.get_or_init(|| Self::build_text_state(&self.blocks))
}
fn build_text_state(blocks: &[ColumnBlock]) -> TextState {
let blocks_with_dict = blocks.iter().filter(|b| b.dict.is_some()).count();
if blocks_with_dict <= 1 {
for block in blocks.iter() {
if let Some(ref dict) = block.dict {
return TextState {
global_dict: TextDictReader::new_lazy(block.raw_dict.clone(), dict.len()),
ordinal_maps: vec![Vec::new(); blocks.len()],
};
}
}
return TextState {
global_dict: TextDictReader::new_lazy(OwnedBytes::new(Vec::new()), 0),
ordinal_maps: vec![Vec::new(); blocks.len()],
};
}
let mut unique_map: BTreeMap<String, u32> = BTreeMap::new();
for block in blocks.iter() {
if let Some(ref dict) = block.dict {
for ord in 0..dict.len() {
if let Some(text) = dict.get(ord) {
unique_map.entry(text.to_string()).or_insert(0);
}
}
}
}
for (i, value) in unique_map.values_mut().enumerate() {
*value = i as u32;
}
let mut ordinal_maps = Vec::with_capacity(blocks.len());
for block in blocks.iter() {
if let Some(ref dict) = block.dict {
let mut map = Vec::with_capacity(dict.len() as usize);
for local_ord in 0..dict.len() {
let text = dict
.get(local_ord)
.expect("block dict ordinal out of range");
let global_ord = *unique_map
.get(text)
.expect("block dict entry not found in merged global dict");
map.push(global_ord);
}
ordinal_maps.push(map);
} else {
ordinal_maps.push(Vec::new());
}
}
let mut dict_buf = Vec::new();
let count = unique_map.len() as u32;
for s in unique_map.keys() {
let bytes = s.as_bytes();
dict_buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
dict_buf.extend_from_slice(bytes);
}
TextState {
global_dict: TextDictReader::new_lazy(OwnedBytes::new(dict_buf), count),
ordinal_maps,
}
}
#[inline]
fn remap_ordinal(&self, block_idx: usize, raw: u64) -> u64 {
if self.column_type == FastFieldColumnType::TextOrdinal
&& raw != FAST_FIELD_MISSING
&& self.blocks.len() > 1
{
let state = self.ensure_text_state();
let map = &state.ordinal_maps[block_idx];
if !map.is_empty() {
let idx = raw as usize;
if idx < map.len() {
map[idx] as u64
} else {
FAST_FIELD_MISSING
}
} else {
raw
}
} else {
raw
}
}
#[inline]
fn find_block(&self, doc_id: u32) -> (usize, u32) {
debug_assert!(!self.blocks.is_empty());
if self.blocks.len() == 1 {
return (0, doc_id);
}
let bi = self
.blocks
.partition_point(|b| b.cumulative_docs <= doc_id)
.saturating_sub(1);
(bi, doc_id - self.blocks[bi].cumulative_docs)
}
#[inline]
pub fn get_u64(&self, doc_id: u32) -> u64 {
if doc_id >= self.num_docs {
return FAST_FIELD_MISSING;
}
let (bi, local) = self.find_block(doc_id);
let block = &self.blocks[bi];
if self.multi {
let start = codec::auto_read(block.offset_data.as_slice(), local as usize) as u32;
let end = codec::auto_read(block.offset_data.as_slice(), local as usize + 1) as u32;
if start >= end {
return FAST_FIELD_MISSING;
}
let raw = codec::auto_read(block.value_data.as_slice(), start as usize);
return self.remap_ordinal(bi, raw);
}
let raw = codec::auto_read(block.data.as_slice(), local as usize);
self.remap_ordinal(bi, raw)
}
#[inline]
fn block_value_range(&self, doc_id: u32) -> (usize, u32, u32) {
if !self.multi || doc_id >= self.num_docs {
return (0, 0, 0);
}
let (bi, local) = self.find_block(doc_id);
let block = &self.blocks[bi];
let start = codec::auto_read(block.offset_data.as_slice(), local as usize) as u32;
let end = codec::auto_read(block.offset_data.as_slice(), local as usize + 1) as u32;
(bi, start, end)
}
#[inline]
pub fn value_range(&self, doc_id: u32) -> (u32, u32) {
let (_, start, end) = self.block_value_range(doc_id);
(start, end)
}
#[inline]
pub fn get_value_at(&self, index: u32) -> u64 {
if self.blocks.len() == 1 {
let raw = codec::auto_read(self.blocks[0].value_data.as_slice(), index as usize);
return self.remap_ordinal(0, raw);
}
0
}
pub fn get_multi_values(&self, doc_id: u32) -> Vec<u64> {
let (bi, start, end) = self.block_value_range(doc_id);
if start >= end {
return Vec::new();
}
let block = &self.blocks[bi];
(start..end)
.map(|idx| {
let raw = codec::auto_read(block.value_data.as_slice(), idx as usize);
self.remap_ordinal(bi, raw)
})
.collect()
}
#[inline]
pub fn for_each_multi_value(&self, doc_id: u32, mut f: impl FnMut(u64) -> bool) -> bool {
let (bi, start, end) = self.block_value_range(doc_id);
if start >= end {
return false;
}
let block = &self.blocks[bi];
for idx in start..end {
let raw = codec::auto_read(block.value_data.as_slice(), idx as usize);
if f(self.remap_ordinal(bi, raw)) {
return true;
}
}
false
}
pub fn scan_single_values(&self, mut f: impl FnMut(u32, u64)) {
if self.multi {
return;
}
const BATCH: usize = 256;
let mut buf = [0u64; BATCH];
let needs_remap =
self.column_type == FastFieldColumnType::TextOrdinal && self.blocks.len() > 1;
let ordinal_maps = if needs_remap {
Some(&self.ensure_text_state().ordinal_maps)
} else {
None
};
for (block_idx, block) in self.blocks.iter().enumerate() {
let n = block.num_docs as usize;
let mut pos = 0;
let map = ordinal_maps.map(|maps| &maps[block_idx]);
let has_map = map.is_some_and(|m| !m.is_empty());
while pos < n {
let chunk = (n - pos).min(BATCH);
codec::auto_read_batch(block.data.as_slice(), pos, &mut buf[..chunk]);
if has_map {
let map = map.unwrap();
for (i, &raw) in buf[..chunk].iter().enumerate() {
let val = if raw != FAST_FIELD_MISSING {
let idx = raw as usize;
if idx < map.len() {
map[idx] as u64
} else {
FAST_FIELD_MISSING
}
} else {
raw
};
f(block.cumulative_docs + pos as u32 + i as u32, val);
}
} else {
for (i, &val) in buf[..chunk].iter().enumerate() {
f(block.cumulative_docs + pos as u32 + i as u32, val);
}
}
pos += chunk;
}
}
}
#[inline]
pub fn has_value(&self, doc_id: u32) -> bool {
if !self.multi {
return doc_id < self.num_docs && self.get_u64(doc_id) != FAST_FIELD_MISSING;
}
let (_, start, end) = self.block_value_range(doc_id);
start < end
}
#[inline]
pub fn get_i64(&self, doc_id: u32) -> i64 {
zigzag_decode(self.get_u64(doc_id))
}
#[inline]
pub fn get_f64(&self, doc_id: u32) -> f64 {
sortable_u64_to_f64(self.get_u64(doc_id))
}
#[inline]
pub fn get_ordinal(&self, doc_id: u32) -> u64 {
self.get_u64(doc_id)
}
pub fn get_text(&self, doc_id: u32) -> Option<&str> {
if doc_id >= self.num_docs {
return None;
}
let (bi, local) = self.find_block(doc_id);
let block = &self.blocks[bi];
let raw_ordinal = if self.multi {
let start = codec::auto_read(block.offset_data.as_slice(), local as usize) as u32;
let end = codec::auto_read(block.offset_data.as_slice(), local as usize + 1) as u32;
if start >= end {
return None;
}
codec::auto_read(block.value_data.as_slice(), start as usize)
} else {
codec::auto_read(block.data.as_slice(), local as usize)
};
if raw_ordinal == FAST_FIELD_MISSING {
return None;
}
block.dict.as_ref().and_then(|d| d.get(raw_ordinal as u32))
}
pub fn text_ordinal(&self, text: &str) -> Option<u64> {
if self.column_type != FastFieldColumnType::TextOrdinal {
return None;
}
self.ensure_text_state().global_dict.ordinal(text)
}
pub fn text_dict(&self) -> Option<&TextDictReader> {
if self.column_type != FastFieldColumnType::TextOrdinal {
return None;
}
Some(&self.ensure_text_state().global_dict)
}
pub fn num_blocks(&self) -> usize {
self.blocks.len()
}
pub fn blocks(&self) -> &[ColumnBlock] {
&self.blocks
}
}
pub struct TextDictReader {
data: OwnedBytes,
count: u32,
offsets: OnceLock<Vec<(u32, u32)>>,
}
impl TextDictReader {
pub fn new_lazy(data: OwnedBytes, count: u32) -> Self {
Self {
data,
count,
offsets: OnceLock::new(),
}
}
pub fn open(file_data: &OwnedBytes, dict_start: usize, count: u32) -> io::Result<Self> {
if count == 0 {
return Ok(Self::new_lazy(OwnedBytes::new(Vec::new()), 0));
}
let dict_slice = file_data.as_slice();
let mut pos = dict_start;
for _ in 0..count {
if pos + 4 > dict_slice.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"text dict truncated",
));
}
let len = u32::from_le_bytes(dict_slice[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
if pos + len > dict_slice.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"text dict entry truncated",
));
}
pos += len;
}
let data = file_data.slice(dict_start..pos);
Ok(Self::new_lazy(data, count))
}
pub fn open_from_raw(raw_dict: &OwnedBytes, count: u32) -> io::Result<Self> {
Ok(Self::new_lazy(raw_dict.clone(), count))
}
#[inline]
fn ensure_offsets(&self) -> &[(u32, u32)] {
self.offsets.get_or_init(|| {
let dict_slice = self.data.as_slice();
let mut pos = 0usize;
let mut offsets = Vec::with_capacity(self.count as usize);
for _ in 0..self.count {
debug_assert!(
pos + 4 <= dict_slice.len(),
"text dict truncated during lazy init"
);
let len = u32::from_le_bytes(dict_slice[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
debug_assert!(
pos + len <= dict_slice.len(),
"text dict entry truncated during lazy init"
);
offsets.push((pos as u32, len as u32));
pos += len;
}
offsets
})
}
pub fn get(&self, ordinal: u32) -> Option<&str> {
let offsets = self.ensure_offsets();
let &(off, len) = offsets.get(ordinal as usize)?;
let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
Some(unsafe { std::str::from_utf8_unchecked(slice) })
}
pub fn ordinal(&self, text: &str) -> Option<u64> {
let offsets = self.ensure_offsets();
offsets
.binary_search_by(|&(off, len)| {
let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
let entry = unsafe { std::str::from_utf8_unchecked(slice) };
entry.cmp(text)
})
.ok()
.map(|i| i as u64)
}
pub fn len(&self) -> u32 {
self.count
}
pub fn is_empty(&self) -> bool {
self.count == 0
}
pub fn iter(&self) -> impl Iterator<Item = &str> {
let offsets = self.ensure_offsets();
offsets.iter().map(|&(off, len)| {
let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
unsafe { std::str::from_utf8_unchecked(slice) }
})
}
}
pub fn write_fast_field_toc_and_footer(
writer: &mut dyn Write,
toc_offset: u64,
entries: &[FastFieldTocEntry],
) -> io::Result<()> {
for e in entries {
e.write_to(writer)?;
}
writer.write_u64::<LittleEndian>(toc_offset)?;
writer.write_u32::<LittleEndian>(entries.len() as u32)?;
writer.write_u32::<LittleEndian>(FAST_FIELD_MAGIC)?;
Ok(())
}
pub fn read_fast_field_footer(file_data: &[u8]) -> io::Result<(u64, u32)> {
let len = file_data.len();
if len < FAST_FIELD_FOOTER_SIZE as usize {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"fast field file too small for footer",
));
}
let footer = &file_data[len - FAST_FIELD_FOOTER_SIZE as usize..];
let mut cursor = std::io::Cursor::new(footer);
let toc_offset = cursor.read_u64::<LittleEndian>()?;
let num_columns = cursor.read_u32::<LittleEndian>()?;
let magic = cursor.read_u32::<LittleEndian>()?;
if magic != FAST_FIELD_MAGIC {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("bad fast field magic: 0x{:08x}", magic),
));
}
Ok((toc_offset, num_columns))
}
pub fn read_fast_field_toc(
file_data: &[u8],
toc_offset: u64,
num_columns: u32,
) -> io::Result<Vec<FastFieldTocEntry>> {
let start = toc_offset as usize;
let expected = num_columns as usize * FAST_FIELD_TOC_ENTRY_SIZE;
if start + expected > file_data.len() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"fast field TOC out of bounds",
));
}
let mut cursor = std::io::Cursor::new(&file_data[start..start + expected]);
let mut entries = Vec::with_capacity(num_columns as usize);
for _ in 0..num_columns {
entries.push(FastFieldTocEntry::read_from(&mut cursor)?);
}
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_zigzag_roundtrip() {
for v in [0i64, 1, -1, 42, -42, i64::MAX, i64::MIN] {
assert_eq!(zigzag_decode(zigzag_encode(v)), v);
}
}
#[test]
fn test_f64_sortable_roundtrip() {
for v in [0.0f64, 1.0, -1.0, f64::MAX, f64::MIN, f64::MIN_POSITIVE] {
assert_eq!(sortable_u64_to_f64(f64_to_sortable_u64(v)), v);
}
}
#[test]
fn test_f64_sortable_order() {
let values = [-100.0f64, -1.0, -0.0, 0.0, 0.5, 1.0, 100.0];
let encoded: Vec<u64> = values.iter().map(|&v| f64_to_sortable_u64(v)).collect();
for i in 1..encoded.len() {
assert!(
encoded[i] >= encoded[i - 1],
"{} >= {} failed for {} vs {}",
encoded[i],
encoded[i - 1],
values[i],
values[i - 1]
);
}
}
#[test]
fn test_bitpack_roundtrip() {
let values: Vec<u64> = vec![0, 3, 7, 15, 0, 1, 6, 12];
let bpv = 4u8;
let mut packed = Vec::new();
bitpack_write(&values, bpv, &mut packed);
for (i, &expected) in values.iter().enumerate() {
let got = bitpack_read(&packed, bpv, i);
assert_eq!(got, expected, "index {}", i);
}
}
#[test]
fn test_bitpack_high_bpv_regression() {
for bpv in [57u8, 58, 59, 60, 63, 64] {
let max_val = if bpv == 64 {
u64::MAX
} else {
(1u64 << bpv) - 1
};
let values: Vec<u64> = (0..32)
.map(|i: u64| {
if max_val == u64::MAX {
i * 7
} else {
(i * 7) % (max_val + 1)
}
})
.collect();
let mut packed = Vec::new();
bitpack_write(&values, bpv, &mut packed);
for (i, &expected) in values.iter().enumerate() {
let got = bitpack_read(&packed, bpv, i);
assert_eq!(got, expected, "high bpv={} index={}", bpv, i);
}
}
}
#[test]
fn test_bitpack_various_widths() {
for bpv in [1u8, 2, 3, 5, 7, 8, 13, 16, 32, 64] {
let max_val = if bpv == 64 {
u64::MAX
} else {
(1u64 << bpv) - 1
};
let values: Vec<u64> = (0..100)
.map(|i: u64| {
if max_val == u64::MAX {
i
} else {
i % (max_val + 1)
}
})
.collect();
let mut packed = Vec::new();
bitpack_write(&values, bpv, &mut packed);
for (i, &expected) in values.iter().enumerate() {
let got = bitpack_read(&packed, bpv, i);
assert_eq!(got, expected, "bpv={} index={}", bpv, i);
}
}
}
fn owned(buf: Vec<u8>) -> OwnedBytes {
OwnedBytes::new(buf)
}
#[test]
fn test_writer_reader_u64_roundtrip() {
let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
writer.add_u64(0, 100);
writer.add_u64(1, 200);
writer.add_u64(2, 150);
writer.add_u64(4, 300); writer.pad_to(5);
let mut buf = Vec::new();
let (mut toc, _bytes) = writer.serialize(&mut buf, 0).unwrap();
toc.field_id = 42;
let toc_offset = buf.len() as u64;
write_fast_field_toc_and_footer(&mut buf, toc_offset, &[toc]).unwrap();
let ob = owned(buf);
let (toc_off, num_cols) = read_fast_field_footer(&ob).unwrap();
assert_eq!(num_cols, 1);
let tocs = read_fast_field_toc(&ob, toc_off, num_cols).unwrap();
assert_eq!(tocs.len(), 1);
assert_eq!(tocs[0].field_id, 42);
let reader = FastFieldReader::open(&ob, &tocs[0]).unwrap();
assert_eq!(reader.get_u64(0), 100);
assert_eq!(reader.get_u64(1), 200);
assert_eq!(reader.get_u64(2), 150);
assert_eq!(reader.get_u64(3), FAST_FIELD_MISSING); assert_eq!(reader.get_u64(4), 300);
}
#[test]
fn test_writer_reader_i64_roundtrip() {
let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::I64);
writer.add_i64(0, -100);
writer.add_i64(1, 50);
writer.add_i64(2, 0);
writer.pad_to(3);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.get_i64(0), -100);
assert_eq!(reader.get_i64(1), 50);
assert_eq!(reader.get_i64(2), 0);
}
#[test]
fn test_writer_reader_f64_roundtrip() {
let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::F64);
writer.add_f64(0, -1.5);
writer.add_f64(1, 3.15);
writer.add_f64(2, 0.0);
writer.pad_to(3);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.get_f64(0), -1.5);
assert_eq!(reader.get_f64(1), 3.15);
assert_eq!(reader.get_f64(2), 0.0);
}
#[test]
fn test_writer_reader_text_roundtrip() {
let mut writer = FastFieldWriter::new_text();
writer.add_text(0, "banana");
writer.add_text(1, "apple");
writer.add_text(2, "cherry");
writer.add_text(3, "apple"); writer.pad_to(5);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.get_text(0), Some("banana"));
assert_eq!(reader.get_text(1), Some("apple"));
assert_eq!(reader.get_text(2), Some("cherry"));
assert_eq!(reader.get_text(3), Some("apple"));
assert_eq!(reader.get_text(4), None);
assert_eq!(reader.text_ordinal("apple"), Some(0));
assert_eq!(reader.text_ordinal("banana"), Some(1));
assert_eq!(reader.text_ordinal("cherry"), Some(2));
assert_eq!(reader.text_ordinal("durian"), None);
}
#[test]
fn test_constant_column() {
let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
for i in 0..100 {
writer.add_u64(i, 42);
}
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
for i in 0..100 {
assert_eq!(reader.get_u64(i), 42);
}
}
#[test]
fn test_multi_value_u64_roundtrip() {
let mut writer = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
writer.add_u64(0, 10);
writer.add_u64(0, 20);
writer.add_u64(0, 30);
writer.add_u64(2, 100);
writer.add_u64(3, 5);
writer.add_u64(3, 15);
writer.pad_to(4);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
assert!(toc.multi);
assert_eq!(toc.num_docs, 4);
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert!(reader.multi);
assert_eq!(reader.get_u64(0), 10);
let (s, e) = reader.value_range(0);
assert_eq!(e - s, 3);
assert_eq!(reader.get_value_at(s), 10);
assert_eq!(reader.get_value_at(s + 1), 20);
assert_eq!(reader.get_value_at(s + 2), 30);
assert_eq!(reader.get_u64(1), FAST_FIELD_MISSING);
let (s, e) = reader.value_range(1);
assert_eq!(s, e);
assert!(!reader.has_value(1));
assert_eq!(reader.get_u64(2), 100);
assert!(reader.has_value(2));
assert_eq!(reader.get_u64(3), 5);
let (s, e) = reader.value_range(3);
assert_eq!(e - s, 2);
assert_eq!(reader.get_value_at(s), 5);
assert_eq!(reader.get_value_at(s + 1), 15);
}
#[test]
fn test_multi_value_text_roundtrip() {
let mut writer = FastFieldWriter::new_text_multi();
writer.add_text(0, "banana");
writer.add_text(0, "apple");
writer.add_text(1, "cherry");
writer.pad_to(3);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
assert!(toc.multi);
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
let (s, e) = reader.value_range(0);
assert_eq!(e - s, 2);
let ord0 = reader.get_value_at(s);
let ord1 = reader.get_value_at(s + 1);
assert_eq!(reader.text_dict().unwrap().get(ord0 as u32), Some("banana"));
assert_eq!(reader.text_dict().unwrap().get(ord1 as u32), Some("apple"));
let (s, e) = reader.value_range(1);
assert_eq!(e - s, 1);
let ord = reader.get_value_at(s);
assert_eq!(reader.text_dict().unwrap().get(ord as u32), Some("cherry"));
assert!(!reader.has_value(2));
}
#[test]
fn test_multi_value_full_toc_roundtrip() {
let mut writer = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
writer.add_u64(0, 1);
writer.add_u64(0, 2);
writer.add_u64(1, 3);
writer.pad_to(2);
let mut buf = Vec::new();
let (mut toc, _) = writer.serialize(&mut buf, 0).unwrap();
toc.field_id = 7;
let toc_offset = buf.len() as u64;
write_fast_field_toc_and_footer(&mut buf, toc_offset, &[toc]).unwrap();
let ob = owned(buf);
let (toc_off, num_cols) = read_fast_field_footer(&ob).unwrap();
let tocs = read_fast_field_toc(&ob, toc_off, num_cols).unwrap();
assert_eq!(tocs[0].field_id, 7);
assert!(tocs[0].multi);
let reader = FastFieldReader::open(&ob, &tocs[0]).unwrap();
assert_eq!(reader.get_u64(0), 1);
assert_eq!(reader.get_u64(1), 3);
}
fn serialize_single_block(writer: &mut FastFieldWriter) -> (Vec<u8>, Vec<u8>, BlockIndexEntry) {
let mut buf = Vec::new();
let (_toc, _) = writer.serialize(&mut buf, 0).unwrap();
let mut cursor = std::io::Cursor::new(&buf[4..4 + BLOCK_INDEX_ENTRY_SIZE]);
let entry = BlockIndexEntry::read_from(&mut cursor).unwrap();
let data_start = 4 + BLOCK_INDEX_ENTRY_SIZE;
let data_end = data_start + entry.data_len as usize;
let dict_end = data_end + entry.dict_len as usize;
let data = buf[data_start..data_end].to_vec();
let dict = if dict_end > data_end {
buf[data_end..dict_end].to_vec()
} else {
Vec::new()
};
(data, dict, entry)
}
fn assemble_blocked_column(
field_id: u32,
column_type: FastFieldColumnType,
multi: bool,
blocks: &[(u32, &[u8], u32, &[u8])], ) -> (Vec<u8>, FastFieldTocEntry) {
use byteorder::{LittleEndian, WriteBytesExt};
let mut buf = Vec::new();
let num_blocks = blocks.len() as u32;
buf.write_u32::<LittleEndian>(num_blocks).unwrap();
for &(num_docs, data, dict_count, dict) in blocks {
let entry = BlockIndexEntry {
num_docs,
data_len: data.len() as u32,
dict_count,
dict_len: dict.len() as u32,
};
entry.write_to(&mut buf).unwrap();
}
let mut total_docs = 0u32;
for &(num_docs, data, _, dict) in blocks {
buf.extend_from_slice(data);
buf.extend_from_slice(dict);
total_docs += num_docs;
}
let data_len = buf.len() as u64;
let toc = FastFieldTocEntry {
field_id,
column_type,
multi,
data_offset: 0,
data_len,
num_docs: total_docs,
dict_offset: 0,
dict_count: 0,
};
let toc_offset = buf.len() as u64;
write_fast_field_toc_and_footer(&mut buf, toc_offset, std::slice::from_ref(&toc)).unwrap();
(buf, toc)
}
#[test]
fn test_multi_block_numeric_roundtrip() {
let mut wa = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
wa.add_u64(0, 10);
wa.add_u64(1, 20);
wa.add_u64(2, 30);
let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
let mut wb = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
wb.add_u64(0, 40);
wb.add_u64(1, 50);
let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
let (buf, toc) = assemble_blocked_column(
1,
FastFieldColumnType::U64,
false,
&[
(entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
(entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
],
);
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.num_docs, 5);
assert_eq!(reader.num_blocks(), 2);
assert_eq!(reader.get_u64(0), 10);
assert_eq!(reader.get_u64(1), 20);
assert_eq!(reader.get_u64(2), 30);
assert_eq!(reader.get_u64(3), 40);
assert_eq!(reader.get_u64(4), 50);
}
#[test]
fn test_multi_block_text_roundtrip() {
let mut wa = FastFieldWriter::new_text();
wa.add_text(0, "alpha");
wa.add_text(1, "beta");
let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
let mut wb = FastFieldWriter::new_text();
wb.add_text(0, "gamma");
wb.add_text(1, "alpha");
let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
let (buf, toc) = assemble_blocked_column(
2,
FastFieldColumnType::TextOrdinal,
false,
&[
(entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
(entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
],
);
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.num_docs, 4);
assert_eq!(reader.num_blocks(), 2);
assert_eq!(reader.text_dict().unwrap().len(), 3);
assert_eq!(reader.get_text(0), Some("alpha"));
assert_eq!(reader.get_text(1), Some("beta"));
assert_eq!(reader.get_text(2), Some("gamma"));
assert_eq!(reader.get_text(3), Some("alpha"));
assert_eq!(reader.text_ordinal("alpha"), Some(0));
assert_eq!(reader.text_ordinal("beta"), Some(1));
assert_eq!(reader.text_ordinal("gamma"), Some(2));
assert_eq!(reader.get_u64(0), 0); assert_eq!(reader.get_u64(1), 1); assert_eq!(reader.get_u64(2), 2); assert_eq!(reader.get_u64(3), 0); }
#[test]
fn test_multi_block_text_ordinal_mismatch_regression() {
let mut wa = FastFieldWriter::new_text();
wa.add_text(0, "book");
wa.add_text(1, "wiki");
let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
let mut wb = FastFieldWriter::new_text();
wb.add_text(0, "apple");
wb.add_text(1, "wiki");
let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
let (buf, toc) = assemble_blocked_column(
2,
FastFieldColumnType::TextOrdinal,
false,
&[
(entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
(entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
],
);
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.text_dict().unwrap().len(), 3);
assert_eq!(reader.text_ordinal("apple"), Some(0));
assert_eq!(reader.text_ordinal("book"), Some(1));
assert_eq!(reader.text_ordinal("wiki"), Some(2));
assert_eq!(reader.get_u64(0), 1); assert_eq!(reader.get_u64(1), 2); assert_eq!(reader.get_u64(2), 0); assert_eq!(reader.get_u64(3), 2);
let wiki_ord = reader.text_ordinal("wiki").unwrap();
assert_eq!(reader.get_u64(1), wiki_ord, "wiki doc should match");
assert_eq!(reader.get_u64(3), wiki_ord, "wiki doc should match");
assert_ne!(reader.get_u64(0), wiki_ord, "book doc must NOT match wiki");
assert_ne!(reader.get_u64(2), wiki_ord, "apple doc must NOT match wiki");
}
#[test]
fn test_i64_timestamps_with_missing_roundtrip() {
let base_ts = 1724630400i64; let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::I64);
let mut expected_values: Vec<Option<i64>> = Vec::new();
for i in 0..100u32 {
if i % 5 == 0 {
expected_values.push(None); } else {
let ts = base_ts - (i as i64 * 86400);
writer.add_i64(i, ts);
expected_values.push(Some(ts));
}
}
writer.pad_to(100);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
for (i, expected) in expected_values.iter().enumerate() {
let raw = reader.get_u64(i as u32);
match expected {
None => {
assert_eq!(
raw, FAST_FIELD_MISSING,
"doc {}: expected MISSING, got raw {}",
i, raw
);
}
Some(ts) => {
assert_ne!(
raw, FAST_FIELD_MISSING,
"doc {}: expected timestamp {}, got MISSING",
i, ts
);
let decoded = zigzag_decode(raw);
assert_eq!(
decoded,
*ts,
"doc {}: expected i64 {}, got i64 {} (raw zigzag: {}, expected zigzag: {})",
i,
ts,
decoded,
raw,
zigzag_encode(*ts)
);
}
}
}
}
#[test]
fn test_issued_at_1724630400_various_sizes() {
let target_ts = 1724630400i64;
let target_zigzag = zigzag_encode(target_ts);
for num_docs in [2, 5, 10, 50, 100, 500, 1000, 2000] {
let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::I64);
let target_doc = num_docs / 3;
for i in 0..num_docs as u32 {
if i == target_doc as u32 {
writer.add_i64(i, target_ts);
} else if i % 3 == 0 {
} else {
let ts = 1700000000i64 + (i as i64 * 86400);
writer.add_i64(i, ts);
}
}
writer.pad_to(num_docs as u32);
let mut buf = Vec::new();
let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
let raw = reader.get_u64(target_doc as u32);
assert_eq!(
raw,
target_zigzag,
"num_docs={}: doc {} expected zigzag {} (ts {}), got {} (decoded i64: {})",
num_docs,
target_doc,
target_zigzag,
target_ts,
raw,
zigzag_decode(raw)
);
}
}
#[test]
fn test_multi_block_multi_value_numeric() {
let mut wa = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
wa.add_u64(0, 1);
wa.add_u64(0, 2);
wa.add_u64(1, 3);
wa.pad_to(2);
let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
let mut wb = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
wb.add_u64(0, 4);
wb.add_u64(0, 5);
wb.add_u64(0, 6);
wb.pad_to(2);
let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
let (buf, toc) = assemble_blocked_column(
3,
FastFieldColumnType::U64,
true,
&[
(entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
(entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
],
);
let ob = owned(buf);
let reader = FastFieldReader::open(&ob, &toc).unwrap();
assert_eq!(reader.num_docs, 4);
assert_eq!(reader.num_blocks(), 2);
assert_eq!(reader.get_multi_values(0), vec![1, 2]);
assert_eq!(reader.get_multi_values(1), vec![3]);
assert_eq!(reader.get_multi_values(2), vec![4, 5, 6]);
assert_eq!(reader.get_multi_values(3), Vec::<u64>::new());
}
}