use crate::core::{FieldId, LuciError, Result, SegmentId};
use crate::mapping::FieldType;
pub const SEGMENT_MAGIC: &[u8; 4] = b"MSEG";
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
pub enum ComponentType {
InvertedIndex = 1,
Columnar = 2,
DocStore = 3,
Vector = 4,
Spatial = 5,
}
impl ComponentType {
pub fn from_u8(v: u8) -> Result<Self> {
match v {
1 => Ok(Self::InvertedIndex),
2 => Ok(Self::Columnar),
3 => Ok(Self::DocStore),
4 => Ok(Self::Vector),
5 => Ok(Self::Spatial),
_ => Err(LuciError::IndexCorrupted(format!(
"unknown component type: {v}"
))),
}
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct ComponentOffset {
pub component_type: ComponentType,
pub offset: u64,
pub length: u64,
pub checksum: u64,
}
impl ComponentOffset {
pub const SERIALIZED_SIZE: usize = 25;
pub fn to_bytes(&self) -> [u8; Self::SERIALIZED_SIZE] {
let mut buf = [0u8; Self::SERIALIZED_SIZE];
buf[0] = self.component_type as u8;
buf[1..9].copy_from_slice(&self.offset.to_le_bytes());
buf[9..17].copy_from_slice(&self.length.to_le_bytes());
buf[17..25].copy_from_slice(&self.checksum.to_le_bytes());
buf
}
pub fn from_bytes(data: &[u8]) -> Result<Self> {
if data.len() < Self::SERIALIZED_SIZE {
return Err(LuciError::IndexCorrupted(
"component offset too short".into(),
));
}
Ok(Self {
component_type: ComponentType::from_u8(data[0])?,
offset: u64::from_le_bytes(data[1..9].try_into().unwrap()),
length: u64::from_le_bytes(data[9..17].try_into().unwrap()),
checksum: u64::from_le_bytes(data[17..25].try_into().unwrap()),
})
}
}
fn field_type_to_u8(ft: &FieldType) -> u8 {
match ft {
FieldType::Text => 0,
FieldType::Keyword => 1,
FieldType::Integer => 2,
FieldType::Long => 3,
FieldType::Float => 4,
FieldType::Double => 5,
FieldType::Boolean => 6,
FieldType::Date => 7,
FieldType::DenseVector { .. } => 8,
FieldType::GeoPoint => 9,
FieldType::Nested => 10,
FieldType::GeoShape => 11,
FieldType::TokenCount => 12,
FieldType::Ip => 13,
}
}
fn field_type_from_u8(v: u8) -> Result<FieldType> {
match v {
0 => Ok(FieldType::Text),
1 => Ok(FieldType::Keyword),
2 => Ok(FieldType::Integer),
3 => Ok(FieldType::Long),
4 => Ok(FieldType::Float),
5 => Ok(FieldType::Double),
6 => Ok(FieldType::Boolean),
7 => Ok(FieldType::Date),
8 => Ok(FieldType::dense_vector(0)),
9 => Ok(FieldType::GeoPoint),
10 => Ok(FieldType::Nested),
11 => Ok(FieldType::GeoShape),
12 => Ok(FieldType::TokenCount),
13 => Ok(FieldType::Ip),
_ => Err(LuciError::IndexCorrupted(format!(
"unknown field type byte: {v}"
))),
}
}
pub const FLAG_STORED: u8 = 0x01;
pub const FLAG_INDEXED: u8 = 0x02;
pub const FLAG_DOC_VALUES: u8 = 0x04;
pub const FLAG_NORMS: u8 = 0x08;
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct FieldMeta {
pub field_id: FieldId,
pub field_name: String,
pub field_type: FieldType,
pub flags: u8,
}
impl FieldMeta {
pub fn new(
field_id: FieldId,
field_name: String,
field_type: FieldType,
stored: bool,
indexed: bool,
doc_values: bool,
norms: bool,
) -> Self {
let mut flags = 0u8;
if stored {
flags |= FLAG_STORED;
}
if indexed {
flags |= FLAG_INDEXED;
}
if doc_values {
flags |= FLAG_DOC_VALUES;
}
if norms {
flags |= FLAG_NORMS;
}
Self {
field_id,
field_name,
field_type,
flags,
}
}
pub fn is_stored(&self) -> bool {
self.flags & FLAG_STORED != 0
}
pub fn is_indexed(&self) -> bool {
self.flags & FLAG_INDEXED != 0
}
pub fn has_doc_values(&self) -> bool {
self.flags & FLAG_DOC_VALUES != 0
}
pub fn has_norms(&self) -> bool {
self.flags & FLAG_NORMS != 0
}
pub fn to_bytes(&self) -> Vec<u8> {
let name_bytes = self.field_name.as_bytes();
let mut buf = Vec::with_capacity(6 + name_bytes.len());
buf.extend_from_slice(&self.field_id.as_u16().to_le_bytes());
buf.extend_from_slice(&(name_bytes.len() as u16).to_le_bytes());
buf.extend_from_slice(name_bytes);
buf.push(field_type_to_u8(&self.field_type));
buf.push(self.flags);
buf
}
pub fn from_bytes(data: &[u8]) -> Result<(Self, usize)> {
if data.len() < 6 {
return Err(LuciError::IndexCorrupted("field meta too short".into()));
}
let field_id = FieldId::new(u16::from_le_bytes([data[0], data[1]]));
let name_len = u16::from_le_bytes([data[2], data[3]]) as usize;
if data.len() < 6 + name_len {
return Err(LuciError::IndexCorrupted(
"field meta name truncated".into(),
));
}
let field_name = std::str::from_utf8(&data[4..4 + name_len])
.map_err(|e| LuciError::IndexCorrupted(format!("invalid field name UTF-8: {e}")))?
.to_string();
let field_type = field_type_from_u8(data[4 + name_len])?;
let flags = data[5 + name_len];
let consumed = 6 + name_len;
Ok((
Self {
field_id,
field_name,
field_type,
flags,
},
consumed,
))
}
}
#[derive(Clone, Debug)]
pub struct SegmentHeader {
pub segment_id: SegmentId,
pub doc_count: u32,
pub max_doc: u32,
pub components: Vec<ComponentOffset>,
pub fields: Vec<FieldMeta>,
pub parent_bitset: Option<Vec<bool>>,
}
impl SegmentHeader {
pub fn to_bytes(&self) -> Vec<u8> {
let mut buf = Vec::new();
buf.extend_from_slice(SEGMENT_MAGIC);
buf.extend_from_slice(&self.segment_id.as_u64().to_le_bytes());
buf.extend_from_slice(&self.doc_count.to_le_bytes());
buf.extend_from_slice(&self.max_doc.to_le_bytes());
let checksum_pos = buf.len();
buf.extend_from_slice(&0u64.to_le_bytes());
buf.push(self.components.len() as u8);
for comp in &self.components {
buf.extend_from_slice(&comp.to_bytes());
}
buf.extend_from_slice(&(self.fields.len() as u16).to_le_bytes());
for field in &self.fields {
buf.extend_from_slice(&field.to_bytes());
}
match &self.parent_bitset {
Some(bitset) => {
buf.push(1u8); let num_bytes = (bitset.len() + 7) / 8;
buf.extend_from_slice(&(bitset.len() as u32).to_le_bytes());
let mut packed = vec![0u8; num_bytes];
for (i, &is_parent) in bitset.iter().enumerate() {
if is_parent {
packed[i / 8] |= 1 << (i % 8);
}
}
buf.extend_from_slice(&packed);
}
None => {
buf.push(0u8); }
}
let mut checksum_data = Vec::new();
checksum_data.extend_from_slice(&buf[..checksum_pos]);
checksum_data.extend_from_slice(&buf[checksum_pos + 8..]);
let checksum = xxhash_rust::xxh3::xxh3_64(&checksum_data);
buf[checksum_pos..checksum_pos + 8].copy_from_slice(&checksum.to_le_bytes());
buf
}
pub fn from_bytes(data: &[u8]) -> Result<(Self, usize)> {
if data.len() < 28 {
return Err(LuciError::IndexCorrupted("segment header too short".into()));
}
if &data[0..4] != SEGMENT_MAGIC {
return Err(LuciError::IndexCorrupted(format!(
"invalid segment magic: expected {:?}, got {:?}",
SEGMENT_MAGIC,
&data[0..4]
)));
}
let segment_id = SegmentId::new(u64::from_le_bytes(data[4..12].try_into().unwrap()));
let doc_count = u32::from_le_bytes(data[12..16].try_into().unwrap());
let max_doc = u32::from_le_bytes(data[16..20].try_into().unwrap());
let stored_checksum = u64::from_le_bytes(data[20..28].try_into().unwrap());
let mut pos = 28;
if pos >= data.len() {
return Err(LuciError::IndexCorrupted(
"segment header truncated at components".into(),
));
}
let num_components = data[pos] as usize;
pos += 1;
let mut components = Vec::with_capacity(num_components);
for _ in 0..num_components {
let comp = ComponentOffset::from_bytes(&data[pos..])?;
pos += ComponentOffset::SERIALIZED_SIZE;
components.push(comp);
}
if pos + 2 > data.len() {
return Err(LuciError::IndexCorrupted(
"segment header truncated at fields".into(),
));
}
let num_fields = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
pos += 2;
let mut fields = Vec::with_capacity(num_fields);
for _ in 0..num_fields {
let (field, consumed) = FieldMeta::from_bytes(&data[pos..])?;
pos += consumed;
fields.push(field);
}
let parent_bitset = if pos < data.len() && data[pos] == 1 {
pos += 1;
let num_docs = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
pos += 4;
let num_bytes = (num_docs + 7) / 8;
let packed = &data[pos..pos + num_bytes];
pos += num_bytes;
let mut bitset = Vec::with_capacity(num_docs);
for i in 0..num_docs {
bitset.push((packed[i / 8] >> (i % 8)) & 1 == 1);
}
Some(bitset)
} else {
if pos < data.len() {
pos += 1;
} None
};
let mut checksum_data = Vec::new();
checksum_data.extend_from_slice(&data[..20]); checksum_data.extend_from_slice(&data[28..pos]); let computed_checksum = xxhash_rust::xxh3::xxh3_64(&checksum_data);
if computed_checksum != stored_checksum {
return Err(LuciError::IndexCorrupted(format!(
"segment header checksum mismatch: stored={stored_checksum:#x}, computed={computed_checksum:#x}"
)));
}
Ok((
Self {
segment_id,
doc_count,
max_doc,
components,
fields,
parent_bitset,
},
pos,
))
}
pub fn component(&self, ct: ComponentType) -> Option<&ComponentOffset> {
self.components.iter().find(|c| c.component_type == ct)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn component_offset_round_trip() {
let co = ComponentOffset {
component_type: ComponentType::InvertedIndex,
offset: 1234,
length: 5678,
checksum: 0xDEADBEEF,
};
let bytes = co.to_bytes();
let decoded = ComponentOffset::from_bytes(&bytes).unwrap();
assert_eq!(decoded, co);
}
#[test]
fn field_meta_round_trip() {
let fm = FieldMeta::new(
FieldId::new(3),
"title".to_string(),
FieldType::Text,
true,
true,
false,
true,
);
let bytes = fm.to_bytes();
let (decoded, consumed) = FieldMeta::from_bytes(&bytes).unwrap();
assert_eq!(consumed, bytes.len());
assert_eq!(decoded, fm);
assert!(decoded.is_stored());
assert!(decoded.is_indexed());
assert!(!decoded.has_doc_values());
assert!(decoded.has_norms());
}
#[test]
fn field_meta_flags() {
let fm = FieldMeta::new(
FieldId::new(0),
"status".to_string(),
FieldType::Keyword,
true,
true,
true,
false,
);
assert!(fm.is_stored());
assert!(fm.is_indexed());
assert!(fm.has_doc_values());
assert!(!fm.has_norms());
}
#[test]
fn header_round_trip() {
let header = SegmentHeader {
segment_id: SegmentId::new(42),
doc_count: 100,
max_doc: 100,
components: vec![
ComponentOffset {
component_type: ComponentType::InvertedIndex,
offset: 256,
length: 1024,
checksum: 111,
},
ComponentOffset {
component_type: ComponentType::DocStore,
offset: 1280,
length: 2048,
checksum: 222,
},
],
fields: vec![
FieldMeta::new(
FieldId::new(0),
"title".to_string(),
FieldType::Text,
true,
true,
false,
true,
),
FieldMeta::new(
FieldId::new(1),
"status".to_string(),
FieldType::Keyword,
true,
true,
true,
false,
),
],
parent_bitset: None,
};
let bytes = header.to_bytes();
let (decoded, consumed) = SegmentHeader::from_bytes(&bytes).unwrap();
assert_eq!(decoded.segment_id, header.segment_id);
assert_eq!(decoded.doc_count, header.doc_count);
assert_eq!(decoded.max_doc, header.max_doc);
assert_eq!(decoded.components.len(), 2);
assert_eq!(decoded.components[0], header.components[0]);
assert_eq!(decoded.components[1], header.components[1]);
assert_eq!(decoded.fields.len(), 2);
assert_eq!(decoded.fields[0], header.fields[0]);
assert_eq!(decoded.fields[1], header.fields[1]);
assert_eq!(consumed, bytes.len());
}
#[test]
fn header_magic_validation() {
let header = SegmentHeader {
segment_id: SegmentId::new(1),
doc_count: 0,
max_doc: 0,
components: vec![],
fields: vec![],
parent_bitset: None,
};
let mut bytes = header.to_bytes();
bytes[0] = b'X'; let err = SegmentHeader::from_bytes(&bytes);
assert!(err.is_err());
}
#[test]
fn header_checksum_validation() {
let header = SegmentHeader {
segment_id: SegmentId::new(1),
doc_count: 10,
max_doc: 10,
components: vec![],
fields: vec![],
parent_bitset: None,
};
let mut bytes = header.to_bytes();
let last = bytes.len() - 1;
bytes[last] ^= 0xFF;
let err = SegmentHeader::from_bytes(&bytes);
assert!(err.is_err());
}
#[test]
fn component_lookup() {
let header = SegmentHeader {
segment_id: SegmentId::new(1),
doc_count: 0,
max_doc: 0,
components: vec![
ComponentOffset {
component_type: ComponentType::InvertedIndex,
offset: 100,
length: 200,
checksum: 0,
},
ComponentOffset {
component_type: ComponentType::DocStore,
offset: 300,
length: 400,
checksum: 0,
},
],
fields: vec![],
parent_bitset: None,
};
assert!(header.component(ComponentType::InvertedIndex).is_some());
assert!(header.component(ComponentType::DocStore).is_some());
assert!(header.component(ComponentType::Columnar).is_none());
assert!(header.component(ComponentType::Vector).is_none());
}
#[test]
fn empty_header() {
let header = SegmentHeader {
segment_id: SegmentId::new(0),
doc_count: 0,
max_doc: 0,
components: vec![],
fields: vec![],
parent_bitset: None,
};
let bytes = header.to_bytes();
let (decoded, _) = SegmentHeader::from_bytes(&bytes).unwrap();
assert_eq!(decoded.doc_count, 0);
assert!(decoded.components.is_empty());
assert!(decoded.fields.is_empty());
}
#[test]
fn unicode_field_name() {
let fm = FieldMeta::new(
FieldId::new(0),
"beschreibung_über".to_string(),
FieldType::Text,
true,
true,
false,
true,
);
let bytes = fm.to_bytes();
let (decoded, _) = FieldMeta::from_bytes(&bytes).unwrap();
assert_eq!(decoded.field_name, "beschreibung_über");
}
#[test]
fn all_component_types_round_trip() {
for &ct in &[
ComponentType::InvertedIndex,
ComponentType::Columnar,
ComponentType::DocStore,
ComponentType::Vector,
ComponentType::Spatial,
] {
let co = ComponentOffset {
component_type: ct,
offset: 0,
length: 0,
checksum: 0,
};
let bytes = co.to_bytes();
let decoded = ComponentOffset::from_bytes(&bytes).unwrap();
assert_eq!(decoded.component_type, ct);
}
}
#[test]
fn all_field_types_round_trip() {
let types = [
FieldType::Text,
FieldType::Keyword,
FieldType::Integer,
FieldType::Long,
FieldType::Float,
FieldType::Double,
FieldType::Boolean,
FieldType::Date,
];
for ft in &types {
let fm = FieldMeta::new(
FieldId::new(0),
"f".to_string(),
ft.clone(),
true,
true,
true,
true,
);
let bytes = fm.to_bytes();
let (decoded, _) = FieldMeta::from_bytes(&bytes).unwrap();
assert_eq!(decoded.field_type, *ft);
}
}
}