Skip to main content

luci/segment/
format.rs

1//! Segment binary format: header, component offsets, field metadata.
2//!
3//! A segment is a self-contained unit holding all index structures for a set
4//! of documents. The format starts with a fixed header followed by component
5//! data:
6//!
7//! ```text
8//! [magic: 4 bytes "MSEG"]
9//! [segment_id: u64]
10//! [doc_count: u32]
11//! [max_doc: u32]
12//! [header_checksum: u64]
13//! [num_components: u8]
14//! [ComponentOffset * num_components]
15//! [num_fields: u16]
16//! [FieldMeta * num_fields]
17//! ... component data ...
18//! ```
19//!
20//! See [[architecture-segment-layout]] and [[architecture-overview#Step 5]].
21
22use crate::core::{FieldId, LuciError, Result, SegmentId};
23use crate::mapping::FieldType;
24
25/// Magic bytes at the start of every segment.
26pub const SEGMENT_MAGIC: &[u8; 4] = b"MSEG";
27
28/// Component types stored in a segment.
29#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30#[repr(u8)]
31pub enum ComponentType {
32    InvertedIndex = 1,
33    Columnar = 2,
34    DocStore = 3,
35    Vector = 4,
36    Spatial = 5,
37}
38
39impl ComponentType {
40    pub fn from_u8(v: u8) -> Result<Self> {
41        match v {
42            1 => Ok(Self::InvertedIndex),
43            2 => Ok(Self::Columnar),
44            3 => Ok(Self::DocStore),
45            4 => Ok(Self::Vector),
46            5 => Ok(Self::Spatial),
47            _ => Err(LuciError::IndexCorrupted(format!(
48                "unknown component type: {v}"
49            ))),
50        }
51    }
52}
53
54/// Location and checksum of a component within the segment.
55#[derive(Clone, Debug, PartialEq, Eq)]
56pub struct ComponentOffset {
57    pub component_type: ComponentType,
58    pub offset: u64,
59    pub length: u64,
60    pub checksum: u64,
61}
62
63impl ComponentOffset {
64    /// Serialized size: 1 + 8 + 8 + 8 = 25 bytes.
65    pub const SERIALIZED_SIZE: usize = 25;
66
67    pub fn to_bytes(&self) -> [u8; Self::SERIALIZED_SIZE] {
68        let mut buf = [0u8; Self::SERIALIZED_SIZE];
69        buf[0] = self.component_type as u8;
70        buf[1..9].copy_from_slice(&self.offset.to_le_bytes());
71        buf[9..17].copy_from_slice(&self.length.to_le_bytes());
72        buf[17..25].copy_from_slice(&self.checksum.to_le_bytes());
73        buf
74    }
75
76    pub fn from_bytes(data: &[u8]) -> Result<Self> {
77        if data.len() < Self::SERIALIZED_SIZE {
78            return Err(LuciError::IndexCorrupted(
79                "component offset too short".into(),
80            ));
81        }
82        Ok(Self {
83            component_type: ComponentType::from_u8(data[0])?,
84            offset: u64::from_le_bytes(data[1..9].try_into().unwrap()),
85            length: u64::from_le_bytes(data[9..17].try_into().unwrap()),
86            checksum: u64::from_le_bytes(data[17..25].try_into().unwrap()),
87        })
88    }
89}
90
91/// Field type encoded as a single byte for segment storage.
92fn field_type_to_u8(ft: &FieldType) -> u8 {
93    match ft {
94        FieldType::Text => 0,
95        FieldType::Keyword => 1,
96        FieldType::Integer => 2,
97        FieldType::Long => 3,
98        FieldType::Float => 4,
99        FieldType::Double => 5,
100        FieldType::Boolean => 6,
101        FieldType::Date => 7,
102        FieldType::DenseVector { .. } => 8,
103        FieldType::GeoPoint => 9,
104        FieldType::Nested => 10,
105        FieldType::GeoShape => 11,
106        FieldType::TokenCount => 12,
107        FieldType::Ip => 13,
108    }
109}
110
111fn field_type_from_u8(v: u8) -> Result<FieldType> {
112    match v {
113        0 => Ok(FieldType::Text),
114        1 => Ok(FieldType::Keyword),
115        2 => Ok(FieldType::Integer),
116        3 => Ok(FieldType::Long),
117        4 => Ok(FieldType::Float),
118        5 => Ok(FieldType::Double),
119        6 => Ok(FieldType::Boolean),
120        7 => Ok(FieldType::Date),
121        // The single-byte field type tag does not carry dims or
122        // quantization. Both are restored from the mapping JSON in
123        // `Mapping::from_json` before the schema is used. The placeholder
124        // values here are overwritten and never reach query/index code.
125        8 => Ok(FieldType::dense_vector(0)),
126        9 => Ok(FieldType::GeoPoint),
127        10 => Ok(FieldType::Nested),
128        11 => Ok(FieldType::GeoShape),
129        12 => Ok(FieldType::TokenCount),
130        13 => Ok(FieldType::Ip),
131        _ => Err(LuciError::IndexCorrupted(format!(
132            "unknown field type byte: {v}"
133        ))),
134    }
135}
136
137/// Field flags packed into a single byte.
138pub const FLAG_STORED: u8 = 0x01;
139pub const FLAG_INDEXED: u8 = 0x02;
140pub const FLAG_DOC_VALUES: u8 = 0x04;
141pub const FLAG_NORMS: u8 = 0x08;
142
143/// Per-field metadata stored in the segment header.
144#[derive(Clone, Debug, PartialEq, Eq)]
145pub struct FieldMeta {
146    pub field_id: FieldId,
147    pub field_name: String,
148    pub field_type: FieldType,
149    pub flags: u8,
150}
151
152impl FieldMeta {
153    pub fn new(
154        field_id: FieldId,
155        field_name: String,
156        field_type: FieldType,
157        stored: bool,
158        indexed: bool,
159        doc_values: bool,
160        norms: bool,
161    ) -> Self {
162        let mut flags = 0u8;
163        if stored {
164            flags |= FLAG_STORED;
165        }
166        if indexed {
167            flags |= FLAG_INDEXED;
168        }
169        if doc_values {
170            flags |= FLAG_DOC_VALUES;
171        }
172        if norms {
173            flags |= FLAG_NORMS;
174        }
175        Self {
176            field_id,
177            field_name,
178            field_type,
179            flags,
180        }
181    }
182
183    pub fn is_stored(&self) -> bool {
184        self.flags & FLAG_STORED != 0
185    }
186    pub fn is_indexed(&self) -> bool {
187        self.flags & FLAG_INDEXED != 0
188    }
189    pub fn has_doc_values(&self) -> bool {
190        self.flags & FLAG_DOC_VALUES != 0
191    }
192    pub fn has_norms(&self) -> bool {
193        self.flags & FLAG_NORMS != 0
194    }
195
196    /// Serialize to bytes: [field_id: u16][name_len: u16][name_bytes][type: u8][flags: u8]
197    pub fn to_bytes(&self) -> Vec<u8> {
198        let name_bytes = self.field_name.as_bytes();
199        let mut buf = Vec::with_capacity(6 + name_bytes.len());
200        buf.extend_from_slice(&self.field_id.as_u16().to_le_bytes());
201        buf.extend_from_slice(&(name_bytes.len() as u16).to_le_bytes());
202        buf.extend_from_slice(name_bytes);
203        buf.push(field_type_to_u8(&self.field_type));
204        buf.push(self.flags);
205        buf
206    }
207
208    /// Deserialize from bytes. Returns (FieldMeta, bytes_consumed).
209    pub fn from_bytes(data: &[u8]) -> Result<(Self, usize)> {
210        if data.len() < 6 {
211            return Err(LuciError::IndexCorrupted("field meta too short".into()));
212        }
213        let field_id = FieldId::new(u16::from_le_bytes([data[0], data[1]]));
214        let name_len = u16::from_le_bytes([data[2], data[3]]) as usize;
215        if data.len() < 6 + name_len {
216            return Err(LuciError::IndexCorrupted(
217                "field meta name truncated".into(),
218            ));
219        }
220        let field_name = std::str::from_utf8(&data[4..4 + name_len])
221            .map_err(|e| LuciError::IndexCorrupted(format!("invalid field name UTF-8: {e}")))?
222            .to_string();
223        let field_type = field_type_from_u8(data[4 + name_len])?;
224        let flags = data[5 + name_len];
225        let consumed = 6 + name_len;
226
227        Ok((
228            Self {
229                field_id,
230                field_name,
231                field_type,
232                flags,
233            },
234            consumed,
235        ))
236    }
237}
238
239/// The segment header: fixed fields + component offsets + field metadata.
240#[derive(Clone, Debug)]
241pub struct SegmentHeader {
242    pub segment_id: SegmentId,
243    pub doc_count: u32,
244    pub max_doc: u32,
245    pub components: Vec<ComponentOffset>,
246    pub fields: Vec<FieldMeta>,
247    /// Parent bitset: if present, indicates which doc IDs are parent docs
248    /// (vs nested hidden docs). One byte per doc (0=nested, 1=parent).
249    pub parent_bitset: Option<Vec<bool>>,
250}
251
252impl SegmentHeader {
253    /// Serialize the header to bytes (including magic and checksum).
254    pub fn to_bytes(&self) -> Vec<u8> {
255        let mut buf = Vec::new();
256
257        // Magic
258        buf.extend_from_slice(SEGMENT_MAGIC);
259        // segment_id
260        buf.extend_from_slice(&self.segment_id.as_u64().to_le_bytes());
261        // doc_count
262        buf.extend_from_slice(&self.doc_count.to_le_bytes());
263        // max_doc
264        buf.extend_from_slice(&self.max_doc.to_le_bytes());
265
266        // Placeholder for header checksum (filled in at the end)
267        let checksum_pos = buf.len();
268        buf.extend_from_slice(&0u64.to_le_bytes());
269
270        // num_components
271        buf.push(self.components.len() as u8);
272        for comp in &self.components {
273            buf.extend_from_slice(&comp.to_bytes());
274        }
275
276        // num_fields
277        buf.extend_from_slice(&(self.fields.len() as u16).to_le_bytes());
278        for field in &self.fields {
279            buf.extend_from_slice(&field.to_bytes());
280        }
281
282        // Parent bitset (optional)
283        match &self.parent_bitset {
284            Some(bitset) => {
285                buf.push(1u8); // has parent bitset
286                let num_bytes = (bitset.len() + 7) / 8;
287                buf.extend_from_slice(&(bitset.len() as u32).to_le_bytes());
288                let mut packed = vec![0u8; num_bytes];
289                for (i, &is_parent) in bitset.iter().enumerate() {
290                    if is_parent {
291                        packed[i / 8] |= 1 << (i % 8);
292                    }
293                }
294                buf.extend_from_slice(&packed);
295            }
296            None => {
297                buf.push(0u8); // no parent bitset
298            }
299        }
300
301        // Compute and insert checksum over everything except the checksum field
302        let mut checksum_data = Vec::new();
303        checksum_data.extend_from_slice(&buf[..checksum_pos]);
304        checksum_data.extend_from_slice(&buf[checksum_pos + 8..]);
305        let checksum = xxhash_rust::xxh3::xxh3_64(&checksum_data);
306        buf[checksum_pos..checksum_pos + 8].copy_from_slice(&checksum.to_le_bytes());
307
308        buf
309    }
310
311    /// Deserialize a header from the beginning of a segment byte slice.
312    /// Returns (header, bytes_consumed).
313    pub fn from_bytes(data: &[u8]) -> Result<(Self, usize)> {
314        if data.len() < 28 {
315            return Err(LuciError::IndexCorrupted("segment header too short".into()));
316        }
317
318        // Validate magic
319        if &data[0..4] != SEGMENT_MAGIC {
320            return Err(LuciError::IndexCorrupted(format!(
321                "invalid segment magic: expected {:?}, got {:?}",
322                SEGMENT_MAGIC,
323                &data[0..4]
324            )));
325        }
326
327        let segment_id = SegmentId::new(u64::from_le_bytes(data[4..12].try_into().unwrap()));
328        let doc_count = u32::from_le_bytes(data[12..16].try_into().unwrap());
329        let max_doc = u32::from_le_bytes(data[16..20].try_into().unwrap());
330        let stored_checksum = u64::from_le_bytes(data[20..28].try_into().unwrap());
331
332        let mut pos = 28;
333
334        // Components
335        if pos >= data.len() {
336            return Err(LuciError::IndexCorrupted(
337                "segment header truncated at components".into(),
338            ));
339        }
340        let num_components = data[pos] as usize;
341        pos += 1;
342
343        let mut components = Vec::with_capacity(num_components);
344        for _ in 0..num_components {
345            let comp = ComponentOffset::from_bytes(&data[pos..])?;
346            pos += ComponentOffset::SERIALIZED_SIZE;
347            components.push(comp);
348        }
349
350        // Fields
351        if pos + 2 > data.len() {
352            return Err(LuciError::IndexCorrupted(
353                "segment header truncated at fields".into(),
354            ));
355        }
356        let num_fields = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
357        pos += 2;
358
359        let mut fields = Vec::with_capacity(num_fields);
360        for _ in 0..num_fields {
361            let (field, consumed) = FieldMeta::from_bytes(&data[pos..])?;
362            pos += consumed;
363            fields.push(field);
364        }
365
366        // Parent bitset (optional)
367        let parent_bitset = if pos < data.len() && data[pos] == 1 {
368            pos += 1;
369            let num_docs = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
370            pos += 4;
371            let num_bytes = (num_docs + 7) / 8;
372            let packed = &data[pos..pos + num_bytes];
373            pos += num_bytes;
374            let mut bitset = Vec::with_capacity(num_docs);
375            for i in 0..num_docs {
376                bitset.push((packed[i / 8] >> (i % 8)) & 1 == 1);
377            }
378            Some(bitset)
379        } else {
380            if pos < data.len() {
381                pos += 1;
382            } // skip the 0 byte
383            None
384        };
385
386        // Validate checksum
387        let mut checksum_data = Vec::new();
388        checksum_data.extend_from_slice(&data[..20]); // everything before checksum
389        checksum_data.extend_from_slice(&data[28..pos]); // everything after checksum
390        let computed_checksum = xxhash_rust::xxh3::xxh3_64(&checksum_data);
391        if computed_checksum != stored_checksum {
392            return Err(LuciError::IndexCorrupted(format!(
393                "segment header checksum mismatch: stored={stored_checksum:#x}, computed={computed_checksum:#x}"
394            )));
395        }
396
397        Ok((
398            Self {
399                segment_id,
400                doc_count,
401                max_doc,
402                components,
403                fields,
404                parent_bitset,
405            },
406            pos,
407        ))
408    }
409
410    /// Find a component by type.
411    pub fn component(&self, ct: ComponentType) -> Option<&ComponentOffset> {
412        self.components.iter().find(|c| c.component_type == ct)
413    }
414}
415
416#[cfg(test)]
417mod tests {
418    use super::*;
419
420    #[test]
421    fn component_offset_round_trip() {
422        let co = ComponentOffset {
423            component_type: ComponentType::InvertedIndex,
424            offset: 1234,
425            length: 5678,
426            checksum: 0xDEADBEEF,
427        };
428        let bytes = co.to_bytes();
429        let decoded = ComponentOffset::from_bytes(&bytes).unwrap();
430        assert_eq!(decoded, co);
431    }
432
433    #[test]
434    fn field_meta_round_trip() {
435        let fm = FieldMeta::new(
436            FieldId::new(3),
437            "title".to_string(),
438            FieldType::Text,
439            true,
440            true,
441            false,
442            true,
443        );
444        let bytes = fm.to_bytes();
445        let (decoded, consumed) = FieldMeta::from_bytes(&bytes).unwrap();
446        assert_eq!(consumed, bytes.len());
447        assert_eq!(decoded, fm);
448        assert!(decoded.is_stored());
449        assert!(decoded.is_indexed());
450        assert!(!decoded.has_doc_values());
451        assert!(decoded.has_norms());
452    }
453
454    #[test]
455    fn field_meta_flags() {
456        let fm = FieldMeta::new(
457            FieldId::new(0),
458            "status".to_string(),
459            FieldType::Keyword,
460            true,
461            true,
462            true,
463            false,
464        );
465        assert!(fm.is_stored());
466        assert!(fm.is_indexed());
467        assert!(fm.has_doc_values());
468        assert!(!fm.has_norms());
469    }
470
471    #[test]
472    fn header_round_trip() {
473        let header = SegmentHeader {
474            segment_id: SegmentId::new(42),
475            doc_count: 100,
476            max_doc: 100,
477            components: vec![
478                ComponentOffset {
479                    component_type: ComponentType::InvertedIndex,
480                    offset: 256,
481                    length: 1024,
482                    checksum: 111,
483                },
484                ComponentOffset {
485                    component_type: ComponentType::DocStore,
486                    offset: 1280,
487                    length: 2048,
488                    checksum: 222,
489                },
490            ],
491            fields: vec![
492                FieldMeta::new(
493                    FieldId::new(0),
494                    "title".to_string(),
495                    FieldType::Text,
496                    true,
497                    true,
498                    false,
499                    true,
500                ),
501                FieldMeta::new(
502                    FieldId::new(1),
503                    "status".to_string(),
504                    FieldType::Keyword,
505                    true,
506                    true,
507                    true,
508                    false,
509                ),
510            ],
511            parent_bitset: None,
512        };
513
514        let bytes = header.to_bytes();
515        let (decoded, consumed) = SegmentHeader::from_bytes(&bytes).unwrap();
516
517        assert_eq!(decoded.segment_id, header.segment_id);
518        assert_eq!(decoded.doc_count, header.doc_count);
519        assert_eq!(decoded.max_doc, header.max_doc);
520        assert_eq!(decoded.components.len(), 2);
521        assert_eq!(decoded.components[0], header.components[0]);
522        assert_eq!(decoded.components[1], header.components[1]);
523        assert_eq!(decoded.fields.len(), 2);
524        assert_eq!(decoded.fields[0], header.fields[0]);
525        assert_eq!(decoded.fields[1], header.fields[1]);
526        assert_eq!(consumed, bytes.len());
527    }
528
529    #[test]
530    fn header_magic_validation() {
531        let header = SegmentHeader {
532            segment_id: SegmentId::new(1),
533            doc_count: 0,
534            max_doc: 0,
535            components: vec![],
536            fields: vec![],
537            parent_bitset: None,
538        };
539        let mut bytes = header.to_bytes();
540        bytes[0] = b'X'; // corrupt magic
541        let err = SegmentHeader::from_bytes(&bytes);
542        assert!(err.is_err());
543    }
544
545    #[test]
546    fn header_checksum_validation() {
547        let header = SegmentHeader {
548            segment_id: SegmentId::new(1),
549            doc_count: 10,
550            max_doc: 10,
551            components: vec![],
552            fields: vec![],
553            parent_bitset: None,
554        };
555        let mut bytes = header.to_bytes();
556        // Corrupt a byte after the checksum to trigger mismatch
557        let last = bytes.len() - 1;
558        bytes[last] ^= 0xFF;
559        let err = SegmentHeader::from_bytes(&bytes);
560        assert!(err.is_err());
561    }
562
563    #[test]
564    fn component_lookup() {
565        let header = SegmentHeader {
566            segment_id: SegmentId::new(1),
567            doc_count: 0,
568            max_doc: 0,
569            components: vec![
570                ComponentOffset {
571                    component_type: ComponentType::InvertedIndex,
572                    offset: 100,
573                    length: 200,
574                    checksum: 0,
575                },
576                ComponentOffset {
577                    component_type: ComponentType::DocStore,
578                    offset: 300,
579                    length: 400,
580                    checksum: 0,
581                },
582            ],
583            fields: vec![],
584            parent_bitset: None,
585        };
586
587        assert!(header.component(ComponentType::InvertedIndex).is_some());
588        assert!(header.component(ComponentType::DocStore).is_some());
589        assert!(header.component(ComponentType::Columnar).is_none());
590        assert!(header.component(ComponentType::Vector).is_none());
591    }
592
593    #[test]
594    fn empty_header() {
595        let header = SegmentHeader {
596            segment_id: SegmentId::new(0),
597            doc_count: 0,
598            max_doc: 0,
599            components: vec![],
600            fields: vec![],
601            parent_bitset: None,
602        };
603        let bytes = header.to_bytes();
604        let (decoded, _) = SegmentHeader::from_bytes(&bytes).unwrap();
605        assert_eq!(decoded.doc_count, 0);
606        assert!(decoded.components.is_empty());
607        assert!(decoded.fields.is_empty());
608    }
609
610    #[test]
611    fn unicode_field_name() {
612        let fm = FieldMeta::new(
613            FieldId::new(0),
614            "beschreibung_über".to_string(),
615            FieldType::Text,
616            true,
617            true,
618            false,
619            true,
620        );
621        let bytes = fm.to_bytes();
622        let (decoded, _) = FieldMeta::from_bytes(&bytes).unwrap();
623        assert_eq!(decoded.field_name, "beschreibung_über");
624    }
625
626    #[test]
627    fn all_component_types_round_trip() {
628        for &ct in &[
629            ComponentType::InvertedIndex,
630            ComponentType::Columnar,
631            ComponentType::DocStore,
632            ComponentType::Vector,
633            ComponentType::Spatial,
634        ] {
635            let co = ComponentOffset {
636                component_type: ct,
637                offset: 0,
638                length: 0,
639                checksum: 0,
640            };
641            let bytes = co.to_bytes();
642            let decoded = ComponentOffset::from_bytes(&bytes).unwrap();
643            assert_eq!(decoded.component_type, ct);
644        }
645    }
646
647    #[test]
648    fn all_field_types_round_trip() {
649        let types = [
650            FieldType::Text,
651            FieldType::Keyword,
652            FieldType::Integer,
653            FieldType::Long,
654            FieldType::Float,
655            FieldType::Double,
656            FieldType::Boolean,
657            FieldType::Date,
658        ];
659        for ft in &types {
660            let fm = FieldMeta::new(
661                FieldId::new(0),
662                "f".to_string(),
663                ft.clone(),
664                true,
665                true,
666                true,
667                true,
668            );
669            let bytes = fm.to_bytes();
670            let (decoded, _) = FieldMeta::from_bytes(&bytes).unwrap();
671            assert_eq!(decoded.field_type, *ft);
672        }
673    }
674}