1use crate::core::{FieldId, LuciError, Result, SegmentId};
23use crate::mapping::FieldType;
24
25pub const SEGMENT_MAGIC: &[u8; 4] = b"MSEG";
27
28#[derive(Clone, Copy, Debug, PartialEq, Eq)]
30#[repr(u8)]
31pub enum ComponentType {
32 InvertedIndex = 1,
33 Columnar = 2,
34 DocStore = 3,
35 Vector = 4,
36 Spatial = 5,
37}
38
39impl ComponentType {
40 pub fn from_u8(v: u8) -> Result<Self> {
41 match v {
42 1 => Ok(Self::InvertedIndex),
43 2 => Ok(Self::Columnar),
44 3 => Ok(Self::DocStore),
45 4 => Ok(Self::Vector),
46 5 => Ok(Self::Spatial),
47 _ => Err(LuciError::IndexCorrupted(format!(
48 "unknown component type: {v}"
49 ))),
50 }
51 }
52}
53
54#[derive(Clone, Debug, PartialEq, Eq)]
56pub struct ComponentOffset {
57 pub component_type: ComponentType,
58 pub offset: u64,
59 pub length: u64,
60 pub checksum: u64,
61}
62
63impl ComponentOffset {
64 pub const SERIALIZED_SIZE: usize = 25;
66
67 pub fn to_bytes(&self) -> [u8; Self::SERIALIZED_SIZE] {
68 let mut buf = [0u8; Self::SERIALIZED_SIZE];
69 buf[0] = self.component_type as u8;
70 buf[1..9].copy_from_slice(&self.offset.to_le_bytes());
71 buf[9..17].copy_from_slice(&self.length.to_le_bytes());
72 buf[17..25].copy_from_slice(&self.checksum.to_le_bytes());
73 buf
74 }
75
76 pub fn from_bytes(data: &[u8]) -> Result<Self> {
77 if data.len() < Self::SERIALIZED_SIZE {
78 return Err(LuciError::IndexCorrupted(
79 "component offset too short".into(),
80 ));
81 }
82 Ok(Self {
83 component_type: ComponentType::from_u8(data[0])?,
84 offset: u64::from_le_bytes(data[1..9].try_into().unwrap()),
85 length: u64::from_le_bytes(data[9..17].try_into().unwrap()),
86 checksum: u64::from_le_bytes(data[17..25].try_into().unwrap()),
87 })
88 }
89}
90
91fn field_type_to_u8(ft: &FieldType) -> u8 {
93 match ft {
94 FieldType::Text => 0,
95 FieldType::Keyword => 1,
96 FieldType::Integer => 2,
97 FieldType::Long => 3,
98 FieldType::Float => 4,
99 FieldType::Double => 5,
100 FieldType::Boolean => 6,
101 FieldType::Date => 7,
102 FieldType::DenseVector { .. } => 8,
103 FieldType::GeoPoint => 9,
104 FieldType::Nested => 10,
105 FieldType::GeoShape => 11,
106 FieldType::TokenCount => 12,
107 FieldType::Ip => 13,
108 }
109}
110
111fn field_type_from_u8(v: u8) -> Result<FieldType> {
112 match v {
113 0 => Ok(FieldType::Text),
114 1 => Ok(FieldType::Keyword),
115 2 => Ok(FieldType::Integer),
116 3 => Ok(FieldType::Long),
117 4 => Ok(FieldType::Float),
118 5 => Ok(FieldType::Double),
119 6 => Ok(FieldType::Boolean),
120 7 => Ok(FieldType::Date),
121 8 => Ok(FieldType::dense_vector(0)),
126 9 => Ok(FieldType::GeoPoint),
127 10 => Ok(FieldType::Nested),
128 11 => Ok(FieldType::GeoShape),
129 12 => Ok(FieldType::TokenCount),
130 13 => Ok(FieldType::Ip),
131 _ => Err(LuciError::IndexCorrupted(format!(
132 "unknown field type byte: {v}"
133 ))),
134 }
135}
136
137pub const FLAG_STORED: u8 = 0x01;
139pub const FLAG_INDEXED: u8 = 0x02;
140pub const FLAG_DOC_VALUES: u8 = 0x04;
141pub const FLAG_NORMS: u8 = 0x08;
142
143#[derive(Clone, Debug, PartialEq, Eq)]
145pub struct FieldMeta {
146 pub field_id: FieldId,
147 pub field_name: String,
148 pub field_type: FieldType,
149 pub flags: u8,
150}
151
152impl FieldMeta {
153 pub fn new(
154 field_id: FieldId,
155 field_name: String,
156 field_type: FieldType,
157 stored: bool,
158 indexed: bool,
159 doc_values: bool,
160 norms: bool,
161 ) -> Self {
162 let mut flags = 0u8;
163 if stored {
164 flags |= FLAG_STORED;
165 }
166 if indexed {
167 flags |= FLAG_INDEXED;
168 }
169 if doc_values {
170 flags |= FLAG_DOC_VALUES;
171 }
172 if norms {
173 flags |= FLAG_NORMS;
174 }
175 Self {
176 field_id,
177 field_name,
178 field_type,
179 flags,
180 }
181 }
182
183 pub fn is_stored(&self) -> bool {
184 self.flags & FLAG_STORED != 0
185 }
186 pub fn is_indexed(&self) -> bool {
187 self.flags & FLAG_INDEXED != 0
188 }
189 pub fn has_doc_values(&self) -> bool {
190 self.flags & FLAG_DOC_VALUES != 0
191 }
192 pub fn has_norms(&self) -> bool {
193 self.flags & FLAG_NORMS != 0
194 }
195
196 pub fn to_bytes(&self) -> Vec<u8> {
198 let name_bytes = self.field_name.as_bytes();
199 let mut buf = Vec::with_capacity(6 + name_bytes.len());
200 buf.extend_from_slice(&self.field_id.as_u16().to_le_bytes());
201 buf.extend_from_slice(&(name_bytes.len() as u16).to_le_bytes());
202 buf.extend_from_slice(name_bytes);
203 buf.push(field_type_to_u8(&self.field_type));
204 buf.push(self.flags);
205 buf
206 }
207
208 pub fn from_bytes(data: &[u8]) -> Result<(Self, usize)> {
210 if data.len() < 6 {
211 return Err(LuciError::IndexCorrupted("field meta too short".into()));
212 }
213 let field_id = FieldId::new(u16::from_le_bytes([data[0], data[1]]));
214 let name_len = u16::from_le_bytes([data[2], data[3]]) as usize;
215 if data.len() < 6 + name_len {
216 return Err(LuciError::IndexCorrupted(
217 "field meta name truncated".into(),
218 ));
219 }
220 let field_name = std::str::from_utf8(&data[4..4 + name_len])
221 .map_err(|e| LuciError::IndexCorrupted(format!("invalid field name UTF-8: {e}")))?
222 .to_string();
223 let field_type = field_type_from_u8(data[4 + name_len])?;
224 let flags = data[5 + name_len];
225 let consumed = 6 + name_len;
226
227 Ok((
228 Self {
229 field_id,
230 field_name,
231 field_type,
232 flags,
233 },
234 consumed,
235 ))
236 }
237}
238
239#[derive(Clone, Debug)]
241pub struct SegmentHeader {
242 pub segment_id: SegmentId,
243 pub doc_count: u32,
244 pub max_doc: u32,
245 pub components: Vec<ComponentOffset>,
246 pub fields: Vec<FieldMeta>,
247 pub parent_bitset: Option<Vec<bool>>,
250}
251
252impl SegmentHeader {
253 pub fn to_bytes(&self) -> Vec<u8> {
255 let mut buf = Vec::new();
256
257 buf.extend_from_slice(SEGMENT_MAGIC);
259 buf.extend_from_slice(&self.segment_id.as_u64().to_le_bytes());
261 buf.extend_from_slice(&self.doc_count.to_le_bytes());
263 buf.extend_from_slice(&self.max_doc.to_le_bytes());
265
266 let checksum_pos = buf.len();
268 buf.extend_from_slice(&0u64.to_le_bytes());
269
270 buf.push(self.components.len() as u8);
272 for comp in &self.components {
273 buf.extend_from_slice(&comp.to_bytes());
274 }
275
276 buf.extend_from_slice(&(self.fields.len() as u16).to_le_bytes());
278 for field in &self.fields {
279 buf.extend_from_slice(&field.to_bytes());
280 }
281
282 match &self.parent_bitset {
284 Some(bitset) => {
285 buf.push(1u8); let num_bytes = (bitset.len() + 7) / 8;
287 buf.extend_from_slice(&(bitset.len() as u32).to_le_bytes());
288 let mut packed = vec![0u8; num_bytes];
289 for (i, &is_parent) in bitset.iter().enumerate() {
290 if is_parent {
291 packed[i / 8] |= 1 << (i % 8);
292 }
293 }
294 buf.extend_from_slice(&packed);
295 }
296 None => {
297 buf.push(0u8); }
299 }
300
301 let mut checksum_data = Vec::new();
303 checksum_data.extend_from_slice(&buf[..checksum_pos]);
304 checksum_data.extend_from_slice(&buf[checksum_pos + 8..]);
305 let checksum = xxhash_rust::xxh3::xxh3_64(&checksum_data);
306 buf[checksum_pos..checksum_pos + 8].copy_from_slice(&checksum.to_le_bytes());
307
308 buf
309 }
310
311 pub fn from_bytes(data: &[u8]) -> Result<(Self, usize)> {
314 if data.len() < 28 {
315 return Err(LuciError::IndexCorrupted("segment header too short".into()));
316 }
317
318 if &data[0..4] != SEGMENT_MAGIC {
320 return Err(LuciError::IndexCorrupted(format!(
321 "invalid segment magic: expected {:?}, got {:?}",
322 SEGMENT_MAGIC,
323 &data[0..4]
324 )));
325 }
326
327 let segment_id = SegmentId::new(u64::from_le_bytes(data[4..12].try_into().unwrap()));
328 let doc_count = u32::from_le_bytes(data[12..16].try_into().unwrap());
329 let max_doc = u32::from_le_bytes(data[16..20].try_into().unwrap());
330 let stored_checksum = u64::from_le_bytes(data[20..28].try_into().unwrap());
331
332 let mut pos = 28;
333
334 if pos >= data.len() {
336 return Err(LuciError::IndexCorrupted(
337 "segment header truncated at components".into(),
338 ));
339 }
340 let num_components = data[pos] as usize;
341 pos += 1;
342
343 let mut components = Vec::with_capacity(num_components);
344 for _ in 0..num_components {
345 let comp = ComponentOffset::from_bytes(&data[pos..])?;
346 pos += ComponentOffset::SERIALIZED_SIZE;
347 components.push(comp);
348 }
349
350 if pos + 2 > data.len() {
352 return Err(LuciError::IndexCorrupted(
353 "segment header truncated at fields".into(),
354 ));
355 }
356 let num_fields = u16::from_le_bytes(data[pos..pos + 2].try_into().unwrap()) as usize;
357 pos += 2;
358
359 let mut fields = Vec::with_capacity(num_fields);
360 for _ in 0..num_fields {
361 let (field, consumed) = FieldMeta::from_bytes(&data[pos..])?;
362 pos += consumed;
363 fields.push(field);
364 }
365
366 let parent_bitset = if pos < data.len() && data[pos] == 1 {
368 pos += 1;
369 let num_docs = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize;
370 pos += 4;
371 let num_bytes = (num_docs + 7) / 8;
372 let packed = &data[pos..pos + num_bytes];
373 pos += num_bytes;
374 let mut bitset = Vec::with_capacity(num_docs);
375 for i in 0..num_docs {
376 bitset.push((packed[i / 8] >> (i % 8)) & 1 == 1);
377 }
378 Some(bitset)
379 } else {
380 if pos < data.len() {
381 pos += 1;
382 } None
384 };
385
386 let mut checksum_data = Vec::new();
388 checksum_data.extend_from_slice(&data[..20]); checksum_data.extend_from_slice(&data[28..pos]); let computed_checksum = xxhash_rust::xxh3::xxh3_64(&checksum_data);
391 if computed_checksum != stored_checksum {
392 return Err(LuciError::IndexCorrupted(format!(
393 "segment header checksum mismatch: stored={stored_checksum:#x}, computed={computed_checksum:#x}"
394 )));
395 }
396
397 Ok((
398 Self {
399 segment_id,
400 doc_count,
401 max_doc,
402 components,
403 fields,
404 parent_bitset,
405 },
406 pos,
407 ))
408 }
409
410 pub fn component(&self, ct: ComponentType) -> Option<&ComponentOffset> {
412 self.components.iter().find(|c| c.component_type == ct)
413 }
414}
415
416#[cfg(test)]
417mod tests {
418 use super::*;
419
420 #[test]
421 fn component_offset_round_trip() {
422 let co = ComponentOffset {
423 component_type: ComponentType::InvertedIndex,
424 offset: 1234,
425 length: 5678,
426 checksum: 0xDEADBEEF,
427 };
428 let bytes = co.to_bytes();
429 let decoded = ComponentOffset::from_bytes(&bytes).unwrap();
430 assert_eq!(decoded, co);
431 }
432
433 #[test]
434 fn field_meta_round_trip() {
435 let fm = FieldMeta::new(
436 FieldId::new(3),
437 "title".to_string(),
438 FieldType::Text,
439 true,
440 true,
441 false,
442 true,
443 );
444 let bytes = fm.to_bytes();
445 let (decoded, consumed) = FieldMeta::from_bytes(&bytes).unwrap();
446 assert_eq!(consumed, bytes.len());
447 assert_eq!(decoded, fm);
448 assert!(decoded.is_stored());
449 assert!(decoded.is_indexed());
450 assert!(!decoded.has_doc_values());
451 assert!(decoded.has_norms());
452 }
453
454 #[test]
455 fn field_meta_flags() {
456 let fm = FieldMeta::new(
457 FieldId::new(0),
458 "status".to_string(),
459 FieldType::Keyword,
460 true,
461 true,
462 true,
463 false,
464 );
465 assert!(fm.is_stored());
466 assert!(fm.is_indexed());
467 assert!(fm.has_doc_values());
468 assert!(!fm.has_norms());
469 }
470
471 #[test]
472 fn header_round_trip() {
473 let header = SegmentHeader {
474 segment_id: SegmentId::new(42),
475 doc_count: 100,
476 max_doc: 100,
477 components: vec![
478 ComponentOffset {
479 component_type: ComponentType::InvertedIndex,
480 offset: 256,
481 length: 1024,
482 checksum: 111,
483 },
484 ComponentOffset {
485 component_type: ComponentType::DocStore,
486 offset: 1280,
487 length: 2048,
488 checksum: 222,
489 },
490 ],
491 fields: vec![
492 FieldMeta::new(
493 FieldId::new(0),
494 "title".to_string(),
495 FieldType::Text,
496 true,
497 true,
498 false,
499 true,
500 ),
501 FieldMeta::new(
502 FieldId::new(1),
503 "status".to_string(),
504 FieldType::Keyword,
505 true,
506 true,
507 true,
508 false,
509 ),
510 ],
511 parent_bitset: None,
512 };
513
514 let bytes = header.to_bytes();
515 let (decoded, consumed) = SegmentHeader::from_bytes(&bytes).unwrap();
516
517 assert_eq!(decoded.segment_id, header.segment_id);
518 assert_eq!(decoded.doc_count, header.doc_count);
519 assert_eq!(decoded.max_doc, header.max_doc);
520 assert_eq!(decoded.components.len(), 2);
521 assert_eq!(decoded.components[0], header.components[0]);
522 assert_eq!(decoded.components[1], header.components[1]);
523 assert_eq!(decoded.fields.len(), 2);
524 assert_eq!(decoded.fields[0], header.fields[0]);
525 assert_eq!(decoded.fields[1], header.fields[1]);
526 assert_eq!(consumed, bytes.len());
527 }
528
529 #[test]
530 fn header_magic_validation() {
531 let header = SegmentHeader {
532 segment_id: SegmentId::new(1),
533 doc_count: 0,
534 max_doc: 0,
535 components: vec![],
536 fields: vec![],
537 parent_bitset: None,
538 };
539 let mut bytes = header.to_bytes();
540 bytes[0] = b'X'; let err = SegmentHeader::from_bytes(&bytes);
542 assert!(err.is_err());
543 }
544
545 #[test]
546 fn header_checksum_validation() {
547 let header = SegmentHeader {
548 segment_id: SegmentId::new(1),
549 doc_count: 10,
550 max_doc: 10,
551 components: vec![],
552 fields: vec![],
553 parent_bitset: None,
554 };
555 let mut bytes = header.to_bytes();
556 let last = bytes.len() - 1;
558 bytes[last] ^= 0xFF;
559 let err = SegmentHeader::from_bytes(&bytes);
560 assert!(err.is_err());
561 }
562
563 #[test]
564 fn component_lookup() {
565 let header = SegmentHeader {
566 segment_id: SegmentId::new(1),
567 doc_count: 0,
568 max_doc: 0,
569 components: vec![
570 ComponentOffset {
571 component_type: ComponentType::InvertedIndex,
572 offset: 100,
573 length: 200,
574 checksum: 0,
575 },
576 ComponentOffset {
577 component_type: ComponentType::DocStore,
578 offset: 300,
579 length: 400,
580 checksum: 0,
581 },
582 ],
583 fields: vec![],
584 parent_bitset: None,
585 };
586
587 assert!(header.component(ComponentType::InvertedIndex).is_some());
588 assert!(header.component(ComponentType::DocStore).is_some());
589 assert!(header.component(ComponentType::Columnar).is_none());
590 assert!(header.component(ComponentType::Vector).is_none());
591 }
592
593 #[test]
594 fn empty_header() {
595 let header = SegmentHeader {
596 segment_id: SegmentId::new(0),
597 doc_count: 0,
598 max_doc: 0,
599 components: vec![],
600 fields: vec![],
601 parent_bitset: None,
602 };
603 let bytes = header.to_bytes();
604 let (decoded, _) = SegmentHeader::from_bytes(&bytes).unwrap();
605 assert_eq!(decoded.doc_count, 0);
606 assert!(decoded.components.is_empty());
607 assert!(decoded.fields.is_empty());
608 }
609
610 #[test]
611 fn unicode_field_name() {
612 let fm = FieldMeta::new(
613 FieldId::new(0),
614 "beschreibung_über".to_string(),
615 FieldType::Text,
616 true,
617 true,
618 false,
619 true,
620 );
621 let bytes = fm.to_bytes();
622 let (decoded, _) = FieldMeta::from_bytes(&bytes).unwrap();
623 assert_eq!(decoded.field_name, "beschreibung_über");
624 }
625
626 #[test]
627 fn all_component_types_round_trip() {
628 for &ct in &[
629 ComponentType::InvertedIndex,
630 ComponentType::Columnar,
631 ComponentType::DocStore,
632 ComponentType::Vector,
633 ComponentType::Spatial,
634 ] {
635 let co = ComponentOffset {
636 component_type: ct,
637 offset: 0,
638 length: 0,
639 checksum: 0,
640 };
641 let bytes = co.to_bytes();
642 let decoded = ComponentOffset::from_bytes(&bytes).unwrap();
643 assert_eq!(decoded.component_type, ct);
644 }
645 }
646
647 #[test]
648 fn all_field_types_round_trip() {
649 let types = [
650 FieldType::Text,
651 FieldType::Keyword,
652 FieldType::Integer,
653 FieldType::Long,
654 FieldType::Float,
655 FieldType::Double,
656 FieldType::Boolean,
657 FieldType::Date,
658 ];
659 for ft in &types {
660 let fm = FieldMeta::new(
661 FieldId::new(0),
662 "f".to_string(),
663 ft.clone(),
664 true,
665 true,
666 true,
667 true,
668 );
669 let bytes = fm.to_bytes();
670 let (decoded, _) = FieldMeta::from_bytes(&bytes).unwrap();
671 assert_eq!(decoded.field_type, *ft);
672 }
673 }
674}