1pub mod codec;
40
41use std::collections::BTreeMap;
42use std::io::{self, Read, Write};
43use std::sync::OnceLock;
44
45use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
46
47pub const FAST_FIELD_MAGIC: u32 = 0x32545346;
51
52pub const FAST_FIELD_FOOTER_SIZE: u64 = 16;
54
55pub const FAST_FIELD_MISSING: u64 = u64::MAX;
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
68#[repr(u8)]
69pub enum FastFieldColumnType {
70 U64 = 0,
71 I64 = 1,
72 F64 = 2,
73 TextOrdinal = 3,
74}
75
76impl FastFieldColumnType {
77 pub fn from_u8(v: u8) -> Option<Self> {
78 match v {
79 0 => Some(Self::U64),
80 1 => Some(Self::I64),
81 2 => Some(Self::F64),
82 3 => Some(Self::TextOrdinal),
83 _ => None,
84 }
85 }
86}
87
88#[inline]
92pub fn zigzag_encode(v: i64) -> u64 {
93 ((v << 1) ^ (v >> 63)) as u64
94}
95
96#[inline]
98pub fn zigzag_decode(v: u64) -> i64 {
99 ((v >> 1) as i64) ^ -((v & 1) as i64)
100}
101
102#[inline]
106pub fn f64_to_sortable_u64(f: f64) -> u64 {
107 let bits = f.to_bits();
108 if (bits >> 63) == 0 {
109 bits ^ (1u64 << 63) } else {
111 !bits }
113}
114
115#[inline]
117pub fn sortable_u64_to_f64(v: u64) -> f64 {
118 let bits = if (v >> 63) != 0 {
119 v ^ (1u64 << 63) } else {
121 !v };
123 f64::from_bits(bits)
124}
125
126#[inline]
128pub fn bits_needed_u64(val: u64) -> u8 {
129 if val == 0 {
130 0
131 } else {
132 64 - val.leading_zeros() as u8
133 }
134}
135
136pub fn bitpack_write(values: &[u64], bits_per_value: u8, out: &mut Vec<u8>) {
141 if bits_per_value == 0 {
142 return; }
144 let bpv = bits_per_value as usize;
145 let total_bits = values.len() * bpv;
146 let total_bytes = total_bits.div_ceil(8);
147 out.reserve(total_bytes);
148
149 let start = out.len();
150 out.resize(start + total_bytes, 0);
151 let buf = &mut out[start..];
152
153 for (i, &val) in values.iter().enumerate() {
154 let bit_offset = i * bpv;
155 let byte_offset = bit_offset / 8;
156 let bit_shift = bit_offset % 8;
157
158 let mut remaining_bits = bpv;
160 let mut v = val;
161 let mut bo = byte_offset;
162 let mut bs = bit_shift;
163
164 while remaining_bits > 0 {
165 let can_write = (8 - bs).min(remaining_bits);
166 let mask = (1u64 << can_write) - 1;
167 buf[bo] |= ((v & mask) << bs) as u8;
168 v >>= can_write;
169 remaining_bits -= can_write;
170 bo += 1;
171 bs = 0;
172 }
173 }
174}
175
176#[inline]
182pub fn bitpack_read(data: &[u8], bits_per_value: u8, index: usize) -> u64 {
183 if bits_per_value == 0 {
184 return 0;
185 }
186 let bpv = bits_per_value as usize;
187 let bit_offset = index * bpv;
188 let byte_offset = bit_offset / 8;
189 let bit_shift = bit_offset % 8;
190
191 if bit_shift + bpv <= 64 && byte_offset + 8 <= data.len() {
194 let raw = u64::from_le_bytes(data[byte_offset..byte_offset + 8].try_into().unwrap());
195 let mask = if bpv >= 64 {
196 u64::MAX
197 } else {
198 (1u64 << bpv) - 1
199 };
200 return (raw >> bit_shift) & mask;
201 }
202
203 let mut result: u64 = 0;
205 let mut remaining_bits = bpv;
206 let mut bo = byte_offset;
207 let mut bs = bit_shift;
208 let mut out_shift = 0;
209
210 while remaining_bits > 0 {
211 let can_read = (8 - bs).min(remaining_bits);
212 let mask = ((1u64 << can_read) - 1) as u8;
213 let byte_val = if bo < data.len() { data[bo] } else { 0 };
214 result |= (((byte_val >> bs) & mask) as u64) << out_shift;
215 remaining_bits -= can_read;
216 out_shift += can_read;
217 bo += 1;
218 bs = 0;
219 }
220
221 result
222}
223
224#[derive(Debug, Clone)]
239pub struct FastFieldTocEntry {
240 pub field_id: u32,
241 pub column_type: FastFieldColumnType,
242 pub multi: bool,
243 pub data_offset: u64,
244 pub data_len: u64,
245 pub num_docs: u32,
246 pub dict_offset: u64,
248 pub dict_count: u32,
250}
251
252pub const FAST_FIELD_TOC_ENTRY_SIZE: usize = 4 + 1 + 1 + 8 + 8 + 4 + 8 + 4; #[derive(Debug, Clone)]
261pub struct BlockIndexEntry {
262 pub num_docs: u32,
263 pub data_len: u32,
264 pub dict_count: u32,
265 pub dict_len: u32,
266}
267
268pub const BLOCK_INDEX_ENTRY_SIZE: usize = 16;
269
270impl BlockIndexEntry {
271 pub fn write_to(&self, w: &mut dyn Write) -> io::Result<()> {
272 w.write_u32::<LittleEndian>(self.num_docs)?;
273 w.write_u32::<LittleEndian>(self.data_len)?;
274 w.write_u32::<LittleEndian>(self.dict_count)?;
275 w.write_u32::<LittleEndian>(self.dict_len)?;
276 Ok(())
277 }
278
279 pub fn read_from(r: &mut dyn Read) -> io::Result<Self> {
280 let num_docs = r.read_u32::<LittleEndian>()?;
281 let data_len = r.read_u32::<LittleEndian>()?;
282 let dict_count = r.read_u32::<LittleEndian>()?;
283 let dict_len = r.read_u32::<LittleEndian>()?;
284 Ok(Self {
285 num_docs,
286 data_len,
287 dict_count,
288 dict_len,
289 })
290 }
291}
292
293impl FastFieldTocEntry {
294 pub fn write_to(&self, w: &mut dyn Write) -> io::Result<()> {
295 w.write_u32::<LittleEndian>(self.field_id)?;
296 w.write_u8(self.column_type as u8)?;
297 let flags: u8 = if self.multi { 1 } else { 0 };
298 w.write_u8(flags)?;
299 w.write_u64::<LittleEndian>(self.data_offset)?;
300 w.write_u64::<LittleEndian>(self.data_len)?;
301 w.write_u32::<LittleEndian>(self.num_docs)?;
302 w.write_u64::<LittleEndian>(self.dict_offset)?;
303 w.write_u32::<LittleEndian>(self.dict_count)?;
304 Ok(())
305 }
306
307 pub fn read_from(r: &mut dyn Read) -> io::Result<Self> {
308 let field_id = r.read_u32::<LittleEndian>()?;
309 let ct = r.read_u8()?;
310 let column_type = FastFieldColumnType::from_u8(ct)
311 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "bad column type"))?;
312 let flags = r.read_u8()?;
313 let multi = (flags & 1) != 0;
314 let data_offset = r.read_u64::<LittleEndian>()?;
315 let data_len = r.read_u64::<LittleEndian>()?;
316 let num_docs = r.read_u32::<LittleEndian>()?;
317 let dict_offset = r.read_u64::<LittleEndian>()?;
318 let dict_count = r.read_u32::<LittleEndian>()?;
319 Ok(Self {
320 field_id,
321 column_type,
322 multi,
323 data_offset,
324 data_len,
325 num_docs,
326 dict_offset,
327 dict_count,
328 })
329 }
330}
331
332pub struct FastFieldWriter {
340 pub column_type: FastFieldColumnType,
341 pub multi: bool,
343
344 values: Vec<u64>,
347
348 multi_values: Vec<u64>,
351 multi_offsets: Vec<u32>,
354 multi_current_doc: u32,
356
357 text_values: Option<BTreeMap<String, u32>>,
360 text_per_doc: Option<Vec<Option<String>>>,
362 text_multi_values: Option<Vec<String>>,
364}
365
366impl FastFieldWriter {
367 pub fn new_numeric(column_type: FastFieldColumnType) -> Self {
369 debug_assert!(matches!(
370 column_type,
371 FastFieldColumnType::U64 | FastFieldColumnType::I64 | FastFieldColumnType::F64
372 ));
373 Self {
374 column_type,
375 multi: false,
376 values: Vec::new(),
377 multi_values: Vec::new(),
378 multi_offsets: vec![0],
379 multi_current_doc: 0,
380 text_values: None,
381 text_per_doc: None,
382 text_multi_values: None,
383 }
384 }
385
386 pub fn new_numeric_multi(column_type: FastFieldColumnType) -> Self {
388 debug_assert!(matches!(
389 column_type,
390 FastFieldColumnType::U64 | FastFieldColumnType::I64 | FastFieldColumnType::F64
391 ));
392 Self {
393 column_type,
394 multi: true,
395 values: Vec::new(),
396 multi_values: Vec::new(),
397 multi_offsets: vec![0],
398 multi_current_doc: 0,
399 text_values: None,
400 text_per_doc: None,
401 text_multi_values: None,
402 }
403 }
404
405 pub fn new_text() -> Self {
407 Self {
408 column_type: FastFieldColumnType::TextOrdinal,
409 multi: false,
410 values: Vec::new(),
411 multi_values: Vec::new(),
412 multi_offsets: vec![0],
413 multi_current_doc: 0,
414 text_values: Some(BTreeMap::new()),
415 text_per_doc: Some(Vec::new()),
416 text_multi_values: None,
417 }
418 }
419
420 pub fn new_text_multi() -> Self {
422 Self {
423 column_type: FastFieldColumnType::TextOrdinal,
424 multi: true,
425 values: Vec::new(),
426 multi_values: Vec::new(),
427 multi_offsets: vec![0],
428 multi_current_doc: 0,
429 text_values: Some(BTreeMap::new()),
430 text_per_doc: None,
431 text_multi_values: Some(Vec::new()),
432 }
433 }
434
435 pub fn add_u64(&mut self, doc_id: u32, value: u64) {
438 if self.multi {
439 self.add_multi_u64(doc_id, value);
440 return;
441 }
442 let idx = doc_id as usize;
443 if idx >= self.values.len() {
444 self.values.resize(idx + 1, FAST_FIELD_MISSING);
445 if let Some(ref mut tpd) = self.text_per_doc {
446 tpd.resize(idx + 1, None);
447 }
448 }
449 self.values[idx] = value;
450 }
451
452 fn add_multi_u64(&mut self, doc_id: u32, value: u64) {
454 while self.multi_current_doc < doc_id {
456 self.multi_current_doc += 1;
457 self.multi_offsets.push(self.multi_values.len() as u32);
458 }
459 if self.multi_current_doc == doc_id && self.multi_offsets.len() == doc_id as usize + 1 {
461 }
463 self.multi_values.push(value);
464 }
465
466 pub fn add_i64(&mut self, doc_id: u32, value: i64) {
468 self.add_u64(doc_id, zigzag_encode(value));
469 }
470
471 pub fn add_f64(&mut self, doc_id: u32, value: f64) {
473 self.add_u64(doc_id, f64_to_sortable_u64(value));
474 }
475
476 pub fn add_text(&mut self, doc_id: u32, value: &str) {
478 if let Some(ref mut dict) = self.text_values {
479 let next_id = dict.len() as u32;
480 dict.entry(value.to_string()).or_insert(next_id);
481 }
482
483 if self.multi {
484 if let Some(ref mut tmv) = self.text_multi_values {
485 while self.multi_current_doc < doc_id {
487 self.multi_current_doc += 1;
488 self.multi_offsets.push(self.multi_values.len() as u32);
489 }
490 if self.multi_current_doc == doc_id
491 && self.multi_offsets.len() == doc_id as usize + 1
492 {
493 }
495 self.multi_values.push(0); tmv.push(value.to_string());
497 }
498 } else {
499 let idx = doc_id as usize;
500 if idx >= self.values.len() {
501 self.values.resize(idx + 1, FAST_FIELD_MISSING);
502 }
503 if let Some(ref mut tpd) = self.text_per_doc {
504 if idx >= tpd.len() {
505 tpd.resize(idx + 1, None);
506 }
507 tpd[idx] = Some(value.to_string());
508 }
509 }
510 }
511
512 pub fn pad_to(&mut self, num_docs: u32) {
517 let n = num_docs as usize;
518 if self.multi {
519 while (self.multi_offsets.len() as u32) <= num_docs {
520 self.multi_offsets.push(self.multi_values.len() as u32);
521 }
522 self.multi_current_doc = num_docs;
523 } else {
524 if self.values.len() < n {
525 self.values.resize(n, FAST_FIELD_MISSING);
526 if let Some(ref mut tpd) = self.text_per_doc {
527 tpd.resize(n, None);
528 }
529 }
530 }
531 }
532
533 pub fn num_docs(&self) -> u32 {
535 if self.multi {
536 (self.multi_offsets.len() as u32).saturating_sub(1)
538 } else {
539 self.values.len() as u32
540 }
541 }
542
543 pub fn serialize(
548 &mut self,
549 writer: &mut dyn Write,
550 data_offset: u64,
551 ) -> io::Result<(FastFieldTocEntry, u64)> {
552 if self.column_type == FastFieldColumnType::TextOrdinal {
554 self.resolve_text_ordinals();
555 }
556
557 let num_docs = self.num_docs();
558
559 let mut block_data = Vec::new();
561 if self.multi {
562 let offsets_u64: Vec<u64> = self.multi_offsets.iter().map(|&v| v as u64).collect();
564 let mut offset_buf = Vec::new();
565 codec::serialize_auto(&offsets_u64, &mut offset_buf)?;
566
567 block_data.write_u32::<LittleEndian>(offset_buf.len() as u32)?;
568 block_data.write_all(&offset_buf)?;
569
570 codec::serialize_auto(&self.multi_values, &mut block_data)?;
571 } else {
572 codec::serialize_auto(&self.values, &mut block_data)?;
573 }
574
575 let mut dict_buf = Vec::new();
577 let dict_count = if self.column_type == FastFieldColumnType::TextOrdinal {
578 let (count, _) = self.write_text_dictionary(&mut dict_buf)?;
579 count
580 } else {
581 0u32
582 };
583
584 let block_entry = BlockIndexEntry {
586 num_docs,
587 data_len: block_data.len() as u32,
588 dict_count,
589 dict_len: dict_buf.len() as u32,
590 };
591
592 let mut total_bytes = 0u64;
594
595 writer.write_u32::<LittleEndian>(1u32)?; total_bytes += 4;
597
598 block_entry.write_to(writer)?;
599 total_bytes += BLOCK_INDEX_ENTRY_SIZE as u64;
600
601 writer.write_all(&block_data)?;
602 total_bytes += block_data.len() as u64;
603
604 writer.write_all(&dict_buf)?;
605 total_bytes += dict_buf.len() as u64;
606
607 let toc = FastFieldTocEntry {
608 field_id: 0, column_type: self.column_type,
610 multi: self.multi,
611 data_offset,
612 data_len: total_bytes,
613 num_docs,
614 dict_offset: 0, dict_count: 0,
616 };
617
618 Ok((toc, total_bytes))
619 }
620
621 fn resolve_text_ordinals(&mut self) {
623 let dict = self.text_values.as_ref().expect("text_values required");
624
625 let sorted_ordinals: BTreeMap<&str, u64> = dict
627 .keys()
628 .enumerate()
629 .map(|(ord, key)| (key.as_str(), ord as u64))
630 .collect();
631
632 if self.multi {
633 if let Some(ref tmv) = self.text_multi_values {
635 for (i, text) in tmv.iter().enumerate() {
636 self.multi_values[i] = sorted_ordinals[text.as_str()];
637 }
638 }
639 } else {
640 let tpd = self.text_per_doc.as_ref().expect("text_per_doc required");
642 for (i, doc_text) in tpd.iter().enumerate() {
643 match doc_text {
644 Some(text) => {
645 self.values[i] = sorted_ordinals[text.as_str()];
646 }
647 None => {
648 self.values[i] = FAST_FIELD_MISSING;
649 }
650 }
651 }
652 }
653 }
654
655 fn write_text_dictionary(&self, writer: &mut dyn Write) -> io::Result<(u32, u64)> {
657 let dict = self.text_values.as_ref().expect("text_values required");
658 let mut bytes_written = 0u64;
659
660 let count = dict.len() as u32;
662 for key in dict.keys() {
663 let key_bytes = key.as_bytes();
664 writer.write_u32::<LittleEndian>(key_bytes.len() as u32)?;
665 writer.write_all(key_bytes)?;
666 bytes_written += 4 + key_bytes.len() as u64;
667 }
668
669 Ok((count, bytes_written))
670 }
671}
672
673use crate::directories::OwnedBytes;
676
677pub struct ColumnBlock {
681 pub cumulative_docs: u32,
683 pub num_docs: u32,
685 pub data: OwnedBytes,
687 pub offset_data: OwnedBytes,
689 pub value_data: OwnedBytes,
691 pub dict: Option<TextDictReader>,
693 pub raw_dict: OwnedBytes,
695}
696
697pub struct FastFieldReader {
708 pub column_type: FastFieldColumnType,
709 pub num_docs: u32,
710 pub multi: bool,
711
712 blocks: Vec<ColumnBlock>,
714
715 text_state: OnceLock<TextState>,
718}
719
720struct TextState {
722 global_dict: TextDictReader,
724 ordinal_maps: Vec<Vec<u32>>,
727}
728
729impl FastFieldReader {
730 pub fn open(file_data: &OwnedBytes, toc: &FastFieldTocEntry) -> io::Result<Self> {
735 let region_start = toc.data_offset as usize;
736 let region_end = region_start + toc.data_len as usize;
737
738 if region_end > file_data.len() {
739 return Err(io::Error::new(
740 io::ErrorKind::UnexpectedEof,
741 "fast field data out of bounds",
742 ));
743 }
744
745 let raw = file_data.as_slice();
746
747 let mut pos = region_start;
749 if pos + 4 > region_end {
750 return Err(io::Error::new(
751 io::ErrorKind::UnexpectedEof,
752 "fast field: missing num_blocks",
753 ));
754 }
755 let num_blocks = u32::from_le_bytes(raw[pos..pos + 4].try_into().unwrap());
756 pos += 4;
757
758 let idx_size = num_blocks as usize * BLOCK_INDEX_ENTRY_SIZE;
760 if pos + idx_size > region_end {
761 return Err(io::Error::new(
762 io::ErrorKind::UnexpectedEof,
763 "fast field: block index truncated",
764 ));
765 }
766 let mut block_entries = Vec::with_capacity(num_blocks as usize);
767 {
768 let mut cursor = std::io::Cursor::new(&raw[pos..pos + idx_size]);
769 for _ in 0..num_blocks {
770 block_entries.push(BlockIndexEntry::read_from(&mut cursor)?);
771 }
772 }
773 pos += idx_size;
774
775 let empty = OwnedBytes::new(Vec::new());
776
777 let mut blocks = Vec::with_capacity(num_blocks as usize);
779 let mut cumulative = 0u32;
780
781 for entry in &block_entries {
782 let data_start = pos;
783 let data_end = data_start + entry.data_len as usize;
784 let dict_start = data_end;
785 let dict_end = dict_start + entry.dict_len as usize;
786
787 if dict_end > file_data.len() {
788 return Err(io::Error::new(
789 io::ErrorKind::UnexpectedEof,
790 "fast field: block data/dict truncated",
791 ));
792 }
793
794 let (block_data, offset_data, value_data) = if toc.multi {
796 let block_raw = &raw[data_start..data_end];
797 if block_raw.len() < 4 {
798 (empty.clone(), empty.clone(), empty.clone())
799 } else {
800 let offset_col_len =
801 u32::from_le_bytes(block_raw[0..4].try_into().unwrap()) as usize;
802 let o_start = data_start + 4;
803 let o_end = o_start + offset_col_len;
804 let v_start = o_end;
805 let v_end = data_end;
806 (
807 file_data.slice(data_start..data_end),
808 file_data.slice(o_start..o_end),
809 file_data.slice(v_start..v_end),
810 )
811 }
812 } else {
813 (
814 file_data.slice(data_start..data_end),
815 empty.clone(),
816 empty.clone(),
817 )
818 };
819
820 let dict = if entry.dict_count > 0 {
822 Some(TextDictReader::new_lazy(
823 file_data.slice(dict_start..dict_end),
824 entry.dict_count,
825 ))
826 } else {
827 None
828 };
829
830 let raw_dict = if entry.dict_len > 0 {
831 file_data.slice(dict_start..dict_end)
832 } else {
833 empty.clone()
834 };
835
836 blocks.push(ColumnBlock {
837 cumulative_docs: cumulative,
838 num_docs: entry.num_docs,
839 data: block_data,
840 offset_data,
841 value_data,
842 dict,
843 raw_dict,
844 });
845
846 cumulative += entry.num_docs;
847 pos = dict_end;
848 }
849
850 Ok(Self {
851 column_type: toc.column_type,
852 num_docs: toc.num_docs,
853 multi: toc.multi,
854 blocks,
855 text_state: OnceLock::new(),
856 })
857 }
858
859 fn ensure_text_state(&self) -> &TextState {
862 self.text_state
863 .get_or_init(|| Self::build_text_state(&self.blocks))
864 }
865
866 fn build_text_state(blocks: &[ColumnBlock]) -> TextState {
869 let blocks_with_dict = blocks.iter().filter(|b| b.dict.is_some()).count();
872 if blocks_with_dict <= 1 {
873 for block in blocks.iter() {
874 if let Some(ref dict) = block.dict {
875 return TextState {
877 global_dict: TextDictReader::new_lazy(block.raw_dict.clone(), dict.len()),
878 ordinal_maps: vec![Vec::new(); blocks.len()],
879 };
880 }
881 }
882 return TextState {
884 global_dict: TextDictReader::new_lazy(OwnedBytes::new(Vec::new()), 0),
885 ordinal_maps: vec![Vec::new(); blocks.len()],
886 };
887 }
888
889 let mut unique_map: BTreeMap<String, u32> = BTreeMap::new();
900 for block in blocks.iter() {
901 if let Some(ref dict) = block.dict {
902 for ord in 0..dict.len() {
903 if let Some(text) = dict.get(ord) {
904 unique_map.entry(text.to_string()).or_insert(0);
905 }
906 }
907 }
908 }
909 for (i, value) in unique_map.values_mut().enumerate() {
911 *value = i as u32;
912 }
913
914 let mut ordinal_maps = Vec::with_capacity(blocks.len());
916 for block in blocks.iter() {
917 if let Some(ref dict) = block.dict {
918 let mut map = Vec::with_capacity(dict.len() as usize);
919 for local_ord in 0..dict.len() {
920 let text = dict
921 .get(local_ord)
922 .expect("block dict ordinal out of range");
923 let global_ord = *unique_map
924 .get(text)
925 .expect("block dict entry not found in merged global dict");
926 map.push(global_ord);
927 }
928 ordinal_maps.push(map);
929 } else {
930 ordinal_maps.push(Vec::new());
931 }
932 }
933
934 let mut dict_buf = Vec::new();
936 let count = unique_map.len() as u32;
937 for s in unique_map.keys() {
938 let bytes = s.as_bytes();
939 dict_buf.extend_from_slice(&(bytes.len() as u32).to_le_bytes());
940 dict_buf.extend_from_slice(bytes);
941 }
942
943 TextState {
944 global_dict: TextDictReader::new_lazy(OwnedBytes::new(dict_buf), count),
945 ordinal_maps,
946 }
947 }
948
949 #[inline]
952 fn remap_ordinal(&self, block_idx: usize, raw: u64) -> u64 {
953 if self.column_type == FastFieldColumnType::TextOrdinal
954 && raw != FAST_FIELD_MISSING
955 && self.blocks.len() > 1
956 {
957 let state = self.ensure_text_state();
958 let map = &state.ordinal_maps[block_idx];
959 if !map.is_empty() {
960 let idx = raw as usize;
961 if idx < map.len() {
962 map[idx] as u64
963 } else {
964 FAST_FIELD_MISSING
965 }
966 } else {
967 raw
968 }
969 } else {
970 raw
971 }
972 }
973
974 #[inline]
976 fn find_block(&self, doc_id: u32) -> (usize, u32) {
977 debug_assert!(!self.blocks.is_empty());
978 if self.blocks.len() == 1 {
980 return (0, doc_id);
981 }
982 let bi = self
984 .blocks
985 .partition_point(|b| b.cumulative_docs <= doc_id)
986 .saturating_sub(1);
987 (bi, doc_id - self.blocks[bi].cumulative_docs)
988 }
989
990 #[inline]
998 pub fn get_u64(&self, doc_id: u32) -> u64 {
999 if doc_id >= self.num_docs {
1000 return FAST_FIELD_MISSING;
1001 }
1002 let (bi, local) = self.find_block(doc_id);
1003 let block = &self.blocks[bi];
1004
1005 if self.multi {
1006 let start = codec::auto_read(block.offset_data.as_slice(), local as usize) as u32;
1007 let end = codec::auto_read(block.offset_data.as_slice(), local as usize + 1) as u32;
1008 if start >= end {
1009 return FAST_FIELD_MISSING;
1010 }
1011 let raw = codec::auto_read(block.value_data.as_slice(), start as usize);
1012 return self.remap_ordinal(bi, raw);
1013 }
1014
1015 let raw = codec::auto_read(block.data.as_slice(), local as usize);
1016 self.remap_ordinal(bi, raw)
1017 }
1018
1019 #[inline]
1022 fn block_value_range(&self, doc_id: u32) -> (usize, u32, u32) {
1023 if !self.multi || doc_id >= self.num_docs {
1024 return (0, 0, 0);
1025 }
1026 let (bi, local) = self.find_block(doc_id);
1027 let block = &self.blocks[bi];
1028 let start = codec::auto_read(block.offset_data.as_slice(), local as usize) as u32;
1029 let end = codec::auto_read(block.offset_data.as_slice(), local as usize + 1) as u32;
1030 (bi, start, end)
1031 }
1032
1033 #[inline]
1037 pub fn value_range(&self, doc_id: u32) -> (u32, u32) {
1038 let (_, start, end) = self.block_value_range(doc_id);
1039 (start, end)
1040 }
1041
1042 #[inline]
1045 pub fn get_value_at(&self, index: u32) -> u64 {
1046 if self.blocks.len() == 1 {
1048 let raw = codec::auto_read(self.blocks[0].value_data.as_slice(), index as usize);
1049 return self.remap_ordinal(0, raw);
1050 }
1051 0
1053 }
1054
1055 pub fn get_multi_values(&self, doc_id: u32) -> Vec<u64> {
1057 let (bi, start, end) = self.block_value_range(doc_id);
1058 if start >= end {
1059 return Vec::new();
1060 }
1061 let block = &self.blocks[bi];
1062 (start..end)
1063 .map(|idx| {
1064 let raw = codec::auto_read(block.value_data.as_slice(), idx as usize);
1065 self.remap_ordinal(bi, raw)
1066 })
1067 .collect()
1068 }
1069
1070 #[inline]
1073 pub fn for_each_multi_value(&self, doc_id: u32, mut f: impl FnMut(u64) -> bool) -> bool {
1074 let (bi, start, end) = self.block_value_range(doc_id);
1075 if start >= end {
1076 return false;
1077 }
1078 let block = &self.blocks[bi];
1079 for idx in start..end {
1080 let raw = codec::auto_read(block.value_data.as_slice(), idx as usize);
1081 if f(self.remap_ordinal(bi, raw)) {
1082 return true;
1083 }
1084 }
1085 false
1086 }
1087
1088 pub fn scan_single_values(&self, mut f: impl FnMut(u32, u64)) {
1095 if self.multi {
1096 return;
1097 }
1098 const BATCH: usize = 256;
1099 let mut buf = [0u64; BATCH];
1100 let needs_remap =
1101 self.column_type == FastFieldColumnType::TextOrdinal && self.blocks.len() > 1;
1102
1103 let ordinal_maps = if needs_remap {
1105 Some(&self.ensure_text_state().ordinal_maps)
1106 } else {
1107 None
1108 };
1109
1110 for (block_idx, block) in self.blocks.iter().enumerate() {
1111 let n = block.num_docs as usize;
1112 let mut pos = 0;
1113
1114 let map = ordinal_maps.map(|maps| &maps[block_idx]);
1115 let has_map = map.is_some_and(|m| !m.is_empty());
1116
1117 while pos < n {
1118 let chunk = (n - pos).min(BATCH);
1119 codec::auto_read_batch(block.data.as_slice(), pos, &mut buf[..chunk]);
1120
1121 if has_map {
1122 let map = map.unwrap();
1123 for (i, &raw) in buf[..chunk].iter().enumerate() {
1124 let val = if raw != FAST_FIELD_MISSING {
1125 let idx = raw as usize;
1126 if idx < map.len() {
1127 map[idx] as u64
1128 } else {
1129 FAST_FIELD_MISSING
1130 }
1131 } else {
1132 raw
1133 };
1134 f(block.cumulative_docs + pos as u32 + i as u32, val);
1135 }
1136 } else {
1137 for (i, &val) in buf[..chunk].iter().enumerate() {
1138 f(block.cumulative_docs + pos as u32 + i as u32, val);
1139 }
1140 }
1141 pos += chunk;
1142 }
1143 }
1144 }
1145
1146 #[inline]
1151 pub fn has_value(&self, doc_id: u32) -> bool {
1152 if !self.multi {
1153 return doc_id < self.num_docs && self.get_u64(doc_id) != FAST_FIELD_MISSING;
1154 }
1155 let (_, start, end) = self.block_value_range(doc_id);
1156 start < end
1157 }
1158
1159 #[inline]
1164 pub fn get_i64(&self, doc_id: u32) -> i64 {
1165 zigzag_decode(self.get_u64(doc_id))
1166 }
1167
1168 #[inline]
1173 pub fn get_f64(&self, doc_id: u32) -> f64 {
1174 sortable_u64_to_f64(self.get_u64(doc_id))
1175 }
1176
1177 #[inline]
1179 pub fn get_ordinal(&self, doc_id: u32) -> u64 {
1180 self.get_u64(doc_id)
1181 }
1182
1183 pub fn get_text(&self, doc_id: u32) -> Option<&str> {
1186 if doc_id >= self.num_docs {
1187 return None;
1188 }
1189 let (bi, local) = self.find_block(doc_id);
1190 let block = &self.blocks[bi];
1191 let raw_ordinal = if self.multi {
1192 let start = codec::auto_read(block.offset_data.as_slice(), local as usize) as u32;
1193 let end = codec::auto_read(block.offset_data.as_slice(), local as usize + 1) as u32;
1194 if start >= end {
1195 return None;
1196 }
1197 codec::auto_read(block.value_data.as_slice(), start as usize)
1198 } else {
1199 codec::auto_read(block.data.as_slice(), local as usize)
1200 };
1201 if raw_ordinal == FAST_FIELD_MISSING {
1202 return None;
1203 }
1204 block.dict.as_ref().and_then(|d| d.get(raw_ordinal as u32))
1205 }
1206
1207 pub fn text_ordinal(&self, text: &str) -> Option<u64> {
1209 if self.column_type != FastFieldColumnType::TextOrdinal {
1210 return None;
1211 }
1212 self.ensure_text_state().global_dict.ordinal(text)
1213 }
1214
1215 pub fn text_dict(&self) -> Option<&TextDictReader> {
1217 if self.column_type != FastFieldColumnType::TextOrdinal {
1218 return None;
1219 }
1220 Some(&self.ensure_text_state().global_dict)
1221 }
1222
1223 pub fn num_blocks(&self) -> usize {
1225 self.blocks.len()
1226 }
1227
1228 pub fn blocks(&self) -> &[ColumnBlock] {
1230 &self.blocks
1231 }
1232}
1233
1234pub struct TextDictReader {
1242 data: OwnedBytes,
1244 count: u32,
1246 offsets: OnceLock<Vec<(u32, u32)>>,
1248}
1249
1250impl TextDictReader {
1251 pub fn new_lazy(data: OwnedBytes, count: u32) -> Self {
1254 Self {
1255 data,
1256 count,
1257 offsets: OnceLock::new(),
1258 }
1259 }
1260
1261 pub fn open(file_data: &OwnedBytes, dict_start: usize, count: u32) -> io::Result<Self> {
1264 if count == 0 {
1265 return Ok(Self::new_lazy(OwnedBytes::new(Vec::new()), 0));
1266 }
1267 let dict_slice = file_data.as_slice();
1269 let mut pos = dict_start;
1270 for _ in 0..count {
1271 if pos + 4 > dict_slice.len() {
1272 return Err(io::Error::new(
1273 io::ErrorKind::UnexpectedEof,
1274 "text dict truncated",
1275 ));
1276 }
1277 let len = u32::from_le_bytes(dict_slice[pos..pos + 4].try_into().unwrap()) as usize;
1278 pos += 4;
1279 if pos + len > dict_slice.len() {
1280 return Err(io::Error::new(
1281 io::ErrorKind::UnexpectedEof,
1282 "text dict entry truncated",
1283 ));
1284 }
1285 pos += len;
1286 }
1287 let data = file_data.slice(dict_start..pos);
1288 Ok(Self::new_lazy(data, count))
1289 }
1290
1291 pub fn open_from_raw(raw_dict: &OwnedBytes, count: u32) -> io::Result<Self> {
1293 Ok(Self::new_lazy(raw_dict.clone(), count))
1294 }
1295
1296 #[inline]
1298 fn ensure_offsets(&self) -> &[(u32, u32)] {
1299 self.offsets.get_or_init(|| {
1300 let dict_slice = self.data.as_slice();
1301 let mut pos = 0usize;
1302 let mut offsets = Vec::with_capacity(self.count as usize);
1303 for _ in 0..self.count {
1304 debug_assert!(
1305 pos + 4 <= dict_slice.len(),
1306 "text dict truncated during lazy init"
1307 );
1308 let len = u32::from_le_bytes(dict_slice[pos..pos + 4].try_into().unwrap()) as usize;
1309 pos += 4;
1310 debug_assert!(
1311 pos + len <= dict_slice.len(),
1312 "text dict entry truncated during lazy init"
1313 );
1314 offsets.push((pos as u32, len as u32));
1315 pos += len;
1316 }
1317 offsets
1318 })
1319 }
1320
1321 pub fn get(&self, ordinal: u32) -> Option<&str> {
1323 let offsets = self.ensure_offsets();
1324 let &(off, len) = offsets.get(ordinal as usize)?;
1325 let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
1326 Some(unsafe { std::str::from_utf8_unchecked(slice) })
1327 }
1328
1329 pub fn ordinal(&self, text: &str) -> Option<u64> {
1331 let offsets = self.ensure_offsets();
1332 offsets
1333 .binary_search_by(|&(off, len)| {
1334 let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
1335 let entry = unsafe { std::str::from_utf8_unchecked(slice) };
1336 entry.cmp(text)
1337 })
1338 .ok()
1339 .map(|i| i as u64)
1340 }
1341
1342 pub fn len(&self) -> u32 {
1344 self.count
1345 }
1346
1347 pub fn is_empty(&self) -> bool {
1349 self.count == 0
1350 }
1351
1352 pub fn iter(&self) -> impl Iterator<Item = &str> {
1354 let offsets = self.ensure_offsets();
1355 offsets.iter().map(|&(off, len)| {
1356 let slice = &self.data.as_slice()[off as usize..off as usize + len as usize];
1357 unsafe { std::str::from_utf8_unchecked(slice) }
1358 })
1359 }
1360}
1361
1362pub fn write_fast_field_toc_and_footer(
1366 writer: &mut dyn Write,
1367 toc_offset: u64,
1368 entries: &[FastFieldTocEntry],
1369) -> io::Result<()> {
1370 for e in entries {
1371 e.write_to(writer)?;
1372 }
1373 writer.write_u64::<LittleEndian>(toc_offset)?;
1374 writer.write_u32::<LittleEndian>(entries.len() as u32)?;
1375 writer.write_u32::<LittleEndian>(FAST_FIELD_MAGIC)?;
1376 Ok(())
1377}
1378
1379pub fn read_fast_field_footer(file_data: &[u8]) -> io::Result<(u64, u32)> {
1382 let len = file_data.len();
1383 if len < FAST_FIELD_FOOTER_SIZE as usize {
1384 return Err(io::Error::new(
1385 io::ErrorKind::UnexpectedEof,
1386 "fast field file too small for footer",
1387 ));
1388 }
1389 let footer = &file_data[len - FAST_FIELD_FOOTER_SIZE as usize..];
1390 let mut cursor = std::io::Cursor::new(footer);
1391 let toc_offset = cursor.read_u64::<LittleEndian>()?;
1392 let num_columns = cursor.read_u32::<LittleEndian>()?;
1393 let magic = cursor.read_u32::<LittleEndian>()?;
1394 if magic != FAST_FIELD_MAGIC {
1395 return Err(io::Error::new(
1396 io::ErrorKind::InvalidData,
1397 format!("bad fast field magic: 0x{:08x}", magic),
1398 ));
1399 }
1400 Ok((toc_offset, num_columns))
1401}
1402
1403pub fn read_fast_field_toc(
1405 file_data: &[u8],
1406 toc_offset: u64,
1407 num_columns: u32,
1408) -> io::Result<Vec<FastFieldTocEntry>> {
1409 let start = toc_offset as usize;
1410 let expected = num_columns as usize * FAST_FIELD_TOC_ENTRY_SIZE;
1411 if start + expected > file_data.len() {
1412 return Err(io::Error::new(
1413 io::ErrorKind::UnexpectedEof,
1414 "fast field TOC out of bounds",
1415 ));
1416 }
1417 let mut cursor = std::io::Cursor::new(&file_data[start..start + expected]);
1418 let mut entries = Vec::with_capacity(num_columns as usize);
1419 for _ in 0..num_columns {
1420 entries.push(FastFieldTocEntry::read_from(&mut cursor)?);
1421 }
1422 Ok(entries)
1423}
1424
1425#[cfg(test)]
1428mod tests {
1429 use super::*;
1430
1431 #[test]
1432 fn test_zigzag_roundtrip() {
1433 for v in [0i64, 1, -1, 42, -42, i64::MAX, i64::MIN] {
1434 assert_eq!(zigzag_decode(zigzag_encode(v)), v);
1435 }
1436 }
1437
1438 #[test]
1439 fn test_f64_sortable_roundtrip() {
1440 for v in [0.0f64, 1.0, -1.0, f64::MAX, f64::MIN, f64::MIN_POSITIVE] {
1441 assert_eq!(sortable_u64_to_f64(f64_to_sortable_u64(v)), v);
1442 }
1443 }
1444
1445 #[test]
1446 fn test_f64_sortable_order() {
1447 let values = [-100.0f64, -1.0, -0.0, 0.0, 0.5, 1.0, 100.0];
1448 let encoded: Vec<u64> = values.iter().map(|&v| f64_to_sortable_u64(v)).collect();
1449 for i in 1..encoded.len() {
1450 assert!(
1451 encoded[i] >= encoded[i - 1],
1452 "{} >= {} failed for {} vs {}",
1453 encoded[i],
1454 encoded[i - 1],
1455 values[i],
1456 values[i - 1]
1457 );
1458 }
1459 }
1460
1461 #[test]
1462 fn test_bitpack_roundtrip() {
1463 let values: Vec<u64> = vec![0, 3, 7, 15, 0, 1, 6, 12];
1464 let bpv = 4u8;
1465 let mut packed = Vec::new();
1466 bitpack_write(&values, bpv, &mut packed);
1467
1468 for (i, &expected) in values.iter().enumerate() {
1469 let got = bitpack_read(&packed, bpv, i);
1470 assert_eq!(got, expected, "index {}", i);
1471 }
1472 }
1473
1474 #[test]
1475 fn test_bitpack_high_bpv_regression() {
1476 for bpv in [57u8, 58, 59, 60, 63, 64] {
1479 let max_val = if bpv == 64 {
1480 u64::MAX
1481 } else {
1482 (1u64 << bpv) - 1
1483 };
1484 let values: Vec<u64> = (0..32)
1485 .map(|i: u64| {
1486 if max_val == u64::MAX {
1487 i * 7
1488 } else {
1489 (i * 7) % (max_val + 1)
1490 }
1491 })
1492 .collect();
1493 let mut packed = Vec::new();
1494 bitpack_write(&values, bpv, &mut packed);
1495 for (i, &expected) in values.iter().enumerate() {
1496 let got = bitpack_read(&packed, bpv, i);
1497 assert_eq!(got, expected, "high bpv={} index={}", bpv, i);
1498 }
1499 }
1500 }
1501
1502 #[test]
1503 fn test_bitpack_various_widths() {
1504 for bpv in [1u8, 2, 3, 5, 7, 8, 13, 16, 32, 64] {
1505 let max_val = if bpv == 64 {
1506 u64::MAX
1507 } else {
1508 (1u64 << bpv) - 1
1509 };
1510 let values: Vec<u64> = (0..100)
1511 .map(|i: u64| {
1512 if max_val == u64::MAX {
1513 i
1514 } else {
1515 i % (max_val + 1)
1516 }
1517 })
1518 .collect();
1519 let mut packed = Vec::new();
1520 bitpack_write(&values, bpv, &mut packed);
1521
1522 for (i, &expected) in values.iter().enumerate() {
1523 let got = bitpack_read(&packed, bpv, i);
1524 assert_eq!(got, expected, "bpv={} index={}", bpv, i);
1525 }
1526 }
1527 }
1528
1529 fn owned(buf: Vec<u8>) -> OwnedBytes {
1531 OwnedBytes::new(buf)
1532 }
1533
1534 #[test]
1535 fn test_writer_reader_u64_roundtrip() {
1536 let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
1537 writer.add_u64(0, 100);
1538 writer.add_u64(1, 200);
1539 writer.add_u64(2, 150);
1540 writer.add_u64(4, 300); writer.pad_to(5);
1542
1543 let mut buf = Vec::new();
1544 let (mut toc, _bytes) = writer.serialize(&mut buf, 0).unwrap();
1545 toc.field_id = 42;
1546
1547 let toc_offset = buf.len() as u64;
1549 write_fast_field_toc_and_footer(&mut buf, toc_offset, &[toc]).unwrap();
1550
1551 let ob = owned(buf);
1553 let (toc_off, num_cols) = read_fast_field_footer(&ob).unwrap();
1554 assert_eq!(num_cols, 1);
1555 let tocs = read_fast_field_toc(&ob, toc_off, num_cols).unwrap();
1556 assert_eq!(tocs.len(), 1);
1557 assert_eq!(tocs[0].field_id, 42);
1558
1559 let reader = FastFieldReader::open(&ob, &tocs[0]).unwrap();
1560 assert_eq!(reader.get_u64(0), 100);
1561 assert_eq!(reader.get_u64(1), 200);
1562 assert_eq!(reader.get_u64(2), 150);
1563 assert_eq!(reader.get_u64(3), FAST_FIELD_MISSING); assert_eq!(reader.get_u64(4), 300);
1565 }
1566
1567 #[test]
1568 fn test_writer_reader_i64_roundtrip() {
1569 let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::I64);
1570 writer.add_i64(0, -100);
1571 writer.add_i64(1, 50);
1572 writer.add_i64(2, 0);
1573 writer.pad_to(3);
1574
1575 let mut buf = Vec::new();
1576 let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
1577 let ob = owned(buf);
1578 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1579 assert_eq!(reader.get_i64(0), -100);
1580 assert_eq!(reader.get_i64(1), 50);
1581 assert_eq!(reader.get_i64(2), 0);
1582 }
1583
1584 #[test]
1585 fn test_writer_reader_f64_roundtrip() {
1586 let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::F64);
1587 writer.add_f64(0, -1.5);
1588 writer.add_f64(1, 3.15);
1589 writer.add_f64(2, 0.0);
1590 writer.pad_to(3);
1591
1592 let mut buf = Vec::new();
1593 let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
1594 let ob = owned(buf);
1595 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1596 assert_eq!(reader.get_f64(0), -1.5);
1597 assert_eq!(reader.get_f64(1), 3.15);
1598 assert_eq!(reader.get_f64(2), 0.0);
1599 }
1600
1601 #[test]
1602 fn test_writer_reader_text_roundtrip() {
1603 let mut writer = FastFieldWriter::new_text();
1604 writer.add_text(0, "banana");
1605 writer.add_text(1, "apple");
1606 writer.add_text(2, "cherry");
1607 writer.add_text(3, "apple"); writer.pad_to(5);
1610
1611 let mut buf = Vec::new();
1612 let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
1613 let ob = owned(buf);
1614 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1615
1616 assert_eq!(reader.get_text(0), Some("banana"));
1618 assert_eq!(reader.get_text(1), Some("apple"));
1619 assert_eq!(reader.get_text(2), Some("cherry"));
1620 assert_eq!(reader.get_text(3), Some("apple"));
1621 assert_eq!(reader.get_text(4), None); assert_eq!(reader.text_ordinal("apple"), Some(0));
1625 assert_eq!(reader.text_ordinal("banana"), Some(1));
1626 assert_eq!(reader.text_ordinal("cherry"), Some(2));
1627 assert_eq!(reader.text_ordinal("durian"), None);
1628 }
1629
1630 #[test]
1631 fn test_constant_column() {
1632 let mut writer = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
1633 for i in 0..100 {
1634 writer.add_u64(i, 42);
1635 }
1636
1637 let mut buf = Vec::new();
1638 let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
1639
1640 let ob = owned(buf);
1641 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1642 for i in 0..100 {
1643 assert_eq!(reader.get_u64(i), 42);
1644 }
1645 }
1646
1647 #[test]
1650 fn test_multi_value_u64_roundtrip() {
1651 let mut writer = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
1652 writer.add_u64(0, 10);
1654 writer.add_u64(0, 20);
1655 writer.add_u64(0, 30);
1656 writer.add_u64(2, 100);
1659 writer.add_u64(3, 5);
1661 writer.add_u64(3, 15);
1662 writer.pad_to(4);
1663
1664 let mut buf = Vec::new();
1665 let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
1666 assert!(toc.multi);
1667 assert_eq!(toc.num_docs, 4);
1668
1669 let ob = owned(buf);
1670 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1671 assert!(reader.multi);
1672
1673 assert_eq!(reader.get_u64(0), 10);
1675 let (s, e) = reader.value_range(0);
1676 assert_eq!(e - s, 3);
1677 assert_eq!(reader.get_value_at(s), 10);
1678 assert_eq!(reader.get_value_at(s + 1), 20);
1679 assert_eq!(reader.get_value_at(s + 2), 30);
1680
1681 assert_eq!(reader.get_u64(1), FAST_FIELD_MISSING);
1683 let (s, e) = reader.value_range(1);
1684 assert_eq!(s, e);
1685 assert!(!reader.has_value(1));
1686
1687 assert_eq!(reader.get_u64(2), 100);
1689 assert!(reader.has_value(2));
1690
1691 assert_eq!(reader.get_u64(3), 5);
1693 let (s, e) = reader.value_range(3);
1694 assert_eq!(e - s, 2);
1695 assert_eq!(reader.get_value_at(s), 5);
1696 assert_eq!(reader.get_value_at(s + 1), 15);
1697 }
1698
1699 #[test]
1700 fn test_multi_value_text_roundtrip() {
1701 let mut writer = FastFieldWriter::new_text_multi();
1702 writer.add_text(0, "banana");
1704 writer.add_text(0, "apple");
1705 writer.add_text(1, "cherry");
1707 writer.pad_to(3);
1709
1710 let mut buf = Vec::new();
1711 let (toc, _) = writer.serialize(&mut buf, 0).unwrap();
1712 assert!(toc.multi);
1713
1714 let ob = owned(buf);
1715 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1716
1717 let (s, e) = reader.value_range(0);
1719 assert_eq!(e - s, 2);
1720 let ord0 = reader.get_value_at(s);
1721 let ord1 = reader.get_value_at(s + 1);
1722 assert_eq!(reader.text_dict().unwrap().get(ord0 as u32), Some("banana"));
1723 assert_eq!(reader.text_dict().unwrap().get(ord1 as u32), Some("apple"));
1724
1725 let (s, e) = reader.value_range(1);
1727 assert_eq!(e - s, 1);
1728 let ord = reader.get_value_at(s);
1729 assert_eq!(reader.text_dict().unwrap().get(ord as u32), Some("cherry"));
1730
1731 assert!(!reader.has_value(2));
1733 }
1734
1735 #[test]
1736 fn test_multi_value_full_toc_roundtrip() {
1737 let mut writer = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
1738 writer.add_u64(0, 1);
1739 writer.add_u64(0, 2);
1740 writer.add_u64(1, 3);
1741 writer.pad_to(2);
1742
1743 let mut buf = Vec::new();
1744 let (mut toc, _) = writer.serialize(&mut buf, 0).unwrap();
1745 toc.field_id = 7;
1746
1747 let toc_offset = buf.len() as u64;
1748 write_fast_field_toc_and_footer(&mut buf, toc_offset, &[toc]).unwrap();
1749
1750 let ob = owned(buf);
1751 let (toc_off, num_cols) = read_fast_field_footer(&ob).unwrap();
1752 let tocs = read_fast_field_toc(&ob, toc_off, num_cols).unwrap();
1753 assert_eq!(tocs[0].field_id, 7);
1754 assert!(tocs[0].multi);
1755
1756 let reader = FastFieldReader::open(&ob, &tocs[0]).unwrap();
1757 assert_eq!(reader.get_u64(0), 1);
1758 assert_eq!(reader.get_u64(1), 3);
1759 }
1760
1761 fn serialize_single_block(writer: &mut FastFieldWriter) -> (Vec<u8>, Vec<u8>, BlockIndexEntry) {
1764 let mut buf = Vec::new();
1765 let (_toc, _) = writer.serialize(&mut buf, 0).unwrap();
1766 let mut cursor = std::io::Cursor::new(&buf[4..4 + BLOCK_INDEX_ENTRY_SIZE]);
1768 let entry = BlockIndexEntry::read_from(&mut cursor).unwrap();
1769 let data_start = 4 + BLOCK_INDEX_ENTRY_SIZE;
1770 let data_end = data_start + entry.data_len as usize;
1771 let dict_end = data_end + entry.dict_len as usize;
1772 let data = buf[data_start..data_end].to_vec();
1773 let dict = if dict_end > data_end {
1774 buf[data_end..dict_end].to_vec()
1775 } else {
1776 Vec::new()
1777 };
1778 (data, dict, entry)
1779 }
1780
1781 fn assemble_blocked_column(
1783 field_id: u32,
1784 column_type: FastFieldColumnType,
1785 multi: bool,
1786 blocks: &[(u32, &[u8], u32, &[u8])], ) -> (Vec<u8>, FastFieldTocEntry) {
1788 use byteorder::{LittleEndian, WriteBytesExt};
1789
1790 let mut buf = Vec::new();
1791 let num_blocks = blocks.len() as u32;
1792
1793 buf.write_u32::<LittleEndian>(num_blocks).unwrap();
1795
1796 for &(num_docs, data, dict_count, dict) in blocks {
1798 let entry = BlockIndexEntry {
1799 num_docs,
1800 data_len: data.len() as u32,
1801 dict_count,
1802 dict_len: dict.len() as u32,
1803 };
1804 entry.write_to(&mut buf).unwrap();
1805 }
1806
1807 let mut total_docs = 0u32;
1809 for &(num_docs, data, _, dict) in blocks {
1810 buf.extend_from_slice(data);
1811 buf.extend_from_slice(dict);
1812 total_docs += num_docs;
1813 }
1814
1815 let data_len = buf.len() as u64;
1816
1817 let toc = FastFieldTocEntry {
1819 field_id,
1820 column_type,
1821 multi,
1822 data_offset: 0,
1823 data_len,
1824 num_docs: total_docs,
1825 dict_offset: 0,
1826 dict_count: 0,
1827 };
1828
1829 let toc_offset = buf.len() as u64;
1830 write_fast_field_toc_and_footer(&mut buf, toc_offset, std::slice::from_ref(&toc)).unwrap();
1831
1832 (buf, toc)
1833 }
1834
1835 #[test]
1836 fn test_multi_block_numeric_roundtrip() {
1837 let mut wa = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
1839 wa.add_u64(0, 10);
1840 wa.add_u64(1, 20);
1841 wa.add_u64(2, 30);
1842 let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
1843
1844 let mut wb = FastFieldWriter::new_numeric(FastFieldColumnType::U64);
1846 wb.add_u64(0, 40);
1847 wb.add_u64(1, 50);
1848 let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
1849
1850 let (buf, toc) = assemble_blocked_column(
1851 1,
1852 FastFieldColumnType::U64,
1853 false,
1854 &[
1855 (entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
1856 (entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
1857 ],
1858 );
1859
1860 let ob = owned(buf);
1861 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1862
1863 assert_eq!(reader.num_docs, 5);
1864 assert_eq!(reader.num_blocks(), 2);
1865 assert_eq!(reader.get_u64(0), 10);
1866 assert_eq!(reader.get_u64(1), 20);
1867 assert_eq!(reader.get_u64(2), 30);
1868 assert_eq!(reader.get_u64(3), 40);
1869 assert_eq!(reader.get_u64(4), 50);
1870 }
1871
1872 #[test]
1873 fn test_multi_block_text_roundtrip() {
1874 let mut wa = FastFieldWriter::new_text();
1876 wa.add_text(0, "alpha");
1877 wa.add_text(1, "beta");
1878 let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
1879
1880 let mut wb = FastFieldWriter::new_text();
1882 wb.add_text(0, "gamma");
1883 wb.add_text(1, "alpha");
1884 let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
1885
1886 let (buf, toc) = assemble_blocked_column(
1887 2,
1888 FastFieldColumnType::TextOrdinal,
1889 false,
1890 &[
1891 (entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
1892 (entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
1893 ],
1894 );
1895
1896 let ob = owned(buf);
1897 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1898
1899 assert_eq!(reader.num_docs, 4);
1900 assert_eq!(reader.num_blocks(), 2);
1901
1902 assert_eq!(reader.text_dict().unwrap().len(), 3);
1904
1905 assert_eq!(reader.get_text(0), Some("alpha"));
1907 assert_eq!(reader.get_text(1), Some("beta"));
1908
1909 assert_eq!(reader.get_text(2), Some("gamma"));
1911 assert_eq!(reader.get_text(3), Some("alpha"));
1912
1913 assert_eq!(reader.text_ordinal("alpha"), Some(0));
1915 assert_eq!(reader.text_ordinal("beta"), Some(1));
1916 assert_eq!(reader.text_ordinal("gamma"), Some(2));
1917
1918 assert_eq!(reader.get_u64(0), 0); assert_eq!(reader.get_u64(1), 1); assert_eq!(reader.get_u64(2), 2); assert_eq!(reader.get_u64(3), 0); }
1924
1925 #[test]
1934 fn test_multi_block_text_ordinal_mismatch_regression() {
1935 let mut wa = FastFieldWriter::new_text();
1937 wa.add_text(0, "book");
1938 wa.add_text(1, "wiki");
1939 let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
1940
1941 let mut wb = FastFieldWriter::new_text();
1943 wb.add_text(0, "apple");
1944 wb.add_text(1, "wiki");
1945 let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
1946
1947 let (buf, toc) = assemble_blocked_column(
1948 2,
1949 FastFieldColumnType::TextOrdinal,
1950 false,
1951 &[
1952 (entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
1953 (entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
1954 ],
1955 );
1956
1957 let ob = owned(buf);
1958 let reader = FastFieldReader::open(&ob, &toc).unwrap();
1959
1960 assert_eq!(reader.text_dict().unwrap().len(), 3);
1962 assert_eq!(reader.text_ordinal("apple"), Some(0));
1963 assert_eq!(reader.text_ordinal("book"), Some(1));
1964 assert_eq!(reader.text_ordinal("wiki"), Some(2));
1965
1966 assert_eq!(reader.get_u64(0), 1); assert_eq!(reader.get_u64(1), 2); assert_eq!(reader.get_u64(2), 0); assert_eq!(reader.get_u64(3), 2); let wiki_ord = reader.text_ordinal("wiki").unwrap();
1974 assert_eq!(reader.get_u64(1), wiki_ord, "wiki doc should match");
1975 assert_eq!(reader.get_u64(3), wiki_ord, "wiki doc should match");
1976 assert_ne!(reader.get_u64(0), wiki_ord, "book doc must NOT match wiki");
1977 assert_ne!(reader.get_u64(2), wiki_ord, "apple doc must NOT match wiki");
1978 }
1979
1980 #[test]
1981 fn test_multi_block_multi_value_numeric() {
1982 let mut wa = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
1984 wa.add_u64(0, 1);
1985 wa.add_u64(0, 2);
1986 wa.add_u64(1, 3);
1987 wa.pad_to(2);
1988 let (data_a, dict_a, entry_a) = serialize_single_block(&mut wa);
1989
1990 let mut wb = FastFieldWriter::new_numeric_multi(FastFieldColumnType::U64);
1992 wb.add_u64(0, 4);
1993 wb.add_u64(0, 5);
1994 wb.add_u64(0, 6);
1995 wb.pad_to(2);
1996 let (data_b, dict_b, entry_b) = serialize_single_block(&mut wb);
1997
1998 let (buf, toc) = assemble_blocked_column(
1999 3,
2000 FastFieldColumnType::U64,
2001 true,
2002 &[
2003 (entry_a.num_docs, &data_a, entry_a.dict_count, &dict_a),
2004 (entry_b.num_docs, &data_b, entry_b.dict_count, &dict_b),
2005 ],
2006 );
2007
2008 let ob = owned(buf);
2009 let reader = FastFieldReader::open(&ob, &toc).unwrap();
2010
2011 assert_eq!(reader.num_docs, 4);
2012 assert_eq!(reader.num_blocks(), 2);
2013
2014 assert_eq!(reader.get_multi_values(0), vec![1, 2]);
2016 assert_eq!(reader.get_multi_values(1), vec![3]);
2018 assert_eq!(reader.get_multi_values(2), vec![4, 5, 6]);
2020 assert_eq!(reader.get_multi_values(3), Vec::<u64>::new());
2022 }
2023}