hermes-core 1.8.33

Core async search engine library with WASM support
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
//! Index loading functions for segment reader

use std::sync::Arc;

use byteorder::{LittleEndian, ReadBytesExt};
use rustc_hash::FxHashMap;
use std::io::Cursor;

use super::super::types::SegmentFiles;
use super::super::vector_data::LazyFlatVectorData;
use super::bmp::BmpIndex;
use super::{SparseIndex, VectorIndex};
use crate::Result;
use crate::directories::{Directory, FileHandle};
use crate::dsl::Schema;

/// Result of loading the `.sparse` file — may contain MaxScore and/or BMP indexes.
pub struct SparseFileData {
    pub maxscore_indexes: FxHashMap<u32, SparseIndex>,
    pub bmp_indexes: FxHashMap<u32, BmpIndex>,
}

/// Vectors file loading result
pub struct VectorsFileData {
    /// ANN indexes per field (IVF, ScaNN, RaBitQ) — loaded into memory for search
    pub indexes: FxHashMap<u32, VectorIndex>,
    /// Lazy flat vectors per field — doc_ids in memory, vectors via mmap for reranking/merge
    pub flat_vectors: FxHashMap<u32, LazyFlatVectorData>,
}

use crate::segment::format::{
    DENSE_TOC_ENTRY_SIZE, DenseVectorTocEntry, FOOTER_SIZE, VECTORS_FOOTER_MAGIC, read_dense_toc,
};

/// Load dense vector indexes from unified .vectors file
///
/// File format (data-first, TOC at end):
/// - [field data...]  — starts at offset 0 (mmap page-aligned)
/// - [TOC entries]    — field_id(4) + index_type(1) + offset(8) + size(8) per field
/// - [footer 16B]     — toc_offset(8) + num_fields(4) + magic(4)
///
/// Also supports legacy header-first format (no magic) for backwards compatibility.
pub async fn load_vectors_file<D: Directory>(
    dir: &D,
    files: &SegmentFiles,
    schema: &Schema,
) -> Result<VectorsFileData> {
    let mut indexes = FxHashMap::default();
    let mut flat_vectors = FxHashMap::default();
    let empty = || VectorsFileData {
        indexes: FxHashMap::default(),
        flat_vectors: FxHashMap::default(),
    };

    // Skip loading vectors file if schema has no dense/binary dense vector fields
    let has_dense_vectors = schema.fields().any(|(_, entry)| {
        entry.dense_vector_config.is_some() || entry.binary_dense_vector_config.is_some()
    });
    if !has_dense_vectors {
        return Ok(empty());
    }

    // Try to open vectors file (may not exist if no vectors were indexed)
    let handle = match dir.open_lazy(&files.vectors).await {
        Ok(h) => h,
        Err(_) => return Ok(empty()),
    };

    let file_size = handle.len();
    if file_size < FOOTER_SIZE {
        return Ok(empty());
    }

    // Try new format: read footer (last 16 bytes)
    let footer_bytes = handle
        .read_bytes_range(file_size - FOOTER_SIZE..file_size)
        .await?;
    let mut cursor = Cursor::new(footer_bytes.as_slice());
    let toc_offset = cursor.read_u64::<LittleEndian>()?;
    let num_fields = cursor.read_u32::<LittleEndian>()?;
    let magic = cursor.read_u32::<LittleEndian>()?;

    let entries: Vec<DenseVectorTocEntry> =
        if magic == VECTORS_FOOTER_MAGIC && toc_offset < file_size - FOOTER_SIZE {
            // New format: TOC at end
            let toc_size = num_fields as u64 * DENSE_TOC_ENTRY_SIZE;
            let toc_bytes = handle
                .read_bytes_range(toc_offset..toc_offset + toc_size)
                .await?;
            read_dense_toc(toc_bytes.as_slice(), num_fields)?
        } else {
            // Legacy format: header at start (num_fields(4) + entries)
            let header_bytes = handle.read_bytes_range(0..4).await?;
            let mut cursor = Cursor::new(header_bytes.as_slice());
            let num_fields = cursor.read_u32::<LittleEndian>()?;
            if num_fields == 0 {
                return Ok(empty());
            }
            let entries_size = num_fields as u64 * DENSE_TOC_ENTRY_SIZE;
            let entries_bytes = handle.read_bytes_range(4..4 + entries_size).await?;
            read_dense_toc(entries_bytes.as_slice(), num_fields)?
        };

    if entries.is_empty() {
        return Ok(empty());
    }

    // Load each entry — a field can have both Flat (lazy) and ANN (in-memory)
    use crate::segment::ann_build;
    for DenseVectorTocEntry {
        field_id,
        index_type,
        offset,
        size: length,
    } in entries
    {
        match index_type {
            ann_build::FLAT_TYPE => {
                // Flat binary — load lazily (only doc_ids in memory, vectors via mmap)
                let slice = handle.slice(offset..offset + length);
                match LazyFlatVectorData::open(slice).await {
                    Ok(lazy_flat) => {
                        flat_vectors.insert(field_id, lazy_flat);
                    }
                    Err(e) => {
                        log::warn!(
                            "Failed to load lazy flat vectors for field {}: {}",
                            field_id,
                            e
                        );
                    }
                }
            }
            ann_build::SCANN_TYPE => {
                // ScaNN (IVF-PQ) — lazy: OwnedBytes stored (zero-copy mmap ref),
                // deserialized on first search. No heap copy during segment load.
                let data = handle.read_bytes_range(offset..offset + length).await?;
                indexes.insert(
                    field_id,
                    VectorIndex::ScaNN(Arc::new(super::types::LazyScaNN::new(data))),
                );
            }
            ann_build::IVF_RABITQ_TYPE => {
                // IVF-RaBitQ — lazy: OwnedBytes stored (zero-copy mmap ref),
                // deserialized on first search. No heap copy during segment load.
                let data = handle.read_bytes_range(offset..offset + length).await?;
                indexes.insert(
                    field_id,
                    VectorIndex::IVF(Arc::new(super::types::LazyIVF::new(data))),
                );
            }
            ann_build::RABITQ_TYPE => {
                // RaBitQ — lazy: OwnedBytes stored (zero-copy mmap ref),
                // deserialized on first search. No heap copy during segment load.
                let data = handle.read_bytes_range(offset..offset + length).await?;
                indexes.insert(
                    field_id,
                    VectorIndex::RaBitQ(Arc::new(super::types::LazyRaBitQ::new(data))),
                );
            }
            _ => {
                log::warn!(
                    "Unknown vector index type {} for field {}",
                    index_type,
                    field_id
                );
            }
        }
    }

    Ok(VectorsFileData {
        indexes,
        flat_vectors,
    })
}

/// Load sparse vector indexes from .sparse file (lazy loading)
///
/// Footer-based format (data-first):
/// ```text
/// [posting data for all dims across all fields]
/// [TOC: per-field header + per-dim entries]
/// [footer: toc_offset(u64) + num_fields(u32) + magic(u32)]
/// ```
///
/// Memory optimization: Only loads the offset table + skip lists, not the block data.
/// Block data is loaded on-demand during queries via mmap range reads.
pub async fn load_sparse_file<D: Directory>(
    dir: &D,
    files: &SegmentFiles,
    total_docs: u32,
    schema: &Schema,
) -> Result<SparseFileData> {
    use crate::segment::format::{SPARSE_FOOTER_MAGIC, SPARSE_FOOTER_SIZE};
    use crate::structures::SparseVectorConfig;

    let empty = || SparseFileData {
        maxscore_indexes: FxHashMap::default(),
        bmp_indexes: FxHashMap::default(),
    };

    let mut maxscore_indexes = FxHashMap::default();
    let mut bmp_indexes = FxHashMap::default();

    // Skip loading sparse file if schema has no sparse vector fields
    let has_sparse_vectors = schema
        .fields()
        .any(|(_, entry)| entry.sparse_vector_config.is_some());
    if !has_sparse_vectors {
        return Ok(empty());
    }

    // Try to open sparse file lazily (may not exist if no sparse vectors were indexed)
    let handle = match dir.open_lazy(&files.sparse).await {
        Ok(h) => h,
        Err(e) => {
            log::debug!("No sparse file found ({}): {:?}", files.sparse.display(), e);
            return Ok(empty());
        }
    };

    let file_size = handle.len();
    if file_size < SPARSE_FOOTER_SIZE {
        return Ok(empty());
    }

    // Read footer (24 bytes): skip_offset(8) + toc_offset(8) + num_fields(4) + magic(4)
    let footer_bytes = match handle
        .read_bytes_range(file_size - SPARSE_FOOTER_SIZE..file_size)
        .await
    {
        Ok(d) => d,
        Err(_) => return Ok(empty()),
    };
    let fb = footer_bytes.as_slice();

    let skip_offset = u64::from_le_bytes(fb[0..8].try_into().unwrap());
    let toc_offset = u64::from_le_bytes(fb[8..16].try_into().unwrap());
    let num_fields = u32::from_le_bytes(fb[16..20].try_into().unwrap());
    let magic = u32::from_le_bytes(fb[20..24].try_into().unwrap());

    if magic != SPARSE_FOOTER_MAGIC {
        return Err(crate::Error::Corruption(format!(
            "Invalid sparse footer magic: {:#x} (expected {:#x})",
            magic, SPARSE_FOOTER_MAGIC
        )));
    }

    log::debug!(
        "Loading sparse: size={} bytes, num_fields={}, skip_offset={}, toc_offset={}",
        file_size,
        num_fields,
        skip_offset,
        toc_offset,
    );

    if num_fields == 0 {
        return Ok(empty());
    }

    // Single tail read: skip section + TOC (skip_offset .. footer_start)
    // For mmap this is zero-copy. Block data at the front is never touched.
    let tail_bytes = handle
        .read_bytes_range(skip_offset..file_size - SPARSE_FOOTER_SIZE)
        .await?;
    let tail = tail_bytes.as_slice();

    // skip section is at tail[0 .. toc_offset - skip_offset]
    let skip_section_len = (toc_offset - skip_offset) as usize;
    let skip_section = tail_bytes.slice(0..skip_section_len);
    let toc_data = &tail[skip_section_len..];

    // Parse TOC: per-field header(13B) + per-dim entries(28B each)
    let mut pos = 0usize;

    for _ in 0..num_fields {
        // Field header: field_id(4) + quant(1) + num_dims(4) + total_vectors(4) = 13 bytes
        let field_id = u32::from_le_bytes(toc_data[pos..pos + 4].try_into().unwrap());
        let quantization = toc_data[pos + 4];
        let ndims = u32::from_le_bytes(toc_data[pos + 5..pos + 9].try_into().unwrap()) as usize;
        let total_vectors = u32::from_le_bytes(toc_data[pos + 9..pos + 13].try_into().unwrap());
        pos += 13;

        // Detect BMP format from the quant byte (bit 3 = format flag)
        let is_bmp = SparseVectorConfig::from_byte(quantization)
            .is_some_and(|c| c.format == crate::structures::SparseFormat::Bmp);

        if is_bmp && ndims >= 1 {
            // BMP field: single sentinel entry with blob location
            let d = &toc_data[pos..pos + 28];
            let dim_id = u32::from_le_bytes(d[0..4].try_into().unwrap());
            let blob_offset = u64::from_le_bytes(d[4..12].try_into().unwrap());
            let blob_len_low = u32::from_le_bytes(d[12..16].try_into().unwrap());
            let blob_len_high = u32::from_le_bytes(d[16..20].try_into().unwrap());
            pos += 28;

            // Skip any additional dim entries (shouldn't have more, but be safe)
            for _ in 1..ndims {
                pos += 28;
            }

            if dim_id != 0xFFFFFFFF {
                log::warn!(
                    "BMP field {} has unexpected dim_id {:#x} (expected sentinel)",
                    field_id,
                    dim_id
                );
            }

            let blob_len = (blob_len_high as u64) << 32 | blob_len_low as u64;

            match BmpIndex::parse(
                handle.clone(),
                blob_offset,
                blob_len,
                total_docs,
                total_vectors,
            ) {
                Ok(idx) => {
                    log::debug!(
                        "Loaded BMP index for field {}: dims={}, num_blocks={}, total_vectors={}",
                        field_id,
                        idx.dims(),
                        idx.num_blocks,
                        total_vectors,
                    );
                    bmp_indexes.insert(field_id, idx);
                }
                Err(e) => {
                    return Err(e);
                }
            }
        } else {
            // MaxScore field: standard per-dimension entries
            let mut dims = super::types::DimensionTable::with_capacity(ndims);
            for _ in 0..ndims {
                let d = &toc_data[pos..pos + 28];
                let dim_id = u32::from_le_bytes(d[0..4].try_into().unwrap());
                let block_data_offset = u64::from_le_bytes(d[4..12].try_into().unwrap());
                let skip_start = u32::from_le_bytes(d[12..16].try_into().unwrap());
                let num_blocks = u32::from_le_bytes(d[16..20].try_into().unwrap());
                let doc_count = u32::from_le_bytes(d[20..24].try_into().unwrap());
                let max_weight = f32::from_le_bytes(d[24..28].try_into().unwrap());
                dims.push(
                    dim_id,
                    block_data_offset,
                    skip_start,
                    num_blocks,
                    doc_count,
                    max_weight,
                );
                pos += 28;
            }
            dims.sort_by_dim_id();

            log::debug!(
                "Loaded sparse index for field {}: num_dims={}, total_vectors={}, skip_bytes={}",
                field_id,
                dims.len(),
                total_vectors,
                skip_section.len(),
            );

            maxscore_indexes.insert(
                field_id,
                SparseIndex::new(
                    handle.clone(),
                    dims,
                    skip_section.clone(),
                    total_docs,
                    total_vectors,
                ),
            );
        }
    }

    log::debug!(
        "Sparse file loaded: maxscore_fields={:?}, bmp_fields={:?}",
        maxscore_indexes.keys().collect::<Vec<_>>(),
        bmp_indexes.keys().collect::<Vec<_>>()
    );

    Ok(SparseFileData {
        maxscore_indexes,
        bmp_indexes,
    })
}

/// Open positions file handle (no header parsing needed - offsets are in TermInfo)
pub async fn open_positions_file<D: Directory>(
    dir: &D,
    files: &SegmentFiles,
    schema: &Schema,
) -> Result<Option<FileHandle>> {
    // Skip loading positions file if schema has no fields with position tracking
    let has_positions = schema.fields().any(|(_, entry)| entry.positions.is_some());
    if !has_positions {
        return Ok(None);
    }

    // Try to open positions file (may not exist if no positions were indexed)
    match dir.open_lazy(&files.positions).await {
        Ok(h) => Ok(Some(h)),
        Err(_) => Ok(None),
    }
}

/// Load fast-field columns from `.fast` file.
/// Returns a map of field_id → FastFieldReader.
pub async fn load_fast_fields_file<D: Directory>(
    dir: &D,
    files: &SegmentFiles,
    schema: &Schema,
) -> Result<FxHashMap<u32, crate::structures::fast_field::FastFieldReader>> {
    use crate::structures::fast_field::{
        FastFieldReader, read_fast_field_footer, read_fast_field_toc,
    };

    // Skip if no fast fields in schema
    let has_fast = schema.fields().any(|(_, entry)| entry.fast);
    if !has_fast {
        return Ok(FxHashMap::default());
    }

    // Try to open the .fast file (may not exist for old segments)
    let handle = match dir.open_read(&files.fast).await {
        Ok(h) => h,
        Err(e) => {
            log::debug!("[fast-fields] .fast file not found ({}), skipping", e);
            return Ok(FxHashMap::default());
        }
    };

    let file_data = handle.read_bytes().await?;
    if file_data.is_empty() {
        return Ok(FxHashMap::default());
    }

    let (toc_offset, num_columns) = read_fast_field_footer(&file_data).map_err(crate::Error::Io)?;

    let mut readers = FxHashMap::default();

    let toc_entries =
        read_fast_field_toc(&file_data, toc_offset, num_columns).map_err(crate::Error::Io)?;
    for toc in &toc_entries {
        let reader = FastFieldReader::open(&file_data, toc).map_err(crate::Error::Io)?;
        readers.insert(toc.field_id, reader);
    }

    log::debug!(
        "[fast-fields] loaded {} columns from .fast file",
        readers.len(),
    );

    Ok(readers)
}