lucisearch 0.8.0

Embeddable, in-process search engine — the SQLite/DuckDB of Elasticsearch
Documentation
//! Reader: content retrieval from committed segments.
//!
//! Handles source retrieval, field retrieval, and source bytes access.
//! Operates on `SegmentStore` but is separate from `Searcher` (scoring).
//!
//! See [[architecture-scoring-materialization-separation]].

use crate::core::{DocId, SegmentId};

use crate::columnar::writer::ColumnType;
use crate::search::results::FieldReaderCache;
use crate::search::segment_store::SegmentStore;

/// Content retrieval from committed segments.
///
/// `pub(crate)` — not exposed to consumers. `Hit` delegates to Reader
/// for lazy content access.
pub struct Reader<'a> {
    store: &'a SegmentStore,
}

impl<'a> Reader<'a> {
    pub fn new(store: &'a SegmentStore) -> Self {
        Self { store }
    }

    /// Retrieve a document's source JSON from its segment.
    ///
    /// LZ4 decompression + JSON parsing happens here.
    pub fn get_source(&self, segment_id: SegmentId, doc_id: DocId) -> Option<serde_json::Value> {
        let segment = self
            .store
            .segments()
            .iter()
            .find(|s| s.segment_id() == segment_id)?;
        let store = segment.doc_store();
        let bytes = store.get(doc_id.as_u32())?;
        serde_json::from_slice(&bytes).ok()
    }

    /// Get raw source bytes (LZ4-decompressed, unparsed).
    pub fn get_source_bytes(&self, segment_id: SegmentId, doc_id: DocId) -> Option<Vec<u8>> {
        let segment = self
            .store
            .segments()
            .iter()
            .find(|s| s.segment_id() == segment_id)?;
        let store = segment.doc_store();
        store.get(doc_id.as_u32())
    }

    /// Retrieve typed field values from the columnar store.
    ///
    /// Each value is wrapped in a JSON array (ES `fields` API compat).
    pub fn retrieve_fields(
        &self,
        segment_id: SegmentId,
        doc_id: DocId,
        field_names: &[String],
    ) -> serde_json::Map<String, serde_json::Value> {
        let segment = match self
            .store
            .segments()
            .iter()
            .find(|s| s.segment_id() == segment_id)
        {
            Some(s) => s,
            None => return serde_json::Map::new(),
        };
        let doc = doc_id.as_u32();
        let mut result = serde_json::Map::new();
        for name in field_names {
            let field_id = match segment
                .header()
                .fields
                .iter()
                .find(|f| f.field_name == *name)
                .map(|f| f.field_id)
            {
                Some(id) => id,
                None => continue,
            };
            let col = match segment.column(field_id) {
                Some(c) => c,
                None => continue,
            };
            let value = match col.col_type() {
                ColumnType::Keyword | ColumnType::KeywordBlocked => match col.keyword_value(doc) {
                    Some(s) => serde_json::json!([s]),
                    None => continue,
                },
                ColumnType::F64 | ColumnType::ConstantF64 => match col.f64_value(doc) {
                    Some(n) => serde_json::json!([n]),
                    None => continue,
                },
                ColumnType::I64 | ColumnType::BitpackedI64 | ColumnType::ConstantI64 => {
                    match col.i64_value(doc) {
                        Some(n) => serde_json::json!([n]),
                        None => continue,
                    }
                }
                ColumnType::Bool => match col.bool_value(doc) {
                    Some(b) => serde_json::json!([b]),
                    None => continue,
                },
                _ => continue,
            };
            result.insert(name.clone(), value);
        }
        result
    }

    /// Retrieve typed field values using a per-`SearchResults` cache so
    /// the `ColumnReader::open` cost is paid once per `(segment, field)`
    /// instead of once per hit. Behaviour is identical to
    /// [`retrieve_fields`][Self::retrieve_fields]; only the column-open
    /// path differs.
    ///
    /// See [[optimize-hit-id-column-reader-cache]].
    pub(crate) fn retrieve_fields_cached(
        &self,
        cache: &FieldReaderCache,
        segment_id: SegmentId,
        doc_id: DocId,
        field_names: &[String],
    ) -> serde_json::Map<String, serde_json::Value> {
        let segment = match self
            .store
            .segments()
            .iter()
            .find(|s| s.segment_id() == segment_id)
        {
            Some(s) => s,
            None => return serde_json::Map::new(),
        };
        let doc = doc_id.as_u32();
        let mut result = serde_json::Map::new();
        for name in field_names {
            let field_id = match segment
                .header()
                .fields
                .iter()
                .find(|f| f.field_name == *name)
                .map(|f| f.field_id)
            {
                Some(id) => id,
                None => continue,
            };
            let value = cache.with_column(segment, field_id, |col| match col.col_type() {
                ColumnType::Keyword | ColumnType::KeywordBlocked => col
                    .keyword_value(doc)
                    .map(|s| serde_json::json!([s.to_owned()])),
                ColumnType::F64 | ColumnType::ConstantF64 => {
                    col.f64_value(doc).map(|n| serde_json::json!([n]))
                }
                ColumnType::I64 | ColumnType::BitpackedI64 | ColumnType::ConstantI64 => {
                    col.i64_value(doc).map(|n| serde_json::json!([n]))
                }
                ColumnType::Bool => col.bool_value(doc).map(|b| serde_json::json!([b])),
                _ => None,
            });
            if let Some(Some(v)) = value {
                result.insert(name.clone(), v);
            }
        }
        result
    }
}