bearing 0.1.0-alpha.2

A Rust port of Apache Lucene
Documentation
// SPDX-License-Identifier: Apache-2.0
//! Postings format constants and term state for the Lucene 10.3 codec.

use std::collections::HashMap;
use std::io;

use crate::codecs::competitive_impact::NormsLookup;
use crate::document::IndexOptions;
use crate::index::indexing_chain::PerFieldData;
use crate::index::{FieldInfos, SegmentInfo};
use crate::store::SharedDirectory;

use super::blocktree_writer::BlockTreeTermsWriter;

// --- Postings format constants ---

pub const BLOCK_SIZE: usize = 128;
pub const LEVEL1_FACTOR: usize = 32;
pub const LEVEL1_NUM_DOCS: usize = LEVEL1_FACTOR * BLOCK_SIZE;

pub const VERSION_START: i32 = 0;
pub const VERSION_CURRENT: i32 = VERSION_START;

// Postings file extensions
pub const META_EXTENSION: &str = "psm";
pub const DOC_EXTENSION: &str = "doc";
pub const POS_EXTENSION: &str = "pos";
// Postings codec names
pub const TERMS_CODEC: &str = "Lucene103PostingsWriterTerms";
pub const META_CODEC: &str = "Lucene103PostingsWriterMeta";
pub const DOC_CODEC: &str = "Lucene103PostingsWriterDoc";
pub const POS_CODEC: &str = "Lucene103PostingsWriterPos";

// BlockTree terms dict constants
pub const TERMS_EXTENSION: &str = "tim";
pub const TERMS_INDEX_EXTENSION: &str = "tip";
pub const TERMS_META_EXTENSION: &str = "tmd";

pub const TERMS_CODEC_NAME: &str = "BlockTreeTermsDict";
pub const TERMS_INDEX_CODEC_NAME: &str = "BlockTreeTermsIndex";
pub const TERMS_META_CODEC_NAME: &str = "BlockTreeTermsMeta";

pub const BLOCKTREE_VERSION_START: i32 = 0;
pub const BLOCKTREE_VERSION_CURRENT: i32 = BLOCKTREE_VERSION_START;

pub const DEFAULT_MIN_BLOCK_SIZE: usize = 25;
pub const DEFAULT_MAX_BLOCK_SIZE: usize = 48;

/// Per-term metadata stored in .tim blocks.
#[derive(Clone, Copy, Debug)]
pub struct IntBlockTermState {
    // From BlockTermState base class
    pub doc_freq: i32,
    pub total_term_freq: i64,
    // Lucene103-specific fields
    pub doc_start_fp: i64,
    pub pos_start_fp: i64,
    pub last_pos_block_offset: i64,
    pub singleton_doc_id: i32,
}

impl IntBlockTermState {
    pub fn new() -> Self {
        Self {
            doc_freq: 0,
            total_term_freq: 0,
            doc_start_fp: 0,
            pos_start_fp: 0,
            last_pos_block_offset: -1,
            singleton_doc_id: -1,
        }
    }
}

impl Default for IntBlockTermState {
    fn default() -> Self {
        Self::new()
    }
}

/// Write all postings files for the given fields to the directory.
/// Returns the file names written.
pub fn write(
    directory: &SharedDirectory,
    segment_info: &SegmentInfo,
    segment_suffix: &str,
    field_infos: &FieldInfos,
    per_field: &HashMap<String, PerFieldData>,
) -> io::Result<Vec<String>> {
    let mut btw = BlockTreeTermsWriter::new(
        directory,
        &segment_info.name,
        segment_suffix,
        &segment_info.id,
        field_infos,
    )?;

    // Process fields in lexicographic name order (matching Java's FieldsConsumer)
    let mut indexed_fields: Vec<_> = field_infos
        .iter()
        .filter(|fi| fi.index_options() != IndexOptions::None)
        .collect();
    indexed_fields.sort_by_key(|fi| fi.name().to_string());

    for fi in indexed_fields {
        if let Some(pfd) = per_field.get(fi.name())
            && pfd.has_postings()
        {
            let sorted = pfd.sorted_postings();
            let norms = if fi.omit_norms() {
                NormsLookup::no_norms()
            } else {
                NormsLookup::new(&pfd.norms, &pfd.norms_docs)
            };
            btw.write_field(fi, &sorted, &pfd.postings, &norms)?;
        }
    }

    btw.finish()
}