kbolt-core 0.1.7

Core engine for kbolt local-first retrieval
Documentation
pub(crate) const DENSE_DOCUMENT_RENDER_IDENTITY: &str = "render=dense-context-v1";

#[derive(Debug, Clone, Copy)]
pub(crate) struct ChunkRetrievalContext<'a> {
    pub body: &'a str,
    pub retrieval_prefix: Option<&'a str>,
    pub title: Option<&'a str>,
    pub heading: Option<&'a str>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct EmbeddingDocumentInput<'a> {
    pub title: Option<&'a str>,
    pub text: String,
}

pub(crate) fn render_structural_body(ctx: ChunkRetrievalContext<'_>) -> String {
    let Some(prefix) = ctx
        .retrieval_prefix
        .map(str::trim)
        .filter(|prefix| !prefix.is_empty())
    else {
        return ctx.body.to_string();
    };

    if ctx.body.is_empty() {
        prefix.to_string()
    } else {
        format!("{prefix}\n{}", ctx.body)
    }
}

pub(crate) fn render_bm25_document(
    ctx: ChunkRetrievalContext<'_>,
    contextual_prefix: bool,
) -> String {
    render_with_title_heading(ctx, contextual_prefix)
}

pub(crate) fn render_rerank_document(ctx: ChunkRetrievalContext<'_>) -> String {
    render_with_title_heading(ctx, true)
}

pub(crate) fn render_dense_document(ctx: ChunkRetrievalContext<'_>) -> EmbeddingDocumentInput<'_> {
    EmbeddingDocumentInput {
        title: ctx.title,
        text: render_with_title_heading(
            ChunkRetrievalContext {
                body: ctx.body,
                retrieval_prefix: ctx.retrieval_prefix,
                title: None,
                heading: ctx.heading,
            },
            true,
        ),
    }
}

pub(crate) fn render_with_title_heading(
    ctx: ChunkRetrievalContext<'_>,
    include_title_heading: bool,
) -> String {
    let source_text = render_structural_body(ctx);
    if !include_title_heading {
        return source_text;
    }

    let mut lines = Vec::new();
    if let Some(title) = ctx.title {
        let normalized_title = title.trim();
        if !normalized_title.is_empty() {
            lines.push(format!("title: {normalized_title}"));
        }
    }

    if let Some(raw_heading) = ctx.heading {
        let normalized_heading = raw_heading.trim();
        if !normalized_heading.is_empty() {
            lines.push(format!("heading: {normalized_heading}"));
        }
    }

    if lines.is_empty() {
        source_text
    } else {
        format!("{}\n\n{}", lines.join("\n"), source_text)
    }
}

#[cfg(test)]
mod tests {
    use super::{
        render_bm25_document, render_dense_document, render_rerank_document,
        render_structural_body, ChunkRetrievalContext,
    };

    #[test]
    fn structural_body_prepends_retrieval_prefix() {
        let text = render_structural_body(ChunkRetrievalContext {
            body: "tail value",
            retrieval_prefix: Some("table header"),
            title: None,
            heading: None,
        });

        assert_eq!(text, "table header\ntail value");
    }

    #[test]
    fn bm25_document_respects_contextual_prefix_flag() {
        let ctx = ChunkRetrievalContext {
            body: "body text",
            retrieval_prefix: Some("table header"),
            title: Some("Guide"),
            heading: Some("Setup"),
        };

        assert_eq!(
            render_bm25_document(ctx, true),
            "title: Guide\nheading: Setup\n\ntable header\nbody text"
        );
        assert_eq!(render_bm25_document(ctx, false), "table header\nbody text");
    }

    #[test]
    fn rerank_document_always_includes_title_and_heading() {
        let text = render_rerank_document(ChunkRetrievalContext {
            body: "body text",
            retrieval_prefix: None,
            title: Some("Guide"),
            heading: Some("Setup"),
        });

        assert_eq!(text, "title: Guide\nheading: Setup\n\nbody text");
    }

    #[test]
    fn dense_document_keeps_title_structural_and_heading_in_text() {
        let input = render_dense_document(ChunkRetrievalContext {
            body: "body text",
            retrieval_prefix: Some("table header"),
            title: Some("Guide"),
            heading: Some("Setup"),
        });

        assert_eq!(input.title, Some("Guide"));
        assert_eq!(input.text, "heading: Setup\n\ntable header\nbody text");
    }
}