pub(crate) const DENSE_DOCUMENT_RENDER_IDENTITY: &str = "render=dense-context-v1";
#[derive(Debug, Clone, Copy)]
pub(crate) struct ChunkRetrievalContext<'a> {
pub body: &'a str,
pub retrieval_prefix: Option<&'a str>,
pub title: Option<&'a str>,
pub heading: Option<&'a str>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct EmbeddingDocumentInput<'a> {
pub title: Option<&'a str>,
pub text: String,
}
pub(crate) fn render_structural_body(ctx: ChunkRetrievalContext<'_>) -> String {
let Some(prefix) = ctx
.retrieval_prefix
.map(str::trim)
.filter(|prefix| !prefix.is_empty())
else {
return ctx.body.to_string();
};
if ctx.body.is_empty() {
prefix.to_string()
} else {
format!("{prefix}\n{}", ctx.body)
}
}
pub(crate) fn render_bm25_document(
ctx: ChunkRetrievalContext<'_>,
contextual_prefix: bool,
) -> String {
render_with_title_heading(ctx, contextual_prefix)
}
pub(crate) fn render_rerank_document(ctx: ChunkRetrievalContext<'_>) -> String {
render_with_title_heading(ctx, true)
}
pub(crate) fn render_dense_document(ctx: ChunkRetrievalContext<'_>) -> EmbeddingDocumentInput<'_> {
EmbeddingDocumentInput {
title: ctx.title,
text: render_with_title_heading(
ChunkRetrievalContext {
body: ctx.body,
retrieval_prefix: ctx.retrieval_prefix,
title: None,
heading: ctx.heading,
},
true,
),
}
}
pub(crate) fn render_with_title_heading(
ctx: ChunkRetrievalContext<'_>,
include_title_heading: bool,
) -> String {
let source_text = render_structural_body(ctx);
if !include_title_heading {
return source_text;
}
let mut lines = Vec::new();
if let Some(title) = ctx.title {
let normalized_title = title.trim();
if !normalized_title.is_empty() {
lines.push(format!("title: {normalized_title}"));
}
}
if let Some(raw_heading) = ctx.heading {
let normalized_heading = raw_heading.trim();
if !normalized_heading.is_empty() {
lines.push(format!("heading: {normalized_heading}"));
}
}
if lines.is_empty() {
source_text
} else {
format!("{}\n\n{}", lines.join("\n"), source_text)
}
}
#[cfg(test)]
mod tests {
use super::{
render_bm25_document, render_dense_document, render_rerank_document,
render_structural_body, ChunkRetrievalContext,
};
#[test]
fn structural_body_prepends_retrieval_prefix() {
let text = render_structural_body(ChunkRetrievalContext {
body: "tail value",
retrieval_prefix: Some("table header"),
title: None,
heading: None,
});
assert_eq!(text, "table header\ntail value");
}
#[test]
fn bm25_document_respects_contextual_prefix_flag() {
let ctx = ChunkRetrievalContext {
body: "body text",
retrieval_prefix: Some("table header"),
title: Some("Guide"),
heading: Some("Setup"),
};
assert_eq!(
render_bm25_document(ctx, true),
"title: Guide\nheading: Setup\n\ntable header\nbody text"
);
assert_eq!(render_bm25_document(ctx, false), "table header\nbody text");
}
#[test]
fn rerank_document_always_includes_title_and_heading() {
let text = render_rerank_document(ChunkRetrievalContext {
body: "body text",
retrieval_prefix: None,
title: Some("Guide"),
heading: Some("Setup"),
});
assert_eq!(text, "title: Guide\nheading: Setup\n\nbody text");
}
#[test]
fn dense_document_keeps_title_structural_and_heading_in_text() {
let input = render_dense_document(ChunkRetrievalContext {
body: "body text",
retrieval_prefix: Some("table header"),
title: Some("Guide"),
heading: Some("Setup"),
});
assert_eq!(input.title, Some("Guide"));
assert_eq!(input.text, "heading: Setup\n\ntable header\nbody text");
}
}