use std::collections::HashSet;
use crate::pipeline::chunk_metadata::ChunkMetadata;
use crate::pipeline::element::Element;
use crate::pipeline::hybrid_chunking::HybridChunk;
use crate::pipeline::{DocumentSource, ElementBBox};
#[cfg(feature = "semantic")]
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone)]
#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
#[non_exhaustive]
pub struct RagChunk {
pub chunk_index: usize,
pub text: String,
pub full_text: String,
pub page_numbers: Vec<u32>,
pub bounding_boxes: Vec<ElementBBox>,
pub element_types: Vec<String>,
pub heading_context: Option<String>,
pub token_estimate: usize,
pub is_oversized: bool,
pub metadata: ChunkMetadata,
}
impl RagChunk {
pub fn from_hybrid_chunk(chunk_index: usize, chunk: &HybridChunk) -> Self {
Self::from_hybrid_chunk_inner(chunk_index, chunk, None)
}
pub fn from_hybrid_chunk_with_source(
chunk_index: usize,
chunk: &HybridChunk,
source: &DocumentSource,
) -> Self {
let mut c = Self::from_hybrid_chunk_inner(chunk_index, chunk, Some(source));
c.metadata.source = Some(source.clone());
c
}
fn from_hybrid_chunk_inner(
chunk_index: usize,
chunk: &HybridChunk,
source: Option<&DocumentSource>,
) -> Self {
let elements = chunk.elements();
let page_numbers = collect_pages(elements);
let bounding_boxes = elements.iter().map(|e| *e.bbox()).collect();
let element_types: Vec<String> =
elements.iter().map(|e| e.type_name().to_string()).collect();
let text = chunk.text();
let full_text = chunk.full_text();
let doc_hash = source.and_then(|s| s.doc_hash.as_deref());
let metadata =
ChunkMetadata::from_elements(elements, &text, &full_text, chunk_index, doc_hash);
Self {
chunk_index,
text,
full_text,
page_numbers,
bounding_boxes,
element_types,
heading_context: chunk.heading_context.clone(),
token_estimate: chunk.token_estimate(),
is_oversized: chunk.is_oversized(),
metadata,
}
}
#[cfg(feature = "semantic")]
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string(self)
}
}
fn collect_pages(elements: &[Element]) -> Vec<u32> {
if elements.is_empty() {
return Vec::new();
}
let first_page = elements[0].page();
if elements.iter().all(|e| e.page() == first_page) {
return vec![first_page];
}
let mut seen = HashSet::new();
let mut pages = Vec::new();
for e in elements {
let p = e.page();
if seen.insert(p) {
pages.push(p);
}
}
pages.sort_unstable();
pages
}