use std::collections::HashSet;
use crate::pipeline::element::Element;
use crate::pipeline::hybrid_chunking::HybridChunk;
use crate::pipeline::ElementBBox;
#[cfg(feature = "semantic")]
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone)]
#[cfg_attr(feature = "semantic", derive(Serialize, Deserialize))]
pub struct RagChunk {
pub chunk_index: usize,
pub text: String,
pub full_text: String,
pub page_numbers: Vec<u32>,
pub bounding_boxes: Vec<ElementBBox>,
pub element_types: Vec<String>,
pub heading_context: Option<String>,
pub token_estimate: usize,
pub is_oversized: bool,
}
impl RagChunk {
pub fn from_hybrid_chunk(chunk_index: usize, chunk: &HybridChunk) -> Self {
let elements = chunk.elements();
let page_numbers = collect_pages(elements);
let bounding_boxes = elements.iter().map(|e| *e.bbox()).collect();
let element_types: Vec<String> =
elements.iter().map(|e| e.type_name().to_string()).collect();
Self {
chunk_index,
text: chunk.text(),
full_text: chunk.full_text(),
page_numbers,
bounding_boxes,
element_types,
heading_context: chunk.heading_context.clone(),
token_estimate: chunk.token_estimate(),
is_oversized: chunk.is_oversized(),
}
}
#[cfg(feature = "semantic")]
pub fn to_json(&self) -> Result<String, serde_json::Error> {
serde_json::to_string(self)
}
}
fn collect_pages(elements: &[Element]) -> Vec<u32> {
if elements.is_empty() {
return Vec::new();
}
let first_page = elements[0].page();
if elements.iter().all(|e| e.page() == first_page) {
return vec![first_page];
}
let mut seen = HashSet::new();
let mut pages = Vec::new();
for e in elements {
let p = e.page();
if seen.insert(p) {
pages.push(p);
}
}
pages.sort_unstable();
pages
}