use crate::pipeline::element::Element;
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct ChunkGroup {
pub elements: Vec<Element>,
pub heading_context: Option<String>,
}
impl ChunkGroup {
pub fn new(elements: Vec<Element>, heading_context: Option<String>) -> Self {
Self {
elements,
heading_context,
}
}
}
pub trait ChunkingStrategy: Send + Sync {
fn chunk(&self, elements: &[Element]) -> Vec<ChunkGroup>;
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ClassLabel(pub std::borrow::Cow<'static, str>);
impl ClassLabel {
pub fn new(label: impl Into<std::borrow::Cow<'static, str>>) -> Self {
Self(label.into())
}
pub fn as_str(&self) -> &str {
&self.0
}
}
impl AsRef<str> for ClassLabel {
fn as_ref(&self) -> &str {
&self.0
}
}
impl From<ClassLabel> for String {
fn from(label: ClassLabel) -> String {
label.0.into_owned()
}
}
impl PartialEq<str> for ClassLabel {
fn eq(&self, other: &str) -> bool {
self.0.as_ref() == other
}
}
impl PartialEq<&str> for ClassLabel {
fn eq(&self, other: &&str) -> bool {
self.0.as_ref() == *other
}
}
pub struct ClassifyContext<'a> {
pub elements: &'a [Element],
pub index: usize,
}
pub trait ElementClassifier: Send + Sync {
fn classify(&self, element: &Element, ctx: &ClassifyContext) -> Option<ClassLabel>;
}
#[cfg(feature = "semantic")]
pub struct EnrichContext<'a> {
pub text: &'a str,
pub elements: &'a [Element],
pub heading_path: &'a [String],
}
#[cfg(feature = "semantic")]
pub trait MetadataEnricher: Send + Sync {
fn enrich(&self, ctx: &EnrichContext, meta: &mut crate::pipeline::ChunkMetadata);
}
use crate::pipeline::hybrid_chunking::{HybridChunkConfig, HybridChunker};
use crate::pipeline::{DocumentSource, PartitionConfig};
pub struct AnalysisPipeline {
pub(crate) chunking: Box<dyn ChunkingStrategy>,
pub(crate) max_tokens: usize,
pub(crate) source: Option<DocumentSource>,
pub(crate) classifier: Option<Box<dyn ElementClassifier>>,
pub(crate) partition_config: PartitionConfig,
#[cfg(feature = "semantic")]
pub(crate) enrichers: Vec<Box<dyn MetadataEnricher>>,
}
impl Default for AnalysisPipeline {
fn default() -> Self {
Self::new()
}
}
impl AnalysisPipeline {
pub fn new() -> Self {
let config = HybridChunkConfig::default();
Self {
max_tokens: config.max_tokens,
chunking: Box::new(HybridChunker::new(config)),
source: None,
classifier: None,
partition_config: PartitionConfig::default(),
#[cfg(feature = "semantic")]
enrichers: Vec::new(),
}
}
#[must_use]
pub fn with_chunking(mut self, strategy: Box<dyn ChunkingStrategy>) -> Self {
self.chunking = strategy;
self
}
#[must_use]
pub fn with_max_tokens(mut self, max_tokens: usize) -> Self {
self.max_tokens = max_tokens;
self
}
#[must_use]
pub fn with_partition_config(mut self, config: PartitionConfig) -> Self {
self.partition_config = config;
self
}
#[must_use]
pub fn with_source(mut self, source: DocumentSource) -> Self {
self.source = Some(source);
self
}
#[must_use]
pub fn with_classifier(mut self, classifier: Box<dyn ElementClassifier>) -> Self {
self.classifier = Some(classifier);
self
}
#[cfg(feature = "semantic")]
#[must_use]
pub fn with_enricher(mut self, enricher: Box<dyn MetadataEnricher>) -> Self {
self.enrichers.push(enricher);
self
}
}