use anyhow::Result;
use async_trait::async_trait;
use futures_util::{stream, StreamExt};
use crate::{
ingestion::{IngestionNode, IngestionStream},
integrations::treesitter::{ChunkSize, CodeSplitter, SupportedLanguages},
ChunkerTransformer,
};
#[derive(Debug)]
pub struct ChunkCode {
chunker: CodeSplitter,
concurrency: Option<usize>,
}
impl ChunkCode {
pub fn try_for_language(lang: impl TryInto<SupportedLanguages>) -> Result<Self> {
Ok(Self {
chunker: CodeSplitter::builder().try_language(lang)?.build()?,
concurrency: None,
})
}
pub fn try_for_language_and_chunk_size(
lang: impl TryInto<SupportedLanguages>,
chunk_size: impl Into<ChunkSize>,
) -> Result<Self> {
Ok(Self {
chunker: CodeSplitter::builder()
.try_language(lang)?
.chunk_size(chunk_size)
.build()
.expect("Failed to build code splitter"),
concurrency: None,
})
}
pub fn with_concurrency(mut self, concurrency: usize) -> Self {
self.concurrency = Some(concurrency);
self
}
}
#[async_trait]
impl ChunkerTransformer for ChunkCode {
#[tracing::instrument(skip_all, name = "transformers.chunk_code")]
async fn transform_node(&self, node: IngestionNode) -> IngestionStream {
let split_result = self.chunker.split(&node.chunk);
if let Ok(split) = split_result {
return stream::iter(split.into_iter().map(move |chunk| {
Ok(IngestionNode {
chunk,
..node.clone()
})
}))
.boxed();
} else {
return stream::iter(vec![Err(split_result.unwrap_err())]).boxed();
}
}
fn concurrency(&self) -> Option<usize> {
self.concurrency
}
}