swiftide 0.3.1

Blazing fast document and code indexation
Documentation
use crate::{ingestion::IngestionNode, ingestion::IngestionStream, ChunkerTransformer};
use async_trait::async_trait;
use derive_builder::Builder;
use futures_util::{stream, StreamExt};
use text_splitter::{Characters, MarkdownSplitter};

#[derive(Debug, Builder)]
#[builder(pattern = "owned")]
pub struct ChunkMarkdown {
    chunker: MarkdownSplitter<Characters>,
    #[builder(default)]
    concurrency: Option<usize>,
}

impl ChunkMarkdown {
    pub fn with_max_characters(max_characters: usize) -> Self {
        Self {
            chunker: MarkdownSplitter::new(max_characters),
            concurrency: None,
        }
    }

    pub fn with_chunk_range(range: std::ops::Range<usize>) -> Self {
        Self {
            chunker: MarkdownSplitter::new(range),
            concurrency: None,
        }
    }

    pub fn builder() -> ChunkMarkdownBuilder {
        ChunkMarkdownBuilder::default()
    }
}

#[async_trait]
impl ChunkerTransformer for ChunkMarkdown {
    #[tracing::instrument(skip_all, name = "transformers.chunk_markdown")]
    async fn transform_node(&self, node: IngestionNode) -> IngestionStream {
        let chunks = self
            .chunker
            .chunks(&node.chunk)
            .map(|chunk| chunk.to_string())
            .collect::<Vec<String>>();

        stream::iter(chunks.into_iter().map(move |chunk| {
            Ok(IngestionNode {
                chunk,
                ..node.clone()
            })
        }))
        .boxed()
    }

    fn concurrency(&self) -> Option<usize> {
        self.concurrency
    }
}