niblits 0.3.10

Token-aware, multi-format text chunking library with language-aware semantic splitting
use std::path::Path;

use super::{ChunkStream, Chunker, ConcreteSizer, MarkdownChunker};
use crate::{Tokenizer, languages::PeekableReader, types::*};
use async_trait::async_trait;
use htmd::HtmlToMarkdown;
use tokio::io::{AsyncRead, AsyncReadExt};

#[derive(Clone)]
pub struct HtmlChunker {
  markdown_chunker: MarkdownChunker,
}

impl HtmlChunker {
  pub fn new(max_chunk_size: usize, tokenizer_type: Tokenizer, chunk_overlap: usize) -> Result<Self, ChunkError> {
    let chunk_sizer = tokenizer_type.try_into()?;
    Ok(Self::new_with_sizer(max_chunk_size, chunk_overlap, chunk_sizer))
  }

  pub fn new_with_sizer(max_chunk_size: usize, chunk_overlap: usize, chunk_sizer: ConcreteSizer) -> Self {
    Self {
      markdown_chunker: MarkdownChunker::new_with_sizer(max_chunk_size, chunk_overlap, chunk_sizer),
    }
  }
}

#[async_trait]
impl Chunker for HtmlChunker {
  async fn applies(
    &self,
    _file_path: &Path,
    mut reader: PeekableReader<Box<dyn AsyncRead + Unpin + Send>>,
  ) -> Result<PeekableReader<Box<dyn AsyncRead + Unpin + Send>>, PeekableReader<Box<dyn AsyncRead + Unpin + Send>>> {
    let matches_html = match reader.peek_content(8192).await {
      Ok(content) => infer::get(&content)
        .map(|file_type| file_type.mime_type() == "text/html")
        .unwrap_or(false),
      Err(_) => false,
    };

    if matches_html { Ok(reader) } else { Err(reader) }
  }

  async fn chunk(&self, file_path: &Path, mut reader: Box<dyn AsyncRead + Unpin + Send>) -> ChunkStream {
    let chunker = self.clone();
    let file_path = file_path.to_path_buf();
    let eof_file_path = file_path.to_string_lossy().to_string();
    Box::pin(async_stream::try_stream! {
      let mut data = Vec::new();
      reader.read_to_end(&mut data).await?;
      if data.is_empty() {
        return;
      }

      let html = String::from_utf8_lossy(&data).into_owned();
      let markdown = HtmlToMarkdown::new()
        .convert(&html)
        .map_err(|err| ChunkError::ParseError(format!("Failed to convert HTML to Markdown: {err}")))?;

      let chunks = chunker
        .markdown_chunker
        .chunk_markdown_string(markdown, Some(file_path.as_path()))?;
      let mut chunk_count = 0usize;
      for semantic_chunk in chunks {
        chunk_count += 1;
        yield Chunk::Text(semantic_chunk);
      }

      if chunk_count > 0 {
        yield Chunk::EndOfFile {
          file_path: eof_file_path,
          content: None,
          content_hash: None,
          file_metadata: None,
          file_symbols: None,
          expected_chunks: chunk_count,
        };
      }
    })
  }
}

#[cfg(test)]
mod tests {
  use super::*;
  use crate::{Tokenizer, chunker::memory_async_reader, types::Chunk};
  use futures::StreamExt;

  #[tokio::test]
  async fn chunk_html_converts_to_markdown() {
    let chunker = HtmlChunker::new(64, Tokenizer::Characters, 0).unwrap();
    let html = "<html><body><h1>Heading</h1><p>Paragraph</p></body></html>";
    let reader = memory_async_reader(html.as_bytes().to_vec());
    let mut stream = chunker.chunk(Path::new("page.html"), reader).await;

    let mut chunks = Vec::new();
    while let Some(item) = stream.next().await {
      let chunk = item.expect("chunking HTML should succeed");
      if let Chunk::Text(sc) = chunk {
        chunks.push(sc.text);
      }
    }

    assert!(
      chunks.iter().any(|c| c.contains("Heading")),
      "expected markdown content to include heading"
    );
  }
}