text-splitter 0.30.0

Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python.
//! Test for `MarkdownSplitter` behavior.
use std::fs;

use fake::{Fake, Faker};
use itertools::Itertools;
use more_asserts::assert_le;
#[cfg(feature = "markdown")]
use text_splitter::{ChunkConfig, MarkdownSplitter};

#[cfg(feature = "markdown")]
#[test]
fn random_chunk_size() {
    let text = fs::read_to_string("tests/inputs/markdown/github_flavored.md").unwrap();

    for _ in 0..10 {
        let max_characters = Faker.fake();
        let splitter = MarkdownSplitter::new(ChunkConfig::new(max_characters).with_trim(false));
        let chunks = splitter.chunks(&text).collect::<Vec<_>>();

        assert_eq!(chunks.join(""), text);
        for chunk in chunks {
            assert_le!(chunk.chars().count(), max_characters);
        }
    }
}

#[cfg(feature = "markdown")]
#[test]
fn random_chunk_indices_increase() {
    let text = fs::read_to_string("tests/inputs/markdown/github_flavored.md").unwrap();

    for _ in 0..10 {
        let max_characters = Faker.fake::<usize>();
        let splitter = MarkdownSplitter::new(ChunkConfig::new(max_characters).with_trim(false));
        let indices = splitter.chunk_indices(&text).map(|(i, _)| i);

        assert!(indices.tuple_windows().all(|(a, b)| a < b));
    }
}

#[cfg(feature = "markdown")]
#[test]
fn fallsback_to_normal_text_split_if_no_markdown_content() {
    let chunk_config = ChunkConfig::new(10).with_trim(false);
    let splitter = MarkdownSplitter::new(chunk_config);
    let text = "Some text\n\nfrom a\ndocument";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(["Some text\n", "\nfrom a\n", "document"].to_vec(), chunks);
}

#[cfg(feature = "markdown")]
#[test]
fn split_by_rule() {
    let splitter = MarkdownSplitter::new(ChunkConfig::new(12).with_trim(false));
    let text = "Some text\n\n---\n\nwith a rule";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(["Some text\n\n", "---\n", "\nwith a rule"].to_vec(), chunks);
}

#[cfg(feature = "markdown")]
#[test]
fn split_by_rule_trim() {
    let splitter = MarkdownSplitter::new(12);
    let text = "Some text\n\n---\n\nwith a rule";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(["Some text", "---", "with a rule"].to_vec(), chunks);
}

#[cfg(feature = "markdown")]
#[test]
fn split_by_headers() {
    let splitter = MarkdownSplitter::new(ChunkConfig::new(30).with_trim(false));
    let text = "# Header 1\n\nSome text\n\n## Header 2\n\nwith headings\n";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        [
            "# Header 1\n\nSome text\n\n",
            "## Header 2\n\nwith headings\n"
        ]
        .to_vec(),
        chunks
    );
}

#[cfg(feature = "markdown")]
#[test]
fn subheadings_grouped_with_top_header() {
    let splitter = MarkdownSplitter::new(ChunkConfig::new(60).with_trim(false));
    let text = "# Header 1\n\nSome text\n\n## Header 2\n\nwith headings\n\n### Subheading\n\nand more text\n";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        [
            "# Header 1\n\nSome text\n\n",
            "## Header 2\n\nwith headings\n\n### Subheading\n\nand more text\n"
        ]
        .to_vec(),
        chunks
    );
}

#[cfg(feature = "markdown")]
#[test]
fn trimming_doesnt_trim_block_level_indentation_if_multiple_items() {
    let splitter = MarkdownSplitter::new(48);
    let text = "* Really long list item that is too big to fit\n\n  * Some Indented Text\n\n  * More Indented Text\n\n";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        [
            "* Really long list item that is too big to fit",
            "  * Some Indented Text\n\n  * More Indented Text"
        ]
        .to_vec(),
        chunks
    );
}

#[cfg(feature = "markdown")]
#[test]
fn trimming_does_trim_block_level_indentation_if_only_one_item() {
    let splitter = MarkdownSplitter::new(30);
    let text = "1. Really long list item\n\n  1. Some Indented Text\n\n  2. More Indented Text\n\n";
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        [
            "1. Really long list item",
            "1. Some Indented Text",
            "2. More Indented Text"
        ]
        .to_vec(),
        chunks
    );
}