text-splitter 0.30.1

Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python.
//! Test for `CodeSplitter` behavior.
use std::fs;

use fake::{Fake, Faker};
use itertools::Itertools;
use more_asserts::assert_le;
#[cfg(feature = "code")]
use text_splitter::{ChunkConfig, CodeSplitter};

#[cfg(feature = "code")]
#[test]
fn random_chunk_size() {
    let text = fs::read_to_string("tests/inputs/code/hashbrown_set_rs.txt").unwrap();

    for _ in 0..10 {
        let max_characters = Faker.fake();
        let splitter = CodeSplitter::new(
            tree_sitter_rust::LANGUAGE,
            ChunkConfig::new(max_characters).with_trim(false),
        )
        .unwrap();
        let chunks = splitter.chunks(&text).collect::<Vec<_>>();

        assert_eq!(chunks.join(""), text);
        for chunk in chunks {
            assert_le!(chunk.chars().count(), max_characters);
        }
    }
}

#[cfg(feature = "code")]
#[test]
fn random_chunk_indices_increase() {
    let text = fs::read_to_string("tests/inputs/code/hashbrown_set_rs.txt").unwrap();

    for _ in 0..10 {
        let max_characters = Faker.fake::<usize>();
        let splitter =
            CodeSplitter::new(tree_sitter_rust::LANGUAGE, ChunkConfig::new(max_characters))
                .unwrap();
        let indices = splitter.chunk_indices(&text).map(|(i, _)| i);

        assert!(indices.tuple_windows().all(|(a, b)| a < b));
    }
}

#[cfg(feature = "code")]
#[test]
fn can_handle_invalid_code() {
    let text = "No code here";

    let splitter = CodeSplitter::new(
        tree_sitter_rust::LANGUAGE,
        ChunkConfig::new(5).with_trim(false),
    )
    .unwrap();
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(chunks.join(""), text);
}

#[cfg(feature = "code")]
#[test]
fn groups_functions() {
    let text = "
fn fn1() {}
fn fn2() {}
fn fn3() {}
fn fn4() {}";

    let splitter = CodeSplitter::new(tree_sitter_rust::LANGUAGE, ChunkConfig::new(24)).unwrap();
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        chunks,
        ["fn fn1() {}\nfn fn2() {}", "fn fn3() {}\nfn fn4() {}"]
    );
}

#[cfg(feature = "code")]
#[test]
fn groups_functions_with_children() {
    let text = "
fn fn1() {}
fn fn2() {
    let x = 4;
}
fn fn3() {}
fn fn4() {}";

    let splitter = CodeSplitter::new(tree_sitter_rust::LANGUAGE, ChunkConfig::new(30)).unwrap();
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        chunks,
        [
            "fn fn1() {}",
            "fn fn2() {\n    let x = 4;\n}",
            "fn fn3() {}\nfn fn4() {}"
        ]
    );
}

#[cfg(feature = "code")]
#[test]
fn functions_overlap() {
    let text = "
fn fn1() {}
fn fn2() {}
fn fn3() {}
fn fn4() {}";

    let splitter = CodeSplitter::new(
        tree_sitter_rust::LANGUAGE,
        ChunkConfig::new(24).with_overlap(12).unwrap(),
    )
    .unwrap();
    let chunks = splitter.chunks(text).collect::<Vec<_>>();

    assert_eq!(
        chunks,
        [
            "fn fn1() {}\nfn fn2() {}",
            "fn fn2() {}\nfn fn3() {}",
            "fn fn3() {}\nfn fn4() {}"
        ]
    );
}