chonkier 0.0.2

🦛 Chonkie, now in Rust 🦀: No-nonsense, ultra-fast, ultra-light chunking library
Documentation
/*
    A simple example of recursive chunking with Chonkier's RecursiveChunker.
*/
use chonkier::chunker::recursive::RecursiveChunker;
use chonkier::types::RecursiveRules;
#[cfg(feature = "tokenizers")]
use chonkier::tokenizer::hftokenizers::HFTokenizer;
use std::fs::{self, File};
use std::io::{BufWriter, Write};

use serde::Serialize;

#[cfg(feature = "tokenizers")]
fn main() -> Result<(), Box<dyn std::error::Error>> {
    // Read the contents of rag.txt
    let text = fs::read_to_string("examples/data/rag.txt")?; // Use relative path to the project root

    // Create a RecursiveChunker instance
    let chunker = RecursiveChunker::new(
        HFTokenizer::from_pretrained("gpt2"),
        128,
        RecursiveRules::default(),
    );

    // Chunk the text
    let chunks = chunker.chunk(&text);

    // Open a file to write the chunks to in JSON Lines format
    let file = File::create("chunks.jsonl")?;
    let mut writer = BufWriter::new(file);

    // Iterate over the chunks and write them to the file
    for chunk in chunks {
        let json_string = serde_json::to_string(&chunk)?;
        writeln!(writer, "{}", json_string)?;
    }

    // Flush the writer to ensure all data is written
    writer.flush()?;

    println!("Chunks written to chunks.jsonl");

    Ok(())
}