semchunk_rs/
lib.rs

1// MIT License
2//
3// Copyright (c) 2024 Dominic Tarro
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11//
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14//
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23//! # High performance text chunking in Rust
24//! 
25//! A port of [umarbutler/semchunk](https://github.com/umarbutler/semchunk) into Rust for splitting text into semantically meaningful chunks.
26//! 
27//! # Example
28//! 
29//! ```
30//! use semchunk_rs::Chunker;
31//! 
32//! let chunker = Chunker::new(4, Box::new(|s: &str| s.len() - s.replace(" ", "").len() + 1));
33//! let text = "The quick brown fox jumps over the lazy dog.";
34//! let chunks = chunker.chunk(text);
35//! assert_eq!(chunks, vec!["The quick brown fox", "jumps over the lazy", "dog."]);
36//! ```
37//! 
38//! With `rust_tokenizers`:
39//! 
40//! ```
41//! use rust_tokenizers::tokenizer::{RobertaTokenizer, Tokenizer};
42//! use semchunk_rs::Chunker;
43//! 
44//! let tokenizer = RobertaTokenizer::from_file(
45//!    "data/roberta-base-vocab.json",
46//!    "data/roberta-base-merges.txt",
47//!    false,
48//!    false,
49//! ).expect("Error loading tokenizer");
50//! let token_counter = Box::new(move |s: &str| tokenizer.tokenize(s).len());
51//! let chunker = Chunker::new(4, token_counter);
52//! let text = "The quick brown fox jumps over the lazy dog.";
53//! let chunks = chunker.chunk(text);
54//! assert_eq!(chunks, vec!["The quick brown fox", "jumps over the", "lazy dog."]);
55//! ```
56
57pub mod chunker;
58pub mod splitter;
59
60pub use chunker::Chunker;
61pub use splitter::Splitter;