Skip to main content

cognee_chunking/
lib.rs

1//! Text chunking for Cognee, ported from the Python chunking hierarchy.
2//!
3//! Splits text through a word → sentence → paragraph hierarchy into
4//! token-bounded chunks. Zero-copy where possible (chunks borrow `&str` slices
5//! via byte-offset tracking).
6//!
7//! - [`text_chunker`] / [`cognify_pipeline`] — the chunking entry points
8//! - [`token_counter`] — the [`token_counter::TokenCounter`] trait and its
9//!   `WordCounter` / `HuggingFaceTokenCounter` / `TikTokenCounter` impls,
10//!   selected by [`config`] (`TokenCounterKind::from_env`)
11
12pub mod chunk_by_paragraph;
13pub mod chunk_by_row;
14pub mod chunk_by_sentence;
15pub mod chunk_by_word;
16pub mod cognify_pipeline;
17pub mod config;
18pub mod cut_type;
19pub mod error;
20pub mod text_chunker;
21pub mod token_counter;
22
23#[cfg(test)]
24pub(crate) mod test_inputs;
25
26pub use chunk_by_row::chunk_by_row;
27pub use cognify_pipeline::ExtractTextChunksPipeline;
28pub use config::TokenCounterKind;
29pub use cut_type::CutType;
30pub use error::ChunkingError;
31pub use text_chunker::{NAMESPACE_OID, chunk_text};
32#[cfg(feature = "hf-tokenizer")]
33pub use token_counter::HuggingFaceTokenCounter;
34#[cfg(feature = "tiktoken")]
35pub use token_counter::TikTokenCounter;
36pub use token_counter::{TokenCounter, WordCounter};