Skip to main content

cognee_chunking/
lib.rs

1//! Text chunking for Cognee, ported from the Python chunking hierarchy.
2//!
3//! Splits text through a word → sentence → paragraph hierarchy into
4//! token-bounded chunks. Zero-copy where possible (chunks borrow `&str` slices
5//! via byte-offset tracking).
6//!
7//! - [`text_chunker`] / `cognify_pipeline` — the chunking entry points (the
8//!   latter is a plain code span, not an intra-doc link: it is gated off wasm32,
9//!   where the link would be unresolved on a `--target wasm32` doc build)
10//! - [`token_counter`] — the [`token_counter::TokenCounter`] trait and its
11//!   `WordCounter` / `HuggingFaceTokenCounter` / `TikTokenCounter` impls,
12//!   selected by [`config`] (`TokenCounterKind::from_env`)
13
14pub mod chunk_by_paragraph;
15pub mod chunk_by_row;
16pub mod chunk_by_sentence;
17pub mod chunk_by_word;
18// cognify_pipeline pulls in cognee-storage (filesystem-coupled) + tokio; excluded
19// on wasm32, where only the pure chunking primitives are available.
20#[cfg(not(target_arch = "wasm32"))]
21pub mod cognify_pipeline;
22pub mod config;
23pub mod cut_type;
24pub mod error;
25pub mod text_chunker;
26pub mod token_counter;
27
28#[cfg(test)]
29pub(crate) mod test_inputs;
30
31pub use chunk_by_row::chunk_by_row;
32#[cfg(not(target_arch = "wasm32"))]
33pub use cognify_pipeline::ExtractTextChunksPipeline;
34pub use config::TokenCounterKind;
35pub use cut_type::CutType;
36pub use error::ChunkingError;
37pub use text_chunker::{NAMESPACE_OID, chunk_text};
38#[cfg(feature = "hf-tokenizer")]
39pub use token_counter::HuggingFaceTokenCounter;
40#[cfg(feature = "tiktoken")]
41pub use token_counter::TikTokenCounter;
42pub use token_counter::{TokenCounter, WordCounter};