Skip to main content

chunkshop/
lib.rs

1//! chunkshop-rs — Rust port of chunkshop.
2//!
3//! Implements sources (files / HTTP / S3 / DB tables), chunkers, a fastembed
4//! embedder, and a modular sink/backend layer (PG / MariaDB / SQLite /
5//! ClickHouse). The YAML config schema and target table shape match the
6//! Python reference so vectors are interchangeable across implementations.
7//!
8//! ## Cargo features
9//!
10//! `default = ["full"]` — preserves backward compatibility with `chunkshop = "0.3"`.
11//!
12//! Library consumers who want only the chunker structs (e.g. an embedded
13//! Postgres extension) can opt into the slim build:
14//!
15//! ```toml
16//! chunkshop = { version = "0.4", default-features = false, features = ["chunkers"] }
17//! ```
18//!
19//! Available features:
20//! - `chunkers` — chunker structs + their config types (no fastembed/ort/sqlx).
21//! - `embedder-core` — fastembed (BYO `try_new_from_user_defined`) + ORT.
22//!   No `hf-hub`, no auto-download. Caller supplies model bytes directly via
23//!   [`embedder::FastembedEmbedder::from_user_defined_files`].
24//! - `embedder-hub` — adds `hf-hub` for runtime auto-download. Enables
25//!   [`embedder::FastembedEmbedder::new`] (stock variants + Xenova int8 BGE
26//!   bit-near-exact) and the [`chunker::SemanticChunker::new`] convenience.
27//! - `embedder` — historical alias = `embedder-core` + `embedder-hub`.
28//!   Existing consumers see no change.
29//! - `extractor` — language detection + entity extractor.
30//! - `source` — files / HTTP / S3 source loaders.
31//! - `sink` — the full modular sink/backend layer (PG/MariaDB/SQLite/ClickHouse).
32//! - `pipeline` — high-level Pipeline + run_cell glue.
33//! - `bakeoff` — chunker × embedder matrix evaluator.
34//! - `full` — all of the above (default).
35
36// The entire modular sink/backend layer is folded under the `sink` feature
37// (deliberate v4 design decision — no per-backend features). DB-table sources
38// reuse this backend layer, so their fetchers are additionally gated.
39#[cfg(feature = "sink")]
40pub mod backends;
41#[cfg(feature = "bakeoff")]
42pub mod bakeoff;
43#[cfg(feature = "chunkers")]
44pub mod chunker;
45#[cfg(all(feature = "code-aware", feature = "chunkers"))]
46pub mod chunkers;
47pub mod codeparse;
48pub mod config;
49#[cfg(feature = "embedder-core")]
50pub mod embedder;
51#[cfg(feature = "extractor")]
52pub mod extractor;
53#[cfg(feature = "pipeline")]
54pub mod framer;
55// `hf_cache` is the network-fetch path (HuggingFace download via hf-hub).
56// Slim consumers on `embedder-core` alone never compile this module.
57#[cfg(feature = "embedder-hub")]
58pub(crate) mod hf_cache;
59#[cfg(feature = "pipeline")]
60pub mod pipeline;
61#[cfg(feature = "pipeline")]
62pub mod runner;
63#[cfg(feature = "chunkers")]
64pub mod sentence_split;
65#[cfg(feature = "sink")]
66pub mod sinks;
67// `sources` is always declared so the `Document` struct is always available
68// (chunkers consume `&Document`). The heavy fetcher impls inside this module
69// are themselves cfg-gated behind the `source` (and, for DB-table sources,
70// `sink`) features.
71pub mod sources;
72// RM-B Task 5: pluggable raw-bytes storage (filesystem + S3). Always-on for
73// LocalRawStore; the S3 backend is `source`-feature-gated for the
74// object_store dep.
75pub mod raw_store;
76// RM-A: zero-network Rust consolidator default + trait. Always-on (only the
77// staging/source/sink layer is `memory`-feature-gated).
78pub mod consolidators;
79/// RM-A: agent-memory staging API — chunkshop-owned append-only session
80/// staging table with deterministic event_id derivation (byte-identical
81/// to Python `chunkshop.memory.staging`).
82#[cfg(feature = "memory")]
83pub mod memory;
84#[cfg(feature = "chunkers")]
85pub mod summarizer;
86
87#[cfg(feature = "sink")]
88pub use backends::{
89    AnyBackend, Backend, BackendConn, BackendDialect, ClickhouseBackend, ColSpec, MariadbBackend,
90    PostgresBackend, SQLiteBackend,
91};
92#[cfg(feature = "bakeoff")]
93pub use bakeoff::{run_bakeoff, run_bakeoff_with_base, BakeoffConfig, BakeoffResults};
94#[cfg(feature = "chunkers")]
95pub use chunker::{Chunk, SentenceAwareChunker};
96pub use config::{load_config, CellConfig};
97#[cfg(feature = "embedder-core")]
98pub use embedder::FastembedEmbedder;
99#[cfg(feature = "pipeline")]
100pub use pipeline::Pipeline;
101#[cfg(feature = "pipeline")]
102pub use runner::{run_cell, CellResult};
103#[cfg(feature = "sink")]
104pub use sinks::{AnySink, ClickhouseSink, MariadbSink, PgSink, Sink, SqliteSink};
105// `Document` is always available; the fetcher sources are gated.
106pub use sources::Document;
107#[cfg(all(feature = "source", feature = "sink"))]
108pub use sources::{
109    AnySource, ClickhouseTableSource, MariadbTableSource, PgTableSource, SqliteTableSource,
110};
111#[cfg(feature = "source")]
112pub use sources::{FilesSource, HttpSource, JsonCorpusSource, S3Source};