entelix_rag/document.rs
1//! `Document` — the unit RAG pipelines move around.
2//!
3//! Distinct from [`entelix_memory::Document`]: the latter is a
4//! *retrieval result* (carries a similarity score from a vector
5//! store); this is the *ingestion / processing* shape (carries
6//! provenance under [`Source`] and split-history under
7//! [`Lineage`]). Splitters, chunkers, and ingestion pipelines move
8//! `entelix_rag::Document` values; the final write-to-vector-store
9//! step converts to `entelix_memory::Document`.
10//!
11//! The two shapes are deliberately uncoupled — retrieval has no
12//! need for `Source` / `Lineage` (that information lives in
13//! `metadata` on persistent storage), and ingestion has no
14//! similarity score until retrieval happens.
15
16use std::sync::Arc;
17
18use chrono::{DateTime, Utc};
19use entelix_memory::Namespace;
20use serde::{Deserialize, Serialize};
21
22/// Stable identifier for a `Document` within its [`Namespace`].
23/// Loaders mint these from the source's natural id (S3 object key,
24/// Notion page id, file path); splitters derive child ids by
25/// suffixing the parent id with `:<chunk_index>`.
26///
27/// Held as `Arc<str>` so cloning a `Document` (and the chunk tree
28/// a splitter produces) is an atomic refcount bump rather than a
29/// fresh string allocation per chunk. Mirrors the
30/// [`entelix_core::TenantId`] interning pattern.
31#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
32#[serde(transparent)]
33pub struct DocumentId(Arc<str>);
34
35impl DocumentId {
36 /// Build an id from any string-like value. Empty ids are
37 /// rejected at construction time — silent mismatch with stored
38 /// records on retrieval is a class of bug not worth admitting.
39 ///
40 /// # Panics
41 ///
42 /// Panics when `id` is empty after `Into::into`. Empty
43 /// document ids are a programmer error, not a runtime
44 /// condition the pipeline should silently paper over.
45 #[must_use]
46 pub fn new(id: impl Into<String>) -> Self {
47 let s: String = id.into();
48 assert!(!s.is_empty(), "DocumentId must not be empty");
49 Self(Arc::from(s))
50 }
51
52 /// Borrow the id as a string slice.
53 #[must_use]
54 pub fn as_str(&self) -> &str {
55 &self.0
56 }
57}
58
59impl std::fmt::Display for DocumentId {
60 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61 f.write_str(&self.0)
62 }
63}
64
65impl From<String> for DocumentId {
66 fn from(s: String) -> Self {
67 Self::new(s)
68 }
69}
70
71impl From<&str> for DocumentId {
72 fn from(s: &str) -> Self {
73 Self::new(s)
74 }
75}
76
77/// Where a `Document` originated. Survives every split and chunker
78/// pass — the leaf chunk knows the source URI of the parent
79/// document and which loader produced it.
80///
81/// `etag` enables idempotent re-ingestion: pipelines compare the
82/// loader-reported etag against the stored value and skip
83/// reprocessing when unchanged. Loaders that lack a natural etag
84/// (in-memory, ephemeral) leave it `None` and the pipeline falls
85/// back to time-based deduplication.
86#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
87#[non_exhaustive]
88pub struct Source {
89 /// URI of the source — `file:///path`, `s3://bucket/key`,
90 /// `notion://workspace/page-id`, etc. Pipelines treat this as
91 /// opaque; loaders define the scheme.
92 pub uri: String,
93 /// Stable identifier of the loader implementation. `"web"`,
94 /// `"s3"`, `"notion"`, etc. — used for dashboards and replay
95 /// routing, not for behaviour gating. Carried as `String`
96 /// (rather than `&'static str`) so persisted documents
97 /// reconstruct via serde without forcing every loader name to
98 /// be a string literal.
99 pub loader: String,
100 /// When the loader fetched this document. Pipelines stamp this
101 /// at fetch time so re-ingestion ages-out properly.
102 pub fetched_at: DateTime<Utc>,
103 /// Optional content-version tag the source surfaces (HTTP
104 /// `ETag`, S3 object `etag`, Notion `last_edited_time` hash).
105 /// `None` when the source has no natural content-versioning
106 /// signal.
107 pub etag: Option<String>,
108}
109
110impl Source {
111 /// Build a source descriptor stamped at the current wall
112 /// clock. Loaders whose natural fetch time differs from "now"
113 /// (replay, batch import) construct via the struct literal.
114 #[must_use]
115 pub fn now(uri: impl Into<String>, loader: impl Into<String>) -> Self {
116 Self {
117 uri: uri.into(),
118 loader: loader.into(),
119 fetched_at: Utc::now(),
120 etag: None,
121 }
122 }
123
124 /// Builder-style etag attachment.
125 #[must_use]
126 pub fn with_etag(mut self, etag: impl Into<String>) -> Self {
127 self.etag = Some(etag.into());
128 self
129 }
130}
131
132/// Split-history — survives every transformation. A leaf chunk's
133/// `Lineage` describes which parent it came from, which split
134/// produced it, and which chunkers ran over it. Audit / debug
135/// flows reconstruct the path from a retrieval hit back to the
136/// ingestion source by walking the lineage chain (parent_id →
137/// loader's source URI).
138///
139/// `None` on the original `Document` produced by a `DocumentLoader`
140/// — only chunked descendants carry lineage.
141#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
142#[non_exhaustive]
143pub struct Lineage {
144 /// Document id of the parent this chunk descends from. Walks
145 /// up through every split layer to the original loaded
146 /// document.
147 pub parent_id: DocumentId,
148 /// Position of this chunk within the immediate parent's split
149 /// output. Zero-based.
150 pub chunk_index: u32,
151 /// Total number of chunks the parent produced. Lets retrieval
152 /// surfaces show "chunk 3 of 12" provenance.
153 pub total_chunks: u32,
154 /// Stable identifier of the splitter that produced this
155 /// chunk. `"recursive-character"`, `"markdown-structure"`,
156 /// etc. — surfaces in audit dashboards.
157 pub splitter: String,
158 /// Stable identifiers of every chunker that processed this
159 /// chunk after the split, in order. `"contextual"`,
160 /// `"hyde"`, … — empty when no chunker ran.
161 pub chunker_chain: Vec<String>,
162}
163
164impl Lineage {
165 /// Build the lineage entry a splitter stamps onto each child
166 /// chunk. The chunker chain starts empty; downstream chunkers
167 /// append themselves via [`Self::push_chunker`].
168 #[must_use]
169 pub fn from_split(
170 parent_id: DocumentId,
171 chunk_index: u32,
172 total_chunks: u32,
173 splitter: impl Into<String>,
174 ) -> Self {
175 Self {
176 parent_id,
177 chunk_index,
178 total_chunks,
179 splitter: splitter.into(),
180 chunker_chain: Vec::new(),
181 }
182 }
183
184 /// Append a chunker identifier to this chunk's chain — called
185 /// by `Chunker` impls when they transform a chunk.
186 pub fn push_chunker(&mut self, chunker: impl Into<String>) {
187 self.chunker_chain.push(chunker.into());
188 }
189}
190
191/// The unit a RAG pipeline moves around — content plus everything
192/// downstream needs to know about where it came from.
193///
194/// Splitters consume one `Document` and emit several. Chunkers
195/// consume a sequence and emit a transformed sequence (typically
196/// the same length, with mutated `content` or `metadata`).
197/// Loaders produce them; ingestion pipelines consume them.
198///
199/// `metadata` is operator-defined free-form JSON for filtering at
200/// the vector-store layer — explicit fields above (`source`,
201/// `lineage`, `namespace`) are the SDK-stamped boundary that every
202/// pipeline must preserve.
203#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
204#[non_exhaustive]
205pub struct Document {
206 /// Stable identifier for this document within its namespace.
207 pub id: DocumentId,
208 /// The textual content. Splitters slice this; chunkers may
209 /// rewrite it (Contextual Retrieval prepends a generated
210 /// context prefix).
211 pub content: String,
212 /// Operator-supplied free-form metadata. Vector stores
213 /// typically expose this as a filterable JSON column.
214 #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
215 pub metadata: serde_json::Value,
216 /// Origin of this document or its top-level ancestor.
217 pub source: Source,
218 /// Split / chunk ancestry. `None` on the loader-produced root;
219 /// `Some` on every chunked descendant.
220 #[serde(default, skip_serializing_if = "Option::is_none")]
221 pub lineage: Option<Lineage>,
222 /// Multi-tenant boundary (invariant 11). Every persistent
223 /// boundary the pipeline crosses respects this — silent
224 /// cross-tenant leakage is structurally impossible because
225 /// every loader, splitter, and pipeline takes a `Namespace`
226 /// at construction time.
227 pub namespace: Namespace,
228}
229
230impl Document {
231 /// Construct a fresh root document — the shape a
232 /// [`crate::DocumentLoader`] emits before any splitter has
233 /// run. `lineage` is `None`; chunked descendants populate it.
234 #[must_use]
235 pub fn root(
236 id: impl Into<DocumentId>,
237 content: impl Into<String>,
238 source: Source,
239 namespace: Namespace,
240 ) -> Self {
241 Self {
242 id: id.into(),
243 content: content.into(),
244 metadata: serde_json::Value::Null,
245 source,
246 lineage: None,
247 namespace,
248 }
249 }
250
251 /// Builder-style metadata setter.
252 #[must_use]
253 pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
254 self.metadata = metadata;
255 self
256 }
257
258 /// Derive a child document for a chunk. The splitter supplies
259 /// the per-chunk content + lineage; the new id is the parent's
260 /// id with a `:{chunk_index}` suffix so leaves carry
261 /// hierarchical identifiers stable across re-runs.
262 #[must_use]
263 pub fn child(&self, content: impl Into<String>, lineage: Lineage) -> Self {
264 let child_id = format!("{}:{}", self.id, lineage.chunk_index);
265 Self {
266 id: DocumentId::new(child_id),
267 content: content.into(),
268 metadata: self.metadata.clone(),
269 source: self.source.clone(),
270 lineage: Some(lineage),
271 namespace: self.namespace.clone(),
272 }
273 }
274}
275
276#[cfg(test)]
277#[allow(clippy::unwrap_used)]
278mod tests {
279 use super::*;
280
281 fn ns() -> Namespace {
282 Namespace::new(entelix_core::TenantId::new("acme"))
283 }
284
285 fn src() -> Source {
286 Source::now("file:///tmp/doc.md", "test")
287 }
288
289 #[test]
290 fn document_id_rejects_empty() {
291 let result = std::panic::catch_unwind(|| DocumentId::new(""));
292 assert!(result.is_err(), "empty DocumentId must panic");
293 }
294
295 #[test]
296 fn document_id_clone_shares_arc() {
297 let id = DocumentId::new("doc-1");
298 let cloned = id.clone();
299 // Same `Arc<str>` allocation under both handles.
300 assert_eq!(Arc::as_ptr(&id.0), Arc::as_ptr(&cloned.0));
301 }
302
303 #[test]
304 fn child_id_suffixes_with_chunk_index() {
305 let root = Document::root("paper", "full text", src(), ns());
306 let lineage = Lineage::from_split(root.id.clone(), 3, 10, "recursive");
307 let child = root.child("slice", lineage);
308 assert_eq!(child.id.as_str(), "paper:3");
309 assert_eq!(child.lineage.as_ref().unwrap().chunk_index, 3);
310 assert_eq!(child.lineage.as_ref().unwrap().total_chunks, 10);
311 assert_eq!(child.source.uri, root.source.uri);
312 assert_eq!(child.namespace, root.namespace);
313 }
314
315 #[test]
316 fn lineage_push_chunker_records_chain_order() {
317 let mut lineage = Lineage::from_split(DocumentId::new("d"), 0, 1, "recursive");
318 lineage.push_chunker("contextual");
319 lineage.push_chunker("hyde");
320 assert_eq!(lineage.chunker_chain, vec!["contextual", "hyde"]);
321 }
322
323 #[test]
324 fn source_with_etag_preserves_other_fields() {
325 let s = Source::now("https://example.com/p", "web").with_etag("W/\"abc\"");
326 assert_eq!(s.etag.as_deref(), Some("W/\"abc\""));
327 assert_eq!(s.loader, "web");
328 }
329
330 #[test]
331 fn document_round_trips_through_serde() {
332 let doc = Document::root("d", "hello", src(), ns())
333 .with_metadata(serde_json::json!({"locale": "en"}));
334 let json = serde_json::to_string(&doc).unwrap();
335 let back: Document = serde_json::from_str(&json).unwrap();
336 assert_eq!(doc, back);
337 }
338}