Skip to main content

entelix_rag/
document.rs

1//! `Document` — the unit RAG pipelines move around.
2//!
3//! Distinct from [`entelix_memory::Document`]: the latter is a
4//! *retrieval result* (carries a similarity score from a vector
5//! store); this is the *ingestion / processing* shape (carries
6//! provenance under [`Source`] and split-history under
7//! [`Lineage`]). Splitters, chunkers, and ingestion pipelines move
8//! `entelix_rag::Document` values; the final write-to-vector-store
9//! step converts to `entelix_memory::Document`.
10//!
11//! The two shapes are deliberately uncoupled — retrieval has no
12//! need for `Source` / `Lineage` (that information lives in
13//! `metadata` on persistent storage), and ingestion has no
14//! similarity score until retrieval happens.
15
16use std::sync::Arc;
17
18use chrono::{DateTime, Utc};
19use entelix_memory::Namespace;
20use serde::{Deserialize, Serialize};
21
22/// Stable identifier for a `Document` within its [`Namespace`].
23/// Loaders mint these from the source's natural id (S3 object key,
24/// Notion page id, file path); splitters derive child ids by
25/// suffixing the parent id with `:<chunk_index>`.
26///
27/// Held as `Arc<str>` so cloning a `Document` (and the chunk tree
28/// a splitter produces) is an atomic refcount bump rather than a
29/// fresh string allocation per chunk. Mirrors the
30/// [`entelix_core::TenantId`] interning pattern.
31#[derive(Clone, Debug, Eq, Hash, PartialEq, Serialize, Deserialize)]
32#[serde(transparent)]
33pub struct DocumentId(Arc<str>);
34
35impl DocumentId {
36    /// Build an id from any string-like value. Empty ids are
37    /// rejected at construction time — silent mismatch with stored
38    /// records on retrieval is a class of bug not worth admitting.
39    ///
40    /// # Panics
41    ///
42    /// Panics when `id` is empty after `Into::into`. Empty
43    /// document ids are a programmer error, not a runtime
44    /// condition the pipeline should silently paper over.
45    #[must_use]
46    pub fn new(id: impl Into<String>) -> Self {
47        let s: String = id.into();
48        assert!(!s.is_empty(), "DocumentId must not be empty");
49        Self(Arc::from(s))
50    }
51
52    /// Borrow the id as a string slice.
53    #[must_use]
54    pub fn as_str(&self) -> &str {
55        &self.0
56    }
57}
58
59impl std::fmt::Display for DocumentId {
60    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
61        f.write_str(&self.0)
62    }
63}
64
65impl From<String> for DocumentId {
66    fn from(s: String) -> Self {
67        Self::new(s)
68    }
69}
70
71impl From<&str> for DocumentId {
72    fn from(s: &str) -> Self {
73        Self::new(s)
74    }
75}
76
77/// Where a `Document` originated. Survives every split and chunker
78/// pass — the leaf chunk knows the source URI of the parent
79/// document and which loader produced it.
80///
81/// `etag` enables idempotent re-ingestion: pipelines compare the
82/// loader-reported etag against the stored value and skip
83/// reprocessing when unchanged. Loaders that lack a natural etag
84/// (in-memory, ephemeral) leave it `None` and the pipeline falls
85/// back to time-based deduplication.
86#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
87#[non_exhaustive]
88pub struct Source {
89    /// URI of the source — `file:///path`, `s3://bucket/key`,
90    /// `notion://workspace/page-id`, etc. Pipelines treat this as
91    /// opaque; loaders define the scheme.
92    pub uri: String,
93    /// Stable identifier of the loader implementation. `"web"`,
94    /// `"s3"`, `"notion"`, etc. — used for dashboards and replay
95    /// routing, not for behaviour gating. Carried as `String`
96    /// (rather than `&'static str`) so persisted documents
97    /// reconstruct via serde without forcing every loader name to
98    /// be a string literal.
99    pub loader: String,
100    /// When the loader fetched this document. Pipelines stamp this
101    /// at fetch time so re-ingestion ages-out properly.
102    pub fetched_at: DateTime<Utc>,
103    /// Optional content-version tag the source surfaces (HTTP
104    /// `ETag`, S3 object `etag`, Notion `last_edited_time` hash).
105    /// `None` when the source has no natural content-versioning
106    /// signal.
107    pub etag: Option<String>,
108}
109
110impl Source {
111    /// Build a source descriptor stamped at the current wall
112    /// clock. Loaders whose natural fetch time differs from "now"
113    /// (replay, batch import) construct via the struct literal.
114    #[must_use]
115    pub fn now(uri: impl Into<String>, loader: impl Into<String>) -> Self {
116        Self {
117            uri: uri.into(),
118            loader: loader.into(),
119            fetched_at: Utc::now(),
120            etag: None,
121        }
122    }
123
124    /// Builder-style etag attachment.
125    #[must_use]
126    pub fn with_etag(mut self, etag: impl Into<String>) -> Self {
127        self.etag = Some(etag.into());
128        self
129    }
130}
131
132/// Split-history — survives every transformation. A leaf chunk's
133/// `Lineage` describes which parent it came from, which split
134/// produced it, and which chunkers ran over it. Audit / debug
135/// flows reconstruct the path from a retrieval hit back to the
136/// ingestion source by walking the lineage chain (parent_id →
137/// loader's source URI).
138///
139/// `None` on the original `Document` produced by a `DocumentLoader`
140/// — only chunked descendants carry lineage.
141#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
142#[non_exhaustive]
143pub struct Lineage {
144    /// Document id of the parent this chunk descends from. Walks
145    /// up through every split layer to the original loaded
146    /// document.
147    pub parent_id: DocumentId,
148    /// Position of this chunk within the immediate parent's split
149    /// output. Zero-based.
150    pub chunk_index: u32,
151    /// Total number of chunks the parent produced. Lets retrieval
152    /// surfaces show "chunk 3 of 12" provenance.
153    pub total_chunks: u32,
154    /// Stable identifier of the splitter that produced this
155    /// chunk. `"recursive-character"`, `"markdown-structure"`,
156    /// etc. — surfaces in audit dashboards.
157    pub splitter: String,
158    /// Stable identifiers of every chunker that processed this
159    /// chunk after the split, in order. `"contextual"`,
160    /// `"hyde"`, … — empty when no chunker ran.
161    pub chunker_chain: Vec<String>,
162}
163
164impl Lineage {
165    /// Build the lineage entry a splitter stamps onto each child
166    /// chunk. The chunker chain starts empty; downstream chunkers
167    /// append themselves via [`Self::push_chunker`].
168    #[must_use]
169    pub fn from_split(
170        parent_id: DocumentId,
171        chunk_index: u32,
172        total_chunks: u32,
173        splitter: impl Into<String>,
174    ) -> Self {
175        Self {
176            parent_id,
177            chunk_index,
178            total_chunks,
179            splitter: splitter.into(),
180            chunker_chain: Vec::new(),
181        }
182    }
183
184    /// Append a chunker identifier to this chunk's chain — called
185    /// by `Chunker` impls when they transform a chunk.
186    pub fn push_chunker(&mut self, chunker: impl Into<String>) {
187        self.chunker_chain.push(chunker.into());
188    }
189}
190
191/// The unit a RAG pipeline moves around — content plus everything
192/// downstream needs to know about where it came from.
193///
194/// Splitters consume one `Document` and emit several. Chunkers
195/// consume a sequence and emit a transformed sequence (typically
196/// the same length, with mutated `content` or `metadata`).
197/// Loaders produce them; ingestion pipelines consume them.
198///
199/// `metadata` is operator-defined free-form JSON for filtering at
200/// the vector-store layer — explicit fields above (`source`,
201/// `lineage`, `namespace`) are the SDK-stamped boundary that every
202/// pipeline must preserve.
203#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
204#[non_exhaustive]
205pub struct Document {
206    /// Stable identifier for this document within its namespace.
207    pub id: DocumentId,
208    /// The textual content. Splitters slice this; chunkers may
209    /// rewrite it (Contextual Retrieval prepends a generated
210    /// context prefix).
211    pub content: String,
212    /// Operator-supplied free-form metadata. Vector stores
213    /// typically expose this as a filterable JSON column.
214    #[serde(default, skip_serializing_if = "serde_json::Value::is_null")]
215    pub metadata: serde_json::Value,
216    /// Origin of this document or its top-level ancestor.
217    pub source: Source,
218    /// Split / chunk ancestry. `None` on the loader-produced root;
219    /// `Some` on every chunked descendant.
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    pub lineage: Option<Lineage>,
222    /// Multi-tenant boundary (invariant 11). Every persistent
223    /// boundary the pipeline crosses respects this — silent
224    /// cross-tenant leakage is structurally impossible because
225    /// every loader, splitter, and pipeline takes a `Namespace`
226    /// at construction time.
227    pub namespace: Namespace,
228}
229
230impl Document {
231    /// Construct a fresh root document — the shape a
232    /// [`crate::DocumentLoader`] emits before any splitter has
233    /// run. `lineage` is `None`; chunked descendants populate it.
234    #[must_use]
235    pub fn root(
236        id: impl Into<DocumentId>,
237        content: impl Into<String>,
238        source: Source,
239        namespace: Namespace,
240    ) -> Self {
241        Self {
242            id: id.into(),
243            content: content.into(),
244            metadata: serde_json::Value::Null,
245            source,
246            lineage: None,
247            namespace,
248        }
249    }
250
251    /// Builder-style metadata setter.
252    #[must_use]
253    pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
254        self.metadata = metadata;
255        self
256    }
257
258    /// Derive a child document for a chunk. The splitter supplies
259    /// the per-chunk content + lineage; the new id is the parent's
260    /// id with a `:{chunk_index}` suffix so leaves carry
261    /// hierarchical identifiers stable across re-runs.
262    #[must_use]
263    pub fn child(&self, content: impl Into<String>, lineage: Lineage) -> Self {
264        let child_id = format!("{}:{}", self.id, lineage.chunk_index);
265        Self {
266            id: DocumentId::new(child_id),
267            content: content.into(),
268            metadata: self.metadata.clone(),
269            source: self.source.clone(),
270            lineage: Some(lineage),
271            namespace: self.namespace.clone(),
272        }
273    }
274}
275
276#[cfg(test)]
277#[allow(clippy::unwrap_used)]
278mod tests {
279    use super::*;
280
281    fn ns() -> Namespace {
282        Namespace::new(entelix_core::TenantId::new("acme"))
283    }
284
285    fn src() -> Source {
286        Source::now("file:///tmp/doc.md", "test")
287    }
288
289    #[test]
290    fn document_id_rejects_empty() {
291        let result = std::panic::catch_unwind(|| DocumentId::new(""));
292        assert!(result.is_err(), "empty DocumentId must panic");
293    }
294
295    #[test]
296    fn document_id_clone_shares_arc() {
297        let id = DocumentId::new("doc-1");
298        let cloned = id.clone();
299        // Same `Arc<str>` allocation under both handles.
300        assert_eq!(Arc::as_ptr(&id.0), Arc::as_ptr(&cloned.0));
301    }
302
303    #[test]
304    fn child_id_suffixes_with_chunk_index() {
305        let root = Document::root("paper", "full text", src(), ns());
306        let lineage = Lineage::from_split(root.id.clone(), 3, 10, "recursive");
307        let child = root.child("slice", lineage);
308        assert_eq!(child.id.as_str(), "paper:3");
309        assert_eq!(child.lineage.as_ref().unwrap().chunk_index, 3);
310        assert_eq!(child.lineage.as_ref().unwrap().total_chunks, 10);
311        assert_eq!(child.source.uri, root.source.uri);
312        assert_eq!(child.namespace, root.namespace);
313    }
314
315    #[test]
316    fn lineage_push_chunker_records_chain_order() {
317        let mut lineage = Lineage::from_split(DocumentId::new("d"), 0, 1, "recursive");
318        lineage.push_chunker("contextual");
319        lineage.push_chunker("hyde");
320        assert_eq!(lineage.chunker_chain, vec!["contextual", "hyde"]);
321    }
322
323    #[test]
324    fn source_with_etag_preserves_other_fields() {
325        let s = Source::now("https://example.com/p", "web").with_etag("W/\"abc\"");
326        assert_eq!(s.etag.as_deref(), Some("W/\"abc\""));
327        assert_eq!(s.loader, "web");
328    }
329
330    #[test]
331    fn document_round_trips_through_serde() {
332        let doc = Document::root("d", "hello", src(), ns())
333            .with_metadata(serde_json::json!({"locale": "en"}));
334        let json = serde_json::to_string(&doc).unwrap();
335        let back: Document = serde_json::from_str(&json).unwrap();
336        assert_eq!(doc, back);
337    }
338}