Skip to main content

cognee_models/
document_chunk.rs

1use serde::{Deserialize, Serialize};
2use serde_json::json;
3use uuid::Uuid;
4
5use crate::DataPoint;
6use crate::has_datapoint::HasDataPoint;
7
8/// A chunk of text extracted from a document during the cognify pipeline.
9///
10/// Extends `DataPoint` (via `#[serde(flatten)]`) following the same pattern
11/// used by `Entity`, `EntityType`, and `EdgeType`.
12///
13/// Python equivalent: `cognee.infrastructure.engine.models.DataPoint` subclass
14/// `DocumentChunk` with `metadata = {"index_fields": ["text"]}`.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct DocumentChunk {
17    /// Base data point fields (id, timestamps, metadata, type, etc.)
18    #[serde(flatten)]
19    pub base: DataPoint,
20    /// The chunk text content.
21    pub text: String,
22    /// Token count (word count by default).
23    pub chunk_size: usize,
24    /// Sequential index within the parent document, starting at 0.
25    pub chunk_index: usize,
26    /// How the chunk boundary was determined (e.g. "paragraph_end", "sentence_end").
27    pub cut_type: String,
28    /// ID of the parent document this chunk belongs to (convenience field).
29    pub document_id: Uuid,
30    /// Document ID for graph edge (mirrors Python's `is_part_of` relationship).
31    pub is_part_of: Option<Uuid>,
32    /// Entity refs populated during graph extraction (mirrors Python's `contains` list).
33    #[serde(default)]
34    pub contains: Vec<serde_json::Value>,
35}
36
37impl DocumentChunk {
38    /// Create a new DocumentChunk with a deterministic ID.
39    ///
40    /// Sets:
41    /// - `base.data_type` = `"DocumentChunk"`
42    /// - `base.metadata["index_fields"]` = `["text"]`
43    /// - `base.id` = the provided deterministic UUID
44    /// - `is_part_of` = `Some(document_id)`
45    /// - `contains` = empty
46    pub fn new(
47        id: Uuid,
48        text: String,
49        chunk_size: usize,
50        chunk_index: usize,
51        cut_type: String,
52        document_id: Uuid,
53    ) -> Self {
54        let mut base = DataPoint::new("DocumentChunk", None);
55        base.id = id;
56        base.set_metadata("index_fields", json!(["text"]));
57        Self {
58            base,
59            text,
60            chunk_size,
61            chunk_index,
62            cut_type,
63            document_id,
64            is_part_of: Some(document_id),
65            contains: vec![],
66        }
67    }
68}
69
70impl HasDataPoint for DocumentChunk {
71    fn data_point(&self) -> &DataPoint {
72        &self.base
73    }
74    fn data_point_mut(&mut self) -> &mut DataPoint {
75        &mut self.base
76    }
77    // for_each_child_mut: default no-op — DocumentChunk references its
78    // parent `Document` by `document_id: Uuid` (and `is_part_of: Option<Uuid>`),
79    // not via an owned child.
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85
86    #[test]
87    fn document_chunk_implements_has_datapoint() {
88        let document_id = Uuid::new_v4();
89        let chunk = DocumentChunk::new(
90            Uuid::new_v4(),
91            "hello".into(),
92            1,
93            0,
94            "paragraph_end".into(),
95            document_id,
96        );
97        let dp_id = chunk.base.id;
98        assert_eq!(chunk.data_point().id, dp_id);
99        let mut chunk2 = chunk;
100        assert_eq!(chunk2.data_point_mut().id, dp_id);
101    }
102}