cognee_models/document_chunk.rs
1use serde::{Deserialize, Serialize};
2use serde_json::json;
3use uuid::Uuid;
4
5use crate::DataPoint;
6use crate::has_datapoint::HasDataPoint;
7
8/// A chunk of text extracted from a document during the cognify pipeline.
9///
10/// Extends `DataPoint` (via `#[serde(flatten)]`) following the same pattern
11/// used by `Entity`, `EntityType`, and `EdgeType`.
12///
13/// Python equivalent: `cognee.infrastructure.engine.models.DataPoint` subclass
14/// `DocumentChunk` with `metadata = {"index_fields": ["text"]}`.
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct DocumentChunk {
17 /// Base data point fields (id, timestamps, metadata, type, etc.)
18 #[serde(flatten)]
19 pub base: DataPoint,
20 /// The chunk text content.
21 pub text: String,
22 /// Token count (word count by default).
23 pub chunk_size: usize,
24 /// Sequential index within the parent document, starting at 0.
25 pub chunk_index: usize,
26 /// How the chunk boundary was determined (e.g. "paragraph_end", "sentence_end").
27 pub cut_type: String,
28 /// ID of the parent document this chunk belongs to (convenience field).
29 pub document_id: Uuid,
30 /// Document ID for graph edge (mirrors Python's `is_part_of` relationship).
31 pub is_part_of: Option<Uuid>,
32 /// Entity refs populated during graph extraction (mirrors Python's `contains` list).
33 #[serde(default)]
34 pub contains: Vec<serde_json::Value>,
35}
36
37impl DocumentChunk {
38 /// Create a new DocumentChunk with a deterministic ID.
39 ///
40 /// Sets:
41 /// - `base.data_type` = `"DocumentChunk"`
42 /// - `base.metadata["index_fields"]` = `["text"]`
43 /// - `base.id` = the provided deterministic UUID
44 /// - `is_part_of` = `Some(document_id)`
45 /// - `contains` = empty
46 pub fn new(
47 id: Uuid,
48 text: String,
49 chunk_size: usize,
50 chunk_index: usize,
51 cut_type: String,
52 document_id: Uuid,
53 ) -> Self {
54 let mut base = DataPoint::new("DocumentChunk", None);
55 base.id = id;
56 base.set_metadata("index_fields", json!(["text"]));
57 Self {
58 base,
59 text,
60 chunk_size,
61 chunk_index,
62 cut_type,
63 document_id,
64 is_part_of: Some(document_id),
65 contains: vec![],
66 }
67 }
68}
69
70impl HasDataPoint for DocumentChunk {
71 fn data_point(&self) -> &DataPoint {
72 &self.base
73 }
74 fn data_point_mut(&mut self) -> &mut DataPoint {
75 &mut self.base
76 }
77 // for_each_child_mut: default no-op — DocumentChunk references its
78 // parent `Document` by `document_id: Uuid` (and `is_part_of: Option<Uuid>`),
79 // not via an owned child.
80}
81
82#[cfg(test)]
83mod tests {
84 use super::*;
85
86 #[test]
87 fn document_chunk_implements_has_datapoint() {
88 let document_id = Uuid::new_v4();
89 let chunk = DocumentChunk::new(
90 Uuid::new_v4(),
91 "hello".into(),
92 1,
93 0,
94 "paragraph_end".into(),
95 document_id,
96 );
97 let dp_id = chunk.base.id;
98 assert_eq!(chunk.data_point().id, dp_id);
99 let mut chunk2 = chunk;
100 assert_eq!(chunk2.data_point_mut().id, dp_id);
101 }
102}