Skip to main content

cognee_cognify/summarization/
models.rs

1//! Summarization data models.
2//!
3//! Port of Python's:
4//! - cognee/tasks/summarization/models.py (TextSummary)
5//! - cognee/shared/data_models.py (SummarizedContent)
6
7use cognee_models::DataPoint;
8use cognee_models::HasDataPoint;
9use schemars::JsonSchema;
10use serde::{Deserialize, Serialize};
11use serde_json::json;
12use uuid::Uuid;
13
14/// LLM output model for summarized content.
15///
16/// This is the structured output format expected from the LLM.
17/// Used as the response_model in extract_summary calls.
18#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
19pub struct SummarizedContent {
20    /// Brief summary of the content (1-2 sentences)
21    pub summary: String,
22
23    /// Detailed description with key information preserved
24    pub description: String,
25}
26
27/// Text summary derived from a document chunk.
28///
29/// Represents a hierarchical summary that can be stored and retrieved.
30/// Extends DataPoint (matching Python's `TextSummary(DataPoint)`).
31/// Links back to the original chunk via `made_from`.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct TextSummary {
34    /// Base DataPoint fields (id, timestamps, metadata, etc.)
35    #[serde(flatten)]
36    pub base: DataPoint,
37
38    /// The chunk this summary was generated from (matches Python's `made_from: DocumentChunk`)
39    pub made_from: Option<Uuid>,
40
41    /// The summary text
42    pub text: String,
43
44    /// Optional description (from SummarizedContent.description)
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub description: Option<String>,
47
48    /// The model used to generate this summary (e.g., "gpt-4", "llama3.2")
49    pub model: String,
50}
51
52impl TextSummary {
53    /// Create a new TextSummary with deterministic UUID v5 ID.
54    ///
55    /// # Arguments
56    /// * `chunk_id` - UUID of the source DocumentChunk
57    /// * `text` - Summary text
58    /// * `description` - Optional detailed description
59    /// * `model` - Model name used for generation
60    ///
61    /// # Returns
62    /// A new TextSummary with uuid5(chunk_id, "TextSummary") as id
63    pub fn new(chunk_id: Uuid, text: String, description: Option<String>, model: String) -> Self {
64        // Deterministic ID: uuid5(chunk_id, "TextSummary")
65        let id = Uuid::new_v5(&chunk_id, b"TextSummary");
66
67        let mut base = DataPoint::new("TextSummary", None);
68        base.id = id;
69        base.metadata
70            .insert("index_fields".to_string(), json!(["text"]));
71
72        Self {
73            base,
74            made_from: Some(chunk_id),
75            text,
76            description,
77            model,
78        }
79    }
80
81    /// Create from a chunk ID and SummarizedContent (LLM output).
82    ///
83    /// # Arguments
84    /// * `chunk_id` - UUID of the source chunk
85    /// * `summarized` - LLM-generated SummarizedContent
86    /// * `model` - Model name
87    pub fn from_summarized_content(
88        chunk_id: Uuid,
89        summarized: SummarizedContent,
90        model: String,
91    ) -> Self {
92        Self::new(
93            chunk_id,
94            summarized.summary,
95            Some(summarized.description),
96            model,
97        )
98    }
99}
100
101impl HasDataPoint for TextSummary {
102    fn data_point(&self) -> &DataPoint {
103        &self.base
104    }
105    fn data_point_mut(&mut self) -> &mut DataPoint {
106        &mut self.base
107    }
108    // for_each_child_mut: default no-op — TextSummary's `made_from`
109    // reference is a `Option<Uuid>`, not an owned child.
110}
111
112#[cfg(test)]
113#[allow(
114    clippy::unwrap_used,
115    clippy::expect_used,
116    reason = "test code — panics are acceptable failures"
117)]
118mod tests {
119    use super::*;
120
121    #[test]
122    fn test_text_summary_deterministic_id() {
123        let chunk_id = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap();
124
125        let summary1 = TextSummary::new(
126            chunk_id,
127            "Test summary".to_string(),
128            None,
129            "gpt-4".to_string(),
130        );
131
132        let summary2 = TextSummary::new(
133            chunk_id,
134            "Different text".to_string(),
135            None,
136            "gpt-3.5-turbo".to_string(),
137        );
138
139        // Same chunk_id should produce same summary id (deterministic)
140        assert_eq!(summary1.base.id, summary2.base.id);
141
142        // Different chunk_id should produce different summary id
143        let different_chunk_id = Uuid::new_v4();
144        let summary3 = TextSummary::new(
145            different_chunk_id,
146            "Test summary".to_string(),
147            None,
148            "gpt-4".to_string(),
149        );
150        assert_ne!(summary1.base.id, summary3.base.id);
151    }
152
153    #[test]
154    fn test_from_summarized_content() {
155        let chunk_id = Uuid::new_v4();
156        let summarized = SummarizedContent {
157            summary: "Brief summary".to_string(),
158            description: "Detailed description with key points.".to_string(),
159        };
160
161        let text_summary = TextSummary::from_summarized_content(
162            chunk_id,
163            summarized.clone(),
164            "llama3".to_string(),
165        );
166
167        assert_eq!(text_summary.made_from, Some(chunk_id));
168        assert_eq!(text_summary.text, summarized.summary);
169        assert_eq!(text_summary.description, Some(summarized.description));
170        assert_eq!(text_summary.model, "llama3");
171        assert_eq!(
172            text_summary.base.id,
173            Uuid::new_v5(&chunk_id, b"TextSummary")
174        );
175    }
176
177    #[test]
178    fn test_serialization() {
179        let chunk_id = Uuid::new_v4();
180        let summary = TextSummary::new(
181            chunk_id,
182            "Summary text".to_string(),
183            Some("Description".to_string()),
184            "gpt-4".to_string(),
185        );
186
187        let json = serde_json::to_string(&summary).unwrap();
188        let deserialized: TextSummary = serde_json::from_str(&json).unwrap();
189
190        assert_eq!(summary.base.id, deserialized.base.id);
191        assert_eq!(summary.made_from, deserialized.made_from);
192        assert_eq!(summary.text, deserialized.text);
193        assert_eq!(summary.description, deserialized.description);
194        assert_eq!(summary.model, deserialized.model);
195    }
196
197    #[test]
198    fn test_data_point_base_fields() {
199        let chunk_id = Uuid::new_v4();
200        let summary = TextSummary::new(
201            chunk_id,
202            "Test summary".to_string(),
203            None,
204            "gpt-4".to_string(),
205        );
206
207        // Verify DataPoint base fields are properly set
208        assert_eq!(summary.base.data_type, "TextSummary");
209        assert_eq!(
210            summary.base.metadata.get("index_fields"),
211            Some(&json!(["text"]))
212        );
213        assert!(summary.base.created_at > 0);
214        assert!(summary.base.updated_at > 0);
215        assert_eq!(summary.base.version, 1);
216    }
217
218    #[test]
219    fn text_summary_implements_has_datapoint() {
220        let chunk_id = Uuid::new_v4();
221        let summary = TextSummary::new(
222            chunk_id,
223            "Summary text".to_string(),
224            None,
225            "gpt-4".to_string(),
226        );
227        let dp_id = summary.base.id;
228        assert_eq!(summary.data_point().id, dp_id);
229        let mut s2 = summary;
230        assert_eq!(s2.data_point_mut().id, dp_id);
231    }
232}