Skip to main content

cognee_cognify/
pipeline.rs

1//! Cognify pipeline result types.
2//!
3//! The actual pipeline orchestration lives in [`crate::tasks`]. This module
4//! defines the output types shared across the pipeline.
5
6use std::collections::HashMap;
7
8use cognee_models::{Document, DocumentChunk, EdgeType, Embedding};
9use uuid::Uuid;
10
11use crate::graph_integration::{GraphEdgePair, GraphNodePair};
12use crate::summarization::TextSummary;
13
14/// Result of the cognify pipeline.
15#[derive(Debug, Clone)]
16pub struct CognifyResult {
17    /// Text chunks extracted from documents
18    pub chunks: Vec<DocumentChunk>,
19
20    /// Entities (nodes) with their types, deduplicated
21    pub entities: Vec<GraphNodePair>,
22
23    /// Edges (relationships) between entities, deduplicated
24    pub edges: Vec<GraphEdgePair>,
25
26    /// Text summaries generated from chunks
27    pub summaries: Vec<TextSummary>,
28
29    /// Edge types aggregated from relationship names
30    pub edge_types: Vec<EdgeType>,
31
32    /// Embeddings for chunks, entities, and summaries
33    pub embeddings: Vec<Embedding>,
34
35    /// Statistics about indexed fields
36    pub indexed_fields: IndexedFieldsStats,
37
38    /// Documents needed by the post-pipeline
39    /// [`crate::tasks::extract_dlt_fk_edges`] teardown step. Populated by the
40    /// final task in [`crate::tasks::build_cognify_pipeline`]; empty in the
41    /// temporal branch (which does not run DLT FK extraction). The matching
42    /// chunk list reuses the existing [`Self::chunks`] field.
43    ///
44    /// Not serialised — internal teardown carrier, not part of the public
45    /// result shape.
46    pub documents_for_dlt: Vec<Document>,
47
48    /// `true` when this result was synthesised by the
49    /// `check_pipeline_run_qualification` short-circuit (latest
50    /// `pipeline_runs` row was `COMPLETED`). All other fields are empty.
51    ///
52    /// CLI prints "already complete" when set; HTTP-server returns
53    /// `200 OK` with `status = "PipelineRunAlreadyCompleted"`. See doc 08-08
54    /// §4.3 and locked decision 13.
55    pub already_completed: bool,
56
57    /// The `pipeline_run_id` of the prior completed run that triggered the
58    /// short-circuit. `None` on normal (non-short-circuit) results.
59    pub prior_pipeline_run_id: Option<Uuid>,
60}
61
62impl CognifyResult {
63    /// Create an empty result (no data to process).
64    pub fn empty() -> Self {
65        Self {
66            chunks: vec![],
67            entities: vec![],
68            edges: vec![],
69            summaries: vec![],
70            edge_types: vec![],
71            embeddings: vec![],
72            indexed_fields: IndexedFieldsStats::default(),
73            documents_for_dlt: vec![],
74            already_completed: false,
75            prior_pipeline_run_id: None,
76        }
77    }
78
79    /// Create a short-circuit "already completed" result tagged with the
80    /// prior `pipeline_run_id`. All payload vectors are empty — callers that
81    /// need the prior run's outputs should query the graph / vector store
82    /// directly (matches Python parity). See doc 08-08 §4.3.
83    pub fn already_completed(pipeline_run_id: Uuid) -> Self {
84        Self {
85            already_completed: true,
86            prior_pipeline_run_id: Some(pipeline_run_id),
87            ..Self::empty()
88        }
89    }
90}
91
92/// Statistics about indexed fields.
93///
94/// Tracks how many data points were indexed for each field type.
95/// Uses dynamic `{TypeName}_{field_name}` keys matching the Python SDK's
96/// `metadata["index_fields"]`-driven approach, plus legacy convenience accessors.
97#[derive(Debug, Clone, Default)]
98pub struct IndexedFieldsStats {
99    /// Dynamic per-collection counts keyed by `"{TypeName}_{field_name}"`.
100    ///
101    /// E.g. `"DocumentChunk_text" -> 42`, `"Entity_name" -> 7`.
102    pub field_counts: HashMap<String, usize>,
103
104    /// Number of triplets indexed (triplets are not standard DataPoints,
105    /// so they are tracked separately).
106    pub triplet_count: usize,
107}
108
109impl IndexedFieldsStats {
110    /// Record that `count` items were indexed for `collection`.
111    pub fn record(&mut self, data_type: &str, field_name: &str, count: usize) {
112        let key = format!("{data_type}_{field_name}");
113        *self.field_counts.entry(key).or_insert(0) += count;
114    }
115
116    /// Get count for a specific `{type}_{field}` collection, or 0 if absent.
117    pub fn get(&self, data_type: &str, field_name: &str) -> usize {
118        let key = format!("{data_type}_{field_name}");
119        self.field_counts.get(&key).copied().unwrap_or(0)
120    }
121
122    // -- Convenience accessors (backward-compatible with old named fields) --
123
124    /// Number of DocumentChunk.text fields indexed.
125    pub fn chunk_text_count(&self) -> usize {
126        self.get("DocumentChunk", "text")
127    }
128
129    /// Number of Entity.name fields indexed.
130    pub fn entity_name_count(&self) -> usize {
131        self.get("Entity", "name")
132    }
133
134    /// Number of EntityType.name fields indexed.
135    pub fn entity_type_name_count(&self) -> usize {
136        self.get("EntityType", "name")
137    }
138
139    /// Number of TextSummary.text fields indexed.
140    pub fn summary_text_count(&self) -> usize {
141        self.get("TextSummary", "text")
142    }
143
144    /// Number of EdgeType.relationship_name fields indexed.
145    pub fn edge_type_count(&self) -> usize {
146        self.get("EdgeType", "relationship_name")
147    }
148}