cognee_cognify/pipeline.rs
1//! Cognify pipeline result types.
2//!
3//! The actual pipeline orchestration lives in [`crate::tasks`]. This module
4//! defines the output types shared across the pipeline.
5
6use std::collections::HashMap;
7
8use cognee_models::{Document, DocumentChunk, EdgeType, Embedding};
9use uuid::Uuid;
10
11use crate::graph_integration::{GraphEdgePair, GraphNodePair};
12use crate::summarization::TextSummary;
13
14/// Result of the cognify pipeline.
15#[derive(Debug, Clone)]
16pub struct CognifyResult {
17 /// Text chunks extracted from documents
18 pub chunks: Vec<DocumentChunk>,
19
20 /// Entities (nodes) with their types, deduplicated
21 pub entities: Vec<GraphNodePair>,
22
23 /// Edges (relationships) between entities, deduplicated
24 pub edges: Vec<GraphEdgePair>,
25
26 /// Text summaries generated from chunks
27 pub summaries: Vec<TextSummary>,
28
29 /// Edge types aggregated from relationship names
30 pub edge_types: Vec<EdgeType>,
31
32 /// Embeddings for chunks, entities, and summaries
33 pub embeddings: Vec<Embedding>,
34
35 /// Statistics about indexed fields
36 pub indexed_fields: IndexedFieldsStats,
37
38 /// Documents needed by the post-pipeline
39 /// [`crate::tasks::extract_dlt_fk_edges`] teardown step. Populated by the
40 /// final task in [`crate::tasks::build_cognify_pipeline`]; empty in the
41 /// temporal branch (which does not run DLT FK extraction). The matching
42 /// chunk list reuses the existing [`Self::chunks`] field.
43 ///
44 /// Not serialised — internal teardown carrier, not part of the public
45 /// result shape.
46 pub documents_for_dlt: Vec<Document>,
47
48 /// `true` when this result was synthesised by the
49 /// `check_pipeline_run_qualification` short-circuit (latest
50 /// `pipeline_runs` row was `COMPLETED`). All other fields are empty.
51 ///
52 /// CLI prints "already complete" when set; HTTP-server returns
53 /// `200 OK` with `status = "PipelineRunAlreadyCompleted"`. See doc 08-08
54 /// §4.3 and locked decision 13.
55 pub already_completed: bool,
56
57 /// The `pipeline_run_id` of the prior completed run that triggered the
58 /// short-circuit. `None` on normal (non-short-circuit) results.
59 pub prior_pipeline_run_id: Option<Uuid>,
60}
61
62impl CognifyResult {
63 /// Create an empty result (no data to process).
64 pub fn empty() -> Self {
65 Self {
66 chunks: vec![],
67 entities: vec![],
68 edges: vec![],
69 summaries: vec![],
70 edge_types: vec![],
71 embeddings: vec![],
72 indexed_fields: IndexedFieldsStats::default(),
73 documents_for_dlt: vec![],
74 already_completed: false,
75 prior_pipeline_run_id: None,
76 }
77 }
78
79 /// Create a short-circuit "already completed" result tagged with the
80 /// prior `pipeline_run_id`. All payload vectors are empty — callers that
81 /// need the prior run's outputs should query the graph / vector store
82 /// directly (matches Python parity). See doc 08-08 §4.3.
83 pub fn already_completed(pipeline_run_id: Uuid) -> Self {
84 Self {
85 already_completed: true,
86 prior_pipeline_run_id: Some(pipeline_run_id),
87 ..Self::empty()
88 }
89 }
90}
91
92/// Statistics about indexed fields.
93///
94/// Tracks how many data points were indexed for each field type.
95/// Uses dynamic `{TypeName}_{field_name}` keys matching the Python SDK's
96/// `metadata["index_fields"]`-driven approach, plus legacy convenience accessors.
97#[derive(Debug, Clone, Default)]
98pub struct IndexedFieldsStats {
99 /// Dynamic per-collection counts keyed by `"{TypeName}_{field_name}"`.
100 ///
101 /// E.g. `"DocumentChunk_text" -> 42`, `"Entity_name" -> 7`.
102 pub field_counts: HashMap<String, usize>,
103
104 /// Number of triplets indexed (triplets are not standard DataPoints,
105 /// so they are tracked separately).
106 pub triplet_count: usize,
107}
108
109impl IndexedFieldsStats {
110 /// Record that `count` items were indexed for `collection`.
111 pub fn record(&mut self, data_type: &str, field_name: &str, count: usize) {
112 let key = format!("{data_type}_{field_name}");
113 *self.field_counts.entry(key).or_insert(0) += count;
114 }
115
116 /// Get count for a specific `{type}_{field}` collection, or 0 if absent.
117 pub fn get(&self, data_type: &str, field_name: &str) -> usize {
118 let key = format!("{data_type}_{field_name}");
119 self.field_counts.get(&key).copied().unwrap_or(0)
120 }
121
122 // -- Convenience accessors (backward-compatible with old named fields) --
123
124 /// Number of DocumentChunk.text fields indexed.
125 pub fn chunk_text_count(&self) -> usize {
126 self.get("DocumentChunk", "text")
127 }
128
129 /// Number of Entity.name fields indexed.
130 pub fn entity_name_count(&self) -> usize {
131 self.get("Entity", "name")
132 }
133
134 /// Number of EntityType.name fields indexed.
135 pub fn entity_type_name_count(&self) -> usize {
136 self.get("EntityType", "name")
137 }
138
139 /// Number of TextSummary.text fields indexed.
140 pub fn summary_text_count(&self) -> usize {
141 self.get("TextSummary", "text")
142 }
143
144 /// Number of EdgeType.relationship_name fields indexed.
145 pub fn edge_type_count(&self) -> usize {
146 self.get("EdgeType", "relationship_name")
147 }
148}