Skip to main content

cognee_models/
data.rs

1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use uuid::Uuid;
4
5/// Represents a piece of data in the system, such as a file or a text.
6/// Fields match the Python cognee `data` table schema for cross-SDK compatibility.
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct Data {
9    /// Unique identifier for this data record (UUID v5, deterministic from content hash)
10    pub id: Uuid,
11    /// Display name derived from the source (filename, URL, or `text_<md5>.txt` for inline text)
12    pub name: String,
13    /// `file://` URI pointing to the stored raw content in the file storage backend
14    pub raw_data_location: String,
15    /// Original source location before any processing (file path, URL, or same as `raw_data_location` for inline text)
16    pub original_data_location: String,
17    /// File extension of the stored content (e.g. "txt", "pdf", "html")
18    pub extension: String,
19    /// MIME type of the stored content (e.g. "text/plain", "application/pdf")
20    pub mime_type: String,
21    /// MD5 hex digest of the raw content bytes (content-only, no owner mixing)
22    pub content_hash: String,
23    /// ID of the user or agent that owns this data record
24    pub owner_id: Uuid,
25    /// Timestamp when this record was first created
26    pub created_at: DateTime<Utc>,
27    /// Timestamp of the last update to this record, if any
28    pub updated_at: Option<DateTime<Utc>>,
29    /// Human-readable label for the data item (from DataItem wrapper or user-provided)
30    pub label: Option<String>,
31    /// Original file extension before any conversion
32    pub original_extension: Option<String>,
33    /// Original MIME type before any conversion
34    pub original_mime_type: Option<String>,
35    /// Python loader engine name (e.g. "text_loader", "pypdf_loader")
36    pub loader_engine: Option<String>,
37    /// MD5 hash of the **extracted-text** file stored by the loader at ADD time
38    /// (Python parity, `ingest_data.py:195`). Equals [`content_hash`](Self::content_hash)
39    /// only when the extracted text is byte-identical to the raw input (plain
40    /// text); for inputs the loader transforms (PDF, CSV, HTML, image, audio)
41    /// the two hashes differ.
42    pub raw_content_hash: Option<String>,
43    /// Tenant/organisation ID for multi-tenant isolation
44    pub tenant_id: Option<Uuid>,
45    /// Arbitrary JSON metadata blob
46    pub external_metadata: Option<String>,
47    /// JSON list of node IDs associated with this data item
48    pub node_set: Option<String>,
49    /// Pipeline processing status
50    pub pipeline_status: Option<String>,
51    /// Token count of the data (-1 = not yet computed)
52    pub token_count: i64,
53    /// Size of the data in bytes (-1 = not yet computed)
54    pub data_size: i64,
55    /// Last access timestamp
56    pub last_accessed: Option<DateTime<Utc>>,
57    /// Importance weight for ranking (0.0 to 1.0). Influences relevance scoring.
58    pub importance_weight: Option<f64>,
59}
60
61impl Data {
62    /// Start building a new `Data` record with the required fields.
63    /// All optional fields default to `None`; `data_size` defaults to `-1`.
64    #[allow(clippy::too_many_arguments)]
65    pub fn builder(
66        id: Uuid,
67        name: impl Into<String>,
68        raw_data_location: impl Into<String>,
69        original_data_location: impl Into<String>,
70        extension: impl Into<String>,
71        mime_type: impl Into<String>,
72        content_hash: impl Into<String>,
73        owner_id: Uuid,
74    ) -> DataBuilder {
75        DataBuilder {
76            id,
77            name: name.into(),
78            raw_data_location: raw_data_location.into(),
79            original_data_location: original_data_location.into(),
80            extension: extension.into(),
81            mime_type: mime_type.into(),
82            content_hash: content_hash.into(),
83            owner_id,
84            tenant_id: None,
85            label: None,
86            original_extension: None,
87            original_mime_type: None,
88            loader_engine: None,
89            raw_content_hash: None,
90            external_metadata: None,
91            node_set: None,
92            importance_weight: None,
93            data_size: -1,
94        }
95    }
96}
97
98/// Builder for [`Data`]. Obtain via [`Data::builder`].
99pub struct DataBuilder {
100    id: Uuid,
101    name: String,
102    raw_data_location: String,
103    original_data_location: String,
104    extension: String,
105    mime_type: String,
106    content_hash: String,
107    owner_id: Uuid,
108    tenant_id: Option<Uuid>,
109    label: Option<String>,
110    original_extension: Option<String>,
111    original_mime_type: Option<String>,
112    loader_engine: Option<String>,
113    raw_content_hash: Option<String>,
114    external_metadata: Option<String>,
115    node_set: Option<String>,
116    importance_weight: Option<f64>,
117    data_size: i64,
118}
119
120impl DataBuilder {
121    pub fn tenant_id(mut self, v: Uuid) -> Self {
122        self.tenant_id = Some(v);
123        self
124    }
125    pub fn label(mut self, v: impl Into<String>) -> Self {
126        self.label = Some(v.into());
127        self
128    }
129    pub fn original_extension(mut self, v: impl Into<String>) -> Self {
130        self.original_extension = Some(v.into());
131        self
132    }
133    pub fn original_mime_type(mut self, v: impl Into<String>) -> Self {
134        self.original_mime_type = Some(v.into());
135        self
136    }
137    pub fn loader_engine(mut self, v: impl Into<String>) -> Self {
138        self.loader_engine = Some(v.into());
139        self
140    }
141    pub fn raw_content_hash(mut self, v: impl Into<String>) -> Self {
142        self.raw_content_hash = Some(v.into());
143        self
144    }
145    pub fn external_metadata(mut self, v: impl Into<String>) -> Self {
146        self.external_metadata = Some(v.into());
147        self
148    }
149    pub fn data_size(mut self, v: i64) -> Self {
150        self.data_size = v;
151        self
152    }
153    pub fn node_set(mut self, v: impl Into<String>) -> Self {
154        self.node_set = Some(v.into());
155        self
156    }
157    pub fn importance_weight(mut self, w: f64) -> Self {
158        self.importance_weight = Some(w);
159        self
160    }
161
162    pub fn build(self) -> Data {
163        Data {
164            id: self.id,
165            name: self.name,
166            raw_data_location: self.raw_data_location,
167            original_data_location: self.original_data_location,
168            extension: self.extension,
169            mime_type: self.mime_type,
170            content_hash: self.content_hash,
171            owner_id: self.owner_id,
172            created_at: Utc::now(),
173            updated_at: None,
174            tenant_id: self.tenant_id,
175            label: self.label,
176            original_extension: self.original_extension,
177            original_mime_type: self.original_mime_type,
178            loader_engine: self.loader_engine,
179            raw_content_hash: self.raw_content_hash,
180            external_metadata: self.external_metadata,
181            node_set: self.node_set,
182            pipeline_status: None,
183            // TODO(COG-4456): compute token_count at ingestion time using TokenCounterKind::from_env()
184            // so the field is populated on add rather than remaining -1 until cognify runs.
185            token_count: -1,
186            data_size: self.data_size,
187            last_accessed: None,
188            importance_weight: self.importance_weight,
189        }
190    }
191}