ceres_core/models.rs
1use chrono::{DateTime, Utc};
2use pgvector::Vector;
3use serde::{Deserialize, Serialize};
4use sha2::{Digest, Sha256};
5use sqlx::prelude::FromRow;
6use sqlx::types::Json;
7use uuid::Uuid;
8
9/// Complete representation of a row from the 'datasets' table.
10///
11/// This structure represents a persisted dataset with all database fields,
12/// including system-generated identifiers and timestamps. It maps directly
13/// to the PostgreSQL schema and is used for reading data from the database.
14///
15/// # Fields
16///
17/// * `id` - Unique identifier (UUID) generated by the database
18/// * `original_id` - Original identifier from the source portal
19/// * `source_portal` - Base URL of the originating CKAN portal
20/// * `url` - Public landing page URL for the dataset
21/// * `title` - Human-readable dataset title
22/// * `description` - Optional detailed description
23/// * `embedding` - Optional 1536-dimensional vector for semantic search
24/// * `metadata` - Additional metadata stored as JSONB
25/// * `first_seen_at` - Timestamp when the dataset was first indexed
26/// * `last_updated_at` - Timestamp of the most recent update
27#[derive(Debug, FromRow, Serialize, Clone)]
28pub struct Dataset {
29 /// Unique identifier (UUID) generated by the database
30 pub id: Uuid,
31 /// Original identifier from the source portal
32 pub original_id: String,
33 /// Base URL of the originating CKAN portal
34 pub source_portal: String,
35 /// Public landing page URL for the dataset
36 pub url: String,
37 /// Human-readable dataset title
38 pub title: String,
39 /// Optional detailed description
40 pub description: Option<String>,
41
42 /// Optional 1536-dimensional vector for semantic search (pgvector type)
43 pub embedding: Option<Vector>,
44
45 /// Additional metadata stored as JSONB
46 pub metadata: Json<serde_json::Value>,
47
48 /// Timestamp when the dataset was first indexed
49 pub first_seen_at: DateTime<Utc>,
50 /// Timestamp of the most recent update
51 pub last_updated_at: DateTime<Utc>,
52 /// SHA-256 hash of title + description for delta detection
53 pub content_hash: Option<String>,
54}
55
56/// Data Transfer Object for inserting or updating datasets.
57///
58/// This structure is used when creating new datasets or updating existing ones.
59/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
60/// timestamps. The embedding field uses pgvector's `Vector` for database storage.
61///
62/// # Examples
63///
64/// ```
65/// use ceres_core::NewDataset;
66/// use serde_json::json;
67///
68/// let title = "My Dataset";
69/// let description = Some("Description here".to_string());
70/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
71///
72/// let dataset = NewDataset {
73/// original_id: "dataset-123".to_string(),
74/// source_portal: "https://dati.gov.it".to_string(),
75/// url: "https://dati.gov.it/dataset/my-data".to_string(),
76/// title: title.to_string(),
77/// description,
78/// embedding: None,
79/// metadata: json!({"tags": ["open-data", "italy"]}),
80/// content_hash,
81/// };
82///
83/// assert_eq!(dataset.title, "My Dataset");
84/// assert!(dataset.embedding.is_none());
85/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
86/// ```
87///
88/// # Fields
89///
90/// * `original_id` - Original identifier from the source portal
91/// * `source_portal` - Base URL of the originating CKAN portal
92/// * `url` - Public landing page URL for the dataset
93/// * `title` - Human-readable dataset title
94/// * `description` - Optional detailed description
95/// * `embedding` - Optional vector of 768 floats (pgvector)
96/// * `metadata` - Additional metadata as JSON
97/// * `content_hash` - SHA-256 hash of title + description for delta detection
98#[derive(Debug, Serialize, Clone)]
99pub struct NewDataset {
100 /// Original identifier from the source portal
101 pub original_id: String,
102 /// Base URL of the originating CKAN portal
103 pub source_portal: String,
104 /// Public landing page URL for the dataset
105 pub url: String,
106 /// Human-readable dataset title
107 pub title: String,
108 /// Optional detailed description
109 pub description: Option<String>,
110 /// Optional vector of 768 floats (converted to pgvector on storage)
111 pub embedding: Option<Vector>,
112 /// Additional metadata as JSON
113 pub metadata: serde_json::Value,
114 /// SHA-256 hash of title + description for delta detection
115 pub content_hash: String,
116}
117
118impl NewDataset {
119 /// Computes a SHA-256 hash of the content (title + description) for delta detection.
120 ///
121 /// This hash is used to determine if the dataset content has changed since
122 /// the last harvest, avoiding unnecessary embedding regeneration.
123 ///
124 /// # Arguments
125 ///
126 /// * `title` - The dataset title
127 /// * `description` - Optional dataset description
128 ///
129 /// # Returns
130 ///
131 /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
132 pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
133 let mut hasher = Sha256::new();
134 // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
135 let content = format!("{}\n{}", title, description.unwrap_or(""));
136 hasher.update(content.as_bytes());
137 format!("{:x}", hasher.finalize())
138 }
139}
140
141/// Result of a semantic search with similarity score.
142///
143/// This structure combines a dataset with its similarity score relative to
144/// the search query. The score represents the cosine similarity between the
145/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
146/// and 1.0 (identical).
147///
148/// # Examples
149///
150/// ```
151/// use ceres_core::SearchResult;
152///
153/// // SearchResult is created by the repository during searches
154/// // The similarity_score indicates how relevant the dataset is to the query
155/// // Typical values:
156/// // - 0.9+ : Highly relevant match
157/// // - 0.7-0.9 : Good match
158/// // - 0.5-0.7 : Partial match
159/// // - <0.5 : Weak match
160/// ```
161#[derive(Debug, Serialize, Clone)]
162pub struct SearchResult {
163 /// The matched dataset
164 pub dataset: Dataset,
165 /// Similarity score (0.0-1.0), where 1.0 is a perfect match
166 pub similarity_score: f32,
167}
168
169/// Database statistics for dashboard and monitoring.
170///
171/// Provides an overview of the database state, useful for dashboards
172/// and monitoring systems.
173#[derive(Debug, Serialize, Clone)]
174pub struct DatabaseStats {
175 /// Total number of datasets in the database
176 pub total_datasets: i64,
177 /// Number of datasets with generated embeddings
178 pub datasets_with_embeddings: i64,
179 /// Number of unique indexed portals
180 pub total_portals: i64,
181 /// Timestamp of the last update
182 pub last_update: Option<DateTime<Utc>>,
183}
184
185/// Portal configured in portals.toml.
186///
187/// Represents an open data portal configured for harvesting.
188/// Supports different portal types (CKAN, Socrata, DCAT).
189///
190/// # Examples
191///
192/// ```
193/// use ceres_core::Portal;
194///
195/// let json = r#"{
196/// "name": "Dati.gov.it",
197/// "url": "https://dati.gov.it",
198/// "type": "ckan",
199/// "description": "Italian national open data portal"
200/// }"#;
201///
202/// let portal: Portal = serde_json::from_str(json).unwrap();
203/// assert_eq!(portal.name, "Dati.gov.it");
204/// assert_eq!(portal.portal_type, "ckan");
205/// assert!(portal.enabled); // Default is true
206/// ```
207#[derive(Debug, Serialize, Deserialize, Clone)]
208pub struct Portal {
209 /// Portal name (human-readable)
210 pub name: String,
211 /// Base URL of the portal
212 pub url: String,
213 /// Portal type ("ckan", "socrata", "dcat")
214 #[serde(rename = "type")]
215 pub portal_type: String,
216 /// Whether the portal is enabled for harvesting
217 #[serde(default = "default_enabled")]
218 pub enabled: bool,
219 /// Optional portal description
220 pub description: Option<String>,
221}
222
223/// Default value for Portal.enabled field
224fn default_enabled() -> bool {
225 true
226}
227
228#[cfg(test)]
229mod tests {
230 use super::*;
231
232 #[test]
233 fn test_portal_default_enabled() {
234 let json = r#"{
235 "name": "Test Portal",
236 "url": "https://example.com",
237 "type": "ckan"
238 }"#;
239
240 let portal: Portal = serde_json::from_str(json).unwrap();
241 assert!(portal.enabled);
242 }
243
244 #[test]
245 fn test_new_dataset_creation() {
246 let title = "Test Dataset";
247 let description = Some("A test dataset".to_string());
248 let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
249
250 let dataset = NewDataset {
251 original_id: "test-123".to_string(),
252 source_portal: "https://example.com".to_string(),
253 url: "https://example.com/dataset/test".to_string(),
254 title: title.to_string(),
255 description,
256 embedding: None,
257 metadata: serde_json::json!({"key": "value"}),
258 content_hash,
259 };
260
261 assert_eq!(dataset.original_id, "test-123");
262 assert!(dataset.embedding.is_none());
263 assert_eq!(dataset.content_hash.len(), 64);
264 }
265
266 #[test]
267 fn test_compute_content_hash_consistency() {
268 let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
269 let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
270 assert_eq!(hash1, hash2);
271 assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
272 }
273
274 #[test]
275 fn test_compute_content_hash_different_content() {
276 let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
277 let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
278 assert_ne!(hash1, hash2);
279 }
280
281 #[test]
282 fn test_compute_content_hash_none_vs_empty() {
283 // None description and empty description should produce same hash
284 let hash1 = NewDataset::compute_content_hash("Title", None);
285 let hash2 = NewDataset::compute_content_hash("Title", Some(""));
286 assert_eq!(hash1, hash2);
287 }
288
289 #[test]
290 fn test_compute_content_hash_separator_prevents_collision() {
291 // "AB" + "C" should differ from "A" + "BC"
292 let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
293 let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
294 assert_ne!(hash1, hash2);
295 }
296}