ceres_core/models.rs
1use chrono::{DateTime, Utc};
2use serde::Serialize;
3use sha2::{Digest, Sha256};
4use uuid::Uuid;
5
6/// Complete representation of a row from the 'datasets' table.
7///
8/// This structure represents a persisted dataset with all database fields,
9/// including system-generated identifiers and timestamps. It maps directly
10/// to the PostgreSQL schema and is used for reading data from the database.
11///
12/// # Fields
13///
14/// * `id` - Unique identifier (UUID) generated by the database
15/// * `original_id` - Original identifier from the source portal
16/// * `source_portal` - Base URL of the originating CKAN portal
17/// * `url` - Public landing page URL for the dataset
18/// * `title` - Human-readable dataset title
19/// * `description` - Optional detailed description
20/// * `embedding` - Optional vector of floats for semantic search
21/// * `metadata` - Additional metadata as JSON
22/// * `first_seen_at` - Timestamp when the dataset was first indexed
23/// * `last_updated_at` - Timestamp of the most recent update
24#[derive(Debug, Serialize, Clone)]
25pub struct Dataset {
26 /// Unique identifier (UUID) generated by the database
27 pub id: Uuid,
28 /// Original identifier from the source portal
29 pub original_id: String,
30 /// Base URL of the originating CKAN portal
31 pub source_portal: String,
32 /// Public landing page URL for the dataset
33 pub url: String,
34 /// Human-readable dataset title
35 pub title: String,
36 /// Optional detailed description
37 pub description: Option<String>,
38
39 /// Optional embedding vector for semantic search
40 pub embedding: Option<Vec<f32>>,
41
42 /// Additional metadata as JSON
43 pub metadata: serde_json::Value,
44
45 /// Timestamp when the dataset was first indexed
46 pub first_seen_at: DateTime<Utc>,
47 /// Timestamp of the most recent update
48 pub last_updated_at: DateTime<Utc>,
49 /// SHA-256 hash of title + description for delta detection
50 pub content_hash: Option<String>,
51 /// Whether this dataset has been removed from its source portal
52 pub is_stale: bool,
53}
54
55/// Data Transfer Object for inserting or updating datasets.
56///
57/// This structure is used when creating new datasets or updating existing ones.
58/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
59/// timestamps. The embedding field stores a vector of floats for semantic search.
60///
61/// # Examples
62///
63/// ```
64/// use ceres_core::NewDataset;
65/// use serde_json::json;
66///
67/// let title = "My Dataset";
68/// let description = Some("Description here".to_string());
69/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
70///
71/// let dataset = NewDataset {
72/// original_id: "dataset-123".to_string(),
73/// source_portal: "https://dati.gov.it".to_string(),
74/// url: "https://dati.gov.it/dataset/my-data".to_string(),
75/// title: title.to_string(),
76/// description,
77/// embedding: None,
78/// metadata: json!({"tags": ["open-data", "italy"]}),
79/// content_hash,
80/// };
81///
82/// assert_eq!(dataset.title, "My Dataset");
83/// assert!(dataset.embedding.is_none());
84/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
85/// ```
86///
87/// # Fields
88///
89/// * `original_id` - Original identifier from the source portal
90/// * `source_portal` - Base URL of the originating CKAN portal
91/// * `url` - Public landing page URL for the dataset
92/// * `title` - Human-readable dataset title
93/// * `description` - Optional detailed description
94/// * `embedding` - Optional vector of floats for semantic search
95/// * `metadata` - Additional metadata as JSON
96/// * `content_hash` - SHA-256 hash of title + description for delta detection
97#[derive(Debug, Serialize, Clone)]
98pub struct NewDataset {
99 /// Original identifier from the source portal
100 pub original_id: String,
101 /// Base URL of the originating CKAN portal
102 pub source_portal: String,
103 /// Public landing page URL for the dataset
104 pub url: String,
105 /// Human-readable dataset title
106 pub title: String,
107 /// Optional detailed description
108 pub description: Option<String>,
109 /// Optional embedding vector for semantic search
110 pub embedding: Option<Vec<f32>>,
111 /// Additional metadata as JSON
112 pub metadata: serde_json::Value,
113 /// SHA-256 hash of title + description for delta detection
114 pub content_hash: String,
115}
116
117impl NewDataset {
118 /// Computes a SHA-256 hash of the content (title + description) for delta detection.
119 ///
120 /// This hash is used to determine if the dataset content has changed since
121 /// the last harvest, avoiding unnecessary embedding regeneration.
122 ///
123 /// # Arguments
124 ///
125 /// * `title` - The dataset title
126 /// * `description` - Optional dataset description
127 ///
128 /// # Returns
129 ///
130 /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
131 pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
132 let mut hasher = Sha256::new();
133 // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
134 let content = format!("{}\n{}", title, description.unwrap_or(""));
135 hasher.update(content.as_bytes());
136 format!("{:x}", hasher.finalize())
137 }
138
139 /// Computes a content hash that includes the language preference.
140 ///
141 /// The language is included so that changing the preferred language
142 /// for a portal triggers re-embedding (since the resolved text changes).
143 pub fn compute_content_hash_with_language(
144 title: &str,
145 description: Option<&str>,
146 language: &str,
147 ) -> String {
148 let mut hasher = Sha256::new();
149 let content = format!("{}\n{}\n{}", language, title, description.unwrap_or(""));
150 hasher.update(content.as_bytes());
151 format!("{:x}", hasher.finalize())
152 }
153}
154
155/// Result of a semantic search with similarity score.
156///
157/// This structure combines a dataset with its similarity score relative to
158/// the search query. The score represents the cosine similarity between the
159/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
160/// and 1.0 (identical).
161///
162/// # Examples
163///
164/// ```
165/// use ceres_core::SearchResult;
166///
167/// // SearchResult is created by the repository during searches
168/// // The similarity_score indicates how relevant the dataset is to the query
169/// // Typical values:
170/// // - 0.9+ : Highly relevant match
171/// // - 0.7-0.9 : Good match
172/// // - 0.5-0.7 : Partial match
173/// // - <0.5 : Weak match
174/// ```
175#[derive(Debug, Serialize, Clone)]
176pub struct SearchResult {
177 /// The matched dataset
178 pub dataset: Dataset,
179 /// Similarity score (0.0-1.0), where 1.0 is a perfect match
180 pub similarity_score: f32,
181}
182
183/// Database statistics for dashboard and monitoring.
184///
185/// Provides an overview of the database state, useful for dashboards
186/// and monitoring systems.
187#[derive(Debug, Serialize, Clone)]
188pub struct DatabaseStats {
189 /// Total number of datasets in the database
190 pub total_datasets: i64,
191 /// Number of datasets with generated embeddings
192 pub datasets_with_embeddings: i64,
193 /// Number of unique indexed portals
194 pub total_portals: i64,
195 /// Timestamp of the last update
196 pub last_update: Option<DateTime<Utc>>,
197 /// Number of datasets marked as stale (removed from source portal)
198 pub stale_datasets: i64,
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204
205 #[test]
206 fn test_new_dataset_creation() {
207 let title = "Test Dataset";
208 let description = Some("A test dataset".to_string());
209 let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
210
211 let dataset = NewDataset {
212 original_id: "test-123".to_string(),
213 source_portal: "https://example.com".to_string(),
214 url: "https://example.com/dataset/test".to_string(),
215 title: title.to_string(),
216 description,
217 embedding: None,
218 metadata: serde_json::json!({"key": "value"}),
219 content_hash,
220 };
221
222 assert_eq!(dataset.original_id, "test-123");
223 assert!(dataset.embedding.is_none());
224 assert_eq!(dataset.content_hash.len(), 64);
225 }
226
227 #[test]
228 fn test_compute_content_hash_consistency() {
229 let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
230 let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
231 assert_eq!(hash1, hash2);
232 assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
233 }
234
235 #[test]
236 fn test_compute_content_hash_different_content() {
237 let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
238 let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
239 assert_ne!(hash1, hash2);
240 }
241
242 #[test]
243 fn test_compute_content_hash_none_vs_empty() {
244 // None description and empty description should produce same hash
245 let hash1 = NewDataset::compute_content_hash("Title", None);
246 let hash2 = NewDataset::compute_content_hash("Title", Some(""));
247 assert_eq!(hash1, hash2);
248 }
249
250 #[test]
251 fn test_compute_content_hash_separator_prevents_collision() {
252 // "AB" + "C" should differ from "A" + "BC"
253 let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
254 let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
255 assert_ne!(hash1, hash2);
256 }
257}