ceres_core/models.rs
1use chrono::{DateTime, Utc};
2use serde::Serialize;
3use sha2::{Digest, Sha256};
4use uuid::Uuid;
5
6/// Complete representation of a row from the 'datasets' table.
7///
8/// This structure represents a persisted dataset with all database fields,
9/// including system-generated identifiers and timestamps. It maps directly
10/// to the PostgreSQL schema and is used for reading data from the database.
11///
12/// # Fields
13///
14/// * `id` - Unique identifier (UUID) generated by the database
15/// * `original_id` - Original identifier from the source portal
16/// * `source_portal` - Base URL of the originating CKAN portal
17/// * `url` - Public landing page URL for the dataset
18/// * `title` - Human-readable dataset title
19/// * `description` - Optional detailed description
20/// * `embedding` - Optional vector of floats for semantic search
21/// * `metadata` - Additional metadata as JSON
22/// * `first_seen_at` - Timestamp when the dataset was first indexed
23/// * `last_updated_at` - Timestamp of the most recent update
24#[derive(Debug, Serialize, Clone)]
25pub struct Dataset {
26 /// Unique identifier (UUID) generated by the database
27 pub id: Uuid,
28 /// Original identifier from the source portal
29 pub original_id: String,
30 /// Base URL of the originating CKAN portal
31 pub source_portal: String,
32 /// Public landing page URL for the dataset
33 pub url: String,
34 /// Human-readable dataset title
35 pub title: String,
36 /// Optional detailed description
37 pub description: Option<String>,
38
39 /// Optional embedding vector for semantic search
40 pub embedding: Option<Vec<f32>>,
41
42 /// Additional metadata as JSON
43 pub metadata: serde_json::Value,
44
45 /// Timestamp when the dataset was first indexed
46 pub first_seen_at: DateTime<Utc>,
47 /// Timestamp of the most recent update
48 pub last_updated_at: DateTime<Utc>,
49 /// SHA-256 hash of title + description for delta detection
50 pub content_hash: Option<String>,
51}
52
53/// Data Transfer Object for inserting or updating datasets.
54///
55/// This structure is used when creating new datasets or updating existing ones.
56/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
57/// timestamps. The embedding field stores a vector of floats for semantic search.
58///
59/// # Examples
60///
61/// ```
62/// use ceres_core::NewDataset;
63/// use serde_json::json;
64///
65/// let title = "My Dataset";
66/// let description = Some("Description here".to_string());
67/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
68///
69/// let dataset = NewDataset {
70/// original_id: "dataset-123".to_string(),
71/// source_portal: "https://dati.gov.it".to_string(),
72/// url: "https://dati.gov.it/dataset/my-data".to_string(),
73/// title: title.to_string(),
74/// description,
75/// embedding: None,
76/// metadata: json!({"tags": ["open-data", "italy"]}),
77/// content_hash,
78/// };
79///
80/// assert_eq!(dataset.title, "My Dataset");
81/// assert!(dataset.embedding.is_none());
82/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
83/// ```
84///
85/// # Fields
86///
87/// * `original_id` - Original identifier from the source portal
88/// * `source_portal` - Base URL of the originating CKAN portal
89/// * `url` - Public landing page URL for the dataset
90/// * `title` - Human-readable dataset title
91/// * `description` - Optional detailed description
92/// * `embedding` - Optional vector of floats for semantic search
93/// * `metadata` - Additional metadata as JSON
94/// * `content_hash` - SHA-256 hash of title + description for delta detection
95#[derive(Debug, Serialize, Clone)]
96pub struct NewDataset {
97 /// Original identifier from the source portal
98 pub original_id: String,
99 /// Base URL of the originating CKAN portal
100 pub source_portal: String,
101 /// Public landing page URL for the dataset
102 pub url: String,
103 /// Human-readable dataset title
104 pub title: String,
105 /// Optional detailed description
106 pub description: Option<String>,
107 /// Optional embedding vector for semantic search
108 pub embedding: Option<Vec<f32>>,
109 /// Additional metadata as JSON
110 pub metadata: serde_json::Value,
111 /// SHA-256 hash of title + description for delta detection
112 pub content_hash: String,
113}
114
115impl NewDataset {
116 /// Computes a SHA-256 hash of the content (title + description) for delta detection.
117 ///
118 /// This hash is used to determine if the dataset content has changed since
119 /// the last harvest, avoiding unnecessary embedding regeneration.
120 ///
121 /// # Arguments
122 ///
123 /// * `title` - The dataset title
124 /// * `description` - Optional dataset description
125 ///
126 /// # Returns
127 ///
128 /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
129 pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
130 let mut hasher = Sha256::new();
131 // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
132 let content = format!("{}\n{}", title, description.unwrap_or(""));
133 hasher.update(content.as_bytes());
134 format!("{:x}", hasher.finalize())
135 }
136
137 /// Computes a content hash that includes the language preference.
138 ///
139 /// The language is included so that changing the preferred language
140 /// for a portal triggers re-embedding (since the resolved text changes).
141 pub fn compute_content_hash_with_language(
142 title: &str,
143 description: Option<&str>,
144 language: &str,
145 ) -> String {
146 let mut hasher = Sha256::new();
147 let content = format!("{}\n{}\n{}", language, title, description.unwrap_or(""));
148 hasher.update(content.as_bytes());
149 format!("{:x}", hasher.finalize())
150 }
151}
152
153/// Result of a semantic search with similarity score.
154///
155/// This structure combines a dataset with its similarity score relative to
156/// the search query. The score represents the cosine similarity between the
157/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
158/// and 1.0 (identical).
159///
160/// # Examples
161///
162/// ```
163/// use ceres_core::SearchResult;
164///
165/// // SearchResult is created by the repository during searches
166/// // The similarity_score indicates how relevant the dataset is to the query
167/// // Typical values:
168/// // - 0.9+ : Highly relevant match
169/// // - 0.7-0.9 : Good match
170/// // - 0.5-0.7 : Partial match
171/// // - <0.5 : Weak match
172/// ```
173#[derive(Debug, Serialize, Clone)]
174pub struct SearchResult {
175 /// The matched dataset
176 pub dataset: Dataset,
177 /// Similarity score (0.0-1.0), where 1.0 is a perfect match
178 pub similarity_score: f32,
179}
180
181/// Database statistics for dashboard and monitoring.
182///
183/// Provides an overview of the database state, useful for dashboards
184/// and monitoring systems.
185#[derive(Debug, Serialize, Clone)]
186pub struct DatabaseStats {
187 /// Total number of datasets in the database
188 pub total_datasets: i64,
189 /// Number of datasets with generated embeddings
190 pub datasets_with_embeddings: i64,
191 /// Number of unique indexed portals
192 pub total_portals: i64,
193 /// Timestamp of the last update
194 pub last_update: Option<DateTime<Utc>>,
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200
201 #[test]
202 fn test_new_dataset_creation() {
203 let title = "Test Dataset";
204 let description = Some("A test dataset".to_string());
205 let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
206
207 let dataset = NewDataset {
208 original_id: "test-123".to_string(),
209 source_portal: "https://example.com".to_string(),
210 url: "https://example.com/dataset/test".to_string(),
211 title: title.to_string(),
212 description,
213 embedding: None,
214 metadata: serde_json::json!({"key": "value"}),
215 content_hash,
216 };
217
218 assert_eq!(dataset.original_id, "test-123");
219 assert!(dataset.embedding.is_none());
220 assert_eq!(dataset.content_hash.len(), 64);
221 }
222
223 #[test]
224 fn test_compute_content_hash_consistency() {
225 let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
226 let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
227 assert_eq!(hash1, hash2);
228 assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
229 }
230
231 #[test]
232 fn test_compute_content_hash_different_content() {
233 let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
234 let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
235 assert_ne!(hash1, hash2);
236 }
237
238 #[test]
239 fn test_compute_content_hash_none_vs_empty() {
240 // None description and empty description should produce same hash
241 let hash1 = NewDataset::compute_content_hash("Title", None);
242 let hash2 = NewDataset::compute_content_hash("Title", Some(""));
243 assert_eq!(hash1, hash2);
244 }
245
246 #[test]
247 fn test_compute_content_hash_separator_prevents_collision() {
248 // "AB" + "C" should differ from "A" + "BC"
249 let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
250 let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
251 assert_ne!(hash1, hash2);
252 }
253}