Skip to main content

ceres_core/
models.rs

1use chrono::{DateTime, Utc};
2use serde::Serialize;
3use sha2::{Digest, Sha256};
4use uuid::Uuid;
5
6/// Complete representation of a row from the 'datasets' table.
7///
8/// This structure represents a persisted dataset with all database fields,
9/// including system-generated identifiers and timestamps. It maps directly
10/// to the PostgreSQL schema and is used for reading data from the database.
11///
12/// # Fields
13///
14/// * `id` - Unique identifier (UUID) generated by the database
15/// * `original_id` - Original identifier from the source portal
16/// * `source_portal` - Base URL of the originating CKAN portal
17/// * `url` - Public landing page URL for the dataset
18/// * `title` - Human-readable dataset title
19/// * `description` - Optional detailed description
20/// * `embedding` - Optional vector of floats for semantic search
21/// * `metadata` - Additional metadata as JSON
22/// * `first_seen_at` - Timestamp when the dataset was first indexed
23/// * `last_updated_at` - Timestamp of the most recent update
24#[derive(Debug, Serialize, Clone)]
25pub struct Dataset {
26    /// Unique identifier (UUID) generated by the database
27    pub id: Uuid,
28    /// Original identifier from the source portal
29    pub original_id: String,
30    /// Base URL of the originating CKAN portal
31    pub source_portal: String,
32    /// Public landing page URL for the dataset
33    pub url: String,
34    /// Human-readable dataset title
35    pub title: String,
36    /// Optional detailed description
37    pub description: Option<String>,
38
39    /// Optional embedding vector for semantic search
40    pub embedding: Option<Vec<f32>>,
41
42    /// Additional metadata as JSON
43    pub metadata: serde_json::Value,
44
45    /// Timestamp when the dataset was first indexed
46    pub first_seen_at: DateTime<Utc>,
47    /// Timestamp of the most recent update
48    pub last_updated_at: DateTime<Utc>,
49    /// SHA-256 hash of title + description for delta detection
50    pub content_hash: Option<String>,
51    /// Whether this dataset has been removed from its source portal
52    pub is_stale: bool,
53}
54
55/// Data Transfer Object for inserting or updating datasets.
56///
57/// This structure is used when creating new datasets or updating existing ones.
58/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
59/// timestamps. The embedding field stores a vector of floats for semantic search.
60///
61/// # Examples
62///
63/// ```
64/// use ceres_core::NewDataset;
65/// use serde_json::json;
66///
67/// let title = "My Dataset";
68/// let description = Some("Description here".to_string());
69/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
70///
71/// let dataset = NewDataset {
72///     original_id: "dataset-123".to_string(),
73///     source_portal: "https://dati.gov.it".to_string(),
74///     url: "https://dati.gov.it/dataset/my-data".to_string(),
75///     title: title.to_string(),
76///     description,
77///     embedding: None,
78///     metadata: json!({"tags": ["open-data", "italy"]}),
79///     content_hash,
80/// };
81///
82/// assert_eq!(dataset.title, "My Dataset");
83/// assert!(dataset.embedding.is_none());
84/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
85/// ```
86///
87/// # Fields
88///
89/// * `original_id` - Original identifier from the source portal
90/// * `source_portal` - Base URL of the originating CKAN portal
91/// * `url` - Public landing page URL for the dataset
92/// * `title` - Human-readable dataset title
93/// * `description` - Optional detailed description
94/// * `embedding` - Optional vector of floats for semantic search
95/// * `metadata` - Additional metadata as JSON
96/// * `content_hash` - SHA-256 hash of title + description for delta detection
97#[derive(Debug, Serialize, Clone)]
98pub struct NewDataset {
99    /// Original identifier from the source portal
100    pub original_id: String,
101    /// Base URL of the originating CKAN portal
102    pub source_portal: String,
103    /// Public landing page URL for the dataset
104    pub url: String,
105    /// Human-readable dataset title
106    pub title: String,
107    /// Optional detailed description
108    pub description: Option<String>,
109    /// Optional embedding vector for semantic search
110    pub embedding: Option<Vec<f32>>,
111    /// Additional metadata as JSON
112    pub metadata: serde_json::Value,
113    /// SHA-256 hash of title + description for delta detection
114    pub content_hash: String,
115}
116
117impl NewDataset {
118    /// Computes a SHA-256 hash of the content (title + description) for delta detection.
119    ///
120    /// This hash is used to determine if the dataset content has changed since
121    /// the last harvest, avoiding unnecessary embedding regeneration.
122    ///
123    /// # Arguments
124    ///
125    /// * `title` - The dataset title
126    /// * `description` - Optional dataset description
127    ///
128    /// # Returns
129    ///
130    /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
131    pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
132        let mut hasher = Sha256::new();
133        // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
134        let content = format!("{}\n{}", title, description.unwrap_or(""));
135        hasher.update(content.as_bytes());
136        format!("{:x}", hasher.finalize())
137    }
138
139    /// Computes a content hash that includes the language preference.
140    ///
141    /// The language is included so that changing the preferred language
142    /// for a portal triggers re-embedding (since the resolved text changes).
143    pub fn compute_content_hash_with_language(
144        title: &str,
145        description: Option<&str>,
146        language: &str,
147    ) -> String {
148        let mut hasher = Sha256::new();
149        let content = format!("{}\n{}\n{}", language, title, description.unwrap_or(""));
150        hasher.update(content.as_bytes());
151        format!("{:x}", hasher.finalize())
152    }
153}
154
155/// Result of a semantic search with similarity score.
156///
157/// This structure combines a dataset with its similarity score relative to
158/// the search query. The score represents the cosine similarity between the
159/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
160/// and 1.0 (identical).
161///
162/// # Examples
163///
164/// ```
165/// use ceres_core::SearchResult;
166///
167/// // SearchResult is created by the repository during searches
168/// // The similarity_score indicates how relevant the dataset is to the query
169/// // Typical values:
170/// // - 0.9+ : Highly relevant match
171/// // - 0.7-0.9 : Good match
172/// // - 0.5-0.7 : Partial match
173/// // - <0.5 : Weak match
174/// ```
175#[derive(Debug, Serialize, Clone)]
176pub struct SearchResult {
177    /// The matched dataset
178    pub dataset: Dataset,
179    /// Similarity score (0.0-1.0), where 1.0 is a perfect match
180    pub similarity_score: f32,
181}
182
183/// Database statistics for dashboard and monitoring.
184///
185/// Provides an overview of the database state, useful for dashboards
186/// and monitoring systems.
187#[derive(Debug, Serialize, Clone)]
188pub struct DatabaseStats {
189    /// Total number of datasets in the database
190    pub total_datasets: i64,
191    /// Number of datasets with generated embeddings
192    pub datasets_with_embeddings: i64,
193    /// Number of unique indexed portals
194    pub total_portals: i64,
195    /// Timestamp of the last update
196    pub last_update: Option<DateTime<Utc>>,
197    /// Number of datasets marked as stale (removed from source portal)
198    pub stale_datasets: i64,
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn test_new_dataset_creation() {
207        let title = "Test Dataset";
208        let description = Some("A test dataset".to_string());
209        let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
210
211        let dataset = NewDataset {
212            original_id: "test-123".to_string(),
213            source_portal: "https://example.com".to_string(),
214            url: "https://example.com/dataset/test".to_string(),
215            title: title.to_string(),
216            description,
217            embedding: None,
218            metadata: serde_json::json!({"key": "value"}),
219            content_hash,
220        };
221
222        assert_eq!(dataset.original_id, "test-123");
223        assert!(dataset.embedding.is_none());
224        assert_eq!(dataset.content_hash.len(), 64);
225    }
226
227    #[test]
228    fn test_compute_content_hash_consistency() {
229        let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
230        let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
231        assert_eq!(hash1, hash2);
232        assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
233    }
234
235    #[test]
236    fn test_compute_content_hash_different_content() {
237        let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
238        let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
239        assert_ne!(hash1, hash2);
240    }
241
242    #[test]
243    fn test_compute_content_hash_none_vs_empty() {
244        // None description and empty description should produce same hash
245        let hash1 = NewDataset::compute_content_hash("Title", None);
246        let hash2 = NewDataset::compute_content_hash("Title", Some(""));
247        assert_eq!(hash1, hash2);
248    }
249
250    #[test]
251    fn test_compute_content_hash_separator_prevents_collision() {
252        // "AB" + "C" should differ from "A" + "BC"
253        let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
254        let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
255        assert_ne!(hash1, hash2);
256    }
257}