Skip to main content

ceres_core/
models.rs

1use chrono::{DateTime, Utc};
2use serde::Serialize;
3use sha2::{Digest, Sha256};
4use uuid::Uuid;
5
6/// Complete representation of a row from the 'datasets' table.
7///
8/// This structure represents a persisted dataset with all database fields,
9/// including system-generated identifiers and timestamps. It maps directly
10/// to the PostgreSQL schema and is used for reading data from the database.
11///
12/// # Fields
13///
14/// * `id` - Unique identifier (UUID) generated by the database
15/// * `original_id` - Original identifier from the source portal
16/// * `source_portal` - Base URL of the originating CKAN portal
17/// * `url` - Public landing page URL for the dataset
18/// * `title` - Human-readable dataset title
19/// * `description` - Optional detailed description
20/// * `embedding` - Optional vector of floats for semantic search
21/// * `metadata` - Additional metadata as JSON
22/// * `first_seen_at` - Timestamp when the dataset was first indexed
23/// * `last_updated_at` - Timestamp of the most recent update
24#[derive(Debug, Serialize, Clone)]
25pub struct Dataset {
26    /// Unique identifier (UUID) generated by the database
27    pub id: Uuid,
28    /// Original identifier from the source portal
29    pub original_id: String,
30    /// Base URL of the originating CKAN portal
31    pub source_portal: String,
32    /// Public landing page URL for the dataset
33    pub url: String,
34    /// Human-readable dataset title
35    pub title: String,
36    /// Optional detailed description
37    pub description: Option<String>,
38
39    /// Optional embedding vector for semantic search
40    pub embedding: Option<Vec<f32>>,
41
42    /// Additional metadata as JSON
43    pub metadata: serde_json::Value,
44
45    /// Timestamp when the dataset was first indexed
46    pub first_seen_at: DateTime<Utc>,
47    /// Timestamp of the most recent update
48    pub last_updated_at: DateTime<Utc>,
49    /// SHA-256 hash of title + description for delta detection
50    pub content_hash: Option<String>,
51}
52
53/// Data Transfer Object for inserting or updating datasets.
54///
55/// This structure is used when creating new datasets or updating existing ones.
56/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
57/// timestamps. The embedding field stores a vector of floats for semantic search.
58///
59/// # Examples
60///
61/// ```
62/// use ceres_core::NewDataset;
63/// use serde_json::json;
64///
65/// let title = "My Dataset";
66/// let description = Some("Description here".to_string());
67/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
68///
69/// let dataset = NewDataset {
70///     original_id: "dataset-123".to_string(),
71///     source_portal: "https://dati.gov.it".to_string(),
72///     url: "https://dati.gov.it/dataset/my-data".to_string(),
73///     title: title.to_string(),
74///     description,
75///     embedding: None,
76///     metadata: json!({"tags": ["open-data", "italy"]}),
77///     content_hash,
78/// };
79///
80/// assert_eq!(dataset.title, "My Dataset");
81/// assert!(dataset.embedding.is_none());
82/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
83/// ```
84///
85/// # Fields
86///
87/// * `original_id` - Original identifier from the source portal
88/// * `source_portal` - Base URL of the originating CKAN portal
89/// * `url` - Public landing page URL for the dataset
90/// * `title` - Human-readable dataset title
91/// * `description` - Optional detailed description
92/// * `embedding` - Optional vector of floats for semantic search
93/// * `metadata` - Additional metadata as JSON
94/// * `content_hash` - SHA-256 hash of title + description for delta detection
95#[derive(Debug, Serialize, Clone)]
96pub struct NewDataset {
97    /// Original identifier from the source portal
98    pub original_id: String,
99    /// Base URL of the originating CKAN portal
100    pub source_portal: String,
101    /// Public landing page URL for the dataset
102    pub url: String,
103    /// Human-readable dataset title
104    pub title: String,
105    /// Optional detailed description
106    pub description: Option<String>,
107    /// Optional embedding vector for semantic search
108    pub embedding: Option<Vec<f32>>,
109    /// Additional metadata as JSON
110    pub metadata: serde_json::Value,
111    /// SHA-256 hash of title + description for delta detection
112    pub content_hash: String,
113}
114
115impl NewDataset {
116    /// Computes a SHA-256 hash of the content (title + description) for delta detection.
117    ///
118    /// This hash is used to determine if the dataset content has changed since
119    /// the last harvest, avoiding unnecessary embedding regeneration.
120    ///
121    /// # Arguments
122    ///
123    /// * `title` - The dataset title
124    /// * `description` - Optional dataset description
125    ///
126    /// # Returns
127    ///
128    /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
129    pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
130        let mut hasher = Sha256::new();
131        // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
132        let content = format!("{}\n{}", title, description.unwrap_or(""));
133        hasher.update(content.as_bytes());
134        format!("{:x}", hasher.finalize())
135    }
136
137    /// Computes a content hash that includes the language preference.
138    ///
139    /// The language is included so that changing the preferred language
140    /// for a portal triggers re-embedding (since the resolved text changes).
141    pub fn compute_content_hash_with_language(
142        title: &str,
143        description: Option<&str>,
144        language: &str,
145    ) -> String {
146        let mut hasher = Sha256::new();
147        let content = format!("{}\n{}\n{}", language, title, description.unwrap_or(""));
148        hasher.update(content.as_bytes());
149        format!("{:x}", hasher.finalize())
150    }
151}
152
153/// Result of a semantic search with similarity score.
154///
155/// This structure combines a dataset with its similarity score relative to
156/// the search query. The score represents the cosine similarity between the
157/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
158/// and 1.0 (identical).
159///
160/// # Examples
161///
162/// ```
163/// use ceres_core::SearchResult;
164///
165/// // SearchResult is created by the repository during searches
166/// // The similarity_score indicates how relevant the dataset is to the query
167/// // Typical values:
168/// // - 0.9+ : Highly relevant match
169/// // - 0.7-0.9 : Good match
170/// // - 0.5-0.7 : Partial match
171/// // - <0.5 : Weak match
172/// ```
173#[derive(Debug, Serialize, Clone)]
174pub struct SearchResult {
175    /// The matched dataset
176    pub dataset: Dataset,
177    /// Similarity score (0.0-1.0), where 1.0 is a perfect match
178    pub similarity_score: f32,
179}
180
181/// Database statistics for dashboard and monitoring.
182///
183/// Provides an overview of the database state, useful for dashboards
184/// and monitoring systems.
185#[derive(Debug, Serialize, Clone)]
186pub struct DatabaseStats {
187    /// Total number of datasets in the database
188    pub total_datasets: i64,
189    /// Number of datasets with generated embeddings
190    pub datasets_with_embeddings: i64,
191    /// Number of unique indexed portals
192    pub total_portals: i64,
193    /// Timestamp of the last update
194    pub last_update: Option<DateTime<Utc>>,
195}
196
197#[cfg(test)]
198mod tests {
199    use super::*;
200
201    #[test]
202    fn test_new_dataset_creation() {
203        let title = "Test Dataset";
204        let description = Some("A test dataset".to_string());
205        let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
206
207        let dataset = NewDataset {
208            original_id: "test-123".to_string(),
209            source_portal: "https://example.com".to_string(),
210            url: "https://example.com/dataset/test".to_string(),
211            title: title.to_string(),
212            description,
213            embedding: None,
214            metadata: serde_json::json!({"key": "value"}),
215            content_hash,
216        };
217
218        assert_eq!(dataset.original_id, "test-123");
219        assert!(dataset.embedding.is_none());
220        assert_eq!(dataset.content_hash.len(), 64);
221    }
222
223    #[test]
224    fn test_compute_content_hash_consistency() {
225        let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
226        let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
227        assert_eq!(hash1, hash2);
228        assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
229    }
230
231    #[test]
232    fn test_compute_content_hash_different_content() {
233        let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
234        let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
235        assert_ne!(hash1, hash2);
236    }
237
238    #[test]
239    fn test_compute_content_hash_none_vs_empty() {
240        // None description and empty description should produce same hash
241        let hash1 = NewDataset::compute_content_hash("Title", None);
242        let hash2 = NewDataset::compute_content_hash("Title", Some(""));
243        assert_eq!(hash1, hash2);
244    }
245
246    #[test]
247    fn test_compute_content_hash_separator_prevents_collision() {
248        // "AB" + "C" should differ from "A" + "BC"
249        let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
250        let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
251        assert_ne!(hash1, hash2);
252    }
253}