ceres_core/
models.rs

1use chrono::{DateTime, Utc};
2use pgvector::Vector;
3use serde::{Deserialize, Serialize};
4use sha2::{Digest, Sha256};
5use sqlx::prelude::FromRow;
6use sqlx::types::Json;
7use uuid::Uuid;
8
9/// Complete representation of a row from the 'datasets' table.
10///
11/// This structure represents a persisted dataset with all database fields,
12/// including system-generated identifiers and timestamps. It maps directly
13/// to the PostgreSQL schema and is used for reading data from the database.
14///
15/// # Fields
16///
17/// * `id` - Unique identifier (UUID) generated by the database
18/// * `original_id` - Original identifier from the source portal
19/// * `source_portal` - Base URL of the originating CKAN portal
20/// * `url` - Public landing page URL for the dataset
21/// * `title` - Human-readable dataset title
22/// * `description` - Optional detailed description
23/// * `embedding` - Optional 1536-dimensional vector for semantic search
24/// * `metadata` - Additional metadata stored as JSONB
25/// * `first_seen_at` - Timestamp when the dataset was first indexed
26/// * `last_updated_at` - Timestamp of the most recent update
27#[derive(Debug, FromRow, Serialize, Clone)]
28pub struct Dataset {
29    /// Unique identifier (UUID) generated by the database
30    pub id: Uuid,
31    /// Original identifier from the source portal
32    pub original_id: String,
33    /// Base URL of the originating CKAN portal
34    pub source_portal: String,
35    /// Public landing page URL for the dataset
36    pub url: String,
37    /// Human-readable dataset title
38    pub title: String,
39    /// Optional detailed description
40    pub description: Option<String>,
41
42    /// Optional 1536-dimensional vector for semantic search (pgvector type)
43    pub embedding: Option<Vector>,
44
45    /// Additional metadata stored as JSONB
46    pub metadata: Json<serde_json::Value>,
47
48    /// Timestamp when the dataset was first indexed
49    pub first_seen_at: DateTime<Utc>,
50    /// Timestamp of the most recent update
51    pub last_updated_at: DateTime<Utc>,
52    /// SHA-256 hash of title + description for delta detection
53    pub content_hash: Option<String>,
54}
55
56/// Data Transfer Object for inserting or updating datasets.
57///
58/// This structure is used when creating new datasets or updating existing ones.
59/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
60/// timestamps. The embedding field uses pgvector's `Vector` for database storage.
61///
62/// # Examples
63///
64/// ```
65/// use ceres_core::NewDataset;
66/// use serde_json::json;
67///
68/// let title = "My Dataset";
69/// let description = Some("Description here".to_string());
70/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
71///
72/// let dataset = NewDataset {
73///     original_id: "dataset-123".to_string(),
74///     source_portal: "https://dati.gov.it".to_string(),
75///     url: "https://dati.gov.it/dataset/my-data".to_string(),
76///     title: title.to_string(),
77///     description,
78///     embedding: None,
79///     metadata: json!({"tags": ["open-data", "italy"]}),
80///     content_hash,
81/// };
82///
83/// assert_eq!(dataset.title, "My Dataset");
84/// assert!(dataset.embedding.is_none());
85/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
86/// ```
87///
88/// # Fields
89///
90/// * `original_id` - Original identifier from the source portal
91/// * `source_portal` - Base URL of the originating CKAN portal
92/// * `url` - Public landing page URL for the dataset
93/// * `title` - Human-readable dataset title
94/// * `description` - Optional detailed description
95/// * `embedding` - Optional vector of 768 floats (pgvector)
96/// * `metadata` - Additional metadata as JSON
97/// * `content_hash` - SHA-256 hash of title + description for delta detection
98#[derive(Debug, Serialize, Clone)]
99pub struct NewDataset {
100    /// Original identifier from the source portal
101    pub original_id: String,
102    /// Base URL of the originating CKAN portal
103    pub source_portal: String,
104    /// Public landing page URL for the dataset
105    pub url: String,
106    /// Human-readable dataset title
107    pub title: String,
108    /// Optional detailed description
109    pub description: Option<String>,
110    /// Optional vector of 768 floats (converted to pgvector on storage)
111    pub embedding: Option<Vector>,
112    /// Additional metadata as JSON
113    pub metadata: serde_json::Value,
114    /// SHA-256 hash of title + description for delta detection
115    pub content_hash: String,
116}
117
118impl NewDataset {
119    /// Computes a SHA-256 hash of the content (title + description) for delta detection.
120    ///
121    /// This hash is used to determine if the dataset content has changed since
122    /// the last harvest, avoiding unnecessary embedding regeneration.
123    ///
124    /// # Arguments
125    ///
126    /// * `title` - The dataset title
127    /// * `description` - Optional dataset description
128    ///
129    /// # Returns
130    ///
131    /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
132    pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
133        let mut hasher = Sha256::new();
134        // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
135        let content = format!("{}\n{}", title, description.unwrap_or(""));
136        hasher.update(content.as_bytes());
137        format!("{:x}", hasher.finalize())
138    }
139}
140
141/// Result of a semantic search with similarity score.
142///
143/// This structure combines a dataset with its similarity score relative to
144/// the search query. The score represents the cosine similarity between the
145/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
146/// and 1.0 (identical).
147///
148/// # Examples
149///
150/// ```
151/// use ceres_core::SearchResult;
152///
153/// // SearchResult is created by the repository during searches
154/// // The similarity_score indicates how relevant the dataset is to the query
155/// // Typical values:
156/// // - 0.9+ : Highly relevant match
157/// // - 0.7-0.9 : Good match
158/// // - 0.5-0.7 : Partial match
159/// // - <0.5 : Weak match
160/// ```
161#[derive(Debug, Serialize, Clone)]
162pub struct SearchResult {
163    /// The matched dataset
164    pub dataset: Dataset,
165    /// Similarity score (0.0-1.0), where 1.0 is a perfect match
166    pub similarity_score: f32,
167}
168
169/// Database statistics for dashboard and monitoring.
170///
171/// Provides an overview of the database state, useful for dashboards
172/// and monitoring systems.
173#[derive(Debug, Serialize, Clone)]
174pub struct DatabaseStats {
175    /// Total number of datasets in the database
176    pub total_datasets: i64,
177    /// Number of datasets with generated embeddings
178    pub datasets_with_embeddings: i64,
179    /// Number of unique indexed portals
180    pub total_portals: i64,
181    /// Timestamp of the last update
182    pub last_update: Option<DateTime<Utc>>,
183}
184
185/// Portal configured in portals.toml.
186///
187/// Represents an open data portal configured for harvesting.
188/// Supports different portal types (CKAN, Socrata, DCAT).
189///
190/// # Examples
191///
192/// ```
193/// use ceres_core::Portal;
194///
195/// let json = r#"{
196///     "name": "Dati.gov.it",
197///     "url": "https://dati.gov.it",
198///     "type": "ckan",
199///     "description": "Italian national open data portal"
200/// }"#;
201///
202/// let portal: Portal = serde_json::from_str(json).unwrap();
203/// assert_eq!(portal.name, "Dati.gov.it");
204/// assert_eq!(portal.portal_type, "ckan");
205/// assert!(portal.enabled); // Default is true
206/// ```
207#[derive(Debug, Serialize, Deserialize, Clone)]
208pub struct Portal {
209    /// Portal name (human-readable)
210    pub name: String,
211    /// Base URL of the portal
212    pub url: String,
213    /// Portal type ("ckan", "socrata", "dcat")
214    #[serde(rename = "type")]
215    pub portal_type: String,
216    /// Whether the portal is enabled for harvesting
217    #[serde(default = "default_enabled")]
218    pub enabled: bool,
219    /// Optional portal description
220    pub description: Option<String>,
221}
222
223/// Default value for Portal.enabled field
224fn default_enabled() -> bool {
225    true
226}
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231
232    #[test]
233    fn test_portal_default_enabled() {
234        let json = r#"{
235            "name": "Test Portal",
236            "url": "https://example.com",
237            "type": "ckan"
238        }"#;
239
240        let portal: Portal = serde_json::from_str(json).unwrap();
241        assert!(portal.enabled);
242    }
243
244    #[test]
245    fn test_new_dataset_creation() {
246        let title = "Test Dataset";
247        let description = Some("A test dataset".to_string());
248        let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
249
250        let dataset = NewDataset {
251            original_id: "test-123".to_string(),
252            source_portal: "https://example.com".to_string(),
253            url: "https://example.com/dataset/test".to_string(),
254            title: title.to_string(),
255            description,
256            embedding: None,
257            metadata: serde_json::json!({"key": "value"}),
258            content_hash,
259        };
260
261        assert_eq!(dataset.original_id, "test-123");
262        assert!(dataset.embedding.is_none());
263        assert_eq!(dataset.content_hash.len(), 64);
264    }
265
266    #[test]
267    fn test_compute_content_hash_consistency() {
268        let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
269        let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
270        assert_eq!(hash1, hash2);
271        assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
272    }
273
274    #[test]
275    fn test_compute_content_hash_different_content() {
276        let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
277        let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
278        assert_ne!(hash1, hash2);
279    }
280
281    #[test]
282    fn test_compute_content_hash_none_vs_empty() {
283        // None description and empty description should produce same hash
284        let hash1 = NewDataset::compute_content_hash("Title", None);
285        let hash2 = NewDataset::compute_content_hash("Title", Some(""));
286        assert_eq!(hash1, hash2);
287    }
288
289    #[test]
290    fn test_compute_content_hash_separator_prevents_collision() {
291        // "AB" + "C" should differ from "A" + "BC"
292        let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
293        let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
294        assert_ne!(hash1, hash2);
295    }
296}