ceres-core 0.4.0

Core types, harvesting logic, and services for Ceres
Documentation
use chrono::{DateTime, Utc};
use serde::Serialize;
use sha2::{Digest, Sha256};
use uuid::Uuid;

/// Complete representation of a row from the 'datasets' table.
///
/// This structure represents a persisted dataset with all database fields,
/// including system-generated identifiers and timestamps. It maps directly
/// to the PostgreSQL schema and is used for reading data from the database.
///
/// # Fields
///
/// * `id` - Unique identifier (UUID) generated by the database
/// * `original_id` - Original identifier from the source portal
/// * `source_portal` - Base URL of the originating CKAN portal
/// * `url` - Public landing page URL for the dataset
/// * `title` - Human-readable dataset title
/// * `description` - Optional detailed description
/// * `embedding` - Optional vector of floats for semantic search
/// * `metadata` - Additional metadata as JSON
/// * `first_seen_at` - Timestamp when the dataset was first indexed
/// * `last_updated_at` - Timestamp of the most recent update
#[derive(Debug, Serialize, Clone)]
pub struct Dataset {
    /// Unique identifier (UUID) generated by the database
    pub id: Uuid,
    /// Original identifier from the source portal
    pub original_id: String,
    /// Base URL of the originating CKAN portal
    pub source_portal: String,
    /// Public landing page URL for the dataset
    pub url: String,
    /// Human-readable dataset title
    pub title: String,
    /// Optional detailed description
    pub description: Option<String>,

    /// Optional embedding vector for semantic search
    pub embedding: Option<Vec<f32>>,

    /// Additional metadata as JSON
    pub metadata: serde_json::Value,

    /// Timestamp when the dataset was first indexed
    pub first_seen_at: DateTime<Utc>,
    /// Timestamp of the most recent update
    pub last_updated_at: DateTime<Utc>,
    /// SHA-256 hash of title + description for delta detection
    pub content_hash: Option<String>,
    /// Whether this dataset has been removed from its source portal
    pub is_stale: bool,
}

/// Data Transfer Object for inserting or updating datasets.
///
/// This structure is used when creating new datasets or updating existing ones.
/// Unlike `Dataset`, it doesn't include database-generated fields like `id` or
/// timestamps. The embedding field stores a vector of floats for semantic search.
///
/// # Examples
///
/// ```
/// use ceres_core::NewDataset;
/// use serde_json::json;
///
/// let title = "My Dataset";
/// let description = Some("Description here".to_string());
/// let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
///
/// let dataset = NewDataset {
///     original_id: "dataset-123".to_string(),
///     source_portal: "https://dati.gov.it".to_string(),
///     url: "https://dati.gov.it/dataset/my-data".to_string(),
///     title: title.to_string(),
///     description,
///     embedding: None,
///     metadata: json!({"tags": ["open-data", "italy"]}),
///     content_hash,
/// };
///
/// assert_eq!(dataset.title, "My Dataset");
/// assert!(dataset.embedding.is_none());
/// assert_eq!(dataset.content_hash.len(), 64); // SHA-256 = 64 hex chars
/// ```
///
/// # Fields
///
/// * `original_id` - Original identifier from the source portal
/// * `source_portal` - Base URL of the originating CKAN portal
/// * `url` - Public landing page URL for the dataset
/// * `title` - Human-readable dataset title
/// * `description` - Optional detailed description
/// * `embedding` - Optional vector of floats for semantic search
/// * `metadata` - Additional metadata as JSON
/// * `content_hash` - SHA-256 hash of title + description for delta detection
#[derive(Debug, Serialize, Clone)]
pub struct NewDataset {
    /// Original identifier from the source portal
    pub original_id: String,
    /// Base URL of the originating CKAN portal
    pub source_portal: String,
    /// Public landing page URL for the dataset
    pub url: String,
    /// Human-readable dataset title
    pub title: String,
    /// Optional detailed description
    pub description: Option<String>,
    /// Optional embedding vector for semantic search
    pub embedding: Option<Vec<f32>>,
    /// Additional metadata as JSON
    pub metadata: serde_json::Value,
    /// SHA-256 hash of title + description for delta detection
    pub content_hash: String,
}

impl NewDataset {
    /// Computes a SHA-256 hash of the content (title + description) for delta detection.
    ///
    /// This hash is used to determine if the dataset content has changed since
    /// the last harvest, avoiding unnecessary embedding regeneration.
    ///
    /// # Arguments
    ///
    /// * `title` - The dataset title
    /// * `description` - Optional dataset description
    ///
    /// # Returns
    ///
    /// A 64-character lowercase hexadecimal string representing the SHA-256 hash.
    pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
        let mut hasher = Sha256::new();
        // Use newline separator to prevent collisions (e.g., "AB" + "C" != "A" + "BC")
        let content = format!("{}\n{}", title, description.unwrap_or(""));
        hasher.update(content.as_bytes());
        format!("{:x}", hasher.finalize())
    }

    /// Computes a content hash that includes the language preference.
    ///
    /// The language is included so that changing the preferred language
    /// for a portal triggers re-embedding (since the resolved text changes).
    pub fn compute_content_hash_with_language(
        title: &str,
        description: Option<&str>,
        language: &str,
    ) -> String {
        let mut hasher = Sha256::new();
        let content = format!("{}\n{}\n{}", language, title, description.unwrap_or(""));
        hasher.update(content.as_bytes());
        format!("{:x}", hasher.finalize())
    }
}

/// Result of a semantic search with similarity score.
///
/// This structure combines a dataset with its similarity score relative to
/// the search query. The score represents the cosine similarity between the
/// dataset embedding and the query embedding, with values between 0.0 (no similarity)
/// and 1.0 (identical).
///
/// # Examples
///
/// ```
/// use ceres_core::SearchResult;
///
/// // SearchResult is created by the repository during searches
/// // The similarity_score indicates how relevant the dataset is to the query
/// // Typical values:
/// // - 0.9+ : Highly relevant match
/// // - 0.7-0.9 : Good match
/// // - 0.5-0.7 : Partial match
/// // - <0.5 : Weak match
/// ```
#[derive(Debug, Serialize, Clone)]
pub struct SearchResult {
    /// The matched dataset
    pub dataset: Dataset,
    /// Similarity score (0.0-1.0), where 1.0 is a perfect match
    pub similarity_score: f32,
}

/// Database statistics for dashboard and monitoring.
///
/// Provides an overview of the database state, useful for dashboards
/// and monitoring systems.
#[derive(Debug, Serialize, Clone)]
pub struct DatabaseStats {
    /// Total number of datasets in the database
    pub total_datasets: i64,
    /// Number of datasets with generated embeddings
    pub datasets_with_embeddings: i64,
    /// Number of unique indexed portals
    pub total_portals: i64,
    /// Timestamp of the last update
    pub last_update: Option<DateTime<Utc>>,
    /// Number of datasets marked as stale (removed from source portal)
    pub stale_datasets: i64,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_new_dataset_creation() {
        let title = "Test Dataset";
        let description = Some("A test dataset".to_string());
        let content_hash = NewDataset::compute_content_hash(title, description.as_deref());

        let dataset = NewDataset {
            original_id: "test-123".to_string(),
            source_portal: "https://example.com".to_string(),
            url: "https://example.com/dataset/test".to_string(),
            title: title.to_string(),
            description,
            embedding: None,
            metadata: serde_json::json!({"key": "value"}),
            content_hash,
        };

        assert_eq!(dataset.original_id, "test-123");
        assert!(dataset.embedding.is_none());
        assert_eq!(dataset.content_hash.len(), 64);
    }

    #[test]
    fn test_compute_content_hash_consistency() {
        let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
        let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
        assert_eq!(hash1, hash2);
        assert_eq!(hash1.len(), 64); // SHA-256 = 64 hex chars
    }

    #[test]
    fn test_compute_content_hash_different_content() {
        let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
        let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
        assert_ne!(hash1, hash2);
    }

    #[test]
    fn test_compute_content_hash_none_vs_empty() {
        // None description and empty description should produce same hash
        let hash1 = NewDataset::compute_content_hash("Title", None);
        let hash2 = NewDataset::compute_content_hash("Title", Some(""));
        assert_eq!(hash1, hash2);
    }

    #[test]
    fn test_compute_content_hash_separator_prevents_collision() {
        // "AB" + "C" should differ from "A" + "BC"
        let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
        let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
        assert_ne!(hash1, hash2);
    }
}