ceres-core 0.4.0

//! Trait definitions for external dependencies.
//!
//! This module defines traits that abstract over external dependencies
//! (embedding providers, portal clients, data stores), enabling:
//!
//! - **Testability**: Mock implementations for unit testing
//! - **Flexibility**: Different backend implementations (e.g., different embedding APIs)
//! - **Decoupling**: Core business logic doesn't depend on specific implementations
//!
//! # Example
//!
//! ```
//! use ceres_core::traits::{EmbeddingProvider, DatasetStore};
//!
//! // Business logic uses traits, not concrete types
//! async fn search_datasets<E, S>(
//!     embedding: &E,
//!     store: &S,
//!     query: &str,
//! ) -> Result<Vec<ceres_core::SearchResult>, ceres_core::AppError>
//! where
//!     E: EmbeddingProvider,
//!     S: DatasetStore,
//! {
//!     let vector: Vec<f32> = embedding.generate(query).await?;
//!     store.search(vector, 10).await
//! }
//! ```

use std::collections::HashMap;
use std::future::Future;

use chrono::{DateTime, Utc};
use futures::stream::{self, BoxStream};
use uuid::Uuid;

use crate::config::PortalType;
use crate::{AppError, Dataset, NewDataset, SearchResult};

/// Provider for generating text embeddings.
///
/// Implementations convert text into vector representations for semantic search.
/// Different providers may produce vectors of different dimensions:
/// - Gemini text-embedding-004: 768 dimensions
/// - OpenAI text-embedding-3-small: 1536 dimensions
/// - OpenAI text-embedding-3-large: 3072 dimensions
pub trait EmbeddingProvider: Send + Sync + Clone {
    /// Returns the provider identifier for logging and configuration.
    ///
    /// # Examples
    ///
    /// - `"gemini"` for Google Gemini
    /// - `"openai"` for OpenAI
    fn name(&self) -> &'static str;

    /// Returns the embedding dimension this provider generates.
    ///
    /// This value must match the database column dimension for vector storage.
    /// Mismatched dimensions will cause insertion failures.
    fn dimension(&self) -> usize;

    /// Generates an embedding vector for the given text.
    ///
    /// # Arguments
    ///
    /// * `text` - The text to embed
    ///
    /// # Returns
    ///
    /// A vector of floating-point values representing the text embedding.
    /// The vector length must equal `self.dimension()`.
    fn generate(&self, text: &str) -> impl Future<Output = Result<Vec<f32>, AppError>> + Send;

    /// Maximum number of texts supported per batch API call.
    ///
    /// The harvest pipeline uses `min(config.embedding_batch_size, max_batch_size())`
    /// to ensure batches never exceed provider limits.
    ///
    /// # Defaults
    ///
    /// Returns `1` (single-item batches). Providers with native batch support
    /// should override to enable efficient batching.
    fn max_batch_size(&self) -> usize {
        1
    }

    /// Generates embeddings for multiple texts in a batch.
    ///
    /// The default implementation calls `generate()` sequentially.
    /// Providers with native batch API support should override for efficiency.
    ///
    /// # Arguments
    ///
    /// * `texts` - Slice of texts to embed
    ///
    /// # Returns
    ///
    /// A vector of embedding vectors, one per input text.
    fn generate_batch(
        &self,
        texts: &[String],
    ) -> impl Future<Output = Result<Vec<Vec<f32>>, AppError>> + Send {
        let texts_owned: Vec<String> = texts.to_vec();
        async move {
            let mut results = Vec::with_capacity(texts_owned.len());
            for text in &texts_owned {
                results.push(self.generate(text).await?);
            }
            Ok(results)
        }
    }
}

/// Client for accessing open data portals (CKAN, Socrata, etc.).
///
/// Implementations fetch dataset metadata from portal APIs.
pub trait PortalClient: Send + Sync + Clone {
    /// Type representing raw portal data before transformation.
    type PortalData: Send;

    /// Returns the portal type identifier (e.g., "ckan", "socrata", "dcat").
    fn portal_type(&self) -> &'static str;

    /// Returns the base URL of the portal.
    fn base_url(&self) -> &str;

    /// Lists all dataset IDs available on the portal.
    fn list_dataset_ids(&self) -> impl Future<Output = Result<Vec<String>, AppError>> + Send;

    /// Fetches detailed metadata for a specific dataset.
    ///
    /// # Arguments
    ///
    /// * `id` - The dataset identifier
    fn get_dataset(
        &self,
        id: &str,
    ) -> impl Future<Output = Result<Self::PortalData, AppError>> + Send;

    /// Converts portal-specific data into a normalized NewDataset.
    ///
    /// # Arguments
    ///
    /// * `data` - The raw portal data
    /// * `portal_url` - The portal URL for source tracking
    /// * `url_template` - Optional URL template with `{id}` and `{name}` placeholders
    /// * `language` - Preferred language for resolving multilingual fields
    fn into_new_dataset(
        data: Self::PortalData,
        portal_url: &str,
        url_template: Option<&str>,
        language: &str,
    ) -> NewDataset;

    /// Searches for datasets modified since the given timestamp.
    ///
    /// Used for incremental harvesting to fetch only recently modified datasets.
    /// Returns full dataset objects, eliminating the need for separate get_dataset calls.
    ///
    /// # Arguments
    ///
    /// * `since` - Only return datasets modified after this timestamp
    ///
    /// # Returns
    ///
    /// A vector of portal-specific dataset objects modified since the given time.
    /// Returns an error if the portal doesn't support incremental search.
    fn search_modified_since(
        &self,
        since: DateTime<Utc>,
    ) -> impl Future<Output = Result<Vec<Self::PortalData>, AppError>> + Send;

    /// Fetches all datasets from the portal in bulk using paginated search.
    ///
    /// This is far more efficient than `list_dataset_ids()` + individual
    /// `get_dataset()` calls for large portals (e.g., HDX with ~40k datasets),
    /// as it avoids per-dataset HTTP requests and rate limiting.
    ///
    /// Returns full dataset objects ready for processing.
    /// The default implementation falls back to `list_dataset_ids()` + `get_dataset()`
    /// for portals that don't support bulk search.
    fn search_all_datasets(
        &self,
    ) -> impl Future<Output = Result<Vec<Self::PortalData>, AppError>> + Send {
        async {
            Err(AppError::Generic(
                "search_all_datasets not supported".to_string(),
            ))
        }
    }

    /// Streams all datasets from the portal page-by-page.
    ///
    /// Each stream item is a page of datasets (e.g., 1000 for CKAN, 100 for DCAT),
    /// matching natural pagination boundaries. This bounds peak memory to one page
    /// instead of accumulating the entire catalog.
    ///
    /// The default implementation wraps `search_all_datasets()` as a single-page
    /// stream, so existing implementors work without changes.
    fn search_all_datasets_stream(&self) -> BoxStream<'_, Result<Vec<Self::PortalData>, AppError>> {
        Box::pin(stream::once(self.search_all_datasets()))
    }

    /// Returns the total number of datasets on the portal.
    ///
    /// Used for progress reporting when streaming. Portals that support a cheap
    /// count query should override this. The default returns an error.
    fn dataset_count(&self) -> impl Future<Output = Result<usize, AppError>> + Send {
        async { Err(AppError::Generic("dataset_count not supported".to_string())) }
    }
}

/// Factory for creating portal clients.
///
/// Separate from PortalClient to avoid issues with async trait constructors.
pub trait PortalClientFactory: Send + Sync + Clone {
    /// The type of portal client this factory creates.
    type Client: PortalClient;

    /// Creates a new portal client for the given URL, portal type, and optional profile.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The portal API base URL
    /// * `portal_type` - The type of portal to create a client for
    /// * `language` - Preferred language for multilingual portals (e.g. "en", "fr")
    /// * `profile` - Optional profile for sub-dispatch (e.g. `"sparql"` for DCAT portals)
    /// * `sparql_endpoint` - Optional custom SPARQL endpoint URL (overrides `{url}/sparql`)
    fn create(
        &self,
        portal_url: &str,
        portal_type: PortalType,
        language: &str,
        profile: Option<&str>,
        sparql_endpoint: Option<&str>,
    ) -> Result<Self::Client, AppError>;
}

/// Store for dataset persistence and retrieval.
///
/// Implementations handle database operations for datasets.
pub trait DatasetStore: Send + Sync + Clone {
    /// Retrieves a dataset by its unique ID.
    ///
    /// # Arguments
    ///
    /// * `id` - The dataset's UUID
    ///
    /// # Returns
    ///
    /// The dataset if found, or None if not exists.
    fn get_by_id(&self, id: Uuid)
    -> impl Future<Output = Result<Option<Dataset>, AppError>> + Send;

    /// Retrieves content hashes for all datasets from a specific portal.
    ///
    /// Used for delta detection to determine which datasets need reprocessing.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    ///
    /// # Returns
    ///
    /// A map from original_id to optional content_hash.
    fn get_hashes_for_portal(
        &self,
        portal_url: &str,
    ) -> impl Future<Output = Result<HashMap<String, Option<String>>, AppError>> + Send;

    /// Updates only the timestamp for an unchanged dataset.
    ///
    /// Used when content hash matches but we want to track "last seen" time.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    /// * `original_id` - The dataset's original ID from the portal
    fn update_timestamp_only(
        &self,
        portal_url: &str,
        original_id: &str,
    ) -> impl Future<Output = Result<(), AppError>> + Send;

    /// Batch updates timestamps for multiple unchanged datasets.
    ///
    /// More efficient than calling `update_timestamp_only` for each dataset.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    /// * `original_ids` - Slice of dataset original IDs to update
    ///
    /// # Returns
    ///
    /// The number of rows actually updated.
    fn batch_update_timestamps(
        &self,
        portal_url: &str,
        original_ids: &[String],
    ) -> impl Future<Output = Result<u64, AppError>> + Send;

    /// Marks datasets as stale if they were not seen during the latest full sync.
    ///
    /// After a successful full sync, any dataset whose `last_updated_at` is older
    /// than `sync_start` was not present in the portal's response.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    /// * `sync_start` - Timestamp recorded at the start of the sync
    ///
    /// # Returns
    ///
    /// The number of datasets newly marked as stale.
    fn mark_stale_datasets(
        &self,
        portal_url: &str,
        sync_start: DateTime<Utc>,
    ) -> impl Future<Output = Result<u64, AppError>> + Send;

    /// Marks datasets as stale if their original_id is NOT in the given set.
    ///
    /// This is more efficient than the timestamp-based approach because it
    /// avoids updating every unchanged row just to compare timestamps later.
    /// Instead, we directly identify stale datasets by exclusion.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    /// * `seen_ids` - All original_ids seen during the current full sync
    ///
    /// # Returns
    ///
    /// The number of datasets newly marked as stale.
    fn mark_stale_by_exclusion(
        &self,
        portal_url: &str,
        seen_ids: &[String],
    ) -> impl Future<Output = Result<u64, AppError>> + Send;

    /// Inserts or updates a dataset.
    ///
    /// # Arguments
    ///
    /// * `dataset` - The dataset to upsert
    ///
    /// # Returns
    ///
    /// The UUID of the affected row.
    fn upsert(&self, dataset: &NewDataset) -> impl Future<Output = Result<Uuid, AppError>> + Send;

    /// Batch upserts multiple datasets in a single operation.
    ///
    /// Much faster than calling `upsert` in a loop because it reduces
    /// database round-trips and amortizes index update costs.
    ///
    /// # Arguments
    ///
    /// * `datasets` - Slice of datasets to upsert
    ///
    /// # Returns
    ///
    /// The UUIDs of all affected rows.
    fn batch_upsert(
        &self,
        datasets: &[NewDataset],
    ) -> impl Future<Output = Result<Vec<Uuid>, AppError>> + Send;

    /// Performs vector similarity search.
    ///
    /// # Arguments
    ///
    /// * `query_vector` - The embedding vector to search for
    /// * `limit` - Maximum number of results
    ///
    /// # Returns
    ///
    /// Datasets ranked by similarity score (highest first).
    fn search(
        &self,
        query_vector: Vec<f32>,
        limit: usize,
    ) -> impl Future<Output = Result<Vec<SearchResult>, AppError>> + Send;

    /// Lists datasets as a stream with optional filtering.
    ///
    /// This method returns a stream of datasets for memory-efficient
    /// processing of large result sets. Unlike batch methods, it streams
    /// results directly from the database without loading everything into memory.
    ///
    /// # Arguments
    ///
    /// * `portal_filter` - Optional portal URL to filter by
    /// * `limit` - Optional maximum number of records
    fn list_stream<'a>(
        &'a self,
        portal_filter: Option<&'a str>,
        limit: Option<usize>,
    ) -> BoxStream<'a, Result<Dataset, AppError>>;

    /// Retrieves the last successful sync timestamp for a portal.
    ///
    /// Used for incremental harvesting to determine which datasets
    /// have been modified since the last sync.
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    ///
    /// # Returns
    ///
    /// The timestamp of the last successful sync, or None if never synced.
    fn get_last_sync_time(
        &self,
        portal_url: &str,
    ) -> impl Future<Output = Result<Option<DateTime<Utc>>, AppError>> + Send;

    /// Records a sync status for a portal.
    ///
    /// Called after a harvest operation to update the sync status.
    /// The `sync_status` parameter indicates the outcome: "completed" or "cancelled".
    ///
    /// # Arguments
    ///
    /// * `portal_url` - The source portal URL
    /// * `sync_time` - The timestamp of this sync
    /// * `sync_mode` - Either "full" or "incremental"
    ///   TODO(design): sync_mode/sync_status should be typed enums, not &str
    /// * `sync_status` - The outcome: "completed" or "cancelled"
    /// * `datasets_synced` - Number of datasets processed
    fn record_sync_status(
        &self,
        portal_url: &str,
        sync_time: DateTime<Utc>,
        sync_mode: &str,
        sync_status: &str,
        datasets_synced: i32,
    ) -> impl Future<Output = Result<(), AppError>> + Send;

    /// Returns lowercased titles that appear across multiple portals.
    ///
    /// Used for cross-portal duplicate detection in Parquet exports.
    /// Typically returns ~21k titles (~2MB) for the full dataset.
    fn get_duplicate_titles(
        &self,
    ) -> impl Future<Output = Result<std::collections::HashSet<String>, AppError>> + Send;

    /// Lists datasets that have no embedding vector (`embedding IS NULL`).
    ///
    /// Used by [`crate::EmbeddingService`] to find datasets needing embedding generation.
    ///
    /// # Arguments
    ///
    /// * `portal_filter` - Optional portal URL to scope the query
    /// * `limit` - Maximum number of datasets to return
    fn list_pending_embeddings(
        &self,
        portal_filter: Option<&str>,
        limit: Option<usize>,
    ) -> impl Future<Output = Result<Vec<Dataset>, AppError>> + Send;

    /// Counts datasets with `embedding IS NULL`.
    ///
    /// Used for progress reporting in the embedding service.
    ///
    /// # Arguments
    ///
    /// * `portal_filter` - Optional portal URL to scope the count
    fn count_pending_embeddings(
        &self,
        portal_filter: Option<&str>,
    ) -> impl Future<Output = Result<i64, AppError>> + Send;

    /// Checks database connectivity.
    ///
    /// Performs a simple query to verify the database is reachable and responsive.
    /// Used by health check endpoints.
    fn health_check(&self) -> impl Future<Output = Result<(), AppError>> + Send;
}