use chrono::{DateTime, Utc};
use serde::Serialize;
use sha2::{Digest, Sha256};
use uuid::Uuid;
#[derive(Debug, Serialize, Clone)]
pub struct Dataset {
pub id: Uuid,
pub original_id: String,
pub source_portal: String,
pub url: String,
pub title: String,
pub description: Option<String>,
pub embedding: Option<Vec<f32>>,
pub metadata: serde_json::Value,
pub first_seen_at: DateTime<Utc>,
pub last_updated_at: DateTime<Utc>,
pub content_hash: Option<String>,
pub is_stale: bool,
}
#[derive(Debug, Serialize, Clone)]
pub struct NewDataset {
pub original_id: String,
pub source_portal: String,
pub url: String,
pub title: String,
pub description: Option<String>,
pub embedding: Option<Vec<f32>>,
pub metadata: serde_json::Value,
pub content_hash: String,
}
impl NewDataset {
pub fn compute_content_hash(title: &str, description: Option<&str>) -> String {
let mut hasher = Sha256::new();
let content = format!("{}\n{}", title, description.unwrap_or(""));
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn compute_content_hash_with_language(
title: &str,
description: Option<&str>,
language: &str,
) -> String {
let mut hasher = Sha256::new();
let content = format!("{}\n{}\n{}", language, title, description.unwrap_or(""));
hasher.update(content.as_bytes());
format!("{:x}", hasher.finalize())
}
}
#[derive(Debug, Serialize, Clone)]
pub struct SearchResult {
pub dataset: Dataset,
pub similarity_score: f32,
}
#[derive(Debug, Serialize, Clone)]
pub struct DatabaseStats {
pub total_datasets: i64,
pub datasets_with_embeddings: i64,
pub total_portals: i64,
pub last_update: Option<DateTime<Utc>>,
pub stale_datasets: i64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_dataset_creation() {
let title = "Test Dataset";
let description = Some("A test dataset".to_string());
let content_hash = NewDataset::compute_content_hash(title, description.as_deref());
let dataset = NewDataset {
original_id: "test-123".to_string(),
source_portal: "https://example.com".to_string(),
url: "https://example.com/dataset/test".to_string(),
title: title.to_string(),
description,
embedding: None,
metadata: serde_json::json!({"key": "value"}),
content_hash,
};
assert_eq!(dataset.original_id, "test-123");
assert!(dataset.embedding.is_none());
assert_eq!(dataset.content_hash.len(), 64);
}
#[test]
fn test_compute_content_hash_consistency() {
let hash1 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
let hash2 = NewDataset::compute_content_hash("Test Title", Some("Test Description"));
assert_eq!(hash1, hash2);
assert_eq!(hash1.len(), 64); }
#[test]
fn test_compute_content_hash_different_content() {
let hash1 = NewDataset::compute_content_hash("Title A", Some("Description"));
let hash2 = NewDataset::compute_content_hash("Title B", Some("Description"));
assert_ne!(hash1, hash2);
}
#[test]
fn test_compute_content_hash_none_vs_empty() {
let hash1 = NewDataset::compute_content_hash("Title", None);
let hash2 = NewDataset::compute_content_hash("Title", Some(""));
assert_eq!(hash1, hash2);
}
#[test]
fn test_compute_content_hash_separator_prevents_collision() {
let hash1 = NewDataset::compute_content_hash("AB", Some("C"));
let hash2 = NewDataset::compute_content_hash("A", Some("BC"));
assert_ne!(hash1, hash2);
}
}