use ceres_core::PortalType;
use clap::{Parser, Subcommand, ValueEnum};
use std::path::PathBuf;
use std::sync::LazyLock;
static VERSION_INFO: LazyLock<String> = LazyLock::new(|| {
let version = env!("CARGO_PKG_VERSION");
let commit = option_env!("VERGEN_GIT_SHA")
.map(|s| s.chars().take(7).collect::<String>())
.unwrap_or_else(|| "unknown".to_string());
let built = option_env!("VERGEN_BUILD_DATE").unwrap_or("unknown"); let target = option_env!("VERGEN_CARGO_TARGET_TRIPLE").unwrap_or("unknown");
let rustc = option_env!("VERGEN_RUSTC_SEMVER").unwrap_or("unknown");
format!("{version}\ncommit: {commit}\nbuilt: {built}\ntarget: {target}\nrustc: {rustc}")
});
pub fn version_info() -> &'static str {
&VERSION_INFO
}
#[derive(Parser, Debug)]
#[command(name = "ceres")]
#[command(
author,
version = version_info(),
about = "Semantic search engine for open data portals"
)]
#[command(after_help = "Examples:
ceres harvest https://dati.comune.milano.it
ceres search \"air quality monitoring\" --limit 5
ceres export --format jsonl > datasets.jsonl
ceres stats
Embedding providers:
EMBEDDING_PROVIDER=gemini (default) - Google Gemini (768 dimensions)
EMBEDDING_PROVIDER=openai - OpenAI (1536 or 3072 dimensions)
EMBEDDING_PROVIDER=ollama - Ollama local (768 dimensions, default: nomic-embed-text)")]
pub struct Config {
#[arg(long, env = "DATABASE_URL")]
pub database_url: String,
#[arg(long, env = "EMBEDDING_PROVIDER", default_value = "gemini")]
pub embedding_provider: String,
#[arg(long, env = "GEMINI_API_KEY")]
pub gemini_api_key: Option<String>,
#[arg(long, env = "OPENAI_API_KEY")]
pub openai_api_key: Option<String>,
#[arg(long, env = "EMBEDDING_MODEL")]
pub embedding_model: Option<String>,
#[arg(long, env = "OLLAMA_ENDPOINT")]
pub ollama_endpoint: Option<String>,
#[command(subcommand)]
pub command: Command,
}
#[derive(Subcommand, Debug)]
pub enum Command {
#[command(after_help = "Examples:
ceres harvest # Harvest all enabled portals from config
ceres harvest https://dati.comune.milano.it # Harvest single CKAN URL (default type)
ceres harvest https://data.public.lu --type dcat # Harvest DCAT portal by URL
ceres harvest https://data.europa.eu --type dcat --profile sparql # Harvest SPARQL DCAT endpoint
ceres harvest --portal milano # Harvest portal by name from config
ceres harvest --config ~/custom.toml # Use custom config file
ceres harvest --full-sync # Force full sync even if incremental is available")]
Harvest {
#[arg(value_name = "URL")]
portal_url: Option<String>,
#[arg(
long,
value_name = "TYPE",
default_value = "ckan",
requires = "portal_url"
)]
r#type: PortalType,
#[arg(long, value_name = "PROFILE", requires = "portal_url")]
profile: Option<String>,
#[arg(short, long, value_name = "NAME", conflicts_with = "portal_url")]
portal: Option<String>,
#[arg(short, long, value_name = "PATH")]
config: Option<PathBuf>,
#[arg(long)]
full_sync: bool,
#[arg(long)]
dry_run: bool,
#[arg(long)]
metadata_only: bool,
},
#[command(after_help = "Examples:
ceres embed # Embed all pending datasets
ceres embed --portal https://dati.comune.milano.it # Embed pending from one portal")]
Embed {
#[arg(short, long)]
portal: Option<String>,
},
#[command(after_help = "Example: ceres search \"trasporto pubblico\" --limit 10")]
Search {
query: String,
#[arg(short, long, default_value = "10")]
limit: usize,
},
#[command(after_help = "Examples:
ceres export --format jsonl > datasets.jsonl
ceres export --format json --portal https://dati.gov.it
ceres export --format parquet --output ./ceres-export")]
Export {
#[arg(short, long, default_value = "jsonl")]
format: ExportFormat,
#[arg(short, long)]
portal: Option<String>,
#[arg(short, long)]
limit: Option<usize>,
#[arg(short, long, value_name = "DIR")]
output: Option<std::path::PathBuf>,
#[arg(short, long, value_name = "PATH")]
config: Option<std::path::PathBuf>,
},
Stats,
}
#[derive(Debug, Clone, ValueEnum)]
pub enum ExportFormat {
Jsonl,
Json,
Csv,
Parquet,
}
#[cfg(test)]
mod tests {
use super::version_info;
#[test]
fn test_version_info_contains_expected_fields() {
let info = version_info();
assert!(info.contains("commit:"));
assert!(info.contains("built:"));
assert!(info.contains("target:"));
assert!(info.contains("rustc:"));
}
}