Skip to main content

ceres_search/
config.rs

1use ceres_core::PortalType;
2use clap::{Parser, Subcommand, ValueEnum};
3use std::path::PathBuf;
4use std::sync::LazyLock;
5
6static VERSION_INFO: LazyLock<String> = LazyLock::new(|| {
7    let version = env!("CARGO_PKG_VERSION");
8
9    // Use VERGEN_GIT_SHA for the commit hash (with safe slicing)
10    let commit = option_env!("VERGEN_GIT_SHA")
11        .map(|s| s.chars().take(7).collect::<String>())
12        .unwrap_or_else(|| "unknown".to_string());
13
14    let built = option_env!("VERGEN_BUILD_DATE").unwrap_or("unknown"); // YYYY-MM-DD
15    let target = option_env!("VERGEN_CARGO_TARGET_TRIPLE").unwrap_or("unknown");
16    let rustc = option_env!("VERGEN_RUSTC_SEMVER").unwrap_or("unknown");
17
18    format!("{version}\ncommit: {commit}\nbuilt: {built}\ntarget: {target}\nrustc: {rustc}")
19});
20
21pub fn version_info() -> &'static str {
22    &VERSION_INFO
23}
24
25/// CLI configuration parsed from command line arguments and environment variables
26#[derive(Parser, Debug)]
27#[command(name = "ceres")]
28#[command(
29    author,
30    version = version_info(),
31    about = "Semantic search engine for open data portals"
32)]
33#[command(after_help = "Examples:
34  ceres harvest https://dati.comune.milano.it
35  ceres search \"air quality monitoring\" --limit 5
36  ceres export --format jsonl > datasets.jsonl
37  ceres stats
38
39Embedding providers:
40  EMBEDDING_PROVIDER=gemini (default) - Google Gemini (768 dimensions)
41  EMBEDDING_PROVIDER=openai           - OpenAI (1536 or 3072 dimensions)
42  EMBEDDING_PROVIDER=ollama           - Ollama local (768 dimensions, default: nomic-embed-text)")]
43pub struct Config {
44    /// PostgreSQL database connection URL
45    #[arg(long, env = "DATABASE_URL")]
46    pub database_url: String,
47
48    /// Embedding provider to use: gemini (default), openai, or ollama
49    #[arg(long, env = "EMBEDDING_PROVIDER", default_value = "gemini")]
50    pub embedding_provider: String,
51
52    /// Google Gemini API key (required when embedding_provider=gemini)
53    #[arg(long, env = "GEMINI_API_KEY")]
54    pub gemini_api_key: Option<String>,
55
56    /// OpenAI API key (required when embedding_provider=openai)
57    #[arg(long, env = "OPENAI_API_KEY")]
58    pub openai_api_key: Option<String>,
59
60    /// Embedding model name (provider-specific, uses default if not set)
61    #[arg(long, env = "EMBEDDING_MODEL")]
62    pub embedding_model: Option<String>,
63
64    /// Ollama API endpoint (default: http://localhost:11434)
65    #[arg(long, env = "OLLAMA_ENDPOINT")]
66    pub ollama_endpoint: Option<String>,
67
68    #[command(subcommand)]
69    pub command: Command,
70}
71
72/// Available CLI commands
73#[derive(Subcommand, Debug)]
74pub enum Command {
75    /// Harvest datasets from open data portals
76    #[command(after_help = "Examples:
77  ceres harvest                                       # Harvest all enabled portals from config
78  ceres harvest https://dati.comune.milano.it         # Harvest single CKAN URL (default type)
79  ceres harvest https://data.public.lu --type dcat    # Harvest DCAT portal by URL
80  ceres harvest https://data.europa.eu --type dcat --profile sparql  # Harvest SPARQL DCAT endpoint
81  ceres harvest --portal milano                       # Harvest portal by name from config
82  ceres harvest --config ~/custom.toml                # Use custom config file
83  ceres harvest --full-sync                           # Force full sync even if incremental is available")]
84    Harvest {
85        /// URL of a portal to harvest (ad-hoc, not from config)
86        #[arg(value_name = "URL")]
87        portal_url: Option<String>,
88
89        /// Portal type when harvesting an ad-hoc URL (ignored when using --portal or batch mode)
90        #[arg(
91            long,
92            value_name = "TYPE",
93            default_value = "ckan",
94            requires = "portal_url"
95        )]
96        r#type: PortalType,
97
98        /// DCAT profile when harvesting an ad-hoc DCAT URL (e.g., "sparql")
99        #[arg(long, value_name = "PROFILE", requires = "portal_url")]
100        profile: Option<String>,
101
102        /// Harvest a specific portal by name from config file
103        #[arg(short, long, value_name = "NAME", conflicts_with = "portal_url")]
104        portal: Option<String>,
105
106        /// Custom path to portals.toml configuration file
107        #[arg(short, long, value_name = "PATH")]
108        config: Option<PathBuf>,
109
110        /// Force full sync even if incremental sync is available
111        #[arg(long)]
112        full_sync: bool,
113
114        /// Preview what would be harvested without writing to DB or calling embedding API
115        #[arg(long)]
116        dry_run: bool,
117
118        /// Only harvest metadata (no embedding). Does not require an API key.
119        #[arg(long)]
120        metadata_only: bool,
121    },
122    /// Generate embeddings for datasets that don't have them yet
123    #[command(after_help = "Examples:
124  ceres embed                                     # Embed all pending datasets
125  ceres embed --portal https://dati.comune.milano.it  # Embed pending from one portal")]
126    Embed {
127        /// Filter to a specific portal URL
128        #[arg(short, long)]
129        portal: Option<String>,
130    },
131    /// Search indexed datasets using semantic similarity
132    #[command(after_help = "Example: ceres search \"trasporto pubblico\" --limit 10")]
133    Search {
134        /// Search query text
135        query: String,
136        /// Maximum number of results to return
137        #[arg(short, long, default_value = "10")]
138        limit: usize,
139    },
140    /// Export indexed datasets to various formats
141    #[command(after_help = "Examples:
142  ceres export --format jsonl > datasets.jsonl
143  ceres export --format json --portal https://dati.gov.it
144  ceres export --format parquet --output ./ceres-export")]
145    Export {
146        /// Output format for exported data
147        #[arg(short, long, default_value = "jsonl")]
148        format: ExportFormat,
149        /// Filter by source portal URL
150        #[arg(short, long)]
151        portal: Option<String>,
152        /// Maximum number of datasets to export
153        #[arg(short, long)]
154        limit: Option<usize>,
155        /// Output directory (required for parquet format)
156        #[arg(short, long, value_name = "DIR")]
157        output: Option<std::path::PathBuf>,
158        /// Custom path to portals.toml (for portal name resolution in parquet export)
159        #[arg(short, long, value_name = "PATH")]
160        config: Option<std::path::PathBuf>,
161    },
162    /// Show database statistics
163    Stats,
164}
165
166/// Supported export formats
167#[derive(Debug, Clone, ValueEnum)]
168pub enum ExportFormat {
169    /// JSON Lines format (one JSON object per line)
170    Jsonl,
171    /// Standard JSON array format
172    Json,
173    /// CSV format (comma-separated values)
174    Csv,
175    /// Parquet format (curated, flattened for HuggingFace)
176    Parquet,
177}
178
179#[cfg(test)]
180mod tests {
181    use super::version_info;
182
183    #[test]
184    fn test_version_info_contains_expected_fields() {
185        let info = version_info();
186        assert!(info.contains("commit:"));
187        assert!(info.contains("built:"));
188        assert!(info.contains("target:"));
189        assert!(info.contains("rustc:"));
190    }
191}