1use ceres_core::PortalType;
2use clap::{Parser, Subcommand, ValueEnum};
3use std::path::PathBuf;
4use std::sync::LazyLock;
5
6static VERSION_INFO: LazyLock<String> = LazyLock::new(|| {
7 let version = env!("CARGO_PKG_VERSION");
8
9 let commit = option_env!("VERGEN_GIT_SHA")
11 .map(|s| s.chars().take(7).collect::<String>())
12 .unwrap_or_else(|| "unknown".to_string());
13
14 let built = option_env!("VERGEN_BUILD_DATE").unwrap_or("unknown"); let target = option_env!("VERGEN_CARGO_TARGET_TRIPLE").unwrap_or("unknown");
16 let rustc = option_env!("VERGEN_RUSTC_SEMVER").unwrap_or("unknown");
17
18 format!("{version}\ncommit: {commit}\nbuilt: {built}\ntarget: {target}\nrustc: {rustc}")
19});
20
21pub fn version_info() -> &'static str {
22 &VERSION_INFO
23}
24
25#[derive(Parser, Debug)]
27#[command(name = "ceres")]
28#[command(
29 author,
30 version = version_info(),
31 about = "Semantic search engine for open data portals"
32)]
33#[command(after_help = "Examples:
34 ceres harvest https://dati.comune.milano.it
35 ceres search \"air quality monitoring\" --limit 5
36 ceres export --format jsonl > datasets.jsonl
37 ceres stats
38
39Embedding providers:
40 EMBEDDING_PROVIDER=gemini (default) - Google Gemini (768 dimensions)
41 EMBEDDING_PROVIDER=openai - OpenAI (1536 or 3072 dimensions)
42 EMBEDDING_PROVIDER=ollama - Ollama local (768 dimensions, default: nomic-embed-text)")]
43pub struct Config {
44 #[arg(long, env = "DATABASE_URL")]
46 pub database_url: String,
47
48 #[arg(long, env = "EMBEDDING_PROVIDER", default_value = "gemini")]
50 pub embedding_provider: String,
51
52 #[arg(long, env = "GEMINI_API_KEY")]
54 pub gemini_api_key: Option<String>,
55
56 #[arg(long, env = "OPENAI_API_KEY")]
58 pub openai_api_key: Option<String>,
59
60 #[arg(long, env = "EMBEDDING_MODEL")]
62 pub embedding_model: Option<String>,
63
64 #[arg(long, env = "OLLAMA_ENDPOINT")]
66 pub ollama_endpoint: Option<String>,
67
68 #[command(subcommand)]
69 pub command: Command,
70}
71
72#[derive(Subcommand, Debug)]
74pub enum Command {
75 #[command(after_help = "Examples:
77 ceres harvest # Harvest all enabled portals from config
78 ceres harvest https://dati.comune.milano.it # Harvest single CKAN URL (default type)
79 ceres harvest https://data.public.lu --type dcat # Harvest DCAT portal by URL
80 ceres harvest https://data.europa.eu --type dcat --profile sparql # Harvest SPARQL DCAT endpoint
81 ceres harvest --portal milano # Harvest portal by name from config
82 ceres harvest --config ~/custom.toml # Use custom config file
83 ceres harvest --full-sync # Force full sync even if incremental is available")]
84 Harvest {
85 #[arg(value_name = "URL")]
87 portal_url: Option<String>,
88
89 #[arg(
91 long,
92 value_name = "TYPE",
93 default_value = "ckan",
94 requires = "portal_url"
95 )]
96 r#type: PortalType,
97
98 #[arg(long, value_name = "PROFILE", requires = "portal_url")]
100 profile: Option<String>,
101
102 #[arg(short, long, value_name = "NAME", conflicts_with = "portal_url")]
104 portal: Option<String>,
105
106 #[arg(short, long, value_name = "PATH")]
108 config: Option<PathBuf>,
109
110 #[arg(long)]
112 full_sync: bool,
113
114 #[arg(long)]
116 dry_run: bool,
117
118 #[arg(long)]
120 metadata_only: bool,
121 },
122 #[command(after_help = "Examples:
124 ceres embed # Embed all pending datasets
125 ceres embed --portal https://dati.comune.milano.it # Embed pending from one portal")]
126 Embed {
127 #[arg(short, long)]
129 portal: Option<String>,
130 },
131 #[command(after_help = "Example: ceres search \"trasporto pubblico\" --limit 10")]
133 Search {
134 query: String,
136 #[arg(short, long, default_value = "10")]
138 limit: usize,
139 },
140 #[command(after_help = "Examples:
142 ceres export --format jsonl > datasets.jsonl
143 ceres export --format json --portal https://dati.gov.it
144 ceres export --format parquet --output ./ceres-export")]
145 Export {
146 #[arg(short, long, default_value = "jsonl")]
148 format: ExportFormat,
149 #[arg(short, long)]
151 portal: Option<String>,
152 #[arg(short, long)]
154 limit: Option<usize>,
155 #[arg(short, long, value_name = "DIR")]
157 output: Option<std::path::PathBuf>,
158 #[arg(short, long, value_name = "PATH")]
160 config: Option<std::path::PathBuf>,
161 },
162 Stats,
164}
165
166#[derive(Debug, Clone, ValueEnum)]
168pub enum ExportFormat {
169 Jsonl,
171 Json,
173 Csv,
175 Parquet,
177}
178
179#[cfg(test)]
180mod tests {
181 use super::version_info;
182
183 #[test]
184 fn test_version_info_contains_expected_fields() {
185 let info = version_info();
186 assert!(info.contains("commit:"));
187 assert!(info.contains("built:"));
188 assert!(info.contains("target:"));
189 assert!(info.contains("rustc:"));
190 }
191}