Skip to main content

anno/
env.rs

1//! Environment variable utilities.
2//!
3//! Provides centralized handling of .env files and environment configuration.
4//!
5//! # Usage
6//!
7//! ```rust,ignore
8//! use anno::env::load_dotenv;
9//!
10//! // Load .env file if present (won't override existing env vars)
11//! load_dotenv();
12//!
13//! // Now HF_TOKEN etc. are available from std::env::var
14//! if let Ok(token) = std::env::var("HF_TOKEN") {
15//!     println!("HuggingFace token available");
16//! }
17//! ```
18//!
19//! # Environment Variables
20//!
21//! | Variable | Purpose |
22//! |----------|---------|
23//! | `HF_TOKEN` | HuggingFace API token for gated models |
24//! | `HF_API_TOKEN` | Alias for `HF_TOKEN` (HuggingFace API token) |
25//! | `OPENAI_API_KEY` | OpenAI API key for LLM backends |
26//! | `ANTHROPIC_API_KEY` | Anthropic API key for Claude LLM backends |
27//! | `OPENROUTER_API_KEY` | OpenRouter API key for LLM backends |
28//! | `GEMINI_API_KEY` | Google Gemini API key |
29//! | `ANNO_CACHE_DIR` | Custom cache directory for models/datasets |
30//! | `ANNO_CI_SEED` | Fixed seed for reproducible CI testing |
31//! | `ANNO_SAMPLE_STRATEGY` | Backend sampling strategy (random, ml-only, worst-first) |
32//! | `ANNO_MATRIX_TASK` | Optional task override for matrix sampler harness (e.g. discontinuous-ner, events, ned) |
33//! | `ANNO_MATRIX_REQUIRE_CACHED` | If true, matrix sampler harness/sweeps only use cached datasets (local default false; CI forced true) |
34//! | `ANNO_MATRIX_INCLUDE_NON_AUTOMATABLE` | If true, include non-automatable datasets in matrix candidates (default false) |
35//! | `ANNO_MATRIX_INCLUDE_SLOW_DATASETS` | If true, allow known-slow datasets even under `ANNO_MUXER_PROFILE=fast*` (default false) |
36//! | `ANNO_MATRIX_COVERAGE_REPORT` | If set, write a JSON coverage report to this path during tests |
37//! | `ANNO_MATRIX_DISTRIBUTION_REPORT` | If set, write a JSON selection-distribution report to this path during tests |
38//! | `ANNO_MATRIX_DISTRIBUTION_ITERS` | Number of simulated selections to run for distribution report (default 200) |
39//! | `ANNO_ML_IN_MATRIX` | Include ML-ish backends in CI matrix (1/true to enable) |
40//! | `ANNO_HISTORY_FILE` | Override muxer history JSON path for matrix sampler harness |
41//! | `ANNO_MUXER_WINDOW_CAP` | Muxer history window size (per arm) |
42//! | `ANNO_MUXER_PER_DATASET` | Use dataset-scoped muxer history + selection (1/true recommended) |
43//! | `ANNO_MUXER_DATASETS_PER_RUN` | Matrix sampler harness: datasets per run (default 2) |
44//! | `ANNO_MUXER_EXPLORATION_C` | Muxer UCB exploration coefficient |
45//! | `ANNO_MUXER_JUNK_WEIGHT` | Muxer soft-junk penalty weight |
46//! | `ANNO_MUXER_HARD_JUNK_WEIGHT` | Muxer hard-junk penalty weight |
47//! | `ANNO_MUXER_COST_WEIGHT` | Muxer mean-cost penalty weight |
48//! | `ANNO_MUXER_LATENCY_WEIGHT` | Muxer mean-latency penalty weight |
49//! | `ANNO_MUXER_MAX_MEAN_ELAPSED_MS` | Optional constraint for ml-only selection: exclude arms above this mean latency (ms) |
50//! | `ANNO_MUXER_LATENCY_GUARDRAIL_ALLOW_FEWER` | If true (default), ml-only may return fewer than K arms instead of falling back to slow ones |
51//! | `ANNO_MUXER_LATENCY_GUARDRAIL_REQUIRE_MEASUREMENT` | If true, untried arms (calls=0) are excluded under the latency guardrail |
52//! | `ANNO_MUXER_PROFILE` | Presets for latency guardrail (`off`, `fast`, `fast-strict`, `regress`) |
53//! | `ANNO_MUXER_JUNK_F1_NER` | Junk cutoff for NER F1 (0..1) (default 0.05) |
54//! | `ANNO_MUXER_JUNK_F1_COREF` | Junk cutoff for coref CoNLL F1 (0..1) |
55//! | `ANNO_MUXER_JUNK_F1_RELATION` | Junk cutoff for relation strict F1 (0..1) |
56//! | `ANNO_MUXER_VERBOSE` | Print chosen slice + per-result outcomes in matrix sampler harness |
57//! | `ANNO_MUXER_HISTORY_SALT` | Optional suffix to isolate muxer history files (useful when semantics change) |
58//! | `ANNO_MUXER_DECISIONS_FILE` | Optional path to write selection decisions as JSONL |
59//! | `ANNO_MUXER_DECISIONS_TOP` | Max candidate rows included per decision (JSONL; default 8) |
60//! | `ANNO_MUXER_MLONLY_POLICY` | `ml-only`: choose `exp3ix` (default) or `mab` |
61//! | `ANNO_MUXER_EXP3_HORIZON` | EXP3-IX: horizon parameter (default 1000) |
62//! | `ANNO_MUXER_EXP3_DECAY` | EXP3-IX: exponential decay per update (0.01..=1.0; default 1.0) |
63//! | `ANNO_WORST_EXPLORATION_C` | Worst-first exploration coefficient (default 0.8) |
64//! | `ANNO_WORST_HARD_WEIGHT` | Worst-first weight for hard failures (default 1.0) |
65//! | `ANNO_WORST_SOFT_WEIGHT` | Worst-first weight for soft junk (default 0.0) |
66//! | `ANNO_RELATION_TPLINKER_ORACLE_ENTITIES` | If true (default), TPLinker relation eval uses gold entity spans as candidates (keeps placeholder baseline non-degenerate) |
67//!
68//! # Muxer presets (recommended)
69//!
70//! Prefer `ANNO_MUXER_PROFILE` over individual latency-guardrail env vars:
71//!
72//! - `ANNO_MUXER_PROFILE=fast`: cap mean latency around 2s in ml-only selection
73//! - `ANNO_MUXER_PROFILE=fast-strict`: like `fast`, but excludes untried arms under the cap
74//! - `ANNO_MUXER_PROFILE=regress`: disable latency guardrail (useful for worst-first regression hunting)
75//!
76//! You can still override the preset explicitly with:
77//! - `ANNO_MUXER_MAX_MEAN_ELAPSED_MS`
78//! - `ANNO_MUXER_LATENCY_GUARDRAIL_ALLOW_FEWER`
79//! - `ANNO_MUXER_LATENCY_GUARDRAIL_REQUIRE_MEASUREMENT`
80
81use once_cell::sync::OnceCell;
82use std::path::Path;
83
84/// Global flag to track if dotenv was loaded
85static DOTENV_LOADED: OnceCell<bool> = OnceCell::new();
86
87/// Load environment variables from .env file if present.
88///
89/// This function:
90/// - Searches for .env in current directory and up to 2 parent directories
91/// - Only sets variables that aren't already set (env vars take precedence)
92/// - Is idempotent - safe to call multiple times
93/// - Returns silently if no .env file is found
94///
95/// # Example
96///
97/// ```rust,ignore
98/// anno::env::load_dotenv();
99/// ```
100pub fn load_dotenv() {
101    // Only load once
102    DOTENV_LOADED.get_or_init(|| {
103        load_dotenv_impl();
104        true
105    });
106}
107
108/// Force reload of .env file (useful for testing)
109#[doc(hidden)]
110pub fn reload_dotenv() {
111    load_dotenv_impl();
112}
113
114fn load_dotenv_impl() {
115    // Try to find .env in current directory or parents
116    let env_paths = [".env", "../.env", "../../.env", "../../../.env"];
117
118    for path_str in env_paths {
119        let path = Path::new(path_str);
120        if let Ok(contents) = std::fs::read_to_string(path) {
121            parse_dotenv(&contents);
122            return;
123        }
124    }
125
126    // Also check workspace root via Cargo manifest
127    if let Ok(manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") {
128        let workspace_env = Path::new(&manifest_dir).join("../.env");
129        if let Ok(contents) = std::fs::read_to_string(&workspace_env) {
130            parse_dotenv(&contents);
131        }
132    }
133
134    // Token alias normalization (non-overriding):
135    //
136    // Some tools (transformers/huggingface_hub) look for `HF_TOKEN` or
137    // `HUGGINGFACE_HUB_TOKEN`. If the user provided `HF_API_TOKEN` in `.env`,
138    // mirror it into those conventional vars (but never override).
139    if std::env::var("HF_TOKEN").is_err() {
140        if let Ok(v) = std::env::var("HF_API_TOKEN") {
141            std::env::set_var("HF_TOKEN", v.clone());
142            if std::env::var("HUGGINGFACE_HUB_TOKEN").is_err() {
143                std::env::set_var("HUGGINGFACE_HUB_TOKEN", v);
144            }
145        }
146    }
147}
148
149fn parse_dotenv(contents: &str) {
150    for line in contents.lines() {
151        let mut line = line.trim();
152
153        // Skip empty lines and comments
154        if line.is_empty() || line.starts_with('#') {
155            continue;
156        }
157
158        // Common .env style: `export KEY=value`
159        if let Some(rest) = line.strip_prefix("export ") {
160            line = rest.trim();
161        }
162
163        // Parse KEY=VALUE (handle quoted values)
164        if let Some((key, value)) = line.split_once('=') {
165            let key = key.trim();
166            let value = value.trim();
167
168            // Remove surrounding quotes if present
169            let value = value
170                .strip_prefix('"')
171                .and_then(|v| v.strip_suffix('"'))
172                .or_else(|| value.strip_prefix('\'').and_then(|v| v.strip_suffix('\'')))
173                .unwrap_or(value);
174
175            // Only set if not already set (env vars take precedence)
176            if std::env::var(key).is_err() {
177                std::env::set_var(key, value);
178            }
179        }
180    }
181}
182
183/// Check if HuggingFace token is available.
184#[must_use]
185pub fn has_hf_token() -> bool {
186    // Support common aliases so `.env` can use a more explicit name.
187    std::env::var("HF_TOKEN").is_ok() || std::env::var("HF_API_TOKEN").is_ok()
188}
189
190/// Get HuggingFace token if available.
191#[must_use]
192pub fn hf_token() -> Option<String> {
193    std::env::var("HF_TOKEN")
194        .ok()
195        .or_else(|| std::env::var("HF_API_TOKEN").ok())
196}
197
198/// Check if any LLM API key is available.
199#[must_use]
200pub fn has_llm_api_key() -> bool {
201    fn nonempty(name: &str) -> bool {
202        std::env::var(name)
203            .ok()
204            .is_some_and(|v| !v.trim().is_empty())
205    }
206
207    nonempty("OPENAI_API_KEY")
208        || nonempty("ANTHROPIC_API_KEY")
209        || nonempty("OPENROUTER_API_KEY")
210        || nonempty("GEMINI_API_KEY")
211}
212
213/// Get the best available LLM API key and provider.
214/// Returns (key, provider) tuple.
215#[must_use]
216pub fn llm_api_key() -> Option<(String, &'static str)> {
217    let nonempty = |name: &str| -> Option<String> {
218        std::env::var(name).ok().filter(|v| !v.trim().is_empty())
219    };
220
221    if let Some(key) = nonempty("OPENAI_API_KEY") {
222        return Some((key, "openai"));
223    }
224    if let Some(key) = nonempty("ANTHROPIC_API_KEY") {
225        return Some((key, "anthropic"));
226    }
227    if let Some(key) = nonempty("OPENROUTER_API_KEY") {
228        return Some((key, "openrouter"));
229    }
230    if let Some(key) = nonempty("GEMINI_API_KEY") {
231        return Some((key, "gemini"));
232    }
233    None
234}
235
236/// Get the cache directory for models and datasets.
237#[must_use]
238pub fn cache_dir() -> std::path::PathBuf {
239    if let Ok(dir) = std::env::var("ANNO_CACHE_DIR") {
240        return std::path::PathBuf::from(dir);
241    }
242
243    // When analysis/eval feature is enabled, use platform-specific cache directories
244    // (this gate keeps minimal builds local by default)
245    #[cfg(any(feature = "analysis", feature = "eval"))]
246    {
247        #[cfg(target_os = "macos")]
248        {
249            dirs::home_dir()
250                .map(|h| h.join("Library/Caches/anno"))
251                .unwrap_or_else(|| std::path::PathBuf::from(".anno-cache"))
252        }
253
254        #[cfg(target_os = "linux")]
255        {
256            dirs::cache_dir()
257                .map(|c| c.join("anno"))
258                .unwrap_or_else(|| std::path::PathBuf::from(".anno-cache"))
259        }
260
261        #[cfg(target_os = "windows")]
262        {
263            dirs::cache_dir()
264                .map(|c| c.join("anno"))
265                .unwrap_or_else(|| std::path::PathBuf::from(".anno-cache"))
266        }
267
268        #[cfg(not(any(target_os = "macos", target_os = "linux", target_os = "windows")))]
269        {
270            std::path::PathBuf::from(".anno-cache")
271        }
272    }
273
274    // Fallback when analysis/eval feature is not enabled
275    #[cfg(not(any(feature = "analysis", feature = "eval")))]
276    {
277        std::path::PathBuf::from(".anno-cache")
278    }
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_parse_dotenv() {
287        let contents = r#"
288# Comment
289KEY1=value1
290KEY2="quoted value"
291KEY3='single quoted'
292  SPACED_KEY = spaced_value
293"#;
294        // Use unique keys to avoid test pollution
295        let test_prefix = format!("ANNO_TEST_{}", std::process::id());
296
297        let test_contents = contents.replace("KEY", &test_prefix);
298        parse_dotenv(&test_contents);
299
300        // The test environment might have these set already, so just check parsing works
301    }
302
303    #[test]
304    fn test_parse_dotenv_supports_export_prefix_and_sets_values() {
305        let pid = std::process::id();
306        let k1 = format!("ANNO_TEST_EXPORT_{}_K1", pid);
307        let k2 = format!("ANNO_TEST_EXPORT_{}_K2", pid);
308
309        // Ensure clean slate
310        std::env::remove_var(&k1);
311        std::env::remove_var(&k2);
312
313        let contents = format!(
314            r#"
315export {k1}=value1
316{k2}="quoted value"
317"#
318        );
319        parse_dotenv(&contents);
320
321        assert_eq!(std::env::var(&k1).as_deref(), Ok("value1"));
322        assert_eq!(std::env::var(&k2).as_deref(), Ok("quoted value"));
323
324        // Clean up
325        std::env::remove_var(&k1);
326        std::env::remove_var(&k2);
327    }
328
329    #[test]
330    fn test_parse_dotenv_does_not_override_existing_env() {
331        let pid = std::process::id();
332        let key = format!("ANNO_TEST_NO_OVERRIDE_{}", pid);
333
334        std::env::set_var(&key, "from_env");
335
336        let contents = format!(r#"{key}=from_dotenv"#);
337        parse_dotenv(&contents);
338
339        assert_eq!(std::env::var(&key).as_deref(), Ok("from_env"));
340
341        std::env::remove_var(&key);
342    }
343
344    #[test]
345    fn test_load_dotenv_idempotent() {
346        load_dotenv();
347        load_dotenv();
348        load_dotenv();
349        // Should not panic or cause issues
350    }
351
352    #[test]
353    fn test_cache_dir() {
354        let dir = cache_dir();
355        // Should return a valid path
356        assert!(!dir.as_os_str().is_empty());
357    }
358}