known_values/directory_loader.rs
1//! Directory-based loading of known values from JSON registry files.
2//!
3//! This module provides functionality to load known values from JSON files
4//! stored in configurable directories. It is only available when the
5//! `directory-loading` feature is enabled (which is the default).
6//!
7//! # Overview
8//!
9//! The module supports loading known values from:
10//! - A default directory: `~/.known-values/`
11//! - Custom directories specified at runtime
12//!
13//! Values loaded from JSON files can override hardcoded values when they
14//! share the same codepoint (numeric identifier).
15//!
16//! # JSON File Format
17//!
18//! Registry files should follow the BlockchainCommons format:
19//!
20//! ```json
21//! {
22//! "ontology": {
23//! "name": "my_registry",
24//! "source_url": "https://example.com/registry"
25//! },
26//! "entries": [
27//! {
28//! "codepoint": 1000,
29//! "name": "myValue",
30//! "type": "property",
31//! "uri": "https://example.com/vocab#myValue",
32//! "description": "A custom known value"
33//! }
34//! ]
35//! }
36//! ```
37//!
38//! Only the `entries` array with `codepoint` and `name` fields
39//! is required; other fields are optional.
40
41use std::{
42 collections::HashMap,
43 fmt, fs, io,
44 path::{Path, PathBuf},
45 sync::{
46 Mutex,
47 atomic::{AtomicBool, Ordering},
48 },
49};
50
51use serde::Deserialize;
52
53use crate::KnownValue;
54
55/// A single entry in a known values JSON registry file.
56#[derive(Debug, Deserialize)]
57pub struct RegistryEntry {
58 /// The unique numeric identifier for this known value.
59 pub codepoint: u64,
60 /// The canonical string name for this known value.
61 pub name: String,
62 /// The type of entry (e.g., "property", "class", "value").
63 #[serde(rename = "type")]
64 pub entry_type: Option<String>,
65 /// An optional URI reference for this known value.
66 pub uri: Option<String>,
67 /// An optional human-readable description.
68 pub description: Option<String>,
69}
70
71/// Metadata about the ontology or registry source.
72#[derive(Debug, Deserialize)]
73pub struct OntologyInfo {
74 /// The name of this registry or ontology.
75 pub name: Option<String>,
76 /// The source URL for this registry.
77 pub source_url: Option<String>,
78 /// The starting codepoint for entries in this registry.
79 pub start_code_point: Option<u64>,
80 /// The processing strategy used to generate this registry.
81 pub processing_strategy: Option<String>,
82}
83
84/// Root structure of a known values JSON registry file.
85#[derive(Debug, Deserialize)]
86pub struct RegistryFile {
87 /// Metadata about this registry.
88 pub ontology: Option<OntologyInfo>,
89 /// Information about how this file was generated.
90 pub generated: Option<GeneratedInfo>,
91 /// The known value entries in this registry.
92 pub entries: Vec<RegistryEntry>,
93 /// Statistics about this registry (ignored during parsing).
94 #[serde(default)]
95 pub statistics: Option<serde_json::Value>,
96}
97
98/// Information about how a registry file was generated.
99#[derive(Debug, Deserialize)]
100pub struct GeneratedInfo {
101 /// The tool used to generate this registry.
102 pub tool: Option<String>,
103}
104
105/// Errors that can occur when loading known values from directories.
106#[derive(Debug)]
107pub enum LoadError {
108 /// An I/O error occurred while reading files.
109 Io(io::Error),
110 /// A JSON parsing error occurred.
111 Json {
112 /// The file that caused the error.
113 file: PathBuf,
114 /// The underlying JSON error.
115 error: serde_json::Error,
116 },
117}
118
119impl fmt::Display for LoadError {
120 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121 match self {
122 LoadError::Io(e) => write!(f, "IO error: {}", e),
123 LoadError::Json { file, error } => {
124 write!(f, "JSON parse error in {}: {}", file.display(), error)
125 }
126 }
127 }
128}
129
130impl std::error::Error for LoadError {
131 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
132 match self {
133 LoadError::Io(e) => Some(e),
134 LoadError::Json { error, .. } => Some(error),
135 }
136 }
137}
138
139impl From<io::Error> for LoadError {
140 fn from(error: io::Error) -> Self { LoadError::Io(error) }
141}
142
143/// Result of a directory loading operation.
144#[derive(Debug, Default)]
145pub struct LoadResult {
146 /// Known values loaded, keyed by codepoint.
147 pub values: HashMap<u64, KnownValue>,
148 /// Files that were successfully processed.
149 pub files_processed: Vec<PathBuf>,
150 /// Non-fatal errors encountered during loading.
151 pub errors: Vec<(PathBuf, LoadError)>,
152}
153
154impl LoadResult {
155 /// Returns the number of unique values loaded.
156 pub fn values_count(&self) -> usize { self.values.len() }
157
158 /// Returns an iterator over the loaded known values.
159 pub fn values_iter(&self) -> impl Iterator<Item = &KnownValue> {
160 self.values.values()
161 }
162
163 /// Consumes the result and returns the loaded known values.
164 pub fn into_values(self) -> impl Iterator<Item = KnownValue> {
165 self.values.into_values()
166 }
167
168 /// Returns true if any errors occurred during loading.
169 pub fn has_errors(&self) -> bool { !self.errors.is_empty() }
170}
171
172/// Result type for tolerant directory loading: successfully loaded values and
173/// per-file errors.
174type TolerantLoadResult = (Vec<KnownValue>, Vec<(PathBuf, LoadError)>);
175
176/// Configuration for loading known values from directories.
177///
178/// This struct specifies which directories to search for JSON registry files.
179/// Directories are processed in order, with values from later directories
180/// overriding values from earlier directories when codepoints collide.
181///
182/// # Examples
183///
184/// ```rust,ignore
185/// use known_values::DirectoryConfig;
186///
187/// // Use only the default directory (~/.known-values/)
188/// let config = DirectoryConfig::default();
189///
190/// // Use custom paths
191/// let config = DirectoryConfig::with_paths(vec![
192/// "/etc/known-values".into(),
193/// "/usr/share/known-values".into(),
194/// ]);
195///
196/// // Use custom paths with default appended
197/// let config = DirectoryConfig::with_paths_and_default(vec![
198/// "/etc/known-values".into(),
199/// ]);
200/// ```
201#[derive(Debug, Clone, Default)]
202pub struct DirectoryConfig {
203 /// Search paths in priority order (later paths override earlier).
204 paths: Vec<PathBuf>,
205}
206
207impl DirectoryConfig {
208 /// Creates a new empty configuration with no search paths.
209 pub fn new() -> Self { Self { paths: Vec::new() } }
210
211 /// Creates configuration with only the default directory
212 /// (`~/.known-values/`).
213 pub fn default_only() -> Self {
214 Self { paths: vec![Self::default_directory()] }
215 }
216
217 /// Creates configuration with custom paths (processed in order).
218 ///
219 /// Later paths in the list take precedence over earlier paths when
220 /// values have the same codepoint.
221 pub fn with_paths(paths: Vec<PathBuf>) -> Self { Self { paths } }
222
223 /// Creates configuration with custom paths followed by the default
224 /// directory.
225 ///
226 /// The default directory (`~/.known-values/`) is appended to the list,
227 /// so its values will override values from the custom paths.
228 pub fn with_paths_and_default(mut paths: Vec<PathBuf>) -> Self {
229 paths.push(Self::default_directory());
230 Self { paths }
231 }
232
233 /// Returns the default directory: `~/.known-values/`
234 ///
235 /// Falls back to `./.known-values/` if the home directory cannot be
236 /// determined.
237 pub fn default_directory() -> PathBuf {
238 dirs::home_dir()
239 .unwrap_or_else(|| PathBuf::from("."))
240 .join(".known-values")
241 }
242
243 /// Returns the configured search paths.
244 pub fn paths(&self) -> &[PathBuf] { &self.paths }
245
246 /// Adds a path to the configuration.
247 ///
248 /// The new path will be processed after existing paths, so its values
249 /// will override values from earlier paths.
250 pub fn add_path(&mut self, path: PathBuf) { self.paths.push(path); }
251}
252
253/// Loads all JSON registry files from a single directory.
254///
255/// This function scans the specified directory for files with a `.json`
256/// extension and attempts to parse them as known value registries.
257///
258/// # Arguments
259///
260/// * `path` - The directory to scan for JSON registry files.
261///
262/// # Returns
263///
264/// Returns `Ok` with a vector of loaded `KnownValue` instances, or an empty
265/// vector if the directory doesn't exist. Returns `Err` only for I/O errors
266/// that prevent directory traversal.
267///
268/// # Examples
269///
270/// ```rust,ignore
271/// use known_values::load_from_directory;
272/// use std::path::Path;
273///
274/// let values = load_from_directory(Path::new("/etc/known-values"))?;
275/// for value in values {
276/// println!("{}: {}", value.value(), value.name());
277/// }
278/// ```
279pub fn load_from_directory(path: &Path) -> Result<Vec<KnownValue>, LoadError> {
280 let mut values = Vec::new();
281
282 // Return empty if directory doesn't exist or isn't a directory
283 if !path.exists() || !path.is_dir() {
284 return Ok(values);
285 }
286
287 for entry in fs::read_dir(path)? {
288 let entry = entry?;
289 let file_path = entry.path();
290
291 // Only process .json files
292 if file_path.extension().is_some_and(|ext| ext == "json") {
293 let content = fs::read_to_string(&file_path)?;
294 let registry: RegistryFile =
295 serde_json::from_str(&content).map_err(|e| {
296 LoadError::Json { file: file_path.clone(), error: e }
297 })?;
298
299 for entry in registry.entries {
300 values.push(KnownValue::new_with_name(
301 entry.codepoint,
302 entry.name,
303 ));
304 }
305 }
306 }
307
308 Ok(values)
309}
310
311/// Loads known values from all directories in the given configuration.
312///
313/// Directories are processed in order. When multiple entries have the same
314/// codepoint, values from later directories override values from earlier
315/// directories.
316///
317/// This function is fault-tolerant: it will continue processing even if
318/// some files fail to parse. Errors are collected in the returned
319/// `LoadResult`.
320///
321/// # Arguments
322///
323/// * `config` - The directory configuration specifying search paths.
324///
325/// # Returns
326///
327/// A `LoadResult` containing the loaded values, processed files, and any
328/// errors encountered.
329///
330/// # Examples
331///
332/// ```rust,ignore
333/// use known_values::{DirectoryConfig, load_from_config};
334///
335/// let config = DirectoryConfig::default_only();
336/// let result = load_from_config(&config);
337///
338/// println!("Loaded {} values from {} files",
339/// result.values_count(),
340/// result.files_processed.len());
341///
342/// if result.has_errors() {
343/// for (path, error) in &result.errors {
344/// eprintln!("Error loading {}: {}", path.display(), error);
345/// }
346/// }
347/// ```
348pub fn load_from_config(config: &DirectoryConfig) -> LoadResult {
349 let mut result = LoadResult::default();
350
351 for dir_path in config.paths() {
352 match load_from_directory_tolerant(dir_path) {
353 Ok((values, errors)) => {
354 for value in values {
355 result.values.insert(value.value(), value);
356 }
357 if !errors.is_empty() {
358 result.errors.extend(errors);
359 }
360 result.files_processed.push(dir_path.clone());
361 }
362 Err(e) => {
363 result.errors.push((dir_path.clone(), e));
364 }
365 }
366 }
367
368 result
369}
370
371/// Loads from a directory with tolerance for individual file failures.
372fn load_from_directory_tolerant(
373 path: &Path,
374) -> Result<TolerantLoadResult, LoadError> {
375 let mut values = Vec::new();
376 let mut errors = Vec::new();
377
378 if !path.exists() || !path.is_dir() {
379 return Ok((values, errors));
380 }
381
382 for entry in fs::read_dir(path)? {
383 let entry = entry?;
384 let file_path = entry.path();
385
386 if file_path.extension().is_some_and(|ext| ext == "json") {
387 match load_single_file(&file_path) {
388 Ok(file_values) => values.extend(file_values),
389 Err(e) => errors.push((file_path, e)),
390 }
391 }
392 }
393
394 Ok((values, errors))
395}
396
397/// Loads known values from a single JSON file.
398fn load_single_file(path: &Path) -> Result<Vec<KnownValue>, LoadError> {
399 let content = fs::read_to_string(path)?;
400 let registry: RegistryFile = serde_json::from_str(&content)
401 .map_err(|e| LoadError::Json { file: path.to_path_buf(), error: e })?;
402
403 Ok(registry
404 .entries
405 .into_iter()
406 .map(|entry| KnownValue::new_with_name(entry.codepoint, entry.name))
407 .collect())
408}
409
410// Global configuration state
411static CUSTOM_CONFIG: Mutex<Option<DirectoryConfig>> = Mutex::new(None);
412static CONFIG_LOCKED: AtomicBool = AtomicBool::new(false);
413
414/// Error returned when configuration cannot be modified.
415#[derive(Debug, Clone, PartialEq, Eq)]
416pub enum ConfigError {
417 /// Configuration was attempted after the global registry was initialized.
418 AlreadyInitialized,
419}
420
421impl fmt::Display for ConfigError {
422 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
423 match self {
424 ConfigError::AlreadyInitialized => {
425 write!(
426 f,
427 "Cannot modify directory configuration after KNOWN_VALUES has been accessed"
428 )
429 }
430 }
431 }
432}
433
434impl std::error::Error for ConfigError {}
435
436/// Sets custom directory configuration for known values loading.
437///
438/// This function must be called **before** the first access to `KNOWN_VALUES`.
439/// Once `KNOWN_VALUES` is accessed, the configuration is locked and cannot
440/// be changed.
441///
442/// # Arguments
443///
444/// * `config` - The directory configuration to use.
445///
446/// # Returns
447///
448/// Returns `Ok(())` if the configuration was set successfully, or
449/// `Err(ConfigError::AlreadyInitialized)` if `KNOWN_VALUES` has already
450/// been accessed.
451///
452/// # Examples
453///
454/// ```rust,ignore
455/// use known_values::{set_directory_config, DirectoryConfig, KNOWN_VALUES};
456///
457/// // Set configuration before accessing KNOWN_VALUES
458/// set_directory_config(DirectoryConfig::with_paths(vec![
459/// "/custom/path".into(),
460/// ])).expect("Configuration should succeed");
461///
462/// // Now access KNOWN_VALUES - it will use the custom configuration
463/// let binding = KNOWN_VALUES.get();
464/// ```
465pub fn set_directory_config(
466 config: DirectoryConfig,
467) -> Result<(), ConfigError> {
468 if CONFIG_LOCKED.load(Ordering::SeqCst) {
469 return Err(ConfigError::AlreadyInitialized);
470 }
471 *CUSTOM_CONFIG.lock().unwrap() = Some(config);
472 Ok(())
473}
474
475/// Adds additional search paths to the directory configuration.
476///
477/// This function must be called **before** the first access to `KNOWN_VALUES`.
478/// Paths are added after any existing paths, so they will take precedence.
479///
480/// If no configuration has been set, this creates a new configuration with
481/// the default directory and appends the new paths.
482///
483/// # Arguments
484///
485/// * `paths` - The paths to add to the configuration.
486///
487/// # Returns
488///
489/// Returns `Ok(())` if the paths were added successfully, or
490/// `Err(ConfigError::AlreadyInitialized)` if `KNOWN_VALUES` has already
491/// been accessed.
492///
493/// # Examples
494///
495/// ```rust,ignore
496/// use known_values::add_search_paths;
497///
498/// // Add custom paths in addition to the default
499/// add_search_paths(vec![
500/// "/etc/known-values".into(),
501/// "/usr/share/known-values".into(),
502/// ]).expect("Should succeed before KNOWN_VALUES access");
503/// ```
504pub fn add_search_paths(paths: Vec<PathBuf>) -> Result<(), ConfigError> {
505 if CONFIG_LOCKED.load(Ordering::SeqCst) {
506 return Err(ConfigError::AlreadyInitialized);
507 }
508 let mut guard = CUSTOM_CONFIG.lock().unwrap();
509 let config = guard.get_or_insert_with(DirectoryConfig::default_only);
510 for path in paths {
511 config.add_path(path);
512 }
513 Ok(())
514}
515
516/// Gets the current directory configuration, locking it for future
517/// modifications.
518///
519/// This is called internally during `KNOWN_VALUES` initialization.
520pub(crate) fn get_and_lock_config() -> DirectoryConfig {
521 CONFIG_LOCKED.store(true, Ordering::SeqCst);
522 CUSTOM_CONFIG
523 .lock()
524 .unwrap()
525 .take()
526 .unwrap_or_else(DirectoryConfig::default_only)
527}
528
529#[cfg(test)]
530mod tests {
531 use super::*;
532
533 #[test]
534 fn test_parse_registry_json() {
535 let json = r#"{
536 "ontology": {"name": "test"},
537 "entries": [
538 {"codepoint": 9999, "name": "testValue", "type": "property"}
539 ],
540 "statistics": {}
541 }"#;
542
543 let registry: RegistryFile = serde_json::from_str(json).unwrap();
544 assert_eq!(registry.entries.len(), 1);
545 assert_eq!(registry.entries[0].codepoint, 9999);
546 assert_eq!(registry.entries[0].name, "testValue");
547 }
548
549 #[test]
550 fn test_parse_minimal_registry() {
551 let json = r#"{"entries": [{"codepoint": 1, "name": "minimal"}]}"#;
552
553 let registry: RegistryFile = serde_json::from_str(json).unwrap();
554 assert_eq!(registry.entries.len(), 1);
555 assert_eq!(registry.entries[0].codepoint, 1);
556 }
557
558 #[test]
559 fn test_parse_full_entry() {
560 let json = r#"{
561 "entries": [{
562 "codepoint": 100,
563 "name": "fullEntry",
564 "type": "class",
565 "uri": "https://example.com/vocab#fullEntry",
566 "description": "A complete entry with all fields"
567 }]
568 }"#;
569
570 let registry: RegistryFile = serde_json::from_str(json).unwrap();
571 let entry = ®istry.entries[0];
572 assert_eq!(entry.codepoint, 100);
573 assert_eq!(entry.name, "fullEntry");
574 assert_eq!(entry.entry_type.as_deref(), Some("class"));
575 assert_eq!(
576 entry.uri.as_deref(),
577 Some("https://example.com/vocab#fullEntry")
578 );
579 assert!(entry.description.is_some());
580 }
581
582 #[test]
583 fn test_directory_config_default() {
584 let config = DirectoryConfig::default_only();
585 assert_eq!(config.paths().len(), 1);
586 assert!(config.paths()[0].ends_with(".known-values"));
587 }
588
589 #[test]
590 fn test_directory_config_custom_paths() {
591 let config = DirectoryConfig::with_paths(vec![
592 PathBuf::from("/a"),
593 PathBuf::from("/b"),
594 ]);
595 assert_eq!(config.paths().len(), 2);
596 assert_eq!(config.paths()[0], PathBuf::from("/a"));
597 assert_eq!(config.paths()[1], PathBuf::from("/b"));
598 }
599
600 #[test]
601 fn test_directory_config_with_default() {
602 let config =
603 DirectoryConfig::with_paths_and_default(vec![PathBuf::from(
604 "/custom",
605 )]);
606 assert_eq!(config.paths().len(), 2);
607 assert_eq!(config.paths()[0], PathBuf::from("/custom"));
608 assert!(config.paths()[1].ends_with(".known-values"));
609 }
610
611 #[test]
612 fn test_load_from_nonexistent_directory() {
613 let result = load_from_directory(Path::new("/nonexistent/path/12345"));
614 assert!(result.is_ok());
615 assert!(result.unwrap().is_empty());
616 }
617
618 #[test]
619 fn test_load_result_methods() {
620 let mut result = LoadResult::default();
621 assert_eq!(result.values_count(), 0);
622 assert!(!result.has_errors());
623
624 result
625 .values
626 .insert(1, KnownValue::new_with_name(1u64, "test".to_string()));
627 assert_eq!(result.values_count(), 1);
628 }
629}