known_values/
directory_loader.rs

1//! Directory-based loading of known values from JSON registry files.
2//!
3//! This module provides functionality to load known values from JSON files
4//! stored in configurable directories. It is only available when the
5//! `directory-loading` feature is enabled (which is the default).
6//!
7//! # Overview
8//!
9//! The module supports loading known values from:
10//! - A default directory: `~/.known-values/`
11//! - Custom directories specified at runtime
12//!
13//! Values loaded from JSON files can override hardcoded values when they
14//! share the same codepoint (numeric identifier).
15//!
16//! # JSON File Format
17//!
18//! Registry files should follow the BlockchainCommons format:
19//!
20//! ```json
21//! {
22//!   "ontology": {
23//!     "name": "my_registry",
24//!     "source_url": "https://example.com/registry"
25//!   },
26//!   "entries": [
27//!     {
28//!       "codepoint": 1000,
29//!       "name": "myValue",
30//!       "type": "property",
31//!       "uri": "https://example.com/vocab#myValue",
32//!       "description": "A custom known value"
33//!     }
34//!   ]
35//! }
36//! ```
37//!
38//! Only the `entries` array with `codepoint` and `name` fields
39//! is required; other fields are optional.
40
41use std::{
42    collections::HashMap,
43    fmt, fs, io,
44    path::{Path, PathBuf},
45    sync::{
46        Mutex,
47        atomic::{AtomicBool, Ordering},
48    },
49};
50
51use serde::Deserialize;
52
53use crate::KnownValue;
54
55/// A single entry in a known values JSON registry file.
56#[derive(Debug, Deserialize)]
57pub struct RegistryEntry {
58    /// The unique numeric identifier for this known value.
59    pub codepoint: u64,
60    /// The canonical string name for this known value.
61    pub name: String,
62    /// The type of entry (e.g., "property", "class", "value").
63    #[serde(rename = "type")]
64    pub entry_type: Option<String>,
65    /// An optional URI reference for this known value.
66    pub uri: Option<String>,
67    /// An optional human-readable description.
68    pub description: Option<String>,
69}
70
71/// Metadata about the ontology or registry source.
72#[derive(Debug, Deserialize)]
73pub struct OntologyInfo {
74    /// The name of this registry or ontology.
75    pub name: Option<String>,
76    /// The source URL for this registry.
77    pub source_url: Option<String>,
78    /// The starting codepoint for entries in this registry.
79    pub start_code_point: Option<u64>,
80    /// The processing strategy used to generate this registry.
81    pub processing_strategy: Option<String>,
82}
83
84/// Root structure of a known values JSON registry file.
85#[derive(Debug, Deserialize)]
86pub struct RegistryFile {
87    /// Metadata about this registry.
88    pub ontology: Option<OntologyInfo>,
89    /// Information about how this file was generated.
90    pub generated: Option<GeneratedInfo>,
91    /// The known value entries in this registry.
92    pub entries: Vec<RegistryEntry>,
93    /// Statistics about this registry (ignored during parsing).
94    #[serde(default)]
95    pub statistics: Option<serde_json::Value>,
96}
97
98/// Information about how a registry file was generated.
99#[derive(Debug, Deserialize)]
100pub struct GeneratedInfo {
101    /// The tool used to generate this registry.
102    pub tool: Option<String>,
103}
104
105/// Errors that can occur when loading known values from directories.
106#[derive(Debug)]
107pub enum LoadError {
108    /// An I/O error occurred while reading files.
109    Io(io::Error),
110    /// A JSON parsing error occurred.
111    Json {
112        /// The file that caused the error.
113        file: PathBuf,
114        /// The underlying JSON error.
115        error: serde_json::Error,
116    },
117}
118
119impl fmt::Display for LoadError {
120    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121        match self {
122            LoadError::Io(e) => write!(f, "IO error: {}", e),
123            LoadError::Json { file, error } => {
124                write!(f, "JSON parse error in {}: {}", file.display(), error)
125            }
126        }
127    }
128}
129
130impl std::error::Error for LoadError {
131    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
132        match self {
133            LoadError::Io(e) => Some(e),
134            LoadError::Json { error, .. } => Some(error),
135        }
136    }
137}
138
139impl From<io::Error> for LoadError {
140    fn from(error: io::Error) -> Self { LoadError::Io(error) }
141}
142
143/// Result of a directory loading operation.
144#[derive(Debug, Default)]
145pub struct LoadResult {
146    /// Known values loaded, keyed by codepoint.
147    pub values: HashMap<u64, KnownValue>,
148    /// Files that were successfully processed.
149    pub files_processed: Vec<PathBuf>,
150    /// Non-fatal errors encountered during loading.
151    pub errors: Vec<(PathBuf, LoadError)>,
152}
153
154impl LoadResult {
155    /// Returns the number of unique values loaded.
156    pub fn values_count(&self) -> usize { self.values.len() }
157
158    /// Returns an iterator over the loaded known values.
159    pub fn values_iter(&self) -> impl Iterator<Item = &KnownValue> {
160        self.values.values()
161    }
162
163    /// Consumes the result and returns the loaded known values.
164    pub fn into_values(self) -> impl Iterator<Item = KnownValue> {
165        self.values.into_values()
166    }
167
168    /// Returns true if any errors occurred during loading.
169    pub fn has_errors(&self) -> bool { !self.errors.is_empty() }
170}
171
172/// Result type for tolerant directory loading: successfully loaded values and
173/// per-file errors.
174type TolerantLoadResult = (Vec<KnownValue>, Vec<(PathBuf, LoadError)>);
175
176/// Configuration for loading known values from directories.
177///
178/// This struct specifies which directories to search for JSON registry files.
179/// Directories are processed in order, with values from later directories
180/// overriding values from earlier directories when codepoints collide.
181///
182/// # Examples
183///
184/// ```rust,ignore
185/// use known_values::DirectoryConfig;
186///
187/// // Use only the default directory (~/.known-values/)
188/// let config = DirectoryConfig::default();
189///
190/// // Use custom paths
191/// let config = DirectoryConfig::with_paths(vec![
192///     "/etc/known-values".into(),
193///     "/usr/share/known-values".into(),
194/// ]);
195///
196/// // Use custom paths with default appended
197/// let config = DirectoryConfig::with_paths_and_default(vec![
198///     "/etc/known-values".into(),
199/// ]);
200/// ```
201#[derive(Debug, Clone, Default)]
202pub struct DirectoryConfig {
203    /// Search paths in priority order (later paths override earlier).
204    paths: Vec<PathBuf>,
205}
206
207impl DirectoryConfig {
208    /// Creates a new empty configuration with no search paths.
209    pub fn new() -> Self { Self { paths: Vec::new() } }
210
211    /// Creates configuration with only the default directory
212    /// (`~/.known-values/`).
213    pub fn default_only() -> Self {
214        Self { paths: vec![Self::default_directory()] }
215    }
216
217    /// Creates configuration with custom paths (processed in order).
218    ///
219    /// Later paths in the list take precedence over earlier paths when
220    /// values have the same codepoint.
221    pub fn with_paths(paths: Vec<PathBuf>) -> Self { Self { paths } }
222
223    /// Creates configuration with custom paths followed by the default
224    /// directory.
225    ///
226    /// The default directory (`~/.known-values/`) is appended to the list,
227    /// so its values will override values from the custom paths.
228    pub fn with_paths_and_default(mut paths: Vec<PathBuf>) -> Self {
229        paths.push(Self::default_directory());
230        Self { paths }
231    }
232
233    /// Returns the default directory: `~/.known-values/`
234    ///
235    /// Falls back to `./.known-values/` if the home directory cannot be
236    /// determined.
237    pub fn default_directory() -> PathBuf {
238        dirs::home_dir()
239            .unwrap_or_else(|| PathBuf::from("."))
240            .join(".known-values")
241    }
242
243    /// Returns the configured search paths.
244    pub fn paths(&self) -> &[PathBuf] { &self.paths }
245
246    /// Adds a path to the configuration.
247    ///
248    /// The new path will be processed after existing paths, so its values
249    /// will override values from earlier paths.
250    pub fn add_path(&mut self, path: PathBuf) { self.paths.push(path); }
251}
252
253/// Loads all JSON registry files from a single directory.
254///
255/// This function scans the specified directory for files with a `.json`
256/// extension and attempts to parse them as known value registries.
257///
258/// # Arguments
259///
260/// * `path` - The directory to scan for JSON registry files.
261///
262/// # Returns
263///
264/// Returns `Ok` with a vector of loaded `KnownValue` instances, or an empty
265/// vector if the directory doesn't exist. Returns `Err` only for I/O errors
266/// that prevent directory traversal.
267///
268/// # Examples
269///
270/// ```rust,ignore
271/// use known_values::load_from_directory;
272/// use std::path::Path;
273///
274/// let values = load_from_directory(Path::new("/etc/known-values"))?;
275/// for value in values {
276///     println!("{}: {}", value.value(), value.name());
277/// }
278/// ```
279pub fn load_from_directory(path: &Path) -> Result<Vec<KnownValue>, LoadError> {
280    let mut values = Vec::new();
281
282    // Return empty if directory doesn't exist or isn't a directory
283    if !path.exists() || !path.is_dir() {
284        return Ok(values);
285    }
286
287    for entry in fs::read_dir(path)? {
288        let entry = entry?;
289        let file_path = entry.path();
290
291        // Only process .json files
292        if file_path.extension().is_some_and(|ext| ext == "json") {
293            let content = fs::read_to_string(&file_path)?;
294            let registry: RegistryFile =
295                serde_json::from_str(&content).map_err(|e| {
296                    LoadError::Json { file: file_path.clone(), error: e }
297                })?;
298
299            for entry in registry.entries {
300                values.push(KnownValue::new_with_name(
301                    entry.codepoint,
302                    entry.name,
303                ));
304            }
305        }
306    }
307
308    Ok(values)
309}
310
311/// Loads known values from all directories in the given configuration.
312///
313/// Directories are processed in order. When multiple entries have the same
314/// codepoint, values from later directories override values from earlier
315/// directories.
316///
317/// This function is fault-tolerant: it will continue processing even if
318/// some files fail to parse. Errors are collected in the returned
319/// `LoadResult`.
320///
321/// # Arguments
322///
323/// * `config` - The directory configuration specifying search paths.
324///
325/// # Returns
326///
327/// A `LoadResult` containing the loaded values, processed files, and any
328/// errors encountered.
329///
330/// # Examples
331///
332/// ```rust,ignore
333/// use known_values::{DirectoryConfig, load_from_config};
334///
335/// let config = DirectoryConfig::default_only();
336/// let result = load_from_config(&config);
337///
338/// println!("Loaded {} values from {} files",
339///     result.values_count(),
340///     result.files_processed.len());
341///
342/// if result.has_errors() {
343///     for (path, error) in &result.errors {
344///         eprintln!("Error loading {}: {}", path.display(), error);
345///     }
346/// }
347/// ```
348pub fn load_from_config(config: &DirectoryConfig) -> LoadResult {
349    let mut result = LoadResult::default();
350
351    for dir_path in config.paths() {
352        match load_from_directory_tolerant(dir_path) {
353            Ok((values, errors)) => {
354                for value in values {
355                    result.values.insert(value.value(), value);
356                }
357                if !errors.is_empty() {
358                    result.errors.extend(errors);
359                }
360                result.files_processed.push(dir_path.clone());
361            }
362            Err(e) => {
363                result.errors.push((dir_path.clone(), e));
364            }
365        }
366    }
367
368    result
369}
370
371/// Loads from a directory with tolerance for individual file failures.
372fn load_from_directory_tolerant(
373    path: &Path,
374) -> Result<TolerantLoadResult, LoadError> {
375    let mut values = Vec::new();
376    let mut errors = Vec::new();
377
378    if !path.exists() || !path.is_dir() {
379        return Ok((values, errors));
380    }
381
382    for entry in fs::read_dir(path)? {
383        let entry = entry?;
384        let file_path = entry.path();
385
386        if file_path.extension().is_some_and(|ext| ext == "json") {
387            match load_single_file(&file_path) {
388                Ok(file_values) => values.extend(file_values),
389                Err(e) => errors.push((file_path, e)),
390            }
391        }
392    }
393
394    Ok((values, errors))
395}
396
397/// Loads known values from a single JSON file.
398fn load_single_file(path: &Path) -> Result<Vec<KnownValue>, LoadError> {
399    let content = fs::read_to_string(path)?;
400    let registry: RegistryFile = serde_json::from_str(&content)
401        .map_err(|e| LoadError::Json { file: path.to_path_buf(), error: e })?;
402
403    Ok(registry
404        .entries
405        .into_iter()
406        .map(|entry| KnownValue::new_with_name(entry.codepoint, entry.name))
407        .collect())
408}
409
410// Global configuration state
411static CUSTOM_CONFIG: Mutex<Option<DirectoryConfig>> = Mutex::new(None);
412static CONFIG_LOCKED: AtomicBool = AtomicBool::new(false);
413
414/// Error returned when configuration cannot be modified.
415#[derive(Debug, Clone, PartialEq, Eq)]
416pub enum ConfigError {
417    /// Configuration was attempted after the global registry was initialized.
418    AlreadyInitialized,
419}
420
421impl fmt::Display for ConfigError {
422    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
423        match self {
424            ConfigError::AlreadyInitialized => {
425                write!(
426                    f,
427                    "Cannot modify directory configuration after KNOWN_VALUES has been accessed"
428                )
429            }
430        }
431    }
432}
433
434impl std::error::Error for ConfigError {}
435
436/// Sets custom directory configuration for known values loading.
437///
438/// This function must be called **before** the first access to `KNOWN_VALUES`.
439/// Once `KNOWN_VALUES` is accessed, the configuration is locked and cannot
440/// be changed.
441///
442/// # Arguments
443///
444/// * `config` - The directory configuration to use.
445///
446/// # Returns
447///
448/// Returns `Ok(())` if the configuration was set successfully, or
449/// `Err(ConfigError::AlreadyInitialized)` if `KNOWN_VALUES` has already
450/// been accessed.
451///
452/// # Examples
453///
454/// ```rust,ignore
455/// use known_values::{set_directory_config, DirectoryConfig, KNOWN_VALUES};
456///
457/// // Set configuration before accessing KNOWN_VALUES
458/// set_directory_config(DirectoryConfig::with_paths(vec![
459///     "/custom/path".into(),
460/// ])).expect("Configuration should succeed");
461///
462/// // Now access KNOWN_VALUES - it will use the custom configuration
463/// let binding = KNOWN_VALUES.get();
464/// ```
465pub fn set_directory_config(
466    config: DirectoryConfig,
467) -> Result<(), ConfigError> {
468    if CONFIG_LOCKED.load(Ordering::SeqCst) {
469        return Err(ConfigError::AlreadyInitialized);
470    }
471    *CUSTOM_CONFIG.lock().unwrap() = Some(config);
472    Ok(())
473}
474
475/// Adds additional search paths to the directory configuration.
476///
477/// This function must be called **before** the first access to `KNOWN_VALUES`.
478/// Paths are added after any existing paths, so they will take precedence.
479///
480/// If no configuration has been set, this creates a new configuration with
481/// the default directory and appends the new paths.
482///
483/// # Arguments
484///
485/// * `paths` - The paths to add to the configuration.
486///
487/// # Returns
488///
489/// Returns `Ok(())` if the paths were added successfully, or
490/// `Err(ConfigError::AlreadyInitialized)` if `KNOWN_VALUES` has already
491/// been accessed.
492///
493/// # Examples
494///
495/// ```rust,ignore
496/// use known_values::add_search_paths;
497///
498/// // Add custom paths in addition to the default
499/// add_search_paths(vec![
500///     "/etc/known-values".into(),
501///     "/usr/share/known-values".into(),
502/// ]).expect("Should succeed before KNOWN_VALUES access");
503/// ```
504pub fn add_search_paths(paths: Vec<PathBuf>) -> Result<(), ConfigError> {
505    if CONFIG_LOCKED.load(Ordering::SeqCst) {
506        return Err(ConfigError::AlreadyInitialized);
507    }
508    let mut guard = CUSTOM_CONFIG.lock().unwrap();
509    let config = guard.get_or_insert_with(DirectoryConfig::default_only);
510    for path in paths {
511        config.add_path(path);
512    }
513    Ok(())
514}
515
516/// Gets the current directory configuration, locking it for future
517/// modifications.
518///
519/// This is called internally during `KNOWN_VALUES` initialization.
520pub(crate) fn get_and_lock_config() -> DirectoryConfig {
521    CONFIG_LOCKED.store(true, Ordering::SeqCst);
522    CUSTOM_CONFIG
523        .lock()
524        .unwrap()
525        .take()
526        .unwrap_or_else(DirectoryConfig::default_only)
527}
528
529#[cfg(test)]
530mod tests {
531    use super::*;
532
533    #[test]
534    fn test_parse_registry_json() {
535        let json = r#"{
536            "ontology": {"name": "test"},
537            "entries": [
538                {"codepoint": 9999, "name": "testValue", "type": "property"}
539            ],
540            "statistics": {}
541        }"#;
542
543        let registry: RegistryFile = serde_json::from_str(json).unwrap();
544        assert_eq!(registry.entries.len(), 1);
545        assert_eq!(registry.entries[0].codepoint, 9999);
546        assert_eq!(registry.entries[0].name, "testValue");
547    }
548
549    #[test]
550    fn test_parse_minimal_registry() {
551        let json = r#"{"entries": [{"codepoint": 1, "name": "minimal"}]}"#;
552
553        let registry: RegistryFile = serde_json::from_str(json).unwrap();
554        assert_eq!(registry.entries.len(), 1);
555        assert_eq!(registry.entries[0].codepoint, 1);
556    }
557
558    #[test]
559    fn test_parse_full_entry() {
560        let json = r#"{
561            "entries": [{
562                "codepoint": 100,
563                "name": "fullEntry",
564                "type": "class",
565                "uri": "https://example.com/vocab#fullEntry",
566                "description": "A complete entry with all fields"
567            }]
568        }"#;
569
570        let registry: RegistryFile = serde_json::from_str(json).unwrap();
571        let entry = &registry.entries[0];
572        assert_eq!(entry.codepoint, 100);
573        assert_eq!(entry.name, "fullEntry");
574        assert_eq!(entry.entry_type.as_deref(), Some("class"));
575        assert_eq!(
576            entry.uri.as_deref(),
577            Some("https://example.com/vocab#fullEntry")
578        );
579        assert!(entry.description.is_some());
580    }
581
582    #[test]
583    fn test_directory_config_default() {
584        let config = DirectoryConfig::default_only();
585        assert_eq!(config.paths().len(), 1);
586        assert!(config.paths()[0].ends_with(".known-values"));
587    }
588
589    #[test]
590    fn test_directory_config_custom_paths() {
591        let config = DirectoryConfig::with_paths(vec![
592            PathBuf::from("/a"),
593            PathBuf::from("/b"),
594        ]);
595        assert_eq!(config.paths().len(), 2);
596        assert_eq!(config.paths()[0], PathBuf::from("/a"));
597        assert_eq!(config.paths()[1], PathBuf::from("/b"));
598    }
599
600    #[test]
601    fn test_directory_config_with_default() {
602        let config =
603            DirectoryConfig::with_paths_and_default(vec![PathBuf::from(
604                "/custom",
605            )]);
606        assert_eq!(config.paths().len(), 2);
607        assert_eq!(config.paths()[0], PathBuf::from("/custom"));
608        assert!(config.paths()[1].ends_with(".known-values"));
609    }
610
611    #[test]
612    fn test_load_from_nonexistent_directory() {
613        let result = load_from_directory(Path::new("/nonexistent/path/12345"));
614        assert!(result.is_ok());
615        assert!(result.unwrap().is_empty());
616    }
617
618    #[test]
619    fn test_load_result_methods() {
620        let mut result = LoadResult::default();
621        assert_eq!(result.values_count(), 0);
622        assert!(!result.has_errors());
623
624        result
625            .values
626            .insert(1, KnownValue::new_with_name(1u64, "test".to_string()));
627        assert_eq!(result.values_count(), 1);
628    }
629}