known-values 0.15.5

Blockchain Commons Known Values.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
//! Directory-based loading of known values from JSON registry files.
//!
//! This module provides functionality to load known values from JSON files
//! stored in configurable directories. It is only available when the
//! `directory-loading` feature is enabled (which is the default).
//!
//! # Overview
//!
//! The module supports loading known values from:
//! - A default directory: `~/.known-values/`
//! - Custom directories specified at runtime
//!
//! Values loaded from JSON files can override hardcoded values when they
//! share the same codepoint (numeric identifier).
//!
//! # JSON File Format
//!
//! Registry files should follow the BlockchainCommons format:
//!
//! ```json
//! {
//!   "ontology": {
//!     "name": "my_registry",
//!     "source_url": "https://example.com/registry"
//!   },
//!   "entries": [
//!     {
//!       "codepoint": 1000,
//!       "name": "myValue",
//!       "type": "property",
//!       "uri": "https://example.com/vocab#myValue",
//!       "description": "A custom known value"
//!     }
//!   ]
//! }
//! ```
//!
//! Only the `entries` array with `codepoint` and `name` fields
//! is required; other fields are optional.

use std::{
    collections::HashMap,
    fmt, fs, io,
    path::{Path, PathBuf},
    sync::{
        Mutex,
        atomic::{AtomicBool, Ordering},
    },
};

use serde::Deserialize;

use crate::KnownValue;

/// A single entry in a known values JSON registry file.
#[derive(Debug, Deserialize)]
pub struct RegistryEntry {
    /// The unique numeric identifier for this known value.
    pub codepoint: u64,
    /// The canonical string name for this known value.
    pub name: String,
    /// The type of entry (e.g., "property", "class", "value").
    #[serde(rename = "type")]
    pub entry_type: Option<String>,
    /// An optional URI reference for this known value.
    pub uri: Option<String>,
    /// An optional human-readable description.
    pub description: Option<String>,
}

/// Metadata about the ontology or registry source.
#[derive(Debug, Deserialize)]
pub struct OntologyInfo {
    /// The name of this registry or ontology.
    pub name: Option<String>,
    /// The source URL for this registry.
    pub source_url: Option<String>,
    /// The starting codepoint for entries in this registry.
    pub start_code_point: Option<u64>,
    /// The processing strategy used to generate this registry.
    pub processing_strategy: Option<String>,
}

/// Root structure of a known values JSON registry file.
#[derive(Debug, Deserialize)]
pub struct RegistryFile {
    /// Metadata about this registry.
    pub ontology: Option<OntologyInfo>,
    /// Information about how this file was generated.
    pub generated: Option<GeneratedInfo>,
    /// The known value entries in this registry.
    pub entries: Vec<RegistryEntry>,
    /// Statistics about this registry (ignored during parsing).
    #[serde(default)]
    pub statistics: Option<serde_json::Value>,
}

/// Information about how a registry file was generated.
#[derive(Debug, Deserialize)]
pub struct GeneratedInfo {
    /// The tool used to generate this registry.
    pub tool: Option<String>,
}

/// Errors that can occur when loading known values from directories.
#[derive(Debug)]
pub enum LoadError {
    /// An I/O error occurred while reading files.
    Io(io::Error),
    /// A JSON parsing error occurred.
    Json {
        /// The file that caused the error.
        file: PathBuf,
        /// The underlying JSON error.
        error: serde_json::Error,
    },
}

impl fmt::Display for LoadError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            LoadError::Io(e) => write!(f, "IO error: {}", e),
            LoadError::Json { file, error } => {
                write!(f, "JSON parse error in {}: {}", file.display(), error)
            }
        }
    }
}

impl std::error::Error for LoadError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            LoadError::Io(e) => Some(e),
            LoadError::Json { error, .. } => Some(error),
        }
    }
}

impl From<io::Error> for LoadError {
    fn from(error: io::Error) -> Self { LoadError::Io(error) }
}

/// Result of a directory loading operation.
#[derive(Debug, Default)]
pub struct LoadResult {
    /// Known values loaded, keyed by codepoint.
    pub values: HashMap<u64, KnownValue>,
    /// Files that were successfully processed.
    pub files_processed: Vec<PathBuf>,
    /// Non-fatal errors encountered during loading.
    pub errors: Vec<(PathBuf, LoadError)>,
}

impl LoadResult {
    /// Returns the number of unique values loaded.
    pub fn values_count(&self) -> usize { self.values.len() }

    /// Returns an iterator over the loaded known values.
    pub fn values_iter(&self) -> impl Iterator<Item = &KnownValue> {
        self.values.values()
    }

    /// Consumes the result and returns the loaded known values.
    pub fn into_values(self) -> impl Iterator<Item = KnownValue> {
        self.values.into_values()
    }

    /// Returns true if any errors occurred during loading.
    pub fn has_errors(&self) -> bool { !self.errors.is_empty() }
}

/// Result type for tolerant directory loading: successfully loaded values and
/// per-file errors.
type TolerantLoadResult = (Vec<KnownValue>, Vec<(PathBuf, LoadError)>);

/// Configuration for loading known values from directories.
///
/// This struct specifies which directories to search for JSON registry files.
/// Directories are processed in order, with values from later directories
/// overriding values from earlier directories when codepoints collide.
///
/// # Examples
///
/// ```rust,ignore
/// use known_values::DirectoryConfig;
///
/// // Use only the default directory (~/.known-values/)
/// let config = DirectoryConfig::default();
///
/// // Use custom paths
/// let config = DirectoryConfig::with_paths(vec![
///     "/etc/known-values".into(),
///     "/usr/share/known-values".into(),
/// ]);
///
/// // Use custom paths with default appended
/// let config = DirectoryConfig::with_paths_and_default(vec![
///     "/etc/known-values".into(),
/// ]);
/// ```
#[derive(Debug, Clone, Default)]
pub struct DirectoryConfig {
    /// Search paths in priority order (later paths override earlier).
    paths: Vec<PathBuf>,
}

impl DirectoryConfig {
    /// Creates a new empty configuration with no search paths.
    pub fn new() -> Self { Self { paths: Vec::new() } }

    /// Creates configuration with only the default directory
    /// (`~/.known-values/`).
    pub fn default_only() -> Self {
        Self { paths: vec![Self::default_directory()] }
    }

    /// Creates configuration with custom paths (processed in order).
    ///
    /// Later paths in the list take precedence over earlier paths when
    /// values have the same codepoint.
    pub fn with_paths(paths: Vec<PathBuf>) -> Self { Self { paths } }

    /// Creates configuration with custom paths followed by the default
    /// directory.
    ///
    /// The default directory (`~/.known-values/`) is appended to the list,
    /// so its values will override values from the custom paths.
    pub fn with_paths_and_default(mut paths: Vec<PathBuf>) -> Self {
        paths.push(Self::default_directory());
        Self { paths }
    }

    /// Returns the default directory: `~/.known-values/`
    ///
    /// Falls back to `./.known-values/` if the home directory cannot be
    /// determined.
    pub fn default_directory() -> PathBuf {
        dirs::home_dir()
            .unwrap_or_else(|| PathBuf::from("."))
            .join(".known-values")
    }

    /// Returns the configured search paths.
    pub fn paths(&self) -> &[PathBuf] { &self.paths }

    /// Adds a path to the configuration.
    ///
    /// The new path will be processed after existing paths, so its values
    /// will override values from earlier paths.
    pub fn add_path(&mut self, path: PathBuf) { self.paths.push(path); }
}

/// Loads all JSON registry files from a single directory.
///
/// This function scans the specified directory for files with a `.json`
/// extension and attempts to parse them as known value registries.
///
/// # Arguments
///
/// * `path` - The directory to scan for JSON registry files.
///
/// # Returns
///
/// Returns `Ok` with a vector of loaded `KnownValue` instances, or an empty
/// vector if the directory doesn't exist. Returns `Err` only for I/O errors
/// that prevent directory traversal.
///
/// # Examples
///
/// ```rust,ignore
/// use known_values::load_from_directory;
/// use std::path::Path;
///
/// let values = load_from_directory(Path::new("/etc/known-values"))?;
/// for value in values {
///     println!("{}: {}", value.value(), value.name());
/// }
/// ```
pub fn load_from_directory(path: &Path) -> Result<Vec<KnownValue>, LoadError> {
    let mut values = Vec::new();

    // Return empty if directory doesn't exist or isn't a directory
    if !path.exists() || !path.is_dir() {
        return Ok(values);
    }

    for entry in fs::read_dir(path)? {
        let entry = entry?;
        let file_path = entry.path();

        // Only process .json files
        if file_path.extension().is_some_and(|ext| ext == "json") {
            let content = fs::read_to_string(&file_path)?;
            let registry: RegistryFile =
                serde_json::from_str(&content).map_err(|e| {
                    LoadError::Json { file: file_path.clone(), error: e }
                })?;

            for entry in registry.entries {
                values.push(KnownValue::new_with_name(
                    entry.codepoint,
                    entry.name,
                ));
            }
        }
    }

    Ok(values)
}

/// Loads known values from all directories in the given configuration.
///
/// Directories are processed in order. When multiple entries have the same
/// codepoint, values from later directories override values from earlier
/// directories.
///
/// This function is fault-tolerant: it will continue processing even if
/// some files fail to parse. Errors are collected in the returned
/// `LoadResult`.
///
/// # Arguments
///
/// * `config` - The directory configuration specifying search paths.
///
/// # Returns
///
/// A `LoadResult` containing the loaded values, processed files, and any
/// errors encountered.
///
/// # Examples
///
/// ```rust,ignore
/// use known_values::{DirectoryConfig, load_from_config};
///
/// let config = DirectoryConfig::default_only();
/// let result = load_from_config(&config);
///
/// println!("Loaded {} values from {} files",
///     result.values_count(),
///     result.files_processed.len());
///
/// if result.has_errors() {
///     for (path, error) in &result.errors {
///         eprintln!("Error loading {}: {}", path.display(), error);
///     }
/// }
/// ```
pub fn load_from_config(config: &DirectoryConfig) -> LoadResult {
    let mut result = LoadResult::default();

    for dir_path in config.paths() {
        match load_from_directory_tolerant(dir_path) {
            Ok((values, errors)) => {
                for value in values {
                    result.values.insert(value.value(), value);
                }
                if !errors.is_empty() {
                    result.errors.extend(errors);
                }
                result.files_processed.push(dir_path.clone());
            }
            Err(e) => {
                result.errors.push((dir_path.clone(), e));
            }
        }
    }

    result
}

/// Loads from a directory with tolerance for individual file failures.
fn load_from_directory_tolerant(
    path: &Path,
) -> Result<TolerantLoadResult, LoadError> {
    let mut values = Vec::new();
    let mut errors = Vec::new();

    if !path.exists() || !path.is_dir() {
        return Ok((values, errors));
    }

    for entry in fs::read_dir(path)? {
        let entry = entry?;
        let file_path = entry.path();

        if file_path.extension().is_some_and(|ext| ext == "json") {
            match load_single_file(&file_path) {
                Ok(file_values) => values.extend(file_values),
                Err(e) => errors.push((file_path, e)),
            }
        }
    }

    Ok((values, errors))
}

/// Loads known values from a single JSON file.
fn load_single_file(path: &Path) -> Result<Vec<KnownValue>, LoadError> {
    let content = fs::read_to_string(path)?;
    let registry: RegistryFile = serde_json::from_str(&content)
        .map_err(|e| LoadError::Json { file: path.to_path_buf(), error: e })?;

    Ok(registry
        .entries
        .into_iter()
        .map(|entry| KnownValue::new_with_name(entry.codepoint, entry.name))
        .collect())
}

// Global configuration state
static CUSTOM_CONFIG: Mutex<Option<DirectoryConfig>> = Mutex::new(None);
static CONFIG_LOCKED: AtomicBool = AtomicBool::new(false);

/// Error returned when configuration cannot be modified.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ConfigError {
    /// Configuration was attempted after the global registry was initialized.
    AlreadyInitialized,
}

impl fmt::Display for ConfigError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            ConfigError::AlreadyInitialized => {
                write!(
                    f,
                    "Cannot modify directory configuration after KNOWN_VALUES has been accessed"
                )
            }
        }
    }
}

impl std::error::Error for ConfigError {}

/// Sets custom directory configuration for known values loading.
///
/// This function must be called **before** the first access to `KNOWN_VALUES`.
/// Once `KNOWN_VALUES` is accessed, the configuration is locked and cannot
/// be changed.
///
/// # Arguments
///
/// * `config` - The directory configuration to use.
///
/// # Returns
///
/// Returns `Ok(())` if the configuration was set successfully, or
/// `Err(ConfigError::AlreadyInitialized)` if `KNOWN_VALUES` has already
/// been accessed.
///
/// # Examples
///
/// ```rust,ignore
/// use known_values::{set_directory_config, DirectoryConfig, KNOWN_VALUES};
///
/// // Set configuration before accessing KNOWN_VALUES
/// set_directory_config(DirectoryConfig::with_paths(vec![
///     "/custom/path".into(),
/// ])).expect("Configuration should succeed");
///
/// // Now access KNOWN_VALUES - it will use the custom configuration
/// let binding = KNOWN_VALUES.get();
/// ```
pub fn set_directory_config(
    config: DirectoryConfig,
) -> Result<(), ConfigError> {
    if CONFIG_LOCKED.load(Ordering::SeqCst) {
        return Err(ConfigError::AlreadyInitialized);
    }
    *CUSTOM_CONFIG.lock().unwrap() = Some(config);
    Ok(())
}

/// Adds additional search paths to the directory configuration.
///
/// This function must be called **before** the first access to `KNOWN_VALUES`.
/// Paths are added after any existing paths, so they will take precedence.
///
/// If no configuration has been set, this creates a new configuration with
/// the default directory and appends the new paths.
///
/// # Arguments
///
/// * `paths` - The paths to add to the configuration.
///
/// # Returns
///
/// Returns `Ok(())` if the paths were added successfully, or
/// `Err(ConfigError::AlreadyInitialized)` if `KNOWN_VALUES` has already
/// been accessed.
///
/// # Examples
///
/// ```rust,ignore
/// use known_values::add_search_paths;
///
/// // Add custom paths in addition to the default
/// add_search_paths(vec![
///     "/etc/known-values".into(),
///     "/usr/share/known-values".into(),
/// ]).expect("Should succeed before KNOWN_VALUES access");
/// ```
pub fn add_search_paths(paths: Vec<PathBuf>) -> Result<(), ConfigError> {
    if CONFIG_LOCKED.load(Ordering::SeqCst) {
        return Err(ConfigError::AlreadyInitialized);
    }
    let mut guard = CUSTOM_CONFIG.lock().unwrap();
    let config = guard.get_or_insert_with(DirectoryConfig::default_only);
    for path in paths {
        config.add_path(path);
    }
    Ok(())
}

/// Gets the current directory configuration, locking it for future
/// modifications.
///
/// This is called internally during `KNOWN_VALUES` initialization.
pub(crate) fn get_and_lock_config() -> DirectoryConfig {
    CONFIG_LOCKED.store(true, Ordering::SeqCst);
    CUSTOM_CONFIG
        .lock()
        .unwrap()
        .take()
        .unwrap_or_else(DirectoryConfig::default_only)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_registry_json() {
        let json = r#"{
            "ontology": {"name": "test"},
            "entries": [
                {"codepoint": 9999, "name": "testValue", "type": "property"}
            ],
            "statistics": {}
        }"#;

        let registry: RegistryFile = serde_json::from_str(json).unwrap();
        assert_eq!(registry.entries.len(), 1);
        assert_eq!(registry.entries[0].codepoint, 9999);
        assert_eq!(registry.entries[0].name, "testValue");
    }

    #[test]
    fn test_parse_minimal_registry() {
        let json = r#"{"entries": [{"codepoint": 1, "name": "minimal"}]}"#;

        let registry: RegistryFile = serde_json::from_str(json).unwrap();
        assert_eq!(registry.entries.len(), 1);
        assert_eq!(registry.entries[0].codepoint, 1);
    }

    #[test]
    fn test_parse_full_entry() {
        let json = r#"{
            "entries": [{
                "codepoint": 100,
                "name": "fullEntry",
                "type": "class",
                "uri": "https://example.com/vocab#fullEntry",
                "description": "A complete entry with all fields"
            }]
        }"#;

        let registry: RegistryFile = serde_json::from_str(json).unwrap();
        let entry = &registry.entries[0];
        assert_eq!(entry.codepoint, 100);
        assert_eq!(entry.name, "fullEntry");
        assert_eq!(entry.entry_type.as_deref(), Some("class"));
        assert_eq!(
            entry.uri.as_deref(),
            Some("https://example.com/vocab#fullEntry")
        );
        assert!(entry.description.is_some());
    }

    #[test]
    fn test_directory_config_default() {
        let config = DirectoryConfig::default_only();
        assert_eq!(config.paths().len(), 1);
        assert!(config.paths()[0].ends_with(".known-values"));
    }

    #[test]
    fn test_directory_config_custom_paths() {
        let config = DirectoryConfig::with_paths(vec![
            PathBuf::from("/a"),
            PathBuf::from("/b"),
        ]);
        assert_eq!(config.paths().len(), 2);
        assert_eq!(config.paths()[0], PathBuf::from("/a"));
        assert_eq!(config.paths()[1], PathBuf::from("/b"));
    }

    #[test]
    fn test_directory_config_with_default() {
        let config =
            DirectoryConfig::with_paths_and_default(vec![PathBuf::from(
                "/custom",
            )]);
        assert_eq!(config.paths().len(), 2);
        assert_eq!(config.paths()[0], PathBuf::from("/custom"));
        assert!(config.paths()[1].ends_with(".known-values"));
    }

    #[test]
    fn test_load_from_nonexistent_directory() {
        let result = load_from_directory(Path::new("/nonexistent/path/12345"));
        assert!(result.is_ok());
        assert!(result.unwrap().is_empty());
    }

    #[test]
    fn test_load_result_methods() {
        let mut result = LoadResult::default();
        assert_eq!(result.values_count(), 0);
        assert!(!result.has_errors());

        result
            .values
            .insert(1, KnownValue::new_with_name(1u64, "test".to_string()));
        assert_eq!(result.values_count(), 1);
    }
}