Skip to main content

hedl_json/
schema_cache.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Schema caching for JSON to HEDL conversion
19//!
20//! When converting large JSON arrays to matrix lists, we often encounter the same
21//! structure repeatedly. Caching the inferred schema significantly improves performance
22//! by avoiding redundant key iteration and sorting.
23//!
24//! # Performance Impact
25//!
26//! - First schema inference: ~O(n*log(n)) where n is number of keys
27//! - Cached lookup: ~O(1) hash map lookup
28//! - Expected speedup: 30-50% for documents with repeated array structures
29//!
30//! # Thread Safety
31//!
32//! The cache is thread-safe using interior mutability with `RwLock`. Multiple threads
33//! can read from the cache concurrently, while writes are exclusive.
34
35use std::collections::HashMap;
36use std::hash::{Hash, Hasher};
37use std::sync::{Arc, RwLock};
38
39/// Cache key for schema lookup
40///
41/// Represents the structure of a JSON object by its sorted field names.
42/// This excludes metadata fields (starting with "__") and child arrays.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct SchemaCacheKey {
45    /// Sorted field names (excluding metadata and children)
46    pub fields: Vec<String>,
47}
48
49impl Hash for SchemaCacheKey {
50    fn hash<H: Hasher>(&self, state: &mut H) {
51        // Hash the sorted fields
52        for field in &self.fields {
53            field.hash(state);
54        }
55    }
56}
57
58impl SchemaCacheKey {
59    /// Create a cache key from a list of field names
60    ///
61    /// Automatically sorts the fields for consistent hashing.
62    #[must_use]
63    pub fn new(mut fields: Vec<String>) -> Self {
64        fields.sort();
65        Self { fields }
66    }
67}
68
69/// Entry in the LRU cache
70#[derive(Debug, Clone)]
71struct CacheEntry {
72    /// The cached schema (column names in order)
73    schema: Vec<String>,
74    /// Access counter for LRU eviction
75    access_count: u64,
76    /// Last access timestamp (for tie-breaking)
77    last_access: std::time::Instant,
78}
79
80/// Statistics for cache performance monitoring
81#[derive(Debug, Clone, Default)]
82pub struct CacheStatistics {
83    /// Number of cache hits
84    pub hits: u64,
85    /// Number of cache misses
86    pub misses: u64,
87    /// Number of evictions due to capacity
88    pub evictions: u64,
89    /// Current cache size
90    pub size: usize,
91    /// Maximum cache capacity
92    pub capacity: usize,
93}
94
95impl CacheStatistics {
96    /// Calculate cache hit rate (0.0 to 1.0)
97    #[must_use]
98    pub fn hit_rate(&self) -> f64 {
99        let total = self.hits + self.misses;
100        if total == 0 {
101            0.0
102        } else {
103            self.hits as f64 / total as f64
104        }
105    }
106
107    /// Calculate cache miss rate (0.0 to 1.0)
108    #[must_use]
109    pub fn miss_rate(&self) -> f64 {
110        1.0 - self.hit_rate()
111    }
112
113    /// Reset all statistics
114    pub fn reset(&mut self) {
115        self.hits = 0;
116        self.misses = 0;
117        self.evictions = 0;
118    }
119}
120
121/// Thread-safe LRU schema cache
122///
123/// Uses interior mutability with `RwLock` for thread-safe access.
124/// Multiple threads can read concurrently, while writes are exclusive.
125///
126/// # Examples
127///
128/// ```rust
129/// use hedl_json::schema_cache::{SchemaCache, SchemaCacheKey};
130///
131/// let cache = SchemaCache::new(100);
132///
133/// // Cache a schema
134/// let key = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
135/// cache.insert(key.clone(), vec!["id".to_string(), "name".to_string()]);
136///
137/// // Retrieve from cache
138/// if let Some(schema) = cache.get(&key) {
139///     println!("Schema: {:?}", schema);
140/// }
141///
142/// // Get statistics
143/// let stats = cache.statistics();
144/// println!("Hit rate: {:.2}%", stats.hit_rate() * 100.0);
145/// ```
146#[derive(Debug, Clone)]
147pub struct SchemaCache {
148    /// The underlying cache storage
149    inner: Arc<RwLock<SchemaCacheInner>>,
150}
151
152#[derive(Debug)]
153struct SchemaCacheInner {
154    /// Map from cache key to schema
155    cache: HashMap<SchemaCacheKey, CacheEntry>,
156    /// Maximum cache size
157    capacity: usize,
158    /// Cache statistics
159    stats: CacheStatistics,
160}
161
162impl SchemaCache {
163    /// Create a new schema cache with the specified capacity
164    ///
165    /// # Arguments
166    ///
167    /// * `capacity` - Maximum number of schemas to cache (default: 100)
168    ///
169    /// # Examples
170    ///
171    /// ```rust
172    /// use hedl_json::schema_cache::SchemaCache;
173    ///
174    /// let cache = SchemaCache::new(100);
175    /// ```
176    #[must_use]
177    pub fn new(capacity: usize) -> Self {
178        Self {
179            inner: Arc::new(RwLock::new(SchemaCacheInner {
180                cache: HashMap::with_capacity(capacity),
181                capacity,
182                stats: CacheStatistics {
183                    capacity,
184                    ..Default::default()
185                },
186            })),
187        }
188    }
189
190    /// Get a schema from the cache
191    ///
192    /// Returns `Some(schema)` if found, `None` otherwise.
193    /// Updates access statistics for LRU tracking.
194    ///
195    /// # Arguments
196    ///
197    /// * `key` - Cache key representing the JSON structure
198    ///
199    /// # Examples
200    ///
201    /// ```rust
202    /// use hedl_json::schema_cache::{SchemaCache, SchemaCacheKey};
203    ///
204    /// let cache = SchemaCache::new(100);
205    /// let key = SchemaCacheKey::new(vec!["id".to_string()]);
206    ///
207    /// if let Some(schema) = cache.get(&key) {
208    ///     println!("Found: {:?}", schema);
209    /// }
210    /// ```
211    #[must_use]
212    pub fn get(&self, key: &SchemaCacheKey) -> Option<Vec<String>> {
213        let mut inner = self.inner.write().expect("lock not poisoned");
214
215        if let Some(entry) = inner.cache.get_mut(key) {
216            // Update LRU tracking
217            entry.access_count += 1;
218            entry.last_access = std::time::Instant::now();
219
220            // Clone schema before updating stats
221            let schema = entry.schema.clone();
222
223            // Update statistics
224            inner.stats.hits += 1;
225
226            Some(schema)
227        } else {
228            // Update statistics
229            inner.stats.misses += 1;
230            None
231        }
232    }
233
234    /// Insert a schema into the cache
235    ///
236    /// If the cache is full, evicts the least recently used entry.
237    ///
238    /// # Arguments
239    ///
240    /// * `key` - Cache key representing the JSON structure
241    /// * `schema` - Schema to cache (column names in order)
242    ///
243    /// # Examples
244    ///
245    /// ```rust
246    /// use hedl_json::schema_cache::{SchemaCache, SchemaCacheKey};
247    ///
248    /// let cache = SchemaCache::new(100);
249    /// let key = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
250    /// cache.insert(key, vec!["id".to_string(), "name".to_string()]);
251    /// ```
252    pub fn insert(&self, key: SchemaCacheKey, schema: Vec<String>) {
253        let mut inner = self.inner.write().expect("lock not poisoned");
254
255        // Check if we need to evict
256        if inner.cache.len() >= inner.capacity && !inner.cache.contains_key(&key) {
257            // Find LRU entry (lowest access_count, oldest last_access)
258            if let Some(lru_key) = inner
259                .cache
260                .iter()
261                .min_by_key(|(_, entry)| (entry.access_count, entry.last_access))
262                .map(|(k, _)| k.clone())
263            {
264                inner.cache.remove(&lru_key);
265                inner.stats.evictions += 1;
266            }
267        }
268
269        // Insert or update
270        inner.cache.insert(
271            key,
272            CacheEntry {
273                schema,
274                access_count: 1,
275                last_access: std::time::Instant::now(),
276            },
277        );
278
279        // Update size statistic
280        inner.stats.size = inner.cache.len();
281    }
282
283    /// Get current cache statistics
284    ///
285    /// Returns a snapshot of cache performance metrics.
286    ///
287    /// # Examples
288    ///
289    /// ```rust
290    /// use hedl_json::schema_cache::SchemaCache;
291    ///
292    /// let cache = SchemaCache::new(100);
293    /// let stats = cache.statistics();
294    /// println!("Hit rate: {:.2}%", stats.hit_rate() * 100.0);
295    /// println!("Size: {}/{}", stats.size, stats.capacity);
296    /// ```
297    #[must_use]
298    pub fn statistics(&self) -> CacheStatistics {
299        let inner = self.inner.read().expect("lock not poisoned");
300        inner.stats.clone()
301    }
302
303    /// Clear the cache and reset statistics
304    ///
305    /// # Examples
306    ///
307    /// ```rust
308    /// use hedl_json::schema_cache::SchemaCache;
309    ///
310    /// let cache = SchemaCache::new(100);
311    /// cache.clear();
312    /// ```
313    pub fn clear(&self) {
314        let mut inner = self.inner.write().expect("lock not poisoned");
315        inner.cache.clear();
316        inner.stats.reset();
317        inner.stats.size = 0;
318        inner.stats.capacity = inner.capacity;
319    }
320
321    /// Get current cache size
322    ///
323    /// # Examples
324    ///
325    /// ```rust
326    /// use hedl_json::schema_cache::SchemaCache;
327    ///
328    /// let cache = SchemaCache::new(100);
329    /// assert_eq!(cache.len(), 0);
330    /// ```
331    #[must_use]
332    pub fn len(&self) -> usize {
333        let inner = self.inner.read().expect("lock not poisoned");
334        inner.cache.len()
335    }
336
337    /// Check if cache is empty
338    ///
339    /// # Examples
340    ///
341    /// ```rust
342    /// use hedl_json::schema_cache::SchemaCache;
343    ///
344    /// let cache = SchemaCache::new(100);
345    /// assert!(cache.is_empty());
346    /// ```
347    #[must_use]
348    pub fn is_empty(&self) -> bool {
349        self.len() == 0
350    }
351
352    /// Get cache capacity
353    ///
354    /// # Examples
355    ///
356    /// ```rust
357    /// use hedl_json::schema_cache::SchemaCache;
358    ///
359    /// let cache = SchemaCache::new(100);
360    /// assert_eq!(cache.capacity(), 100);
361    /// ```
362    #[must_use]
363    pub fn capacity(&self) -> usize {
364        let inner = self.inner.read().expect("lock not poisoned");
365        inner.capacity
366    }
367}
368
369#[cfg(test)]
370mod tests {
371    use super::*;
372
373    #[test]
374    fn test_cache_key_new() {
375        let key = SchemaCacheKey::new(vec!["name".to_string(), "id".to_string()]);
376        assert_eq!(key.fields, vec!["id", "name"]); // Should be sorted
377    }
378
379    #[test]
380    fn test_cache_key_equality() {
381        let key1 = SchemaCacheKey::new(vec!["name".to_string(), "id".to_string()]);
382        let key2 = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
383        assert_eq!(key1, key2); // Same fields, different order
384    }
385
386    #[test]
387    fn test_cache_basic_operations() {
388        let cache = SchemaCache::new(10);
389
390        let key = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
391        let schema = vec!["id".to_string(), "name".to_string()];
392
393        // Initially empty
394        assert!(cache.is_empty());
395        assert_eq!(cache.len(), 0);
396
397        // Insert and retrieve
398        cache.insert(key.clone(), schema.clone());
399        assert_eq!(cache.len(), 1);
400        assert!(!cache.is_empty());
401
402        let retrieved = cache.get(&key);
403        assert_eq!(retrieved, Some(schema));
404    }
405
406    #[test]
407    fn test_cache_miss() {
408        let cache = SchemaCache::new(10);
409        let key = SchemaCacheKey::new(vec!["id".to_string()]);
410
411        let result = cache.get(&key);
412        assert_eq!(result, None);
413
414        let stats = cache.statistics();
415        assert_eq!(stats.misses, 1);
416        assert_eq!(stats.hits, 0);
417    }
418
419    #[test]
420    fn test_cache_hit() {
421        let cache = SchemaCache::new(10);
422        let key = SchemaCacheKey::new(vec!["id".to_string()]);
423        let schema = vec!["id".to_string()];
424
425        cache.insert(key.clone(), schema.clone());
426
427        let result = cache.get(&key);
428        assert_eq!(result, Some(schema));
429
430        let stats = cache.statistics();
431        assert_eq!(stats.hits, 1);
432        assert_eq!(stats.misses, 0);
433    }
434
435    #[test]
436    fn test_cache_statistics() {
437        let cache = SchemaCache::new(10);
438        let key1 = SchemaCacheKey::new(vec!["id".to_string()]);
439        let key2 = SchemaCacheKey::new(vec!["name".to_string()]);
440
441        // Miss
442        cache.get(&key1);
443
444        // Insert
445        cache.insert(key1.clone(), vec!["id".to_string()]);
446
447        // Hit
448        cache.get(&key1);
449
450        // Miss
451        cache.get(&key2);
452
453        let stats = cache.statistics();
454        assert_eq!(stats.hits, 1);
455        assert_eq!(stats.misses, 2);
456        assert_eq!(stats.size, 1);
457        assert_eq!(stats.capacity, 10);
458        assert!((stats.hit_rate() - 0.333).abs() < 0.01);
459    }
460
461    #[test]
462    fn test_cache_lru_eviction() {
463        let cache = SchemaCache::new(3);
464
465        // Insert 3 entries
466        for i in 0..3 {
467            let key = SchemaCacheKey::new(vec![format!("field{}", i)]);
468            cache.insert(key, vec![format!("field{}", i)]);
469        }
470
471        assert_eq!(cache.len(), 3);
472
473        // Access first entry to make it recently used
474        let key0 = SchemaCacheKey::new(vec!["field0".to_string()]);
475        cache.get(&key0);
476
477        // Insert 4th entry - should evict least recently used (field1 or field2)
478        let key3 = SchemaCacheKey::new(vec!["field3".to_string()]);
479        cache.insert(key3.clone(), vec!["field3".to_string()]);
480
481        assert_eq!(cache.len(), 3);
482
483        // field0 should still be there (recently accessed)
484        assert!(cache.get(&key0).is_some());
485
486        // field3 should be there (just inserted)
487        assert!(cache.get(&key3).is_some());
488
489        let stats = cache.statistics();
490        assert_eq!(stats.evictions, 1);
491    }
492
493    #[test]
494    fn test_cache_clear() {
495        let cache = SchemaCache::new(10);
496
497        // Insert some entries
498        for i in 0..5 {
499            let key = SchemaCacheKey::new(vec![format!("field{}", i)]);
500            cache.insert(key, vec![format!("field{}", i)]);
501        }
502
503        assert_eq!(cache.len(), 5);
504
505        // Clear
506        cache.clear();
507
508        assert_eq!(cache.len(), 0);
509        assert!(cache.is_empty());
510
511        let stats = cache.statistics();
512        assert_eq!(stats.hits, 0);
513        assert_eq!(stats.misses, 0);
514        assert_eq!(stats.evictions, 0);
515        assert_eq!(stats.size, 0);
516    }
517
518    #[test]
519    fn test_cache_capacity() {
520        let cache = SchemaCache::new(42);
521        assert_eq!(cache.capacity(), 42);
522    }
523
524    #[test]
525    fn test_cache_update_existing() {
526        let cache = SchemaCache::new(10);
527        let key = SchemaCacheKey::new(vec!["id".to_string()]);
528
529        // Insert initial schema
530        cache.insert(key.clone(), vec!["id".to_string()]);
531
532        // Update with new schema
533        cache.insert(key.clone(), vec!["id".to_string(), "name".to_string()]);
534
535        let result = cache.get(&key);
536        assert_eq!(result, Some(vec!["id".to_string(), "name".to_string()]));
537
538        // Should not cause eviction when updating existing key
539        assert_eq!(cache.len(), 1);
540    }
541
542    #[test]
543    fn test_cache_clone() {
544        let cache = SchemaCache::new(10);
545        let key = SchemaCacheKey::new(vec!["id".to_string()]);
546        cache.insert(key.clone(), vec!["id".to_string()]);
547
548        let cache_clone = cache.clone();
549
550        // Both caches should have the same data
551        assert_eq!(cache.len(), cache_clone.len());
552        assert_eq!(cache.get(&key), cache_clone.get(&key));
553    }
554
555    #[test]
556    fn test_statistics_hit_rate() {
557        let stats = CacheStatistics {
558            hits: 7,
559            misses: 3,
560            evictions: 0,
561            size: 5,
562            capacity: 10,
563        };
564
565        assert!((stats.hit_rate() - 0.7).abs() < 0.01);
566        assert!((stats.miss_rate() - 0.3).abs() < 0.01);
567    }
568
569    #[test]
570    fn test_statistics_empty() {
571        let stats = CacheStatistics::default();
572        assert_eq!(stats.hit_rate(), 0.0);
573        assert_eq!(stats.miss_rate(), 1.0);
574    }
575}