hedl_json/schema_cache.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Schema caching for JSON to HEDL conversion
19//!
20//! When converting large JSON arrays to matrix lists, we often encounter the same
21//! structure repeatedly. Caching the inferred schema significantly improves performance
22//! by avoiding redundant key iteration and sorting.
23//!
24//! # Performance Impact
25//!
26//! - First schema inference: ~O(n*log(n)) where n is number of keys
27//! - Cached lookup: ~O(1) hash map lookup
28//! - Expected speedup: 30-50% for documents with repeated array structures
29//!
30//! # Thread Safety
31//!
32//! The cache is thread-safe using interior mutability with `RwLock`. Multiple threads
33//! can read from the cache concurrently, while writes are exclusive.
34
35use std::collections::HashMap;
36use std::hash::{Hash, Hasher};
37use std::sync::{Arc, RwLock};
38
39/// Cache key for schema lookup
40///
41/// Represents the structure of a JSON object by its sorted field names.
42/// This excludes metadata fields (starting with "__") and child arrays.
43#[derive(Debug, Clone, PartialEq, Eq)]
44pub struct SchemaCacheKey {
45 /// Sorted field names (excluding metadata and children)
46 pub fields: Vec<String>,
47}
48
49impl Hash for SchemaCacheKey {
50 fn hash<H: Hasher>(&self, state: &mut H) {
51 // Hash the sorted fields
52 for field in &self.fields {
53 field.hash(state);
54 }
55 }
56}
57
58impl SchemaCacheKey {
59 /// Create a cache key from a list of field names
60 ///
61 /// Automatically sorts the fields for consistent hashing.
62 #[must_use]
63 pub fn new(mut fields: Vec<String>) -> Self {
64 fields.sort();
65 Self { fields }
66 }
67}
68
69/// Entry in the LRU cache
70#[derive(Debug, Clone)]
71struct CacheEntry {
72 /// The cached schema (column names in order)
73 schema: Vec<String>,
74 /// Access counter for LRU eviction
75 access_count: u64,
76 /// Last access timestamp (for tie-breaking)
77 last_access: std::time::Instant,
78}
79
80/// Statistics for cache performance monitoring
81#[derive(Debug, Clone, Default)]
82pub struct CacheStatistics {
83 /// Number of cache hits
84 pub hits: u64,
85 /// Number of cache misses
86 pub misses: u64,
87 /// Number of evictions due to capacity
88 pub evictions: u64,
89 /// Current cache size
90 pub size: usize,
91 /// Maximum cache capacity
92 pub capacity: usize,
93}
94
95impl CacheStatistics {
96 /// Calculate cache hit rate (0.0 to 1.0)
97 #[must_use]
98 pub fn hit_rate(&self) -> f64 {
99 let total = self.hits + self.misses;
100 if total == 0 {
101 0.0
102 } else {
103 self.hits as f64 / total as f64
104 }
105 }
106
107 /// Calculate cache miss rate (0.0 to 1.0)
108 #[must_use]
109 pub fn miss_rate(&self) -> f64 {
110 1.0 - self.hit_rate()
111 }
112
113 /// Reset all statistics
114 pub fn reset(&mut self) {
115 self.hits = 0;
116 self.misses = 0;
117 self.evictions = 0;
118 }
119}
120
121/// Thread-safe LRU schema cache
122///
123/// Uses interior mutability with `RwLock` for thread-safe access.
124/// Multiple threads can read concurrently, while writes are exclusive.
125///
126/// # Examples
127///
128/// ```rust
129/// use hedl_json::schema_cache::{SchemaCache, SchemaCacheKey};
130///
131/// let cache = SchemaCache::new(100);
132///
133/// // Cache a schema
134/// let key = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
135/// cache.insert(key.clone(), vec!["id".to_string(), "name".to_string()]);
136///
137/// // Retrieve from cache
138/// if let Some(schema) = cache.get(&key) {
139/// println!("Schema: {:?}", schema);
140/// }
141///
142/// // Get statistics
143/// let stats = cache.statistics();
144/// println!("Hit rate: {:.2}%", stats.hit_rate() * 100.0);
145/// ```
146#[derive(Debug, Clone)]
147pub struct SchemaCache {
148 /// The underlying cache storage
149 inner: Arc<RwLock<SchemaCacheInner>>,
150}
151
152#[derive(Debug)]
153struct SchemaCacheInner {
154 /// Map from cache key to schema
155 cache: HashMap<SchemaCacheKey, CacheEntry>,
156 /// Maximum cache size
157 capacity: usize,
158 /// Cache statistics
159 stats: CacheStatistics,
160}
161
162impl SchemaCache {
163 /// Create a new schema cache with the specified capacity
164 ///
165 /// # Arguments
166 ///
167 /// * `capacity` - Maximum number of schemas to cache (default: 100)
168 ///
169 /// # Examples
170 ///
171 /// ```rust
172 /// use hedl_json::schema_cache::SchemaCache;
173 ///
174 /// let cache = SchemaCache::new(100);
175 /// ```
176 #[must_use]
177 pub fn new(capacity: usize) -> Self {
178 Self {
179 inner: Arc::new(RwLock::new(SchemaCacheInner {
180 cache: HashMap::with_capacity(capacity),
181 capacity,
182 stats: CacheStatistics {
183 capacity,
184 ..Default::default()
185 },
186 })),
187 }
188 }
189
190 /// Get a schema from the cache
191 ///
192 /// Returns `Some(schema)` if found, `None` otherwise.
193 /// Updates access statistics for LRU tracking.
194 ///
195 /// # Arguments
196 ///
197 /// * `key` - Cache key representing the JSON structure
198 ///
199 /// # Examples
200 ///
201 /// ```rust
202 /// use hedl_json::schema_cache::{SchemaCache, SchemaCacheKey};
203 ///
204 /// let cache = SchemaCache::new(100);
205 /// let key = SchemaCacheKey::new(vec!["id".to_string()]);
206 ///
207 /// if let Some(schema) = cache.get(&key) {
208 /// println!("Found: {:?}", schema);
209 /// }
210 /// ```
211 #[must_use]
212 pub fn get(&self, key: &SchemaCacheKey) -> Option<Vec<String>> {
213 let mut inner = self.inner.write().expect("lock not poisoned");
214
215 if let Some(entry) = inner.cache.get_mut(key) {
216 // Update LRU tracking
217 entry.access_count += 1;
218 entry.last_access = std::time::Instant::now();
219
220 // Clone schema before updating stats
221 let schema = entry.schema.clone();
222
223 // Update statistics
224 inner.stats.hits += 1;
225
226 Some(schema)
227 } else {
228 // Update statistics
229 inner.stats.misses += 1;
230 None
231 }
232 }
233
234 /// Insert a schema into the cache
235 ///
236 /// If the cache is full, evicts the least recently used entry.
237 ///
238 /// # Arguments
239 ///
240 /// * `key` - Cache key representing the JSON structure
241 /// * `schema` - Schema to cache (column names in order)
242 ///
243 /// # Examples
244 ///
245 /// ```rust
246 /// use hedl_json::schema_cache::{SchemaCache, SchemaCacheKey};
247 ///
248 /// let cache = SchemaCache::new(100);
249 /// let key = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
250 /// cache.insert(key, vec!["id".to_string(), "name".to_string()]);
251 /// ```
252 pub fn insert(&self, key: SchemaCacheKey, schema: Vec<String>) {
253 let mut inner = self.inner.write().expect("lock not poisoned");
254
255 // Check if we need to evict
256 if inner.cache.len() >= inner.capacity && !inner.cache.contains_key(&key) {
257 // Find LRU entry (lowest access_count, oldest last_access)
258 if let Some(lru_key) = inner
259 .cache
260 .iter()
261 .min_by_key(|(_, entry)| (entry.access_count, entry.last_access))
262 .map(|(k, _)| k.clone())
263 {
264 inner.cache.remove(&lru_key);
265 inner.stats.evictions += 1;
266 }
267 }
268
269 // Insert or update
270 inner.cache.insert(
271 key,
272 CacheEntry {
273 schema,
274 access_count: 1,
275 last_access: std::time::Instant::now(),
276 },
277 );
278
279 // Update size statistic
280 inner.stats.size = inner.cache.len();
281 }
282
283 /// Get current cache statistics
284 ///
285 /// Returns a snapshot of cache performance metrics.
286 ///
287 /// # Examples
288 ///
289 /// ```rust
290 /// use hedl_json::schema_cache::SchemaCache;
291 ///
292 /// let cache = SchemaCache::new(100);
293 /// let stats = cache.statistics();
294 /// println!("Hit rate: {:.2}%", stats.hit_rate() * 100.0);
295 /// println!("Size: {}/{}", stats.size, stats.capacity);
296 /// ```
297 #[must_use]
298 pub fn statistics(&self) -> CacheStatistics {
299 let inner = self.inner.read().expect("lock not poisoned");
300 inner.stats.clone()
301 }
302
303 /// Clear the cache and reset statistics
304 ///
305 /// # Examples
306 ///
307 /// ```rust
308 /// use hedl_json::schema_cache::SchemaCache;
309 ///
310 /// let cache = SchemaCache::new(100);
311 /// cache.clear();
312 /// ```
313 pub fn clear(&self) {
314 let mut inner = self.inner.write().expect("lock not poisoned");
315 inner.cache.clear();
316 inner.stats.reset();
317 inner.stats.size = 0;
318 inner.stats.capacity = inner.capacity;
319 }
320
321 /// Get current cache size
322 ///
323 /// # Examples
324 ///
325 /// ```rust
326 /// use hedl_json::schema_cache::SchemaCache;
327 ///
328 /// let cache = SchemaCache::new(100);
329 /// assert_eq!(cache.len(), 0);
330 /// ```
331 #[must_use]
332 pub fn len(&self) -> usize {
333 let inner = self.inner.read().expect("lock not poisoned");
334 inner.cache.len()
335 }
336
337 /// Check if cache is empty
338 ///
339 /// # Examples
340 ///
341 /// ```rust
342 /// use hedl_json::schema_cache::SchemaCache;
343 ///
344 /// let cache = SchemaCache::new(100);
345 /// assert!(cache.is_empty());
346 /// ```
347 #[must_use]
348 pub fn is_empty(&self) -> bool {
349 self.len() == 0
350 }
351
352 /// Get cache capacity
353 ///
354 /// # Examples
355 ///
356 /// ```rust
357 /// use hedl_json::schema_cache::SchemaCache;
358 ///
359 /// let cache = SchemaCache::new(100);
360 /// assert_eq!(cache.capacity(), 100);
361 /// ```
362 #[must_use]
363 pub fn capacity(&self) -> usize {
364 let inner = self.inner.read().expect("lock not poisoned");
365 inner.capacity
366 }
367}
368
369#[cfg(test)]
370mod tests {
371 use super::*;
372
373 #[test]
374 fn test_cache_key_new() {
375 let key = SchemaCacheKey::new(vec!["name".to_string(), "id".to_string()]);
376 assert_eq!(key.fields, vec!["id", "name"]); // Should be sorted
377 }
378
379 #[test]
380 fn test_cache_key_equality() {
381 let key1 = SchemaCacheKey::new(vec!["name".to_string(), "id".to_string()]);
382 let key2 = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
383 assert_eq!(key1, key2); // Same fields, different order
384 }
385
386 #[test]
387 fn test_cache_basic_operations() {
388 let cache = SchemaCache::new(10);
389
390 let key = SchemaCacheKey::new(vec!["id".to_string(), "name".to_string()]);
391 let schema = vec!["id".to_string(), "name".to_string()];
392
393 // Initially empty
394 assert!(cache.is_empty());
395 assert_eq!(cache.len(), 0);
396
397 // Insert and retrieve
398 cache.insert(key.clone(), schema.clone());
399 assert_eq!(cache.len(), 1);
400 assert!(!cache.is_empty());
401
402 let retrieved = cache.get(&key);
403 assert_eq!(retrieved, Some(schema));
404 }
405
406 #[test]
407 fn test_cache_miss() {
408 let cache = SchemaCache::new(10);
409 let key = SchemaCacheKey::new(vec!["id".to_string()]);
410
411 let result = cache.get(&key);
412 assert_eq!(result, None);
413
414 let stats = cache.statistics();
415 assert_eq!(stats.misses, 1);
416 assert_eq!(stats.hits, 0);
417 }
418
419 #[test]
420 fn test_cache_hit() {
421 let cache = SchemaCache::new(10);
422 let key = SchemaCacheKey::new(vec!["id".to_string()]);
423 let schema = vec!["id".to_string()];
424
425 cache.insert(key.clone(), schema.clone());
426
427 let result = cache.get(&key);
428 assert_eq!(result, Some(schema));
429
430 let stats = cache.statistics();
431 assert_eq!(stats.hits, 1);
432 assert_eq!(stats.misses, 0);
433 }
434
435 #[test]
436 fn test_cache_statistics() {
437 let cache = SchemaCache::new(10);
438 let key1 = SchemaCacheKey::new(vec!["id".to_string()]);
439 let key2 = SchemaCacheKey::new(vec!["name".to_string()]);
440
441 // Miss
442 cache.get(&key1);
443
444 // Insert
445 cache.insert(key1.clone(), vec!["id".to_string()]);
446
447 // Hit
448 cache.get(&key1);
449
450 // Miss
451 cache.get(&key2);
452
453 let stats = cache.statistics();
454 assert_eq!(stats.hits, 1);
455 assert_eq!(stats.misses, 2);
456 assert_eq!(stats.size, 1);
457 assert_eq!(stats.capacity, 10);
458 assert!((stats.hit_rate() - 0.333).abs() < 0.01);
459 }
460
461 #[test]
462 fn test_cache_lru_eviction() {
463 let cache = SchemaCache::new(3);
464
465 // Insert 3 entries
466 for i in 0..3 {
467 let key = SchemaCacheKey::new(vec![format!("field{}", i)]);
468 cache.insert(key, vec![format!("field{}", i)]);
469 }
470
471 assert_eq!(cache.len(), 3);
472
473 // Access first entry to make it recently used
474 let key0 = SchemaCacheKey::new(vec!["field0".to_string()]);
475 cache.get(&key0);
476
477 // Insert 4th entry - should evict least recently used (field1 or field2)
478 let key3 = SchemaCacheKey::new(vec!["field3".to_string()]);
479 cache.insert(key3.clone(), vec!["field3".to_string()]);
480
481 assert_eq!(cache.len(), 3);
482
483 // field0 should still be there (recently accessed)
484 assert!(cache.get(&key0).is_some());
485
486 // field3 should be there (just inserted)
487 assert!(cache.get(&key3).is_some());
488
489 let stats = cache.statistics();
490 assert_eq!(stats.evictions, 1);
491 }
492
493 #[test]
494 fn test_cache_clear() {
495 let cache = SchemaCache::new(10);
496
497 // Insert some entries
498 for i in 0..5 {
499 let key = SchemaCacheKey::new(vec![format!("field{}", i)]);
500 cache.insert(key, vec![format!("field{}", i)]);
501 }
502
503 assert_eq!(cache.len(), 5);
504
505 // Clear
506 cache.clear();
507
508 assert_eq!(cache.len(), 0);
509 assert!(cache.is_empty());
510
511 let stats = cache.statistics();
512 assert_eq!(stats.hits, 0);
513 assert_eq!(stats.misses, 0);
514 assert_eq!(stats.evictions, 0);
515 assert_eq!(stats.size, 0);
516 }
517
518 #[test]
519 fn test_cache_capacity() {
520 let cache = SchemaCache::new(42);
521 assert_eq!(cache.capacity(), 42);
522 }
523
524 #[test]
525 fn test_cache_update_existing() {
526 let cache = SchemaCache::new(10);
527 let key = SchemaCacheKey::new(vec!["id".to_string()]);
528
529 // Insert initial schema
530 cache.insert(key.clone(), vec!["id".to_string()]);
531
532 // Update with new schema
533 cache.insert(key.clone(), vec!["id".to_string(), "name".to_string()]);
534
535 let result = cache.get(&key);
536 assert_eq!(result, Some(vec!["id".to_string(), "name".to_string()]));
537
538 // Should not cause eviction when updating existing key
539 assert_eq!(cache.len(), 1);
540 }
541
542 #[test]
543 fn test_cache_clone() {
544 let cache = SchemaCache::new(10);
545 let key = SchemaCacheKey::new(vec!["id".to_string()]);
546 cache.insert(key.clone(), vec!["id".to_string()]);
547
548 let cache_clone = cache.clone();
549
550 // Both caches should have the same data
551 assert_eq!(cache.len(), cache_clone.len());
552 assert_eq!(cache.get(&key), cache_clone.get(&key));
553 }
554
555 #[test]
556 fn test_statistics_hit_rate() {
557 let stats = CacheStatistics {
558 hits: 7,
559 misses: 3,
560 evictions: 0,
561 size: 5,
562 capacity: 10,
563 };
564
565 assert!((stats.hit_rate() - 0.7).abs() < 0.01);
566 assert!((stats.miss_rate() - 0.3).abs() < 0.01);
567 }
568
569 #[test]
570 fn test_statistics_empty() {
571 let stats = CacheStatistics::default();
572 assert_eq!(stats.hit_rate(), 0.0);
573 assert_eq!(stats.miss_rate(), 1.0);
574 }
575}