Skip to main content

tensorlogic_oxirs_bridge/schema/
cache.rs

1//! Caching system for parsed RDF schemas and SymbolTables.
2//!
3//! This module provides two complementary caching mechanisms to optimize repeated RDF parsing:
4//!
5//! - [`SchemaCache`] - In-memory cache with TTL and LRU eviction
6//! - [`PersistentCache`] - File-based cache that persists across process restarts
7//!
8//! # Performance Benefits
9//!
10//! Caching can provide **10-50x speedups** for repeated operations on the same RDF schemas.
11//! Benchmarks show:
12//! - Cold parse: ~2-5ms per schema
13//! - Memory cache hit: ~0.1ms (20-50x faster)
14//! - Disk cache hit: ~0.5ms (4-10x faster)
15//!
16//! # Examples
17//!
18//! ## In-Memory Caching
19//!
20//! ```
21//! use tensorlogic_oxirs_bridge::SchemaCache;
22//! use tensorlogic_adapters::SymbolTable;
23//! use anyhow::Result;
24//!
25//! fn main() -> Result<()> {
26//!     let mut cache = SchemaCache::new();
27//!     let turtle = "@prefix ex: <http://example.org/> .";
28//!
29//!     // First access - cache miss
30//!     if let Some(table) = cache.get_symbol_table(turtle) {
31//!         println!("Cache hit!");
32//!     } else {
33//!         println!("Cache miss - parsing...");
34//!         // ... parse and analyze ...
35//!         let table = SymbolTable::new();
36//!         cache.put_symbol_table(turtle, table);
37//!     }
38//!
39//!     // Second access - cache hit
40//!     assert!(cache.get_symbol_table(turtle).is_some());
41//!
42//!     // Check statistics
43//!     let stats = cache.stats();
44//!     println!("Hit rate: {:.1}%", stats.hit_rate * 100.0);
45//!     Ok(())
46//! }
47//! ```
48//!
49//! ## File-Based Persistent Caching
50//!
51//! ```no_run
52//! use tensorlogic_oxirs_bridge::PersistentCache;
53//! use tensorlogic_adapters::SymbolTable;
54//! use anyhow::Result;
55//!
56//! fn main() -> Result<()> {
57//!     let cache_dir = std::env::temp_dir().join("my_cache");
58//!     let mut cache = PersistentCache::new(&cache_dir)?;
59//!
60//!     let turtle = "@prefix ex: <http://example.org/> .";
61//!
62//!     // Try loading from disk
63//!     if let Some(table) = cache.load_symbol_table(turtle)? {
64//!         println!("Loaded from disk cache!");
65//!     } else {
66//!         println!("Not in cache - parsing...");
67//!         // ... parse and analyze ...
68//!         let table = SymbolTable::new();
69//!         cache.save_symbol_table(turtle, &table)?;
70//!     }
71//!     Ok(())
72//! }
73//! ```
74//!
75//! # See Also
76//!
77//! - [`SchemaAnalyzer`](crate::SchemaAnalyzer) - The main schema parsing interface
78//! - [Example 08](https://github.com/cool-japan/tensorlogic/blob/main/crates/tensorlogic-oxirs-bridge/examples/08_performance_features.rs) - Performance features demonstration
79
80use anyhow::{Context, Result};
81use serde::{Deserialize, Serialize};
82use std::collections::hash_map::DefaultHasher;
83use std::collections::HashMap;
84use std::hash::{Hash, Hasher};
85use std::path::{Path, PathBuf};
86use std::time::{Duration, SystemTime};
87use tensorlogic_adapters::SymbolTable;
88
89use super::{ClassInfo, PropertyInfo};
90
91/// Type alias for parsed schema data (classes and properties)
92type ParsedSchema = (
93    indexmap::IndexMap<String, ClassInfo>,
94    indexmap::IndexMap<String, PropertyInfo>,
95);
96
97/// Cache entry with expiration tracking and access statistics.
98///
99/// Internal structure used by [`SchemaCache`] to track cached values with TTL and LRU metadata.
100#[derive(Debug, Clone, Serialize, Deserialize)]
101struct CacheEntry<T> {
102    value: T,
103    created_at: SystemTime,
104    last_accessed: SystemTime,
105    access_count: usize,
106}
107
108impl<T> CacheEntry<T> {
109    fn new(value: T) -> Self {
110        let now = SystemTime::now();
111        Self {
112            value,
113            created_at: now,
114            last_accessed: now,
115            access_count: 0,
116        }
117    }
118
119    fn access(&mut self) -> &T {
120        self.last_accessed = SystemTime::now();
121        self.access_count += 1;
122        &self.value
123    }
124
125    fn is_expired(&self, ttl: Duration) -> bool {
126        self.created_at
127            .elapsed()
128            .map(|age| age > ttl)
129            .unwrap_or(false)
130    }
131}
132
133/// Serializable schema cache data.
134///
135/// Internal structure for storing parsed RDF schemas before conversion to symbol tables.
136#[derive(Debug, Clone, Serialize, Deserialize)]
137struct SchemaCacheData {
138    classes: indexmap::IndexMap<String, ClassInfo>,
139    properties: indexmap::IndexMap<String, PropertyInfo>,
140}
141
142/// In-memory cache for parsed RDF schemas and symbol tables.
143///
144/// Provides fast caching with content-based hashing, TTL expiration, and LRU eviction.
145/// Ideal for repeated parsing of the same RDF schemas during a single session.
146///
147/// # Features
148///
149/// - **Content-based hashing**: Automatically deduplicates identical schemas
150/// - **TTL expiration**: Configurable time-to-live (default: 1 hour)
151/// - **LRU eviction**: Automatic removal of least-recently-used entries when full
152/// - **Hit/miss tracking**: Built-in statistics for cache performance monitoring
153/// - **Dual storage**: Caches both raw parsed schemas and symbol tables
154///
155/// # Performance
156///
157/// - **Lookup**: O(1) average case (HashMap-based)
158/// - **Insertion**: O(1) average case
159/// - **Space overhead**: ~2-3x original schema size (includes metadata)
160///
161/// # Examples
162///
163/// ## Basic Usage
164///
165/// ```
166/// use tensorlogic_oxirs_bridge::{SchemaCache, SchemaAnalyzer};
167/// use anyhow::Result;
168///
169/// fn main() -> Result<()> {
170///     let mut cache = SchemaCache::new();
171///     let turtle = r#"
172///         @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
173///         @prefix ex: <http://example.org/> .
174///         ex:Person a rdfs:Class .
175///     "#;
176///
177///     // First parse - cache miss
178///     let table1 = if let Some(cached) = cache.get_symbol_table(turtle) {
179///         cached
180///     } else {
181///         let mut analyzer = SchemaAnalyzer::new();
182///         analyzer.load_turtle(turtle)?;
183///         analyzer.analyze()?;
184///         let table = analyzer.to_symbol_table()?;
185///         cache.put_symbol_table(turtle, table.clone());
186///         table
187///     };
188///
189///     // Second access - cache hit (much faster)
190///     let table2 = cache.get_symbol_table(turtle).expect("should be cached");
191///
192///     // Statistics
193///     let stats = cache.stats();
194///     assert_eq!(stats.total_hits, 1);
195///     assert_eq!(stats.total_misses, 1);
196///     assert_eq!(stats.hit_rate, 0.5);
197///     Ok(())
198/// }
199/// ```
200///
201/// ## Custom TTL and Size
202///
203/// ```
204/// use tensorlogic_oxirs_bridge::SchemaCache;
205/// use std::time::Duration;
206///
207/// // Cache with 30-minute TTL and max 50 entries
208/// let cache = SchemaCache::with_settings(
209///     Duration::from_secs(30 * 60),  // TTL: 30 minutes
210///     50                              // Max size: 50 entries
211/// );
212/// ```
213///
214/// ## Cleanup
215///
216/// ```
217/// use tensorlogic_oxirs_bridge::SchemaCache;
218///
219/// let mut cache = SchemaCache::new();
220/// // ... use cache ...
221///
222/// // Remove expired entries
223/// cache.cleanup_expired();
224///
225/// // Clear everything
226/// cache.clear();
227/// ```
228///
229/// # See Also
230///
231/// - [`PersistentCache`] - File-based caching for cross-session persistence
232/// - [`CacheStats`] - Cache performance statistics
233#[derive(Debug)]
234pub struct SchemaCache {
235    /// Content hash → Parsed schema
236    schemas: HashMap<u64, CacheEntry<SchemaCacheData>>,
237
238    /// Content hash → SymbolTable
239    symbol_tables: HashMap<u64, CacheEntry<SymbolTable>>,
240
241    /// Time-to-live for cache entries
242    ttl: Duration,
243
244    /// Maximum cache size (number of entries)
245    max_size: usize,
246
247    /// Cache statistics
248    hits: usize,
249    misses: usize,
250}
251
252impl SchemaCache {
253    /// Creates a new cache with default settings.
254    ///
255    /// Default configuration:
256    /// - **TTL**: 1 hour (3600 seconds)
257    /// - **Max entries**: 100
258    ///
259    /// # Examples
260    ///
261    /// ```
262    /// use tensorlogic_oxirs_bridge::SchemaCache;
263    ///
264    /// let cache = SchemaCache::new();
265    /// ```
266    pub fn new() -> Self {
267        Self::with_settings(Duration::from_secs(3600), 100)
268    }
269
270    /// Creates a cache with custom TTL and maximum size.
271    ///
272    /// # Arguments
273    ///
274    /// * `ttl` - Time-to-live for cache entries
275    /// * `max_size` - Maximum number of entries before LRU eviction kicks in
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// use tensorlogic_oxirs_bridge::SchemaCache;
281    /// use std::time::Duration;
282    ///
283    /// // 5-minute TTL, max 25 entries
284    /// let cache = SchemaCache::with_settings(Duration::from_secs(300), 25);
285    /// ```
286    pub fn with_settings(ttl: Duration, max_size: usize) -> Self {
287        Self {
288            schemas: HashMap::new(),
289            symbol_tables: HashMap::new(),
290            ttl,
291            max_size,
292            hits: 0,
293            misses: 0,
294        }
295    }
296
297    /// Calculate hash of content
298    fn hash_content(content: &str) -> u64 {
299        let mut hasher = DefaultHasher::new();
300        content.hash(&mut hasher);
301        hasher.finish()
302    }
303
304    /// Get cached schema by content hash
305    pub fn get_schema(&mut self, content: &str) -> Option<ParsedSchema> {
306        let hash = Self::hash_content(content);
307
308        if let Some(entry) = self.schemas.get_mut(&hash) {
309            if !entry.is_expired(self.ttl) {
310                self.hits += 1;
311                let data = entry.access();
312                return Some((data.classes.clone(), data.properties.clone()));
313            } else {
314                // Remove expired entry
315                self.schemas.remove(&hash);
316            }
317        }
318
319        self.misses += 1;
320        None
321    }
322
323    /// Cache a parsed schema
324    pub fn put_schema(
325        &mut self,
326        content: &str,
327        classes: indexmap::IndexMap<String, ClassInfo>,
328        properties: indexmap::IndexMap<String, PropertyInfo>,
329    ) {
330        let hash = Self::hash_content(content);
331
332        // Evict oldest if at capacity
333        if self.schemas.len() >= self.max_size {
334            if let Some(oldest_key) = self.find_oldest_schema() {
335                self.schemas.remove(&oldest_key);
336            }
337        }
338
339        self.schemas.insert(
340            hash,
341            CacheEntry::new(SchemaCacheData {
342                classes,
343                properties,
344            }),
345        );
346    }
347
348    /// Get cached SymbolTable by content hash
349    pub fn get_symbol_table(&mut self, content: &str) -> Option<SymbolTable> {
350        let hash = Self::hash_content(content);
351
352        if let Some(entry) = self.symbol_tables.get_mut(&hash) {
353            if !entry.is_expired(self.ttl) {
354                self.hits += 1;
355                return Some(entry.access().clone());
356            } else {
357                // Remove expired entry
358                self.symbol_tables.remove(&hash);
359            }
360        }
361
362        self.misses += 1;
363        None
364    }
365
366    /// Cache a SymbolTable
367    pub fn put_symbol_table(&mut self, content: &str, table: SymbolTable) {
368        let hash = Self::hash_content(content);
369
370        // Evict oldest if at capacity
371        if self.symbol_tables.len() >= self.max_size {
372            if let Some(oldest_key) = self.find_oldest_symbol_table() {
373                self.symbol_tables.remove(&oldest_key);
374            }
375        }
376
377        self.symbol_tables.insert(hash, CacheEntry::new(table));
378    }
379
380    /// Find oldest schema entry for eviction
381    fn find_oldest_schema(&self) -> Option<u64> {
382        self.schemas
383            .iter()
384            .min_by_key(|(_, entry)| entry.last_accessed)
385            .map(|(k, _)| *k)
386    }
387
388    /// Find oldest symbol table entry for eviction
389    fn find_oldest_symbol_table(&self) -> Option<u64> {
390        self.symbol_tables
391            .iter()
392            .min_by_key(|(_, entry)| entry.last_accessed)
393            .map(|(k, _)| *k)
394    }
395
396    /// Clear all expired entries
397    pub fn cleanup_expired(&mut self) {
398        self.schemas.retain(|_, entry| !entry.is_expired(self.ttl));
399        self.symbol_tables
400            .retain(|_, entry| !entry.is_expired(self.ttl));
401    }
402
403    /// Clear all cache entries
404    pub fn clear(&mut self) {
405        self.schemas.clear();
406        self.symbol_tables.clear();
407        self.hits = 0;
408        self.misses = 0;
409    }
410
411    /// Get cache statistics
412    pub fn stats(&self) -> CacheStats {
413        CacheStats {
414            schema_entries: self.schemas.len(),
415            symbol_table_entries: self.symbol_tables.len(),
416            total_hits: self.hits,
417            total_misses: self.misses,
418            hit_rate: if self.hits + self.misses > 0 {
419                (self.hits as f64) / ((self.hits + self.misses) as f64)
420            } else {
421                0.0
422            },
423        }
424    }
425}
426
427impl Default for SchemaCache {
428    fn default() -> Self {
429        Self::new()
430    }
431}
432
433/// Cache statistics
434#[derive(Debug, Clone, Serialize, Deserialize)]
435pub struct CacheStats {
436    pub schema_entries: usize,
437    pub symbol_table_entries: usize,
438    pub total_hits: usize,
439    pub total_misses: usize,
440    pub hit_rate: f64,
441}
442
443impl std::fmt::Display for CacheStats {
444    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
445        writeln!(f, "Cache Statistics:")?;
446        writeln!(f, "  Schema entries: {}", self.schema_entries)?;
447        writeln!(f, "  Symbol table entries: {}", self.symbol_table_entries)?;
448        writeln!(f, "  Total hits: {}", self.total_hits)?;
449        writeln!(f, "  Total misses: {}", self.total_misses)?;
450        writeln!(f, "  Hit rate: {:.2}%", self.hit_rate * 100.0)?;
451        Ok(())
452    }
453}
454
455/// File-based persistent cache
456pub struct PersistentCache {
457    cache_dir: PathBuf,
458    in_memory: SchemaCache,
459}
460
461impl PersistentCache {
462    /// Create a new persistent cache with a directory
463    pub fn new(cache_dir: impl AsRef<Path>) -> Result<Self> {
464        let cache_dir = cache_dir.as_ref().to_path_buf();
465        std::fs::create_dir_all(&cache_dir).context("Failed to create cache directory")?;
466
467        Ok(Self {
468            cache_dir,
469            in_memory: SchemaCache::new(),
470        })
471    }
472
473    /// Get cache file path for content
474    fn cache_path(&self, content: &str, suffix: &str) -> PathBuf {
475        let hash = SchemaCache::hash_content(content);
476        self.cache_dir.join(format!("{:016x}.{}", hash, suffix))
477    }
478
479    /// Load SymbolTable from cache (memory or disk)
480    pub fn load_symbol_table(&mut self, content: &str) -> Result<Option<SymbolTable>> {
481        // Try memory first
482        if let Some(table) = self.in_memory.get_symbol_table(content) {
483            return Ok(Some(table));
484        }
485
486        // Try disk
487        let path = self.cache_path(content, "symboltable.json");
488        if path.exists() {
489            let json = std::fs::read_to_string(&path).context("Failed to read cache file")?;
490            let table: SymbolTable =
491                serde_json::from_str(&json).context("Failed to deserialize SymbolTable")?;
492
493            // Store in memory for future access
494            self.in_memory.put_symbol_table(content, table.clone());
495
496            return Ok(Some(table));
497        }
498
499        Ok(None)
500    }
501
502    /// Save SymbolTable to cache (memory and disk)
503    pub fn save_symbol_table(&mut self, content: &str, table: &SymbolTable) -> Result<()> {
504        // Save to memory
505        self.in_memory.put_symbol_table(content, table.clone());
506
507        // Save to disk
508        let path = self.cache_path(content, "symboltable.json");
509        let json =
510            serde_json::to_string_pretty(table).context("Failed to serialize SymbolTable")?;
511        std::fs::write(&path, json).context("Failed to write cache file")?;
512
513        Ok(())
514    }
515
516    /// Load schema from cache (memory or disk)
517    pub fn load_schema(&mut self, content: &str) -> Result<Option<ParsedSchema>> {
518        // Try memory first
519        if let Some(result) = self.in_memory.get_schema(content) {
520            return Ok(Some(result));
521        }
522
523        // Try disk
524        let path = self.cache_path(content, "schema.json");
525        if path.exists() {
526            let json = std::fs::read_to_string(&path).context("Failed to read cache file")?;
527            let data: SchemaCacheData =
528                serde_json::from_str(&json).context("Failed to deserialize schema")?;
529
530            // Store in memory for future access
531            self.in_memory
532                .put_schema(content, data.classes.clone(), data.properties.clone());
533
534            return Ok(Some((data.classes, data.properties)));
535        }
536
537        Ok(None)
538    }
539
540    /// Save schema to cache (memory and disk)
541    pub fn save_schema(
542        &mut self,
543        content: &str,
544        classes: &indexmap::IndexMap<String, ClassInfo>,
545        properties: &indexmap::IndexMap<String, PropertyInfo>,
546    ) -> Result<()> {
547        // Save to memory
548        self.in_memory
549            .put_schema(content, classes.clone(), properties.clone());
550
551        // Save to disk
552        let path = self.cache_path(content, "schema.json");
553        let data = SchemaCacheData {
554            classes: classes.clone(),
555            properties: properties.clone(),
556        };
557        let json = serde_json::to_string_pretty(&data).context("Failed to serialize schema")?;
558        std::fs::write(&path, json).context("Failed to write cache file")?;
559
560        Ok(())
561    }
562
563    /// Clear all cache files
564    pub fn clear_all(&mut self) -> Result<()> {
565        self.in_memory.clear();
566
567        for entry in std::fs::read_dir(&self.cache_dir)? {
568            let entry = entry?;
569            if entry.path().is_file() {
570                std::fs::remove_file(entry.path())?;
571            }
572        }
573
574        Ok(())
575    }
576
577    /// Get cache statistics
578    pub fn stats(&self) -> CacheStats {
579        self.in_memory.stats()
580    }
581}
582
583#[cfg(test)]
584mod tests {
585    use super::*;
586    use std::thread;
587    use std::time::Duration;
588
589    #[test]
590    fn test_schema_cache_basic() {
591        let mut cache = SchemaCache::new();
592
593        let content = "@prefix ex: <http://example.org/> .";
594        let classes = indexmap::IndexMap::new();
595        let properties = indexmap::IndexMap::new();
596
597        // First access - miss
598        assert!(cache.get_schema(content).is_none());
599        assert_eq!(cache.stats().total_misses, 1);
600
601        // Store
602        cache.put_schema(content, classes.clone(), properties.clone());
603
604        // Second access - hit
605        assert!(cache.get_schema(content).is_some());
606        assert_eq!(cache.stats().total_hits, 1);
607    }
608
609    #[test]
610    fn test_symbol_table_cache() {
611        let mut cache = SchemaCache::new();
612
613        let content = "@prefix ex: <http://example.org/> .";
614        let table = SymbolTable::new();
615
616        // First access - miss
617        assert!(cache.get_symbol_table(content).is_none());
618
619        // Store
620        cache.put_symbol_table(content, table.clone());
621
622        // Second access - hit
623        assert!(cache.get_symbol_table(content).is_some());
624    }
625
626    #[test]
627    fn test_cache_expiration() {
628        let mut cache = SchemaCache::with_settings(Duration::from_millis(100), 10);
629
630        let content = "@prefix ex: <http://example.org/> .";
631        let table = SymbolTable::new();
632
633        cache.put_symbol_table(content, table);
634
635        // Should hit immediately
636        assert!(cache.get_symbol_table(content).is_some());
637
638        // Wait for expiration
639        thread::sleep(Duration::from_millis(150));
640
641        // Should miss after expiration
642        assert!(cache.get_symbol_table(content).is_none());
643    }
644
645    #[test]
646    fn test_cache_eviction() {
647        let mut cache = SchemaCache::with_settings(Duration::from_secs(3600), 2);
648
649        let table = SymbolTable::new();
650
651        // Fill cache
652        cache.put_symbol_table("content1", table.clone());
653        cache.put_symbol_table("content2", table.clone());
654
655        // Add third item - should evict oldest
656        cache.put_symbol_table("content3", table.clone());
657
658        // Cache should still have 2 entries
659        assert_eq!(cache.stats().symbol_table_entries, 2);
660    }
661
662    #[test]
663    fn test_cache_stats() {
664        let mut cache = SchemaCache::new();
665
666        let content = "@prefix ex: <http://example.org/> .";
667        let table = SymbolTable::new();
668
669        cache.get_symbol_table(content); // Miss
670        cache.put_symbol_table(content, table);
671        cache.get_symbol_table(content); // Hit
672        cache.get_symbol_table(content); // Hit
673
674        let stats = cache.stats();
675        assert_eq!(stats.total_hits, 2);
676        assert_eq!(stats.total_misses, 1);
677        assert!((stats.hit_rate - 0.666).abs() < 0.01);
678    }
679
680    #[test]
681    fn test_cache_clear() {
682        let mut cache = SchemaCache::new();
683
684        let content = "@prefix ex: <http://example.org/> .";
685        let table = SymbolTable::new();
686
687        cache.put_symbol_table(content, table);
688        assert_eq!(cache.stats().symbol_table_entries, 1);
689
690        cache.clear();
691        assert_eq!(cache.stats().symbol_table_entries, 0);
692        assert_eq!(cache.stats().total_hits, 0);
693    }
694
695    #[test]
696    fn test_persistent_cache() -> Result<()> {
697        let temp_dir = std::env::temp_dir().join("tensorlogic_oxirs_test_cache");
698        std::fs::create_dir_all(&temp_dir)?;
699
700        let mut cache = PersistentCache::new(&temp_dir)?;
701
702        let content = "@prefix ex: <http://example.org/> .";
703        let table = SymbolTable::new();
704
705        // Save
706        cache.save_symbol_table(content, &table)?;
707
708        // Load
709        let loaded = cache.load_symbol_table(content)?;
710        assert!(loaded.is_some());
711
712        // Clean up
713        cache.clear_all()?;
714        std::fs::remove_dir_all(temp_dir)?;
715
716        Ok(())
717    }
718}