Skip to main content

hedl_lsp/
document_manager.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Document management with caching and LRU eviction.
19//!
20//! This module handles document storage, caching, and lifecycle management for the LSP server.
21//! It provides efficient document access with configurable cache limits and automatic LRU eviction.
22//!
23//! # Responsibilities
24//!
25//! - Document storage and retrieval
26//! - Content hash-based change detection
27//! - LRU-based cache eviction
28//! - Cache statistics tracking
29//! - Document size limits enforcement
30//!
31//! # Design
32//!
33//! The `DocumentCache` maintains a cache of analyzed documents with the following features:
34//!
35//! - **LRU Eviction**: Automatically evicts least recently used documents when cache is full
36//! - **Dirty Tracking**: Tracks which documents need re-analysis via content hashing
37//! - **Access Tracking**: Updates last access time for LRU ordering
38//! - **Size Limits**: Enforces maximum document size to prevent memory exhaustion
39//! - **Statistics**: Provides cache hit/miss/eviction metrics for monitoring
40
41use crate::analysis::AnalyzedDocument;
42use dashmap::DashMap;
43use parking_lot::Mutex;
44use ropey::Rope;
45use std::sync::Arc;
46use tower_lsp::lsp_types::Url;
47use tracing::{debug, error, warn};
48
49// Re-export constants for backwards compatibility
50pub use crate::constants::{DEFAULT_MAX_CACHE_SIZE, DEFAULT_MAX_DOCUMENT_SIZE};
51
52/// Document state with caching and dirty tracking.
53///
54/// Each document is stored with its content (as a Rope for efficient editing),
55/// analysis results, content hash for change detection, and dirty flag.
56pub struct DocumentState {
57    /// Current rope content for efficient editing operations.
58    pub rope: Rope,
59    /// Cached analysis result from last parse (Arc-wrapped to avoid expensive clones).
60    pub analysis: Arc<AnalyzedDocument>,
61    /// Content hash for change detection.
62    pub content_hash: u64,
63    /// Dirty flag: true if content changed since last analysis.
64    pub dirty: bool,
65    /// Last access timestamp for LRU eviction.
66    pub last_access: std::time::Instant,
67}
68
69/// Cache statistics for monitoring and optimization.
70///
71/// These statistics help identify cache performance issues and guide
72/// configuration tuning.
73#[derive(Debug, Clone, Default)]
74pub struct CacheStatistics {
75    /// Number of cache hits (document found in cache).
76    pub hits: u64,
77    /// Number of cache misses (document not in cache).
78    pub misses: u64,
79    /// Number of document evictions due to cache size limit.
80    pub evictions: u64,
81    /// Current number of documents in cache.
82    pub current_size: usize,
83    /// Maximum cache size.
84    pub max_size: usize,
85}
86
87/// Document manager with LRU caching and dirty tracking.
88///
89/// The `DocumentCache` is the single source of truth for all document state
90/// in the LSP server. It handles document lifecycle, caching, and eviction.
91///
92/// # Thread Safety
93///
94/// The `DocumentCache` uses `DashMap` for concurrent access and `parking_lot::Mutex`
95/// for fine-grained locking. It can be safely shared across threads.
96///
97/// # Example
98///
99/// ```no_run
100/// use hedl_lsp::document_manager::DocumentCache;
101///
102/// let manager = DocumentCache::new(1000, 500 * 1024 * 1024);
103///
104/// // Insert a document
105/// // manager.insert_or_update(uri, content);
106///
107/// // Get a document
108/// // let doc = manager.get(&uri);
109/// ```
110pub struct DocumentCache {
111    /// Document store: URI -> document state.
112    documents: DashMap<Url, Arc<Mutex<DocumentState>>>,
113    /// Cache statistics for monitoring.
114    cache_stats: Arc<Mutex<CacheStatistics>>,
115    /// Maximum number of documents to cache.
116    max_cache_size: Arc<parking_lot::RwLock<usize>>,
117    /// Maximum document size in bytes.
118    max_document_size: Arc<parking_lot::RwLock<usize>>,
119}
120
121impl DocumentCache {
122    /// Create a new document manager with specified limits.
123    ///
124    /// # Parameters
125    ///
126    /// - `max_cache_size`: Maximum number of documents to cache (default: 1000)
127    /// - `max_document_size`: Maximum document size in bytes (default: 500 MB)
128    ///
129    /// # Example
130    ///
131    /// ```no_run
132    /// use hedl_lsp::document_manager::DocumentCache;
133    ///
134    /// // Create with custom limits
135    /// let manager = DocumentCache::new(2000, 1024 * 1024 * 1024);
136    /// ```
137    #[must_use]
138    pub fn new(max_cache_size: usize, max_document_size: usize) -> Self {
139        Self {
140            documents: DashMap::new(),
141            cache_stats: Arc::new(Mutex::new(CacheStatistics {
142                max_size: max_cache_size,
143                ..Default::default()
144            })),
145            max_cache_size: Arc::new(parking_lot::RwLock::new(max_cache_size)),
146            max_document_size: Arc::new(parking_lot::RwLock::new(max_document_size)),
147        }
148    }
149
150    /// Get current cache statistics.
151    ///
152    /// This method provides a snapshot of cache performance metrics.
153    #[must_use]
154    pub fn statistics(&self) -> CacheStatistics {
155        let mut stats = self.cache_stats.lock();
156        stats.current_size = self.documents.len();
157        stats.clone()
158    }
159
160    /// Update maximum cache size (can be called during runtime).
161    pub fn set_max_cache_size(&self, new_max: usize) {
162        let mut max = self.max_cache_size.write();
163        *max = new_max;
164        let mut stats = self.cache_stats.lock();
165        stats.max_size = new_max;
166        debug!("Cache max size updated to: {}", new_max);
167    }
168
169    /// Get current maximum cache size.
170    #[must_use]
171    pub fn max_cache_size(&self) -> usize {
172        *self.max_cache_size.read()
173    }
174
175    /// Update maximum document size (can be called during runtime).
176    pub fn set_max_document_size(&self, new_max: usize) {
177        let mut max = self.max_document_size.write();
178        *max = new_max;
179        debug!("Max document size updated to: {} bytes", new_max);
180    }
181
182    /// Get current maximum document size.
183    #[must_use]
184    pub fn max_document_size(&self) -> usize {
185        *self.max_document_size.read()
186    }
187
188    /// Compute a simple hash for change detection.
189    fn hash_content(content: &str) -> u64 {
190        use std::collections::hash_map::DefaultHasher;
191        use std::hash::{Hash, Hasher};
192        let mut hasher = DefaultHasher::new();
193        content.hash(&mut hasher);
194        hasher.finish()
195    }
196
197    /// Insert or update a document.
198    ///
199    /// If the document already exists, updates its content and marks it as dirty
200    /// if the content changed. If it's a new document, performs initial analysis.
201    ///
202    /// # Memory Management
203    ///
204    /// This method enforces the maximum document size limit. Documents exceeding
205    /// the limit are rejected and this method returns `false`.
206    ///
207    /// # Returns
208    ///
209    /// Returns `true` if the document was successfully inserted/updated,
210    /// `false` if rejected due to size constraints.
211    ///
212    /// # Error Handling
213    ///
214    /// - Size limit violations: Logged as warnings and rejected
215    /// - Cache eviction: Logged with LRU document details
216    /// - Content hashing: Hash collisions are statistically impossible but detected
217    pub fn insert_or_update(&self, uri: &Url, content: &str) -> bool {
218        // Memory management: Enforce maximum document size
219        let max_size = self.max_document_size();
220        if content.len() > max_size {
221            warn!(
222                "Document size limit exceeded for {}: {} bytes > {} bytes maximum (rejected)",
223                uri,
224                content.len(),
225                max_size
226            );
227            return false;
228        }
229
230        let rope = Rope::from_str(content);
231        let content_hash = Self::hash_content(content);
232        let line_count = content.lines().count();
233
234        if let Some(state_ref) = self.documents.get(uri) {
235            // Cache hit - existing document
236            {
237                let mut stats = self.cache_stats.lock();
238                stats.hits += 1;
239            }
240
241            let mut state = state_ref.lock();
242            // Only update if content actually changed
243            if state.content_hash == content_hash {
244                debug!(
245                    "Document content unchanged for {} (hash: {:#x}), updating access time only",
246                    uri, content_hash
247                );
248                // Update access time even if content hasn't changed
249                state.last_access = std::time::Instant::now();
250            } else {
251                debug!(
252                    "Document content changed for {}: {} -> {} bytes, {} lines",
253                    uri,
254                    state.rope.len_bytes(),
255                    content.len(),
256                    line_count
257                );
258                state.rope = rope;
259                state.content_hash = content_hash;
260                state.dirty = true;
261                state.last_access = std::time::Instant::now();
262            }
263        } else {
264            // Cache miss - new document
265            {
266                let mut stats = self.cache_stats.lock();
267                stats.misses += 1;
268            }
269
270            debug!(
271                "New document registered: {} ({} bytes, {} lines)",
272                uri,
273                content.len(),
274                line_count
275            );
276
277            // Check if we need to evict before inserting
278            let max_cache = self.max_cache_size();
279            if self.documents.len() >= max_cache {
280                warn!(
281                    "Cache limit reached ({}/{}), triggering LRU eviction before inserting {}",
282                    self.documents.len(),
283                    max_cache,
284                    uri
285                );
286                self.evict_lru_document();
287            }
288
289            // New document - perform initial analysis synchronously
290            debug!("Starting initial analysis for new document: {}", uri);
291            let analysis = Arc::new(AnalyzedDocument::analyze(content));
292
293            if !analysis.errors.is_empty() {
294                debug!(
295                    "Initial analysis found {} parse errors for {}",
296                    analysis.errors.len(),
297                    uri
298                );
299            }
300
301            let state = DocumentState {
302                rope,
303                analysis,
304                content_hash,
305                dirty: false,
306                last_access: std::time::Instant::now(),
307            };
308            self.documents
309                .insert(uri.clone(), Arc::new(Mutex::new(state)));
310            debug!("Document cached: {} (hash: {:#x})", uri, content_hash);
311        }
312
313        true
314    }
315
316    /// Get document content and analysis.
317    ///
318    /// This method returns the document content and an Arc to the analysis.
319    /// It also updates the last access time for LRU tracking.
320    ///
321    /// # Returns
322    ///
323    /// Returns `Some((content, analysis))` if the document exists, `None` otherwise.
324    ///
325    /// # Error Handling
326    ///
327    /// - Missing document: Returns None (logged at call site)
328    /// - Access tracking: Always updates last access time for LRU
329    #[must_use]
330    pub fn get(&self, uri: &Url) -> Option<(String, Arc<AnalyzedDocument>)> {
331        self.documents.get(uri).map(|entry| {
332            let mut state = entry.lock();
333            state.last_access = std::time::Instant::now();
334            debug!(
335                "Document accessed: {} ({} bytes, dirty: {})",
336                uri,
337                state.rope.len_bytes(),
338                state.dirty
339            );
340            (state.rope.to_string(), Arc::clone(&state.analysis))
341        })
342    }
343
344    /// Get document state reference for in-place operations.
345    ///
346    /// This method returns an Arc to the document state, allowing for
347    /// more efficient operations that need to inspect or modify state
348    /// without cloning the entire content.
349    ///
350    /// # Returns
351    ///
352    /// Returns `Some(Arc<Mutex<DocumentState>>)` if the document exists, `None` otherwise.
353    #[must_use]
354    pub fn get_state(&self, uri: &Url) -> Option<Arc<Mutex<DocumentState>>> {
355        self.documents.get(uri).map(|entry| entry.clone())
356    }
357
358    /// Check if a document is dirty (needs re-analysis).
359    ///
360    /// # Returns
361    ///
362    /// Returns `true` if the document exists and is dirty, `false` otherwise.
363    #[must_use]
364    pub fn is_dirty(&self, uri: &Url) -> bool {
365        self.documents.get(uri).is_some_and(|entry| {
366            let state = entry.lock();
367            state.dirty
368        })
369    }
370
371    /// Mark a document as clean (analysis is up-to-date).
372    ///
373    /// This method should be called after successfully analyzing a document.
374    pub fn mark_clean(&self, uri: &Url) {
375        if let Some(state_ref) = self.documents.get(uri) {
376            let mut state = state_ref.lock();
377            state.dirty = false;
378        }
379    }
380
381    /// Update analysis for a document and mark it as clean.
382    ///
383    /// This is a convenience method that combines updating the analysis
384    /// and marking the document as clean.
385    ///
386    /// # Error Handling
387    ///
388    /// - Missing document: Silently ignored (document may have been closed/evicted)
389    /// - Analysis update: Atomic with dirty flag clearing
390    pub fn update_analysis(&self, uri: &Url, analysis: Arc<AnalyzedDocument>) {
391        if let Some(state_ref) = self.documents.get(uri) {
392            let mut state = state_ref.lock();
393            debug!(
394                "Updating analysis for {}: {} entities, {} errors",
395                uri,
396                analysis
397                    .entities
398                    .values()
399                    .map(std::collections::HashMap::len)
400                    .sum::<usize>(),
401                analysis.errors.len()
402            );
403            state.analysis = analysis;
404            state.dirty = false;
405        } else {
406            warn!(
407                "Attempted to update analysis for non-existent document: {} (may have been closed/evicted)",
408                uri
409            );
410        }
411    }
412
413    /// Remove a document from the cache.
414    ///
415    /// This is typically called when a document is closed in the editor.
416    ///
417    /// # Returns
418    ///
419    /// Returns `true` if the document was removed, `false` if it didn't exist.
420    #[must_use]
421    pub fn remove(&self, uri: &Url) -> bool {
422        self.documents.remove(uri).is_some()
423    }
424
425    /// Get all document URIs currently in the cache.
426    ///
427    /// This is useful for workspace-wide operations like workspace symbols.
428    #[must_use]
429    pub fn all_uris(&self) -> Vec<Url> {
430        self.documents
431            .iter()
432            .map(|entry| entry.key().clone())
433            .collect()
434    }
435
436    /// Iterate over all documents with a function.
437    ///
438    /// This provides a safe way to iterate over all documents without
439    /// exposing the internal `DashMap` structure.
440    pub fn for_each<F>(&self, mut f: F)
441    where
442        F: FnMut(&Url, &Arc<Mutex<DocumentState>>),
443    {
444        for entry in &self.documents {
445            f(entry.key(), entry.value());
446        }
447    }
448
449    /// Evict the least recently used document.
450    ///
451    /// This is called when the number of open documents exceeds the configured
452    /// maximum cache size to prevent unbounded memory growth.
453    ///
454    /// # Error Handling
455    ///
456    /// - Empty cache: Returns immediately without error
457    /// - LRU selection: Uses precise timestamp comparison
458    /// - Eviction: Logged with document details and idle time
459    fn evict_lru_document(&self) {
460        if self.documents.is_empty() {
461            warn!("LRU eviction requested but cache is empty (no-op)");
462            return;
463        }
464
465        // Find the LRU document
466        let mut lru_uri: Option<Url> = None;
467        let mut lru_time = std::time::Instant::now();
468        let mut lru_size: usize = 0;
469
470        for entry in &self.documents {
471            let state = entry.value().lock();
472            if lru_uri.is_none() || state.last_access < lru_time {
473                lru_uri = Some(entry.key().clone());
474                lru_time = state.last_access;
475                lru_size = state.rope.len_bytes();
476            }
477        }
478
479        // Evict the LRU document
480        if let Some(uri) = lru_uri {
481            let idle_duration = std::time::Instant::now().duration_since(lru_time);
482            warn!(
483                "Evicting LRU document {} ({} bytes, idle for {:?})",
484                uri, lru_size, idle_duration
485            );
486
487            if let Some((_, removed_state)) = self.documents.remove(&uri) {
488                let state = removed_state.lock();
489                debug!(
490                    "Evicted document had {} entities, {} references",
491                    state
492                        .analysis
493                        .entities
494                        .values()
495                        .map(std::collections::HashMap::len)
496                        .sum::<usize>(),
497                    state.analysis.references.len()
498                );
499            }
500
501            // Update statistics
502            {
503                let mut stats = self.cache_stats.lock();
504                stats.evictions += 1;
505                debug!(
506                    "Cache statistics after eviction: {} hits, {} misses, {} evictions, {}/{} size",
507                    stats.hits,
508                    stats.misses,
509                    stats.evictions,
510                    self.documents.len(),
511                    stats.max_size
512                );
513            }
514        } else {
515            error!("LRU eviction failed: no document found despite non-empty cache");
516        }
517    }
518
519    /// Clear all documents from the cache.
520    ///
521    /// This is primarily useful for testing or when resetting the server state.
522    pub fn clear(&self) {
523        self.documents.clear();
524        let mut stats = self.cache_stats.lock();
525        stats.hits = 0;
526        stats.misses = 0;
527        stats.evictions = 0;
528    }
529}
530
531#[cfg(test)]
532mod tests {
533    use super::*;
534
535    #[test]
536    fn test_document_manager_new() {
537        let manager = DocumentCache::new(100, 1024 * 1024);
538        assert_eq!(manager.max_cache_size(), 100);
539        assert_eq!(manager.max_document_size(), 1024 * 1024);
540
541        let stats = manager.statistics();
542        assert_eq!(stats.max_size, 100);
543        assert_eq!(stats.current_size, 0);
544        assert_eq!(stats.hits, 0);
545        assert_eq!(stats.misses, 0);
546        assert_eq!(stats.evictions, 0);
547    }
548
549    #[test]
550    fn test_insert_and_get() {
551        let manager = DocumentCache::new(10, 1024 * 1024);
552        let uri = Url::parse("file:///test.hedl").unwrap();
553        let content = "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n";
554
555        // Insert document
556        assert!(manager.insert_or_update(&uri, content));
557
558        // Get document
559        let result = manager.get(&uri);
560        assert!(result.is_some());
561        let (retrieved_content, analysis) = result.unwrap();
562        assert_eq!(retrieved_content, content);
563        assert!(analysis.document.is_some()); // Analysis should have been performed
564
565        // Check statistics
566        let stats = manager.statistics();
567        assert_eq!(stats.misses, 1); // Initial insert is a miss
568        assert_eq!(stats.hits, 0);
569        assert_eq!(stats.current_size, 1);
570    }
571
572    #[test]
573    fn test_update_marks_dirty() {
574        let manager = DocumentCache::new(10, 1024 * 1024);
575        let uri = Url::parse("file:///test.hedl").unwrap();
576
577        // Insert initial content
578        manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
579        assert!(!manager.is_dirty(&uri));
580
581        // Update with different content
582        manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n%S:User:[id]\n---\n");
583        assert!(manager.is_dirty(&uri));
584
585        // Update with same content (hash unchanged)
586        manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n%S:User:[id]\n---\n");
587        assert!(manager.is_dirty(&uri)); // Still dirty until marked clean
588    }
589
590    #[test]
591    fn test_mark_clean() {
592        let manager = DocumentCache::new(10, 1024 * 1024);
593        let uri = Url::parse("file:///test.hedl").unwrap();
594
595        manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
596        manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n%S:User:[id]\n---\n");
597        assert!(manager.is_dirty(&uri));
598
599        manager.mark_clean(&uri);
600        assert!(!manager.is_dirty(&uri));
601    }
602
603    #[test]
604    fn test_document_size_limit() {
605        let manager = DocumentCache::new(10, 100); // Only 100 bytes allowed
606        let uri = Url::parse("file:///test.hedl").unwrap();
607
608        // Small document should succeed
609        assert!(manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n"));
610
611        // Large document should be rejected
612        let large_content = "x".repeat(101);
613        assert!(!manager.insert_or_update(&uri, &large_content));
614    }
615
616    #[test]
617    fn test_lru_eviction() {
618        let manager = DocumentCache::new(3, 1024 * 1024); // Max 3 documents
619
620        // Insert 3 documents
621        for i in 0..3 {
622            let uri = Url::parse(&format!("file:///test{i}.hedl")).unwrap();
623            manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
624        }
625
626        let stats = manager.statistics();
627        assert_eq!(stats.current_size, 3);
628        assert_eq!(stats.evictions, 0);
629
630        // Insert 4th document should trigger eviction
631        let uri4 = Url::parse("file:///test4.hedl").unwrap();
632        manager.insert_or_update(&uri4, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
633
634        let stats = manager.statistics();
635        assert_eq!(stats.current_size, 3); // Still at max
636        assert_eq!(stats.evictions, 1); // One eviction occurred
637    }
638
639    #[test]
640    fn test_remove() {
641        let manager = DocumentCache::new(10, 1024 * 1024);
642        let uri = Url::parse("file:///test.hedl").unwrap();
643
644        manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
645        assert!(manager.get(&uri).is_some());
646
647        assert!(manager.remove(&uri));
648        assert!(manager.get(&uri).is_none());
649
650        // Removing non-existent document should return false
651        assert!(!manager.remove(&uri));
652    }
653
654    #[test]
655    fn test_all_uris() {
656        let manager = DocumentCache::new(10, 1024 * 1024);
657
658        for i in 0..5 {
659            let uri = Url::parse(&format!("file:///test{i}.hedl")).unwrap();
660            manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
661        }
662
663        let uris = manager.all_uris();
664        assert_eq!(uris.len(), 5);
665    }
666
667    #[test]
668    fn test_clear() {
669        let manager = DocumentCache::new(10, 1024 * 1024);
670
671        for i in 0..3 {
672            let uri = Url::parse(&format!("file:///test{i}.hedl")).unwrap();
673            manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
674        }
675
676        assert_eq!(manager.statistics().current_size, 3);
677
678        manager.clear();
679
680        assert_eq!(manager.statistics().current_size, 0);
681        assert_eq!(manager.statistics().hits, 0);
682        assert_eq!(manager.statistics().misses, 0);
683    }
684
685    #[test]
686    fn test_runtime_config_update() {
687        let manager = DocumentCache::new(100, 1024 * 1024);
688
689        assert_eq!(manager.max_cache_size(), 100);
690        manager.set_max_cache_size(200);
691        assert_eq!(manager.max_cache_size(), 200);
692
693        assert_eq!(manager.max_document_size(), 1024 * 1024);
694        manager.set_max_document_size(2 * 1024 * 1024);
695        assert_eq!(manager.max_document_size(), 2 * 1024 * 1024);
696    }
697}