hedl_lsp/document_manager.rs
1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Document management with caching and LRU eviction.
19//!
20//! This module handles document storage, caching, and lifecycle management for the LSP server.
21//! It provides efficient document access with configurable cache limits and automatic LRU eviction.
22//!
23//! # Responsibilities
24//!
25//! - Document storage and retrieval
26//! - Content hash-based change detection
27//! - LRU-based cache eviction
28//! - Cache statistics tracking
29//! - Document size limits enforcement
30//!
31//! # Design
32//!
33//! The `DocumentCache` maintains a cache of analyzed documents with the following features:
34//!
35//! - **LRU Eviction**: Automatically evicts least recently used documents when cache is full
36//! - **Dirty Tracking**: Tracks which documents need re-analysis via content hashing
37//! - **Access Tracking**: Updates last access time for LRU ordering
38//! - **Size Limits**: Enforces maximum document size to prevent memory exhaustion
39//! - **Statistics**: Provides cache hit/miss/eviction metrics for monitoring
40
41use crate::analysis::AnalyzedDocument;
42use dashmap::DashMap;
43use parking_lot::Mutex;
44use ropey::Rope;
45use std::sync::Arc;
46use tower_lsp::lsp_types::Url;
47use tracing::{debug, error, warn};
48
49// Re-export constants for backwards compatibility
50pub use crate::constants::{DEFAULT_MAX_CACHE_SIZE, DEFAULT_MAX_DOCUMENT_SIZE};
51
52/// Document state with caching and dirty tracking.
53///
54/// Each document is stored with its content (as a Rope for efficient editing),
55/// analysis results, content hash for change detection, and dirty flag.
56pub struct DocumentState {
57 /// Current rope content for efficient editing operations.
58 pub rope: Rope,
59 /// Cached analysis result from last parse (Arc-wrapped to avoid expensive clones).
60 pub analysis: Arc<AnalyzedDocument>,
61 /// Content hash for change detection.
62 pub content_hash: u64,
63 /// Dirty flag: true if content changed since last analysis.
64 pub dirty: bool,
65 /// Last access timestamp for LRU eviction.
66 pub last_access: std::time::Instant,
67}
68
69/// Cache statistics for monitoring and optimization.
70///
71/// These statistics help identify cache performance issues and guide
72/// configuration tuning.
73#[derive(Debug, Clone, Default)]
74pub struct CacheStatistics {
75 /// Number of cache hits (document found in cache).
76 pub hits: u64,
77 /// Number of cache misses (document not in cache).
78 pub misses: u64,
79 /// Number of document evictions due to cache size limit.
80 pub evictions: u64,
81 /// Current number of documents in cache.
82 pub current_size: usize,
83 /// Maximum cache size.
84 pub max_size: usize,
85}
86
87/// Document manager with LRU caching and dirty tracking.
88///
89/// The `DocumentCache` is the single source of truth for all document state
90/// in the LSP server. It handles document lifecycle, caching, and eviction.
91///
92/// # Thread Safety
93///
94/// The `DocumentCache` uses `DashMap` for concurrent access and `parking_lot::Mutex`
95/// for fine-grained locking. It can be safely shared across threads.
96///
97/// # Example
98///
99/// ```no_run
100/// use hedl_lsp::document_manager::DocumentCache;
101///
102/// let manager = DocumentCache::new(1000, 500 * 1024 * 1024);
103///
104/// // Insert a document
105/// // manager.insert_or_update(uri, content);
106///
107/// // Get a document
108/// // let doc = manager.get(&uri);
109/// ```
110pub struct DocumentCache {
111 /// Document store: URI -> document state.
112 documents: DashMap<Url, Arc<Mutex<DocumentState>>>,
113 /// Cache statistics for monitoring.
114 cache_stats: Arc<Mutex<CacheStatistics>>,
115 /// Maximum number of documents to cache.
116 max_cache_size: Arc<parking_lot::RwLock<usize>>,
117 /// Maximum document size in bytes.
118 max_document_size: Arc<parking_lot::RwLock<usize>>,
119}
120
121impl DocumentCache {
122 /// Create a new document manager with specified limits.
123 ///
124 /// # Parameters
125 ///
126 /// - `max_cache_size`: Maximum number of documents to cache (default: 1000)
127 /// - `max_document_size`: Maximum document size in bytes (default: 500 MB)
128 ///
129 /// # Example
130 ///
131 /// ```no_run
132 /// use hedl_lsp::document_manager::DocumentCache;
133 ///
134 /// // Create with custom limits
135 /// let manager = DocumentCache::new(2000, 1024 * 1024 * 1024);
136 /// ```
137 #[must_use]
138 pub fn new(max_cache_size: usize, max_document_size: usize) -> Self {
139 Self {
140 documents: DashMap::new(),
141 cache_stats: Arc::new(Mutex::new(CacheStatistics {
142 max_size: max_cache_size,
143 ..Default::default()
144 })),
145 max_cache_size: Arc::new(parking_lot::RwLock::new(max_cache_size)),
146 max_document_size: Arc::new(parking_lot::RwLock::new(max_document_size)),
147 }
148 }
149
150 /// Get current cache statistics.
151 ///
152 /// This method provides a snapshot of cache performance metrics.
153 #[must_use]
154 pub fn statistics(&self) -> CacheStatistics {
155 let mut stats = self.cache_stats.lock();
156 stats.current_size = self.documents.len();
157 stats.clone()
158 }
159
160 /// Update maximum cache size (can be called during runtime).
161 pub fn set_max_cache_size(&self, new_max: usize) {
162 let mut max = self.max_cache_size.write();
163 *max = new_max;
164 let mut stats = self.cache_stats.lock();
165 stats.max_size = new_max;
166 debug!("Cache max size updated to: {}", new_max);
167 }
168
169 /// Get current maximum cache size.
170 #[must_use]
171 pub fn max_cache_size(&self) -> usize {
172 *self.max_cache_size.read()
173 }
174
175 /// Update maximum document size (can be called during runtime).
176 pub fn set_max_document_size(&self, new_max: usize) {
177 let mut max = self.max_document_size.write();
178 *max = new_max;
179 debug!("Max document size updated to: {} bytes", new_max);
180 }
181
182 /// Get current maximum document size.
183 #[must_use]
184 pub fn max_document_size(&self) -> usize {
185 *self.max_document_size.read()
186 }
187
188 /// Compute a simple hash for change detection.
189 fn hash_content(content: &str) -> u64 {
190 use std::collections::hash_map::DefaultHasher;
191 use std::hash::{Hash, Hasher};
192 let mut hasher = DefaultHasher::new();
193 content.hash(&mut hasher);
194 hasher.finish()
195 }
196
197 /// Insert or update a document.
198 ///
199 /// If the document already exists, updates its content and marks it as dirty
200 /// if the content changed. If it's a new document, performs initial analysis.
201 ///
202 /// # Memory Management
203 ///
204 /// This method enforces the maximum document size limit. Documents exceeding
205 /// the limit are rejected and this method returns `false`.
206 ///
207 /// # Returns
208 ///
209 /// Returns `true` if the document was successfully inserted/updated,
210 /// `false` if rejected due to size constraints.
211 ///
212 /// # Error Handling
213 ///
214 /// - Size limit violations: Logged as warnings and rejected
215 /// - Cache eviction: Logged with LRU document details
216 /// - Content hashing: Hash collisions are statistically impossible but detected
217 pub fn insert_or_update(&self, uri: &Url, content: &str) -> bool {
218 // Memory management: Enforce maximum document size
219 let max_size = self.max_document_size();
220 if content.len() > max_size {
221 warn!(
222 "Document size limit exceeded for {}: {} bytes > {} bytes maximum (rejected)",
223 uri,
224 content.len(),
225 max_size
226 );
227 return false;
228 }
229
230 let rope = Rope::from_str(content);
231 let content_hash = Self::hash_content(content);
232 let line_count = content.lines().count();
233
234 if let Some(state_ref) = self.documents.get(uri) {
235 // Cache hit - existing document
236 {
237 let mut stats = self.cache_stats.lock();
238 stats.hits += 1;
239 }
240
241 let mut state = state_ref.lock();
242 // Only update if content actually changed
243 if state.content_hash == content_hash {
244 debug!(
245 "Document content unchanged for {} (hash: {:#x}), updating access time only",
246 uri, content_hash
247 );
248 // Update access time even if content hasn't changed
249 state.last_access = std::time::Instant::now();
250 } else {
251 debug!(
252 "Document content changed for {}: {} -> {} bytes, {} lines",
253 uri,
254 state.rope.len_bytes(),
255 content.len(),
256 line_count
257 );
258 state.rope = rope;
259 state.content_hash = content_hash;
260 state.dirty = true;
261 state.last_access = std::time::Instant::now();
262 }
263 } else {
264 // Cache miss - new document
265 {
266 let mut stats = self.cache_stats.lock();
267 stats.misses += 1;
268 }
269
270 debug!(
271 "New document registered: {} ({} bytes, {} lines)",
272 uri,
273 content.len(),
274 line_count
275 );
276
277 // Check if we need to evict before inserting
278 let max_cache = self.max_cache_size();
279 if self.documents.len() >= max_cache {
280 warn!(
281 "Cache limit reached ({}/{}), triggering LRU eviction before inserting {}",
282 self.documents.len(),
283 max_cache,
284 uri
285 );
286 self.evict_lru_document();
287 }
288
289 // New document - perform initial analysis synchronously
290 debug!("Starting initial analysis for new document: {}", uri);
291 let analysis = Arc::new(AnalyzedDocument::analyze(content));
292
293 if !analysis.errors.is_empty() {
294 debug!(
295 "Initial analysis found {} parse errors for {}",
296 analysis.errors.len(),
297 uri
298 );
299 }
300
301 let state = DocumentState {
302 rope,
303 analysis,
304 content_hash,
305 dirty: false,
306 last_access: std::time::Instant::now(),
307 };
308 self.documents
309 .insert(uri.clone(), Arc::new(Mutex::new(state)));
310 debug!("Document cached: {} (hash: {:#x})", uri, content_hash);
311 }
312
313 true
314 }
315
316 /// Get document content and analysis.
317 ///
318 /// This method returns the document content and an Arc to the analysis.
319 /// It also updates the last access time for LRU tracking.
320 ///
321 /// # Returns
322 ///
323 /// Returns `Some((content, analysis))` if the document exists, `None` otherwise.
324 ///
325 /// # Error Handling
326 ///
327 /// - Missing document: Returns None (logged at call site)
328 /// - Access tracking: Always updates last access time for LRU
329 #[must_use]
330 pub fn get(&self, uri: &Url) -> Option<(String, Arc<AnalyzedDocument>)> {
331 self.documents.get(uri).map(|entry| {
332 let mut state = entry.lock();
333 state.last_access = std::time::Instant::now();
334 debug!(
335 "Document accessed: {} ({} bytes, dirty: {})",
336 uri,
337 state.rope.len_bytes(),
338 state.dirty
339 );
340 (state.rope.to_string(), Arc::clone(&state.analysis))
341 })
342 }
343
344 /// Get document state reference for in-place operations.
345 ///
346 /// This method returns an Arc to the document state, allowing for
347 /// more efficient operations that need to inspect or modify state
348 /// without cloning the entire content.
349 ///
350 /// # Returns
351 ///
352 /// Returns `Some(Arc<Mutex<DocumentState>>)` if the document exists, `None` otherwise.
353 #[must_use]
354 pub fn get_state(&self, uri: &Url) -> Option<Arc<Mutex<DocumentState>>> {
355 self.documents.get(uri).map(|entry| entry.clone())
356 }
357
358 /// Check if a document is dirty (needs re-analysis).
359 ///
360 /// # Returns
361 ///
362 /// Returns `true` if the document exists and is dirty, `false` otherwise.
363 #[must_use]
364 pub fn is_dirty(&self, uri: &Url) -> bool {
365 self.documents.get(uri).is_some_and(|entry| {
366 let state = entry.lock();
367 state.dirty
368 })
369 }
370
371 /// Mark a document as clean (analysis is up-to-date).
372 ///
373 /// This method should be called after successfully analyzing a document.
374 pub fn mark_clean(&self, uri: &Url) {
375 if let Some(state_ref) = self.documents.get(uri) {
376 let mut state = state_ref.lock();
377 state.dirty = false;
378 }
379 }
380
381 /// Update analysis for a document and mark it as clean.
382 ///
383 /// This is a convenience method that combines updating the analysis
384 /// and marking the document as clean.
385 ///
386 /// # Error Handling
387 ///
388 /// - Missing document: Silently ignored (document may have been closed/evicted)
389 /// - Analysis update: Atomic with dirty flag clearing
390 pub fn update_analysis(&self, uri: &Url, analysis: Arc<AnalyzedDocument>) {
391 if let Some(state_ref) = self.documents.get(uri) {
392 let mut state = state_ref.lock();
393 debug!(
394 "Updating analysis for {}: {} entities, {} errors",
395 uri,
396 analysis
397 .entities
398 .values()
399 .map(std::collections::HashMap::len)
400 .sum::<usize>(),
401 analysis.errors.len()
402 );
403 state.analysis = analysis;
404 state.dirty = false;
405 } else {
406 warn!(
407 "Attempted to update analysis for non-existent document: {} (may have been closed/evicted)",
408 uri
409 );
410 }
411 }
412
413 /// Remove a document from the cache.
414 ///
415 /// This is typically called when a document is closed in the editor.
416 ///
417 /// # Returns
418 ///
419 /// Returns `true` if the document was removed, `false` if it didn't exist.
420 #[must_use]
421 pub fn remove(&self, uri: &Url) -> bool {
422 self.documents.remove(uri).is_some()
423 }
424
425 /// Get all document URIs currently in the cache.
426 ///
427 /// This is useful for workspace-wide operations like workspace symbols.
428 #[must_use]
429 pub fn all_uris(&self) -> Vec<Url> {
430 self.documents
431 .iter()
432 .map(|entry| entry.key().clone())
433 .collect()
434 }
435
436 /// Iterate over all documents with a function.
437 ///
438 /// This provides a safe way to iterate over all documents without
439 /// exposing the internal `DashMap` structure.
440 pub fn for_each<F>(&self, mut f: F)
441 where
442 F: FnMut(&Url, &Arc<Mutex<DocumentState>>),
443 {
444 for entry in &self.documents {
445 f(entry.key(), entry.value());
446 }
447 }
448
449 /// Evict the least recently used document.
450 ///
451 /// This is called when the number of open documents exceeds the configured
452 /// maximum cache size to prevent unbounded memory growth.
453 ///
454 /// # Error Handling
455 ///
456 /// - Empty cache: Returns immediately without error
457 /// - LRU selection: Uses precise timestamp comparison
458 /// - Eviction: Logged with document details and idle time
459 fn evict_lru_document(&self) {
460 if self.documents.is_empty() {
461 warn!("LRU eviction requested but cache is empty (no-op)");
462 return;
463 }
464
465 // Find the LRU document
466 let mut lru_uri: Option<Url> = None;
467 let mut lru_time = std::time::Instant::now();
468 let mut lru_size: usize = 0;
469
470 for entry in &self.documents {
471 let state = entry.value().lock();
472 if lru_uri.is_none() || state.last_access < lru_time {
473 lru_uri = Some(entry.key().clone());
474 lru_time = state.last_access;
475 lru_size = state.rope.len_bytes();
476 }
477 }
478
479 // Evict the LRU document
480 if let Some(uri) = lru_uri {
481 let idle_duration = std::time::Instant::now().duration_since(lru_time);
482 warn!(
483 "Evicting LRU document {} ({} bytes, idle for {:?})",
484 uri, lru_size, idle_duration
485 );
486
487 if let Some((_, removed_state)) = self.documents.remove(&uri) {
488 let state = removed_state.lock();
489 debug!(
490 "Evicted document had {} entities, {} references",
491 state
492 .analysis
493 .entities
494 .values()
495 .map(std::collections::HashMap::len)
496 .sum::<usize>(),
497 state.analysis.references.len()
498 );
499 }
500
501 // Update statistics
502 {
503 let mut stats = self.cache_stats.lock();
504 stats.evictions += 1;
505 debug!(
506 "Cache statistics after eviction: {} hits, {} misses, {} evictions, {}/{} size",
507 stats.hits,
508 stats.misses,
509 stats.evictions,
510 self.documents.len(),
511 stats.max_size
512 );
513 }
514 } else {
515 error!("LRU eviction failed: no document found despite non-empty cache");
516 }
517 }
518
519 /// Clear all documents from the cache.
520 ///
521 /// This is primarily useful for testing or when resetting the server state.
522 pub fn clear(&self) {
523 self.documents.clear();
524 let mut stats = self.cache_stats.lock();
525 stats.hits = 0;
526 stats.misses = 0;
527 stats.evictions = 0;
528 }
529}
530
531#[cfg(test)]
532mod tests {
533 use super::*;
534
535 #[test]
536 fn test_document_manager_new() {
537 let manager = DocumentCache::new(100, 1024 * 1024);
538 assert_eq!(manager.max_cache_size(), 100);
539 assert_eq!(manager.max_document_size(), 1024 * 1024);
540
541 let stats = manager.statistics();
542 assert_eq!(stats.max_size, 100);
543 assert_eq!(stats.current_size, 0);
544 assert_eq!(stats.hits, 0);
545 assert_eq!(stats.misses, 0);
546 assert_eq!(stats.evictions, 0);
547 }
548
549 #[test]
550 fn test_insert_and_get() {
551 let manager = DocumentCache::new(10, 1024 * 1024);
552 let uri = Url::parse("file:///test.hedl").unwrap();
553 let content = "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n";
554
555 // Insert document
556 assert!(manager.insert_or_update(&uri, content));
557
558 // Get document
559 let result = manager.get(&uri);
560 assert!(result.is_some());
561 let (retrieved_content, analysis) = result.unwrap();
562 assert_eq!(retrieved_content, content);
563 assert!(analysis.document.is_some()); // Analysis should have been performed
564
565 // Check statistics
566 let stats = manager.statistics();
567 assert_eq!(stats.misses, 1); // Initial insert is a miss
568 assert_eq!(stats.hits, 0);
569 assert_eq!(stats.current_size, 1);
570 }
571
572 #[test]
573 fn test_update_marks_dirty() {
574 let manager = DocumentCache::new(10, 1024 * 1024);
575 let uri = Url::parse("file:///test.hedl").unwrap();
576
577 // Insert initial content
578 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
579 assert!(!manager.is_dirty(&uri));
580
581 // Update with different content
582 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n%S:User:[id]\n---\n");
583 assert!(manager.is_dirty(&uri));
584
585 // Update with same content (hash unchanged)
586 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n%S:User:[id]\n---\n");
587 assert!(manager.is_dirty(&uri)); // Still dirty until marked clean
588 }
589
590 #[test]
591 fn test_mark_clean() {
592 let manager = DocumentCache::new(10, 1024 * 1024);
593 let uri = Url::parse("file:///test.hedl").unwrap();
594
595 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
596 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n%S:User:[id]\n---\n");
597 assert!(manager.is_dirty(&uri));
598
599 manager.mark_clean(&uri);
600 assert!(!manager.is_dirty(&uri));
601 }
602
603 #[test]
604 fn test_document_size_limit() {
605 let manager = DocumentCache::new(10, 100); // Only 100 bytes allowed
606 let uri = Url::parse("file:///test.hedl").unwrap();
607
608 // Small document should succeed
609 assert!(manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n"));
610
611 // Large document should be rejected
612 let large_content = "x".repeat(101);
613 assert!(!manager.insert_or_update(&uri, &large_content));
614 }
615
616 #[test]
617 fn test_lru_eviction() {
618 let manager = DocumentCache::new(3, 1024 * 1024); // Max 3 documents
619
620 // Insert 3 documents
621 for i in 0..3 {
622 let uri = Url::parse(&format!("file:///test{i}.hedl")).unwrap();
623 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
624 }
625
626 let stats = manager.statistics();
627 assert_eq!(stats.current_size, 3);
628 assert_eq!(stats.evictions, 0);
629
630 // Insert 4th document should trigger eviction
631 let uri4 = Url::parse("file:///test4.hedl").unwrap();
632 manager.insert_or_update(&uri4, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
633
634 let stats = manager.statistics();
635 assert_eq!(stats.current_size, 3); // Still at max
636 assert_eq!(stats.evictions, 1); // One eviction occurred
637 }
638
639 #[test]
640 fn test_remove() {
641 let manager = DocumentCache::new(10, 1024 * 1024);
642 let uri = Url::parse("file:///test.hedl").unwrap();
643
644 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
645 assert!(manager.get(&uri).is_some());
646
647 assert!(manager.remove(&uri));
648 assert!(manager.get(&uri).is_none());
649
650 // Removing non-existent document should return false
651 assert!(!manager.remove(&uri));
652 }
653
654 #[test]
655 fn test_all_uris() {
656 let manager = DocumentCache::new(10, 1024 * 1024);
657
658 for i in 0..5 {
659 let uri = Url::parse(&format!("file:///test{i}.hedl")).unwrap();
660 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
661 }
662
663 let uris = manager.all_uris();
664 assert_eq!(uris.len(), 5);
665 }
666
667 #[test]
668 fn test_clear() {
669 let manager = DocumentCache::new(10, 1024 * 1024);
670
671 for i in 0..3 {
672 let uri = Url::parse(&format!("file:///test{i}.hedl")).unwrap();
673 manager.insert_or_update(&uri, "%V:2.0\n%NULL:~\n%QUOTE:\"\n---\n");
674 }
675
676 assert_eq!(manager.statistics().current_size, 3);
677
678 manager.clear();
679
680 assert_eq!(manager.statistics().current_size, 0);
681 assert_eq!(manager.statistics().hits, 0);
682 assert_eq!(manager.statistics().misses, 0);
683 }
684
685 #[test]
686 fn test_runtime_config_update() {
687 let manager = DocumentCache::new(100, 1024 * 1024);
688
689 assert_eq!(manager.max_cache_size(), 100);
690 manager.set_max_cache_size(200);
691 assert_eq!(manager.max_cache_size(), 200);
692
693 assert_eq!(manager.max_document_size(), 1024 * 1024);
694 manager.set_max_document_size(2 * 1024 * 1024);
695 assert_eq!(manager.max_document_size(), 2 * 1024 * 1024);
696 }
697}