prax_query/mem_optimize/
interning.rs

1//! Enhanced string interning for efficient identifier storage.
2//!
3//! This module provides both global and scoped string interning to minimize
4//! memory allocations for repeated field names, table names, and other identifiers.
5//!
6//! # Interning Strategies
7//!
8//! 1. **Static interning**: Compile-time constants for common fields (zero allocation)
9//! 2. **Global interning**: Thread-safe, lifetime of program, for repeated identifiers
10//! 3. **Scoped interning**: Per-request/query, automatically freed when scope ends
11//!
12//! # Performance
13//!
14//! - First intern: O(n) allocation + hash lookup
15//! - Subsequent lookups: O(n) hash lookup, no allocation
16//! - Cloning interned string: O(1)
17//!
18//! # Example
19//!
20//! ```rust
21//! use prax_query::mem_optimize::interning::{GlobalInterner, InternedStr};
22//!
23//! // Get the global interner
24//! let interner = GlobalInterner::get();
25//!
26//! // Intern a string
27//! let s1 = interner.intern("user_id");
28//! let s2 = interner.intern("user_id");
29//!
30//! // Same memory location
31//! assert!(InternedStr::ptr_eq(&s1, &s2));
32//! ```
33
34use parking_lot::{Mutex, RwLock};
35use smol_str::SmolStr;
36use std::borrow::Cow;
37use std::collections::{HashMap, HashSet};
38use std::hash::{Hash, Hasher};
39use std::sync::Arc;
40
41// ============================================================================
42// Interned String Types
43// ============================================================================
44
45/// An interned string that shares memory with other identical strings.
46///
47/// This is a thin wrapper around `Arc<str>` that provides cheap cloning
48/// and comparison operations.
49#[derive(Clone, Debug)]
50pub struct InternedStr(Arc<str>);
51
52impl InternedStr {
53    /// Create a new interned string from a raw Arc.
54    #[inline]
55    pub fn new(s: Arc<str>) -> Self {
56        Self(s)
57    }
58
59    /// Get the string slice.
60    #[inline]
61    pub fn as_str(&self) -> &str {
62        &self.0
63    }
64
65    /// Check if two interned strings point to the same memory.
66    #[inline]
67    pub fn ptr_eq(a: &Self, b: &Self) -> bool {
68        Arc::ptr_eq(&a.0, &b.0)
69    }
70
71    /// Get the inner Arc.
72    #[inline]
73    pub fn into_arc(self) -> Arc<str> {
74        self.0
75    }
76
77    /// Convert to a SmolStr (small string optimization).
78    #[inline]
79    pub fn to_smol(&self) -> SmolStr {
80        SmolStr::new(&*self.0)
81    }
82
83    /// Convert to Cow<'static, str>.
84    ///
85    /// Returns Cow::Owned because Arc<str> cannot be borrowed with 'static lifetime.
86    #[inline]
87    pub fn to_cow(&self) -> Cow<'static, str> {
88        Cow::Owned(self.0.to_string())
89    }
90}
91
92impl AsRef<str> for InternedStr {
93    #[inline]
94    fn as_ref(&self) -> &str {
95        &self.0
96    }
97}
98
99impl std::ops::Deref for InternedStr {
100    type Target = str;
101
102    #[inline]
103    fn deref(&self) -> &Self::Target {
104        &self.0
105    }
106}
107
108impl PartialEq for InternedStr {
109    #[inline]
110    fn eq(&self, other: &Self) -> bool {
111        // Fast path: pointer equality
112        if Arc::ptr_eq(&self.0, &other.0) {
113            return true;
114        }
115        // Slow path: string comparison
116        *self.0 == *other.0
117    }
118}
119
120impl Eq for InternedStr {}
121
122impl Hash for InternedStr {
123    #[inline]
124    fn hash<H: Hasher>(&self, state: &mut H) {
125        self.0.hash(state)
126    }
127}
128
129impl std::fmt::Display for InternedStr {
130    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131        self.0.fmt(f)
132    }
133}
134
135impl From<&str> for InternedStr {
136    fn from(s: &str) -> Self {
137        GlobalInterner::get().intern(s)
138    }
139}
140
141impl From<String> for InternedStr {
142    fn from(s: String) -> Self {
143        GlobalInterner::get().intern(&s)
144    }
145}
146
147// ============================================================================
148// Global Interner
149// ============================================================================
150
151/// Thread-safe global string interner.
152///
153/// Strings interned here live for the lifetime of the program. Use this for
154/// identifiers that appear frequently across many queries.
155pub struct GlobalInterner {
156    strings: RwLock<HashSet<Arc<str>>>,
157    stats: Mutex<InternerStats>,
158}
159
160impl GlobalInterner {
161    /// Get the global interner instance.
162    pub fn get() -> &'static Self {
163        static INSTANCE: std::sync::OnceLock<GlobalInterner> = std::sync::OnceLock::new();
164        INSTANCE.get_or_init(|| {
165            let interner = GlobalInterner {
166                strings: RwLock::new(HashSet::with_capacity(256)),
167                stats: Mutex::new(InternerStats::default()),
168            };
169            // Pre-populate with common identifiers
170            interner.prepopulate();
171            interner
172        })
173    }
174
175    /// Pre-populate with common SQL identifiers.
176    fn prepopulate(&self) {
177        for name in COMMON_IDENTIFIERS {
178            self.intern(name);
179        }
180    }
181
182    /// Intern a string, returning an interned reference.
183    ///
184    /// If the string has been interned before, returns the existing reference.
185    /// Otherwise, creates a new interned entry.
186    #[inline]
187    pub fn intern(&self, s: &str) -> InternedStr {
188        // Fast path: check if already interned (read lock)
189        {
190            let strings = self.strings.read();
191            if let Some(existing) = strings.get(s) {
192                let mut stats = self.stats.lock();
193                stats.hits += 1;
194                return InternedStr(Arc::clone(existing));
195            }
196        }
197
198        // Slow path: need to insert (write lock)
199        let mut strings = self.strings.write();
200
201        // Double-check after acquiring write lock
202        if let Some(existing) = strings.get(s) {
203            let mut stats = self.stats.lock();
204            stats.hits += 1;
205            return InternedStr(Arc::clone(existing));
206        }
207
208        // Insert new string
209        let arc: Arc<str> = Arc::from(s);
210        strings.insert(Arc::clone(&arc));
211
212        let mut stats = self.stats.lock();
213        stats.misses += 1;
214        stats.total_bytes += s.len();
215
216        InternedStr(arc)
217    }
218
219    /// Try to get an already-interned string without creating a new entry.
220    #[inline]
221    pub fn lookup(&self, s: &str) -> Option<InternedStr> {
222        let strings = self.strings.read();
223        strings.get(s).map(|arc| InternedStr(Arc::clone(arc)))
224    }
225
226    /// Get the number of interned strings.
227    pub fn len(&self) -> usize {
228        self.strings.read().len()
229    }
230
231    /// Check if the interner is empty.
232    pub fn is_empty(&self) -> bool {
233        self.strings.read().is_empty()
234    }
235
236    /// Get interning statistics.
237    pub fn stats(&self) -> InternerStats {
238        self.stats.lock().clone()
239    }
240
241    /// Clear all interned strings (use with caution!).
242    ///
243    /// This invalidates all existing `InternedStr` references from this interner.
244    /// Only use during testing or shutdown.
245    pub fn clear(&self) {
246        self.strings.write().clear();
247        *self.stats.lock() = InternerStats::default();
248    }
249}
250
251// ============================================================================
252// Scoped Interner
253// ============================================================================
254
255/// A scoped string interner for temporary use.
256///
257/// Strings interned here are freed when the interner is dropped.
258/// Use this for request-scoped interning to avoid memory growth.
259#[derive(Default)]
260pub struct ScopedInterner {
261    strings: HashSet<Arc<str>>,
262    stats: InternerStats,
263}
264
265impl ScopedInterner {
266    /// Create a new scoped interner.
267    pub fn new() -> Self {
268        Self::default()
269    }
270
271    /// Create a scoped interner with pre-allocated capacity.
272    pub fn with_capacity(capacity: usize) -> Self {
273        Self {
274            strings: HashSet::with_capacity(capacity),
275            stats: InternerStats::default(),
276        }
277    }
278
279    /// Intern a string within this scope.
280    #[inline]
281    pub fn intern(&mut self, s: &str) -> InternedStr {
282        if let Some(existing) = self.strings.get(s) {
283            self.stats.hits += 1;
284            return InternedStr(Arc::clone(existing));
285        }
286
287        let arc: Arc<str> = Arc::from(s);
288        self.strings.insert(Arc::clone(&arc));
289        self.stats.misses += 1;
290        self.stats.total_bytes += s.len();
291
292        InternedStr(arc)
293    }
294
295    /// Try to get an already-interned string.
296    #[inline]
297    pub fn get(&self, s: &str) -> Option<InternedStr> {
298        self.strings.get(s).map(|arc| InternedStr(Arc::clone(arc)))
299    }
300
301    /// Get the number of interned strings.
302    pub fn len(&self) -> usize {
303        self.strings.len()
304    }
305
306    /// Check if empty.
307    pub fn is_empty(&self) -> bool {
308        self.strings.is_empty()
309    }
310
311    /// Get statistics.
312    pub fn stats(&self) -> &InternerStats {
313        &self.stats
314    }
315
316    /// Clear all interned strings.
317    pub fn clear(&mut self) {
318        self.strings.clear();
319        self.stats = InternerStats::default();
320    }
321}
322
323// ============================================================================
324// Identifier Cache
325// ============================================================================
326
327/// Cache for auto-interning common identifier patterns.
328///
329/// This cache recognizes common patterns like `table.column` and automatically
330/// interns both the full identifier and its components.
331pub struct IdentifierCache {
332    /// Full identifiers (e.g., "users.email")
333    full: RwLock<HashMap<String, InternedStr>>,
334    /// Components (e.g., "users", "email")
335    components: RwLock<HashSet<Arc<str>>>,
336}
337
338impl IdentifierCache {
339    /// Create a new identifier cache.
340    pub fn new() -> Self {
341        Self {
342            full: RwLock::new(HashMap::with_capacity(128)),
343            components: RwLock::new(HashSet::with_capacity(256)),
344        }
345    }
346
347    /// Get the global identifier cache.
348    pub fn global() -> &'static Self {
349        static INSTANCE: std::sync::OnceLock<IdentifierCache> = std::sync::OnceLock::new();
350        INSTANCE.get_or_init(Self::new)
351    }
352
353    /// Intern a table.column identifier.
354    ///
355    /// Also interns the individual components.
356    pub fn intern_qualified(&self, table: &str, column: &str) -> InternedStr {
357        let key = format!("{}.{}", table, column);
358
359        // Check cache
360        if let Some(cached) = self.full.read().get(&key) {
361            return cached.clone();
362        }
363
364        // Intern components
365        self.intern_component(table);
366        self.intern_component(column);
367
368        // Intern full identifier
369        let interned = GlobalInterner::get().intern(&key);
370
371        // Cache it
372        self.full.write().insert(key, interned.clone());
373
374        interned
375    }
376
377    /// Intern just a component (table name or column name).
378    pub fn intern_component(&self, name: &str) -> InternedStr {
379        // Check if already in components
380        {
381            let components = self.components.read();
382            if let Some(existing) = components.get(name) {
383                return InternedStr(Arc::clone(existing));
384            }
385        }
386
387        // Intern via global interner
388        let interned = GlobalInterner::get().intern(name);
389
390        // Add to components
391        self.components.write().insert(interned.0.clone());
392
393        interned
394    }
395
396    /// Get a cached qualified identifier.
397    pub fn get_qualified(&self, table: &str, column: &str) -> Option<InternedStr> {
398        let key = format!("{}.{}", table, column);
399        self.full.read().get(&key).cloned()
400    }
401
402    /// Get cached component count.
403    pub fn component_count(&self) -> usize {
404        self.components.read().len()
405    }
406
407    /// Get cached full identifier count.
408    pub fn qualified_count(&self) -> usize {
409        self.full.read().len()
410    }
411}
412
413impl Default for IdentifierCache {
414    fn default() -> Self {
415        Self::new()
416    }
417}
418
419// ============================================================================
420// Statistics
421// ============================================================================
422
423/// Statistics for an interner.
424#[derive(Debug, Clone, Default)]
425pub struct InternerStats {
426    /// Cache hits.
427    pub hits: u64,
428    /// Cache misses (new strings interned).
429    pub misses: u64,
430    /// Total bytes interned.
431    pub total_bytes: usize,
432}
433
434impl InternerStats {
435    /// Get the hit ratio.
436    pub fn hit_ratio(&self) -> f64 {
437        let total = self.hits + self.misses;
438        if total == 0 {
439            0.0
440        } else {
441            self.hits as f64 / total as f64
442        }
443    }
444}
445
446// ============================================================================
447// Common Identifiers
448// ============================================================================
449
450/// Common SQL identifiers to pre-populate.
451const COMMON_IDENTIFIERS: &[&str] = &[
452    // Common column names
453    "id",
454    "uuid",
455    "name",
456    "email",
457    "username",
458    "password",
459    "password_hash",
460    "title",
461    "description",
462    "content",
463    "body",
464    "text",
465    "status",
466    "state",
467    "type",
468    "kind",
469    "role",
470    "active",
471    "enabled",
472    "deleted",
473    "archived",
474    "verified",
475    "confirmed",
476    "published",
477    "visible",
478    "public",
479    "private",
480    // Numeric fields
481    "count",
482    "total",
483    "score",
484    "rating",
485    "priority",
486    "order",
487    "position",
488    "rank",
489    "level",
490    "index",
491    "sequence",
492    "age",
493    "amount",
494    "price",
495    "cost",
496    "quantity",
497    "weight",
498    "height",
499    "width",
500    "length",
501    "size",
502    // Foreign keys
503    "user_id",
504    "account_id",
505    "organization_id",
506    "tenant_id",
507    "post_id",
508    "comment_id",
509    "article_id",
510    "product_id",
511    "order_id",
512    "item_id",
513    "category_id",
514    "tag_id",
515    "parent_id",
516    "author_id",
517    "owner_id",
518    "creator_id",
519    "assignee_id",
520    "reviewer_id",
521    // Timestamps
522    "created_at",
523    "updated_at",
524    "deleted_at",
525    "published_at",
526    "expires_at",
527    "starts_at",
528    "ends_at",
529    "last_login_at",
530    "last_seen_at",
531    "verified_at",
532    "confirmed_at",
533    // URL/path fields
534    "slug",
535    "url",
536    "uri",
537    "path",
538    "permalink",
539    "link",
540    "href",
541    "src",
542    "source",
543    "destination",
544    // Auth fields
545    "key",
546    "value",
547    "token",
548    "secret",
549    "code",
550    "pin",
551    "otp",
552    "api_key",
553    "access_token",
554    "refresh_token",
555    // Metadata
556    "version",
557    "revision",
558    "checksum",
559    "hash",
560    "signature",
561    "fingerprint",
562    "metadata",
563    "data",
564    "payload",
565    "config",
566    "settings",
567    "options",
568    "preferences",
569    // Common table names
570    "users",
571    "accounts",
572    "organizations",
573    "tenants",
574    "posts",
575    "comments",
576    "articles",
577    "products",
578    "orders",
579    "items",
580    "categories",
581    "tags",
582    "files",
583    "images",
584    "documents",
585    "messages",
586    "notifications",
587    "events",
588    "logs",
589    "sessions",
590    "tokens",
591    // SQL keywords used as identifiers
592    "SELECT",
593    "FROM",
594    "WHERE",
595    "AND",
596    "OR",
597    "NOT",
598    "IN",
599    "IS",
600    "NULL",
601    "TRUE",
602    "FALSE",
603    "ASC",
604    "DESC",
605    "LIMIT",
606    "OFFSET",
607    "ORDER",
608    "BY",
609    "GROUP",
610    "HAVING",
611    "JOIN",
612    "LEFT",
613    "RIGHT",
614    "INNER",
615    "OUTER",
616    "ON",
617    "AS",
618];
619
620// ============================================================================
621// Convenience Functions
622// ============================================================================
623
624/// Intern a string using the global interner.
625#[inline]
626pub fn intern(s: &str) -> InternedStr {
627    GlobalInterner::get().intern(s)
628}
629
630/// Try to get an already-interned string from the global interner.
631#[inline]
632pub fn get_interned(s: &str) -> Option<InternedStr> {
633    GlobalInterner::get().lookup(s)
634}
635
636/// Intern a qualified identifier (table.column).
637#[inline]
638pub fn intern_qualified(table: &str, column: &str) -> InternedStr {
639    IdentifierCache::global().intern_qualified(table, column)
640}
641
642/// Intern just a component (table or column name).
643#[inline]
644pub fn intern_component(name: &str) -> InternedStr {
645    IdentifierCache::global().intern_component(name)
646}
647
648#[cfg(test)]
649mod tests {
650    use super::*;
651
652    #[test]
653    fn test_global_interner_dedup() {
654        let interner = GlobalInterner::get();
655
656        let s1 = interner.intern("test_field");
657        let s2 = interner.intern("test_field");
658
659        // Should be the same pointer
660        assert!(InternedStr::ptr_eq(&s1, &s2));
661    }
662
663    #[test]
664    fn test_scoped_interner() {
665        let mut interner = ScopedInterner::new();
666
667        let s1 = interner.intern("scoped_field");
668        let s2 = interner.intern("scoped_field");
669
670        assert!(InternedStr::ptr_eq(&s1, &s2));
671        assert_eq!(interner.len(), 1);
672    }
673
674    #[test]
675    fn test_identifier_cache_qualified() {
676        let cache = IdentifierCache::new();
677
678        let id1 = cache.intern_qualified("users", "email");
679        let id2 = cache.intern_qualified("users", "email");
680
681        assert!(InternedStr::ptr_eq(&id1, &id2));
682        assert_eq!(id1.as_str(), "users.email");
683    }
684
685    #[test]
686    fn test_interned_str_equality() {
687        let interner = GlobalInterner::get();
688
689        let s1 = interner.intern("equal_test");
690        let s2 = interner.intern("equal_test");
691        let s3 = interner.intern("different");
692
693        assert_eq!(s1, s2);
694        assert_ne!(s1, s3);
695    }
696
697    #[test]
698    fn test_interned_str_hash() {
699        use std::collections::HashSet;
700
701        let interner = GlobalInterner::get();
702
703        let s1 = interner.intern("hash_test");
704        let s2 = interner.intern("hash_test");
705
706        let mut set = HashSet::new();
707        set.insert(s1.clone());
708
709        assert!(set.contains(&s2));
710    }
711
712    #[test]
713    fn test_interner_stats() {
714        let mut interner = ScopedInterner::new();
715
716        // First intern - miss
717        let _ = interner.intern("stats_test");
718        assert_eq!(interner.stats().misses, 1);
719        assert_eq!(interner.stats().hits, 0);
720
721        // Second intern - hit
722        let _ = interner.intern("stats_test");
723        assert_eq!(interner.stats().misses, 1);
724        assert_eq!(interner.stats().hits, 1);
725
726        assert!(interner.stats().hit_ratio() > 0.4);
727    }
728
729    #[test]
730    fn test_common_identifiers_prepopulated() {
731        let interner = GlobalInterner::get();
732
733        // These should be hits (pre-populated)
734        let _ = interner.intern("id");
735        let _ = interner.intern("created_at");
736        let _ = interner.intern("user_id");
737
738        // Verify they're in the interner
739        assert!(interner.lookup("id").is_some());
740        assert!(interner.lookup("email").is_some());
741    }
742
743    #[test]
744    fn test_interned_str_from() {
745        let s1: InternedStr = "from_str".into();
746        let s2: InternedStr = String::from("from_string").into();
747
748        assert_eq!(s1.as_str(), "from_str");
749        assert_eq!(s2.as_str(), "from_string");
750    }
751}
752