ddex_builder/
optimized_strings.rs

1//! Optimized string handling for DDEX Builder performance
2//!
3//! This module provides string interning, Cow optimization, and memory-efficient
4//! string operations to reduce allocations and improve build performance.
5
6use indexmap::IndexMap;
7use indexmap::IndexSet;
8use once_cell::sync::Lazy;
9use smartstring::{LazyCompact, SmartString};
10use std::borrow::Cow;
11use std::sync::Arc;
12use string_cache::DefaultAtom;
13
14/// High-performance string type for small strings
15pub type FastString = SmartString<LazyCompact>;
16
17/// Static string cache for common DDEX values
18static COMMON_STRINGS: Lazy<IndexMap<&'static str, &'static str>> = Lazy::new(|| {
19    let mut map = IndexMap::new();
20
21    // Common DDEX versions
22    map.insert("4.3", "4.3");
23    map.insert("4.2", "4.2");
24    map.insert("4.1", "4.1");
25
26    // Common message types
27    map.insert("NewReleaseMessage", "NewReleaseMessage");
28    map.insert("PurgeReleaseMessage", "PurgeReleaseMessage");
29    map.insert("LiveMessage", "LiveMessage");
30
31    // Common roles
32    map.insert("MainArtist", "MainArtist");
33    map.insert("FeaturedArtist", "FeaturedArtist");
34    map.insert("Producer", "Producer");
35    map.insert("Composer", "Composer");
36    map.insert("Performer", "Performer");
37    map.insert("Engineer", "Engineer");
38    map.insert("Mixer", "Mixer");
39
40    // Common resource types
41    map.insert("SoundRecording", "SoundRecording");
42    map.insert("Video", "Video");
43    map.insert("Image", "Image");
44    map.insert("Text", "Text");
45
46    // Common release types
47    map.insert("Single", "Single");
48    map.insert("Album", "Album");
49    map.insert("EP", "EP");
50    map.insert("Compilation", "Compilation");
51
52    // Common genres
53    map.insert("Rock", "Rock");
54    map.insert("Pop", "Pop");
55    map.insert("Electronic", "Electronic");
56    map.insert("Hip-Hop", "Hip-Hop");
57    map.insert("Classical", "Classical");
58    map.insert("Jazz", "Jazz");
59    map.insert("Country", "Country");
60    map.insert("R&B", "R&B");
61    map.insert("Folk", "Folk");
62    map.insert("Alternative", "Alternative");
63
64    // Common language codes
65    map.insert("en", "en");
66    map.insert("es", "es");
67    map.insert("fr", "fr");
68    map.insert("de", "de");
69    map.insert("it", "it");
70    map.insert("pt", "pt");
71    map.insert("ja", "ja");
72    map.insert("ko", "ko");
73    map.insert("zh", "zh");
74
75    // Common territory codes
76    map.insert("US", "US");
77    map.insert("GB", "GB");
78    map.insert("CA", "CA");
79    map.insert("AU", "AU");
80    map.insert("DE", "DE");
81    map.insert("FR", "FR");
82    map.insert("JP", "JP");
83    map.insert("KR", "KR");
84
85    // Common commercial models
86    map.insert("SubscriptionModel", "SubscriptionModel");
87    map.insert("PermanentDownload", "PermanentDownload");
88    map.insert("AdSupportedModel", "AdSupportedModel");
89    map.insert("ConditionalDownload", "ConditionalDownload");
90
91    // Common prefixes for copyright
92    map.insert("℗ ", "℗ ");
93    map.insert("© ", "© ");
94
95    map
96});
97
98/// String interner for repeated values during build process
99#[derive(Debug, Default)]
100pub struct StringInterner {
101    /// Interned strings storage
102    strings: IndexSet<Arc<str>>,
103    /// Quick lookup for atoms
104    atoms: IndexMap<String, DefaultAtom>,
105}
106
107impl StringInterner {
108    /// Create a new string interner
109    pub fn new() -> Self {
110        Self {
111            strings: IndexSet::new(),
112            atoms: IndexMap::new(),
113        }
114    }
115
116    /// Intern a string, returning a reference to the interned version
117    pub fn intern(&mut self, s: &str) -> Arc<str> {
118        // Check static cache first
119        if let Some(&static_str) = COMMON_STRINGS.get(s) {
120            return Arc::from(static_str);
121        }
122
123        // Check if already interned
124        if let Some(existing) = self.strings.get(s) {
125            return existing.clone();
126        }
127
128        // Intern new string
129        let arc_str: Arc<str> = Arc::from(s);
130        self.strings.insert(arc_str.clone());
131        arc_str
132    }
133
134    /// Intern as an atom for even better performance on repeated lookups
135    pub fn intern_atom(&mut self, s: String) -> DefaultAtom {
136        if let Some(atom) = self.atoms.get(&s) {
137            return atom.clone();
138        }
139
140        let atom = DefaultAtom::from(s.as_str());
141        self.atoms.insert(s, atom.clone());
142        atom
143    }
144
145    /// Get memory usage statistics
146    pub fn memory_usage(&self) -> usize {
147        self.strings
148            .iter()
149            .map(|s| s.len() + std::mem::size_of::<Arc<str>>())
150            .sum::<usize>()
151            + self.atoms.len() * std::mem::size_of::<DefaultAtom>()
152    }
153
154    /// Clear the interner (useful for long-running processes)
155    pub fn clear(&mut self) {
156        self.strings.clear();
157        self.atoms.clear();
158    }
159}
160
161/// Optimized string for DDEX data
162#[derive(Debug, Clone, PartialEq, Eq, Hash)]
163pub enum OptimizedString {
164    /// Static string reference (zero allocation)
165    Static(&'static str),
166    /// Interned string (shared allocation)
167    Interned(Arc<str>),
168    /// Small string optimization
169    Small(FastString),
170    /// Atom for very frequent lookups
171    Atom(DefaultAtom),
172}
173
174impl OptimizedString {
175    /// Create from a string, choosing the most efficient representation
176    pub fn new(s: &str) -> Self {
177        // Check if it's a common static string
178        if let Some(&static_str) = COMMON_STRINGS.get(s) {
179            return OptimizedString::Static(static_str);
180        }
181
182        // Use small string optimization for short strings
183        if s.len() <= 23 {
184            // SmartString threshold
185            OptimizedString::Small(FastString::from(s))
186        } else {
187            // For longer strings, we'll need interning context
188            OptimizedString::Small(FastString::from(s))
189        }
190    }
191
192    /// Create from an interned string
193    pub fn interned(s: Arc<str>) -> Self {
194        OptimizedString::Interned(s)
195    }
196
197    /// Create from an atom
198    pub fn atom(atom: DefaultAtom) -> Self {
199        OptimizedString::Atom(atom)
200    }
201
202    /// Get the string value
203    pub fn as_str(&self) -> &str {
204        match self {
205            OptimizedString::Static(s) => s,
206            OptimizedString::Interned(s) => s,
207            OptimizedString::Small(s) => s,
208            OptimizedString::Atom(atom) => atom,
209        }
210    }
211
212    /// Get memory footprint in bytes
213    pub fn memory_footprint(&self) -> usize {
214        match self {
215            OptimizedString::Static(_) => 0, // No allocation
216            OptimizedString::Interned(_) => std::mem::size_of::<Arc<str>>(),
217            OptimizedString::Small(s) => s.capacity(),
218            OptimizedString::Atom(_) => std::mem::size_of::<DefaultAtom>(),
219        }
220    }
221}
222
223impl AsRef<str> for OptimizedString {
224    fn as_ref(&self) -> &str {
225        self.as_str()
226    }
227}
228
229impl std::fmt::Display for OptimizedString {
230    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
231        f.write_str(self.as_str())
232    }
233}
234
235/// Cow-optimized string for contexts where we may or may not own the data
236pub type CowString = Cow<'static, str>;
237
238/// Localized string with language code
239#[derive(Debug, Clone, PartialEq, Eq)]
240pub struct OptimizedLocalizedString {
241    /// The text content
242    pub text: OptimizedString,
243    /// Optional ISO language code (e.g., "en", "es")
244    pub language_code: Option<OptimizedString>,
245}
246
247impl OptimizedLocalizedString {
248    /// Create a new optimized localized string
249    pub fn new(text: &str, language_code: Option<&str>) -> Self {
250        Self {
251            text: OptimizedString::new(text),
252            language_code: language_code.map(OptimizedString::new),
253        }
254    }
255
256    /// Memory footprint of this localized string
257    pub fn memory_footprint(&self) -> usize {
258        self.text.memory_footprint()
259            + self
260                .language_code
261                .as_ref()
262                .map(|lc| lc.memory_footprint())
263                .unwrap_or(0)
264    }
265}
266
267/// Buffer pool for XML generation to reduce allocations
268#[derive(Debug, Default)]
269pub struct BufferPool {
270    buffers: Vec<String>,
271    current_size: usize,
272}
273
274impl BufferPool {
275    /// Create a new buffer pool
276    pub fn new() -> Self {
277        Self {
278            buffers: Vec::new(),
279            current_size: 0,
280        }
281    }
282
283    /// Get a buffer from the pool, or create one if none available
284    pub fn get_buffer(&mut self, estimated_size: usize) -> String {
285        match self.buffers.pop() {
286            Some(mut buffer) => {
287                buffer.clear();
288                if buffer.capacity() < estimated_size {
289                    buffer.reserve(estimated_size - buffer.capacity());
290                }
291                buffer
292            }
293            None => {
294                self.current_size += estimated_size;
295                String::with_capacity(estimated_size)
296            }
297        }
298    }
299
300    /// Return a buffer to the pool
301    pub fn return_buffer(&mut self, buffer: String) {
302        if buffer.capacity() <= 8192 {
303            // Don't keep huge buffers
304            self.buffers.push(buffer);
305        }
306    }
307
308    /// Get current memory usage
309    pub fn memory_usage(&self) -> usize {
310        self.current_size + self.buffers.iter().map(|b| b.capacity()).sum::<usize>()
311    }
312
313    /// Clear the pool
314    pub fn clear(&mut self) {
315        self.buffers.clear();
316        self.current_size = 0;
317    }
318}
319
320/// Build context that manages optimized strings and buffers
321#[derive(Debug, Default)]
322pub struct BuildContext {
323    /// String interner for repeated values
324    pub interner: StringInterner,
325    /// Buffer pool for XML generation
326    pub buffer_pool: BufferPool,
327    /// Statistics
328    pub stats: BuildStats,
329}
330
331impl BuildContext {
332    /// Create a new build context
333    pub fn new() -> Self {
334        Self {
335            interner: StringInterner::new(),
336            buffer_pool: BufferPool::new(),
337            stats: BuildStats::default(),
338        }
339    }
340
341    /// Optimize a string using the context's interner
342    pub fn optimize_string(&mut self, s: &str) -> OptimizedString {
343        self.stats.strings_processed += 1;
344
345        // Track if we use static cache
346        if COMMON_STRINGS.contains_key(s) {
347            self.stats.static_cache_hits += 1;
348            return OptimizedString::new(s);
349        }
350
351        // Check if worth interning (repeated strings)
352        if s.len() > 23 {
353            // Beyond small string optimization
354            let interned = self.interner.intern(s);
355            self.stats.interned_strings += 1;
356            OptimizedString::interned(interned)
357        } else {
358            OptimizedString::new(s)
359        }
360    }
361
362    /// Get a buffer for XML generation
363    pub fn get_xml_buffer(&mut self, estimated_size: usize) -> String {
364        self.stats.buffers_requested += 1;
365        self.buffer_pool.get_buffer(estimated_size)
366    }
367
368    /// Return a buffer to the pool
369    pub fn return_xml_buffer(&mut self, buffer: String) {
370        self.buffer_pool.return_buffer(buffer);
371    }
372
373    /// Get memory usage statistics
374    pub fn memory_usage(&self) -> MemoryUsage {
375        MemoryUsage {
376            interner_bytes: self.interner.memory_usage(),
377            buffer_pool_bytes: self.buffer_pool.memory_usage(),
378            total_bytes: self.interner.memory_usage() + self.buffer_pool.memory_usage(),
379        }
380    }
381
382    /// Reset context for next build (keeps caches)
383    pub fn reset_for_next_build(&mut self) {
384        // Don't clear interner - strings likely to be reused
385        self.buffer_pool.clear();
386        self.stats = BuildStats::default();
387    }
388
389    /// Full reset including caches
390    pub fn full_reset(&mut self) {
391        self.interner.clear();
392        self.buffer_pool.clear();
393        self.stats = BuildStats::default();
394    }
395}
396
397/// Statistics for string optimization
398#[derive(Debug, Default, Clone)]
399pub struct BuildStats {
400    /// Total strings processed
401    pub strings_processed: usize,
402    /// Cache hits for static strings
403    pub static_cache_hits: usize,
404    /// Number of interned strings
405    pub interned_strings: usize,
406    /// Number of buffer requests
407    pub buffers_requested: usize,
408}
409
410/// Memory usage statistics
411#[derive(Debug, Clone)]
412pub struct MemoryUsage {
413    /// Bytes used by string interner
414    pub interner_bytes: usize,
415    /// Bytes in buffer pool
416    pub buffer_pool_bytes: usize,
417    /// Total memory usage
418    pub total_bytes: usize,
419}
420
421/// Memory size constants for planning
422pub mod buffer_sizes {
423    /// Estimated XML output sizes
424    pub const SINGLE_TRACK_XML: usize = 8_192; // ~8KB
425    /// Typical size of 12-track album XML (~64KB)
426    pub const ALBUM_12_TRACKS_XML: usize = 65_536; // ~64KB
427    /// Typical size of 100-track compilation XML (~512KB)
428    pub const COMPILATION_100_TRACKS_XML: usize = 524_288; // ~512KB
429
430    /// Buffer overhead factors
431    pub const BUFFER_OVERHEAD_FACTOR: f32 = 1.2; // 20% overhead for safety
432
433    /// Calculate estimated buffer size for track count
434    pub fn estimated_xml_size(track_count: usize) -> usize {
435        let base_size = match track_count {
436            1 => SINGLE_TRACK_XML,
437            2..=20 => ALBUM_12_TRACKS_XML,
438            _ => COMPILATION_100_TRACKS_XML,
439        };
440
441        // Scale linearly for track count
442        let scaled = if track_count <= 20 {
443            (base_size * track_count / 12).max(SINGLE_TRACK_XML)
444        } else {
445            COMPILATION_100_TRACKS_XML * track_count / 100
446        };
447
448        (scaled as f32 * BUFFER_OVERHEAD_FACTOR) as usize
449    }
450}
451
452#[cfg(test)]
453mod tests {
454    use super::*;
455
456    #[test]
457    fn test_optimized_string_static_cache() {
458        let s = OptimizedString::new("MainArtist");
459        match s {
460            OptimizedString::Static(val) => assert_eq!(val, "MainArtist"),
461            _ => panic!("Expected static string"),
462        }
463
464        // Should be zero allocation
465        assert_eq!(s.memory_footprint(), 0);
466    }
467
468    #[test]
469    fn test_string_interner() {
470        let mut interner = StringInterner::new();
471
472        let s1 = interner.intern("Custom Artist Name");
473        let s2 = interner.intern("Custom Artist Name");
474
475        // Should be same Arc
476        assert_eq!(s1.as_ptr(), s2.as_ptr());
477    }
478
479    #[test]
480    fn test_buffer_pool() {
481        let mut pool = BufferPool::new();
482
483        let mut buffer = pool.get_buffer(1024);
484        buffer.push_str("test content");
485
486        assert!(buffer.capacity() >= 1024);
487
488        pool.return_buffer(buffer);
489
490        let buffer2 = pool.get_buffer(512);
491        assert!(buffer2.is_empty());
492        assert!(buffer2.capacity() >= 1024); // Reused larger buffer
493    }
494
495    #[test]
496    fn test_buffer_size_estimation() {
497        assert_eq!(
498            buffer_sizes::estimated_xml_size(1),
499            (buffer_sizes::SINGLE_TRACK_XML as f32 * buffer_sizes::BUFFER_OVERHEAD_FACTOR) as usize
500        );
501
502        assert_eq!(
503            buffer_sizes::estimated_xml_size(12),
504            (buffer_sizes::ALBUM_12_TRACKS_XML as f32 * buffer_sizes::BUFFER_OVERHEAD_FACTOR)
505                as usize
506        );
507
508        // Large compilation should scale
509        let size_100 = buffer_sizes::estimated_xml_size(100);
510        let size_200 = buffer_sizes::estimated_xml_size(200);
511        assert!(size_200 > size_100);
512    }
513}