ddex_builder/
optimized_strings.rs

1//! Optimized string handling for DDEX Builder performance
2//! 
3//! This module provides string interning, Cow optimization, and memory-efficient
4//! string operations to reduce allocations and improve build performance.
5
6use std::borrow::Cow;
7use indexmap::IndexMap;
8use std::sync::Arc;
9use once_cell::sync::Lazy;
10use smartstring::{SmartString, LazyCompact};
11use string_cache::{DefaultAtom, Atom};
12use indexmap::IndexSet;
13
14/// High-performance string type for small strings
15pub type FastString = SmartString<LazyCompact>;
16
17/// Static string cache for common DDEX values
18static COMMON_STRINGS: Lazy<IndexMap<&'static str, &'static str>> = Lazy::new(|| {
19    let mut map = IndexMap::new();
20    
21    // Common DDEX versions
22    map.insert("4.3", "4.3");
23    map.insert("4.2", "4.2");
24    map.insert("4.1", "4.1");
25    
26    // Common message types
27    map.insert("NewReleaseMessage", "NewReleaseMessage");
28    map.insert("PurgeReleaseMessage", "PurgeReleaseMessage");
29    map.insert("LiveMessage", "LiveMessage");
30    
31    // Common roles
32    map.insert("MainArtist", "MainArtist");
33    map.insert("FeaturedArtist", "FeaturedArtist");
34    map.insert("Producer", "Producer");
35    map.insert("Composer", "Composer");
36    map.insert("Performer", "Performer");
37    map.insert("Engineer", "Engineer");
38    map.insert("Mixer", "Mixer");
39    
40    // Common resource types
41    map.insert("SoundRecording", "SoundRecording");
42    map.insert("Video", "Video");
43    map.insert("Image", "Image");
44    map.insert("Text", "Text");
45    
46    // Common release types
47    map.insert("Single", "Single");
48    map.insert("Album", "Album");
49    map.insert("EP", "EP");
50    map.insert("Compilation", "Compilation");
51    
52    // Common genres
53    map.insert("Rock", "Rock");
54    map.insert("Pop", "Pop");
55    map.insert("Electronic", "Electronic");
56    map.insert("Hip-Hop", "Hip-Hop");
57    map.insert("Classical", "Classical");
58    map.insert("Jazz", "Jazz");
59    map.insert("Country", "Country");
60    map.insert("R&B", "R&B");
61    map.insert("Folk", "Folk");
62    map.insert("Alternative", "Alternative");
63    
64    // Common language codes
65    map.insert("en", "en");
66    map.insert("es", "es");
67    map.insert("fr", "fr");
68    map.insert("de", "de");
69    map.insert("it", "it");
70    map.insert("pt", "pt");
71    map.insert("ja", "ja");
72    map.insert("ko", "ko");
73    map.insert("zh", "zh");
74    
75    // Common territory codes
76    map.insert("US", "US");
77    map.insert("GB", "GB");
78    map.insert("CA", "CA");
79    map.insert("AU", "AU");
80    map.insert("DE", "DE");
81    map.insert("FR", "FR");
82    map.insert("JP", "JP");
83    map.insert("KR", "KR");
84    
85    // Common commercial models
86    map.insert("SubscriptionModel", "SubscriptionModel");
87    map.insert("PermanentDownload", "PermanentDownload");
88    map.insert("AdSupportedModel", "AdSupportedModel");
89    map.insert("ConditionalDownload", "ConditionalDownload");
90    
91    // Common prefixes for copyright
92    map.insert("℗ ", "℗ ");
93    map.insert("© ", "© ");
94    
95    map
96});
97
98/// String interner for repeated values during build process
99#[derive(Debug, Default)]
100pub struct StringInterner {
101    /// Interned strings storage
102    strings: IndexSet<Arc<str>>,
103    /// Quick lookup for atoms
104    atoms: IndexMap<String, DefaultAtom>,
105}
106
107impl StringInterner {
108    /// Create a new string interner
109    pub fn new() -> Self {
110        Self {
111            strings: IndexSet::new(),
112            atoms: IndexMap::new(),
113        }
114    }
115    
116    /// Intern a string, returning a reference to the interned version
117    pub fn intern(&mut self, s: &str) -> Arc<str> {
118        // Check static cache first
119        if let Some(&static_str) = COMMON_STRINGS.get(s) {
120            return Arc::from(static_str);
121        }
122        
123        // Check if already interned
124        if let Some(existing) = self.strings.get(s) {
125            return existing.clone();
126        }
127        
128        // Intern new string
129        let arc_str: Arc<str> = Arc::from(s);
130        self.strings.insert(arc_str.clone());
131        arc_str
132    }
133    
134    /// Intern as an atom for even better performance on repeated lookups
135    pub fn intern_atom(&mut self, s: String) -> DefaultAtom {
136        if let Some(atom) = self.atoms.get(&s) {
137            return atom.clone();
138        }
139        
140        let atom = DefaultAtom::from(s.as_str());
141        self.atoms.insert(s, atom.clone());
142        atom
143    }
144    
145    /// Get memory usage statistics
146    pub fn memory_usage(&self) -> usize {
147        self.strings.iter()
148            .map(|s| s.len() + std::mem::size_of::<Arc<str>>())
149            .sum::<usize>()
150            + self.atoms.len() * std::mem::size_of::<DefaultAtom>()
151    }
152    
153    /// Clear the interner (useful for long-running processes)
154    pub fn clear(&mut self) {
155        self.strings.clear();
156        self.atoms.clear();
157    }
158}
159
160/// Optimized string for DDEX data
161#[derive(Debug, Clone, PartialEq, Eq, Hash)]
162pub enum OptimizedString {
163    /// Static string reference (zero allocation)
164    Static(&'static str),
165    /// Interned string (shared allocation)
166    Interned(Arc<str>),
167    /// Small string optimization
168    Small(FastString),
169    /// Atom for very frequent lookups
170    Atom(DefaultAtom),
171}
172
173impl OptimizedString {
174    /// Create from a string, choosing the most efficient representation
175    pub fn new(s: &str) -> Self {
176        // Check if it's a common static string
177        if let Some(&static_str) = COMMON_STRINGS.get(s) {
178            return OptimizedString::Static(static_str);
179        }
180        
181        // Use small string optimization for short strings
182        if s.len() <= 23 {  // SmartString threshold
183            OptimizedString::Small(FastString::from(s))
184        } else {
185            // For longer strings, we'll need interning context
186            OptimizedString::Small(FastString::from(s))
187        }
188    }
189    
190    /// Create from an interned string
191    pub fn interned(s: Arc<str>) -> Self {
192        OptimizedString::Interned(s)
193    }
194    
195    /// Create from an atom
196    pub fn atom(atom: DefaultAtom) -> Self {
197        OptimizedString::Atom(atom)
198    }
199    
200    /// Get the string value
201    pub fn as_str(&self) -> &str {
202        match self {
203            OptimizedString::Static(s) => s,
204            OptimizedString::Interned(s) => s,
205            OptimizedString::Small(s) => s,
206            OptimizedString::Atom(atom) => atom,
207        }
208    }
209    
210    /// Get memory footprint in bytes
211    pub fn memory_footprint(&self) -> usize {
212        match self {
213            OptimizedString::Static(_) => 0, // No allocation
214            OptimizedString::Interned(_) => std::mem::size_of::<Arc<str>>(),
215            OptimizedString::Small(s) => s.capacity(),
216            OptimizedString::Atom(_) => std::mem::size_of::<DefaultAtom>(),
217        }
218    }
219}
220
221impl AsRef<str> for OptimizedString {
222    fn as_ref(&self) -> &str {
223        self.as_str()
224    }
225}
226
227impl std::fmt::Display for OptimizedString {
228    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
229        f.write_str(self.as_str())
230    }
231}
232
233/// Cow-optimized string for contexts where we may or may not own the data
234pub type CowString = Cow<'static, str>;
235
236/// Optimized localized string that minimizes allocations
237#[derive(Debug, Clone, PartialEq, Eq)]
238pub struct OptimizedLocalizedString {
239    pub text: OptimizedString,
240    pub language_code: Option<OptimizedString>,
241}
242
243impl OptimizedLocalizedString {
244    /// Create a new optimized localized string
245    pub fn new(text: &str, language_code: Option<&str>) -> Self {
246        Self {
247            text: OptimizedString::new(text),
248            language_code: language_code.map(OptimizedString::new),
249        }
250    }
251    
252    /// Memory footprint of this localized string
253    pub fn memory_footprint(&self) -> usize {
254        self.text.memory_footprint() 
255            + self.language_code.as_ref()
256                .map(|lc| lc.memory_footprint())
257                .unwrap_or(0)
258    }
259}
260
261/// Buffer pool for XML generation to reduce allocations
262#[derive(Debug, Default)]
263pub struct BufferPool {
264    buffers: Vec<String>,
265    current_size: usize,
266}
267
268impl BufferPool {
269    /// Create a new buffer pool
270    pub fn new() -> Self {
271        Self {
272            buffers: Vec::new(),
273            current_size: 0,
274        }
275    }
276    
277    /// Get a buffer from the pool, or create one if none available
278    pub fn get_buffer(&mut self, estimated_size: usize) -> String {
279        match self.buffers.pop() {
280            Some(mut buffer) => {
281                buffer.clear();
282                if buffer.capacity() < estimated_size {
283                    buffer.reserve(estimated_size - buffer.capacity());
284                }
285                buffer
286            }
287            None => {
288                self.current_size += estimated_size;
289                String::with_capacity(estimated_size)
290            }
291        }
292    }
293    
294    /// Return a buffer to the pool
295    pub fn return_buffer(&mut self, buffer: String) {
296        if buffer.capacity() <= 8192 {  // Don't keep huge buffers
297            self.buffers.push(buffer);
298        }
299    }
300    
301    /// Get current memory usage
302    pub fn memory_usage(&self) -> usize {
303        self.current_size + self.buffers.iter()
304            .map(|b| b.capacity())
305            .sum::<usize>()
306    }
307    
308    /// Clear the pool
309    pub fn clear(&mut self) {
310        self.buffers.clear();
311        self.current_size = 0;
312    }
313}
314
315/// Build context that manages optimized strings and buffers
316#[derive(Debug, Default)]
317pub struct BuildContext {
318    /// String interner for repeated values
319    pub interner: StringInterner,
320    /// Buffer pool for XML generation
321    pub buffer_pool: BufferPool,
322    /// Statistics
323    pub stats: BuildStats,
324}
325
326impl BuildContext {
327    /// Create a new build context
328    pub fn new() -> Self {
329        Self {
330            interner: StringInterner::new(),
331            buffer_pool: BufferPool::new(),
332            stats: BuildStats::default(),
333        }
334    }
335    
336    /// Optimize a string using the context's interner
337    pub fn optimize_string(&mut self, s: &str) -> OptimizedString {
338        self.stats.strings_processed += 1;
339        
340        // Track if we use static cache
341        if COMMON_STRINGS.contains_key(s) {
342            self.stats.static_cache_hits += 1;
343            return OptimizedString::new(s);
344        }
345        
346        // Check if worth interning (repeated strings)
347        if s.len() > 23 {  // Beyond small string optimization
348            let interned = self.interner.intern(s);
349            self.stats.interned_strings += 1;
350            OptimizedString::interned(interned)
351        } else {
352            OptimizedString::new(s)
353        }
354    }
355    
356    /// Get a buffer for XML generation
357    pub fn get_xml_buffer(&mut self, estimated_size: usize) -> String {
358        self.stats.buffers_requested += 1;
359        self.buffer_pool.get_buffer(estimated_size)
360    }
361    
362    /// Return a buffer to the pool
363    pub fn return_xml_buffer(&mut self, buffer: String) {
364        self.buffer_pool.return_buffer(buffer);
365    }
366    
367    /// Get memory usage statistics
368    pub fn memory_usage(&self) -> MemoryUsage {
369        MemoryUsage {
370            interner_bytes: self.interner.memory_usage(),
371            buffer_pool_bytes: self.buffer_pool.memory_usage(),
372            total_bytes: self.interner.memory_usage() + self.buffer_pool.memory_usage(),
373        }
374    }
375    
376    /// Reset context for next build (keeps caches)
377    pub fn reset_for_next_build(&mut self) {
378        // Don't clear interner - strings likely to be reused
379        self.buffer_pool.clear();
380        self.stats = BuildStats::default();
381    }
382    
383    /// Full reset including caches
384    pub fn full_reset(&mut self) {
385        self.interner.clear();
386        self.buffer_pool.clear();
387        self.stats = BuildStats::default();
388    }
389}
390
391/// Build statistics for performance monitoring
392#[derive(Debug, Default, Clone)]
393pub struct BuildStats {
394    pub strings_processed: usize,
395    pub static_cache_hits: usize,
396    pub interned_strings: usize,
397    pub buffers_requested: usize,
398}
399
400/// Memory usage information
401#[derive(Debug, Clone)]
402pub struct MemoryUsage {
403    pub interner_bytes: usize,
404    pub buffer_pool_bytes: usize,
405    pub total_bytes: usize,
406}
407
408/// Pre-calculate buffer sizes for different build types
409pub mod buffer_sizes {
410    /// Estimated XML output sizes
411    pub const SINGLE_TRACK_XML: usize = 8_192;      // ~8KB
412    pub const ALBUM_12_TRACKS_XML: usize = 65_536;  // ~64KB  
413    pub const COMPILATION_100_TRACKS_XML: usize = 524_288; // ~512KB
414    
415    /// Buffer overhead factors
416    pub const BUFFER_OVERHEAD_FACTOR: f32 = 1.2; // 20% overhead for safety
417    
418    /// Calculate estimated buffer size for track count
419    pub fn estimated_xml_size(track_count: usize) -> usize {
420        let base_size = match track_count {
421            1 => SINGLE_TRACK_XML,
422            2..=20 => ALBUM_12_TRACKS_XML,
423            _ => COMPILATION_100_TRACKS_XML,
424        };
425        
426        // Scale linearly for track count
427        let scaled = if track_count <= 20 {
428            (base_size * track_count / 12).max(SINGLE_TRACK_XML)
429        } else {
430            COMPILATION_100_TRACKS_XML * track_count / 100
431        };
432        
433        (scaled as f32 * BUFFER_OVERHEAD_FACTOR) as usize
434    }
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440    
441    #[test]
442    fn test_optimized_string_static_cache() {
443        let s = OptimizedString::new("MainArtist");
444        match s {
445            OptimizedString::Static(val) => assert_eq!(val, "MainArtist"),
446            _ => panic!("Expected static string"),
447        }
448        
449        // Should be zero allocation
450        assert_eq!(s.memory_footprint(), 0);
451    }
452    
453    #[test]
454    fn test_string_interner() {
455        let mut interner = StringInterner::new();
456        
457        let s1 = interner.intern("Custom Artist Name");
458        let s2 = interner.intern("Custom Artist Name");
459        
460        // Should be same Arc
461        assert_eq!(s1.as_ptr(), s2.as_ptr());
462    }
463    
464    #[test]
465    fn test_buffer_pool() {
466        let mut pool = BufferPool::new();
467        
468        let mut buffer = pool.get_buffer(1024);
469        buffer.push_str("test content");
470        
471        assert!(buffer.capacity() >= 1024);
472        
473        pool.return_buffer(buffer);
474        
475        let buffer2 = pool.get_buffer(512);
476        assert!(buffer2.is_empty());
477        assert!(buffer2.capacity() >= 1024); // Reused larger buffer
478    }
479    
480    #[test]
481    fn test_buffer_size_estimation() {
482        assert_eq!(buffer_sizes::estimated_xml_size(1), 
483                   (buffer_sizes::SINGLE_TRACK_XML as f32 * buffer_sizes::BUFFER_OVERHEAD_FACTOR) as usize);
484        
485        assert_eq!(buffer_sizes::estimated_xml_size(12),
486                   (buffer_sizes::ALBUM_12_TRACKS_XML as f32 * buffer_sizes::BUFFER_OVERHEAD_FACTOR) as usize);
487        
488        // Large compilation should scale
489        let size_100 = buffer_sizes::estimated_xml_size(100);
490        let size_200 = buffer_sizes::estimated_xml_size(200);
491        assert!(size_200 > size_100);
492    }
493}