stratum_dsp/
config.rs

1//! Configuration parameters for audio analysis
2
3use crate::preprocessing::normalization::NormalizationMethod;
4use crate::features::key::templates::TemplateSet;
5
6/// Analysis configuration parameters
7#[derive(Debug, Clone)]
8pub struct AnalysisConfig {
9    // Preprocessing
10    /// Silence detection threshold in dB (default: -40.0)
11    /// Frames with RMS below this threshold are considered silent
12    pub min_amplitude_db: f32,
13    
14    /// Normalization method to use (default: Peak)
15    pub normalization: NormalizationMethod,
16
17    /// Enable normalization step (default: true)
18    pub enable_normalization: bool,
19
20    /// Enable silence detection + trimming step (default: true)
21    pub enable_silence_trimming: bool,
22
23    // Onset detection (used by beat tracking and legacy BPM fallback)
24    /// Enable multi-detector onset consensus (spectral flux + HFC + optional HPSS) (default: true)
25    ///
26    /// Note: Tempogram BPM does not use this onset list, but legacy BPM + beat tracking do.
27    pub enable_onset_consensus: bool,
28
29    /// Threshold percentile for STFT-based onset detectors (spectral flux / HFC / HPSS) (default: 0.80)
30    /// Range: [0.0, 1.0]
31    pub onset_threshold_percentile: f32,
32
33    /// Onset clustering tolerance window in milliseconds for consensus voting (default: 50 ms)
34    pub onset_consensus_tolerance_ms: u32,
35
36    /// Consensus method weights [energy_flux, spectral_flux, hfc, hpss] (default: equal weights)
37    pub onset_consensus_weights: [f32; 4],
38
39    /// Enable HPSS-based onset detector inside consensus (default: false; more expensive)
40    pub enable_hpss_onsets: bool,
41
42    /// HPSS median-filter margin (default: 10). Typical values: 5–20.
43    pub hpss_margin: usize,
44    
45    // BPM detection
46    /// Force legacy BPM estimation (Phase 1B autocorrelation + comb filter) and skip tempogram.
47    /// Default: false.
48    ///
49    /// Intended for A/B validation and hybrid/consensus experimentation.
50    pub force_legacy_bpm: bool,
51
52    /// Enable BPM fusion (compute tempogram + legacy in parallel, then choose using consensus logic).
53    /// Default: false (tempogram-only unless it fails, then legacy fallback).
54    pub enable_bpm_fusion: bool,
55
56    /// Enable legacy BPM guardrails (soft confidence caps by tempo range).
57    /// Default: true.
58    pub enable_legacy_bpm_guardrails: bool,
59
60    /// Enable **true** multi-resolution tempogram BPM estimation.
61    ///
62    /// When enabled, BPM estimation recomputes STFT at hop sizes {256, 512, 1024} and fuses
63    /// candidates using a cross-resolution scoring rule. This is intended to reduce
64    /// metrical-level (T vs 2T vs T/2) errors.
65    ///
66    /// Default: true (Phase 1F tuning path).
67    pub enable_tempogram_multi_resolution: bool,
68
69    /// Multi-resolution fusion: number of hop=512 candidates to consider as anchors.
70    /// Default: 10.
71    pub tempogram_multi_res_top_k: usize,
72
73    /// Multi-resolution fusion weight for hop=512 (global beat).
74    pub tempogram_multi_res_w512: f32,
75    /// Multi-resolution fusion weight for hop=256 (fine transients).
76    pub tempogram_multi_res_w256: f32,
77    /// Multi-resolution fusion weight for hop=1024 (structural/metre level).
78    pub tempogram_multi_res_w1024: f32,
79
80    /// Structural discount factor applied when hop=1024 supports 2T instead of T.
81    pub tempogram_multi_res_structural_discount: f32,
82
83    /// Factor applied to hop=512 support when evaluating the 2T / T/2 hypotheses.
84    pub tempogram_multi_res_double_time_512_factor: f32,
85
86    /// Minimum score margin (absolute) required to switch between T / 2T / T/2 hypotheses.
87    pub tempogram_multi_res_margin_threshold: f32,
88
89    /// Enable a gentle human-tempo prior as a tie-breaker (only when scores are very close).
90    /// Default: false.
91    pub tempogram_multi_res_use_human_prior: bool,
92
93    /// Enable HPSS percussive-only tempogram fallback (ambiguous-only).
94    ///
95    /// This computes an HPSS decomposition on the (already computed) STFT magnitudes and re-runs
96    /// tempogram on the percussive component. Intended to reduce low-tempo half/double-time traps
97    /// caused by sustained harmonic energy.
98    ///
99    /// Default: true (Phase 1F tuning path).
100    pub enable_tempogram_percussive_fallback: bool,
101
102    /// Enable multi-band novelty fusion inside the tempogram estimator.
103    ///
104    /// This computes novelty curves over low/mid/high frequency bands, runs the tempogram
105    /// on each, then fuses their support when scoring BPM candidates. This is primarily
106    /// intended to improve **candidate generation** (getting GT into top-N candidates),
107    /// which is currently the limiting factor after metrical selection improvements.
108    ///
109    /// Default: true (Phase 1F tuning path).
110    pub enable_tempogram_band_fusion: bool,
111
112    /// Band split cutoffs (Hz). Bands are: low=[~0..low_max], mid=[low_max..mid_max], high=[mid_max..high_max].
113    /// If `tempogram_band_high_max_hz <= 0`, high extends to Nyquist.
114    pub tempogram_band_low_max_hz: f32,
115    /// Upper cutoff for the mid band (Hz).
116    pub tempogram_band_mid_max_hz: f32,
117    /// Upper cutoff for the high band (Hz). If <= 0, uses Nyquist.
118    pub tempogram_band_high_max_hz: f32,
119
120    /// Weight for the full-band tempogram contribution when band-score fusion is enabled.
121    pub tempogram_band_w_full: f32,
122    /// Weight for the low band contribution.
123    pub tempogram_band_w_low: f32,
124    /// Weight for the mid band contribution.
125    pub tempogram_band_w_mid: f32,
126    /// Weight for the high band contribution.
127    pub tempogram_band_w_high: f32,
128
129    /// If true, multi-band tempograms contribute **only to candidate seeding** (peak proposals),
130    /// while final candidate scoring remains full-band-only.
131    ///
132    /// This is the safer default: high-frequency bands often emphasize subdivisions (hi-hats),
133    /// which can otherwise increase 2× / 3:2 metrical errors if they directly affect scoring.
134    pub tempogram_band_seed_only: bool,
135
136    /// Minimum per-band normalized support required to count as "supporting" a BPM candidate
137    /// for band-consensus scoring.
138    ///
139    /// Range: [0, 1]. Default: 0.25.
140    pub tempogram_band_support_threshold: f32,
141
142    /// Bonus multiplier applied when **multiple bands** support the same BPM candidate.
143    ///
144    /// This is a lightweight "consensus" heuristic intended to reduce metrical/subdivision errors
145    /// (e.g., a 2× tempo supported only by the high band should not win over a tempo supported by
146    /// low+mid bands).
147    ///
148    /// Score adjustment: `score *= (1 + bonus * max(0, support_bands - 1))`.
149    pub tempogram_band_consensus_bonus: f32,
150
151    /// Tempogram novelty weights for combining {spectral, energy, HFC}.
152    pub tempogram_novelty_w_spectral: f32,
153    /// Tempogram novelty weight for energy flux.
154    pub tempogram_novelty_w_energy: f32,
155    /// Tempogram novelty weight for HFC.
156    pub tempogram_novelty_w_hfc: f32,
157    /// Tempogram novelty conditioning windows.
158    pub tempogram_novelty_local_mean_window: usize,
159    /// Tempogram novelty moving-average smoothing window (frames). Use 0/1 to disable.
160    pub tempogram_novelty_smooth_window: usize,
161
162    /// Debug: if set, the `analyze_file` example will pass this track ID through to the
163    /// multi-resolution fusion so it can print detailed scoring diagnostics.
164    pub debug_track_id: Option<u32>,
165    /// Debug: optional ground-truth BPM passed alongside `debug_track_id`.
166    pub debug_gt_bpm: Option<f32>,
167    /// Debug: number of top candidates per hop to print when `debug_track_id` is set.
168    pub debug_top_n: usize,
169
170    /// Enable log-mel novelty tempogram as an additional candidate generator/support signal.
171    ///
172    /// This computes a log-mel SuperFlux-style novelty curve, then runs the tempogram on it.
173    /// The resulting candidates are used for seeding and for the consensus bonus logic.
174    pub enable_tempogram_mel_novelty: bool,
175    /// Mel band count used by log-mel novelty.
176    pub tempogram_mel_n_mels: usize,
177    /// Minimum mel frequency (Hz).
178    pub tempogram_mel_fmin_hz: f32,
179    /// Maximum mel frequency (Hz). If <= 0, uses Nyquist.
180    pub tempogram_mel_fmax_hz: f32,
181    /// Max-filter neighborhood radius in mel bins (SuperFlux-style reference).
182    pub tempogram_mel_max_filter_bins: usize,
183    /// Weight for mel variant when band scoring fusion is enabled (`seed_only=false`).
184    pub tempogram_mel_weight: f32,
185
186    /// SuperFlux max-filter neighborhood radius (bins) used by the tempogram novelty extractor.
187    pub tempogram_superflux_max_filter_bins: usize,
188
189    /// Emit tempogram BPM candidate list (top-N) into `AnalysisMetadata` for validation/tuning.
190    ///
191    /// Default: false (avoid bloating outputs in normal use).
192    pub emit_tempogram_candidates: bool,
193
194    /// Number of tempogram candidates to emit when `emit_tempogram_candidates` is enabled.
195    /// Default: 10.
196    pub tempogram_candidates_top_n: usize,
197
198    /// Legacy guardrails: preferred BPM range (default: 75–150).
199    pub legacy_bpm_preferred_min: f32,
200    /// Legacy guardrails: preferred BPM range upper bound (default: 150).
201    pub legacy_bpm_preferred_max: f32,
202
203    /// Legacy guardrails: soft BPM range (default: 60–180).
204    /// Values in [soft_min, preferred_min) or (preferred_max, soft_max] get a medium cap.
205    pub legacy_bpm_soft_min: f32,
206    /// Legacy guardrails: soft BPM range upper bound (default: 180).
207    pub legacy_bpm_soft_max: f32,
208
209    /// Legacy guardrails: confidence caps by range.
210    /// - preferred: inside [preferred_min, preferred_max]
211    /// - soft: inside [soft_min, soft_max] but outside preferred
212    /// - extreme: outside [soft_min, soft_max]
213    ///
214    /// **Multiplier semantics**: these are applied as `confidence *= multiplier` to legacy
215    /// candidates/estimates (softly biasing the selection).
216    pub legacy_bpm_conf_mul_preferred: f32,
217    /// Legacy guardrails: confidence multiplier for the soft band (default: 0.50).
218    pub legacy_bpm_conf_mul_soft: f32,
219    /// Legacy guardrails: confidence multiplier for extremes (default: 0.10).
220    pub legacy_bpm_conf_mul_extreme: f32,
221
222    /// Minimum BPM to consider (default: 60.0)
223    pub min_bpm: f32,
224    
225    /// Maximum BPM to consider (default: 180.0)
226    pub max_bpm: f32,
227    
228    /// BPM resolution for comb filterbank (default: 1.0)
229    pub bpm_resolution: f32,
230    
231    // STFT parameters
232    /// Frame size for STFT (default: 2048)
233    pub frame_size: usize,
234    
235    /// Hop size for STFT (default: 512)
236    pub hop_size: usize,
237    
238    // Key detection
239    /// Center frequency for chroma extraction (default: 440.0 Hz, A4)
240    pub center_frequency: f32,
241    
242    /// Enable soft chroma mapping (default: true)
243    /// Soft mapping spreads frequency bins to neighboring semitones for robustness
244    pub soft_chroma_mapping: bool,
245    
246    /// Soft mapping standard deviation in semitones (default: 0.5)
247    /// Lower values = sharper mapping, higher values = more spread
248    pub soft_mapping_sigma: f32,
249    
250    /// Chroma sharpening power (default: 1.0 = no sharpening, 1.5-2.0 recommended)
251    /// Power > 1.0 emphasizes prominent semitones, improving key detection
252    pub chroma_sharpening_power: f32,
253
254    /// Enable a lightweight percussive-suppression step for key detection by time-smoothing
255    /// the STFT magnitude spectrogram prior to chroma extraction.
256    ///
257    /// This is HPSS-inspired (harmonic content is sustained in time; percussive is transient),
258    /// but uses a cheap moving-average rather than full iterative HPSS.
259    ///
260    /// Default: true.
261    pub enable_key_spectrogram_time_smoothing: bool,
262
263    /// Half-window size (in frames) for the key spectrogram time-smoothing.
264    /// Effective window length is `2*margin + 1`.
265    ///
266    /// Default: 12 (≈ 12 * hop_size samples ≈ 140 ms at 44.1kHz with hop=512).
267    pub key_spectrogram_smooth_margin: usize,
268
269    /// Enable weighted key aggregation (frame weights based on tonality + energy).
270    /// Default: true.
271    pub enable_key_frame_weighting: bool,
272
273    /// Minimum per-frame "tonalness" required to include the frame in key aggregation.
274    /// Tonalness is computed from chroma entropy and mapped to [0, 1].
275    /// Default: 0.10.
276    pub key_min_tonalness: f32,
277
278    /// Exponent applied to tonalness when building frame weights (>= 0).
279    /// Default: 2.0.
280    pub key_tonalness_power: f32,
281
282    /// Exponent applied to normalized frame energy when building frame weights (>= 0).
283    /// Default: 0.50 (square-root weighting).
284    pub key_energy_power: f32,
285
286    /// Enable a harmonic-emphasized spectrogram for key detection via a time-smoothing-derived
287    /// soft mask (cheap HPSS-inspired).
288    ///
289    /// If enabled, key detection uses `harmonic_spectrogram_time_mask()` instead of raw/time-smoothed
290    /// magnitudes when extracting chroma.
291    ///
292    /// Default: true.
293    pub enable_key_harmonic_mask: bool,
294
295    /// Soft-mask exponent \(p\) for harmonic masking (>= 1.0). Higher values produce harder masks.
296    /// Default: 2.0.
297    pub key_harmonic_mask_power: f32,
298
299    /// Enable median-filter HPSS harmonic extraction for key detection (key-only).
300    ///
301    /// This is a more literature-standard HPSS step than `harmonic_spectrogram_time_mask()`.
302    /// We compute time- and frequency-median estimates on a **time-downsampled**, **band-limited**
303    /// spectrogram, build a soft mask, then apply it to the full-resolution spectrogram.
304    ///
305    /// Default: false (opt-in; more expensive).
306    pub enable_key_hpss_harmonic: bool,
307
308    /// Time-downsampling step for key HPSS (>= 1). Values like 2–6 greatly reduce cost.
309    /// Default: 4.
310    pub key_hpss_frame_step: usize,
311
312    /// Half-window size (in downsampled frames) for the HPSS harmonic (time) median filter.
313    /// Effective window length is `2*margin + 1` (in downsampled frames).
314    /// Default: 8.
315    pub key_hpss_time_margin: usize,
316
317    /// Half-window size (in frequency bins) for the HPSS percussive (frequency) median filter.
318    /// Effective window length is `2*margin + 1` bins.
319    /// Default: 8.
320    pub key_hpss_freq_margin: usize,
321
322    /// Soft-mask exponent \(p\) for HPSS masking (>= 1.0). Higher values produce harder masks.
323    /// Default: 2.0.
324    pub key_hpss_mask_power: f32,
325
326    /// Enable a key-only STFT override (compute a separate STFT for key detection).
327    ///
328    /// Rationale: key detection benefits from higher frequency resolution than BPM/onset work.
329    /// A larger FFT size improves pitch precision at low frequencies where semitone spacing is small.
330    ///
331    /// Default: false (keep single shared STFT by default).
332    pub enable_key_stft_override: bool,
333
334    /// FFT frame size used for key-only STFT when `enable_key_stft_override` is true.
335    /// Default: 8192.
336    pub key_stft_frame_size: usize,
337
338    /// Hop size used for key-only STFT when `enable_key_stft_override` is true.
339    /// Default: 512.
340    pub key_stft_hop_size: usize,
341
342    /// Enable log-frequency (semitone-aligned) spectrogram for key detection.
343    ///
344    /// This converts the linear STFT magnitude spectrogram into a log-frequency representation
345    /// where each bin corresponds to one semitone. This provides better pitch-class resolution
346    /// than mapping linear FFT bins to semitones, especially at low frequencies.
347    ///
348    /// When enabled, chroma extraction works directly on semitone bins (no frequency-to-semitone
349    /// mapping needed). HPCP is disabled when log-frequency is enabled (HPCP requires frequency
350    /// information for harmonic summation).
351    ///
352    /// Default: false (use linear STFT with frequency-to-semitone mapping).
353    pub enable_key_log_frequency: bool,
354
355    /// Enable beat-synchronous chroma extraction for key detection.
356    ///
357    /// This aligns chroma windows to beat boundaries instead of fixed-time frames, improving
358    /// harmonic coherence by aligning to musical structure. For each beat interval, chroma vectors
359    /// from all STFT frames within that interval are averaged.
360    ///
361    /// Requires a valid beat grid (falls back to frame-based chroma if beat grid is unavailable).
362    /// HPCP is disabled when beat-synchronous is enabled (HPCP requires frame-based processing).
363    ///
364    /// Default: false (use frame-based chroma extraction).
365    pub enable_key_beat_synchronous: bool,
366
367    /// Enable multi-scale key detection (ensemble voting across multiple time scales).
368    ///
369    /// This runs key detection at multiple segment lengths (short, medium, long) and aggregates
370    /// results using clarity-weighted voting. This captures both local and global key information,
371    /// improving robustness on tracks with key changes or varying harmonic stability.
372    ///
373    /// Default: false (use single-scale detection).
374    pub enable_key_multi_scale: bool,
375
376    /// Template set to use for key detection.
377    ///
378    /// - `KrumhanslKessler`: Krumhansl-Kessler (1982) templates (empirical, from listening experiments)
379    /// - `Temperley`: Temperley (1999) templates (statistical, from corpus analysis)
380    ///
381    /// Default: `KrumhanslKessler`.
382    pub key_template_set: crate::features::key::templates::TemplateSet,
383
384    /// Enable ensemble key detection (combine K-K and Temperley template scores).
385    ///
386    /// This runs key detection with both template sets and combines their scores using
387    /// weighted voting. This ensemble approach can improve robustness by leveraging
388    /// complementary strengths of different template sets.
389    ///
390    /// Default: false (use single template set).
391    pub enable_key_ensemble: bool,
392
393    /// Weight for Krumhansl-Kessler scores in ensemble detection.
394    ///
395    /// Default: 0.5 (equal weight with Temperley).
396    pub key_ensemble_kk_weight: f32,
397
398    /// Weight for Temperley scores in ensemble detection.
399    ///
400    /// Default: 0.5 (equal weight with K-K).
401    pub key_ensemble_temperley_weight: f32,
402
403    /// Enable median key detection (detect key from multiple short segments and select median).
404    ///
405    /// This divides the track into multiple short overlapping segments, detects key for each
406    /// segment, and selects the median key (most common key across segments). This helps
407    /// handle brief modulations, breakdowns, or ambiguous sections.
408    ///
409    /// Default: false (use global key detection).
410    pub enable_key_median: bool,
411
412    /// Segment length (in frames) for median key detection.
413    ///
414    /// Default: 480 (~4 seconds at typical frame rates).
415    pub key_median_segment_length_frames: usize,
416
417    /// Segment hop size (in frames) for median key detection.
418    ///
419    /// Default: 120 (~1 second).
420    pub key_median_segment_hop_frames: usize,
421
422    /// Minimum number of segments required for median key detection.
423    ///
424    /// If fewer segments are available, falls back to global detection.
425    ///
426    /// Default: 3.
427    pub key_median_min_segments: usize,
428
429    /// Segment lengths (in frames) for multi-scale key detection.
430    /// Multiple scales are processed and aggregated with clarity-weighted voting.
431    /// Default: [120, 360, 720] (approximately 2s, 6s, 12s at typical frame rates).
432    pub key_multi_scale_lengths: Vec<usize>,
433
434    /// Hop size (in frames) between segments for multi-scale detection.
435    /// Default: 60 (approximately 1s at typical frame rates).
436    pub key_multi_scale_hop: usize,
437
438    /// Minimum clarity threshold for including a segment in multi-scale aggregation.
439    /// Default: 0.20.
440    pub key_multi_scale_min_clarity: f32,
441
442    /// Optional weights for each scale in multi-scale detection (if empty, all scales weighted equally).
443    /// Length should match `key_multi_scale_lengths`. Default: empty (equal weights).
444    pub key_multi_scale_weights: Vec<f32>,
445
446    /// Enable per-track tuning compensation for key detection.
447    ///
448    /// This estimates a global detuning offset (in semitones, relative to A4=440Hz) from the
449    /// key spectrogram, then shifts semitone mapping by that offset during chroma extraction.
450    ///
451    /// Default: true.
452    pub enable_key_tuning_compensation: bool,
453
454    /// Maximum absolute tuning correction to apply (semitones).
455    /// Default: 0.25.
456    pub key_tuning_max_abs_semitones: f32,
457
458    /// Frame subsampling step used for tuning estimation (>= 1).
459    /// Default: 20.
460    pub key_tuning_frame_step: usize,
461
462    /// Relative threshold (fraction of per-frame peak) for selecting bins used in tuning estimation.
463    /// Default: 0.35.
464    pub key_tuning_peak_rel_threshold: f32,
465
466    /// Enable trimming the first/last fraction of frames for key detection.
467    ///
468    /// DJ tracks often have long beat-only intros/outros; trimming edges reduces percussive bias
469    /// without affecting tempo (tempo uses its own pipeline).
470    ///
471    /// Default: true.
472    pub enable_key_edge_trim: bool,
473
474    /// Fraction (0..0.49) to trim from the start and end (symmetric) when `enable_key_edge_trim` is true.
475    /// Default: 0.15 (use middle 70%).
476    pub key_edge_trim_fraction: f32,
477
478    /// Enable segment voting for key detection (windowed key detection + score accumulation).
479    ///
480    /// Rationale: long-form DJ tracks can modulate, have breakdowns, or contain beat-only sections.
481    /// Segment voting helps focus on harmonically stable portions without requiring full key-change tracking.
482    ///
483    /// Default: true.
484    pub enable_key_segment_voting: bool,
485
486    /// Segment length in chroma frames for key voting.
487    /// Default: 1024 (~11.9s at 44.1kHz, hop=512).
488    pub key_segment_len_frames: usize,
489
490    /// Segment hop/stride in frames for key voting.
491    /// Default: 512 (~50% overlap).
492    pub key_segment_hop_frames: usize,
493
494    /// Minimum clarity required to include a segment in voting (0..1).
495    /// Default: 0.20.
496    pub key_segment_min_clarity: f32,
497
498    /// Enable a conservative mode heuristic to reduce minor→major mistakes.
499    ///
500    /// Uses the 3rd degree (minor third vs major third) from the aggregated chroma to potentially
501    /// flip parallel mode, gated by a score-ratio threshold.
502    ///
503    /// Default: true.
504    pub enable_key_mode_heuristic: bool,
505
506    /// Required ratio margin for the 3rd-degree test (>=0). If `p(min3) > p(maj3) * (1+margin)`
507    /// we prefer minor (and vice versa for major).
508    /// Default: 0.05.
509    pub key_mode_third_ratio_margin: f32,
510
511    /// Only flip parallel mode if the alternate mode's template score is at least this ratio of
512    /// the best mode's score (0..1).
513    /// Default: 0.92.
514    pub key_mode_flip_min_score_ratio: f32,
515
516    /// Enable HPCP-style pitch-class profile extraction for key detection.
517    ///
518    /// This uses spectral peak picking + harmonic summation to form a more robust tonal profile
519    /// than raw STFT-bin chroma on real-world mixes.
520    ///
521    /// Default: false (experimental).
522    pub enable_key_hpcp: bool,
523
524    /// Number of spectral peaks per frame used for HPCP extraction.
525    /// Default: 24.
526    pub key_hpcp_peaks_per_frame: usize,
527
528    /// Number of harmonics per peak used for HPCP extraction.
529    /// Default: 4.
530    pub key_hpcp_num_harmonics: usize,
531
532    /// Harmonic decay factor applied per harmonic (0..1). Lower values emphasize fundamentals.
533    /// Default: 0.60.
534    pub key_hpcp_harmonic_decay: f32,
535
536    /// Magnitude compression exponent for peak weights (0..1].
537    /// Default: 0.50 (sqrt).
538    pub key_hpcp_mag_power: f32,
539
540    /// Enable spectral whitening (per-frame frequency-domain normalization) for HPCP peak picking.
541    ///
542    /// This suppresses timbral formants and broadband coloration, helping peaks corresponding to
543    /// harmonic partials stand out more consistently across mixes.
544    ///
545    /// Default: false.
546    pub enable_key_hpcp_whitening: bool,
547
548    /// Frequency smoothing window (in FFT bins) for HPCP whitening.
549    /// Larger values whiten more aggressively (more timbre suppression), but can also amplify noise.
550    ///
551    /// Default: 31.
552    pub key_hpcp_whitening_smooth_bins: usize,
553
554    /// Enable a bass-band HPCP blend (tonic reinforcement).
555    ///
556    /// Relative major/minor share pitch classes; bass/tonic emphasis can disambiguate mode in
557    /// dance music where the bassline strongly implies the tonic.
558    ///
559    /// Default: true.
560    pub enable_key_hpcp_bass_blend: bool,
561
562    /// Bass-band lower cutoff (Hz) for bass HPCP.
563    /// Default: 55.0.
564    pub key_hpcp_bass_fmin_hz: f32,
565
566    /// Bass-band upper cutoff (Hz) for bass HPCP.
567    /// Default: 300.0.
568    pub key_hpcp_bass_fmax_hz: f32,
569
570    /// Blend weight for bass HPCP (0..1). Final PCP = normalize((1-w)*full + w*bass).
571    /// Default: 0.35.
572    pub key_hpcp_bass_weight: f32,
573
574    /// Enable a minor-key harmonic bonus (leading-tone vs flat-7) when scoring templates.
575    ///
576    /// Many dance tracks in minor heavily use harmonic minor gestures (raised 7th). This bonus
577    /// nudges minor candidates whose pitch-class distribution supports a leading-tone.
578    ///
579    /// Default: true.
580    pub enable_key_minor_harmonic_bonus: bool,
581
582    /// Weight for the minor harmonic bonus. Internally scaled by the sum of frame weights so it
583    /// is comparable to the template-score scale.
584    ///
585    /// Default: 0.8.
586    pub key_minor_leading_tone_bonus_weight: f32,
587    
588    // ML refinement
589    /// Enable ML refinement (requires ml feature)
590    #[cfg(feature = "ml")]
591    pub enable_ml_refinement: bool,
592}
593
594impl Default for AnalysisConfig {
595    fn default() -> Self {
596        Self {
597            min_amplitude_db: -40.0,
598            normalization: NormalizationMethod::Peak,
599            enable_normalization: true,
600            enable_silence_trimming: true,
601            enable_onset_consensus: true,
602            onset_threshold_percentile: 0.80,
603            onset_consensus_tolerance_ms: 50,
604            onset_consensus_weights: [0.25, 0.25, 0.25, 0.25],
605            enable_hpss_onsets: false,
606            hpss_margin: 10,
607            force_legacy_bpm: false,
608            enable_bpm_fusion: false,
609            enable_legacy_bpm_guardrails: true,
610            enable_tempogram_multi_resolution: true,
611            tempogram_multi_res_top_k: 25,
612            tempogram_multi_res_w512: 0.45,
613            tempogram_multi_res_w256: 0.35,
614            tempogram_multi_res_w1024: 0.20,
615            tempogram_multi_res_structural_discount: 0.85,
616            tempogram_multi_res_double_time_512_factor: 0.92,
617            tempogram_multi_res_margin_threshold: 0.08,
618            tempogram_multi_res_use_human_prior: false,
619            // HPSS percussive fallback is very expensive and (so far) has not shown consistent gains.
620            // Keep it opt-in to avoid multi-second outliers during batch runs.
621            enable_tempogram_percussive_fallback: false,
622            enable_tempogram_band_fusion: true,
623            // Default cutoffs (Hz): ~kick/bass fundamentals, then body/rhythm textures, then attacks.
624            tempogram_band_low_max_hz: 200.0,
625            tempogram_band_mid_max_hz: 2000.0,
626            tempogram_band_high_max_hz: 8000.0,
627            // Default weights: keep full-band as anchor, but allow bands to pull candidates into view.
628            tempogram_band_w_full: 0.40,
629            tempogram_band_w_low: 0.25,
630            tempogram_band_w_mid: 0.20,
631            tempogram_band_w_high: 0.15,
632            tempogram_band_seed_only: true,
633            tempogram_band_support_threshold: 0.25,
634            tempogram_band_consensus_bonus: 0.08,
635            // Novelty weighting defaults (tuned on 200-track validation):
636            // shift weight toward transient-heavy signals (energy/HFC) to reduce octave/subdivision traps.
637            tempogram_novelty_w_spectral: 0.30,
638            tempogram_novelty_w_energy: 0.35,
639            tempogram_novelty_w_hfc: 0.35,
640            tempogram_novelty_local_mean_window: 16,
641            tempogram_novelty_smooth_window: 5,
642            debug_track_id: None,
643            debug_gt_bpm: None,
644            debug_top_n: 5,
645            enable_tempogram_mel_novelty: true,
646            tempogram_mel_n_mels: 40,
647            tempogram_mel_fmin_hz: 30.0,
648            tempogram_mel_fmax_hz: 8000.0,
649            tempogram_mel_max_filter_bins: 2,
650            tempogram_mel_weight: 0.15,
651            tempogram_superflux_max_filter_bins: 4,
652            emit_tempogram_candidates: false,
653            tempogram_candidates_top_n: 10,
654            // Tuned defaults (empirical, small-batch): slightly wider preferred band and
655            // slightly less aggressive down-weighting while keeping a strong extreme penalty.
656            legacy_bpm_preferred_min: 72.0,
657            legacy_bpm_preferred_max: 168.0,
658            legacy_bpm_soft_min: 60.0,
659            legacy_bpm_soft_max: 210.0,
660            legacy_bpm_conf_mul_preferred: 1.30,
661            legacy_bpm_conf_mul_soft: 0.70,
662            legacy_bpm_conf_mul_extreme: 0.01,
663            min_bpm: 40.0,  // Lowered from 60.0 to catch slower tracks (ballads, ambient, etc.)
664            max_bpm: 240.0, // Raised from 180.0 to catch high-tempo tracks (drum & bass, etc.)
665            bpm_resolution: 1.0,
666            frame_size: 2048,
667            hop_size: 512,
668            center_frequency: 440.0,
669            soft_chroma_mapping: true,
670            soft_mapping_sigma: 0.5,
671            chroma_sharpening_power: 1.0, // No sharpening by default (can be enabled with 1.5-2.0)
672            enable_key_spectrogram_time_smoothing: true,
673            key_spectrogram_smooth_margin: 12,
674            enable_key_frame_weighting: true,
675            // Default: do not hard-gate frames by tonalness; use soft weighting instead.
676            key_min_tonalness: 0.0,
677            key_tonalness_power: 2.0,
678            key_energy_power: 0.50,
679            enable_key_harmonic_mask: true,
680            key_harmonic_mask_power: 2.0,
681            // Default: off. HPSS median filtering is more expensive than the cheap harmonic mask.
682            // Enable via CLI/validation when experimenting.
683            enable_key_hpss_harmonic: false,
684            key_hpss_frame_step: 4,
685            key_hpss_time_margin: 8,
686            key_hpss_freq_margin: 8,
687            key_hpss_mask_power: 2.0,
688            enable_key_stft_override: true,
689            key_stft_frame_size: 8192,
690            key_stft_hop_size: 512,
691            enable_key_log_frequency: false,
692            enable_key_beat_synchronous: false,
693            enable_key_multi_scale: false,
694            key_multi_scale_lengths: vec![120, 360, 720], // ~2s, 6s, 12s at typical frame rates
695            key_multi_scale_hop: 60, // ~1s
696            key_multi_scale_min_clarity: 0.20,
697            key_multi_scale_weights: vec![], // Equal weights by default
698            key_template_set: TemplateSet::KrumhanslKessler,
699            enable_key_ensemble: false,
700            key_ensemble_kk_weight: 0.5,
701            key_ensemble_temperley_weight: 0.5,
702            enable_key_median: false,
703            key_median_segment_length_frames: 480, // ~4 seconds at typical frame rates
704            key_median_segment_hop_frames: 120, // ~1 second
705            key_median_min_segments: 3,
706            // Default: off. Tuning estimation can be unstable on real-world mixes without a more
707            // peak/partial-aware frontend (HPCP/CQT). Keep available for experimentation.
708            enable_key_tuning_compensation: false,
709            key_tuning_max_abs_semitones: 0.08,
710            key_tuning_frame_step: 20,
711            key_tuning_peak_rel_threshold: 0.35,
712            // Default: off. Hard edge trimming can remove useful harmonic content on some tracks.
713            // Prefer harmonic masking + frame weighting; keep edge-trim available for experimentation.
714            enable_key_edge_trim: false,
715            key_edge_trim_fraction: 0.15,
716            enable_key_segment_voting: true,
717            key_segment_len_frames: 1024,
718            key_segment_hop_frames: 512,
719            key_segment_min_clarity: 0.20,
720            enable_key_mode_heuristic: false,
721            // NOTE: Aggressive defaults for Phase 1F DJ validation: minor keys were frequently
722            // predicted as major. Keep these tunable via CLI/validation.
723            key_mode_third_ratio_margin: 0.00,
724            key_mode_flip_min_score_ratio: 0.60,
725            enable_key_hpcp: true,
726            key_hpcp_peaks_per_frame: 24,
727            key_hpcp_num_harmonics: 4,
728            key_hpcp_harmonic_decay: 0.60,
729            key_hpcp_mag_power: 0.50,
730            enable_key_hpcp_whitening: false,
731            key_hpcp_whitening_smooth_bins: 31,
732            // Experimental: tonic reinforcement can backfire if the bass is not stably pitched.
733            enable_key_hpcp_bass_blend: false,
734            key_hpcp_bass_fmin_hz: 55.0,
735            key_hpcp_bass_fmax_hz: 300.0,
736            key_hpcp_bass_weight: 0.35,
737            // Experimental: can easily over-bias the result on real-world mixes.
738            enable_key_minor_harmonic_bonus: false,
739            key_minor_leading_tone_bonus_weight: 0.2,
740            #[cfg(feature = "ml")]
741            enable_ml_refinement: false,
742        }
743    }
744}
745
stratum_dsp/config.rs

stratum_dsp/
config.rs