stratum_dsp/config.rs
1//! Configuration parameters for audio analysis
2
3use crate::preprocessing::normalization::NormalizationMethod;
4use crate::features::key::templates::TemplateSet;
5
6/// Analysis configuration parameters
7#[derive(Debug, Clone)]
8pub struct AnalysisConfig {
9 // Preprocessing
10 /// Silence detection threshold in dB (default: -40.0)
11 /// Frames with RMS below this threshold are considered silent
12 pub min_amplitude_db: f32,
13
14 /// Normalization method to use (default: Peak)
15 pub normalization: NormalizationMethod,
16
17 /// Enable normalization step (default: true)
18 pub enable_normalization: bool,
19
20 /// Enable silence detection + trimming step (default: true)
21 pub enable_silence_trimming: bool,
22
23 // Onset detection (used by beat tracking and legacy BPM fallback)
24 /// Enable multi-detector onset consensus (spectral flux + HFC + optional HPSS) (default: true)
25 ///
26 /// Note: Tempogram BPM does not use this onset list, but legacy BPM + beat tracking do.
27 pub enable_onset_consensus: bool,
28
29 /// Threshold percentile for STFT-based onset detectors (spectral flux / HFC / HPSS) (default: 0.80)
30 /// Range: [0.0, 1.0]
31 pub onset_threshold_percentile: f32,
32
33 /// Onset clustering tolerance window in milliseconds for consensus voting (default: 50 ms)
34 pub onset_consensus_tolerance_ms: u32,
35
36 /// Consensus method weights [energy_flux, spectral_flux, hfc, hpss] (default: equal weights)
37 pub onset_consensus_weights: [f32; 4],
38
39 /// Enable HPSS-based onset detector inside consensus (default: false; more expensive)
40 pub enable_hpss_onsets: bool,
41
42 /// HPSS median-filter margin (default: 10). Typical values: 5–20.
43 pub hpss_margin: usize,
44
45 // BPM detection
46 /// Force legacy BPM estimation (Phase 1B autocorrelation + comb filter) and skip tempogram.
47 /// Default: false.
48 ///
49 /// Intended for A/B validation and hybrid/consensus experimentation.
50 pub force_legacy_bpm: bool,
51
52 /// Enable BPM fusion (compute tempogram + legacy in parallel, then choose using consensus logic).
53 /// Default: false (tempogram-only unless it fails, then legacy fallback).
54 pub enable_bpm_fusion: bool,
55
56 /// Enable legacy BPM guardrails (soft confidence caps by tempo range).
57 /// Default: true.
58 pub enable_legacy_bpm_guardrails: bool,
59
60 /// Enable **true** multi-resolution tempogram BPM estimation.
61 ///
62 /// When enabled, BPM estimation recomputes STFT at hop sizes {256, 512, 1024} and fuses
63 /// candidates using a cross-resolution scoring rule. This is intended to reduce
64 /// metrical-level (T vs 2T vs T/2) errors.
65 ///
66 /// Default: true (Phase 1F tuning path).
67 pub enable_tempogram_multi_resolution: bool,
68
69 /// Multi-resolution fusion: number of hop=512 candidates to consider as anchors.
70 /// Default: 10.
71 pub tempogram_multi_res_top_k: usize,
72
73 /// Multi-resolution fusion weight for hop=512 (global beat).
74 pub tempogram_multi_res_w512: f32,
75 /// Multi-resolution fusion weight for hop=256 (fine transients).
76 pub tempogram_multi_res_w256: f32,
77 /// Multi-resolution fusion weight for hop=1024 (structural/metre level).
78 pub tempogram_multi_res_w1024: f32,
79
80 /// Structural discount factor applied when hop=1024 supports 2T instead of T.
81 pub tempogram_multi_res_structural_discount: f32,
82
83 /// Factor applied to hop=512 support when evaluating the 2T / T/2 hypotheses.
84 pub tempogram_multi_res_double_time_512_factor: f32,
85
86 /// Minimum score margin (absolute) required to switch between T / 2T / T/2 hypotheses.
87 pub tempogram_multi_res_margin_threshold: f32,
88
89 /// Enable a gentle human-tempo prior as a tie-breaker (only when scores are very close).
90 /// Default: false.
91 pub tempogram_multi_res_use_human_prior: bool,
92
93 /// Enable HPSS percussive-only tempogram fallback (ambiguous-only).
94 ///
95 /// This computes an HPSS decomposition on the (already computed) STFT magnitudes and re-runs
96 /// tempogram on the percussive component. Intended to reduce low-tempo half/double-time traps
97 /// caused by sustained harmonic energy.
98 ///
99 /// Default: true (Phase 1F tuning path).
100 pub enable_tempogram_percussive_fallback: bool,
101
102 /// Enable multi-band novelty fusion inside the tempogram estimator.
103 ///
104 /// This computes novelty curves over low/mid/high frequency bands, runs the tempogram
105 /// on each, then fuses their support when scoring BPM candidates. This is primarily
106 /// intended to improve **candidate generation** (getting GT into top-N candidates),
107 /// which is currently the limiting factor after metrical selection improvements.
108 ///
109 /// Default: true (Phase 1F tuning path).
110 pub enable_tempogram_band_fusion: bool,
111
112 /// Band split cutoffs (Hz). Bands are: low=[~0..low_max], mid=[low_max..mid_max], high=[mid_max..high_max].
113 /// If `tempogram_band_high_max_hz <= 0`, high extends to Nyquist.
114 pub tempogram_band_low_max_hz: f32,
115 /// Upper cutoff for the mid band (Hz).
116 pub tempogram_band_mid_max_hz: f32,
117 /// Upper cutoff for the high band (Hz). If <= 0, uses Nyquist.
118 pub tempogram_band_high_max_hz: f32,
119
120 /// Weight for the full-band tempogram contribution when band-score fusion is enabled.
121 pub tempogram_band_w_full: f32,
122 /// Weight for the low band contribution.
123 pub tempogram_band_w_low: f32,
124 /// Weight for the mid band contribution.
125 pub tempogram_band_w_mid: f32,
126 /// Weight for the high band contribution.
127 pub tempogram_band_w_high: f32,
128
129 /// If true, multi-band tempograms contribute **only to candidate seeding** (peak proposals),
130 /// while final candidate scoring remains full-band-only.
131 ///
132 /// This is the safer default: high-frequency bands often emphasize subdivisions (hi-hats),
133 /// which can otherwise increase 2× / 3:2 metrical errors if they directly affect scoring.
134 pub tempogram_band_seed_only: bool,
135
136 /// Minimum per-band normalized support required to count as "supporting" a BPM candidate
137 /// for band-consensus scoring.
138 ///
139 /// Range: [0, 1]. Default: 0.25.
140 pub tempogram_band_support_threshold: f32,
141
142 /// Bonus multiplier applied when **multiple bands** support the same BPM candidate.
143 ///
144 /// This is a lightweight "consensus" heuristic intended to reduce metrical/subdivision errors
145 /// (e.g., a 2× tempo supported only by the high band should not win over a tempo supported by
146 /// low+mid bands).
147 ///
148 /// Score adjustment: `score *= (1 + bonus * max(0, support_bands - 1))`.
149 pub tempogram_band_consensus_bonus: f32,
150
151 /// Tempogram novelty weights for combining {spectral, energy, HFC}.
152 pub tempogram_novelty_w_spectral: f32,
153 /// Tempogram novelty weight for energy flux.
154 pub tempogram_novelty_w_energy: f32,
155 /// Tempogram novelty weight for HFC.
156 pub tempogram_novelty_w_hfc: f32,
157 /// Tempogram novelty conditioning windows.
158 pub tempogram_novelty_local_mean_window: usize,
159 /// Tempogram novelty moving-average smoothing window (frames). Use 0/1 to disable.
160 pub tempogram_novelty_smooth_window: usize,
161
162 /// Debug: if set, the `analyze_file` example will pass this track ID through to the
163 /// multi-resolution fusion so it can print detailed scoring diagnostics.
164 pub debug_track_id: Option<u32>,
165 /// Debug: optional ground-truth BPM passed alongside `debug_track_id`.
166 pub debug_gt_bpm: Option<f32>,
167 /// Debug: number of top candidates per hop to print when `debug_track_id` is set.
168 pub debug_top_n: usize,
169
170 /// Enable log-mel novelty tempogram as an additional candidate generator/support signal.
171 ///
172 /// This computes a log-mel SuperFlux-style novelty curve, then runs the tempogram on it.
173 /// The resulting candidates are used for seeding and for the consensus bonus logic.
174 pub enable_tempogram_mel_novelty: bool,
175 /// Mel band count used by log-mel novelty.
176 pub tempogram_mel_n_mels: usize,
177 /// Minimum mel frequency (Hz).
178 pub tempogram_mel_fmin_hz: f32,
179 /// Maximum mel frequency (Hz). If <= 0, uses Nyquist.
180 pub tempogram_mel_fmax_hz: f32,
181 /// Max-filter neighborhood radius in mel bins (SuperFlux-style reference).
182 pub tempogram_mel_max_filter_bins: usize,
183 /// Weight for mel variant when band scoring fusion is enabled (`seed_only=false`).
184 pub tempogram_mel_weight: f32,
185
186 /// SuperFlux max-filter neighborhood radius (bins) used by the tempogram novelty extractor.
187 pub tempogram_superflux_max_filter_bins: usize,
188
189 /// Emit tempogram BPM candidate list (top-N) into `AnalysisMetadata` for validation/tuning.
190 ///
191 /// Default: false (avoid bloating outputs in normal use).
192 pub emit_tempogram_candidates: bool,
193
194 /// Number of tempogram candidates to emit when `emit_tempogram_candidates` is enabled.
195 /// Default: 10.
196 pub tempogram_candidates_top_n: usize,
197
198 /// Legacy guardrails: preferred BPM range (default: 75–150).
199 pub legacy_bpm_preferred_min: f32,
200 /// Legacy guardrails: preferred BPM range upper bound (default: 150).
201 pub legacy_bpm_preferred_max: f32,
202
203 /// Legacy guardrails: soft BPM range (default: 60–180).
204 /// Values in [soft_min, preferred_min) or (preferred_max, soft_max] get a medium cap.
205 pub legacy_bpm_soft_min: f32,
206 /// Legacy guardrails: soft BPM range upper bound (default: 180).
207 pub legacy_bpm_soft_max: f32,
208
209 /// Legacy guardrails: confidence caps by range.
210 /// - preferred: inside [preferred_min, preferred_max]
211 /// - soft: inside [soft_min, soft_max] but outside preferred
212 /// - extreme: outside [soft_min, soft_max]
213 ///
214 /// **Multiplier semantics**: these are applied as `confidence *= multiplier` to legacy
215 /// candidates/estimates (softly biasing the selection).
216 pub legacy_bpm_conf_mul_preferred: f32,
217 /// Legacy guardrails: confidence multiplier for the soft band (default: 0.50).
218 pub legacy_bpm_conf_mul_soft: f32,
219 /// Legacy guardrails: confidence multiplier for extremes (default: 0.10).
220 pub legacy_bpm_conf_mul_extreme: f32,
221
222 /// Minimum BPM to consider (default: 60.0)
223 pub min_bpm: f32,
224
225 /// Maximum BPM to consider (default: 180.0)
226 pub max_bpm: f32,
227
228 /// BPM resolution for comb filterbank (default: 1.0)
229 pub bpm_resolution: f32,
230
231 // STFT parameters
232 /// Frame size for STFT (default: 2048)
233 pub frame_size: usize,
234
235 /// Hop size for STFT (default: 512)
236 pub hop_size: usize,
237
238 // Key detection
239 /// Center frequency for chroma extraction (default: 440.0 Hz, A4)
240 pub center_frequency: f32,
241
242 /// Enable soft chroma mapping (default: true)
243 /// Soft mapping spreads frequency bins to neighboring semitones for robustness
244 pub soft_chroma_mapping: bool,
245
246 /// Soft mapping standard deviation in semitones (default: 0.5)
247 /// Lower values = sharper mapping, higher values = more spread
248 pub soft_mapping_sigma: f32,
249
250 /// Chroma sharpening power (default: 1.0 = no sharpening, 1.5-2.0 recommended)
251 /// Power > 1.0 emphasizes prominent semitones, improving key detection
252 pub chroma_sharpening_power: f32,
253
254 /// Enable a lightweight percussive-suppression step for key detection by time-smoothing
255 /// the STFT magnitude spectrogram prior to chroma extraction.
256 ///
257 /// This is HPSS-inspired (harmonic content is sustained in time; percussive is transient),
258 /// but uses a cheap moving-average rather than full iterative HPSS.
259 ///
260 /// Default: true.
261 pub enable_key_spectrogram_time_smoothing: bool,
262
263 /// Half-window size (in frames) for the key spectrogram time-smoothing.
264 /// Effective window length is `2*margin + 1`.
265 ///
266 /// Default: 12 (≈ 12 * hop_size samples ≈ 140 ms at 44.1kHz with hop=512).
267 pub key_spectrogram_smooth_margin: usize,
268
269 /// Enable weighted key aggregation (frame weights based on tonality + energy).
270 /// Default: true.
271 pub enable_key_frame_weighting: bool,
272
273 /// Minimum per-frame "tonalness" required to include the frame in key aggregation.
274 /// Tonalness is computed from chroma entropy and mapped to [0, 1].
275 /// Default: 0.10.
276 pub key_min_tonalness: f32,
277
278 /// Exponent applied to tonalness when building frame weights (>= 0).
279 /// Default: 2.0.
280 pub key_tonalness_power: f32,
281
282 /// Exponent applied to normalized frame energy when building frame weights (>= 0).
283 /// Default: 0.50 (square-root weighting).
284 pub key_energy_power: f32,
285
286 /// Enable a harmonic-emphasized spectrogram for key detection via a time-smoothing-derived
287 /// soft mask (cheap HPSS-inspired).
288 ///
289 /// If enabled, key detection uses `harmonic_spectrogram_time_mask()` instead of raw/time-smoothed
290 /// magnitudes when extracting chroma.
291 ///
292 /// Default: true.
293 pub enable_key_harmonic_mask: bool,
294
295 /// Soft-mask exponent \(p\) for harmonic masking (>= 1.0). Higher values produce harder masks.
296 /// Default: 2.0.
297 pub key_harmonic_mask_power: f32,
298
299 /// Enable median-filter HPSS harmonic extraction for key detection (key-only).
300 ///
301 /// This is a more literature-standard HPSS step than `harmonic_spectrogram_time_mask()`.
302 /// We compute time- and frequency-median estimates on a **time-downsampled**, **band-limited**
303 /// spectrogram, build a soft mask, then apply it to the full-resolution spectrogram.
304 ///
305 /// Default: false (opt-in; more expensive).
306 pub enable_key_hpss_harmonic: bool,
307
308 /// Time-downsampling step for key HPSS (>= 1). Values like 2–6 greatly reduce cost.
309 /// Default: 4.
310 pub key_hpss_frame_step: usize,
311
312 /// Half-window size (in downsampled frames) for the HPSS harmonic (time) median filter.
313 /// Effective window length is `2*margin + 1` (in downsampled frames).
314 /// Default: 8.
315 pub key_hpss_time_margin: usize,
316
317 /// Half-window size (in frequency bins) for the HPSS percussive (frequency) median filter.
318 /// Effective window length is `2*margin + 1` bins.
319 /// Default: 8.
320 pub key_hpss_freq_margin: usize,
321
322 /// Soft-mask exponent \(p\) for HPSS masking (>= 1.0). Higher values produce harder masks.
323 /// Default: 2.0.
324 pub key_hpss_mask_power: f32,
325
326 /// Enable a key-only STFT override (compute a separate STFT for key detection).
327 ///
328 /// Rationale: key detection benefits from higher frequency resolution than BPM/onset work.
329 /// A larger FFT size improves pitch precision at low frequencies where semitone spacing is small.
330 ///
331 /// Default: false (keep single shared STFT by default).
332 pub enable_key_stft_override: bool,
333
334 /// FFT frame size used for key-only STFT when `enable_key_stft_override` is true.
335 /// Default: 8192.
336 pub key_stft_frame_size: usize,
337
338 /// Hop size used for key-only STFT when `enable_key_stft_override` is true.
339 /// Default: 512.
340 pub key_stft_hop_size: usize,
341
342 /// Enable log-frequency (semitone-aligned) spectrogram for key detection.
343 ///
344 /// This converts the linear STFT magnitude spectrogram into a log-frequency representation
345 /// where each bin corresponds to one semitone. This provides better pitch-class resolution
346 /// than mapping linear FFT bins to semitones, especially at low frequencies.
347 ///
348 /// When enabled, chroma extraction works directly on semitone bins (no frequency-to-semitone
349 /// mapping needed). HPCP is disabled when log-frequency is enabled (HPCP requires frequency
350 /// information for harmonic summation).
351 ///
352 /// Default: false (use linear STFT with frequency-to-semitone mapping).
353 pub enable_key_log_frequency: bool,
354
355 /// Enable beat-synchronous chroma extraction for key detection.
356 ///
357 /// This aligns chroma windows to beat boundaries instead of fixed-time frames, improving
358 /// harmonic coherence by aligning to musical structure. For each beat interval, chroma vectors
359 /// from all STFT frames within that interval are averaged.
360 ///
361 /// Requires a valid beat grid (falls back to frame-based chroma if beat grid is unavailable).
362 /// HPCP is disabled when beat-synchronous is enabled (HPCP requires frame-based processing).
363 ///
364 /// Default: false (use frame-based chroma extraction).
365 pub enable_key_beat_synchronous: bool,
366
367 /// Enable multi-scale key detection (ensemble voting across multiple time scales).
368 ///
369 /// This runs key detection at multiple segment lengths (short, medium, long) and aggregates
370 /// results using clarity-weighted voting. This captures both local and global key information,
371 /// improving robustness on tracks with key changes or varying harmonic stability.
372 ///
373 /// Default: false (use single-scale detection).
374 pub enable_key_multi_scale: bool,
375
376 /// Template set to use for key detection.
377 ///
378 /// - `KrumhanslKessler`: Krumhansl-Kessler (1982) templates (empirical, from listening experiments)
379 /// - `Temperley`: Temperley (1999) templates (statistical, from corpus analysis)
380 ///
381 /// Default: `KrumhanslKessler`.
382 pub key_template_set: crate::features::key::templates::TemplateSet,
383
384 /// Enable ensemble key detection (combine K-K and Temperley template scores).
385 ///
386 /// This runs key detection with both template sets and combines their scores using
387 /// weighted voting. This ensemble approach can improve robustness by leveraging
388 /// complementary strengths of different template sets.
389 ///
390 /// Default: false (use single template set).
391 pub enable_key_ensemble: bool,
392
393 /// Weight for Krumhansl-Kessler scores in ensemble detection.
394 ///
395 /// Default: 0.5 (equal weight with Temperley).
396 pub key_ensemble_kk_weight: f32,
397
398 /// Weight for Temperley scores in ensemble detection.
399 ///
400 /// Default: 0.5 (equal weight with K-K).
401 pub key_ensemble_temperley_weight: f32,
402
403 /// Enable median key detection (detect key from multiple short segments and select median).
404 ///
405 /// This divides the track into multiple short overlapping segments, detects key for each
406 /// segment, and selects the median key (most common key across segments). This helps
407 /// handle brief modulations, breakdowns, or ambiguous sections.
408 ///
409 /// Default: false (use global key detection).
410 pub enable_key_median: bool,
411
412 /// Segment length (in frames) for median key detection.
413 ///
414 /// Default: 480 (~4 seconds at typical frame rates).
415 pub key_median_segment_length_frames: usize,
416
417 /// Segment hop size (in frames) for median key detection.
418 ///
419 /// Default: 120 (~1 second).
420 pub key_median_segment_hop_frames: usize,
421
422 /// Minimum number of segments required for median key detection.
423 ///
424 /// If fewer segments are available, falls back to global detection.
425 ///
426 /// Default: 3.
427 pub key_median_min_segments: usize,
428
429 /// Segment lengths (in frames) for multi-scale key detection.
430 /// Multiple scales are processed and aggregated with clarity-weighted voting.
431 /// Default: [120, 360, 720] (approximately 2s, 6s, 12s at typical frame rates).
432 pub key_multi_scale_lengths: Vec<usize>,
433
434 /// Hop size (in frames) between segments for multi-scale detection.
435 /// Default: 60 (approximately 1s at typical frame rates).
436 pub key_multi_scale_hop: usize,
437
438 /// Minimum clarity threshold for including a segment in multi-scale aggregation.
439 /// Default: 0.20.
440 pub key_multi_scale_min_clarity: f32,
441
442 /// Optional weights for each scale in multi-scale detection (if empty, all scales weighted equally).
443 /// Length should match `key_multi_scale_lengths`. Default: empty (equal weights).
444 pub key_multi_scale_weights: Vec<f32>,
445
446 /// Enable per-track tuning compensation for key detection.
447 ///
448 /// This estimates a global detuning offset (in semitones, relative to A4=440Hz) from the
449 /// key spectrogram, then shifts semitone mapping by that offset during chroma extraction.
450 ///
451 /// Default: true.
452 pub enable_key_tuning_compensation: bool,
453
454 /// Maximum absolute tuning correction to apply (semitones).
455 /// Default: 0.25.
456 pub key_tuning_max_abs_semitones: f32,
457
458 /// Frame subsampling step used for tuning estimation (>= 1).
459 /// Default: 20.
460 pub key_tuning_frame_step: usize,
461
462 /// Relative threshold (fraction of per-frame peak) for selecting bins used in tuning estimation.
463 /// Default: 0.35.
464 pub key_tuning_peak_rel_threshold: f32,
465
466 /// Enable trimming the first/last fraction of frames for key detection.
467 ///
468 /// DJ tracks often have long beat-only intros/outros; trimming edges reduces percussive bias
469 /// without affecting tempo (tempo uses its own pipeline).
470 ///
471 /// Default: true.
472 pub enable_key_edge_trim: bool,
473
474 /// Fraction (0..0.49) to trim from the start and end (symmetric) when `enable_key_edge_trim` is true.
475 /// Default: 0.15 (use middle 70%).
476 pub key_edge_trim_fraction: f32,
477
478 /// Enable segment voting for key detection (windowed key detection + score accumulation).
479 ///
480 /// Rationale: long-form DJ tracks can modulate, have breakdowns, or contain beat-only sections.
481 /// Segment voting helps focus on harmonically stable portions without requiring full key-change tracking.
482 ///
483 /// Default: true.
484 pub enable_key_segment_voting: bool,
485
486 /// Segment length in chroma frames for key voting.
487 /// Default: 1024 (~11.9s at 44.1kHz, hop=512).
488 pub key_segment_len_frames: usize,
489
490 /// Segment hop/stride in frames for key voting.
491 /// Default: 512 (~50% overlap).
492 pub key_segment_hop_frames: usize,
493
494 /// Minimum clarity required to include a segment in voting (0..1).
495 /// Default: 0.20.
496 pub key_segment_min_clarity: f32,
497
498 /// Enable a conservative mode heuristic to reduce minor→major mistakes.
499 ///
500 /// Uses the 3rd degree (minor third vs major third) from the aggregated chroma to potentially
501 /// flip parallel mode, gated by a score-ratio threshold.
502 ///
503 /// Default: true.
504 pub enable_key_mode_heuristic: bool,
505
506 /// Required ratio margin for the 3rd-degree test (>=0). If `p(min3) > p(maj3) * (1+margin)`
507 /// we prefer minor (and vice versa for major).
508 /// Default: 0.05.
509 pub key_mode_third_ratio_margin: f32,
510
511 /// Only flip parallel mode if the alternate mode's template score is at least this ratio of
512 /// the best mode's score (0..1).
513 /// Default: 0.92.
514 pub key_mode_flip_min_score_ratio: f32,
515
516 /// Enable HPCP-style pitch-class profile extraction for key detection.
517 ///
518 /// This uses spectral peak picking + harmonic summation to form a more robust tonal profile
519 /// than raw STFT-bin chroma on real-world mixes.
520 ///
521 /// Default: false (experimental).
522 pub enable_key_hpcp: bool,
523
524 /// Number of spectral peaks per frame used for HPCP extraction.
525 /// Default: 24.
526 pub key_hpcp_peaks_per_frame: usize,
527
528 /// Number of harmonics per peak used for HPCP extraction.
529 /// Default: 4.
530 pub key_hpcp_num_harmonics: usize,
531
532 /// Harmonic decay factor applied per harmonic (0..1). Lower values emphasize fundamentals.
533 /// Default: 0.60.
534 pub key_hpcp_harmonic_decay: f32,
535
536 /// Magnitude compression exponent for peak weights (0..1].
537 /// Default: 0.50 (sqrt).
538 pub key_hpcp_mag_power: f32,
539
540 /// Enable spectral whitening (per-frame frequency-domain normalization) for HPCP peak picking.
541 ///
542 /// This suppresses timbral formants and broadband coloration, helping peaks corresponding to
543 /// harmonic partials stand out more consistently across mixes.
544 ///
545 /// Default: false.
546 pub enable_key_hpcp_whitening: bool,
547
548 /// Frequency smoothing window (in FFT bins) for HPCP whitening.
549 /// Larger values whiten more aggressively (more timbre suppression), but can also amplify noise.
550 ///
551 /// Default: 31.
552 pub key_hpcp_whitening_smooth_bins: usize,
553
554 /// Enable a bass-band HPCP blend (tonic reinforcement).
555 ///
556 /// Relative major/minor share pitch classes; bass/tonic emphasis can disambiguate mode in
557 /// dance music where the bassline strongly implies the tonic.
558 ///
559 /// Default: true.
560 pub enable_key_hpcp_bass_blend: bool,
561
562 /// Bass-band lower cutoff (Hz) for bass HPCP.
563 /// Default: 55.0.
564 pub key_hpcp_bass_fmin_hz: f32,
565
566 /// Bass-band upper cutoff (Hz) for bass HPCP.
567 /// Default: 300.0.
568 pub key_hpcp_bass_fmax_hz: f32,
569
570 /// Blend weight for bass HPCP (0..1). Final PCP = normalize((1-w)*full + w*bass).
571 /// Default: 0.35.
572 pub key_hpcp_bass_weight: f32,
573
574 /// Enable a minor-key harmonic bonus (leading-tone vs flat-7) when scoring templates.
575 ///
576 /// Many dance tracks in minor heavily use harmonic minor gestures (raised 7th). This bonus
577 /// nudges minor candidates whose pitch-class distribution supports a leading-tone.
578 ///
579 /// Default: true.
580 pub enable_key_minor_harmonic_bonus: bool,
581
582 /// Weight for the minor harmonic bonus. Internally scaled by the sum of frame weights so it
583 /// is comparable to the template-score scale.
584 ///
585 /// Default: 0.8.
586 pub key_minor_leading_tone_bonus_weight: f32,
587
588 // ML refinement
589 /// Enable ML refinement (requires ml feature)
590 #[cfg(feature = "ml")]
591 pub enable_ml_refinement: bool,
592}
593
594impl Default for AnalysisConfig {
595 fn default() -> Self {
596 Self {
597 min_amplitude_db: -40.0,
598 normalization: NormalizationMethod::Peak,
599 enable_normalization: true,
600 enable_silence_trimming: true,
601 enable_onset_consensus: true,
602 onset_threshold_percentile: 0.80,
603 onset_consensus_tolerance_ms: 50,
604 onset_consensus_weights: [0.25, 0.25, 0.25, 0.25],
605 enable_hpss_onsets: false,
606 hpss_margin: 10,
607 force_legacy_bpm: false,
608 enable_bpm_fusion: false,
609 enable_legacy_bpm_guardrails: true,
610 enable_tempogram_multi_resolution: true,
611 tempogram_multi_res_top_k: 25,
612 tempogram_multi_res_w512: 0.45,
613 tempogram_multi_res_w256: 0.35,
614 tempogram_multi_res_w1024: 0.20,
615 tempogram_multi_res_structural_discount: 0.85,
616 tempogram_multi_res_double_time_512_factor: 0.92,
617 tempogram_multi_res_margin_threshold: 0.08,
618 tempogram_multi_res_use_human_prior: false,
619 // HPSS percussive fallback is very expensive and (so far) has not shown consistent gains.
620 // Keep it opt-in to avoid multi-second outliers during batch runs.
621 enable_tempogram_percussive_fallback: false,
622 enable_tempogram_band_fusion: true,
623 // Default cutoffs (Hz): ~kick/bass fundamentals, then body/rhythm textures, then attacks.
624 tempogram_band_low_max_hz: 200.0,
625 tempogram_band_mid_max_hz: 2000.0,
626 tempogram_band_high_max_hz: 8000.0,
627 // Default weights: keep full-band as anchor, but allow bands to pull candidates into view.
628 tempogram_band_w_full: 0.40,
629 tempogram_band_w_low: 0.25,
630 tempogram_band_w_mid: 0.20,
631 tempogram_band_w_high: 0.15,
632 tempogram_band_seed_only: true,
633 tempogram_band_support_threshold: 0.25,
634 tempogram_band_consensus_bonus: 0.08,
635 // Novelty weighting defaults (tuned on 200-track validation):
636 // shift weight toward transient-heavy signals (energy/HFC) to reduce octave/subdivision traps.
637 tempogram_novelty_w_spectral: 0.30,
638 tempogram_novelty_w_energy: 0.35,
639 tempogram_novelty_w_hfc: 0.35,
640 tempogram_novelty_local_mean_window: 16,
641 tempogram_novelty_smooth_window: 5,
642 debug_track_id: None,
643 debug_gt_bpm: None,
644 debug_top_n: 5,
645 enable_tempogram_mel_novelty: true,
646 tempogram_mel_n_mels: 40,
647 tempogram_mel_fmin_hz: 30.0,
648 tempogram_mel_fmax_hz: 8000.0,
649 tempogram_mel_max_filter_bins: 2,
650 tempogram_mel_weight: 0.15,
651 tempogram_superflux_max_filter_bins: 4,
652 emit_tempogram_candidates: false,
653 tempogram_candidates_top_n: 10,
654 // Tuned defaults (empirical, small-batch): slightly wider preferred band and
655 // slightly less aggressive down-weighting while keeping a strong extreme penalty.
656 legacy_bpm_preferred_min: 72.0,
657 legacy_bpm_preferred_max: 168.0,
658 legacy_bpm_soft_min: 60.0,
659 legacy_bpm_soft_max: 210.0,
660 legacy_bpm_conf_mul_preferred: 1.30,
661 legacy_bpm_conf_mul_soft: 0.70,
662 legacy_bpm_conf_mul_extreme: 0.01,
663 min_bpm: 40.0, // Lowered from 60.0 to catch slower tracks (ballads, ambient, etc.)
664 max_bpm: 240.0, // Raised from 180.0 to catch high-tempo tracks (drum & bass, etc.)
665 bpm_resolution: 1.0,
666 frame_size: 2048,
667 hop_size: 512,
668 center_frequency: 440.0,
669 soft_chroma_mapping: true,
670 soft_mapping_sigma: 0.5,
671 chroma_sharpening_power: 1.0, // No sharpening by default (can be enabled with 1.5-2.0)
672 enable_key_spectrogram_time_smoothing: true,
673 key_spectrogram_smooth_margin: 12,
674 enable_key_frame_weighting: true,
675 // Default: do not hard-gate frames by tonalness; use soft weighting instead.
676 key_min_tonalness: 0.0,
677 key_tonalness_power: 2.0,
678 key_energy_power: 0.50,
679 enable_key_harmonic_mask: true,
680 key_harmonic_mask_power: 2.0,
681 // Default: off. HPSS median filtering is more expensive than the cheap harmonic mask.
682 // Enable via CLI/validation when experimenting.
683 enable_key_hpss_harmonic: false,
684 key_hpss_frame_step: 4,
685 key_hpss_time_margin: 8,
686 key_hpss_freq_margin: 8,
687 key_hpss_mask_power: 2.0,
688 enable_key_stft_override: true,
689 key_stft_frame_size: 8192,
690 key_stft_hop_size: 512,
691 enable_key_log_frequency: false,
692 enable_key_beat_synchronous: false,
693 enable_key_multi_scale: false,
694 key_multi_scale_lengths: vec![120, 360, 720], // ~2s, 6s, 12s at typical frame rates
695 key_multi_scale_hop: 60, // ~1s
696 key_multi_scale_min_clarity: 0.20,
697 key_multi_scale_weights: vec![], // Equal weights by default
698 key_template_set: TemplateSet::KrumhanslKessler,
699 enable_key_ensemble: false,
700 key_ensemble_kk_weight: 0.5,
701 key_ensemble_temperley_weight: 0.5,
702 enable_key_median: false,
703 key_median_segment_length_frames: 480, // ~4 seconds at typical frame rates
704 key_median_segment_hop_frames: 120, // ~1 second
705 key_median_min_segments: 3,
706 // Default: off. Tuning estimation can be unstable on real-world mixes without a more
707 // peak/partial-aware frontend (HPCP/CQT). Keep available for experimentation.
708 enable_key_tuning_compensation: false,
709 key_tuning_max_abs_semitones: 0.08,
710 key_tuning_frame_step: 20,
711 key_tuning_peak_rel_threshold: 0.35,
712 // Default: off. Hard edge trimming can remove useful harmonic content on some tracks.
713 // Prefer harmonic masking + frame weighting; keep edge-trim available for experimentation.
714 enable_key_edge_trim: false,
715 key_edge_trim_fraction: 0.15,
716 enable_key_segment_voting: true,
717 key_segment_len_frames: 1024,
718 key_segment_hop_frames: 512,
719 key_segment_min_clarity: 0.20,
720 enable_key_mode_heuristic: false,
721 // NOTE: Aggressive defaults for Phase 1F DJ validation: minor keys were frequently
722 // predicted as major. Keep these tunable via CLI/validation.
723 key_mode_third_ratio_margin: 0.00,
724 key_mode_flip_min_score_ratio: 0.60,
725 enable_key_hpcp: true,
726 key_hpcp_peaks_per_frame: 24,
727 key_hpcp_num_harmonics: 4,
728 key_hpcp_harmonic_decay: 0.60,
729 key_hpcp_mag_power: 0.50,
730 enable_key_hpcp_whitening: false,
731 key_hpcp_whitening_smooth_bins: 31,
732 // Experimental: tonic reinforcement can backfire if the bass is not stably pitched.
733 enable_key_hpcp_bass_blend: false,
734 key_hpcp_bass_fmin_hz: 55.0,
735 key_hpcp_bass_fmax_hz: 300.0,
736 key_hpcp_bass_weight: 0.35,
737 // Experimental: can easily over-bias the result on real-world mixes.
738 enable_key_minor_harmonic_bonus: false,
739 key_minor_leading_tone_bonus_weight: 0.2,
740 #[cfg(feature = "ml")]
741 enable_ml_refinement: false,
742 }
743 }
744}
745