Skip to main content

musefs_core/
scan.rs

1use std::collections::{BTreeMap, HashMap, HashSet};
2use std::path::{Path, PathBuf};
3
4use musefs_db::convert::usize_from;
5use musefs_db::{Db, Format, NewArt, NewTrack, Tag, TrackArt};
6use musefs_format::{EmbeddedBinaryTag, EmbeddedPicture, Extent, flac, mp3, mp4, ogg, wav};
7
8use crate::byte_budget::ByteBudget;
9use crate::error::Result;
10use crate::freshness::BackingStamp;
11use std::fmt;
12use std::sync::Arc;
13use std::sync::mpsc::sync_channel;
14
15const BATCH_FILES: usize = 256;
16const BATCH_BYTES: u64 = 64 << 20; // 64 MiB
17
18/// Initial bounded-read window. Sized to cover most files' metadata in one read;
19/// larger metadata (e.g. embedded cover art) triggers a precise `NeedMore` widen.
20const WINDOW: usize = 1 << 16; // 64 KiB
21/// Cap on widen iterations before falling back to a full-buffer read.
22const MAX_WIDEN_RETRIES: usize = 8;
23/// Hard ceiling on bytes read to probe one file. Real audio metadata fits far
24/// below this, so a file still unparsed past the cap is treated as malformed
25/// rather than read whole into RAM. Guards against a multi-GB file misnamed with
26/// an audio extension, and against a corrupt header whose length field demands a
27/// giant `NeedMore` widen.
28pub(crate) const MAX_PROBE_BYTES: u64 = 64 << 20; // 64 MiB
29
30/// The artwork-size ceiling. Enforced here at ingest (oversize scanned art is
31/// dropped) and at resolve in `mapping::track_art_to_inputs` (oversize art from
32/// any writer is rejected). Sized to clear FLAC's 24-bit block length with
33/// headroom for the picture-block framing.
34pub(crate) const MAX_ART_BYTES: usize = 16 * 1024 * 1024 - 64 * 1024;
35
36/// Per-frame cap for opaque binary tags, mirroring `MAX_ART_BYTES`. Oversize
37/// payloads (e.g. a GEOB embedding a multi-MB file) are logged-and-skipped.
38const MAX_BINARY_TAG_BYTES: usize = MAX_ART_BYTES;
39
40/// Outcome of probing one backing file. `Unparseable` is a supported-extension
41/// file whose bytes did not parse (counted as a scan `failed`). `Raced` means
42/// the file changed under us between the pre- and post-probe `fstat` — the probe
43/// may be torn, so nothing is committed for it (#276).
44#[derive(Debug)]
45enum ProbeOutcome {
46    Probed(Probed, BackingStamp),
47    Unparseable,
48    Raced,
49}
50
51#[cfg(test)]
52thread_local! {
53    static AFTER_S1_HOOK: std::cell::RefCell<Option<Box<dyn FnMut()>>> =
54        const { std::cell::RefCell::new(None) };
55}
56#[cfg(test)]
57fn fire_after_s1() {
58    AFTER_S1_HOOK.with(|h| {
59        if let Some(f) = h.borrow_mut().as_mut() {
60            f();
61        }
62    });
63}
64#[cfg(test)]
65fn set_after_s1_hook(f: impl FnMut() + 'static) {
66    AFTER_S1_HOOK.with(|h| *h.borrow_mut() = Some(Box::new(f)));
67}
68#[cfg(test)]
69fn clear_after_s1_hook() {
70    AFTER_S1_HOOK.with(|h| *h.borrow_mut() = None);
71}
72
73/// A progress event emitted during a scan or revalidate. Borrows the current
74/// path to avoid a per-file allocation in the writer; the saved allocation is
75/// negligible next to the existing per-file `to_string_lossy` + DB write, so do
76/// not contort the API to preserve the borrow.
77#[derive(Debug, Clone, Copy)]
78pub enum ScanProgress<'a> {
79    /// A supported-audio file was found during the walk; `found` is the running
80    /// count of collected files.
81    Discovered { found: u64 },
82    /// The walk (and, for revalidate, the skip-unchanged pass) finished;
83    /// `total` files will be ingested and tracked by the determinate bar.
84    Walked { total: u64 },
85    /// A file was committed. `done` runs 1..=total; `path` is its absolute path.
86    Ingested {
87        done: u64,
88        total: u64,
89        path: &'a str,
90    },
91}
92
93/// UI-agnostic progress callback for [`ScanOptions`]. Invoked only from the
94/// caller's thread (the walk and the single writer), never from probe workers.
95/// The `Send + Sync` bound is not required by today's code; it is deliberate
96/// future-proofing and free here (`indicatif::ProgressBar` is `Send + Sync`).
97#[derive(Clone)]
98pub struct ProgressSink(Arc<dyn for<'a> Fn(ScanProgress<'a>) + Send + Sync>);
99
100impl ProgressSink {
101    pub fn new(f: impl for<'a> Fn(ScanProgress<'a>) + Send + Sync + 'static) -> Self {
102        ProgressSink(Arc::new(f))
103    }
104
105    fn emit(&self, ev: ScanProgress<'_>) {
106        (self.0)(ev);
107    }
108}
109
110impl fmt::Debug for ProgressSink {
111    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
112        f.write_str("ProgressSink")
113    }
114}
115
116#[derive(Debug, Clone, PartialEq, Eq)]
117pub struct ScanStats {
118    pub scanned: u64,
119    pub skipped: u64,
120    /// Files skipped because a track already exists at that path.
121    pub already_present: u64,
122    pub failed: u64,
123    pub raced: u64,
124}
125
126/// Per-extension tally of files skipped during the directory walk because their
127/// extension is not a supported audio format. Backs the end-of-scan summary log
128/// line (#341) that breaks the single `skipped` count down by extension, so an
129/// operator can tell expected sidecars (cover art, `.cue`, `.log`, `.nfo`) from
130/// genuinely unexpected files. Not part of `ScanStats`: the breakdown is
131/// log-only and does not affect the CLI summary.
132#[derive(Debug, Default)]
133struct SkipTally {
134    total: u64,
135    by_ext: BTreeMap<String, u64>,
136}
137
138impl SkipTally {
139    /// Record one skipped file, bucketed by its lowercased extension
140    /// (`<none>` when the file has no extension or a non-UTF-8 one).
141    fn record(&mut self, path: &Path) {
142        self.total += 1;
143        let ext = path
144            .extension()
145            .and_then(|e| e.to_str())
146            .map_or_else(|| "<none>".to_string(), str::to_ascii_lowercase);
147        *self.by_ext.entry(ext).or_insert(0) += 1;
148    }
149
150    /// The end-of-scan summary line, e.g. `skipped 42: jpg=20, cue=10, log=8,
151    /// <none>=4` — buckets ordered by descending count, ties broken by extension
152    /// name. `None` when nothing was skipped, so there is no line to emit.
153    fn summary(&self) -> Option<String> {
154        if self.total == 0 {
155            return None;
156        }
157        let mut buckets: Vec<(&String, &u64)> = self.by_ext.iter().collect();
158        buckets.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
159        let breakdown = buckets
160            .iter()
161            .map(|(ext, n)| format!("{ext}={n}"))
162            .collect::<Vec<_>>()
163            .join(", ");
164        Some(format!("skipped {}: {breakdown}", self.total))
165    }
166}
167
168#[derive(Debug, Clone, PartialEq, Eq)]
169pub struct RevalidateStats {
170    pub updated: u64,
171    pub unchanged: u64,
172    pub pruned: u64,
173    pub failed: u64,
174    pub raced: u64,
175}
176
177fn has_ext(path: &Path, ext: &str) -> bool {
178    path.extension()
179        .and_then(|e| e.to_str())
180        .is_some_and(|e| e.eq_ignore_ascii_case(ext))
181}
182
183/// True if `path` has an extension for a format the scanner can probe.
184fn is_supported_audio(path: &Path) -> bool {
185    has_ext(path, "flac")
186        || has_ext(path, "mp3")
187        || has_ext(path, "m4a")
188        || has_ext(path, "m4b")
189        || has_ext(path, "ogg")
190        || has_ext(path, "oga")
191        || has_ext(path, "opus")
192        || has_ext(path, "wav")
193}
194
195fn collect_audio(
196    root: &Path,
197    out: &mut Vec<PathBuf>,
198    follow_symlinks: bool,
199) -> std::io::Result<SkipTally> {
200    collect_audio_with(root, out, follow_symlinks, None)
201}
202
203fn collect_audio_with(
204    root: &Path,
205    out: &mut Vec<PathBuf>,
206    follow_symlinks: bool,
207    progress: Option<&ProgressSink>,
208) -> std::io::Result<SkipTally> {
209    let mut visited = HashSet::new();
210    let mut files_visited = HashSet::new();
211    let mut tally = SkipTally::default();
212    if follow_symlinks && let Ok(meta) = std::fs::metadata(root) {
213        visited.insert(dir_key(&meta));
214    }
215    collect_audio_inner(
216        root,
217        out,
218        follow_symlinks,
219        &mut visited,
220        &mut files_visited,
221        &mut tally,
222        progress,
223    )?;
224    Ok(tally)
225}
226
227fn collect_audio_inner(
228    root: &Path,
229    out: &mut Vec<PathBuf>,
230    follow_symlinks: bool,
231    visited: &mut HashSet<(u64, u64)>,
232    files_visited: &mut HashSet<(u64, u64)>,
233    tally: &mut SkipTally,
234    progress: Option<&ProgressSink>,
235) -> std::io::Result<()> {
236    // A single unreadable subtree or vanished entry must drop only that entry,
237    // not abort the whole ingest — matching the log-and-continue resilience of
238    // the symlink arm below and `probe_file` (#534). The top-level root is
239    // validated upstream by `scan_directory_with`'s canonicalize, so a genuine
240    // bad root is still reported there.
241    let entries = match std::fs::read_dir(root) {
242        Ok(entries) => entries,
243        Err(e) => {
244            log::warn!("skipping directory {}: {e}", root.display());
245            return Ok(());
246        }
247    };
248    for entry in entries {
249        let entry = match entry {
250            Ok(entry) => entry,
251            Err(e) => {
252                log::warn!("skipping unreadable entry in {}: {e}", root.display());
253                continue;
254            }
255        };
256        let path = entry.path();
257        let ftype = match entry.file_type() {
258            Ok(ftype) => ftype,
259            Err(e) => {
260                log::warn!("skipping {}: {e}", path.display());
261                continue;
262            }
263        };
264        if ftype.is_dir() {
265            descend(
266                &path,
267                out,
268                follow_symlinks,
269                visited,
270                files_visited,
271                tally,
272                progress,
273            )?;
274        } else if ftype.is_file() {
275            if is_supported_audio(&path) {
276                push_file(&path, out, follow_symlinks, files_visited, None, progress);
277            } else {
278                tally.record(&path);
279            }
280        } else if ftype.is_symlink() {
281            if !follow_symlinks {
282                // Routine and expected (symlinks are off by default); a library
283                // sitting next to symlinked dirs would otherwise flood stderr at
284                // the default `warn` floor. The end-of-scan skip tally still
285                // surfaces what was passed over.
286                log::debug!(
287                    "skipping symlink {} (pass --follow-symlinks to scan it)",
288                    path.display()
289                );
290                continue;
291            }
292            match std::fs::metadata(&path) {
293                Ok(meta) if meta.is_dir() => {
294                    descend(
295                        &path,
296                        out,
297                        follow_symlinks,
298                        visited,
299                        files_visited,
300                        tally,
301                        progress,
302                    )?;
303                }
304                Ok(meta) if meta.is_file() => {
305                    if is_supported_audio(&path) {
306                        push_file(
307                            &path,
308                            out,
309                            follow_symlinks,
310                            files_visited,
311                            Some(&meta),
312                            progress,
313                        );
314                    } else {
315                        tally.record(&path);
316                    }
317                }
318                Ok(_) => {}
319                Err(e) => {
320                    log::warn!("skipping broken symlink {}: {e}", path.display());
321                }
322            }
323        } else {
324            // A direct special file (FIFO, char/block device, socket) — not a
325            // file, dir, or symlink. The audio invariant is unaffected (it is
326            // never opened), but tally it so it surfaces in the skip breakdown
327            // rather than vanishing without a trace, matching unsupported
328            // regular files above (#544).
329            tally.record(&path);
330        }
331    }
332    Ok(())
333}
334
335fn descend(
336    path: &Path,
337    out: &mut Vec<PathBuf>,
338    follow_symlinks: bool,
339    visited: &mut HashSet<(u64, u64)>,
340    files_visited: &mut HashSet<(u64, u64)>,
341    tally: &mut SkipTally,
342    progress: Option<&ProgressSink>,
343) -> std::io::Result<()> {
344    if !follow_symlinks {
345        return collect_audio_inner(
346            path,
347            out,
348            follow_symlinks,
349            visited,
350            files_visited,
351            tally,
352            progress,
353        );
354    }
355    let meta = match std::fs::metadata(path) {
356        Ok(m) => m,
357        Err(e) => {
358            log::warn!("skipping directory {}: {e}", path.display());
359            return Ok(());
360        }
361    };
362    if !visited.insert(dir_key(&meta)) {
363        log::warn!("skipping symlink cycle at {}", path.display());
364        return Ok(());
365    }
366    collect_audio_inner(
367        path,
368        out,
369        follow_symlinks,
370        visited,
371        files_visited,
372        tally,
373        progress,
374    )
375}
376
377fn dir_key(meta: &std::fs::Metadata) -> (u64, u64) {
378    use std::os::unix::fs::MetadataExt;
379    (meta.dev(), meta.ino())
380}
381
382/// Collect one supported-extension file into `out`, deduplicating by target
383/// identity when following symlinks so a real file and a symlink to it (or a
384/// file reached via two symlink paths) are ingested once. `known_meta` is the
385/// already-resolved target metadata when the caller has it (the symlink arm),
386/// avoiding a second `stat`. Dedup is best-effort: if the target cannot be
387/// `stat`ed we push it and let the probe pipeline count it rather than dropping
388/// it silently.
389fn push_file(
390    path: &Path,
391    out: &mut Vec<PathBuf>,
392    follow_symlinks: bool,
393    files_visited: &mut HashSet<(u64, u64)>,
394    known_meta: Option<&std::fs::Metadata>,
395    progress: Option<&ProgressSink>,
396) {
397    if !follow_symlinks {
398        out.push(path.to_path_buf());
399        if let Some(p) = progress {
400            p.emit(ScanProgress::Discovered {
401                found: out.len() as u64,
402            });
403        }
404        return;
405    }
406    let key = match known_meta {
407        Some(m) => Some(dir_key(m)),
408        None => std::fs::metadata(path).ok().map(|m| dir_key(&m)),
409    };
410    match key {
411        Some(k) if !files_visited.insert(k) => {
412            log::debug!("skipping duplicate backing target {}", path.display());
413        }
414        _ => {
415            out.push(path.to_path_buf());
416            if let Some(p) = progress {
417                p.emit(ScanProgress::Discovered {
418                    found: out.len() as u64,
419                });
420            }
421        }
422    }
423}
424
425/// A backing file parsed into the fields a track row needs, plus its raw
426/// `(key, value)` tags to seed.
427#[derive(Debug)]
428pub(crate) struct Probed {
429    format: Format,
430    audio_offset: u64,
431    audio_length: u64,
432    tags: Vec<(String, String)>,
433    pictures: Vec<EmbeddedPicture>,
434    binary_tags: Vec<EmbeddedBinaryTag>,
435    /// FLAC STREAMINFO/SEEKTABLE as (kind, body) pairs; empty for other formats.
436    structural_blocks: Vec<(String, Vec<u8>)>,
437}
438
439/// Assemble a WAV [`Probed`] from located audio bounds, reading tags and pictures
440/// from `prefix`. Shared by the bounded, full-buffer, and ceiling probe paths.
441fn wav_probed(prefix: &[u8], bounds: &wav::WavBounds) -> Probed {
442    let (binary_tags, promoted) = wav::read_binary_tags(prefix);
443    let mut tags = wav::read_tags(prefix);
444    tags.extend(promoted);
445    Probed {
446        format: Format::Wav,
447        audio_offset: bounds.audio_offset,
448        audio_length: bounds.audio_length,
449        tags,
450        pictures: wav::read_pictures(prefix),
451        binary_tags,
452        structural_blocks: Vec::new(),
453    }
454}
455
456/// Full-buffer probe (legacy path). Retained as the reference implementation the
457/// bounded path is checked against (see the equivalence property test).
458pub(crate) fn probe_full(path: &Path, bytes: &[u8]) -> Option<Probed> {
459    if has_ext(path, "flac") {
460        let scan = flac::locate_audio(bytes).ok()?;
461        let (structural_blocks, binary_tags) = flac::split_preserved(&scan.preserved);
462        Some(Probed {
463            format: Format::Flac,
464            audio_offset: scan.audio_offset,
465            audio_length: scan.audio_length,
466            tags: flac::read_vorbis_comments(bytes).unwrap_or_default(),
467            pictures: flac::read_pictures(bytes).unwrap_or_default(),
468            binary_tags,
469            structural_blocks,
470        })
471    } else if has_ext(path, "mp3") {
472        let bounds = mp3::locate_audio(bytes).ok()?;
473        let (binary_tags, promoted) = mp3::read_binary_tags(bytes);
474        let mut tags = mp3::read_tags(bytes);
475        tags.extend(promoted);
476        Some(Probed {
477            format: Format::Mp3,
478            audio_offset: bounds.audio_offset,
479            audio_length: bounds.audio_length,
480            tags,
481            pictures: mp3::read_pictures(bytes),
482            binary_tags,
483            structural_blocks: Vec::new(),
484        })
485    } else if has_ext(path, "m4a") || has_ext(path, "m4b") {
486        let bounds = mp4::locate_audio(bytes).ok()?;
487        let (pictures, art_drops) = mp4::read_pictures_reporting(bytes, MAX_ART_BYTES);
488        let (binary_tags, bin_drops) = mp4::read_binary_tags_reporting(bytes, MAX_BINARY_TAG_BYTES);
489        log_mp4_oversize_drops(path, &art_drops, &bin_drops);
490        Some(Probed {
491            format: Format::M4a,
492            audio_offset: bounds.audio_offset,
493            audio_length: bounds.audio_length,
494            tags: mp4::read_tags(bytes),
495            pictures,
496            binary_tags,
497            structural_blocks: Vec::new(),
498        })
499    } else if has_ext(path, "ogg") || has_ext(path, "oga") || has_ext(path, "opus") {
500        let scan = ogg::locate_audio(bytes).ok()?;
501        let format = match scan.codec {
502            ogg::Codec::Opus => Format::Opus,
503            ogg::Codec::Vorbis => Format::Vorbis,
504            ogg::Codec::OggFlac => Format::OggFlac,
505        };
506        Some(Probed {
507            format,
508            audio_offset: scan.audio_offset,
509            audio_length: scan.audio_length,
510            tags: ogg::read_tags(bytes).unwrap_or_default(),
511            pictures: ogg::read_pictures(bytes).unwrap_or_default(),
512            binary_tags: Vec::new(),
513            structural_blocks: Vec::new(),
514        })
515    } else if has_ext(path, "wav") {
516        let bounds = wav::locate_audio(bytes).ok()?;
517        Some(wav_probed(bytes, &bounds))
518    } else {
519        None
520    }
521}
522
523/// Read `[0, len)` of `path` into a buffer, counting the read. A short read at
524/// EOF is fine (`len` may exceed the file size).
525fn read_window(file: &std::fs::File, len: usize) -> std::io::Result<Vec<u8>> {
526    use std::os::unix::fs::FileExt;
527    let mut buf = vec![0u8; len];
528    let n = file.read_at(&mut buf, 0)?;
529    buf.truncate(n);
530    crate::metrics::on_scan_read(n as u64);
531    Ok(buf)
532}
533
534/// Read the file's last 128 bytes (for the MP3 ID3v1 trailer check), or `None`
535/// if the file is shorter than 128 bytes.
536fn read_tail_128(file: &std::fs::File, file_len: u64) -> std::io::Result<Option<[u8; 128]>> {
537    if file_len < 128 {
538        return Ok(None);
539    }
540    use std::os::unix::fs::FileExt;
541    let mut buf = [0u8; 128];
542    file.read_exact_at(&mut buf, file_len - 128)?;
543    crate::metrics::on_scan_read(128);
544    Ok(Some(buf))
545}
546
547/// Bounded probe of one backing file: open once, fstat before and after the
548/// probe, and report `Raced` when the file moved mid-probe — so the stored
549/// stamp and the probed bytes provably share one inode held still across the
550/// probe. Never reads the audio payload (M4A uses the seek reader;
551/// front-anchored formats read only the metadata extent).
552///
553/// Returns `ProbeOutcome::Unparseable` for a supported-extension file that does
554/// not parse (counted as `failed`) and `ProbeOutcome::Raced` if the file
555/// changed under us.
556fn probe_file(path: &Path, window: usize) -> std::io::Result<ProbeOutcome> {
557    let file = std::fs::File::open(path)?;
558    crate::metrics::on_scan_open();
559    let s1 = BackingStamp::from_metadata(&file.metadata()?);
560    #[cfg(test)]
561    fire_after_s1();
562
563    let probed = probe_body(path, &file, s1.size, window)?;
564
565    let s2 = BackingStamp::from_metadata(&file.metadata()?);
566    if s1 != s2 {
567        log::warn!("skipping {}: changed during probe", path.display());
568        return Ok(ProbeOutcome::Raced);
569    }
570    Ok(match probed {
571        Some(p) => ProbeOutcome::Probed(p, s1),
572        None => ProbeOutcome::Unparseable,
573    })
574}
575
576/// Run [`probe_file`] under a panic boundary so a residual parser panic — one
577/// the format-layer alloc guards (`id3v2_alloc_safe` and friends) don't catch —
578/// drops just that file instead of unwinding the scan worker thread. An unwound
579/// worker would skip its `failed.fetch_add`, and a crafted directory could kill
580/// every worker, closing the channel so the writer reports success while
581/// silently truncating the rest of the library (#425). A caught panic is logged
582/// and folded into `ProbeOutcome::Unparseable`, which the worker already counts
583/// as `failed`. Mirrors the read path's `read_outcome` boundary (#359).
584fn probe_file_caught(path: &Path, window: usize) -> std::io::Result<ProbeOutcome> {
585    match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| probe_file(path, window))) {
586        Ok(res) => res,
587        Err(payload) => {
588            let msg = payload
589                .downcast_ref::<&str>()
590                .copied()
591                .or_else(|| payload.downcast_ref::<String>().map(String::as_str))
592                .unwrap_or("<non-string panic>");
593            log::error!(
594                "scan worker panicked probing {}: {msg}; counting as failed",
595                path.display()
596            );
597            Ok(ProbeOutcome::Unparseable)
598        }
599    }
600}
601
602/// The per-format metadata dispatch for one already-opened backing file, over
603/// its first `file_len` bytes. Split out of `probe_file` so the fstat-sandwich
604/// wrapper stays legible. Never reads the audio payload (M4A uses the seek
605/// reader; front-anchored formats read only the metadata extent). Returns
606/// `Ok(None)` for an unsupported/unparseable file.
607fn probe_body(
608    path: &Path,
609    file: &std::fs::File,
610    file_len: u64,
611    window: usize,
612) -> std::io::Result<Option<Probed>> {
613    // M4A: seek reader, never touches mdat.
614    if has_ext(path, "m4a") || has_ext(path, "m4b") {
615        let mut f = file;
616        let scan = match mp4::read_structure_from(&mut f, file_len) {
617            Ok(s) => s,
618            Err(e) => {
619                log::warn!("skipping {}: {e}", path.display());
620                return Ok(None);
621            }
622        };
623        let (pictures, art_drops) = mp4::read_pictures_reporting(&scan.moov, MAX_ART_BYTES);
624        let (binary_tags, bin_drops) =
625            mp4::read_binary_tags_reporting(&scan.moov, MAX_BINARY_TAG_BYTES);
626        log_mp4_oversize_drops(path, &art_drops, &bin_drops);
627        return Ok(Some(Probed {
628            format: Format::M4a,
629            audio_offset: scan.mdat_payload_offset,
630            audio_length: scan.mdat_payload_len,
631            tags: mp4::read_tags(&scan.moov),
632            pictures,
633            binary_tags,
634            structural_blocks: Vec::new(),
635        }));
636    }
637
638    // Front-anchored formats: read a window, widen on NeedMore. Only the MP3
639    // arm of probe_prefix consumes the ID3v1 tail, and dispatch is by
640    // extension — so only .mp3 pays the tail read (#67).
641    let tail = if has_ext(path, "mp3") {
642        read_tail_128(file, file_len)?
643    } else {
644        None
645    };
646    // Never read past the probe ceiling, however large the file or whatever a
647    // (possibly corrupt) header asks for via `NeedMore`.
648    let probe_cap = file_len.min(MAX_PROBE_BYTES);
649    let mut want = usize_from((window as u64).min(probe_cap));
650    let mut prefix = read_window(file, want)?;
651    for _ in 0..MAX_WIDEN_RETRIES {
652        match probe_prefix(path, &prefix, file_len, tail.as_ref()) {
653            Probe::Done(p) => return Ok(Some(p)),
654            Probe::Skip => {
655                log::warn!("skipping {}: no parseable audio metadata", path.display());
656                return Ok(None);
657            }
658            Probe::NeedMore(up_to) => {
659                // Read everything we're willing to probe? Widening can't help.
660                if want as u64 >= probe_cap {
661                    break;
662                }
663                // Grow to at least `up_to` (capped at `probe_cap`), always making
664                // progress (`+1`), then retry.
665                want = usize_from(up_to.min(probe_cap))
666                    .max(want + 1)
667                    .min(usize_from(probe_cap));
668                prefix = read_window(file, want)?;
669            }
670        }
671    }
672    // Fallback: full-buffer probe over the bytes we were willing to read.
673    if (prefix.len() as u64) < probe_cap {
674        prefix = read_window(file, usize_from(probe_cap))?;
675    }
676    if let Some(p) = probe_full(path, &prefix) {
677        return Ok(Some(p));
678    }
679    // A WAV whose `data` payload runs past the probe ceiling fails the strict
680    // full-buffer parse (the payload isn't present to bound), yet its `fmt `/`data`
681    // headers sit at the front: trust the declared bounds and serve the audio,
682    // accepting the loss of any tag chunks trailing the payload.
683    if has_ext(path, "wav")
684        && file_len > MAX_PROBE_BYTES
685        && let Ok(bounds) = wav::locate_audio_at_ceiling(&prefix, file_len)
686    {
687        return Ok(Some(wav_probed(&prefix, &bounds)));
688    }
689    if file_len > MAX_PROBE_BYTES {
690        log::warn!(
691            "skipping {}: no parseable metadata within first {MAX_PROBE_BYTES} bytes",
692            path.display()
693        );
694    } else {
695        log::warn!("skipping {}: no parseable audio metadata", path.display());
696    }
697    Ok(None)
698}
699
700/// Outcome of a single bounded dispatch attempt against the current `prefix`.
701enum Probe {
702    Done(Probed),
703    NeedMore(u64),
704    Skip,
705}
706
707/// Dispatch the front-anchored formats against `prefix` + `file_len`.
708fn probe_prefix(path: &Path, prefix: &[u8], file_len: u64, tail: Option<&[u8; 128]>) -> Probe {
709    if has_ext(path, "flac") {
710        match flac::read_metadata_bounded(prefix) {
711            Ok(Extent::Complete(meta)) => {
712                let (structural_blocks, binary_tags) = flac::split_preserved(&meta.preserved);
713                Probe::Done(Probed {
714                    format: Format::Flac,
715                    audio_offset: meta.audio_offset,
716                    audio_length: file_len - meta.audio_offset,
717                    tags: flac::read_vorbis_comments(prefix).unwrap_or_default(),
718                    pictures: flac::read_pictures(prefix).unwrap_or_default(),
719                    binary_tags,
720                    structural_blocks,
721                })
722            }
723            Ok(Extent::NeedMore { up_to }) => Probe::NeedMore(up_to),
724            Err(_) => Probe::Skip,
725        }
726    } else if has_ext(path, "mp3") {
727        match mp3::locate_audio_bounded(prefix, file_len, tail) {
728            Ok(Extent::Complete(b)) => {
729                let (binary_tags, promoted) = mp3::read_binary_tags(prefix);
730                let mut tags = mp3::read_tags(prefix);
731                tags.extend(promoted);
732                Probe::Done(Probed {
733                    format: Format::Mp3,
734                    audio_offset: b.audio_offset,
735                    audio_length: b.audio_length,
736                    tags,
737                    pictures: mp3::read_pictures(prefix),
738                    binary_tags,
739                    structural_blocks: Vec::new(),
740                })
741            }
742            Ok(Extent::NeedMore { up_to }) => Probe::NeedMore(up_to),
743            Err(_) => Probe::Skip,
744        }
745    } else if has_ext(path, "ogg") || has_ext(path, "oga") || has_ext(path, "opus") {
746        match ogg::read_metadata_bounded(prefix, file_len) {
747            Ok(Extent::Complete(header)) => {
748                let format = match header.codec {
749                    ogg::Codec::Opus => Format::Opus,
750                    ogg::Codec::Vorbis => Format::Vorbis,
751                    ogg::Codec::OggFlac => Format::OggFlac,
752                };
753                Probe::Done(Probed {
754                    format,
755                    audio_offset: header.audio_offset,
756                    audio_length: file_len - header.audio_offset,
757                    tags: ogg::read_tags(prefix).unwrap_or_default(),
758                    pictures: ogg::read_pictures(prefix).unwrap_or_default(),
759                    binary_tags: Vec::new(),
760                    structural_blocks: Vec::new(),
761                })
762            }
763            Ok(Extent::NeedMore { up_to }) => Probe::NeedMore(up_to),
764            Err(_) => Probe::Skip,
765        }
766    } else if has_ext(path, "wav") {
767        match wav::locate_audio_bounded(prefix, file_len) {
768            Ok(Extent::Complete(b)) => Probe::Done(wav_probed(prefix, &b)),
769            Ok(Extent::NeedMore { up_to }) => Probe::NeedMore(up_to),
770            Err(_) => Probe::Skip,
771        }
772    } else {
773        Probe::Skip
774    }
775}
776
777/// How much checksum work a scan does per file.
778#[derive(Debug, Clone, Copy, PartialEq, Eq)]
779pub enum ChecksumTier {
780    /// No checksums (legacy behavior).
781    None,
782    /// Compute the cheap fingerprint only (rides the probe).
783    Fingerprint,
784    /// Fingerprint plus an eager full-file SHA-256.
785    Full,
786}
787
788/// How a fingerprint match is confirmed before a retarget.
789#[derive(Debug, Clone, Copy, PartialEq, Eq)]
790pub enum MatchStrictness {
791    /// Confirm with the full hash when the candidate has one; else trust the
792    /// fingerprint.
793    Auto,
794    /// Fingerprint match is always sufficient; never read the full file.
795    Fast,
796    /// Require a full-hash match; refuse the retarget if the candidate has no
797    /// stored content_hash.
798    Strict,
799}
800
801/// Whether the writer overwrites curated metadata or only refreshes structural
802/// serving facts.
803#[derive(Debug, Clone, Copy, PartialEq, Eq)]
804enum WritePolicy {
805    /// Full upsert: track row, checksums, tags, binary tags, structural blocks,
806    /// and art.
807    Full,
808    /// Layer-A-only refresh: track row, checksums, and structural blocks.
809    StructuralOnly,
810}
811
812/// Knobs for a scan. `jobs == 0` means "use available parallelism".
813#[derive(Debug, Clone)]
814pub struct ScanOptions {
815    pub jobs: usize,
816    /// Initial probe read window in bytes; widened on `NeedMore`.
817    pub window: usize,
818    /// In-flight art-byte budget and per-batch byte-flush threshold.
819    pub batch_bytes: u64,
820    /// Follow symlinks during collection. Off by default: symlinks are logged
821    /// and skipped, which keeps the walk immune to directory-symlink cycles.
822    pub follow_symlinks: bool,
823    /// Optional progress callback. `None` (the default) disables reporting.
824    pub progress: Option<ProgressSink>,
825    /// Which checksums to compute and store this scan.
826    pub checksum: ChecksumTier,
827    /// How a refind fingerprint match is confirmed before retargeting.
828    pub strictness: MatchStrictness,
829    /// Scan only: re-ingest files already present in the DB, overwriting
830    /// curated metadata. Off by default; bare scan is additive.
831    pub force: bool,
832    /// Revalidate only: delete tracks whose backing file is gone and GC
833    /// orphaned art. Off by default.
834    pub prune: bool,
835}
836
837impl Default for ScanOptions {
838    fn default() -> Self {
839        Self {
840            jobs: 0,
841            window: WINDOW,
842            batch_bytes: BATCH_BYTES,
843            follow_symlinks: false,
844            progress: None,
845            checksum: ChecksumTier::Fingerprint,
846            strictness: MatchStrictness::Auto,
847            force: false,
848            prune: false,
849        }
850    }
851}
852
853fn effective_jobs(jobs: usize) -> usize {
854    if jobs != 0 {
855        return jobs;
856    }
857    std::thread::available_parallelism().map_or(1, std::num::NonZero::get)
858}
859
860/// One probed file ready to write, plus its art-byte weight for backpressure.
861struct Unit {
862    abs_path: String,
863    stamp: BackingStamp,
864    probed: Probed,
865    weight: u64,
866    fingerprint: Option<String>,
867    content_hash: Option<String>,
868}
869
870/// In-memory byte weight of a `Probed`, used for batch backpressure
871/// (`ScanOptions::batch_bytes`). Counts every buffered payload — pictures plus FLAC
872/// structural blocks and binary tags — so large preserved blocks can't slip the
873/// budget the way picture-only accounting did.
874fn payload_weight(p: &Probed) -> u64 {
875    let pictures: u64 = p.pictures.iter().map(|pic| pic.data.len() as u64).sum();
876    let binary: u64 = p.binary_tags.iter().map(|t| t.payload.len() as u64).sum();
877    let structural: u64 = p
878        .structural_blocks
879        .iter()
880        .map(|(_, body)| body.len() as u64)
881        .sum();
882    pictures + binary + structural
883}
884
885/// The universal `tags.key` floor, mirrored from the DB `CHECK` exactly: a key
886/// must be non-empty and contain no byte below 0x20 (the control chars the DB
887/// rejects via its GLOB range; NUL also fails here, the DB's documented blind
888/// spot). DEL (0x7F) and high/non-ASCII bytes are accepted, matching the DB.
889/// Distinct from the strict Vorbis `is_valid_key` (which also bars `=`, 0x7E,
890/// 0x7F, and non-ASCII) — applying that here would wrongly drop legal MP3/M4A
891/// custom keys containing `=`/`:`/space.
892fn key_passes_floor(key: &str) -> bool {
893    !key.is_empty() && key.bytes().all(|b| b >= 0x20)
894}
895
896/// Drops embedded pictures over [`MAX_ART_BYTES`], logging each so a cover that
897/// vanishes from the synthesized view is explained rather than silent (#284).
898/// Filtering here, before the caller enumerates, keeps stored art ordinals
899/// gap-free. Note: the mp4 `covr` path caps oversize art earlier, inside
900/// `mp4::read_pictures`, so those drops never reach this filter.
901fn accept_pictures(abs_path: &str, pictures: Vec<EmbeddedPicture>) -> Vec<EmbeddedPicture> {
902    pictures
903        .into_iter()
904        .filter(|p| {
905            if p.data.len() > MAX_ART_BYTES {
906                log::warn!(
907                    "{abs_path}: dropping embedded {} art ({} bytes), over the {MAX_ART_BYTES}-byte cap",
908                    p.mime,
909                    p.data.len(),
910                );
911                return false;
912            }
913            true
914        })
915        .collect()
916}
917
918/// Filters embedded binary tags to those worth storing, logging oversize drops
919/// (#284). Empty payloads carry nothing to serve, so they are dropped silently;
920/// payloads over [`MAX_BINARY_TAG_BYTES`] are a lossy drop and get a warning.
921fn accept_binary_tags(abs_path: &str, tags: Vec<EmbeddedBinaryTag>) -> Vec<musefs_db::BinaryTag> {
922    tags.into_iter()
923        .filter(|b| {
924            if b.payload.len() > MAX_BINARY_TAG_BYTES {
925                log::warn!(
926                    "{abs_path}: dropping binary tag {} ({} bytes), over the {MAX_BINARY_TAG_BYTES}-byte cap",
927                    b.key,
928                    b.payload.len(),
929                );
930                return false;
931            }
932            !b.payload.is_empty()
933        })
934        .enumerate()
935        .map(|(ordinal, b)| musefs_db::BinaryTag {
936            key: b.key,
937            payload: b.payload,
938            ordinal: ordinal as u64,
939        })
940        .collect()
941}
942
943/// Logs each oversized mp4 `covr` image / binary `----` value that the format
944/// layer skipped before materialization (#343). These drops happen inside
945/// `mp4::read_pictures` / `mp4::read_binary_tags` — earlier than the `accept_*`
946/// ingest filters that log the lossy drops for the other formats (#284), and
947/// deliberately so, to avoid building a large image out of a large `moov` — so
948/// they are surfaced here at probe time, mirroring the `accept_*` message shape.
949fn log_mp4_oversize_drops(path: &Path, art: &[mp4::OversizeDrop], binary: &[mp4::OversizeDrop]) {
950    for d in art {
951        log::warn!(
952            "{}: dropping embedded {} art ({} bytes), over the {MAX_ART_BYTES}-byte cap",
953            path.display(),
954            d.descriptor,
955            d.bytes,
956        );
957    }
958    for d in binary {
959        log::warn!(
960            "{}: dropping binary tag {} ({} bytes), over the {MAX_BINARY_TAG_BYTES}-byte cap",
961            path.display(),
962            d.descriptor,
963            d.bytes,
964        );
965    }
966}
967
968fn structural_blocks_from(blocks: Vec<(String, Vec<u8>)>) -> Vec<musefs_db::StructuralBlock> {
969    let mut ordinals: HashMap<String, u64> = HashMap::new();
970    blocks
971        .into_iter()
972        .map(|(kind, body)| {
973            let ord = ordinals.entry(kind.clone()).or_insert(0);
974            let block = musefs_db::StructuralBlock {
975                kind,
976                ordinal: *ord,
977                body,
978            };
979            *ord += 1;
980            block
981        })
982        .collect()
983}
984
985/// The write surface `ingest_into` drives: satisfied by both a direct `&Db`
986/// (its methods take `&self`) and a batched `&mut BulkWriter` (`&mut self`), so
987/// the upsert body lives in exactly one place. Each method delegates through the
988/// concrete type path (`Db::`/`BulkWriter::`), which names the inherent method
989/// unambiguously so the same-named trait method can't recurse into itself.
990trait TrackSink {
991    fn upsert_track(&mut self, t: &NewTrack) -> musefs_db::Result<i64>;
992    fn replace_tags(&mut self, track_id: i64, tags: &[Tag]) -> musefs_db::Result<()>;
993    fn set_binary_tags(
994        &mut self,
995        track_id: i64,
996        tags: &[musefs_db::BinaryTag],
997    ) -> musefs_db::Result<()>;
998    fn set_structural_blocks(
999        &mut self,
1000        track_id: i64,
1001        blocks: &[musefs_db::StructuralBlock],
1002    ) -> musefs_db::Result<()>;
1003    fn upsert_art(&mut self, a: &NewArt) -> musefs_db::Result<i64>;
1004    fn set_track_art(&mut self, track_id: i64, items: &[TrackArt]) -> musefs_db::Result<()>;
1005    fn set_track_checksums(
1006        &mut self,
1007        track_id: i64,
1008        fingerprint: Option<&str>,
1009        content_hash: Option<&str>,
1010    ) -> musefs_db::Result<()>;
1011    fn track_exists_at(&mut self, path: &str) -> musefs_db::Result<bool>;
1012    fn tracks_by_fingerprint(&mut self, fp: &str) -> musefs_db::Result<Vec<musefs_db::Track>>;
1013    #[allow(clippy::too_many_arguments)]
1014    fn retarget_track(
1015        &mut self,
1016        id: i64,
1017        new_backing_path: &str,
1018        stamp: BackingStamp,
1019        audio_offset: u64,
1020        audio_length: u64,
1021        fingerprint: Option<&str>,
1022        content_hash: Option<&str>,
1023    ) -> musefs_db::Result<()>;
1024}
1025
1026impl TrackSink for &Db {
1027    fn upsert_track(&mut self, t: &NewTrack) -> musefs_db::Result<i64> {
1028        Db::upsert_track(self, t)
1029    }
1030    fn replace_tags(&mut self, track_id: i64, tags: &[Tag]) -> musefs_db::Result<()> {
1031        Db::replace_tags(self, track_id, tags)
1032    }
1033    fn set_binary_tags(
1034        &mut self,
1035        track_id: i64,
1036        tags: &[musefs_db::BinaryTag],
1037    ) -> musefs_db::Result<()> {
1038        Db::set_binary_tags(self, track_id, tags)
1039    }
1040    fn set_structural_blocks(
1041        &mut self,
1042        track_id: i64,
1043        blocks: &[musefs_db::StructuralBlock],
1044    ) -> musefs_db::Result<()> {
1045        Db::set_structural_blocks(self, track_id, blocks)
1046    }
1047    fn upsert_art(&mut self, a: &NewArt) -> musefs_db::Result<i64> {
1048        Db::upsert_art(self, a)
1049    }
1050    fn set_track_art(&mut self, track_id: i64, items: &[TrackArt]) -> musefs_db::Result<()> {
1051        Db::set_track_art(self, track_id, items)
1052    }
1053    fn set_track_checksums(
1054        &mut self,
1055        track_id: i64,
1056        fingerprint: Option<&str>,
1057        content_hash: Option<&str>,
1058    ) -> musefs_db::Result<()> {
1059        Db::set_track_checksums(self, track_id, fingerprint, content_hash)
1060    }
1061    fn track_exists_at(&mut self, path: &str) -> musefs_db::Result<bool> {
1062        Ok(Db::get_track_by_path(self, path)?.is_some())
1063    }
1064    fn tracks_by_fingerprint(&mut self, fp: &str) -> musefs_db::Result<Vec<musefs_db::Track>> {
1065        Db::tracks_by_fingerprint(self, fp)
1066    }
1067    fn retarget_track(
1068        &mut self,
1069        id: i64,
1070        new_backing_path: &str,
1071        stamp: BackingStamp,
1072        audio_offset: u64,
1073        audio_length: u64,
1074        fingerprint: Option<&str>,
1075        content_hash: Option<&str>,
1076    ) -> musefs_db::Result<()> {
1077        Db::retarget_track(
1078            self,
1079            id,
1080            new_backing_path,
1081            stamp.size,
1082            stamp.mtime_ns,
1083            stamp.ctime_ns,
1084            audio_offset,
1085            audio_length,
1086            fingerprint,
1087            content_hash,
1088        )
1089    }
1090}
1091
1092impl TrackSink for &mut musefs_db::BulkWriter<'_> {
1093    fn upsert_track(&mut self, t: &NewTrack) -> musefs_db::Result<i64> {
1094        musefs_db::BulkWriter::upsert_track(self, t)
1095    }
1096    fn replace_tags(&mut self, track_id: i64, tags: &[Tag]) -> musefs_db::Result<()> {
1097        musefs_db::BulkWriter::replace_tags(self, track_id, tags)
1098    }
1099    fn set_binary_tags(
1100        &mut self,
1101        track_id: i64,
1102        tags: &[musefs_db::BinaryTag],
1103    ) -> musefs_db::Result<()> {
1104        musefs_db::BulkWriter::set_binary_tags(self, track_id, tags)
1105    }
1106    fn set_structural_blocks(
1107        &mut self,
1108        track_id: i64,
1109        blocks: &[musefs_db::StructuralBlock],
1110    ) -> musefs_db::Result<()> {
1111        musefs_db::BulkWriter::set_structural_blocks(self, track_id, blocks)
1112    }
1113    fn upsert_art(&mut self, a: &NewArt) -> musefs_db::Result<i64> {
1114        musefs_db::BulkWriter::upsert_art(self, a)
1115    }
1116    fn set_track_art(&mut self, track_id: i64, items: &[TrackArt]) -> musefs_db::Result<()> {
1117        musefs_db::BulkWriter::set_track_art(self, track_id, items)
1118    }
1119    fn set_track_checksums(
1120        &mut self,
1121        track_id: i64,
1122        fingerprint: Option<&str>,
1123        content_hash: Option<&str>,
1124    ) -> musefs_db::Result<()> {
1125        musefs_db::BulkWriter::set_track_checksums(self, track_id, fingerprint, content_hash)
1126    }
1127    fn track_exists_at(&mut self, path: &str) -> musefs_db::Result<bool> {
1128        Ok(musefs_db::BulkWriter::get_track_by_path(self, path)?.is_some())
1129    }
1130    fn tracks_by_fingerprint(&mut self, fp: &str) -> musefs_db::Result<Vec<musefs_db::Track>> {
1131        musefs_db::BulkWriter::tracks_by_fingerprint(self, fp)
1132    }
1133    fn retarget_track(
1134        &mut self,
1135        id: i64,
1136        new_backing_path: &str,
1137        stamp: BackingStamp,
1138        audio_offset: u64,
1139        audio_length: u64,
1140        fingerprint: Option<&str>,
1141        content_hash: Option<&str>,
1142    ) -> musefs_db::Result<()> {
1143        musefs_db::BulkWriter::retarget_track(
1144            self,
1145            id,
1146            new_backing_path,
1147            stamp.size,
1148            stamp.mtime_ns,
1149            stamp.ctime_ns,
1150            audio_offset,
1151            audio_length,
1152            fingerprint,
1153            content_hash,
1154        )
1155    }
1156}
1157
1158/// Upsert a track from a probed backing file into `w`: write the track row,
1159/// replace its seeded tags, and ingest its embedded art (capped, deduped,
1160/// clamped). The single source of the ingest body shared by `ingest` (direct
1161/// `&Db`), `ingest_unit` (production batch path), and `ingest_bulk` (test-only
1162/// `BulkWriter` wrapper). Takes `probed` by value so
1163/// picture/binary-tag/structural-block bytes are moved, not cloned (#68).
1164fn ingest_into(
1165    mut w: impl TrackSink,
1166    abs_path: &str,
1167    stamp: BackingStamp,
1168    probed: Probed,
1169    fingerprint: Option<&str>,
1170    content_hash: Option<&str>,
1171) -> Result<()> {
1172    let track_id = w.upsert_track(&NewTrack {
1173        backing_path: abs_path.to_string(),
1174        format: probed.format,
1175        audio_offset: probed.audio_offset,
1176        audio_length: probed.audio_length,
1177        backing_size: stamp.size,
1178        backing_mtime_ns: stamp.mtime_ns,
1179        backing_ctime_ns: stamp.ctime_ns,
1180    })?;
1181    w.set_track_checksums(track_id, fingerprint, content_hash)?;
1182
1183    let mut tags = Vec::new();
1184    let mut ordinals: HashMap<String, u64> = HashMap::new();
1185    for (key, value) in probed.tags {
1186        if !key_passes_floor(&key) {
1187            continue;
1188        }
1189        let ord = ordinals.entry(key.clone()).or_insert(0);
1190        tags.push(Tag::new(&key, &value, *ord));
1191        *ord += 1;
1192    }
1193    w.replace_tags(track_id, &tags)?;
1194
1195    let binary_tags = accept_binary_tags(abs_path, probed.binary_tags);
1196    w.set_binary_tags(track_id, &binary_tags)?;
1197
1198    let structural_blocks = structural_blocks_from(probed.structural_blocks);
1199    w.set_structural_blocks(track_id, &structural_blocks)?;
1200
1201    let mut track_arts = Vec::new();
1202    for (ordinal, pic) in accept_pictures(abs_path, probed.pictures)
1203        .into_iter()
1204        .enumerate()
1205    {
1206        let art_id = w.upsert_art(&NewArt {
1207            mime: pic.mime,
1208            width: (pic.width != 0).then_some(pic.width),
1209            height: (pic.height != 0).then_some(pic.height),
1210            data: pic.data,
1211        })?;
1212        let picture_type = pic.picture_type.get();
1213        track_arts.push(TrackArt {
1214            art_id,
1215            picture_type,
1216            description: pic.description,
1217            ordinal: ordinal as u64,
1218        });
1219    }
1220    w.set_track_art(track_id, &track_arts)?;
1221    Ok(())
1222}
1223
1224/// Refresh only the structural serving facts for an already-probed file.
1225/// Leaves curated tags, binary tags, and art untouched.
1226fn refresh_structural_into(
1227    mut w: impl TrackSink,
1228    abs_path: &str,
1229    stamp: BackingStamp,
1230    probed: Probed,
1231    fingerprint: Option<&str>,
1232    content_hash: Option<&str>,
1233) -> Result<()> {
1234    let track_id = w.upsert_track(&NewTrack {
1235        backing_path: abs_path.to_string(),
1236        format: probed.format,
1237        audio_offset: probed.audio_offset,
1238        audio_length: probed.audio_length,
1239        backing_size: stamp.size,
1240        backing_mtime_ns: stamp.mtime_ns,
1241        backing_ctime_ns: stamp.ctime_ns,
1242    })?;
1243    w.set_track_checksums(track_id, fingerprint, content_hash)?;
1244    let structural_blocks = structural_blocks_from(probed.structural_blocks);
1245    w.set_structural_blocks(track_id, &structural_blocks)?;
1246    Ok(())
1247}
1248
1249/// Decide how to ingest one probed unit: retarget a relocated row when a unique
1250/// fingerprint match exists whose backing file is gone, otherwise ingest fresh.
1251/// The strict/auto confirm hash, if computed here, is persisted on the retarget
1252/// (so a fingerprint-tier strict move doesn't re-read the file next scan).
1253fn ingest_unit(
1254    mut w: impl TrackSink,
1255    unit: Unit,
1256    strictness: MatchStrictness,
1257    policy: WritePolicy,
1258) -> Result<()> {
1259    if policy == WritePolicy::StructuralOnly {
1260        return refresh_structural_into(
1261            w,
1262            &unit.abs_path,
1263            unit.stamp,
1264            unit.probed,
1265            unit.fingerprint.as_deref(),
1266            unit.content_hash.as_deref(),
1267        );
1268    }
1269    // Known path => ordinary upsert (re-scan of an in-place file).
1270    if w.track_exists_at(&unit.abs_path)? {
1271        return ingest_into(
1272            w,
1273            &unit.abs_path,
1274            unit.stamp,
1275            unit.probed,
1276            unit.fingerprint.as_deref(),
1277            unit.content_hash.as_deref(),
1278        );
1279    }
1280    if let Some(fp) = unit.fingerprint.as_deref() {
1281        let candidates: Vec<musefs_db::Track> = w
1282            .tracks_by_fingerprint(fp)?
1283            .into_iter()
1284            .filter(|t| match std::fs::metadata(&t.backing_path) {
1285                Err(e) if e.kind() == std::io::ErrorKind::NotFound => true,
1286                Ok(_) => false,
1287                Err(e) => {
1288                    log::warn!(
1289                        "skipping retarget candidate {}: cannot stat backing path ({e})",
1290                        t.backing_path
1291                    );
1292                    false
1293                }
1294            })
1295            .collect();
1296        if candidates.len() == 1 {
1297            let cand = &candidates[0];
1298            // Does this strictness need a full-hash confirm against this candidate?
1299            let needs_full = match strictness {
1300                MatchStrictness::Fast => false,
1301                MatchStrictness::Auto | MatchStrictness::Strict => cand.content_hash.is_some(),
1302            };
1303            // The new file's full hash: worker-computed if present, else read now
1304            // (the file is present — it's the move destination). A read error here
1305            // must not abort the whole scan — log it and fall through with `None`,
1306            // which fails the confirm and inserts this unit fresh.
1307            let new_hash: Option<String> = match (&unit.content_hash, needs_full) {
1308                (Some(h), _) => Some(h.clone()),
1309                (None, true) => match full_file_hash(std::path::Path::new(&unit.abs_path)) {
1310                    Ok(h) => Some(h),
1311                    Err(e) => {
1312                        log::warn!(
1313                            "hash confirm failed for {}: {e}; inserting fresh",
1314                            unit.abs_path
1315                        );
1316                        None
1317                    }
1318                },
1319                (None, false) => None,
1320            };
1321            let confirmed = match strictness {
1322                MatchStrictness::Fast => true,
1323                MatchStrictness::Auto | MatchStrictness::Strict => match &cand.content_hash {
1324                    // Strict with no stored hash => refuse; Auto with none => fingerprint is enough.
1325                    None => matches!(strictness, MatchStrictness::Auto),
1326                    Some(stored) => new_hash.as_deref() == Some(stored.as_str()),
1327                },
1328            };
1329            if confirmed && !w.track_exists_at(&unit.abs_path)? {
1330                w.retarget_track(
1331                    cand.id,
1332                    &unit.abs_path,
1333                    unit.stamp,
1334                    unit.probed.audio_offset,
1335                    unit.probed.audio_length,
1336                    unit.fingerprint.as_deref(),
1337                    new_hash.as_deref(),
1338                )?;
1339                return Ok(());
1340            }
1341            if !confirmed {
1342                log::warn!(
1343                    "fingerprint match for {} not confirmed (strictness {:?}); inserting fresh",
1344                    unit.abs_path,
1345                    strictness,
1346                );
1347            }
1348        } else if candidates.len() > 1 {
1349            log::warn!(
1350                "ambiguous fingerprint match for {} ({} missing candidates); inserting fresh",
1351                unit.abs_path,
1352                candidates.len(),
1353            );
1354        }
1355    }
1356    ingest_into(
1357        w,
1358        &unit.abs_path,
1359        unit.stamp,
1360        unit.probed,
1361        unit.fingerprint.as_deref(),
1362        unit.content_hash.as_deref(),
1363    )
1364}
1365
1366/// Upsert a track from a probed backing file through a direct `&Db`. Thin
1367/// wrapper over [`ingest_into`]; the `oracle`/non-bulk scan path.
1368fn ingest(db: &Db, abs_path: &str, meta: &std::fs::Metadata, probed: Probed) -> Result<()> {
1369    ingest_into(
1370        db,
1371        abs_path,
1372        BackingStamp::from_metadata(meta),
1373        probed,
1374        None,
1375        None,
1376    )
1377}
1378
1379/// Like [`ingest`], but writes through a batch `BulkWriter`. Thin wrapper over
1380/// [`ingest_into`]; the `stamp` is captured once by the caller's `fstat`. The
1381/// production batch path inlines `ingest_into` (it threads per-unit checksums),
1382/// so this wrapper now only serves the hardening tests' bulk-writer coverage.
1383#[cfg(test)]
1384fn ingest_bulk(
1385    bw: &mut musefs_db::BulkWriter<'_>,
1386    abs_path: &str,
1387    stamp: BackingStamp,
1388    probed: Probed,
1389) -> Result<()> {
1390    ingest_into(bw, abs_path, stamp, probed, None, None)
1391}
1392
1393/// Public entry: parallel-probe / single-writer scan of `root`.
1394///
1395/// Insert/update a track row for each supported audio file (FLAC, MP3, M4A,
1396/// Opus, Vorbis, FLAC-in-Ogg) under `root` (with audio bounds and validation
1397/// stamps), seeding its tags from the file's existing metadata. `root` may be
1398/// a single audio file (only that file is scanned) or a directory (walked
1399/// recursively). Files whose extension is not a supported audio format
1400/// increment `ScanStats::skipped` and are tallied by extension for the
1401/// end-of-scan summary log line (#341); supported-extension files with a
1402/// per-file I/O or parse error increment `ScanStats::failed` and do not abort
1403/// the scan.
1404pub fn scan_directory_with(db: &Db, root: &Path, opts: &ScanOptions) -> Result<ScanStats> {
1405    // Canonicalize the root once. With symlinks unfollowed (the default) every
1406    // path the walk yields is then already absolute and symlink-free — i.e.
1407    // canonical — so the workers need not canonicalize each probed file (#440).
1408    let canon = std::fs::canonicalize(root)?;
1409    let root = canon.as_path();
1410    let mut files = Vec::new();
1411    let mut tally = SkipTally::default();
1412    if root.is_file() {
1413        if is_supported_audio(root) {
1414            files.push(root.to_path_buf());
1415        } else {
1416            tally.record(root);
1417        }
1418    } else {
1419        tally = collect_audio_with(
1420            root,
1421            &mut files,
1422            opts.follow_symlinks,
1423            opts.progress.as_ref(),
1424        )?;
1425    }
1426    let mut already_present = 0u64;
1427    if !opts.force {
1428        let existing: HashSet<String> = db
1429            .list_tracks()?
1430            .into_iter()
1431            .map(|t| t.backing_path)
1432            .collect();
1433        let before = files.len();
1434        files.retain(|path| {
1435            let key = if opts.follow_symlinks {
1436                match std::fs::canonicalize(path) {
1437                    Ok(abs) => abs.to_string_lossy().into_owned(),
1438                    Err(_) => return true,
1439                }
1440            } else {
1441                path.to_string_lossy().into_owned()
1442            };
1443            !existing.contains(&key)
1444        });
1445        already_present = (before - files.len()) as u64;
1446    }
1447    if let Some(p) = &opts.progress {
1448        p.emit(ScanProgress::Walked {
1449            total: files.len() as u64,
1450        });
1451    }
1452    db.apply_bulk_pragmas_self()?; // scan-scoped tuning on the caller's connection
1453    let mut stats = run_pipeline(db, files, opts, WritePolicy::Full)?;
1454    // skipped is tallied during the walk, not the pipeline
1455    stats.skipped = tally.total;
1456    stats.already_present = already_present;
1457    // Per-extension breakdown of the skip count, so a large `skipped` is
1458    // diagnosable (#341). Log-only: never folded into `stats`/the CLI summary.
1459    if let Some(summary) = tally.summary() {
1460        log::warn!("{summary}");
1461    }
1462    Ok(stats)
1463}
1464
1465/// Back-compat shim used by the CLI and existing tests.
1466pub fn scan_directory(db: &Db, root: &Path) -> Result<ScanStats> {
1467    scan_directory_with(db, root, &ScanOptions::default())
1468}
1469
1470/// Probe `files` across `jobs` workers (no DB access) and write the results from a
1471/// single writer (this thread) in batched transactions. Per-file errors are
1472/// counted, not fatal.
1473fn run_pipeline(
1474    db: &Db,
1475    files: Vec<PathBuf>,
1476    opts: &ScanOptions,
1477    policy: WritePolicy,
1478) -> Result<ScanStats> {
1479    use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
1480
1481    let jobs = effective_jobs(opts.jobs);
1482    let total = files.len() as u64;
1483    let progress = opts.progress.as_ref();
1484    let window = opts.window;
1485    let follow_symlinks = opts.follow_symlinks;
1486    let tier = opts.checksum;
1487    let strictness = opts.strictness;
1488    let cap = opts.batch_bytes;
1489    let budget = Arc::new(ByteBudget::new(cap));
1490    let failed = Arc::new(AtomicU64::new(0));
1491    let raced = Arc::new(AtomicU64::new(0));
1492
1493    // Work queue: a shared slice with an atomic cursor — each worker claims the
1494    // next index with a single relaxed `fetch_add`, no per-file lock contention.
1495    let files = Arc::new(files);
1496    let cursor = Arc::new(AtomicUsize::new(0));
1497    let (tx, rx) = sync_channel::<Unit>(jobs * 2);
1498
1499    let mut workers = Vec::with_capacity(jobs);
1500    for _ in 0..jobs {
1501        let files = Arc::clone(&files);
1502        let cursor = Arc::clone(&cursor);
1503        let tx = tx.clone();
1504        let budget = Arc::clone(&budget);
1505        let failed = Arc::clone(&failed);
1506        let raced = Arc::clone(&raced);
1507        workers.push(std::thread::spawn(move || {
1508            loop {
1509                let i = cursor.fetch_add(1, Ordering::Relaxed);
1510                let Some(path) = files.get(i) else { break };
1511                match probe_file_caught(path, window) {
1512                    Ok(ProbeOutcome::Probed(probed, stamp)) => {
1513                        // No-follow paths are canonical by construction (the root
1514                        // was canonicalized up front); only the opt-in symlink walk
1515                        // can yield a path with a symlink component to resolve (#440).
1516                        let abs_path = if follow_symlinks {
1517                            match std::fs::canonicalize(path) {
1518                                Ok(abs) => abs.to_string_lossy().into_owned(),
1519                                Err(e) => {
1520                                    log::warn!("skipping {}: {e}", path.display());
1521                                    failed.fetch_add(1, Ordering::Relaxed);
1522                                    continue;
1523                                }
1524                            }
1525                        } else {
1526                            path.to_string_lossy().into_owned()
1527                        };
1528                        let weight = payload_weight(&probed);
1529                        budget.acquire(weight); // backpressure on in-flight art bytes
1530                        let fingerprint = match tier {
1531                            ChecksumTier::None => None,
1532                            ChecksumTier::Fingerprint | ChecksumTier::Full => {
1533                                Some(fingerprint_of(&probed))
1534                            }
1535                        };
1536                        let content_hash = match tier {
1537                            ChecksumTier::Full => {
1538                                match full_file_hash(std::path::Path::new(&abs_path)) {
1539                                    Ok(h) => Some(h),
1540                                    Err(e) => {
1541                                        log::warn!("content hash failed for {abs_path}: {e}");
1542                                        None
1543                                    }
1544                                }
1545                            }
1546                            _ => None,
1547                        };
1548                        let unit = Unit {
1549                            abs_path,
1550                            stamp,
1551                            probed,
1552                            weight,
1553                            fingerprint,
1554                            content_hash,
1555                        };
1556                        if tx.send(unit).is_err() {
1557                            budget.release(weight);
1558                            break;
1559                        }
1560                    }
1561                    Ok(ProbeOutcome::Unparseable) => {
1562                        failed.fetch_add(1, Ordering::Relaxed);
1563                    }
1564                    Err(e) => {
1565                        log::warn!("skipping {}: {e}", path.display());
1566                        failed.fetch_add(1, Ordering::Relaxed);
1567                    }
1568                    Ok(ProbeOutcome::Raced) => {
1569                        raced.fetch_add(1, Ordering::Relaxed);
1570                    }
1571                }
1572            }
1573        }));
1574    }
1575    drop(tx); // close the channel once all clones (workers) finish
1576
1577    // Writer: this thread. Batch by file count and accumulated art bytes.
1578    let mut scanned = 0u64;
1579    let mut batch: Vec<Unit> = Vec::new();
1580    let mut batch_bytes = 0u64;
1581    let flush = |batch: &mut Vec<Unit>, batch_bytes: &mut u64, scanned: &mut u64| -> Result<()> {
1582        if batch.is_empty() {
1583            return Ok(());
1584        }
1585        let mut bw = db.bulk_writer()?;
1586        // Budget weights are released only after commit, and ingest_unit consumes
1587        // the Probed — capture each unit's weight before the move (#68).
1588        let mut released = 0u64;
1589        // `Ingested` reports committed files, so buffer the paths and emit only
1590        // after `bw.commit()` succeeds — a failed commit aborts the scan without
1591        // having advanced the progress bar past unpersisted files.
1592        let mut committed: Vec<String> = Vec::new();
1593        for unit in batch.drain(..) {
1594            released += unit.weight;
1595            committed.push(unit.abs_path.clone());
1596            ingest_unit(&mut bw, unit, strictness, policy)?;
1597        }
1598        bw.commit()?;
1599        for abs_path in committed {
1600            *scanned += 1;
1601            if let Some(p) = progress {
1602                p.emit(ScanProgress::Ingested {
1603                    done: *scanned,
1604                    total,
1605                    path: &abs_path,
1606                });
1607            }
1608        }
1609        // Coalesce into one wakeup: the commit frees the whole batch, so a single
1610        // release avoids waking every blocked producer once per committed file.
1611        budget.release(released);
1612        *batch_bytes = 0;
1613        Ok(())
1614    };
1615
1616    // Drain the channel, batching by file count and accumulated art bytes. The
1617    // budget cap equals the byte-flush threshold, so a worker calling
1618    // `budget.acquire` (which it does *before* `send`) could block while the
1619    // writer's pending batch sits just below the threshold — if the writer then
1620    // parked on a blocking `recv`, neither side could make progress (the held
1621    // budget is never released, the batch never reaches the threshold). To avoid
1622    // that, whenever the channel momentarily drains we flush the pending batch —
1623    // releasing the budget so blocked producers proceed — *before* blocking on the
1624    // next item.
1625    loop {
1626        match rx.try_recv() {
1627            Ok(unit) => {
1628                batch_bytes += unit.weight;
1629                batch.push(unit);
1630                if batch.len() >= BATCH_FILES || batch_bytes >= cap {
1631                    flush(&mut batch, &mut batch_bytes, &mut scanned)?;
1632                }
1633            }
1634            Err(std::sync::mpsc::TryRecvError::Empty) => {
1635                flush(&mut batch, &mut batch_bytes, &mut scanned)?;
1636                match rx.recv() {
1637                    Ok(unit) => {
1638                        batch_bytes += unit.weight;
1639                        batch.push(unit);
1640                        if batch.len() >= BATCH_FILES || batch_bytes >= cap {
1641                            flush(&mut batch, &mut batch_bytes, &mut scanned)?;
1642                        }
1643                    }
1644                    Err(_) => break, // all workers finished; channel closed
1645                }
1646            }
1647            Err(std::sync::mpsc::TryRecvError::Disconnected) => break,
1648        }
1649    }
1650    flush(&mut batch, &mut batch_bytes, &mut scanned)?;
1651    // A fatal flush error above returns via `?` *before* this join, abandoning the
1652    // worker threads — acceptable because a DB-write failure aborts the whole scan.
1653    // On the success path every worker has already exited (the work queue drained
1654    // and `drop(tx)` closed the channel), so these joins return promptly.
1655    for w in workers {
1656        let _ = w.join();
1657    }
1658
1659    Ok(ScanStats {
1660        scanned,
1661        skipped: 0, // counted at walk time; filled in by scan_directory_with
1662        already_present: 0,
1663        failed: failed.load(Ordering::Relaxed),
1664        raced: raced.load(Ordering::Relaxed),
1665    })
1666}
1667
1668/// Test/oracle only: scan using the legacy whole-file probe (`probe_full`). The
1669/// equivalence property compares this against the bounded `scan_directory`.
1670#[doc(hidden)]
1671pub fn scan_directory_full_oracle(db: &Db, root: &Path) -> Result<ScanStats> {
1672    let mut files = Vec::new();
1673    let mut skipped = 0u64;
1674    if root.is_file() {
1675        if is_supported_audio(root) {
1676            files.push(root.to_path_buf());
1677        } else {
1678            skipped += 1;
1679        }
1680    } else {
1681        skipped += collect_audio(root, &mut files, false)?.total;
1682    }
1683    let mut stats = ScanStats {
1684        scanned: 0,
1685        skipped,
1686        already_present: 0,
1687        failed: 0,
1688        raced: 0,
1689    };
1690    for path in files {
1691        let bytes = std::fs::read(&path)?;
1692        let Some(probed) = probe_full(&path, &bytes) else {
1693            stats.failed += 1;
1694            continue;
1695        };
1696        let meta = std::fs::metadata(&path)?;
1697        let abs = std::fs::canonicalize(&path)?;
1698        ingest(db, &abs.to_string_lossy(), &meta, probed)?;
1699        stats.scanned += 1;
1700    }
1701    Ok(stats)
1702}
1703
1704/// Re-validate an already-scanned library root: re-probe only files whose
1705/// size/mtime/ctime changed since the last scan (skipping unchanged ones so external
1706/// tag edits in the DB are preserved), then delete tracks **under `root`** whose
1707/// backing file is gone (cascading tags/art links) and garbage-collect
1708/// now-unreferenced art. `root` may be a single audio file (only that file is
1709/// revalidated) or a directory (walked recursively). Pruning is scoped to
1710/// `root`, so revalidating one library root never removes tracks belonging to
1711/// another.
1712///
1713/// Uses `opts` to configure the probe pipeline (e.g. `jobs` for parallelism).
1714/// The skip-unchanged decision runs on the calling thread before workers are
1715/// dispatched, so workers remain DB-free. A `stat`/`canonicalize` failure on a
1716/// candidate during the skip pass is counted in `failed` (and the file is left
1717/// for the next revalidation) rather than re-probed or pruned.
1718pub fn revalidate_with(db: &Db, root: &Path, opts: &ScanOptions) -> Result<RevalidateStats> {
1719    // Canonicalize once; see scan_directory_with (#440). The prune pass below reuses
1720    // this canonical root for its `starts_with` scope check.
1721    let canon = std::fs::canonicalize(root)?;
1722    let root = canon.as_path();
1723    let mut files = Vec::new();
1724    if root.is_file() {
1725        if is_supported_audio(root) {
1726            files.push(root.to_path_buf());
1727        }
1728    } else {
1729        collect_audio_with(
1730            root,
1731            &mut files,
1732            opts.follow_symlinks,
1733            opts.progress.as_ref(),
1734        )?;
1735    }
1736    db.apply_bulk_pragmas_self()?;
1737
1738    // Main-thread pre-dispatch skip pass: load existing
1739    // (path -> stamp, id, format, has_fingerprint, has_content_hash) once,
1740    // stat each candidate, keep only changed files. Workers stay DB-free.
1741    let existing: HashMap<String, (crate::freshness::BackingStamp, i64, Format, bool, bool)> = db
1742        .list_tracks()?
1743        .into_iter()
1744        .map(|t| {
1745            (
1746                t.backing_path.clone(),
1747                (
1748                    crate::freshness::BackingStamp::from_track(&t),
1749                    t.id,
1750                    t.format,
1751                    t.fingerprint.is_some(),
1752                    t.content_hash.is_some(),
1753                ),
1754            )
1755        })
1756        .collect();
1757    // Legacy backfill (spec §1): FLAC tracks scanned under V1 have no structural
1758    // blocks. Re-scan them even when the backing file is unchanged so the V2
1759    // structural store + binary tags get populated by the ingest path.
1760    let have_structural = db.track_ids_with_structural_blocks()?;
1761
1762    let mut unchanged = 0u64;
1763    let mut skip_failed = 0u64;
1764    let mut changed: Vec<PathBuf> = Vec::new();
1765    for path in files {
1766        let meta = match std::fs::metadata(&path) {
1767            Ok(meta) => meta,
1768            Err(e) => {
1769                log::warn!("skipping {}: {e}", path.display());
1770                skip_failed += 1;
1771                continue;
1772            }
1773        };
1774        let key = if opts.follow_symlinks {
1775            match std::fs::canonicalize(&path) {
1776                Ok(abs) => abs.to_string_lossy().into_owned(),
1777                Err(e) => {
1778                    log::warn!("skipping {}: {e}", path.display());
1779                    skip_failed += 1;
1780                    continue;
1781                }
1782            }
1783        } else {
1784            path.to_string_lossy().into_owned()
1785        };
1786        if let Some((stamp, id, format, has_fingerprint, has_content_hash)) =
1787            existing.get(&key).copied()
1788        {
1789            let needs_backfill = format == Format::Flac && !have_structural.contains(&id);
1790            let needs_checksum = match opts.checksum {
1791                ChecksumTier::None => false,
1792                ChecksumTier::Fingerprint => !has_fingerprint,
1793                ChecksumTier::Full => !has_fingerprint || !has_content_hash,
1794            };
1795            if crate::freshness::BackingStamp::from_metadata(&meta) == stamp
1796                && !needs_backfill
1797                && !needs_checksum
1798            {
1799                unchanged += 1;
1800                continue;
1801            }
1802            changed.push(path);
1803        }
1804    }
1805
1806    if let Some(p) = &opts.progress {
1807        p.emit(ScanProgress::Walked {
1808            total: changed.len() as u64,
1809        });
1810    }
1811
1812    let mut pruned = 0u64;
1813    let scan = run_pipeline(db, changed, opts, WritePolicy::StructuralOnly)?;
1814
1815    if opts.prune {
1816        let canon_root = root;
1817        for track in db.list_tracks()? {
1818            if !Path::new(&track.backing_path).starts_with(canon_root) {
1819                continue;
1820            }
1821            if let Err(e) = std::fs::metadata(&track.backing_path)
1822                && e.kind() == std::io::ErrorKind::NotFound
1823            {
1824                db.delete_track(track.id)?;
1825                pruned += 1;
1826            }
1827        }
1828        db.gc_orphan_art()?;
1829    }
1830
1831    Ok(RevalidateStats {
1832        updated: scan.scanned,
1833        unchanged,
1834        pruned,
1835        failed: scan.failed + skip_failed,
1836        raced: scan.raced,
1837    })
1838}
1839
1840/// Back-compat shim used by the CLI and existing tests.
1841pub fn revalidate(db: &Db, root: &Path) -> Result<RevalidateStats> {
1842    revalidate_with(db, root, &ScanOptions::default())
1843}
1844
1845/// SHA-256 of the probe's parsed output, hex-encoded. This is the cheap content
1846/// fingerprint: deterministic per file (the parsed `Probed` is window- and
1847/// format-independent), and excludes every filesystem-stamp field. Length-prefix
1848/// every variable-length field so concatenation can't alias.
1849pub(crate) fn fingerprint_of(p: &Probed) -> String {
1850    use sha2::{Digest, Sha256};
1851    // Inner fn (not a closure) so it doesn't hold a borrow of `h` across the
1852    // direct `h.update(...)` calls below.
1853    fn feed(h: &mut Sha256, bytes: &[u8]) {
1854        h.update((bytes.len() as u64).to_le_bytes());
1855        h.update(bytes);
1856    }
1857    let mut h = Sha256::new();
1858    feed(&mut h, p.format.as_str().as_bytes());
1859    h.update(p.audio_offset.to_le_bytes());
1860    h.update(p.audio_length.to_le_bytes());
1861    h.update((p.tags.len() as u64).to_le_bytes());
1862    for (k, v) in &p.tags {
1863        feed(&mut h, k.as_bytes());
1864        feed(&mut h, v.as_bytes());
1865    }
1866    h.update((p.pictures.len() as u64).to_le_bytes());
1867    for pic in &p.pictures {
1868        feed(&mut h, pic.mime.as_bytes());
1869        h.update(u64::from(pic.picture_type.get()).to_le_bytes());
1870        feed(&mut h, pic.description.as_bytes());
1871        h.update(u64::from(pic.width).to_le_bytes());
1872        h.update(u64::from(pic.height).to_le_bytes());
1873        feed(&mut h, &pic.data);
1874    }
1875    h.update((p.binary_tags.len() as u64).to_le_bytes());
1876    for bt in &p.binary_tags {
1877        feed(&mut h, bt.key.as_bytes());
1878        feed(&mut h, &bt.payload);
1879    }
1880    h.update((p.structural_blocks.len() as u64).to_le_bytes());
1881    for (kind, body) in &p.structural_blocks {
1882        feed(&mut h, kind.as_bytes());
1883        feed(&mut h, body);
1884    }
1885    format!("{:x}", base16ct::HexDisplay(&h.finalize()))
1886}
1887
1888/// Streaming SHA-256 of an entire backing file, hex-encoded. The authoritative
1889/// content identity; reads the whole file, so callers gate it on the `Full` tier
1890/// or a strict-confirmation need.
1891pub(crate) fn full_file_hash(path: &std::path::Path) -> std::io::Result<String> {
1892    use sha2::{Digest, Sha256};
1893    let mut f = std::fs::File::open(path)?;
1894    let mut h = Sha256::new();
1895    let mut buf = vec![0u8; 1 << 16];
1896    loop {
1897        let n = std::io::Read::read(&mut f, &mut buf)?;
1898        if n == 0 {
1899            break;
1900        }
1901        h.update(&buf[..n]);
1902    }
1903    Ok(format!("{:x}", base16ct::HexDisplay(&h.finalize())))
1904}
1905
1906#[cfg(test)]
1907mod bounded_probe_tests;
1908#[cfg(test)]
1909mod hardening_tests;
1910#[cfg(test)]
1911mod ogg_probe_tests;
1912#[cfg(test)]
1913mod scan_unit_tests;
1914#[cfg(test)]
1915mod wav_probe_tests;