Skip to main content

oximedia_container/
probe.rs

1//! Format probing and detection.
2
3use crate::ContainerFormat;
4use oximedia_core::OxiError;
5
6/// Result of format probing.
7///
8/// Contains the detected format and a confidence score indicating
9/// how certain the detection is.
10#[derive(Clone, Debug)]
11pub struct ProbeResult {
12    /// Detected container format.
13    pub format: ContainerFormat,
14    /// Confidence score from 0.0 to 1.0.
15    ///
16    /// Higher values indicate greater confidence in the detection.
17    /// A score of 1.0 means the format is certain (e.g., unique magic bytes).
18    pub confidence: f32,
19}
20
21impl ProbeResult {
22    /// Creates a new probe result.
23    #[must_use]
24    pub const fn new(format: ContainerFormat, confidence: f32) -> Self {
25        Self { format, confidence }
26    }
27}
28
29/// Magic byte signatures for container detection.
30const MATROSKA_MAGIC: &[u8] = &[0x1A, 0x45, 0xDF, 0xA3]; // EBML header
31const OGG_MAGIC: &[u8] = b"OggS";
32const FLAC_MAGIC: &[u8] = b"fLaC";
33const RIFF_MAGIC: &[u8] = b"RIFF";
34const WAVE_MAGIC: &[u8] = b"WAVE";
35const ISOBMFF_FTYP: &[u8] = b"ftyp";
36const WEBVTT_MAGIC: &[u8] = b"WEBVTT";
37const Y4M_MAGIC: &[u8] = b"YUV4MPEG2";
38const MPEG_TS_SYNC: u8 = 0x47; // MPEG-TS sync byte
39const TS_PACKET_SIZE: usize = 188;
40
41/// Probe the container format from raw bytes.
42///
43/// Analyzes the first few bytes of media data to detect the container format.
44/// Returns the detected format and a confidence score.
45///
46/// # Arguments
47///
48/// * `data` - At least the first 12 bytes of the file (more bytes improve detection)
49///
50/// # Errors
51///
52/// Returns `OxiError::UnknownFormat` if the format cannot be detected.
53///
54/// # Example
55///
56/// ```
57/// use oximedia_container::{probe_format, ContainerFormat};
58///
59/// // WebM/Matroska header
60/// let data = [0x1A, 0x45, 0xDF, 0xA3, 0x01, 0x00, 0x00, 0x00];
61/// let result = probe_format(&data).expect("valid header");
62/// assert_eq!(result.format, ContainerFormat::Matroska);
63/// ```
64pub fn probe_format(data: &[u8]) -> Result<ProbeResult, OxiError> {
65    if data.len() < 4 {
66        return Err(OxiError::UnknownFormat);
67    }
68
69    // Check Matroska/WebM (EBML header)
70    if data.starts_with(MATROSKA_MAGIC) {
71        // WebM is detected by DocType, but for initial probe we return Matroska
72        return Ok(ProbeResult {
73            format: ContainerFormat::Matroska,
74            confidence: 0.95,
75        });
76    }
77
78    // Check Ogg
79    if data.starts_with(OGG_MAGIC) {
80        return Ok(ProbeResult {
81            format: ContainerFormat::Ogg,
82            confidence: 0.99,
83        });
84    }
85
86    // Check Y4M (YUV4MPEG2)
87    if data.len() >= Y4M_MAGIC.len() && data.starts_with(Y4M_MAGIC) {
88        return Ok(ProbeResult {
89            format: ContainerFormat::Y4m,
90            confidence: 0.99,
91        });
92    }
93
94    // Check FLAC
95    if data.starts_with(FLAC_MAGIC) {
96        return Ok(ProbeResult {
97            format: ContainerFormat::Flac,
98            confidence: 0.99,
99        });
100    }
101
102    // Check WAV (RIFF + WAVE)
103    if data.len() >= 12 && data.starts_with(RIFF_MAGIC) && &data[8..12] == WAVE_MAGIC {
104        return Ok(ProbeResult {
105            format: ContainerFormat::Wav,
106            confidence: 0.99,
107        });
108    }
109
110    // Check ISOBMFF/MP4 (ftyp box)
111    if data.len() >= 8 && &data[4..8] == ISOBMFF_FTYP {
112        return Ok(ProbeResult {
113            format: ContainerFormat::Mp4,
114            confidence: 0.90,
115        });
116    }
117
118    // Check MPEG-TS (sync byte pattern every 188 bytes)
119    // We need at least 2 packets (376 bytes) for reliable detection
120    if data.len() >= TS_PACKET_SIZE * 2 {
121        let mut sync_count = 0;
122        let max_checks = (data.len() / TS_PACKET_SIZE).min(3);
123
124        for i in 0..max_checks {
125            if data[i * TS_PACKET_SIZE] == MPEG_TS_SYNC {
126                sync_count += 1;
127            } else {
128                break;
129            }
130        }
131
132        if sync_count >= 2 {
133            return Ok(ProbeResult {
134                format: ContainerFormat::MpegTs,
135                confidence: 0.95,
136            });
137        }
138    } else if data.len() >= TS_PACKET_SIZE && data[0] == MPEG_TS_SYNC {
139        // Single packet check (lower confidence)
140        return Ok(ProbeResult {
141            format: ContainerFormat::MpegTs,
142            confidence: 0.60,
143        });
144    }
145
146    // Check WebVTT
147    if data.starts_with(WEBVTT_MAGIC) {
148        return Ok(ProbeResult {
149            format: ContainerFormat::WebVtt,
150            confidence: 0.99,
151        });
152    }
153
154    // Check SRT (heuristic: starts with a number followed by newline and timestamp)
155    // SRT format: "1\n00:00:00,000 --> 00:00:02,000\n"
156    if data.len() >= 20 {
157        // Convert to string to check pattern
158        if let Ok(text) = std::str::from_utf8(&data[..data.len().min(100)]) {
159            let lines: Vec<&str> = text.lines().take(3).collect();
160            if lines.len() >= 2
161                && lines[0].trim().chars().all(|c| c.is_ascii_digit())
162                && lines[1].contains("-->")
163                && lines[1].contains(',')
164            {
165                return Ok(ProbeResult {
166                    format: ContainerFormat::Srt,
167                    confidence: 0.85,
168                });
169            }
170        }
171    }
172
173    Err(OxiError::UnknownFormat)
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn test_probe_matroska() {
182        let data = [0x1A, 0x45, 0xDF, 0xA3, 0x01, 0x00, 0x00, 0x00];
183        let result = probe_format(&data).expect("operation should succeed");
184        assert_eq!(result.format, ContainerFormat::Matroska);
185        assert!(result.confidence > 0.9);
186    }
187
188    #[test]
189    fn test_probe_ogg() {
190        let data = b"OggS\x00\x02\x00\x00\x00\x00\x00\x00";
191        let result = probe_format(data).expect("operation should succeed");
192        assert_eq!(result.format, ContainerFormat::Ogg);
193    }
194
195    #[test]
196    fn test_probe_flac() {
197        let data = b"fLaC\x00\x00\x00\x22";
198        let result = probe_format(data).expect("operation should succeed");
199        assert_eq!(result.format, ContainerFormat::Flac);
200    }
201
202    #[test]
203    fn test_probe_wav() {
204        let data = b"RIFF\x00\x00\x00\x00WAVEfmt ";
205        let result = probe_format(data).expect("operation should succeed");
206        assert_eq!(result.format, ContainerFormat::Wav);
207    }
208
209    #[test]
210    fn test_probe_unknown() {
211        let data = [0x00, 0x00, 0x00, 0x00];
212        assert!(probe_format(&data).is_err());
213    }
214
215    #[test]
216    fn test_probe_too_short() {
217        let data = [0x1A, 0x45];
218        assert!(probe_format(&data).is_err());
219    }
220}