oximedia-io 0.1.2

I/O layer for OxiMedia
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
//! Format auto-detection by buffer content.
//!
//! [`ContentDetector`] provides three orthogonal detection methods:
//!
//! 1. **Text encoding** — inspects BOM markers and byte statistics to classify
//!    a buffer as UTF-8, UTF-16 (LE/BE), Latin-1, or plain ASCII.
//!
//! 2. **Binary vs. text** — heuristic based on null-byte density and
//!    non-printable byte ratio.
//!
//! 3. **Media type** — delegates to [`FormatDetector`] for magic-byte
//!    identification and maps the result to a broad [`MediaType`] category.

use crate::format_detector::{FormatDetector, MediaFormat};

// ─────────────────────────────────────────────────────────────────────────────
// TextEncoding
// ─────────────────────────────────────────────────────────────────────────────

/// Character encoding detected from a data buffer.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TextEncoding {
    /// Unicode UTF-8 (with or without a leading BOM `EF BB BF`).
    Utf8,
    /// UTF-16 Little-Endian (BOM `FF FE`).
    Utf16Le,
    /// UTF-16 Big-Endian (BOM `FE FF`).
    Utf16Be,
    /// ISO 8859-1 / Windows-1252 (bytes in 0x80–0xFF range present).
    Latin1,
    /// Pure 7-bit ASCII (only printable characters and common control chars).
    Ascii,
}

impl std::fmt::Display for TextEncoding {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TextEncoding::Utf8 => write!(f, "UTF-8"),
            TextEncoding::Utf16Le => write!(f, "UTF-16LE"),
            TextEncoding::Utf16Be => write!(f, "UTF-16BE"),
            TextEncoding::Latin1 => write!(f, "Latin-1"),
            TextEncoding::Ascii => write!(f, "ASCII"),
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// MediaType
// ─────────────────────────────────────────────────────────────────────────────

/// Broad media type category.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MediaType {
    /// Video container (MP4, MKV, WebM, MXF, etc.).
    Video,
    /// Audio file (FLAC, WAV, MP3, AAC, etc.).
    Audio,
    /// Image (JPEG, PNG, WebP, EXR, DPX, etc.).
    Image,
    /// Compressed archive or data container (ZIP, GZ, XZ, etc.).
    Archive,
    /// Plain-text content (SRT, VTT, SVG, etc.).
    Text,
    /// Binary data that did not match any known media format.
    Binary,
    /// Content type could not be determined (empty or ambiguous buffer).
    Unknown,
}

impl std::fmt::Display for MediaType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            MediaType::Video => write!(f, "Video"),
            MediaType::Audio => write!(f, "Audio"),
            MediaType::Image => write!(f, "Image"),
            MediaType::Archive => write!(f, "Archive"),
            MediaType::Text => write!(f, "Text"),
            MediaType::Binary => write!(f, "Binary"),
            MediaType::Unknown => write!(f, "Unknown"),
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// ContentDetector
// ─────────────────────────────────────────────────────────────────────────────

/// Number of bytes examined by the binary-detection heuristic.
const BINARY_PROBE_BYTES: usize = 8192;

/// Null-byte ratio threshold above which a buffer is considered binary.
const NULL_BYTE_THRESHOLD: f64 = 0.01; // 1 %

/// Non-printable byte ratio threshold above which a buffer is considered binary.
const NON_PRINTABLE_THRESHOLD: f64 = 0.30; // 30 %

/// Stateless content-based format detector.
pub struct ContentDetector;

impl ContentDetector {
    // ── Text encoding detection ───────────────────────────────────────────────

    /// Detect the text encoding of `data`.
    ///
    /// The detection proceeds in this order:
    ///
    /// 1. **BOM checks** — `EF BB BF` → UTF-8, `FF FE` → UTF-16LE,
    ///    `FE FF` → UTF-16BE.
    /// 2. **Pure ASCII** — all bytes are printable ASCII or common control
    ///    characters (`\t`, `\n`, `\r`).
    /// 3. **Valid UTF-8** — the entire slice passes `std::str::from_utf8`.
    /// 4. **Latin-1** — any byte in the `0x80–0xFF` range is present.
    /// 5. **Fallback** — returns [`TextEncoding::Ascii`].
    #[must_use]
    pub fn detect_encoding(data: &[u8]) -> TextEncoding {
        if data.is_empty() {
            return TextEncoding::Ascii;
        }

        // ── BOM detection ─────────────────────────────────────────────────────
        if data.len() >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF {
            return TextEncoding::Utf8;
        }
        if data.len() >= 2 && data[0] == 0xFF && data[1] == 0xFE {
            return TextEncoding::Utf16Le;
        }
        if data.len() >= 2 && data[0] == 0xFE && data[1] == 0xFF {
            return TextEncoding::Utf16Be;
        }

        // ── Pure ASCII check ──────────────────────────────────────────────────
        // A byte is "ASCII-compatible" if it is a printable ASCII character
        // (0x20–0x7E) or one of the three common whitespace control codes.
        let is_ascii_compat =
            |b: u8| -> bool { (0x20..=0x7E).contains(&b) || b == 0x09 || b == 0x0A || b == 0x0D };
        if data.iter().copied().all(is_ascii_compat) {
            return TextEncoding::Ascii;
        }

        // ── Valid UTF-8 check ─────────────────────────────────────────────────
        if std::str::from_utf8(data).is_ok() {
            return TextEncoding::Utf8;
        }

        // ── Latin-1 heuristic ─────────────────────────────────────────────────
        // If any byte is in the extended range it's likely Latin-1 / Windows-1252.
        if data.iter().any(|&b| b >= 0x80) {
            return TextEncoding::Latin1;
        }

        // Fallback (should be unreachable given the ASCII-compat check above,
        // but provides a safe default).
        TextEncoding::Ascii
    }

    // ── Binary detection ──────────────────────────────────────────────────────

    /// Return `true` when `data` appears to be binary content.
    ///
    /// The heuristic examines up to the first `BINARY_PROBE_BYTES` bytes.
    /// A buffer is considered binary when:
    ///
    /// - More than 1 % of the examined bytes are null (`0x00`), **or**
    /// - More than 30 % of the examined bytes are non-printable control
    ///   characters (bytes `< 0x08` or in the range `0x0E–0x1F`, excluding
    ///   tab `0x09`, LF `0x0A`, and CR `0x0D`).
    #[must_use]
    pub fn is_binary(data: &[u8]) -> bool {
        if data.is_empty() {
            return false;
        }
        let probe = &data[..data.len().min(BINARY_PROBE_BYTES)];
        let total = probe.len() as f64;
        let mut null_count = 0usize;
        let mut non_printable_count = 0usize;

        for &b in probe {
            if b == 0x00 {
                null_count += 1;
            }
            // Non-printable control characters (excluding \t, \n, \r).
            if b < 0x08 || (0x0E..=0x1F).contains(&b) {
                non_printable_count += 1;
            }
        }

        let null_ratio = null_count as f64 / total;
        let non_printable_ratio = non_printable_count as f64 / total;

        null_ratio > NULL_BYTE_THRESHOLD || non_printable_ratio > NON_PRINTABLE_THRESHOLD
    }

    // ── Media type detection ──────────────────────────────────────────────────

    /// Detect the broad media type of `data` using magic-byte inspection.
    ///
    /// Delegates to [`FormatDetector::detect`] for magic-byte identification
    /// and maps the result to a [`MediaType`] category.  When the format is
    /// [`MediaFormat::Unknown`] this method falls back to [`Self::is_binary`]
    /// to distinguish [`MediaType::Binary`] from [`MediaType::Text`].
    #[must_use]
    pub fn detect_media_type(data: &[u8]) -> MediaType {
        if data.is_empty() {
            return MediaType::Unknown;
        }

        let detection = FormatDetector::detect(data);
        Self::media_format_to_type(detection.format, data)
    }

    /// Map a [`MediaFormat`] to the corresponding [`MediaType`].
    fn media_format_to_type(format: MediaFormat, data: &[u8]) -> MediaType {
        // Use the convenience helpers on MediaFormat where possible.
        if format.is_video() {
            return MediaType::Video;
        }
        if format.is_audio() {
            return MediaType::Audio;
        }
        if format.is_image() {
            return MediaType::Image;
        }

        match format {
            // Archive / compression formats.
            MediaFormat::Zip
            | MediaFormat::Tar
            | MediaFormat::Gz
            | MediaFormat::Bz2
            | MediaFormat::Xz
            | MediaFormat::Zstd => MediaType::Archive,

            // Text subtitle formats.
            MediaFormat::Srt | MediaFormat::Vtt | MediaFormat::Ass | MediaFormat::Svg => {
                MediaType::Text
            }

            // Unknown: fall back to binary heuristic.
            MediaFormat::Unknown => {
                if Self::is_binary(data) {
                    MediaType::Binary
                } else {
                    MediaType::Text
                }
            }

            // Anything else that is_video / is_audio / is_image did not catch
            // (should not happen, but provide a safe default).
            _ => MediaType::Binary,
        }
    }
}

// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    // ── TextEncoding::Utf8 via BOM ────────────────────────────────────────────

    #[test]
    fn test_encoding_utf8_bom() {
        let data = b"\xEF\xBB\xBFHello, world!";
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Utf8);
    }

    #[test]
    fn test_encoding_utf8_no_bom() {
        let data = "Hello, world! ✓ café".as_bytes();
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Utf8);
    }

    // ── TextEncoding::Utf16Le / Utf16Be via BOM ───────────────────────────────

    #[test]
    fn test_encoding_utf16_le_bom() {
        let data = b"\xFF\xFE\x48\x00\x65\x00"; // LE BOM + "He" in UTF-16LE
        assert_eq!(
            ContentDetector::detect_encoding(data),
            TextEncoding::Utf16Le
        );
    }

    #[test]
    fn test_encoding_utf16_be_bom() {
        let data = b"\xFE\xFF\x00\x48\x00\x65"; // BE BOM + "He" in UTF-16BE
        assert_eq!(
            ContentDetector::detect_encoding(data),
            TextEncoding::Utf16Be
        );
    }

    // ── TextEncoding::Ascii ───────────────────────────────────────────────────

    #[test]
    fn test_encoding_ascii_printable() {
        let data = b"Hello World 123";
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Ascii);
    }

    #[test]
    fn test_encoding_ascii_with_crlf() {
        let data = b"line1\r\nline2\r\n";
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Ascii);
    }

    #[test]
    fn test_encoding_ascii_with_tab() {
        let data = b"col1\tcol2\tcol3";
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Ascii);
    }

    #[test]
    fn test_encoding_empty_returns_ascii() {
        assert_eq!(ContentDetector::detect_encoding(&[]), TextEncoding::Ascii);
    }

    // ── TextEncoding::Latin1 ──────────────────────────────────────────────────

    #[test]
    fn test_encoding_latin1_extended_bytes() {
        // Invalid UTF-8 but valid Latin-1
        let data = b"Caf\xe9 au lait"; // 0xE9 = 'é' in Latin-1, but invalid UTF-8 here
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Latin1);
    }

    #[test]
    fn test_encoding_latin1_high_bytes() {
        let data = &[0x80u8, 0x9F, 0xA0, 0xFF];
        assert_eq!(ContentDetector::detect_encoding(data), TextEncoding::Latin1);
    }

    // ── is_binary ─────────────────────────────────────────────────────────────

    #[test]
    fn test_is_binary_empty() {
        assert!(!ContentDetector::is_binary(&[]));
    }

    #[test]
    fn test_is_binary_plain_text() {
        let text = b"This is plain ASCII text.\nNo binary bytes here.\n";
        assert!(!ContentDetector::is_binary(text));
    }

    #[test]
    fn test_is_binary_null_bytes() {
        // 5 null bytes in 100 bytes = 5% > 1% threshold.
        let mut data = vec![0x41u8; 100]; // 'A' * 100
        data[10] = 0x00;
        data[20] = 0x00;
        data[30] = 0x00;
        data[40] = 0x00;
        data[50] = 0x00;
        assert!(ContentDetector::is_binary(&data));
    }

    #[test]
    fn test_is_binary_jpeg_magic() {
        // JPEG files start with 0xFF 0xD8 — binary.
        let data = [0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46];
        assert!(ContentDetector::is_binary(&data));
    }

    #[test]
    fn test_is_binary_utf8_text() {
        let text = "The quick brown fox jumps over the lazy dog. 1234567890!".as_bytes();
        assert!(!ContentDetector::is_binary(text));
    }

    // ── detect_media_type ─────────────────────────────────────────────────────

    #[test]
    fn test_media_type_empty_returns_unknown() {
        assert_eq!(ContentDetector::detect_media_type(&[]), MediaType::Unknown);
    }

    #[test]
    fn test_media_type_jpeg_is_image() {
        // JPEG magic: FF D8
        let data = [0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46];
        assert_eq!(ContentDetector::detect_media_type(&data), MediaType::Image);
    }

    #[test]
    fn test_media_type_png_is_image() {
        // PNG magic: 89 50 4E 47 0D 0A 1A 0A
        let data = [0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A];
        assert_eq!(ContentDetector::detect_media_type(&data), MediaType::Image);
    }

    #[test]
    fn test_media_type_flac_is_audio() {
        // FLAC magic: 66 4C 61 43 = "fLaC"
        let data = b"fLaC\x00\x00\x00\x22";
        assert_eq!(ContentDetector::detect_media_type(data), MediaType::Audio);
    }

    #[test]
    fn test_media_type_wav_is_audio() {
        // WAV: RIFF....WAVE
        let data = b"RIFF\x00\x00\x00\x00WAVE";
        assert_eq!(ContentDetector::detect_media_type(data), MediaType::Audio);
    }

    #[test]
    fn test_media_type_zip_is_archive() {
        // ZIP magic: 50 4B 03 04
        let data = [0x50, 0x4B, 0x03, 0x04, 0x14, 0x00, 0x00, 0x00];
        assert_eq!(
            ContentDetector::detect_media_type(&data),
            MediaType::Archive
        );
    }

    #[test]
    fn test_media_type_gz_is_archive() {
        // GZ magic: 1F 8B
        let data = [0x1F, 0x8B, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00];
        assert_eq!(
            ContentDetector::detect_media_type(&data),
            MediaType::Archive
        );
    }

    #[test]
    fn test_media_type_unknown_binary_is_binary() {
        // Random binary data that doesn't match any format.
        let data = [
            0x00u8, 0x01, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0xDE, 0xAD, 0xBE, 0xEF, 0x00, 0x00,
            0x00, 0x00,
        ];
        assert_eq!(ContentDetector::detect_media_type(&data), MediaType::Binary);
    }

    #[test]
    fn test_media_type_unknown_text_is_text() {
        // Plain text that doesn't match any media format.
        let data = b"1\n00:00:01,000 --> 00:00:03,000\nHello world\n\n";
        // FormatDetector will return Unknown for SRT-like content.
        let mt = ContentDetector::detect_media_type(data);
        assert!(
            matches!(mt, MediaType::Text | MediaType::Unknown),
            "expected Text or Unknown, got {mt:?}"
        );
    }

    // ── MediaType display ─────────────────────────────────────────────────────

    #[test]
    fn test_media_type_display() {
        assert_eq!(MediaType::Video.to_string(), "Video");
        assert_eq!(MediaType::Audio.to_string(), "Audio");
        assert_eq!(MediaType::Image.to_string(), "Image");
        assert_eq!(MediaType::Archive.to_string(), "Archive");
        assert_eq!(MediaType::Text.to_string(), "Text");
        assert_eq!(MediaType::Binary.to_string(), "Binary");
        assert_eq!(MediaType::Unknown.to_string(), "Unknown");
    }

    // ── TextEncoding display ──────────────────────────────────────────────────

    #[test]
    fn test_text_encoding_display() {
        assert_eq!(TextEncoding::Utf8.to_string(), "UTF-8");
        assert_eq!(TextEncoding::Utf16Le.to_string(), "UTF-16LE");
        assert_eq!(TextEncoding::Utf16Be.to_string(), "UTF-16BE");
        assert_eq!(TextEncoding::Latin1.to_string(), "Latin-1");
        assert_eq!(TextEncoding::Ascii.to_string(), "ASCII");
    }
}