Skip to main content

mux_media/types/
char_encoding.rs

1use crate::Extension;
2use std::{fs::File, io::Read, path::Path};
3
4/// A charaster encoding of file.
5#[derive(Clone, Debug, PartialEq)]
6pub enum CharEncoding {
7    Utf8Compatible,
8    NotUtf8Compatible(String),
9    NotRecognized,
10}
11
12impl CharEncoding {
13    pub fn new(file: impl AsRef<Path>) -> CharEncoding {
14        let f = file.as_ref();
15
16        if f.extension().map_or(false, |ext| {
17            Extension::new_and_is_matroska(ext.as_encoded_bytes())
18        }) {
19            // All text in a Matroska(tm) file is encoded in UTF-8
20            return Self::Utf8Compatible;
21        }
22
23        return match detect_chardet(f) {
24            Some(s) if is_utf8_compatible(&s) => Self::Utf8Compatible,
25            Some(s) => Self::NotUtf8Compatible(s),
26            None => Self::NotRecognized,
27        };
28
29        fn detect_chardet(f: &Path) -> Option<String> {
30            const READ_LIMIT: usize = 128 * 1024; // 128 KiB
31            const LIM_CONFIDENCE: f32 = 0.8;
32
33            let mut file = File::open(f).ok()?;
34            let mut bytes = [0u8; READ_LIMIT];
35            let bytes_read = file.read(&mut bytes).ok()?;
36
37            match chardet::detect(&bytes[..bytes_read]) {
38                det if det.1 >= LIM_CONFIDENCE => Some(det.0),
39                _ => None,
40            }
41        }
42
43        fn is_utf8_compatible(s: &str) -> bool {
44            let s = s.trim();
45            s.eq_ignore_ascii_case("ascii") || s.eq_ignore_ascii_case("utf-8")
46        }
47    }
48
49    pub(crate) fn get_ffmpeg_sub_charenc(&self) -> Option<&str> {
50        match self {
51            Self::Utf8Compatible => None,
52            Self::NotUtf8Compatible(s) => Some(&s),
53            Self::NotRecognized => None,
54        }
55    }
56}