1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
use crate::source::RecognizedEncoding; use regex::Regex; use std::error::Error; use std::fmt; lazy_static! { static ref ENCODING_RE: Regex = Regex::new( r"(?x) [\s\#](en)?coding\s*[:=]\s* ( # Special-case: there's a UTF8-MAC encoding. (?P<a>utf8-mac) | # Chew the suffix; it's there for emacs compat. (?P<b>[A-Za-z0-9_-]+?)(-unix|-dos|-mac) | (?P<c>[A-Za-z0-9_-]+) ) " ) .expect("ENCODING_RE regex is invalid"); } pub type CustomDecoder = Box<dyn FnOnce(RecognizedEncoding, Vec<u8>) -> Result<Vec<u8>, InputError>>; #[derive(Debug)] pub enum InputError { UnableToRecognizeEncoding, UnsupportdEncoding(String), NoDecoder(RecognizedEncoding), DecodingError(String), } impl fmt::Display for InputError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self) } } impl Error for InputError {} fn recognize_encoding(source: &[u8]) -> Result<String, InputError> { if source.is_empty() { return Err(InputError::UnableToRecognizeEncoding); } let mut lines = source.split(|byte| *byte == b'\n'); let first_line = lines.next().unwrap_or(&[] as &[u8]); let second_line = lines.next().unwrap_or(&[] as &[u8]); let encoding_line: &[u8]; if first_line.starts_with(r"\xef\xbb\xbf".as_bytes()) { return Ok("utf-8".to_owned()); } else if first_line.starts_with("#!".as_bytes()) { encoding_line = second_line; } else { encoding_line = first_line; } if !encoding_line.starts_with("#".as_bytes()) { return Err(InputError::UnableToRecognizeEncoding); } let encoding_line = String::from(String::from_utf8_lossy(encoding_line)); let captures = ENCODING_RE .captures(&encoding_line) .ok_or(InputError::UnableToRecognizeEncoding)?; captures .name("a") .or_else(|| captures.name("b")) .or_else(|| captures.name("c")) .map(|m| m.as_str().to_owned()) .ok_or(InputError::UnableToRecognizeEncoding) } pub fn decode_input(input: Vec<u8>, decoder: Option<CustomDecoder>) -> Result<Vec<u8>, InputError> { let enc = recognize_encoding(&input).unwrap_or_else(|_| "utf-8".to_owned()); match &enc.to_uppercase()[..] { "UTF-8" | "ASCII-8BIT" | "BINARY" => Ok(input), _ => { let enc = RecognizedEncoding::parse(&enc).ok_or(InputError::UnsupportdEncoding(enc))?; if let Some(decoder) = decoder { decoder(enc, input) } else { Err(InputError::NoDecoder(enc)) } } } }