use std::{fs, io, path::PathBuf};
pub fn prepare(
files: Vec<PathBuf>,
) -> Result<(Option<Vec<u8>>, ContentsIter, SetWriter), failure::Error> {
let mut rest = ContentsIter::from(files);
let first = rest.next();
match first {
None => Ok((None, rest, SetWriter { bom: b"", eol: b"" })),
Some(Err(e)) => Err(e),
Some(Ok(first)) => {
let mut eol: &[u8] = b"\n";
if let Some(n) = memchr(b'\n', &first) {
if n > 0 && first[n - 1] == b'\r' {
eol = b"\r\n";
}
}
let bom = if has_bom(&first) { BOM_BYTES } else { b"" };
Ok((Some(first), rest, SetWriter { bom, eol }))
}
}
}
#[derive(Debug)]
pub struct SetWriter {
bom: &'static [u8],
eol: &'static [u8],
}
impl SetWriter {
pub fn output(&self, result: crate::LineIterator) -> Result<(), failure::Error> {
if atty::is(atty::Stream::Stdout) {
self.inner(result, io::stdout().lock())
} else {
self.inner(result, io::BufWriter::new(io::stdout().lock()))
}
}
fn inner(
&self,
result: crate::LineIterator,
mut out: impl io::Write,
) -> Result<(), failure::Error> {
out.write_all(self.bom)?;
for line in result {
out.write_all(line)?;
out.write_all(self.eol)?;
}
out.flush()?;
Ok(())
}
}
pub struct ContentsIter {
files: std::vec::IntoIter<PathBuf>,
}
impl From<Vec<PathBuf>> for ContentsIter {
fn from(files: Vec<PathBuf>) -> Self {
ContentsIter { files: files.into_iter() }
}
}
impl Iterator for ContentsIter {
type Item = Result<Vec<u8>, failure::Error>;
fn next(&mut self) -> Option<Self::Item> {
let path = self.files.next()?;
Some(match fs::read(&path) {
Ok(contents) => Ok(decode_if_utf16(contents)),
Err(io_err) => {
let path = path.to_string_lossy();
Err(format_err!("Can't read file `{}`: {}", path, io_err))
}
})
}
}
use memchr::memchr;
fn decode_if_utf16(candidate: Vec<u8>) -> Vec<u8> {
if let Some((enc, _)) = encoding_rs::Encoding::for_bom(&candidate) {
if [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc) {
let (translated, _had_malformed_sequences) =
enc.decode_without_bom_handling(&candidate);
return translated.into_owned().into_bytes();
}
}
return candidate;
}
pub(crate) struct InputLines<'data> {
remaining: &'data [u8],
}
const BOM_0: u8 = b'\xEF';
const BOM_1: u8 = b'\xBB';
const BOM_2: u8 = b'\xBF';
const BOM_BYTES: &[u8] = b"\xEF\xBB\xBF";
pub(crate) fn has_bom(contents: &[u8]) -> bool {
contents.len() >= 3 && contents[0] == BOM_0 && contents[1] == BOM_1 && contents[2] == BOM_2
}
pub(crate) fn lines_of(contents: &[u8]) -> InputLines {
if has_bom(contents) {
InputLines { remaining: &contents[3..] }
} else {
InputLines { remaining: contents }
}
}
impl<'data> Iterator for InputLines<'data> {
type Item = &'data [u8];
fn next(&mut self) -> Option<Self::Item> {
match memchr(b'\n', self.remaining) {
None => {
if self.remaining.is_empty() {
None
} else {
let line = self.remaining;
self.remaining = b"";
Some(line)
}
}
Some(mut end) => {
let restart = end + 1;
if end > 0 && self.remaining[end - 1] == b'\r' {
end -= 1
}
let line = &self.remaining[..end];
self.remaining = &self.remaining[restart..];
Some(line)
}
}
}
}
#[allow(clippy::pedantic)]
#[cfg(test)]
mod test {
use super::*;
const UTF8_BOM: &str = "\u{FEFF}";
#[test]
fn utf8_bom_is_correct() {
assert_eq!([BOM_0, BOM_1, BOM_2], UTF8_BOM.as_bytes());
}
fn utf_16le(source: &str) -> Vec<u8> {
let mut result = b"\xff\xfe".to_vec();
for b in source.as_bytes().iter() {
result.push(*b);
result.push(0);
}
result
}
fn utf_16be(source: &str) -> Vec<u8> {
let mut result = b"\xfe\xff".to_vec();
for b in source.as_bytes().iter() {
result.push(0);
result.push(*b);
}
result
}
fn abominate(expected: &str) -> String {
UTF8_BOM.to_string() + expected
}
#[test]
fn utf_16le_is_translated_to_utf8() {
let expected = "The cute red crab\n jumps over the lazy blue gopher\n";
assert_eq!(decode_if_utf16(utf_16le(&expected)), abominate(expected).as_bytes());
}
#[test]
fn utf_16be_is_translated_to_utf8() {
let expected = "The cute red crab\n jumps over the lazy blue gopher\n";
assert_eq!(decode_if_utf16(utf_16be(&expected)), abominate(expected).as_bytes());
}
#[test]
fn fn_lines_of_strips_utf8_bom_and_line_terminators() {
let with_bom = UTF8_BOM.to_string() + "abc\ndefg\nxyz\n";
let expected: Vec<&[u8]> = vec![b"abc", b"defg", b"xyz"];
let result = lines_of(with_bom.as_bytes()).collect::<Vec<_>>();
assert_eq!(expected, result);
}
}