ecformat 0.1.1

command line tool to keep files correct in respect of your EditorConfig
Documentation
// SPDX-FileCopyrightText: Contributors to ecformat project <https://codeberg.org/BaumiCoder/ecformat>
//
// SPDX-License-Identifier: BlueOak-1.0.0

//! Provides functions to interact with files.

use std::{
    fs::{self, File},
    io::{self, Read},
    path::{Path, PathBuf},
};

use ec4rs::property::{self, Charset};
use encoding_rs_io::DecodeReaderBytesBuilder;
use ignore::WalkBuilder;
use itertools::Itertools;

use crate::cli::IgnoreArgs;

/// Provides an iterator over all files of the target.
/// In case of a directory these are (recursively) the files in it and
/// in case of a file the iterator only contains this one file as item.
pub fn get_target_files(
    targets: &[PathBuf],
    ignore_args: &IgnoreArgs,
) -> anyhow::Result<impl Iterator<Item = Result<PathBuf, ignore::Error>>> {
    let first_target = targets.first().expect("at least one target necessary");
    let mut builder = WalkBuilder::new(first_target);
    for t in targets.iter().skip(1) {
        builder.add(t);
    }
    if ignore_args.hidden {
        builder.hidden(false);
        if ignore_args.git_settings {
            // ignore ".git" directory as the metadata of git, if git settings are respected.
            builder.filter_entry(|entry| !(entry.path().is_dir() && entry.file_name() == ".git"));
        }
    }
    if !ignore_args.git_settings {
        builder
            .git_ignore(false)
            .git_exclude(false)
            .git_global(false);
    }
    builder.ignore(false);
    if let Some(file_name) = ignore_args.ignore_file.as_ref() {
        builder.add_custom_ignore_filename(file_name);
    }
    Ok(builder.build()
        .map(|entry| entry.map(|f| f.into_path()))
        // Use map on entries to leave error in iterator to allow later handling
        .filter(|path| path.as_ref().map_or(true, |p| p.is_file()))
        // Avoid duplicates from having a file and its parent folder in the list of targets
        .unique_by(|entry| {
            match entry {
                Ok(p) => match p.canonicalize() {
                    Ok(cp) => cp.to_str().map(|s| s.to_string()),
                    Err(e) => Some(e.to_string()),
                },
                Err(e) => Some(e.to_string()),
            }
        }))
}

/// Read the content of a file into a String using the given charset.
pub fn read_file(file_path: &Path, charset: &property::Charset) -> io::Result<String> {
    let mut reader = DecodeReaderBytesBuilder::new()
        .encoding(Some(charset_as_encoding(charset)))
        .build(File::open(file_path)?);
    let mut content = String::new();
    reader.read_to_string(&mut content)?;
    Ok(content)
}

/// Overwrites a file with the given content encoded with the given charset.
pub fn overwrite_file(
    file_path: &Path,
    charset: &property::Charset,
    content: &str,
) -> io::Result<()> {
    let mut output: Vec<u8> = match charset {
        // From the documentation of the encode() function in `encoding_rs`:
        // "For the UTF-8 encode spec concept, it is slightly more efficient
        // to use string.as_bytes() instead of invoking this method on UTF_8."
        Charset::Utf8 | Charset::Utf8Bom => content.as_bytes().to_vec(),
        // Use encode() function from `encoding_rs` where possible
        Charset::Latin1 => {
            let target_encoding = charset_as_encoding(charset);
            let (encoded, actual_encoding, _) = target_encoding.encode(content);

            debug_assert_eq!(
                target_encoding, actual_encoding,
                "If the encoding in encoding_rs cannot be produced with encode(), \
                the bytes have to be created differently."
            );
            encoded.into_owned()
        }
        // UTF-16: `encoding_rs` "does not provide encoders for those encodings"
        // (see start page of documentation of `encoding_rs`)
        Charset::Utf16Le => content
            .encode_utf16()
            .flat_map(|c| [c as u8, (c >> 8) as u8])
            .collect::<Vec<u8>>(),
        Charset::Utf16Be => content
            .encode_utf16()
            .flat_map(|c| [(c >> 8) as u8, c as u8])
            .collect::<Vec<u8>>(),
    };
    add_bom(charset, &mut output);
    fs::write(file_path, output)
}

/// Add BOM if necessary to the given byte vector.
pub fn add_bom(charset: &property::Charset, file_content: &mut Vec<u8>) {
    // https://en.wikipedia.org/wiki/Byte_order_mark#Byte-order_marks_by_encoding
    let bom = match charset {
        Charset::Utf8 | Charset::Latin1 => None,
        Charset::Utf8Bom => Some(vec![0xEF, 0xBB, 0xBF]),
        // In EditorConfig it is unspecified if UTF-16 is with BOM or not,
        // but e.g., VSCodium and IntelliJ are using BOM for UTF-16.
        Charset::Utf16Le => Some(vec![0xFF, 0xFE]),
        Charset::Utf16Be => Some(vec![0xFE, 0xFF]),
    };
    if let Some(prefix) = bom {
        file_content.splice(0..0, prefix);
    }
}

/// Returns the encoding for the given charset. As the functions in this module
/// take the charset as parameter to have the information about UTF-8 with or without BOM.
fn charset_as_encoding(charset: &property::Charset) -> &'static encoding_rs::Encoding {
    match charset {
        Charset::Utf8 | Charset::Utf8Bom => encoding_rs::UTF_8,
        Charset::Latin1 => encoding_rs::WINDOWS_1252,
        Charset::Utf16Le => encoding_rs::UTF_16LE,
        Charset::Utf16Be => encoding_rs::UTF_16BE,
    }
}

#[cfg(test)]
mod tests;