ecformat 0.1.1

command line tool to keep files correct in respect of your EditorConfig
Documentation
// SPDX-FileCopyrightText: Contributors to ecformat project <https://codeberg.org/BaumiCoder/ecformat>
//
// SPDX-License-Identifier: BlueOak-1.0.0

//! Module for the `charset` property of EditorConfig.

use std::{
    fs::{self, File},
    io::{self, Read},
    path::Path,
};

use anyhow::Result;
use charset_normalizer_rs::{entity, utils};
use ec4rs::property::{self, Charset};
use encoding::EncoderTrap;
use log::warn;
use snafu::ensure;

use super::{PropertyHandler, errors};
use crate::files;

/// Returns the `charset` property of the given properties
/// and uses a fallback if `charset` is not set.
pub fn get_charset(properties: &ec4rs::Properties) -> property::Charset {
    match properties.get::<property::Charset>() {
        Ok(charset) => charset,
        Err(_) => property::Charset::Utf8,
    }
}

/// Handles the `charset` property for a single file.
pub struct CharsetHandler {
    charset: property::Charset,
}

impl PropertyHandler for CharsetHandler {
    fn check(&self, file_path: &Path) -> Result<()> {
        match Self::get_charset_from_file(file_path)? {
            Some(determinted_charset) => {
                let expected_charset = self.charset;
                match determinted_charset {
                    Ok(actual_charset) => {
                        ensure!(
                            actual_charset == expected_charset,
                            errors::CharsetSnafu {
                                actual_charset: actual_charset.to_string(),
                                expected_charset,
                            }
                        );
                    }
                    Err(actual_charset) => errors::CharsetSnafu {
                        actual_charset,
                        expected_charset,
                    }
                    .fail()?,
                }
            }
            None => {
                warn!(
                    "Charset in file '{}' could not be determinted",
                    file_path.display()
                );
            }
        };
        Ok(())
    }

    fn fix(&self, file_path: &Path) -> Result<()> {
        let determinted = self.set_charset_of_file(file_path)?;

        if !determinted {
            warn!(
                "Charset in file '{}' could not be determinted",
                file_path.display()
            );
        }
        Ok(())
    }
}

impl CharsetHandler {
    /// Creates a [`CharsetHandler`] for the given properties,
    /// if a handler is necessary for these properties.
    pub fn build(properties: &ec4rs::Properties) -> Option<CharsetHandler> {
        match properties.get::<property::Charset>() {
            Ok(charset) => Some(CharsetHandler { charset }),
            Err(_) => None, // no charset property set
        }
    }

    /// Determine the charset used in the given file (if possible as EditorConfig Charset)
    /// and returns None if no charset could be determined.
    /// IO Errors occur if the file cannot be accessed.
    fn get_charset_from_file(
        file_path: &Path,
    ) -> io::Result<Option<Result<property::Charset, String>>> {
        let charset_match = Self::get_charset_match_from_path(file_path)?;
        Ok(charset_match.map(|c| Self::get_charset_from_charset_match(&c)))
    }

    /// Sets the charset of the given file, if it is not already using this charset.
    /// The boolean return value indicated if the current charset could be determined
    /// or not. Only with a determined charset, it could be changed to the requested one.
    fn set_charset_of_file(&self, file_path: &Path) -> io::Result<bool> {
        let charset_match = Self::get_charset_match_from_path(file_path)?;

        match charset_match {
            Some(charset_match) => {
                let actual_charset = Self::get_charset_from_charset_match(&charset_match);
                let charset_wrong = actual_charset != Ok(self.charset);
                if charset_wrong {
                    let input = charset_match.decoded_payload().unwrap();
                    let mut output =
                        utils::encode(input, &self.charset_name(), EncoderTrap::Strict).unwrap();
                    files::add_bom(&self.charset, &mut output);
                    // Replace file content with content encoded in the requested charset.
                    fs::write(file_path, output)?;
                }

                Ok(true)
            }
            None => Ok(false),
        }
    }

    fn get_charset_match_from_path(path: &Path) -> io::Result<Option<entity::CharsetMatch>> {
        let settings = Some(entity::NormalizerSettings {
            exclude_encodings: vec![
                String::from("ascii"),     // prefer utf-8 (over ascii)
                String::from("macintosh"), // prefer latin1 (over the similar macintosh)
            ],
            enable_fallback: false,
            ..Default::default()
        });
        // Instead of from_path(), read file content to have proper IO errors instead of a String.
        let mut file = File::open(path)?;
        let file_size = file.metadata()?.len();
        let mut bytes = Vec::with_capacity(file_size as usize);

        file.read_to_end(&mut bytes)?;
        Ok(charset_normalizer_rs::from_bytes(&bytes, settings)
            .expect("settings are valid")
            .get_best()
            .cloned())
    }

    /// Determined the EditorConfig Charset from a `CharsetMatch`.
    /// If the match is not for a charset, allowed in EditorConfig,
    /// its name is inside the Err of the Result.
    fn get_charset_from_charset_match(
        charset_match: &entity::CharsetMatch,
    ) -> Result<property::Charset, String> {
        match charset_match.encoding() {
            "utf-8" => {
                if charset_match.bom() {
                    Ok(Charset::Utf8Bom)
                } else {
                    Ok(Charset::Utf8)
                }
            }
            "iso-8859-1" => Ok(Charset::Latin1),
            "utf-16le" => Ok(Charset::Utf16Le),
            "utf-16be" => Ok(Charset::Utf16Be),
            charset => Err(String::from(charset)),
        }
    }

    /// Gives the name of the charset for use in functions of `charset_normalizer_rs` crate.
    fn charset_name(&self) -> String {
        match self.charset {
            Charset::Utf8Bom => Charset::Utf8,
            c => c,
        }
        .to_string()
    }
}

#[cfg(test)]
mod tests;