1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// SPDX-FileCopyrightText: Contributors to ecformat project <https://codeberg.org/BaumiCoder/ecformat>
//
// SPDX-License-Identifier: BlueOak-1.0.0
//! Module for the `charset` property of EditorConfig.
use std::{
fs::{self, File},
io::{self, Read},
path::Path,
};
use anyhow::Result;
use charset_normalizer_rs::{entity, utils};
use ec4rs::property::{self, Charset};
use encoding::EncoderTrap;
use log::warn;
use snafu::ensure;
use super::{PropertyHandler, errors};
use crate::files;
/// Returns the `charset` property of the given properties
/// and uses a fallback if `charset` is not set.
pub fn get_charset(properties: &ec4rs::Properties) -> property::Charset {
match properties.get::<property::Charset>() {
Ok(charset) => charset,
Err(_) => property::Charset::Utf8,
}
}
/// Handles the `charset` property for a single file.
pub struct CharsetHandler {
charset: property::Charset,
}
impl PropertyHandler for CharsetHandler {
fn check(&self, file_path: &Path) -> Result<()> {
match Self::get_charset_from_file(file_path)? {
Some(determinted_charset) => {
let expected_charset = self.charset;
match determinted_charset {
Ok(actual_charset) => {
ensure!(
actual_charset == expected_charset,
errors::CharsetSnafu {
actual_charset: actual_charset.to_string(),
expected_charset,
}
);
}
Err(actual_charset) => errors::CharsetSnafu {
actual_charset,
expected_charset,
}
.fail()?,
}
}
None => {
warn!(
"Charset in file '{}' could not be determinted",
file_path.display()
);
}
};
Ok(())
}
fn fix(&self, file_path: &Path) -> Result<()> {
let determinted = self.set_charset_of_file(file_path)?;
if !determinted {
warn!(
"Charset in file '{}' could not be determinted",
file_path.display()
);
}
Ok(())
}
}
impl CharsetHandler {
/// Creates a [`CharsetHandler`] for the given properties,
/// if a handler is necessary for these properties.
pub fn build(properties: &ec4rs::Properties) -> Option<CharsetHandler> {
match properties.get::<property::Charset>() {
Ok(charset) => Some(CharsetHandler { charset }),
Err(_) => None, // no charset property set
}
}
/// Determine the charset used in the given file (if possible as EditorConfig Charset)
/// and returns None if no charset could be determined.
/// IO Errors occur if the file cannot be accessed.
fn get_charset_from_file(
file_path: &Path,
) -> io::Result<Option<Result<property::Charset, String>>> {
let charset_match = Self::get_charset_match_from_path(file_path)?;
Ok(charset_match.map(|c| Self::get_charset_from_charset_match(&c)))
}
/// Sets the charset of the given file, if it is not already using this charset.
/// The boolean return value indicated if the current charset could be determined
/// or not. Only with a determined charset, it could be changed to the requested one.
fn set_charset_of_file(&self, file_path: &Path) -> io::Result<bool> {
let charset_match = Self::get_charset_match_from_path(file_path)?;
match charset_match {
Some(charset_match) => {
let actual_charset = Self::get_charset_from_charset_match(&charset_match);
let charset_wrong = actual_charset != Ok(self.charset);
if charset_wrong {
let input = charset_match.decoded_payload().unwrap();
let mut output =
utils::encode(input, &self.charset_name(), EncoderTrap::Strict).unwrap();
files::add_bom(&self.charset, &mut output);
// Replace file content with content encoded in the requested charset.
fs::write(file_path, output)?;
}
Ok(true)
}
None => Ok(false),
}
}
fn get_charset_match_from_path(path: &Path) -> io::Result<Option<entity::CharsetMatch>> {
let settings = Some(entity::NormalizerSettings {
exclude_encodings: vec![
String::from("ascii"), // prefer utf-8 (over ascii)
String::from("macintosh"), // prefer latin1 (over the similar macintosh)
],
enable_fallback: false,
..Default::default()
});
// Instead of from_path(), read file content to have proper IO errors instead of a String.
let mut file = File::open(path)?;
let file_size = file.metadata()?.len();
let mut bytes = Vec::with_capacity(file_size as usize);
file.read_to_end(&mut bytes)?;
Ok(charset_normalizer_rs::from_bytes(&bytes, settings)
.expect("settings are valid")
.get_best()
.cloned())
}
/// Determined the EditorConfig Charset from a `CharsetMatch`.
/// If the match is not for a charset, allowed in EditorConfig,
/// its name is inside the Err of the Result.
fn get_charset_from_charset_match(
charset_match: &entity::CharsetMatch,
) -> Result<property::Charset, String> {
match charset_match.encoding() {
"utf-8" => {
if charset_match.bom() {
Ok(Charset::Utf8Bom)
} else {
Ok(Charset::Utf8)
}
}
"iso-8859-1" => Ok(Charset::Latin1),
"utf-16le" => Ok(Charset::Utf16Le),
"utf-16be" => Ok(Charset::Utf16Be),
charset => Err(String::from(charset)),
}
}
/// Gives the name of the charset for use in functions of `charset_normalizer_rs` crate.
fn charset_name(&self) -> String {
match self.charset {
Charset::Utf8Bom => Charset::Utf8,
c => c,
}
.to_string()
}
}
#[cfg(test)]
mod tests;