Skip to main content

grit_lib/
commit_encoding.rs

1//! Git commit encoding labels (`encoding` header, `i18n.commitEncoding`) mapped to codecs.
2//!
3//! Git's `ISO-8859-1` is strict Latin-1; `encoding_rs` maps that label to Windows-1252, so we
4//! handle Latin-1 separately.
5
6use encoding_rs::Encoding;
7
8fn is_iso_8859_1(label: &str) -> bool {
9    matches!(
10        label.trim().to_ascii_lowercase().as_str(),
11        "iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
12    )
13}
14
15fn decode_latin1(bytes: &[u8]) -> String {
16    let mut s = String::with_capacity(bytes.len());
17    for &b in bytes {
18        s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
19    }
20    s
21}
22
23fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
24    unicode
25        .chars()
26        .map(|c| {
27            let cp = u32::from(c);
28            if cp <= 0xFF {
29                cp as u8
30            } else {
31                b'?'
32            }
33        })
34        .collect()
35}
36
37/// Git stores the commit message body with a trailing newline when non-empty.
38#[must_use]
39pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
40    if !bytes.is_empty() && !bytes.ends_with(b"\n") {
41        bytes.push(b'\n');
42    }
43    bytes
44}
45
46/// Resolve an encoding label the way Git uses it in config and commit objects.
47///
48/// Git accepts names like `eucJP` that [`Encoding::for_label`] does not recognize.
49/// ISO-8859-1 is handled separately (strict Latin-1).
50#[must_use]
51pub fn resolve(label: &str) -> Option<&'static Encoding> {
52    let t = label.trim();
53    if t.is_empty() || is_iso_8859_1(t) {
54        return None;
55    }
56    let normalized = t.replace('_', "-");
57    let lower = normalized.to_ascii_lowercase();
58    let mapped = match lower.as_str() {
59        "eucjp" => "euc-jp",
60        "cp932" | "mskanji" | "sjis" => "shift_jis",
61        _ => normalized.as_str(),
62    };
63    Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
64}
65
66/// Encode `unicode` for storage in a commit message body using Git's encoding name.
67#[must_use]
68pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
69    let t = label.trim();
70    let raw = if is_iso_8859_1(t) {
71        encode_latin1_lossy(unicode)
72    } else {
73        let enc = resolve(t)?;
74        let (cow, _, _) = enc.encode(unicode);
75        cow.into_owned()
76    };
77    Some(ensure_body_trailing_newline(raw))
78}
79
80/// Encode a single header field (author/committer line) without adding a trailing newline.
81#[must_use]
82pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
83    let t = label.trim();
84    if is_iso_8859_1(t) {
85        return Some(encode_latin1_lossy(unicode));
86    }
87    let enc = resolve(t)?;
88    let (cow, _, _) = enc.encode(unicode);
89    Some(cow.into_owned())
90}
91
92/// Decode `bytes` using Git's encoding name, or lossy UTF-8 if unknown.
93#[must_use]
94pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
95    if let Some(l) = label {
96        if is_iso_8859_1(l) {
97            return decode_latin1(bytes);
98        }
99        if let Some(enc) = resolve(l) {
100            let (cow, _) = enc.decode_without_bom_handling(bytes);
101            return cow.into_owned();
102        }
103    }
104    String::from_utf8_lossy(bytes).into_owned()
105}
106
107/// Re-encode `unicode` from UTF-8 into `output_label`, or `None` if unsupported.
108#[must_use]
109pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
110    encode_header_text(output_label, unicode)
111}