Skip to main content

grit_lib/
commit_encoding.rs

1//! Git commit encoding labels (`encoding` header, `i18n.commitEncoding`) mapped to codecs.
2//!
3//! Git's `ISO-8859-1` is strict Latin-1; `encoding_rs` maps that label to Windows-1252, so we
4//! handle Latin-1 separately.
5
6use encoding_rs::Encoding;
7
8fn is_iso_8859_1(label: &str) -> bool {
9    matches!(
10        label.trim().to_ascii_lowercase().as_str(),
11        "iso-8859-1" | "iso8859-1" | "latin1" | "latin-1"
12    )
13}
14
15fn decode_latin1(bytes: &[u8]) -> String {
16    let mut s = String::with_capacity(bytes.len());
17    for &b in bytes {
18        s.push(char::from_u32(u32::from(b)).unwrap_or('\u{FFFD}'));
19    }
20    s
21}
22
23fn encode_latin1_lossy(unicode: &str) -> Vec<u8> {
24    unicode
25        .chars()
26        .map(|c| {
27            let cp = u32::from(c);
28            if cp <= 0xFF {
29                cp as u8
30            } else {
31                b'?'
32            }
33        })
34        .collect()
35}
36
37/// Git stores the commit message body with a trailing newline when non-empty.
38#[must_use]
39pub fn ensure_body_trailing_newline(mut bytes: Vec<u8>) -> Vec<u8> {
40    if !bytes.is_empty() && !bytes.ends_with(b"\n") {
41        bytes.push(b'\n');
42    }
43    bytes
44}
45
46/// Resolve an encoding label the way Git uses it in config and commit objects.
47///
48/// Git accepts names like `eucJP` that [`Encoding::for_label`] does not recognize.
49/// ISO-8859-1 is handled separately (strict Latin-1).
50#[must_use]
51pub fn resolve(label: &str) -> Option<&'static Encoding> {
52    let t = label.trim();
53    if t.is_empty() || is_iso_8859_1(t) {
54        return None;
55    }
56    let normalized = t.replace('_', "-");
57    let lower = normalized.to_ascii_lowercase();
58    let mapped = match lower.as_str() {
59        "eucjp" => "euc-jp",
60        "cp932" | "mskanji" | "sjis" => "shift_jis",
61        _ => normalized.as_str(),
62    };
63    Encoding::for_label(mapped.as_bytes()).or_else(|| Encoding::for_label(t.as_bytes()))
64}
65
66/// Encode `unicode` for storage in a commit message body using Git's encoding name.
67#[must_use]
68pub fn encode_unicode(label: &str, unicode: &str) -> Option<Vec<u8>> {
69    let t = label.trim();
70    let raw = if is_iso_8859_1(t) {
71        encode_latin1_lossy(unicode)
72    } else {
73        let enc = resolve(t)?;
74        let (cow, _, _) = enc.encode(unicode);
75        cow.into_owned()
76    };
77    Some(ensure_body_trailing_newline(raw))
78}
79
80/// Encode a single header field (author/committer line) without adding a trailing newline.
81#[must_use]
82pub fn encode_header_text(label: &str, unicode: &str) -> Option<Vec<u8>> {
83    let t = label.trim();
84    if is_iso_8859_1(t) {
85        return Some(encode_latin1_lossy(unicode));
86    }
87    let enc = resolve(t)?;
88    let (cow, _, _) = enc.encode(unicode);
89    Some(cow.into_owned())
90}
91
92/// Decode `bytes` using Git's encoding name, or lossy UTF-8 if unknown.
93#[must_use]
94pub fn decode_bytes(label: Option<&str>, bytes: &[u8]) -> String {
95    if let Some(l) = label {
96        if is_iso_8859_1(l) {
97            return decode_latin1(bytes);
98        }
99        if let Some(enc) = resolve(l) {
100            let (cow, _) = enc.decode_without_bom_handling(bytes);
101            return cow.into_owned();
102        }
103    }
104    String::from_utf8_lossy(bytes).into_owned()
105}
106
107/// Re-encode `unicode` from UTF-8 into `output_label`, or `None` if unsupported.
108#[must_use]
109pub fn reencode_utf8_to_label(output_label: &str, unicode: &str) -> Option<Vec<u8>> {
110    encode_header_text(output_label, unicode)
111}
112
113/// Prepare a commit message for storage per `i18n.commitEncoding` (or equivalent).
114///
115/// When the configured encoding is not UTF-8, returns [`Some`] raw bytes for the body
116/// and sets `encoding` in the commit object; otherwise UTF-8 is stored without an
117/// `encoding` header.
118#[must_use]
119pub fn finalize_stored_commit_message(
120    message: String,
121    commit_encoding: Option<&str>,
122) -> (String, Option<String>, Option<Vec<u8>>) {
123    let is_utf8 = match commit_encoding {
124        None => true,
125        Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
126    };
127    if is_utf8 {
128        return (message, None, None);
129    }
130    let Some(label) = commit_encoding.filter(|s| !s.trim().is_empty()) else {
131        return (message, None, None);
132    };
133    let Some(raw) = encode_unicode(label, &message) else {
134        return (message, None, None);
135    };
136    (message, Some(label.to_owned()), Some(raw))
137}
138
139/// Decode `=?charset?q?...?=` encoded-words in an email display name (before `<`).
140///
141/// Used when applying patches: `git format-patch` emits RFC 2047 in `From:`; the stored
142/// commit author should be the decoded Unicode form.
143#[must_use]
144pub fn decode_rfc2047_mailbox_from_line(from: &str) -> String {
145    let from = from.trim();
146    let Some(lt) = from.find('<') else {
147        return decode_rfc2047_encoded_words(from);
148    };
149    let name = from[..lt].trim();
150    let tail = &from[lt..];
151    let decoded = decode_rfc2047_encoded_words(name);
152    if decoded.is_empty() {
153        tail.trim_start().to_string()
154    } else {
155        format!("{decoded} {tail}")
156    }
157}
158
159fn decode_rfc2047_encoded_words(s: &str) -> String {
160    let mut out = String::new();
161    let mut rest = s;
162    while let Some(start) = rest.find("=?") {
163        out.push_str(&rest[..start]);
164        rest = &rest[start + 2..];
165        let Some(d1) = rest.find('?') else {
166            out.push_str("=?");
167            out.push_str(rest);
168            return out;
169        };
170        let charset = &rest[..d1];
171        let after_cs = &rest[d1 + 1..];
172        let Some(d2) = after_cs.find('?') else {
173            out.push_str("=?");
174            out.push_str(rest);
175            return out;
176        };
177        let encoding = after_cs[..d2].to_ascii_lowercase();
178        let after_enc = &after_cs[d2 + 1..];
179        let Some(end) = after_enc.find("?=") else {
180            out.push_str("=?");
181            out.push_str(rest);
182            return out;
183        };
184        let payload = &after_enc[..end];
185        rest = &after_enc[end + 2..];
186        if encoding == "q" {
187            let bytes = decode_quoted_printable_soft(payload);
188            out.push_str(&decode_bytes(Some(charset), &bytes));
189        } else if encoding == "b" {
190            if let Some(bytes) = base64_decode_rfc2047(payload) {
191                out.push_str(&decode_bytes(Some(charset), &bytes));
192            }
193        }
194    }
195    out.push_str(rest);
196    out
197}
198
199fn decode_quoted_printable_soft(payload: &str) -> Vec<u8> {
200    let mut out = Vec::new();
201    let mut it = payload.as_bytes().iter().copied().peekable();
202    while let Some(b) = it.next() {
203        if b == b'_' {
204            out.push(b' ');
205        } else if b == b'=' {
206            let h1 = it.next();
207            let h2 = it.next();
208            if let (Some(a), Some(c)) = (h1, h2) {
209                if let (Some(hi), Some(lo)) = (hex_nibble(a), hex_nibble(c)) {
210                    out.push((hi << 4) | lo);
211                    continue;
212                }
213            }
214            out.push(b'=');
215        } else {
216            out.push(b);
217        }
218    }
219    out
220}
221
222fn hex_nibble(b: u8) -> Option<u8> {
223    match b {
224        b'0'..=b'9' => Some(b - b'0'),
225        b'a'..=b'f' => Some(b - b'a' + 10),
226        b'A'..=b'F' => Some(b - b'A' + 10),
227        _ => None,
228    }
229}
230
231fn base64_decode_rfc2047(input: &str) -> Option<Vec<u8>> {
232    const TABLE: &[u8; 64] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
233    let mut output = Vec::new();
234    let mut buf: u32 = 0;
235    let mut bits: u32 = 0;
236    for &byte in input.as_bytes() {
237        if byte == b'=' {
238            break;
239        }
240        if byte.is_ascii_whitespace() {
241            continue;
242        }
243        let val = TABLE.iter().position(|&c| c == byte)? as u32;
244        buf = (buf << 6) | val;
245        bits += 6;
246        if bits >= 8 {
247            bits -= 8;
248            output.push((buf >> bits) as u8);
249            buf &= (1 << bits) - 1;
250        }
251    }
252    Some(output)
253}
254
255/// Raw `author` / `committer` header payloads for a new commit object.
256///
257/// When `encoding` is unset or UTF-8, returns empty vectors so
258/// [`crate::objects::serialize_commit`] writes the Unicode [`String`] fields as UTF-8.
259/// When `encoding` is non-UTF-8, encodes the full identity lines (name, email, timestamp)
260/// for storage in that charset.
261#[must_use]
262pub fn identity_raw_for_serialized_commit(
263    encoding: &Option<String>,
264    author: &str,
265    committer: &str,
266) -> (Vec<u8>, Vec<u8>) {
267    let is_utf8 = match encoding.as_deref() {
268        None => true,
269        Some(e) => e.eq_ignore_ascii_case("utf-8") || e.eq_ignore_ascii_case("utf8"),
270    };
271    if is_utf8 {
272        return (Vec::new(), Vec::new());
273    }
274    let Some(label) = encoding.as_deref() else {
275        return (Vec::new(), Vec::new());
276    };
277    let author_raw = encode_header_text(label, author).unwrap_or_default();
278    let committer_raw = encode_header_text(label, committer).unwrap_or_default();
279    (author_raw, committer_raw)
280}
281
282/// Unicode commit message body for display (for example, `format-patch`).
283///
284/// Uses `raw_message` when set; otherwise returns `message`.
285#[must_use]
286pub fn commit_message_unicode_for_display(
287    encoding: Option<&str>,
288    message: &str,
289    raw_message: Option<&[u8]>,
290) -> String {
291    if let Some(raw) = raw_message {
292        decode_bytes(encoding, raw)
293    } else {
294        message.to_owned()
295    }
296}