jni/wrapper/java_vm/init_args/
char_encoding_windows.rs

1use super::{char_encoding_generic::*, JvmError};
2use std::{
3    borrow::Cow,
4    convert::TryInto,
5    ffi::{c_int, c_uint, CStr},
6    io,
7    mem::MaybeUninit,
8    ptr,
9};
10use windows_sys::Win32::Globalization as winnls;
11
12// The integer type used by `WideCharToMultiByte` for string lengths.
13type WSize = c_int;
14
15// The type of Windows codepage numbers.
16type WCodepage = c_uint;
17
18// The maximum length, in UTF-8 bytes, of strings that will be accepted for transcoding.
19//
20// The purpose of this limit is to prevent overflow. `WideCharToMultiByte` behaves rather badly
21// (see https://github.com/jni-rs/jni-rs/pull/414 for discussion) if the string is long enough to
22// overflow its counters.
23//
24// Although it is possible to transcode a string of any length by splitting it into smaller
25// substrings, the code complexity needed to do so isn't worthwhile just for transcoding JVM
26// options. Also, `test_overflow` would take a very long time to run, which was deemed unacceptable
27// (see https://github.com/jni-rs/jni-rs/pull/414#issuecomment-1419130483). We set this arbitrary
28// limit instead.
29const MAX_INPUT_LEN: usize = 1048576;
30
31/// Converts `s` into a `Cow<CStr>` encoded in the specified Windows code page.
32pub(super) fn str_to_cstr_win32<'a>(
33    s: Cow<'a, str>,
34    needed_codepage: WCodepage,
35) -> Result<Cow<'static, CStr>, JvmError> {
36    // First, check if the input string (UTF-8) is too long to transcode. Bail early if so.
37    if s.len() > MAX_INPUT_LEN {
38        return Err(JvmError::OptStringTooLong {
39            opt_string: s.into_owned(),
40        });
41    }
42
43    // This function will generate an error if `WideCharToMultiByte` fails.
44    fn convert_error(s: Cow<str>) -> JvmError {
45        JvmError::OptStringTranscodeFailure {
46            opt_string: s.into_owned(),
47            error: io::Error::last_os_error(),
48        }
49    }
50
51    // Convert the string to UTF-16 first.
52    let s_utf16: Vec<u16> = s.encode_utf16().collect();
53
54    // Determine how long the string is, in UTF-16 units, in the integer type that Win32 expects.
55    // Overflow should be impossible; panic if it happens.
56    let s_utf16_len: WSize = s_utf16
57        .len()
58        .try_into()
59        .expect("UTF-16 form of input string is too long");
60
61    // Decide which flags we're going to use.
62    let conversion_flags = match needed_codepage {
63        // No flags may be given for the following code pages.
64        // https://learn.microsoft.com/en-us/windows/win32/api/stringapiset/nf-stringapiset-widechartomultibyte
65        42
66        | 50220
67        | 50221
68        | 50222
69        | 50225
70        | 50227
71        | 50229
72        | 54936
73        | 57002..=57011
74        | 65000
75        | 65001 => 0,
76
77        _ => winnls::WC_COMPOSITECHECK | winnls::WC_NO_BEST_FIT_CHARS,
78    };
79
80    // Find out how much buffer space will be needed for the output and whether the string is
81    // fully representable.
82    let mut is_non_representable: Option<MaybeUninit<_>> = match needed_codepage {
83        // All characters are representable in UTF-7 and UTF-8, and moreover
84        // `WideCharToMultiByte` will fail if the target encoding is UTF-7 or UTF-8 and this is not
85        // `None`.
86        winnls::CP_UTF7 | winnls::CP_UTF8 => None,
87        _ => Some(MaybeUninit::uninit()),
88    };
89
90    // Safety: `s_utf16.as_ptr()` is a valid pointer to a UTF-16 string, and `s_utf16_len` is its
91    // length. `lpDefaultChar` is null. `lpUsedDefaultChar` is either null or valid. `cbMultiByte`
92    // is zero.
93    let required_buffer_space = unsafe {
94        winnls::WideCharToMultiByte(
95            needed_codepage,
96            conversion_flags,
97            s_utf16.as_ptr(),
98            s_utf16_len,
99            ptr::null_mut(),
100            0,
101            ptr::null(),
102            match &mut is_non_representable {
103                Some(x) => x.as_mut_ptr(),
104                None => ptr::null_mut(),
105            },
106        )
107    };
108
109    // Bail on error.
110    if required_buffer_space == 0 {
111        drop(s_utf16);
112
113        return Err(convert_error(s));
114    }
115
116    // Check if the string is not fully representable.
117    if let Some(is_non_representable) = is_non_representable {
118        // Safety: `is_non_representable` has been initialized by `WideCharToMultiByte`.
119        let is_non_representable = unsafe { is_non_representable.assume_init() };
120
121        if is_non_representable != 0 {
122            drop(s_utf16);
123
124            return Err(JvmError::OptStringNotRepresentable {
125                opt_string: s.into_owned(),
126            });
127        }
128    }
129
130    // Convert the required buffer space to `usize`, and increment it by one for the null
131    // terminator.
132    //
133    // This shouldn't overflow (see the comment on `MAX_INPUT_LEN` above), so we won't check for
134    // overflow here.
135    let required_buffer_space_usize: usize = required_buffer_space as _;
136    let required_buffer_space_usize_with_nul: usize = required_buffer_space_usize + 1;
137
138    // Allocate enough buffer space, including one byte for the null terminator.
139    let mut output = Vec::<u8>::with_capacity(required_buffer_space_usize_with_nul);
140
141    // Perform the actual conversion.
142    //
143    // Safety: `chunk.as_ptr()` is a valid pointer, and `chunk_len_i32` is its length.
144    // `chunk_output_ptr` is a valid pointer, and `required_buffer_space` is its length.
145    // All other raw pointers are null.
146    let used_buffer_space = unsafe {
147        winnls::WideCharToMultiByte(
148            needed_codepage,
149            conversion_flags,
150            s_utf16.as_ptr(),
151            s_utf16_len,
152            output.as_mut_ptr(),
153            required_buffer_space,
154            ptr::null(),
155            ptr::null_mut(),
156        )
157    };
158
159    drop(s_utf16);
160
161    // Bail on error.
162    if used_buffer_space == 0 {
163        drop(output);
164
165        return Err(convert_error(s));
166    }
167
168    let used_buffer_space_usize: usize = used_buffer_space as usize;
169
170    // Set the new length of the output buffer. Don't use `required_buffer_space`, just in case
171    // `WideCharToMultiByte` changes its mind about how much buffer space it's actually going to
172    // use.
173    //
174    // Safety: `used_buffer_space_usize` is the number of bytes that `WideCharToMultiByte` has
175    // just initialized.
176    unsafe {
177        output.set_len(used_buffer_space_usize);
178    }
179
180    // That's it, it's converted. Now turn it into a `CString`. This will add a null terminator if
181    // there isn't one already and check for null bytes in the middle.
182    unsafe { bytes_to_cstr(Cow::Owned(output), Some(s.into())) }
183}
184
185/// Converts `s` into the Windows default character encoding.
186pub(super) fn str_to_cstr_win32_default_codepage<'a>(
187    s: Cow<'a, str>,
188) -> Result<Cow<'a, CStr>, JvmError> {
189    // Get the code page. There is a remote possibility that it is UTF-8. If so, pass the
190    // string through unchanged (other than adding a null terminator). If not, we need to have
191    // Windows convert the string to the expected code page first.
192
193    // Safety: This function isn't actually unsafe.
194    let needed_codepage = unsafe { winnls::GetACP() };
195
196    if needed_codepage == winnls::CP_UTF8 {
197        // The code page is UTF-8! Lucky us.
198        return utf8_to_cstr(s);
199    }
200
201    // The code page is not UTF-8, so do the transcoding.
202    str_to_cstr_win32(s, needed_codepage)
203}
204
205/// Transcodes text in an arbitrary Windows codepage into a Rust `String`. Used to test
206/// round-tripping.
207#[cfg(test)]
208fn codepage_to_string_win32(
209    codepage_string: impl AsRef<[u8]>,
210    codepage: WCodepage,
211    max_expected_utf16_len: WSize,
212) -> io::Result<String> {
213    let codepage_string_slice = codepage_string.as_ref();
214
215    let codepage_string_slice_len: WSize = codepage_string_slice
216        .len()
217        .try_into()
218        .expect("`codepage_string`'s length is too large to transcode with Win32");
219
220    let mut buf = Vec::<u16>::with_capacity(
221        max_expected_utf16_len
222            .try_into()
223            .expect("expected_utf16_len is negative or exceeds address space"),
224    );
225
226    // Safety: All of these pointers and lengths are valid and checked for overflow.
227    let utf16_units_transcoded = unsafe {
228        winnls::MultiByteToWideChar(
229            codepage,
230            0,
231            codepage_string_slice.as_ptr() as *const _,
232            codepage_string_slice_len,
233            buf.as_mut_ptr(),
234            max_expected_utf16_len,
235        )
236    };
237
238    if utf16_units_transcoded == 0 {
239        return Err(io::Error::last_os_error());
240    }
241
242    // Safety: `MultiByteToWideChar` claims to have initialized this many UTF-16 units.
243    unsafe {
244        buf.set_len(utf16_units_transcoded as _);
245    }
246
247    drop(codepage_string);
248
249    let string =
250        String::from_utf16(buf.as_slice()).expect("`MultiByteToWideChar` generated invalid UTF-16");
251
252    Ok(string)
253}
254
255#[test]
256fn test() {
257    use assert_matches::assert_matches;
258
259    {
260        let result = str_to_cstr_win32("Hello, world 😎".into(), winnls::CP_UTF8).unwrap();
261        assert_eq!(
262            result.to_bytes_with_nul(),
263            b"Hello, world \xf0\x9f\x98\x8e\0"
264        );
265        assert_matches!(result, Cow::Owned(_));
266    }
267
268    {
269        let result = str_to_cstr_win32("Hello, world 😎\0".into(), winnls::CP_UTF8).unwrap();
270        assert_eq!(
271            result.to_bytes_with_nul(),
272            b"Hello, world \xf0\x9f\x98\x8e\0"
273        );
274    }
275
276    {
277        let result = str_to_cstr_win32("Hello, world 😎".into(), 1252).unwrap_err();
278        let error_string = assert_matches!(result, JvmError::OptStringNotRepresentable { opt_string } => opt_string);
279        assert_eq!(error_string, "Hello, world 😎");
280    }
281
282    {
283        let result = str_to_cstr_win32("Hello, worldâ„¢".into(), 1252).unwrap();
284        assert_eq!(result.to_bytes_with_nul(), b"Hello, world\x99\0");
285        assert_matches!(result, Cow::Owned(_));
286    }
287}
288
289#[test]
290fn test_overflow() {
291    use assert_matches::assert_matches;
292
293    // Note: We avoid naïvely using `assert` here, because assertion failure will dump millions of
294    // characters to the console. Instead, here are some functions for handling errors without
295    // doing that.
296
297    #[track_caller]
298    fn check_and_clear_error_opt_string(expected_opt_string: &str, error: &mut JvmError) {
299        if let Some(actual_opt_string) = error.opt_string_mut() {
300            if actual_opt_string != expected_opt_string {
301                panic!("opt_string was mangled in moving it to an error");
302            }
303
304            *actual_opt_string = String::new();
305        }
306    }
307
308    #[track_caller]
309    fn expect_success(
310        expected_opt_string: &str,
311        result: Result<Cow<'static, CStr>, JvmError>,
312    ) -> Cow<'static, CStr> {
313        match result {
314            Ok(ok) => ok,
315            Err(mut error) => {
316                check_and_clear_error_opt_string(expected_opt_string, &mut error);
317                panic!("unexpected transcoding failure: {}", error)
318            }
319        }
320    }
321
322    #[track_caller]
323    fn expect_successful_roundtrip(
324        expected_opt_string: &str,
325        result: Result<Cow<'static, CStr>, JvmError>,
326    ) -> Cow<'static, CStr> {
327        let string = expect_success(expected_opt_string, result);
328        assert!(
329            expected_opt_string.as_bytes() == string.to_bytes(),
330            "opt_string was transcoded successfully but mangled"
331        );
332        string
333    }
334
335    #[track_caller]
336    fn expect_opt_string_too_long(
337        expected_opt_string: &str,
338        result: Result<Cow<'static, CStr>, JvmError>,
339    ) {
340        let mut error = match result {
341            Err(err) => err,
342            Ok(ok) => {
343                assert!(
344                    expected_opt_string.as_bytes() == ok.to_bytes(),
345                    "transcoding unexpectedly succeeded and resulted in mangled output"
346                );
347                panic!("transcoding unexpectedly succeeded")
348            }
349        };
350
351        check_and_clear_error_opt_string(expected_opt_string, &mut error);
352
353        assert_matches!(error, JvmError::OptStringTooLong { .. });
354    }
355
356    {
357        // Try transcoding a plain ASCII string.
358
359        // First, allocate enough space to completely fill the maximum allowed length, plus one
360        // more.
361        //eprintln!("Allocating & filling ASCII");
362        let string = vec![b'H'; MAX_INPUT_LEN.checked_add(1).unwrap()];
363
364        //eprintln!("Checking UTF-8 correctness");
365        let mut string = String::from_utf8(string).unwrap();
366
367        // This string is currently one character too long to transcode, so there should be an
368        // overflow error.
369        //eprintln!("Transcoding ASCII string that's too long");
370        expect_opt_string_too_long(
371            &string,
372            str_to_cstr_win32(string.as_str().into(), winnls::CP_UTF8),
373        );
374
375        // But if we remove one character…
376        assert_eq!(string.pop(), Some('H'));
377
378        // …then it should transcode fine.
379        //eprintln!("Transcoding ASCII string that's not too long");
380        expect_successful_roundtrip(
381            &string,
382            str_to_cstr_win32(string.as_str().into(), winnls::CP_UTF8),
383        );
384    }
385
386    {
387        // Try transcoding a non-ASCII string.
388
389        // U+07FF is the highest code point that can be represnted in UTF-8 with only two bytes, so
390        // we'll use that. The UTF-8 encoding is `df bf`. We fill it this way because it's much
391        // faster than the naïve character-by-character approach (at least unless some future Rust
392        // compiler performs this optimization on its own, but 1.66 doesn't).
393        //eprintln!("Allocating & filling non-ASCII for UTF-8 and UTF-7");
394        let string_byte_pairs = vec![u16::from_be(0xdfbf); MAX_INPUT_LEN / 2];
395
396        //eprintln!("Checking UTF-8 correctness");
397        let string: &str =
398            std::str::from_utf8(bytemuck::cast_slice(string_byte_pairs.as_slice())).unwrap();
399
400        // Again, the string should transcode without overflow.
401        //eprintln!("Transcoding non-ASCII to UTF-8");
402        expect_successful_roundtrip(string, str_to_cstr_win32(string.into(), winnls::CP_UTF8));
403
404        // This should work even with UTF-7. This is the real reason we're using U+07FF: we need
405        // to check that the highest code point that fits under the limit will not overflow even
406        // with the worst-case code page.
407        {
408            //eprintln!("Transcoding non-ASCII to UTF-7");
409            let result = expect_success(string, str_to_cstr_win32(string.into(), winnls::CP_UTF7));
410
411            // *And* it should roundtrip back to UTF-8.
412            //eprintln!("Transcoding UTF-7 back to UTF-8");
413            let result: String = codepage_to_string_win32(
414                result.to_bytes(),
415                winnls::CP_UTF7,
416                (string.len() / 2).try_into().unwrap(),
417            )
418            .unwrap();
419
420            assert!(result == string, "didn't roundtrip via UTF-7");
421        }
422    }
423
424    {
425        // Try transcoding to Windows-1252. This is the slowest part of the test
426        // (`WideCharToMultiByte` is very slow at this, for some reason), so it's done last.
427        //eprintln!("Allocating & filling non-ASCII for Windows-1252");
428        let string_byte_pairs = vec![u16::from_be(0xc2ae); MAX_INPUT_LEN / 2];
429
430        //eprintln!("Checking UTF-8 correctness");
431        let string: &str =
432            std::str::from_utf8(bytemuck::cast_slice(string_byte_pairs.as_slice())).unwrap();
433
434        //eprintln!("Transcoding non-ASCII to Windows-1252");
435        let result = expect_success(string, str_to_cstr_win32(string.into(), 1252));
436
437        //eprintln!("Checking Windows-1252 for correctness");
438        assert!(
439            result.to_bytes().iter().all(|byte| *byte == 0xae),
440            "string didn't transcode to Windows-1252 properly"
441        );
442    }
443}