sciter/
utf.rs

1//! UTF-8 <> UTF-16 conversion support.
2
3// Since Rust haven't stable support of UTF-16, I've ported this code
4// from Sciter SDK (aux-cvt.h)
5
6// (C) 2003-2015, Andrew Fedoniouk (andrew@terrainformatica.com)
7
8
9#![allow(dead_code)]
10
11use std::ffi::{CStr, CString};
12use capi::sctypes::{LPCSTR, LPCWSTR, LPCBYTE};
13
14
15/// UTF-8 to UTF-16 converter.
16#[allow(unused_parens)]
17fn towcs(utf: &[u8], outbuf: &mut Vec<u16>) -> bool
18{
19	let errc = 0x003F; // '?'
20	let mut num_errors = 0;
21
22	let last = utf.len();
23	let mut pc = 0;
24	while (pc < last) {
25		let mut b = u32::from(utf[pc]); pc += 1;
26		if (b == 0) { break; }
27
28		if ((b & 0x80) == 0) {
29			// 1-BYTE sequence: 000000000xxxxxxx = 0xxxxxxx
30
31		} else if ((b & 0xE0) == 0xC0) {
32			// 2-BYTE sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
33			if (pc == last) {
34				outbuf.push(errc);
35				num_errors += 1;
36				break;
37			}
38
39			b = (b & 0x1f) << 6;
40			b |= (u32::from(utf[pc]) & 0x3f); pc += 1;
41
42		} else if ((b & 0xf0) == 0xe0) {
43			// 3-BYTE sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
44			if (pc >= last - 1) {
45				outbuf.push(errc);
46				num_errors += 1;
47				break;
48			}
49
50			b = (b & 0x0f) << 12;
51			b |= (u32::from(utf[pc]) & 0x3f) << 6; pc += 1;
52			b |= (u32::from(utf[pc]) & 0x3f); pc += 1;
53
54			if (b == 0xFEFF && outbuf.is_empty()) { // bom at start
55				continue; // skip it
56			}
57
58		} else if ((b & 0xf8) == 0xf0) {
59			// 4-BYTE sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
60			if(pc >= last - 2) { outbuf.push(errc); break; }
61
62			b = (b & 0x07) << 18;
63			b |= (u32::from(utf[pc]) & 0x3f) << 12; pc += 1;
64			b |= (u32::from(utf[pc]) & 0x3f) << 6; pc += 1;
65			b |= (u32::from(utf[pc]) & 0x3f); pc += 1;
66
67			// b shall contain now full 21-bit unicode code point.
68			assert!((b & 0x1f_ffff) == b);
69			if((b & 0x1f_ffff) != b) {
70				outbuf.push(errc);
71				num_errors += 1;
72				continue;
73			}
74
75			outbuf.push( (0xd7c0 + (b >> 10)) as u16 );
76			outbuf.push( (0xdc00 | (b & 0x3ff)) as u16 );
77
78		} else {
79			num_errors += 1;
80			b = u32::from(errc);
81		}
82
83		outbuf.push(b as u16);
84	}
85	return num_errors == 0;
86}
87
88
89/// UTF-16 to UTF-8 converter.
90#[allow(unused_parens)]
91fn fromwcs(wcs: &[u16], outbuf: &mut Vec<u8>) -> bool
92{
93	let mut num_errors = 0;
94
95	let last = wcs.len();
96	let mut pc = 0;
97	while (pc < last) {
98		let c = u32::from(wcs[pc]);
99		if (c < (1 << 7)) {
100			outbuf.push(c as u8);
101
102		} else if (c < (1 << 11)) {
103			outbuf.push(((c >> 6) | 0xc0) as u8);
104			outbuf.push(((c & 0x3f) | 0x80) as u8);
105
106		} else if (c < (1 << 16)) {
107			outbuf.push(((c >> 12) | 0xe0) as u8);
108			outbuf.push((((c >> 6) & 0x3f) | 0x80) as u8);
109			outbuf.push(((c & 0x3f) | 0x80) as u8);
110
111		} else if (c < (1 << 21)) {
112			outbuf.push(((c >> 18) | 0xf0) as u8);
113			outbuf.push((((c >> 12) & 0x3f) | 0x80) as u8);
114			outbuf.push((((c >> 6) & 0x3f) | 0x80) as u8);
115			outbuf.push(((c & 0x3f) | 0x80) as u8);
116
117		} else {
118			num_errors += 1;
119		}
120		pc += 1;
121	}
122	return num_errors == 0;
123}
124
125
126/// UTF-16 string length like `libc::wcslen`.
127fn wcslen(sz: LPCWSTR) -> usize
128{
129	if sz.is_null() {
130		return 0;
131	}
132	let mut i: isize = 0;
133	loop {
134		let c = unsafe { *sz.offset(i) };
135		if c == 0 {
136			break;
137		}
138		i += 1;
139	}
140	return i as usize;
141}
142
143/// UTF8 to Rust string conversion. See also [`s2u!`](../macro.s2u.html).
144pub fn u2s(sz: LPCSTR) -> String
145{
146	if sz.is_null() {
147		return String::new();
148	}
149	let cs = unsafe { CStr::from_ptr(sz) };
150	let cow = cs.to_string_lossy();
151	return cow.into_owned();
152}
153
154/// UTF8 to Rust string conversion. See also [`s2u!`](../macro.s2u.html).
155pub fn u2sn(sz: LPCSTR, len: usize) -> String
156{
157	if sz.is_null() || len == 0 {
158		return String::new();
159	}
160	let chars = unsafe { ::std::slice::from_raw_parts(sz as LPCBYTE, len) };
161	let s = String::from_utf8_lossy(chars).into_owned();
162	return s;
163}
164
165/// UTF-16 to Rust string conversion. See also [`s2w!`](../macro.s2w.html).
166pub fn w2s(sz: LPCWSTR) -> String
167{
168	return w2sn(sz, wcslen(sz));
169}
170
171/// UTF-16 to Rust string conversion. See also [`s2w!`](../macro.s2w.html).
172pub fn w2sn(sz: LPCWSTR, len: usize) -> String
173{
174	if sz.is_null() || len == 0 {
175		return String::new();
176	}
177	let chars = unsafe { ::std::slice::from_raw_parts(sz, len) };
178	let s = String::from_utf16_lossy(chars);
179	return s;
180}
181
182/// Rust string to UTF-8 conversion.
183pub fn s2un(s: &str) -> (CString, u32) {
184	let cs = CString::new(s).unwrap();
185	let n = cs.as_bytes().len() as u32;
186	return (cs, n);
187}
188
189/// Rust string to UTF-16 conversion.
190pub fn s2vec(s: &str) -> Vec<u16> {
191	s2vecn(s).0
192}
193
194/// Rust string to UTF-16 conversion.
195pub fn s2vecn(s: &str) -> (Vec<u16>, u32) {
196	let cs = CString::new(s).unwrap();
197	let mut out = Vec::with_capacity(s.len() * 2 + 1);
198	towcs(cs.to_bytes(), &mut out);
199	let n = out.len() as u32;
200	if n > 0 {
201		out.push(0);
202	}
203	return (out, n);
204}
205
206use capi::sctypes::{UINT, LPVOID};
207
208/// Convert an incoming UTF-16 to `String`.
209pub(crate) extern "system" fn store_wstr(szstr: LPCWSTR, str_length: UINT, param: LPVOID) {
210	let s = self::w2sn(szstr, str_length as usize);
211	let out = param as *mut String;
212	unsafe { *out = s };
213}
214
215/// Convert an incoming UTF-8 to `String`.
216pub(crate) extern "system" fn store_astr(szstr: LPCSTR,  str_length: UINT, param: LPVOID) {
217	let s = self::u2sn(szstr, str_length as usize);
218	let out = param as *mut String;
219	unsafe { *out = s };
220}
221
222/// Convert an incoming html string (UTF-8 in fact) to `String`.
223pub(crate) extern "system" fn store_bstr(szstr: LPCBYTE, str_length: UINT, param: LPVOID) {
224	if szstr.is_null() || str_length == 0 {
225		return;
226	}
227	let s = unsafe { ::std::slice::from_raw_parts(szstr, str_length as usize) };
228	let pout = param as *mut Vec<u8>;
229	let out = unsafe {&mut *pout};
230	out.extend_from_slice(s);
231}
232
233
234#[cfg(test)]
235mod tests {
236	#![allow(unused_imports)]
237
238	use std::ffi::{CStr, CString};
239	use capi::sctypes::{LPCWSTR, LPCSTR};
240	use super::{wcslen, u2s, w2s, s2vec};
241
242	#[test]
243	fn test_wcslen() {
244		let nullptr: LPCWSTR = ::std::ptr::null();
245		assert_eq!(wcslen(nullptr), 0);
246
247		let v = vec![0 as u16];
248		assert_eq!(wcslen(v.as_ptr()), 0);
249
250		let v = vec![32, 32, 0];
251		assert_eq!(wcslen(v.as_ptr()), 2);
252	}
253
254	#[test]
255	fn test_u2s() {
256		let nullptr: LPCSTR = ::std::ptr::null();
257		assert_eq!(u2s(nullptr), String::new());
258
259		let s = "hi, there";
260		let cs = CString::new(s).unwrap();
261		assert_eq!(u2s(cs.as_ptr()), s);
262	}
263
264	#[test]
265	fn test_w2s() {
266		let nullptr: LPCWSTR = ::std::ptr::null();
267		assert_eq!(w2s(nullptr), String::new());
268
269		let v = vec![32, 32, 0];	// SP
270		assert_eq!(w2s(v.as_ptr()), "  ");
271	}
272
273	#[test]
274	fn s2w_test() {
275		let v = s2vec("");
276		assert_eq!(v, []);
277
278		assert_eq!(s2vec(""), []);
279
280		assert_eq!(s2vec("A"), ['A' as u16, 0]);
281
282		assert_eq!(s2vec("AB"), ['A' as u16, 'B' as u16, 0]);
283
284		let (cs, n) = s2wn!("");
285		assert_eq!(n, 0);
286		assert_eq!(cs, []);
287	}
288}