1pub mod ffi {
2 use libc::{c_char, c_int, c_void, size_t};
6
7 #[allow(non_camel_case_types)]
8 pub type iconv_t = *mut c_void;
9 #[cfg_attr(windows, link(name = "iconv"))]
10 extern "C" {
11 #[cfg_attr(windows, link_name = "libiconv_open")]
12 pub fn iconv_open(__tocode: *const c_char, __fromcode: *const c_char) -> iconv_t;
13 #[cfg_attr(windows, link_name = "libiconv")]
14 pub fn iconv(
15 __cd: iconv_t,
16 __inbuf: *mut *mut c_char,
17 __inbytesleft: *mut size_t,
18 __outbuf: *mut *mut c_char,
19 __outbytesleft: *mut size_t,
20 ) -> size_t;
21 #[cfg_attr(windows, link_name = "libiconv_close")]
22 pub fn iconv_close(__cd: iconv_t) -> c_int;
23 }
24 }
26
27use libc::size_t;
28use std::io::{BufRead, Read, Write};
29
30use dyn_buf::VecBuf;
31
32const MIN_WRITE: usize = 4096;
33
34pub struct Iconv {
36 cd: ffi::iconv_t,
37}
38
39#[derive(Debug)]
40pub enum IconvError {
41 ConversionNotSupport,
42 OsError(i32),
43 IncompleteInput,
44 InvalidInput,
45 NotSufficientOutput,
46}
47
48impl IconvError {
49 pub fn into_io_error(self) -> std::io::Error {
50 match self {
51 IconvError::OsError(e) => std::io::Error::from_raw_os_error(e),
52 IconvError::ConversionNotSupport => {
53 std::io::Error::new(std::io::ErrorKind::Unsupported, self)
54 }
55 IconvError::NotSufficientOutput => {
56 std::io::Error::new(std::io::ErrorKind::InvalidInput, self)
57 }
58 IconvError::InvalidInput => std::io::Error::new(std::io::ErrorKind::InvalidData, self),
59 IconvError::IncompleteInput => {
60 std::io::Error::new(std::io::ErrorKind::InvalidInput, self)
61 }
62 }
63 }
64}
65
66impl std::fmt::Display for IconvError {
67 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
68 match self {
69 IconvError::OsError(e) => write!(f, "{}", std::io::Error::from_raw_os_error(*e)),
70 IconvError::ConversionNotSupport => {
71 write!(f, "The conversion is not supported by the implementation")
72 }
73 IconvError::NotSufficientOutput => {
74 write!(f, "There is not sufficient room in the output")
75 }
76 IconvError::InvalidInput => write!(
77 f,
78 "An invalid multibyte sequence has been encountered in the input"
79 ),
80 IconvError::IncompleteInput => write!(
81 f,
82 "An incomplete multibyte sequence has been encountered in the input"
83 ),
84 }
85 }
86}
87
88impl std::error::Error for IconvError {}
89
90pub fn iconv(input: &[u8], from_encoding: &str, to_encoding: &str) -> Result<Vec<u8>, IconvError> {
92 let mut c = Iconv::new(from_encoding, to_encoding)?;
93 let mut read = 0;
94 let mut output = VecBuf::new(MIN_WRITE);
95 loop {
96 match c.convert(&input[read..], output.prepare_at_least(0)) {
97 Ok((r, w, _)) => {
98 output.commit(w);
99 if read >= input.len() {
100 return Ok(output.into_vec());
101 }
102 read += r;
103 }
104 Err((r, w, IconvError::NotSufficientOutput)) => {
105 output.commit(w);
106 read += r;
107 output.grow(0);
108 }
109 Err((_, _, e)) => return Err(e),
110 }
111 }
112}
113
114pub fn encode(input: &str, encoding: &str) -> Result<Vec<u8>, IconvError> {
116 iconv(input.as_bytes(), "UTF-8", encoding)
117}
118
119pub fn decode(input: &[u8], encoding: &str) -> Result<String, IconvError> {
121 iconv(input, encoding, "UTF-8").map(|v| unsafe { String::from_utf8_unchecked(v) })
122}
123
124pub fn copy<R: Read, W: Write>(
125 input: R,
126 mut output: W,
127 from_encoding: &str,
128 to_encoding: &str,
129) -> std::io::Result<usize> {
130 let mut cr =
131 IconvReader::new(input, from_encoding, to_encoding).map_err(|e| e.into_io_error())?;
132 let mut w = 0;
133 loop {
134 let v = cr.fill_buf()?;
135 output.write_all(v)?;
136 let n = v.len();
137 cr.consume(n);
138 w += n;
139 if n == 0 {
140 return Ok(w);
141 }
142 }
143}
144
145impl Iconv {
146 pub fn new(from_encoding: &str, to_encoding: &str) -> Result<Iconv, IconvError> {
148 use std::ffi::CString;
149 let from_code = CString::new(from_encoding).unwrap();
150 let to_code = CString::new(to_encoding).unwrap();
151
152 let handle = unsafe { ffi::iconv_open(to_code.as_ptr(), from_code.as_ptr()) };
153 if handle as isize == -1 {
154 let e = std::io::Error::last_os_error().raw_os_error().unwrap();
155 return Err(if e == libc::EINVAL {
156 IconvError::ConversionNotSupport
157 } else {
158 IconvError::OsError(e)
159 });
160 }
161 Ok(Iconv { cd: handle })
162 }
163
164 pub fn reset(&mut self) {
166 use std::ptr::null_mut;
167 unsafe { ffi::iconv(self.cd, null_mut(), null_mut(), null_mut(), null_mut()) };
168 }
169
170 pub fn convert(
174 &mut self,
175 input: &[u8],
176 output: &mut [u8],
177 ) -> Result<(usize, usize, usize), (usize, usize, IconvError)> {
178 let input_left = input.len() as size_t;
179 let output_left = output.len() as size_t;
180
181 let input_ptr = input.as_ptr();
182 let output_ptr = output.as_ptr();
183
184 use std::mem::transmute;
185 let chars = unsafe {
186 ffi::iconv(
187 self.cd,
188 if input.is_empty() {
189 std::ptr::null_mut()
190 } else {
191 transmute(&input_ptr)
192 },
193 transmute(&input_left),
194 transmute(&output_ptr),
195 transmute(&output_left),
196 )
197 };
198 let bytes_read = input.len() - input_left as usize;
199 let bytes_written = output.len() - output_left as usize;
200
201 if chars as isize != -1 {
202 Ok((bytes_read, bytes_written, chars as usize))
203 } else {
204 let errno = std::io::Error::last_os_error().raw_os_error().unwrap();
205 Err((
206 bytes_read,
207 bytes_written,
208 match errno {
209 libc::E2BIG => IconvError::NotSufficientOutput,
210 libc::EINVAL => IconvError::IncompleteInput,
211 libc::EILSEQ => IconvError::InvalidInput,
212 _ => IconvError::OsError(errno),
213 },
214 ))
215 }
216 }
217}
218
219impl Drop for Iconv {
220 fn drop(&mut self) {
221 unsafe { ffi::iconv_close(self.cd) };
222 }
223}
224
225pub struct IconvReader<R: Read> {
226 iconv: Iconv,
227 reader: R,
228 input: VecBuf,
229 output: VecBuf,
230}
231
232impl<R: Read> IconvReader<R> {
233 pub fn new(reader: R, from_encoding: &str, to_encoding: &str) -> Result<Self, IconvError> {
234 let iconv = Iconv::new(from_encoding, to_encoding)?;
235 Ok(Self {
236 iconv,
237 reader,
238 input: VecBuf::new(MIN_WRITE),
239 output: VecBuf::new(MIN_WRITE),
240 })
241 }
242
243 pub fn into_inner(self) -> R {
244 self.reader
245 }
246}
247
248pub struct IconvWriter<W: Write> {
249 iconv: Iconv,
250 writer: W,
251 input: VecBuf,
252 output: VecBuf,
253}
254
255impl<W: Write> IconvWriter<W> {
256 pub fn new(writer: W, from_encoding: &str, to_encoding: &str) -> Result<Self, IconvError> {
257 let iconv = Iconv::new(from_encoding, to_encoding)?;
258 Ok(Self {
259 iconv,
260 writer,
261 input: VecBuf::new(MIN_WRITE),
262 output: VecBuf::new(MIN_WRITE),
263 })
264 }
265
266 pub fn into_inner(self) -> W {
267 self.writer
268 }
269}
270
271impl<R: Read> Read for IconvReader<R> {
272 fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
273 let mut wrote = 0;
274 loop {
275 let n = self.reader.read(self.input.prepare_at_least(0))?;
276 self.input.commit(n);
277
278 match self.iconv.convert(self.input.data(), &mut buf[wrote..]) {
279 Ok((r, w, _)) => {
280 self.input.consume(r);
281 wrote += w;
282 return Ok(wrote);
283 }
284 Err((r, w, e @ IconvError::NotSufficientOutput)) => {
285 self.input.consume(r);
286 wrote += w;
287 return if wrote > 0 {
288 Ok(wrote)
289 } else {
290 Err(e.into_io_error())
291 };
292 }
293 Err((r, w, e @ IconvError::IncompleteInput)) => {
294 self.input.consume(r);
295 wrote += w;
296 if n == 0 {
297 return if wrote > 0 {
298 Ok(wrote)
299 } else {
300 Err(e.into_io_error())
301 };
302 }
303 }
304 Err((_, _, e)) => return Err(e.into_io_error()),
305 }
306 }
307 }
308}
309
310impl<R: Read> BufRead for IconvReader<R> {
311 fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
312 if self.output.is_empty() {
313 let mut o = std::mem::take(&mut self.output);
314 let n = self.read(o.prepare_at_least(0))?;
315 o.commit(n);
316 let _ = std::mem::replace(&mut self.output, o);
317 }
318 Ok(self.output.data())
319 }
320
321 fn consume(&mut self, amt: usize) {
322 self.output.consume(amt)
323 }
324}
325
326impl<W: Write> Write for IconvWriter<W> {
327 fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
328 if self.input.is_empty() {
329 match self.iconv.convert(buf, self.output.prepare_at_least(0)) {
330 Ok((r, w, _)) | Err((r, w, IconvError::IncompleteInput)) => {
331 self.output.commit(w);
332
333 let n = self.writer.write(self.output.data())?;
334 self.output.consume(n);
335
336 Ok(r)
337 }
338 Err((_, _, e)) => Err(e.into_io_error()),
339 }
340 } else {
341 self.input.write_all(buf);
342
343 match self
344 .iconv
345 .convert(self.input.data(), self.output.prepare_at_least(0))
346 {
347 Ok((r, w, _)) | Err((r, w, IconvError::IncompleteInput)) => {
348 self.input.consume(r);
349 self.output.commit(w);
350
351 let n = self.writer.write(self.output.data())?;
352 self.output.consume(n);
353
354 Ok(buf.len())
355 }
356 Err((_, _, e)) => Err(e.into_io_error()),
357 }
358 }
359 }
360
361 fn flush(&mut self) -> std::io::Result<()> {
362 let _ = self.write(&[])?;
363
364 if !self.input.is_empty() {
365 return Err(IconvError::IncompleteInput.into_io_error());
366 }
367 let b = self.output.data();
368 self.writer.write_all(b)?;
369 let n = b.len();
370 self.output.consume(n);
371 self.writer.flush()
372 }
373
374 fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
375 let w = self.write(buf)?;
376 if w < buf.len() {
377 self.input.write_all(&buf[w..]);
378 }
379 Ok(())
380 }
381}
382
383#[cfg(test)]
384mod test {
385 use std::{
386 io,
387 io::{BufReader, Read},
388 iter,
389 };
390
391 use super::*;
392
393 #[test]
394 fn test_reader() {
395 let a = "噗哈";
396 let a_gbk = [224u8, 219, 185, 254];
397 let mut input = String::new();
398 let mut gbk: Vec<u8> = Vec::new();
399 for i in 0..1024 {
400 let i = i.to_string();
401 input.push_str(&i);
402 input.push_str(a);
403 gbk.extend(i.as_bytes());
404 gbk.extend(a_gbk);
405 }
406
407 let r = BufReader::new(input.as_bytes());
408 let mut cr = IconvReader::new(r, "UTF-8", "GBK").unwrap();
409
410 let mut nread = 0;
411 let mut k = 0;
412 loop {
413 k = (k + 1) % 10 + 1;
414 let mut buf = [0u8; 11];
415 let res = cr.read(&mut buf[..k]);
416 println!("{:?}", res);
417 match res {
418 Ok(n) if n == 0 => {
419 assert_eq!(nread, gbk.len());
420 return;
421 }
422 Ok(n) => {
423 assert_eq!(&buf[..n], &gbk[nread..nread + n]);
424 nread += n;
425 }
426 Err(ref e) if e.kind() == io::ErrorKind::InvalidInput => {
427 return;
428 }
429 _ => {
430 unreachable!();
431 }
432 }
433 }
434 }
435
436 #[test]
437 fn test_buf_reader() {
438 let a = "噗哈";
439 let a_gbk = [224u8, 219, 185, 254];
440 let mut input = String::new();
441 let mut gbk: Vec<u8> = Vec::new();
442 for i in 0..102400 {
443 let i = i.to_string();
444 input.push_str(&i);
445 input.push_str(a);
446 gbk.extend(i.as_bytes());
447 gbk.extend(a_gbk);
448 }
449
450 let r = BufReader::new(input.as_bytes());
451 let mut cr = IconvReader::new(r, "UTF-8", "GBK").unwrap();
452
453 let mut nread = 0;
454 loop {
455 let res = cr.fill_buf().unwrap();
456 let n = res.len();
457 println!("{} {}", nread, n);
458 if res.is_empty() {
459 assert_eq!(nread, gbk.len());
460 break;
461 }
462
463 assert_eq!(res, &gbk[nread..nread + n]);
464 nread += n;
465
466 cr.consume(n);
467 }
468 }
469
470 #[test]
471 fn test_copy() {
472 let a = "噗哈";
473 let a_gbk = [224u8, 219, 185, 254];
474 let mut input = String::new();
475 let mut gbk: Vec<u8> = Vec::new();
476 for i in 0..102400 {
477 let i = i.to_string();
478 input.push_str(&i);
479 input.push_str(a);
480 gbk.extend(i.as_bytes());
481 gbk.extend(a_gbk);
482 }
483
484 let r = BufReader::new(input.as_bytes());
485 let mut output = vec![];
486 let c = copy(r, std::io::BufWriter::new(&mut output), "UTF-8", "GBK").unwrap();
487 assert_eq!(c, output.len());
488 assert_eq!(output, gbk);
489 }
490
491 #[test]
492 fn test_writer() {
493 let a = "噗哈";
494 let a_gbk = [224u8, 219, 185, 254];
495 let mut writer = IconvWriter::new(vec![], "UTF-8", "GBK").unwrap();
496 let mut gbk: Vec<u8> = Vec::new();
497 for i in 0..102400 {
498 let i = i.to_string();
499 writer.write_all(i.as_bytes()).unwrap();
500 writer.write_all(a.as_bytes()).unwrap();
501 gbk.extend(i.as_bytes());
502 gbk.extend(a_gbk);
503 }
504
505 assert_eq!(&writer.into_inner(), &gbk);
506 }
507
508 #[test]
509 fn test_encoder_normal() {
510 assert!(encode("", "LATIN1").unwrap().is_empty());
511
512 let a = "哈哈";
513 assert_eq!(encode(a, "GBK").unwrap(), vec!(0xb9, 0xfe, 0xb9, 0xfe));
514
515 let b = iter::repeat(a).take(1024).collect::<Vec<&str>>().join("");
516
517 for ch in encode(&b, "GBK").unwrap().chunks(4) {
518 assert_eq!(ch, &vec![0xb9, 0xfe, 0xb9, 0xfe][..]);
519 }
520
521 let c = vec![0xe5, 0x93, 0x88, 0xe5, 0x93, 0x88]; assert_eq!(
523 iconv(&c, "UTF-8", "GBK").unwrap(),
524 vec!(0xb9, 0xfe, 0xb9, 0xfe)
525 );
526 }
527
528 #[test]
529 fn test_encoder_fail_creating_converter() {
530 assert!(decode("".as_bytes(), "NOT_EXISTS").is_err());
531 }
532
533 #[test]
534 fn test_encoder_ilseq() {
535 let a = vec![0xff, 0xff, 0xff];
536 assert!(matches!(
537 decode(&a, "GBK").unwrap_err(),
538 IconvError::InvalidInput
539 ));
540 }
541
542 #[test]
543 fn test_encoder_invalid() {
544 let a = vec![0xe5, 0x93, 0x88, 0xe5, 0x88]; assert!(matches!(
546 decode(&a, "GBK").unwrap_err(),
547 IconvError::IncompleteInput
548 ));
549 }
550
551 #[test]
552 fn test_decoder_normal() {
553 let buf = Vec::new();
554 let b = &buf[..];
555 assert_eq!(decode(b, "CP936").unwrap(), "".to_string());
556
557 let a = vec![0xb9, 0xfe, 0xb9, 0xfe];
558 assert_eq!(decode(&a, "GBK").unwrap(), "哈哈".to_string());
559 }
560
561 #[test]
562 fn test_decoder_fail_creating_converter() {
563 let buf = Vec::new();
564 let b = &buf[..];
565 assert!(matches!(
566 decode(b, "NOT_EXSITS").unwrap_err(),
567 IconvError::ConversionNotSupport
568 ));
569 }
570
571 #[test]
572 fn test_decoder_ilseq() {
573 let a = vec![0xff, 0xff, 0xff];
574 assert!(matches!(
575 decode(&a, "GBK").unwrap_err(),
576 IconvError::InvalidInput
577 ));
578 }
579
580 #[test]
581 fn test_decoder_invalid() {
582 let a = vec![0xb9, 0xfe, 0xb9]; assert!(matches!(
584 decode(&a, "GBK").unwrap_err(),
585 IconvError::IncompleteInput
586 ));
587 }
588
589 #[test]
590 fn test_caocao_joke() {
591 let a = "曹操";
592 let b = "变巨";
593 assert_eq!(encode(a, "BIG5").unwrap(), encode(b, "GBK").unwrap());
594 }
595}