iconv_compat_win_sys/
lib.rs

1pub mod ffi {
2    /* automatically generated by rust-bindgen and then manually modified :P
3     * $ bindgen /usr/include/iconv.h
4     */
5    use libc::{c_char, c_int, c_void, size_t};
6
7    #[allow(non_camel_case_types)]
8    pub type iconv_t = *mut c_void;
9    #[cfg_attr(windows, link(name = "iconv"))]
10    extern "C" {
11        #[cfg_attr(windows, link_name = "libiconv_open")]
12        pub fn iconv_open(__tocode: *const c_char, __fromcode: *const c_char) -> iconv_t;
13        #[cfg_attr(windows, link_name = "libiconv")]
14        pub fn iconv(
15            __cd: iconv_t,
16            __inbuf: *mut *mut c_char,
17            __inbytesleft: *mut size_t,
18            __outbuf: *mut *mut c_char,
19            __outbytesleft: *mut size_t,
20        ) -> size_t;
21        #[cfg_attr(windows, link_name = "libiconv_close")]
22        pub fn iconv_close(__cd: iconv_t) -> c_int;
23    }
24    /* automatically generated ends */
25}
26
27use libc::size_t;
28use std::io::{BufRead, Read, Write};
29
30use dyn_buf::VecBuf;
31
32const MIN_WRITE: usize = 4096;
33
34/// The representation of a iconv converter
35pub struct Iconv {
36    cd: ffi::iconv_t,
37}
38
39#[derive(Debug)]
40pub enum IconvError {
41    ConversionNotSupport,
42    OsError(i32),
43    IncompleteInput,
44    InvalidInput,
45    NotSufficientOutput,
46}
47
48impl IconvError {
49    pub fn into_io_error(self) -> std::io::Error {
50        match self {
51            IconvError::OsError(e) => std::io::Error::from_raw_os_error(e),
52            IconvError::ConversionNotSupport => {
53                std::io::Error::new(std::io::ErrorKind::Unsupported, self)
54            }
55            IconvError::NotSufficientOutput => {
56                std::io::Error::new(std::io::ErrorKind::InvalidInput, self)
57            }
58            IconvError::InvalidInput => std::io::Error::new(std::io::ErrorKind::InvalidData, self),
59            IconvError::IncompleteInput => {
60                std::io::Error::new(std::io::ErrorKind::InvalidInput, self)
61            }
62        }
63    }
64}
65
66impl std::fmt::Display for IconvError {
67    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
68        match self {
69            IconvError::OsError(e) => write!(f, "{}", std::io::Error::from_raw_os_error(*e)),
70            IconvError::ConversionNotSupport => {
71                write!(f, "The conversion is not supported by the implementation")
72            }
73            IconvError::NotSufficientOutput => {
74                write!(f, "There is not sufficient room in the output")
75            }
76            IconvError::InvalidInput => write!(
77                f,
78                "An invalid multibyte sequence has been encountered in the input"
79            ),
80            IconvError::IncompleteInput => write!(
81                f,
82                "An incomplete multibyte sequence has been encountered in the input"
83            ),
84        }
85    }
86}
87
88impl std::error::Error for IconvError {}
89
90/// convert `input` from `from_encoding` to `to_encoding`
91pub fn iconv(input: &[u8], from_encoding: &str, to_encoding: &str) -> Result<Vec<u8>, IconvError> {
92    let mut c = Iconv::new(from_encoding, to_encoding)?;
93    let mut read = 0;
94    let mut output = VecBuf::new(MIN_WRITE);
95    loop {
96        match c.convert(&input[read..], output.prepare_at_least(0)) {
97            Ok((r, w, _)) => {
98                output.commit(w);
99                if read >= input.len() {
100                    return Ok(output.into_vec());
101                }
102                read += r;
103            }
104            Err((r, w, IconvError::NotSufficientOutput)) => {
105                output.commit(w);
106                read += r;
107                output.grow(0);
108            }
109            Err((_, _, e)) => return Err(e),
110        }
111    }
112}
113
114/// convert `input` from UTF-8 to `encoding`
115pub fn encode(input: &str, encoding: &str) -> Result<Vec<u8>, IconvError> {
116    iconv(input.as_bytes(), "UTF-8", encoding)
117}
118
119/// convert `input` from `encoding` to UTF-8
120pub fn decode(input: &[u8], encoding: &str) -> Result<String, IconvError> {
121    iconv(input, encoding, "UTF-8").map(|v| unsafe { String::from_utf8_unchecked(v) })
122}
123
124pub fn copy<R: Read, W: Write>(
125    input: R,
126    mut output: W,
127    from_encoding: &str,
128    to_encoding: &str,
129) -> std::io::Result<usize> {
130    let mut cr =
131        IconvReader::new(input, from_encoding, to_encoding).map_err(|e| e.into_io_error())?;
132    let mut w = 0;
133    loop {
134        let v = cr.fill_buf()?;
135        output.write_all(v)?;
136        let n = v.len();
137        cr.consume(n);
138        w += n;
139        if n == 0 {
140            return Ok(w);
141        }
142    }
143}
144
145impl Iconv {
146    /// Creates a new Converter from `from_encoding` to `to_encoding`.
147    pub fn new(from_encoding: &str, to_encoding: &str) -> Result<Iconv, IconvError> {
148        use std::ffi::CString;
149        let from_code = CString::new(from_encoding).unwrap();
150        let to_code = CString::new(to_encoding).unwrap();
151
152        let handle = unsafe { ffi::iconv_open(to_code.as_ptr(), from_code.as_ptr()) };
153        if handle as isize == -1 {
154            let e = std::io::Error::last_os_error().raw_os_error().unwrap();
155            return Err(if e == libc::EINVAL {
156                IconvError::ConversionNotSupport
157            } else {
158                IconvError::OsError(e)
159            });
160        }
161        Ok(Iconv { cd: handle })
162    }
163
164    /// reset to the initial state
165    pub fn reset(&mut self) {
166        use std::ptr::null_mut;
167        unsafe { ffi::iconv(self.cd, null_mut(), null_mut(), null_mut(), null_mut()) };
168    }
169
170    /// Convert from input into output.
171    /// Returns Ok((bytes_read, bytes_written, number_of_chars_converted)).
172    ///      or Err((bytes_read, bytes_written, IconvError))
173    pub fn convert(
174        &mut self,
175        input: &[u8],
176        output: &mut [u8],
177    ) -> Result<(usize, usize, usize), (usize, usize, IconvError)> {
178        let input_left = input.len() as size_t;
179        let output_left = output.len() as size_t;
180
181        let input_ptr = input.as_ptr();
182        let output_ptr = output.as_ptr();
183
184        use std::mem::transmute;
185        let chars = unsafe {
186            ffi::iconv(
187                self.cd,
188                if input.is_empty() {
189                    std::ptr::null_mut()
190                } else {
191                    transmute(&input_ptr)
192                },
193                transmute(&input_left),
194                transmute(&output_ptr),
195                transmute(&output_left),
196            )
197        };
198        let bytes_read = input.len() - input_left as usize;
199        let bytes_written = output.len() - output_left as usize;
200
201        if chars as isize != -1 {
202            Ok((bytes_read, bytes_written, chars as usize))
203        } else {
204            let errno = std::io::Error::last_os_error().raw_os_error().unwrap();
205            Err((
206                bytes_read,
207                bytes_written,
208                match errno {
209                    libc::E2BIG => IconvError::NotSufficientOutput,
210                    libc::EINVAL => IconvError::IncompleteInput,
211                    libc::EILSEQ => IconvError::InvalidInput,
212                    _ => IconvError::OsError(errno),
213                },
214            ))
215        }
216    }
217}
218
219impl Drop for Iconv {
220    fn drop(&mut self) {
221        unsafe { ffi::iconv_close(self.cd) };
222    }
223}
224
225pub struct IconvReader<R: Read> {
226    iconv: Iconv,
227    reader: R,
228    input: VecBuf,
229    output: VecBuf,
230}
231
232impl<R: Read> IconvReader<R> {
233    pub fn new(reader: R, from_encoding: &str, to_encoding: &str) -> Result<Self, IconvError> {
234        let iconv = Iconv::new(from_encoding, to_encoding)?;
235        Ok(Self {
236            iconv,
237            reader,
238            input: VecBuf::new(MIN_WRITE),
239            output: VecBuf::new(MIN_WRITE),
240        })
241    }
242
243    pub fn into_inner(self) -> R {
244        self.reader
245    }
246}
247
248pub struct IconvWriter<W: Write> {
249    iconv: Iconv,
250    writer: W,
251    input: VecBuf,
252    output: VecBuf,
253}
254
255impl<W: Write> IconvWriter<W> {
256    pub fn new(writer: W, from_encoding: &str, to_encoding: &str) -> Result<Self, IconvError> {
257        let iconv = Iconv::new(from_encoding, to_encoding)?;
258        Ok(Self {
259            iconv,
260            writer,
261            input: VecBuf::new(MIN_WRITE),
262            output: VecBuf::new(MIN_WRITE),
263        })
264    }
265
266    pub fn into_inner(self) -> W {
267        self.writer
268    }
269}
270
271impl<R: Read> Read for IconvReader<R> {
272    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
273        let mut wrote = 0;
274        loop {
275            let n = self.reader.read(self.input.prepare_at_least(0))?;
276            self.input.commit(n);
277
278            match self.iconv.convert(self.input.data(), &mut buf[wrote..]) {
279                Ok((r, w, _)) => {
280                    self.input.consume(r);
281                    wrote += w;
282                    return Ok(wrote);
283                }
284                Err((r, w, e @ IconvError::NotSufficientOutput)) => {
285                    self.input.consume(r);
286                    wrote += w;
287                    return if wrote > 0 {
288                        Ok(wrote)
289                    } else {
290                        Err(e.into_io_error())
291                    };
292                }
293                Err((r, w, e @ IconvError::IncompleteInput)) => {
294                    self.input.consume(r);
295                    wrote += w;
296                    if n == 0 {
297                        return if wrote > 0 {
298                            Ok(wrote)
299                        } else {
300                            Err(e.into_io_error())
301                        };
302                    }
303                }
304                Err((_, _, e)) => return Err(e.into_io_error()),
305            }
306        }
307    }
308}
309
310impl<R: Read> BufRead for IconvReader<R> {
311    fn fill_buf(&mut self) -> std::io::Result<&[u8]> {
312        if self.output.is_empty() {
313            let mut o = std::mem::take(&mut self.output);
314            let n = self.read(o.prepare_at_least(0))?;
315            o.commit(n);
316            let _ = std::mem::replace(&mut self.output, o);
317        }
318        Ok(self.output.data())
319    }
320
321    fn consume(&mut self, amt: usize) {
322        self.output.consume(amt)
323    }
324}
325
326impl<W: Write> Write for IconvWriter<W> {
327    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
328        if self.input.is_empty() {
329            match self.iconv.convert(buf, self.output.prepare_at_least(0)) {
330                Ok((r, w, _)) | Err((r, w, IconvError::IncompleteInput)) => {
331                    self.output.commit(w);
332
333                    let n = self.writer.write(self.output.data())?;
334                    self.output.consume(n);
335
336                    Ok(r)
337                }
338                Err((_, _, e)) => Err(e.into_io_error()),
339            }
340        } else {
341            self.input.write_all(buf);
342
343            match self
344                .iconv
345                .convert(self.input.data(), self.output.prepare_at_least(0))
346            {
347                Ok((r, w, _)) | Err((r, w, IconvError::IncompleteInput)) => {
348                    self.input.consume(r);
349                    self.output.commit(w);
350
351                    let n = self.writer.write(self.output.data())?;
352                    self.output.consume(n);
353
354                    Ok(buf.len())
355                }
356                Err((_, _, e)) => Err(e.into_io_error()),
357            }
358        }
359    }
360
361    fn flush(&mut self) -> std::io::Result<()> {
362        let _ = self.write(&[])?;
363
364        if !self.input.is_empty() {
365            return Err(IconvError::IncompleteInput.into_io_error());
366        }
367        let b = self.output.data();
368        self.writer.write_all(b)?;
369        let n = b.len();
370        self.output.consume(n);
371        self.writer.flush()
372    }
373
374    fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
375        let w = self.write(buf)?;
376        if w < buf.len() {
377            self.input.write_all(&buf[w..]);
378        }
379        Ok(())
380    }
381}
382
383#[cfg(test)]
384mod test {
385    use std::{
386        io,
387        io::{BufReader, Read},
388        iter,
389    };
390
391    use super::*;
392
393    #[test]
394    fn test_reader() {
395        let a = "噗哈";
396        let a_gbk = [224u8, 219, 185, 254];
397        let mut input = String::new();
398        let mut gbk: Vec<u8> = Vec::new();
399        for i in 0..1024 {
400            let i = i.to_string();
401            input.push_str(&i);
402            input.push_str(a);
403            gbk.extend(i.as_bytes());
404            gbk.extend(a_gbk);
405        }
406
407        let r = BufReader::new(input.as_bytes());
408        let mut cr = IconvReader::new(r, "UTF-8", "GBK").unwrap();
409
410        let mut nread = 0;
411        let mut k = 0;
412        loop {
413            k = (k + 1) % 10 + 1;
414            let mut buf = [0u8; 11];
415            let res = cr.read(&mut buf[..k]);
416            println!("{:?}", res);
417            match res {
418                Ok(n) if n == 0 => {
419                    assert_eq!(nread, gbk.len());
420                    return;
421                }
422                Ok(n) => {
423                    assert_eq!(&buf[..n], &gbk[nread..nread + n]);
424                    nread += n;
425                }
426                Err(ref e) if e.kind() == io::ErrorKind::InvalidInput => {
427                    return;
428                }
429                _ => {
430                    unreachable!();
431                }
432            }
433        }
434    }
435
436    #[test]
437    fn test_buf_reader() {
438        let a = "噗哈";
439        let a_gbk = [224u8, 219, 185, 254];
440        let mut input = String::new();
441        let mut gbk: Vec<u8> = Vec::new();
442        for i in 0..102400 {
443            let i = i.to_string();
444            input.push_str(&i);
445            input.push_str(a);
446            gbk.extend(i.as_bytes());
447            gbk.extend(a_gbk);
448        }
449
450        let r = BufReader::new(input.as_bytes());
451        let mut cr = IconvReader::new(r, "UTF-8", "GBK").unwrap();
452
453        let mut nread = 0;
454        loop {
455            let res = cr.fill_buf().unwrap();
456            let n = res.len();
457            println!("{} {}", nread, n);
458            if res.is_empty() {
459                assert_eq!(nread, gbk.len());
460                break;
461            }
462
463            assert_eq!(res, &gbk[nread..nread + n]);
464            nread += n;
465
466            cr.consume(n);
467        }
468    }
469
470    #[test]
471    fn test_copy() {
472        let a = "噗哈";
473        let a_gbk = [224u8, 219, 185, 254];
474        let mut input = String::new();
475        let mut gbk: Vec<u8> = Vec::new();
476        for i in 0..102400 {
477            let i = i.to_string();
478            input.push_str(&i);
479            input.push_str(a);
480            gbk.extend(i.as_bytes());
481            gbk.extend(a_gbk);
482        }
483
484        let r = BufReader::new(input.as_bytes());
485        let mut output = vec![];
486        let c = copy(r, std::io::BufWriter::new(&mut output), "UTF-8", "GBK").unwrap();
487        assert_eq!(c, output.len());
488        assert_eq!(output, gbk);
489    }
490
491    #[test]
492    fn test_writer() {
493        let a = "噗哈";
494        let a_gbk = [224u8, 219, 185, 254];
495        let mut writer = IconvWriter::new(vec![], "UTF-8", "GBK").unwrap();
496        let mut gbk: Vec<u8> = Vec::new();
497        for i in 0..102400 {
498            let i = i.to_string();
499            writer.write_all(i.as_bytes()).unwrap();
500            writer.write_all(a.as_bytes()).unwrap();
501            gbk.extend(i.as_bytes());
502            gbk.extend(a_gbk);
503        }
504
505        assert_eq!(&writer.into_inner(), &gbk);
506    }
507
508    #[test]
509    fn test_encoder_normal() {
510        assert!(encode("", "LATIN1").unwrap().is_empty());
511
512        let a = "哈哈";
513        assert_eq!(encode(a, "GBK").unwrap(), vec!(0xb9, 0xfe, 0xb9, 0xfe));
514
515        let b = iter::repeat(a).take(1024).collect::<Vec<&str>>().join("");
516
517        for ch in encode(&b, "GBK").unwrap().chunks(4) {
518            assert_eq!(ch, &vec![0xb9, 0xfe, 0xb9, 0xfe][..]);
519        }
520
521        let c = vec![0xe5, 0x93, 0x88, 0xe5, 0x93, 0x88]; // utf8 bytes
522        assert_eq!(
523            iconv(&c, "UTF-8", "GBK").unwrap(),
524            vec!(0xb9, 0xfe, 0xb9, 0xfe)
525        );
526    }
527
528    #[test]
529    fn test_encoder_fail_creating_converter() {
530        assert!(decode("".as_bytes(), "NOT_EXISTS").is_err());
531    }
532
533    #[test]
534    fn test_encoder_ilseq() {
535        let a = vec![0xff, 0xff, 0xff];
536        assert!(matches!(
537            decode(&a, "GBK").unwrap_err(),
538            IconvError::InvalidInput
539        ));
540    }
541
542    #[test]
543    fn test_encoder_invalid() {
544        let a = vec![0xe5, 0x93, 0x88, 0xe5, 0x88]; // incomplete utf8 bytes
545        assert!(matches!(
546            decode(&a, "GBK").unwrap_err(),
547            IconvError::IncompleteInput
548        ));
549    }
550
551    #[test]
552    fn test_decoder_normal() {
553        let buf = Vec::new();
554        let b = &buf[..];
555        assert_eq!(decode(b, "CP936").unwrap(), "".to_string());
556
557        let a = vec![0xb9, 0xfe, 0xb9, 0xfe];
558        assert_eq!(decode(&a, "GBK").unwrap(), "哈哈".to_string());
559    }
560
561    #[test]
562    fn test_decoder_fail_creating_converter() {
563        let buf = Vec::new();
564        let b = &buf[..];
565        assert!(matches!(
566            decode(b, "NOT_EXSITS").unwrap_err(),
567            IconvError::ConversionNotSupport
568        ));
569    }
570
571    #[test]
572    fn test_decoder_ilseq() {
573        let a = vec![0xff, 0xff, 0xff];
574        assert!(matches!(
575            decode(&a, "GBK").unwrap_err(),
576            IconvError::InvalidInput
577        ));
578    }
579
580    #[test]
581    fn test_decoder_invalid() {
582        let a = vec![0xb9, 0xfe, 0xb9]; // incomplete gbk bytes
583        assert!(matches!(
584            decode(&a, "GBK").unwrap_err(),
585            IconvError::IncompleteInput
586        ));
587    }
588
589    #[test]
590    fn test_caocao_joke() {
591        let a = "曹操";
592        let b = "变巨";
593        assert_eq!(encode(a, "BIG5").unwrap(), encode(b, "GBK").unwrap());
594    }
595}