csv_lib/decoders/
decoders.rs

1use crate::encoders::big5::encode_big5;
2use crate::encoders::gbk::encode_gbk;
3use crate::encoders::shiftjis::encode_shiftjis;
4use crate::encoders::koi8r::encode_koi8r;
5use crate::encoders::windows1252::encode_windows1252;
6use crate::encoders::windows1251::encode_windows1251;
7use crate::encoders::latin9::encode_latin9;
8use crate::encoders::latin2::encode_latin2;
9use crate::encoders::latin1::encode_latin1;
10use crate::encoders::utf8::encode_utf8;
11use std::borrow::Cow;
12
13use crate::decoders::{
14    utf8::{decode_utf8},
15    latin1::{decode_latin1},
16    latin2::{decode_latin2},
17    latin9::{decode_latin9},
18    windows1251::{decode_windows1251},
19    windows1252::{decode_windows1252},
20    koi8r::{decode_koi8r},
21    shiftjis::{decode_shiftjis},
22    gbk::{decode_gbk},
23    big5::{decode_big5},
24};
25
26/// ## Encoding Enum
27///
28/// Represents the available character set transformations.
29/// Used to decode byte data into Unicode `str` and encode `str` into byte data.
30#[derive(Debug, Clone, Copy)]
31#[allow(dead_code)]
32pub enum Encoding {
33    Utf8,
34    Windows1252,
35    ISO8859_1,
36    ISO8859_15,
37    Windows1251,
38    KOI8R,
39    ShiftJIS,
40    GBK,
41    GB2312,
42    Big5,
43    ISO8859_2,
44}
45
46impl Encoding {
47    /// ## Decode Function
48    ///
49    /// Receives an input `&[u8]` and decodes it into a `Cow<'_, str>`.
50    ///
51    /// - No BOM removal is performed here.
52    /// - Automatically selects the appropriate decoding method based on `Decoder`.
53    #[allow(dead_code)]
54    pub fn decode (self, input: &[u8]) -> Cow<'_, str> {
55        match self {
56            Encoding::Utf8 => decode_utf8(input),
57            Encoding::ISO8859_1 => decode_latin1(input),
58            Encoding::ISO8859_2 => decode_latin2(input),
59            Encoding::ISO8859_15 => decode_latin9(input),
60            Encoding::Windows1251 => decode_windows1251(input),
61            Encoding::Windows1252 => decode_windows1252(input),
62            Encoding::KOI8R => decode_koi8r(input),
63            Encoding::ShiftJIS => decode_shiftjis(input),
64            Encoding::GBK | Encoding::GB2312 => decode_gbk(input),
65            Encoding::Big5 => decode_big5(input),
66        }
67    }
68
69    /// ## Decode with BOM Removal
70    ///
71    /// Receives an input `&[u8]` and decodes it, automatically removing UTF BOM markers if present.
72    ///
73    /// - Checks UTF-8 BOM (`EF BB BF`) at the byte level.
74    /// - After decoding, checks Unicode BOM (`\u{FEFF}`) and removes if found.
75    ///
76    /// ### Returns:
77    /// - A `Cow<'_, str>` representing the decoded string.
78    /// - A `bool` indicating whether a BOM was removed.
79    #[allow(dead_code)]
80    pub fn decode_with_bom_removal(self, input: &[u8]) -> (Cow<'_, str>, bool) {
81        let mut sliced_input = input;
82        let mut bom_removed = false;
83        //Bom remove
84        if input.starts_with(&[0xEF, 0xBB, 0xBF]) {
85            sliced_input = &input[3..];
86            bom_removed = true;
87        }
88
89        let decoded = self.decode(sliced_input);
90
91        if !bom_removed && decoded.starts_with('\u{FEFF}') {
92            return match decoded {
93                Cow::Borrowed(s) => (Cow::Borrowed(&s[1..]), true),
94                Cow::Owned(mut s) => {
95                    s.remove(0);
96                    (Cow::Owned(s), true)
97                }
98            }
99        }
100
101        (decoded, bom_removed)
102    }
103
104    /// ## Encode Function
105    /// Receives a `&str` and encodes it into a `Vec<u8>` using the specified encoding.
106    /// - Automatically selects the corresponding encoder based on `Decoder`.
107    /// - Characters not representable in the encoding are replaced with `?` when necessary.
108    #[allow(dead_code)]
109    pub fn encode (self, input: &str) -> Vec<u8> {
110        match self {
111            Encoding::Utf8 => encode_utf8(input),
112            Encoding::ISO8859_1 => encode_latin1(input),
113            Encoding::ISO8859_2 => encode_latin2(input),
114            Encoding::ISO8859_15 => encode_latin9(input),
115            Encoding::Windows1251 => encode_windows1251(input),
116            Encoding::Windows1252 => encode_windows1252(input),
117            Encoding::KOI8R => encode_koi8r(input),
118            Encoding::ShiftJIS => encode_shiftjis(input),
119            Encoding::GBK | Encoding::GB2312 => encode_gbk(input),
120            Encoding::Big5 => encode_big5(input),
121        }
122    }
123}