Skip to main content

tiberius/tds/
collation.rs

1//! legacy implementation of collations (or codepages rather) for dealing with varchar's with legacy databases
2//! references [1] which has some mappings from the katmai (SQL Server 2008) source code and is a TDS driver
3//! directly from microsoft
4//! [2] is helpful to map CP1234 to the appropriate encoding
5//!
6//! [1] <https://github.com/Microsoft/mssql-jdbc/blob/eb14f63077c47ef1fc1c690deb8cfab602baeb85/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java>
7//! [2] <https://github.com/lifthrasiir/rust-encoding/blob/496823171f15d9b9446b2ec3fb7765f22346256b/src/label.rs#L282>
8
9use encoding_rs::Encoding;
10use std::fmt;
11
12use crate::error::Error;
13
14/// SQL Server collation metadata attached to character columns.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub struct Collation {
17    /// LCID ColFlags Version
18    info: u32,
19    /// Sortid
20    sort_id: u8,
21}
22
23impl Collation {
24    /// Creates collation metadata from the raw TDS collation parts.
25    pub fn new(info: u32, sort_id: u8) -> Self {
26        Self { info, sort_id }
27    }
28
29    /// return the locale id part of the LCID (the specification here uses ambiguous terms)
30    pub fn lcid(&self) -> u16 {
31        (self.info & 0xffff) as u16
32    }
33
34    /// The SQL Server sort ID.
35    pub fn sort_id(&self) -> u8 {
36        self.sort_id
37    }
38
39    /// The raw LCID, flags, and version bits.
40    pub fn info(&self) -> u32 {
41        self.info
42    }
43
44    /// return an encoding for a given collation
45    pub fn encoding(&self) -> crate::Result<&'static Encoding> {
46        let res = if self.sort_id == 0 {
47            lcid_to_encoding(self.lcid())
48        } else {
49            sortid_to_encoding(self.sort_id)
50        };
51
52        res.ok_or_else(|| {
53            Error::Encoding(
54                format!(
55                    "encoding: unspported encoding (LCID: {:#02x}, sort ID: {})",
56                    self.lcid(),
57                    self.sort_id(),
58                )
59                .into(),
60            )
61        })
62    }
63}
64
65impl fmt::Display for Collation {
66    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
67        match self.encoding() {
68            Ok(encoding) => write!(f, "{}", encoding.name()),
69            _ => write!(f, "None"),
70        }
71    }
72}
73
74/// https://github.com/Microsoft/mssql-jdbc/blob/eb14f63077c47ef1fc1c690deb8cfab602baeb85/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java#L102-L310
75/// maps an LCID (it's locale part which is only 2 bytes) to a codepage
76///
77/// generate the code below from source code:
78/// 1. (regex)replace: (.*?)\((.*?),(.*?)\) with $2 => $3
79/// 2. replace: Encoding.CP(.*?) with encoding::all::WINDOWS_$1
80/// 3. replace: Encoding.UNICODE with encoding::all::UTF16_LE
81//
82/// the unimplemented!() one's are not supported by rust-encoding
83pub fn lcid_to_encoding(locale: u16) -> Option<&'static Encoding> {
84    match locale {
85        0x0401 => Some(encoding_rs::WINDOWS_1256),
86        0x0402 => Some(encoding_rs::WINDOWS_1251),
87        0x0403 => Some(encoding_rs::WINDOWS_1252),
88        // CP950
89        0x0404 | 0x0c04 | 0x1404 => Some(encoding_rs::BIG5),
90        0x0405 => Some(encoding_rs::WINDOWS_1250),
91        0x0406 => Some(encoding_rs::WINDOWS_1252),
92        0x0407 => Some(encoding_rs::WINDOWS_1252),
93        0x0408 => Some(encoding_rs::WINDOWS_1253),
94        0x0409 => Some(encoding_rs::WINDOWS_1252),
95        0x040a => Some(encoding_rs::WINDOWS_1252),
96        0x040b => Some(encoding_rs::WINDOWS_1252),
97        0x040c => Some(encoding_rs::WINDOWS_1252),
98        0x040d => Some(encoding_rs::WINDOWS_1255),
99        0x040e => Some(encoding_rs::WINDOWS_1250),
100        0x040f => Some(encoding_rs::WINDOWS_1252),
101        0x0410 => Some(encoding_rs::WINDOWS_1252),
102        // CP932
103        0x0411 => Some(encoding_rs::SHIFT_JIS),
104        0x0412 => Some(encoding_rs::EUC_KR),
105        0x0413 => Some(encoding_rs::WINDOWS_1252),
106        0x0414 => Some(encoding_rs::WINDOWS_1252),
107        0x0415 => Some(encoding_rs::WINDOWS_1250),
108        0x0416 => Some(encoding_rs::WINDOWS_1252),
109        0x0417 => Some(encoding_rs::WINDOWS_1252),
110        0x0418 => Some(encoding_rs::WINDOWS_1250),
111        0x0419 => Some(encoding_rs::WINDOWS_1251),
112        0x041a => Some(encoding_rs::WINDOWS_1250),
113        0x041b => Some(encoding_rs::WINDOWS_1250),
114        0x041c => Some(encoding_rs::WINDOWS_1250),
115        0x041d => Some(encoding_rs::WINDOWS_1252),
116        0x041e => Some(encoding_rs::WINDOWS_874),
117        0x041f => Some(encoding_rs::WINDOWS_1254),
118        0x0420 => Some(encoding_rs::WINDOWS_1256),
119        0x0421 => Some(encoding_rs::WINDOWS_1252),
120        0x0422 => Some(encoding_rs::WINDOWS_1251),
121        0x0423 => Some(encoding_rs::WINDOWS_1251),
122        0x0424 => Some(encoding_rs::WINDOWS_1250),
123        0x0425 => Some(encoding_rs::WINDOWS_1257),
124        0x0426 => Some(encoding_rs::WINDOWS_1257),
125        0x0427 => Some(encoding_rs::WINDOWS_1257),
126        0x0428 => Some(encoding_rs::WINDOWS_1251),
127        0x0429 => Some(encoding_rs::WINDOWS_1256),
128        0x042a => Some(encoding_rs::WINDOWS_1258),
129        0x042b => Some(encoding_rs::WINDOWS_1252),
130        0x042c => Some(encoding_rs::WINDOWS_1254),
131        0x042d => Some(encoding_rs::WINDOWS_1252),
132        0x042e => Some(encoding_rs::WINDOWS_1252),
133        0x042f => Some(encoding_rs::WINDOWS_1251),
134        0x0432 => Some(encoding_rs::WINDOWS_1252),
135        0x0434 => Some(encoding_rs::WINDOWS_1252),
136        0x0435 => Some(encoding_rs::WINDOWS_1252),
137        0x0436 => Some(encoding_rs::WINDOWS_1252),
138        0x0437 => Some(encoding_rs::WINDOWS_1252),
139        0x0438 => Some(encoding_rs::WINDOWS_1252),
140        0x0439 => Some(encoding_rs::UTF_16LE),
141        0x043a => Some(encoding_rs::UTF_16LE),
142        0x043b => Some(encoding_rs::WINDOWS_1252),
143        0x043e => Some(encoding_rs::WINDOWS_1252),
144        0x043f => Some(encoding_rs::WINDOWS_1251),
145        0x0440 => Some(encoding_rs::WINDOWS_1251),
146        0x0441 => Some(encoding_rs::WINDOWS_1252),
147        0x0442 => Some(encoding_rs::WINDOWS_1250),
148        0x0443 => Some(encoding_rs::WINDOWS_1254),
149        0x0444 => Some(encoding_rs::WINDOWS_1251),
150        0x0445 => Some(encoding_rs::UTF_16LE),
151        0x0446 => Some(encoding_rs::UTF_16LE),
152        0x0447 => Some(encoding_rs::UTF_16LE),
153        0x0448 => Some(encoding_rs::UTF_16LE),
154        0x0449 => Some(encoding_rs::UTF_16LE),
155        0x044a => Some(encoding_rs::UTF_16LE),
156        0x044b => Some(encoding_rs::UTF_16LE),
157        0x044c => Some(encoding_rs::UTF_16LE),
158        0x044d => Some(encoding_rs::UTF_16LE),
159        0x044e => Some(encoding_rs::UTF_16LE),
160        0x044f => Some(encoding_rs::UTF_16LE),
161        0x0450 => Some(encoding_rs::WINDOWS_1251),
162        0x0451 => Some(encoding_rs::UTF_16LE),
163        0x0452 => Some(encoding_rs::WINDOWS_1252),
164        0x0453 => Some(encoding_rs::UTF_16LE),
165        0x0454 => Some(encoding_rs::UTF_16LE),
166        0x0456 => Some(encoding_rs::WINDOWS_1252),
167        0x0457 => Some(encoding_rs::UTF_16LE),
168        0x045a => Some(encoding_rs::UTF_16LE),
169        0x045b => Some(encoding_rs::UTF_16LE),
170        0x045d => Some(encoding_rs::WINDOWS_1252),
171        0x045e => Some(encoding_rs::WINDOWS_1252),
172        0x0461 => Some(encoding_rs::UTF_16LE),
173        0x0462 => Some(encoding_rs::WINDOWS_1252),
174        0x0463 => Some(encoding_rs::UTF_16LE),
175        0x0464 => Some(encoding_rs::WINDOWS_1252),
176        0x0465 => Some(encoding_rs::UTF_16LE),
177        0x0468 => Some(encoding_rs::WINDOWS_1252),
178        0x046a => Some(encoding_rs::WINDOWS_1252),
179        0x046b => Some(encoding_rs::WINDOWS_1252),
180        0x046c => Some(encoding_rs::WINDOWS_1252),
181        0x046d => Some(encoding_rs::WINDOWS_1251),
182        0x046e => Some(encoding_rs::WINDOWS_1252),
183        0x046f => Some(encoding_rs::WINDOWS_1252),
184        0x0470 => Some(encoding_rs::WINDOWS_1252),
185        0x0478 => Some(encoding_rs::WINDOWS_1252),
186        0x047a => Some(encoding_rs::WINDOWS_1252),
187        0x047c => Some(encoding_rs::WINDOWS_1252),
188        0x047e => Some(encoding_rs::WINDOWS_1252),
189        0x0480 => Some(encoding_rs::WINDOWS_1256),
190        0x0481 => Some(encoding_rs::UTF_16LE),
191        0x0482 => Some(encoding_rs::WINDOWS_1252),
192        0x0483 => Some(encoding_rs::WINDOWS_1252),
193        0x0484 => Some(encoding_rs::WINDOWS_1252),
194        0x0485 => Some(encoding_rs::WINDOWS_1251),
195        0x0486 => Some(encoding_rs::WINDOWS_1252),
196        0x0487 => Some(encoding_rs::WINDOWS_1252),
197        0x0488 => Some(encoding_rs::WINDOWS_1252),
198        0x048c => Some(encoding_rs::WINDOWS_1256),
199        0x0801 => Some(encoding_rs::WINDOWS_1256),
200        // CP936
201        0x0804 | 0x1004 => Some(encoding_rs::GB18030),
202        0x0807 => Some(encoding_rs::WINDOWS_1252),
203        0x0809 => Some(encoding_rs::WINDOWS_1252),
204        0x080a => Some(encoding_rs::WINDOWS_1252),
205        0x080c => Some(encoding_rs::WINDOWS_1252),
206        0x0810 => Some(encoding_rs::WINDOWS_1252),
207        0x0813 => Some(encoding_rs::WINDOWS_1252),
208        0x0814 => Some(encoding_rs::WINDOWS_1252),
209        0x0816 => Some(encoding_rs::WINDOWS_1252),
210        0x081a => Some(encoding_rs::WINDOWS_1250),
211        0x081d => Some(encoding_rs::WINDOWS_1252),
212        0x0827 => Some(encoding_rs::WINDOWS_1257),
213        0x082c => Some(encoding_rs::WINDOWS_1251),
214        0x082e => Some(encoding_rs::WINDOWS_1252),
215        0x083b => Some(encoding_rs::WINDOWS_1252),
216        0x083c => Some(encoding_rs::WINDOWS_1252),
217        0x083e => Some(encoding_rs::WINDOWS_1252),
218        0x0843 => Some(encoding_rs::WINDOWS_1251),
219        0x0845 => Some(encoding_rs::UTF_16LE),
220        0x0850 => Some(encoding_rs::WINDOWS_1251),
221        0x085d => Some(encoding_rs::WINDOWS_1252),
222        0x085f => Some(encoding_rs::WINDOWS_1252),
223        0x086b => Some(encoding_rs::WINDOWS_1252),
224        0x0c01 => Some(encoding_rs::WINDOWS_1256),
225        0x0c07 => Some(encoding_rs::WINDOWS_1252),
226        0x0c09 => Some(encoding_rs::WINDOWS_1252),
227        0x0c0a => Some(encoding_rs::WINDOWS_1252),
228        0x0c0c => Some(encoding_rs::WINDOWS_1252),
229        0x0c1a => Some(encoding_rs::WINDOWS_1251),
230        0x0c3b => Some(encoding_rs::WINDOWS_1252),
231        0x0c6b => Some(encoding_rs::WINDOWS_1252),
232        0x1001 => Some(encoding_rs::WINDOWS_1256),
233        0x1007 => Some(encoding_rs::WINDOWS_1252),
234        0x1009 => Some(encoding_rs::WINDOWS_1252),
235        0x100a => Some(encoding_rs::WINDOWS_1252),
236        0x100c => Some(encoding_rs::WINDOWS_1252),
237        0x101a => Some(encoding_rs::WINDOWS_1250),
238        0x103b => Some(encoding_rs::WINDOWS_1252),
239        0x1401 => Some(encoding_rs::WINDOWS_1256),
240        0x1407 => Some(encoding_rs::WINDOWS_1252),
241        0x1409 => Some(encoding_rs::WINDOWS_1252),
242        0x140a => Some(encoding_rs::WINDOWS_1252),
243        0x140c => Some(encoding_rs::WINDOWS_1252),
244        0x141a => Some(encoding_rs::WINDOWS_1250),
245        0x143b => Some(encoding_rs::WINDOWS_1252),
246        0x1801 => Some(encoding_rs::WINDOWS_1256),
247        0x1809 => Some(encoding_rs::WINDOWS_1252),
248        0x180a => Some(encoding_rs::WINDOWS_1252),
249        0x180c => Some(encoding_rs::WINDOWS_1252),
250        0x181a => Some(encoding_rs::WINDOWS_1250),
251        0x183b => Some(encoding_rs::WINDOWS_1252),
252        0x1c01 => Some(encoding_rs::WINDOWS_1256),
253        0x1c09 => Some(encoding_rs::WINDOWS_1252),
254        0x1c0a => Some(encoding_rs::WINDOWS_1252),
255        0x1c1a => Some(encoding_rs::WINDOWS_1251),
256        0x1c3b => Some(encoding_rs::WINDOWS_1252),
257        0x2001 => Some(encoding_rs::WINDOWS_1256),
258        0x2009 => Some(encoding_rs::WINDOWS_1252),
259        0x200a => Some(encoding_rs::WINDOWS_1252),
260        0x201a => Some(encoding_rs::WINDOWS_1251),
261        0x203b => Some(encoding_rs::WINDOWS_1252),
262        0x2401 => Some(encoding_rs::WINDOWS_1256),
263        0x2409 => Some(encoding_rs::WINDOWS_1252),
264        0x240a => Some(encoding_rs::WINDOWS_1252),
265        0x243b => Some(encoding_rs::WINDOWS_1252),
266        0x2801 => Some(encoding_rs::WINDOWS_1256),
267        0x2809 => Some(encoding_rs::WINDOWS_1252),
268        0x280a => Some(encoding_rs::WINDOWS_1252),
269        0x2c01 => Some(encoding_rs::WINDOWS_1256),
270        0x2c09 => Some(encoding_rs::WINDOWS_1252),
271        0x2c0a => Some(encoding_rs::WINDOWS_1252),
272        0x3001 => Some(encoding_rs::WINDOWS_1256),
273        0x3009 => Some(encoding_rs::WINDOWS_1252),
274        0x300a => Some(encoding_rs::WINDOWS_1252),
275        0x3401 => Some(encoding_rs::WINDOWS_1256),
276        0x3409 => Some(encoding_rs::WINDOWS_1252),
277        0x340a => Some(encoding_rs::WINDOWS_1252),
278        0x3801 => Some(encoding_rs::WINDOWS_1256),
279        0x380a => Some(encoding_rs::WINDOWS_1252),
280        0x3c01 => Some(encoding_rs::WINDOWS_1256),
281        0x3c0a => Some(encoding_rs::WINDOWS_1252),
282        0x4001 => Some(encoding_rs::WINDOWS_1256),
283        0x4009 => Some(encoding_rs::WINDOWS_1252),
284        0x400a => Some(encoding_rs::WINDOWS_1252),
285        0x4409 => Some(encoding_rs::WINDOWS_1252),
286        0x440a => Some(encoding_rs::WINDOWS_1252),
287        0x4809 => Some(encoding_rs::WINDOWS_1252),
288        0x480a => Some(encoding_rs::WINDOWS_1252),
289        0x4c0a => Some(encoding_rs::WINDOWS_1252),
290        0x500a => Some(encoding_rs::WINDOWS_1252),
291        0x540a => Some(encoding_rs::WINDOWS_1252),
292        _ => None,
293    }
294}
295
296/// [1] https://github.com/Microsoft/mssql-jdbc/blob/eb14f63077c47ef1fc1c690deb8cfab602baeb85/src/main/java/com/microsoft/sqlserver/jdbc/SQLCollation.java#L362-L482
297/// [2] https://msdn.microsoft.com/de-de/library/ms144250(v=sql.105).aspx
298///
299/// [2] does only contain 3/4 of the content [1] contains, so the source code is again the better source of information
300///
301/// generate the code below from source code:
302/// 1. (regex)replace .*\((.*?),.*?,(.*?)\) with $1 => $2
303/// 2. see above/as above
304pub fn sortid_to_encoding(sort_id: u8) -> Option<&'static Encoding> {
305    match sort_id {
306        // 30 | 31 | 32 | 33 | 34 | 35 => Some(encoding_rs::WINDOWS_437),
307        // 40 | 41 | 42 | 43 | 44 | 45 | 49 => Some(encoding_rs::WINDOWS_850),
308        50 => Some(encoding_rs::WINDOWS_1252),
309        51 => Some(encoding_rs::WINDOWS_1252),
310        52 => Some(encoding_rs::WINDOWS_1252),
311        53 => Some(encoding_rs::WINDOWS_1252),
312        54 => Some(encoding_rs::WINDOWS_1252),
313        // 55 | 56 | 57 | 58 | 59 | 60 | 61 => Some(encoding_rs::WINDOWS_850),
314        71 => Some(encoding_rs::WINDOWS_1252),
315        72 => Some(encoding_rs::WINDOWS_1252),
316        73 => Some(encoding_rs::WINDOWS_1252),
317        74 => Some(encoding_rs::WINDOWS_1252),
318        75 => Some(encoding_rs::WINDOWS_1252),
319        80 => Some(encoding_rs::WINDOWS_1250),
320        81 => Some(encoding_rs::WINDOWS_1250),
321        82 => Some(encoding_rs::WINDOWS_1250),
322        83 => Some(encoding_rs::WINDOWS_1250),
323        84 => Some(encoding_rs::WINDOWS_1250),
324        85 => Some(encoding_rs::WINDOWS_1250),
325        86 => Some(encoding_rs::WINDOWS_1250),
326        87 => Some(encoding_rs::WINDOWS_1250),
327        88 => Some(encoding_rs::WINDOWS_1250),
328        89 => Some(encoding_rs::WINDOWS_1250),
329        90 => Some(encoding_rs::WINDOWS_1250),
330        91 => Some(encoding_rs::WINDOWS_1250),
331        92 => Some(encoding_rs::WINDOWS_1250),
332        93 => Some(encoding_rs::WINDOWS_1250),
333        94 => Some(encoding_rs::WINDOWS_1250),
334        95 => Some(encoding_rs::WINDOWS_1250),
335        96 => Some(encoding_rs::WINDOWS_1250),
336        97 => Some(encoding_rs::WINDOWS_1250),
337        98 => Some(encoding_rs::WINDOWS_1250),
338        104 => Some(encoding_rs::WINDOWS_1251),
339        105 => Some(encoding_rs::WINDOWS_1251),
340        106 => Some(encoding_rs::WINDOWS_1251),
341        107 => Some(encoding_rs::WINDOWS_1251),
342        108 => Some(encoding_rs::WINDOWS_1251),
343        112 => Some(encoding_rs::WINDOWS_1253),
344        113 => Some(encoding_rs::WINDOWS_1253),
345        114 => Some(encoding_rs::WINDOWS_1253),
346        120 => Some(encoding_rs::WINDOWS_1253),
347        121 => Some(encoding_rs::WINDOWS_1253),
348        122 => Some(encoding_rs::WINDOWS_1253),
349        124 => Some(encoding_rs::WINDOWS_1253),
350        128 => Some(encoding_rs::WINDOWS_1254),
351        129 => Some(encoding_rs::WINDOWS_1254),
352        130 => Some(encoding_rs::WINDOWS_1254),
353        136 => Some(encoding_rs::WINDOWS_1255),
354        137 => Some(encoding_rs::WINDOWS_1255),
355        138 => Some(encoding_rs::WINDOWS_1255),
356        144 => Some(encoding_rs::WINDOWS_1256),
357        145 => Some(encoding_rs::WINDOWS_1256),
358        146 => Some(encoding_rs::WINDOWS_1256),
359        152 => Some(encoding_rs::WINDOWS_1257),
360        153 => Some(encoding_rs::WINDOWS_1257),
361        154 => Some(encoding_rs::WINDOWS_1257),
362        155 => Some(encoding_rs::WINDOWS_1257),
363        156 => Some(encoding_rs::WINDOWS_1257),
364        157 => Some(encoding_rs::WINDOWS_1257),
365        158 => Some(encoding_rs::WINDOWS_1257),
366        159 => Some(encoding_rs::WINDOWS_1257),
367        160 => Some(encoding_rs::WINDOWS_1257),
368        183 => Some(encoding_rs::WINDOWS_1252),
369        184 => Some(encoding_rs::WINDOWS_1252),
370        185 => Some(encoding_rs::WINDOWS_1252),
371        186 => Some(encoding_rs::WINDOWS_1252),
372        // CP 932
373        192 | 193 | 200 => Some(encoding_rs::SHIFT_JIS),
374        194 => Some(encoding_rs::EUC_KR),
375        195 => Some(encoding_rs::EUC_KR),
376        // CP950
377        196 | 197 | 202 => Some(encoding_rs::BIG5),
378        // CP936 (GB18030 is an extension of it with more chars), should be backwards-compatible)
379        198 | 199 | 203 => Some(encoding_rs::GB18030),
380        201 => Some(encoding_rs::BIG5),
381        204 => Some(encoding_rs::WINDOWS_874),
382        205 => Some(encoding_rs::WINDOWS_874),
383        206 => Some(encoding_rs::WINDOWS_874),
384        210 => Some(encoding_rs::WINDOWS_1252),
385        211 => Some(encoding_rs::WINDOWS_1252),
386        212 => Some(encoding_rs::WINDOWS_1252),
387        213 => Some(encoding_rs::WINDOWS_1252),
388        214 => Some(encoding_rs::WINDOWS_1252),
389        215 => Some(encoding_rs::WINDOWS_1252),
390        216 => Some(encoding_rs::WINDOWS_1252),
391        217 => Some(encoding_rs::WINDOWS_1252),
392        _ => None,
393    }
394}
395
396/* TODO
397#[cfg(test)]
398mod tests {
399    use futures_state_stream::StateStream;
400    use tokio::executor::current_thread;
401    use crate::tests::new_connection;
402
403    #[test]
404    fn select_nvarchar_collation_test() {
405        let c1 = new_connection();
406        let query = c1.simple_query(
407            "select cast(cast(N'cześć' as nvarchar(5)) collate Polish_CI_AI as varchar(5))",
408        );
409        let mut i = 0;
410        {
411            let future = query.for_each(|x| {
412                let val: &str = x.get(0);
413                assert_eq!(val, "cześć");
414                i += 1;
415                Ok(())
416            });
417            current_thread::block_on_all(future).unwrap();
418        }
419        assert_eq!(i, 1);
420    }
421}
422*/