Skip to main content

arrow_odbc/reader/
text.rs

1use std::{cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use encoding_rs::mem::convert_utf16_to_str;
5use odbc_api::{
6    DataType as OdbcDataType,
7    buffers::{AnySlice, BufferDesc},
8};
9
10use super::{ColumnFailure, MappingError, ReadStrategy};
11
12/// This function decides wether this column will be queried as narrow (assumed to be utf-8) or
13/// wide text (assumed to be utf-16). The reason we do not always use narrow is that the encoding
14/// dependends on the system locals which is usually not UTF-8 on windows systems. Furthermore we
15/// are trying to adapt the buffer size to the maximum string length the column could contain.
16pub fn choose_text_strategy(
17    sql_type: OdbcDataType,
18    lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
19    max_text_size: Option<usize>,
20    trim_fixed_sized_character_strings: bool,
21    text_encoding: TextEncoding,
22) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
23    let apply_buffer_limit = |len| match (len, max_text_size) {
24        (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
25        (None, Some(limit)) => Ok(limit),
26        (Some(len), None) => Ok(len),
27        (Some(len), Some(limit)) => Ok(min(len, limit)),
28    };
29    let is_fixed_sized_char = matches!(
30        sql_type,
31        OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
32    );
33    let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
34    let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
35        let hex_len = sql_type
36            .utf16_len()
37            .map(Ok)
38            .or_else(|| lazy_display_size().transpose())
39            .transpose()
40            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
41        let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
42        wide_text_strategy(hex_len, trim)
43    } else {
44        let octet_len = sql_type
45            .utf8_len()
46            .map(Ok)
47            .or_else(|| lazy_display_size().transpose())
48            .transpose()
49            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
50        let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
51        // So far only Linux users seemed to have complained about panics due to garbage indices?
52        // Linux usually would use UTF-8, so we only invest work in working around this for narrow
53        // strategies
54        narrow_text_strategy(octet_len, trim)
55    };
56
57    Ok(strategy)
58}
59
60/// Used to indicate the preferred encoding for text columns.
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum TextEncoding {
63    /// Evaluates to [`Self::Utf16`] on windows and [`Self::Utf8`] on other systems. We do this,
64    /// because most systems e.g. MacOs and Linux use UTF-8 as their default encoding, while windows
65    /// may still use a Latin1 or some other extended ASCII as their narrow encoding. On the other
66    /// hand many Posix drivers are lacking in their support for wide function calls and UTF-16. So
67    /// using `Wide` on windows and `Narrow` everythere else is a good starting point.
68    Auto,
69    /// Use narrow characters (one byte) to encode text in payloads. ODBC lets the client choose the
70    /// encoding which should be based on the system local. This is often not what is actually
71    /// happening though. If we use narrow encoding, we assume the text to be UTF-8 and error if we
72    /// find that not to be the case.
73    Utf8,
74    /// Use wide characters (two bytes) to encode text in payloads. ODBC defines the encoding to
75    /// be always UTF-16.
76    Utf16,
77}
78
79impl Default for TextEncoding {
80    fn default() -> Self {
81        Self::Auto
82    }
83}
84
85impl TextEncoding {
86    pub fn use_utf16(&self) -> bool {
87        match self {
88            Self::Auto => cfg!(target_os = "windows"),
89            Self::Utf8 => false,
90            Self::Utf16 => true,
91        }
92    }
93}
94
95fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
96    Box::new(WideText::new(u16_len, trim))
97}
98
99fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
100    Box::new(NarrowText::new(octet_len, trim))
101}
102
103/// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as
104/// UTF-8. We use it, since the narrow representation in ODBC is not always guaranteed to be UTF-8,
105/// but depends on the local instead.
106pub struct WideText {
107    /// Maximum string length in u16, excluding terminating zero
108    max_str_len: usize,
109    /// Wether the string should be trimmed.
110    trim: bool,
111}
112
113impl WideText {
114    pub fn new(max_str_len: usize, trim: bool) -> Self {
115        Self { max_str_len, trim }
116    }
117}
118
119impl ReadStrategy for WideText {
120    fn buffer_desc(&self) -> BufferDesc {
121        BufferDesc::WText {
122            max_str_len: self.max_str_len,
123        }
124    }
125
126    fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
127        let view = column_view.as_w_text_view().unwrap();
128        let item_capacity = view.len();
129        // Any utf-16 character could take up to 4 Bytes if represented as utf-8, but since mostly
130        // this is 1 to one, and also not every string is likeyl to use its maximum capacity, we
131        // rather accept the reallocation in these scenarios.
132        let data_capacity = self.max_str_len * item_capacity;
133        let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
134
135        let mut converter = Utf16ToUtf8Converter::new();
136        for value in view.iter() {
137            let opt = if let Some(utf16) = value {
138                let slice = converter.utf16_to_utf8(utf16.as_slice());
139                let slice = if self.trim { slice.trim() } else { slice };
140                Some(slice)
141            } else {
142                None
143            };
144            builder.append_option(opt);
145        }
146        Ok(Arc::new(builder.finish()))
147    }
148}
149
150struct Utf16ToUtf8Converter {
151    // Buffer used to convert individual values from utf16 to utf8.
152    buf_utf8: String,
153}
154
155impl Utf16ToUtf8Converter {
156    fn new() -> Self {
157        Self {
158            buf_utf8: String::new(),
159        }
160    }
161
162    fn utf16_to_utf8(&mut self, utf16: &[u16]) -> &str {
163        let max_utf8_len = utf16.len() * 3;
164        // Clearing is necessary. Otherwise we may slice the stream in between two bytes of
165        // a multi-byte character which would violate the invariant of String and cause a
166        // panic.
167        self.buf_utf8.clear();
168        for _ in 0..max_utf8_len {
169            self.buf_utf8.push('\0');
170        }
171        let written = convert_utf16_to_str(utf16, &mut self.buf_utf8);
172        &self.buf_utf8[..written]
173    }
174}
175
176pub struct NarrowText {
177    /// Maximum string length in u8, excluding terminating zero
178    max_str_len: usize,
179    /// Wether the string should be trimmed.
180    trim: bool,
181}
182
183impl NarrowText {
184    pub fn new(max_str_len: usize, trim: bool) -> Self {
185        Self { max_str_len, trim }
186    }
187}
188
189impl ReadStrategy for NarrowText {
190    fn buffer_desc(&self) -> BufferDesc {
191        BufferDesc::Text {
192            max_str_len: self.max_str_len,
193        }
194    }
195
196    fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
197        let view = column_view.as_text_view().unwrap();
198        let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
199        for value in view.iter() {
200            builder.append_option(
201                value
202                    .map(|bytes| {
203                        let untrimmed = simdutf8::basic::from_utf8(bytes).map_err(|_| {
204                            MappingError::InvalidUtf8 {
205                                lossy_value: String::from_utf8_lossy(bytes).into_owned(),
206                            }
207                        })?;
208                        Ok(if self.trim {
209                            untrimmed.trim()
210                        } else {
211                            untrimmed
212                        })
213                    })
214                    .transpose()?,
215            );
216        }
217        Ok(Arc::new(builder.finish()))
218    }
219}
220
221#[cfg(test)]
222mod tests {
223    use odbc_api::buffers::{AnySlice, ColumnBuffer, TextColumn};
224
225    use crate::reader::{MappingError, ReadStrategy as _, text::Utf16ToUtf8Converter};
226
227    use super::NarrowText;
228
229    /// Then querying under windows (implying UTF-16 encoding) a column with value
230    /// "Colt Telecom España S.A." was followed by a column if max_length with 18 bytes in UTF-8
231    /// encoding. This let to a panic due to a violation of the String invariant, because that would
232    /// split the reused buffer in the middle of the UTF-8 character "ñ" (which is 2 bytes in
233    /// UTF-8).
234    ///
235    /// See: <https://github.com/pacman82/arrow-odbc/issues/177>
236    ///
237    /// And also a similar issue in odbc2parquet:
238    /// <https://github.com/pacman82/odbc2parquet/issues/862>
239    #[test]
240    fn do_not_split_buffer_accross_char_boundaries() {
241        // Given a string with a multibyte character at position 18 then encoded in UTF-8.
242        let utf_16_with_multibyte = "Colt Telecom España S.A."
243            .encode_utf16()
244            .collect::<Vec<u16>>();
245        // And a string value with 6 characters. A maximum length of a character in UTF-8 is 3
246        // bytes. 3 x 6 = 18, so this would cause the multibyte character in the previous string to
247        // be splitted if we reuse the same buffer for both strings.
248        let six = "123456".encode_utf16().collect::<Vec<u16>>();
249
250        // When convertig both values in succession to UTF-8
251        let mut converter = Utf16ToUtf8Converter::new();
252        let first = converter.utf16_to_utf8(&utf_16_with_multibyte).to_owned();
253        let second = converter.utf16_to_utf8(&six);
254
255        // Then both strings should be correct and no panic occurred (implied by reaching this line
256        assert_eq!(first, "Colt Telecom España S.A.");
257        assert_eq!(second, "123456");
258    }
259
260    #[test]
261    fn must_return_error_for_invalid_utf8() {
262        // Given a slice with invalid utf-8
263        let mut column = TextColumn::new(1, 10);
264        column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
265        let column_view = AnySlice::Text(column.view(1));
266
267        // When
268        let strategy = NarrowText::new(5, false);
269        let result = strategy.fill_arrow_array(column_view);
270
271        // Then
272        let error = result.unwrap_err();
273        let MappingError::InvalidUtf8 { lossy_value } = error else {
274            panic!("Not an InvalidUtf8 error")
275        };
276        assert_eq!(lossy_value, "Hello�");
277    }
278}