Skip to main content

arrow_odbc/reader/
text.rs

1use std::{cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use encoding_rs::mem::convert_utf16_to_str;
5use odbc_api::{
6    DataType as OdbcDataType,
7    buffers::{AnyColumnBufferSlice, BufferDesc},
8};
9
10use super::{ColumnFailure, MappingError, ReadStrategy};
11
12/// This function decides wether this column will be queried as narrow (assumed to be utf-8) or
13/// wide text (assumed to be utf-16). The reason we do not always use narrow is that the encoding
14/// dependends on the system locals which is usually not UTF-8 on windows systems. Furthermore we
15/// are trying to adapt the buffer size to the maximum string length the column could contain.
16pub fn choose_text_strategy(
17    sql_type: OdbcDataType,
18    lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
19    max_text_size: Option<usize>,
20    trim_fixed_sized_character_strings: bool,
21    text_encoding: TextEncoding,
22) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
23    let apply_buffer_limit = |len| match (len, max_text_size) {
24        (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
25        (None, Some(limit)) => Ok(limit),
26        (Some(len), None) => Ok(len),
27        (Some(len), Some(limit)) => Ok(min(len, limit)),
28    };
29    let is_fixed_sized_char = matches!(
30        sql_type,
31        OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
32    );
33    let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
34    let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
35        let hex_len = sql_type
36            .utf16_len()
37            .map(Ok)
38            .or_else(|| lazy_display_size().transpose())
39            .transpose()
40            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
41        let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
42        wide_text_strategy(hex_len, trim)
43    } else {
44        let octet_len = sql_type
45            .utf8_len()
46            .map(Ok)
47            .or_else(|| lazy_display_size().transpose())
48            .transpose()
49            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
50        let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
51        // So far only Linux users seemed to have complained about panics due to garbage indices?
52        // Linux usually would use UTF-8, so we only invest work in working around this for narrow
53        // strategies
54        narrow_text_strategy(octet_len, trim)
55    };
56
57    Ok(strategy)
58}
59
60/// Used to indicate the preferred encoding for text columns.
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum TextEncoding {
63    /// Evaluates to [`Self::Utf16`] on windows and [`Self::Utf8`] on other systems. We do this,
64    /// because most systems e.g. MacOs and Linux use UTF-8 as their default encoding, while windows
65    /// may still use a Latin1 or some other extended ASCII as their narrow encoding. On the other
66    /// hand many Posix drivers are lacking in their support for wide function calls and UTF-16. So
67    /// using `Wide` on windows and `Narrow` everythere else is a good starting point.
68    Auto,
69    /// Use narrow characters (one byte) to encode text in payloads. ODBC lets the client choose the
70    /// encoding which should be based on the system local. This is often not what is actually
71    /// happening though. If we use narrow encoding, we assume the text to be UTF-8 and error if we
72    /// find that not to be the case.
73    Utf8,
74    /// Use wide characters (two bytes) to encode text in payloads. ODBC defines the encoding to
75    /// be always UTF-16.
76    Utf16,
77}
78
79impl Default for TextEncoding {
80    fn default() -> Self {
81        Self::Auto
82    }
83}
84
85impl TextEncoding {
86    pub fn use_utf16(&self) -> bool {
87        match self {
88            Self::Auto => cfg!(target_os = "windows"),
89            Self::Utf8 => false,
90            Self::Utf16 => true,
91        }
92    }
93}
94
95fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
96    Box::new(WideText::new(u16_len, trim))
97}
98
99fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
100    Box::new(NarrowText::new(octet_len, trim))
101}
102
103/// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as
104/// UTF-8. We use it, since the narrow representation in ODBC is not always guaranteed to be UTF-8,
105/// but depends on the local instead.
106pub struct WideText {
107    /// Maximum string length in u16, excluding terminating zero
108    max_str_len: usize,
109    /// Wether the string should be trimmed.
110    trim: bool,
111}
112
113impl WideText {
114    pub fn new(max_str_len: usize, trim: bool) -> Self {
115        Self { max_str_len, trim }
116    }
117}
118
119impl ReadStrategy for WideText {
120    fn buffer_desc(&self) -> BufferDesc {
121        BufferDesc::WText {
122            max_str_len: self.max_str_len,
123        }
124    }
125
126    fn fill_arrow_array(
127        &self,
128        column_view: AnyColumnBufferSlice,
129    ) -> Result<ArrayRef, MappingError> {
130        let view = column_view.as_wide_text().unwrap();
131        let item_capacity = view.len();
132        // Any utf-16 character could take up to 4 Bytes if represented as utf-8, but since mostly
133        // this is 1 to one, and also not every string is likeyl to use its maximum capacity, we
134        // rather accept the reallocation in these scenarios.
135        let data_capacity = self.max_str_len * item_capacity;
136        let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
137
138        let mut converter = Utf16ToUtf8Converter::new();
139        for value in view.iter() {
140            let opt = if let Some(utf16) = value {
141                let slice = converter.utf16_to_utf8(utf16.as_slice());
142                let slice = if self.trim { slice.trim() } else { slice };
143                Some(slice)
144            } else {
145                None
146            };
147            builder.append_option(opt);
148        }
149        Ok(Arc::new(builder.finish()))
150    }
151}
152
153struct Utf16ToUtf8Converter {
154    // Buffer used to convert individual values from utf16 to utf8.
155    buf_utf8: String,
156}
157
158impl Utf16ToUtf8Converter {
159    fn new() -> Self {
160        Self {
161            buf_utf8: String::new(),
162        }
163    }
164
165    fn utf16_to_utf8(&mut self, utf16: &[u16]) -> &str {
166        let max_utf8_len = utf16.len() * 3;
167        // Clearing is necessary. Otherwise we may slice the stream in between two bytes of
168        // a multi-byte character which would violate the invariant of String and cause a
169        // panic.
170        self.buf_utf8.clear();
171        for _ in 0..max_utf8_len {
172            self.buf_utf8.push('\0');
173        }
174        let written = convert_utf16_to_str(utf16, &mut self.buf_utf8);
175        &self.buf_utf8[..written]
176    }
177}
178
179pub struct NarrowText {
180    /// Maximum string length in u8, excluding terminating zero
181    max_str_len: usize,
182    /// Wether the string should be trimmed.
183    trim: bool,
184}
185
186impl NarrowText {
187    pub fn new(max_str_len: usize, trim: bool) -> Self {
188        Self { max_str_len, trim }
189    }
190}
191
192impl ReadStrategy for NarrowText {
193    fn buffer_desc(&self) -> BufferDesc {
194        BufferDesc::Text {
195            max_str_len: self.max_str_len,
196        }
197    }
198
199    fn fill_arrow_array(
200        &self,
201        column_view: AnyColumnBufferSlice,
202    ) -> Result<ArrayRef, MappingError> {
203        let view = column_view.as_text().unwrap();
204        let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
205        for value in view.iter() {
206            builder.append_option(
207                value
208                    .map(|bytes| {
209                        let untrimmed = simdutf8::basic::from_utf8(bytes).map_err(|_| {
210                            MappingError::InvalidUtf8 {
211                                lossy_value: String::from_utf8_lossy(bytes).into_owned(),
212                            }
213                        })?;
214                        Ok(if self.trim {
215                            untrimmed.trim()
216                        } else {
217                            untrimmed
218                        })
219                    })
220                    .transpose()?,
221            );
222        }
223        Ok(Arc::new(builder.finish()))
224    }
225}
226
227#[cfg(test)]
228mod tests {
229    use odbc_api::buffers::{AnyColumnBuffer, Slice, TextColumn};
230
231    use crate::reader::{MappingError, ReadStrategy as _, text::Utf16ToUtf8Converter};
232
233    use super::NarrowText;
234
235    /// Then querying under windows (implying UTF-16 encoding) a column with value
236    /// "Colt Telecom España S.A." was followed by a column if max_length with 18 bytes in UTF-8
237    /// encoding. This let to a panic due to a violation of the String invariant, because that would
238    /// split the reused buffer in the middle of the UTF-8 character "ñ" (which is 2 bytes in
239    /// UTF-8).
240    ///
241    /// See: <https://github.com/pacman82/arrow-odbc/issues/177>
242    ///
243    /// And also a similar issue in odbc2parquet:
244    /// <https://github.com/pacman82/odbc2parquet/issues/862>
245    #[test]
246    fn do_not_split_buffer_accross_char_boundaries() {
247        // Given a string with a multibyte character at position 18 then encoded in UTF-8.
248        let utf_16_with_multibyte = "Colt Telecom España S.A."
249            .encode_utf16()
250            .collect::<Vec<u16>>();
251        // And a string value with 6 characters. A maximum length of a character in UTF-8 is 3
252        // bytes. 3 x 6 = 18, so this would cause the multibyte character in the previous string to
253        // be splitted if we reuse the same buffer for both strings.
254        let six = "123456".encode_utf16().collect::<Vec<u16>>();
255
256        // When convertig both values in succession to UTF-8
257        let mut converter = Utf16ToUtf8Converter::new();
258        let first = converter.utf16_to_utf8(&utf_16_with_multibyte).to_owned();
259        let second = converter.utf16_to_utf8(&six);
260
261        // Then both strings should be correct and no panic occurred (implied by reaching this line
262        assert_eq!(first, "Colt Telecom España S.A.");
263        assert_eq!(second, "123456");
264    }
265
266    #[test]
267    fn must_return_error_for_invalid_utf8() {
268        // Given a slice with invalid utf-8
269        let mut column = TextColumn::new(1, 10);
270        column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
271        let column: Box<dyn AnyColumnBuffer> = Box::new(column);
272        let column_view = column.slice(1);
273
274        // When
275        let strategy = NarrowText::new(5, false);
276        let result = strategy.fill_arrow_array(column_view);
277
278        // Then
279        let error = result.unwrap_err();
280        let MappingError::InvalidUtf8 { lossy_value } = error else {
281            panic!("Not an InvalidUtf8 error")
282        };
283        assert_eq!(lossy_value, "Hello�");
284    }
285}