arrow_odbc/reader/
text.rs

1use std::{char::decode_utf16, cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use odbc_api::{
5    DataType as OdbcDataType,
6    buffers::{AnySlice, BufferDesc},
7};
8
9use super::{ColumnFailure, MappingError, ReadStrategy};
10
11/// This function decides wether this column will be queried as narrow (assumed to be utf-8) or
12/// wide text (assumed to be utf-16). The reason we do not always use narrow is that the encoding
13/// dependends on the system locals which is usually not UTF-8 on windows systems. Furthermore we
14/// are trying to adapt the buffer size to the maximum string length the column could contain.
15pub fn choose_text_strategy(
16    sql_type: OdbcDataType,
17    lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
18    max_text_size: Option<usize>,
19    trim_fixed_sized_character_strings: bool,
20    text_encoding: TextEncoding,
21) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
22    let apply_buffer_limit = |len| match (len, max_text_size) {
23        (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
24        (None, Some(limit)) => Ok(limit),
25        (Some(len), None) => Ok(len),
26        (Some(len), Some(limit)) => Ok(min(len, limit)),
27    };
28    let is_fixed_sized_char = matches!(
29        sql_type,
30        OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
31    );
32    let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
33    let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
34        let hex_len = sql_type
35            .utf16_len()
36            .map(Ok)
37            .or_else(|| lazy_display_size().transpose())
38            .transpose()
39            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
40        let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
41        wide_text_strategy(hex_len, trim)
42    } else {
43        let octet_len = sql_type
44            .utf8_len()
45            .map(Ok)
46            .or_else(|| lazy_display_size().transpose())
47            .transpose()
48            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
49        let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
50        // So far only Linux users seemed to have complained about panics due to garbage indices?
51        // Linux usually would use UTF-8, so we only invest work in working around this for narrow
52        // strategies
53        narrow_text_strategy(octet_len, trim)
54    };
55
56    Ok(strategy)
57}
58
59/// Used to indicate the preferred encoding for text columns.
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
61pub enum TextEncoding {
62    /// Evaluates to [`Self::Utf16`] on windows and [`Self::Utf8`] on other systems. We do this,
63    /// because most systems e.g. MacOs and Linux use UTF-8 as their default encoding, while windows
64    /// may still use a Latin1 or some other extended ASCII as their narrow encoding. On the other
65    /// hand many Posix drivers are lacking in their support for wide function calls and UTF-16. So
66    /// using `Wide` on windows and `Narrow` everythere else is a good starting point.
67    Auto,
68    /// Use narrow characters (one byte) to encode text in payloads. ODBC lets the client choose the
69    /// encoding which should be based on the system local. This is often not what is actually
70    /// happening though. If we use narrow encoding, we assume the text to be UTF-8 and error if we
71    /// find that not to be the case.
72    Utf8,
73    /// Use wide characters (two bytes) to encode text in payloads. ODBC defines the encoding to
74    /// be always UTF-16.
75    Utf16,
76}
77
78impl Default for TextEncoding {
79    fn default() -> Self {
80        Self::Auto
81    }
82}
83
84impl TextEncoding {
85    pub fn use_utf16(&self) -> bool {
86        match self {
87            Self::Auto => cfg!(target_os = "windows"),
88            Self::Utf8 => false,
89            Self::Utf16 => true,
90        }
91    }
92}
93
94fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
95    Box::new(WideText::new(u16_len, trim))
96}
97
98fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
99    Box::new(NarrowText::new(octet_len, trim))
100}
101
102/// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as
103/// UTF-8. We use it, since the narrow representation in ODBC is not always guaranteed to be UTF-8,
104/// but depends on the local instead.
105pub struct WideText {
106    /// Maximum string length in u16, excluding terminating zero
107    max_str_len: usize,
108    /// Wether the string should be trimmed.
109    trim: bool,
110}
111
112impl WideText {
113    pub fn new(max_str_len: usize, trim: bool) -> Self {
114        Self { max_str_len, trim }
115    }
116}
117
118impl ReadStrategy for WideText {
119    fn buffer_desc(&self) -> BufferDesc {
120        BufferDesc::WText {
121            max_str_len: self.max_str_len,
122        }
123    }
124
125    fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
126        let view = column_view.as_w_text_view().unwrap();
127        let item_capacity = view.len();
128        // Any utf-16 character could take up to 4 Bytes if represented as utf-8, but since mostly
129        // this is 1 to one, and also not every string is likeyl to use its maximum capacity, we
130        // rather accept the reallocation in these scenarios.
131        let data_capacity = self.max_str_len * item_capacity;
132        let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
133        // Buffer used to convert individual values from utf16 to utf8.
134        let mut buf_utf8 = String::new();
135        for value in view.iter() {
136            buf_utf8.clear();
137            let opt = if let Some(utf16) = value {
138                for c in decode_utf16(utf16.as_slice().iter().cloned()) {
139                    buf_utf8.push(c.unwrap());
140                }
141                let slice = if self.trim {
142                    buf_utf8.trim()
143                } else {
144                    buf_utf8.as_str()
145                };
146                Some(slice)
147            } else {
148                None
149            };
150            builder.append_option(opt);
151        }
152        Ok(Arc::new(builder.finish()))
153    }
154}
155
156pub struct NarrowText {
157    /// Maximum string length in u8, excluding terminating zero
158    max_str_len: usize,
159    /// Wether the string should be trimmed.
160    trim: bool,
161}
162
163impl NarrowText {
164    pub fn new(max_str_len: usize, trim: bool) -> Self {
165        Self { max_str_len, trim }
166    }
167}
168
169impl ReadStrategy for NarrowText {
170    fn buffer_desc(&self) -> BufferDesc {
171        BufferDesc::Text {
172            max_str_len: self.max_str_len,
173        }
174    }
175
176    fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
177        let view = column_view.as_text_view().unwrap();
178        let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
179        for value in view.iter() {
180            builder.append_option(
181                value
182                    .map(|bytes| {
183                        let untrimmed =
184                            std::str::from_utf8(bytes).map_err(|_| MappingError::InvalidUtf8 {
185                                lossy_value: String::from_utf8_lossy(bytes).into_owned(),
186                            })?;
187                        Ok(if self.trim {
188                            untrimmed.trim()
189                        } else {
190                            untrimmed
191                        })
192                    })
193                    .transpose()?,
194            );
195        }
196        Ok(Arc::new(builder.finish()))
197    }
198}
199
200#[cfg(test)]
201mod tests {
202    use odbc_api::buffers::{AnySlice, ColumnBuffer, TextColumn};
203
204    use crate::reader::{MappingError, ReadStrategy as _};
205
206    use super::NarrowText;
207
208    #[test]
209    fn must_return_error_for_invalid_utf8() {
210        // Given a slice with invalid utf-8
211        let mut column = TextColumn::new(1, 10);
212        column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
213        let column_view = AnySlice::Text(column.view(1));
214
215        // When
216        let strategy = NarrowText::new(5, false);
217        let result = strategy.fill_arrow_array(column_view);
218
219        // Then
220        let error = result.unwrap_err();
221        let MappingError::InvalidUtf8 { lossy_value } = error else {
222            panic!("Not an InvalidUtf8 error")
223        };
224        assert_eq!(lossy_value, "Hello�");
225    }
226}