Skip to main content

arrow_odbc/reader/
text.rs

1use std::{cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use encoding_rs::mem::convert_utf16_to_str;
5use odbc_api::{
6    DataType as OdbcDataType,
7    buffers::{AnySlice, BufferDesc},
8};
9
10use super::{ColumnFailure, MappingError, ReadStrategy};
11
12/// This function decides wether this column will be queried as narrow (assumed to be utf-8) or
13/// wide text (assumed to be utf-16). The reason we do not always use narrow is that the encoding
14/// dependends on the system locals which is usually not UTF-8 on windows systems. Furthermore we
15/// are trying to adapt the buffer size to the maximum string length the column could contain.
16pub fn choose_text_strategy(
17    sql_type: OdbcDataType,
18    lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
19    max_text_size: Option<usize>,
20    trim_fixed_sized_character_strings: bool,
21    text_encoding: TextEncoding,
22) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
23    let apply_buffer_limit = |len| match (len, max_text_size) {
24        (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
25        (None, Some(limit)) => Ok(limit),
26        (Some(len), None) => Ok(len),
27        (Some(len), Some(limit)) => Ok(min(len, limit)),
28    };
29    let is_fixed_sized_char = matches!(
30        sql_type,
31        OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
32    );
33    let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
34    let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
35        let hex_len = sql_type
36            .utf16_len()
37            .map(Ok)
38            .or_else(|| lazy_display_size().transpose())
39            .transpose()
40            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
41        let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
42        wide_text_strategy(hex_len, trim)
43    } else {
44        let octet_len = sql_type
45            .utf8_len()
46            .map(Ok)
47            .or_else(|| lazy_display_size().transpose())
48            .transpose()
49            .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
50        let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
51        // So far only Linux users seemed to have complained about panics due to garbage indices?
52        // Linux usually would use UTF-8, so we only invest work in working around this for narrow
53        // strategies
54        narrow_text_strategy(octet_len, trim)
55    };
56
57    Ok(strategy)
58}
59
60/// Used to indicate the preferred encoding for text columns.
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum TextEncoding {
63    /// Evaluates to [`Self::Utf16`] on windows and [`Self::Utf8`] on other systems. We do this,
64    /// because most systems e.g. MacOs and Linux use UTF-8 as their default encoding, while windows
65    /// may still use a Latin1 or some other extended ASCII as their narrow encoding. On the other
66    /// hand many Posix drivers are lacking in their support for wide function calls and UTF-16. So
67    /// using `Wide` on windows and `Narrow` everythere else is a good starting point.
68    Auto,
69    /// Use narrow characters (one byte) to encode text in payloads. ODBC lets the client choose the
70    /// encoding which should be based on the system local. This is often not what is actually
71    /// happening though. If we use narrow encoding, we assume the text to be UTF-8 and error if we
72    /// find that not to be the case.
73    Utf8,
74    /// Use wide characters (two bytes) to encode text in payloads. ODBC defines the encoding to
75    /// be always UTF-16.
76    Utf16,
77}
78
79impl Default for TextEncoding {
80    fn default() -> Self {
81        Self::Auto
82    }
83}
84
85impl TextEncoding {
86    pub fn use_utf16(&self) -> bool {
87        match self {
88            Self::Auto => cfg!(target_os = "windows"),
89            Self::Utf8 => false,
90            Self::Utf16 => true,
91        }
92    }
93}
94
95fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
96    Box::new(WideText::new(u16_len, trim))
97}
98
99fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
100    Box::new(NarrowText::new(octet_len, trim))
101}
102
103/// Strategy requesting the text from the database as UTF-16 (Wide characters) and emmitting it as
104/// UTF-8. We use it, since the narrow representation in ODBC is not always guaranteed to be UTF-8,
105/// but depends on the local instead.
106pub struct WideText {
107    /// Maximum string length in u16, excluding terminating zero
108    max_str_len: usize,
109    /// Wether the string should be trimmed.
110    trim: bool,
111}
112
113impl WideText {
114    pub fn new(max_str_len: usize, trim: bool) -> Self {
115        Self { max_str_len, trim }
116    }
117}
118
119impl ReadStrategy for WideText {
120    fn buffer_desc(&self) -> BufferDesc {
121        BufferDesc::WText {
122            max_str_len: self.max_str_len,
123        }
124    }
125
126    fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
127        let view = column_view.as_w_text_view().unwrap();
128        let item_capacity = view.len();
129        // Any utf-16 character could take up to 4 Bytes if represented as utf-8, but since mostly
130        // this is 1 to one, and also not every string is likeyl to use its maximum capacity, we
131        // rather accept the reallocation in these scenarios.
132        let data_capacity = self.max_str_len * item_capacity;
133        let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
134
135        let mut converter = Utf16ToUtf8Converter::new();
136        for value in view.iter() {
137            let opt = if let Some(utf16) = value {
138                let slice = converter.utf16_to_utf8(utf16.as_slice());
139                let slice = if self.trim { slice.trim() } else { slice };
140                Some(slice)
141            } else {
142                None
143            };
144            builder.append_option(opt);
145        }
146        Ok(Arc::new(builder.finish()))
147    }
148}
149
150struct Utf16ToUtf8Converter {
151    // Buffer used to convert individual values from utf16 to utf8.
152    buf_utf8: String,
153}
154
155impl Utf16ToUtf8Converter {
156    fn new() -> Self {
157        Self {
158            buf_utf8: String::new(),
159        }
160    }
161
162    fn utf16_to_utf8(&mut self, utf16: &[u16]) -> &str {
163        let max_utf8_len = utf16.len() * 3;
164        // Pad buffer with up to the required size
165        if max_utf8_len > self.buf_utf8.len() {
166            let additional = max_utf8_len - self.buf_utf8.len();
167            self.buf_utf8.reserve(additional);
168            for _ in 0..additional {
169                self.buf_utf8.push('\0');
170            }
171        }
172        let written = convert_utf16_to_str(utf16, &mut self.buf_utf8[..max_utf8_len]);
173        &self.buf_utf8[..written]
174    }
175}
176
177pub struct NarrowText {
178    /// Maximum string length in u8, excluding terminating zero
179    max_str_len: usize,
180    /// Wether the string should be trimmed.
181    trim: bool,
182}
183
184impl NarrowText {
185    pub fn new(max_str_len: usize, trim: bool) -> Self {
186        Self { max_str_len, trim }
187    }
188}
189
190impl ReadStrategy for NarrowText {
191    fn buffer_desc(&self) -> BufferDesc {
192        BufferDesc::Text {
193            max_str_len: self.max_str_len,
194        }
195    }
196
197    fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
198        let view = column_view.as_text_view().unwrap();
199        let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
200        for value in view.iter() {
201            builder.append_option(
202                value
203                    .map(|bytes| {
204                        let untrimmed = simdutf8::basic::from_utf8(bytes).map_err(|_| {
205                            MappingError::InvalidUtf8 {
206                                lossy_value: String::from_utf8_lossy(bytes).into_owned(),
207                            }
208                        })?;
209                        Ok(if self.trim {
210                            untrimmed.trim()
211                        } else {
212                            untrimmed
213                        })
214                    })
215                    .transpose()?,
216            );
217        }
218        Ok(Arc::new(builder.finish()))
219    }
220}
221
222#[cfg(test)]
223mod tests {
224    use odbc_api::buffers::{AnySlice, ColumnBuffer, TextColumn};
225
226    use crate::reader::{MappingError, ReadStrategy as _};
227
228    use super::NarrowText;
229
230    #[test]
231    fn must_return_error_for_invalid_utf8() {
232        // Given a slice with invalid utf-8
233        let mut column = TextColumn::new(1, 10);
234        column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
235        let column_view = AnySlice::Text(column.view(1));
236
237        // When
238        let strategy = NarrowText::new(5, false);
239        let result = strategy.fill_arrow_array(column_view);
240
241        // Then
242        let error = result.unwrap_err();
243        let MappingError::InvalidUtf8 { lossy_value } = error else {
244            panic!("Not an InvalidUtf8 error")
245        };
246        assert_eq!(lossy_value, "Hello�");
247    }
248}