arrow_odbc/reader/
text.rs1use std::{cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use encoding_rs::mem::convert_utf16_to_str;
5use odbc_api::{
6 DataType as OdbcDataType,
7 buffers::{AnySlice, BufferDesc},
8};
9
10use super::{ColumnFailure, MappingError, ReadStrategy};
11
12pub fn choose_text_strategy(
17 sql_type: OdbcDataType,
18 lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
19 max_text_size: Option<usize>,
20 trim_fixed_sized_character_strings: bool,
21 text_encoding: TextEncoding,
22) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
23 let apply_buffer_limit = |len| match (len, max_text_size) {
24 (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
25 (None, Some(limit)) => Ok(limit),
26 (Some(len), None) => Ok(len),
27 (Some(len), Some(limit)) => Ok(min(len, limit)),
28 };
29 let is_fixed_sized_char = matches!(
30 sql_type,
31 OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
32 );
33 let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
34 let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
35 let hex_len = sql_type
36 .utf16_len()
37 .map(Ok)
38 .or_else(|| lazy_display_size().transpose())
39 .transpose()
40 .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
41 let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
42 wide_text_strategy(hex_len, trim)
43 } else {
44 let octet_len = sql_type
45 .utf8_len()
46 .map(Ok)
47 .or_else(|| lazy_display_size().transpose())
48 .transpose()
49 .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
50 let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
51 narrow_text_strategy(octet_len, trim)
55 };
56
57 Ok(strategy)
58}
59
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum TextEncoding {
63 Auto,
69 Utf8,
74 Utf16,
77}
78
79impl Default for TextEncoding {
80 fn default() -> Self {
81 Self::Auto
82 }
83}
84
85impl TextEncoding {
86 pub fn use_utf16(&self) -> bool {
87 match self {
88 Self::Auto => cfg!(target_os = "windows"),
89 Self::Utf8 => false,
90 Self::Utf16 => true,
91 }
92 }
93}
94
95fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
96 Box::new(WideText::new(u16_len, trim))
97}
98
99fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
100 Box::new(NarrowText::new(octet_len, trim))
101}
102
103pub struct WideText {
107 max_str_len: usize,
109 trim: bool,
111}
112
113impl WideText {
114 pub fn new(max_str_len: usize, trim: bool) -> Self {
115 Self { max_str_len, trim }
116 }
117}
118
119impl ReadStrategy for WideText {
120 fn buffer_desc(&self) -> BufferDesc {
121 BufferDesc::WText {
122 max_str_len: self.max_str_len,
123 }
124 }
125
126 fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
127 let view = column_view.as_w_text_view().unwrap();
128 let item_capacity = view.len();
129 let data_capacity = self.max_str_len * item_capacity;
133 let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
134
135 let mut converter = Utf16ToUtf8Converter::new();
136 for value in view.iter() {
137 let opt = if let Some(utf16) = value {
138 let slice = converter.utf16_to_utf8(utf16.as_slice());
139 let slice = if self.trim { slice.trim() } else { slice };
140 Some(slice)
141 } else {
142 None
143 };
144 builder.append_option(opt);
145 }
146 Ok(Arc::new(builder.finish()))
147 }
148}
149
150struct Utf16ToUtf8Converter {
151 buf_utf8: String,
153}
154
155impl Utf16ToUtf8Converter {
156 fn new() -> Self {
157 Self {
158 buf_utf8: String::new(),
159 }
160 }
161
162 fn utf16_to_utf8(&mut self, utf16: &[u16]) -> &str {
163 let max_utf8_len = utf16.len() * 3;
164 self.buf_utf8.clear();
168 for _ in 0..max_utf8_len {
169 self.buf_utf8.push('\0');
170 }
171 let written = convert_utf16_to_str(utf16, &mut self.buf_utf8);
172 &self.buf_utf8[..written]
173 }
174}
175
176pub struct NarrowText {
177 max_str_len: usize,
179 trim: bool,
181}
182
183impl NarrowText {
184 pub fn new(max_str_len: usize, trim: bool) -> Self {
185 Self { max_str_len, trim }
186 }
187}
188
189impl ReadStrategy for NarrowText {
190 fn buffer_desc(&self) -> BufferDesc {
191 BufferDesc::Text {
192 max_str_len: self.max_str_len,
193 }
194 }
195
196 fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
197 let view = column_view.as_text_view().unwrap();
198 let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
199 for value in view.iter() {
200 builder.append_option(
201 value
202 .map(|bytes| {
203 let untrimmed = simdutf8::basic::from_utf8(bytes).map_err(|_| {
204 MappingError::InvalidUtf8 {
205 lossy_value: String::from_utf8_lossy(bytes).into_owned(),
206 }
207 })?;
208 Ok(if self.trim {
209 untrimmed.trim()
210 } else {
211 untrimmed
212 })
213 })
214 .transpose()?,
215 );
216 }
217 Ok(Arc::new(builder.finish()))
218 }
219}
220
221#[cfg(test)]
222mod tests {
223 use odbc_api::buffers::{AnySlice, ColumnBuffer, TextColumn};
224
225 use crate::reader::{MappingError, ReadStrategy as _, text::Utf16ToUtf8Converter};
226
227 use super::NarrowText;
228
229 #[test]
240 fn do_not_split_buffer_accross_char_boundaries() {
241 let utf_16_with_multibyte = "Colt Telecom España S.A."
243 .encode_utf16()
244 .collect::<Vec<u16>>();
245 let six = "123456".encode_utf16().collect::<Vec<u16>>();
249
250 let mut converter = Utf16ToUtf8Converter::new();
252 let first = converter.utf16_to_utf8(&utf_16_with_multibyte).to_owned();
253 let second = converter.utf16_to_utf8(&six);
254
255 assert_eq!(first, "Colt Telecom España S.A.");
257 assert_eq!(second, "123456");
258 }
259
260 #[test]
261 fn must_return_error_for_invalid_utf8() {
262 let mut column = TextColumn::new(1, 10);
264 column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
265 let column_view = AnySlice::Text(column.view(1));
266
267 let strategy = NarrowText::new(5, false);
269 let result = strategy.fill_arrow_array(column_view);
270
271 let error = result.unwrap_err();
273 let MappingError::InvalidUtf8 { lossy_value } = error else {
274 panic!("Not an InvalidUtf8 error")
275 };
276 assert_eq!(lossy_value, "Hello�");
277 }
278}