arrow_odbc/reader/
text.rs1use std::{cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use encoding_rs::mem::convert_utf16_to_str;
5use odbc_api::{
6 DataType as OdbcDataType,
7 buffers::{AnyColumnBufferSlice, BufferDesc},
8};
9
10use super::{ColumnFailure, MappingError, ReadStrategy};
11
12pub fn choose_text_strategy(
17 sql_type: OdbcDataType,
18 lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
19 max_text_size: Option<usize>,
20 trim_fixed_sized_character_strings: bool,
21 text_encoding: TextEncoding,
22) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
23 let apply_buffer_limit = |len| match (len, max_text_size) {
24 (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
25 (None, Some(limit)) => Ok(limit),
26 (Some(len), None) => Ok(len),
27 (Some(len), Some(limit)) => Ok(min(len, limit)),
28 };
29 let is_fixed_sized_char = matches!(
30 sql_type,
31 OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
32 );
33 let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
34 let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
35 let hex_len = sql_type
36 .utf16_len()
37 .map(Ok)
38 .or_else(|| lazy_display_size().transpose())
39 .transpose()
40 .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
41 let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
42 wide_text_strategy(hex_len, trim)
43 } else {
44 let octet_len = sql_type
45 .utf8_len()
46 .map(Ok)
47 .or_else(|| lazy_display_size().transpose())
48 .transpose()
49 .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
50 let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
51 narrow_text_strategy(octet_len, trim)
55 };
56
57 Ok(strategy)
58}
59
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum TextEncoding {
63 Auto,
69 Utf8,
74 Utf16,
77}
78
79impl Default for TextEncoding {
80 fn default() -> Self {
81 Self::Auto
82 }
83}
84
85impl TextEncoding {
86 pub fn use_utf16(&self) -> bool {
87 match self {
88 Self::Auto => cfg!(target_os = "windows"),
89 Self::Utf8 => false,
90 Self::Utf16 => true,
91 }
92 }
93}
94
95fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
96 Box::new(WideText::new(u16_len, trim))
97}
98
99fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
100 Box::new(NarrowText::new(octet_len, trim))
101}
102
103pub struct WideText {
107 max_str_len: usize,
109 trim: bool,
111}
112
113impl WideText {
114 pub fn new(max_str_len: usize, trim: bool) -> Self {
115 Self { max_str_len, trim }
116 }
117}
118
119impl ReadStrategy for WideText {
120 fn buffer_desc(&self) -> BufferDesc {
121 BufferDesc::WText {
122 max_str_len: self.max_str_len,
123 }
124 }
125
126 fn fill_arrow_array(
127 &self,
128 column_view: AnyColumnBufferSlice,
129 ) -> Result<ArrayRef, MappingError> {
130 let view = column_view.as_wide_text().unwrap();
131 let item_capacity = view.len();
132 let data_capacity = self.max_str_len * item_capacity;
136 let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
137
138 let mut converter = Utf16ToUtf8Converter::new();
139 for value in view.iter() {
140 let opt = if let Some(utf16) = value {
141 let slice = converter.utf16_to_utf8(utf16.as_slice());
142 let slice = if self.trim { slice.trim() } else { slice };
143 Some(slice)
144 } else {
145 None
146 };
147 builder.append_option(opt);
148 }
149 Ok(Arc::new(builder.finish()))
150 }
151}
152
153struct Utf16ToUtf8Converter {
154 buf_utf8: String,
156}
157
158impl Utf16ToUtf8Converter {
159 fn new() -> Self {
160 Self {
161 buf_utf8: String::new(),
162 }
163 }
164
165 fn utf16_to_utf8(&mut self, utf16: &[u16]) -> &str {
166 let max_utf8_len = utf16.len() * 3;
167 self.buf_utf8.clear();
171 for _ in 0..max_utf8_len {
172 self.buf_utf8.push('\0');
173 }
174 let written = convert_utf16_to_str(utf16, &mut self.buf_utf8);
175 &self.buf_utf8[..written]
176 }
177}
178
179pub struct NarrowText {
180 max_str_len: usize,
182 trim: bool,
184}
185
186impl NarrowText {
187 pub fn new(max_str_len: usize, trim: bool) -> Self {
188 Self { max_str_len, trim }
189 }
190}
191
192impl ReadStrategy for NarrowText {
193 fn buffer_desc(&self) -> BufferDesc {
194 BufferDesc::Text {
195 max_str_len: self.max_str_len,
196 }
197 }
198
199 fn fill_arrow_array(
200 &self,
201 column_view: AnyColumnBufferSlice,
202 ) -> Result<ArrayRef, MappingError> {
203 let view = column_view.as_text().unwrap();
204 let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
205 for value in view.iter() {
206 builder.append_option(
207 value
208 .map(|bytes| {
209 let untrimmed = simdutf8::basic::from_utf8(bytes).map_err(|_| {
210 MappingError::InvalidUtf8 {
211 lossy_value: String::from_utf8_lossy(bytes).into_owned(),
212 }
213 })?;
214 Ok(if self.trim {
215 untrimmed.trim()
216 } else {
217 untrimmed
218 })
219 })
220 .transpose()?,
221 );
222 }
223 Ok(Arc::new(builder.finish()))
224 }
225}
226
227#[cfg(test)]
228mod tests {
229 use odbc_api::buffers::{AnyColumnBuffer, Slice, TextColumn};
230
231 use crate::reader::{MappingError, ReadStrategy as _, text::Utf16ToUtf8Converter};
232
233 use super::NarrowText;
234
235 #[test]
246 fn do_not_split_buffer_accross_char_boundaries() {
247 let utf_16_with_multibyte = "Colt Telecom España S.A."
249 .encode_utf16()
250 .collect::<Vec<u16>>();
251 let six = "123456".encode_utf16().collect::<Vec<u16>>();
255
256 let mut converter = Utf16ToUtf8Converter::new();
258 let first = converter.utf16_to_utf8(&utf_16_with_multibyte).to_owned();
259 let second = converter.utf16_to_utf8(&six);
260
261 assert_eq!(first, "Colt Telecom España S.A.");
263 assert_eq!(second, "123456");
264 }
265
266 #[test]
267 fn must_return_error_for_invalid_utf8() {
268 let mut column = TextColumn::new(1, 10);
270 column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
271 let column: Box<dyn AnyColumnBuffer> = Box::new(column);
272 let column_view = column.slice(1);
273
274 let strategy = NarrowText::new(5, false);
276 let result = strategy.fill_arrow_array(column_view);
277
278 let error = result.unwrap_err();
280 let MappingError::InvalidUtf8 { lossy_value } = error else {
281 panic!("Not an InvalidUtf8 error")
282 };
283 assert_eq!(lossy_value, "Hello�");
284 }
285}