arrow_odbc/reader/
text.rs1use std::{cmp::min, num::NonZeroUsize, sync::Arc};
2
3use arrow::array::{ArrayRef, StringBuilder};
4use encoding_rs::mem::convert_utf16_to_str;
5use odbc_api::{
6 DataType as OdbcDataType,
7 buffers::{AnySlice, BufferDesc},
8};
9
10use super::{ColumnFailure, MappingError, ReadStrategy};
11
12pub fn choose_text_strategy(
17 sql_type: OdbcDataType,
18 lazy_display_size: impl FnOnce() -> Result<Option<NonZeroUsize>, odbc_api::Error>,
19 max_text_size: Option<usize>,
20 trim_fixed_sized_character_strings: bool,
21 text_encoding: TextEncoding,
22) -> Result<Box<dyn ReadStrategy + Send>, ColumnFailure> {
23 let apply_buffer_limit = |len| match (len, max_text_size) {
24 (None, None) => Err(ColumnFailure::ZeroSizedColumn { sql_type }),
25 (None, Some(limit)) => Ok(limit),
26 (Some(len), None) => Ok(len),
27 (Some(len), Some(limit)) => Ok(min(len, limit)),
28 };
29 let is_fixed_sized_char = matches!(
30 sql_type,
31 OdbcDataType::Char { .. } | OdbcDataType::WChar { .. }
32 );
33 let trim = trim_fixed_sized_character_strings && is_fixed_sized_char;
34 let strategy: Box<dyn ReadStrategy + Send> = if text_encoding.use_utf16() {
35 let hex_len = sql_type
36 .utf16_len()
37 .map(Ok)
38 .or_else(|| lazy_display_size().transpose())
39 .transpose()
40 .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
41 let hex_len = apply_buffer_limit(hex_len.map(NonZeroUsize::get))?;
42 wide_text_strategy(hex_len, trim)
43 } else {
44 let octet_len = sql_type
45 .utf8_len()
46 .map(Ok)
47 .or_else(|| lazy_display_size().transpose())
48 .transpose()
49 .map_err(|source| ColumnFailure::UnknownStringLength { sql_type, source })?;
50 let octet_len = apply_buffer_limit(octet_len.map(NonZeroUsize::get))?;
51 narrow_text_strategy(octet_len, trim)
55 };
56
57 Ok(strategy)
58}
59
60#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum TextEncoding {
63 Auto,
69 Utf8,
74 Utf16,
77}
78
79impl Default for TextEncoding {
80 fn default() -> Self {
81 Self::Auto
82 }
83}
84
85impl TextEncoding {
86 pub fn use_utf16(&self) -> bool {
87 match self {
88 Self::Auto => cfg!(target_os = "windows"),
89 Self::Utf8 => false,
90 Self::Utf16 => true,
91 }
92 }
93}
94
95fn wide_text_strategy(u16_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
96 Box::new(WideText::new(u16_len, trim))
97}
98
99fn narrow_text_strategy(octet_len: usize, trim: bool) -> Box<dyn ReadStrategy + Send> {
100 Box::new(NarrowText::new(octet_len, trim))
101}
102
103pub struct WideText {
107 max_str_len: usize,
109 trim: bool,
111}
112
113impl WideText {
114 pub fn new(max_str_len: usize, trim: bool) -> Self {
115 Self { max_str_len, trim }
116 }
117}
118
119impl ReadStrategy for WideText {
120 fn buffer_desc(&self) -> BufferDesc {
121 BufferDesc::WText {
122 max_str_len: self.max_str_len,
123 }
124 }
125
126 fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
127 let view = column_view.as_w_text_view().unwrap();
128 let item_capacity = view.len();
129 let data_capacity = self.max_str_len * item_capacity;
133 let mut builder = StringBuilder::with_capacity(item_capacity, data_capacity);
134
135 let mut converter = Utf16ToUtf8Converter::new();
136 for value in view.iter() {
137 let opt = if let Some(utf16) = value {
138 let slice = converter.utf16_to_utf8(utf16.as_slice());
139 let slice = if self.trim { slice.trim() } else { slice };
140 Some(slice)
141 } else {
142 None
143 };
144 builder.append_option(opt);
145 }
146 Ok(Arc::new(builder.finish()))
147 }
148}
149
150struct Utf16ToUtf8Converter {
151 buf_utf8: String,
153}
154
155impl Utf16ToUtf8Converter {
156 fn new() -> Self {
157 Self {
158 buf_utf8: String::new(),
159 }
160 }
161
162 fn utf16_to_utf8(&mut self, utf16: &[u16]) -> &str {
163 let max_utf8_len = utf16.len() * 3;
164 if max_utf8_len > self.buf_utf8.len() {
166 let additional = max_utf8_len - self.buf_utf8.len();
167 self.buf_utf8.reserve(additional);
168 for _ in 0..additional {
169 self.buf_utf8.push('\0');
170 }
171 }
172 let written = convert_utf16_to_str(utf16, &mut self.buf_utf8[..max_utf8_len]);
173 &self.buf_utf8[..written]
174 }
175}
176
177pub struct NarrowText {
178 max_str_len: usize,
180 trim: bool,
182}
183
184impl NarrowText {
185 pub fn new(max_str_len: usize, trim: bool) -> Self {
186 Self { max_str_len, trim }
187 }
188}
189
190impl ReadStrategy for NarrowText {
191 fn buffer_desc(&self) -> BufferDesc {
192 BufferDesc::Text {
193 max_str_len: self.max_str_len,
194 }
195 }
196
197 fn fill_arrow_array(&self, column_view: AnySlice) -> Result<ArrayRef, MappingError> {
198 let view = column_view.as_text_view().unwrap();
199 let mut builder = StringBuilder::with_capacity(view.len(), self.max_str_len * view.len());
200 for value in view.iter() {
201 builder.append_option(
202 value
203 .map(|bytes| {
204 let untrimmed = simdutf8::basic::from_utf8(bytes).map_err(|_| {
205 MappingError::InvalidUtf8 {
206 lossy_value: String::from_utf8_lossy(bytes).into_owned(),
207 }
208 })?;
209 Ok(if self.trim {
210 untrimmed.trim()
211 } else {
212 untrimmed
213 })
214 })
215 .transpose()?,
216 );
217 }
218 Ok(Arc::new(builder.finish()))
219 }
220}
221
222#[cfg(test)]
223mod tests {
224 use odbc_api::buffers::{AnySlice, ColumnBuffer, TextColumn};
225
226 use crate::reader::{MappingError, ReadStrategy as _};
227
228 use super::NarrowText;
229
230 #[test]
231 fn must_return_error_for_invalid_utf8() {
232 let mut column = TextColumn::new(1, 10);
234 column.set_value(0, Some(&[b'H', b'e', b'l', b'l', b'o', 0xc3]));
235 let column_view = AnySlice::Text(column.view(1));
236
237 let strategy = NarrowText::new(5, false);
239 let result = strategy.fill_arrow_array(column_view);
240
241 let error = result.unwrap_err();
243 let MappingError::InvalidUtf8 { lossy_value } = error else {
244 panic!("Not an InvalidUtf8 error")
245 };
246 assert_eq!(lossy_value, "Hello�");
247 }
248}