reifydb_engine/function/blob/
utf8.rs

1// Copyright (c) reifydb.com 2025
2// This file is licensed under the AGPL-3.0-or-later, see license.md file
3
4use reifydb_core::value::column::ColumnData;
5use reifydb_type::{OwnedFragment, value::Blob};
6
7use crate::function::{ScalarFunction, ScalarFunctionContext};
8
9pub struct BlobUtf8;
10
11impl BlobUtf8 {
12	pub fn new() -> Self {
13		Self
14	}
15}
16
17impl ScalarFunction for BlobUtf8 {
18	fn scalar(&self, ctx: ScalarFunctionContext) -> crate::Result<ColumnData> {
19		let columns = ctx.columns;
20		let row_count = ctx.row_count;
21
22		if columns.is_empty() {
23			return Ok(ColumnData::blob([]));
24		}
25
26		let column = columns.get(0).unwrap();
27
28		match &column.data() {
29			ColumnData::Utf8 {
30				container,
31				..
32			} => {
33				let mut result_data = Vec::with_capacity(container.data().len());
34
35				for i in 0..row_count {
36					if container.is_defined(i) {
37						let utf8_str = &container[i];
38						let blob = Blob::from_utf8(OwnedFragment::internal(utf8_str));
39						result_data.push(blob);
40					} else {
41						result_data.push(Blob::empty())
42					}
43				}
44
45				Ok(ColumnData::blob_with_bitvec(result_data, container.bitvec().clone()))
46			}
47			_ => unimplemented!("BlobUtf8 only supports text input"),
48		}
49	}
50}
51
52#[cfg(test)]
53mod tests {
54	use reifydb_core::value::{
55		column::{Column, Columns},
56		container::Utf8Container,
57	};
58	use reifydb_type::{Fragment, value::constraint::bytes::MaxBytes};
59
60	use super::*;
61
62	#[test]
63	fn test_blob_utf8_simple_ascii() {
64		let function = BlobUtf8::new();
65
66		let utf8_data = vec!["Hello!".to_string()];
67		let bitvec = vec![true];
68		let input_column = Column {
69			name: Fragment::borrowed_internal("input"),
70			data: ColumnData::Utf8 {
71				container: Utf8Container::new(utf8_data, bitvec.into()),
72				max_bytes: MaxBytes::MAX,
73			},
74		};
75		let columns = Columns::new(vec![input_column]);
76		let ctx = ScalarFunctionContext {
77			columns: &columns,
78			row_count: 1,
79		};
80
81		let result = function.scalar(ctx).unwrap();
82
83		let ColumnData::Blob {
84			container,
85			..
86		} = result
87		else {
88			panic!("Expected BLOB column data");
89		};
90		assert_eq!(container.len(), 1);
91		assert!(container.is_defined(0));
92		assert_eq!(container[0].as_bytes(), "Hello!".as_bytes());
93	}
94
95	#[test]
96	fn test_blob_utf8_empty_string() {
97		let function = BlobUtf8::new();
98
99		let utf8_data = vec!["".to_string()];
100		let bitvec = vec![true];
101		let input_column = Column {
102			name: Fragment::borrowed_internal("input"),
103			data: ColumnData::Utf8 {
104				container: Utf8Container::new(utf8_data, bitvec.into()),
105				max_bytes: MaxBytes::MAX,
106			},
107		};
108		let columns = Columns::new(vec![input_column]);
109		let ctx = ScalarFunctionContext {
110			columns: &columns,
111			row_count: 1,
112		};
113
114		let result = function.scalar(ctx).unwrap();
115
116		let ColumnData::Blob {
117			container,
118			..
119		} = result
120		else {
121			panic!("Expected BLOB column data");
122		};
123		assert_eq!(container.len(), 1);
124		assert!(container.is_defined(0));
125		assert_eq!(container[0].as_bytes(), &[] as &[u8]);
126	}
127
128	#[test]
129	fn test_blob_utf8_unicode_characters() {
130		let function = BlobUtf8::new();
131
132		// Test Unicode characters: emoji, accented chars, etc.
133		let utf8_data = vec!["Hello 🌍! Café naïve".to_string()];
134		let bitvec = vec![true];
135		let input_column = Column {
136			name: Fragment::borrowed_internal("input"),
137			data: ColumnData::Utf8 {
138				container: Utf8Container::new(utf8_data, bitvec.into()),
139				max_bytes: MaxBytes::MAX,
140			},
141		};
142		let columns = Columns::new(vec![input_column]);
143		let ctx = ScalarFunctionContext {
144			columns: &columns,
145			row_count: 1,
146		};
147
148		let result = function.scalar(ctx).unwrap();
149
150		let ColumnData::Blob {
151			container,
152			..
153		} = result
154		else {
155			panic!("Expected BLOB column data");
156		};
157		assert_eq!(container.len(), 1);
158		assert!(container.is_defined(0));
159		assert_eq!(container[0].as_bytes(), "Hello 🌍! Café naïve".as_bytes());
160	}
161
162	#[test]
163	fn test_blob_utf8_multibyte_characters() {
164		let function = BlobUtf8::new();
165
166		// Test various multibyte UTF-8 characters
167		let utf8_data = vec!["日本語 中文 한국어 العربية".to_string()];
168		let bitvec = vec![true];
169		let input_column = Column {
170			name: Fragment::borrowed_internal("input"),
171			data: ColumnData::Utf8 {
172				container: Utf8Container::new(utf8_data, bitvec.into()),
173				max_bytes: MaxBytes::MAX,
174			},
175		};
176		let columns = Columns::new(vec![input_column]);
177		let ctx = ScalarFunctionContext {
178			columns: &columns,
179			row_count: 1,
180		};
181
182		let result = function.scalar(ctx).unwrap();
183
184		let ColumnData::Blob {
185			container,
186			..
187		} = result
188		else {
189			panic!("Expected BLOB column data");
190		};
191		assert_eq!(container.len(), 1);
192		assert!(container.is_defined(0));
193		assert_eq!(container[0].as_bytes(), "日本語 中文 한국어 العربية".as_bytes());
194	}
195
196	#[test]
197	fn test_blob_utf8_special_characters() {
198		let function = BlobUtf8::new();
199
200		// Test special characters including newlines, tabs, etc.
201		let utf8_data = vec!["Line1\nLine2\tTabbed\r\nWindows".to_string()];
202		let bitvec = vec![true];
203		let input_column = Column {
204			name: Fragment::borrowed_internal("input"),
205			data: ColumnData::Utf8 {
206				container: Utf8Container::new(utf8_data, bitvec.into()),
207				max_bytes: MaxBytes::MAX,
208			},
209		};
210		let columns = Columns::new(vec![input_column]);
211		let ctx = ScalarFunctionContext {
212			columns: &columns,
213			row_count: 1,
214		};
215
216		let result = function.scalar(ctx).unwrap();
217
218		let ColumnData::Blob {
219			container,
220			..
221		} = result
222		else {
223			panic!("Expected BLOB column data");
224		};
225		assert_eq!(container.len(), 1);
226		assert!(container.is_defined(0));
227		assert_eq!(container[0].as_bytes(), "Line1\nLine2\tTabbed\r\nWindows".as_bytes());
228	}
229
230	#[test]
231	fn test_blob_utf8_multiple_rows() {
232		let function = BlobUtf8::new();
233
234		let utf8_data = vec!["First".to_string(), "Second 🚀".to_string(), "Third café".to_string()];
235		let bitvec = vec![true, true, true];
236		let input_column = Column {
237			name: Fragment::borrowed_internal("input"),
238			data: ColumnData::Utf8 {
239				container: Utf8Container::new(utf8_data, bitvec.into()),
240				max_bytes: MaxBytes::MAX,
241			},
242		};
243		let columns = Columns::new(vec![input_column]);
244		let ctx = ScalarFunctionContext {
245			columns: &columns,
246			row_count: 3,
247		};
248
249		let result = function.scalar(ctx).unwrap();
250
251		let ColumnData::Blob {
252			container,
253			..
254		} = result
255		else {
256			panic!("Expected BLOB column data");
257		};
258		assert_eq!(container.len(), 3);
259		assert!(container.is_defined(0));
260		assert!(container.is_defined(1));
261		assert!(container.is_defined(2));
262
263		assert_eq!(container[0].as_bytes(), "First".as_bytes());
264		assert_eq!(container[1].as_bytes(), "Second 🚀".as_bytes());
265		assert_eq!(container[2].as_bytes(), "Third café".as_bytes());
266	}
267
268	#[test]
269	fn test_blob_utf8_with_null_data() {
270		let function = BlobUtf8::new();
271
272		let utf8_data = vec!["First".to_string(), "".to_string(), "Third".to_string()];
273		let bitvec = vec![true, false, true];
274		let input_column = Column {
275			name: Fragment::borrowed_internal("input"),
276			data: ColumnData::Utf8 {
277				container: Utf8Container::new(utf8_data, bitvec.into()),
278				max_bytes: MaxBytes::MAX,
279			},
280		};
281		let columns = Columns::new(vec![input_column]);
282		let ctx = ScalarFunctionContext {
283			columns: &columns,
284			row_count: 3,
285		};
286
287		let result = function.scalar(ctx).unwrap();
288
289		let ColumnData::Blob {
290			container,
291			..
292		} = result
293		else {
294			panic!("Expected BLOB column data");
295		};
296		assert_eq!(container.len(), 3);
297		assert!(container.is_defined(0));
298		assert!(!container.is_defined(1));
299		assert!(container.is_defined(2));
300
301		assert_eq!(container[0].as_bytes(), "First".as_bytes());
302		assert_eq!(container[1].as_bytes(), [].as_slice() as &[u8]);
303		assert_eq!(container[2].as_bytes(), "Third".as_bytes());
304	}
305
306	#[test]
307	fn test_blob_utf8_json_data() {
308		let function = BlobUtf8::new();
309
310		// Test JSON-like data which is common to store as UTF-8
311		let utf8_data = vec![r#"{"name": "John", "age": 30, "city": "New York"}"#.to_string()];
312		let bitvec = vec![true];
313		let input_column = Column {
314			name: Fragment::borrowed_internal("input"),
315			data: ColumnData::Utf8 {
316				container: Utf8Container::new(utf8_data, bitvec.into()),
317				max_bytes: MaxBytes::MAX,
318			},
319		};
320		let columns = Columns::new(vec![input_column]);
321		let ctx = ScalarFunctionContext {
322			columns: &columns,
323			row_count: 1,
324		};
325
326		let result = function.scalar(ctx).unwrap();
327
328		let ColumnData::Blob {
329			container,
330			..
331		} = result
332		else {
333			panic!("Expected BLOB column data");
334		};
335		assert_eq!(container.len(), 1);
336		assert!(container.is_defined(0));
337		assert_eq!(container[0].as_bytes(), r#"{"name": "John", "age": 30, "city": "New York"}"#.as_bytes());
338	}
339
340	#[test]
341	fn test_blob_utf8_long_string() {
342		let function = BlobUtf8::new();
343
344		// Test a longer string to verify no issues with size
345		let long_string = "A".repeat(1000);
346		let utf8_data = vec![long_string.clone()];
347		let bitvec = vec![true];
348		let input_column = Column {
349			name: Fragment::borrowed_internal("input"),
350			data: ColumnData::Utf8 {
351				container: Utf8Container::new(utf8_data, bitvec.into()),
352				max_bytes: MaxBytes::MAX,
353			},
354		};
355		let columns = Columns::new(vec![input_column]);
356		let ctx = ScalarFunctionContext {
357			columns: &columns,
358			row_count: 1,
359		};
360
361		let result = function.scalar(ctx).unwrap();
362
363		let ColumnData::Blob {
364			container,
365			..
366		} = result
367		else {
368			panic!("Expected BLOB column data");
369		};
370		assert_eq!(container.len(), 1);
371		assert!(container.is_defined(0));
372		assert_eq!(container[0].as_bytes(), long_string.as_bytes());
373		assert_eq!(container[0].as_bytes().len(), 1000);
374	}
375}