1use crate::codec::AvroFieldBuilder;
482use crate::errors::AvroError;
483use crate::reader::header::read_header;
484use crate::schema::{
485 AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, SCHEMA_METADATA_KEY,
486 SINGLE_OBJECT_MAGIC, Schema, SchemaStore,
487};
488use arrow_array::{RecordBatch, RecordBatchReader};
489use arrow_schema::{ArrowError, SchemaRef};
490use block::BlockDecoder;
491use header::Header;
492use indexmap::IndexMap;
493use record::RecordDecoder;
494use std::io::BufRead;
495
496mod block;
497mod cursor;
498mod header;
499mod record;
500mod vlq;
501
502#[cfg(feature = "async")]
503mod async_reader;
504
505#[cfg(feature = "object_store")]
506pub use async_reader::AvroObjectReader;
507#[cfg(feature = "async")]
508pub use async_reader::{AsyncAvroFileReader, AsyncFileReader};
509
510fn is_incomplete_data(err: &AvroError) -> bool {
511 matches!(
512 err,
513 AvroError::EOF(_) | AvroError::NeedMoreData(_) | AvroError::NeedMoreDataRange(_)
514 )
515}
516
517#[derive(Debug)]
640pub struct Decoder {
641 active_decoder: RecordDecoder,
642 active_fingerprint: Option<Fingerprint>,
643 batch_size: usize,
644 remaining_capacity: usize,
645 cache: IndexMap<Fingerprint, RecordDecoder>,
646 fingerprint_algorithm: FingerprintAlgorithm,
647 pending_schema: Option<(Fingerprint, RecordDecoder)>,
648 awaiting_body: bool,
649}
650
651impl Decoder {
652 pub(crate) fn from_parts(
653 batch_size: usize,
654 active_decoder: RecordDecoder,
655 active_fingerprint: Option<Fingerprint>,
656 cache: IndexMap<Fingerprint, RecordDecoder>,
657 fingerprint_algorithm: FingerprintAlgorithm,
658 ) -> Self {
659 Self {
660 batch_size,
661 remaining_capacity: batch_size,
662 active_fingerprint,
663 active_decoder,
664 cache,
665 fingerprint_algorithm,
666 pending_schema: None,
667 awaiting_body: false,
668 }
669 }
670
671 pub fn schema(&self) -> SchemaRef {
676 self.active_decoder.schema().clone()
677 }
678
679 pub fn batch_size(&self) -> usize {
681 self.batch_size
682 }
683
684 pub fn decode(&mut self, data: &[u8]) -> Result<usize, AvroError> {
705 let mut total_consumed = 0usize;
706 while total_consumed < data.len() && self.remaining_capacity > 0 {
707 if self.awaiting_body {
708 match self.active_decoder.decode(&data[total_consumed..], 1) {
709 Ok(n) => {
710 self.remaining_capacity -= 1;
711 total_consumed += n;
712 self.awaiting_body = false;
713 continue;
714 }
715 Err(ref e) if is_incomplete_data(e) => break,
716 Err(e) => return Err(e),
717 };
718 }
719 match self.handle_prefix(&data[total_consumed..])? {
720 Some(0) => break, Some(n) => {
722 total_consumed += n;
723 self.apply_pending_schema_if_batch_empty();
724 self.awaiting_body = true;
725 }
726 None => {
727 return Err(AvroError::ParseError(
728 "Missing magic bytes and fingerprint".to_string(),
729 ));
730 }
731 }
732 }
733 Ok(total_consumed)
734 }
735
736 fn handle_prefix(&mut self, buf: &[u8]) -> Result<Option<usize>, AvroError> {
741 match self.fingerprint_algorithm {
742 FingerprintAlgorithm::Rabin => {
743 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
744 Fingerprint::Rabin(u64::from_le_bytes(bytes))
745 })
746 }
747 FingerprintAlgorithm::Id => self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
748 Fingerprint::Id(u32::from_be_bytes(bytes))
749 }),
750 FingerprintAlgorithm::Id64 => {
751 self.handle_prefix_common(buf, &CONFLUENT_MAGIC, |bytes| {
752 Fingerprint::Id64(u64::from_be_bytes(bytes))
753 })
754 }
755 #[cfg(feature = "md5")]
756 FingerprintAlgorithm::MD5 => {
757 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
758 Fingerprint::MD5(bytes)
759 })
760 }
761 #[cfg(feature = "sha256")]
762 FingerprintAlgorithm::SHA256 => {
763 self.handle_prefix_common(buf, &SINGLE_OBJECT_MAGIC, |bytes| {
764 Fingerprint::SHA256(bytes)
765 })
766 }
767 }
768 }
769
770 fn handle_prefix_common<const MAGIC_LEN: usize, const N: usize>(
774 &mut self,
775 buf: &[u8],
776 magic: &[u8; MAGIC_LEN],
777 fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
778 ) -> Result<Option<usize>, AvroError> {
779 if buf.len() < MAGIC_LEN {
782 return Ok(Some(0));
783 }
784 if &buf[..MAGIC_LEN] != magic {
786 return Ok(None);
787 }
788 let consumed_fp = self.handle_fingerprint(&buf[MAGIC_LEN..], fingerprint_from)?;
790 Ok(Some(consumed_fp.map_or(0, |n| n + MAGIC_LEN)))
793 }
794
795 fn handle_fingerprint<const N: usize>(
800 &mut self,
801 buf: &[u8],
802 fingerprint_from: impl FnOnce([u8; N]) -> Fingerprint,
803 ) -> Result<Option<usize>, AvroError> {
804 let Some(fingerprint_bytes) = buf.get(..N) else {
806 return Ok(None); };
808 let new_fingerprint = fingerprint_from(fingerprint_bytes.try_into().unwrap());
810 if self.active_fingerprint != Some(new_fingerprint) {
812 let Some(new_decoder) = self.cache.shift_remove(&new_fingerprint) else {
813 return Err(AvroError::ParseError(format!(
814 "Unknown fingerprint: {new_fingerprint:?}"
815 )));
816 };
817 self.pending_schema = Some((new_fingerprint, new_decoder));
818 if self.remaining_capacity < self.batch_size {
821 self.remaining_capacity = 0;
822 }
823 }
824 Ok(Some(N))
825 }
826
827 fn apply_pending_schema(&mut self) {
828 if let Some((new_fingerprint, new_decoder)) = self.pending_schema.take() {
829 if let Some(old_fingerprint) = self.active_fingerprint.replace(new_fingerprint) {
830 let old_decoder = std::mem::replace(&mut self.active_decoder, new_decoder);
831 self.cache.shift_remove(&old_fingerprint);
832 self.cache.insert(old_fingerprint, old_decoder);
833 } else {
834 self.active_decoder = new_decoder;
835 }
836 }
837 }
838
839 fn apply_pending_schema_if_batch_empty(&mut self) {
840 if self.batch_is_empty() {
841 self.apply_pending_schema();
842 }
843 }
844
845 fn flush_and_reset(&mut self) -> Result<Option<RecordBatch>, AvroError> {
846 if self.batch_is_empty() {
847 return Ok(None);
848 }
849 let batch = self.active_decoder.flush()?;
850 self.remaining_capacity = self.batch_size;
851 Ok(Some(batch))
852 }
853
854 pub fn flush(&mut self) -> Result<Option<RecordBatch>, AvroError> {
861 let batch = self.flush_and_reset();
863 self.apply_pending_schema();
864 batch
865 }
866
867 pub fn capacity(&self) -> usize {
869 self.remaining_capacity
870 }
871
872 pub fn batch_is_full(&self) -> bool {
874 self.remaining_capacity == 0
875 }
876
877 pub fn batch_is_empty(&self) -> bool {
879 self.remaining_capacity == self.batch_size
880 }
881
882 fn decode_block(&mut self, data: &[u8], count: usize) -> Result<(usize, usize), AvroError> {
886 let to_decode = std::cmp::min(count, self.remaining_capacity);
888 if to_decode == 0 {
889 return Ok((0, 0));
890 }
891 let consumed = self.active_decoder.decode(data, to_decode)?;
892 self.remaining_capacity -= to_decode;
893 Ok((consumed, to_decode))
894 }
895
896 fn flush_block(&mut self) -> Result<Option<RecordBatch>, AvroError> {
899 self.flush_and_reset()
900 }
901}
902
903#[derive(Debug)]
966pub struct ReaderBuilder {
967 batch_size: usize,
968 strict_mode: bool,
969 utf8_view: bool,
970 reader_schema: Option<AvroSchema>,
971 projection: Option<Vec<usize>>,
972 writer_schema_store: Option<SchemaStore>,
973 active_fingerprint: Option<Fingerprint>,
974}
975
976impl Default for ReaderBuilder {
977 fn default() -> Self {
978 Self {
979 batch_size: 1024,
980 strict_mode: false,
981 utf8_view: false,
982 reader_schema: None,
983 projection: None,
984 writer_schema_store: None,
985 active_fingerprint: None,
986 }
987 }
988}
989
990impl ReaderBuilder {
991 pub fn new() -> Self {
1001 Self::default()
1002 }
1003
1004 fn make_record_decoder(
1005 &self,
1006 writer_schema: &Schema,
1007 reader_schema: Option<&Schema>,
1008 ) -> Result<RecordDecoder, AvroError> {
1009 let mut builder = AvroFieldBuilder::new(writer_schema);
1010 if let Some(reader_schema) = reader_schema {
1011 builder = builder.with_reader_schema(reader_schema);
1012 }
1013 let root = builder
1014 .with_utf8view(self.utf8_view)
1015 .with_strict_mode(self.strict_mode)
1016 .build()?;
1017 RecordDecoder::try_new_with_options(root.data_type())
1018 }
1019
1020 fn make_record_decoder_from_schemas(
1021 &self,
1022 writer_schema: &Schema,
1023 reader_schema: Option<&AvroSchema>,
1024 ) -> Result<RecordDecoder, AvroError> {
1025 let reader_schema_raw = reader_schema.map(|s| s.schema()).transpose()?;
1026 self.make_record_decoder(writer_schema, reader_schema_raw.as_ref())
1027 }
1028
1029 fn make_decoder(
1030 &self,
1031 header: Option<&Header>,
1032 reader_schema: Option<&AvroSchema>,
1033 ) -> Result<Decoder, AvroError> {
1034 if let Some(hdr) = header {
1035 let writer_schema = hdr.schema()?.ok_or_else(|| {
1036 AvroError::ParseError("No Avro schema present in file header".into())
1037 })?;
1038 let projected_reader_schema = self
1039 .projection
1040 .as_deref()
1041 .map(|projection| {
1042 let base_schema = if let Some(reader_schema) = reader_schema {
1043 reader_schema.clone()
1044 } else {
1045 let raw = hdr.get(SCHEMA_METADATA_KEY).ok_or_else(|| {
1046 AvroError::ParseError(
1047 "No Avro schema present in file header".to_string(),
1048 )
1049 })?;
1050 let json_string = std::str::from_utf8(raw)
1051 .map_err(|e| {
1052 AvroError::ParseError(format!(
1053 "Invalid UTF-8 in Avro schema header: {e}"
1054 ))
1055 })?
1056 .to_string();
1057 AvroSchema::new(json_string)
1058 };
1059 base_schema.project(projection)
1060 })
1061 .transpose()?;
1062 let effective_reader_schema = projected_reader_schema.as_ref().or(reader_schema);
1063 let record_decoder =
1064 self.make_record_decoder_from_schemas(&writer_schema, effective_reader_schema)?;
1065 return Ok(Decoder::from_parts(
1066 self.batch_size,
1067 record_decoder,
1068 None,
1069 IndexMap::new(),
1070 FingerprintAlgorithm::Rabin,
1071 ));
1072 }
1073 let store = self.writer_schema_store.as_ref().ok_or_else(|| {
1074 AvroError::ParseError("Writer schema store required for raw Avro".into())
1075 })?;
1076 let fingerprints = store.fingerprints();
1077 if fingerprints.is_empty() {
1078 return Err(AvroError::ParseError(
1079 "Writer schema store must contain at least one schema".into(),
1080 ));
1081 }
1082 let start_fingerprint = self
1083 .active_fingerprint
1084 .or_else(|| fingerprints.first().copied())
1085 .ok_or_else(|| {
1086 AvroError::ParseError("Could not determine initial schema fingerprint".into())
1087 })?;
1088 let projection = self.projection.as_deref();
1089 let projected_reader_schema = match (projection, reader_schema) {
1090 (Some(projection), Some(reader_schema)) => Some(reader_schema.project(projection)?),
1091 _ => None,
1092 };
1093 let mut cache = IndexMap::with_capacity(fingerprints.len().saturating_sub(1));
1094 let mut active_decoder: Option<RecordDecoder> = None;
1095 for fingerprint in store.fingerprints() {
1096 let avro_schema = match store.lookup(&fingerprint) {
1097 Some(schema) => schema,
1098 None => {
1099 return Err(AvroError::General(format!(
1100 "Fingerprint {fingerprint:?} not found in schema store",
1101 )));
1102 }
1103 };
1104 let writer_schema = avro_schema.schema()?;
1105 let record_decoder = match projection {
1106 None => self.make_record_decoder_from_schemas(&writer_schema, reader_schema)?,
1107 Some(projection) => {
1108 if let Some(ref pruned_reader_schema) = projected_reader_schema {
1109 self.make_record_decoder_from_schemas(
1110 &writer_schema,
1111 Some(pruned_reader_schema),
1112 )?
1113 } else {
1114 let derived_reader_schema = avro_schema.project(projection)?;
1115 self.make_record_decoder_from_schemas(
1116 &writer_schema,
1117 Some(&derived_reader_schema),
1118 )?
1119 }
1120 }
1121 };
1122 if fingerprint == start_fingerprint {
1123 active_decoder = Some(record_decoder);
1124 } else {
1125 cache.insert(fingerprint, record_decoder);
1126 }
1127 }
1128 let active_decoder = active_decoder.ok_or_else(|| {
1129 AvroError::General(format!(
1130 "Initial fingerprint {start_fingerprint:?} not found in schema store"
1131 ))
1132 })?;
1133 Ok(Decoder::from_parts(
1134 self.batch_size,
1135 active_decoder,
1136 Some(start_fingerprint),
1137 cache,
1138 store.fingerprint_algorithm(),
1139 ))
1140 }
1141
1142 pub fn with_batch_size(mut self, batch_size: usize) -> Self {
1148 self.batch_size = batch_size;
1149 self
1150 }
1151
1152 pub fn with_utf8_view(mut self, utf8_view: bool) -> Self {
1158 self.utf8_view = utf8_view;
1159 self
1160 }
1161
1162 pub fn use_utf8view(&self) -> bool {
1164 self.utf8_view
1165 }
1166
1167 pub fn with_strict_mode(mut self, strict_mode: bool) -> Self {
1172 self.strict_mode = strict_mode;
1173 self
1174 }
1175
1176 pub fn with_reader_schema(mut self, schema: AvroSchema) -> Self {
1183 self.reader_schema = Some(schema);
1184 self
1185 }
1186
1187 pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
1245 self.projection = Some(projection);
1246 self
1247 }
1248
1249 pub fn with_writer_schema_store(mut self, store: SchemaStore) -> Self {
1257 self.writer_schema_store = Some(store);
1258 self
1259 }
1260
1261 pub fn with_active_fingerprint(mut self, fp: Fingerprint) -> Self {
1266 self.active_fingerprint = Some(fp);
1267 self
1268 }
1269
1270 pub fn build<R: BufRead>(self, mut reader: R) -> Result<Reader<R>, ArrowError> {
1276 let header = read_header(&mut reader)?;
1277 let decoder = self.make_decoder(Some(&header), self.reader_schema.as_ref())?;
1278 Ok(Reader {
1279 reader,
1280 header,
1281 decoder,
1282 block_decoder: BlockDecoder::default(),
1283 block_data: Vec::new(),
1284 block_count: 0,
1285 block_cursor: 0,
1286 finished: false,
1287 })
1288 }
1289
1290 pub fn build_decoder(self) -> Result<Decoder, ArrowError> {
1299 if self.writer_schema_store.is_none() {
1300 return Err(ArrowError::InvalidArgumentError(
1301 "Building a decoder requires a writer schema store".to_string(),
1302 ));
1303 }
1304 self.make_decoder(None, self.reader_schema.as_ref())
1305 .map_err(ArrowError::from)
1306 }
1307}
1308
1309#[derive(Debug)]
1319pub struct Reader<R: BufRead> {
1320 reader: R,
1321 header: Header,
1322 decoder: Decoder,
1323 block_decoder: BlockDecoder,
1324 block_data: Vec<u8>,
1325 block_count: usize,
1326 block_cursor: usize,
1327 finished: bool,
1328}
1329
1330impl<R: BufRead> Reader<R> {
1331 pub fn schema(&self) -> SchemaRef {
1334 self.decoder.schema()
1335 }
1336
1337 pub fn avro_header(&self) -> &Header {
1339 &self.header
1340 }
1341
1342 fn read(&mut self) -> Result<Option<RecordBatch>, AvroError> {
1347 'outer: while !self.finished && !self.decoder.batch_is_full() {
1348 while self.block_cursor == self.block_data.len() {
1349 let buf = self.reader.fill_buf()?;
1350 if buf.is_empty() {
1351 self.finished = true;
1352 break 'outer;
1353 }
1354 let consumed = self.block_decoder.decode(buf)?;
1356 self.reader.consume(consumed);
1357 if let Some(block) = self.block_decoder.flush() {
1358 self.block_data = if let Some(ref codec) = self.header.compression()? {
1360 let decompressed: Vec<u8> = codec.decompress(&block.data)?;
1361 decompressed
1362 } else {
1363 block.data
1364 };
1365 self.block_count = block.count;
1366 self.block_cursor = 0;
1367 } else if consumed == 0 {
1368 return Err(AvroError::ParseError(
1370 "Could not decode next Avro block from partial data".to_string(),
1371 ));
1372 }
1373 }
1374 if self.block_cursor < self.block_data.len() {
1376 let (consumed, records_decoded) = self
1377 .decoder
1378 .decode_block(&self.block_data[self.block_cursor..], self.block_count)?;
1379 self.block_cursor += consumed;
1380 self.block_count -= records_decoded;
1381 }
1382 }
1383 self.decoder.flush_block()
1384 }
1385}
1386
1387impl<R: BufRead> Iterator for Reader<R> {
1388 type Item = Result<RecordBatch, ArrowError>;
1389
1390 fn next(&mut self) -> Option<Self::Item> {
1391 self.read().map_err(ArrowError::from).transpose()
1392 }
1393}
1394
1395impl<R: BufRead> RecordBatchReader for Reader<R> {
1396 fn schema(&self) -> SchemaRef {
1397 self.schema()
1398 }
1399}
1400
1401#[cfg(test)]
1402mod test {
1403 use crate::codec::AvroFieldBuilder;
1404 use crate::reader::header::HeaderDecoder;
1405 use crate::reader::record::RecordDecoder;
1406 use crate::reader::{Decoder, Reader, ReaderBuilder};
1407 use crate::schema::{
1408 AVRO_ENUM_SYMBOLS_METADATA_KEY, AVRO_NAME_METADATA_KEY, AVRO_NAMESPACE_METADATA_KEY,
1409 AvroSchema, CONFLUENT_MAGIC, Fingerprint, FingerprintAlgorithm, PrimitiveType,
1410 SINGLE_OBJECT_MAGIC, SchemaStore,
1411 };
1412 use crate::test_util::arrow_test_data;
1413 use crate::writer::AvroWriter;
1414 use arrow_array::builder::{
1415 ArrayBuilder, BooleanBuilder, Float32Builder, Int32Builder, Int64Builder, ListBuilder,
1416 MapBuilder, StringBuilder, StructBuilder,
1417 };
1418 #[cfg(feature = "snappy")]
1419 use arrow_array::builder::{Float64Builder, MapFieldNames};
1420 use arrow_array::cast::AsArray;
1421 #[cfg(not(feature = "avro_custom_types"))]
1422 use arrow_array::types::Int64Type;
1423 #[cfg(feature = "avro_custom_types")]
1424 use arrow_array::types::{
1425 DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
1426 DurationSecondType,
1427 };
1428 use arrow_array::types::{Int32Type, IntervalMonthDayNanoType};
1429 use arrow_array::*;
1430 #[cfg(feature = "snappy")]
1431 use arrow_buffer::{Buffer, NullBuffer};
1432 use arrow_buffer::{IntervalMonthDayNano, OffsetBuffer, ScalarBuffer, i256};
1433 #[cfg(feature = "avro_custom_types")]
1434 use arrow_schema::{
1435 ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit, UnionFields,
1436 UnionMode,
1437 };
1438 #[cfg(not(feature = "avro_custom_types"))]
1439 use arrow_schema::{
1440 ArrowError, DataType, Field, FieldRef, Fields, IntervalUnit, Schema, UnionFields, UnionMode,
1441 };
1442 use bytes::Bytes;
1443 use futures::executor::block_on;
1444 use futures::{Stream, StreamExt, TryStreamExt, stream};
1445 use serde_json::{Value, json};
1446 use std::collections::HashMap;
1447 use std::fs::File;
1448 use std::io::{BufReader, Cursor};
1449 use std::sync::Arc;
1450
1451 fn files() -> impl Iterator<Item = &'static str> {
1452 [
1453 #[cfg(feature = "snappy")]
1455 "avro/alltypes_plain.avro",
1456 #[cfg(feature = "snappy")]
1457 "avro/alltypes_plain.snappy.avro",
1458 #[cfg(feature = "zstd")]
1459 "avro/alltypes_plain.zstandard.avro",
1460 #[cfg(feature = "bzip2")]
1461 "avro/alltypes_plain.bzip2.avro",
1462 #[cfg(feature = "xz")]
1463 "avro/alltypes_plain.xz.avro",
1464 ]
1465 .into_iter()
1466 }
1467
1468 fn read_file(path: &str, batch_size: usize, utf8_view: bool) -> RecordBatch {
1469 let file = File::open(path).unwrap();
1470 let reader = ReaderBuilder::new()
1471 .with_batch_size(batch_size)
1472 .with_utf8_view(utf8_view)
1473 .build(BufReader::new(file))
1474 .unwrap();
1475 let schema = reader.schema();
1476 let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1477 arrow::compute::concat_batches(&schema, &batches).unwrap()
1478 }
1479
1480 fn read_file_strict(
1481 path: &str,
1482 batch_size: usize,
1483 utf8_view: bool,
1484 ) -> Result<Reader<BufReader<File>>, ArrowError> {
1485 let file = File::open(path)?;
1486 ReaderBuilder::new()
1487 .with_batch_size(batch_size)
1488 .with_utf8_view(utf8_view)
1489 .with_strict_mode(true)
1490 .build(BufReader::new(file))
1491 }
1492
1493 fn decode_stream<S: Stream<Item = Bytes> + Unpin>(
1494 mut decoder: Decoder,
1495 mut input: S,
1496 ) -> impl Stream<Item = Result<RecordBatch, ArrowError>> {
1497 async_stream::try_stream! {
1498 if let Some(data) = input.next().await {
1499 let consumed = decoder.decode(&data)?;
1500 if consumed < data.len() {
1501 Err(ArrowError::ParseError(
1502 "did not consume all bytes".to_string(),
1503 ))?;
1504 }
1505 }
1506 if let Some(batch) = decoder.flush()? {
1507 yield batch
1508 }
1509 }
1510 }
1511
1512 fn make_record_schema(pt: PrimitiveType) -> AvroSchema {
1513 let js = format!(
1514 r#"{{"type":"record","name":"TestRecord","fields":[{{"name":"a","type":"{}"}}]}}"#,
1515 pt.as_ref()
1516 );
1517 AvroSchema::new(js)
1518 }
1519
1520 fn make_two_schema_store() -> (
1521 SchemaStore,
1522 Fingerprint,
1523 Fingerprint,
1524 AvroSchema,
1525 AvroSchema,
1526 ) {
1527 let schema_int = make_record_schema(PrimitiveType::Int);
1528 let schema_long = make_record_schema(PrimitiveType::Long);
1529 let mut store = SchemaStore::new();
1530 let fp_int = store
1531 .register(schema_int.clone())
1532 .expect("register int schema");
1533 let fp_long = store
1534 .register(schema_long.clone())
1535 .expect("register long schema");
1536 (store, fp_int, fp_long, schema_int, schema_long)
1537 }
1538
1539 fn make_prefix(fp: Fingerprint) -> Vec<u8> {
1540 match fp {
1541 Fingerprint::Rabin(v) => {
1542 let mut out = Vec::with_capacity(2 + 8);
1543 out.extend_from_slice(&SINGLE_OBJECT_MAGIC);
1544 out.extend_from_slice(&v.to_le_bytes());
1545 out
1546 }
1547 Fingerprint::Id(v) => {
1548 panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1549 }
1550 Fingerprint::Id64(v) => {
1551 panic!("make_prefix expects a Rabin fingerprint, got ({v})");
1552 }
1553 #[cfg(feature = "md5")]
1554 Fingerprint::MD5(v) => {
1555 panic!("make_prefix expects a Rabin fingerprint, got ({v:?})");
1556 }
1557 #[cfg(feature = "sha256")]
1558 Fingerprint::SHA256(id) => {
1559 panic!("make_prefix expects a Rabin fingerprint, got ({id:?})");
1560 }
1561 }
1562 }
1563
1564 fn make_decoder(store: &SchemaStore, fp: Fingerprint, reader_schema: &AvroSchema) -> Decoder {
1565 ReaderBuilder::new()
1566 .with_batch_size(8)
1567 .with_reader_schema(reader_schema.clone())
1568 .with_writer_schema_store(store.clone())
1569 .with_active_fingerprint(fp)
1570 .build_decoder()
1571 .expect("decoder")
1572 }
1573
1574 fn make_id_prefix(id: u32, additional: usize) -> Vec<u8> {
1575 let capacity = CONFLUENT_MAGIC.len() + size_of::<u32>() + additional;
1576 let mut out = Vec::with_capacity(capacity);
1577 out.extend_from_slice(&CONFLUENT_MAGIC);
1578 out.extend_from_slice(&id.to_be_bytes());
1579 out
1580 }
1581
1582 fn make_message_id(id: u32, value: i64) -> Vec<u8> {
1583 let encoded_value = encode_zigzag(value);
1584 let mut msg = make_id_prefix(id, encoded_value.len());
1585 msg.extend_from_slice(&encoded_value);
1586 msg
1587 }
1588
1589 fn make_id64_prefix(id: u64, additional: usize) -> Vec<u8> {
1590 let capacity = CONFLUENT_MAGIC.len() + size_of::<u64>() + additional;
1591 let mut out = Vec::with_capacity(capacity);
1592 out.extend_from_slice(&CONFLUENT_MAGIC);
1593 out.extend_from_slice(&id.to_be_bytes());
1594 out
1595 }
1596
1597 fn make_message_id64(id: u64, value: i64) -> Vec<u8> {
1598 let encoded_value = encode_zigzag(value);
1599 let mut msg = make_id64_prefix(id, encoded_value.len());
1600 msg.extend_from_slice(&encoded_value);
1601 msg
1602 }
1603
1604 fn make_value_schema(pt: PrimitiveType) -> AvroSchema {
1605 let json_schema = format!(
1606 r#"{{"type":"record","name":"S","fields":[{{"name":"v","type":"{}"}}]}}"#,
1607 pt.as_ref()
1608 );
1609 AvroSchema::new(json_schema)
1610 }
1611
1612 fn encode_zigzag(value: i64) -> Vec<u8> {
1613 let mut n = ((value << 1) ^ (value >> 63)) as u64;
1614 let mut out = Vec::new();
1615 loop {
1616 if (n & !0x7F) == 0 {
1617 out.push(n as u8);
1618 break;
1619 } else {
1620 out.push(((n & 0x7F) | 0x80) as u8);
1621 n >>= 7;
1622 }
1623 }
1624 out
1625 }
1626
1627 fn make_message(fp: Fingerprint, value: i64) -> Vec<u8> {
1628 let mut msg = make_prefix(fp);
1629 msg.extend_from_slice(&encode_zigzag(value));
1630 msg
1631 }
1632
1633 fn load_writer_schema_json(path: &str) -> Value {
1634 let file = File::open(path).unwrap();
1635 let header = super::read_header(BufReader::new(file)).unwrap();
1636 let schema = header.schema().unwrap().unwrap();
1637 serde_json::to_value(&schema).unwrap()
1638 }
1639
1640 fn make_reader_schema_with_promotions(
1641 path: &str,
1642 promotions: &HashMap<&str, &str>,
1643 ) -> AvroSchema {
1644 let mut root = load_writer_schema_json(path);
1645 assert_eq!(root["type"], "record", "writer schema must be a record");
1646 let fields = root
1647 .get_mut("fields")
1648 .and_then(|f| f.as_array_mut())
1649 .expect("record has fields");
1650 for f in fields.iter_mut() {
1651 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1652 continue;
1653 };
1654 if let Some(new_ty) = promotions.get(name) {
1655 let ty = f.get_mut("type").expect("field has a type");
1656 match ty {
1657 Value::String(_) => {
1658 *ty = Value::String((*new_ty).to_string());
1659 }
1660 Value::Array(arr) => {
1662 for b in arr.iter_mut() {
1663 match b {
1664 Value::String(s) if s != "null" => {
1665 *b = Value::String((*new_ty).to_string());
1666 break;
1667 }
1668 Value::Object(_) => {
1669 *b = Value::String((*new_ty).to_string());
1670 break;
1671 }
1672 _ => {}
1673 }
1674 }
1675 }
1676 Value::Object(_) => {
1677 *ty = Value::String((*new_ty).to_string());
1678 }
1679 _ => {}
1680 }
1681 }
1682 }
1683 AvroSchema::new(root.to_string())
1684 }
1685
1686 fn make_reader_schema_with_enum_remap(
1687 path: &str,
1688 remap: &HashMap<&str, Vec<&str>>,
1689 ) -> AvroSchema {
1690 let mut root = load_writer_schema_json(path);
1691 assert_eq!(root["type"], "record", "writer schema must be a record");
1692 let fields = root
1693 .get_mut("fields")
1694 .and_then(|f| f.as_array_mut())
1695 .expect("record has fields");
1696
1697 fn to_symbols_array(symbols: &[&str]) -> Value {
1698 Value::Array(symbols.iter().map(|s| Value::String((*s).into())).collect())
1699 }
1700
1701 fn update_enum_symbols(ty: &mut Value, symbols: &Value) {
1702 match ty {
1703 Value::Object(map) => {
1704 if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1705 map.insert("symbols".to_string(), symbols.clone());
1706 }
1707 }
1708 Value::Array(arr) => {
1709 for b in arr.iter_mut() {
1710 if let Value::Object(map) = b {
1711 if matches!(map.get("type"), Some(Value::String(t)) if t == "enum") {
1712 map.insert("symbols".to_string(), symbols.clone());
1713 }
1714 }
1715 }
1716 }
1717 _ => {}
1718 }
1719 }
1720 for f in fields.iter_mut() {
1721 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
1722 continue;
1723 };
1724 if let Some(new_symbols) = remap.get(name) {
1725 let symbols_val = to_symbols_array(new_symbols);
1726 let ty = f.get_mut("type").expect("field has a type");
1727 update_enum_symbols(ty, &symbols_val);
1728 }
1729 }
1730 AvroSchema::new(root.to_string())
1731 }
1732
1733 fn read_alltypes_with_reader_schema(path: &str, reader_schema: AvroSchema) -> RecordBatch {
1734 let file = File::open(path).unwrap();
1735 let reader = ReaderBuilder::new()
1736 .with_batch_size(1024)
1737 .with_utf8_view(false)
1738 .with_reader_schema(reader_schema)
1739 .build(BufReader::new(file))
1740 .unwrap();
1741 let schema = reader.schema();
1742 let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
1743 arrow::compute::concat_batches(&schema, &batches).unwrap()
1744 }
1745
1746 fn make_reader_schema_with_selected_fields_in_order(
1747 path: &str,
1748 selected: &[&str],
1749 ) -> AvroSchema {
1750 let mut root = load_writer_schema_json(path);
1751 assert_eq!(root["type"], "record", "writer schema must be a record");
1752 let writer_fields = root
1753 .get("fields")
1754 .and_then(|f| f.as_array())
1755 .expect("record has fields");
1756 let mut field_map: HashMap<String, Value> = HashMap::with_capacity(writer_fields.len());
1757 for f in writer_fields {
1758 if let Some(name) = f.get("name").and_then(|n| n.as_str()) {
1759 field_map.insert(name.to_string(), f.clone());
1760 }
1761 }
1762 let mut new_fields = Vec::with_capacity(selected.len());
1763 for name in selected {
1764 let f = field_map
1765 .get(*name)
1766 .unwrap_or_else(|| panic!("field '{name}' not found in writer schema"))
1767 .clone();
1768 new_fields.push(f);
1769 }
1770 root["fields"] = Value::Array(new_fields);
1771 AvroSchema::new(root.to_string())
1772 }
1773
1774 fn write_ocf(schema: &Schema, batches: &[RecordBatch]) -> Vec<u8> {
1775 let mut w = AvroWriter::new(Vec::<u8>::new(), schema.clone()).expect("writer");
1776 for b in batches {
1777 w.write(b).expect("write");
1778 }
1779 w.finish().expect("finish");
1780 w.into_inner()
1781 }
1782
1783 #[test]
1784 fn ocf_projection_no_reader_schema_reorder() -> Result<(), Box<dyn std::error::Error>> {
1785 let writer_schema = Schema::new(vec![
1787 Field::new("id", DataType::Int32, false),
1788 Field::new("name", DataType::Utf8, false),
1789 Field::new("is_active", DataType::Boolean, false),
1790 ]);
1791 let batch = RecordBatch::try_new(
1792 Arc::new(writer_schema.clone()),
1793 vec![
1794 Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef,
1795 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1796 Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef,
1797 ],
1798 )?;
1799 let bytes = write_ocf(&writer_schema, &[batch]);
1800 let mut reader = ReaderBuilder::new()
1802 .with_projection(vec![2, 0])
1803 .build(Cursor::new(bytes))?;
1804 let out = reader.next().unwrap()?;
1805 assert_eq!(out.num_columns(), 2);
1806 assert_eq!(out.schema().field(0).name(), "is_active");
1807 assert_eq!(out.schema().field(1).name(), "id");
1808 let is_active = out.column(0).as_boolean();
1809 assert!(is_active.value(0));
1810 assert!(!is_active.value(1));
1811 let id = out.column(1).as_primitive::<Int32Type>();
1812 assert_eq!(id.value(0), 1);
1813 assert_eq!(id.value(1), 2);
1814 Ok(())
1815 }
1816
1817 #[test]
1818 fn ocf_projection_with_reader_schema_alias_and_default()
1819 -> Result<(), Box<dyn std::error::Error>> {
1820 let writer_schema = Schema::new(vec![
1822 Field::new("id", DataType::Int64, false),
1823 Field::new("name", DataType::Utf8, false),
1824 ]);
1825 let batch = RecordBatch::try_new(
1826 Arc::new(writer_schema.clone()),
1827 vec![
1828 Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1829 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
1830 ],
1831 )?;
1832 let bytes = write_ocf(&writer_schema, &[batch]);
1833 let reader_json = r#"
1837 {
1838 "type": "record",
1839 "name": "topLevelRecord",
1840 "fields": [
1841 { "name": "id", "type": "long" },
1842 { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
1843 { "name": "is_active", "type": "boolean", "default": true }
1844 ]
1845 }"#;
1846 let mut reader = ReaderBuilder::new()
1848 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
1849 .with_projection(vec![1, 2])
1850 .build(Cursor::new(bytes))?;
1851 let out = reader.next().unwrap()?;
1852 assert_eq!(out.num_columns(), 2);
1853 assert_eq!(out.schema().field(0).name(), "full_name");
1854 assert_eq!(out.schema().field(1).name(), "is_active");
1855 let full_name = out.column(0).as_string::<i32>();
1856 assert_eq!(full_name.value(0), "a");
1857 assert_eq!(full_name.value(1), "b");
1858 let is_active = out.column(1).as_boolean();
1859 assert!(is_active.value(0));
1860 assert!(is_active.value(1));
1861 Ok(())
1862 }
1863
1864 #[test]
1865 fn projection_errors_out_of_bounds_and_duplicate() -> Result<(), Box<dyn std::error::Error>> {
1866 let writer_schema = Schema::new(vec![
1867 Field::new("a", DataType::Int32, false),
1868 Field::new("b", DataType::Int32, false),
1869 ]);
1870 let batch = RecordBatch::try_new(
1871 Arc::new(writer_schema.clone()),
1872 vec![
1873 Arc::new(Int32Array::from(vec![1])) as ArrayRef,
1874 Arc::new(Int32Array::from(vec![2])) as ArrayRef,
1875 ],
1876 )?;
1877 let bytes = write_ocf(&writer_schema, &[batch]);
1878 let err = ReaderBuilder::new()
1879 .with_projection(vec![2])
1880 .build(Cursor::new(bytes.clone()))
1881 .unwrap_err();
1882 assert!(matches!(err, ArrowError::AvroError(_)));
1883 assert!(err.to_string().contains("out of bounds"));
1884 let err = ReaderBuilder::new()
1885 .with_projection(vec![0, 0])
1886 .build(Cursor::new(bytes))
1887 .unwrap_err();
1888 assert!(matches!(err, ArrowError::AvroError(_)));
1889 assert!(err.to_string().contains("Duplicate projection index"));
1890 Ok(())
1891 }
1892
1893 #[test]
1894 #[cfg(feature = "snappy")]
1895 fn test_alltypes_plain_with_projection_and_reader_schema() {
1896 use std::fs::File;
1897 use std::io::BufReader;
1898 let path = arrow_test_data("avro/alltypes_plain.avro");
1899 let reader_schema = make_reader_schema_with_selected_fields_in_order(
1901 &path,
1902 &["double_col", "id", "tinyint_col"],
1903 );
1904 let file = File::open(&path).expect("open avro/alltypes_plain.avro");
1905 let reader = ReaderBuilder::new()
1906 .with_batch_size(1024)
1907 .with_reader_schema(reader_schema)
1908 .with_projection(vec![1, 2]) .build(BufReader::new(file))
1910 .expect("build reader with projection and reader schema");
1911 let schema = reader.schema();
1912 assert_eq!(schema.fields().len(), 2);
1914 assert_eq!(schema.field(0).name(), "id");
1915 assert_eq!(schema.field(1).name(), "tinyint_col");
1916 let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
1917 assert_eq!(batches.len(), 1);
1918 let batch = &batches[0];
1919 assert_eq!(batch.num_rows(), 8);
1920 assert_eq!(batch.num_columns(), 2);
1921 let expected = RecordBatch::try_from_iter_with_nullable([
1925 (
1926 "id",
1927 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as ArrayRef,
1928 true,
1929 ),
1930 (
1931 "tinyint_col",
1932 Arc::new(Int32Array::from(vec![0, 1, 0, 1, 0, 1, 0, 1])) as ArrayRef,
1933 true,
1934 ),
1935 ])
1936 .unwrap();
1937 assert_eq!(
1938 batch, &expected,
1939 "Projected batch mismatch for alltypes_plain.avro with reader schema and projection [1, 2]"
1940 );
1941 }
1942
1943 #[test]
1944 #[cfg(feature = "snappy")]
1945 fn test_alltypes_plain_with_projection() {
1946 use std::fs::File;
1947 use std::io::BufReader;
1948 let path = arrow_test_data("avro/alltypes_plain.avro");
1949 let file = File::open(&path).expect("open avro/alltypes_plain.avro");
1950 let reader = ReaderBuilder::new()
1951 .with_batch_size(1024)
1952 .with_projection(vec![2, 0, 5])
1953 .build(BufReader::new(file))
1954 .expect("build reader with projection");
1955 let schema = reader.schema();
1956 assert_eq!(schema.fields().len(), 3);
1957 assert_eq!(schema.field(0).name(), "tinyint_col");
1958 assert_eq!(schema.field(1).name(), "id");
1959 assert_eq!(schema.field(2).name(), "bigint_col");
1960 let batches: Vec<RecordBatch> = reader.collect::<Result<Vec<_>, _>>().unwrap();
1961 assert_eq!(batches.len(), 1);
1962 let batch = &batches[0];
1963 assert_eq!(batch.num_rows(), 8);
1964 assert_eq!(batch.num_columns(), 3);
1965 let expected = RecordBatch::try_from_iter_with_nullable([
1966 (
1967 "tinyint_col",
1968 Arc::new(Int32Array::from(vec![0, 1, 0, 1, 0, 1, 0, 1])) as ArrayRef,
1969 true,
1970 ),
1971 (
1972 "id",
1973 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as ArrayRef,
1974 true,
1975 ),
1976 (
1977 "bigint_col",
1978 Arc::new(Int64Array::from(vec![0, 10, 0, 10, 0, 10, 0, 10])) as ArrayRef,
1979 true,
1980 ),
1981 ])
1982 .unwrap();
1983 assert_eq!(
1984 batch, &expected,
1985 "Projected batch mismatch for alltypes_plain.avro with projection [2, 0, 5]"
1986 );
1987 }
1988
1989 #[test]
1990 fn writer_string_reader_nullable_with_alias() -> Result<(), Box<dyn std::error::Error>> {
1991 let writer_schema = Schema::new(vec![
1992 Field::new("id", DataType::Int64, false),
1993 Field::new("name", DataType::Utf8, false),
1994 ]);
1995 let batch = RecordBatch::try_new(
1996 Arc::new(writer_schema.clone()),
1997 vec![
1998 Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef,
1999 Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef,
2000 ],
2001 )?;
2002 let bytes = write_ocf(&writer_schema, &[batch]);
2003 let reader_json = r#"
2004 {
2005 "type": "record",
2006 "name": "topLevelRecord",
2007 "fields": [
2008 { "name": "id", "type": "long" },
2009 { "name": "full_name", "type": ["null","string"], "aliases": ["name"], "default": null },
2010 { "name": "is_active", "type": "boolean", "default": true }
2011 ]
2012 }"#;
2013 let mut reader = ReaderBuilder::new()
2014 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2015 .build(Cursor::new(bytes))?;
2016 let out = reader.next().unwrap()?;
2017 let full_name = out.column(1).as_string::<i32>();
2018 assert_eq!(full_name.value(0), "a");
2019 assert_eq!(full_name.value(1), "b");
2020 Ok(())
2021 }
2022
2023 #[test]
2024 fn writer_string_reader_string_null_order_second() -> Result<(), Box<dyn std::error::Error>> {
2025 let writer_schema = Schema::new(vec![Field::new("name", DataType::Utf8, false)]);
2027 let batch = RecordBatch::try_new(
2028 Arc::new(writer_schema.clone()),
2029 vec![Arc::new(StringArray::from(vec!["x", "y"])) as ArrayRef],
2030 )?;
2031 let bytes = write_ocf(&writer_schema, &[batch]);
2032
2033 let reader_json = r#"
2035 {
2036 "type":"record", "name":"topLevelRecord",
2037 "fields":[ { "name":"name", "type":["string","null"], "default":"x" } ]
2038 }"#;
2039
2040 let mut reader = ReaderBuilder::new()
2041 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2042 .build(Cursor::new(bytes))?;
2043
2044 let out = reader.next().unwrap()?;
2045 assert_eq!(out.num_rows(), 2);
2046
2047 let name = out.column(0).as_string::<i32>();
2049 assert_eq!(name.value(0), "x");
2050 assert_eq!(name.value(1), "y");
2051
2052 Ok(())
2053 }
2054
2055 #[test]
2056 fn promotion_writer_int_reader_nullable_long() -> Result<(), Box<dyn std::error::Error>> {
2057 let writer_schema = Schema::new(vec![Field::new("v", DataType::Int32, false)]);
2059 let batch = RecordBatch::try_new(
2060 Arc::new(writer_schema.clone()),
2061 vec![Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef],
2062 )?;
2063 let bytes = write_ocf(&writer_schema, &[batch]);
2064
2065 let reader_json = r#"
2067 {
2068 "type":"record", "name":"topLevelRecord",
2069 "fields":[ { "name":"v", "type":["null","long"], "default": null } ]
2070 }"#;
2071
2072 let mut reader = ReaderBuilder::new()
2073 .with_reader_schema(AvroSchema::new(reader_json.to_string()))
2074 .build(Cursor::new(bytes))?;
2075
2076 let out = reader.next().unwrap()?;
2077 assert_eq!(out.num_rows(), 3);
2078
2079 let v = out
2081 .column(0)
2082 .as_primitive::<arrow_array::types::Int64Type>();
2083 assert_eq!(v.values(), &[1, 2, 3]);
2084 assert!(
2085 out.column(0).nulls().is_none(),
2086 "expected no validity bitmap for all-valid column"
2087 );
2088
2089 Ok(())
2090 }
2091
2092 #[test]
2093 fn test_alltypes_schema_promotion_mixed() {
2094 for file in files() {
2095 let file = arrow_test_data(file);
2096 let mut promotions: HashMap<&str, &str> = HashMap::new();
2097 promotions.insert("id", "long");
2098 promotions.insert("tinyint_col", "float");
2099 promotions.insert("smallint_col", "double");
2100 promotions.insert("int_col", "double");
2101 promotions.insert("bigint_col", "double");
2102 promotions.insert("float_col", "double");
2103 promotions.insert("date_string_col", "string");
2104 promotions.insert("string_col", "string");
2105 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2106 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2107 let expected = RecordBatch::try_from_iter_with_nullable([
2108 (
2109 "id",
2110 Arc::new(Int64Array::from(vec![4i64, 5, 6, 7, 2, 3, 0, 1])) as _,
2111 true,
2112 ),
2113 (
2114 "bool_col",
2115 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2116 true,
2117 ),
2118 (
2119 "tinyint_col",
2120 Arc::new(Float32Array::from_iter_values(
2121 (0..8).map(|x| (x % 2) as f32),
2122 )) as _,
2123 true,
2124 ),
2125 (
2126 "smallint_col",
2127 Arc::new(Float64Array::from_iter_values(
2128 (0..8).map(|x| (x % 2) as f64),
2129 )) as _,
2130 true,
2131 ),
2132 (
2133 "int_col",
2134 Arc::new(Float64Array::from_iter_values(
2135 (0..8).map(|x| (x % 2) as f64),
2136 )) as _,
2137 true,
2138 ),
2139 (
2140 "bigint_col",
2141 Arc::new(Float64Array::from_iter_values(
2142 (0..8).map(|x| ((x % 2) * 10) as f64),
2143 )) as _,
2144 true,
2145 ),
2146 (
2147 "float_col",
2148 Arc::new(Float64Array::from_iter_values(
2149 (0..8).map(|x| ((x % 2) as f32 * 1.1f32) as f64),
2150 )) as _,
2151 true,
2152 ),
2153 (
2154 "double_col",
2155 Arc::new(Float64Array::from_iter_values(
2156 (0..8).map(|x| (x % 2) as f64 * 10.1),
2157 )) as _,
2158 true,
2159 ),
2160 (
2161 "date_string_col",
2162 Arc::new(StringArray::from(vec![
2163 "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2164 "01/01/09", "01/01/09",
2165 ])) as _,
2166 true,
2167 ),
2168 (
2169 "string_col",
2170 Arc::new(StringArray::from(
2171 (0..8)
2172 .map(|x| if x % 2 == 0 { "0" } else { "1" })
2173 .collect::<Vec<_>>(),
2174 )) as _,
2175 true,
2176 ),
2177 (
2178 "timestamp_col",
2179 Arc::new(
2180 TimestampMicrosecondArray::from_iter_values([
2181 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2190 .with_timezone("+00:00"),
2191 ) as _,
2192 true,
2193 ),
2194 ])
2195 .unwrap();
2196 assert_eq!(batch, expected, "mismatch for file {file}");
2197 }
2198 }
2199
2200 #[test]
2201 fn test_alltypes_schema_promotion_long_to_float_only() {
2202 for file in files() {
2203 let file = arrow_test_data(file);
2204 let mut promotions: HashMap<&str, &str> = HashMap::new();
2205 promotions.insert("bigint_col", "float");
2206 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2207 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2208 let expected = RecordBatch::try_from_iter_with_nullable([
2209 (
2210 "id",
2211 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2212 true,
2213 ),
2214 (
2215 "bool_col",
2216 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2217 true,
2218 ),
2219 (
2220 "tinyint_col",
2221 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2222 true,
2223 ),
2224 (
2225 "smallint_col",
2226 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2227 true,
2228 ),
2229 (
2230 "int_col",
2231 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2232 true,
2233 ),
2234 (
2235 "bigint_col",
2236 Arc::new(Float32Array::from_iter_values(
2237 (0..8).map(|x| ((x % 2) * 10) as f32),
2238 )) as _,
2239 true,
2240 ),
2241 (
2242 "float_col",
2243 Arc::new(Float32Array::from_iter_values(
2244 (0..8).map(|x| (x % 2) as f32 * 1.1),
2245 )) as _,
2246 true,
2247 ),
2248 (
2249 "double_col",
2250 Arc::new(Float64Array::from_iter_values(
2251 (0..8).map(|x| (x % 2) as f64 * 10.1),
2252 )) as _,
2253 true,
2254 ),
2255 (
2256 "date_string_col",
2257 Arc::new(BinaryArray::from_iter_values([
2258 [48, 51, 47, 48, 49, 47, 48, 57],
2259 [48, 51, 47, 48, 49, 47, 48, 57],
2260 [48, 52, 47, 48, 49, 47, 48, 57],
2261 [48, 52, 47, 48, 49, 47, 48, 57],
2262 [48, 50, 47, 48, 49, 47, 48, 57],
2263 [48, 50, 47, 48, 49, 47, 48, 57],
2264 [48, 49, 47, 48, 49, 47, 48, 57],
2265 [48, 49, 47, 48, 49, 47, 48, 57],
2266 ])) as _,
2267 true,
2268 ),
2269 (
2270 "string_col",
2271 Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
2272 true,
2273 ),
2274 (
2275 "timestamp_col",
2276 Arc::new(
2277 TimestampMicrosecondArray::from_iter_values([
2278 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2287 .with_timezone("+00:00"),
2288 ) as _,
2289 true,
2290 ),
2291 ])
2292 .unwrap();
2293 assert_eq!(batch, expected, "mismatch for file {file}");
2294 }
2295 }
2296
2297 #[test]
2298 fn test_alltypes_schema_promotion_bytes_to_string_only() {
2299 for file in files() {
2300 let file = arrow_test_data(file);
2301 let mut promotions: HashMap<&str, &str> = HashMap::new();
2302 promotions.insert("date_string_col", "string");
2303 promotions.insert("string_col", "string");
2304 let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2305 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
2306 let expected = RecordBatch::try_from_iter_with_nullable([
2307 (
2308 "id",
2309 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
2310 true,
2311 ),
2312 (
2313 "bool_col",
2314 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
2315 true,
2316 ),
2317 (
2318 "tinyint_col",
2319 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2320 true,
2321 ),
2322 (
2323 "smallint_col",
2324 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2325 true,
2326 ),
2327 (
2328 "int_col",
2329 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
2330 true,
2331 ),
2332 (
2333 "bigint_col",
2334 Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
2335 true,
2336 ),
2337 (
2338 "float_col",
2339 Arc::new(Float32Array::from_iter_values(
2340 (0..8).map(|x| (x % 2) as f32 * 1.1),
2341 )) as _,
2342 true,
2343 ),
2344 (
2345 "double_col",
2346 Arc::new(Float64Array::from_iter_values(
2347 (0..8).map(|x| (x % 2) as f64 * 10.1),
2348 )) as _,
2349 true,
2350 ),
2351 (
2352 "date_string_col",
2353 Arc::new(StringArray::from(vec![
2354 "03/01/09", "03/01/09", "04/01/09", "04/01/09", "02/01/09", "02/01/09",
2355 "01/01/09", "01/01/09",
2356 ])) as _,
2357 true,
2358 ),
2359 (
2360 "string_col",
2361 Arc::new(StringArray::from(
2362 (0..8)
2363 .map(|x| if x % 2 == 0 { "0" } else { "1" })
2364 .collect::<Vec<_>>(),
2365 )) as _,
2366 true,
2367 ),
2368 (
2369 "timestamp_col",
2370 Arc::new(
2371 TimestampMicrosecondArray::from_iter_values([
2372 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
2381 .with_timezone("+00:00"),
2382 ) as _,
2383 true,
2384 ),
2385 ])
2386 .unwrap();
2387 assert_eq!(batch, expected, "mismatch for file {file}");
2388 }
2389 }
2390
2391 #[test]
2392 #[cfg(feature = "snappy")]
2394 fn test_alltypes_illegal_promotion_bool_to_double_errors() {
2395 let file = arrow_test_data("avro/alltypes_plain.avro");
2396 let mut promotions: HashMap<&str, &str> = HashMap::new();
2397 promotions.insert("bool_col", "double"); let reader_schema = make_reader_schema_with_promotions(&file, &promotions);
2399 let file_handle = File::open(&file).unwrap();
2400 let result = ReaderBuilder::new()
2401 .with_reader_schema(reader_schema)
2402 .build(BufReader::new(file_handle));
2403 let err = result.expect_err("expected illegal promotion to error");
2404 let msg = err.to_string();
2405 assert!(
2406 msg.contains("Illegal promotion") || msg.contains("illegal promotion"),
2407 "unexpected error: {msg}"
2408 );
2409 }
2410
2411 #[test]
2412 fn test_simple_enum_with_reader_schema_mapping() {
2413 let file = arrow_test_data("avro/simple_enum.avro");
2414 let mut remap: HashMap<&str, Vec<&str>> = HashMap::new();
2415 remap.insert("f1", vec!["d", "c", "b", "a"]);
2416 remap.insert("f2", vec!["h", "g", "f", "e"]);
2417 remap.insert("f3", vec!["k", "i", "j"]);
2418 let reader_schema = make_reader_schema_with_enum_remap(&file, &remap);
2419 let actual = read_alltypes_with_reader_schema(&file, reader_schema);
2420 let dict_type = DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
2421 let f1_keys = Int32Array::from(vec![3, 2, 1, 0]);
2423 let f1_vals = StringArray::from(vec!["d", "c", "b", "a"]);
2424 let f1 = DictionaryArray::<Int32Type>::try_new(f1_keys, Arc::new(f1_vals)).unwrap();
2425 let mut md_f1 = HashMap::new();
2426 md_f1.insert(
2427 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2428 r#"["d","c","b","a"]"#.to_string(),
2429 );
2430 md_f1.insert("avro.name".to_string(), "enum1".to_string());
2432 md_f1.insert("avro.namespace".to_string(), "ns1".to_string());
2433 let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
2434 let f2_keys = Int32Array::from(vec![1, 0, 3, 2]);
2436 let f2_vals = StringArray::from(vec!["h", "g", "f", "e"]);
2437 let f2 = DictionaryArray::<Int32Type>::try_new(f2_keys, Arc::new(f2_vals)).unwrap();
2438 let mut md_f2 = HashMap::new();
2439 md_f2.insert(
2440 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2441 r#"["h","g","f","e"]"#.to_string(),
2442 );
2443 md_f2.insert("avro.name".to_string(), "enum2".to_string());
2445 md_f2.insert("avro.namespace".to_string(), "ns2".to_string());
2446 let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
2447 let f3_keys = Int32Array::from(vec![Some(2), Some(0), None, Some(1)]);
2449 let f3_vals = StringArray::from(vec!["k", "i", "j"]);
2450 let f3 = DictionaryArray::<Int32Type>::try_new(f3_keys, Arc::new(f3_vals)).unwrap();
2451 let mut md_f3 = HashMap::new();
2452 md_f3.insert(
2453 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
2454 r#"["k","i","j"]"#.to_string(),
2455 );
2456 md_f3.insert("avro.name".to_string(), "enum3".to_string());
2458 md_f3.insert("avro.namespace".to_string(), "ns1".to_string());
2459 let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
2460 let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
2461 let expected = RecordBatch::try_new(
2462 expected_schema,
2463 vec![Arc::new(f1) as ArrayRef, Arc::new(f2), Arc::new(f3)],
2464 )
2465 .unwrap();
2466 assert_eq!(actual, expected);
2467 }
2468
2469 #[test]
2470 fn test_schema_store_register_lookup() {
2471 let schema_int = make_record_schema(PrimitiveType::Int);
2472 let schema_long = make_record_schema(PrimitiveType::Long);
2473 let mut store = SchemaStore::new();
2474 let fp_int = store.register(schema_int.clone()).unwrap();
2475 let fp_long = store.register(schema_long.clone()).unwrap();
2476 assert_eq!(store.lookup(&fp_int).cloned(), Some(schema_int));
2477 assert_eq!(store.lookup(&fp_long).cloned(), Some(schema_long));
2478 assert_eq!(store.fingerprint_algorithm(), FingerprintAlgorithm::Rabin);
2479 }
2480
2481 #[test]
2482 fn test_unknown_fingerprint_is_error() {
2483 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2484 let unknown_fp = Fingerprint::Rabin(0xDEAD_BEEF_DEAD_BEEF);
2485 let prefix = make_prefix(unknown_fp);
2486 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2487 let err = decoder.decode(&prefix).expect_err("decode should error");
2488 let msg = err.to_string();
2489 assert!(
2490 msg.contains("Unknown fingerprint"),
2491 "unexpected message: {msg}"
2492 );
2493 }
2494
2495 #[test]
2496 fn test_handle_prefix_incomplete_magic() {
2497 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2498 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2499 let buf = &SINGLE_OBJECT_MAGIC[..1];
2500 let res = decoder.handle_prefix(buf).unwrap();
2501 assert_eq!(res, Some(0));
2502 assert!(decoder.pending_schema.is_none());
2503 }
2504
2505 #[test]
2506 fn test_handle_prefix_magic_mismatch() {
2507 let (store, fp_int, _fp_long, _schema_int, schema_long) = make_two_schema_store();
2508 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2509 let buf = [0xFFu8, 0x00u8, 0x01u8];
2510 let res = decoder.handle_prefix(&buf).unwrap();
2511 assert!(res.is_none());
2512 }
2513
2514 #[test]
2515 fn test_handle_prefix_incomplete_fingerprint() {
2516 let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2517 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2518 let long_bytes = match fp_long {
2519 Fingerprint::Rabin(v) => v.to_le_bytes(),
2520 Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2521 Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2522 #[cfg(feature = "md5")]
2523 Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2524 #[cfg(feature = "sha256")]
2525 Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2526 };
2527 let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2528 buf.extend_from_slice(&long_bytes[..4]);
2529 let res = decoder.handle_prefix(&buf).unwrap();
2530 assert_eq!(res, Some(0));
2531 assert!(decoder.pending_schema.is_none());
2532 }
2533
2534 #[test]
2535 fn test_handle_prefix_valid_prefix_switches_schema() {
2536 let (store, fp_int, fp_long, _schema_int, schema_long) = make_two_schema_store();
2537 let mut decoder = make_decoder(&store, fp_int, &schema_long);
2538 let writer_schema_long = schema_long.schema().unwrap();
2539 let root_long = AvroFieldBuilder::new(&writer_schema_long).build().unwrap();
2540 let long_decoder = RecordDecoder::try_new_with_options(root_long.data_type()).unwrap();
2541 let _ = decoder.cache.insert(fp_long, long_decoder);
2542 let mut buf = Vec::from(SINGLE_OBJECT_MAGIC);
2543 match fp_long {
2544 Fingerprint::Rabin(v) => buf.extend_from_slice(&v.to_le_bytes()),
2545 Fingerprint::Id(id) => panic!("expected Rabin fingerprint, got ({id})"),
2546 Fingerprint::Id64(id) => panic!("expected Rabin fingerprint, got ({id})"),
2547 #[cfg(feature = "md5")]
2548 Fingerprint::MD5(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2549 #[cfg(feature = "sha256")]
2550 Fingerprint::SHA256(v) => panic!("expected Rabin fingerprint, got ({v:?})"),
2551 }
2552 let consumed = decoder.handle_prefix(&buf).unwrap().unwrap();
2553 assert_eq!(consumed, buf.len());
2554 assert!(decoder.pending_schema.is_some());
2555 assert_eq!(decoder.pending_schema.as_ref().unwrap().0, fp_long);
2556 }
2557
2558 #[test]
2559 fn test_decoder_projection_multiple_writer_schemas_no_reader_schema()
2560 -> Result<(), Box<dyn std::error::Error>> {
2561 let writer_v1 = AvroSchema::new(
2563 r#"{"type":"record","name":"E","fields":[{"name":"a","type":"int"},{"name":"b","type":"string"}]}"#
2564 .to_string(),
2565 );
2566 let writer_v2 = AvroSchema::new(
2567 r#"{"type":"record","name":"E","fields":[{"name":"a","type":"long"},{"name":"b","type":"string"},{"name":"c","type":"int"}]}"#
2568 .to_string(),
2569 );
2570 let mut store = SchemaStore::new();
2571 let fp1 = store.register(writer_v1)?;
2572 let fp2 = store.register(writer_v2)?;
2573 let mut decoder = ReaderBuilder::new()
2574 .with_writer_schema_store(store)
2575 .with_active_fingerprint(fp1)
2576 .with_batch_size(8)
2577 .with_projection(vec![1])
2578 .build_decoder()?;
2579 let mut msg1 = make_prefix(fp1);
2581 msg1.extend_from_slice(&encode_zigzag(1)); msg1.push((1u8) << 1);
2583 msg1.extend_from_slice(b"x");
2584 let mut msg2 = make_prefix(fp2);
2586 msg2.extend_from_slice(&encode_zigzag(2)); msg2.push((1u8) << 1);
2588 msg2.extend_from_slice(b"y");
2589 msg2.extend_from_slice(&encode_zigzag(7)); decoder.decode(&msg1)?;
2591 let batch1 = decoder.flush()?.expect("batch1");
2592 assert_eq!(batch1.num_columns(), 1);
2593 assert_eq!(batch1.schema().field(0).name(), "b");
2594 let b1 = batch1.column(0).as_string::<i32>();
2595 assert_eq!(b1.value(0), "x");
2596 decoder.decode(&msg2)?;
2597 let batch2 = decoder.flush()?.expect("batch2");
2598 assert_eq!(batch2.num_columns(), 1);
2599 assert_eq!(batch2.schema().field(0).name(), "b");
2600 let b2 = batch2.column(0).as_string::<i32>();
2601 assert_eq!(b2.value(0), "y");
2602 Ok(())
2603 }
2604
2605 #[test]
2606 fn test_two_messages_same_schema() {
2607 let writer_schema = make_value_schema(PrimitiveType::Int);
2608 let reader_schema = writer_schema.clone();
2609 let mut store = SchemaStore::new();
2610 let fp = store.register(writer_schema).unwrap();
2611 let msg1 = make_message(fp, 42);
2612 let msg2 = make_message(fp, 11);
2613 let input = [msg1.clone(), msg2.clone()].concat();
2614 let mut decoder = ReaderBuilder::new()
2615 .with_batch_size(8)
2616 .with_reader_schema(reader_schema.clone())
2617 .with_writer_schema_store(store)
2618 .with_active_fingerprint(fp)
2619 .build_decoder()
2620 .unwrap();
2621 let _ = decoder.decode(&input).unwrap();
2622 let batch = decoder.flush().unwrap().expect("batch");
2623 assert_eq!(batch.num_rows(), 2);
2624 let col = batch
2625 .column(0)
2626 .as_any()
2627 .downcast_ref::<Int32Array>()
2628 .unwrap();
2629 assert_eq!(col.value(0), 42);
2630 assert_eq!(col.value(1), 11);
2631 }
2632
2633 #[test]
2634 fn test_two_messages_schema_switch() {
2635 let w_int = make_value_schema(PrimitiveType::Int);
2636 let w_long = make_value_schema(PrimitiveType::Long);
2637 let mut store = SchemaStore::new();
2638 let fp_int = store.register(w_int).unwrap();
2639 let fp_long = store.register(w_long).unwrap();
2640 let msg_int = make_message(fp_int, 1);
2641 let msg_long = make_message(fp_long, 123456789_i64);
2642 let mut decoder = ReaderBuilder::new()
2643 .with_batch_size(8)
2644 .with_writer_schema_store(store)
2645 .with_active_fingerprint(fp_int)
2646 .build_decoder()
2647 .unwrap();
2648 let _ = decoder.decode(&msg_int).unwrap();
2649 let batch1 = decoder.flush().unwrap().expect("batch1");
2650 assert_eq!(batch1.num_rows(), 1);
2651 assert_eq!(
2652 batch1
2653 .column(0)
2654 .as_any()
2655 .downcast_ref::<Int32Array>()
2656 .unwrap()
2657 .value(0),
2658 1
2659 );
2660 let _ = decoder.decode(&msg_long).unwrap();
2661 let batch2 = decoder.flush().unwrap().expect("batch2");
2662 assert_eq!(batch2.num_rows(), 1);
2663 assert_eq!(
2664 batch2
2665 .column(0)
2666 .as_any()
2667 .downcast_ref::<Int64Array>()
2668 .unwrap()
2669 .value(0),
2670 123456789_i64
2671 );
2672 }
2673
2674 #[test]
2675 fn test_two_messages_same_schema_id() {
2676 let writer_schema = make_value_schema(PrimitiveType::Int);
2677 let reader_schema = writer_schema.clone();
2678 let id = 100u32;
2679 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2681 let _ = store
2682 .set(Fingerprint::Id(id), writer_schema.clone())
2683 .expect("set id schema");
2684 let msg1 = make_message_id(id, 21);
2685 let msg2 = make_message_id(id, 22);
2686 let input = [msg1.clone(), msg2.clone()].concat();
2687 let mut decoder = ReaderBuilder::new()
2688 .with_batch_size(8)
2689 .with_reader_schema(reader_schema)
2690 .with_writer_schema_store(store)
2691 .with_active_fingerprint(Fingerprint::Id(id))
2692 .build_decoder()
2693 .unwrap();
2694 let _ = decoder.decode(&input).unwrap();
2695 let batch = decoder.flush().unwrap().expect("batch");
2696 assert_eq!(batch.num_rows(), 2);
2697 let col = batch
2698 .column(0)
2699 .as_any()
2700 .downcast_ref::<Int32Array>()
2701 .unwrap();
2702 assert_eq!(col.value(0), 21);
2703 assert_eq!(col.value(1), 22);
2704 }
2705
2706 #[test]
2707 fn test_unknown_id_fingerprint_is_error() {
2708 let writer_schema = make_value_schema(PrimitiveType::Int);
2709 let id_known = 7u32;
2710 let id_unknown = 9u32;
2711 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2712 let _ = store
2713 .set(Fingerprint::Id(id_known), writer_schema.clone())
2714 .expect("set id schema");
2715 let mut decoder = ReaderBuilder::new()
2716 .with_batch_size(8)
2717 .with_reader_schema(writer_schema)
2718 .with_writer_schema_store(store)
2719 .with_active_fingerprint(Fingerprint::Id(id_known))
2720 .build_decoder()
2721 .unwrap();
2722 let prefix = make_id_prefix(id_unknown, 0);
2723 let err = decoder.decode(&prefix).expect_err("decode should error");
2724 let msg = err.to_string();
2725 assert!(
2726 msg.contains("Unknown fingerprint"),
2727 "unexpected message: {msg}"
2728 );
2729 }
2730
2731 #[test]
2732 fn test_handle_prefix_id_incomplete_magic() {
2733 let writer_schema = make_value_schema(PrimitiveType::Int);
2734 let id = 5u32;
2735 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id);
2736 let _ = store
2737 .set(Fingerprint::Id(id), writer_schema.clone())
2738 .expect("set id schema");
2739 let mut decoder = ReaderBuilder::new()
2740 .with_batch_size(8)
2741 .with_reader_schema(writer_schema)
2742 .with_writer_schema_store(store)
2743 .with_active_fingerprint(Fingerprint::Id(id))
2744 .build_decoder()
2745 .unwrap();
2746 let buf = &CONFLUENT_MAGIC[..0]; let res = decoder.handle_prefix(buf).unwrap();
2748 assert_eq!(res, Some(0));
2749 assert!(decoder.pending_schema.is_none());
2750 }
2751
2752 #[test]
2753 fn test_two_messages_same_schema_id64() {
2754 let writer_schema = make_value_schema(PrimitiveType::Int);
2755 let reader_schema = writer_schema.clone();
2756 let id = 100u64;
2757 let mut store = SchemaStore::new_with_type(FingerprintAlgorithm::Id64);
2759 let _ = store
2760 .set(Fingerprint::Id64(id), writer_schema.clone())
2761 .expect("set id schema");
2762 let msg1 = make_message_id64(id, 21);
2763 let msg2 = make_message_id64(id, 22);
2764 let input = [msg1.clone(), msg2.clone()].concat();
2765 let mut decoder = ReaderBuilder::new()
2766 .with_batch_size(8)
2767 .with_reader_schema(reader_schema)
2768 .with_writer_schema_store(store)
2769 .with_active_fingerprint(Fingerprint::Id64(id))
2770 .build_decoder()
2771 .unwrap();
2772 let _ = decoder.decode(&input).unwrap();
2773 let batch = decoder.flush().unwrap().expect("batch");
2774 assert_eq!(batch.num_rows(), 2);
2775 let col = batch
2776 .column(0)
2777 .as_any()
2778 .downcast_ref::<Int32Array>()
2779 .unwrap();
2780 assert_eq!(col.value(0), 21);
2781 assert_eq!(col.value(1), 22);
2782 }
2783
2784 #[test]
2785 fn test_decode_stream_with_schema() {
2786 struct TestCase<'a> {
2787 name: &'a str,
2788 schema: &'a str,
2789 expected_error: Option<&'a str>,
2790 }
2791 let tests = vec![
2792 TestCase {
2793 name: "success",
2794 schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"string"}]}"#,
2795 expected_error: None,
2796 },
2797 TestCase {
2798 name: "valid schema invalid data",
2799 schema: r#"{"type":"record","name":"test","fields":[{"name":"f2","type":"long"}]}"#,
2800 expected_error: Some("did not consume all bytes"),
2801 },
2802 ];
2803 for test in tests {
2804 let avro_schema = AvroSchema::new(test.schema.to_string());
2805 let mut store = SchemaStore::new();
2806 let fp = store.register(avro_schema.clone()).unwrap();
2807 let prefix = make_prefix(fp);
2808 let record_val = "some_string";
2809 let mut body = prefix;
2810 body.push((record_val.len() as u8) << 1);
2811 body.extend_from_slice(record_val.as_bytes());
2812 let decoder_res = ReaderBuilder::new()
2813 .with_batch_size(1)
2814 .with_writer_schema_store(store)
2815 .with_active_fingerprint(fp)
2816 .build_decoder();
2817 let decoder = match decoder_res {
2818 Ok(d) => d,
2819 Err(e) => {
2820 if let Some(expected) = test.expected_error {
2821 assert!(
2822 e.to_string().contains(expected),
2823 "Test '{}' failed at build – expected '{expected}', got '{e}'",
2824 test.name
2825 );
2826 continue;
2827 } else {
2828 panic!("Test '{}' failed during build: {e}", test.name);
2829 }
2830 }
2831 };
2832 let stream = Box::pin(stream::once(async { Bytes::from(body) }));
2833 let decoded_stream = decode_stream(decoder, stream);
2834 let batches_result: Result<Vec<RecordBatch>, ArrowError> =
2835 block_on(decoded_stream.try_collect());
2836 match (batches_result, test.expected_error) {
2837 (Ok(batches), None) => {
2838 let batch =
2839 arrow::compute::concat_batches(&batches[0].schema(), &batches).unwrap();
2840 let expected_field = Field::new("f2", DataType::Utf8, false);
2841 let expected_schema = Arc::new(Schema::new(vec![expected_field]));
2842 let expected_array = Arc::new(StringArray::from(vec![record_val]));
2843 let expected_batch =
2844 RecordBatch::try_new(expected_schema, vec![expected_array]).unwrap();
2845 assert_eq!(batch, expected_batch, "Test '{}'", test.name);
2846 }
2847 (Err(e), Some(expected)) => {
2848 assert!(
2849 e.to_string().contains(expected),
2850 "Test '{}' – expected error containing '{expected}', got '{e}'",
2851 test.name
2852 );
2853 }
2854 (Ok(_), Some(expected)) => {
2855 panic!(
2856 "Test '{}' expected failure ('{expected}') but succeeded",
2857 test.name
2858 );
2859 }
2860 (Err(e), None) => {
2861 panic!("Test '{}' unexpectedly failed with '{e}'", test.name);
2862 }
2863 }
2864 }
2865 }
2866
2867 #[test]
2868 fn test_utf8view_support() {
2869 struct TestHelper;
2870 impl TestHelper {
2871 fn with_utf8view(field: &Field) -> Field {
2872 match field.data_type() {
2873 DataType::Utf8 => {
2874 Field::new(field.name(), DataType::Utf8View, field.is_nullable())
2875 .with_metadata(field.metadata().clone())
2876 }
2877 _ => field.clone(),
2878 }
2879 }
2880 }
2881
2882 let field = TestHelper::with_utf8view(&Field::new("str_field", DataType::Utf8, false));
2883
2884 assert_eq!(field.data_type(), &DataType::Utf8View);
2885
2886 let array = StringViewArray::from(vec!["test1", "test2"]);
2887 let batch =
2888 RecordBatch::try_from_iter(vec![("str_field", Arc::new(array) as ArrayRef)]).unwrap();
2889
2890 assert!(batch.column(0).as_any().is::<StringViewArray>());
2891 }
2892
2893 fn make_reader_schema_with_default_fields(
2894 path: &str,
2895 default_fields: Vec<Value>,
2896 ) -> AvroSchema {
2897 let mut root = load_writer_schema_json(path);
2898 assert_eq!(root["type"], "record", "writer schema must be a record");
2899 root.as_object_mut()
2900 .expect("schema is a JSON object")
2901 .insert("fields".to_string(), Value::Array(default_fields));
2902 AvroSchema::new(root.to_string())
2903 }
2904
2905 #[test]
2906 fn test_schema_resolution_defaults_all_supported_types() {
2907 let path = "test/data/skippable_types.avro";
2908 let duration_default = "\u{0000}".repeat(12);
2909 let reader_schema = make_reader_schema_with_default_fields(
2910 path,
2911 vec![
2912 serde_json::json!({"name":"d_bool","type":"boolean","default":true}),
2913 serde_json::json!({"name":"d_int","type":"int","default":42}),
2914 serde_json::json!({"name":"d_long","type":"long","default":12345}),
2915 serde_json::json!({"name":"d_float","type":"float","default":1.5}),
2916 serde_json::json!({"name":"d_double","type":"double","default":2.25}),
2917 serde_json::json!({"name":"d_bytes","type":"bytes","default":"XYZ"}),
2918 serde_json::json!({"name":"d_string","type":"string","default":"hello"}),
2919 serde_json::json!({"name":"d_date","type":{"type":"int","logicalType":"date"},"default":0}),
2920 serde_json::json!({"name":"d_time_ms","type":{"type":"int","logicalType":"time-millis"},"default":1000}),
2921 serde_json::json!({"name":"d_time_us","type":{"type":"long","logicalType":"time-micros"},"default":2000}),
2922 serde_json::json!({"name":"d_ts_ms","type":{"type":"long","logicalType":"local-timestamp-millis"},"default":0}),
2923 serde_json::json!({"name":"d_ts_us","type":{"type":"long","logicalType":"local-timestamp-micros"},"default":0}),
2924 serde_json::json!({"name":"d_decimal","type":{"type":"bytes","logicalType":"decimal","precision":10,"scale":2},"default":""}),
2925 serde_json::json!({"name":"d_fixed","type":{"type":"fixed","name":"F4","size":4},"default":"ABCD"}),
2926 serde_json::json!({"name":"d_enum","type":{"type":"enum","name":"E","symbols":["A","B","C"]},"default":"A"}),
2927 serde_json::json!({"name":"d_duration","type":{"type":"fixed","name":"Dur","size":12,"logicalType":"duration"},"default":duration_default}),
2928 serde_json::json!({"name":"d_uuid","type":{"type":"string","logicalType":"uuid"},"default":"00000000-0000-0000-0000-000000000000"}),
2929 serde_json::json!({"name":"d_array","type":{"type":"array","items":"int"},"default":[1,2,3]}),
2930 serde_json::json!({"name":"d_map","type":{"type":"map","values":"long"},"default":{"a":1,"b":2}}),
2931 serde_json::json!({"name":"d_record","type":{
2932 "type":"record","name":"DefaultRec","fields":[
2933 {"name":"x","type":"int"},
2934 {"name":"y","type":["null","string"],"default":null}
2935 ]
2936 },"default":{"x":7}}),
2937 serde_json::json!({"name":"d_nullable_null","type":["null","int"],"default":null}),
2938 serde_json::json!({"name":"d_nullable_value","type":["int","null"],"default":123}),
2939 ],
2940 );
2941 let actual = read_alltypes_with_reader_schema(path, reader_schema);
2942 let num_rows = actual.num_rows();
2943 assert!(num_rows > 0, "skippable_types.avro should contain rows");
2944 assert_eq!(
2945 actual.num_columns(),
2946 22,
2947 "expected exactly our defaulted fields"
2948 );
2949 let mut arrays: Vec<Arc<dyn Array>> = Vec::with_capacity(22);
2950 arrays.push(Arc::new(BooleanArray::from_iter(std::iter::repeat_n(
2951 Some(true),
2952 num_rows,
2953 ))));
2954 arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
2955 42, num_rows,
2956 ))));
2957 arrays.push(Arc::new(Int64Array::from_iter_values(std::iter::repeat_n(
2958 12345, num_rows,
2959 ))));
2960 arrays.push(Arc::new(Float32Array::from_iter_values(
2961 std::iter::repeat_n(1.5f32, num_rows),
2962 )));
2963 arrays.push(Arc::new(Float64Array::from_iter_values(
2964 std::iter::repeat_n(2.25f64, num_rows),
2965 )));
2966 arrays.push(Arc::new(BinaryArray::from_iter_values(
2967 std::iter::repeat_n(b"XYZ".as_ref(), num_rows),
2968 )));
2969 arrays.push(Arc::new(StringArray::from_iter_values(
2970 std::iter::repeat_n("hello", num_rows),
2971 )));
2972 arrays.push(Arc::new(Date32Array::from_iter_values(
2973 std::iter::repeat_n(0, num_rows),
2974 )));
2975 arrays.push(Arc::new(Time32MillisecondArray::from_iter_values(
2976 std::iter::repeat_n(1_000, num_rows),
2977 )));
2978 arrays.push(Arc::new(Time64MicrosecondArray::from_iter_values(
2979 std::iter::repeat_n(2_000i64, num_rows),
2980 )));
2981 arrays.push(Arc::new(TimestampMillisecondArray::from_iter_values(
2982 std::iter::repeat_n(0i64, num_rows),
2983 )));
2984 arrays.push(Arc::new(TimestampMicrosecondArray::from_iter_values(
2985 std::iter::repeat_n(0i64, num_rows),
2986 )));
2987 #[cfg(feature = "small_decimals")]
2988 let decimal = Decimal64Array::from_iter_values(std::iter::repeat_n(0i64, num_rows))
2989 .with_precision_and_scale(10, 2)
2990 .unwrap();
2991 #[cfg(not(feature = "small_decimals"))]
2992 let decimal = Decimal128Array::from_iter_values(std::iter::repeat_n(0i128, num_rows))
2993 .with_precision_and_scale(10, 2)
2994 .unwrap();
2995 arrays.push(Arc::new(decimal));
2996 let fixed_iter = std::iter::repeat_n(Some(*b"ABCD"), num_rows);
2997 arrays.push(Arc::new(
2998 FixedSizeBinaryArray::try_from_sparse_iter_with_size(fixed_iter, 4).unwrap(),
2999 ));
3000 let enum_keys = Int32Array::from_iter_values(std::iter::repeat_n(0, num_rows));
3001 let enum_values = StringArray::from_iter_values(["A", "B", "C"]);
3002 let enum_arr =
3003 DictionaryArray::<Int32Type>::try_new(enum_keys, Arc::new(enum_values)).unwrap();
3004 arrays.push(Arc::new(enum_arr));
3005 let duration_values = std::iter::repeat_n(
3006 Some(IntervalMonthDayNanoType::make_value(0, 0, 0)),
3007 num_rows,
3008 );
3009 let duration_arr: IntervalMonthDayNanoArray = duration_values.collect();
3010 arrays.push(Arc::new(duration_arr));
3011 let uuid_bytes = [0u8; 16];
3012 let uuid_iter = std::iter::repeat_n(Some(uuid_bytes), num_rows);
3013 arrays.push(Arc::new(
3014 FixedSizeBinaryArray::try_from_sparse_iter_with_size(uuid_iter, 16).unwrap(),
3015 ));
3016 let item_field = Arc::new(Field::new(
3017 Field::LIST_FIELD_DEFAULT_NAME,
3018 DataType::Int32,
3019 false,
3020 ));
3021 let mut list_builder = ListBuilder::new(Int32Builder::new()).with_field(item_field);
3022 for _ in 0..num_rows {
3023 list_builder.values().append_value(1);
3024 list_builder.values().append_value(2);
3025 list_builder.values().append_value(3);
3026 list_builder.append(true);
3027 }
3028 arrays.push(Arc::new(list_builder.finish()));
3029 let values_field = Arc::new(Field::new("value", DataType::Int64, false));
3030 let mut map_builder = MapBuilder::new(
3031 Some(builder::MapFieldNames {
3032 entry: "entries".to_string(),
3033 key: "key".to_string(),
3034 value: "value".to_string(),
3035 }),
3036 StringBuilder::new(),
3037 Int64Builder::new(),
3038 )
3039 .with_values_field(values_field);
3040 for _ in 0..num_rows {
3041 let (keys, vals) = map_builder.entries();
3042 keys.append_value("a");
3043 vals.append_value(1);
3044 keys.append_value("b");
3045 vals.append_value(2);
3046 map_builder.append(true).unwrap();
3047 }
3048 arrays.push(Arc::new(map_builder.finish()));
3049 let rec_fields: Fields = Fields::from(vec![
3050 Field::new("x", DataType::Int32, false),
3051 Field::new("y", DataType::Utf8, true),
3052 ]);
3053 let mut sb = StructBuilder::new(
3054 rec_fields.clone(),
3055 vec![
3056 Box::new(Int32Builder::new()),
3057 Box::new(StringBuilder::new()),
3058 ],
3059 );
3060 for _ in 0..num_rows {
3061 sb.field_builder::<Int32Builder>(0).unwrap().append_value(7);
3062 sb.field_builder::<StringBuilder>(1).unwrap().append_null();
3063 sb.append(true);
3064 }
3065 arrays.push(Arc::new(sb.finish()));
3066 arrays.push(Arc::new(Int32Array::from_iter(std::iter::repeat_n(
3067 None::<i32>,
3068 num_rows,
3069 ))));
3070 arrays.push(Arc::new(Int32Array::from_iter_values(std::iter::repeat_n(
3071 123, num_rows,
3072 ))));
3073 let expected = RecordBatch::try_new(actual.schema(), arrays).unwrap();
3074 assert_eq!(
3075 actual, expected,
3076 "defaults should materialize correctly for all fields"
3077 );
3078 }
3079
3080 #[test]
3081 fn test_schema_resolution_default_enum_invalid_symbol_errors() {
3082 let path = "test/data/skippable_types.avro";
3083 let bad_schema = make_reader_schema_with_default_fields(
3084 path,
3085 vec![serde_json::json!({
3086 "name":"bad_enum",
3087 "type":{"type":"enum","name":"E","symbols":["A","B","C"]},
3088 "default":"Z"
3089 })],
3090 );
3091 let file = File::open(path).unwrap();
3092 let res = ReaderBuilder::new()
3093 .with_reader_schema(bad_schema)
3094 .build(BufReader::new(file));
3095 let err = res.expect_err("expected enum default validation to fail");
3096 let msg = err.to_string();
3097 let lower_msg = msg.to_lowercase();
3098 assert!(
3099 lower_msg.contains("enum")
3100 && (lower_msg.contains("symbol") || lower_msg.contains("default")),
3101 "unexpected error: {msg}"
3102 );
3103 }
3104
3105 #[test]
3106 fn test_schema_resolution_default_fixed_size_mismatch_errors() {
3107 let path = "test/data/skippable_types.avro";
3108 let bad_schema = make_reader_schema_with_default_fields(
3109 path,
3110 vec![serde_json::json!({
3111 "name":"bad_fixed",
3112 "type":{"type":"fixed","name":"F","size":4},
3113 "default":"ABC"
3114 })],
3115 );
3116 let file = File::open(path).unwrap();
3117 let res = ReaderBuilder::new()
3118 .with_reader_schema(bad_schema)
3119 .build(BufReader::new(file));
3120 let err = res.expect_err("expected fixed default validation to fail");
3121 let msg = err.to_string();
3122 let lower_msg = msg.to_lowercase();
3123 assert!(
3124 lower_msg.contains("fixed")
3125 && (lower_msg.contains("size")
3126 || lower_msg.contains("length")
3127 || lower_msg.contains("does not match")),
3128 "unexpected error: {msg}"
3129 );
3130 }
3131
3132 #[test]
3133 #[cfg(feature = "snappy")]
3135 fn test_alltypes_skip_writer_fields_keep_double_only() {
3136 let file = arrow_test_data("avro/alltypes_plain.avro");
3137 let reader_schema =
3138 make_reader_schema_with_selected_fields_in_order(&file, &["double_col"]);
3139 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
3140 let expected = RecordBatch::try_from_iter_with_nullable([(
3141 "double_col",
3142 Arc::new(Float64Array::from_iter_values(
3143 (0..8).map(|x| (x % 2) as f64 * 10.1),
3144 )) as _,
3145 true,
3146 )])
3147 .unwrap();
3148 assert_eq!(batch, expected);
3149 }
3150
3151 #[test]
3152 #[cfg(feature = "snappy")]
3154 fn test_alltypes_skip_writer_fields_reorder_and_skip_many() {
3155 let file = arrow_test_data("avro/alltypes_plain.avro");
3156 let reader_schema =
3157 make_reader_schema_with_selected_fields_in_order(&file, &["timestamp_col", "id"]);
3158 let batch = read_alltypes_with_reader_schema(&file, reader_schema);
3159 let expected = RecordBatch::try_from_iter_with_nullable([
3160 (
3161 "timestamp_col",
3162 Arc::new(
3163 TimestampMicrosecondArray::from_iter_values([
3164 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
3173 .with_timezone("+00:00"),
3174 ) as _,
3175 true,
3176 ),
3177 (
3178 "id",
3179 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
3180 true,
3181 ),
3182 ])
3183 .unwrap();
3184 assert_eq!(batch, expected);
3185 }
3186
3187 #[test]
3188 fn test_skippable_types_project_each_field_individually() {
3189 let path = "test/data/skippable_types.avro";
3190 let full = read_file(path, 1024, false);
3191 let schema_full = full.schema();
3192 let num_rows = full.num_rows();
3193 let writer_json = load_writer_schema_json(path);
3194 assert_eq!(
3195 writer_json["type"], "record",
3196 "writer schema must be a record"
3197 );
3198 let fields_json = writer_json
3199 .get("fields")
3200 .and_then(|f| f.as_array())
3201 .expect("record has fields");
3202 assert_eq!(
3203 schema_full.fields().len(),
3204 fields_json.len(),
3205 "full read column count vs writer fields"
3206 );
3207 fn rebuild_list_array_with_element(
3208 col: &ArrayRef,
3209 new_elem: Arc<Field>,
3210 is_large: bool,
3211 ) -> ArrayRef {
3212 if is_large {
3213 let list = col
3214 .as_any()
3215 .downcast_ref::<LargeListArray>()
3216 .expect("expected LargeListArray");
3217 let offsets = list.offsets().clone();
3218 let values = list.values().clone();
3219 let validity = list.nulls().cloned();
3220 Arc::new(LargeListArray::try_new(new_elem, offsets, values, validity).unwrap())
3221 } else {
3222 let list = col
3223 .as_any()
3224 .downcast_ref::<ListArray>()
3225 .expect("expected ListArray");
3226 let offsets = list.offsets().clone();
3227 let values = list.values().clone();
3228 let validity = list.nulls().cloned();
3229 Arc::new(ListArray::try_new(new_elem, offsets, values, validity).unwrap())
3230 }
3231 }
3232 for (idx, f) in fields_json.iter().enumerate() {
3233 let name = f
3234 .get("name")
3235 .and_then(|n| n.as_str())
3236 .unwrap_or_else(|| panic!("field at index {idx} has no name"));
3237 let reader_schema = make_reader_schema_with_selected_fields_in_order(path, &[name]);
3238 let projected = read_alltypes_with_reader_schema(path, reader_schema);
3239 assert_eq!(
3240 projected.num_columns(),
3241 1,
3242 "projected batch should contain exactly the selected column '{name}'"
3243 );
3244 assert_eq!(
3245 projected.num_rows(),
3246 num_rows,
3247 "row count mismatch for projected column '{name}'"
3248 );
3249 let col_full = full.column(idx).clone();
3250 let full_field = schema_full.field(idx).as_ref().clone();
3251 let proj_field_ref = projected.schema().field(0).clone();
3252 let proj_field = proj_field_ref.as_ref();
3253 let top_meta = proj_field.metadata().clone();
3254 let (expected_field_ref, expected_col): (Arc<Field>, ArrayRef) =
3255 match (full_field.data_type(), proj_field.data_type()) {
3256 (&DataType::List(_), DataType::List(proj_elem)) => {
3257 let new_col =
3258 rebuild_list_array_with_element(&col_full, proj_elem.clone(), false);
3259 let nf = Field::new(
3260 full_field.name().clone(),
3261 proj_field.data_type().clone(),
3262 full_field.is_nullable(),
3263 )
3264 .with_metadata(top_meta);
3265 (Arc::new(nf), new_col)
3266 }
3267 (&DataType::LargeList(_), DataType::LargeList(proj_elem)) => {
3268 let new_col =
3269 rebuild_list_array_with_element(&col_full, proj_elem.clone(), true);
3270 let nf = Field::new(
3271 full_field.name().clone(),
3272 proj_field.data_type().clone(),
3273 full_field.is_nullable(),
3274 )
3275 .with_metadata(top_meta);
3276 (Arc::new(nf), new_col)
3277 }
3278 _ => {
3279 let nf = full_field.with_metadata(top_meta);
3280 (Arc::new(nf), col_full)
3281 }
3282 };
3283
3284 let expected = RecordBatch::try_new(
3285 Arc::new(Schema::new(vec![expected_field_ref])),
3286 vec![expected_col],
3287 )
3288 .unwrap();
3289 assert_eq!(
3290 projected, expected,
3291 "projected column '{name}' mismatch vs full read column"
3292 );
3293 }
3294 }
3295
3296 #[test]
3297 fn test_union_fields_avro_nullable_and_general_unions() {
3298 let path = "test/data/union_fields.avro";
3299 let batch = read_file(path, 1024, false);
3300 let schema = batch.schema();
3301 let idx = schema.index_of("nullable_int_nullfirst").unwrap();
3302 let a = batch.column(idx).as_primitive::<Int32Type>();
3303 assert_eq!(a.len(), 4);
3304 assert!(a.is_null(0));
3305 assert_eq!(a.value(1), 42);
3306 assert!(a.is_null(2));
3307 assert_eq!(a.value(3), 0);
3308 let idx = schema.index_of("nullable_string_nullsecond").unwrap();
3309 let s = batch
3310 .column(idx)
3311 .as_any()
3312 .downcast_ref::<StringArray>()
3313 .expect("nullable_string_nullsecond should be Utf8");
3314 assert_eq!(s.len(), 4);
3315 assert_eq!(s.value(0), "s1");
3316 assert!(s.is_null(1));
3317 assert_eq!(s.value(2), "s3");
3318 assert!(s.is_valid(3)); assert_eq!(s.value(3), "");
3320 let idx = schema.index_of("union_prim").unwrap();
3321 let u = batch
3322 .column(idx)
3323 .as_any()
3324 .downcast_ref::<UnionArray>()
3325 .expect("union_prim should be Union");
3326 let fields = match u.data_type() {
3327 DataType::Union(fields, mode) => {
3328 assert!(matches!(mode, UnionMode::Dense), "expect dense unions");
3329 fields
3330 }
3331 other => panic!("expected Union, got {other:?}"),
3332 };
3333 let tid_by_name = |name: &str| -> i8 {
3334 for (tid, f) in fields.iter() {
3335 if f.name() == name {
3336 return tid;
3337 }
3338 }
3339 panic!("union child '{name}' not found");
3340 };
3341 let expected_type_ids = vec![
3342 tid_by_name("long"),
3343 tid_by_name("int"),
3344 tid_by_name("float"),
3345 tid_by_name("double"),
3346 ];
3347 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3348 assert_eq!(
3349 type_ids, expected_type_ids,
3350 "branch selection for union_prim rows"
3351 );
3352 let longs = u
3353 .child(tid_by_name("long"))
3354 .as_any()
3355 .downcast_ref::<Int64Array>()
3356 .unwrap();
3357 assert_eq!(longs.len(), 1);
3358 let ints = u
3359 .child(tid_by_name("int"))
3360 .as_any()
3361 .downcast_ref::<Int32Array>()
3362 .unwrap();
3363 assert_eq!(ints.len(), 1);
3364 let floats = u
3365 .child(tid_by_name("float"))
3366 .as_any()
3367 .downcast_ref::<Float32Array>()
3368 .unwrap();
3369 assert_eq!(floats.len(), 1);
3370 let doubles = u
3371 .child(tid_by_name("double"))
3372 .as_any()
3373 .downcast_ref::<Float64Array>()
3374 .unwrap();
3375 assert_eq!(doubles.len(), 1);
3376 let idx = schema.index_of("union_bytes_vs_string").unwrap();
3377 let u = batch
3378 .column(idx)
3379 .as_any()
3380 .downcast_ref::<UnionArray>()
3381 .expect("union_bytes_vs_string should be Union");
3382 let fields = match u.data_type() {
3383 DataType::Union(fields, _) => fields,
3384 other => panic!("expected Union, got {other:?}"),
3385 };
3386 let tid_by_name = |name: &str| -> i8 {
3387 for (tid, f) in fields.iter() {
3388 if f.name() == name {
3389 return tid;
3390 }
3391 }
3392 panic!("union child '{name}' not found");
3393 };
3394 let tid_bytes = tid_by_name("bytes");
3395 let tid_string = tid_by_name("string");
3396 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3397 assert_eq!(
3398 type_ids,
3399 vec![tid_bytes, tid_string, tid_string, tid_bytes],
3400 "branch selection for bytes/string union"
3401 );
3402 let s_child = u
3403 .child(tid_string)
3404 .as_any()
3405 .downcast_ref::<StringArray>()
3406 .unwrap();
3407 assert_eq!(s_child.len(), 2);
3408 assert_eq!(s_child.value(0), "hello");
3409 assert_eq!(s_child.value(1), "world");
3410 let b_child = u
3411 .child(tid_bytes)
3412 .as_any()
3413 .downcast_ref::<BinaryArray>()
3414 .unwrap();
3415 assert_eq!(b_child.len(), 2);
3416 assert_eq!(b_child.value(0), &[0x00, 0xFF, 0x7F]);
3417 assert_eq!(b_child.value(1), b""); let idx = schema.index_of("union_enum_records_array_map").unwrap();
3419 let u = batch
3420 .column(idx)
3421 .as_any()
3422 .downcast_ref::<UnionArray>()
3423 .expect("union_enum_records_array_map should be Union");
3424 let fields = match u.data_type() {
3425 DataType::Union(fields, _) => fields,
3426 other => panic!("expected Union, got {other:?}"),
3427 };
3428 let mut tid_enum: Option<i8> = None;
3429 let mut tid_rec_a: Option<i8> = None;
3430 let mut tid_rec_b: Option<i8> = None;
3431 let mut tid_array: Option<i8> = None;
3432 for (tid, f) in fields.iter() {
3433 match f.data_type() {
3434 DataType::Dictionary(_, _) => tid_enum = Some(tid),
3435 DataType::Struct(childs) => {
3436 if childs.len() == 2 && childs[0].name() == "a" && childs[1].name() == "b" {
3437 tid_rec_a = Some(tid);
3438 } else if childs.len() == 2
3439 && childs[0].name() == "x"
3440 && childs[1].name() == "y"
3441 {
3442 tid_rec_b = Some(tid);
3443 }
3444 }
3445 DataType::List(_) => tid_array = Some(tid),
3446 _ => {}
3447 }
3448 }
3449 let (tid_enum, tid_rec_a, tid_rec_b, tid_array) = (
3450 tid_enum.expect("enum child"),
3451 tid_rec_a.expect("RecA child"),
3452 tid_rec_b.expect("RecB child"),
3453 tid_array.expect("array<long> child"),
3454 );
3455 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3456 assert_eq!(
3457 type_ids,
3458 vec![tid_enum, tid_rec_a, tid_rec_b, tid_array],
3459 "branch selection for complex union"
3460 );
3461 let dict = u
3462 .child(tid_enum)
3463 .as_any()
3464 .downcast_ref::<DictionaryArray<Int32Type>>()
3465 .unwrap();
3466 assert_eq!(dict.len(), 1);
3467 assert!(dict.is_valid(0));
3468 let rec_a = u
3469 .child(tid_rec_a)
3470 .as_any()
3471 .downcast_ref::<StructArray>()
3472 .unwrap();
3473 assert_eq!(rec_a.len(), 1);
3474 let a_val = rec_a
3475 .column_by_name("a")
3476 .unwrap()
3477 .as_any()
3478 .downcast_ref::<Int32Array>()
3479 .unwrap();
3480 assert_eq!(a_val.value(0), 7);
3481 let b_val = rec_a
3482 .column_by_name("b")
3483 .unwrap()
3484 .as_any()
3485 .downcast_ref::<StringArray>()
3486 .unwrap();
3487 assert_eq!(b_val.value(0), "x");
3488 let rec_b = u
3490 .child(tid_rec_b)
3491 .as_any()
3492 .downcast_ref::<StructArray>()
3493 .unwrap();
3494 let x_val = rec_b
3495 .column_by_name("x")
3496 .unwrap()
3497 .as_any()
3498 .downcast_ref::<Int64Array>()
3499 .unwrap();
3500 assert_eq!(x_val.value(0), 123_456_789_i64);
3501 let y_val = rec_b
3502 .column_by_name("y")
3503 .unwrap()
3504 .as_any()
3505 .downcast_ref::<BinaryArray>()
3506 .unwrap();
3507 assert_eq!(y_val.value(0), &[0xFF, 0x00]);
3508 let arr = u
3509 .child(tid_array)
3510 .as_any()
3511 .downcast_ref::<ListArray>()
3512 .unwrap();
3513 assert_eq!(arr.len(), 1);
3514 let first_values = arr.value(0);
3515 let longs = first_values.as_any().downcast_ref::<Int64Array>().unwrap();
3516 assert_eq!(longs.len(), 3);
3517 assert_eq!(longs.value(0), 1);
3518 assert_eq!(longs.value(1), 2);
3519 assert_eq!(longs.value(2), 3);
3520 let idx = schema.index_of("union_date_or_fixed4").unwrap();
3521 let u = batch
3522 .column(idx)
3523 .as_any()
3524 .downcast_ref::<UnionArray>()
3525 .expect("union_date_or_fixed4 should be Union");
3526 let fields = match u.data_type() {
3527 DataType::Union(fields, _) => fields,
3528 other => panic!("expected Union, got {other:?}"),
3529 };
3530 let mut tid_date: Option<i8> = None;
3531 let mut tid_fixed: Option<i8> = None;
3532 for (tid, f) in fields.iter() {
3533 match f.data_type() {
3534 DataType::Date32 => tid_date = Some(tid),
3535 DataType::FixedSizeBinary(4) => tid_fixed = Some(tid),
3536 _ => {}
3537 }
3538 }
3539 let (tid_date, tid_fixed) = (tid_date.expect("date"), tid_fixed.expect("fixed(4)"));
3540 let type_ids: Vec<i8> = u.type_ids().iter().copied().collect();
3541 assert_eq!(
3542 type_ids,
3543 vec![tid_date, tid_fixed, tid_date, tid_fixed],
3544 "branch selection for date/fixed4 union"
3545 );
3546 let dates = u
3547 .child(tid_date)
3548 .as_any()
3549 .downcast_ref::<Date32Array>()
3550 .unwrap();
3551 assert_eq!(dates.len(), 2);
3552 assert_eq!(dates.value(0), 19_000); assert_eq!(dates.value(1), 0); let fixed = u
3555 .child(tid_fixed)
3556 .as_any()
3557 .downcast_ref::<FixedSizeBinaryArray>()
3558 .unwrap();
3559 assert_eq!(fixed.len(), 2);
3560 assert_eq!(fixed.value(0), b"ABCD");
3561 assert_eq!(fixed.value(1), &[0x00, 0x11, 0x22, 0x33]);
3562 }
3563
3564 #[test]
3565 fn test_union_schema_resolution_all_type_combinations() {
3566 let path = "test/data/union_fields.avro";
3567 let baseline = read_file(path, 1024, false);
3568 let baseline_schema = baseline.schema();
3569 let mut root = load_writer_schema_json(path);
3570 assert_eq!(root["type"], "record", "writer schema must be a record");
3571 let fields = root
3572 .get_mut("fields")
3573 .and_then(|f| f.as_array_mut())
3574 .expect("record has fields");
3575 fn is_named_type(obj: &Value, ty: &str, nm: &str) -> bool {
3576 obj.get("type").and_then(|v| v.as_str()) == Some(ty)
3577 && obj.get("name").and_then(|v| v.as_str()) == Some(nm)
3578 }
3579 fn is_logical(obj: &Value, prim: &str, lt: &str) -> bool {
3580 obj.get("type").and_then(|v| v.as_str()) == Some(prim)
3581 && obj.get("logicalType").and_then(|v| v.as_str()) == Some(lt)
3582 }
3583 fn find_first(arr: &[Value], pred: impl Fn(&Value) -> bool) -> Option<Value> {
3584 arr.iter().find(|v| pred(v)).cloned()
3585 }
3586 fn prim(s: &str) -> Value {
3587 Value::String(s.to_string())
3588 }
3589 for f in fields.iter_mut() {
3590 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
3591 continue;
3592 };
3593 match name {
3594 "nullable_int_nullfirst" => {
3596 f["type"] = json!(["int", "null"]);
3597 }
3598 "nullable_string_nullsecond" => {
3599 f["type"] = json!(["null", "string"]);
3600 }
3601 "union_prim" => {
3602 let orig = f["type"].as_array().unwrap().clone();
3603 let long = prim("long");
3604 let double = prim("double");
3605 let string = prim("string");
3606 let bytes = prim("bytes");
3607 let boolean = prim("boolean");
3608 assert!(orig.contains(&long));
3609 assert!(orig.contains(&double));
3610 assert!(orig.contains(&string));
3611 assert!(orig.contains(&bytes));
3612 assert!(orig.contains(&boolean));
3613 f["type"] = json!([long, double, string, bytes, boolean]);
3614 }
3615 "union_bytes_vs_string" => {
3616 f["type"] = json!(["string", "bytes"]);
3617 }
3618 "union_fixed_dur_decfix" => {
3619 let orig = f["type"].as_array().unwrap().clone();
3620 let fx8 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx8")).unwrap();
3621 let dur12 = find_first(&orig, |o| is_named_type(o, "fixed", "Dur12")).unwrap();
3622 let decfix16 =
3623 find_first(&orig, |o| is_named_type(o, "fixed", "DecFix16")).unwrap();
3624 f["type"] = json!([decfix16, dur12, fx8]);
3625 }
3626 "union_enum_records_array_map" => {
3627 let orig = f["type"].as_array().unwrap().clone();
3628 let enum_color = find_first(&orig, |o| {
3629 o.get("type").and_then(|v| v.as_str()) == Some("enum")
3630 })
3631 .unwrap();
3632 let rec_a = find_first(&orig, |o| is_named_type(o, "record", "RecA")).unwrap();
3633 let rec_b = find_first(&orig, |o| is_named_type(o, "record", "RecB")).unwrap();
3634 let arr = find_first(&orig, |o| {
3635 o.get("type").and_then(|v| v.as_str()) == Some("array")
3636 })
3637 .unwrap();
3638 let map = find_first(&orig, |o| {
3639 o.get("type").and_then(|v| v.as_str()) == Some("map")
3640 })
3641 .unwrap();
3642 f["type"] = json!([arr, map, rec_b, rec_a, enum_color]);
3643 }
3644 "union_date_or_fixed4" => {
3645 let orig = f["type"].as_array().unwrap().clone();
3646 let date = find_first(&orig, |o| is_logical(o, "int", "date")).unwrap();
3647 let fx4 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx4")).unwrap();
3648 f["type"] = json!([fx4, date]);
3649 }
3650 "union_time_millis_or_enum" => {
3651 let orig = f["type"].as_array().unwrap().clone();
3652 let time_ms =
3653 find_first(&orig, |o| is_logical(o, "int", "time-millis")).unwrap();
3654 let en = find_first(&orig, |o| {
3655 o.get("type").and_then(|v| v.as_str()) == Some("enum")
3656 })
3657 .unwrap();
3658 f["type"] = json!([en, time_ms]);
3659 }
3660 "union_time_micros_or_string" => {
3661 let orig = f["type"].as_array().unwrap().clone();
3662 let time_us =
3663 find_first(&orig, |o| is_logical(o, "long", "time-micros")).unwrap();
3664 f["type"] = json!(["string", time_us]);
3665 }
3666 "union_ts_millis_utc_or_array" => {
3667 let orig = f["type"].as_array().unwrap().clone();
3668 let ts_ms =
3669 find_first(&orig, |o| is_logical(o, "long", "timestamp-millis")).unwrap();
3670 let arr = find_first(&orig, |o| {
3671 o.get("type").and_then(|v| v.as_str()) == Some("array")
3672 })
3673 .unwrap();
3674 f["type"] = json!([arr, ts_ms]);
3675 }
3676 "union_ts_micros_local_or_bytes" => {
3677 let orig = f["type"].as_array().unwrap().clone();
3678 let lts_us =
3679 find_first(&orig, |o| is_logical(o, "long", "local-timestamp-micros"))
3680 .unwrap();
3681 f["type"] = json!(["bytes", lts_us]);
3682 }
3683 "union_uuid_or_fixed10" => {
3684 let orig = f["type"].as_array().unwrap().clone();
3685 let uuid = find_first(&orig, |o| is_logical(o, "string", "uuid")).unwrap();
3686 let fx10 = find_first(&orig, |o| is_named_type(o, "fixed", "Fx10")).unwrap();
3687 f["type"] = json!([fx10, uuid]);
3688 }
3689 "union_dec_bytes_or_dec_fixed" => {
3690 let orig = f["type"].as_array().unwrap().clone();
3691 let dec_bytes = find_first(&orig, |o| {
3692 o.get("type").and_then(|v| v.as_str()) == Some("bytes")
3693 && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3694 })
3695 .unwrap();
3696 let dec_fix = find_first(&orig, |o| {
3697 is_named_type(o, "fixed", "DecFix20")
3698 && o.get("logicalType").and_then(|v| v.as_str()) == Some("decimal")
3699 })
3700 .unwrap();
3701 f["type"] = json!([dec_fix, dec_bytes]);
3702 }
3703 "union_null_bytes_string" => {
3704 f["type"] = json!(["bytes", "string", "null"]);
3705 }
3706 "array_of_union" => {
3707 let obj = f
3708 .get_mut("type")
3709 .expect("array type")
3710 .as_object_mut()
3711 .unwrap();
3712 obj.insert("items".to_string(), json!(["string", "long"]));
3713 }
3714 "map_of_union" => {
3715 let obj = f
3716 .get_mut("type")
3717 .expect("map type")
3718 .as_object_mut()
3719 .unwrap();
3720 obj.insert("values".to_string(), json!(["double", "null"]));
3721 }
3722 "record_with_union_field" => {
3723 let rec = f
3724 .get_mut("type")
3725 .expect("record type")
3726 .as_object_mut()
3727 .unwrap();
3728 let rec_fields = rec.get_mut("fields").unwrap().as_array_mut().unwrap();
3729 let mut found = false;
3730 for rf in rec_fields.iter_mut() {
3731 if rf.get("name").and_then(|v| v.as_str()) == Some("u") {
3732 rf["type"] = json!(["string", "long"]); found = true;
3734 break;
3735 }
3736 }
3737 assert!(found, "field 'u' expected in HasUnion");
3738 }
3739 "union_ts_micros_utc_or_map" => {
3740 let orig = f["type"].as_array().unwrap().clone();
3741 let ts_us =
3742 find_first(&orig, |o| is_logical(o, "long", "timestamp-micros")).unwrap();
3743 let map = find_first(&orig, |o| {
3744 o.get("type").and_then(|v| v.as_str()) == Some("map")
3745 })
3746 .unwrap();
3747 f["type"] = json!([map, ts_us]);
3748 }
3749 "union_ts_millis_local_or_string" => {
3750 let orig = f["type"].as_array().unwrap().clone();
3751 let lts_ms =
3752 find_first(&orig, |o| is_logical(o, "long", "local-timestamp-millis"))
3753 .unwrap();
3754 f["type"] = json!(["string", lts_ms]);
3755 }
3756 "union_bool_or_string" => {
3757 f["type"] = json!(["string", "boolean"]);
3758 }
3759 _ => {}
3760 }
3761 }
3762 let reader_schema = AvroSchema::new(root.to_string());
3763 let resolved = read_alltypes_with_reader_schema(path, reader_schema);
3764
3765 fn branch_token(dt: &DataType) -> String {
3766 match dt {
3767 DataType::Null => "null".into(),
3768 DataType::Boolean => "boolean".into(),
3769 DataType::Int32 => "int".into(),
3770 DataType::Int64 => "long".into(),
3771 DataType::Float32 => "float".into(),
3772 DataType::Float64 => "double".into(),
3773 DataType::Binary => "bytes".into(),
3774 DataType::Utf8 => "string".into(),
3775 DataType::Date32 => "date".into(),
3776 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => "time-millis".into(),
3777 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => "time-micros".into(),
3778 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => if tz.is_some() {
3779 "timestamp-millis"
3780 } else {
3781 "local-timestamp-millis"
3782 }
3783 .into(),
3784 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => if tz.is_some() {
3785 "timestamp-micros"
3786 } else {
3787 "local-timestamp-micros"
3788 }
3789 .into(),
3790 DataType::Interval(IntervalUnit::MonthDayNano) => "duration".into(),
3791 DataType::FixedSizeBinary(n) => format!("fixed{n}"),
3792 DataType::Dictionary(_, _) => "enum".into(),
3793 DataType::Decimal128(p, s) => format!("decimal({p},{s})"),
3794 DataType::Decimal256(p, s) => format!("decimal({p},{s})"),
3795 #[cfg(feature = "small_decimals")]
3796 DataType::Decimal64(p, s) => format!("decimal({p},{s})"),
3797 DataType::Struct(fields) => {
3798 if fields.len() == 2 && fields[0].name() == "a" && fields[1].name() == "b" {
3799 "record:RecA".into()
3800 } else if fields.len() == 2
3801 && fields[0].name() == "x"
3802 && fields[1].name() == "y"
3803 {
3804 "record:RecB".into()
3805 } else {
3806 "record".into()
3807 }
3808 }
3809 DataType::List(_) => "array".into(),
3810 DataType::Map(_, _) => "map".into(),
3811 other => format!("{other:?}"),
3812 }
3813 }
3814
3815 fn union_tokens(u: &UnionArray) -> (Vec<i8>, HashMap<i8, String>) {
3816 let fields = match u.data_type() {
3817 DataType::Union(fields, _) => fields,
3818 other => panic!("expected Union, got {other:?}"),
3819 };
3820 let mut dict: HashMap<i8, String> = HashMap::with_capacity(fields.len());
3821 for (tid, f) in fields.iter() {
3822 dict.insert(tid, branch_token(f.data_type()));
3823 }
3824 let ids: Vec<i8> = u.type_ids().iter().copied().collect();
3825 (ids, dict)
3826 }
3827
3828 fn expected_token(field_name: &str, writer_token: &str) -> String {
3829 match field_name {
3830 "union_prim" => match writer_token {
3831 "int" => "long".into(),
3832 "float" => "double".into(),
3833 other => other.into(),
3834 },
3835 "record_with_union_field.u" => match writer_token {
3836 "int" => "long".into(),
3837 other => other.into(),
3838 },
3839 _ => writer_token.into(),
3840 }
3841 }
3842
3843 fn get_union<'a>(
3844 rb: &'a RecordBatch,
3845 schema: arrow_schema::SchemaRef,
3846 fname: &str,
3847 ) -> &'a UnionArray {
3848 let idx = schema.index_of(fname).unwrap();
3849 rb.column(idx)
3850 .as_any()
3851 .downcast_ref::<UnionArray>()
3852 .unwrap_or_else(|| panic!("{fname} should be a Union"))
3853 }
3854
3855 fn assert_union_equivalent(field_name: &str, u_writer: &UnionArray, u_reader: &UnionArray) {
3856 let (ids_w, dict_w) = union_tokens(u_writer);
3857 let (ids_r, dict_r) = union_tokens(u_reader);
3858 assert_eq!(
3859 ids_w.len(),
3860 ids_r.len(),
3861 "{field_name}: row count mismatch between baseline and resolved"
3862 );
3863 for (i, (id_w, id_r)) in ids_w.iter().zip(ids_r.iter()).enumerate() {
3864 let w_tok = dict_w.get(id_w).unwrap();
3865 let want = expected_token(field_name, w_tok);
3866 let got = dict_r.get(id_r).unwrap();
3867 assert_eq!(
3868 got, &want,
3869 "{field_name}: row {i} resolved to wrong union branch (writer={w_tok}, expected={want}, got={got})"
3870 );
3871 }
3872 }
3873
3874 for (fname, dt) in [
3875 ("nullable_int_nullfirst", DataType::Int32),
3876 ("nullable_string_nullsecond", DataType::Utf8),
3877 ] {
3878 let idx_b = baseline_schema.index_of(fname).unwrap();
3879 let idx_r = resolved.schema().index_of(fname).unwrap();
3880 let col_b = baseline.column(idx_b);
3881 let col_r = resolved.column(idx_r);
3882 assert_eq!(
3883 col_b.data_type(),
3884 &dt,
3885 "baseline {fname} should decode as non-union with nullability"
3886 );
3887 assert_eq!(
3888 col_b.as_ref(),
3889 col_r.as_ref(),
3890 "{fname}: values must be identical regardless of null-branch order"
3891 );
3892 }
3893 let union_fields = [
3894 "union_prim",
3895 "union_bytes_vs_string",
3896 "union_fixed_dur_decfix",
3897 "union_enum_records_array_map",
3898 "union_date_or_fixed4",
3899 "union_time_millis_or_enum",
3900 "union_time_micros_or_string",
3901 "union_ts_millis_utc_or_array",
3902 "union_ts_micros_local_or_bytes",
3903 "union_uuid_or_fixed10",
3904 "union_dec_bytes_or_dec_fixed",
3905 "union_null_bytes_string",
3906 "union_ts_micros_utc_or_map",
3907 "union_ts_millis_local_or_string",
3908 "union_bool_or_string",
3909 ];
3910 for fname in union_fields {
3911 let u_b = get_union(&baseline, baseline_schema.clone(), fname);
3912 let u_r = get_union(&resolved, resolved.schema(), fname);
3913 assert_union_equivalent(fname, u_b, u_r);
3914 }
3915 {
3916 let fname = "array_of_union";
3917 let idx_b = baseline_schema.index_of(fname).unwrap();
3918 let idx_r = resolved.schema().index_of(fname).unwrap();
3919 let arr_b = baseline
3920 .column(idx_b)
3921 .as_any()
3922 .downcast_ref::<ListArray>()
3923 .expect("array_of_union should be a List");
3924 let arr_r = resolved
3925 .column(idx_r)
3926 .as_any()
3927 .downcast_ref::<ListArray>()
3928 .expect("array_of_union should be a List");
3929 assert_eq!(
3930 arr_b.value_offsets(),
3931 arr_r.value_offsets(),
3932 "{fname}: list offsets changed after resolution"
3933 );
3934 let u_b = arr_b
3935 .values()
3936 .as_any()
3937 .downcast_ref::<UnionArray>()
3938 .expect("array items should be Union");
3939 let u_r = arr_r
3940 .values()
3941 .as_any()
3942 .downcast_ref::<UnionArray>()
3943 .expect("array items should be Union");
3944 let (ids_b, dict_b) = union_tokens(u_b);
3945 let (ids_r, dict_r) = union_tokens(u_r);
3946 assert_eq!(ids_b.len(), ids_r.len(), "{fname}: values length mismatch");
3947 for (i, (id_b, id_r)) in ids_b.iter().zip(ids_r.iter()).enumerate() {
3948 let w_tok = dict_b.get(id_b).unwrap();
3949 let got = dict_r.get(id_r).unwrap();
3950 assert_eq!(
3951 got, w_tok,
3952 "{fname}: value {i} resolved to wrong branch (writer={w_tok}, got={got})"
3953 );
3954 }
3955 }
3956 {
3957 let fname = "map_of_union";
3958 let idx_b = baseline_schema.index_of(fname).unwrap();
3959 let idx_r = resolved.schema().index_of(fname).unwrap();
3960 let map_b = baseline
3961 .column(idx_b)
3962 .as_any()
3963 .downcast_ref::<MapArray>()
3964 .expect("map_of_union should be a Map");
3965 let map_r = resolved
3966 .column(idx_r)
3967 .as_any()
3968 .downcast_ref::<MapArray>()
3969 .expect("map_of_union should be a Map");
3970 assert_eq!(
3971 map_b.value_offsets(),
3972 map_r.value_offsets(),
3973 "{fname}: map value offsets changed after resolution"
3974 );
3975 let ent_b = map_b.entries();
3976 let ent_r = map_r.entries();
3977 let val_b_any = ent_b.column(1).as_ref();
3978 let val_r_any = ent_r.column(1).as_ref();
3979 let b_union = val_b_any.as_any().downcast_ref::<UnionArray>();
3980 let r_union = val_r_any.as_any().downcast_ref::<UnionArray>();
3981 if let (Some(u_b), Some(u_r)) = (b_union, r_union) {
3982 assert_union_equivalent(fname, u_b, u_r);
3983 } else {
3984 assert_eq!(
3985 val_b_any.data_type(),
3986 val_r_any.data_type(),
3987 "{fname}: value data types differ after resolution"
3988 );
3989 assert_eq!(
3990 val_b_any, val_r_any,
3991 "{fname}: value arrays differ after resolution (nullable value column case)"
3992 );
3993 let value_nullable = |m: &MapArray| -> bool {
3994 match m.data_type() {
3995 DataType::Map(entries_field, _sorted) => match entries_field.data_type() {
3996 DataType::Struct(fields) => {
3997 assert_eq!(fields.len(), 2, "entries struct must have 2 fields");
3998 assert_eq!(fields[0].name(), "key");
3999 assert_eq!(fields[1].name(), "value");
4000 fields[1].is_nullable()
4001 }
4002 other => panic!("Map entries field must be Struct, got {other:?}"),
4003 },
4004 other => panic!("expected Map data type, got {other:?}"),
4005 }
4006 };
4007 assert!(
4008 value_nullable(map_b),
4009 "{fname}: baseline Map value field should be nullable per Arrow spec"
4010 );
4011 assert!(
4012 value_nullable(map_r),
4013 "{fname}: resolved Map value field should be nullable per Arrow spec"
4014 );
4015 }
4016 }
4017 {
4018 let fname = "record_with_union_field";
4019 let idx_b = baseline_schema.index_of(fname).unwrap();
4020 let idx_r = resolved.schema().index_of(fname).unwrap();
4021 let rec_b = baseline
4022 .column(idx_b)
4023 .as_any()
4024 .downcast_ref::<StructArray>()
4025 .expect("record_with_union_field should be a Struct");
4026 let rec_r = resolved
4027 .column(idx_r)
4028 .as_any()
4029 .downcast_ref::<StructArray>()
4030 .expect("record_with_union_field should be a Struct");
4031 let u_b = rec_b
4032 .column_by_name("u")
4033 .unwrap()
4034 .as_any()
4035 .downcast_ref::<UnionArray>()
4036 .expect("field 'u' should be Union (baseline)");
4037 let u_r = rec_r
4038 .column_by_name("u")
4039 .unwrap()
4040 .as_any()
4041 .downcast_ref::<UnionArray>()
4042 .expect("field 'u' should be Union (resolved)");
4043 assert_union_equivalent("record_with_union_field.u", u_b, u_r);
4044 }
4045 }
4046
4047 #[test]
4048 fn test_union_fields_end_to_end_expected_arrays() {
4049 fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
4050 for (tid, f) in fields.iter() {
4051 if f.name() == want {
4052 return tid;
4053 }
4054 }
4055 panic!("union child '{want}' not found")
4056 }
4057
4058 fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
4059 for (tid, f) in fields.iter() {
4060 if pred(f.data_type()) {
4061 return tid;
4062 }
4063 }
4064 panic!("no union child matches predicate");
4065 }
4066
4067 fn uuid16_from_str(s: &str) -> [u8; 16] {
4068 fn hex(b: u8) -> u8 {
4069 match b {
4070 b'0'..=b'9' => b - b'0',
4071 b'a'..=b'f' => b - b'a' + 10,
4072 b'A'..=b'F' => b - b'A' + 10,
4073 _ => panic!("invalid hex"),
4074 }
4075 }
4076 let mut out = [0u8; 16];
4077 let bytes = s.as_bytes();
4078 let (mut i, mut j) = (0, 0);
4079 while i < bytes.len() {
4080 if bytes[i] == b'-' {
4081 i += 1;
4082 continue;
4083 }
4084 let hi = hex(bytes[i]);
4085 let lo = hex(bytes[i + 1]);
4086 out[j] = (hi << 4) | lo;
4087 j += 1;
4088 i += 2;
4089 }
4090 assert_eq!(j, 16, "uuid must decode to 16 bytes");
4091 out
4092 }
4093
4094 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
4095 match dt {
4096 DataType::Null => Arc::new(NullArray::new(0)),
4097 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
4098 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
4099 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
4100 DataType::Float32 => Arc::new(arrow_array::Float32Array::from(Vec::<f32>::new())),
4101 DataType::Float64 => Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())),
4102 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
4103 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
4104 DataType::Date32 => Arc::new(arrow_array::Date32Array::from(Vec::<i32>::new())),
4105 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4106 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
4107 }
4108 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4109 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
4110 }
4111 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4112 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
4113 Arc::new(if let Some(tz) = tz {
4114 a.with_timezone(tz.clone())
4115 } else {
4116 a
4117 })
4118 }
4119 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4120 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
4121 Arc::new(if let Some(tz) = tz {
4122 a.with_timezone(tz.clone())
4123 } else {
4124 a
4125 })
4126 }
4127 DataType::Interval(IntervalUnit::MonthDayNano) => {
4128 Arc::new(arrow_array::IntervalMonthDayNanoArray::from(Vec::<
4129 IntervalMonthDayNano,
4130 >::new(
4131 )))
4132 }
4133 DataType::FixedSizeBinary(n) => Arc::new(FixedSizeBinaryArray::new_null(*n, 0)),
4134 DataType::Dictionary(k, v) => {
4135 assert_eq!(**k, DataType::Int32, "expect int32 keys for enums");
4136 let keys = Int32Array::from(Vec::<i32>::new());
4137 let values = match v.as_ref() {
4138 DataType::Utf8 => {
4139 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4140 }
4141 other => panic!("unexpected dictionary value type {other:?}"),
4142 };
4143 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4144 }
4145 DataType::List(field) => {
4146 let values: ArrayRef = match field.data_type() {
4147 DataType::Int32 => {
4148 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
4149 }
4150 DataType::Int64 => {
4151 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
4152 }
4153 DataType::Utf8 => {
4154 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4155 }
4156 DataType::Union(_, _) => {
4157 let (uf, _) = if let DataType::Union(f, m) = field.data_type() {
4158 (f.clone(), m)
4159 } else {
4160 unreachable!()
4161 };
4162 let children: Vec<ArrayRef> = uf
4163 .iter()
4164 .map(|(_, f)| empty_child_for(f.data_type()))
4165 .collect();
4166 Arc::new(
4167 UnionArray::try_new(
4168 uf.clone(),
4169 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
4170 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
4171 children,
4172 )
4173 .unwrap(),
4174 ) as ArrayRef
4175 }
4176 other => panic!("unsupported list item type: {other:?}"),
4177 };
4178 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
4179 Arc::new(ListArray::try_new(field.clone(), offsets, values, None).unwrap())
4180 }
4181 DataType::Map(entry_field, ordered) => {
4182 let DataType::Struct(childs) = entry_field.data_type() else {
4183 panic!("map entries must be struct")
4184 };
4185 let key_field = &childs[0];
4186 let val_field = &childs[1];
4187 assert_eq!(key_field.data_type(), &DataType::Utf8);
4188 let keys = StringArray::from(Vec::<&str>::new());
4189 let vals: ArrayRef = match val_field.data_type() {
4190 DataType::Float64 => {
4191 Arc::new(arrow_array::Float64Array::from(Vec::<f64>::new())) as ArrayRef
4192 }
4193 DataType::Int64 => {
4194 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
4195 }
4196 DataType::Utf8 => {
4197 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
4198 }
4199 DataType::Union(uf, _) => {
4200 let ch: Vec<ArrayRef> = uf
4201 .iter()
4202 .map(|(_, f)| empty_child_for(f.data_type()))
4203 .collect();
4204 Arc::new(
4205 UnionArray::try_new(
4206 uf.clone(),
4207 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
4208 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
4209 ch,
4210 )
4211 .unwrap(),
4212 ) as ArrayRef
4213 }
4214 other => panic!("unsupported map value type: {other:?}"),
4215 };
4216 let entries = StructArray::new(
4217 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4218 vec![Arc::new(keys) as ArrayRef, vals],
4219 None,
4220 );
4221 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
4222 Arc::new(MapArray::new(
4223 entry_field.clone(),
4224 offsets,
4225 entries,
4226 None,
4227 *ordered,
4228 ))
4229 }
4230 other => panic!("empty_child_for: unhandled type {other:?}"),
4231 }
4232 }
4233
4234 fn mk_dense_union(
4235 fields: &UnionFields,
4236 type_ids: Vec<i8>,
4237 offsets: Vec<i32>,
4238 provide: impl Fn(&Field) -> Option<ArrayRef>,
4239 ) -> ArrayRef {
4240 let children: Vec<ArrayRef> = fields
4241 .iter()
4242 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
4243 .collect();
4244
4245 Arc::new(
4246 UnionArray::try_new(
4247 fields.clone(),
4248 ScalarBuffer::<i8>::from(type_ids),
4249 Some(ScalarBuffer::<i32>::from(offsets)),
4250 children,
4251 )
4252 .unwrap(),
4253 ) as ArrayRef
4254 }
4255
4256 let date_a: i32 = 19_000;
4258 let time_ms_a: i32 = 13 * 3_600_000 + 45 * 60_000 + 30_000 + 123;
4259 let time_us_b: i64 = 23 * 3_600_000_000 + 59 * 60_000_000 + 59 * 1_000_000 + 999_999;
4260 let ts_ms_2024_01_01: i64 = 1_704_067_200_000;
4261 let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1000;
4262 let fx8_a: [u8; 8] = *b"ABCDEFGH";
4264 let fx4_abcd: [u8; 4] = *b"ABCD";
4265 let fx4_misc: [u8; 4] = [0x00, 0x11, 0x22, 0x33];
4266 let fx10_ascii: [u8; 10] = *b"0123456789";
4267 let fx10_aa: [u8; 10] = [0xAA; 10];
4268 let dur_a = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
4270 let dur_b = IntervalMonthDayNanoType::make_value(12, 31, 999_000_000);
4271 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
4273 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
4274 let dec_b_scale2_pos: i128 = 123_456; let dec_fix16_neg: i128 = -101; let dec_fix20_s4: i128 = 1_234_567_891_234; let dec_fix20_s4_neg: i128 = -123; let path = "test/data/union_fields.avro";
4280 let actual = read_file(path, 1024, false);
4281 let schema = actual.schema();
4282 let get_union = |name: &str| -> (UnionFields, UnionMode) {
4284 let idx = schema.index_of(name).unwrap();
4285 match schema.field(idx).data_type() {
4286 DataType::Union(f, m) => (f.clone(), *m),
4287 other => panic!("{name} should be a Union, got {other:?}"),
4288 }
4289 };
4290 let mut expected_cols: Vec<ArrayRef> = Vec::with_capacity(schema.fields().len());
4291 expected_cols.push(Arc::new(Int32Array::from(vec![
4293 None,
4294 Some(42),
4295 None,
4296 Some(0),
4297 ])));
4298 expected_cols.push(Arc::new(StringArray::from(vec![
4300 Some("s1"),
4301 None,
4302 Some("s3"),
4303 Some(""),
4304 ])));
4305 {
4307 let (uf, mode) = get_union("union_prim");
4308 assert!(matches!(mode, UnionMode::Dense));
4309 let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4310 let expected_names = vec![
4311 "boolean", "int", "long", "float", "double", "bytes", "string",
4312 ];
4313 assert_eq!(
4314 generated_names, expected_names,
4315 "Field names for union_prim are incorrect"
4316 );
4317 let tids = vec![
4318 tid_by_name(&uf, "long"),
4319 tid_by_name(&uf, "int"),
4320 tid_by_name(&uf, "float"),
4321 tid_by_name(&uf, "double"),
4322 ];
4323 let offs = vec![0, 0, 0, 0];
4324 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4325 "int" => Some(Arc::new(Int32Array::from(vec![-1])) as ArrayRef),
4326 "long" => Some(Arc::new(Int64Array::from(vec![1_234_567_890_123i64])) as ArrayRef),
4327 "float" => {
4328 Some(Arc::new(arrow_array::Float32Array::from(vec![1.25f32])) as ArrayRef)
4329 }
4330 "double" => {
4331 Some(Arc::new(arrow_array::Float64Array::from(vec![-2.5f64])) as ArrayRef)
4332 }
4333 _ => None,
4334 });
4335 expected_cols.push(arr);
4336 }
4337 {
4339 let (uf, _) = get_union("union_bytes_vs_string");
4340 let tids = vec![
4341 tid_by_name(&uf, "bytes"),
4342 tid_by_name(&uf, "string"),
4343 tid_by_name(&uf, "string"),
4344 tid_by_name(&uf, "bytes"),
4345 ];
4346 let offs = vec![0, 0, 1, 1];
4347 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4348 "bytes" => Some(
4349 Arc::new(BinaryArray::from(vec![&[0x00, 0xFF, 0x7F][..], &[][..]])) as ArrayRef,
4350 ),
4351 "string" => Some(Arc::new(StringArray::from(vec!["hello", "world"])) as ArrayRef),
4352 _ => None,
4353 });
4354 expected_cols.push(arr);
4355 }
4356 {
4358 let (uf, _) = get_union("union_fixed_dur_decfix");
4359 let tid_fx8 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(8)));
4360 let tid_dur = tid_by_dt(&uf, |dt| {
4361 matches!(
4362 dt,
4363 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)
4364 )
4365 });
4366 let tid_dec = tid_by_dt(&uf, |dt| match dt {
4367 #[cfg(feature = "small_decimals")]
4368 DataType::Decimal64(10, 2) => true,
4369 DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4370 _ => false,
4371 });
4372 let tids = vec![tid_fx8, tid_dur, tid_dec, tid_dur];
4373 let offs = vec![0, 0, 0, 1];
4374 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4375 DataType::FixedSizeBinary(8) => {
4376 let it = [Some(fx8_a)].into_iter();
4377 Some(Arc::new(
4378 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 8).unwrap(),
4379 ) as ArrayRef)
4380 }
4381 DataType::Interval(IntervalUnit::MonthDayNano) => {
4382 Some(Arc::new(arrow_array::IntervalMonthDayNanoArray::from(vec![
4383 dur_a, dur_b,
4384 ])) as ArrayRef)
4385 }
4386 #[cfg(feature = "small_decimals")]
4387 DataType::Decimal64(10, 2) => {
4388 let a = arrow_array::Decimal64Array::from_iter_values([dec_fix16_neg as i64]);
4389 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4390 }
4391 DataType::Decimal128(10, 2) => {
4392 let a = arrow_array::Decimal128Array::from_iter_values([dec_fix16_neg]);
4393 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4394 }
4395 DataType::Decimal256(10, 2) => {
4396 let a = arrow_array::Decimal256Array::from_iter_values([i256::from_i128(
4397 dec_fix16_neg,
4398 )]);
4399 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4400 }
4401 _ => None,
4402 });
4403 let generated_names: Vec<&str> = uf.iter().map(|(_, f)| f.name().as_str()).collect();
4404 let expected_names = vec!["Fx8", "Dur12", "DecFix16"];
4405 assert_eq!(
4406 generated_names, expected_names,
4407 "Data type names were not generated correctly for union_fixed_dur_decfix"
4408 );
4409 expected_cols.push(arr);
4410 }
4411 {
4413 let (uf, _) = get_union("union_enum_records_array_map");
4414 let tid_enum = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4415 let tid_reca = tid_by_dt(&uf, |dt| {
4416 if let DataType::Struct(fs) = dt {
4417 fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b"
4418 } else {
4419 false
4420 }
4421 });
4422 let tid_recb = tid_by_dt(&uf, |dt| {
4423 if let DataType::Struct(fs) = dt {
4424 fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y"
4425 } else {
4426 false
4427 }
4428 });
4429 let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4430 let tids = vec![tid_enum, tid_reca, tid_recb, tid_arr];
4431 let offs = vec![0, 0, 0, 0];
4432 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4433 DataType::Dictionary(_, _) => {
4434 let keys = Int32Array::from(vec![0i32]); let values =
4436 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
4437 Some(
4438 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4439 as ArrayRef,
4440 )
4441 }
4442 DataType::Struct(fs)
4443 if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
4444 {
4445 let a = Int32Array::from(vec![7]);
4446 let b = StringArray::from(vec!["x"]);
4447 Some(Arc::new(StructArray::new(
4448 fs.clone(),
4449 vec![Arc::new(a), Arc::new(b)],
4450 None,
4451 )) as ArrayRef)
4452 }
4453 DataType::Struct(fs)
4454 if fs.len() == 2 && fs[0].name() == "x" && fs[1].name() == "y" =>
4455 {
4456 let x = Int64Array::from(vec![123_456_789i64]);
4457 let y = BinaryArray::from(vec![&[0xFF, 0x00][..]]);
4458 Some(Arc::new(StructArray::new(
4459 fs.clone(),
4460 vec![Arc::new(x), Arc::new(y)],
4461 None,
4462 )) as ArrayRef)
4463 }
4464 DataType::List(field) => {
4465 let values = Int64Array::from(vec![1i64, 2, 3]);
4466 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
4467 Some(Arc::new(
4468 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4469 ) as ArrayRef)
4470 }
4471 DataType::Map(_, _) => None,
4472 other => panic!("unexpected child {other:?}"),
4473 });
4474 expected_cols.push(arr);
4475 }
4476 {
4478 let (uf, _) = get_union("union_date_or_fixed4");
4479 let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
4480 let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
4481 let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
4482 let offs = vec![0, 0, 1, 1];
4483 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4484 DataType::Date32 => {
4485 Some(Arc::new(arrow_array::Date32Array::from(vec![date_a, 0])) as ArrayRef)
4486 }
4487 DataType::FixedSizeBinary(4) => {
4488 let it = [Some(fx4_abcd), Some(fx4_misc)].into_iter();
4489 Some(Arc::new(
4490 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
4491 ) as ArrayRef)
4492 }
4493 _ => None,
4494 });
4495 expected_cols.push(arr);
4496 }
4497 {
4499 let (uf, _) = get_union("union_time_millis_or_enum");
4500 let tid_ms = tid_by_dt(&uf, |dt| {
4501 matches!(dt, DataType::Time32(arrow_schema::TimeUnit::Millisecond))
4502 });
4503 let tid_en = tid_by_dt(&uf, |dt| matches!(dt, DataType::Dictionary(_, _)));
4504 let tids = vec![tid_ms, tid_en, tid_en, tid_ms];
4505 let offs = vec![0, 0, 1, 1];
4506 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4507 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
4508 Some(Arc::new(Time32MillisecondArray::from(vec![time_ms_a, 0])) as ArrayRef)
4509 }
4510 DataType::Dictionary(_, _) => {
4511 let keys = Int32Array::from(vec![0i32, 1]); let values = Arc::new(StringArray::from(vec!["ON", "OFF"])) as ArrayRef;
4513 Some(
4514 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
4515 as ArrayRef,
4516 )
4517 }
4518 _ => None,
4519 });
4520 expected_cols.push(arr);
4521 }
4522 {
4524 let (uf, _) = get_union("union_time_micros_or_string");
4525 let tid_us = tid_by_dt(&uf, |dt| {
4526 matches!(dt, DataType::Time64(arrow_schema::TimeUnit::Microsecond))
4527 });
4528 let tid_s = tid_by_name(&uf, "string");
4529 let tids = vec![tid_s, tid_us, tid_s, tid_s];
4530 let offs = vec![0, 0, 1, 2];
4531 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4532 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
4533 Some(Arc::new(Time64MicrosecondArray::from(vec![time_us_b])) as ArrayRef)
4534 }
4535 DataType::Utf8 => {
4536 Some(Arc::new(StringArray::from(vec!["evening", "night", ""])) as ArrayRef)
4537 }
4538 _ => None,
4539 });
4540 expected_cols.push(arr);
4541 }
4542 {
4544 let (uf, _) = get_union("union_ts_millis_utc_or_array");
4545 let tid_ts = tid_by_dt(&uf, |dt| {
4546 matches!(
4547 dt,
4548 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)
4549 )
4550 });
4551 let tid_arr = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
4552 let tids = vec![tid_ts, tid_arr, tid_arr, tid_ts];
4553 let offs = vec![0, 0, 1, 1];
4554 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4555 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
4556 let a = TimestampMillisecondArray::from(vec![
4557 ts_ms_2024_01_01,
4558 ts_ms_2024_01_01 + 86_400_000,
4559 ]);
4560 Some(Arc::new(if let Some(tz) = tz {
4561 a.with_timezone(tz.clone())
4562 } else {
4563 a
4564 }) as ArrayRef)
4565 }
4566 DataType::List(field) => {
4567 let values = Int32Array::from(vec![0, 1, 2, -1, 0, 1]);
4568 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 6]));
4569 Some(Arc::new(
4570 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
4571 ) as ArrayRef)
4572 }
4573 _ => None,
4574 });
4575 expected_cols.push(arr);
4576 }
4577 {
4579 let (uf, _) = get_union("union_ts_micros_local_or_bytes");
4580 let tid_lts = tid_by_dt(&uf, |dt| {
4581 matches!(
4582 dt,
4583 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
4584 )
4585 });
4586 let tid_b = tid_by_name(&uf, "bytes");
4587 let tids = vec![tid_b, tid_lts, tid_b, tid_b];
4588 let offs = vec![0, 0, 1, 2];
4589 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4590 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None) => Some(Arc::new(
4591 TimestampMicrosecondArray::from(vec![ts_us_2024_01_01]),
4592 )
4593 as ArrayRef),
4594 DataType::Binary => Some(Arc::new(BinaryArray::from(vec![
4595 &b"\x11\x22\x33"[..],
4596 &b"\x00"[..],
4597 &b"\x10\x20\x30\x40"[..],
4598 ])) as ArrayRef),
4599 _ => None,
4600 });
4601 expected_cols.push(arr);
4602 }
4603 {
4605 let (uf, _) = get_union("union_uuid_or_fixed10");
4606 let tid_fx16 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
4607 let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
4608 let tids = vec![tid_fx16, tid_fx10, tid_fx16, tid_fx10];
4609 let offs = vec![0, 0, 1, 1];
4610 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4611 DataType::FixedSizeBinary(16) => {
4612 let it = [Some(uuid1), Some(uuid2)].into_iter();
4613 Some(Arc::new(
4614 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
4615 ) as ArrayRef)
4616 }
4617 DataType::FixedSizeBinary(10) => {
4618 let it = [Some(fx10_ascii), Some(fx10_aa)].into_iter();
4619 Some(Arc::new(
4620 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
4621 ) as ArrayRef)
4622 }
4623 _ => None,
4624 });
4625 expected_cols.push(arr);
4626 }
4627 {
4629 let (uf, _) = get_union("union_dec_bytes_or_dec_fixed");
4630 let tid_b10s2 = tid_by_dt(&uf, |dt| match dt {
4631 #[cfg(feature = "small_decimals")]
4632 DataType::Decimal64(10, 2) => true,
4633 DataType::Decimal128(10, 2) | DataType::Decimal256(10, 2) => true,
4634 _ => false,
4635 });
4636 let tid_f20s4 = tid_by_dt(&uf, |dt| {
4637 matches!(
4638 dt,
4639 DataType::Decimal128(20, 4) | DataType::Decimal256(20, 4)
4640 )
4641 });
4642 let tids = vec![tid_b10s2, tid_f20s4, tid_b10s2, tid_f20s4];
4643 let offs = vec![0, 0, 1, 1];
4644 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4645 #[cfg(feature = "small_decimals")]
4646 DataType::Decimal64(10, 2) => {
4647 let a = Decimal64Array::from_iter_values([dec_b_scale2_pos as i64, 0i64]);
4648 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4649 }
4650 DataType::Decimal128(10, 2) => {
4651 let a = Decimal128Array::from_iter_values([dec_b_scale2_pos, 0]);
4652 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4653 }
4654 DataType::Decimal256(10, 2) => {
4655 let a = Decimal256Array::from_iter_values([
4656 i256::from_i128(dec_b_scale2_pos),
4657 i256::from(0),
4658 ]);
4659 Some(Arc::new(a.with_precision_and_scale(10, 2).unwrap()) as ArrayRef)
4660 }
4661 DataType::Decimal128(20, 4) => {
4662 let a = Decimal128Array::from_iter_values([dec_fix20_s4_neg, dec_fix20_s4]);
4663 Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4664 }
4665 DataType::Decimal256(20, 4) => {
4666 let a = Decimal256Array::from_iter_values([
4667 i256::from_i128(dec_fix20_s4_neg),
4668 i256::from_i128(dec_fix20_s4),
4669 ]);
4670 Some(Arc::new(a.with_precision_and_scale(20, 4).unwrap()) as ArrayRef)
4671 }
4672 _ => None,
4673 });
4674 expected_cols.push(arr);
4675 }
4676 {
4678 let (uf, _) = get_union("union_null_bytes_string");
4679 let tid_n = tid_by_name(&uf, "null");
4680 let tid_b = tid_by_name(&uf, "bytes");
4681 let tid_s = tid_by_name(&uf, "string");
4682 let tids = vec![tid_n, tid_b, tid_s, tid_s];
4683 let offs = vec![0, 0, 0, 1];
4684 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4685 "null" => Some(Arc::new(arrow_array::NullArray::new(1)) as ArrayRef),
4686 "bytes" => Some(Arc::new(BinaryArray::from(vec![&b"\x01\x02"[..]])) as ArrayRef),
4687 "string" => Some(Arc::new(StringArray::from(vec!["text", "u"])) as ArrayRef),
4688 _ => None,
4689 });
4690 expected_cols.push(arr);
4691 }
4692 {
4694 let idx = schema.index_of("array_of_union").unwrap();
4695 let dt = schema.field(idx).data_type().clone();
4696 let (item_field, _) = match &dt {
4697 DataType::List(f) => (f.clone(), ()),
4698 other => panic!("array_of_union must be List, got {other:?}"),
4699 };
4700 let (uf, _) = match item_field.data_type() {
4701 DataType::Union(f, m) => (f.clone(), m),
4702 other => panic!("array_of_union items must be Union, got {other:?}"),
4703 };
4704 let tid_l = tid_by_name(&uf, "long");
4705 let tid_s = tid_by_name(&uf, "string");
4706 let type_ids = vec![tid_l, tid_s, tid_l, tid_s, tid_l, tid_l, tid_s, tid_l];
4707 let offsets = vec![0, 0, 1, 1, 2, 3, 2, 4];
4708 let values_union =
4709 mk_dense_union(&uf, type_ids, offsets, |f| match f.name().as_str() {
4710 "long" => {
4711 Some(Arc::new(Int64Array::from(vec![1i64, -5, 42, -1, 0])) as ArrayRef)
4712 }
4713 "string" => Some(Arc::new(StringArray::from(vec!["a", "", "z"])) as ArrayRef),
4714 _ => None,
4715 });
4716 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 5, 6, 8]));
4717 expected_cols.push(Arc::new(
4718 ListArray::try_new(item_field.clone(), list_offsets, values_union, None).unwrap(),
4719 ));
4720 }
4721 {
4723 let idx = schema.index_of("map_of_union").unwrap();
4724 let dt = schema.field(idx).data_type().clone();
4725 let (entry_field, ordered) = match &dt {
4726 DataType::Map(f, ordered) => (f.clone(), *ordered),
4727 other => panic!("map_of_union must be Map, got {other:?}"),
4728 };
4729 let DataType::Struct(entry_fields) = entry_field.data_type() else {
4730 panic!("map entries must be struct")
4731 };
4732 let key_field = entry_fields[0].clone();
4733 let val_field = entry_fields[1].clone();
4734 let keys = StringArray::from(vec!["a", "b", "x", "pi"]);
4735 let rounded_pi = (std::f64::consts::PI * 100_000.0).round() / 100_000.0;
4736 let values: ArrayRef = match val_field.data_type() {
4737 DataType::Union(uf, _) => {
4738 let tid_n = tid_by_name(uf, "null");
4739 let tid_d = tid_by_name(uf, "double");
4740 let tids = vec![tid_n, tid_d, tid_d, tid_d];
4741 let offs = vec![0, 0, 1, 2];
4742 mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4743 "null" => Some(Arc::new(NullArray::new(1)) as ArrayRef),
4744 "double" => Some(Arc::new(arrow_array::Float64Array::from(vec![
4745 2.5f64, -0.5f64, rounded_pi,
4746 ])) as ArrayRef),
4747 _ => None,
4748 })
4749 }
4750 DataType::Float64 => Arc::new(arrow_array::Float64Array::from(vec![
4751 None,
4752 Some(2.5),
4753 Some(-0.5),
4754 Some(rounded_pi),
4755 ])),
4756 other => panic!("unexpected map value type {other:?}"),
4757 };
4758 let entries = StructArray::new(
4759 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4760 vec![Arc::new(keys) as ArrayRef, values],
4761 None,
4762 );
4763 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 3, 4]));
4764 expected_cols.push(Arc::new(MapArray::new(
4765 entry_field,
4766 offsets,
4767 entries,
4768 None,
4769 ordered,
4770 )));
4771 }
4772 {
4774 let idx = schema.index_of("record_with_union_field").unwrap();
4775 let DataType::Struct(rec_fields) = schema.field(idx).data_type() else {
4776 panic!("record_with_union_field should be Struct")
4777 };
4778 let id = Int32Array::from(vec![1, 2, 3, 4]);
4779 let u_field = rec_fields.iter().find(|f| f.name() == "u").unwrap();
4780 let DataType::Union(uf, _) = u_field.data_type() else {
4781 panic!("u must be Union")
4782 };
4783 let tid_i = tid_by_name(uf, "int");
4784 let tid_s = tid_by_name(uf, "string");
4785 let tids = vec![tid_s, tid_i, tid_i, tid_s];
4786 let offs = vec![0, 0, 1, 1];
4787 let u = mk_dense_union(uf, tids, offs, |f| match f.name().as_str() {
4788 "int" => Some(Arc::new(Int32Array::from(vec![99, 0])) as ArrayRef),
4789 "string" => Some(Arc::new(StringArray::from(vec!["one", "four"])) as ArrayRef),
4790 _ => None,
4791 });
4792 let rec = StructArray::new(rec_fields.clone(), vec![Arc::new(id) as ArrayRef, u], None);
4793 expected_cols.push(Arc::new(rec));
4794 }
4795 {
4797 let (uf, _) = get_union("union_ts_micros_utc_or_map");
4798 let tid_ts = tid_by_dt(&uf, |dt| {
4799 matches!(
4800 dt,
4801 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some(_))
4802 )
4803 });
4804 let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
4805 let tids = vec![tid_ts, tid_map, tid_ts, tid_map];
4806 let offs = vec![0, 0, 1, 1];
4807 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4808 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
4809 let a = TimestampMicrosecondArray::from(vec![ts_us_2024_01_01, 0i64]);
4810 Some(Arc::new(if let Some(tz) = tz {
4811 a.with_timezone(tz.clone())
4812 } else {
4813 a
4814 }) as ArrayRef)
4815 }
4816 DataType::Map(entry_field, ordered) => {
4817 let DataType::Struct(fs) = entry_field.data_type() else {
4818 panic!("map entries must be struct")
4819 };
4820 let key_field = fs[0].clone();
4821 let val_field = fs[1].clone();
4822 assert_eq!(key_field.data_type(), &DataType::Utf8);
4823 assert_eq!(val_field.data_type(), &DataType::Int64);
4824 let keys = StringArray::from(vec!["k1", "k2", "n"]);
4825 let vals = Int64Array::from(vec![1i64, 2, 0]);
4826 let entries = StructArray::new(
4827 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
4828 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
4829 None,
4830 );
4831 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
4832 Some(Arc::new(MapArray::new(
4833 entry_field.clone(),
4834 offsets,
4835 entries,
4836 None,
4837 *ordered,
4838 )) as ArrayRef)
4839 }
4840 _ => None,
4841 });
4842 expected_cols.push(arr);
4843 }
4844 {
4846 let (uf, _) = get_union("union_ts_millis_local_or_string");
4847 let tid_ts = tid_by_dt(&uf, |dt| {
4848 matches!(
4849 dt,
4850 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
4851 )
4852 });
4853 let tid_s = tid_by_name(&uf, "string");
4854 let tids = vec![tid_s, tid_ts, tid_s, tid_s];
4855 let offs = vec![0, 0, 1, 2];
4856 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
4857 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None) => Some(Arc::new(
4858 TimestampMillisecondArray::from(vec![ts_ms_2024_01_01]),
4859 )
4860 as ArrayRef),
4861 DataType::Utf8 => {
4862 Some(
4863 Arc::new(StringArray::from(vec!["local midnight", "done", ""])) as ArrayRef,
4864 )
4865 }
4866 _ => None,
4867 });
4868 expected_cols.push(arr);
4869 }
4870 {
4872 let (uf, _) = get_union("union_bool_or_string");
4873 let tid_b = tid_by_name(&uf, "boolean");
4874 let tid_s = tid_by_name(&uf, "string");
4875 let tids = vec![tid_b, tid_s, tid_b, tid_s];
4876 let offs = vec![0, 0, 1, 1];
4877 let arr = mk_dense_union(&uf, tids, offs, |f| match f.name().as_str() {
4878 "boolean" => Some(Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef),
4879 "string" => Some(Arc::new(StringArray::from(vec!["no", "yes"])) as ArrayRef),
4880 _ => None,
4881 });
4882 expected_cols.push(arr);
4883 }
4884 let expected = RecordBatch::try_new(schema.clone(), expected_cols).unwrap();
4885 assert_eq!(
4886 actual, expected,
4887 "full end-to-end equality for union_fields.avro"
4888 );
4889 }
4890
4891 #[test]
4892 fn test_read_zero_byte_avro_file() {
4893 let batch = read_file("test/data/zero_byte.avro", 3, false);
4894 let schema = batch.schema();
4895 assert_eq!(schema.fields().len(), 1);
4896 let field = schema.field(0);
4897 assert_eq!(field.name(), "data");
4898 assert_eq!(field.data_type(), &DataType::Binary);
4899 assert!(field.is_nullable());
4900 assert_eq!(batch.num_rows(), 3);
4901 assert_eq!(batch.num_columns(), 1);
4902 let binary_array = batch
4903 .column(0)
4904 .as_any()
4905 .downcast_ref::<BinaryArray>()
4906 .unwrap();
4907 assert!(binary_array.is_null(0));
4908 assert!(binary_array.is_valid(1));
4909 assert_eq!(binary_array.value(1), b"");
4910 assert!(binary_array.is_valid(2));
4911 assert_eq!(binary_array.value(2), b"some bytes");
4912 }
4913
4914 #[test]
4915 fn test_alltypes() {
4916 let expected = RecordBatch::try_from_iter_with_nullable([
4917 (
4918 "id",
4919 Arc::new(Int32Array::from(vec![4, 5, 6, 7, 2, 3, 0, 1])) as _,
4920 true,
4921 ),
4922 (
4923 "bool_col",
4924 Arc::new(BooleanArray::from_iter((0..8).map(|x| Some(x % 2 == 0)))) as _,
4925 true,
4926 ),
4927 (
4928 "tinyint_col",
4929 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4930 true,
4931 ),
4932 (
4933 "smallint_col",
4934 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4935 true,
4936 ),
4937 (
4938 "int_col",
4939 Arc::new(Int32Array::from_iter_values((0..8).map(|x| x % 2))) as _,
4940 true,
4941 ),
4942 (
4943 "bigint_col",
4944 Arc::new(Int64Array::from_iter_values((0..8).map(|x| (x % 2) * 10))) as _,
4945 true,
4946 ),
4947 (
4948 "float_col",
4949 Arc::new(Float32Array::from_iter_values(
4950 (0..8).map(|x| (x % 2) as f32 * 1.1),
4951 )) as _,
4952 true,
4953 ),
4954 (
4955 "double_col",
4956 Arc::new(Float64Array::from_iter_values(
4957 (0..8).map(|x| (x % 2) as f64 * 10.1),
4958 )) as _,
4959 true,
4960 ),
4961 (
4962 "date_string_col",
4963 Arc::new(BinaryArray::from_iter_values([
4964 [48, 51, 47, 48, 49, 47, 48, 57],
4965 [48, 51, 47, 48, 49, 47, 48, 57],
4966 [48, 52, 47, 48, 49, 47, 48, 57],
4967 [48, 52, 47, 48, 49, 47, 48, 57],
4968 [48, 50, 47, 48, 49, 47, 48, 57],
4969 [48, 50, 47, 48, 49, 47, 48, 57],
4970 [48, 49, 47, 48, 49, 47, 48, 57],
4971 [48, 49, 47, 48, 49, 47, 48, 57],
4972 ])) as _,
4973 true,
4974 ),
4975 (
4976 "string_col",
4977 Arc::new(BinaryArray::from_iter_values((0..8).map(|x| [48 + x % 2]))) as _,
4978 true,
4979 ),
4980 (
4981 "timestamp_col",
4982 Arc::new(
4983 TimestampMicrosecondArray::from_iter_values([
4984 1235865600000000, 1235865660000000, 1238544000000000, 1238544060000000, 1233446400000000, 1233446460000000, 1230768000000000, 1230768060000000, ])
4993 .with_timezone("+00:00"),
4994 ) as _,
4995 true,
4996 ),
4997 ])
4998 .unwrap();
4999
5000 for file in files() {
5001 let file = arrow_test_data(file);
5002
5003 assert_eq!(read_file(&file, 8, false), expected);
5004 assert_eq!(read_file(&file, 3, false), expected);
5005 }
5006 }
5007
5008 #[test]
5009 #[cfg(feature = "snappy")]
5011 fn test_alltypes_dictionary() {
5012 let file = "avro/alltypes_dictionary.avro";
5013 let expected = RecordBatch::try_from_iter_with_nullable([
5014 ("id", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
5015 (
5016 "bool_col",
5017 Arc::new(BooleanArray::from(vec![Some(true), Some(false)])) as _,
5018 true,
5019 ),
5020 (
5021 "tinyint_col",
5022 Arc::new(Int32Array::from(vec![0, 1])) as _,
5023 true,
5024 ),
5025 (
5026 "smallint_col",
5027 Arc::new(Int32Array::from(vec![0, 1])) as _,
5028 true,
5029 ),
5030 ("int_col", Arc::new(Int32Array::from(vec![0, 1])) as _, true),
5031 (
5032 "bigint_col",
5033 Arc::new(Int64Array::from(vec![0, 10])) as _,
5034 true,
5035 ),
5036 (
5037 "float_col",
5038 Arc::new(Float32Array::from(vec![0.0, 1.1])) as _,
5039 true,
5040 ),
5041 (
5042 "double_col",
5043 Arc::new(Float64Array::from(vec![0.0, 10.1])) as _,
5044 true,
5045 ),
5046 (
5047 "date_string_col",
5048 Arc::new(BinaryArray::from_iter_values([b"01/01/09", b"01/01/09"])) as _,
5049 true,
5050 ),
5051 (
5052 "string_col",
5053 Arc::new(BinaryArray::from_iter_values([b"0", b"1"])) as _,
5054 true,
5055 ),
5056 (
5057 "timestamp_col",
5058 Arc::new(
5059 TimestampMicrosecondArray::from_iter_values([
5060 1230768000000000, 1230768060000000, ])
5063 .with_timezone("+00:00"),
5064 ) as _,
5065 true,
5066 ),
5067 ])
5068 .unwrap();
5069 let file_path = arrow_test_data(file);
5070 let batch_large = read_file(&file_path, 8, false);
5071 assert_eq!(
5072 batch_large, expected,
5073 "Decoded RecordBatch does not match for file {file}"
5074 );
5075 let batch_small = read_file(&file_path, 3, false);
5076 assert_eq!(
5077 batch_small, expected,
5078 "Decoded RecordBatch (batch size 3) does not match for file {file}"
5079 );
5080 }
5081
5082 #[test]
5083 fn test_alltypes_nulls_plain() {
5084 let file = "avro/alltypes_nulls_plain.avro";
5085 let expected = RecordBatch::try_from_iter_with_nullable([
5086 (
5087 "string_col",
5088 Arc::new(StringArray::from(vec![None::<&str>])) as _,
5089 true,
5090 ),
5091 ("int_col", Arc::new(Int32Array::from(vec![None])) as _, true),
5092 (
5093 "bool_col",
5094 Arc::new(BooleanArray::from(vec![None])) as _,
5095 true,
5096 ),
5097 (
5098 "bigint_col",
5099 Arc::new(Int64Array::from(vec![None])) as _,
5100 true,
5101 ),
5102 (
5103 "float_col",
5104 Arc::new(Float32Array::from(vec![None])) as _,
5105 true,
5106 ),
5107 (
5108 "double_col",
5109 Arc::new(Float64Array::from(vec![None])) as _,
5110 true,
5111 ),
5112 (
5113 "bytes_col",
5114 Arc::new(BinaryArray::from(vec![None::<&[u8]>])) as _,
5115 true,
5116 ),
5117 ])
5118 .unwrap();
5119 let file_path = arrow_test_data(file);
5120 let batch_large = read_file(&file_path, 8, false);
5121 assert_eq!(
5122 batch_large, expected,
5123 "Decoded RecordBatch does not match for file {file}"
5124 );
5125 let batch_small = read_file(&file_path, 3, false);
5126 assert_eq!(
5127 batch_small, expected,
5128 "Decoded RecordBatch (batch size 3) does not match for file {file}"
5129 );
5130 }
5131
5132 #[test]
5133 #[cfg(feature = "snappy")]
5135 fn test_binary() {
5136 let file = arrow_test_data("avro/binary.avro");
5137 let batch = read_file(&file, 8, false);
5138 let expected = RecordBatch::try_from_iter_with_nullable([(
5139 "foo",
5140 Arc::new(BinaryArray::from_iter_values(vec![
5141 b"\x00" as &[u8],
5142 b"\x01" as &[u8],
5143 b"\x02" as &[u8],
5144 b"\x03" as &[u8],
5145 b"\x04" as &[u8],
5146 b"\x05" as &[u8],
5147 b"\x06" as &[u8],
5148 b"\x07" as &[u8],
5149 b"\x08" as &[u8],
5150 b"\t" as &[u8],
5151 b"\n" as &[u8],
5152 b"\x0b" as &[u8],
5153 ])) as Arc<dyn Array>,
5154 true,
5155 )])
5156 .unwrap();
5157 assert_eq!(batch, expected);
5158 }
5159
5160 #[test]
5161 #[cfg(feature = "snappy")]
5163 fn test_decimal() {
5164 #[cfg(feature = "small_decimals")]
5168 let files: [(&str, DataType, HashMap<String, String>); 8] = [
5169 (
5170 "avro/fixed_length_decimal.avro",
5171 DataType::Decimal128(25, 2),
5172 HashMap::from([
5173 (
5174 "avro.namespace".to_string(),
5175 "topLevelRecord.value".to_string(),
5176 ),
5177 ("avro.name".to_string(), "fixed".to_string()),
5178 ]),
5179 ),
5180 (
5181 "avro/fixed_length_decimal_legacy.avro",
5182 DataType::Decimal64(13, 2),
5183 HashMap::from([
5184 (
5185 "avro.namespace".to_string(),
5186 "topLevelRecord.value".to_string(),
5187 ),
5188 ("avro.name".to_string(), "fixed".to_string()),
5189 ]),
5190 ),
5191 (
5192 "avro/int32_decimal.avro",
5193 DataType::Decimal32(4, 2),
5194 HashMap::from([
5195 (
5196 "avro.namespace".to_string(),
5197 "topLevelRecord.value".to_string(),
5198 ),
5199 ("avro.name".to_string(), "fixed".to_string()),
5200 ]),
5201 ),
5202 (
5203 "avro/int64_decimal.avro",
5204 DataType::Decimal64(10, 2),
5205 HashMap::from([
5206 (
5207 "avro.namespace".to_string(),
5208 "topLevelRecord.value".to_string(),
5209 ),
5210 ("avro.name".to_string(), "fixed".to_string()),
5211 ]),
5212 ),
5213 (
5214 "test/data/int256_decimal.avro",
5215 DataType::Decimal256(76, 10),
5216 HashMap::new(),
5217 ),
5218 (
5219 "test/data/fixed256_decimal.avro",
5220 DataType::Decimal256(76, 10),
5221 HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
5222 ),
5223 (
5224 "test/data/fixed_length_decimal_legacy_32.avro",
5225 DataType::Decimal32(9, 2),
5226 HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
5227 ),
5228 (
5229 "test/data/int128_decimal.avro",
5230 DataType::Decimal128(38, 2),
5231 HashMap::new(),
5232 ),
5233 ];
5234 #[cfg(not(feature = "small_decimals"))]
5235 let files: [(&str, DataType, HashMap<String, String>); 8] = [
5236 (
5237 "avro/fixed_length_decimal.avro",
5238 DataType::Decimal128(25, 2),
5239 HashMap::from([
5240 (
5241 "avro.namespace".to_string(),
5242 "topLevelRecord.value".to_string(),
5243 ),
5244 ("avro.name".to_string(), "fixed".to_string()),
5245 ]),
5246 ),
5247 (
5248 "avro/fixed_length_decimal_legacy.avro",
5249 DataType::Decimal128(13, 2),
5250 HashMap::from([
5251 (
5252 "avro.namespace".to_string(),
5253 "topLevelRecord.value".to_string(),
5254 ),
5255 ("avro.name".to_string(), "fixed".to_string()),
5256 ]),
5257 ),
5258 (
5259 "avro/int32_decimal.avro",
5260 DataType::Decimal128(4, 2),
5261 HashMap::from([
5262 (
5263 "avro.namespace".to_string(),
5264 "topLevelRecord.value".to_string(),
5265 ),
5266 ("avro.name".to_string(), "fixed".to_string()),
5267 ]),
5268 ),
5269 (
5270 "avro/int64_decimal.avro",
5271 DataType::Decimal128(10, 2),
5272 HashMap::from([
5273 (
5274 "avro.namespace".to_string(),
5275 "topLevelRecord.value".to_string(),
5276 ),
5277 ("avro.name".to_string(), "fixed".to_string()),
5278 ]),
5279 ),
5280 (
5281 "test/data/int256_decimal.avro",
5282 DataType::Decimal256(76, 10),
5283 HashMap::new(),
5284 ),
5285 (
5286 "test/data/fixed256_decimal.avro",
5287 DataType::Decimal256(76, 10),
5288 HashMap::from([("avro.name".to_string(), "Decimal256Fixed".to_string())]),
5289 ),
5290 (
5291 "test/data/fixed_length_decimal_legacy_32.avro",
5292 DataType::Decimal128(9, 2),
5293 HashMap::from([("avro.name".to_string(), "Decimal32FixedLegacy".to_string())]),
5294 ),
5295 (
5296 "test/data/int128_decimal.avro",
5297 DataType::Decimal128(38, 2),
5298 HashMap::new(),
5299 ),
5300 ];
5301 for (file, expected_dt, mut metadata) in files {
5302 let (precision, scale) = match expected_dt {
5303 DataType::Decimal32(p, s)
5304 | DataType::Decimal64(p, s)
5305 | DataType::Decimal128(p, s)
5306 | DataType::Decimal256(p, s) => (p, s),
5307 _ => unreachable!("Unexpected decimal type in test inputs"),
5308 };
5309 assert!(scale >= 0, "test data uses non-negative scales only");
5310 let scale_u32 = scale as u32;
5311 let file_path: String = if file.starts_with("avro/") {
5312 arrow_test_data(file)
5313 } else {
5314 std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5315 .join(file)
5316 .to_string_lossy()
5317 .into_owned()
5318 };
5319 let pow10: i128 = 10i128.pow(scale_u32);
5320 let values_i128: Vec<i128> = (1..=24).map(|n| (n as i128) * pow10).collect();
5321 let build_expected = |dt: &DataType, values: &[i128]| -> ArrayRef {
5322 match *dt {
5323 #[cfg(feature = "small_decimals")]
5324 DataType::Decimal32(p, s) => {
5325 let it = values.iter().map(|&v| v as i32);
5326 Arc::new(
5327 Decimal32Array::from_iter_values(it)
5328 .with_precision_and_scale(p, s)
5329 .unwrap(),
5330 )
5331 }
5332 #[cfg(feature = "small_decimals")]
5333 DataType::Decimal64(p, s) => {
5334 let it = values.iter().map(|&v| v as i64);
5335 Arc::new(
5336 Decimal64Array::from_iter_values(it)
5337 .with_precision_and_scale(p, s)
5338 .unwrap(),
5339 )
5340 }
5341 DataType::Decimal128(p, s) => {
5342 let it = values.iter().copied();
5343 Arc::new(
5344 Decimal128Array::from_iter_values(it)
5345 .with_precision_and_scale(p, s)
5346 .unwrap(),
5347 )
5348 }
5349 DataType::Decimal256(p, s) => {
5350 let it = values.iter().map(|&v| i256::from_i128(v));
5351 Arc::new(
5352 Decimal256Array::from_iter_values(it)
5353 .with_precision_and_scale(p, s)
5354 .unwrap(),
5355 )
5356 }
5357 _ => unreachable!("Unexpected decimal type in test"),
5358 }
5359 };
5360 let actual_batch = read_file(&file_path, 8, false);
5361 let actual_nullable = actual_batch.schema().field(0).is_nullable();
5362 let expected_array = build_expected(&expected_dt, &values_i128);
5363 metadata.insert("precision".to_string(), precision.to_string());
5364 metadata.insert("scale".to_string(), scale.to_string());
5365 let field =
5366 Field::new("value", expected_dt.clone(), actual_nullable).with_metadata(metadata);
5367 let expected_schema = Arc::new(Schema::new(vec![field]));
5368 let expected_batch =
5369 RecordBatch::try_new(expected_schema.clone(), vec![expected_array]).unwrap();
5370 assert_eq!(
5371 actual_batch, expected_batch,
5372 "Decoded RecordBatch does not match for {file}"
5373 );
5374 let actual_batch_small = read_file(&file_path, 3, false);
5375 assert_eq!(
5376 actual_batch_small, expected_batch,
5377 "Decoded RecordBatch does not match for {file} with batch size 3"
5378 );
5379 }
5380 }
5381
5382 #[test]
5383 fn test_read_duration_logical_types_feature_toggle() -> Result<(), ArrowError> {
5384 let file_path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
5385 .join("test/data/duration_logical_types.avro")
5386 .to_string_lossy()
5387 .into_owned();
5388
5389 let actual_batch = read_file(&file_path, 4, false);
5390
5391 let expected_batch = {
5392 #[cfg(feature = "avro_custom_types")]
5393 {
5394 let schema = Arc::new(Schema::new(vec![
5395 Field::new(
5396 "duration_time_nanos",
5397 DataType::Duration(TimeUnit::Nanosecond),
5398 false,
5399 ),
5400 Field::new(
5401 "duration_time_micros",
5402 DataType::Duration(TimeUnit::Microsecond),
5403 false,
5404 ),
5405 Field::new(
5406 "duration_time_millis",
5407 DataType::Duration(TimeUnit::Millisecond),
5408 false,
5409 ),
5410 Field::new(
5411 "duration_time_seconds",
5412 DataType::Duration(TimeUnit::Second),
5413 false,
5414 ),
5415 ]));
5416
5417 let nanos = Arc::new(PrimitiveArray::<DurationNanosecondType>::from(vec![
5418 10, 20, 30, 40,
5419 ])) as ArrayRef;
5420 let micros = Arc::new(PrimitiveArray::<DurationMicrosecondType>::from(vec![
5421 100, 200, 300, 400,
5422 ])) as ArrayRef;
5423 let millis = Arc::new(PrimitiveArray::<DurationMillisecondType>::from(vec![
5424 1000, 2000, 3000, 4000,
5425 ])) as ArrayRef;
5426 let seconds = Arc::new(PrimitiveArray::<DurationSecondType>::from(vec![1, 2, 3, 4]))
5427 as ArrayRef;
5428
5429 RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5430 }
5431 #[cfg(not(feature = "avro_custom_types"))]
5432 {
5433 let schema = Arc::new(Schema::new(vec![
5434 Field::new("duration_time_nanos", DataType::Int64, false).with_metadata(
5435 [(
5436 "logicalType".to_string(),
5437 "arrow.duration-nanos".to_string(),
5438 )]
5439 .into(),
5440 ),
5441 Field::new("duration_time_micros", DataType::Int64, false).with_metadata(
5442 [(
5443 "logicalType".to_string(),
5444 "arrow.duration-micros".to_string(),
5445 )]
5446 .into(),
5447 ),
5448 Field::new("duration_time_millis", DataType::Int64, false).with_metadata(
5449 [(
5450 "logicalType".to_string(),
5451 "arrow.duration-millis".to_string(),
5452 )]
5453 .into(),
5454 ),
5455 Field::new("duration_time_seconds", DataType::Int64, false).with_metadata(
5456 [(
5457 "logicalType".to_string(),
5458 "arrow.duration-seconds".to_string(),
5459 )]
5460 .into(),
5461 ),
5462 ]));
5463
5464 let nanos =
5465 Arc::new(PrimitiveArray::<Int64Type>::from(vec![10, 20, 30, 40])) as ArrayRef;
5466 let micros = Arc::new(PrimitiveArray::<Int64Type>::from(vec![100, 200, 300, 400]))
5467 as ArrayRef;
5468 let millis = Arc::new(PrimitiveArray::<Int64Type>::from(vec![
5469 1000, 2000, 3000, 4000,
5470 ])) as ArrayRef;
5471 let seconds =
5472 Arc::new(PrimitiveArray::<Int64Type>::from(vec![1, 2, 3, 4])) as ArrayRef;
5473
5474 RecordBatch::try_new(schema, vec![nanos, micros, millis, seconds])?
5475 }
5476 };
5477
5478 assert_eq!(actual_batch, expected_batch);
5479
5480 Ok(())
5481 }
5482
5483 #[test]
5484 #[cfg(feature = "snappy")]
5486 fn test_dict_pages_offset_zero() {
5487 let file = arrow_test_data("avro/dict-page-offset-zero.avro");
5488 let batch = read_file(&file, 32, false);
5489 let num_rows = batch.num_rows();
5490 let expected_field = Int32Array::from(vec![Some(1552); num_rows]);
5491 let expected = RecordBatch::try_from_iter_with_nullable([(
5492 "l_partkey",
5493 Arc::new(expected_field) as Arc<dyn Array>,
5494 true,
5495 )])
5496 .unwrap();
5497 assert_eq!(batch, expected);
5498 }
5499
5500 #[test]
5501 #[cfg(feature = "snappy")]
5503 fn test_list_columns() {
5504 let file = arrow_test_data("avro/list_columns.avro");
5505 let mut int64_list_builder = ListBuilder::new(Int64Builder::new());
5506 {
5507 {
5508 let values = int64_list_builder.values();
5509 values.append_value(1);
5510 values.append_value(2);
5511 values.append_value(3);
5512 }
5513 int64_list_builder.append(true);
5514 }
5515 {
5516 {
5517 let values = int64_list_builder.values();
5518 values.append_null();
5519 values.append_value(1);
5520 }
5521 int64_list_builder.append(true);
5522 }
5523 {
5524 {
5525 let values = int64_list_builder.values();
5526 values.append_value(4);
5527 }
5528 int64_list_builder.append(true);
5529 }
5530 let int64_list = int64_list_builder.finish();
5531 let mut utf8_list_builder = ListBuilder::new(StringBuilder::new());
5532 {
5533 {
5534 let values = utf8_list_builder.values();
5535 values.append_value("abc");
5536 values.append_value("efg");
5537 values.append_value("hij");
5538 }
5539 utf8_list_builder.append(true);
5540 }
5541 {
5542 utf8_list_builder.append(false);
5543 }
5544 {
5545 {
5546 let values = utf8_list_builder.values();
5547 values.append_value("efg");
5548 values.append_null();
5549 values.append_value("hij");
5550 values.append_value("xyz");
5551 }
5552 utf8_list_builder.append(true);
5553 }
5554 let utf8_list = utf8_list_builder.finish();
5555 let expected = RecordBatch::try_from_iter_with_nullable([
5556 ("int64_list", Arc::new(int64_list) as Arc<dyn Array>, true),
5557 ("utf8_list", Arc::new(utf8_list) as Arc<dyn Array>, true),
5558 ])
5559 .unwrap();
5560 let batch = read_file(&file, 8, false);
5561 assert_eq!(batch, expected);
5562 }
5563
5564 #[test]
5565 #[cfg(feature = "snappy")]
5566 fn test_nested_lists() {
5567 use arrow_data::ArrayDataBuilder;
5568 let file = arrow_test_data("avro/nested_lists.snappy.avro");
5569 let inner_values = StringArray::from(vec![
5570 Some("a"),
5571 Some("b"),
5572 Some("c"),
5573 Some("d"),
5574 Some("a"),
5575 Some("b"),
5576 Some("c"),
5577 Some("d"),
5578 Some("e"),
5579 Some("a"),
5580 Some("b"),
5581 Some("c"),
5582 Some("d"),
5583 Some("e"),
5584 Some("f"),
5585 ]);
5586 let inner_offsets = Buffer::from_slice_ref([0, 2, 3, 3, 4, 6, 8, 8, 9, 11, 13, 14, 14, 15]);
5587 let inner_validity = [
5588 true, true, false, true, true, true, false, true, true, true, true, false, true,
5589 ];
5590 let inner_null_buffer = Buffer::from_iter(inner_validity.iter().copied());
5591 let inner_field = Field::new("item", DataType::Utf8, true);
5592 let inner_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(inner_field)))
5593 .len(13)
5594 .add_buffer(inner_offsets)
5595 .add_child_data(inner_values.to_data())
5596 .null_bit_buffer(Some(inner_null_buffer))
5597 .build()
5598 .unwrap();
5599 let inner_list_array = ListArray::from(inner_list_data);
5600 let middle_offsets = Buffer::from_slice_ref([0, 2, 4, 6, 8, 11, 13]);
5601 let middle_validity = [true; 6];
5602 let middle_null_buffer = Buffer::from_iter(middle_validity.iter().copied());
5603 let middle_field = Field::new("item", inner_list_array.data_type().clone(), true);
5604 let middle_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(middle_field)))
5605 .len(6)
5606 .add_buffer(middle_offsets)
5607 .add_child_data(inner_list_array.to_data())
5608 .null_bit_buffer(Some(middle_null_buffer))
5609 .build()
5610 .unwrap();
5611 let middle_list_array = ListArray::from(middle_list_data);
5612 let outer_offsets = Buffer::from_slice_ref([0, 2, 4, 6]);
5613 let outer_null_buffer = Buffer::from_slice_ref([0b111]); let outer_field = Field::new("item", middle_list_array.data_type().clone(), true);
5615 let outer_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(outer_field)))
5616 .len(3)
5617 .add_buffer(outer_offsets)
5618 .add_child_data(middle_list_array.to_data())
5619 .null_bit_buffer(Some(outer_null_buffer))
5620 .build()
5621 .unwrap();
5622 let a_expected = ListArray::from(outer_list_data);
5623 let b_expected = Int32Array::from(vec![1, 1, 1]);
5624 let expected = RecordBatch::try_from_iter_with_nullable([
5625 ("a", Arc::new(a_expected) as Arc<dyn Array>, true),
5626 ("b", Arc::new(b_expected) as Arc<dyn Array>, true),
5627 ])
5628 .unwrap();
5629 let left = read_file(&file, 8, false);
5630 assert_eq!(left, expected, "Mismatch for batch size=8");
5631 let left_small = read_file(&file, 3, false);
5632 assert_eq!(left_small, expected, "Mismatch for batch size=3");
5633 }
5634
5635 #[test]
5636 fn test_simple() {
5637 let tests = [
5638 ("avro/simple_enum.avro", 4, build_expected_enum(), 2),
5639 ("avro/simple_fixed.avro", 2, build_expected_fixed(), 1),
5640 ];
5641
5642 fn build_expected_enum() -> RecordBatch {
5643 let keys_f1 = Int32Array::from(vec![0, 1, 2, 3]);
5645 let vals_f1 = StringArray::from(vec!["a", "b", "c", "d"]);
5646 let f1_dict =
5647 DictionaryArray::<Int32Type>::try_new(keys_f1, Arc::new(vals_f1)).unwrap();
5648 let keys_f2 = Int32Array::from(vec![2, 3, 0, 1]);
5649 let vals_f2 = StringArray::from(vec!["e", "f", "g", "h"]);
5650 let f2_dict =
5651 DictionaryArray::<Int32Type>::try_new(keys_f2, Arc::new(vals_f2)).unwrap();
5652 let keys_f3 = Int32Array::from(vec![Some(1), Some(2), None, Some(0)]);
5653 let vals_f3 = StringArray::from(vec!["i", "j", "k"]);
5654 let f3_dict =
5655 DictionaryArray::<Int32Type>::try_new(keys_f3, Arc::new(vals_f3)).unwrap();
5656 let dict_type =
5657 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
5658 let mut md_f1 = HashMap::new();
5659 md_f1.insert(
5660 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5661 r#"["a","b","c","d"]"#.to_string(),
5662 );
5663 md_f1.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum1".to_string());
5664 md_f1.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5665 let f1_field = Field::new("f1", dict_type.clone(), false).with_metadata(md_f1);
5666 let mut md_f2 = HashMap::new();
5667 md_f2.insert(
5668 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5669 r#"["e","f","g","h"]"#.to_string(),
5670 );
5671 md_f2.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum2".to_string());
5672 md_f2.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
5673 let f2_field = Field::new("f2", dict_type.clone(), false).with_metadata(md_f2);
5674 let mut md_f3 = HashMap::new();
5675 md_f3.insert(
5676 AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
5677 r#"["i","j","k"]"#.to_string(),
5678 );
5679 md_f3.insert(AVRO_NAME_METADATA_KEY.to_string(), "enum3".to_string());
5680 md_f3.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns1".to_string());
5681 let f3_field = Field::new("f3", dict_type.clone(), true).with_metadata(md_f3);
5682 let expected_schema = Arc::new(Schema::new(vec![f1_field, f2_field, f3_field]));
5683 RecordBatch::try_new(
5684 expected_schema,
5685 vec![
5686 Arc::new(f1_dict) as Arc<dyn Array>,
5687 Arc::new(f2_dict) as Arc<dyn Array>,
5688 Arc::new(f3_dict) as Arc<dyn Array>,
5689 ],
5690 )
5691 .unwrap()
5692 }
5693
5694 fn build_expected_fixed() -> RecordBatch {
5695 let f1 =
5696 FixedSizeBinaryArray::try_from_iter(vec![b"abcde", b"12345"].into_iter()).unwrap();
5697 let f2 =
5698 FixedSizeBinaryArray::try_from_iter(vec![b"fghijklmno", b"1234567890"].into_iter())
5699 .unwrap();
5700 let f3 = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5701 vec![Some(b"ABCDEF" as &[u8]), None].into_iter(),
5702 6,
5703 )
5704 .unwrap();
5705
5706 let mut md_f1 = HashMap::new();
5708 md_f1.insert(
5709 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5710 "fixed1".to_string(),
5711 );
5712 md_f1.insert(
5713 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5714 "ns1".to_string(),
5715 );
5716
5717 let mut md_f2 = HashMap::new();
5718 md_f2.insert(
5719 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5720 "fixed2".to_string(),
5721 );
5722 md_f2.insert(
5723 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5724 "ns2".to_string(),
5725 );
5726
5727 let mut md_f3 = HashMap::new();
5728 md_f3.insert(
5729 crate::schema::AVRO_NAME_METADATA_KEY.to_string(),
5730 "fixed3".to_string(),
5731 );
5732 md_f3.insert(
5733 crate::schema::AVRO_NAMESPACE_METADATA_KEY.to_string(),
5734 "ns1".to_string(),
5735 );
5736
5737 let expected_schema = Arc::new(Schema::new(vec![
5738 Field::new("f1", DataType::FixedSizeBinary(5), false).with_metadata(md_f1),
5739 Field::new("f2", DataType::FixedSizeBinary(10), false).with_metadata(md_f2),
5740 Field::new("f3", DataType::FixedSizeBinary(6), true).with_metadata(md_f3),
5741 ]));
5742
5743 RecordBatch::try_new(
5744 expected_schema,
5745 vec![
5746 Arc::new(f1) as Arc<dyn Array>,
5747 Arc::new(f2) as Arc<dyn Array>,
5748 Arc::new(f3) as Arc<dyn Array>,
5749 ],
5750 )
5751 .unwrap()
5752 }
5753 for (file_name, batch_size, expected, alt_batch_size) in tests {
5754 let file = arrow_test_data(file_name);
5755 let actual = read_file(&file, batch_size, false);
5756 assert_eq!(actual, expected);
5757 let actual2 = read_file(&file, alt_batch_size, false);
5758 assert_eq!(actual2, expected);
5759 }
5760 }
5761
5762 #[test]
5763 #[cfg(feature = "snappy")]
5764 fn test_single_nan() {
5765 let file = arrow_test_data("avro/single_nan.avro");
5766 let actual = read_file(&file, 1, false);
5767 use arrow_array::Float64Array;
5768 let schema = Arc::new(Schema::new(vec![Field::new(
5769 "mycol",
5770 DataType::Float64,
5771 true,
5772 )]));
5773 let col = Float64Array::from(vec![None]);
5774 let expected = RecordBatch::try_new(schema, vec![Arc::new(col)]).unwrap();
5775 assert_eq!(actual, expected);
5776 let actual2 = read_file(&file, 2, false);
5777 assert_eq!(actual2, expected);
5778 }
5779
5780 #[test]
5781 fn test_duration_uuid() {
5782 let batch = read_file("test/data/duration_uuid.avro", 4, false);
5783 let schema = batch.schema();
5784 let fields = schema.fields();
5785 assert_eq!(fields.len(), 2);
5786 assert_eq!(fields[0].name(), "duration_field");
5787 assert_eq!(
5788 fields[0].data_type(),
5789 &DataType::Interval(IntervalUnit::MonthDayNano)
5790 );
5791 assert_eq!(fields[1].name(), "uuid_field");
5792 assert_eq!(fields[1].data_type(), &DataType::FixedSizeBinary(16));
5793 assert_eq!(batch.num_rows(), 4);
5794 assert_eq!(batch.num_columns(), 2);
5795 let duration_array = batch
5796 .column(0)
5797 .as_any()
5798 .downcast_ref::<IntervalMonthDayNanoArray>()
5799 .unwrap();
5800 let expected_duration_array: IntervalMonthDayNanoArray = [
5801 Some(IntervalMonthDayNanoType::make_value(1, 15, 500_000_000)),
5802 Some(IntervalMonthDayNanoType::make_value(0, 5, 2_500_000_000)),
5803 Some(IntervalMonthDayNanoType::make_value(2, 0, 0)),
5804 Some(IntervalMonthDayNanoType::make_value(12, 31, 999_000_000)),
5805 ]
5806 .iter()
5807 .copied()
5808 .collect();
5809 assert_eq!(&expected_duration_array, duration_array);
5810 let uuid_array = batch
5811 .column(1)
5812 .as_any()
5813 .downcast_ref::<FixedSizeBinaryArray>()
5814 .unwrap();
5815 let expected_uuid_array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(
5816 [
5817 Some([
5818 0xfe, 0x7b, 0xc3, 0x0b, 0x4c, 0xe8, 0x4c, 0x5e, 0xb6, 0x7c, 0x22, 0x34, 0xa2,
5819 0xd3, 0x8e, 0x66,
5820 ]),
5821 Some([
5822 0xb3, 0x3f, 0x2a, 0xd7, 0x97, 0xb4, 0x4d, 0xe1, 0x8b, 0xfe, 0x94, 0x94, 0x1d,
5823 0x60, 0x15, 0x6e,
5824 ]),
5825 Some([
5826 0x5f, 0x74, 0x92, 0x64, 0x07, 0x4b, 0x40, 0x05, 0x84, 0xbf, 0x11, 0x5e, 0xa8,
5827 0x4e, 0xd2, 0x0a,
5828 ]),
5829 Some([
5830 0x08, 0x26, 0xcc, 0x06, 0xd2, 0xe3, 0x45, 0x99, 0xb4, 0xad, 0xaf, 0x5f, 0xa6,
5831 0x90, 0x5c, 0xdb,
5832 ]),
5833 ]
5834 .into_iter(),
5835 16,
5836 )
5837 .unwrap();
5838 assert_eq!(&expected_uuid_array, uuid_array);
5839 }
5840
5841 #[test]
5842 #[cfg(feature = "snappy")]
5843 fn test_datapage_v2() {
5844 let file = arrow_test_data("avro/datapage_v2.snappy.avro");
5845 let batch = read_file(&file, 8, false);
5846 let a = StringArray::from(vec![
5847 Some("abc"),
5848 Some("abc"),
5849 Some("abc"),
5850 None,
5851 Some("abc"),
5852 ]);
5853 let b = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(4), Some(5)]);
5854 let c = Float64Array::from(vec![Some(2.0), Some(3.0), Some(4.0), Some(5.0), Some(2.0)]);
5855 let d = BooleanArray::from(vec![
5856 Some(true),
5857 Some(true),
5858 Some(true),
5859 Some(false),
5860 Some(true),
5861 ]);
5862 let e_values = Int32Array::from(vec![
5863 Some(1),
5864 Some(2),
5865 Some(3),
5866 Some(1),
5867 Some(2),
5868 Some(3),
5869 Some(1),
5870 Some(2),
5871 ]);
5872 let e_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 3, 3, 3, 6, 8]));
5873 let e_validity = Some(NullBuffer::from(vec![true, false, false, true, true]));
5874 let field_e = Arc::new(Field::new("item", DataType::Int32, true));
5875 let e = ListArray::new(field_e, e_offsets, Arc::new(e_values), e_validity);
5876 let expected = RecordBatch::try_from_iter_with_nullable([
5877 ("a", Arc::new(a) as Arc<dyn Array>, true),
5878 ("b", Arc::new(b) as Arc<dyn Array>, true),
5879 ("c", Arc::new(c) as Arc<dyn Array>, true),
5880 ("d", Arc::new(d) as Arc<dyn Array>, true),
5881 ("e", Arc::new(e) as Arc<dyn Array>, true),
5882 ])
5883 .unwrap();
5884 assert_eq!(batch, expected);
5885 }
5886
5887 #[test]
5888 fn test_nested_records() {
5889 let f1_f1_1 = StringArray::from(vec!["aaa", "bbb"]);
5890 let f1_f1_2 = Int32Array::from(vec![10, 20]);
5891 let rounded_pi = (std::f64::consts::PI * 100.0).round() / 100.0;
5892 let f1_f1_3_1 = Float64Array::from(vec![rounded_pi, rounded_pi]);
5893 let f1_f1_3 = StructArray::from(vec![(
5894 Arc::new(Field::new("f1_3_1", DataType::Float64, false)),
5895 Arc::new(f1_f1_3_1) as Arc<dyn Array>,
5896 )]);
5897 let mut f1_3_md: HashMap<String, String> = HashMap::new();
5899 f1_3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns3".to_string());
5900 f1_3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record3".to_string());
5901 let f1_expected = StructArray::from(vec![
5902 (
5903 Arc::new(Field::new("f1_1", DataType::Utf8, false)),
5904 Arc::new(f1_f1_1) as Arc<dyn Array>,
5905 ),
5906 (
5907 Arc::new(Field::new("f1_2", DataType::Int32, false)),
5908 Arc::new(f1_f1_2) as Arc<dyn Array>,
5909 ),
5910 (
5911 Arc::new(
5912 Field::new(
5913 "f1_3",
5914 DataType::Struct(Fields::from(vec![Field::new(
5915 "f1_3_1",
5916 DataType::Float64,
5917 false,
5918 )])),
5919 false,
5920 )
5921 .with_metadata(f1_3_md),
5922 ),
5923 Arc::new(f1_f1_3) as Arc<dyn Array>,
5924 ),
5925 ]);
5926 let f2_fields = [
5927 Field::new("f2_1", DataType::Boolean, false),
5928 Field::new("f2_2", DataType::Float32, false),
5929 ];
5930 let f2_struct_builder = StructBuilder::new(
5931 f2_fields
5932 .iter()
5933 .map(|f| Arc::new(f.clone()))
5934 .collect::<Vec<Arc<Field>>>(),
5935 vec![
5936 Box::new(BooleanBuilder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5937 Box::new(Float32Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>,
5938 ],
5939 );
5940 let mut f2_list_builder = ListBuilder::new(f2_struct_builder);
5941 {
5942 let struct_builder = f2_list_builder.values();
5943 struct_builder.append(true);
5944 {
5945 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5946 b.append_value(true);
5947 }
5948 {
5949 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5950 b.append_value(1.2_f32);
5951 }
5952 struct_builder.append(true);
5953 {
5954 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5955 b.append_value(true);
5956 }
5957 {
5958 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5959 b.append_value(2.2_f32);
5960 }
5961 f2_list_builder.append(true);
5962 }
5963 {
5964 let struct_builder = f2_list_builder.values();
5965 struct_builder.append(true);
5966 {
5967 let b = struct_builder.field_builder::<BooleanBuilder>(0).unwrap();
5968 b.append_value(false);
5969 }
5970 {
5971 let b = struct_builder.field_builder::<Float32Builder>(1).unwrap();
5972 b.append_value(10.2_f32);
5973 }
5974 f2_list_builder.append(true);
5975 }
5976
5977 let list_array_with_nullable_items = f2_list_builder.finish();
5978 let mut f2_item_md: HashMap<String, String> = HashMap::new();
5980 f2_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record4".to_string());
5981 f2_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns4".to_string());
5982 let item_field = Arc::new(
5983 Field::new(
5984 "item",
5985 list_array_with_nullable_items.values().data_type().clone(),
5986 false, )
5988 .with_metadata(f2_item_md),
5989 );
5990 let list_data_type = DataType::List(item_field);
5991 let f2_array_data = list_array_with_nullable_items
5992 .to_data()
5993 .into_builder()
5994 .data_type(list_data_type)
5995 .build()
5996 .unwrap();
5997 let f2_expected = ListArray::from(f2_array_data);
5998 let mut f3_struct_builder = StructBuilder::new(
5999 vec![Arc::new(Field::new("f3_1", DataType::Utf8, false))],
6000 vec![Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>],
6001 );
6002 f3_struct_builder.append(true);
6003 {
6004 let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
6005 b.append_value("xyz");
6006 }
6007 f3_struct_builder.append(false);
6008 {
6009 let b = f3_struct_builder.field_builder::<StringBuilder>(0).unwrap();
6010 b.append_null();
6011 }
6012 let f3_expected = f3_struct_builder.finish();
6013 let f4_fields = [Field::new("f4_1", DataType::Int64, false)];
6014 let f4_struct_builder = StructBuilder::new(
6015 f4_fields
6016 .iter()
6017 .map(|f| Arc::new(f.clone()))
6018 .collect::<Vec<Arc<Field>>>(),
6019 vec![Box::new(Int64Builder::new()) as Box<dyn arrow_array::builder::ArrayBuilder>],
6020 );
6021 let mut f4_list_builder = ListBuilder::new(f4_struct_builder);
6022 {
6023 let struct_builder = f4_list_builder.values();
6024 struct_builder.append(true);
6025 {
6026 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6027 b.append_value(200);
6028 }
6029 struct_builder.append(false);
6030 {
6031 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6032 b.append_null();
6033 }
6034 f4_list_builder.append(true);
6035 }
6036 {
6037 let struct_builder = f4_list_builder.values();
6038 struct_builder.append(false);
6039 {
6040 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6041 b.append_null();
6042 }
6043 struct_builder.append(true);
6044 {
6045 let b = struct_builder.field_builder::<Int64Builder>(0).unwrap();
6046 b.append_value(300);
6047 }
6048 f4_list_builder.append(true);
6049 }
6050 let f4_expected = f4_list_builder.finish();
6051 let mut f4_item_md: HashMap<String, String> = HashMap::new();
6053 f4_item_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns6".to_string());
6054 f4_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record6".to_string());
6055 let f4_item_field = Arc::new(
6056 Field::new("item", f4_expected.values().data_type().clone(), true)
6057 .with_metadata(f4_item_md),
6058 );
6059 let f4_list_data_type = DataType::List(f4_item_field);
6060 let f4_array_data = f4_expected
6061 .to_data()
6062 .into_builder()
6063 .data_type(f4_list_data_type)
6064 .build()
6065 .unwrap();
6066 let f4_expected = ListArray::from(f4_array_data);
6067 let mut f1_md: HashMap<String, String> = HashMap::new();
6069 f1_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record2".to_string());
6070 f1_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns2".to_string());
6071 let mut f3_md: HashMap<String, String> = HashMap::new();
6072 f3_md.insert(AVRO_NAMESPACE_METADATA_KEY.to_string(), "ns5".to_string());
6073 f3_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "record5".to_string());
6074 let expected_schema = Schema::new(vec![
6075 Field::new("f1", f1_expected.data_type().clone(), false).with_metadata(f1_md),
6076 Field::new("f2", f2_expected.data_type().clone(), false),
6077 Field::new("f3", f3_expected.data_type().clone(), true).with_metadata(f3_md),
6078 Field::new("f4", f4_expected.data_type().clone(), false),
6079 ]);
6080 let expected = RecordBatch::try_new(
6081 Arc::new(expected_schema),
6082 vec![
6083 Arc::new(f1_expected) as Arc<dyn Array>,
6084 Arc::new(f2_expected) as Arc<dyn Array>,
6085 Arc::new(f3_expected) as Arc<dyn Array>,
6086 Arc::new(f4_expected) as Arc<dyn Array>,
6087 ],
6088 )
6089 .unwrap();
6090 let file = arrow_test_data("avro/nested_records.avro");
6091 let batch_large = read_file(&file, 8, false);
6092 assert_eq!(
6093 batch_large, expected,
6094 "Decoded RecordBatch does not match expected data for nested records (batch size 8)"
6095 );
6096 let batch_small = read_file(&file, 3, false);
6097 assert_eq!(
6098 batch_small, expected,
6099 "Decoded RecordBatch does not match expected data for nested records (batch size 3)"
6100 );
6101 }
6102
6103 #[test]
6104 #[cfg(feature = "snappy")]
6106 fn test_repeated_no_annotation() {
6107 use arrow_data::ArrayDataBuilder;
6108 let file = arrow_test_data("avro/repeated_no_annotation.avro");
6109 let batch_large = read_file(&file, 8, false);
6110 let id_array = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
6112 let number_array = Int64Array::from(vec![
6114 Some(5555555555),
6115 Some(1111111111),
6116 Some(1111111111),
6117 Some(2222222222),
6118 Some(3333333333),
6119 ]);
6120 let kind_array =
6121 StringArray::from(vec![None, Some("home"), Some("home"), None, Some("mobile")]);
6122 let phone_fields = Fields::from(vec![
6123 Field::new("number", DataType::Int64, true),
6124 Field::new("kind", DataType::Utf8, true),
6125 ]);
6126 let phone_struct_data = ArrayDataBuilder::new(DataType::Struct(phone_fields))
6127 .len(5)
6128 .child_data(vec![number_array.into_data(), kind_array.into_data()])
6129 .build()
6130 .unwrap();
6131 let phone_struct_array = StructArray::from(phone_struct_data);
6132 let phone_list_offsets = Buffer::from_slice_ref([0i32, 0, 0, 0, 1, 2, 5]);
6134 let phone_list_validity = Buffer::from_iter([false, false, true, true, true, true]);
6135 let mut phone_item_md = HashMap::new();
6137 phone_item_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "phone".to_string());
6138 phone_item_md.insert(
6139 AVRO_NAMESPACE_METADATA_KEY.to_string(),
6140 "topLevelRecord.phoneNumbers".to_string(),
6141 );
6142 let phone_item_field = Field::new("item", phone_struct_array.data_type().clone(), true)
6143 .with_metadata(phone_item_md);
6144 let phone_list_data = ArrayDataBuilder::new(DataType::List(Arc::new(phone_item_field)))
6145 .len(6)
6146 .add_buffer(phone_list_offsets)
6147 .null_bit_buffer(Some(phone_list_validity))
6148 .child_data(vec![phone_struct_array.into_data()])
6149 .build()
6150 .unwrap();
6151 let phone_list_array = ListArray::from(phone_list_data);
6152 let phone_numbers_validity = Buffer::from_iter([false, false, true, true, true, true]);
6154 let phone_numbers_field = Field::new("phone", phone_list_array.data_type().clone(), true);
6155 let phone_numbers_struct_data =
6156 ArrayDataBuilder::new(DataType::Struct(Fields::from(vec![phone_numbers_field])))
6157 .len(6)
6158 .null_bit_buffer(Some(phone_numbers_validity))
6159 .child_data(vec![phone_list_array.into_data()])
6160 .build()
6161 .unwrap();
6162 let phone_numbers_struct_array = StructArray::from(phone_numbers_struct_data);
6163 let mut phone_numbers_md = HashMap::new();
6165 phone_numbers_md.insert(
6166 AVRO_NAME_METADATA_KEY.to_string(),
6167 "phoneNumbers".to_string(),
6168 );
6169 phone_numbers_md.insert(
6170 AVRO_NAMESPACE_METADATA_KEY.to_string(),
6171 "topLevelRecord".to_string(),
6172 );
6173 let id_field = Field::new("id", DataType::Int32, true);
6174 let phone_numbers_schema_field = Field::new(
6175 "phoneNumbers",
6176 phone_numbers_struct_array.data_type().clone(),
6177 true,
6178 )
6179 .with_metadata(phone_numbers_md);
6180 let expected_schema = Schema::new(vec![id_field, phone_numbers_schema_field]);
6181 let expected = RecordBatch::try_new(
6183 Arc::new(expected_schema),
6184 vec![
6185 Arc::new(id_array) as _,
6186 Arc::new(phone_numbers_struct_array) as _,
6187 ],
6188 )
6189 .unwrap();
6190 assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6191 let batch_small = read_file(&file, 3, false);
6192 assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6193 }
6194
6195 #[test]
6196 #[cfg(feature = "snappy")]
6198 fn test_nonnullable_impala() {
6199 let file = arrow_test_data("avro/nonnullable.impala.avro");
6200 let id = Int64Array::from(vec![Some(8)]);
6201 let mut int_array_builder = ListBuilder::new(Int32Builder::new());
6202 {
6203 let vb = int_array_builder.values();
6204 vb.append_value(-1);
6205 }
6206 int_array_builder.append(true); let int_array = int_array_builder.finish();
6208 let mut iaa_builder = ListBuilder::new(ListBuilder::new(Int32Builder::new()));
6209 {
6210 let inner_list_builder = iaa_builder.values();
6211 {
6212 let vb = inner_list_builder.values();
6213 vb.append_value(-1);
6214 vb.append_value(-2);
6215 }
6216 inner_list_builder.append(true);
6217 inner_list_builder.append(true);
6218 }
6219 iaa_builder.append(true);
6220 let int_array_array = iaa_builder.finish();
6221 let field_names = MapFieldNames {
6222 entry: "entries".to_string(),
6223 key: "key".to_string(),
6224 value: "value".to_string(),
6225 };
6226 let mut int_map_builder =
6227 MapBuilder::new(Some(field_names), StringBuilder::new(), Int32Builder::new());
6228 {
6229 let (keys, vals) = int_map_builder.entries();
6230 keys.append_value("k1");
6231 vals.append_value(-1);
6232 }
6233 int_map_builder.append(true).unwrap(); let int_map = int_map_builder.finish();
6235 let field_names2 = MapFieldNames {
6236 entry: "entries".to_string(),
6237 key: "key".to_string(),
6238 value: "value".to_string(),
6239 };
6240 let mut ima_builder = ListBuilder::new(MapBuilder::new(
6241 Some(field_names2),
6242 StringBuilder::new(),
6243 Int32Builder::new(),
6244 ));
6245 {
6246 let map_builder = ima_builder.values();
6247 map_builder.append(true).unwrap();
6248 {
6249 let (keys, vals) = map_builder.entries();
6250 keys.append_value("k1");
6251 vals.append_value(1);
6252 }
6253 map_builder.append(true).unwrap();
6254 map_builder.append(true).unwrap();
6255 map_builder.append(true).unwrap();
6256 }
6257 ima_builder.append(true);
6258 let int_map_array_ = ima_builder.finish();
6259 let meta_nested_struct: HashMap<String, String> = [
6261 ("avro.name", "nested_Struct"),
6262 ("avro.namespace", "topLevelRecord"),
6263 ]
6264 .into_iter()
6265 .map(|(k, v)| (k.to_string(), v.to_string()))
6266 .collect();
6267 let meta_c: HashMap<String, String> = [
6268 ("avro.name", "c"),
6269 ("avro.namespace", "topLevelRecord.nested_Struct"),
6270 ]
6271 .into_iter()
6272 .map(|(k, v)| (k.to_string(), v.to_string()))
6273 .collect();
6274 let meta_d_item_struct: HashMap<String, String> = [
6275 ("avro.name", "D"),
6276 ("avro.namespace", "topLevelRecord.nested_Struct.c"),
6277 ]
6278 .into_iter()
6279 .map(|(k, v)| (k.to_string(), v.to_string()))
6280 .collect();
6281 let meta_g_value: HashMap<String, String> = [
6282 ("avro.name", "G"),
6283 ("avro.namespace", "topLevelRecord.nested_Struct"),
6284 ]
6285 .into_iter()
6286 .map(|(k, v)| (k.to_string(), v.to_string()))
6287 .collect();
6288 let meta_h: HashMap<String, String> = [
6289 ("avro.name", "h"),
6290 ("avro.namespace", "topLevelRecord.nested_Struct.G"),
6291 ]
6292 .into_iter()
6293 .map(|(k, v)| (k.to_string(), v.to_string()))
6294 .collect();
6295 let ef_struct_field = Arc::new(
6297 Field::new(
6298 "item",
6299 DataType::Struct(
6300 vec![
6301 Field::new("e", DataType::Int32, true),
6302 Field::new("f", DataType::Utf8, true),
6303 ]
6304 .into(),
6305 ),
6306 true,
6307 )
6308 .with_metadata(meta_d_item_struct.clone()),
6309 );
6310 let d_inner_list_field = Arc::new(Field::new(
6311 "item",
6312 DataType::List(ef_struct_field.clone()),
6313 true,
6314 ));
6315 let d_field = Field::new("D", DataType::List(d_inner_list_field.clone()), true);
6316 let i_list_field = Arc::new(Field::new("item", DataType::Float64, true));
6318 let i_field = Field::new("i", DataType::List(i_list_field.clone()), true);
6319 let h_field = Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6321 .with_metadata(meta_h.clone());
6322 let g_value_struct_field = Field::new(
6324 "value",
6325 DataType::Struct(vec![h_field.clone()].into()),
6326 true,
6327 )
6328 .with_metadata(meta_g_value.clone());
6329 let entries_struct_field = Field::new(
6331 "entries",
6332 DataType::Struct(
6333 vec![
6334 Field::new("key", DataType::Utf8, false),
6335 g_value_struct_field.clone(),
6336 ]
6337 .into(),
6338 ),
6339 false,
6340 );
6341 let a_field = Arc::new(Field::new("a", DataType::Int32, true));
6343 let b_field = Arc::new(Field::new(
6344 "B",
6345 DataType::List(Arc::new(Field::new("item", DataType::Int32, true))),
6346 true,
6347 ));
6348 let c_field = Arc::new(
6349 Field::new("c", DataType::Struct(vec![d_field.clone()].into()), true)
6350 .with_metadata(meta_c.clone()),
6351 );
6352 let g_field = Arc::new(Field::new(
6353 "G",
6354 DataType::Map(Arc::new(entries_struct_field.clone()), false),
6355 true,
6356 ));
6357 let mut nested_sb = StructBuilder::new(
6359 vec![
6360 a_field.clone(),
6361 b_field.clone(),
6362 c_field.clone(),
6363 g_field.clone(),
6364 ],
6365 vec![
6366 Box::new(Int32Builder::new()),
6367 Box::new(ListBuilder::new(Int32Builder::new())),
6368 {
6369 Box::new(StructBuilder::new(
6371 vec![Arc::new(d_field.clone())],
6372 vec![Box::new({
6373 let ef_struct_builder = StructBuilder::new(
6374 vec![
6375 Arc::new(Field::new("e", DataType::Int32, true)),
6376 Arc::new(Field::new("f", DataType::Utf8, true)),
6377 ],
6378 vec![
6379 Box::new(Int32Builder::new()),
6380 Box::new(StringBuilder::new()),
6381 ],
6382 );
6383 let list_of_ef = ListBuilder::new(ef_struct_builder)
6385 .with_field(ef_struct_field.clone());
6386 ListBuilder::new(list_of_ef)
6388 })],
6389 ))
6390 },
6391 {
6392 let map_field_names = MapFieldNames {
6393 entry: "entries".to_string(),
6394 key: "key".to_string(),
6395 value: "value".to_string(),
6396 };
6397 let i_list_builder = ListBuilder::new(Float64Builder::new());
6398 let h_struct_builder = StructBuilder::new(
6399 vec![Arc::new(Field::new(
6400 "i",
6401 DataType::List(i_list_field.clone()),
6402 true,
6403 ))],
6404 vec![Box::new(i_list_builder)],
6405 );
6406 let g_value_builder = StructBuilder::new(
6407 vec![Arc::new(
6408 Field::new("h", DataType::Struct(vec![i_field.clone()].into()), true)
6409 .with_metadata(meta_h.clone()),
6410 )],
6411 vec![Box::new(h_struct_builder)],
6412 );
6413 let map_builder = MapBuilder::new(
6415 Some(map_field_names),
6416 StringBuilder::new(),
6417 g_value_builder,
6418 )
6419 .with_values_field(Arc::new(
6420 Field::new(
6421 "value",
6422 DataType::Struct(vec![h_field.clone()].into()),
6423 true,
6424 )
6425 .with_metadata(meta_g_value.clone()),
6426 ));
6427
6428 Box::new(map_builder)
6429 },
6430 ],
6431 );
6432 nested_sb.append(true);
6433 {
6434 let a_builder = nested_sb.field_builder::<Int32Builder>(0).unwrap();
6435 a_builder.append_value(-1);
6436 }
6437 {
6438 let b_builder = nested_sb
6439 .field_builder::<ListBuilder<Int32Builder>>(1)
6440 .unwrap();
6441 {
6442 let vb = b_builder.values();
6443 vb.append_value(-1);
6444 }
6445 b_builder.append(true);
6446 }
6447 {
6448 let c_struct_builder = nested_sb.field_builder::<StructBuilder>(2).unwrap();
6449 c_struct_builder.append(true);
6450 let d_list_builder = c_struct_builder
6451 .field_builder::<ListBuilder<ListBuilder<StructBuilder>>>(0)
6452 .unwrap();
6453 {
6454 let sub_list_builder = d_list_builder.values();
6455 {
6456 let ef_struct = sub_list_builder.values();
6457 ef_struct.append(true);
6458 {
6459 let e_b = ef_struct.field_builder::<Int32Builder>(0).unwrap();
6460 e_b.append_value(-1);
6461 let f_b = ef_struct.field_builder::<StringBuilder>(1).unwrap();
6462 f_b.append_value("nonnullable");
6463 }
6464 sub_list_builder.append(true);
6465 }
6466 d_list_builder.append(true);
6467 }
6468 }
6469 {
6470 let g_map_builder = nested_sb
6471 .field_builder::<MapBuilder<StringBuilder, StructBuilder>>(3)
6472 .unwrap();
6473 g_map_builder.append(true).unwrap();
6474 }
6475 let nested_struct = nested_sb.finish();
6476 let schema = Arc::new(arrow_schema::Schema::new(vec![
6477 Field::new("ID", id.data_type().clone(), true),
6478 Field::new("Int_Array", int_array.data_type().clone(), true),
6479 Field::new("int_array_array", int_array_array.data_type().clone(), true),
6480 Field::new("Int_Map", int_map.data_type().clone(), true),
6481 Field::new("int_map_array", int_map_array_.data_type().clone(), true),
6482 Field::new("nested_Struct", nested_struct.data_type().clone(), true)
6483 .with_metadata(meta_nested_struct.clone()),
6484 ]));
6485 let expected = RecordBatch::try_new(
6486 schema,
6487 vec![
6488 Arc::new(id) as Arc<dyn Array>,
6489 Arc::new(int_array),
6490 Arc::new(int_array_array),
6491 Arc::new(int_map),
6492 Arc::new(int_map_array_),
6493 Arc::new(nested_struct),
6494 ],
6495 )
6496 .unwrap();
6497 let batch_large = read_file(&file, 8, false);
6498 assert_eq!(batch_large, expected, "Mismatch for batch_size=8");
6499 let batch_small = read_file(&file, 3, false);
6500 assert_eq!(batch_small, expected, "Mismatch for batch_size=3");
6501 }
6502
6503 #[test]
6504 fn test_nonnullable_impala_strict() {
6505 let file = arrow_test_data("avro/nonnullable.impala.avro");
6506 let err = read_file_strict(&file, 8, false).unwrap_err();
6507 assert!(err.to_string().contains(
6508 "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6509 ));
6510 }
6511
6512 #[test]
6513 #[cfg(feature = "snappy")]
6515 fn test_nullable_impala() {
6516 let file = arrow_test_data("avro/nullable.impala.avro");
6517 let batch1 = read_file(&file, 3, false);
6518 let batch2 = read_file(&file, 8, false);
6519 assert_eq!(batch1, batch2);
6520 let batch = batch1;
6521 assert_eq!(batch.num_rows(), 7);
6522 let id_array = batch
6523 .column(0)
6524 .as_any()
6525 .downcast_ref::<Int64Array>()
6526 .expect("id column should be an Int64Array");
6527 let expected_ids = [1, 2, 3, 4, 5, 6, 7];
6528 for (i, &expected_id) in expected_ids.iter().enumerate() {
6529 assert_eq!(id_array.value(i), expected_id, "Mismatch in id at row {i}",);
6530 }
6531 let int_array = batch
6532 .column(1)
6533 .as_any()
6534 .downcast_ref::<ListArray>()
6535 .expect("int_array column should be a ListArray");
6536 {
6537 let offsets = int_array.value_offsets();
6538 let start = offsets[0] as usize;
6539 let end = offsets[1] as usize;
6540 let values = int_array
6541 .values()
6542 .as_any()
6543 .downcast_ref::<Int32Array>()
6544 .expect("Values of int_array should be an Int32Array");
6545 let row0: Vec<Option<i32>> = (start..end).map(|i| Some(values.value(i))).collect();
6546 assert_eq!(
6547 row0,
6548 vec![Some(1), Some(2), Some(3)],
6549 "Mismatch in int_array row 0"
6550 );
6551 }
6552 let nested_struct = batch
6553 .column(5)
6554 .as_any()
6555 .downcast_ref::<StructArray>()
6556 .expect("nested_struct column should be a StructArray");
6557 let a_array = nested_struct
6558 .column_by_name("A")
6559 .expect("Field A should exist in nested_struct")
6560 .as_any()
6561 .downcast_ref::<Int32Array>()
6562 .expect("Field A should be an Int32Array");
6563 assert_eq!(a_array.value(0), 1, "Mismatch in nested_struct.A at row 0");
6564 assert!(
6565 !a_array.is_valid(1),
6566 "Expected null in nested_struct.A at row 1"
6567 );
6568 assert!(
6569 !a_array.is_valid(3),
6570 "Expected null in nested_struct.A at row 3"
6571 );
6572 assert_eq!(a_array.value(6), 7, "Mismatch in nested_struct.A at row 6");
6573 }
6574
6575 #[test]
6576 fn test_nullable_impala_strict() {
6577 let file = arrow_test_data("avro/nullable.impala.avro");
6578 let err = read_file_strict(&file, 8, false).unwrap_err();
6579 assert!(err.to_string().contains(
6580 "Found Avro union of the form ['T','null'], which is disallowed in strict_mode"
6581 ));
6582 }
6583
6584 #[test]
6585 fn test_nested_record_type_reuse() {
6586 let batch = read_file("test/data/nested_record_reuse.avro", 8, false);
6612 let schema = batch.schema();
6613
6614 assert_eq!(schema.fields().len(), 3);
6616 let fields = schema.fields();
6617 assert_eq!(fields[0].name(), "nested");
6618 assert_eq!(fields[1].name(), "nestedRecord");
6619 assert_eq!(fields[2].name(), "nestedArray");
6620 assert!(matches!(fields[0].data_type(), DataType::Struct(_)));
6621 assert!(matches!(fields[1].data_type(), DataType::Struct(_)));
6622 assert!(matches!(fields[2].data_type(), DataType::List(_)));
6623
6624 if let DataType::Struct(nested_fields) = fields[0].data_type() {
6626 assert_eq!(nested_fields.len(), 1);
6627 assert_eq!(nested_fields[0].name(), "nested_int");
6628 assert_eq!(nested_fields[0].data_type(), &DataType::Int32);
6629 }
6630
6631 assert_eq!(fields[0].data_type(), fields[1].data_type());
6633 if let DataType::List(array_field) = fields[2].data_type() {
6634 assert_eq!(array_field.data_type(), fields[0].data_type());
6635 }
6636
6637 assert_eq!(batch.num_rows(), 2);
6639 assert_eq!(batch.num_columns(), 3);
6640
6641 let nested_col = batch
6643 .column(0)
6644 .as_any()
6645 .downcast_ref::<StructArray>()
6646 .unwrap();
6647 let nested_int_array = nested_col
6648 .column_by_name("nested_int")
6649 .unwrap()
6650 .as_any()
6651 .downcast_ref::<Int32Array>()
6652 .unwrap();
6653 assert_eq!(nested_int_array.value(0), 42);
6654 assert_eq!(nested_int_array.value(1), 99);
6655
6656 let nested_record_col = batch
6658 .column(1)
6659 .as_any()
6660 .downcast_ref::<StructArray>()
6661 .unwrap();
6662 let nested_record_int_array = nested_record_col
6663 .column_by_name("nested_int")
6664 .unwrap()
6665 .as_any()
6666 .downcast_ref::<Int32Array>()
6667 .unwrap();
6668 assert_eq!(nested_record_int_array.value(0), 100);
6669 assert_eq!(nested_record_int_array.value(1), 200);
6670
6671 let nested_array_col = batch
6673 .column(2)
6674 .as_any()
6675 .downcast_ref::<ListArray>()
6676 .unwrap();
6677 assert_eq!(nested_array_col.len(), 2);
6678 let first_array_struct = nested_array_col.value(0);
6679 let first_array_struct_array = first_array_struct
6680 .as_any()
6681 .downcast_ref::<StructArray>()
6682 .unwrap();
6683 let first_array_int_values = first_array_struct_array
6684 .column_by_name("nested_int")
6685 .unwrap()
6686 .as_any()
6687 .downcast_ref::<Int32Array>()
6688 .unwrap();
6689 assert_eq!(first_array_int_values.len(), 3);
6690 assert_eq!(first_array_int_values.value(0), 1);
6691 assert_eq!(first_array_int_values.value(1), 2);
6692 assert_eq!(first_array_int_values.value(2), 3);
6693 }
6694
6695 #[test]
6696 fn test_enum_type_reuse() {
6697 let batch = read_file("test/data/enum_reuse.avro", 8, false);
6720 let schema = batch.schema();
6721
6722 assert_eq!(schema.fields().len(), 3);
6724 let fields = schema.fields();
6725 assert_eq!(fields[0].name(), "status");
6726 assert_eq!(fields[1].name(), "backupStatus");
6727 assert_eq!(fields[2].name(), "statusHistory");
6728 assert!(matches!(fields[0].data_type(), DataType::Dictionary(_, _)));
6729 assert!(matches!(fields[1].data_type(), DataType::Dictionary(_, _)));
6730 assert!(matches!(fields[2].data_type(), DataType::List(_)));
6731
6732 if let DataType::Dictionary(key_type, value_type) = fields[0].data_type() {
6733 assert_eq!(key_type.as_ref(), &DataType::Int32);
6734 assert_eq!(value_type.as_ref(), &DataType::Utf8);
6735 }
6736
6737 assert_eq!(fields[0].data_type(), fields[1].data_type());
6739 if let DataType::List(array_field) = fields[2].data_type() {
6740 assert_eq!(array_field.data_type(), fields[0].data_type());
6741 }
6742
6743 assert_eq!(batch.num_rows(), 2);
6745 assert_eq!(batch.num_columns(), 3);
6746
6747 let status_col = batch
6749 .column(0)
6750 .as_any()
6751 .downcast_ref::<DictionaryArray<Int32Type>>()
6752 .unwrap();
6753 let status_values = status_col
6754 .values()
6755 .as_any()
6756 .downcast_ref::<StringArray>()
6757 .unwrap();
6758
6759 assert_eq!(
6761 status_values.value(status_col.key(0).unwrap() as usize),
6762 "ACTIVE"
6763 );
6764 assert_eq!(
6765 status_values.value(status_col.key(1).unwrap() as usize),
6766 "PENDING"
6767 );
6768
6769 let backup_status_col = batch
6771 .column(1)
6772 .as_any()
6773 .downcast_ref::<DictionaryArray<Int32Type>>()
6774 .unwrap();
6775 let backup_status_values = backup_status_col
6776 .values()
6777 .as_any()
6778 .downcast_ref::<StringArray>()
6779 .unwrap();
6780
6781 assert_eq!(
6783 backup_status_values.value(backup_status_col.key(0).unwrap() as usize),
6784 "INACTIVE"
6785 );
6786 assert_eq!(
6787 backup_status_values.value(backup_status_col.key(1).unwrap() as usize),
6788 "ACTIVE"
6789 );
6790
6791 let status_history_col = batch
6793 .column(2)
6794 .as_any()
6795 .downcast_ref::<ListArray>()
6796 .unwrap();
6797 assert_eq!(status_history_col.len(), 2);
6798
6799 let first_array_dict = status_history_col.value(0);
6801 let first_array_dict_array = first_array_dict
6802 .as_any()
6803 .downcast_ref::<DictionaryArray<Int32Type>>()
6804 .unwrap();
6805 let first_array_values = first_array_dict_array
6806 .values()
6807 .as_any()
6808 .downcast_ref::<StringArray>()
6809 .unwrap();
6810
6811 assert_eq!(first_array_dict_array.len(), 3);
6813 assert_eq!(
6814 first_array_values.value(first_array_dict_array.key(0).unwrap() as usize),
6815 "PENDING"
6816 );
6817 assert_eq!(
6818 first_array_values.value(first_array_dict_array.key(1).unwrap() as usize),
6819 "ACTIVE"
6820 );
6821 assert_eq!(
6822 first_array_values.value(first_array_dict_array.key(2).unwrap() as usize),
6823 "INACTIVE"
6824 );
6825 }
6826
6827 #[test]
6828 fn test_bad_varint_bug_nullable_array_items() {
6829 use flate2::read::GzDecoder;
6830 use std::io::Read;
6831 let manifest_dir = env!("CARGO_MANIFEST_DIR");
6832 let gz_path = format!("{manifest_dir}/test/data/bad-varint-bug.avro.gz");
6833 let gz_file = File::open(&gz_path).expect("test file should exist");
6834 let mut decoder = GzDecoder::new(gz_file);
6835 let mut avro_bytes = Vec::new();
6836 decoder
6837 .read_to_end(&mut avro_bytes)
6838 .expect("should decompress");
6839 let reader_arrow_schema = Schema::new(vec![Field::new(
6840 "int_array",
6841 DataType::List(Arc::new(Field::new("element", DataType::Int32, true))),
6842 true,
6843 )])
6844 .with_metadata(HashMap::from([("avro.name".into(), "table".into())]));
6845 let reader_schema = AvroSchema::try_from(&reader_arrow_schema)
6846 .expect("should convert Arrow schema to Avro");
6847 let mut reader = ReaderBuilder::new()
6848 .with_reader_schema(reader_schema)
6849 .build(Cursor::new(avro_bytes))
6850 .expect("should build reader");
6851 let batch = reader
6852 .next()
6853 .expect("should have one batch")
6854 .expect("reading should succeed without bad varint error");
6855 assert_eq!(batch.num_rows(), 1);
6856 let list_col = batch
6857 .column(0)
6858 .as_any()
6859 .downcast_ref::<ListArray>()
6860 .expect("should be ListArray");
6861 assert_eq!(list_col.len(), 1);
6862 let values = list_col.values();
6863 let int_values = values.as_primitive::<Int32Type>();
6864 assert_eq!(int_values.len(), 2);
6865 assert_eq!(int_values.value(0), 1);
6866 assert_eq!(int_values.value(1), 2);
6867 }
6868
6869 fn corrupt_first_block_payload_byte(
6870 mut bytes: Vec<u8>,
6871 field_offset: usize,
6872 expected_original: u8,
6873 replacement: u8,
6874 ) -> Vec<u8> {
6875 let mut header_decoder = HeaderDecoder::default();
6876 let header_len = header_decoder.decode(&bytes).expect("decode header");
6877 assert!(header_decoder.flush().is_some(), "decode complete header");
6878
6879 let mut cursor = &bytes[header_len..];
6880 let (_, count_len) = crate::reader::vlq::read_varint(cursor).expect("decode block count");
6881 cursor = &cursor[count_len..];
6882 let (_, size_len) = crate::reader::vlq::read_varint(cursor).expect("decode block size");
6883 let data_start = header_len + count_len + size_len;
6884 let target = data_start + field_offset;
6885
6886 assert!(
6887 target < bytes.len(),
6888 "target byte offset {target} out of bounds for input length {}",
6889 bytes.len()
6890 );
6891 assert_eq!(
6892 bytes[target], expected_original,
6893 "unexpected original byte at payload offset {field_offset}"
6894 );
6895 bytes[target] = replacement;
6896 bytes
6897 }
6898
6899 #[test]
6900 fn ocf_projection_rejects_overflowing_varint_in_skipped_long_field() {
6901 let writer_schema = Schema::new(vec![
6905 Field::new("bad_long", DataType::Int64, false),
6906 Field::new("keep", DataType::Int32, false),
6907 ]);
6908 let batch = RecordBatch::try_new(
6909 Arc::new(writer_schema.clone()),
6910 vec![
6911 Arc::new(Int64Array::from(vec![i64::MIN])) as ArrayRef,
6912 Arc::new(Int32Array::from(vec![7])) as ArrayRef,
6913 ],
6914 )
6915 .expect("build writer batch");
6916 let bytes = write_ocf(&writer_schema, &[batch]);
6917 let mutated = corrupt_first_block_payload_byte(bytes, 9, 0x01, 0x02);
6918
6919 let err = ReaderBuilder::new()
6920 .build(Cursor::new(mutated.clone()))
6921 .expect("build full reader")
6922 .collect::<Result<Vec<_>, _>>()
6923 .expect_err("full decode should reject malformed varint");
6924 assert!(matches!(err, ArrowError::AvroError(_)));
6925 assert!(err.to_string().contains("bad varint"));
6926
6927 let err = ReaderBuilder::new()
6928 .with_projection(vec![1])
6929 .build(Cursor::new(mutated))
6930 .expect("build projected reader")
6931 .collect::<Result<Vec<_>, _>>()
6932 .expect_err("projection must also reject malformed skipped varint");
6933 assert!(matches!(err, ArrowError::AvroError(_)));
6934 assert!(err.to_string().contains("bad varint"));
6935 }
6936
6937 #[test]
6938 fn ocf_projection_rejects_i32_overflow_in_skipped_int_field() {
6939 let writer_schema = Schema::new(vec![
6943 Field::new("bad_int", DataType::Int32, false),
6944 Field::new("keep", DataType::Int64, false),
6945 ]);
6946 let batch = RecordBatch::try_new(
6947 Arc::new(writer_schema.clone()),
6948 vec![
6949 Arc::new(Int32Array::from(vec![i32::MIN])) as ArrayRef,
6950 Arc::new(Int64Array::from(vec![11])) as ArrayRef,
6951 ],
6952 )
6953 .expect("build writer batch");
6954 let bytes = write_ocf(&writer_schema, &[batch]);
6955 let mutated = corrupt_first_block_payload_byte(bytes, 4, 0x0f, 0x10);
6956
6957 let err = ReaderBuilder::new()
6958 .build(Cursor::new(mutated.clone()))
6959 .expect("build full reader")
6960 .collect::<Result<Vec<_>, _>>()
6961 .expect_err("full decode should reject int overflow");
6962 assert!(matches!(err, ArrowError::AvroError(_)));
6963 assert!(err.to_string().contains("varint overflow"));
6964
6965 let err = ReaderBuilder::new()
6966 .with_projection(vec![1])
6967 .build(Cursor::new(mutated))
6968 .expect("build projected reader")
6969 .collect::<Result<Vec<_>, _>>()
6970 .expect_err("projection must also reject skipped int overflow");
6971 assert!(matches!(err, ArrowError::AvroError(_)));
6972 assert!(err.to_string().contains("varint overflow"));
6973 }
6974
6975 #[test]
6976 fn comprehensive_e2e_test() {
6977 let path = "test/data/comprehensive_e2e.avro";
6978 let batch = read_file(path, 1024, false);
6979 let schema = batch.schema();
6980
6981 #[inline]
6982 fn tid_by_name(fields: &UnionFields, want: &str) -> i8 {
6983 for (tid, f) in fields.iter() {
6984 if f.name() == want {
6985 return tid;
6986 }
6987 }
6988 panic!("union child '{want}' not found");
6989 }
6990
6991 #[inline]
6992 fn tid_by_dt(fields: &UnionFields, pred: impl Fn(&DataType) -> bool) -> i8 {
6993 for (tid, f) in fields.iter() {
6994 if pred(f.data_type()) {
6995 return tid;
6996 }
6997 }
6998 panic!("no union child matches predicate");
6999 }
7000
7001 fn mk_dense_union(
7002 fields: &UnionFields,
7003 type_ids: Vec<i8>,
7004 offsets: Vec<i32>,
7005 provide: impl Fn(&Field) -> Option<ArrayRef>,
7006 ) -> ArrayRef {
7007 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
7008 match dt {
7009 DataType::Null => Arc::new(NullArray::new(0)),
7010 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
7011 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
7012 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
7013 DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
7014 DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
7015 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
7016 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
7017 DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
7018 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
7019 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
7020 }
7021 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
7022 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
7023 }
7024 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
7025 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
7026 Arc::new(if let Some(tz) = tz {
7027 a.with_timezone(tz.clone())
7028 } else {
7029 a
7030 })
7031 }
7032 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
7033 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
7034 Arc::new(if let Some(tz) = tz {
7035 a.with_timezone(tz.clone())
7036 } else {
7037 a
7038 })
7039 }
7040 DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
7041 IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
7042 ),
7043 DataType::FixedSizeBinary(sz) => Arc::new(
7044 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
7045 std::iter::empty::<Option<Vec<u8>>>(),
7046 *sz,
7047 )
7048 .unwrap(),
7049 ),
7050 DataType::Dictionary(_, _) => {
7051 let keys = Int32Array::from(Vec::<i32>::new());
7052 let values = Arc::new(StringArray::from(Vec::<&str>::new()));
7053 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7054 }
7055 DataType::Struct(fields) => {
7056 let children: Vec<ArrayRef> = fields
7057 .iter()
7058 .map(|f| empty_child_for(f.data_type()) as ArrayRef)
7059 .collect();
7060 Arc::new(StructArray::new(fields.clone(), children, None))
7061 }
7062 DataType::List(field) => {
7063 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
7064 Arc::new(
7065 ListArray::try_new(
7066 field.clone(),
7067 offsets,
7068 empty_child_for(field.data_type()),
7069 None,
7070 )
7071 .unwrap(),
7072 )
7073 }
7074 DataType::Map(entry_field, is_sorted) => {
7075 let (key_field, val_field) = match entry_field.data_type() {
7076 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7077 other => panic!("unexpected map entries type: {other:?}"),
7078 };
7079 let keys = StringArray::from(Vec::<&str>::new());
7080 let vals: ArrayRef = match val_field.data_type() {
7081 DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
7082 DataType::Boolean => {
7083 Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
7084 }
7085 DataType::Int32 => {
7086 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
7087 }
7088 DataType::Int64 => {
7089 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
7090 }
7091 DataType::Float32 => {
7092 Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
7093 }
7094 DataType::Float64 => {
7095 Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
7096 }
7097 DataType::Utf8 => {
7098 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
7099 }
7100 DataType::Binary => {
7101 Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
7102 }
7103 DataType::Union(uf, _) => {
7104 let children: Vec<ArrayRef> = uf
7105 .iter()
7106 .map(|(_, f)| empty_child_for(f.data_type()))
7107 .collect();
7108 Arc::new(
7109 UnionArray::try_new(
7110 uf.clone(),
7111 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
7112 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
7113 children,
7114 )
7115 .unwrap(),
7116 ) as ArrayRef
7117 }
7118 other => panic!("unsupported map value type: {other:?}"),
7119 };
7120 let entries = StructArray::new(
7121 Fields::from(vec![
7122 key_field.as_ref().clone(),
7123 val_field.as_ref().clone(),
7124 ]),
7125 vec![Arc::new(keys) as ArrayRef, vals],
7126 None,
7127 );
7128 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
7129 Arc::new(MapArray::new(
7130 entry_field.clone(),
7131 offsets,
7132 entries,
7133 None,
7134 *is_sorted,
7135 ))
7136 }
7137 other => panic!("empty_child_for: unhandled type {other:?}"),
7138 }
7139 }
7140 let children: Vec<ArrayRef> = fields
7141 .iter()
7142 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
7143 .collect();
7144 Arc::new(
7145 UnionArray::try_new(
7146 fields.clone(),
7147 ScalarBuffer::<i8>::from(type_ids),
7148 Some(ScalarBuffer::<i32>::from(offsets)),
7149 children,
7150 )
7151 .unwrap(),
7152 ) as ArrayRef
7153 }
7154
7155 #[inline]
7156 fn uuid16_from_str(s: &str) -> [u8; 16] {
7157 let mut out = [0u8; 16];
7158 let mut idx = 0usize;
7159 let mut hi: Option<u8> = None;
7160 for ch in s.chars() {
7161 if ch == '-' {
7162 continue;
7163 }
7164 let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
7165 if let Some(h) = hi {
7166 out[idx] = (h << 4) | v;
7167 idx += 1;
7168 hi = None;
7169 } else {
7170 hi = Some(v);
7171 }
7172 }
7173 assert_eq!(idx, 16, "UUID must decode to 16 bytes");
7174 out
7175 }
7176 let date_a: i32 = 19_000; let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
7178 let time_us_eod: i64 = 86_400_000_000 - 1;
7179 let ts_ms_2024_01_01: i64 = 1_704_067_200_000; let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
7181 let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
7182 let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
7183 let dur_large =
7184 IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
7185 let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
7186 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
7187 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
7188
7189 #[inline]
7190 fn push_like(
7191 reader_schema: &arrow_schema::Schema,
7192 name: &str,
7193 arr: ArrayRef,
7194 fields: &mut Vec<FieldRef>,
7195 cols: &mut Vec<ArrayRef>,
7196 ) {
7197 let src = reader_schema
7198 .field_with_name(name)
7199 .unwrap_or_else(|_| panic!("source schema missing field '{name}'"));
7200 let mut f = Field::new(name, arr.data_type().clone(), src.is_nullable());
7201 let md = src.metadata();
7202 if !md.is_empty() {
7203 f = f.with_metadata(md.clone());
7204 }
7205 fields.push(Arc::new(f));
7206 cols.push(arr);
7207 }
7208
7209 let mut fields: Vec<FieldRef> = Vec::new();
7210 let mut columns: Vec<ArrayRef> = Vec::new();
7211 push_like(
7212 schema.as_ref(),
7213 "id",
7214 Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef,
7215 &mut fields,
7216 &mut columns,
7217 );
7218 push_like(
7219 schema.as_ref(),
7220 "flag",
7221 Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef,
7222 &mut fields,
7223 &mut columns,
7224 );
7225 push_like(
7226 schema.as_ref(),
7227 "ratio_f32",
7228 Arc::new(Float32Array::from(vec![1.25f32, -0.0, 3.5, 9.75])) as ArrayRef,
7229 &mut fields,
7230 &mut columns,
7231 );
7232 push_like(
7233 schema.as_ref(),
7234 "ratio_f64",
7235 Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef,
7236 &mut fields,
7237 &mut columns,
7238 );
7239 push_like(
7240 schema.as_ref(),
7241 "count_i32",
7242 Arc::new(Int32Array::from(vec![7, -1, 0, 123])) as ArrayRef,
7243 &mut fields,
7244 &mut columns,
7245 );
7246 push_like(
7247 schema.as_ref(),
7248 "count_i64",
7249 Arc::new(Int64Array::from(vec![
7250 7_000_000_000i64,
7251 -2,
7252 0,
7253 -9_876_543_210i64,
7254 ])) as ArrayRef,
7255 &mut fields,
7256 &mut columns,
7257 );
7258 push_like(
7259 schema.as_ref(),
7260 "opt_i32_nullfirst",
7261 Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef,
7262 &mut fields,
7263 &mut columns,
7264 );
7265 push_like(
7266 schema.as_ref(),
7267 "opt_str_nullsecond",
7268 Arc::new(StringArray::from(vec![
7269 Some("alpha"),
7270 None,
7271 Some("s3"),
7272 Some(""),
7273 ])) as ArrayRef,
7274 &mut fields,
7275 &mut columns,
7276 );
7277 {
7278 let uf = match schema
7279 .field_with_name("tri_union_prim")
7280 .unwrap()
7281 .data_type()
7282 {
7283 DataType::Union(f, UnionMode::Dense) => f.clone(),
7284 other => panic!("tri_union_prim should be dense union, got {other:?}"),
7285 };
7286 let tid_i = tid_by_name(&uf, "int");
7287 let tid_s = tid_by_name(&uf, "string");
7288 let tid_b = tid_by_name(&uf, "boolean");
7289 let tids = vec![tid_i, tid_s, tid_b, tid_s];
7290 let offs = vec![0, 0, 0, 1];
7291 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7292 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
7293 DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
7294 DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
7295 _ => None,
7296 });
7297 push_like(
7298 schema.as_ref(),
7299 "tri_union_prim",
7300 arr,
7301 &mut fields,
7302 &mut columns,
7303 );
7304 }
7305
7306 push_like(
7307 schema.as_ref(),
7308 "str_utf8",
7309 Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef,
7310 &mut fields,
7311 &mut columns,
7312 );
7313 push_like(
7314 schema.as_ref(),
7315 "raw_bytes",
7316 Arc::new(BinaryArray::from(vec![
7317 b"\x00\x01".as_ref(),
7318 b"".as_ref(),
7319 b"\xFF\x00".as_ref(),
7320 b"\x10\x20\x30\x40".as_ref(),
7321 ])) as ArrayRef,
7322 &mut fields,
7323 &mut columns,
7324 );
7325 {
7326 let it = [
7327 Some(*b"0123456789ABCDEF"),
7328 Some([0u8; 16]),
7329 Some(*b"ABCDEFGHIJKLMNOP"),
7330 Some([0xAA; 16]),
7331 ]
7332 .into_iter();
7333 let arr =
7334 Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
7335 as ArrayRef;
7336 push_like(
7337 schema.as_ref(),
7338 "fx16_plain",
7339 arr,
7340 &mut fields,
7341 &mut columns,
7342 );
7343 }
7344 {
7345 #[cfg(feature = "small_decimals")]
7346 let dec10_2 = Arc::new(
7347 Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
7348 .with_precision_and_scale(10, 2)
7349 .unwrap(),
7350 ) as ArrayRef;
7351 #[cfg(not(feature = "small_decimals"))]
7352 let dec10_2 = Arc::new(
7353 Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
7354 .with_precision_and_scale(10, 2)
7355 .unwrap(),
7356 ) as ArrayRef;
7357 push_like(
7358 schema.as_ref(),
7359 "dec_bytes_s10_2",
7360 dec10_2,
7361 &mut fields,
7362 &mut columns,
7363 );
7364 }
7365 {
7366 #[cfg(feature = "small_decimals")]
7367 let dec20_4 = Arc::new(
7368 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
7369 .with_precision_and_scale(20, 4)
7370 .unwrap(),
7371 ) as ArrayRef;
7372 #[cfg(not(feature = "small_decimals"))]
7373 let dec20_4 = Arc::new(
7374 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
7375 .with_precision_and_scale(20, 4)
7376 .unwrap(),
7377 ) as ArrayRef;
7378 push_like(
7379 schema.as_ref(),
7380 "dec_fix_s20_4",
7381 dec20_4,
7382 &mut fields,
7383 &mut columns,
7384 );
7385 }
7386 {
7387 let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
7388 let arr =
7389 Arc::new(FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap())
7390 as ArrayRef;
7391 push_like(schema.as_ref(), "uuid_str", arr, &mut fields, &mut columns);
7392 }
7393 push_like(
7394 schema.as_ref(),
7395 "d_date",
7396 Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef,
7397 &mut fields,
7398 &mut columns,
7399 );
7400 push_like(
7401 schema.as_ref(),
7402 "t_millis",
7403 Arc::new(Time32MillisecondArray::from(vec![
7404 time_ms_a,
7405 0,
7406 1,
7407 86_400_000 - 1,
7408 ])) as ArrayRef,
7409 &mut fields,
7410 &mut columns,
7411 );
7412 push_like(
7413 schema.as_ref(),
7414 "t_micros",
7415 Arc::new(Time64MicrosecondArray::from(vec![
7416 time_us_eod,
7417 0,
7418 1,
7419 1_000_000,
7420 ])) as ArrayRef,
7421 &mut fields,
7422 &mut columns,
7423 );
7424 {
7425 let a = TimestampMillisecondArray::from(vec![
7426 ts_ms_2024_01_01,
7427 -1,
7428 ts_ms_2024_01_01 + 123,
7429 0,
7430 ])
7431 .with_timezone("+00:00");
7432 push_like(
7433 schema.as_ref(),
7434 "ts_millis_utc",
7435 Arc::new(a) as ArrayRef,
7436 &mut fields,
7437 &mut columns,
7438 );
7439 }
7440 {
7441 let a = TimestampMicrosecondArray::from(vec![
7442 ts_us_2024_01_01,
7443 1,
7444 ts_us_2024_01_01 + 456,
7445 0,
7446 ])
7447 .with_timezone("+00:00");
7448 push_like(
7449 schema.as_ref(),
7450 "ts_micros_utc",
7451 Arc::new(a) as ArrayRef,
7452 &mut fields,
7453 &mut columns,
7454 );
7455 }
7456 push_like(
7457 schema.as_ref(),
7458 "ts_millis_local",
7459 Arc::new(TimestampMillisecondArray::from(vec![
7460 ts_ms_2024_01_01 + 86_400_000,
7461 0,
7462 ts_ms_2024_01_01 + 789,
7463 123_456_789,
7464 ])) as ArrayRef,
7465 &mut fields,
7466 &mut columns,
7467 );
7468 push_like(
7469 schema.as_ref(),
7470 "ts_micros_local",
7471 Arc::new(TimestampMicrosecondArray::from(vec![
7472 ts_us_2024_01_01 + 123_456,
7473 0,
7474 ts_us_2024_01_01 + 101_112,
7475 987_654_321,
7476 ])) as ArrayRef,
7477 &mut fields,
7478 &mut columns,
7479 );
7480 {
7481 let v = vec![dur_small, dur_zero, dur_large, dur_2years];
7482 push_like(
7483 schema.as_ref(),
7484 "interval_mdn",
7485 Arc::new(IntervalMonthDayNanoArray::from(v)) as ArrayRef,
7486 &mut fields,
7487 &mut columns,
7488 );
7489 }
7490 {
7491 let keys = Int32Array::from(vec![1, 2, 3, 0]); let values = Arc::new(StringArray::from(vec![
7493 "UNKNOWN",
7494 "NEW",
7495 "PROCESSING",
7496 "DONE",
7497 ])) as ArrayRef;
7498 let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
7499 push_like(
7500 schema.as_ref(),
7501 "status",
7502 Arc::new(dict) as ArrayRef,
7503 &mut fields,
7504 &mut columns,
7505 );
7506 }
7507 {
7508 let list_field = match schema.field_with_name("arr_union").unwrap().data_type() {
7509 DataType::List(f) => f.clone(),
7510 other => panic!("arr_union should be List, got {other:?}"),
7511 };
7512 let uf = match list_field.data_type() {
7513 DataType::Union(f, UnionMode::Dense) => f.clone(),
7514 other => panic!("arr_union item should be union, got {other:?}"),
7515 };
7516 let tid_l = tid_by_name(&uf, "long");
7517 let tid_s = tid_by_name(&uf, "string");
7518 let tid_n = tid_by_name(&uf, "null");
7519 let type_ids = vec![
7520 tid_l, tid_s, tid_n, tid_l, tid_n, tid_s, tid_l, tid_l, tid_s, tid_n, tid_l,
7521 ];
7522 let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
7523 let values = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7524 DataType::Int64 => {
7525 Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
7526 }
7527 DataType::Utf8 => {
7528 Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
7529 }
7530 DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
7531 _ => None,
7532 });
7533 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
7534 let arr = Arc::new(ListArray::try_new(list_field, list_offsets, values, None).unwrap())
7535 as ArrayRef;
7536 push_like(schema.as_ref(), "arr_union", arr, &mut fields, &mut columns);
7537 }
7538 {
7539 let (entry_field, entries_fields, uf, is_sorted) =
7540 match schema.field_with_name("map_union").unwrap().data_type() {
7541 DataType::Map(entry_field, is_sorted) => {
7542 let fs = match entry_field.data_type() {
7543 DataType::Struct(fs) => fs.clone(),
7544 other => panic!("map entries must be struct, got {other:?}"),
7545 };
7546 let val_f = fs[1].clone();
7547 let uf = match val_f.data_type() {
7548 DataType::Union(f, UnionMode::Dense) => f.clone(),
7549 other => panic!("map value must be union, got {other:?}"),
7550 };
7551 (entry_field.clone(), fs, uf, *is_sorted)
7552 }
7553 other => panic!("map_union should be Map, got {other:?}"),
7554 };
7555 let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
7556 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
7557 let tid_null = tid_by_name(&uf, "null");
7558 let tid_d = tid_by_name(&uf, "double");
7559 let tid_s = tid_by_name(&uf, "string");
7560 let type_ids = vec![tid_d, tid_null, tid_s, tid_d, tid_d, tid_s];
7561 let offsets = vec![0, 0, 0, 1, 2, 1];
7562 let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
7563 let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7564 DataType::Float64 => {
7565 Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
7566 }
7567 DataType::Utf8 => {
7568 Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
7569 }
7570 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7571 _ => None,
7572 });
7573 let entries = StructArray::new(
7574 entries_fields.clone(),
7575 vec![Arc::new(keys) as ArrayRef, vals],
7576 None,
7577 );
7578 let map =
7579 Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef;
7580 push_like(schema.as_ref(), "map_union", map, &mut fields, &mut columns);
7581 }
7582 {
7583 let fs = match schema.field_with_name("address").unwrap().data_type() {
7584 DataType::Struct(fs) => fs.clone(),
7585 other => panic!("address should be Struct, got {other:?}"),
7586 };
7587 let street = Arc::new(StringArray::from(vec![
7588 "100 Main",
7589 "",
7590 "42 Galaxy Way",
7591 "End Ave",
7592 ])) as ArrayRef;
7593 let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
7594 let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
7595 let arr = Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef;
7596 push_like(schema.as_ref(), "address", arr, &mut fields, &mut columns);
7597 }
7598 {
7599 let fs = match schema.field_with_name("maybe_auth").unwrap().data_type() {
7600 DataType::Struct(fs) => fs.clone(),
7601 other => panic!("maybe_auth should be Struct, got {other:?}"),
7602 };
7603 let user =
7604 Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
7605 let token_values: Vec<Option<&[u8]>> = vec![
7606 None, Some(b"\x01\x02\x03".as_ref()), None, Some(b"".as_ref()), ];
7611 let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
7612 let arr = Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef;
7613 push_like(
7614 schema.as_ref(),
7615 "maybe_auth",
7616 arr,
7617 &mut fields,
7618 &mut columns,
7619 );
7620 }
7621 {
7622 let uf = match schema
7623 .field_with_name("union_enum_record_array_map")
7624 .unwrap()
7625 .data_type()
7626 {
7627 DataType::Union(f, UnionMode::Dense) => f.clone(),
7628 other => panic!("union_enum_record_array_map should be union, got {other:?}"),
7629 };
7630 let mut tid_enum: Option<i8> = None;
7631 let mut tid_rec_a: Option<i8> = None;
7632 let mut tid_array: Option<i8> = None;
7633 let mut tid_map: Option<i8> = None;
7634 let mut map_entry_field: Option<FieldRef> = None;
7635 let mut map_sorted: bool = false;
7636 for (tid, f) in uf.iter() {
7637 match f.data_type() {
7638 DataType::Dictionary(_, _) => tid_enum = Some(tid),
7639 DataType::Struct(childs)
7640 if childs.len() == 2
7641 && childs[0].name() == "a"
7642 && childs[1].name() == "b" =>
7643 {
7644 tid_rec_a = Some(tid)
7645 }
7646 DataType::List(item) if matches!(item.data_type(), DataType::Int64) => {
7647 tid_array = Some(tid)
7648 }
7649 DataType::Map(ef, is_sorted) => {
7650 tid_map = Some(tid);
7651 map_entry_field = Some(ef.clone());
7652 map_sorted = *is_sorted;
7653 }
7654 _ => {}
7655 }
7656 }
7657 let (tid_enum, tid_rec_a, tid_array, tid_map) = (
7658 tid_enum.unwrap(),
7659 tid_rec_a.unwrap(),
7660 tid_array.unwrap(),
7661 tid_map.unwrap(),
7662 );
7663 let tids = vec![tid_enum, tid_rec_a, tid_array, tid_map];
7664 let offs = vec![0, 0, 0, 0];
7665 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7666 DataType::Dictionary(_, _) => {
7667 let keys = Int32Array::from(vec![0i32]);
7668 let values =
7669 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
7670 Some(
7671 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
7672 as ArrayRef,
7673 )
7674 }
7675 DataType::Struct(fs)
7676 if fs.len() == 2 && fs[0].name() == "a" && fs[1].name() == "b" =>
7677 {
7678 let a = Int32Array::from(vec![7]);
7679 let b = StringArray::from(vec!["rec"]);
7680 Some(Arc::new(StructArray::new(
7681 fs.clone(),
7682 vec![Arc::new(a), Arc::new(b)],
7683 None,
7684 )) as ArrayRef)
7685 }
7686 DataType::List(field) => {
7687 let values = Int64Array::from(vec![1i64, 2, 3]);
7688 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
7689 Some(Arc::new(
7690 ListArray::try_new(field.clone(), offsets, Arc::new(values), None).unwrap(),
7691 ) as ArrayRef)
7692 }
7693 DataType::Map(_, _) => {
7694 let entry_field = map_entry_field.clone().unwrap();
7695 let (key_field, val_field) = match entry_field.data_type() {
7696 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7697 _ => unreachable!(),
7698 };
7699 let keys = StringArray::from(vec!["k"]);
7700 let vals = StringArray::from(vec!["v"]);
7701 let entries = StructArray::new(
7702 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7703 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7704 None,
7705 );
7706 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
7707 Some(Arc::new(MapArray::new(
7708 entry_field.clone(),
7709 offsets,
7710 entries,
7711 None,
7712 map_sorted,
7713 )) as ArrayRef)
7714 }
7715 _ => None,
7716 });
7717 push_like(
7718 schema.as_ref(),
7719 "union_enum_record_array_map",
7720 arr,
7721 &mut fields,
7722 &mut columns,
7723 );
7724 }
7725 {
7726 let uf = match schema
7727 .field_with_name("union_date_or_fixed4")
7728 .unwrap()
7729 .data_type()
7730 {
7731 DataType::Union(f, UnionMode::Dense) => f.clone(),
7732 other => panic!("union_date_or_fixed4 should be union, got {other:?}"),
7733 };
7734 let tid_date = tid_by_dt(&uf, |dt| matches!(dt, DataType::Date32));
7735 let tid_fx4 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(4)));
7736 let tids = vec![tid_date, tid_fx4, tid_date, tid_fx4];
7737 let offs = vec![0, 0, 1, 1];
7738 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7739 DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
7740 DataType::FixedSizeBinary(4) => {
7741 let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
7742 Some(Arc::new(
7743 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
7744 ) as ArrayRef)
7745 }
7746 _ => None,
7747 });
7748 push_like(
7749 schema.as_ref(),
7750 "union_date_or_fixed4",
7751 arr,
7752 &mut fields,
7753 &mut columns,
7754 );
7755 }
7756 {
7757 let uf = match schema
7758 .field_with_name("union_interval_or_string")
7759 .unwrap()
7760 .data_type()
7761 {
7762 DataType::Union(f, UnionMode::Dense) => f.clone(),
7763 other => panic!("union_interval_or_string should be union, got {other:?}"),
7764 };
7765 let tid_dur = tid_by_dt(&uf, |dt| {
7766 matches!(dt, DataType::Interval(IntervalUnit::MonthDayNano))
7767 });
7768 let tid_str = tid_by_dt(&uf, |dt| matches!(dt, DataType::Utf8));
7769 let tids = vec![tid_dur, tid_str, tid_dur, tid_str];
7770 let offs = vec![0, 0, 1, 1];
7771 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7772 DataType::Interval(IntervalUnit::MonthDayNano) => Some(Arc::new(
7773 IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
7774 )
7775 as ArrayRef),
7776 DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
7777 "duration-as-text",
7778 "iso-8601-period-P1Y",
7779 ])) as ArrayRef),
7780 _ => None,
7781 });
7782 push_like(
7783 schema.as_ref(),
7784 "union_interval_or_string",
7785 arr,
7786 &mut fields,
7787 &mut columns,
7788 );
7789 }
7790 {
7791 let uf = match schema
7792 .field_with_name("union_uuid_or_fixed10")
7793 .unwrap()
7794 .data_type()
7795 {
7796 DataType::Union(f, UnionMode::Dense) => f.clone(),
7797 other => panic!("union_uuid_or_fixed10 should be union, got {other:?}"),
7798 };
7799 let tid_uuid = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(16)));
7800 let tid_fx10 = tid_by_dt(&uf, |dt| matches!(dt, DataType::FixedSizeBinary(10)));
7801 let tids = vec![tid_uuid, tid_fx10, tid_uuid, tid_fx10];
7802 let offs = vec![0, 0, 1, 1];
7803 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7804 DataType::FixedSizeBinary(16) => {
7805 let it = [Some(uuid1), Some(uuid2)].into_iter();
7806 Some(Arc::new(
7807 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
7808 ) as ArrayRef)
7809 }
7810 DataType::FixedSizeBinary(10) => {
7811 let fx10_a = [0xAAu8; 10];
7812 let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
7813 let it = [Some(fx10_a), Some(fx10_b)].into_iter();
7814 Some(Arc::new(
7815 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
7816 ) as ArrayRef)
7817 }
7818 _ => None,
7819 });
7820 push_like(
7821 schema.as_ref(),
7822 "union_uuid_or_fixed10",
7823 arr,
7824 &mut fields,
7825 &mut columns,
7826 );
7827 }
7828 {
7829 let list_field = match schema
7830 .field_with_name("array_records_with_union")
7831 .unwrap()
7832 .data_type()
7833 {
7834 DataType::List(f) => f.clone(),
7835 other => panic!("array_records_with_union should be List, got {other:?}"),
7836 };
7837 let kv_fields = match list_field.data_type() {
7838 DataType::Struct(fs) => fs.clone(),
7839 other => panic!("array_records_with_union items must be Struct, got {other:?}"),
7840 };
7841 let val_field = kv_fields
7842 .iter()
7843 .find(|f| f.name() == "val")
7844 .unwrap()
7845 .clone();
7846 let uf = match val_field.data_type() {
7847 DataType::Union(f, UnionMode::Dense) => f.clone(),
7848 other => panic!("KV.val should be union, got {other:?}"),
7849 };
7850 let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
7851 let tid_null = tid_by_name(&uf, "null");
7852 let tid_i = tid_by_name(&uf, "int");
7853 let tid_l = tid_by_name(&uf, "long");
7854 let type_ids = vec![tid_i, tid_null, tid_l, tid_null, tid_i];
7855 let offsets = vec![0, 0, 0, 1, 1];
7856 let vals = mk_dense_union(&uf, type_ids, offsets, |f| match f.data_type() {
7857 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
7858 DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
7859 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
7860 _ => None,
7861 });
7862 let values_struct =
7863 Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None)) as ArrayRef;
7864 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
7865 let arr = Arc::new(
7866 ListArray::try_new(list_field, list_offsets, values_struct, None).unwrap(),
7867 ) as ArrayRef;
7868 push_like(
7869 schema.as_ref(),
7870 "array_records_with_union",
7871 arr,
7872 &mut fields,
7873 &mut columns,
7874 );
7875 }
7876 {
7877 let uf = match schema
7878 .field_with_name("union_map_or_array_int")
7879 .unwrap()
7880 .data_type()
7881 {
7882 DataType::Union(f, UnionMode::Dense) => f.clone(),
7883 other => panic!("union_map_or_array_int should be union, got {other:?}"),
7884 };
7885 let tid_map = tid_by_dt(&uf, |dt| matches!(dt, DataType::Map(_, _)));
7886 let tid_list = tid_by_dt(&uf, |dt| matches!(dt, DataType::List(_)));
7887 let map_child: ArrayRef = {
7888 let (entry_field, is_sorted) = match uf
7889 .iter()
7890 .find(|(tid, _)| *tid == tid_map)
7891 .unwrap()
7892 .1
7893 .data_type()
7894 {
7895 DataType::Map(ef, is_sorted) => (ef.clone(), *is_sorted),
7896 _ => unreachable!(),
7897 };
7898 let (key_field, val_field) = match entry_field.data_type() {
7899 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
7900 _ => unreachable!(),
7901 };
7902 let keys = StringArray::from(vec!["x", "y", "only"]);
7903 let vals = Int32Array::from(vec![1, 2, 10]);
7904 let entries = StructArray::new(
7905 Fields::from(vec![key_field.as_ref().clone(), val_field.as_ref().clone()]),
7906 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
7907 None,
7908 );
7909 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
7910 Arc::new(MapArray::new(entry_field, moff, entries, None, is_sorted)) as ArrayRef
7911 };
7912 let list_child: ArrayRef = {
7913 let list_field = match uf
7914 .iter()
7915 .find(|(tid, _)| *tid == tid_list)
7916 .unwrap()
7917 .1
7918 .data_type()
7919 {
7920 DataType::List(f) => f.clone(),
7921 _ => unreachable!(),
7922 };
7923 let values = Int32Array::from(vec![1, 2, 3, 0]);
7924 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
7925 Arc::new(ListArray::try_new(list_field, offsets, Arc::new(values), None).unwrap())
7926 as ArrayRef
7927 };
7928 let tids = vec![tid_map, tid_list, tid_map, tid_list];
7929 let offs = vec![0, 0, 1, 1];
7930 let arr = mk_dense_union(&uf, tids, offs, |f| match f.data_type() {
7931 DataType::Map(_, _) => Some(map_child.clone()),
7932 DataType::List(_) => Some(list_child.clone()),
7933 _ => None,
7934 });
7935 push_like(
7936 schema.as_ref(),
7937 "union_map_or_array_int",
7938 arr,
7939 &mut fields,
7940 &mut columns,
7941 );
7942 }
7943 push_like(
7944 schema.as_ref(),
7945 "renamed_with_default",
7946 Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
7947 &mut fields,
7948 &mut columns,
7949 );
7950 {
7951 let fs = match schema.field_with_name("person").unwrap().data_type() {
7952 DataType::Struct(fs) => fs.clone(),
7953 other => panic!("person should be Struct, got {other:?}"),
7954 };
7955 let name =
7956 Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef;
7957 let age = Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef;
7958 let arr = Arc::new(StructArray::new(fs, vec![name, age], None)) as ArrayRef;
7959 push_like(schema.as_ref(), "person", arr, &mut fields, &mut columns);
7960 }
7961 let expected =
7962 RecordBatch::try_new(Arc::new(Schema::new(Fields::from(fields))), columns).unwrap();
7963 assert_eq!(
7964 expected, batch,
7965 "entire RecordBatch mismatch (schema, all columns, all rows)"
7966 );
7967 }
7968 #[test]
7969 fn comprehensive_e2e_resolution_test() {
7970 use serde_json::Value;
7971 use std::collections::HashMap;
7972
7973 fn make_comprehensive_reader_schema(path: &str) -> AvroSchema {
7986 fn set_type_string(f: &mut Value, new_ty: &str) {
7987 if let Some(ty) = f.get_mut("type") {
7988 match ty {
7989 Value::String(_) | Value::Object(_) => {
7990 *ty = Value::String(new_ty.to_string());
7991 }
7992 Value::Array(arr) => {
7993 for b in arr.iter_mut() {
7994 match b {
7995 Value::String(s) if s != "null" => {
7996 *b = Value::String(new_ty.to_string());
7997 break;
7998 }
7999 Value::Object(_) => {
8000 *b = Value::String(new_ty.to_string());
8001 break;
8002 }
8003 _ => {}
8004 }
8005 }
8006 }
8007 _ => {}
8008 }
8009 }
8010 }
8011 fn reverse_union_array(f: &mut Value) {
8012 if let Some(arr) = f.get_mut("type").and_then(|t| t.as_array_mut()) {
8013 arr.reverse();
8014 }
8015 }
8016 fn reverse_items_union(f: &mut Value) {
8017 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
8018 if let Some(items) = obj.get_mut("items").and_then(|v| v.as_array_mut()) {
8019 items.reverse();
8020 }
8021 }
8022 }
8023 fn reverse_map_values_union(f: &mut Value) {
8024 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
8025 if let Some(values) = obj.get_mut("values").and_then(|v| v.as_array_mut()) {
8026 values.reverse();
8027 }
8028 }
8029 }
8030 fn reverse_nested_union_in_record(f: &mut Value, field_name: &str) {
8031 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
8032 if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
8033 for ff in fields.iter_mut() {
8034 if ff.get("name").and_then(|n| n.as_str()) == Some(field_name) {
8035 if let Some(ty) = ff.get_mut("type") {
8036 if let Some(arr) = ty.as_array_mut() {
8037 arr.reverse();
8038 }
8039 }
8040 }
8041 }
8042 }
8043 }
8044 }
8045 fn rename_nested_field_with_alias(f: &mut Value, old: &str, new: &str) {
8046 if let Some(obj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
8047 if let Some(fields) = obj.get_mut("fields").and_then(|v| v.as_array_mut()) {
8048 for ff in fields.iter_mut() {
8049 if ff.get("name").and_then(|n| n.as_str()) == Some(old) {
8050 ff["name"] = Value::String(new.to_string());
8051 ff["aliases"] = Value::Array(vec![Value::String(old.to_string())]);
8052 }
8053 }
8054 }
8055 }
8056 }
8057 let mut root = load_writer_schema_json(path);
8058 assert_eq!(root["type"], "record", "writer schema must be a record");
8059 let fields = root
8060 .get_mut("fields")
8061 .and_then(|f| f.as_array_mut())
8062 .expect("record has fields");
8063 for f in fields.iter_mut() {
8064 let Some(name) = f.get("name").and_then(|n| n.as_str()) else {
8065 continue;
8066 };
8067 match name {
8068 "id" => {
8070 f["name"] = Value::String("identifier".into());
8071 f["aliases"] = Value::Array(vec![Value::String("id".into())]);
8072 }
8073 "renamed_with_default" => {
8074 f["name"] = Value::String("old_count".into());
8075 f["aliases"] =
8076 Value::Array(vec![Value::String("renamed_with_default".into())]);
8077 }
8078 "count_i32" => set_type_string(f, "long"),
8080 "ratio_f32" => set_type_string(f, "double"),
8081 "opt_str_nullsecond" => reverse_union_array(f),
8083 "union_enum_record_array_map" => reverse_union_array(f),
8084 "union_date_or_fixed4" => reverse_union_array(f),
8085 "union_interval_or_string" => reverse_union_array(f),
8086 "union_uuid_or_fixed10" => reverse_union_array(f),
8087 "union_map_or_array_int" => reverse_union_array(f),
8088 "maybe_auth" => reverse_nested_union_in_record(f, "token"),
8089 "arr_union" => reverse_items_union(f),
8091 "map_union" => reverse_map_values_union(f),
8092 "address" => rename_nested_field_with_alias(f, "street", "street_name"),
8094 "person" => {
8096 if let Some(tobj) = f.get_mut("type").and_then(|t| t.as_object_mut()) {
8097 tobj.insert("name".to_string(), Value::String("Person".into()));
8098 tobj.insert(
8099 "namespace".to_string(),
8100 Value::String("com.example".into()),
8101 );
8102 tobj.insert(
8103 "aliases".into(),
8104 Value::Array(vec![
8105 Value::String("PersonV2".into()),
8106 Value::String("com.example.v2.PersonV2".into()),
8107 ]),
8108 );
8109 }
8110 }
8111 _ => {}
8112 }
8113 }
8114 fields.reverse();
8115 AvroSchema::new(root.to_string())
8116 }
8117
8118 let path = "test/data/comprehensive_e2e.avro";
8119 let reader_schema = make_comprehensive_reader_schema(path);
8120 let batch = read_alltypes_with_reader_schema(path, reader_schema.clone());
8121
8122 const UUID_EXT_KEY: &str = "ARROW:extension:name";
8123 const UUID_LOGICAL_KEY: &str = "logicalType";
8124
8125 let uuid_md_top: Option<HashMap<String, String>> = batch
8126 .schema()
8127 .field_with_name("uuid_str")
8128 .ok()
8129 .and_then(|f| {
8130 let md = f.metadata();
8131 let has_ext = md.get(UUID_EXT_KEY).is_some();
8132 let is_uuid_logical = md
8133 .get(UUID_LOGICAL_KEY)
8134 .map(|v| v.trim_matches('"') == "uuid")
8135 .unwrap_or(false);
8136 if has_ext || is_uuid_logical {
8137 Some(md.clone())
8138 } else {
8139 None
8140 }
8141 });
8142
8143 let uuid_md_union: Option<HashMap<String, String>> = batch
8144 .schema()
8145 .field_with_name("union_uuid_or_fixed10")
8146 .ok()
8147 .and_then(|f| match f.data_type() {
8148 DataType::Union(uf, _) => uf
8149 .iter()
8150 .find(|(_, child)| child.name() == "uuid")
8151 .and_then(|(_, child)| {
8152 let md = child.metadata();
8153 let has_ext = md.get(UUID_EXT_KEY).is_some();
8154 let is_uuid_logical = md
8155 .get(UUID_LOGICAL_KEY)
8156 .map(|v| v.trim_matches('"') == "uuid")
8157 .unwrap_or(false);
8158 if has_ext || is_uuid_logical {
8159 Some(md.clone())
8160 } else {
8161 None
8162 }
8163 }),
8164 _ => None,
8165 });
8166
8167 let add_uuid_ext_top = |f: Field| -> Field {
8168 if let Some(md) = &uuid_md_top {
8169 f.with_metadata(md.clone())
8170 } else {
8171 f
8172 }
8173 };
8174 let add_uuid_ext_union = |f: Field| -> Field {
8175 if let Some(md) = &uuid_md_union {
8176 f.with_metadata(md.clone())
8177 } else {
8178 f
8179 }
8180 };
8181
8182 #[inline]
8183 fn uuid16_from_str(s: &str) -> [u8; 16] {
8184 let mut out = [0u8; 16];
8185 let mut idx = 0usize;
8186 let mut hi: Option<u8> = None;
8187 for ch in s.chars() {
8188 if ch == '-' {
8189 continue;
8190 }
8191 let v = ch.to_digit(16).expect("invalid hex digit in UUID") as u8;
8192 if let Some(h) = hi {
8193 out[idx] = (h << 4) | v;
8194 idx += 1;
8195 hi = None;
8196 } else {
8197 hi = Some(v);
8198 }
8199 }
8200 assert_eq!(idx, 16, "UUID must decode to 16 bytes");
8201 out
8202 }
8203
8204 fn mk_dense_union(
8205 fields: &UnionFields,
8206 type_ids: Vec<i8>,
8207 offsets: Vec<i32>,
8208 provide: impl Fn(&Field) -> Option<ArrayRef>,
8209 ) -> ArrayRef {
8210 fn empty_child_for(dt: &DataType) -> Arc<dyn Array> {
8211 match dt {
8212 DataType::Null => Arc::new(NullArray::new(0)),
8213 DataType::Boolean => Arc::new(BooleanArray::from(Vec::<bool>::new())),
8214 DataType::Int32 => Arc::new(Int32Array::from(Vec::<i32>::new())),
8215 DataType::Int64 => Arc::new(Int64Array::from(Vec::<i64>::new())),
8216 DataType::Float32 => Arc::new(Float32Array::from(Vec::<f32>::new())),
8217 DataType::Float64 => Arc::new(Float64Array::from(Vec::<f64>::new())),
8218 DataType::Binary => Arc::new(BinaryArray::from(Vec::<&[u8]>::new())),
8219 DataType::Utf8 => Arc::new(StringArray::from(Vec::<&str>::new())),
8220 DataType::Date32 => Arc::new(Date32Array::from(Vec::<i32>::new())),
8221 DataType::Time32(arrow_schema::TimeUnit::Millisecond) => {
8222 Arc::new(Time32MillisecondArray::from(Vec::<i32>::new()))
8223 }
8224 DataType::Time64(arrow_schema::TimeUnit::Microsecond) => {
8225 Arc::new(Time64MicrosecondArray::from(Vec::<i64>::new()))
8226 }
8227 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, tz) => {
8228 let a = TimestampMillisecondArray::from(Vec::<i64>::new());
8229 Arc::new(if let Some(tz) = tz {
8230 a.with_timezone(tz.clone())
8231 } else {
8232 a
8233 })
8234 }
8235 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, tz) => {
8236 let a = TimestampMicrosecondArray::from(Vec::<i64>::new());
8237 Arc::new(if let Some(tz) = tz {
8238 a.with_timezone(tz.clone())
8239 } else {
8240 a
8241 })
8242 }
8243 DataType::Interval(IntervalUnit::MonthDayNano) => Arc::new(
8244 IntervalMonthDayNanoArray::from(Vec::<IntervalMonthDayNano>::new()),
8245 ),
8246 DataType::FixedSizeBinary(sz) => Arc::new(
8247 FixedSizeBinaryArray::try_from_sparse_iter_with_size(
8248 std::iter::empty::<Option<Vec<u8>>>(),
8249 *sz,
8250 )
8251 .unwrap(),
8252 ),
8253 DataType::Dictionary(_, _) => {
8254 let keys = Int32Array::from(Vec::<i32>::new());
8255 let values = Arc::new(StringArray::from(Vec::<&str>::new()));
8256 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8257 }
8258 DataType::Struct(fields) => {
8259 let children: Vec<ArrayRef> = fields
8260 .iter()
8261 .map(|f| empty_child_for(f.data_type()) as ArrayRef)
8262 .collect();
8263 Arc::new(StructArray::new(fields.clone(), children, None))
8264 }
8265 DataType::List(field) => {
8266 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
8267 Arc::new(
8268 ListArray::try_new(
8269 field.clone(),
8270 offsets,
8271 empty_child_for(field.data_type()),
8272 None,
8273 )
8274 .unwrap(),
8275 )
8276 }
8277 DataType::Map(entry_field, is_sorted) => {
8278 let (key_field, val_field) = match entry_field.data_type() {
8279 DataType::Struct(fs) => (fs[0].clone(), fs[1].clone()),
8280 other => panic!("unexpected map entries type: {other:?}"),
8281 };
8282 let keys = StringArray::from(Vec::<&str>::new());
8283 let vals: ArrayRef = match val_field.data_type() {
8284 DataType::Null => Arc::new(NullArray::new(0)) as ArrayRef,
8285 DataType::Boolean => {
8286 Arc::new(BooleanArray::from(Vec::<bool>::new())) as ArrayRef
8287 }
8288 DataType::Int32 => {
8289 Arc::new(Int32Array::from(Vec::<i32>::new())) as ArrayRef
8290 }
8291 DataType::Int64 => {
8292 Arc::new(Int64Array::from(Vec::<i64>::new())) as ArrayRef
8293 }
8294 DataType::Float32 => {
8295 Arc::new(Float32Array::from(Vec::<f32>::new())) as ArrayRef
8296 }
8297 DataType::Float64 => {
8298 Arc::new(Float64Array::from(Vec::<f64>::new())) as ArrayRef
8299 }
8300 DataType::Utf8 => {
8301 Arc::new(StringArray::from(Vec::<&str>::new())) as ArrayRef
8302 }
8303 DataType::Binary => {
8304 Arc::new(BinaryArray::from(Vec::<&[u8]>::new())) as ArrayRef
8305 }
8306 DataType::Union(uf, _) => {
8307 let children: Vec<ArrayRef> = uf
8308 .iter()
8309 .map(|(_, f)| empty_child_for(f.data_type()))
8310 .collect();
8311 Arc::new(
8312 UnionArray::try_new(
8313 uf.clone(),
8314 ScalarBuffer::<i8>::from(Vec::<i8>::new()),
8315 Some(ScalarBuffer::<i32>::from(Vec::<i32>::new())),
8316 children,
8317 )
8318 .unwrap(),
8319 ) as ArrayRef
8320 }
8321 other => panic!("unsupported map value type: {other:?}"),
8322 };
8323 let entries = StructArray::new(
8324 Fields::from(vec![
8325 key_field.as_ref().clone(),
8326 val_field.as_ref().clone(),
8327 ]),
8328 vec![Arc::new(keys) as ArrayRef, vals],
8329 None,
8330 );
8331 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0]));
8332 Arc::new(MapArray::new(
8333 entry_field.clone(),
8334 offsets,
8335 entries,
8336 None,
8337 *is_sorted,
8338 ))
8339 }
8340 other => panic!("empty_child_for: unhandled type {other:?}"),
8341 }
8342 }
8343 let children: Vec<ArrayRef> = fields
8344 .iter()
8345 .map(|(_, f)| provide(f).unwrap_or_else(|| empty_child_for(f.data_type())))
8346 .collect();
8347 Arc::new(
8348 UnionArray::try_new(
8349 fields.clone(),
8350 ScalarBuffer::<i8>::from(type_ids),
8351 Some(ScalarBuffer::<i32>::from(offsets)),
8352 children,
8353 )
8354 .unwrap(),
8355 ) as ArrayRef
8356 }
8357 let date_a: i32 = 19_000; let time_ms_a: i32 = 12 * 3_600_000 + 34 * 60_000 + 56_000 + 789;
8359 let time_us_eod: i64 = 86_400_000_000 - 1;
8360 let ts_ms_2024_01_01: i64 = 1_704_067_200_000; let ts_us_2024_01_01: i64 = ts_ms_2024_01_01 * 1_000;
8362 let dur_small = IntervalMonthDayNanoType::make_value(1, 2, 3_000_000_000);
8363 let dur_zero = IntervalMonthDayNanoType::make_value(0, 0, 0);
8364 let dur_large =
8365 IntervalMonthDayNanoType::make_value(12, 31, ((86_400_000 - 1) as i64) * 1_000_000);
8366 let dur_2years = IntervalMonthDayNanoType::make_value(24, 0, 0);
8367 let uuid1 = uuid16_from_str("fe7bc30b-4ce8-4c5e-b67c-2234a2d38e66");
8368 let uuid2 = uuid16_from_str("0826cc06-d2e3-4599-b4ad-af5fa6905cdb");
8369 let item_name = Field::LIST_FIELD_DEFAULT_NAME;
8370 let uf_tri = UnionFields::try_new(
8371 vec![0, 1, 2],
8372 vec![
8373 Field::new("int", DataType::Int32, false),
8374 Field::new("string", DataType::Utf8, false),
8375 Field::new("boolean", DataType::Boolean, false),
8376 ],
8377 )
8378 .unwrap();
8379 let uf_arr_items = UnionFields::try_new(
8380 vec![0, 1, 2],
8381 vec![
8382 Field::new("null", DataType::Null, false),
8383 Field::new("string", DataType::Utf8, false),
8384 Field::new("long", DataType::Int64, false),
8385 ],
8386 )
8387 .unwrap();
8388 let arr_items_field = Arc::new(Field::new(
8389 item_name,
8390 DataType::Union(uf_arr_items.clone(), UnionMode::Dense),
8391 true,
8392 ));
8393 let uf_map_vals = UnionFields::try_new(
8394 vec![0, 1, 2],
8395 vec![
8396 Field::new("string", DataType::Utf8, false),
8397 Field::new("double", DataType::Float64, false),
8398 Field::new("null", DataType::Null, false),
8399 ],
8400 )
8401 .unwrap();
8402 let map_entries_field = Arc::new(Field::new(
8403 "entries",
8404 DataType::Struct(Fields::from(vec![
8405 Field::new("key", DataType::Utf8, false),
8406 Field::new(
8407 "value",
8408 DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
8409 true,
8410 ),
8411 ])),
8412 false,
8413 ));
8414 let mut enum_md_color = {
8416 let mut m = HashMap::<String, String>::new();
8417 m.insert(
8418 crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8419 serde_json::to_string(&vec!["RED", "GREEN", "BLUE"]).unwrap(),
8420 );
8421 m
8422 };
8423 enum_md_color.insert(AVRO_NAME_METADATA_KEY.to_string(), "Color".to_string());
8424 enum_md_color.insert(
8425 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8426 "org.apache.arrow.avrotests.v1.types".to_string(),
8427 );
8428 let union_rec_a_fields = Fields::from(vec![
8429 Field::new("a", DataType::Int32, false),
8430 Field::new("b", DataType::Utf8, false),
8431 ]);
8432 let union_rec_b_fields = Fields::from(vec![
8433 Field::new("x", DataType::Int64, false),
8434 Field::new("y", DataType::Binary, false),
8435 ]);
8436 let union_map_entries = Arc::new(Field::new(
8437 "entries",
8438 DataType::Struct(Fields::from(vec![
8439 Field::new("key", DataType::Utf8, false),
8440 Field::new("value", DataType::Utf8, false),
8441 ])),
8442 false,
8443 ));
8444 let rec_a_md = {
8445 let mut m = HashMap::<String, String>::new();
8446 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecA".to_string());
8447 m.insert(
8448 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8449 "org.apache.arrow.avrotests.v1.types".to_string(),
8450 );
8451 m
8452 };
8453 let rec_b_md = {
8454 let mut m = HashMap::<String, String>::new();
8455 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "RecB".to_string());
8456 m.insert(
8457 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8458 "org.apache.arrow.avrotests.v1.types".to_string(),
8459 );
8460 m
8461 };
8462 let uf_union_big = UnionFields::try_new(
8463 vec![0, 1, 2, 3, 4],
8464 vec![
8465 Field::new(
8466 "map",
8467 DataType::Map(union_map_entries.clone(), false),
8468 false,
8469 ),
8470 Field::new(
8471 "array",
8472 DataType::List(Arc::new(Field::new(item_name, DataType::Int64, false))),
8473 false,
8474 ),
8475 Field::new(
8476 "org.apache.arrow.avrotests.v1.types.RecB",
8477 DataType::Struct(union_rec_b_fields.clone()),
8478 false,
8479 )
8480 .with_metadata(rec_b_md.clone()),
8481 Field::new(
8482 "org.apache.arrow.avrotests.v1.types.RecA",
8483 DataType::Struct(union_rec_a_fields.clone()),
8484 false,
8485 )
8486 .with_metadata(rec_a_md.clone()),
8487 Field::new(
8488 "org.apache.arrow.avrotests.v1.types.Color",
8489 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8490 false,
8491 )
8492 .with_metadata(enum_md_color.clone()),
8493 ],
8494 )
8495 .unwrap();
8496 let fx4_md = {
8497 let mut m = HashMap::<String, String>::new();
8498 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx4".to_string());
8499 m.insert(
8500 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8501 "org.apache.arrow.avrotests.v1".to_string(),
8502 );
8503 m
8504 };
8505 let uf_date_fixed4 = UnionFields::try_new(
8506 vec![0, 1],
8507 vec![
8508 Field::new(
8509 "org.apache.arrow.avrotests.v1.Fx4",
8510 DataType::FixedSizeBinary(4),
8511 false,
8512 )
8513 .with_metadata(fx4_md.clone()),
8514 Field::new("date", DataType::Date32, false),
8515 ],
8516 )
8517 .unwrap();
8518 let dur12u_md = {
8519 let mut m = HashMap::<String, String>::new();
8520 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12U".to_string());
8521 m.insert(
8522 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8523 "org.apache.arrow.avrotests.v1".to_string(),
8524 );
8525 m
8526 };
8527 let uf_dur_or_str = UnionFields::try_new(
8528 vec![0, 1],
8529 vec![
8530 Field::new("string", DataType::Utf8, false),
8531 Field::new(
8532 "org.apache.arrow.avrotests.v1.Dur12U",
8533 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano),
8534 false,
8535 )
8536 .with_metadata(dur12u_md.clone()),
8537 ],
8538 )
8539 .unwrap();
8540 let fx10_md = {
8541 let mut m = HashMap::<String, String>::new();
8542 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx10".to_string());
8543 m.insert(
8544 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8545 "org.apache.arrow.avrotests.v1".to_string(),
8546 );
8547 m
8548 };
8549 let uf_uuid_or_fx10 = UnionFields::try_new(
8550 vec![0, 1],
8551 vec![
8552 Field::new(
8553 "org.apache.arrow.avrotests.v1.Fx10",
8554 DataType::FixedSizeBinary(10),
8555 false,
8556 )
8557 .with_metadata(fx10_md.clone()),
8558 add_uuid_ext_union(Field::new("uuid", DataType::FixedSizeBinary(16), false)),
8559 ],
8560 )
8561 .unwrap();
8562 let uf_kv_val = UnionFields::try_new(
8563 vec![0, 1, 2],
8564 vec![
8565 Field::new("null", DataType::Null, false),
8566 Field::new("int", DataType::Int32, false),
8567 Field::new("long", DataType::Int64, false),
8568 ],
8569 )
8570 .unwrap();
8571 let kv_fields = Fields::from(vec![
8572 Field::new("key", DataType::Utf8, false),
8573 Field::new(
8574 "val",
8575 DataType::Union(uf_kv_val.clone(), UnionMode::Dense),
8576 true,
8577 ),
8578 ]);
8579 let kv_item_field = Arc::new(Field::new(
8580 item_name,
8581 DataType::Struct(kv_fields.clone()),
8582 false,
8583 ));
8584 let map_int_entries = Arc::new(Field::new(
8585 "entries",
8586 DataType::Struct(Fields::from(vec![
8587 Field::new("key", DataType::Utf8, false),
8588 Field::new("value", DataType::Int32, false),
8589 ])),
8590 false,
8591 ));
8592 let uf_map_or_array = UnionFields::try_new(
8593 vec![0, 1],
8594 vec![
8595 Field::new(
8596 "array",
8597 DataType::List(Arc::new(Field::new(item_name, DataType::Int32, false))),
8598 false,
8599 ),
8600 Field::new("map", DataType::Map(map_int_entries.clone(), false), false),
8601 ],
8602 )
8603 .unwrap();
8604 let mut enum_md_status = {
8605 let mut m = HashMap::<String, String>::new();
8606 m.insert(
8607 crate::schema::AVRO_ENUM_SYMBOLS_METADATA_KEY.to_string(),
8608 serde_json::to_string(&vec!["UNKNOWN", "NEW", "PROCESSING", "DONE"]).unwrap(),
8609 );
8610 m
8611 };
8612 enum_md_status.insert(AVRO_NAME_METADATA_KEY.to_string(), "Status".to_string());
8613 enum_md_status.insert(
8614 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8615 "org.apache.arrow.avrotests.v1.types".to_string(),
8616 );
8617 let mut dec20_md = HashMap::<String, String>::new();
8618 dec20_md.insert("precision".to_string(), "20".to_string());
8619 dec20_md.insert("scale".to_string(), "4".to_string());
8620 dec20_md.insert(AVRO_NAME_METADATA_KEY.to_string(), "DecFix20".to_string());
8621 dec20_md.insert(
8622 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8623 "org.apache.arrow.avrotests.v1.types".to_string(),
8624 );
8625 let mut dec10_md = HashMap::<String, String>::new();
8626 dec10_md.insert("precision".to_string(), "10".to_string());
8627 dec10_md.insert("scale".to_string(), "2".to_string());
8628 let fx16_top_md = {
8629 let mut m = HashMap::<String, String>::new();
8630 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Fx16".to_string());
8631 m.insert(
8632 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8633 "org.apache.arrow.avrotests.v1.types".to_string(),
8634 );
8635 m
8636 };
8637 let dur12_top_md = {
8638 let mut m = HashMap::<String, String>::new();
8639 m.insert(AVRO_NAME_METADATA_KEY.to_string(), "Dur12".to_string());
8640 m.insert(
8641 AVRO_NAMESPACE_METADATA_KEY.to_string(),
8642 "org.apache.arrow.avrotests.v1.types".to_string(),
8643 );
8644 m
8645 };
8646 #[cfg(feature = "small_decimals")]
8647 let dec20_dt = DataType::Decimal128(20, 4);
8648 #[cfg(not(feature = "small_decimals"))]
8649 let dec20_dt = DataType::Decimal128(20, 4);
8650 #[cfg(feature = "small_decimals")]
8651 let dec10_dt = DataType::Decimal64(10, 2);
8652 #[cfg(not(feature = "small_decimals"))]
8653 let dec10_dt = DataType::Decimal128(10, 2);
8654 let fields: Vec<FieldRef> = vec![
8655 Arc::new(Field::new(
8656 "person",
8657 DataType::Struct(Fields::from(vec![
8658 Field::new("name", DataType::Utf8, false),
8659 Field::new("age", DataType::Int32, false),
8660 ])),
8661 false,
8662 )),
8663 Arc::new(Field::new("old_count", DataType::Int32, false)),
8664 Arc::new(Field::new(
8665 "union_map_or_array_int",
8666 DataType::Union(uf_map_or_array.clone(), UnionMode::Dense),
8667 false,
8668 )),
8669 Arc::new(Field::new(
8670 "array_records_with_union",
8671 DataType::List(kv_item_field.clone()),
8672 false,
8673 )),
8674 Arc::new(Field::new(
8675 "union_uuid_or_fixed10",
8676 DataType::Union(uf_uuid_or_fx10.clone(), UnionMode::Dense),
8677 false,
8678 )),
8679 Arc::new(Field::new(
8680 "union_interval_or_string",
8681 DataType::Union(uf_dur_or_str.clone(), UnionMode::Dense),
8682 false,
8683 )),
8684 Arc::new(Field::new(
8685 "union_date_or_fixed4",
8686 DataType::Union(uf_date_fixed4.clone(), UnionMode::Dense),
8687 false,
8688 )),
8689 Arc::new(Field::new(
8690 "union_enum_record_array_map",
8691 DataType::Union(uf_union_big.clone(), UnionMode::Dense),
8692 false,
8693 )),
8694 Arc::new(Field::new(
8695 "maybe_auth",
8696 DataType::Struct(Fields::from(vec![
8697 Field::new("user", DataType::Utf8, false),
8698 Field::new("token", DataType::Binary, true), ])),
8700 false,
8701 )),
8702 Arc::new(Field::new(
8703 "address",
8704 DataType::Struct(Fields::from(vec![
8705 Field::new("street_name", DataType::Utf8, false),
8706 Field::new("zip", DataType::Int32, false),
8707 Field::new("country", DataType::Utf8, false),
8708 ])),
8709 false,
8710 )),
8711 Arc::new(Field::new(
8712 "map_union",
8713 DataType::Map(map_entries_field.clone(), false),
8714 false,
8715 )),
8716 Arc::new(Field::new(
8717 "arr_union",
8718 DataType::List(arr_items_field.clone()),
8719 false,
8720 )),
8721 Arc::new(
8722 Field::new(
8723 "status",
8724 DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
8725 false,
8726 )
8727 .with_metadata(enum_md_status.clone()),
8728 ),
8729 Arc::new(
8730 Field::new(
8731 "interval_mdn",
8732 DataType::Interval(IntervalUnit::MonthDayNano),
8733 false,
8734 )
8735 .with_metadata(dur12_top_md.clone()),
8736 ),
8737 Arc::new(Field::new(
8738 "ts_micros_local",
8739 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None),
8740 false,
8741 )),
8742 Arc::new(Field::new(
8743 "ts_millis_local",
8744 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None),
8745 false,
8746 )),
8747 Arc::new(Field::new(
8748 "ts_micros_utc",
8749 DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, Some("+00:00".into())),
8750 false,
8751 )),
8752 Arc::new(Field::new(
8753 "ts_millis_utc",
8754 DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, Some("+00:00".into())),
8755 false,
8756 )),
8757 Arc::new(Field::new(
8758 "t_micros",
8759 DataType::Time64(arrow_schema::TimeUnit::Microsecond),
8760 false,
8761 )),
8762 Arc::new(Field::new(
8763 "t_millis",
8764 DataType::Time32(arrow_schema::TimeUnit::Millisecond),
8765 false,
8766 )),
8767 Arc::new(Field::new("d_date", DataType::Date32, false)),
8768 Arc::new(add_uuid_ext_top(Field::new(
8769 "uuid_str",
8770 DataType::FixedSizeBinary(16),
8771 false,
8772 ))),
8773 Arc::new(Field::new("dec_fix_s20_4", dec20_dt, false).with_metadata(dec20_md.clone())),
8774 Arc::new(
8775 Field::new("dec_bytes_s10_2", dec10_dt, false).with_metadata(dec10_md.clone()),
8776 ),
8777 Arc::new(
8778 Field::new("fx16_plain", DataType::FixedSizeBinary(16), false)
8779 .with_metadata(fx16_top_md.clone()),
8780 ),
8781 Arc::new(Field::new("raw_bytes", DataType::Binary, false)),
8782 Arc::new(Field::new("str_utf8", DataType::Utf8, false)),
8783 Arc::new(Field::new(
8784 "tri_union_prim",
8785 DataType::Union(uf_tri.clone(), UnionMode::Dense),
8786 false,
8787 )),
8788 Arc::new(Field::new("opt_str_nullsecond", DataType::Utf8, true)),
8789 Arc::new(Field::new("opt_i32_nullfirst", DataType::Int32, true)),
8790 Arc::new(Field::new("count_i64", DataType::Int64, false)),
8791 Arc::new(Field::new("count_i32", DataType::Int64, false)),
8792 Arc::new(Field::new("ratio_f64", DataType::Float64, false)),
8793 Arc::new(Field::new("ratio_f32", DataType::Float64, false)),
8794 Arc::new(Field::new("flag", DataType::Boolean, false)),
8795 Arc::new(Field::new("identifier", DataType::Int64, false)),
8796 ];
8797 let expected_schema = Arc::new(arrow_schema::Schema::new(Fields::from(fields)));
8798 let mut cols: Vec<ArrayRef> = vec![
8799 Arc::new(StructArray::new(
8800 match expected_schema
8801 .field_with_name("person")
8802 .unwrap()
8803 .data_type()
8804 {
8805 DataType::Struct(fs) => fs.clone(),
8806 _ => unreachable!(),
8807 },
8808 vec![
8809 Arc::new(StringArray::from(vec!["Alice", "Bob", "Carol", "Dave"])) as ArrayRef,
8810 Arc::new(Int32Array::from(vec![30, 0, 25, 41])) as ArrayRef,
8811 ],
8812 None,
8813 )) as ArrayRef,
8814 Arc::new(Int32Array::from(vec![100, 42, 7, 42])) as ArrayRef,
8815 ];
8816 {
8817 let map_child: ArrayRef = {
8818 let keys = StringArray::from(vec!["x", "y", "only"]);
8819 let vals = Int32Array::from(vec![1, 2, 10]);
8820 let entries = StructArray::new(
8821 Fields::from(vec![
8822 Field::new("key", DataType::Utf8, false),
8823 Field::new("value", DataType::Int32, false),
8824 ]),
8825 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8826 None,
8827 );
8828 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3]));
8829 Arc::new(MapArray::new(
8830 map_int_entries.clone(),
8831 moff,
8832 entries,
8833 None,
8834 false,
8835 )) as ArrayRef
8836 };
8837 let list_child: ArrayRef = {
8838 let values = Int32Array::from(vec![1, 2, 3, 0]);
8839 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4]));
8840 Arc::new(
8841 ListArray::try_new(
8842 Arc::new(Field::new(item_name, DataType::Int32, false)),
8843 offsets,
8844 Arc::new(values),
8845 None,
8846 )
8847 .unwrap(),
8848 ) as ArrayRef
8849 };
8850 let tids = vec![1, 0, 1, 0];
8851 let offs = vec![0, 0, 1, 1];
8852 let arr = mk_dense_union(&uf_map_or_array, tids, offs, |f| match f.name().as_str() {
8853 "array" => Some(list_child.clone()),
8854 "map" => Some(map_child.clone()),
8855 _ => None,
8856 });
8857 cols.push(arr);
8858 }
8859 {
8860 let keys = Arc::new(StringArray::from(vec!["k1", "k2", "k", "k3", "x"])) as ArrayRef;
8861 let type_ids = vec![1, 0, 2, 0, 1];
8862 let offsets = vec![0, 0, 0, 1, 1];
8863 let vals = mk_dense_union(&uf_kv_val, type_ids, offsets, |f| match f.data_type() {
8864 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![5, -5])) as ArrayRef),
8865 DataType::Int64 => Some(Arc::new(Int64Array::from(vec![99i64])) as ArrayRef),
8866 DataType::Null => Some(Arc::new(NullArray::new(2)) as ArrayRef),
8867 _ => None,
8868 });
8869 let values_struct =
8870 Arc::new(StructArray::new(kv_fields.clone(), vec![keys, vals], None));
8871 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 2, 3, 4, 5]));
8872 let arr = Arc::new(
8873 ListArray::try_new(kv_item_field.clone(), list_offsets, values_struct, None)
8874 .unwrap(),
8875 ) as ArrayRef;
8876 cols.push(arr);
8877 }
8878 {
8879 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8881 let arr = mk_dense_union(&uf_uuid_or_fx10, type_ids, offs, |f| match f.data_type() {
8882 DataType::FixedSizeBinary(16) => {
8883 let it = [Some(uuid1), Some(uuid2)].into_iter();
8884 Some(Arc::new(
8885 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
8886 ) as ArrayRef)
8887 }
8888 DataType::FixedSizeBinary(10) => {
8889 let fx10_a = [0xAAu8; 10];
8890 let fx10_b = [0x00u8, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99];
8891 let it = [Some(fx10_a), Some(fx10_b)].into_iter();
8892 Some(Arc::new(
8893 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 10).unwrap(),
8894 ) as ArrayRef)
8895 }
8896 _ => None,
8897 });
8898 cols.push(arr);
8899 }
8900 {
8901 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8903 let arr = mk_dense_union(&uf_dur_or_str, type_ids, offs, |f| match f.data_type() {
8904 DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => Some(Arc::new(
8905 IntervalMonthDayNanoArray::from(vec![dur_small, dur_large]),
8906 )
8907 as ArrayRef),
8908 DataType::Utf8 => Some(Arc::new(StringArray::from(vec![
8909 "duration-as-text",
8910 "iso-8601-period-P1Y",
8911 ])) as ArrayRef),
8912 _ => None,
8913 });
8914 cols.push(arr);
8915 }
8916 {
8917 let type_ids = vec![1, 0, 1, 0]; let offs = vec![0, 0, 1, 1];
8919 let arr = mk_dense_union(&uf_date_fixed4, type_ids, offs, |f| match f.data_type() {
8920 DataType::Date32 => Some(Arc::new(Date32Array::from(vec![date_a, 0])) as ArrayRef),
8921 DataType::FixedSizeBinary(4) => {
8922 let it = [Some(*b"\x00\x11\x22\x33"), Some(*b"ABCD")].into_iter();
8923 Some(Arc::new(
8924 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 4).unwrap(),
8925 ) as ArrayRef)
8926 }
8927 _ => None,
8928 });
8929 cols.push(arr);
8930 }
8931 {
8932 let tids = vec![4, 3, 1, 0]; let offs = vec![0, 0, 0, 0];
8934 let arr = mk_dense_union(&uf_union_big, tids, offs, |f| match f.data_type() {
8935 DataType::Dictionary(_, _) => {
8936 let keys = Int32Array::from(vec![0i32]);
8937 let values =
8938 Arc::new(StringArray::from(vec!["RED", "GREEN", "BLUE"])) as ArrayRef;
8939 Some(
8940 Arc::new(DictionaryArray::<Int32Type>::try_new(keys, values).unwrap())
8941 as ArrayRef,
8942 )
8943 }
8944 DataType::Struct(fs) if fs == &union_rec_a_fields => {
8945 let a = Int32Array::from(vec![7]);
8946 let b = StringArray::from(vec!["rec"]);
8947 Some(Arc::new(StructArray::new(
8948 fs.clone(),
8949 vec![Arc::new(a) as ArrayRef, Arc::new(b) as ArrayRef],
8950 None,
8951 )) as ArrayRef)
8952 }
8953 DataType::List(_) => {
8954 let values = Int64Array::from(vec![1i64, 2, 3]);
8955 let offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3]));
8956 Some(Arc::new(
8957 ListArray::try_new(
8958 Arc::new(Field::new(item_name, DataType::Int64, false)),
8959 offsets,
8960 Arc::new(values),
8961 None,
8962 )
8963 .unwrap(),
8964 ) as ArrayRef)
8965 }
8966 DataType::Map(_, _) => {
8967 let keys = StringArray::from(vec!["k"]);
8968 let vals = StringArray::from(vec!["v"]);
8969 let entries = StructArray::new(
8970 Fields::from(vec![
8971 Field::new("key", DataType::Utf8, false),
8972 Field::new("value", DataType::Utf8, false),
8973 ]),
8974 vec![Arc::new(keys) as ArrayRef, Arc::new(vals) as ArrayRef],
8975 None,
8976 );
8977 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1]));
8978 Some(Arc::new(MapArray::new(
8979 union_map_entries.clone(),
8980 moff,
8981 entries,
8982 None,
8983 false,
8984 )) as ArrayRef)
8985 }
8986 _ => None,
8987 });
8988 cols.push(arr);
8989 }
8990 {
8991 let fs = match expected_schema
8992 .field_with_name("maybe_auth")
8993 .unwrap()
8994 .data_type()
8995 {
8996 DataType::Struct(fs) => fs.clone(),
8997 _ => unreachable!(),
8998 };
8999 let user =
9000 Arc::new(StringArray::from(vec!["alice", "bob", "carol", "dave"])) as ArrayRef;
9001 let token_values: Vec<Option<&[u8]>> = vec![
9002 None,
9003 Some(b"\x01\x02\x03".as_ref()),
9004 None,
9005 Some(b"".as_ref()),
9006 ];
9007 let token = Arc::new(BinaryArray::from(token_values)) as ArrayRef;
9008 cols.push(Arc::new(StructArray::new(fs, vec![user, token], None)) as ArrayRef);
9009 }
9010 {
9011 let fs = match expected_schema
9012 .field_with_name("address")
9013 .unwrap()
9014 .data_type()
9015 {
9016 DataType::Struct(fs) => fs.clone(),
9017 _ => unreachable!(),
9018 };
9019 let street = Arc::new(StringArray::from(vec![
9020 "100 Main",
9021 "",
9022 "42 Galaxy Way",
9023 "End Ave",
9024 ])) as ArrayRef;
9025 let zip = Arc::new(Int32Array::from(vec![12345, 0, 42424, 1])) as ArrayRef;
9026 let country = Arc::new(StringArray::from(vec!["US", "CA", "US", "GB"])) as ArrayRef;
9027 cols.push(Arc::new(StructArray::new(fs, vec![street, zip, country], None)) as ArrayRef);
9028 }
9029 {
9030 let keys = StringArray::from(vec!["a", "b", "c", "neg", "pi", "ok"]);
9031 let moff = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 3, 4, 4, 6]));
9032 let tid_s = 0; let tid_d = 1; let tid_n = 2; let type_ids = vec![tid_d, tid_n, tid_s, tid_d, tid_d, tid_s];
9036 let offsets = vec![0, 0, 0, 1, 2, 1];
9037 let pi_5dp = (std::f64::consts::PI * 100_000.0).trunc() / 100_000.0;
9038 let vals = mk_dense_union(&uf_map_vals, type_ids, offsets, |f| match f.data_type() {
9039 DataType::Float64 => {
9040 Some(Arc::new(Float64Array::from(vec![1.5f64, -0.5, pi_5dp])) as ArrayRef)
9041 }
9042 DataType::Utf8 => {
9043 Some(Arc::new(StringArray::from(vec!["yes", "true"])) as ArrayRef)
9044 }
9045 DataType::Null => Some(Arc::new(NullArray::new(1)) as ArrayRef),
9046 _ => None,
9047 });
9048 let entries = StructArray::new(
9049 Fields::from(vec![
9050 Field::new("key", DataType::Utf8, false),
9051 Field::new(
9052 "value",
9053 DataType::Union(uf_map_vals.clone(), UnionMode::Dense),
9054 true,
9055 ),
9056 ]),
9057 vec![Arc::new(keys) as ArrayRef, vals],
9058 None,
9059 );
9060 let map = Arc::new(MapArray::new(
9061 map_entries_field.clone(),
9062 moff,
9063 entries,
9064 None,
9065 false,
9066 )) as ArrayRef;
9067 cols.push(map);
9068 }
9069 {
9070 let type_ids = vec![
9071 2, 1, 0, 2, 0, 1, 2, 2, 1, 0,
9072 2, ];
9074 let offsets = vec![0, 0, 0, 1, 1, 1, 2, 3, 2, 2, 4];
9075 let values =
9076 mk_dense_union(&uf_arr_items, type_ids, offsets, |f| match f.data_type() {
9077 DataType::Int64 => {
9078 Some(Arc::new(Int64Array::from(vec![1i64, -3, 0, -1, 0])) as ArrayRef)
9079 }
9080 DataType::Utf8 => {
9081 Some(Arc::new(StringArray::from(vec!["x", "z", "end"])) as ArrayRef)
9082 }
9083 DataType::Null => Some(Arc::new(NullArray::new(3)) as ArrayRef),
9084 _ => None,
9085 });
9086 let list_offsets = OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 4, 7, 8, 11]));
9087 let arr = Arc::new(
9088 ListArray::try_new(arr_items_field.clone(), list_offsets, values, None).unwrap(),
9089 ) as ArrayRef;
9090 cols.push(arr);
9091 }
9092 {
9093 let keys = Int32Array::from(vec![1, 2, 3, 0]); let values = Arc::new(StringArray::from(vec![
9095 "UNKNOWN",
9096 "NEW",
9097 "PROCESSING",
9098 "DONE",
9099 ])) as ArrayRef;
9100 let dict = DictionaryArray::<Int32Type>::try_new(keys, values).unwrap();
9101 cols.push(Arc::new(dict) as ArrayRef);
9102 }
9103 cols.push(Arc::new(IntervalMonthDayNanoArray::from(vec![
9104 dur_small, dur_zero, dur_large, dur_2years,
9105 ])) as ArrayRef);
9106 cols.push(Arc::new(TimestampMicrosecondArray::from(vec![
9107 ts_us_2024_01_01 + 123_456,
9108 0,
9109 ts_us_2024_01_01 + 101_112,
9110 987_654_321,
9111 ])) as ArrayRef);
9112 cols.push(Arc::new(TimestampMillisecondArray::from(vec![
9113 ts_ms_2024_01_01 + 86_400_000,
9114 0,
9115 ts_ms_2024_01_01 + 789,
9116 123_456_789,
9117 ])) as ArrayRef);
9118 {
9119 let a = TimestampMicrosecondArray::from(vec![
9120 ts_us_2024_01_01,
9121 1,
9122 ts_us_2024_01_01 + 456,
9123 0,
9124 ])
9125 .with_timezone("+00:00");
9126 cols.push(Arc::new(a) as ArrayRef);
9127 }
9128 {
9129 let a = TimestampMillisecondArray::from(vec![
9130 ts_ms_2024_01_01,
9131 -1,
9132 ts_ms_2024_01_01 + 123,
9133 0,
9134 ])
9135 .with_timezone("+00:00");
9136 cols.push(Arc::new(a) as ArrayRef);
9137 }
9138 cols.push(Arc::new(Time64MicrosecondArray::from(vec![
9139 time_us_eod,
9140 0,
9141 1,
9142 1_000_000,
9143 ])) as ArrayRef);
9144 cols.push(Arc::new(Time32MillisecondArray::from(vec![
9145 time_ms_a,
9146 0,
9147 1,
9148 86_400_000 - 1,
9149 ])) as ArrayRef);
9150 cols.push(Arc::new(Date32Array::from(vec![date_a, 0, 1, 365])) as ArrayRef);
9151 {
9152 let it = [Some(uuid1), Some(uuid2), Some(uuid1), Some(uuid2)].into_iter();
9153 cols.push(Arc::new(
9154 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
9155 ) as ArrayRef);
9156 }
9157 {
9158 #[cfg(feature = "small_decimals")]
9159 let arr = Arc::new(
9160 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
9161 .with_precision_and_scale(20, 4)
9162 .unwrap(),
9163 ) as ArrayRef;
9164 #[cfg(not(feature = "small_decimals"))]
9165 let arr = Arc::new(
9166 Decimal128Array::from_iter_values([1_234_567_891_234i128, -420_000i128, 0, -1i128])
9167 .with_precision_and_scale(20, 4)
9168 .unwrap(),
9169 ) as ArrayRef;
9170 cols.push(arr);
9171 }
9172 {
9173 #[cfg(feature = "small_decimals")]
9174 let arr = Arc::new(
9175 Decimal64Array::from_iter_values([123456i64, -1, 0, 9_999_999_999i64])
9176 .with_precision_and_scale(10, 2)
9177 .unwrap(),
9178 ) as ArrayRef;
9179 #[cfg(not(feature = "small_decimals"))]
9180 let arr = Arc::new(
9181 Decimal128Array::from_iter_values([123456i128, -1, 0, 9_999_999_999i128])
9182 .with_precision_and_scale(10, 2)
9183 .unwrap(),
9184 ) as ArrayRef;
9185 cols.push(arr);
9186 }
9187 {
9188 let it = [
9189 Some(*b"0123456789ABCDEF"),
9190 Some([0u8; 16]),
9191 Some(*b"ABCDEFGHIJKLMNOP"),
9192 Some([0xAA; 16]),
9193 ]
9194 .into_iter();
9195 cols.push(Arc::new(
9196 FixedSizeBinaryArray::try_from_sparse_iter_with_size(it, 16).unwrap(),
9197 ) as ArrayRef);
9198 }
9199 cols.push(Arc::new(BinaryArray::from(vec![
9200 b"\x00\x01".as_ref(),
9201 b"".as_ref(),
9202 b"\xFF\x00".as_ref(),
9203 b"\x10\x20\x30\x40".as_ref(),
9204 ])) as ArrayRef);
9205 cols.push(Arc::new(StringArray::from(vec!["hello", "", "world", "✓ unicode"])) as ArrayRef);
9206 {
9207 let tids = vec![0, 1, 2, 1];
9208 let offs = vec![0, 0, 0, 1];
9209 let arr = mk_dense_union(&uf_tri, tids, offs, |f| match f.data_type() {
9210 DataType::Int32 => Some(Arc::new(Int32Array::from(vec![0])) as ArrayRef),
9211 DataType::Utf8 => Some(Arc::new(StringArray::from(vec!["hi", ""])) as ArrayRef),
9212 DataType::Boolean => Some(Arc::new(BooleanArray::from(vec![true])) as ArrayRef),
9213 _ => None,
9214 });
9215 cols.push(arr);
9216 }
9217 cols.push(Arc::new(StringArray::from(vec![
9218 Some("alpha"),
9219 None,
9220 Some("s3"),
9221 Some(""),
9222 ])) as ArrayRef);
9223 cols.push(Arc::new(Int32Array::from(vec![None, Some(42), None, Some(0)])) as ArrayRef);
9224 cols.push(Arc::new(Int64Array::from(vec![
9225 7_000_000_000i64,
9226 -2,
9227 0,
9228 -9_876_543_210i64,
9229 ])) as ArrayRef);
9230 cols.push(Arc::new(Int64Array::from(vec![7i64, -1, 0, 123])) as ArrayRef);
9231 cols.push(Arc::new(Float64Array::from(vec![2.5f64, -1.0, 7.0, -2.25])) as ArrayRef);
9232 cols.push(Arc::new(Float64Array::from(vec![1.25f64, -0.0, 3.5, 9.75])) as ArrayRef);
9233 cols.push(Arc::new(BooleanArray::from(vec![true, false, true, false])) as ArrayRef);
9234 cols.push(Arc::new(Int64Array::from(vec![1, 2, 3, 4])) as ArrayRef);
9235 let expected = RecordBatch::try_new(expected_schema, cols).unwrap();
9236 assert_eq!(
9237 expected, batch,
9238 "entire RecordBatch mismatch (schema, all columns, all rows)"
9239 );
9240 }
9241}