datafusion-functions 54.0.0

Function packages for the DataFusion query engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

use std::marker::PhantomData;
use std::mem::size_of;
use std::sync::Arc;

use datafusion_common::{Result, exec_datafusion_err, internal_err};

use arrow::array::{
    Array, ArrayAccessor, ArrayDataBuilder, ArrayRef, BinaryArray, ByteView,
    GenericStringArray, LargeStringArray, OffsetSizeTrait, StringArray, StringViewArray,
    make_view,
};
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer};
use arrow::datatypes::DataType;

/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row
/// at a time from multiple input columns.
///
/// Each row is written via repeated `write` calls (one per input fragment)
/// followed by a single `append_offset` to commit the row.  The output null
/// buffer is computed in bulk by the caller and supplied to `finish`, avoiding
/// per-row NULL handling work.
///
/// For the common "produce one `&str` per row" pattern, prefer
/// `GenericStringArrayBuilder` instead.
pub(crate) struct ConcatStringBuilder {
    offsets_buffer: MutableBuffer,
    value_buffer: MutableBuffer,
    /// If true, a safety check is required during the `finish` call
    tainted: bool,
}

impl ConcatStringBuilder {
    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
        let capacity = item_capacity
            .checked_add(1)
            .map(|i| i.saturating_mul(size_of::<i32>()))
            .expect("capacity integer overflow");

        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
        // SAFETY: the first offset value is definitely not going to exceed the bounds.
        unsafe { offsets_buffer.push_unchecked(0_i32) };
        Self {
            offsets_buffer,
            value_buffer: MutableBuffer::with_capacity(data_capacity),
            tainted: false,
        }
    }

    pub fn write<const CHECK_VALID: bool>(
        &mut self,
        column: &ColumnarValueRef,
        i: usize,
    ) {
        match column {
            ColumnarValueRef::Scalar(s) => {
                self.value_buffer.extend_from_slice(s);
                self.tainted = true;
            }
            ColumnarValueRef::NullableArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer
                        .extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableLargeStringArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer
                        .extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableStringViewArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer
                        .extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableBinaryArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer.extend_from_slice(array.value(i));
                }
                self.tainted = true;
            }
            ColumnarValueRef::NonNullableArray(array) => {
                self.value_buffer
                    .extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableLargeStringArray(array) => {
                self.value_buffer
                    .extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableStringViewArray(array) => {
                self.value_buffer
                    .extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableBinaryArray(array) => {
                self.value_buffer.extend_from_slice(array.value(i));
                self.tainted = true;
            }
        }
    }

    pub fn append_offset(&mut self) -> Result<()> {
        let next_offset: i32 = self
            .value_buffer
            .len()
            .try_into()
            .map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
        self.offsets_buffer.push(next_offset);
        Ok(())
    }

    /// Finalize the builder into a concrete [`StringArray`].
    ///
    /// # Errors
    ///
    /// Returns an error when:
    ///
    /// - the provided `null_buffer` is not the same length as the `offsets_buffer`.
    pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<StringArray> {
        let row_count = self.offsets_buffer.len() / size_of::<i32>() - 1;
        if let Some(ref null_buffer) = null_buffer
            && null_buffer.len() != row_count
        {
            return internal_err!(
                "Null buffer and offsets buffer must be the same length"
            );
        }
        let array_builder = ArrayDataBuilder::new(DataType::Utf8)
            .len(row_count)
            .add_buffer(self.offsets_buffer.into())
            .add_buffer(self.value_buffer.into())
            .nulls(null_buffer);
        if self.tainted {
            // Raw binary arrays with possible invalid utf-8 were used,
            // so let ArrayDataBuilder perform validation
            let array_data = array_builder.build()?;
            Ok(StringArray::from(array_data))
        } else {
            // SAFETY: all data that was appended was valid UTF8 and the values
            // and offsets were created correctly
            let array_data = unsafe { array_builder.build_unchecked() };
            Ok(StringArray::from(array_data))
        }
    }
}

/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one
/// row at a time from multiple input columns.
///
/// Each row is written via repeated `write` calls (one per input
/// fragment) followed by a single `append_offset` to commit the row
/// as a single string view. The output null buffer is supplied by the caller
/// at `finish` time, avoiding per-row NULL handling work.
///
/// For the common "produce one `&str` per row" pattern, prefer
/// [`StringViewArrayBuilder`] instead.
pub(crate) struct ConcatStringViewBuilder {
    views: Vec<u128>,
    data: Vec<u8>,
    block: Vec<u8>,
    /// If true, a safety check is required during the `append_offset` call
    tainted: bool,
}

impl ConcatStringViewBuilder {
    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
        Self {
            views: Vec::with_capacity(item_capacity),
            data: Vec::with_capacity(data_capacity),
            block: vec![],
            tainted: false,
        }
    }

    pub fn write<const CHECK_VALID: bool>(
        &mut self,
        column: &ColumnarValueRef,
        i: usize,
    ) {
        match column {
            ColumnarValueRef::Scalar(s) => {
                self.block.extend_from_slice(s);
                self.tainted = true;
            }
            ColumnarValueRef::NullableArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.block.extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableLargeStringArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.block.extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableStringViewArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.block.extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableBinaryArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.block.extend_from_slice(array.value(i));
                }
                self.tainted = true;
            }
            ColumnarValueRef::NonNullableArray(array) => {
                self.block.extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableLargeStringArray(array) => {
                self.block.extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableStringViewArray(array) => {
                self.block.extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableBinaryArray(array) => {
                self.block.extend_from_slice(array.value(i));
                self.tainted = true;
            }
        }
    }

    /// Finalizes the current row by converting the accumulated data into a
    /// StringView and appending it to the views buffer.
    pub fn append_offset(&mut self) -> Result<()> {
        if self.tainted {
            std::str::from_utf8(&self.block)
                .map_err(|_| exec_datafusion_err!("invalid UTF-8 in binary literal"))?;
        }

        let v = &self.block;
        if v.len() > 12 {
            let offset: u32 = self
                .data
                .len()
                .try_into()
                .map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
            self.data.extend_from_slice(v);
            self.views.push(make_view(v, 0, offset));
        } else {
            self.views.push(make_view(v, 0, 0));
        }

        self.block.clear();
        self.tainted = false;
        Ok(())
    }

    /// Finalize the builder into a concrete [`StringViewArray`].
    ///
    /// # Errors
    ///
    /// Returns an error when:
    ///
    /// - the provided `null_buffer` length does not match the row count.
    pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> {
        if let Some(ref nulls) = null_buffer
            && nulls.len() != self.views.len()
        {
            return internal_err!(
                "Null buffer length ({}) must match row count ({})",
                nulls.len(),
                self.views.len()
            );
        }

        let buffers: Vec<Buffer> = if self.data.is_empty() {
            vec![]
        } else {
            vec![Buffer::from(self.data)]
        };

        // SAFETY: views were constructed with correct lengths, offsets, and
        // prefixes. UTF-8 validity was checked in append_offset() for any row
        // where tainted data (e.g., binary literals) was appended.
        let array = unsafe {
            StringViewArray::new_unchecked(
                ScalarBuffer::from(self.views),
                buffers,
                null_buffer,
            )
        };
        Ok(array)
    }
}

/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one
/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for
/// details on the row-composition contract.
///
/// For the common "produce one `&str` per row" pattern, prefer
/// `GenericStringArrayBuilder` instead.
pub(crate) struct ConcatLargeStringBuilder {
    offsets_buffer: MutableBuffer,
    value_buffer: MutableBuffer,
    /// If true, a safety check is required during the `finish` call
    tainted: bool,
}

impl ConcatLargeStringBuilder {
    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
        let capacity = item_capacity
            .checked_add(1)
            .map(|i| i.saturating_mul(size_of::<i64>()))
            .expect("capacity integer overflow");

        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
        // SAFETY: the first offset value is definitely not going to exceed the bounds.
        unsafe { offsets_buffer.push_unchecked(0_i64) };
        Self {
            offsets_buffer,
            value_buffer: MutableBuffer::with_capacity(data_capacity),
            tainted: false,
        }
    }

    pub fn write<const CHECK_VALID: bool>(
        &mut self,
        column: &ColumnarValueRef,
        i: usize,
    ) {
        match column {
            ColumnarValueRef::Scalar(s) => {
                self.value_buffer.extend_from_slice(s);
                self.tainted = true;
            }
            ColumnarValueRef::NullableArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer
                        .extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableLargeStringArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer
                        .extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableStringViewArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer
                        .extend_from_slice(array.value(i).as_bytes());
                }
            }
            ColumnarValueRef::NullableBinaryArray(array) => {
                if !CHECK_VALID || array.is_valid(i) {
                    self.value_buffer.extend_from_slice(array.value(i));
                }
                self.tainted = true;
            }
            ColumnarValueRef::NonNullableArray(array) => {
                self.value_buffer
                    .extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableLargeStringArray(array) => {
                self.value_buffer
                    .extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableStringViewArray(array) => {
                self.value_buffer
                    .extend_from_slice(array.value(i).as_bytes());
            }
            ColumnarValueRef::NonNullableBinaryArray(array) => {
                self.value_buffer.extend_from_slice(array.value(i));
                self.tainted = true;
            }
        }
    }

    pub fn append_offset(&mut self) -> Result<()> {
        let next_offset: i64 = self
            .value_buffer
            .len()
            .try_into()
            .map_err(|_| exec_datafusion_err!("byte array offset overflow"))?;
        self.offsets_buffer.push(next_offset);
        Ok(())
    }

    /// Finalize the builder into a concrete [`LargeStringArray`].
    ///
    /// # Errors
    ///
    /// Returns an error when:
    ///
    /// - the provided `null_buffer` is not the same length as the `offsets_buffer`.
    pub fn finish(self, null_buffer: Option<NullBuffer>) -> Result<LargeStringArray> {
        let row_count = self.offsets_buffer.len() / size_of::<i64>() - 1;
        if let Some(ref null_buffer) = null_buffer
            && null_buffer.len() != row_count
        {
            return internal_err!(
                "Null buffer and offsets buffer must be the same length"
            );
        }
        let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
            .len(row_count)
            .add_buffer(self.offsets_buffer.into())
            .add_buffer(self.value_buffer.into())
            .nulls(null_buffer);
        if self.tainted {
            // Raw binary arrays with possible invalid utf-8 were used,
            // so let ArrayDataBuilder perform validation
            let array_data = array_builder.build()?;
            Ok(LargeStringArray::from(array_data))
        } else {
            // SAFETY: all data that was appended was valid Large UTF8 and the values
            // and offsets were created correctly
            let array_data = unsafe { array_builder.build_unchecked() };
            Ok(LargeStringArray::from(array_data))
        }
    }
}

// ----------------------------------------------------------------------------
// Bulk-nulls builders
//
// These builders are similar to Arrow's `GenericStringBuilder` and
// `StringViewBuilder` but tuned for string UDFs along two axes:
//
//   * Bulk-NULL handling. The NULL bitmap is passed to `finish()` rather than
//     maintained per-row. Many string UDFs can compute the bitmap in bulk,
//     where this is significantly more efficient.
//   * Closure-based row emission. Beyond `append_value(&str)`, the builders
//     expose `append_with` (fragments written into the builder via a
//     `StringWriter`) and `append_byte_map` (byte-to-byte mapping of an input
//     slice), letting UDFs emit a row without first assembling it in a scratch
//     `String`.
// ----------------------------------------------------------------------------

/// Builder for a [`GenericStringArray<O>`]. Instantiate with `O = i32` for
/// [`StringArray`] (Utf8) or `O = i64` for [`LargeStringArray`] (LargeUtf8).
pub(crate) struct GenericStringArrayBuilder<O: OffsetSizeTrait> {
    offsets_buffer: MutableBuffer,
    value_buffer: MutableBuffer,
    placeholder_count: usize,
    _phantom: PhantomData<O>,
}

impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
        let capacity = item_capacity
            .checked_add(1)
            .map(|i| i.saturating_mul(size_of::<O>()))
            .expect("capacity integer overflow");

        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
        offsets_buffer.push(O::usize_as(0));
        Self {
            offsets_buffer,
            value_buffer: MutableBuffer::with_capacity(data_capacity),
            placeholder_count: 0,
            _phantom: PhantomData,
        }
    }

    /// See [`BulkNullStringArrayBuilder::append_value`].
    ///
    /// # Panics
    ///
    /// Panics if the cumulative byte length exceeds `O::MAX`.
    #[inline]
    pub fn append_value(&mut self, value: &str) {
        self.value_buffer.extend_from_slice(value.as_bytes());
        let next_offset =
            O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
        self.offsets_buffer.push(next_offset);
    }

    /// See [`BulkNullStringArrayBuilder::append_placeholder`].
    #[inline]
    pub fn append_placeholder(&mut self) {
        let next_offset =
            O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
        self.offsets_buffer.push(next_offset);
        self.placeholder_count += 1;
    }

    /// See [`BulkNullStringArrayBuilder::append_byte_map`].
    ///
    /// # Safety
    ///
    /// The bytes produced by applying `map` to each byte of `src`, in order,
    /// must form valid UTF-8.
    ///
    /// # Panics
    ///
    /// Panics if the cumulative byte length exceeds `O::MAX`.
    #[inline]
    pub unsafe fn append_byte_map<F: FnMut(u8) -> u8>(&mut self, src: &[u8], mut map: F) {
        self.value_buffer.extend(src.iter().map(|&b| map(b)));
        let next_offset =
            O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
        self.offsets_buffer.push(next_offset);
    }

    /// See [`BulkNullStringArrayBuilder::append_with`].
    ///
    /// # Panics
    ///
    /// Panics if the cumulative byte length exceeds `O::MAX`.
    #[inline]
    pub fn append_with<F>(&mut self, f: F)
    where
        F: FnOnce(&mut GenericStringWriter<'_>),
    {
        let mut writer = GenericStringWriter {
            value_buffer: &mut self.value_buffer,
        };
        f(&mut writer);
        let next_offset =
            O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
        self.offsets_buffer.push(next_offset);
    }

    /// Finalize into a [`GenericStringArray<O>`] using the caller-supplied
    /// null buffer.
    ///
    /// # Errors
    ///
    /// Returns an error when `null_buffer.len()` does not match the number of
    /// appended rows.
    pub fn finish(
        self,
        null_buffer: Option<NullBuffer>,
    ) -> Result<GenericStringArray<O>> {
        let row_count = self.offsets_buffer.len() / size_of::<O>() - 1;
        if let Some(ref n) = null_buffer
            && n.len() != row_count
        {
            return internal_err!(
                "Null buffer length ({}) must match row count ({row_count})",
                n.len()
            );
        }
        let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count());
        debug_assert!(
            null_count >= self.placeholder_count,
            "{} placeholder rows but null buffer has {null_count} nulls",
            self.placeholder_count,
        );
        let array_data = ArrayDataBuilder::new(GenericStringArray::<O>::DATA_TYPE)
            .len(row_count)
            .add_buffer(self.offsets_buffer.into())
            .add_buffer(self.value_buffer.into())
            .nulls(null_buffer);
        // SAFETY: every appended value came from a `&str`, so the value
        // buffer is valid UTF-8 and offsets are monotonically non-decreasing.
        let array_data = unsafe { array_data.build_unchecked() };
        Ok(GenericStringArray::<O>::from(array_data))
    }
}

/// Starting size for the long-string data block used by `StringView`-style
/// arrays; matches Arrow's `GenericByteViewBuilder` default.
pub(crate) const STRING_VIEW_INIT_BLOCK_SIZE: u32 = 8 * 1024;
/// Maximum size each long-string data block in a `StringView`-style array
/// grows to; matches Arrow's `GenericByteViewBuilder` default.
pub(crate) const STRING_VIEW_MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;

/// Append-only writer handed to closures passed to `append_with`.
pub(crate) trait StringWriter {
    fn write_str(&mut self, s: &str);
    fn write_char(&mut self, c: char);
}

/// [`StringWriter`] for [`GenericStringArrayBuilder`]. Writes go straight to
/// the value buffer.
pub(crate) struct GenericStringWriter<'a> {
    value_buffer: &'a mut MutableBuffer,
}

impl StringWriter for GenericStringWriter<'_> {
    #[inline(always)]
    fn write_str(&mut self, s: &str) {
        push_bytes_to_mutable_buffer(self.value_buffer, s.as_bytes());
    }

    #[inline(always)]
    fn write_char(&mut self, c: char) {
        push_char_to_mutable_buffer(self.value_buffer, c);
    }
}

/// Write `bytes` into `value_buffer`. For repeated small writes,
/// MutableBuffer::extend_from_slice can be slow (memcpy per call), so we extend
/// the buffer here directly and force inlining.
#[inline(always)]
fn push_bytes_to_mutable_buffer(value_buffer: &mut MutableBuffer, bytes: &[u8]) {
    let n = bytes.len();
    let old_len = value_buffer.len();
    value_buffer.reserve(n);

    // SAFETY: we reserved `n` bytes; the source and destination do not alias
    // because `bytes` was passed in by the caller and `value_buffer` is owned.
    unsafe {
        let dst = value_buffer.as_mut_ptr().add(old_len);
        let src = bytes.as_ptr();
        match n {
            0 => {}
            1 => std::ptr::copy_nonoverlapping(src, dst, 1),
            2 => std::ptr::copy_nonoverlapping(src, dst, 2),
            3 => std::ptr::copy_nonoverlapping(src, dst, 3),
            4 => std::ptr::copy_nonoverlapping(src, dst, 4),
            5 => std::ptr::copy_nonoverlapping(src, dst, 5),
            6 => std::ptr::copy_nonoverlapping(src, dst, 6),
            7 => std::ptr::copy_nonoverlapping(src, dst, 7),
            8 => std::ptr::copy_nonoverlapping(src, dst, 8),
            _ => std::ptr::copy_nonoverlapping(src, dst, n),
        }
        value_buffer.set_len(old_len + n);
    }
}

#[inline(always)]
fn push_char_to_mutable_buffer(value_buffer: &mut MutableBuffer, c: char) {
    let len = c.len_utf8();
    let old_len = value_buffer.len();
    value_buffer.reserve(len);

    // SAFETY: we reserved `len` bytes above, write valid UTF-8 into those
    // bytes, then update the initialized length to include them.
    unsafe {
        let dst = value_buffer.as_mut_ptr().add(old_len);
        if len == 1 {
            *dst = c as u8;
        } else {
            c.encode_utf8(std::slice::from_raw_parts_mut(dst, len));
        }
        value_buffer.set_len(old_len + len);
    }
}

/// Builder for a [`StringViewArray`].
///
/// Short strings (≤ 12 bytes) are inlined into the view itself; long strings
/// are appended into an in-progress data block. When the in-progress block
/// fills up it is flushed into `completed` and a new block — double the size
/// of the last, capped at [`STRING_VIEW_MAX_BLOCK_SIZE`] — is started.
pub(crate) struct StringViewArrayBuilder {
    views: Vec<u128>,
    in_progress: Vec<u8>,
    completed: Vec<Buffer>,
    block_size: u32,
    placeholder_count: usize,
}

impl StringViewArrayBuilder {
    pub fn with_capacity(item_capacity: usize) -> Self {
        Self {
            views: Vec::with_capacity(item_capacity),
            in_progress: Vec::new(),
            completed: Vec::new(),
            block_size: STRING_VIEW_INIT_BLOCK_SIZE,
            placeholder_count: 0,
        }
    }

    /// Doubles the block-size target and returns the new size.
    fn next_block_size(&mut self) -> u32 {
        if self.block_size < STRING_VIEW_MAX_BLOCK_SIZE {
            self.block_size = self.block_size.saturating_mul(2);
        }
        self.block_size
    }

    /// See [`BulkNullStringArrayBuilder::append_value`].
    ///
    /// # Panics
    ///
    /// Panics if the value length, the in-progress buffer offset, or the
    /// number of completed buffers exceeds `i32::MAX`. The ByteView spec
    /// uses signed 32-bit integers for these fields; exceeding `i32::MAX`
    /// would produce an array that does not round-trip through Arrow IPC
    /// (see <https://github.com/apache/arrow-rs/issues/6172>).
    #[inline]
    pub fn append_value(&mut self, value: &str) {
        let v = value.as_bytes();
        let length: u32 =
            i32::try_from(v.len()).expect("value length exceeds i32::MAX") as u32;
        if length <= 12 {
            self.views.push(make_view(v, 0, 0));
            return;
        }

        let required_cap = self.in_progress.len() + length as usize;
        if self.in_progress.capacity() < required_cap {
            self.flush_in_progress();
            let to_reserve = (length as usize).max(self.next_block_size() as usize);
            self.in_progress.reserve(to_reserve);
        }

        let offset: u32 = i32::try_from(self.in_progress.len())
            .expect("offset exceeds i32::MAX") as u32;
        self.in_progress.extend_from_slice(v);
        self.views.push(self.make_long_view(length, offset, v));
    }

    /// See [`BulkNullStringArrayBuilder::append_placeholder`].
    #[inline]
    pub fn append_placeholder(&mut self) {
        // Zero-length inline view — `length` field is 0, no buffer ref.
        self.views.push(0);
        self.placeholder_count += 1;
    }

    /// Ensure the in-progress block has room for `length` more bytes,
    /// flushing the current block and starting a new (doubled) one if not.
    /// Caller must invoke this only when no bytes of the current row are
    /// yet in `in_progress` — flushing mid-row would orphan partial data.
    #[inline]
    fn ensure_long_capacity(&mut self, length: u32) {
        let required_cap = self.in_progress.len() + length as usize;
        if self.in_progress.capacity() < required_cap {
            self.flush_in_progress();
            let to_reserve = (length as usize).max(self.next_block_size() as usize);
            self.in_progress.reserve(to_reserve);
        }
    }

    /// Encode a long-form view referencing `length` bytes already written
    /// into the in-progress block at `offset`. `prefix_bytes` is the row's
    /// data slice (or any slice starting with the row's first 4 bytes).
    ///
    /// Built inline rather than going through Arrow's `make_view`: that
    /// function is `[inline(never)]` and has to handle short strings, so
    /// building the view here ourselves is faster.
    #[inline]
    fn make_long_view(&self, length: u32, offset: u32, prefix_bytes: &[u8]) -> u128 {
        let buffer_index: u32 = i32::try_from(self.completed.len())
            .expect("buffer count exceeds i32::MAX")
            as u32;
        ByteView {
            length,
            // length > 12, so prefix_bytes has at least 4 bytes.
            prefix: u32::from_le_bytes(prefix_bytes[..4].try_into().unwrap()),
            buffer_index,
            offset,
        }
        .into()
    }

    /// See [`BulkNullStringArrayBuilder::append_byte_map`].
    ///
    /// # Safety
    ///
    /// The bytes produced by applying `map` to each byte of `src`, in order,
    /// must form valid UTF-8.
    ///
    /// # Panics
    ///
    /// Panics under the same conditions as [`Self::append_value`]: if
    /// `src.len()`, the in-progress buffer offset, or the number of completed
    /// buffers exceeds `i32::MAX`.
    #[inline]
    pub unsafe fn append_byte_map<F: FnMut(u8) -> u8>(&mut self, src: &[u8], mut map: F) {
        let length: u32 =
            i32::try_from(src.len()).expect("value length exceeds i32::MAX") as u32;
        if length <= 12 {
            let mut bytes = [0u8; 12];
            for (d, &b) in bytes[..src.len()].iter_mut().zip(src) {
                *d = map(b);
            }
            self.views.push(make_view(&bytes[..src.len()], 0, 0));
            return;
        }

        self.ensure_long_capacity(length);

        let cursor = self.in_progress.len();
        let offset: u32 = i32::try_from(cursor).expect("offset exceeds i32::MAX") as u32;
        self.in_progress.extend(src.iter().map(|&b| map(b)));
        self.views
            .push(self.make_long_view(length, offset, &self.in_progress[cursor..]));
    }

    /// See [`BulkNullStringArrayBuilder::append_with`].
    ///
    /// # Panics
    ///
    /// Panics under the same conditions as [`Self::append_value`]: if the
    /// row's byte length, the in-progress buffer offset, or the number of
    /// completed buffers exceeds `i32::MAX`.
    #[inline]
    pub fn append_with<F>(&mut self, f: F)
    where
        F: FnOnce(&mut StringViewWriter<'_>),
    {
        let mut writer = StringViewWriter {
            inline_buf: [0u8; 12],
            inline_len: 0,
            spill_cursor: None,
            builder: self,
        };
        f(&mut writer);
        // Destructure to release the borrow on `self` and pull out the
        // inline-buffer state by-value. Copy types only; the &mut self is
        // dropped here, ending the borrow.
        let StringViewWriter {
            inline_buf,
            inline_len,
            spill_cursor,
            ..
        } = writer;

        match spill_cursor {
            None => {
                self.views
                    .push(make_view(&inline_buf[..inline_len as usize], 0, 0));
            }
            Some(start) => {
                let end = self.in_progress.len();
                let length: u32 = i32::try_from(end - start)
                    .expect("value length exceeds i32::MAX")
                    as u32;
                let offset: u32 =
                    i32::try_from(start).expect("offset exceeds i32::MAX") as u32;
                self.views.push(self.make_long_view(
                    length,
                    offset,
                    &self.in_progress[start..],
                ));
            }
        }
    }

    fn flush_in_progress(&mut self) {
        if !self.in_progress.is_empty() {
            let block = std::mem::take(&mut self.in_progress);
            self.completed.push(Buffer::from_vec(block));
        }
    }

    /// Finalize into a [`StringViewArray`] using the caller-supplied null
    /// buffer.
    ///
    /// # Errors
    ///
    /// Returns an error when `null_buffer.len()` does not match the number of
    /// appended rows.
    pub fn finish(mut self, null_buffer: Option<NullBuffer>) -> Result<StringViewArray> {
        if let Some(ref n) = null_buffer
            && n.len() != self.views.len()
        {
            return internal_err!(
                "Null buffer length ({}) must match row count ({})",
                n.len(),
                self.views.len()
            );
        }
        let null_count = null_buffer.as_ref().map_or(0, |n| n.null_count());
        debug_assert!(
            null_count >= self.placeholder_count,
            "{} placeholder rows but null buffer has {null_count} nulls",
            self.placeholder_count,
        );
        self.flush_in_progress();
        // SAFETY: every long-string view references bytes we wrote ourselves
        // into `self.completed`, with prefixes derived from those same bytes.
        // Inline views were built from valid `&str`. Placeholder views are
        // zero-length with no buffer reference.
        let array = unsafe {
            StringViewArray::new_unchecked(
                ScalarBuffer::from(self.views),
                self.completed,
                null_buffer,
            )
        };
        Ok(array)
    }
}

/// [`StringWriter`] for [`StringViewArrayBuilder`].
///
/// The writer accumulates the first up-to-12 bytes of a row in a stack
/// buffer; if the row stays inline-sized, it never touches the data block.
/// On the first write that would exceed 12 bytes, the stack buffer is
/// spilled into the builder's in-progress block and subsequent writes go
/// directly there.
pub(crate) struct StringViewWriter<'a> {
    inline_buf: [u8; 12],
    inline_len: u8,
    /// `None` while the row fits inline; becomes `Some(start)` (offset of
    /// the row's first byte in `in_progress`) at first spill.
    spill_cursor: Option<usize>,
    builder: &'a mut StringViewArrayBuilder,
}

impl StringWriter for StringViewWriter<'_> {
    #[inline]
    fn write_str(&mut self, s: &str) {
        let bytes = s.as_bytes();
        if self.spill_cursor.is_some() {
            self.builder.in_progress.extend_from_slice(bytes);
            return;
        }

        let inline_len = self.inline_len as usize;
        let new_len = inline_len + bytes.len();
        if new_len <= 12 {
            self.inline_buf[inline_len..new_len].copy_from_slice(bytes);
            self.inline_len = new_len as u8;
            return;
        }

        // First spill of this row: `ensure_long_capacity` may flush the
        // current block, which is safe because no row-data for this row
        // is in it yet — the inline prefix is still in `inline_buf`.
        self.builder.ensure_long_capacity(new_len as u32);
        let cursor = self.builder.in_progress.len();
        self.builder
            .in_progress
            .extend_from_slice(&self.inline_buf[..inline_len]);
        self.builder.in_progress.extend_from_slice(bytes);
        self.spill_cursor = Some(cursor);
    }

    #[inline]
    fn write_char(&mut self, c: char) {
        let len = c.len_utf8();
        if self.spill_cursor.is_some() {
            push_char_to_vec(&mut self.builder.in_progress, c);
            return;
        }

        let inline_len = self.inline_len as usize;
        let new_len = inline_len + len;
        if new_len <= 12 {
            c.encode_utf8(&mut self.inline_buf[inline_len..new_len]);
            self.inline_len = new_len as u8;
            return;
        }

        self.builder.ensure_long_capacity(new_len as u32);
        let cursor = self.builder.in_progress.len();
        self.builder
            .in_progress
            .extend_from_slice(&self.inline_buf[..inline_len]);
        push_char_to_vec(&mut self.builder.in_progress, c);
        self.spill_cursor = Some(cursor);
    }
}

#[inline]
fn push_char_to_vec(v: &mut Vec<u8>, c: char) {
    let mut buf = [0u8; 4];
    v.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
}

/// Trait abstracting over the bulk-NULL string array builders.
///
/// Similar to Arrow's `StringLikeArrayBuilder`, this allows generic dispatch
/// over the three string array types (Utf8, LargeUtf8, Utf8View) when the
/// function body is uniform across them.
///
/// Three methods append a non-null row; which method to pick depends on how the
/// row is produced:
///
/// - [`append_value`](Self::append_value) pushes an already-finished `&str`.
///   Use it when the row is forwarded from an existing slice (e.g. an input
///   column) — there is nothing to elide.
/// - [`append_byte_map`](Self::append_byte_map) emits a row whose bytes are a
///   byte-to-byte mapping of an input slice. Output length is known up front
///   and the inner loop is straight-line, so this is the fastest path when the
///   shape fits.
/// - [`append_with`](Self::append_with) emits a row by feeding fragments to a
///   [`StringWriter`]. Use it when the row is computed from multiple sources or
///   when the output length is not known up front. Bytes are written directly
///   into the builder, so it is typically faster than assembling a `String` and
///   calling `append_value(&scratch)`.
///
/// For a NULL row, call [`append_placeholder`](Self::append_placeholder) to
/// advance the row count without writing into the value buffer; the caller MUST
/// clear the corresponding bit in the null buffer passed to
/// [`finish`](Self::finish).
pub(crate) trait BulkNullStringArrayBuilder {
    /// Per-builder concrete writer type, exposed as a GAT so generic callers
    /// can use the inherent (non-`dyn`) writer methods without vtable
    /// dispatch.
    type Writer<'a>: StringWriter
    where
        Self: 'a;

    /// Append `value` as the next row.
    ///
    /// # Panics
    ///
    /// Panics if the resulting array would exceed the per-implementation
    /// size limit. See the inherent method on each builder for specifics.
    fn append_value(&mut self, value: &str);

    /// Append an empty placeholder row. The corresponding slot MUST be masked
    /// as null by the null buffer passed to [`finish`](Self::finish).
    fn append_placeholder(&mut self);

    /// Append a row whose bytes are produced by `f` calling write methods on
    /// the supplied [`StringWriter`].
    ///
    /// The closure can call `write_str` or `write_char` on the supplied
    /// `StringWriter` zero or more times. Zero calls produces a row containing
    /// the empty string.
    ///
    /// # Panics
    ///
    /// See [`append_value`](Self::append_value).
    fn append_with<F>(&mut self, f: F)
    where
        F: for<'a> FnOnce(&mut Self::Writer<'a>);

    /// Append a row whose bytes are produced by mapping each byte of `src`
    /// through `map`, in order. Output length equals `src.len()`.
    ///
    /// Because the output length is known up front and the inner loop is
    /// straight-line, this is more efficient than
    /// [`append_with`](Self::append_with) for byte-to-byte mappings and
    /// autovectorizes well.
    ///
    /// # Safety
    ///
    /// The bytes produced by applying `map` to each byte of `src`, in order,
    /// must form valid UTF-8.
    ///
    /// # Panics
    ///
    /// See [`append_value`](Self::append_value).
    unsafe fn append_byte_map<F: FnMut(u8) -> u8>(&mut self, src: &[u8], map: F);

    /// Finalize into a concrete array using the caller-supplied null buffer.
    ///
    /// # Errors
    ///
    /// Returns an error when `null_buffer.len()` does not match the number
    /// of appended rows.
    fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef>;
}

impl<O: OffsetSizeTrait> BulkNullStringArrayBuilder for GenericStringArrayBuilder<O> {
    type Writer<'a> = GenericStringWriter<'a>;

    #[inline]
    fn append_value(&mut self, value: &str) {
        GenericStringArrayBuilder::<O>::append_value(self, value)
    }
    #[inline]
    fn append_placeholder(&mut self) {
        GenericStringArrayBuilder::<O>::append_placeholder(self)
    }
    #[inline]
    fn append_with<F>(&mut self, f: F)
    where
        F: for<'a> FnOnce(&mut Self::Writer<'a>),
    {
        GenericStringArrayBuilder::<O>::append_with(self, f)
    }
    #[inline]
    unsafe fn append_byte_map<F: FnMut(u8) -> u8>(&mut self, src: &[u8], map: F) {
        // SAFETY: contract forwarded.
        unsafe { GenericStringArrayBuilder::<O>::append_byte_map(self, src, map) }
    }
    fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef> {
        Ok(Arc::new(GenericStringArrayBuilder::<O>::finish(
            self, nulls,
        )?))
    }
}

impl BulkNullStringArrayBuilder for StringViewArrayBuilder {
    type Writer<'a> = StringViewWriter<'a>;

    #[inline]
    fn append_value(&mut self, value: &str) {
        StringViewArrayBuilder::append_value(self, value)
    }
    #[inline]
    fn append_placeholder(&mut self) {
        StringViewArrayBuilder::append_placeholder(self)
    }
    #[inline]
    fn append_with<F>(&mut self, f: F)
    where
        F: for<'a> FnOnce(&mut Self::Writer<'a>),
    {
        StringViewArrayBuilder::append_with(self, f)
    }
    #[inline]
    unsafe fn append_byte_map<F: FnMut(u8) -> u8>(&mut self, src: &[u8], map: F) {
        // SAFETY: contract forwarded.
        unsafe { StringViewArrayBuilder::append_byte_map(self, src, map) }
    }
    fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef> {
        Ok(Arc::new(StringViewArrayBuilder::finish(self, nulls)?))
    }
}

/// Append a new view to the views buffer with the given substr.
///
/// Callers are responsible for their own null tracking.
///
/// # Safety
///
/// original_view must be a valid view (the format described on
/// [`GenericByteViewArray`](arrow::array::GenericByteViewArray).
///
/// # Arguments
/// - views_buffer: The buffer to append the new view to
/// - original_view: The original view value
/// - substr: The substring to append. Must be a valid substring of the original view
/// - start_offset: The start offset of the substring in the view
///
/// LLVM is apparently overly eager to inline this function into some hot loops,
/// which bloats them and regresses performance, so we disable inlining for now.
#[inline(never)]
pub(crate) fn append_view(
    views_buffer: &mut Vec<u128>,
    original_view: &u128,
    substr: &str,
    start_offset: u32,
) {
    let substr_len = substr.len();
    let sub_view = if substr_len > 12 {
        let view = ByteView::from(*original_view);
        make_view(
            substr.as_bytes(),
            view.buffer_index,
            view.offset + start_offset,
        )
    } else {
        make_view(substr.as_bytes(), 0, 0)
    };
    views_buffer.push(sub_view);
}

#[derive(Debug)]
pub(crate) enum ColumnarValueRef<'a> {
    Scalar(&'a [u8]),
    NullableArray(&'a StringArray),
    NonNullableArray(&'a StringArray),
    NullableLargeStringArray(&'a LargeStringArray),
    NonNullableLargeStringArray(&'a LargeStringArray),
    NullableStringViewArray(&'a StringViewArray),
    NonNullableStringViewArray(&'a StringViewArray),
    NullableBinaryArray(&'a BinaryArray),
    NonNullableBinaryArray(&'a BinaryArray),
}

impl ColumnarValueRef<'_> {
    #[inline]
    pub fn is_valid(&self, i: usize) -> bool {
        match &self {
            Self::Scalar(_)
            | Self::NonNullableArray(_)
            | Self::NonNullableLargeStringArray(_)
            | Self::NonNullableStringViewArray(_)
            | Self::NonNullableBinaryArray(_) => true,
            Self::NullableArray(array) => array.is_valid(i),
            Self::NullableStringViewArray(array) => array.is_valid(i),
            Self::NullableLargeStringArray(array) => array.is_valid(i),
            Self::NullableBinaryArray(array) => array.is_valid(i),
        }
    }

    #[inline]
    pub fn nulls(&self) -> Option<NullBuffer> {
        match &self {
            Self::Scalar(_)
            | Self::NonNullableArray(_)
            | Self::NonNullableStringViewArray(_)
            | Self::NonNullableLargeStringArray(_)
            | Self::NonNullableBinaryArray(_) => None,
            Self::NullableArray(array) => array.nulls().cloned(),
            Self::NullableStringViewArray(array) => array.nulls().cloned(),
            Self::NullableLargeStringArray(array) => array.nulls().cloned(),
            Self::NullableBinaryArray(array) => array.nulls().cloned(),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Run `scenario` against `builder`, finish with a null buffer derived
    /// from `expected` (a bit is set wherever `expected[i].is_some()`), and
    /// assert the resulting array equals the corresponding
    /// `*Array::from(expected)`.
    ///
    /// The caller is responsible for driving NULLs in `scenario` — usually
    /// by calling `append_placeholder` at each index where `expected[i]` is
    /// `None`.
    fn run_scenario<B, F>(mut builder: B, expected: &[Option<&str>], scenario: F)
    where
        B: BulkNullStringArrayBuilder,
        F: FnOnce(&mut B),
    {
        scenario(&mut builder);
        let bits: Vec<bool> = expected.iter().map(|x| x.is_some()).collect();
        let nulls = if bits.iter().any(|v| !v) {
            Some(NullBuffer::from(bits))
        } else {
            None
        };
        let array = builder.finish(nulls).unwrap();
        let owned: Vec<Option<&str>> = expected.to_vec();
        if let Some(a) = array.as_any().downcast_ref::<StringArray>() {
            assert_eq!(a, &StringArray::from(owned));
        } else if let Some(a) = array.as_any().downcast_ref::<LargeStringArray>() {
            assert_eq!(a, &LargeStringArray::from(owned));
        } else if let Some(a) = array.as_any().downcast_ref::<StringViewArray>() {
            assert_eq!(a, &StringViewArray::from(owned));
        } else {
            panic!("unexpected array type");
        }
    }

    /// Run `$scenario` against all three bulk-null builders, asserting each
    /// produces an array equivalent to `$expected`. `$scenario` is a closure
    /// `|builder| { ... }`; it is duplicated syntactically at each call site
    /// so the `BulkNullStringArrayBuilder::Writer` GAT can specialize per
    /// builder.
    macro_rules! check_on_all_builders {
        ($expected:expr, $scenario:expr $(,)?) => {{
            let expected = $expected;
            run_scenario(
                GenericStringArrayBuilder::<i32>::with_capacity(0, 0),
                expected,
                $scenario,
            );
            run_scenario(
                GenericStringArrayBuilder::<i64>::with_capacity(0, 0),
                expected,
                $scenario,
            );
            run_scenario(
                StringViewArrayBuilder::with_capacity(0),
                expected,
                $scenario,
            );
        }};
    }

    fn assert_finish_errs_on_length_mismatch<B>(mut builder: B)
    where
        B: BulkNullStringArrayBuilder,
    {
        builder.append_value("a");
        builder.append_value("b");
        let nulls = NullBuffer::from(vec![true, false, true]);
        assert!(builder.finish(Some(nulls)).is_err());
    }

    #[test]
    #[should_panic(expected = "capacity integer overflow")]
    fn test_overflow_concat_string_builder() {
        let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX);
    }

    #[test]
    #[should_panic(expected = "capacity integer overflow")]
    fn test_overflow_concat_large_string_builder() {
        let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX);
    }

    #[test]
    fn bulk_append_value_with_nulls() {
        check_on_all_builders!(
            &[
                Some("a string longer than twelve bytes"),
                None,
                Some("short"),
                None,
            ],
            |b| {
                b.append_value("a string longer than twelve bytes");
                b.append_placeholder();
                b.append_value("short");
                b.append_placeholder();
            },
        );
    }

    #[test]
    fn bulk_empty_builder() {
        check_on_all_builders!(&[], |_b| {});
    }

    #[test]
    fn bulk_all_placeholders() {
        check_on_all_builders!(&[None, None, None], |b| {
            b.append_placeholder();
            b.append_placeholder();
            b.append_placeholder();
        });
    }

    #[test]
    fn bulk_append_value_no_nulls() {
        check_on_all_builders!(
            &[
                Some("foo"),
                Some(""),
                Some("a string longer than twelve bytes")
            ],
            |b| {
                b.append_value("foo");
                b.append_value("");
                b.append_value("a string longer than twelve bytes");
            },
        );
    }

    #[test]
    fn bulk_append_with() {
        check_on_all_builders!(
            &[
                Some("hello"),
                None,
                Some("hello world"),
                Some("a long string of 25 bytes"),
                Some(""),
            ],
            |b| {
                b.append_with(|w| w.write_str("hello"));
                b.append_placeholder();
                b.append_with(|w| {
                    w.write_str("hello ");
                    w.write_str("world");
                });
                b.append_with(|w| w.write_str("a long string of 25 bytes"));
                b.append_with(|_w| {});
            },
        );
    }

    #[test]
    fn bulk_append_with_chars() {
        check_on_all_builders!(&[Some("hé!"), Some("x")], |b| {
            b.append_with(|w| {
                w.write_char('h');
                w.write_char('é');
                w.write_char('!');
            });
            b.append_with(|w| w.write_char('x'));
        });
    }

    #[test]
    fn bulk_append_byte_map() {
        // SAFETY: ASCII inputs and ASCII outputs in every call.
        check_on_all_builders!(&[Some("HELLO"), Some("aXcaX"), Some("")], |b| unsafe {
            b.append_byte_map(b"hello", |x| x.to_ascii_uppercase());
            b.append_byte_map(b"abcab", |x| if x == b'b' { b'X' } else { x });
            b.append_byte_map(b"", |x| x);
        },);
    }

    #[test]
    fn bulk_finish_errors_on_null_buffer_length_mismatch() {
        assert_finish_errs_on_length_mismatch(
            GenericStringArrayBuilder::<i32>::with_capacity(2, 4),
        );
        assert_finish_errs_on_length_mismatch(
            GenericStringArrayBuilder::<i64>::with_capacity(2, 4),
        );
        assert_finish_errs_on_length_mismatch(StringViewArrayBuilder::with_capacity(2));
    }

    #[test]
    #[cfg(debug_assertions)]
    #[should_panic(expected = "placeholder rows")]
    fn string_array_builder_placeholder_without_null_mask() {
        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(2, 4);
        builder.append_value("a");
        builder.append_placeholder();
        // Slot 1 is a placeholder but the null buffer doesn't mark it null.
        let nulls = NullBuffer::from(vec![true, true]);
        let _ = builder.finish(Some(nulls));
    }

    #[test]
    #[cfg(debug_assertions)]
    #[should_panic(expected = "placeholder rows")]
    fn string_array_builder_placeholder_with_none_null_buffer() {
        let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(1, 4);
        builder.append_placeholder();
        let _ = builder.finish(None);
    }

    #[test]
    #[cfg(debug_assertions)]
    #[should_panic(expected = "placeholder rows")]
    fn string_view_array_builder_placeholder_without_null_mask() {
        let mut builder = StringViewArrayBuilder::with_capacity(2);
        builder.append_value("a");
        builder.append_placeholder();
        let nulls = NullBuffer::from(vec![true, true]);
        let _ = builder.finish(Some(nulls));
    }

    #[test]
    #[cfg(debug_assertions)]
    #[should_panic(expected = "placeholder rows")]
    fn string_view_array_builder_placeholder_with_none_null_buffer() {
        let mut builder = StringViewArrayBuilder::with_capacity(1);
        builder.append_placeholder();
        let _ = builder.finish(None);
    }

    #[test]
    fn string_view_array_builder_append_with_inline() {
        // Rows that stay ≤ 12 bytes never touch the data block.
        let mut builder = StringViewArrayBuilder::with_capacity(4);
        let inputs = ["hello", "world!", "", "0123456789ab"];
        for s in &inputs {
            builder.append_with(|w| w.write_str(s));
        }
        let array = builder.finish(None).unwrap();
        assert_eq!(array.len(), inputs.len());
        for (i, s) in inputs.iter().enumerate() {
            assert_eq!(array.value(i), *s);
        }
        assert_eq!(array.data_buffers().len(), 0);
    }

    #[test]
    fn string_view_array_builder_append_byte_map() {
        let mut builder = StringViewArrayBuilder::with_capacity(4);
        // SAFETY: ASCII inputs and ASCII outputs in every call.
        unsafe {
            builder.append_byte_map(b"hello", |b| b.to_ascii_uppercase());
            builder.append_byte_map(b"a long string of 25 bytes", |b| {
                if b == b' ' { b'_' } else { b }
            });
            // 12 bytes — exactly at the inline boundary.
            builder.append_byte_map(b"abcdefghijkl", |b| b);
            builder.append_byte_map(b"", |b| b);
        }
        let array = builder.finish(None).unwrap();
        assert_eq!(array.value(0), "HELLO");
        assert_eq!(array.value(1), "a_long_string_of_25_bytes");
        assert_eq!(array.value(2), "abcdefghijkl");
        assert_eq!(array.value(3), "");
        assert_eq!(array.data_buffers().len(), 1);
        assert_eq!(array.data_buffers()[0].len(), 25);
    }

    #[test]
    fn string_view_array_builder_append_with_at_inline_boundary() {
        // Building exactly 12 bytes via several writes should still go inline.
        let mut builder = StringViewArrayBuilder::with_capacity(2);
        builder.append_with(|w| {
            w.write_str("hello");
            w.write_str(" world!");
        });
        builder.append_with(|w| {
            for _ in 0..6 {
                w.write_str("ab");
            }
        });
        let array = builder.finish(None).unwrap();
        assert_eq!(array.value(0), "hello world!");
        assert_eq!(array.value(1), "abababababab");
        assert_eq!(array.data_buffers().len(), 0);
    }

    #[test]
    fn string_view_array_builder_append_with_spill_on_overflow() {
        // 12 bytes from one write, +1 byte from another → spill at boundary.
        let mut builder = StringViewArrayBuilder::with_capacity(1);
        builder.append_with(|w| {
            w.write_str("hello world!");
            w.write_str("X");
        });
        let array = builder.finish(None).unwrap();
        assert_eq!(array.value(0), "hello world!X");
        assert_eq!(array.data_buffers().len(), 1);
        assert_eq!(array.data_buffers()[0].len(), 13);
    }

    #[test]
    fn string_view_array_builder_append_with_long_single_write() {
        // A single write larger than 12 bytes spills immediately with an
        // empty inline_buf prefix.
        let mut builder = StringViewArrayBuilder::with_capacity(1);
        builder.append_with(|w| w.write_str("a long string of 25 bytes"));
        let array = builder.finish(None).unwrap();
        assert_eq!(array.value(0), "a long string of 25 bytes");
        assert_eq!(array.data_buffers().len(), 1);
        assert_eq!(array.data_buffers()[0].len(), 25);
    }

    #[test]
    fn string_view_array_builder_append_with_many_small_writes_spilling() {
        // 30 × "ab" (60 bytes total): first 6 fit inline, remainder spills.
        let mut builder = StringViewArrayBuilder::with_capacity(1);
        builder.append_with(|w| {
            for _ in 0..30 {
                w.write_str("ab");
            }
        });
        let array = builder.finish(None).unwrap();
        assert_eq!(array.value(0), "ab".repeat(30));
        assert_eq!(array.data_buffers().len(), 1);
        assert_eq!(array.data_buffers()[0].len(), 60);
    }

    #[test]
    fn string_view_array_builder_append_with_chars() {
        // write_char with multi-byte UTF-8: row 0 stays inline (3 bytes),
        // row 1 spills (40 bytes).
        let mut builder = StringViewArrayBuilder::with_capacity(2);
        builder.append_with(|w| {
            w.write_char('é');
            w.write_char('!');
        });
        builder.append_with(|w| {
            for _ in 0..10 {
                w.write_char('🦀');
            }
        });
        let array = builder.finish(None).unwrap();
        assert_eq!(array.value(0), "é!");
        assert_eq!(array.value(1), "🦀".repeat(10));
    }

    #[test]
    fn string_view_array_builder_append_with_block_rotation() {
        // 40 long rows, 500 bytes each, exceeds the first doubled block
        // (~16 KiB). Forces the builder to rotate blocks between rows.
        const STR_LEN: usize = 500;
        const N: usize = 40;
        let s = "x".repeat(STR_LEN);
        let mut builder = StringViewArrayBuilder::with_capacity(N);
        for _ in 0..N {
            builder.append_with(|w| w.write_str(&s));
        }
        let array = builder.finish(None).unwrap();
        assert_eq!(array.len(), N);
        assert!(
            array.data_buffers().len() >= 2,
            "expected multiple data buffers, got {}",
            array.data_buffers().len()
        );
        let total: usize = array.data_buffers().iter().map(|b| b.len()).sum();
        assert_eq!(total, N * STR_LEN);
        for i in 0..N {
            assert_eq!(array.value(i), s);
        }
    }

    #[test]
    fn string_view_array_builder_flushes_full_blocks() {
        // Each value is 300 bytes. The first data block is 2 × STRING_VIEW_INIT_BLOCK_SIZE
        // = 16 KiB, so ~50 values saturate it and the rest spill into additional
        // blocks.
        let value = "x".repeat(300);
        let mut builder = StringViewArrayBuilder::with_capacity(100);
        for _ in 0..100 {
            builder.append_value(&value);
        }
        let array = builder.finish(None).unwrap();
        assert_eq!(array.len(), 100);
        assert!(
            array.data_buffers().len() > 1,
            "expected multiple data buffers, got {}",
            array.data_buffers().len()
        );
        for i in 0..100 {
            assert_eq!(array.value(i), value);
        }
    }
}