1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
use crate::prelude::*;
use arrow::array::{ArrayRef, BooleanBufferBuilder};
use arrow::datatypes::ToByteSlice;
use arrow::{
array::{Array, ArrayData, LargeListArray, LargeStringArray},
buffer::Buffer,
};
use itertools::Itertools;
use std::convert::TryFrom;
pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec<usize> {
let mut idx = Vec::with_capacity(capacity);
let mut count = 0;
let mut last_idx = 0;
for &offset in offsets.iter().skip(1) {
while count < offset {
count += 1;
idx.push(last_idx)
}
last_idx += 1;
}
for _ in 0..(capacity - count as usize) {
idx.push(last_idx);
}
idx
}
impl ChunkExplode for ListChunked {
fn explode_and_offsets(&self) -> Result<(Series, &[i64])> {
let ca = self.rechunk();
let listarr: &LargeListArray = ca
.downcast_iter()
.next()
.ok_or_else(|| PolarsError::NoData("cannot explode empty list".into()))?;
let list_data = listarr.data();
let values = listarr.values();
let offset_ptr = list_data.buffers()[0].as_ptr() as *const i64;
let offsets = unsafe { std::slice::from_raw_parts(offset_ptr, self.len()) };
let s = Series::try_from((self.name(), values)).unwrap();
Ok((s, offsets))
}
}
impl ChunkExplode for Utf8Chunked {
fn explode_and_offsets(&self) -> Result<(Series, &[i64])> {
let ca = self.rechunk();
let stringarr: &LargeStringArray = ca
.downcast_iter()
.next()
.ok_or_else(|| PolarsError::NoData("cannot explode empty str".into()))?;
let list_data = stringarr.data();
let str_values_buf = stringarr.value_data();
let offset_ptr = list_data.buffers()[0].as_ptr() as *const i64;
let offsets = unsafe { std::slice::from_raw_parts(offset_ptr, self.len()) };
let str_data = unsafe { std::str::from_utf8_unchecked(str_values_buf.as_slice()) };
let mut new_offsets = str_data.char_indices().map(|t| t.0 as i64).collect_vec();
new_offsets.push(str_data.len() as i64);
let mut builder = ArrayData::builder(ArrowDataType::LargeUtf8)
.len(new_offsets.len() - 1)
.add_buffer(Buffer::from(new_offsets.to_byte_slice()))
.add_buffer(str_values_buf);
if self.null_count() > 0 {
let capacity = new_offsets.len();
let mut bitmap_builder = BooleanBufferBuilder::new(new_offsets.len());
let mut count = 0;
let mut last_idx = 0;
let mut last_valid = stringarr.is_valid(last_idx);
for &offset in offsets.iter().skip(1) {
while count < offset {
count += 1;
bitmap_builder.append(last_valid);
}
last_idx += 1;
last_valid = stringarr.is_valid(last_idx);
}
for _ in 0..(capacity - count as usize) {
bitmap_builder.append(last_valid);
}
builder = builder.null_bit_buffer(bitmap_builder.finish());
}
let arr_data = builder.build();
let new_arr = Arc::new(LargeStringArray::from(arr_data)) as ArrayRef;
let s = Series::try_from((self.name(), new_arr)).unwrap();
Ok((s, offsets))
}
}