1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
use lru::LruCache;
use once_cell::sync::Lazy;
use regex::Regex;
use std::fmt;
use std::num::NonZeroUsize;
use std::sync::{Arc, Mutex};
include!("utils.rs");
/// Global cache for compiled regex patterns to avoid recompilation.
///
/// Cache size is limited to `REGEX_CACHE_CAPACITY` entries using an LRU policy
/// to bound memory usage.
const REGEX_CACHE_CAPACITY: usize = 128;
pub(crate) static REGEX_CACHE: Lazy<Mutex<LruCache<String, Arc<Regex>>>> = Lazy::new(|| {
let cap = NonZeroUsize::new(REGEX_CACHE_CAPACITY).unwrap();
Mutex::new(LruCache::new(cap))
});
/// Get or compile a regex pattern, using a global cache for performance.
///
/// Mutex poisoning is ignored by taking ownership of the inner value,
/// allowing continued use of the existing cache.
pub(crate) fn get_or_compile_regex(pattern: &str) -> Result<Arc<Regex>, regex::Error> {
let mut cache = REGEX_CACHE.lock().unwrap_or_else(|e| e.into_inner());
if let Some(regex) = cache.get(pattern) {
return Ok(regex.clone());
}
let compiled = Arc::new(Regex::new(pattern)?);
cache.put(pattern.to_string(), compiled.clone());
Ok(compiled)
}
#[derive(Debug)]
pub enum SelectorError {
InvalidRegex {
pattern: String,
source: regex::Error,
},
InvalidSelector {
selector: String,
reason: String,
},
}
impl fmt::Display for SelectorError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SelectorError::InvalidRegex { pattern, source } => {
write!(f, "Invalid regex pattern '{}': {}", pattern, source)
}
SelectorError::InvalidSelector { selector, reason } => {
write!(f, "Invalid selector '{}': {}", selector, reason)
}
}
}
}
impl std::error::Error for SelectorError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
SelectorError::InvalidRegex { source, .. } => Some(source),
SelectorError::InvalidSelector { .. } => None,
}
}
}
/// Keep track of user column and row selections
#[derive(Debug, Clone)]
pub struct Selector {
/// Index of first row to grab (start of range) - can be negative for Python-style indexing
pub start_idx: i64,
/// Resolved start index (converted from negative to positive if needed)
pub resolved_start_idx: usize,
/// Regex of first to grab (start of range)
pub start_regex: Arc<regex::Regex>,
/// Index of last row to grab (end of range) - can be negative for Python-style indexing
pub end_idx: i64,
/// Resolved end index (converted from negative to positive if needed)
pub resolved_end_idx: usize,
/// Regex of last row to grab (end of range)
pub end_regex: Arc<regex::Regex>,
/// Step size between start and end of range
pub step: usize,
/// Keep track of when to stop adding rows from range to output
pub stopped: bool,
/// Track if indices have been resolved for a given collection length
pub indices_resolved: bool,
}
impl Selector {
/// Create a new default selector
///
/// # Returns
///
/// Returns `Ok(Selector)` with default values or `Err(SelectorError)` if the default regex cannot be compiled.
/// This should never fail in practice since we use a known-good regex pattern.
pub fn new() -> Result<Selector, SelectorError> {
let default_regex = r".^";
let start_regex =
get_or_compile_regex(default_regex).map_err(|e| SelectorError::InvalidRegex {
pattern: default_regex.to_string(),
source: e,
})?;
let end_regex =
get_or_compile_regex(default_regex).map_err(|e| SelectorError::InvalidRegex {
pattern: default_regex.to_string(),
source: e,
})?;
Ok(Selector {
start_idx: 0,
resolved_start_idx: 0,
start_regex,
end_idx: i64::MAX,
resolved_end_idx: usize::MAX,
end_regex,
step: 1,
stopped: false,
indices_resolved: false,
})
}
/// Resolve negative indices based on collection length (Python-style indexing)
///
/// # Arguments
///
/// * `collection_length` - The length of the collection being indexed
///
/// # Examples
///
/// * `-1` with length 5 becomes index 4 (last item)
/// * `-2` with length 5 becomes index 3 (second to last)
///
/// Negative end indices are treated as exclusive bounds, so an end index of
/// `-1` with length 5 resolves to `3`, excluding the last item.
pub fn resolve_indices(&mut self, collection_length: usize) {
if self.indices_resolved {
return;
}
// Resolve start index
self.resolved_start_idx = if self.start_idx < 0 {
let abs_idx = (-self.start_idx) as usize;
if abs_idx > collection_length {
0 // Out of bounds negative index, clamp to start
} else {
collection_length.saturating_sub(abs_idx)
}
} else if self.start_idx == i64::MAX {
usize::MAX // Keep as usize::MAX for regex-based selection
} else if self.start_idx == 0 {
0 // Handle the special case of 0 (keep as 0, don't convert)
} else {
(self.start_idx - 1) as usize // Convert 1-based to 0-based for positive indices
};
// Resolve end index
self.resolved_end_idx = if self.end_idx < 0 {
let abs_idx = (-self.end_idx) as usize;
if abs_idx > collection_length {
self.resolved_start_idx = usize::MAX;
usize::MAX // Out of bounds negative index, yield no matches
} else {
let idx = collection_length.saturating_sub(abs_idx);
if self.start_idx != self.end_idx {
idx.saturating_sub(1)
} else {
idx
}
}
} else if self.end_idx == i64::MAX {
usize::MAX // Keep as usize::MAX for regex-based or unlimited selection
} else if self.end_idx == 0 {
0 // Handle the special case of 0 (keep as 0, don't convert)
} else {
(self.end_idx - 1) as usize // Convert 1-based to 0-based for positive indices
};
if self.resolved_start_idx > self.resolved_end_idx {
self.resolved_start_idx = usize::MAX;
self.resolved_end_idx = usize::MAX;
}
self.indices_resolved = true;
}
/// Reset resolution state - call when reusing selector for new collection
pub fn reset_resolution(&mut self) {
self.indices_resolved = false;
}
}
impl Default for Selector {
/// Defaults to implement a new selector without defining each field individually
///
/// # Panics
///
/// This will panic if the default regex pattern fails to compile, which should never happen.
/// For error handling, use `Selector::new()` instead.
fn default() -> Selector {
Selector::new().expect("Default selector regex should always compile")
}
}
impl PartialEq for Selector {
/// Enable checking the equality of two Selector structs
/// We do this by simply ensuring each field in the structs are equal
/// While this seems straight forward, it's necessary as `regex::Regex` does not have a
/// PartialEq implemented by default.
fn eq(&self, other: &Self) -> bool {
self.start_idx == other.start_idx
&& self.resolved_start_idx == other.resolved_start_idx
&& utils::regex_eq(&self.start_regex, &other.start_regex)
&& self.end_idx == other.end_idx
&& self.resolved_end_idx == other.resolved_end_idx
&& utils::regex_eq(&self.end_regex, &other.end_regex)
&& self.step == other.step
&& self.stopped == other.stopped
&& self.indices_resolved == other.indices_resolved
}
}
/// Parse either row or column selectors, turning Python-like list slicing syntax into vector of
/// Selector structs
///
/// # Errors
///
/// Returns `SelectorError::InvalidRegex` if any regex pattern fails to compile.
pub fn parse_selectors(selectors: &str) -> Result<Vec<Selector>, SelectorError> {
let mut sequences: Vec<Selector> = Vec::new();
// Iterate through selectors, which are separated by commas
for selector in selectors.split(",") {
let mut sequence = Selector::new()?;
let components: Vec<&str> = selector.split(":").collect();
// Handle completely empty selector (from consecutive commas like "1,,3")
if selector.is_empty() {
// Empty selector means select all - Python slice [:] semantics
sequence.start_idx = 0;
sequence.end_idx = i64::MAX;
sequence.step = 1;
sequences.push(sequence);
continue;
}
// Iterate through components in an individual selector, which are separated by colons
for (idx, component) in components.iter().enumerate() {
// Handle empty components with Python slice semantics
if component.is_empty() {
match idx {
0 => {
// Empty start component means start from beginning
sequence.start_idx = 0;
}
1 => {
// Empty end component means go to the end
sequence.end_idx = i64::MAX;
}
2 => {
// Empty step component means default step of 1
sequence.step = 1;
}
_ => {}
}
continue;
}
// Try to parse int from component. If we're successful, use that int as a start index,
// end index, or step. If parse() returns an error, use that component as a regex
// pattern to match to
let parsed_component = component.parse::<i64>();
match parsed_component {
Ok(_ok) => {
let raw_number = parsed_component.as_ref().unwrap();
match idx {
0 => {
// Store raw signed number - will be resolved later with collection length
sequence.start_idx = *raw_number;
// If this is the full selection, set this to the end index as well
if selector.matches(":").count() == 0 {
sequence.end_idx = *raw_number;
}
}
1 => {
// Store raw signed number - will be resolved later with collection length
sequence.end_idx = *raw_number;
}
2 => {
// Step value should NOT be negative and must be positive
if *raw_number <= 0 {
return Err(SelectorError::InvalidSelector {
selector: selector.to_string(),
reason:
"step size must be a positive integer greater than zero."
.to_string(),
});
}
sequence.step = *raw_number as usize;
}
_ => {
return Err(SelectorError::InvalidSelector {
selector: selector.to_string(),
reason: "A selector cannot be more than three components long"
.to_string(),
});
}
}
}
Err(_e) => {
let case_insensitive_regex = format!(r"(?i).*{}.*", &component);
match idx {
0 => {
sequence.start_regex = get_or_compile_regex(&case_insensitive_regex)
.map_err(|e| SelectorError::InvalidRegex {
pattern: case_insensitive_regex.clone(),
source: e,
})?;
// Set the start index to the i64 max to ensure it doesn't interfere
sequence.start_idx = i64::MAX;
// If this is the full selection, set this to the end regex as well
if selector.matches(":").count() == 0 {
sequence.end_regex = get_or_compile_regex(&case_insensitive_regex)
.map_err(|e| SelectorError::InvalidRegex {
pattern: case_insensitive_regex,
source: e,
})?;
}
}
1 => {
sequence.end_regex = get_or_compile_regex(&case_insensitive_regex)
.map_err(|e| SelectorError::InvalidRegex {
pattern: case_insensitive_regex,
source: e,
})?;
}
2 => {
return Err(SelectorError::InvalidSelector {
selector: selector.to_string(),
reason: "Step size must be an integer".to_string(),
});
}
_ => {
return Err(SelectorError::InvalidSelector {
selector: selector.to_string(),
reason: "A selector cannot be more than three components long"
.to_string(),
});
}
}
}
}
}
// Add parsed selector to vector
sequences.push(sequence);
}
// Return all selectors
Ok(sequences)
}
#[cfg(test)]
#[path = "selector_tests.rs"]
mod selector_tests;