use regex::Regex;
use smallvec::SmallVec;
mod debug;
mod parser;
mod template;
use dashmap::DashMap;
use fast_strip_ansi::strip_ansi_string;
use memchr::memchr_iter;
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::time::{Duration, Instant};
#[allow(deprecated)]
pub use crate::pipeline::template::{
MultiTemplate, RichFormatResult, SectionInfo, SectionType, Template, TemplateOutput,
};
pub use debug::DebugTracer;
static REGEX_CACHE: Lazy<DashMap<String, Regex>> = Lazy::new(DashMap::new);
type SplitCacheKey = (u64, String);
type SplitCacheValue = Vec<String>;
static SPLIT_CACHE: Lazy<DashMap<SplitCacheKey, SplitCacheValue>> = Lazy::new(DashMap::new);
static COMMON_SEPARATORS: Lazy<HashMap<&'static str, String>> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert(" ", " ".to_string());
map.insert(",", ",".to_string());
map.insert("\n", "\n".to_string());
map.insert("\t", "\t".to_string());
map.insert(":", ":".to_string());
map.insert(";", ";".to_string());
map.insert("|", "|".to_string());
map.insert("-", "-".to_string());
map.insert("_", "_".to_string());
map.insert("", "".to_string());
map
});
fn get_interned_separator(sep: &str) -> String {
COMMON_SEPARATORS
.get(sep)
.cloned()
.unwrap_or_else(|| sep.to_string())
}
#[inline(always)]
fn ascii_trim(s: &str) -> Option<&str> {
if s.is_ascii() {
Some(s.trim_matches(|c: char| c.is_ascii_whitespace()))
} else {
None
}
}
#[inline(always)]
fn ascii_reverse(s: &str) -> Option<String> {
if s.is_ascii() {
let mut bytes: Vec<u8> = s.bytes().collect();
bytes.reverse();
Some(unsafe { String::from_utf8_unchecked(bytes) })
} else {
None
}
}
pub(crate) fn get_cached_split(input: &str, separator: &str) -> Vec<String> {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
input.hash(&mut hasher);
let input_hash = hasher.finish();
let cache_key = (input_hash, separator.to_string());
if let Some(cached_split) = SPLIT_CACHE.get(&cache_key) {
return cached_split.value().clone();
}
let parts: Vec<String> = if separator.len() == 1 {
let sep_byte = separator.as_bytes()[0];
let mut parts = Vec::with_capacity(16);
let mut start = 0usize;
for idx in memchr_iter(sep_byte, input.as_bytes()) {
parts.push(input[start..idx].to_string());
start = idx + 1;
}
parts.push(input[start..].to_string());
parts
} else {
input.split(separator).map(str::to_string).collect()
};
if input.len() <= 10_000 && parts.len() <= 1_000 {
SPLIT_CACHE.insert(cache_key, parts.clone());
}
parts
}
fn get_cached_regex(pattern: &str) -> Result<Regex, String> {
if let Some(regex) = REGEX_CACHE.get(pattern) {
return Ok(regex.value().clone());
}
let regex = Regex::new(pattern).map_err(|e| format!("Invalid regex: {e}"))?;
REGEX_CACHE
.entry(pattern.to_string())
.or_insert(regex.clone());
Ok(regex)
}
#[derive(Debug, Clone)]
pub(crate) enum Value {
Str(String),
List(Vec<String>),
}
#[derive(Debug, Clone, Hash)]
pub enum StringOp {
Split { sep: String, range: RangeSpec },
Join { sep: String },
Replace {
pattern: String,
replacement: String,
flags: String,
},
Upper,
Lower,
Trim {
chars: String,
direction: TrimDirection,
},
Substring { range: RangeSpec },
Append { suffix: String },
Prepend { prefix: String },
Surround { text: String },
StripAnsi,
Filter { pattern: String },
FilterNot { pattern: String },
Slice { range: RangeSpec },
Map {
operations: Box<SmallVec<[StringOp; 8]>>,
},
Sort { direction: SortDirection },
Reverse,
Unique,
Pad {
width: usize,
char: char,
direction: PadDirection,
},
RegexExtract {
pattern: String,
group: Option<usize>,
},
}
#[derive(Debug, Clone, Copy, Hash)]
pub enum RangeSpec {
Index(isize),
Range(Option<isize>, Option<isize>, bool),
}
#[derive(Debug, Clone, Copy, Hash)]
pub enum TrimDirection {
Both,
Left,
Right,
}
#[derive(Debug, Clone, Copy, Hash)]
pub enum SortDirection {
Asc,
Desc,
}
#[derive(Debug, Clone, Copy, Hash)]
pub enum PadDirection {
Left,
Right,
Both,
}
#[inline(always)]
fn resolve_index(idx: isize, len: usize) -> usize {
let len_i = len as isize;
let resolved = if idx < 0 { len_i + idx } else { idx };
resolved.clamp(0, len_i.max(0)) as usize
}
fn apply_range<T: Clone>(items: &[T], range: &RangeSpec) -> Vec<T> {
let len = items.len();
if len == 0 {
return Vec::new();
}
match range {
RangeSpec::Index(idx) => {
let i = resolve_index(*idx, len).min(len - 1);
if let Some(item) = items.get(i) {
vec![item.clone()]
} else {
Vec::new()
}
}
RangeSpec::Range(start, end, inclusive) => {
let s_idx = start.map_or(0, |s| resolve_index(s, len));
if s_idx >= len {
return Vec::new();
}
let mut e_idx = end.map_or(len, |e| resolve_index(e, len));
if *inclusive {
e_idx = e_idx.saturating_add(1);
}
let e_idx = e_idx.min(len);
if s_idx >= e_idx {
Vec::new()
} else {
items[s_idx..e_idx].to_vec()
}
}
}
}
pub fn apply_ops_internal(
input: &str,
ops: &[StringOp],
debug: bool,
debug_tracer: Option<DebugTracer>,
) -> Result<String, String> {
let mut val = Value::Str(input.to_string());
let mut default_sep = " ".to_string();
let start_time = if debug { Some(Instant::now()) } else { None };
if debug && let Some(ref tracer) = debug_tracer {
tracer.pipeline_start(ops, &val);
}
for (i, op) in ops.iter().enumerate() {
let step_start = if debug { Some(Instant::now()) } else { None };
let input_val = if debug { Some(val.clone()) } else { None };
match op {
StringOp::Map { operations } => {
if debug && let Some(ref tracer) = debug_tracer {
tracer.operation_step(
i + 1,
ops.len(),
op,
input_val.as_ref().unwrap(),
&Value::Str("processing...".to_string()),
Duration::from_nanos(0),
);
}
if let Value::List(list) = val {
let mapped = list
.iter()
.enumerate()
.map(|(item_idx, item)| {
if debug && let Some(ref tracer) = debug_tracer {
tracer.map_item_start(item_idx + 1, list.len(), item);
}
let sub_tracer = DebugTracer::sub_pipeline(debug);
let result = apply_ops_internal(
item,
operations.as_slice(),
debug,
Some(sub_tracer),
);
if debug && let Some(ref tracer) = debug_tracer {
match &result {
Ok(output) => tracer.map_item_end(Ok(output)),
Err(e) => tracer.map_item_end(Err(e)),
}
}
result
})
.collect::<Result<Vec<_>, _>>()?;
if debug && let Some(ref tracer) = debug_tracer {
tracer.map_complete(list.len(), mapped.len());
}
val = Value::List(mapped);
} else {
return Err("Map operation can only be applied to lists".to_string());
}
}
_ => {
val = apply_single_operation(op, val, &mut default_sep)?;
}
}
if debug
&& !matches!(op, StringOp::Map { .. })
&& let Some(ref tracer) = debug_tracer
{
let elapsed = step_start.unwrap().elapsed();
tracer.operation_step(
i + 1,
ops.len(),
op,
input_val.as_ref().unwrap(),
&val,
elapsed,
);
}
}
if debug && let Some(ref tracer) = debug_tracer {
let total_elapsed = start_time.unwrap().elapsed();
tracer.pipeline_end(&val, total_elapsed);
}
Ok(match val {
Value::Str(s) => s,
Value::List(list) => {
if list.is_empty() {
String::new()
} else {
list.join(&default_sep)
}
}
})
}
fn apply_string_operation<F>(val: Value, transform: F, op_name: &str) -> Result<Value, String>
where
F: FnOnce(String) -> String,
{
if let Value::Str(s) = val {
Ok(Value::Str(transform(s)))
} else {
Err(format!(
"{} operation can only be applied to strings. Use map:{{{}}} for lists.",
op_name,
op_name.to_lowercase()
))
}
}
fn apply_list_operation<F>(val: Value, transform: F, op_name: &str) -> Result<Value, String>
where
F: FnOnce(Vec<String>) -> Vec<String>,
{
if let Value::List(list) = val {
Ok(Value::List(transform(list)))
} else {
Err(format!("{op_name} operation can only be applied to lists"))
}
}
fn apply_single_operation(
op: &StringOp,
val: Value,
default_sep: &mut String,
) -> Result<Value, String> {
match op {
StringOp::Split { sep, range } => {
let parts: Vec<String> = match &val {
Value::Str(s) => {
get_cached_split(s, sep)
}
Value::List(list) => list.iter().flat_map(|s| get_cached_split(s, sep)).collect(),
};
*default_sep = get_interned_separator(sep);
let result = apply_range(&parts, range);
match range {
RangeSpec::Index(_) => {
if result.len() == 1 {
Ok(Value::Str(result[0].clone()))
} else if result.is_empty() {
Ok(Value::Str(String::new()))
} else {
Ok(Value::List(result))
}
}
_ => Ok(Value::List(result)),
}
}
StringOp::Join { sep } => {
let result = match val {
Value::List(list) => Value::Str(list.join(sep)),
Value::Str(s) => Value::Str(s), };
*default_sep = get_interned_separator(sep);
Ok(result)
}
StringOp::Slice { range } => {
apply_list_operation(val, |list| apply_range(&list, range), "Slice")
}
StringOp::Filter { pattern } => {
let re = get_cached_regex(pattern)?;
match val {
Value::List(list) => Ok(Value::List(
list.into_iter().filter(|s| re.is_match(s)).collect(),
)),
Value::Str(s) => Ok(Value::Str(if re.is_match(&s) { s } else { String::new() })),
}
}
StringOp::FilterNot { pattern } => {
let re = get_cached_regex(pattern)?;
match val {
Value::List(list) => Ok(Value::List(
list.into_iter().filter(|s| !re.is_match(s)).collect(),
)),
Value::Str(s) => Ok(Value::Str(if re.is_match(&s) { String::new() } else { s })),
}
}
StringOp::Sort { direction } => {
if let Value::List(mut list) = val {
match direction {
SortDirection::Asc => list.sort(),
SortDirection::Desc => {
list.sort();
list.reverse();
}
}
Ok(Value::List(list))
} else {
Err("Sort operation can only be applied to lists".to_string())
}
}
StringOp::Reverse => match val {
Value::Str(s) => Ok(Value::Str(
ascii_reverse(&s).unwrap_or_else(|| s.chars().rev().collect()),
)),
Value::List(mut list) => {
list.reverse();
Ok(Value::List(list))
}
},
StringOp::Unique => apply_list_operation(
val,
|list| {
let mut seen = std::collections::HashSet::new();
list.into_iter()
.filter(|item| seen.insert(item.clone()))
.collect()
},
"Unique",
),
StringOp::Substring { range } => {
if let Value::Str(s) = val {
if s.is_ascii() {
let bytes = s.as_bytes();
let result_bytes = apply_range(bytes, range);
let result = unsafe { String::from_utf8_unchecked(result_bytes) };
Ok(Value::Str(result))
} else {
let chars: Vec<char> = s.chars().collect();
let result: String = apply_range(&chars, range).into_iter().collect();
Ok(Value::Str(result))
}
} else {
Err("Substring operation can only be applied to strings. Use map:{substring:...} for lists.".to_string())
}
}
StringOp::Replace {
pattern,
replacement,
flags,
} => {
if let Value::Str(s) = val {
if !flags.contains('g')
&& !pattern.contains([
'\\', '.', '*', '+', '?', '^', '$', '|', '[', ']', '(', ')', '{', '}',
])
&& !s.contains(pattern)
{
return Ok(Value::Str(s));
}
let pattern_to_use = if flags.is_empty() {
pattern.clone()
} else {
let mut inline_flags = String::with_capacity(4);
for (flag, c) in [('i', 'i'), ('m', 'm'), ('s', 's'), ('x', 'x')] {
if flags.contains(flag) {
inline_flags.push(c);
}
}
if inline_flags.is_empty() {
pattern.clone()
} else {
format!("(?{inline_flags}){pattern}")
}
};
let re = get_cached_regex(&pattern_to_use)?;
let result = if flags.contains('g') {
re.replace_all(&s, replacement.as_str()).to_string()
} else {
re.replace(&s, replacement.as_str()).to_string()
};
Ok(Value::Str(result))
} else {
Err(
"Replace operation can only be applied to strings. Use map:{replace:...} for lists."
.to_string(),
)
}
}
StringOp::Upper => apply_string_operation(val, |s| s.to_uppercase(), "Upper"),
StringOp::Lower => apply_string_operation(val, |s| s.to_lowercase(), "Lower"),
StringOp::Trim { chars, direction } => {
if let Value::Str(s) = val {
let result = if chars.is_empty() || chars.trim().is_empty() {
match direction {
TrimDirection::Both => {
if let Some(trimmed) = ascii_trim(&s) {
trimmed.to_string()
} else {
s.trim().to_string()
}
}
TrimDirection::Left => s.trim_start().to_string(),
TrimDirection::Right => s.trim_end().to_string(),
}
} else {
let chars_to_trim: Vec<char> = chars.chars().collect();
match direction {
TrimDirection::Both => {
s.trim_matches(|c| chars_to_trim.contains(&c)).to_string()
}
TrimDirection::Left => s
.trim_start_matches(|c| chars_to_trim.contains(&c))
.to_string(),
TrimDirection::Right => s
.trim_end_matches(|c| chars_to_trim.contains(&c))
.to_string(),
}
};
Ok(Value::Str(result))
} else {
Err(
"Trim operation can only be applied to strings. Use map:{trim} for lists."
.to_string(),
)
}
}
StringOp::Append { suffix } => {
apply_string_operation(val, |s| format!("{s}{suffix}"), "Append")
}
StringOp::Prepend { prefix } => {
apply_string_operation(val, |s| format!("{prefix}{s}"), "Prepend")
}
StringOp::Surround { text } => {
apply_string_operation(val, |s| format!("{text}{s}{text}"), "Surround")
}
StringOp::StripAnsi => {
if let Value::Str(s) = val {
let result = strip_ansi_string(&s).into_owned();
Ok(Value::Str(result))
} else {
Err("StripAnsi operation can only be applied to strings. Use map:{strip_ansi} for lists.".to_string())
}
}
StringOp::Pad {
width,
char,
direction,
} => {
if let Value::Str(s) = val {
let current_len = s.chars().count();
let result = if current_len >= *width {
s
} else {
let padding_needed = *width - current_len;
match direction {
PadDirection::Left => {
format!("{}{s}", char.to_string().repeat(padding_needed))
}
PadDirection::Right => {
format!("{s}{}", char.to_string().repeat(padding_needed))
}
PadDirection::Both => {
let left_pad = padding_needed / 2;
let right_pad = padding_needed - left_pad;
format!(
"{}{s}{}",
char.to_string().repeat(left_pad),
char.to_string().repeat(right_pad)
)
}
}
};
Ok(Value::Str(result))
} else {
Err(
"Pad operation can only be applied to strings. Use map:{pad:...} for lists."
.to_string(),
)
}
}
StringOp::RegexExtract { pattern, group } => {
if let Value::Str(s) = val {
let re = get_cached_regex(pattern)?;
let result = if let Some(group_idx) = group {
re.captures(&s)
.and_then(|caps| caps.get(*group_idx))
.map(|m| m.as_str().to_string())
.unwrap_or_default()
} else {
re.find(&s)
.map(|m| m.as_str().to_string())
.unwrap_or_default()
};
Ok(Value::Str(result))
} else {
Err("RegexExtract operation can only be applied to strings. Use map:{regex_extract:...} for lists.".to_string())
}
}
StringOp::Map { .. } => Err("Map operations should be handled separately".to_string()),
}
}