use arrow::{
array::BooleanArray,
datatypes::{DataType, Field, Schema},
};
use arrow::{array::StringArray, record_batch::RecordBatch};
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_expr::{Operator, and, binary_expr, col, lit, or};
use datafusion_physical_expr::{
PhysicalExpr,
expressions::{BinaryExpr, Column},
planner::logical2physical,
};
use std::hint::black_box;
use std::sync::Arc;
fn generate_boolean_cases<const TEST_ALL_FALSE: bool>(
len: usize,
) -> Vec<(String, BooleanArray)> {
let mut cases = Vec::with_capacity(6);
if TEST_ALL_FALSE {
let all_false = BooleanArray::from(vec![false; len]);
cases.push(("all_false".to_string(), all_false));
} else {
let all_true = BooleanArray::from(vec![true; len]);
cases.push(("all_true".to_string(), all_true));
}
if TEST_ALL_FALSE {
let mut first_true = vec![false; len];
first_true[0] = true;
cases.push(("one_true_first".to_string(), BooleanArray::from(first_true)));
} else {
let mut first_false = vec![true; len];
first_false[0] = false;
cases.push((
"one_false_first".to_string(),
BooleanArray::from(first_false),
));
}
if TEST_ALL_FALSE {
let mut last_true = vec![false; len];
last_true[len - 1] = true;
cases.push(("one_true_last".to_string(), BooleanArray::from(last_true)));
} else {
let mut last_false = vec![true; len];
last_false[len - 1] = false;
cases.push(("one_false_last".to_string(), BooleanArray::from(last_false)));
}
let mid = len / 2;
if TEST_ALL_FALSE {
let mut mid_true = vec![false; len];
mid_true[mid] = true;
cases.push(("one_true_middle".to_string(), BooleanArray::from(mid_true)));
} else {
let mut mid_false = vec![true; len];
mid_false[mid] = false;
cases.push((
"one_false_middle".to_string(),
BooleanArray::from(mid_false),
));
}
let mid_left = len / 4;
if TEST_ALL_FALSE {
let mut mid_left_true = vec![false; len];
mid_left_true[mid_left] = true;
cases.push((
"one_true_middle_left".to_string(),
BooleanArray::from(mid_left_true),
));
} else {
let mut mid_left_false = vec![true; len];
mid_left_false[mid_left] = false;
cases.push((
"one_false_middle_left".to_string(),
BooleanArray::from(mid_left_false),
));
}
let mid_right = (3 * len) / 4;
if TEST_ALL_FALSE {
let mut mid_right_true = vec![false; len];
mid_right_true[mid_right] = true;
cases.push((
"one_true_middle_right".to_string(),
BooleanArray::from(mid_right_true),
));
} else {
let mut mid_right_false = vec![true; len];
mid_right_false[mid_right] = false;
cases.push((
"one_false_middle_right".to_string(),
BooleanArray::from(mid_right_false),
));
}
if TEST_ALL_FALSE {
let all_true = vec![true; len];
cases.push(("all_true_in_and".to_string(), BooleanArray::from(all_true)));
} else {
let all_false = vec![false; len];
cases.push(("all_false_in_or".to_string(), BooleanArray::from(all_false)));
}
cases
}
fn benchmark_binary_op_in_short_circuit(c: &mut Criterion) {
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Boolean, false),
Field::new("b", DataType::Utf8, false),
Field::new("c", DataType::Utf8, false),
]));
let (b_values, c_values) = generate_test_strings(8192);
let batches_and =
create_record_batch::<true>(schema.clone(), &b_values, &c_values).unwrap();
let batches_or =
create_record_batch::<false>(schema.clone(), &b_values, &c_values).unwrap();
let right_condition_and = and(
binary_expr(
col("b"),
Operator::RegexMatch,
lit(r#"^https://(\w+\.)?example\.(com|org)/"#),
),
binary_expr(
col("c"),
Operator::RegexMatch,
lit("```(rust|python|go)\nfn? main$$"),
),
);
let right_condition_or = or(
binary_expr(
col("b"),
Operator::RegexMatch,
lit(r#"^https://(\w+\.)?example\.(com|org)/"#),
),
binary_expr(
col("c"),
Operator::RegexMatch,
lit("```(rust|python|go)\nfn? main$$"),
),
);
let expr_and = BinaryExpr::new(
Arc::new(Column::new("a", 0)),
Operator::And,
logical2physical(&right_condition_and, &schema),
);
let expr_or = BinaryExpr::new(
Arc::new(Column::new("a", 0)),
Operator::Or,
logical2physical(&right_condition_or, &schema),
);
{
for (name, batch) in batches_and.into_iter() {
c.bench_function(&format!("short_circuit/and/{name}"), |b| {
b.iter(|| expr_and.evaluate(black_box(&batch)).unwrap())
});
}
}
{
for (name, batch) in batches_or.into_iter() {
c.bench_function(&format!("short_circuit/or/{name}"), |b| {
b.iter(|| expr_or.evaluate(black_box(&batch)).unwrap())
});
}
}
}
fn generate_test_strings(num_rows: usize) -> (Vec<String>, Vec<String>) {
let base_urls = [
"https://api.example.com/v2/users/12345/posts?category=tech&sort=date&lang=en-US",
"https://cdn.example.net/assets/images/2023/08/15/sample-image-highres.jpg?width=1920&quality=85",
"http://service.demo.org:8080/api/data/transactions/20230815123456.csv",
"ftp://legacy.archive.example/backups/2023/Q3/database-dump.sql.gz",
"https://docs.example.co.uk/reference/advanced-topics/concurrency/parallel-processing.md#implementation-details",
];
let base_markdowns = [
concat!(
"# Advanced Topics in Computer Science\n\n",
"## Summary\nThis article explores complex system design patterns and...\n\n",
"```rust\nfn process_data(data: &mut [i32]) {\n // Parallel processing example\n data.par_iter_mut().for_each(|x| *x *= 2);\n}\n```\n\n",
"## Performance Considerations\nWhen implementing concurrent systems...\n"
),
concat!(
"## API Documentation\n\n",
"```json\n{\n \"endpoint\": \"/api/v2/users\",\n \"methods\": [\"GET\", \"POST\"],\n \"parameters\": {\n \"page\": \"number\"\n }\n}\n```\n\n",
"# Authentication Guide\nSecure your API access using OAuth 2.0...\n"
),
concat!(
"# Data Processing Pipeline\n\n",
"```python\nfrom multiprocessing import Pool\n\ndef main():\n with Pool(8) as p:\n results = p.map(process_item, data)\n```\n\n",
"## Summary of Optimizations\n1. Batch processing\n2. Memory pooling\n3. Concurrent I/O operations\n"
),
concat!(
"# System Architecture Overview\n\n",
"## Components\n- Load Balancer\n- Database Cluster\n- Cache Service\n\n",
"```go\nfunc main() {\n router := gin.Default()\n router.GET(\"/api/health\", healthCheck)\n router.Run(\":8080\")\n}\n```\n"
),
concat!(
"## Configuration Reference\n\n",
"```yaml\nserver:\n port: 8080\n max_threads: 32\n\ndatabase:\n url: postgres://user@prod-db:5432/main\n```\n\n",
"# Deployment Strategies\nBlue-green deployment patterns with...\n"
),
];
let mut urls = Vec::with_capacity(num_rows);
let mut markdowns = Vec::with_capacity(num_rows);
for i in 0..num_rows {
urls.push(base_urls[i % 5].to_string());
markdowns.push(base_markdowns[i % 5].to_string());
}
(urls, markdowns)
}
#[expect(clippy::needless_pass_by_value)]
fn create_record_batch<const TEST_ALL_FALSE: bool>(
schema: Arc<Schema>,
b_values: &[String],
c_values: &[String],
) -> arrow::error::Result<Vec<(String, RecordBatch)>> {
let boolean_array = generate_boolean_cases::<TEST_ALL_FALSE>(b_values.len());
let mut rbs = Vec::with_capacity(boolean_array.len());
for (name, a_array) in boolean_array {
let b_array = StringArray::from(b_values.to_vec());
let c_array = StringArray::from(c_values.to_vec());
rbs.push((
name,
RecordBatch::try_new(
schema.clone(),
vec![Arc::new(a_array), Arc::new(b_array), Arc::new(c_array)],
)?,
));
}
Ok(rbs)
}
criterion_group!(benches, benchmark_binary_op_in_short_circuit);
criterion_main!(benches);