use std::fmt::Debug;
use std::ops::Range;
use fastfield_codecs::MonotonicallyMappableToU64;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
};
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
use crate::aggregation::{
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey,
};
use crate::schema::Type;
use crate::{DocId, TantivyError};
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
pub struct RangeAggregation {
pub field: String,
pub ranges: Vec<RangeAggregationRange>,
#[serde(default)]
pub keyed: bool,
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct RangeAggregationRange {
#[serde(skip_serializing_if = "Option::is_none", default)]
pub key: Option<String>,
#[serde(skip_serializing_if = "Option::is_none", default)]
pub from: Option<f64>,
#[serde(skip_serializing_if = "Option::is_none", default)]
pub to: Option<f64>,
}
impl From<Range<f64>> for RangeAggregationRange {
fn from(range: Range<f64>) -> Self {
let from = if range.start == f64::MIN {
None
} else {
Some(range.start)
};
let to = if range.end == f64::MAX {
None
} else {
Some(range.end)
};
RangeAggregationRange {
key: None,
from,
to,
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct InternalRangeAggregationRange {
key: Option<String>,
range: Range<u64>,
}
impl From<Range<u64>> for InternalRangeAggregationRange {
fn from(range: Range<u64>) -> Self {
InternalRangeAggregationRange { key: None, range }
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentRangeAndBucketEntry {
range: Range<u64>,
bucket: SegmentRangeBucketEntry,
}
#[derive(Clone, Debug, PartialEq)]
pub struct SegmentRangeCollector {
buckets: Vec<SegmentRangeAndBucketEntry>,
field_type: Type,
}
#[derive(Clone, PartialEq)]
pub(crate) struct SegmentRangeBucketEntry {
pub key: Key,
pub doc_count: u64,
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
pub from: Option<f64>,
pub to: Option<f64>,
}
impl Debug for SegmentRangeBucketEntry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeBucketEntry")
.field("key", &self.key)
.field("doc_count", &self.doc_count)
.field("from", &self.from)
.field("to", &self.to)
.finish()
}
}
impl SegmentRangeBucketEntry {
pub(crate) fn into_intermediate_bucket_entry(
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateRangeBucketEntry> {
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
} else {
Default::default()
};
Ok(IntermediateRangeBucketEntry {
key: self.key,
doc_count: self.doc_count,
sub_aggregation,
from: self.from,
to: self.to,
})
}
}
impl SegmentRangeCollector {
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
let field_type = self.field_type;
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets
.into_iter()
.map(move |range_bucket| {
Ok((
range_to_string(&range_bucket.range, &field_type)?,
range_bucket
.bucket
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
))
})
.collect::<crate::Result<_>>()?;
Ok(IntermediateBucketResult::Range(
IntermediateRangeBucketResult { buckets },
))
}
pub(crate) fn from_req_and_validate(
req: &RangeAggregation,
sub_aggregation: &AggregationsWithAccessor,
bucket_count: &BucketCount,
field_type: Type,
) -> crate::Result<Self> {
let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)?
.iter()
.map(|range| {
let key = range
.key
.clone()
.map(|key| Ok(Key::Str(key)))
.unwrap_or_else(|| range_to_key(&range.range, &field_type))?;
let to = if range.range.end == u64::MAX {
None
} else {
Some(f64_from_fastfield_u64(range.range.end, &field_type))
};
let from = if range.range.start == u64::MIN {
None
} else {
Some(f64_from_fastfield_u64(range.range.start, &field_type))
};
let sub_aggregation = if sub_aggregation.is_empty() {
None
} else {
Some(SegmentAggregationResultsCollector::from_req_and_validate(
sub_aggregation,
)?)
};
Ok(SegmentRangeAndBucketEntry {
range: range.range.clone(),
bucket: SegmentRangeBucketEntry {
doc_count: 0,
sub_aggregation,
key,
from,
to,
},
})
})
.collect::<crate::Result<_>>()?;
bucket_count.add_count(buckets.len() as u32);
bucket_count.validate_bucket_count()?;
Ok(SegmentRangeCollector {
buckets,
field_type,
})
}
#[inline]
pub(crate) fn collect_block(
&mut self,
doc: &[DocId],
bucket_with_accessor: &BucketAggregationWithAccessor,
force_flush: bool,
) -> crate::Result<()> {
let mut iter = doc.chunks_exact(4);
let accessor = bucket_with_accessor
.accessor
.as_single()
.expect("unexpected fast field cardinality");
for docs in iter.by_ref() {
let val1 = accessor.get_val(docs[0]);
let val2 = accessor.get_val(docs[1]);
let val3 = accessor.get_val(docs[2]);
let val4 = accessor.get_val(docs[3]);
let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3);
let bucket_pos4 = self.get_bucket_pos(val4);
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
}
for &doc in iter.remainder() {
let val = accessor.get_val(doc);
let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, doc, &bucket_with_accessor.sub_aggregation)?;
}
if force_flush {
for bucket in &mut self.buckets {
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
sub_aggregation
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush)?;
}
}
}
Ok(())
}
#[inline]
fn increment_bucket(
&mut self,
bucket_pos: usize,
doc: DocId,
bucket_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<()> {
let bucket = &mut self.buckets[bucket_pos];
bucket.bucket.doc_count += 1;
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
sub_aggregation.collect(doc, bucket_with_accessor)?;
}
Ok(())
}
#[inline]
fn get_bucket_pos(&self, val: u64) -> usize {
let pos = self
.buckets
.binary_search_by_key(&val, |probe| probe.range.start)
.unwrap_or_else(|pos| pos - 1);
debug_assert!(self.buckets[pos].range.contains(&val));
pos
}
}
fn to_u64_range(
range: &RangeAggregationRange,
field_type: &Type,
) -> crate::Result<InternalRangeAggregationRange> {
let start = if let Some(from) = range.from {
f64_to_fastfield_u64(from, field_type)
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
} else {
u64::MIN
};
let end = if let Some(to) = range.to {
f64_to_fastfield_u64(to, field_type)
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
} else {
u64::MAX
};
Ok(InternalRangeAggregationRange {
key: range.key.clone(),
range: start..end,
})
}
fn extend_validate_ranges(
buckets: &[RangeAggregationRange],
field_type: &Type,
) -> crate::Result<Vec<InternalRangeAggregationRange>> {
let mut converted_buckets = buckets
.iter()
.map(|range| to_u64_range(range, field_type))
.collect::<crate::Result<Vec<_>>>()?;
converted_buckets.sort_by_key(|bucket| bucket.range.start);
if converted_buckets[0].range.start != u64::MIN {
converted_buckets.insert(0, (u64::MIN..converted_buckets[0].range.start).into());
}
if converted_buckets[converted_buckets.len() - 1].range.end != u64::MAX {
converted_buckets
.push((converted_buckets[converted_buckets.len() - 1].range.end..u64::MAX).into());
}
let find_hole = |converted_buckets: &[InternalRangeAggregationRange]| {
for (pos, ranges) in converted_buckets.windows(2).enumerate() {
if ranges[0].range.end > ranges[1].range.start {
return Err(TantivyError::InvalidArgument(format!(
"Overlapping ranges not supported range {:?}, range+1 {:?}",
ranges[0], ranges[1]
)));
}
if ranges[0].range.end != ranges[1].range.start {
return Ok(Some(pos));
}
}
Ok(None)
};
while let Some(hole_pos) = find_hole(&converted_buckets)? {
let new_range =
converted_buckets[hole_pos].range.end..converted_buckets[hole_pos + 1].range.start;
converted_buckets.insert(hole_pos + 1, new_range.into());
}
Ok(converted_buckets)
}
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> crate::Result<String> {
let to_str = |val: u64, is_start: bool| {
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
Ok("*".to_string())
} else if *field_type == Type::Date {
let val = i64::from_u64(val);
format_date(val)
} else {
Ok(f64_from_fastfield_u64(val, field_type).to_string())
}
};
Ok(format!(
"{}-{}",
to_str(range.start, true)?,
to_str(range.end, false)?
))
}
pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> crate::Result<Key> {
Ok(Key::Str(range_to_string(range, field_type)?))
}
#[cfg(test)]
mod tests {
use fastfield_codecs::MonotonicallyMappableToU64;
use serde_json::Value;
use super::*;
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
};
use crate::aggregation::tests::{
exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_with_num_docs,
};
pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>,
field_type: Type,
) -> SegmentRangeCollector {
let req = RangeAggregation {
field: "dummy".to_string(),
ranges,
..Default::default()
};
SegmentRangeCollector::from_req_and_validate(
&req,
&Default::default(),
&Default::default(),
field_type,
)
.expect("unexpected error")
}
#[test]
fn range_fraction_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(res["range"]["buckets"][0]["key"], "*-0");
assert_eq!(res["range"]["buckets"][0]["doc_count"], 0);
assert_eq!(res["range"]["buckets"][1]["key"], "0-0.1");
assert_eq!(res["range"]["buckets"][1]["doc_count"], 10);
assert_eq!(res["range"]["buckets"][2]["key"], "0.1-0.2");
assert_eq!(res["range"]["buckets"][2]["doc_count"], 10);
assert_eq!(res["range"]["buckets"][3]["key"], "0.2-*");
assert_eq!(res["range"]["buckets"][3]["doc_count"], 80);
Ok(())
}
#[test]
fn range_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
keyed: true,
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res,
json!({
"range": {
"buckets": {
"*-0": { "key": "*-0", "doc_count": 0, "to": 0.0},
"0-0.1": {"key": "0-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
"0.1-0.2": {"key": "0.1-0.2", "doc_count": 10, "from": 0.1, "to": 0.2},
"0.2-*": {"key": "0.2-*", "doc_count": 80, "from": 0.2},
}
}
})
);
Ok(())
}
#[test]
fn range_custom_key_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![
RangeAggregationRange {
key: Some("custom-key-0-to-0.1".to_string()),
from: Some(0f64),
to: Some(0.1f64),
},
RangeAggregationRange {
key: None,
from: Some(0.1f64),
to: Some(0.2f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res,
json!({
"range": {
"buckets": [
{"key": "*-0", "doc_count": 0, "to": 0.0},
{"key": "custom-key-0-to-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
{"key": "0.1-0.2", "doc_count": 10, "from": 0.1, "to": 0.2},
{"key": "0.2-*", "doc_count": 80, "from": 0.2}
]
}
})
);
Ok(())
}
#[test]
fn range_date_test_single_segment() -> crate::Result<()> {
range_date_test_with_opt(true)
}
#[test]
fn range_date_test_multi_segment() -> crate::Result<()> {
range_date_test_with_opt(false)
}
fn range_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
let agg_req: Aggregations = vec![(
"date_ranges".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "date".to_string(),
ranges: vec![
RangeAggregationRange {
key: None,
from: None,
to: Some(1546300800000000.0f64),
},
RangeAggregationRange {
key: None,
from: Some(1546300800000000.0f64),
to: Some(1546387200000000.0f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let agg_res = exec_request(agg_req, &index)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(
res["date_ranges"]["buckets"][0]["from_as_string"],
Value::Null
);
assert_eq!(
res["date_ranges"]["buckets"][0]["key"],
"*-2019-01-01T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][1]["from_as_string"],
"2019-01-01T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][1]["to_as_string"],
"2019-01-02T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][2]["from_as_string"],
"2019-01-02T00:00:00Z"
);
assert_eq!(
res["date_ranges"]["buckets"][2]["to_as_string"],
Value::Null
);
Ok(())
}
#[test]
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![RangeAggregationRange {
key: Some("custom-key-0-to-0.1".to_string()),
from: Some(0f64),
to: Some(0.1f64),
}],
keyed: true,
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res,
json!({
"range": {
"buckets": {
"*-0": { "key": "*-0", "doc_count": 0, "to": 0.0},
"custom-key-0-to-0.1": {"key": "custom-key-0-to-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
"0.1-*": {"key": "0.1-*", "doc_count": 90, "from": 0.1},
}
}
})
);
Ok(())
}
#[test]
fn bucket_test_extend_range_hole() {
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
let collector = get_collector_from_ranges(buckets, Type::F64);
let buckets = collector.buckets;
assert_eq!(buckets[0].range.start, u64::MIN);
assert_eq!(buckets[0].range.end, 10f64.to_u64());
assert_eq!(buckets[1].range.start, 10f64.to_u64());
assert_eq!(buckets[1].range.end, 20f64.to_u64());
assert_eq!(buckets[2].range.start, 20f64.to_u64());
assert_eq!(buckets[2].range.end, 30f64.to_u64());
assert_eq!(buckets[3].range.start, 30f64.to_u64());
assert_eq!(buckets[3].range.end, 40f64.to_u64());
}
#[test]
fn bucket_test_range_conversion_special_case() {
let buckets = vec![
(f64::MIN..10f64).into(),
(10f64..20f64).into(),
(20f64..f64::MAX).into(),
];
let collector = get_collector_from_ranges(buckets, Type::F64);
let buckets = collector.buckets;
assert_eq!(buckets[0].range.start, u64::MIN);
assert_eq!(buckets[0].range.end, 10f64.to_u64());
assert_eq!(buckets[1].range.start, 10f64.to_u64());
assert_eq!(buckets[1].range.end, 20f64.to_u64());
assert_eq!(buckets[2].range.start, 20f64.to_u64());
assert_eq!(buckets[2].range.end, u64::MAX);
assert_eq!(buckets.len(), 3);
}
#[test]
fn bucket_range_test_negative_vals() {
let buckets = vec![(-10f64..-1f64).into()];
let collector = get_collector_from_ranges(buckets, Type::F64);
let buckets = collector.buckets;
assert_eq!(&buckets[0].bucket.key.to_string(), "*--10");
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "-1-*");
}
#[test]
fn bucket_range_test_positive_vals() {
let buckets = vec![(0f64..10f64).into()];
let collector = get_collector_from_ranges(buckets, Type::F64);
let buckets = collector.buckets;
assert_eq!(&buckets[0].bucket.key.to_string(), "*-0");
assert_eq!(&buckets[buckets.len() - 1].bucket.key.to_string(), "10-*");
}
#[test]
fn range_binary_search_test_u64() {
let check_ranges = |ranges: Vec<RangeAggregationRange>| {
let collector = get_collector_from_ranges(ranges, Type::U64);
let search = |val: u64| collector.get_bucket_pos(val);
assert_eq!(search(u64::MIN), 0);
assert_eq!(search(9), 0);
assert_eq!(search(10), 1);
assert_eq!(search(11), 1);
assert_eq!(search(99), 1);
assert_eq!(search(100), 2);
assert_eq!(search(u64::MAX - 1), 2); };
let ranges = vec![(10.0..100.0).into()];
check_ranges(ranges);
let ranges = vec![
RangeAggregationRange {
key: None,
to: Some(10.0),
from: None,
},
(10.0..100.0).into(),
];
check_ranges(ranges);
let ranges = vec![
RangeAggregationRange {
key: None,
to: Some(10.0),
from: None,
},
(10.0..100.0).into(),
RangeAggregationRange {
key: None,
to: None,
from: Some(100.0),
},
];
check_ranges(ranges);
}
#[test]
fn range_binary_search_test_f64() {
let ranges = vec![(10.0..100.0).into()];
let collector = get_collector_from_ranges(ranges, Type::F64);
let search = |val: u64| collector.get_bucket_pos(val);
assert_eq!(search(u64::MIN), 0);
assert_eq!(search(9f64.to_u64()), 0);
assert_eq!(search(10f64.to_u64()), 1);
assert_eq!(search(11f64.to_u64()), 1);
assert_eq!(search(99f64.to_u64()), 1);
assert_eq!(search(100f64.to_u64()), 2);
assert_eq!(search(u64::MAX - 1), 2); }
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use itertools::Itertools;
use rand::seq::SliceRandom;
use rand::thread_rng;
use super::*;
use crate::aggregation::bucket::range::tests::get_collector_from_ranges;
const TOTAL_DOCS: u64 = 1_000_000u64;
const NUM_DOCS: u64 = 50_000u64;
fn get_collector_with_buckets(num_buckets: u64, num_docs: u64) -> SegmentRangeCollector {
let bucket_size = num_docs / num_buckets;
let mut buckets: Vec<RangeAggregationRange> = vec![];
for i in 0..num_buckets {
let bucket_start = (i * bucket_size) as f64;
buckets.push((bucket_start..bucket_start + bucket_size as f64).into())
}
get_collector_from_ranges(buckets, Type::U64)
}
fn get_rand_docs(total_docs: u64, num_docs_returned: u64) -> Vec<u64> {
let mut rng = thread_rng();
let all_docs = (0..total_docs - 1).collect_vec();
let mut vals = all_docs
.as_slice()
.choose_multiple(&mut rng, num_docs_returned as usize)
.cloned()
.collect_vec();
vals.sort();
vals
}
fn bench_range_binary_search(b: &mut test::Bencher, num_buckets: u64) {
let collector = get_collector_with_buckets(num_buckets, TOTAL_DOCS);
let vals = get_rand_docs(TOTAL_DOCS, NUM_DOCS);
b.iter(|| {
let mut bucket_pos = 0;
for val in &vals {
bucket_pos = collector.get_bucket_pos(*val);
}
bucket_pos
})
}
#[bench]
fn bench_range_100_buckets(b: &mut test::Bencher) {
bench_range_binary_search(b, 100)
}
#[bench]
fn bench_range_10_buckets(b: &mut test::Bencher) {
bench_range_binary_search(b, 10)
}
}