1use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
27use arrow_array::{BinaryArray, BinaryViewArray};
28use arrow_array::{Float64Array, Int32Array};
29use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
30use datafusion_common::{exec_datafusion_err, plan_err, DataFusionError, Result};
31use geo_types::{
32 Coord, Geometry, GeometryCollection, LineString, MultiLineString, MultiPoint, MultiPolygon,
33 Point, Polygon, Rect,
34};
35use rand::{distr::Uniform, rngs::StdRng, Rng, RngExt, SeedableRng};
36use sedona_common::sedona_internal_err;
37use sedona_geometry::types::GeometryTypeId;
38use sedona_schema::datatypes::{SedonaType, WKB_GEOMETRY};
39use std::f64::consts::PI;
40use std::sync::Arc;
41use wkb::writer::WriteOptions;
42use wkb::Endianness;
43
44#[derive(Debug, Clone)]
90pub struct RandomPartitionedDataBuilder {
91 pub seed: u64,
92 pub num_partitions: usize,
93 pub batches_per_partition: usize,
94 pub rows_per_batch: usize,
95 sedona_type: SedonaType,
96 null_rate: f64,
97 options: RandomGeometryOptions,
98}
99
100impl Default for RandomPartitionedDataBuilder {
101 fn default() -> Self {
102 let options = RandomGeometryOptions::new();
103
104 Self {
105 seed: 42,
106 num_partitions: 1,
107 batches_per_partition: 1,
108 rows_per_batch: 10,
109 sedona_type: WKB_GEOMETRY,
110 null_rate: 0.0,
111 options,
112 }
113 }
114}
115
116impl RandomPartitionedDataBuilder {
117 pub fn new() -> Self {
134 Self::default()
135 }
136
137 pub fn seed(mut self, seed: u64) -> Self {
146 self.seed = seed;
147 self
148 }
149
150 pub fn num_partitions(mut self, num_partitions: usize) -> Self {
159 self.num_partitions = num_partitions;
160 self
161 }
162
163 pub fn batches_per_partition(mut self, batches_per_partition: usize) -> Self {
171 self.batches_per_partition = batches_per_partition;
172 self
173 }
174
175 pub fn rows_per_batch(mut self, rows_per_batch: usize) -> Self {
183 self.rows_per_batch = rows_per_batch;
184 self
185 }
186
187 pub fn geometry_type(mut self, geom_type: GeometryTypeId) -> Self {
198 self.options.geom_type = geom_type;
199 self
200 }
201
202 pub fn sedona_type(mut self, sedona_type: SedonaType) -> Self {
210 self.sedona_type = sedona_type;
211 self
212 }
213
214 pub fn bounds(mut self, bounds: Rect) -> Self {
223 self.options.bounds = bounds;
224 self
225 }
226
227 pub fn size_range(mut self, size_range: (f64, f64)) -> Self {
236 self.options.size_range = size_range;
237 self
238 }
239
240 pub fn null_rate(mut self, null_rate: f64) -> Self {
246 self.null_rate = null_rate;
247 self
248 }
249
250 pub fn empty_rate(mut self, empty_rate: f64) -> Self {
256 self.options.empty_rate = empty_rate;
257 self
258 }
259
260 pub fn vertices_per_linestring_range(
269 mut self,
270 vertices_per_linestring_range: (usize, usize),
271 ) -> Self {
272 self.options.vertices_per_linestring_range = vertices_per_linestring_range;
273 self
274 }
275
276 pub fn num_parts_range(mut self, num_parts_range: (usize, usize)) -> Self {
283 self.options.num_parts_range = num_parts_range;
284 self
285 }
286
287 pub fn polygon_hole_rate(mut self, polygon_hole_rate: f64) -> Self {
294 self.options.polygon_hole_rate = polygon_hole_rate;
295 self
296 }
297
298 pub fn schema(&self) -> SchemaRef {
306 Arc::new(Schema::new(vec![
308 Field::new("id", DataType::Int32, false),
309 Field::new("dist", DataType::Float64, false),
310 self.sedona_type.to_storage_field("geometry", true).unwrap(),
311 ]))
312 }
313
314 pub fn build(&self) -> Result<(SchemaRef, Vec<Vec<RecordBatch>>)> {
335 let schema = self.schema();
337 let mut result = Vec::with_capacity(self.num_partitions);
338
339 for partition_idx in 0..self.num_partitions {
340 let rng = Self::default_rng(self.seed + partition_idx as u64);
341 let partition_batches = self
342 .partition_reader(rng, partition_idx)
343 .collect::<Result<Vec<_>, ArrowError>>()?;
344 result.push(partition_batches);
345 }
346
347 Ok((schema, result))
348 }
349
350 pub fn validate(&self) -> Result<()> {
356 self.options.validate()?;
357
358 if self.null_rate < 0.0 || self.null_rate > 1.0 {
359 return plan_err!(
360 "Expected null_rate between 0.0 and 1.0 but got {}",
361 self.null_rate
362 );
363 }
364
365 if self.rows_per_batch == 0 {
366 return plan_err!("Expected rows_per_batch > 0 but got 0");
367 }
368
369 if self.num_partitions == 0 {
370 return plan_err!("Expected num_partitions > 0 but got 0");
371 }
372
373 Ok(())
374 }
375
376 pub fn default_rng(seed: u64) -> impl Rng {
380 StdRng::seed_from_u64(seed)
381 }
382
383 pub fn partition_reader<R: Rng + Send + 'static>(
385 &self,
386 rng: R,
387 partition_idx: usize,
388 ) -> Box<dyn RecordBatchReader + Send> {
389 let reader = RandomPartitionedDataReader {
390 builder: self.clone(),
391 schema: self.schema(),
392 partition_idx,
393 batch_idx: 0,
394 rng,
395 };
396
397 Box::new(reader)
398 }
399
400 fn generate_batch<R: Rng>(
402 &self,
403 rng: &mut R,
404 schema: &SchemaRef,
405 partition_idx: usize,
406 batch_idx: usize,
407 ) -> Result<RecordBatch> {
408 self.validate()?;
410
411 let id_start =
413 (partition_idx * self.batches_per_partition + batch_idx) * self.rows_per_batch;
414 let ids: Vec<i32> = (0..self.rows_per_batch)
415 .map(|i| (id_start + i) as i32)
416 .collect();
417
418 let max_dist = self
420 .options
421 .bounds
422 .width()
423 .min(self.options.bounds.height());
424 let distance_dist = Uniform::new(0.0, max_dist).expect("valid input to Uniform::new()");
425 let distances: Vec<f64> = (0..self.rows_per_batch)
426 .map(|_| rng.sample(distance_dist))
427 .collect();
428
429 let wkb_geometries = (0..self.rows_per_batch)
431 .map(|_| -> Result<Option<Vec<u8>>> {
432 if rng.random_bool(self.null_rate) {
433 Ok(None)
434 } else {
435 Ok(Some(generate_random_wkb(rng, &self.options)?))
436 }
437 })
438 .collect::<Result<Vec<Option<Vec<u8>>>>>()?;
439
440 let id_array = Arc::new(Int32Array::from(ids));
442 let dist_array = Arc::new(Float64Array::from(distances));
443 let geometry_array = create_wkb_array(wkb_geometries, &self.sedona_type)?;
444
445 Ok(RecordBatch::try_new(
447 schema.clone(),
448 vec![id_array, dist_array, geometry_array],
449 )?)
450 }
451}
452
453fn create_wkb_array(
455 wkb_values: Vec<Option<Vec<u8>>>,
456 sedona_type: &SedonaType,
457) -> Result<ArrayRef> {
458 match sedona_type {
459 SedonaType::Wkb(_, _) => Ok(Arc::new(BinaryArray::from_iter(wkb_values))),
460 SedonaType::WkbView(_, _) => Ok(Arc::new(BinaryViewArray::from_iter(wkb_values))),
461 _ => sedona_internal_err!("create_wkb_array not implemented for {sedona_type:?}"),
462 }
463}
464
465struct RandomPartitionedDataReader<R> {
466 builder: RandomPartitionedDataBuilder,
467 schema: SchemaRef,
468 partition_idx: usize,
469 batch_idx: usize,
470 rng: R,
471}
472
473impl<R: Rng> RecordBatchReader for RandomPartitionedDataReader<R> {
474 fn schema(&self) -> SchemaRef {
475 self.builder.schema()
476 }
477}
478
479impl<R: Rng> Iterator for RandomPartitionedDataReader<R> {
480 type Item = std::result::Result<RecordBatch, ArrowError>;
481
482 fn next(&mut self) -> Option<Self::Item> {
483 if self.batch_idx == self.builder.batches_per_partition {
484 return None;
485 }
486
487 let maybe_batch = self
488 .builder
489 .generate_batch(
490 &mut self.rng,
491 &self.schema,
492 self.partition_idx,
493 self.batch_idx,
494 )
495 .map_err(|e| ArrowError::ExternalError(Box::new(e)));
496 self.batch_idx += 1;
497 Some(maybe_batch)
498 }
499}
500
501#[derive(Debug, Clone)]
503struct RandomGeometryOptions {
504 geom_type: GeometryTypeId,
505 bounds: Rect,
506 size_range: (f64, f64),
507 vertices_per_linestring_range: (usize, usize),
508 empty_rate: f64,
509 polygon_hole_rate: f64,
510 num_parts_range: (usize, usize),
511}
512
513impl RandomGeometryOptions {
514 fn new() -> Self {
515 Self {
516 geom_type: GeometryTypeId::Point,
517 empty_rate: 0.0,
518 bounds: Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 }),
519 size_range: (1.0, 10.0),
520 vertices_per_linestring_range: (4, 4),
521 polygon_hole_rate: 0.0,
522 num_parts_range: (1, 3),
523 }
524 }
525
526 fn validate(&self) -> Result<()> {
527 if self.bounds.width() <= 0.0 || self.bounds.height() <= 0.0 {
528 return plan_err!("Expected valid bounds but got {:?}", self.bounds);
529 }
530
531 if self.size_range.0 <= 0.0 || self.size_range.0 > self.size_range.1 {
532 return plan_err!("Expected valid size_range but got {:?}", self.size_range);
533 }
534
535 if self.vertices_per_linestring_range.0 == 0
536 || self.vertices_per_linestring_range.0 > self.vertices_per_linestring_range.1
537 {
538 return plan_err!(
539 "Expected valid vertices_per_linestring_range but got {:?}",
540 self.vertices_per_linestring_range
541 );
542 }
543
544 if !(0.0..=1.0).contains(&self.empty_rate) {
545 return plan_err!(
546 "Expected empty_rate between 0.0 and 1.0 but got {}",
547 self.empty_rate
548 );
549 }
550
551 if !(0.0..=1.0).contains(&self.polygon_hole_rate) {
552 return plan_err!(
553 "Expected polygon_hole_rate between 0.0 and 1.0 but got {}",
554 self.polygon_hole_rate
555 );
556 }
557
558 if self.num_parts_range.0 == 0 || self.num_parts_range.0 > self.num_parts_range.1 {
559 return plan_err!(
560 "Expected valid num_parts_range but got {:?}",
561 self.num_parts_range
562 );
563 }
564
565 Ok(())
566 }
567}
568
569impl Default for RandomGeometryOptions {
570 fn default() -> Self {
571 Self::new()
572 }
573}
574
575fn generate_random_wkb<R: rand::Rng>(
577 rng: &mut R,
578 options: &RandomGeometryOptions,
579) -> Result<Vec<u8>> {
580 let geometry = generate_random_geometry(rng, options)?;
581
582 let mut out: Vec<u8> = vec![];
584 wkb::writer::write_geometry(
585 &mut out,
586 &geometry,
587 &WriteOptions {
588 endianness: Endianness::LittleEndian,
589 },
590 )
591 .map_err(|e| DataFusionError::External(Box::new(e)))?;
592 Ok(out)
593}
594
595fn generate_random_geometry<R: rand::Rng>(
596 rng: &mut R,
597 options: &RandomGeometryOptions,
598) -> Result<Geometry> {
599 Ok(match options.geom_type {
600 GeometryTypeId::Point => Geometry::Point(generate_random_point(rng, options)?),
601 GeometryTypeId::LineString => {
602 Geometry::LineString(generate_random_linestring(rng, options)?)
603 }
604 GeometryTypeId::Polygon => Geometry::Polygon(generate_random_polygon(rng, options)?),
605 GeometryTypeId::MultiPoint => {
606 Geometry::MultiPoint(generate_random_multipoint(rng, options)?)
607 }
608 GeometryTypeId::MultiLineString => {
609 Geometry::MultiLineString(generate_random_multilinestring(rng, options)?)
610 }
611 GeometryTypeId::MultiPolygon => {
612 Geometry::MultiPolygon(generate_random_multipolygon(rng, options)?)
613 }
614 GeometryTypeId::GeometryCollection => {
615 Geometry::GeometryCollection(generate_random_geometrycollection(rng, options)?)
616 }
617 GeometryTypeId::Geometry => {
618 let mut copy_options = options.clone();
619 copy_options.geom_type = pick_random_geometry_type(rng);
620 generate_random_geometry(rng, ©_options)?
621 }
622 })
623}
624
625fn generate_random_point<R: rand::Rng>(
626 rng: &mut R,
627 options: &RandomGeometryOptions,
628) -> Result<Point> {
629 if rng.random_bool(options.empty_rate) {
630 Ok(Point::new(f64::NAN, f64::NAN))
634 } else {
635 let x_dist = Uniform::new(options.bounds.min().x, options.bounds.max().x)
637 .map_err(|e| exec_datafusion_err!("Invalid x bounds for random point: {e}"))?;
638 let y_dist = Uniform::new(options.bounds.min().y, options.bounds.max().y)
639 .map_err(|e| exec_datafusion_err!("Invalid y bounds for random point: {e}"))?;
640 let x = rng.sample(x_dist);
641 let y = rng.sample(y_dist);
642 Ok(Point::new(x, y))
643 }
644}
645
646fn generate_random_linestring<R: rand::Rng>(
647 rng: &mut R,
648 options: &RandomGeometryOptions,
649) -> Result<LineString> {
650 if rng.random_bool(options.empty_rate) {
651 Ok(LineString::new(vec![]))
652 } else {
653 let (center_x, center_y, half_size) = generate_random_circle(rng, options)?;
654 let vertices_dist = Uniform::new_inclusive(
655 options.vertices_per_linestring_range.0,
656 options.vertices_per_linestring_range.1,
657 )
658 .map_err(|e| exec_datafusion_err!("Invalid vertex count range for linestring: {e}"))?;
659 let num_vertices = rng.sample(vertices_dist).max(2);
661 let angle = rng.random_range(0.0..(2.0 * PI));
663 let coords =
664 generate_circular_vertices(angle, center_x, center_y, half_size, num_vertices, false)?;
665 Ok(LineString::from(coords))
666 }
667}
668
669fn generate_random_polygon<R: rand::Rng>(
670 rng: &mut R,
671 options: &RandomGeometryOptions,
672) -> Result<Polygon> {
673 if rng.random_bool(options.empty_rate) {
674 Ok(Polygon::new(LineString::new(vec![]), vec![]))
675 } else {
676 let (center_x, center_y, half_size) = generate_random_circle(rng, options)?;
677 let vertices_dist = Uniform::new_inclusive(
678 options.vertices_per_linestring_range.0,
679 options.vertices_per_linestring_range.1,
680 )
681 .map_err(|e| exec_datafusion_err!("Invalid vertex count range for polygon: {e}"))?;
682 let num_vertices = rng.sample(vertices_dist).max(3);
684
685 let angle = rng.random_range(0.0..=(2.0 * PI));
688 let coords =
689 generate_circular_vertices(angle, center_x, center_y, half_size, num_vertices, true)?;
690 let shell = LineString::from(coords);
691 let mut holes = Vec::new();
692
693 let add_hole = rng.random_bool(options.polygon_hole_rate);
695 let hole_scale_factor = rng.random_range(0.1..0.5);
696 if add_hole {
697 let new_size = half_size * hole_scale_factor;
698 let mut coords = generate_circular_vertices(
699 angle,
700 center_x,
701 center_y,
702 new_size,
703 num_vertices,
704 true,
705 )?;
706 coords.reverse();
707 holes.push(LineString::from(coords));
708 }
709
710 Ok(Polygon::new(shell, holes))
711 }
712}
713
714fn generate_random_multipoint<R: rand::Rng>(
715 rng: &mut R,
716 options: &RandomGeometryOptions,
717) -> Result<MultiPoint> {
718 if rng.random_bool(options.empty_rate) {
719 Ok(MultiPoint::new(vec![]))
720 } else {
721 let children = generate_random_children(rng, options, generate_random_point)?;
722 Ok(MultiPoint::new(children))
723 }
724}
725
726fn generate_random_multilinestring<R: rand::Rng>(
727 rng: &mut R,
728 options: &RandomGeometryOptions,
729) -> Result<MultiLineString> {
730 if rng.random_bool(options.empty_rate) {
731 Ok(MultiLineString::new(vec![]))
732 } else {
733 let children = generate_random_children(rng, options, generate_random_linestring)?;
734 Ok(MultiLineString::new(children))
735 }
736}
737
738fn generate_random_multipolygon<R: rand::Rng>(
739 rng: &mut R,
740 options: &RandomGeometryOptions,
741) -> Result<MultiPolygon> {
742 if rng.random_bool(options.empty_rate) {
743 Ok(MultiPolygon::new(vec![]))
744 } else {
745 let children = generate_random_children(rng, options, generate_random_polygon)?;
746 Ok(MultiPolygon::new(children))
747 }
748}
749
750fn generate_random_geometrycollection<R: rand::Rng>(
751 rng: &mut R,
752 options: &RandomGeometryOptions,
753) -> Result<GeometryCollection> {
754 if rng.random_bool(options.empty_rate) {
755 Ok(GeometryCollection::new_from(vec![]))
756 } else {
757 let children = generate_random_children(rng, options, generate_random_geometry)?;
758 Ok(GeometryCollection::new_from(children))
759 }
760}
761
762fn generate_random_children<R: Rng, T, F: Fn(&mut R, &RandomGeometryOptions) -> Result<T>>(
763 rng: &mut R,
764 options: &RandomGeometryOptions,
765 func: F,
766) -> Result<Vec<T>> {
767 let num_parts_dist =
768 Uniform::new_inclusive(options.num_parts_range.0, options.num_parts_range.1)
769 .map_err(|e| exec_datafusion_err!("Invalid part count range: {e}"))?;
770 let num_parts = rng.sample(num_parts_dist);
771
772 let (center_x, center_y, half_width) = generate_random_circle(rng, options)?;
774 let feature_bounds = Rect::new(
775 Coord {
776 x: center_x - half_width,
777 y: center_y - half_width,
778 },
779 Coord {
780 x: center_x + half_width,
781 y: center_y + half_width,
782 },
783 );
784
785 let child_bounds = generate_non_overlapping_sub_rectangles(num_parts, &feature_bounds);
786 let mut child_options = options.clone();
787 child_options.empty_rate = 0.0;
788
789 let mut children = Vec::new();
790 for bounds in child_bounds {
791 child_options.bounds = bounds;
792 let child_size = bounds.height().min(bounds.width());
793 child_options.size_range = (child_size * 0.9, child_size);
794
795 if options.geom_type == GeometryTypeId::GeometryCollection {
798 child_options.geom_type = pick_random_geometry_type(rng);
799 }
800 children.push(func(rng, &child_options)?);
801 }
802
803 Ok(children)
804}
805
806fn pick_random_geometry_type<R: Rng>(rng: &mut R) -> GeometryTypeId {
807 [
808 GeometryTypeId::Point,
809 GeometryTypeId::LineString,
810 GeometryTypeId::Polygon,
811 GeometryTypeId::MultiPoint,
812 GeometryTypeId::MultiLineString,
813 GeometryTypeId::MultiPolygon,
814 ][rng.random_range(0..6)]
815}
816
817fn generate_random_circle<R: rand::Rng>(
818 rng: &mut R,
819 options: &RandomGeometryOptions,
820) -> Result<(f64, f64, f64)> {
821 let size_dist = Uniform::new_inclusive(options.size_range.0, options.size_range.1)
823 .map_err(|e| exec_datafusion_err!("Invalid size range for random region: {e}"))?;
824 let size = rng.sample(size_dist);
825 let half_size = size / 2.0;
826 let height = options.bounds.height();
827 let width = options.bounds.width();
828
829 let center_x = if width >= size {
831 let center_x_dist = Uniform::new(
832 options.bounds.min().x + half_size,
833 options.bounds.max().x - half_size,
834 )
835 .map_err(|e| exec_datafusion_err!("Invalid x bounds for random circle center: {e}"))?;
836
837 rng.sample(center_x_dist)
838 } else {
839 options.bounds.min().x + width / 2.0
840 };
841
842 let center_y = if height >= size {
843 let center_y_dist = Uniform::new(
844 options.bounds.min().y + half_size,
845 options.bounds.max().y - half_size,
846 )
847 .map_err(|e| exec_datafusion_err!("Invalid y bounds for random circle center: {e}"))?;
848
849 rng.sample(center_y_dist)
850 } else {
851 options.bounds.min().y + height / 2.0
852 };
853
854 Ok((
855 center_x,
856 center_y,
857 half_size.min(height / 2.0).min(width / 2.0),
858 ))
859}
860
861fn generate_non_overlapping_sub_rectangles(num_parts: usize, bounds: &Rect) -> Vec<Rect> {
862 let mut tiles = vec![*bounds];
863 let mut n = 0;
864 while tiles.len() < num_parts {
865 let (largest_idx, _) = tiles
867 .iter()
868 .enumerate()
869 .map(|(i, rect)| (i, rect.height() * rect.width()))
870 .max_by(|(_, a1), (_, a2)| a1.partial_cmp(a2).unwrap())
871 .unwrap_or((0, 0.0));
872
873 let new_rects = if (n % 2) == 0 {
875 tiles[largest_idx].split_x()
876 } else {
877 tiles[largest_idx].split_y()
878 };
879
880 tiles.remove(largest_idx);
882 tiles.insert(largest_idx, new_rects[0]);
883 tiles.insert(largest_idx, new_rects[1]);
884 n += 1;
885 }
886
887 tiles
888}
889
890fn generate_circular_vertices(
891 mut angle: f64,
892 center_x: f64,
893 center_y: f64,
894 radius: f64,
895 num_vertices: usize,
896 closed: bool,
897) -> Result<Vec<Coord>> {
898 let mut out = Vec::new();
899
900 let dangle = 2.0 * PI / (num_vertices as f64).max(3.0);
901 for _ in 0..num_vertices {
902 out.push(Coord {
903 x: angle.cos() * radius + center_x,
904 y: angle.sin() * radius + center_y,
905 });
906 angle += dangle;
907 }
908
909 if closed {
910 out.push(out[0]);
911 }
912
913 Ok(out)
914}
915
916#[cfg(test)]
917mod tests {
918 use super::*;
919 use arrow_schema::DataType;
920 use geo_traits::{MultiLineStringTrait, MultiPolygonTrait};
921 use geo_types::Coord;
922 use rand::rngs::StdRng;
923 use rand::SeedableRng;
924 use rstest::rstest;
925 use sedona_geometry::{
926 analyze::analyze_geometry, bounds::wkb_bounds_xy, interval::IntervalTrait,
927 };
928
929 #[test]
930 fn test_generate_random_geometry_produces_valid_wkb() {
931 let bounds = Rect::new(Coord { x: 10.0, y: 10.0 }, Coord { x: 90.0, y: 90.0 });
932 let size_range = (1.0, 10.0);
933
934 let test_cases = vec![
936 (GeometryTypeId::Point, 42, 100, 20, 50), (GeometryTypeId::Polygon, 123, 50, 80, 200),
938 ];
939
940 for (geom_type, seed, iterations, min_size, max_size) in test_cases {
941 let mut rng = StdRng::seed_from_u64(seed);
942 let options = RandomGeometryOptions {
943 geom_type,
944 bounds,
945 size_range,
946 ..Default::default()
947 };
948
949 for _ in 0..iterations {
950 let wkb_bytes = generate_random_wkb(&mut rng, &options).unwrap();
951
952 assert!(!wkb_bytes.is_empty());
954 assert!(
955 wkb_bytes.len() >= min_size,
956 "WKB size {} is smaller than expected minimum {} for {:?}",
957 wkb_bytes.len(),
958 min_size,
959 geom_type
960 );
961 assert!(
962 wkb_bytes.len() <= max_size,
963 "WKB size {} is larger than expected maximum {} for {:?}",
964 wkb_bytes.len(),
965 max_size,
966 geom_type
967 );
968
969 wkb::reader::read_wkb(&wkb_bytes).unwrap();
971 }
972 }
973 }
974
975 #[test]
976 fn test_generate_random_geometry_deterministic() {
977 let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
978 let size_range = (1.0, 10.0);
979
980 let geom_types = [GeometryTypeId::Point, GeometryTypeId::Polygon];
981
982 let mut rng1 = StdRng::seed_from_u64(42);
984 let mut rng2 = StdRng::seed_from_u64(42);
985
986 for geom_type in geom_types {
987 let options = RandomGeometryOptions {
988 geom_type,
989 bounds,
990 size_range,
991 ..Default::default()
992 };
993 let wkb1 = generate_random_wkb(&mut rng1, &options).unwrap();
994 let wkb2 = generate_random_wkb(&mut rng2, &options).unwrap();
995
996 assert_eq!(wkb1, wkb2);
998 }
999 }
1000
1001 #[test]
1002 fn test_random_partitioned_data_builder_build_basic() {
1003 let (schema, partitions) = RandomPartitionedDataBuilder::new()
1004 .num_partitions(2)
1005 .batches_per_partition(3)
1006 .rows_per_batch(4)
1007 .null_rate(0.0) .build()
1009 .unwrap();
1010
1011 assert_eq!(schema.fields().len(), 3);
1013 assert_eq!(schema.field(0).name(), "id");
1014 assert_eq!(schema.field(0).data_type(), &DataType::Int32);
1015 assert_eq!(schema.field(1).name(), "dist");
1016 assert_eq!(schema.field(1).data_type(), &DataType::Float64);
1017 assert_eq!(schema.field(2).name(), "geometry");
1018
1019 assert_eq!(partitions.len(), 2); for partition in &partitions {
1023 assert_eq!(partition.len(), 3); for batch in partition {
1026 assert_eq!(batch.num_rows(), 4); assert_eq!(batch.num_columns(), 3);
1028 }
1029 }
1030 }
1031
1032 #[test]
1033 fn test_random_partitioned_data_builder_unique_ids() {
1034 let (_, partitions) = RandomPartitionedDataBuilder::new()
1035 .num_partitions(2)
1036 .batches_per_partition(2)
1037 .rows_per_batch(3)
1038 .build()
1039 .unwrap();
1040
1041 let mut all_ids = Vec::new();
1042
1043 for partition in &partitions {
1044 for batch in partition {
1045 let id_array = batch
1046 .column(0)
1047 .as_any()
1048 .downcast_ref::<Int32Array>()
1049 .unwrap();
1050 for i in 0..id_array.len() {
1051 all_ids.push(id_array.value(i));
1052 }
1053 }
1054 }
1055
1056 all_ids.sort();
1058 for i in 1..all_ids.len() {
1059 assert_ne!(
1060 all_ids[i - 1],
1061 all_ids[i],
1062 "Found duplicate ID: {}",
1063 all_ids[i]
1064 );
1065 }
1066
1067 for (i, &id) in all_ids.iter().enumerate() {
1069 assert_eq!(id, i as i32);
1070 }
1071 }
1072
1073 #[test]
1074 fn test_random_partitioned_data_builder_null_rate() {
1075 let (_, partitions) = RandomPartitionedDataBuilder::new()
1076 .rows_per_batch(100)
1077 .null_rate(0.5) .build()
1079 .unwrap();
1080
1081 let batch = &partitions[0][0];
1082 let geometry_array = batch.column(2);
1083
1084 let null_count = geometry_array.null_count();
1085 let total_count = geometry_array.len();
1086 let null_rate = null_count as f64 / total_count as f64;
1087
1088 assert!(
1090 (0.3..=0.7).contains(&null_rate),
1091 "Expected null rate around 0.5, got {null_rate}"
1092 );
1093 }
1094
1095 #[test]
1096 fn test_random_partitioned_data_builder_deterministic() {
1097 let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
1098
1099 let (schema1, partitions1) = RandomPartitionedDataBuilder::new()
1100 .seed(999)
1101 .num_partitions(2)
1102 .batches_per_partition(2)
1103 .rows_per_batch(5)
1104 .bounds(bounds)
1105 .build()
1106 .unwrap();
1107
1108 let (schema2, partitions2) = RandomPartitionedDataBuilder::new()
1109 .seed(999) .num_partitions(2)
1111 .batches_per_partition(2)
1112 .rows_per_batch(5)
1113 .bounds(bounds)
1114 .build()
1115 .unwrap();
1116
1117 assert_eq!(schema1, schema2);
1119
1120 assert_eq!(partitions1.len(), partitions2.len());
1122 for (partition1, partition2) in partitions1.iter().zip(partitions2.iter()) {
1123 assert_eq!(partition1.len(), partition2.len());
1124 for (batch1, batch2) in partition1.iter().zip(partition2.iter()) {
1125 let ids1 = batch1
1127 .column(0)
1128 .as_any()
1129 .downcast_ref::<Int32Array>()
1130 .unwrap();
1131 let ids2 = batch2
1132 .column(0)
1133 .as_any()
1134 .downcast_ref::<Int32Array>()
1135 .unwrap();
1136 assert_eq!(ids1, ids2);
1137
1138 let dists1 = batch1
1140 .column(1)
1141 .as_any()
1142 .downcast_ref::<Float64Array>()
1143 .unwrap();
1144 let dists2 = batch2
1145 .column(1)
1146 .as_any()
1147 .downcast_ref::<Float64Array>()
1148 .unwrap();
1149 assert_eq!(dists1, dists2);
1150 }
1151 }
1152 }
1153
1154 #[test]
1155 fn test_random_partitioned_data_builder_different_seeds() {
1156 let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
1157
1158 let (_, partitions1) = RandomPartitionedDataBuilder::new()
1159 .seed(111)
1160 .rows_per_batch(10)
1161 .bounds(bounds)
1162 .build()
1163 .unwrap();
1164
1165 let (_, partitions2) = RandomPartitionedDataBuilder::new()
1166 .seed(222) .rows_per_batch(10)
1168 .bounds(bounds)
1169 .build()
1170 .unwrap();
1171
1172 let dists1 = partitions1[0][0]
1174 .column(1)
1175 .as_any()
1176 .downcast_ref::<Float64Array>()
1177 .unwrap();
1178 let dists2 = partitions2[0][0]
1179 .column(1)
1180 .as_any()
1181 .downcast_ref::<Float64Array>()
1182 .unwrap();
1183
1184 let mut found_difference = false;
1186 for i in 0..dists1.len() {
1187 if (dists1.value(i) - dists2.value(i)).abs() > f64::EPSILON {
1188 found_difference = true;
1189 break;
1190 }
1191 }
1192 assert!(
1193 found_difference,
1194 "Expected different random data with different seeds"
1195 );
1196 }
1197
1198 #[test]
1199 fn test_random_linestring_num_vertices() {
1200 let mut rng = StdRng::seed_from_u64(123);
1201 let mut options = RandomGeometryOptions::new();
1202 options.vertices_per_linestring_range = (3, 3);
1203 for _ in 0..100 {
1204 let geom = generate_random_linestring(&mut rng, &options).unwrap();
1205 assert_eq!(geom.coords().count(), 3);
1206 }
1207
1208 options.vertices_per_linestring_range = (50, 50);
1209 for _ in 0..100 {
1210 let geom = generate_random_linestring(&mut rng, &options).unwrap();
1211 assert_eq!(geom.coords().count(), 50);
1212 }
1213 }
1214
1215 #[test]
1216 fn test_random_polygon_has_hole() {
1217 let mut rng = StdRng::seed_from_u64(123);
1218 let mut options = RandomGeometryOptions::new();
1219
1220 options.polygon_hole_rate = 0.0;
1221 for _ in 0..100 {
1222 let geom = generate_random_polygon(&mut rng, &options).unwrap();
1223 assert_eq!(geom.interiors().len(), 0);
1224 }
1225
1226 options.polygon_hole_rate = 1.0;
1227 for _ in 0..100 {
1228 let geom = generate_random_polygon(&mut rng, &options).unwrap();
1229 assert!(!geom.interiors().is_empty());
1230 }
1231 }
1232
1233 #[test]
1234 fn test_random_multipoint_part_count() {
1235 let mut rng = StdRng::seed_from_u64(123);
1236 let mut options = RandomGeometryOptions::new();
1237
1238 options.num_parts_range = (3, 3);
1239 for _ in 0..100 {
1240 let geom = generate_random_multipoint(&mut rng, &options).unwrap();
1241 assert_eq!(geom.len(), 3);
1242 }
1243
1244 options.num_parts_range = (10, 10);
1245 for _ in 0..100 {
1246 let geom = generate_random_multipoint(&mut rng, &options).unwrap();
1247 assert_eq!(geom.len(), 10);
1248 }
1249 }
1250
1251 #[test]
1252 fn test_random_multilinestring_part_count() {
1253 let mut rng = StdRng::seed_from_u64(123);
1254 let mut options = RandomGeometryOptions::new();
1255
1256 options.num_parts_range = (3, 3);
1257 for _ in 0..100 {
1258 let geom = generate_random_multilinestring(&mut rng, &options).unwrap();
1259 assert_eq!(geom.num_line_strings(), 3);
1260 }
1261
1262 options.num_parts_range = (10, 10);
1263 for _ in 0..100 {
1264 let geom = generate_random_multilinestring(&mut rng, &options).unwrap();
1265 assert_eq!(geom.num_line_strings(), 10);
1266 }
1267 }
1268
1269 #[test]
1270 fn test_random_multipolygon_part_count() {
1271 let mut rng = StdRng::seed_from_u64(123);
1272 let mut options = RandomGeometryOptions::new();
1273
1274 options.num_parts_range = (3, 3);
1275 for _ in 0..100 {
1276 let geom = generate_random_multipolygon(&mut rng, &options).unwrap();
1277 assert_eq!(geom.num_polygons(), 3);
1278 }
1279
1280 options.num_parts_range = (10, 10);
1281 for _ in 0..100 {
1282 let geom = generate_random_multipolygon(&mut rng, &options).unwrap();
1283 assert_eq!(geom.num_polygons(), 10);
1284 }
1285 }
1286
1287 #[test]
1288 fn test_random_geometrycollection_part_count() {
1289 let mut rng = StdRng::seed_from_u64(123);
1290 let mut options = RandomGeometryOptions::new();
1291
1292 options.num_parts_range = (3, 3);
1293 for _ in 0..100 {
1294 let geom = generate_random_geometrycollection(&mut rng, &options).unwrap();
1295 assert_eq!(geom.len(), 3);
1296 }
1297
1298 options.num_parts_range = (10, 10);
1299 for _ in 0..100 {
1300 let geom = generate_random_geometrycollection(&mut rng, &options).unwrap();
1301 assert_eq!(geom.len(), 10);
1302 }
1303 }
1304
1305 #[rstest]
1306 fn test_random_geometry_type(
1307 #[values(
1308 GeometryTypeId::Point,
1309 GeometryTypeId::LineString,
1310 GeometryTypeId::Polygon,
1311 GeometryTypeId::MultiPoint,
1312 GeometryTypeId::MultiLineString,
1313 GeometryTypeId::MultiPolygon,
1314 GeometryTypeId::GeometryCollection
1315 )]
1316 geom_type: GeometryTypeId,
1317 ) {
1318 let mut rng = StdRng::seed_from_u64(123);
1319 let mut options = RandomGeometryOptions::new();
1320 options.geom_type = geom_type;
1321
1322 options.empty_rate = 0.0;
1323 for _ in 0..100 {
1324 let geom = generate_random_wkb(&mut rng, &options).unwrap();
1325 let wkb = wkb::reader::read_wkb(&geom).unwrap();
1326 let analysis = analyze_geometry(&wkb).unwrap();
1327 assert_eq!(analysis.geometry_type.geometry_type(), geom_type);
1328 }
1329 }
1330
1331 #[rstest]
1332 fn test_random_emptiness(
1333 #[values(
1334 GeometryTypeId::Point,
1335 GeometryTypeId::LineString,
1336 GeometryTypeId::Polygon,
1337 GeometryTypeId::MultiPoint,
1338 GeometryTypeId::MultiLineString,
1339 GeometryTypeId::MultiPolygon,
1340 GeometryTypeId::GeometryCollection
1341 )]
1342 geom_type: GeometryTypeId,
1343 ) {
1344 let mut rng = StdRng::seed_from_u64(123);
1345 let mut options = RandomGeometryOptions::new();
1346 options.geom_type = geom_type;
1347
1348 options.empty_rate = 0.0;
1349 for _ in 0..100 {
1350 let geom = generate_random_wkb(&mut rng, &options).unwrap();
1351 let bounds = wkb_bounds_xy(&geom).unwrap();
1352 assert!(!bounds.x().is_empty());
1353 assert!(!bounds.y().is_empty());
1354
1355 assert!(
1356 bounds.x().lo() >= options.bounds.min().x
1357 && bounds.y().lo() >= options.bounds.min().y
1358 && bounds.x().hi() <= options.bounds.max().x
1359 && bounds.y().hi() <= options.bounds.max().y
1360 );
1361 }
1362
1363 options.empty_rate = 1.0;
1364 for _ in 0..100 {
1365 let geom = generate_random_wkb(&mut rng, &options).unwrap();
1366 let bounds = wkb_bounds_xy(&geom).unwrap();
1367 assert!(bounds.x().is_empty());
1368 assert!(bounds.y().is_empty());
1369 }
1370 }
1371
1372 #[test]
1373 fn test_random_partitioned_data_builder_validation() {
1374 let err = RandomPartitionedDataBuilder::new()
1376 .null_rate(-0.1)
1377 .validate()
1378 .unwrap_err();
1379 assert_eq!(
1380 err.to_string(),
1381 "Error during planning: Expected null_rate between 0.0 and 1.0 but got -0.1"
1382 );
1383
1384 let err = RandomPartitionedDataBuilder::new()
1386 .null_rate(1.5)
1387 .validate()
1388 .unwrap_err();
1389 assert_eq!(
1390 err.to_string(),
1391 "Error during planning: Expected null_rate between 0.0 and 1.0 but got 1.5"
1392 );
1393
1394 let err = RandomPartitionedDataBuilder::new()
1396 .rows_per_batch(0)
1397 .validate()
1398 .unwrap_err();
1399 assert_eq!(
1400 err.to_string(),
1401 "Error during planning: Expected rows_per_batch > 0 but got 0"
1402 );
1403
1404 let err = RandomPartitionedDataBuilder::new()
1406 .num_partitions(0)
1407 .validate()
1408 .unwrap_err();
1409 assert_eq!(
1410 err.to_string(),
1411 "Error during planning: Expected num_partitions > 0 but got 0"
1412 );
1413
1414 let err = RandomPartitionedDataBuilder::new()
1416 .empty_rate(-0.1)
1417 .validate()
1418 .unwrap_err();
1419 assert_eq!(
1420 err.to_string(),
1421 "Error during planning: Expected empty_rate between 0.0 and 1.0 but got -0.1"
1422 );
1423
1424 let err = RandomPartitionedDataBuilder::new()
1426 .empty_rate(1.5)
1427 .validate()
1428 .unwrap_err();
1429 assert_eq!(
1430 err.to_string(),
1431 "Error during planning: Expected empty_rate between 0.0 and 1.0 but got 1.5"
1432 );
1433
1434 let err = RandomPartitionedDataBuilder::new()
1436 .polygon_hole_rate(-0.1)
1437 .validate()
1438 .unwrap_err();
1439 assert_eq!(
1440 err.to_string(),
1441 "Error during planning: Expected polygon_hole_rate between 0.0 and 1.0 but got -0.1"
1442 );
1443
1444 let err = RandomPartitionedDataBuilder::new()
1446 .polygon_hole_rate(1.5)
1447 .validate()
1448 .unwrap_err();
1449 assert_eq!(
1450 err.to_string(),
1451 "Error during planning: Expected polygon_hole_rate between 0.0 and 1.0 but got 1.5"
1452 );
1453
1454 let err = RandomPartitionedDataBuilder::new()
1456 .size_range((0.0, 10.0))
1457 .validate()
1458 .unwrap_err();
1459 assert_eq!(
1460 err.to_string(),
1461 "Error during planning: Expected valid size_range but got (0.0, 10.0)"
1462 );
1463
1464 let err = RandomPartitionedDataBuilder::new()
1466 .size_range((5.0, -1.0))
1467 .validate()
1468 .unwrap_err();
1469 assert_eq!(
1470 err.to_string(),
1471 "Error during planning: Expected valid size_range but got (5.0, -1.0)"
1472 );
1473
1474 let err = RandomPartitionedDataBuilder::new()
1476 .size_range((10.0, 5.0))
1477 .validate()
1478 .unwrap_err();
1479 assert_eq!(
1480 err.to_string(),
1481 "Error during planning: Expected valid size_range but got (10.0, 5.0)"
1482 );
1483
1484 let err = RandomPartitionedDataBuilder::new()
1486 .vertices_per_linestring_range((0, 5))
1487 .validate()
1488 .unwrap_err();
1489 assert_eq!(
1490 err.to_string(),
1491 "Error during planning: Expected valid vertices_per_linestring_range but got (0, 5)"
1492 );
1493
1494 let err = RandomPartitionedDataBuilder::new()
1496 .vertices_per_linestring_range((10, 5))
1497 .validate()
1498 .unwrap_err();
1499 assert_eq!(
1500 err.to_string(),
1501 "Error during planning: Expected valid vertices_per_linestring_range but got (10, 5)"
1502 );
1503
1504 let err = RandomPartitionedDataBuilder::new()
1506 .num_parts_range((0, 5))
1507 .validate()
1508 .unwrap_err();
1509 assert_eq!(
1510 err.to_string(),
1511 "Error during planning: Expected valid num_parts_range but got (0, 5)"
1512 );
1513
1514 let err = RandomPartitionedDataBuilder::new()
1516 .num_parts_range((10, 5))
1517 .validate()
1518 .unwrap_err();
1519 assert_eq!(
1520 err.to_string(),
1521 "Error during planning: Expected valid num_parts_range but got (10, 5)"
1522 );
1523
1524 let err = RandomPartitionedDataBuilder::new()
1526 .bounds(Rect::new(
1527 Coord { x: 10.0, y: 10.0 },
1528 Coord { x: 10.0, y: 20.0 },
1529 ))
1530 .validate()
1531 .unwrap_err();
1532 assert_eq!(
1533 err.to_string(),
1534 "Error during planning: Expected valid bounds but got RECT(10.0 10.0,10.0 20.0)"
1535 );
1536
1537 let err = RandomPartitionedDataBuilder::new()
1539 .bounds(Rect::new(
1540 Coord { x: 10.0, y: 10.0 },
1541 Coord { x: 20.0, y: 10.0 },
1542 ))
1543 .validate()
1544 .unwrap_err();
1545 assert_eq!(
1546 err.to_string(),
1547 "Error during planning: Expected valid bounds but got RECT(10.0 10.0,20.0 10.0)"
1548 );
1549 }
1550}