Skip to main content

sedona_testing/
datagen.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17//! Tools for generating random geometries
18//!
19//! The current options provided here were built to support basic correctness testing and
20//! benchmarking algorithmic complexity of a large number of scalar functions, many of which
21//! have various performance or correctness issues that arise from null features, empty features,
22//! polygons with holes, or collections with various numbers of sub-geometries.
23//! See <https://github.com/apache/sedona/pull/1680> for the Sedona/Java implementation of spider,
24//! which implements a number of other strategies for generating various geometry types.
25
26use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
27use arrow_array::{BinaryArray, BinaryViewArray};
28use arrow_array::{Float64Array, Int32Array};
29use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
30use datafusion_common::{exec_datafusion_err, plan_err, DataFusionError, Result};
31use geo_types::{
32    Coord, Geometry, GeometryCollection, LineString, MultiLineString, MultiPoint, MultiPolygon,
33    Point, Polygon, Rect,
34};
35use rand::{distr::Uniform, rngs::StdRng, Rng, RngExt, SeedableRng};
36use sedona_common::sedona_internal_err;
37use sedona_geometry::types::GeometryTypeId;
38use sedona_schema::datatypes::{SedonaType, WKB_GEOMETRY};
39use std::f64::consts::PI;
40use std::sync::Arc;
41use wkb::writer::WriteOptions;
42use wkb::Endianness;
43
44/// Builder for generating test data partitions with random geometries.
45///
46/// This builder allows you to create deterministic test datasets with configurable
47/// geometry types, data distribution, and partitioning for testing spatial operations.
48///
49/// The generated data includes:
50///
51/// - `id`: Unique integer identifier for each row
52/// - `dist`: Random floating-point distance value (0.0 to 100.0)
53/// - `geometry`: Random geometry data in the specified format (WKB or WKB View)
54///
55/// The strategy for generating geometries and their options are not stable and may change
56/// as the needs of testing and benchmarking evolve or better strategies are discovered.
57/// The strategy for generating random geometries is as follows:
58///
59/// - Points are uniformly distributed over the [Self::bounds] indicated
60/// - Linestrings are generated by calculating the points in a circle of a randomly
61///   chosen size (according to [Self::size_range]) with vertex count sampled using
62///   [Self::vertices_per_linestring_range]. The start and end point of generated
63///   linestrings are never connected.
64/// - Polygons are generated using a closed version of the linestring generated.
65///   They may or may not have a hole according to [Self::polygon_hole_rate].
66/// - MultiPoint, MultiLinestring, and MultiPolygon geometries are constructed
67///   with the number of parts sampled according to [Self::num_parts_range].
68///   The size of the entire feature is constrained to [Self::size_range],
69///   and this space is subdivided to obtain the exact number of spaces needed.
70///   Child features are generated using the global options except with sizes
71///   sampled to approach the space given to them.
72///
73/// # Example
74///
75/// ```rust
76/// use sedona_testing::datagen::RandomPartitionedDataBuilder;
77/// use sedona_geometry::types::GeometryTypeId;
78/// use geo_types::{Coord, Rect};
79///
80/// let (schema, partitions) = RandomPartitionedDataBuilder::new()
81///     .seed(42)
82///     .num_partitions(4)
83///     .rows_per_batch(1000)
84///     .geometry_type(GeometryTypeId::Polygon)
85///     .bounds(Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 }))
86///     .build()
87///     .unwrap();
88/// ```
89#[derive(Debug, Clone)]
90pub struct RandomPartitionedDataBuilder {
91    pub seed: u64,
92    pub num_partitions: usize,
93    pub batches_per_partition: usize,
94    pub rows_per_batch: usize,
95    sedona_type: SedonaType,
96    null_rate: f64,
97    options: RandomGeometryOptions,
98}
99
100impl Default for RandomPartitionedDataBuilder {
101    fn default() -> Self {
102        let options = RandomGeometryOptions::new();
103
104        Self {
105            seed: 42,
106            num_partitions: 1,
107            batches_per_partition: 1,
108            rows_per_batch: 10,
109            sedona_type: WKB_GEOMETRY,
110            null_rate: 0.0,
111            options,
112        }
113    }
114}
115
116impl RandomPartitionedDataBuilder {
117    /// Creates a new `RandomPartitionedDataBuilder` with default values.
118    ///
119    /// Default configuration:
120    ///
121    /// - seed: 42 (for deterministic results)
122    /// - num_partitions: 1
123    /// - batches_per_partition: 1
124    /// - rows_per_batch: 10
125    /// - geometry_type: Point
126    /// - bounds: (0,0) to (100,100)
127    /// - size_range: 1.0 to 10.0
128    /// - null_rate: 0.0 (no nulls)
129    /// - empty_rate: 0.0 (no empties)
130    /// - vertices_per_linestring_range
131    /// - num_parts_range: 1 to 3
132    /// - polygon_hole_rate: 0.0 (no polygons with holes)
133    pub fn new() -> Self {
134        Self::default()
135    }
136
137    /// Sets the random seed for deterministic data generation.
138    ///
139    /// Using the same seed will produce identical datasets, which is useful
140    /// for reproducible tests.
141    ///
142    /// # Arguments
143    ///
144    /// * `seed` - The random seed value
145    pub fn seed(mut self, seed: u64) -> Self {
146        self.seed = seed;
147        self
148    }
149
150    /// Sets the number of data partitions to generate.
151    ///
152    /// Each partition contains multiple batches of data. This is useful for
153    /// testing distributed processing scenarios.
154    ///
155    /// # Arguments
156    ///
157    /// * `num_partitions` - Number of partitions to create
158    pub fn num_partitions(mut self, num_partitions: usize) -> Self {
159        self.num_partitions = num_partitions;
160        self
161    }
162
163    /// Sets the number of batches per partition.
164    ///
165    /// Each batch is a `RecordBatch` containing the specified number of rows.
166    ///
167    /// # Arguments
168    ///
169    /// * `batches_per_partition` - Number of batches in each partition
170    pub fn batches_per_partition(mut self, batches_per_partition: usize) -> Self {
171        self.batches_per_partition = batches_per_partition;
172        self
173    }
174
175    /// Sets the number of rows per batch.
176    ///
177    /// This determines the size of each `RecordBatch` that will be generated.
178    ///
179    /// # Arguments
180    ///
181    /// * `rows_per_batch` - Number of rows in each batch
182    pub fn rows_per_batch(mut self, rows_per_batch: usize) -> Self {
183        self.rows_per_batch = rows_per_batch;
184        self
185    }
186
187    /// Sets the type of geometry to generate.
188    ///
189    /// Currently supports:
190    /// - `GeometryTypeId::Point`: Random points within the specified bounds
191    /// - `GeometryTypeId::Polygon`: Random diamond-shaped polygons
192    /// - Other types default to point generation
193    ///
194    /// # Arguments
195    ///
196    /// * `geom_type` - The geometry type to generate
197    pub fn geometry_type(mut self, geom_type: GeometryTypeId) -> Self {
198        self.options.geom_type = geom_type;
199        self
200    }
201
202    /// Sets the Sedona data type for the geometry column.
203    ///
204    /// This determines how the geometry data is stored (e.g., WKB or WKB View).
205    ///
206    /// # Arguments
207    ///
208    /// * `sedona_type` - The Sedona type for geometry storage
209    pub fn sedona_type(mut self, sedona_type: SedonaType) -> Self {
210        self.sedona_type = sedona_type;
211        self
212    }
213
214    /// Sets the spatial bounds for geometry generation.
215    ///
216    /// All generated geometries will be positioned within these bounds.
217    /// For polygons, the bounds are used to ensure the entire polygon fits within the area.
218    ///
219    /// # Arguments
220    ///
221    /// * `bounds` - Rectangle defining the spatial bounds (min_x, min_y, max_x, max_y)
222    pub fn bounds(mut self, bounds: Rect) -> Self {
223        self.options.bounds = bounds;
224        self
225    }
226
227    /// Sets the size range for generated geometries.
228    ///
229    /// For polygons, this controls the radius of the generated shapes.
230    /// For points, this parameter is not used.
231    ///
232    /// # Arguments
233    ///
234    /// * `size_range` - Tuple of (min_size, max_size) for geometry dimensions
235    pub fn size_range(mut self, size_range: (f64, f64)) -> Self {
236        self.options.size_range = size_range;
237        self
238    }
239
240    /// Sets the rate of null values in the geometry column.
241    ///
242    /// # Arguments
243    ///
244    /// * `null_rate` - Fraction of rows that should have null geometry (0.0 to 1.0)
245    pub fn null_rate(mut self, null_rate: f64) -> Self {
246        self.null_rate = null_rate;
247        self
248    }
249
250    /// Sets the rate of EMPTY geometries in the geometry column.
251    ///
252    /// # Arguments
253    ///
254    /// * `empty_rate` - Fraction of rows that should have empty geometry (0.0 to 1.0)
255    pub fn empty_rate(mut self, empty_rate: f64) -> Self {
256        self.options.empty_rate = empty_rate;
257        self
258    }
259
260    /// Sets the vertex count range
261    ///
262    /// # Arguments
263    ///
264    /// * `vertices_per_linestring_range` - The minimum and maximum (inclusive) number of vertices
265    ///   in linestring output. This also affects polygon output, although the actual number
266    ///   of vertices in the polygon ring will be one more than the range indicated here to
267    ///   close the polygon.
268    pub fn vertices_per_linestring_range(
269        mut self,
270        vertices_per_linestring_range: (usize, usize),
271    ) -> Self {
272        self.options.vertices_per_linestring_range = vertices_per_linestring_range;
273        self
274    }
275
276    /// Sets the number of parts range
277    ///
278    /// # Arguments
279    ///
280    /// * `num_parts_range` - The minimum and maximum (inclusive) number of parts
281    ///   in multi geometry and/or collection output.
282    pub fn num_parts_range(mut self, num_parts_range: (usize, usize)) -> Self {
283        self.options.num_parts_range = num_parts_range;
284        self
285    }
286
287    /// Sets the polygon hole rate
288    ///
289    /// # Arguments
290    ///
291    /// * `polygon_hole_rate` - Fraction of polygons that should have an interior
292    ///   ring. Currently only a single interior ring is possible.
293    pub fn polygon_hole_rate(mut self, polygon_hole_rate: f64) -> Self {
294        self.options.polygon_hole_rate = polygon_hole_rate;
295        self
296    }
297
298    /// The [SchemaRef] generated by this builder
299    ///
300    /// The resulting schema contains three columns:
301    ///
302    /// - `id`: Int32 - Unique sequential identifier for each row
303    /// - `dist`: Float64 - Random distance value between 0.0 and 100.0
304    /// - `geometry`: SedonaType - Random geometry data (WKB or WKB View format)
305    pub fn schema(&self) -> SchemaRef {
306        // Create schema
307        Arc::new(Schema::new(vec![
308            Field::new("id", DataType::Int32, false),
309            Field::new("dist", DataType::Float64, false),
310            self.sedona_type.to_storage_field("geometry", true).unwrap(),
311        ]))
312    }
313
314    /// Builds the random partitioned dataset with the configured parameters.
315    ///
316    /// Generates a deterministic dataset based on the seed and configuration.
317    /// The resulting schema contains three columns:
318    /// - `id`: Int32 - Unique sequential identifier for each row
319    /// - `dist`: Float64 - Random distance value between 0.0 and 100.0
320    /// - `geometry`: SedonaType - Random geometry data (WKB or WKB View format)
321    ///
322    /// # Returns
323    ///
324    /// A tuple containing:
325    /// - `SchemaRef`: Arrow schema for the generated data
326    /// - `Vec<Vec<RecordBatch>>`: Vector of partitions, each containing a vector of record batches
327    ///
328    /// # Errors
329    ///
330    /// Returns a `datafusion_common::Result` error if:
331    /// - RecordBatch creation fails
332    /// - Array conversion fails
333    /// - Schema creation fails
334    pub fn build(&self) -> Result<(SchemaRef, Vec<Vec<RecordBatch>>)> {
335        // Create a seeded random number generator for deterministic results
336        let schema = self.schema();
337        let mut result = Vec::with_capacity(self.num_partitions);
338
339        for partition_idx in 0..self.num_partitions {
340            let rng = Self::default_rng(self.seed + partition_idx as u64);
341            let partition_batches = self
342                .partition_reader(rng, partition_idx)
343                .collect::<Result<Vec<_>, ArrowError>>()?;
344            result.push(partition_batches);
345        }
346
347        Ok((schema, result))
348    }
349
350    /// Validate options
351    ///
352    /// This is called internally before generating batches to prevent panics from
353    /// occurring while creating random output; however, it may also be called
354    /// at a higher level to generate an error at a more relevant time.
355    pub fn validate(&self) -> Result<()> {
356        self.options.validate()?;
357
358        if self.null_rate < 0.0 || self.null_rate > 1.0 {
359            return plan_err!(
360                "Expected null_rate between 0.0 and 1.0 but got {}",
361                self.null_rate
362            );
363        }
364
365        if self.rows_per_batch == 0 {
366            return plan_err!("Expected rows_per_batch > 0 but got 0");
367        }
368
369        if self.num_partitions == 0 {
370            return plan_err!("Expected num_partitions > 0 but got 0");
371        }
372
373        Ok(())
374    }
375
376    /// Generate a [Rng] based on a seed
377    ///
378    /// Callers can also supply their own [Rng].
379    pub fn default_rng(seed: u64) -> impl Rng {
380        StdRng::seed_from_u64(seed)
381    }
382
383    /// Create a [RecordBatchReader] that reads a single partition
384    pub fn partition_reader<R: Rng + Send + 'static>(
385        &self,
386        rng: R,
387        partition_idx: usize,
388    ) -> Box<dyn RecordBatchReader + Send> {
389        let reader = RandomPartitionedDataReader {
390            builder: self.clone(),
391            schema: self.schema(),
392            partition_idx,
393            batch_idx: 0,
394            rng,
395        };
396
397        Box::new(reader)
398    }
399
400    /// Generate a single batch
401    fn generate_batch<R: Rng>(
402        &self,
403        rng: &mut R,
404        schema: &SchemaRef,
405        partition_idx: usize,
406        batch_idx: usize,
407    ) -> Result<RecordBatch> {
408        // Check for valid ranges to avoid panic in generation
409        self.validate()?;
410
411        // Generate IDs - make them unique across partitions and batches
412        let id_start =
413            (partition_idx * self.batches_per_partition + batch_idx) * self.rows_per_batch;
414        let ids: Vec<i32> = (0..self.rows_per_batch)
415            .map(|i| (id_start + i) as i32)
416            .collect();
417
418        // Generate random distances relevant to the bounds (0.0 and 100.0 by default)
419        let max_dist = self
420            .options
421            .bounds
422            .width()
423            .min(self.options.bounds.height());
424        let distance_dist = Uniform::new(0.0, max_dist).expect("valid input to Uniform::new()");
425        let distances: Vec<f64> = (0..self.rows_per_batch)
426            .map(|_| rng.sample(distance_dist))
427            .collect();
428
429        // Generate random geometries based on the geometry type
430        let wkb_geometries = (0..self.rows_per_batch)
431            .map(|_| -> Result<Option<Vec<u8>>> {
432                if rng.random_bool(self.null_rate) {
433                    Ok(None)
434                } else {
435                    Ok(Some(generate_random_wkb(rng, &self.options)?))
436                }
437            })
438            .collect::<Result<Vec<Option<Vec<u8>>>>>()?;
439
440        // Create Arrow arrays
441        let id_array = Arc::new(Int32Array::from(ids));
442        let dist_array = Arc::new(Float64Array::from(distances));
443        let geometry_array = create_wkb_array(wkb_geometries, &self.sedona_type)?;
444
445        // Create RecordBatch
446        Ok(RecordBatch::try_new(
447            schema.clone(),
448            vec![id_array, dist_array, geometry_array],
449        )?)
450    }
451}
452
453/// Create an ArrayRef from a vector of WKB bytes based on the sedona type
454fn create_wkb_array(
455    wkb_values: Vec<Option<Vec<u8>>>,
456    sedona_type: &SedonaType,
457) -> Result<ArrayRef> {
458    match sedona_type {
459        SedonaType::Wkb(_, _) => Ok(Arc::new(BinaryArray::from_iter(wkb_values))),
460        SedonaType::WkbView(_, _) => Ok(Arc::new(BinaryViewArray::from_iter(wkb_values))),
461        _ => sedona_internal_err!("create_wkb_array not implemented for {sedona_type:?}"),
462    }
463}
464
465struct RandomPartitionedDataReader<R> {
466    builder: RandomPartitionedDataBuilder,
467    schema: SchemaRef,
468    partition_idx: usize,
469    batch_idx: usize,
470    rng: R,
471}
472
473impl<R: Rng> RecordBatchReader for RandomPartitionedDataReader<R> {
474    fn schema(&self) -> SchemaRef {
475        self.builder.schema()
476    }
477}
478
479impl<R: Rng> Iterator for RandomPartitionedDataReader<R> {
480    type Item = std::result::Result<RecordBatch, ArrowError>;
481
482    fn next(&mut self) -> Option<Self::Item> {
483        if self.batch_idx == self.builder.batches_per_partition {
484            return None;
485        }
486
487        let maybe_batch = self
488            .builder
489            .generate_batch(
490                &mut self.rng,
491                &self.schema,
492                self.partition_idx,
493                self.batch_idx,
494            )
495            .map_err(|e| ArrowError::ExternalError(Box::new(e)));
496        self.batch_idx += 1;
497        Some(maybe_batch)
498    }
499}
500
501/// Options for the current strategy influencing individual geometry constructors
502#[derive(Debug, Clone)]
503struct RandomGeometryOptions {
504    geom_type: GeometryTypeId,
505    bounds: Rect,
506    size_range: (f64, f64),
507    vertices_per_linestring_range: (usize, usize),
508    empty_rate: f64,
509    polygon_hole_rate: f64,
510    num_parts_range: (usize, usize),
511}
512
513impl RandomGeometryOptions {
514    fn new() -> Self {
515        Self {
516            geom_type: GeometryTypeId::Point,
517            empty_rate: 0.0,
518            bounds: Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 }),
519            size_range: (1.0, 10.0),
520            vertices_per_linestring_range: (4, 4),
521            polygon_hole_rate: 0.0,
522            num_parts_range: (1, 3),
523        }
524    }
525
526    fn validate(&self) -> Result<()> {
527        if self.bounds.width() <= 0.0 || self.bounds.height() <= 0.0 {
528            return plan_err!("Expected valid bounds but got {:?}", self.bounds);
529        }
530
531        if self.size_range.0 <= 0.0 || self.size_range.0 > self.size_range.1 {
532            return plan_err!("Expected valid size_range but got {:?}", self.size_range);
533        }
534
535        if self.vertices_per_linestring_range.0 == 0
536            || self.vertices_per_linestring_range.0 > self.vertices_per_linestring_range.1
537        {
538            return plan_err!(
539                "Expected valid vertices_per_linestring_range but got {:?}",
540                self.vertices_per_linestring_range
541            );
542        }
543
544        if !(0.0..=1.0).contains(&self.empty_rate) {
545            return plan_err!(
546                "Expected empty_rate between 0.0 and 1.0 but got {}",
547                self.empty_rate
548            );
549        }
550
551        if !(0.0..=1.0).contains(&self.polygon_hole_rate) {
552            return plan_err!(
553                "Expected polygon_hole_rate between 0.0 and 1.0 but got {}",
554                self.polygon_hole_rate
555            );
556        }
557
558        if self.num_parts_range.0 == 0 || self.num_parts_range.0 > self.num_parts_range.1 {
559            return plan_err!(
560                "Expected valid num_parts_range but got {:?}",
561                self.num_parts_range
562            );
563        }
564
565        Ok(())
566    }
567}
568
569impl Default for RandomGeometryOptions {
570    fn default() -> Self {
571        Self::new()
572    }
573}
574
575/// Generate random geometry WKB bytes based on the geometry type
576fn generate_random_wkb<R: rand::Rng>(
577    rng: &mut R,
578    options: &RandomGeometryOptions,
579) -> Result<Vec<u8>> {
580    let geometry = generate_random_geometry(rng, options)?;
581
582    // Convert geometry to WKB
583    let mut out: Vec<u8> = vec![];
584    wkb::writer::write_geometry(
585        &mut out,
586        &geometry,
587        &WriteOptions {
588            endianness: Endianness::LittleEndian,
589        },
590    )
591    .map_err(|e| DataFusionError::External(Box::new(e)))?;
592    Ok(out)
593}
594
595fn generate_random_geometry<R: rand::Rng>(
596    rng: &mut R,
597    options: &RandomGeometryOptions,
598) -> Result<Geometry> {
599    Ok(match options.geom_type {
600        GeometryTypeId::Point => Geometry::Point(generate_random_point(rng, options)?),
601        GeometryTypeId::LineString => {
602            Geometry::LineString(generate_random_linestring(rng, options)?)
603        }
604        GeometryTypeId::Polygon => Geometry::Polygon(generate_random_polygon(rng, options)?),
605        GeometryTypeId::MultiPoint => {
606            Geometry::MultiPoint(generate_random_multipoint(rng, options)?)
607        }
608        GeometryTypeId::MultiLineString => {
609            Geometry::MultiLineString(generate_random_multilinestring(rng, options)?)
610        }
611        GeometryTypeId::MultiPolygon => {
612            Geometry::MultiPolygon(generate_random_multipolygon(rng, options)?)
613        }
614        GeometryTypeId::GeometryCollection => {
615            Geometry::GeometryCollection(generate_random_geometrycollection(rng, options)?)
616        }
617        GeometryTypeId::Geometry => {
618            let mut copy_options = options.clone();
619            copy_options.geom_type = pick_random_geometry_type(rng);
620            generate_random_geometry(rng, &copy_options)?
621        }
622    })
623}
624
625fn generate_random_point<R: rand::Rng>(
626    rng: &mut R,
627    options: &RandomGeometryOptions,
628) -> Result<Point> {
629    if rng.random_bool(options.empty_rate) {
630        // This is a bit of a hack because geo-types doesn't support empty point; however,
631        // this does work with respect to sending this directly to the WKB reader and getting
632        // the WKB result we want
633        Ok(Point::new(f64::NAN, f64::NAN))
634    } else {
635        // Generate random points within the specified bounds
636        let x_dist = Uniform::new(options.bounds.min().x, options.bounds.max().x)
637            .map_err(|e| exec_datafusion_err!("Invalid x bounds for random point: {e}"))?;
638        let y_dist = Uniform::new(options.bounds.min().y, options.bounds.max().y)
639            .map_err(|e| exec_datafusion_err!("Invalid y bounds for random point: {e}"))?;
640        let x = rng.sample(x_dist);
641        let y = rng.sample(y_dist);
642        Ok(Point::new(x, y))
643    }
644}
645
646fn generate_random_linestring<R: rand::Rng>(
647    rng: &mut R,
648    options: &RandomGeometryOptions,
649) -> Result<LineString> {
650    if rng.random_bool(options.empty_rate) {
651        Ok(LineString::new(vec![]))
652    } else {
653        let (center_x, center_y, half_size) = generate_random_circle(rng, options)?;
654        let vertices_dist = Uniform::new_inclusive(
655            options.vertices_per_linestring_range.0,
656            options.vertices_per_linestring_range.1,
657        )
658        .map_err(|e| exec_datafusion_err!("Invalid vertex count range for linestring: {e}"))?;
659        // Always sample in such a way that we end up with a valid linestring
660        let num_vertices = rng.sample(vertices_dist).max(2);
661        // Randomize starting angle (0 to 2 * PI)
662        let angle = rng.random_range(0.0..(2.0 * PI));
663        let coords =
664            generate_circular_vertices(angle, center_x, center_y, half_size, num_vertices, false)?;
665        Ok(LineString::from(coords))
666    }
667}
668
669fn generate_random_polygon<R: rand::Rng>(
670    rng: &mut R,
671    options: &RandomGeometryOptions,
672) -> Result<Polygon> {
673    if rng.random_bool(options.empty_rate) {
674        Ok(Polygon::new(LineString::new(vec![]), vec![]))
675    } else {
676        let (center_x, center_y, half_size) = generate_random_circle(rng, options)?;
677        let vertices_dist = Uniform::new_inclusive(
678            options.vertices_per_linestring_range.0,
679            options.vertices_per_linestring_range.1,
680        )
681        .map_err(|e| exec_datafusion_err!("Invalid vertex count range for polygon: {e}"))?;
682        // Always sample in such a way that we end up with a valid Polygon
683        let num_vertices = rng.sample(vertices_dist).max(3);
684
685        // Randomize starting angle (but use the same starting angle for both the shell
686        // and the hole to ensure a non-intersecting interior)
687        let angle = rng.random_range(0.0..=(2.0 * PI));
688        let coords =
689            generate_circular_vertices(angle, center_x, center_y, half_size, num_vertices, true)?;
690        let shell = LineString::from(coords);
691        let mut holes = Vec::new();
692
693        // Potentially add a hole based on probability
694        let add_hole = rng.random_bool(options.polygon_hole_rate);
695        let hole_scale_factor = rng.random_range(0.1..0.5);
696        if add_hole {
697            let new_size = half_size * hole_scale_factor;
698            let mut coords = generate_circular_vertices(
699                angle,
700                center_x,
701                center_y,
702                new_size,
703                num_vertices,
704                true,
705            )?;
706            coords.reverse();
707            holes.push(LineString::from(coords));
708        }
709
710        Ok(Polygon::new(shell, holes))
711    }
712}
713
714fn generate_random_multipoint<R: rand::Rng>(
715    rng: &mut R,
716    options: &RandomGeometryOptions,
717) -> Result<MultiPoint> {
718    if rng.random_bool(options.empty_rate) {
719        Ok(MultiPoint::new(vec![]))
720    } else {
721        let children = generate_random_children(rng, options, generate_random_point)?;
722        Ok(MultiPoint::new(children))
723    }
724}
725
726fn generate_random_multilinestring<R: rand::Rng>(
727    rng: &mut R,
728    options: &RandomGeometryOptions,
729) -> Result<MultiLineString> {
730    if rng.random_bool(options.empty_rate) {
731        Ok(MultiLineString::new(vec![]))
732    } else {
733        let children = generate_random_children(rng, options, generate_random_linestring)?;
734        Ok(MultiLineString::new(children))
735    }
736}
737
738fn generate_random_multipolygon<R: rand::Rng>(
739    rng: &mut R,
740    options: &RandomGeometryOptions,
741) -> Result<MultiPolygon> {
742    if rng.random_bool(options.empty_rate) {
743        Ok(MultiPolygon::new(vec![]))
744    } else {
745        let children = generate_random_children(rng, options, generate_random_polygon)?;
746        Ok(MultiPolygon::new(children))
747    }
748}
749
750fn generate_random_geometrycollection<R: rand::Rng>(
751    rng: &mut R,
752    options: &RandomGeometryOptions,
753) -> Result<GeometryCollection> {
754    if rng.random_bool(options.empty_rate) {
755        Ok(GeometryCollection::new_from(vec![]))
756    } else {
757        let children = generate_random_children(rng, options, generate_random_geometry)?;
758        Ok(GeometryCollection::new_from(children))
759    }
760}
761
762fn generate_random_children<R: Rng, T, F: Fn(&mut R, &RandomGeometryOptions) -> Result<T>>(
763    rng: &mut R,
764    options: &RandomGeometryOptions,
765    func: F,
766) -> Result<Vec<T>> {
767    let num_parts_dist =
768        Uniform::new_inclusive(options.num_parts_range.0, options.num_parts_range.1)
769            .map_err(|e| exec_datafusion_err!("Invalid part count range: {e}"))?;
770    let num_parts = rng.sample(num_parts_dist);
771
772    // Constrain this feature to the size range indicated in the option
773    let (center_x, center_y, half_width) = generate_random_circle(rng, options)?;
774    let feature_bounds = Rect::new(
775        Coord {
776            x: center_x - half_width,
777            y: center_y - half_width,
778        },
779        Coord {
780            x: center_x + half_width,
781            y: center_y + half_width,
782        },
783    );
784
785    let child_bounds = generate_non_overlapping_sub_rectangles(num_parts, &feature_bounds);
786    let mut child_options = options.clone();
787    child_options.empty_rate = 0.0;
788
789    let mut children = Vec::new();
790    for bounds in child_bounds {
791        child_options.bounds = bounds;
792        let child_size = bounds.height().min(bounds.width());
793        child_options.size_range = (child_size * 0.9, child_size);
794
795        // If GeometryCollection, pick a random geometry type
796        // Don't support nested GeometryCollection for now to avoid too much recursion
797        if options.geom_type == GeometryTypeId::GeometryCollection {
798            child_options.geom_type = pick_random_geometry_type(rng);
799        }
800        children.push(func(rng, &child_options)?);
801    }
802
803    Ok(children)
804}
805
806fn pick_random_geometry_type<R: Rng>(rng: &mut R) -> GeometryTypeId {
807    [
808        GeometryTypeId::Point,
809        GeometryTypeId::LineString,
810        GeometryTypeId::Polygon,
811        GeometryTypeId::MultiPoint,
812        GeometryTypeId::MultiLineString,
813        GeometryTypeId::MultiPolygon,
814    ][rng.random_range(0..6)]
815}
816
817fn generate_random_circle<R: rand::Rng>(
818    rng: &mut R,
819    options: &RandomGeometryOptions,
820) -> Result<(f64, f64, f64)> {
821    // Generate random circular polygons
822    let size_dist = Uniform::new_inclusive(options.size_range.0, options.size_range.1)
823        .map_err(|e| exec_datafusion_err!("Invalid size range for random region: {e}"))?;
824    let size = rng.sample(size_dist);
825    let half_size = size / 2.0;
826    let height = options.bounds.height();
827    let width = options.bounds.width();
828
829    // Ensure circle fits within bounds by constraining center position
830    let center_x = if width >= size {
831        let center_x_dist = Uniform::new(
832            options.bounds.min().x + half_size,
833            options.bounds.max().x - half_size,
834        )
835        .map_err(|e| exec_datafusion_err!("Invalid x bounds for random circle center: {e}"))?;
836
837        rng.sample(center_x_dist)
838    } else {
839        options.bounds.min().x + width / 2.0
840    };
841
842    let center_y = if height >= size {
843        let center_y_dist = Uniform::new(
844            options.bounds.min().y + half_size,
845            options.bounds.max().y - half_size,
846        )
847        .map_err(|e| exec_datafusion_err!("Invalid y bounds for random circle center: {e}"))?;
848
849        rng.sample(center_y_dist)
850    } else {
851        options.bounds.min().y + height / 2.0
852    };
853
854    Ok((
855        center_x,
856        center_y,
857        half_size.min(height / 2.0).min(width / 2.0),
858    ))
859}
860
861fn generate_non_overlapping_sub_rectangles(num_parts: usize, bounds: &Rect) -> Vec<Rect> {
862    let mut tiles = vec![*bounds];
863    let mut n = 0;
864    while tiles.len() < num_parts {
865        // Find the largest rectangle
866        let (largest_idx, _) = tiles
867            .iter()
868            .enumerate()
869            .map(|(i, rect)| (i, rect.height() * rect.width()))
870            .max_by(|(_, a1), (_, a2)| a1.partial_cmp(a2).unwrap())
871            .unwrap_or((0, 0.0));
872
873        // Mix up subdividing by x and y
874        let new_rects = if (n % 2) == 0 {
875            tiles[largest_idx].split_x()
876        } else {
877            tiles[largest_idx].split_y()
878        };
879
880        // Remove the largest rectangle and add its subdivisions
881        tiles.remove(largest_idx);
882        tiles.insert(largest_idx, new_rects[0]);
883        tiles.insert(largest_idx, new_rects[1]);
884        n += 1;
885    }
886
887    tiles
888}
889
890fn generate_circular_vertices(
891    mut angle: f64,
892    center_x: f64,
893    center_y: f64,
894    radius: f64,
895    num_vertices: usize,
896    closed: bool,
897) -> Result<Vec<Coord>> {
898    let mut out = Vec::new();
899
900    let dangle = 2.0 * PI / (num_vertices as f64).max(3.0);
901    for _ in 0..num_vertices {
902        out.push(Coord {
903            x: angle.cos() * radius + center_x,
904            y: angle.sin() * radius + center_y,
905        });
906        angle += dangle;
907    }
908
909    if closed {
910        out.push(out[0]);
911    }
912
913    Ok(out)
914}
915
916#[cfg(test)]
917mod tests {
918    use super::*;
919    use arrow_schema::DataType;
920    use geo_traits::{MultiLineStringTrait, MultiPolygonTrait};
921    use geo_types::Coord;
922    use rand::rngs::StdRng;
923    use rand::SeedableRng;
924    use rstest::rstest;
925    use sedona_geometry::{
926        analyze::analyze_geometry, bounds::wkb_bounds_xy, interval::IntervalTrait,
927    };
928
929    #[test]
930    fn test_generate_random_geometry_produces_valid_wkb() {
931        let bounds = Rect::new(Coord { x: 10.0, y: 10.0 }, Coord { x: 90.0, y: 90.0 });
932        let size_range = (1.0, 10.0);
933
934        // Test both Point and Polygon geometry types
935        let test_cases = vec![
936            (GeometryTypeId::Point, 42, 100, 20, 50), // (type, seed, iterations, min_size, max_size)
937            (GeometryTypeId::Polygon, 123, 50, 80, 200),
938        ];
939
940        for (geom_type, seed, iterations, min_size, max_size) in test_cases {
941            let mut rng = StdRng::seed_from_u64(seed);
942            let options = RandomGeometryOptions {
943                geom_type,
944                bounds,
945                size_range,
946                ..Default::default()
947            };
948
949            for _ in 0..iterations {
950                let wkb_bytes = generate_random_wkb(&mut rng, &options).unwrap();
951
952                // Verify WKB is not empty and has reasonable size
953                assert!(!wkb_bytes.is_empty());
954                assert!(
955                    wkb_bytes.len() >= min_size,
956                    "WKB size {} is smaller than expected minimum {} for {:?}",
957                    wkb_bytes.len(),
958                    min_size,
959                    geom_type
960                );
961                assert!(
962                    wkb_bytes.len() <= max_size,
963                    "WKB size {} is larger than expected maximum {} for {:?}",
964                    wkb_bytes.len(),
965                    max_size,
966                    geom_type
967                );
968
969                // Verify WKB can be parsed without error
970                wkb::reader::read_wkb(&wkb_bytes).unwrap();
971            }
972        }
973    }
974
975    #[test]
976    fn test_generate_random_geometry_deterministic() {
977        let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
978        let size_range = (1.0, 10.0);
979
980        let geom_types = [GeometryTypeId::Point, GeometryTypeId::Polygon];
981
982        // Generate with same seed twice
983        let mut rng1 = StdRng::seed_from_u64(42);
984        let mut rng2 = StdRng::seed_from_u64(42);
985
986        for geom_type in geom_types {
987            let options = RandomGeometryOptions {
988                geom_type,
989                bounds,
990                size_range,
991                ..Default::default()
992            };
993            let wkb1 = generate_random_wkb(&mut rng1, &options).unwrap();
994            let wkb2 = generate_random_wkb(&mut rng2, &options).unwrap();
995
996            // Should generate identical results
997            assert_eq!(wkb1, wkb2);
998        }
999    }
1000
1001    #[test]
1002    fn test_random_partitioned_data_builder_build_basic() {
1003        let (schema, partitions) = RandomPartitionedDataBuilder::new()
1004            .num_partitions(2)
1005            .batches_per_partition(3)
1006            .rows_per_batch(4)
1007            .null_rate(0.0) // No nulls for easier testing
1008            .build()
1009            .unwrap();
1010
1011        // Verify schema
1012        assert_eq!(schema.fields().len(), 3);
1013        assert_eq!(schema.field(0).name(), "id");
1014        assert_eq!(schema.field(0).data_type(), &DataType::Int32);
1015        assert_eq!(schema.field(1).name(), "dist");
1016        assert_eq!(schema.field(1).data_type(), &DataType::Float64);
1017        assert_eq!(schema.field(2).name(), "geometry");
1018
1019        // Verify partitions structure
1020        assert_eq!(partitions.len(), 2); // num_partitions
1021
1022        for partition in &partitions {
1023            assert_eq!(partition.len(), 3); // batches_per_partition
1024
1025            for batch in partition {
1026                assert_eq!(batch.num_rows(), 4); // rows_per_batch
1027                assert_eq!(batch.num_columns(), 3);
1028            }
1029        }
1030    }
1031
1032    #[test]
1033    fn test_random_partitioned_data_builder_unique_ids() {
1034        let (_, partitions) = RandomPartitionedDataBuilder::new()
1035            .num_partitions(2)
1036            .batches_per_partition(2)
1037            .rows_per_batch(3)
1038            .build()
1039            .unwrap();
1040
1041        let mut all_ids = Vec::new();
1042
1043        for partition in &partitions {
1044            for batch in partition {
1045                let id_array = batch
1046                    .column(0)
1047                    .as_any()
1048                    .downcast_ref::<Int32Array>()
1049                    .unwrap();
1050                for i in 0..id_array.len() {
1051                    all_ids.push(id_array.value(i));
1052                }
1053            }
1054        }
1055
1056        // Verify all IDs are unique
1057        all_ids.sort();
1058        for i in 1..all_ids.len() {
1059            assert_ne!(
1060                all_ids[i - 1],
1061                all_ids[i],
1062                "Found duplicate ID: {}",
1063                all_ids[i]
1064            );
1065        }
1066
1067        // Verify IDs are sequential starting from 0
1068        for (i, &id) in all_ids.iter().enumerate() {
1069            assert_eq!(id, i as i32);
1070        }
1071    }
1072
1073    #[test]
1074    fn test_random_partitioned_data_builder_null_rate() {
1075        let (_, partitions) = RandomPartitionedDataBuilder::new()
1076            .rows_per_batch(100)
1077            .null_rate(0.5) // 50% null rate
1078            .build()
1079            .unwrap();
1080
1081        let batch = &partitions[0][0];
1082        let geometry_array = batch.column(2);
1083
1084        let null_count = geometry_array.null_count();
1085        let total_count = geometry_array.len();
1086        let null_rate = null_count as f64 / total_count as f64;
1087
1088        // Allow some variance due to randomness (±20%)
1089        assert!(
1090            (0.3..=0.7).contains(&null_rate),
1091            "Expected null rate around 0.5, got {null_rate}"
1092        );
1093    }
1094
1095    #[test]
1096    fn test_random_partitioned_data_builder_deterministic() {
1097        let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
1098
1099        let (schema1, partitions1) = RandomPartitionedDataBuilder::new()
1100            .seed(999)
1101            .num_partitions(2)
1102            .batches_per_partition(2)
1103            .rows_per_batch(5)
1104            .bounds(bounds)
1105            .build()
1106            .unwrap();
1107
1108        let (schema2, partitions2) = RandomPartitionedDataBuilder::new()
1109            .seed(999) // Same seed
1110            .num_partitions(2)
1111            .batches_per_partition(2)
1112            .rows_per_batch(5)
1113            .bounds(bounds)
1114            .build()
1115            .unwrap();
1116
1117        // Schemas should be identical
1118        assert_eq!(schema1, schema2);
1119
1120        // All data should be identical
1121        assert_eq!(partitions1.len(), partitions2.len());
1122        for (partition1, partition2) in partitions1.iter().zip(partitions2.iter()) {
1123            assert_eq!(partition1.len(), partition2.len());
1124            for (batch1, batch2) in partition1.iter().zip(partition2.iter()) {
1125                // Compare IDs
1126                let ids1 = batch1
1127                    .column(0)
1128                    .as_any()
1129                    .downcast_ref::<Int32Array>()
1130                    .unwrap();
1131                let ids2 = batch2
1132                    .column(0)
1133                    .as_any()
1134                    .downcast_ref::<Int32Array>()
1135                    .unwrap();
1136                assert_eq!(ids1, ids2);
1137
1138                // Compare distances
1139                let dists1 = batch1
1140                    .column(1)
1141                    .as_any()
1142                    .downcast_ref::<Float64Array>()
1143                    .unwrap();
1144                let dists2 = batch2
1145                    .column(1)
1146                    .as_any()
1147                    .downcast_ref::<Float64Array>()
1148                    .unwrap();
1149                assert_eq!(dists1, dists2);
1150            }
1151        }
1152    }
1153
1154    #[test]
1155    fn test_random_partitioned_data_builder_different_seeds() {
1156        let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
1157
1158        let (_, partitions1) = RandomPartitionedDataBuilder::new()
1159            .seed(111)
1160            .rows_per_batch(10)
1161            .bounds(bounds)
1162            .build()
1163            .unwrap();
1164
1165        let (_, partitions2) = RandomPartitionedDataBuilder::new()
1166            .seed(222) // Different seed
1167            .rows_per_batch(10)
1168            .bounds(bounds)
1169            .build()
1170            .unwrap();
1171
1172        // Data should be different (distances should differ)
1173        let dists1 = partitions1[0][0]
1174            .column(1)
1175            .as_any()
1176            .downcast_ref::<Float64Array>()
1177            .unwrap();
1178        let dists2 = partitions2[0][0]
1179            .column(1)
1180            .as_any()
1181            .downcast_ref::<Float64Array>()
1182            .unwrap();
1183
1184        // At least some distances should be different
1185        let mut found_difference = false;
1186        for i in 0..dists1.len() {
1187            if (dists1.value(i) - dists2.value(i)).abs() > f64::EPSILON {
1188                found_difference = true;
1189                break;
1190            }
1191        }
1192        assert!(
1193            found_difference,
1194            "Expected different random data with different seeds"
1195        );
1196    }
1197
1198    #[test]
1199    fn test_random_linestring_num_vertices() {
1200        let mut rng = StdRng::seed_from_u64(123);
1201        let mut options = RandomGeometryOptions::new();
1202        options.vertices_per_linestring_range = (3, 3);
1203        for _ in 0..100 {
1204            let geom = generate_random_linestring(&mut rng, &options).unwrap();
1205            assert_eq!(geom.coords().count(), 3);
1206        }
1207
1208        options.vertices_per_linestring_range = (50, 50);
1209        for _ in 0..100 {
1210            let geom = generate_random_linestring(&mut rng, &options).unwrap();
1211            assert_eq!(geom.coords().count(), 50);
1212        }
1213    }
1214
1215    #[test]
1216    fn test_random_polygon_has_hole() {
1217        let mut rng = StdRng::seed_from_u64(123);
1218        let mut options = RandomGeometryOptions::new();
1219
1220        options.polygon_hole_rate = 0.0;
1221        for _ in 0..100 {
1222            let geom = generate_random_polygon(&mut rng, &options).unwrap();
1223            assert_eq!(geom.interiors().len(), 0);
1224        }
1225
1226        options.polygon_hole_rate = 1.0;
1227        for _ in 0..100 {
1228            let geom = generate_random_polygon(&mut rng, &options).unwrap();
1229            assert!(!geom.interiors().is_empty());
1230        }
1231    }
1232
1233    #[test]
1234    fn test_random_multipoint_part_count() {
1235        let mut rng = StdRng::seed_from_u64(123);
1236        let mut options = RandomGeometryOptions::new();
1237
1238        options.num_parts_range = (3, 3);
1239        for _ in 0..100 {
1240            let geom = generate_random_multipoint(&mut rng, &options).unwrap();
1241            assert_eq!(geom.len(), 3);
1242        }
1243
1244        options.num_parts_range = (10, 10);
1245        for _ in 0..100 {
1246            let geom = generate_random_multipoint(&mut rng, &options).unwrap();
1247            assert_eq!(geom.len(), 10);
1248        }
1249    }
1250
1251    #[test]
1252    fn test_random_multilinestring_part_count() {
1253        let mut rng = StdRng::seed_from_u64(123);
1254        let mut options = RandomGeometryOptions::new();
1255
1256        options.num_parts_range = (3, 3);
1257        for _ in 0..100 {
1258            let geom = generate_random_multilinestring(&mut rng, &options).unwrap();
1259            assert_eq!(geom.num_line_strings(), 3);
1260        }
1261
1262        options.num_parts_range = (10, 10);
1263        for _ in 0..100 {
1264            let geom = generate_random_multilinestring(&mut rng, &options).unwrap();
1265            assert_eq!(geom.num_line_strings(), 10);
1266        }
1267    }
1268
1269    #[test]
1270    fn test_random_multipolygon_part_count() {
1271        let mut rng = StdRng::seed_from_u64(123);
1272        let mut options = RandomGeometryOptions::new();
1273
1274        options.num_parts_range = (3, 3);
1275        for _ in 0..100 {
1276            let geom = generate_random_multipolygon(&mut rng, &options).unwrap();
1277            assert_eq!(geom.num_polygons(), 3);
1278        }
1279
1280        options.num_parts_range = (10, 10);
1281        for _ in 0..100 {
1282            let geom = generate_random_multipolygon(&mut rng, &options).unwrap();
1283            assert_eq!(geom.num_polygons(), 10);
1284        }
1285    }
1286
1287    #[test]
1288    fn test_random_geometrycollection_part_count() {
1289        let mut rng = StdRng::seed_from_u64(123);
1290        let mut options = RandomGeometryOptions::new();
1291
1292        options.num_parts_range = (3, 3);
1293        for _ in 0..100 {
1294            let geom = generate_random_geometrycollection(&mut rng, &options).unwrap();
1295            assert_eq!(geom.len(), 3);
1296        }
1297
1298        options.num_parts_range = (10, 10);
1299        for _ in 0..100 {
1300            let geom = generate_random_geometrycollection(&mut rng, &options).unwrap();
1301            assert_eq!(geom.len(), 10);
1302        }
1303    }
1304
1305    #[rstest]
1306    fn test_random_geometry_type(
1307        #[values(
1308            GeometryTypeId::Point,
1309            GeometryTypeId::LineString,
1310            GeometryTypeId::Polygon,
1311            GeometryTypeId::MultiPoint,
1312            GeometryTypeId::MultiLineString,
1313            GeometryTypeId::MultiPolygon,
1314            GeometryTypeId::GeometryCollection
1315        )]
1316        geom_type: GeometryTypeId,
1317    ) {
1318        let mut rng = StdRng::seed_from_u64(123);
1319        let mut options = RandomGeometryOptions::new();
1320        options.geom_type = geom_type;
1321
1322        options.empty_rate = 0.0;
1323        for _ in 0..100 {
1324            let geom = generate_random_wkb(&mut rng, &options).unwrap();
1325            let wkb = wkb::reader::read_wkb(&geom).unwrap();
1326            let analysis = analyze_geometry(&wkb).unwrap();
1327            assert_eq!(analysis.geometry_type.geometry_type(), geom_type);
1328        }
1329    }
1330
1331    #[rstest]
1332    fn test_random_emptiness(
1333        #[values(
1334            GeometryTypeId::Point,
1335            GeometryTypeId::LineString,
1336            GeometryTypeId::Polygon,
1337            GeometryTypeId::MultiPoint,
1338            GeometryTypeId::MultiLineString,
1339            GeometryTypeId::MultiPolygon,
1340            GeometryTypeId::GeometryCollection
1341        )]
1342        geom_type: GeometryTypeId,
1343    ) {
1344        let mut rng = StdRng::seed_from_u64(123);
1345        let mut options = RandomGeometryOptions::new();
1346        options.geom_type = geom_type;
1347
1348        options.empty_rate = 0.0;
1349        for _ in 0..100 {
1350            let geom = generate_random_wkb(&mut rng, &options).unwrap();
1351            let bounds = wkb_bounds_xy(&geom).unwrap();
1352            assert!(!bounds.x().is_empty());
1353            assert!(!bounds.y().is_empty());
1354
1355            assert!(
1356                bounds.x().lo() >= options.bounds.min().x
1357                    && bounds.y().lo() >= options.bounds.min().y
1358                    && bounds.x().hi() <= options.bounds.max().x
1359                    && bounds.y().hi() <= options.bounds.max().y
1360            );
1361        }
1362
1363        options.empty_rate = 1.0;
1364        for _ in 0..100 {
1365            let geom = generate_random_wkb(&mut rng, &options).unwrap();
1366            let bounds = wkb_bounds_xy(&geom).unwrap();
1367            assert!(bounds.x().is_empty());
1368            assert!(bounds.y().is_empty());
1369        }
1370    }
1371
1372    #[test]
1373    fn test_random_partitioned_data_builder_validation() {
1374        // Test invalid null_rate (< 0.0)
1375        let err = RandomPartitionedDataBuilder::new()
1376            .null_rate(-0.1)
1377            .validate()
1378            .unwrap_err();
1379        assert_eq!(
1380            err.to_string(),
1381            "Error during planning: Expected null_rate between 0.0 and 1.0 but got -0.1"
1382        );
1383
1384        // Test invalid null_rate (> 1.0)
1385        let err = RandomPartitionedDataBuilder::new()
1386            .null_rate(1.5)
1387            .validate()
1388            .unwrap_err();
1389        assert_eq!(
1390            err.to_string(),
1391            "Error during planning: Expected null_rate between 0.0 and 1.0 but got 1.5"
1392        );
1393
1394        // Test invalid rows_per_batch (0)
1395        let err = RandomPartitionedDataBuilder::new()
1396            .rows_per_batch(0)
1397            .validate()
1398            .unwrap_err();
1399        assert_eq!(
1400            err.to_string(),
1401            "Error during planning: Expected rows_per_batch > 0 but got 0"
1402        );
1403
1404        // Test invalid num_partitions (0)
1405        let err = RandomPartitionedDataBuilder::new()
1406            .num_partitions(0)
1407            .validate()
1408            .unwrap_err();
1409        assert_eq!(
1410            err.to_string(),
1411            "Error during planning: Expected num_partitions > 0 but got 0"
1412        );
1413
1414        // Test invalid empty_rate (< 0.0)
1415        let err = RandomPartitionedDataBuilder::new()
1416            .empty_rate(-0.1)
1417            .validate()
1418            .unwrap_err();
1419        assert_eq!(
1420            err.to_string(),
1421            "Error during planning: Expected empty_rate between 0.0 and 1.0 but got -0.1"
1422        );
1423
1424        // Test invalid empty_rate (> 1.0)
1425        let err = RandomPartitionedDataBuilder::new()
1426            .empty_rate(1.5)
1427            .validate()
1428            .unwrap_err();
1429        assert_eq!(
1430            err.to_string(),
1431            "Error during planning: Expected empty_rate between 0.0 and 1.0 but got 1.5"
1432        );
1433
1434        // Test invalid polygon_hole_rate (< 0.0)
1435        let err = RandomPartitionedDataBuilder::new()
1436            .polygon_hole_rate(-0.1)
1437            .validate()
1438            .unwrap_err();
1439        assert_eq!(
1440            err.to_string(),
1441            "Error during planning: Expected polygon_hole_rate between 0.0 and 1.0 but got -0.1"
1442        );
1443
1444        // Test invalid polygon_hole_rate (> 1.0)
1445        let err = RandomPartitionedDataBuilder::new()
1446            .polygon_hole_rate(1.5)
1447            .validate()
1448            .unwrap_err();
1449        assert_eq!(
1450            err.to_string(),
1451            "Error during planning: Expected polygon_hole_rate between 0.0 and 1.0 but got 1.5"
1452        );
1453
1454        // Test invalid size_range (min <= 0)
1455        let err = RandomPartitionedDataBuilder::new()
1456            .size_range((0.0, 10.0))
1457            .validate()
1458            .unwrap_err();
1459        assert_eq!(
1460            err.to_string(),
1461            "Error during planning: Expected valid size_range but got (0.0, 10.0)"
1462        );
1463
1464        // Test invalid size_range (max <= 0)
1465        let err = RandomPartitionedDataBuilder::new()
1466            .size_range((5.0, -1.0))
1467            .validate()
1468            .unwrap_err();
1469        assert_eq!(
1470            err.to_string(),
1471            "Error during planning: Expected valid size_range but got (5.0, -1.0)"
1472        );
1473
1474        // Test invalid size_range (min > max)
1475        let err = RandomPartitionedDataBuilder::new()
1476            .size_range((10.0, 5.0))
1477            .validate()
1478            .unwrap_err();
1479        assert_eq!(
1480            err.to_string(),
1481            "Error during planning: Expected valid size_range but got (10.0, 5.0)"
1482        );
1483
1484        // Test invalid vertices_per_linestring_range (min == 0)
1485        let err = RandomPartitionedDataBuilder::new()
1486            .vertices_per_linestring_range((0, 5))
1487            .validate()
1488            .unwrap_err();
1489        assert_eq!(
1490            err.to_string(),
1491            "Error during planning: Expected valid vertices_per_linestring_range but got (0, 5)"
1492        );
1493
1494        // Test invalid vertices_per_linestring_range (min > max)
1495        let err = RandomPartitionedDataBuilder::new()
1496            .vertices_per_linestring_range((10, 5))
1497            .validate()
1498            .unwrap_err();
1499        assert_eq!(
1500            err.to_string(),
1501            "Error during planning: Expected valid vertices_per_linestring_range but got (10, 5)"
1502        );
1503
1504        // Test invalid num_parts_range (min == 0)
1505        let err = RandomPartitionedDataBuilder::new()
1506            .num_parts_range((0, 5))
1507            .validate()
1508            .unwrap_err();
1509        assert_eq!(
1510            err.to_string(),
1511            "Error during planning: Expected valid num_parts_range but got (0, 5)"
1512        );
1513
1514        // Test invalid num_parts_range (min > max)
1515        let err = RandomPartitionedDataBuilder::new()
1516            .num_parts_range((10, 5))
1517            .validate()
1518            .unwrap_err();
1519        assert_eq!(
1520            err.to_string(),
1521            "Error during planning: Expected valid num_parts_range but got (10, 5)"
1522        );
1523
1524        // Test invalid bounds (zero width)
1525        let err = RandomPartitionedDataBuilder::new()
1526            .bounds(Rect::new(
1527                Coord { x: 10.0, y: 10.0 },
1528                Coord { x: 10.0, y: 20.0 },
1529            ))
1530            .validate()
1531            .unwrap_err();
1532        assert_eq!(
1533            err.to_string(),
1534            "Error during planning: Expected valid bounds but got RECT(10.0 10.0,10.0 20.0)"
1535        );
1536
1537        // Test invalid bounds (zero height)
1538        let err = RandomPartitionedDataBuilder::new()
1539            .bounds(Rect::new(
1540                Coord { x: 10.0, y: 10.0 },
1541                Coord { x: 20.0, y: 10.0 },
1542            ))
1543            .validate()
1544            .unwrap_err();
1545        assert_eq!(
1546            err.to_string(),
1547            "Error during planning: Expected valid bounds but got RECT(10.0 10.0,20.0 10.0)"
1548        );
1549    }
1550}