sedona_testing/
datagen.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17//! Tools for generating random geometries
18//!
19//! The current options provided here were built to support basic correctness testing and
20//! benchmarking algorithmic complexity of a large number of scalar functions, many of which
21//! have various performance or correctness issues that arise from null features, empty features,
22//! polygons with holes, or collections with various numbers of sub-geometries.
23//! See <https://github.com/apache/sedona/pull/1680> for the Sedona/Java implementation of spider,
24//! which implements a number of other strategies for generating various geometry types.
25
26use arrow_array::{ArrayRef, RecordBatch, RecordBatchReader};
27use arrow_array::{BinaryArray, BinaryViewArray};
28use arrow_array::{Float64Array, Int32Array};
29use arrow_schema::{ArrowError, DataType, Field, Schema, SchemaRef};
30use datafusion_common::Result;
31use geo_types::{
32    Coord, Geometry, GeometryCollection, LineString, MultiLineString, MultiPoint, MultiPolygon,
33    Point, Polygon, Rect,
34};
35use rand::distributions::Uniform;
36use rand::rngs::StdRng;
37use rand::{Rng, SeedableRng};
38use sedona_common::sedona_internal_err;
39use sedona_geometry::types::GeometryTypeId;
40use sedona_schema::datatypes::{SedonaType, WKB_GEOMETRY};
41use std::f64::consts::PI;
42use std::sync::Arc;
43use wkb::writer::WriteOptions;
44use wkb::Endianness;
45
46/// Builder for generating test data partitions with random geometries.
47///
48/// This builder allows you to create deterministic test datasets with configurable
49/// geometry types, data distribution, and partitioning for testing spatial operations.
50///
51/// The generated data includes:
52///
53/// - `id`: Unique integer identifier for each row
54/// - `dist`: Random floating-point distance value (0.0 to 100.0)
55/// - `geometry`: Random geometry data in the specified format (WKB or WKB View)
56///
57/// The strategy for generating geometries and their options are not stable and may change
58/// as the needs of testing and benchmarking evolve or better strategies are discovered.
59/// The strategy for generating random geometries is as follows:
60///
61/// - Points are uniformly distributed over the [Self::bounds] indicated
62/// - Linestrings are generated by calculating the points in a circle of a randomly
63///   chosen size (according to [Self::size_range]) with vertex count sampled using
64///   [Self::vertices_per_linestring_range]. The start and end point of generated
65///   linestrings are never connected.
66/// - Polygons are generated using a closed version of the linestring generated.
67///   They may or may not have a hole according to [Self::polygon_hole_rate].
68/// - MultiPoint, MultiLinestring, and MultiPolygon geometries are constructed
69///   with the number of parts sampled according to [Self::num_parts_range].
70///   The size of the entire feature is constrained to [Self::size_range],
71///   and this space is subdivided to obtain the exact number of spaces needed.
72///   Child features are generated using the global options except with sizes
73///   sampled to approach the space given to them.
74///
75/// # Example
76///
77/// ```rust
78/// use sedona_testing::datagen::RandomPartitionedDataBuilder;
79/// use sedona_geometry::types::GeometryTypeId;
80/// use geo_types::{Coord, Rect};
81///
82/// let (schema, partitions) = RandomPartitionedDataBuilder::new()
83///     .seed(42)
84///     .num_partitions(4)
85///     .rows_per_batch(1000)
86///     .geometry_type(GeometryTypeId::Polygon)
87///     .bounds(Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 }))
88///     .build()
89///     .unwrap();
90/// ```
91#[derive(Debug, Clone)]
92pub struct RandomPartitionedDataBuilder {
93    pub seed: u64,
94    pub num_partitions: usize,
95    pub batches_per_partition: usize,
96    pub rows_per_batch: usize,
97    sedona_type: SedonaType,
98    null_rate: f64,
99    options: RandomGeometryOptions,
100}
101
102impl Default for RandomPartitionedDataBuilder {
103    fn default() -> Self {
104        let options = RandomGeometryOptions::new();
105
106        Self {
107            seed: 42,
108            num_partitions: 1,
109            batches_per_partition: 1,
110            rows_per_batch: 10,
111            sedona_type: WKB_GEOMETRY,
112            null_rate: 0.0,
113            options,
114        }
115    }
116}
117
118impl RandomPartitionedDataBuilder {
119    /// Creates a new `RandomPartitionedDataBuilder` with default values.
120    ///
121    /// Default configuration:
122    ///
123    /// - seed: 42 (for deterministic results)
124    /// - num_partitions: 1
125    /// - batches_per_partition: 1
126    /// - rows_per_batch: 10
127    /// - geometry_type: Point
128    /// - bounds: (0,0) to (100,100)
129    /// - size_range: 1.0 to 10.0
130    /// - null_rate: 0.0 (no nulls)
131    /// - empty_rate: 0.0 (no empties)
132    /// - vertices_per_linestring_range
133    /// - num_parts_range: 1 to 3
134    /// - polygon_hole_rate: 0.0 (no polygons with holes)
135    pub fn new() -> Self {
136        Self::default()
137    }
138
139    /// Sets the random seed for deterministic data generation.
140    ///
141    /// Using the same seed will produce identical datasets, which is useful
142    /// for reproducible tests.
143    ///
144    /// # Arguments
145    ///
146    /// * `seed` - The random seed value
147    pub fn seed(mut self, seed: u64) -> Self {
148        self.seed = seed;
149        self
150    }
151
152    /// Sets the number of data partitions to generate.
153    ///
154    /// Each partition contains multiple batches of data. This is useful for
155    /// testing distributed processing scenarios.
156    ///
157    /// # Arguments
158    ///
159    /// * `num_partitions` - Number of partitions to create
160    pub fn num_partitions(mut self, num_partitions: usize) -> Self {
161        self.num_partitions = num_partitions;
162        self
163    }
164
165    /// Sets the number of batches per partition.
166    ///
167    /// Each batch is a `RecordBatch` containing the specified number of rows.
168    ///
169    /// # Arguments
170    ///
171    /// * `batches_per_partition` - Number of batches in each partition
172    pub fn batches_per_partition(mut self, batches_per_partition: usize) -> Self {
173        self.batches_per_partition = batches_per_partition;
174        self
175    }
176
177    /// Sets the number of rows per batch.
178    ///
179    /// This determines the size of each `RecordBatch` that will be generated.
180    ///
181    /// # Arguments
182    ///
183    /// * `rows_per_batch` - Number of rows in each batch
184    pub fn rows_per_batch(mut self, rows_per_batch: usize) -> Self {
185        self.rows_per_batch = rows_per_batch;
186        self
187    }
188
189    /// Sets the type of geometry to generate.
190    ///
191    /// Currently supports:
192    /// - `GeometryTypeId::Point`: Random points within the specified bounds
193    /// - `GeometryTypeId::Polygon`: Random diamond-shaped polygons
194    /// - Other types default to point generation
195    ///
196    /// # Arguments
197    ///
198    /// * `geom_type` - The geometry type to generate
199    pub fn geometry_type(mut self, geom_type: GeometryTypeId) -> Self {
200        self.options.geom_type = geom_type;
201        self
202    }
203
204    /// Sets the Sedona data type for the geometry column.
205    ///
206    /// This determines how the geometry data is stored (e.g., WKB or WKB View).
207    ///
208    /// # Arguments
209    ///
210    /// * `sedona_type` - The Sedona type for geometry storage
211    pub fn sedona_type(mut self, sedona_type: SedonaType) -> Self {
212        self.sedona_type = sedona_type;
213        self
214    }
215
216    /// Sets the spatial bounds for geometry generation.
217    ///
218    /// All generated geometries will be positioned within these bounds.
219    /// For polygons, the bounds are used to ensure the entire polygon fits within the area.
220    ///
221    /// # Arguments
222    ///
223    /// * `bounds` - Rectangle defining the spatial bounds (min_x, min_y, max_x, max_y)
224    pub fn bounds(mut self, bounds: Rect) -> Self {
225        self.options.bounds = bounds;
226        self
227    }
228
229    /// Sets the size range for generated geometries.
230    ///
231    /// For polygons, this controls the radius of the generated shapes.
232    /// For points, this parameter is not used.
233    ///
234    /// # Arguments
235    ///
236    /// * `size_range` - Tuple of (min_size, max_size) for geometry dimensions
237    pub fn size_range(mut self, size_range: (f64, f64)) -> Self {
238        self.options.size_range = size_range;
239        self
240    }
241
242    /// Sets the rate of null values in the geometry column.
243    ///
244    /// # Arguments
245    ///
246    /// * `null_rate` - Fraction of rows that should have null geometry (0.0 to 1.0)
247    pub fn null_rate(mut self, null_rate: f64) -> Self {
248        self.null_rate = null_rate;
249        self
250    }
251
252    /// Sets the rate of EMPTY geometries in the geometry column.
253    ///
254    /// # Arguments
255    ///
256    /// * `empty_rate` - Fraction of rows that should have empty geometry (0.0 to 1.0)
257    pub fn empty_rate(mut self, empty_rate: f64) -> Self {
258        self.options.empty_rate = empty_rate;
259        self
260    }
261
262    /// Sets the vertex count range
263    ///
264    /// # Arguments
265    ///
266    /// * `vertices_per_linestring_range` - The minimum and maximum (inclusive) number of vertices
267    ///   in linestring output. This also affects polygon output, although the actual number
268    ///   of vertices in the polygon ring will be one more than the range indicated here to
269    ///   close the polygon.
270    pub fn vertices_per_linestring_range(
271        mut self,
272        vertices_per_linestring_range: (usize, usize),
273    ) -> Self {
274        self.options.vertices_per_linestring_range = vertices_per_linestring_range;
275        self
276    }
277
278    /// Sets the number of parts range
279    ///
280    /// # Arguments
281    ///
282    /// * `num_parts_range` - The minimum and maximum (inclusive) number of parts
283    ///   in multi geometry and/or collection output.
284    pub fn num_parts_range(mut self, num_parts_range: (usize, usize)) -> Self {
285        self.options.num_parts_range = num_parts_range;
286        self
287    }
288
289    /// Sets the polygon hole rate
290    ///
291    /// # Arguments
292    ///
293    /// * `polygon_hole_rate` - Fraction of polygons that should have an interior
294    ///   ring. Currently only a single interior ring is possible.
295    pub fn polygon_hole_rate(mut self, polygon_hole_rate: f64) -> Self {
296        self.options.polygon_hole_rate = polygon_hole_rate;
297        self
298    }
299
300    /// The [SchemaRef] generated by this builder
301    ///
302    /// The resulting schema contains three columns:
303    ///
304    /// - `id`: Int32 - Unique sequential identifier for each row
305    /// - `dist`: Float64 - Random distance value between 0.0 and 100.0
306    /// - `geometry`: SedonaType - Random geometry data (WKB or WKB View format)
307    pub fn schema(&self) -> SchemaRef {
308        // Create schema
309        Arc::new(Schema::new(vec![
310            Field::new("id", DataType::Int32, false),
311            Field::new("dist", DataType::Float64, false),
312            self.sedona_type.to_storage_field("geometry", true).unwrap(),
313        ]))
314    }
315
316    /// Builds the random partitioned dataset with the configured parameters.
317    ///
318    /// Generates a deterministic dataset based on the seed and configuration.
319    /// The resulting schema contains three columns:
320    /// - `id`: Int32 - Unique sequential identifier for each row
321    /// - `dist`: Float64 - Random distance value between 0.0 and 100.0
322    /// - `geometry`: SedonaType - Random geometry data (WKB or WKB View format)
323    ///
324    /// # Returns
325    ///
326    /// A tuple containing:
327    /// - `SchemaRef`: Arrow schema for the generated data
328    /// - `Vec<Vec<RecordBatch>>`: Vector of partitions, each containing a vector of record batches
329    ///
330    /// # Errors
331    ///
332    /// Returns a `datafusion_common::Result` error if:
333    /// - RecordBatch creation fails
334    /// - Array conversion fails
335    /// - Schema creation fails
336    pub fn build(&self) -> Result<(SchemaRef, Vec<Vec<RecordBatch>>)> {
337        // Create a seeded random number generator for deterministic results
338        let schema = self.schema();
339        let mut result = Vec::with_capacity(self.num_partitions);
340
341        for partition_idx in 0..self.num_partitions {
342            let rng = Self::default_rng(self.seed + partition_idx as u64);
343            let partition_batches = self
344                .partition_reader(rng, partition_idx)
345                .collect::<Result<Vec<_>, ArrowError>>()?;
346            result.push(partition_batches);
347        }
348
349        Ok((schema, result))
350    }
351
352    /// Generate a [Rng] based on a seed
353    ///
354    /// Callers can also supply their own [Rng].
355    pub fn default_rng(seed: u64) -> impl Rng {
356        StdRng::seed_from_u64(seed)
357    }
358
359    /// Create a [RecordBatchReader] that reads a single partition
360    pub fn partition_reader<R: Rng + Send + 'static>(
361        &self,
362        rng: R,
363        partition_idx: usize,
364    ) -> Box<dyn RecordBatchReader + Send> {
365        let reader = RandomPartitionedDataReader {
366            builder: self.clone(),
367            schema: self.schema(),
368            partition_idx,
369            batch_idx: 0,
370            rng,
371        };
372
373        Box::new(reader)
374    }
375
376    /// Generate a single batch
377    fn generate_batch<R: Rng>(
378        &self,
379        rng: &mut R,
380        schema: &SchemaRef,
381        partition_idx: usize,
382        batch_idx: usize,
383    ) -> Result<RecordBatch> {
384        // Generate IDs - make them unique across partitions and batches
385        let id_start =
386            (partition_idx * self.batches_per_partition + batch_idx) * self.rows_per_batch;
387        let ids: Vec<i32> = (0..self.rows_per_batch)
388            .map(|i| (id_start + i) as i32)
389            .collect();
390
391        // Generate random distances between 0.0 and 100.0
392        let distance_dist = Uniform::new(0.0, 100.0);
393        let distances: Vec<f64> = (0..self.rows_per_batch)
394            .map(|_| rng.sample(distance_dist))
395            .collect();
396
397        // Generate random geometries based on the geometry type
398        let wkb_geometries: Vec<Option<Vec<u8>>> = (0..self.rows_per_batch)
399            .map(|_| {
400                if rng.sample(Uniform::new(0.0, 1.0)) < self.null_rate {
401                    None
402                } else {
403                    Some(generate_random_wkb(rng, &self.options))
404                }
405            })
406            .collect();
407
408        // Create Arrow arrays
409        let id_array = Arc::new(Int32Array::from(ids));
410        let dist_array = Arc::new(Float64Array::from(distances));
411        let geometry_array = create_wkb_array(wkb_geometries, &self.sedona_type)?;
412
413        // Create RecordBatch
414        Ok(RecordBatch::try_new(
415            schema.clone(),
416            vec![id_array, dist_array, geometry_array],
417        )?)
418    }
419}
420
421/// Create an ArrayRef from a vector of WKB bytes based on the sedona type
422fn create_wkb_array(
423    wkb_values: Vec<Option<Vec<u8>>>,
424    sedona_type: &SedonaType,
425) -> Result<ArrayRef> {
426    match sedona_type {
427        SedonaType::Wkb(_, _) => Ok(Arc::new(BinaryArray::from_iter(wkb_values))),
428        SedonaType::WkbView(_, _) => Ok(Arc::new(BinaryViewArray::from_iter(wkb_values))),
429        _ => sedona_internal_err!("create_wkb_array not implemented for {sedona_type:?}"),
430    }
431}
432
433struct RandomPartitionedDataReader<R> {
434    builder: RandomPartitionedDataBuilder,
435    schema: SchemaRef,
436    partition_idx: usize,
437    batch_idx: usize,
438    rng: R,
439}
440
441impl<R: Rng> RecordBatchReader for RandomPartitionedDataReader<R> {
442    fn schema(&self) -> SchemaRef {
443        self.builder.schema()
444    }
445}
446
447impl<R: Rng> Iterator for RandomPartitionedDataReader<R> {
448    type Item = std::result::Result<RecordBatch, ArrowError>;
449
450    fn next(&mut self) -> Option<Self::Item> {
451        if self.batch_idx == self.builder.batches_per_partition {
452            return None;
453        }
454
455        let maybe_batch = self
456            .builder
457            .generate_batch(
458                &mut self.rng,
459                &self.schema,
460                self.partition_idx,
461                self.batch_idx,
462            )
463            .map_err(|e| ArrowError::ExternalError(Box::new(e)));
464        self.batch_idx += 1;
465        Some(maybe_batch)
466    }
467}
468
469/// Options for the current strategy influencing individual geometry constructors
470#[derive(Debug, Clone)]
471struct RandomGeometryOptions {
472    geom_type: GeometryTypeId,
473    bounds: Rect,
474    size_range: (f64, f64),
475    vertices_per_linestring_range: (usize, usize),
476    empty_rate: f64,
477    polygon_hole_rate: f64,
478    num_parts_range: (usize, usize),
479}
480
481impl RandomGeometryOptions {
482    fn new() -> Self {
483        Self {
484            geom_type: GeometryTypeId::Point,
485            empty_rate: 0.0,
486            bounds: Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 }),
487            size_range: (1.0, 10.0),
488            vertices_per_linestring_range: (4, 4),
489            polygon_hole_rate: 0.0,
490            num_parts_range: (1, 3),
491        }
492    }
493}
494
495impl Default for RandomGeometryOptions {
496    fn default() -> Self {
497        Self::new()
498    }
499}
500
501/// Generate random geometry WKB bytes based on the geometry type
502fn generate_random_wkb<R: rand::Rng>(rng: &mut R, options: &RandomGeometryOptions) -> Vec<u8> {
503    let geometry = generate_random_geometry(rng, options);
504
505    // Convert geometry to WKB
506    let mut out: Vec<u8> = vec![];
507    wkb::writer::write_geometry(
508        &mut out,
509        &geometry,
510        &WriteOptions {
511            endianness: Endianness::LittleEndian,
512        },
513    )
514    .unwrap();
515    out
516}
517
518fn generate_random_geometry<R: rand::Rng>(
519    rng: &mut R,
520    options: &RandomGeometryOptions,
521) -> Geometry {
522    match options.geom_type {
523        GeometryTypeId::Point => Geometry::Point(generate_random_point(rng, options)),
524        GeometryTypeId::LineString => {
525            Geometry::LineString(generate_random_linestring(rng, options))
526        }
527        GeometryTypeId::Polygon => Geometry::Polygon(generate_random_polygon(rng, options)),
528        GeometryTypeId::MultiPoint => {
529            Geometry::MultiPoint(generate_random_multipoint(rng, options))
530        }
531        GeometryTypeId::MultiLineString => {
532            Geometry::MultiLineString(generate_random_multilinestring(rng, options))
533        }
534        GeometryTypeId::MultiPolygon => {
535            Geometry::MultiPolygon(generate_random_multipolygon(rng, options))
536        }
537        GeometryTypeId::GeometryCollection => {
538            Geometry::GeometryCollection(generate_random_geometrycollection(rng, options))
539        }
540        GeometryTypeId::Geometry => {
541            let mut copy_options = options.clone();
542            copy_options.geom_type = pick_random_geometry_type(rng);
543            generate_random_geometry(rng, &copy_options)
544        }
545    }
546}
547
548fn generate_random_point<R: rand::Rng>(rng: &mut R, options: &RandomGeometryOptions) -> Point {
549    if rng.gen_bool(options.empty_rate) {
550        // This is a bit of a hack because geo-types doesn't support empty point; however,
551        // this does work with respect to sending this directly to the WKB reader and getting
552        // the WKB result we want
553        Point::new(f64::NAN, f64::NAN)
554    } else {
555        // Generate random points within the specified bounds
556        let x_dist = Uniform::new(options.bounds.min().x, options.bounds.max().x);
557        let y_dist = Uniform::new(options.bounds.min().y, options.bounds.max().y);
558        let x = rng.sample(x_dist);
559        let y = rng.sample(y_dist);
560        Point::new(x, y)
561    }
562}
563
564fn generate_random_linestring<R: rand::Rng>(
565    rng: &mut R,
566    options: &RandomGeometryOptions,
567) -> LineString {
568    if rng.gen_bool(options.empty_rate) {
569        LineString::new(vec![])
570    } else {
571        let (center_x, center_y, half_size) = generate_random_circle(rng, options);
572        let vertices_dist = Uniform::new_inclusive(
573            options.vertices_per_linestring_range.0,
574            options.vertices_per_linestring_range.1,
575        );
576        // Always sample in such a way that we end up with a valid linestring
577        let num_vertices = rng.sample(vertices_dist).max(2);
578        let coords =
579            generate_circular_vertices(rng, center_x, center_y, half_size, num_vertices, false);
580        LineString::from(coords)
581    }
582}
583
584fn generate_random_polygon<R: rand::Rng>(rng: &mut R, options: &RandomGeometryOptions) -> Polygon {
585    if rng.gen_bool(options.empty_rate) {
586        Polygon::new(LineString::new(vec![]), vec![])
587    } else {
588        let (center_x, center_y, half_size) = generate_random_circle(rng, options);
589        let vertices_dist = Uniform::new_inclusive(
590            options.vertices_per_linestring_range.0,
591            options.vertices_per_linestring_range.1,
592        );
593        // Always sample in such a way that we end up with a valid Polygon
594        let num_vertices = rng.sample(vertices_dist).max(3);
595        let coords =
596            generate_circular_vertices(rng, center_x, center_y, half_size, num_vertices, true);
597        let shell = LineString::from(coords);
598        let mut holes = Vec::new();
599
600        // Potentially add a hole based on probability
601        let add_hole = rng.gen_bool(options.polygon_hole_rate);
602        let hole_scale_factor_dist = Uniform::new(0.1, 0.5);
603        let hole_scale_factor = rng.sample(hole_scale_factor_dist);
604        if add_hole {
605            let new_size = half_size * hole_scale_factor;
606            let mut coords =
607                generate_circular_vertices(rng, center_x, center_y, new_size, num_vertices, true);
608            coords.reverse();
609            holes.push(LineString::from(coords));
610        }
611
612        Polygon::new(shell, holes)
613    }
614}
615
616fn generate_random_multipoint<R: rand::Rng>(
617    rng: &mut R,
618    options: &RandomGeometryOptions,
619) -> MultiPoint {
620    if rng.gen_bool(options.empty_rate) {
621        MultiPoint::new(vec![])
622    } else {
623        let children = generate_random_children(rng, options, generate_random_point);
624        MultiPoint::new(children)
625    }
626}
627
628fn generate_random_multilinestring<R: rand::Rng>(
629    rng: &mut R,
630    options: &RandomGeometryOptions,
631) -> MultiLineString {
632    if rng.gen_bool(options.empty_rate) {
633        MultiLineString::new(vec![])
634    } else {
635        let children = generate_random_children(rng, options, generate_random_linestring);
636        MultiLineString::new(children)
637    }
638}
639
640fn generate_random_multipolygon<R: rand::Rng>(
641    rng: &mut R,
642    options: &RandomGeometryOptions,
643) -> MultiPolygon {
644    if rng.gen_bool(options.empty_rate) {
645        MultiPolygon::new(vec![])
646    } else {
647        let children = generate_random_children(rng, options, generate_random_polygon);
648        MultiPolygon::new(children)
649    }
650}
651
652fn generate_random_geometrycollection<R: rand::Rng>(
653    rng: &mut R,
654    options: &RandomGeometryOptions,
655) -> GeometryCollection {
656    if rng.gen_bool(options.empty_rate) {
657        GeometryCollection::new_from(vec![])
658    } else {
659        let children = generate_random_children(rng, options, generate_random_geometry);
660        GeometryCollection::new_from(children)
661    }
662}
663
664fn generate_random_children<R: Rng, T, F: Fn(&mut R, &RandomGeometryOptions) -> T>(
665    rng: &mut R,
666    options: &RandomGeometryOptions,
667    func: F,
668) -> Vec<T> {
669    let num_parts_dist =
670        Uniform::new_inclusive(options.num_parts_range.0, options.num_parts_range.1);
671    let num_parts = rng.sample(num_parts_dist);
672
673    // Constrain this feature to the size range indicated in the option
674    let (center_x, center_y, half_width) = generate_random_circle(rng, options);
675    let feature_bounds = Rect::new(
676        Coord {
677            x: center_x - half_width,
678            y: center_y - half_width,
679        },
680        Coord {
681            x: center_x + half_width,
682            y: center_y + half_width,
683        },
684    );
685
686    let child_bounds = generate_non_overlapping_sub_rectangles(num_parts, &feature_bounds);
687    let mut child_options = options.clone();
688    child_options.empty_rate = 0.0;
689
690    let mut children = Vec::new();
691    for bounds in child_bounds {
692        child_options.bounds = bounds;
693        let child_size = bounds.height().min(bounds.width());
694        child_options.size_range = (child_size * 0.9, child_size);
695
696        // If GeometryCollection, pick a random geometry type
697        // Don't support nested GeometryCollection for now to avoid too much recursion
698        if options.geom_type == GeometryTypeId::GeometryCollection {
699            child_options.geom_type = pick_random_geometry_type(rng);
700        }
701        children.push(func(rng, &child_options));
702    }
703
704    children
705}
706
707fn pick_random_geometry_type<R: Rng>(rng: &mut R) -> GeometryTypeId {
708    [
709        GeometryTypeId::Point,
710        GeometryTypeId::LineString,
711        GeometryTypeId::Polygon,
712        GeometryTypeId::MultiPoint,
713        GeometryTypeId::MultiLineString,
714        GeometryTypeId::MultiPolygon,
715    ][rng.gen_range(0..6)]
716}
717
718fn generate_random_circle<R: rand::Rng>(
719    rng: &mut R,
720    options: &RandomGeometryOptions,
721) -> (f64, f64, f64) {
722    // Generate random diamond polygons (rotated squares)
723    let size_dist = Uniform::new(options.size_range.0, options.size_range.1);
724    let half_size = rng.sample(size_dist) / 2.0;
725
726    // Ensure diamond fits within bounds by constraining center position
727    let center_x_dist = Uniform::new(
728        options.bounds.min().x + half_size,
729        options.bounds.max().x - half_size,
730    );
731    let center_y_dist = Uniform::new(
732        options.bounds.min().y + half_size,
733        options.bounds.max().y - half_size,
734    );
735    let center_x = rng.sample(center_x_dist);
736    let center_y = rng.sample(center_y_dist);
737
738    (center_x, center_y, half_size)
739}
740
741fn generate_non_overlapping_sub_rectangles(num_parts: usize, bounds: &Rect) -> Vec<Rect> {
742    let mut tiles = vec![*bounds];
743    let mut n = 0;
744    while tiles.len() < num_parts {
745        // Find the largest rectangle
746        let (largest_idx, _) = tiles
747            .iter()
748            .enumerate()
749            .map(|(i, rect)| (i, rect.height() * rect.width()))
750            .max_by(|(_, a1), (_, a2)| a1.partial_cmp(a2).unwrap())
751            .unwrap_or((0, 0.0));
752
753        // Mix up subdividing by x and y
754        let new_rects = if (n % 2) == 0 {
755            tiles[largest_idx].split_x()
756        } else {
757            tiles[largest_idx].split_y()
758        };
759
760        // Remove the largest rectangle and add its subdivisions
761        tiles.remove(largest_idx);
762        tiles.insert(largest_idx, new_rects[0]);
763        tiles.insert(largest_idx, new_rects[1]);
764        n += 1;
765    }
766
767    tiles
768}
769
770fn generate_circular_vertices<R: rand::Rng>(
771    rng: &mut R,
772    center_x: f64,
773    center_y: f64,
774    radius: f64,
775    num_vertices: usize,
776    closed: bool,
777) -> Vec<Coord> {
778    let mut out = Vec::new();
779
780    // Randomize starting angle (0 to 2 * PI)
781    let start_angle_dist = Uniform::new(0.0, 2.0 * PI);
782    let mut angle: f64 = rng.sample(start_angle_dist);
783
784    let dangle = 2.0 * PI / (num_vertices as f64).max(3.0);
785    for _ in 0..num_vertices {
786        out.push(Coord {
787            x: angle.cos() * radius + center_x,
788            y: angle.sin() * radius + center_y,
789        });
790        angle += dangle;
791    }
792
793    if closed {
794        out.push(out[0]);
795    }
796
797    out
798}
799
800#[cfg(test)]
801mod tests {
802    use super::*;
803    use arrow_schema::DataType;
804    use geo_traits::{MultiLineStringTrait, MultiPolygonTrait};
805    use geo_types::Coord;
806    use rand::rngs::StdRng;
807    use rand::SeedableRng;
808    use rstest::rstest;
809    use sedona_geometry::{
810        analyze::analyze_geometry, bounds::wkb_bounds_xy, interval::IntervalTrait,
811    };
812
813    #[test]
814    fn test_generate_random_geometry_produces_valid_wkb() {
815        let bounds = Rect::new(Coord { x: 10.0, y: 10.0 }, Coord { x: 90.0, y: 90.0 });
816        let size_range = (1.0, 10.0);
817
818        // Test both Point and Polygon geometry types
819        let test_cases = vec![
820            (GeometryTypeId::Point, 42, 100, 20, 50), // (type, seed, iterations, min_size, max_size)
821            (GeometryTypeId::Polygon, 123, 50, 80, 200),
822        ];
823
824        for (geom_type, seed, iterations, min_size, max_size) in test_cases {
825            let mut rng = StdRng::seed_from_u64(seed);
826            let options = RandomGeometryOptions {
827                geom_type,
828                bounds,
829                size_range,
830                ..Default::default()
831            };
832
833            for _ in 0..iterations {
834                let wkb_bytes = generate_random_wkb(&mut rng, &options);
835
836                // Verify WKB is not empty and has reasonable size
837                assert!(!wkb_bytes.is_empty());
838                assert!(
839                    wkb_bytes.len() >= min_size,
840                    "WKB size {} is smaller than expected minimum {} for {:?}",
841                    wkb_bytes.len(),
842                    min_size,
843                    geom_type
844                );
845                assert!(
846                    wkb_bytes.len() <= max_size,
847                    "WKB size {} is larger than expected maximum {} for {:?}",
848                    wkb_bytes.len(),
849                    max_size,
850                    geom_type
851                );
852
853                // Verify WKB can be parsed without error
854                wkb::reader::read_wkb(&wkb_bytes).unwrap();
855            }
856        }
857    }
858
859    #[test]
860    fn test_generate_random_geometry_deterministic() {
861        let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
862        let size_range = (1.0, 10.0);
863
864        let geom_types = [GeometryTypeId::Point, GeometryTypeId::Polygon];
865
866        // Generate with same seed twice
867        let mut rng1 = StdRng::seed_from_u64(42);
868        let mut rng2 = StdRng::seed_from_u64(42);
869
870        for geom_type in geom_types {
871            let options = RandomGeometryOptions {
872                geom_type,
873                bounds,
874                size_range,
875                ..Default::default()
876            };
877            let wkb1 = generate_random_wkb(&mut rng1, &options);
878            let wkb2 = generate_random_wkb(&mut rng2, &options);
879
880            // Should generate identical results
881            assert_eq!(wkb1, wkb2);
882        }
883    }
884
885    #[test]
886    fn test_random_partitioned_data_builder_build_basic() {
887        let (schema, partitions) = RandomPartitionedDataBuilder::new()
888            .num_partitions(2)
889            .batches_per_partition(3)
890            .rows_per_batch(4)
891            .null_rate(0.0) // No nulls for easier testing
892            .build()
893            .unwrap();
894
895        // Verify schema
896        assert_eq!(schema.fields().len(), 3);
897        assert_eq!(schema.field(0).name(), "id");
898        assert_eq!(schema.field(0).data_type(), &DataType::Int32);
899        assert_eq!(schema.field(1).name(), "dist");
900        assert_eq!(schema.field(1).data_type(), &DataType::Float64);
901        assert_eq!(schema.field(2).name(), "geometry");
902
903        // Verify partitions structure
904        assert_eq!(partitions.len(), 2); // num_partitions
905
906        for partition in &partitions {
907            assert_eq!(partition.len(), 3); // batches_per_partition
908
909            for batch in partition {
910                assert_eq!(batch.num_rows(), 4); // rows_per_batch
911                assert_eq!(batch.num_columns(), 3);
912            }
913        }
914    }
915
916    #[test]
917    fn test_random_partitioned_data_builder_unique_ids() {
918        let (_, partitions) = RandomPartitionedDataBuilder::new()
919            .num_partitions(2)
920            .batches_per_partition(2)
921            .rows_per_batch(3)
922            .build()
923            .unwrap();
924
925        let mut all_ids = Vec::new();
926
927        for partition in &partitions {
928            for batch in partition {
929                let id_array = batch
930                    .column(0)
931                    .as_any()
932                    .downcast_ref::<Int32Array>()
933                    .unwrap();
934                for i in 0..id_array.len() {
935                    all_ids.push(id_array.value(i));
936                }
937            }
938        }
939
940        // Verify all IDs are unique
941        all_ids.sort();
942        for i in 1..all_ids.len() {
943            assert_ne!(
944                all_ids[i - 1],
945                all_ids[i],
946                "Found duplicate ID: {}",
947                all_ids[i]
948            );
949        }
950
951        // Verify IDs are sequential starting from 0
952        for (i, &id) in all_ids.iter().enumerate() {
953            assert_eq!(id, i as i32);
954        }
955    }
956
957    #[test]
958    fn test_random_partitioned_data_builder_null_rate() {
959        let (_, partitions) = RandomPartitionedDataBuilder::new()
960            .rows_per_batch(100)
961            .null_rate(0.5) // 50% null rate
962            .build()
963            .unwrap();
964
965        let batch = &partitions[0][0];
966        let geometry_array = batch.column(2);
967
968        let null_count = geometry_array.null_count();
969        let total_count = geometry_array.len();
970        let null_rate = null_count as f64 / total_count as f64;
971
972        // Allow some variance due to randomness (±20%)
973        assert!(
974            (0.3..=0.7).contains(&null_rate),
975            "Expected null rate around 0.5, got {null_rate}"
976        );
977    }
978
979    #[test]
980    fn test_random_partitioned_data_builder_deterministic() {
981        let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
982
983        let (schema1, partitions1) = RandomPartitionedDataBuilder::new()
984            .seed(999)
985            .num_partitions(2)
986            .batches_per_partition(2)
987            .rows_per_batch(5)
988            .bounds(bounds)
989            .build()
990            .unwrap();
991
992        let (schema2, partitions2) = RandomPartitionedDataBuilder::new()
993            .seed(999) // Same seed
994            .num_partitions(2)
995            .batches_per_partition(2)
996            .rows_per_batch(5)
997            .bounds(bounds)
998            .build()
999            .unwrap();
1000
1001        // Schemas should be identical
1002        assert_eq!(schema1, schema2);
1003
1004        // All data should be identical
1005        assert_eq!(partitions1.len(), partitions2.len());
1006        for (partition1, partition2) in partitions1.iter().zip(partitions2.iter()) {
1007            assert_eq!(partition1.len(), partition2.len());
1008            for (batch1, batch2) in partition1.iter().zip(partition2.iter()) {
1009                // Compare IDs
1010                let ids1 = batch1
1011                    .column(0)
1012                    .as_any()
1013                    .downcast_ref::<Int32Array>()
1014                    .unwrap();
1015                let ids2 = batch2
1016                    .column(0)
1017                    .as_any()
1018                    .downcast_ref::<Int32Array>()
1019                    .unwrap();
1020                assert_eq!(ids1, ids2);
1021
1022                // Compare distances
1023                let dists1 = batch1
1024                    .column(1)
1025                    .as_any()
1026                    .downcast_ref::<Float64Array>()
1027                    .unwrap();
1028                let dists2 = batch2
1029                    .column(1)
1030                    .as_any()
1031                    .downcast_ref::<Float64Array>()
1032                    .unwrap();
1033                assert_eq!(dists1, dists2);
1034            }
1035        }
1036    }
1037
1038    #[test]
1039    fn test_random_partitioned_data_builder_different_seeds() {
1040        let bounds = Rect::new(Coord { x: 0.0, y: 0.0 }, Coord { x: 100.0, y: 100.0 });
1041
1042        let (_, partitions1) = RandomPartitionedDataBuilder::new()
1043            .seed(111)
1044            .rows_per_batch(10)
1045            .bounds(bounds)
1046            .build()
1047            .unwrap();
1048
1049        let (_, partitions2) = RandomPartitionedDataBuilder::new()
1050            .seed(222) // Different seed
1051            .rows_per_batch(10)
1052            .bounds(bounds)
1053            .build()
1054            .unwrap();
1055
1056        // Data should be different (distances should differ)
1057        let dists1 = partitions1[0][0]
1058            .column(1)
1059            .as_any()
1060            .downcast_ref::<Float64Array>()
1061            .unwrap();
1062        let dists2 = partitions2[0][0]
1063            .column(1)
1064            .as_any()
1065            .downcast_ref::<Float64Array>()
1066            .unwrap();
1067
1068        // At least some distances should be different
1069        let mut found_difference = false;
1070        for i in 0..dists1.len() {
1071            if (dists1.value(i) - dists2.value(i)).abs() > f64::EPSILON {
1072                found_difference = true;
1073                break;
1074            }
1075        }
1076        assert!(
1077            found_difference,
1078            "Expected different random data with different seeds"
1079        );
1080    }
1081
1082    #[test]
1083    fn test_random_linestring_num_vertices() {
1084        let mut rng = StdRng::seed_from_u64(123);
1085        let mut options = RandomGeometryOptions::new();
1086        options.vertices_per_linestring_range = (3, 3);
1087        for _ in 0..100 {
1088            let geom = generate_random_linestring(&mut rng, &options);
1089            assert_eq!(geom.coords().count(), 3);
1090        }
1091
1092        options.vertices_per_linestring_range = (50, 50);
1093        for _ in 0..100 {
1094            let geom = generate_random_linestring(&mut rng, &options);
1095            assert_eq!(geom.coords().count(), 50);
1096        }
1097    }
1098
1099    #[test]
1100    fn test_random_polygon_has_hole() {
1101        let mut rng = StdRng::seed_from_u64(123);
1102        let mut options = RandomGeometryOptions::new();
1103
1104        options.polygon_hole_rate = 0.0;
1105        for _ in 0..100 {
1106            let geom = generate_random_polygon(&mut rng, &options);
1107            assert_eq!(geom.interiors().len(), 0);
1108        }
1109
1110        options.polygon_hole_rate = 1.0;
1111        for _ in 0..100 {
1112            let geom = generate_random_polygon(&mut rng, &options);
1113            assert!(!geom.interiors().is_empty());
1114        }
1115    }
1116
1117    #[test]
1118    fn test_random_multipoint_part_count() {
1119        let mut rng = StdRng::seed_from_u64(123);
1120        let mut options = RandomGeometryOptions::new();
1121
1122        options.num_parts_range = (3, 3);
1123        for _ in 0..100 {
1124            let geom = generate_random_multipoint(&mut rng, &options);
1125            assert_eq!(geom.len(), 3);
1126        }
1127
1128        options.num_parts_range = (10, 10);
1129        for _ in 0..100 {
1130            let geom = generate_random_multipoint(&mut rng, &options);
1131            assert_eq!(geom.len(), 10);
1132        }
1133    }
1134
1135    #[test]
1136    fn test_random_multilinestring_part_count() {
1137        let mut rng = StdRng::seed_from_u64(123);
1138        let mut options = RandomGeometryOptions::new();
1139
1140        options.num_parts_range = (3, 3);
1141        for _ in 0..100 {
1142            let geom = generate_random_multilinestring(&mut rng, &options);
1143            assert_eq!(geom.num_line_strings(), 3);
1144        }
1145
1146        options.num_parts_range = (10, 10);
1147        for _ in 0..100 {
1148            let geom = generate_random_multilinestring(&mut rng, &options);
1149            assert_eq!(geom.num_line_strings(), 10);
1150        }
1151    }
1152
1153    #[test]
1154    fn test_random_multipolygon_part_count() {
1155        let mut rng = StdRng::seed_from_u64(123);
1156        let mut options = RandomGeometryOptions::new();
1157
1158        options.num_parts_range = (3, 3);
1159        for _ in 0..100 {
1160            let geom = generate_random_multipolygon(&mut rng, &options);
1161            assert_eq!(geom.num_polygons(), 3);
1162        }
1163
1164        options.num_parts_range = (10, 10);
1165        for _ in 0..100 {
1166            let geom = generate_random_multipolygon(&mut rng, &options);
1167            assert_eq!(geom.num_polygons(), 10);
1168        }
1169    }
1170
1171    #[test]
1172    fn test_random_geometrycollection_part_count() {
1173        let mut rng = StdRng::seed_from_u64(123);
1174        let mut options = RandomGeometryOptions::new();
1175
1176        options.num_parts_range = (3, 3);
1177        for _ in 0..100 {
1178            let geom = generate_random_geometrycollection(&mut rng, &options);
1179            assert_eq!(geom.len(), 3);
1180        }
1181
1182        options.num_parts_range = (10, 10);
1183        for _ in 0..100 {
1184            let geom = generate_random_geometrycollection(&mut rng, &options);
1185            assert_eq!(geom.len(), 10);
1186        }
1187    }
1188
1189    #[rstest]
1190    fn test_random_geometry_type(
1191        #[values(
1192            GeometryTypeId::Point,
1193            GeometryTypeId::LineString,
1194            GeometryTypeId::Polygon,
1195            GeometryTypeId::MultiPoint,
1196            GeometryTypeId::MultiLineString,
1197            GeometryTypeId::MultiPolygon,
1198            GeometryTypeId::GeometryCollection
1199        )]
1200        geom_type: GeometryTypeId,
1201    ) {
1202        let mut rng = StdRng::seed_from_u64(123);
1203        let mut options = RandomGeometryOptions::new();
1204        options.geom_type = geom_type;
1205
1206        options.empty_rate = 0.0;
1207        for _ in 0..100 {
1208            let geom = generate_random_wkb(&mut rng, &options);
1209            let wkb = wkb::reader::read_wkb(&geom).unwrap();
1210            let analysis = analyze_geometry(&wkb).unwrap();
1211            assert_eq!(analysis.geometry_type.geometry_type(), geom_type);
1212        }
1213    }
1214
1215    #[rstest]
1216    fn test_random_emptiness(
1217        #[values(
1218            GeometryTypeId::Point,
1219            GeometryTypeId::LineString,
1220            GeometryTypeId::Polygon,
1221            GeometryTypeId::MultiPoint,
1222            GeometryTypeId::MultiLineString,
1223            GeometryTypeId::MultiPolygon,
1224            GeometryTypeId::GeometryCollection
1225        )]
1226        geom_type: GeometryTypeId,
1227    ) {
1228        let mut rng = StdRng::seed_from_u64(123);
1229        let mut options = RandomGeometryOptions::new();
1230        options.geom_type = geom_type;
1231
1232        options.empty_rate = 0.0;
1233        for _ in 0..100 {
1234            let geom = generate_random_wkb(&mut rng, &options);
1235            let bounds = wkb_bounds_xy(&geom).unwrap();
1236            assert!(!bounds.x().is_empty());
1237            assert!(!bounds.y().is_empty());
1238
1239            assert!(
1240                bounds.x().lo() >= options.bounds.min().x
1241                    && bounds.y().lo() >= options.bounds.min().y
1242                    && bounds.x().hi() <= options.bounds.max().x
1243                    && bounds.y().hi() <= options.bounds.max().y
1244            );
1245        }
1246
1247        options.empty_rate = 1.0;
1248        for _ in 0..100 {
1249            let geom = generate_random_wkb(&mut rng, &options);
1250            let bounds = wkb_bounds_xy(&geom).unwrap();
1251            assert!(bounds.x().is_empty());
1252            assert!(bounds.y().is_empty());
1253        }
1254    }
1255}