rust-data-processing 0.3.4

Schema-first ingestion (CSV, JSON, Parquet, Excel) into an in-memory DataSet, plus Polars-backed pipelines, SQL, profiling, validation, and map/reduce-style processing.
# Spark integration: MinIO S3 handoff + local Spark standalone master (spark://).
#
# Usage:
#   cd integration_testing/Spark
#   cp .env.example .env
#   docker compose up -d
#   python3 run_spark_tests.py --no-rancher

include:
  - path: ../MinIO/docker-compose.yml

services:
  spark-master:
    # Versioned tags moved to bitnamilegacy (see bitnami/containers#83267); 3.5.3 was never published.
    image: bitnamilegacy/spark:3.5.1
    container_name: rdp-spark-master
    restart: "no"
    ports:
      - "${SPARK_MASTER_PORT:-7077}:7077"
      - "${SPARK_UI_PORT:-8080}:8080"
    environment:
      - SPARK_MODE=master
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
      - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
      - SPARK_SSL_ENABLED=no
    volumes:
      - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf:ro
    networks:
      - rdp-platform
    healthcheck:
      # bitnamilegacy/spark has no curl; bash /dev/tcp probes the Master UI port.
      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/127.0.0.1/8080' || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 24
      start_period: 30s

  spark-worker:
    image: bitnamilegacy/spark:3.5.1
    container_name: rdp-spark-worker
    restart: "no"
    depends_on:
      spark-master:
        condition: service_healthy
    environment:
      - SPARK_MODE=worker
      - SPARK_MASTER_URL=spark://spark-master:7077
      - SPARK_WORKER_MEMORY=1G
      - SPARK_WORKER_CORES=1
      - SPARK_RPC_AUTHENTICATION_ENABLED=no
      - SPARK_RPC_ENCRYPTION_ENABLED=no
    volumes:
      - ./conf/spark-defaults.conf:/opt/bitnami/spark/conf/spark-defaults.conf:ro
    networks:
      - rdp-platform