// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.dataflow.v1beta3;
import "google/api/field_behavior.proto";
import "google/protobuf/any.proto";
import "google/protobuf/struct.proto";
option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
option go_package = "google.golang.org/genproto/googleapis/dataflow/v1beta3;dataflow";
option java_multiple_files = true;
option java_outer_classname = "EnvironmentProto";
option java_package = "com.google.dataflow.v1beta3";
option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
option ruby_package = "Google::Cloud::Dataflow::V1beta3";
// Describes the environment in which a Dataflow Job runs.
message Environment {
// The prefix of the resources the system should use for temporary
// storage. The system will append the suffix "/temp-{JOBNAME} to
// this resource prefix, where {JOBNAME} is the value of the
// job_name field. The resulting bucket and object prefix is used
// as the prefix of the resources used to store temporary data
// needed during the job execution. NOTE: This will override the
// value in taskrunner_settings.
// The supported resource type is:
//
// Google Cloud Storage:
//
// storage.googleapis.com/{bucket}/{object}
// bucket.storage.googleapis.com/{object}
string temp_storage_prefix = 1;
// The type of cluster manager API to use. If unknown or
// unspecified, the service will attempt to choose a reasonable
// default. This should be in the form of the API service name,
// e.g. "compute.googleapis.com".
string cluster_manager_api_service = 2;
// The list of experiments to enable. This field should be used for SDK
// related experiments and not for service related experiments. The proper
// field for service related experiments is service_options.
repeated string experiments = 3;
// The list of service options to enable. This field should be used for
// service related experiments only. These experiments, when graduating to GA,
// should be replaced by dedicated fields or become default (i.e. always on).
repeated string service_options = 16;
// If set, contains the Cloud KMS key identifier used to encrypt data
// at rest, AKA a Customer Managed Encryption Key (CMEK).
//
// Format:
// projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY
string service_kms_key_name = 12;
// The worker pools. At least one "harness" worker pool must be
// specified in order for the job to have workers.
repeated WorkerPool worker_pools = 4;
// A description of the process that generated the request.
google.protobuf.Struct user_agent = 5;
// A structure describing which components and their versions of the service
// are required in order to run the job.
google.protobuf.Struct version = 6;
// The dataset for the current project where various workflow
// related tables are stored.
//
// The supported resource type is:
//
// Google BigQuery:
// bigquery.googleapis.com/{dataset}
string dataset = 7;
// The Cloud Dataflow SDK pipeline options specified by the user. These
// options are passed through the service and are used to recreate the
// SDK pipeline options on the worker in a language agnostic and platform
// independent way.
google.protobuf.Struct sdk_pipeline_options = 8;
// Experimental settings.
google.protobuf.Any internal_experiments = 9;
// Identity to run virtual machines as. Defaults to the default account.
string service_account_email = 10;
// Which Flexible Resource Scheduling mode to run in.
FlexResourceSchedulingGoal flex_resource_scheduling_goal = 11;
// The Compute Engine region
// (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
// which worker processing should occur, e.g. "us-west1". Mutually exclusive
// with worker_zone. If neither worker_region nor worker_zone is specified,
// default to the control plane's region.
string worker_region = 13;
// The Compute Engine zone
// (https://cloud.google.com/compute/docs/regions-zones/regions-zones) in
// which worker processing should occur, e.g. "us-west1-a". Mutually exclusive
// with worker_region. If neither worker_region nor worker_zone is specified,
// a zone in the control plane's region is chosen based on available capacity.
string worker_zone = 14;
// Output only. The shuffle mode used for the job.
ShuffleMode shuffle_mode = 15 [(google.api.field_behavior) = OUTPUT_ONLY];
// Any debugging options to be supplied to the job.
DebugOptions debug_options = 17;
}
// The packages that must be installed in order for a worker to run the
// steps of the Cloud Dataflow job that will be assigned to its worker
// pool.
//
// This is the mechanism by which the Cloud Dataflow SDK causes code to
// be loaded onto the workers. For example, the Cloud Dataflow Java SDK
// might use this to install jars containing the user's code and all of the
// various dependencies (libraries, data files, etc.) required in order
// for that code to run.
message Package {
// The name of the package.
string name = 1;
// The resource to read the package from. The supported resource type is:
//
// Google Cloud Storage:
//
// storage.googleapis.com/{bucket}
// bucket.storage.googleapis.com/
string location = 2;
}
// Specifies the processing model used by a
// [google.dataflow.v1beta3.Job], which determines the way the Job is
// managed by the Cloud Dataflow service (how workers are scheduled, how
// inputs are sharded, etc).
enum JobType {
// The type of the job is unspecified, or unknown.
JOB_TYPE_UNKNOWN = 0;
// A batch job with a well-defined end point: data is read, data is
// processed, data is written, and the job is done.
JOB_TYPE_BATCH = 1;
// A continuously streaming job with no end: data is read,
// processed, and written continuously.
JOB_TYPE_STREAMING = 2;
}
// Specifies the resource to optimize for in Flexible Resource Scheduling.
enum FlexResourceSchedulingGoal {
// Run in the default mode.
FLEXRS_UNSPECIFIED = 0;
// Optimize for lower execution time.
FLEXRS_SPEED_OPTIMIZED = 1;
// Optimize for lower cost.
FLEXRS_COST_OPTIMIZED = 2;
}
// Describes the data disk used by a workflow job.
message Disk {
// Size of disk in GB. If zero or unspecified, the service will
// attempt to choose a reasonable default.
int32 size_gb = 1;
// Disk storage type, as defined by Google Compute Engine. This
// must be a disk type appropriate to the project and zone in which
// the workers will run. If unknown or unspecified, the service
// will attempt to choose a reasonable default.
//
// For example, the standard persistent disk type is a resource name
// typically ending in "pd-standard". If SSD persistent disks are
// available, the resource name typically ends with "pd-ssd". The
// actual valid values are defined the Google Compute Engine API,
// not by the Cloud Dataflow API; consult the Google Compute Engine
// documentation for more information about determining the set of
// available disk types for a particular project and zone.
//
// Google Compute Engine Disk types are local to a particular
// project in a particular zone, and so the resource name will
// typically look something like this:
//
// compute.googleapis.com/projects/project-id/zones/zone/diskTypes/pd-standard
string disk_type = 2;
// Directory in a VM where disk is mounted.
string mount_point = 3;
}
// Provides data to pass through to the worker harness.
message WorkerSettings {
// The base URL for accessing Google Cloud APIs.
//
// When workers access Google Cloud APIs, they logically do so via
// relative URLs. If this field is specified, it supplies the base
// URL to use for resolving these relative URLs. The normative
// algorithm used is defined by RFC 1808, "Relative Uniform Resource
// Locators".
//
// If not specified, the default value is "http://www.googleapis.com/"
string base_url = 1;
// Whether to send work progress updates to the service.
bool reporting_enabled = 2;
// The Cloud Dataflow service path relative to the root URL, for example,
// "dataflow/v1b3/projects".
string service_path = 3;
// The Shuffle service path relative to the root URL, for example,
// "shuffle/v1beta1".
string shuffle_service_path = 4;
// The ID of the worker running this pipeline.
string worker_id = 5;
// The prefix of the resources the system should use for temporary
// storage.
//
// The supported resource type is:
//
// Google Cloud Storage:
//
// storage.googleapis.com/{bucket}/{object}
// bucket.storage.googleapis.com/{object}
string temp_storage_prefix = 6;
}
// Taskrunner configuration settings.
message TaskRunnerSettings {
// The UNIX user ID on the worker VM to use for tasks launched by
// taskrunner; e.g. "root".
string task_user = 1;
// The UNIX group ID on the worker VM to use for tasks launched by
// taskrunner; e.g. "wheel".
string task_group = 2;
// The OAuth2 scopes to be requested by the taskrunner in order to
// access the Cloud Dataflow API.
repeated string oauth_scopes = 3;
// The base URL for the taskrunner to use when accessing Google Cloud APIs.
//
// When workers access Google Cloud APIs, they logically do so via
// relative URLs. If this field is specified, it supplies the base
// URL to use for resolving these relative URLs. The normative
// algorithm used is defined by RFC 1808, "Relative Uniform Resource
// Locators".
//
// If not specified, the default value is "http://www.googleapis.com/"
string base_url = 4;
// The API version of endpoint, e.g. "v1b3"
string dataflow_api_version = 5;
// The settings to pass to the parallel worker harness.
WorkerSettings parallel_worker_settings = 6;
// The location on the worker for task-specific subdirectories.
string base_task_dir = 7;
// Whether to continue taskrunner if an exception is hit.
bool continue_on_exception = 8;
// Whether to send taskrunner log info to Google Compute Engine VM serial
// console.
bool log_to_serialconsole = 9;
// Whether to also send taskrunner log info to stderr.
bool alsologtostderr = 10;
// Indicates where to put logs. If this is not specified, the logs
// will not be uploaded.
//
// The supported resource type is:
//
// Google Cloud Storage:
// storage.googleapis.com/{bucket}/{object}
// bucket.storage.googleapis.com/{object}
string log_upload_location = 11;
// The directory on the VM to store logs.
string log_dir = 12;
// The prefix of the resources the taskrunner should use for
// temporary storage.
//
// The supported resource type is:
//
// Google Cloud Storage:
// storage.googleapis.com/{bucket}/{object}
// bucket.storage.googleapis.com/{object}
string temp_storage_prefix = 13;
// The command to launch the worker harness.
string harness_command = 14;
// The file to store the workflow in.
string workflow_file_name = 15;
// The file to store preprocessing commands in.
string commandlines_file_name = 16;
// The ID string of the VM.
string vm_id = 17;
// The suggested backend language.
string language_hint = 18;
// The streaming worker main class name.
string streaming_worker_main_class = 19;
}
// Specifies what happens to a resource when a Cloud Dataflow
// [google.dataflow.v1beta3.Job][google.dataflow.v1beta3.Job] has completed.
enum TeardownPolicy {
// The teardown policy isn't specified, or is unknown.
TEARDOWN_POLICY_UNKNOWN = 0;
// Always teardown the resource.
TEARDOWN_ALWAYS = 1;
// Teardown the resource on success. This is useful for debugging
// failures.
TEARDOWN_ON_SUCCESS = 2;
// Never teardown the resource. This is useful for debugging and
// development.
TEARDOWN_NEVER = 3;
}
// The default set of packages to be staged on a pool of workers.
enum DefaultPackageSet {
// The default set of packages to stage is unknown, or unspecified.
DEFAULT_PACKAGE_SET_UNKNOWN = 0;
// Indicates that no packages should be staged at the worker unless
// explicitly specified by the job.
DEFAULT_PACKAGE_SET_NONE = 1;
// Stage packages typically useful to workers written in Java.
DEFAULT_PACKAGE_SET_JAVA = 2;
// Stage packages typically useful to workers written in Python.
DEFAULT_PACKAGE_SET_PYTHON = 3;
}
// Specifies the algorithm used to determine the number of worker
// processes to run at any given point in time, based on the amount of
// data left to process, the number of workers, and how quickly
// existing workers are processing data.
enum AutoscalingAlgorithm {
// The algorithm is unknown, or unspecified.
AUTOSCALING_ALGORITHM_UNKNOWN = 0;
// Disable autoscaling.
AUTOSCALING_ALGORITHM_NONE = 1;
// Increase worker count over time to reduce job execution time.
AUTOSCALING_ALGORITHM_BASIC = 2;
}
// Settings for WorkerPool autoscaling.
message AutoscalingSettings {
// The algorithm to use for autoscaling.
AutoscalingAlgorithm algorithm = 1;
// The maximum number of workers to cap scaling at.
int32 max_num_workers = 2;
}
// Specifies how IP addresses should be allocated to the worker machines.
enum WorkerIPAddressConfiguration {
// The configuration is unknown, or unspecified.
WORKER_IP_UNSPECIFIED = 0;
// Workers should have public IP addresses.
WORKER_IP_PUBLIC = 1;
// Workers should have private IP addresses.
WORKER_IP_PRIVATE = 2;
}
// Defines a SDK harness container for executing Dataflow pipelines.
message SdkHarnessContainerImage {
// A docker container image that resides in Google Container Registry.
string container_image = 1;
// If true, recommends the Dataflow service to use only one core per SDK
// container instance with this image. If false (or unset) recommends using
// more than one core per SDK container instance with this image for
// efficiency. Note that Dataflow service may choose to override this property
// if needed.
bool use_single_core_per_container = 2;
// Environment ID for the Beam runner API proto Environment that corresponds
// to the current SDK Harness.
string environment_id = 3;
// The set of capabilities enumerated in the above Environment proto. See also
// https://github.com/apache/beam/blob/master/model/pipeline/src/main/proto/beam_runner_api.proto
repeated string capabilities = 4;
}
// Describes one particular pool of Cloud Dataflow workers to be
// instantiated by the Cloud Dataflow service in order to perform the
// computations required by a job. Note that a workflow job may use
// multiple pools, in order to match the various computational
// requirements of the various stages of the job.
message WorkerPool {
// The kind of the worker pool; currently only `harness` and `shuffle`
// are supported.
string kind = 1;
// Number of Google Compute Engine workers in this pool needed to
// execute the job. If zero or unspecified, the service will
// attempt to choose a reasonable default.
int32 num_workers = 2;
// Packages to be installed on workers.
repeated Package packages = 3;
// The default package set to install. This allows the service to
// select a default set of packages which are useful to worker
// harnesses written in a particular language.
DefaultPackageSet default_package_set = 4;
// Machine type (e.g. "n1-standard-1"). If empty or unspecified, the
// service will attempt to choose a reasonable default.
string machine_type = 5;
// Sets the policy for determining when to turndown worker pool.
// Allowed values are: `TEARDOWN_ALWAYS`, `TEARDOWN_ON_SUCCESS`, and
// `TEARDOWN_NEVER`.
// `TEARDOWN_ALWAYS` means workers are always torn down regardless of whether
// the job succeeds. `TEARDOWN_ON_SUCCESS` means workers are torn down
// if the job succeeds. `TEARDOWN_NEVER` means the workers are never torn
// down.
//
// If the workers are not torn down by the service, they will
// continue to run and use Google Compute Engine VM resources in the
// user's project until they are explicitly terminated by the user.
// Because of this, Google recommends using the `TEARDOWN_ALWAYS`
// policy except for small, manually supervised test jobs.
//
// If unknown or unspecified, the service will attempt to choose a reasonable
// default.
TeardownPolicy teardown_policy = 6;
// Size of root disk for VMs, in GB. If zero or unspecified, the service will
// attempt to choose a reasonable default.
int32 disk_size_gb = 7;
// Type of root disk for VMs. If empty or unspecified, the service will
// attempt to choose a reasonable default.
string disk_type = 16;
// Fully qualified source image for disks.
string disk_source_image = 8;
// Zone to run the worker pools in. If empty or unspecified, the service
// will attempt to choose a reasonable default.
string zone = 9;
// Settings passed through to Google Compute Engine workers when
// using the standard Dataflow task runner. Users should ignore
// this field.
TaskRunnerSettings taskrunner_settings = 10;
// The action to take on host maintenance, as defined by the Google
// Compute Engine API.
string on_host_maintenance = 11;
// Data disks that are used by a VM in this workflow.
repeated Disk data_disks = 12;
// Metadata to set on the Google Compute Engine VMs.
map<string, string> metadata = 13;
// Settings for autoscaling of this WorkerPool.
AutoscalingSettings autoscaling_settings = 14;
// Extra arguments for this worker pool.
google.protobuf.Any pool_args = 15;
// Network to which VMs will be assigned. If empty or unspecified,
// the service will use the network "default".
string network = 17;
// Subnetwork to which VMs will be assigned, if desired. Expected to be of
// the form "regions/REGION/subnetworks/SUBNETWORK".
string subnetwork = 19;
// Required. Docker container image that executes the Cloud Dataflow worker
// harness, residing in Google Container Registry.
//
// Deprecated for the Fn API path. Use sdk_harness_container_images instead.
string worker_harness_container_image = 18;
// The number of threads per worker harness. If empty or unspecified, the
// service will choose a number of threads (according to the number of cores
// on the selected machine type for batch, or 1 by convention for streaming).
int32 num_threads_per_worker = 20;
// Configuration for VM IPs.
WorkerIPAddressConfiguration ip_configuration = 21;
// Set of SDK harness containers needed to execute this pipeline. This will
// only be set in the Fn API path. For non-cross-language pipelines this
// should have only one entry. Cross-language pipelines will have two or more
// entries.
repeated SdkHarnessContainerImage sdk_harness_container_images = 22;
}
// Specifies the shuffle mode used by a
// [google.dataflow.v1beta3.Job], which determines the approach data is shuffled
// during processing. More details in:
// https://cloud.google.com/dataflow/docs/guides/deploying-a-pipeline#dataflow-shuffle
enum ShuffleMode {
// Shuffle mode information is not available.
SHUFFLE_MODE_UNSPECIFIED = 0;
// Shuffle is done on the worker VMs.
VM_BASED = 1;
// Shuffle is done on the service side.
SERVICE_BASED = 2;
}
// Describes any options that have an effect on the debugging of pipelines.
message DebugOptions {
// When true, enables the logging of the literal hot key to the user's Cloud
// Logging.
bool enable_hot_key_logging = 1;
}