// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.dataflow.v1beta3;
option csharp_namespace = "Google.Cloud.Dataflow.V1Beta3";
option go_package = "google.golang.org/genproto/googleapis/dataflow/v1beta3;dataflow";
option java_multiple_files = true;
option java_outer_classname = "StreamingProto";
option java_package = "com.google.dataflow.v1beta3";
option php_namespace = "Google\\Cloud\\Dataflow\\V1beta3";
option ruby_package = "Google::Cloud::Dataflow::V1beta3";
// Global topology of the streaming Dataflow job, including all
// computations and their sharded locations.
message TopologyConfig {
// The computations associated with a streaming Dataflow job.
repeated ComputationTopology computations = 1;
// The disks assigned to a streaming Dataflow job.
repeated DataDiskAssignment data_disk_assignments = 2;
// Maps user stage names to stable computation names.
map<string, string> user_stage_to_computation_name_map = 3;
// The size (in bits) of keys that will be assigned to source messages.
int32 forwarding_key_bits = 4;
// Version number for persistent state.
int32 persistent_state_version = 5;
}
// Identifies a pubsub location to use for transferring data into or
// out of a streaming Dataflow job.
message PubsubLocation {
// A pubsub topic, in the form of
// "pubsub.googleapis.com/topics/<project-id>/<topic-name>"
string topic = 1;
// A pubsub subscription, in the form of
// "pubsub.googleapis.com/subscriptions/<project-id>/<subscription-name>"
string subscription = 2;
// If set, contains a pubsub label from which to extract record timestamps.
// If left empty, record timestamps will be generated upon arrival.
string timestamp_label = 3;
// If set, contains a pubsub label from which to extract record ids.
// If left empty, record deduplication will be strictly best effort.
string id_label = 4;
// Indicates whether the pipeline allows late-arriving data.
bool drop_late_data = 5;
// If set, specifies the pubsub subscription that will be used for tracking
// custom time timestamps for watermark estimation.
string tracking_subscription = 6;
// If true, then the client has requested to get pubsub attributes.
bool with_attributes = 7;
}
// Identifies the location of a streaming computation stage, for
// stage-to-stage communication.
message StreamingStageLocation {
// Identifies the particular stream within the streaming Dataflow
// job.
string stream_id = 1;
}
// Identifies the location of a streaming side input.
message StreamingSideInputLocation {
// Identifies the particular side input within the streaming Dataflow job.
string tag = 1;
// Identifies the state family where this side input is stored.
string state_family = 2;
}
// Identifies the location of a custom souce.
message CustomSourceLocation {
// Whether this source is stateful.
bool stateful = 1;
}
// Describes a stream of data, either as input to be processed or as
// output of a streaming Dataflow job.
message StreamLocation {
// A specification of a stream's location.
oneof location {
// The stream is part of another computation within the current
// streaming Dataflow job.
StreamingStageLocation streaming_stage_location = 1;
// The stream is a pubsub stream.
PubsubLocation pubsub_location = 2;
// The stream is a streaming side input.
StreamingSideInputLocation side_input_location = 3;
// The stream is a custom source.
CustomSourceLocation custom_source_location = 4;
}
}
// State family configuration.
message StateFamilyConfig {
// The state family value.
string state_family = 1;
// If true, this family corresponds to a read operation.
bool is_read = 2;
}
// All configuration data for a particular Computation.
message ComputationTopology {
// The system stage name.
string system_stage_name = 1;
// The ID of the computation.
string computation_id = 5;
// The key ranges processed by the computation.
repeated KeyRangeLocation key_ranges = 2;
// The inputs to the computation.
repeated StreamLocation inputs = 3;
// The outputs from the computation.
repeated StreamLocation outputs = 4;
// The state family values.
repeated StateFamilyConfig state_families = 7;
}
// Location information for a specific key-range of a sharded computation.
// Currently we only support UTF-8 character splits to simplify encoding into
// JSON.
message KeyRangeLocation {
// The start (inclusive) of the key range.
string start = 1;
// The end (exclusive) of the key range.
string end = 2;
// The physical location of this range assignment to be used for
// streaming computation cross-worker message delivery.
string delivery_endpoint = 3;
// The name of the data disk where data for this range is stored.
// This name is local to the Google Cloud Platform project and uniquely
// identifies the disk within that project, for example
// "myproject-1014-104817-4c2-harness-0-disk-1".
string data_disk = 5;
// DEPRECATED. The location of the persistent state for this range, as a
// persistent directory in the worker local filesystem.
string deprecated_persistent_directory = 4 [deprecated = true];
}
// Describes mounted data disk.
message MountedDataDisk {
// The name of the data disk.
// This name is local to the Google Cloud Platform project and uniquely
// identifies the disk within that project, for example
// "myproject-1014-104817-4c2-harness-0-disk-1".
string data_disk = 1;
}
// Data disk assignment for a given VM instance.
message DataDiskAssignment {
// VM instance name the data disks mounted to, for example
// "myproject-1014-104817-4c2-harness-0".
string vm_instance = 1;
// Mounted data disks. The order is important a data disk's 0-based index in
// this list defines which persistent directory the disk is mounted to, for
// example the list of { "myproject-1014-104817-4c2-harness-0-disk-0" },
// { "myproject-1014-104817-4c2-harness-0-disk-1" }.
repeated string data_disks = 2;
}
// Data disk assignment information for a specific key-range of a sharded
// computation.
// Currently we only support UTF-8 character splits to simplify encoding into
// JSON.
message KeyRangeDataDiskAssignment {
// The start (inclusive) of the key range.
string start = 1;
// The end (exclusive) of the key range.
string end = 2;
// The name of the data disk where data for this range is stored.
// This name is local to the Google Cloud Platform project and uniquely
// identifies the disk within that project, for example
// "myproject-1014-104817-4c2-harness-0-disk-1".
string data_disk = 3;
}
// Describes full or partial data disk assignment information of the computation
// ranges.
message StreamingComputationRanges {
// The ID of the computation.
string computation_id = 1;
// Data disk assignments for ranges from this computation.
repeated KeyRangeDataDiskAssignment range_assignments = 2;
}
// Streaming appliance snapshot configuration.
message StreamingApplianceSnapshotConfig {
// If set, indicates the snapshot id for the snapshot being performed.
string snapshot_id = 1;
// Indicates which endpoint is used to import appliance state.
string import_state_endpoint = 2;
}