googapis 0.6.0 - Docs.rs

/// The top-level message sent by the client for the `Recognize` method.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct RecognizeRequest {
    /// Required. Provides information to the recognizer that specifies how to
    /// process the request.
    #[prost(message, optional, tag = "1")]
    pub config: ::core::option::Option<RecognitionConfig>,
    /// Required. The audio data to be recognized.
    #[prost(message, optional, tag = "2")]
    pub audio: ::core::option::Option<RecognitionAudio>,
}
/// The top-level message sent by the client for the `LongRunningRecognize`
/// method.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct LongRunningRecognizeRequest {
    /// Required. Provides information to the recognizer that specifies how to
    /// process the request.
    #[prost(message, optional, tag = "1")]
    pub config: ::core::option::Option<RecognitionConfig>,
    /// Required. The audio data to be recognized.
    #[prost(message, optional, tag = "2")]
    pub audio: ::core::option::Option<RecognitionAudio>,
    /// Optional. Specifies an optional destination for the recognition results.
    #[prost(message, optional, tag = "4")]
    pub output_config: ::core::option::Option<TranscriptOutputConfig>,
}
/// Specifies an optional destination for the recognition results.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct TranscriptOutputConfig {
    #[prost(oneof = "transcript_output_config::OutputType", tags = "1")]
    pub output_type: ::core::option::Option<transcript_output_config::OutputType>,
}
/// Nested message and enum types in `TranscriptOutputConfig`.
pub mod transcript_output_config {
    #[derive(Clone, PartialEq, ::prost::Oneof)]
    pub enum OutputType {
        /// Specifies a Cloud Storage URI for the recognition results. Must be
        /// specified in the format: `gs://bucket_name/object_name`, and the bucket
        /// must already exist.
        #[prost(string, tag = "1")]
        GcsUri(::prost::alloc::string::String),
    }
}
/// The top-level message sent by the client for the `StreamingRecognize` method.
/// Multiple `StreamingRecognizeRequest` messages are sent. The first message
/// must contain a `streaming_config` message and must not contain
/// `audio_content`. All subsequent messages must contain `audio_content` and
/// must not contain a `streaming_config` message.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StreamingRecognizeRequest {
    /// The streaming request, which is either a streaming config or audio content.
    #[prost(oneof = "streaming_recognize_request::StreamingRequest", tags = "1, 2")]
    pub streaming_request: ::core::option::Option<streaming_recognize_request::StreamingRequest>,
}
/// Nested message and enum types in `StreamingRecognizeRequest`.
pub mod streaming_recognize_request {
    /// The streaming request, which is either a streaming config or audio content.
    #[derive(Clone, PartialEq, ::prost::Oneof)]
    pub enum StreamingRequest {
        /// Provides information to the recognizer that specifies how to process the
        /// request. The first `StreamingRecognizeRequest` message must contain a
        /// `streaming_config`  message.
        #[prost(message, tag = "1")]
        StreamingConfig(super::StreamingRecognitionConfig),
        /// The audio data to be recognized. Sequential chunks of audio data are sent
        /// in sequential `StreamingRecognizeRequest` messages. The first
        /// `StreamingRecognizeRequest` message must not contain `audio_content` data
        /// and all subsequent `StreamingRecognizeRequest` messages must contain
        /// `audio_content` data. The audio bytes must be encoded as specified in
        /// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
        /// pure binary representation (not base64). See
        /// [content limits](<https://cloud.google.com/speech-to-text/quotas#content>).
        #[prost(bytes, tag = "2")]
        AudioContent(::prost::alloc::vec::Vec<u8>),
    }
}
/// Provides information to the recognizer that specifies how to process the
/// request.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StreamingRecognitionConfig {
    /// Required. Provides information to the recognizer that specifies how to
    /// process the request.
    #[prost(message, optional, tag = "1")]
    pub config: ::core::option::Option<RecognitionConfig>,
    /// If `false` or omitted, the recognizer will perform continuous
    /// recognition (continuing to wait for and process audio even if the user
    /// pauses speaking) until the client closes the input stream (gRPC API) or
    /// until the maximum time limit has been reached. May return multiple
    /// `StreamingRecognitionResult`s with the `is_final` flag set to `true`.
    ///
    /// If `true`, the recognizer will detect a single spoken utterance. When it
    /// detects that the user has paused or stopped speaking, it will return an
    /// `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no
    /// more than one `StreamingRecognitionResult` with the `is_final` flag set to
    /// `true`.
    ///
    /// The `single_utterance` field can only be used with specified models,
    /// otherwise an error is thrown. The `model` field in \[`RecognitionConfig`][\]
    /// must be set to:
    ///
    /// * `command_and_search`
    /// * `phone_call` AND additional field `useEnhanced`=`true`
    /// * The `model` field is left undefined. In this case the API auto-selects
    ///   a model based on any other parameters that you set in
    ///   `RecognitionConfig`.
    #[prost(bool, tag = "2")]
    pub single_utterance: bool,
    /// If `true`, interim results (tentative hypotheses) may be
    /// returned as they become available (these interim results are indicated with
    /// the `is_final=false` flag).
    /// If `false` or omitted, only `is_final=true` result(s) are returned.
    #[prost(bool, tag = "3")]
    pub interim_results: bool,
}
/// Provides information to the recognizer that specifies how to process the
/// request.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct RecognitionConfig {
    /// Encoding of audio data sent in all `RecognitionAudio` messages.
    /// This field is optional for `FLAC` and `WAV` audio files and required
    /// for all other audio formats. For details, see \[AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding\].
    #[prost(enumeration = "recognition_config::AudioEncoding", tag = "1")]
    pub encoding: i32,
    /// Sample rate in Hertz of the audio data sent in all
    /// `RecognitionAudio` messages. Valid values are: 8000-48000.
    /// 16000 is optimal. For best results, set the sampling rate of the audio
    /// source to 16000 Hz. If that's not possible, use the native sample rate of
    /// the audio source (instead of re-sampling).
    /// This field is optional for FLAC and WAV audio files, but is
    /// required for all other audio formats. For details, see \[AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding\].
    #[prost(int32, tag = "2")]
    pub sample_rate_hertz: i32,
    /// The number of channels in the input audio data.
    /// ONLY set this for MULTI-CHANNEL recognition.
    /// Valid values for LINEAR16 and FLAC are `1`-`8`.
    /// Valid values for OGG_OPUS are '1'-'254'.
    /// Valid value for MULAW, AMR, AMR_WB and SPEEX_WITH_HEADER_BYTE is only `1`.
    /// If `0` or omitted, defaults to one channel (mono).
    /// Note: We only recognize the first channel by default.
    /// To perform independent recognition on each channel set
    /// `enable_separate_recognition_per_channel` to 'true'.
    #[prost(int32, tag = "7")]
    pub audio_channel_count: i32,
    /// This needs to be set to `true` explicitly and `audio_channel_count` > 1
    /// to get each channel recognized separately. The recognition result will
    /// contain a `channel_tag` field to state which channel that result belongs
    /// to. If this is not true, we will only recognize the first channel. The
    /// request is billed cumulatively for all channels recognized:
    /// `audio_channel_count` multiplied by the length of the audio.
    #[prost(bool, tag = "12")]
    pub enable_separate_recognition_per_channel: bool,
    /// Required. The language of the supplied audio as a
    /// \[BCP-47\](<https://www.rfc-editor.org/rfc/bcp/bcp47.txt>) language tag.
    /// Example: "en-US".
    /// See [Language
    /// Support](<https://cloud.google.com/speech-to-text/docs/languages>) for a list
    /// of the currently supported language codes.
    #[prost(string, tag = "3")]
    pub language_code: ::prost::alloc::string::String,
    /// Maximum number of recognition hypotheses to be returned.
    /// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
    /// within each `SpeechRecognitionResult`.
    /// The server may return fewer than `max_alternatives`.
    /// Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
    /// one. If omitted, will return a maximum of one.
    #[prost(int32, tag = "4")]
    pub max_alternatives: i32,
    /// If set to `true`, the server will attempt to filter out
    /// profanities, replacing all but the initial character in each filtered word
    /// with asterisks, e.g. "f***". If set to `false` or omitted, profanities
    /// won't be filtered out.
    #[prost(bool, tag = "5")]
    pub profanity_filter: bool,
    /// Array of \[SpeechContext][google.cloud.speech.v1.SpeechContext\].
    /// A means to provide context to assist the speech recognition. For more
    /// information, see
    /// [speech
    /// adaptation](<https://cloud.google.com/speech-to-text/docs/adaptation>).
    #[prost(message, repeated, tag = "6")]
    pub speech_contexts: ::prost::alloc::vec::Vec<SpeechContext>,
    /// If `true`, the top result includes a list of words and
    /// the start and end time offsets (timestamps) for those words. If
    /// `false`, no word-level time offset information is returned. The default is
    /// `false`.
    #[prost(bool, tag = "8")]
    pub enable_word_time_offsets: bool,
    /// If 'true', adds punctuation to recognition result hypotheses.
    /// This feature is only available in select languages. Setting this for
    /// requests in other languages has no effect at all.
    /// The default 'false' value does not add punctuation to result hypotheses.
    #[prost(bool, tag = "11")]
    pub enable_automatic_punctuation: bool,
    /// Config to enable speaker diarization and set additional
    /// parameters to make diarization better suited for your application.
    /// Note: When this is enabled, we send all the words from the beginning of the
    /// audio for the top alternative in every consecutive STREAMING responses.
    /// This is done in order to improve our speaker tags as our models learn to
    /// identify the speakers in the conversation over time.
    /// For non-streaming requests, the diarization results will be provided only
    /// in the top alternative of the FINAL SpeechRecognitionResult.
    #[prost(message, optional, tag = "19")]
    pub diarization_config: ::core::option::Option<SpeakerDiarizationConfig>,
    /// Metadata regarding this request.
    #[prost(message, optional, tag = "9")]
    pub metadata: ::core::option::Option<RecognitionMetadata>,
    /// Which model to select for the given request. Select the model
    /// best suited to your domain to get best results. If a model is not
    /// explicitly specified, then we auto-select a model based on the parameters
    /// in the RecognitionConfig.
    /// <table>
    ///   <tr>
    ///     <td><b>Model</b></td>
    ///     <td><b>Description</b></td>
    ///   </tr>
    ///   <tr>
    ///     <td><code>command_and_search</code></td>
    ///     <td>Best for short queries such as voice commands or voice search.</td>
    ///   </tr>
    ///   <tr>
    ///     <td><code>phone_call</code></td>
    ///     <td>Best for audio that originated from a phone call (typically
    ///     recorded at an 8khz sampling rate).</td>
    ///   </tr>
    ///   <tr>
    ///     <td><code>video</code></td>
    ///     <td>Best for audio that originated from video or includes multiple
    ///         speakers. Ideally the audio is recorded at a 16khz or greater
    ///         sampling rate. This is a premium model that costs more than the
    ///         standard rate.</td>
    ///   </tr>
    ///   <tr>
    ///     <td><code>default</code></td>
    ///     <td>Best for audio that is not one of the specific audio models.
    ///         For example, long-form audio. Ideally the audio is high-fidelity,
    ///         recorded at a 16khz or greater sampling rate.</td>
    ///   </tr>
    /// </table>
    #[prost(string, tag = "13")]
    pub model: ::prost::alloc::string::String,
    /// Set to true to use an enhanced model for speech recognition.
    /// If `use_enhanced` is set to true and the `model` field is not set, then
    /// an appropriate enhanced model is chosen if an enhanced model exists for
    /// the audio.
    ///
    /// If `use_enhanced` is true and an enhanced version of the specified model
    /// does not exist, then the speech is recognized using the standard version
    /// of the specified model.
    #[prost(bool, tag = "14")]
    pub use_enhanced: bool,
}
/// Nested message and enum types in `RecognitionConfig`.
pub mod recognition_config {
    /// The encoding of the audio data sent in the request.
    ///
    /// All encodings support only 1 channel (mono) audio, unless the
    /// `audio_channel_count` and `enable_separate_recognition_per_channel` fields
    /// are set.
    ///
    /// For best results, the audio source should be captured and transmitted using
    /// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
    /// recognition can be reduced if lossy codecs are used to capture or transmit
    /// audio, particularly if background noise is present. Lossy codecs include
    /// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, `MP3`.
    ///
    /// The `FLAC` and `WAV` audio file formats include a header that describes the
    /// included audio content. You can request recognition for `WAV` files that
    /// contain either `LINEAR16` or `MULAW` encoded audio.
    /// If you send `FLAC` or `WAV` audio file format in
    /// your request, you do not need to specify an `AudioEncoding`; the audio
    /// encoding format is determined from the file header. If you specify
    /// an `AudioEncoding` when you send  send `FLAC` or `WAV` audio, the
    /// encoding configuration must match the encoding described in the audio
    /// header; otherwise the request returns an
    /// \[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT\] error code.
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
    #[repr(i32)]
    pub enum AudioEncoding {
        /// Not specified.
        EncodingUnspecified = 0,
        /// Uncompressed 16-bit signed little-endian samples (Linear PCM).
        Linear16 = 1,
        /// `FLAC` (Free Lossless Audio
        /// Codec) is the recommended encoding because it is
        /// lossless--therefore recognition is not compromised--and
        /// requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
        /// encoding supports 16-bit and 24-bit samples, however, not all fields in
        /// `STREAMINFO` are supported.
        Flac = 2,
        /// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
        Mulaw = 3,
        /// Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
        Amr = 4,
        /// Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
        AmrWb = 5,
        /// Opus encoded audio frames in Ogg container
        /// (\[OggOpus\](<https://wiki.xiph.org/OggOpus>)).
        /// `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
        OggOpus = 6,
        /// Although the use of lossy encodings is not recommended, if a very low
        /// bitrate encoding is required, `OGG_OPUS` is highly preferred over
        /// Speex encoding. The \[Speex\](<https://speex.org/>)  encoding supported by
        /// Cloud Speech API has a header byte in each block, as in MIME type
        /// `audio/x-speex-with-header-byte`.
        /// It is a variant of the RTP Speex encoding defined in
        /// [RFC 5574](<https://tools.ietf.org/html/rfc5574>).
        /// The stream is a sequence of blocks, one block per RTP packet. Each block
        /// starts with a byte containing the length of the block, in bytes, followed
        /// by one or more frames of Speex data, padded to an integral number of
        /// bytes (octets) as specified in RFC 5574. In other words, each RTP header
        /// is replaced with a single byte containing the block length. Only Speex
        /// wideband is supported. `sample_rate_hertz` must be 16000.
        SpeexWithHeaderByte = 7,
    }
}
/// Config to enable speaker diarization.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SpeakerDiarizationConfig {
    /// If 'true', enables speaker detection for each recognized word in
    /// the top alternative of the recognition result using a speaker_tag provided
    /// in the WordInfo.
    #[prost(bool, tag = "1")]
    pub enable_speaker_diarization: bool,
    /// Minimum number of speakers in the conversation. This range gives you more
    /// flexibility by allowing the system to automatically determine the correct
    /// number of speakers. If not set, the default value is 2.
    #[prost(int32, tag = "2")]
    pub min_speaker_count: i32,
    /// Maximum number of speakers in the conversation. This range gives you more
    /// flexibility by allowing the system to automatically determine the correct
    /// number of speakers. If not set, the default value is 6.
    #[prost(int32, tag = "3")]
    pub max_speaker_count: i32,
    /// Output only. Unused.
    #[deprecated]
    #[prost(int32, tag = "5")]
    pub speaker_tag: i32,
}
/// Description of audio data to be recognized.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct RecognitionMetadata {
    /// The use case most closely describing the audio content to be recognized.
    #[prost(enumeration = "recognition_metadata::InteractionType", tag = "1")]
    pub interaction_type: i32,
    /// The industry vertical to which this speech recognition request most
    /// closely applies. This is most indicative of the topics contained
    /// in the audio.  Use the 6-digit NAICS code to identify the industry
    /// vertical - see <https://www.naics.com/search/.>
    #[prost(uint32, tag = "3")]
    pub industry_naics_code_of_audio: u32,
    /// The audio type that most closely describes the audio being recognized.
    #[prost(enumeration = "recognition_metadata::MicrophoneDistance", tag = "4")]
    pub microphone_distance: i32,
    /// The original media the speech was recorded on.
    #[prost(enumeration = "recognition_metadata::OriginalMediaType", tag = "5")]
    pub original_media_type: i32,
    /// The type of device the speech was recorded with.
    #[prost(enumeration = "recognition_metadata::RecordingDeviceType", tag = "6")]
    pub recording_device_type: i32,
    /// The device used to make the recording.  Examples 'Nexus 5X' or
    /// 'Polycom SoundStation IP 6000' or 'POTS' or 'VoIP' or
    /// 'Cardioid Microphone'.
    #[prost(string, tag = "7")]
    pub recording_device_name: ::prost::alloc::string::String,
    /// Mime type of the original audio file.  For example `audio/m4a`,
    /// `audio/x-alaw-basic`, `audio/mp3`, `audio/3gpp`.
    /// A list of possible audio mime types is maintained at
    /// <http://www.iana.org/assignments/media-types/media-types.xhtml#audio>
    #[prost(string, tag = "8")]
    pub original_mime_type: ::prost::alloc::string::String,
    /// Description of the content. Eg. "Recordings of federal supreme court
    /// hearings from 2012".
    #[prost(string, tag = "10")]
    pub audio_topic: ::prost::alloc::string::String,
}
/// Nested message and enum types in `RecognitionMetadata`.
pub mod recognition_metadata {
    /// Use case categories that the audio recognition request can be described
    /// by.
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
    #[repr(i32)]
    pub enum InteractionType {
        /// Use case is either unknown or is something other than one of the other
        /// values below.
        Unspecified = 0,
        /// Multiple people in a conversation or discussion. For example in a
        /// meeting with two or more people actively participating. Typically
        /// all the primary people speaking would be in the same room (if not,
        /// see PHONE_CALL)
        Discussion = 1,
        /// One or more persons lecturing or presenting to others, mostly
        /// uninterrupted.
        Presentation = 2,
        /// A phone-call or video-conference in which two or more people, who are
        /// not in the same room, are actively participating.
        PhoneCall = 3,
        /// A recorded message intended for another person to listen to.
        Voicemail = 4,
        /// Professionally produced audio (eg. TV Show, Podcast).
        ProfessionallyProduced = 5,
        /// Transcribe spoken questions and queries into text.
        VoiceSearch = 6,
        /// Transcribe voice commands, such as for controlling a device.
        VoiceCommand = 7,
        /// Transcribe speech to text to create a written document, such as a
        /// text-message, email or report.
        Dictation = 8,
    }
    /// Enumerates the types of capture settings describing an audio file.
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
    #[repr(i32)]
    pub enum MicrophoneDistance {
        /// Audio type is not known.
        Unspecified = 0,
        /// The audio was captured from a closely placed microphone. Eg. phone,
        /// dictaphone, or handheld microphone. Generally if there speaker is within
        /// 1 meter of the microphone.
        Nearfield = 1,
        /// The speaker if within 3 meters of the microphone.
        Midfield = 2,
        /// The speaker is more than 3 meters away from the microphone.
        Farfield = 3,
    }
    /// The original media the speech was recorded on.
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
    #[repr(i32)]
    pub enum OriginalMediaType {
        /// Unknown original media type.
        Unspecified = 0,
        /// The speech data is an audio recording.
        Audio = 1,
        /// The speech data originally recorded on a video.
        Video = 2,
    }
    /// The type of device the speech was recorded with.
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
    #[repr(i32)]
    pub enum RecordingDeviceType {
        /// The recording device is unknown.
        Unspecified = 0,
        /// Speech was recorded on a smartphone.
        Smartphone = 1,
        /// Speech was recorded using a personal computer or tablet.
        Pc = 2,
        /// Speech was recorded over a phone line.
        PhoneLine = 3,
        /// Speech was recorded in a vehicle.
        Vehicle = 4,
        /// Speech was recorded outdoors.
        OtherOutdoorDevice = 5,
        /// Speech was recorded indoors.
        OtherIndoorDevice = 6,
    }
}
/// Provides "hints" to the speech recognizer to favor specific words and phrases
/// in the results.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SpeechContext {
    /// A list of strings containing words and phrases "hints" so that
    /// the speech recognition is more likely to recognize them. This can be used
    /// to improve the accuracy for specific words and phrases, for example, if
    /// specific commands are typically spoken by the user. This can also be used
    /// to add additional words to the vocabulary of the recognizer. See
    /// [usage limits](<https://cloud.google.com/speech-to-text/quotas#content>).
    ///
    /// List items can also be set to classes for groups of words that represent
    /// common concepts that occur in natural language. For example, rather than
    /// providing phrase hints for every month of the year, using the $MONTH class
    /// improves the likelihood of correctly transcribing audio that includes
    /// months.
    #[prost(string, repeated, tag = "1")]
    pub phrases: ::prost::alloc::vec::Vec<::prost::alloc::string::String>,
}
/// Contains audio data in the encoding specified in the `RecognitionConfig`.
/// Either `content` or `uri` must be supplied. Supplying both or neither
/// returns \[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT\]. See
/// [content limits](<https://cloud.google.com/speech-to-text/quotas#content>).
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct RecognitionAudio {
    /// The audio source, which is either inline content or a Google Cloud
    /// Storage uri.
    #[prost(oneof = "recognition_audio::AudioSource", tags = "1, 2")]
    pub audio_source: ::core::option::Option<recognition_audio::AudioSource>,
}
/// Nested message and enum types in `RecognitionAudio`.
pub mod recognition_audio {
    /// The audio source, which is either inline content or a Google Cloud
    /// Storage uri.
    #[derive(Clone, PartialEq, ::prost::Oneof)]
    pub enum AudioSource {
        /// The audio data bytes encoded as specified in
        /// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
        /// pure binary representation, whereas JSON representations use base64.
        #[prost(bytes, tag = "1")]
        Content(::prost::alloc::vec::Vec<u8>),
        /// URI that points to a file that contains audio data bytes as specified in
        /// `RecognitionConfig`. The file must not be compressed (for example, gzip).
        /// Currently, only Google Cloud Storage URIs are
        /// supported, which must be specified in the following format:
        /// `gs://bucket_name/object_name` (other URI formats return
        /// \[google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT\]). For more information, see
        /// [Request URIs](<https://cloud.google.com/storage/docs/reference-uris>).
        #[prost(string, tag = "2")]
        Uri(::prost::alloc::string::String),
    }
}
/// The only message returned to the client by the `Recognize` method. It
/// contains the result as zero or more sequential `SpeechRecognitionResult`
/// messages.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct RecognizeResponse {
    /// Sequential list of transcription results corresponding to
    /// sequential portions of audio.
    #[prost(message, repeated, tag = "2")]
    pub results: ::prost::alloc::vec::Vec<SpeechRecognitionResult>,
    /// When available, billed audio seconds for the corresponding request.
    #[prost(message, optional, tag = "3")]
    pub total_billed_time: ::core::option::Option<::prost_types::Duration>,
}
/// The only message returned to the client by the `LongRunningRecognize` method.
/// It contains the result as zero or more sequential `SpeechRecognitionResult`
/// messages. It is included in the `result.response` field of the `Operation`
/// returned by the `GetOperation` call of the `google::longrunning::Operations`
/// service.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct LongRunningRecognizeResponse {
    /// Sequential list of transcription results corresponding to
    /// sequential portions of audio.
    #[prost(message, repeated, tag = "2")]
    pub results: ::prost::alloc::vec::Vec<SpeechRecognitionResult>,
    /// When available, billed audio seconds for the corresponding request.
    #[prost(message, optional, tag = "3")]
    pub total_billed_time: ::core::option::Option<::prost_types::Duration>,
}
/// Describes the progress of a long-running `LongRunningRecognize` call. It is
/// included in the `metadata` field of the `Operation` returned by the
/// `GetOperation` call of the `google::longrunning::Operations` service.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct LongRunningRecognizeMetadata {
    /// Approximate percentage of audio processed thus far. Guaranteed to be 100
    /// when the audio is fully processed and the results are available.
    #[prost(int32, tag = "1")]
    pub progress_percent: i32,
    /// Time when the request was received.
    #[prost(message, optional, tag = "2")]
    pub start_time: ::core::option::Option<::prost_types::Timestamp>,
    /// Time of the most recent processing update.
    #[prost(message, optional, tag = "3")]
    pub last_update_time: ::core::option::Option<::prost_types::Timestamp>,
    /// Output only. The URI of the audio file being transcribed. Empty if the audio was sent
    /// as byte content.
    #[prost(string, tag = "4")]
    pub uri: ::prost::alloc::string::String,
}
/// `StreamingRecognizeResponse` is the only message returned to the client by
/// `StreamingRecognize`. A series of zero or more `StreamingRecognizeResponse`
/// messages are streamed back to the client. If there is no recognizable
/// audio, and `single_utterance` is set to false, then no messages are streamed
/// back to the client.
///
/// Here's an example of a series of `StreamingRecognizeResponse`s that might be
/// returned while processing audio:
///
/// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
///
/// 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
///
/// 3. results { alternatives { transcript: "to be" } stability: 0.9 }
///    results { alternatives { transcript: " or not to be" } stability: 0.01 }
///
/// 4. results { alternatives { transcript: "to be or not to be"
///                             confidence: 0.92 }
///              alternatives { transcript: "to bee or not to bee" }
///              is_final: true }
///
/// 5. results { alternatives { transcript: " that's" } stability: 0.01 }
///
/// 6. results { alternatives { transcript: " that is" } stability: 0.9 }
///    results { alternatives { transcript: " the question" } stability: 0.01 }
///
/// 7. results { alternatives { transcript: " that is the question"
///                             confidence: 0.98 }
///              alternatives { transcript: " that was the question" }
///              is_final: true }
///
/// Notes:
///
/// - Only two of the above responses #4 and #7 contain final results; they are
///   indicated by `is_final: true`. Concatenating these together generates the
///   full transcript: "to be or not to be that is the question".
///
/// - The others contain interim `results`. #3 and #6 contain two interim
///   `results`: the first portion has a high stability and is less likely to
///   change; the second portion has a low stability and is very likely to
///   change. A UI designer might choose to show only high stability `results`.
///
/// - The specific `stability` and `confidence` values shown above are only for
///   illustrative purposes. Actual values may vary.
///
/// - In each response, only one of these fields will be set:
///     `error`,
///     `speech_event_type`, or
///     one or more (repeated) `results`.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StreamingRecognizeResponse {
    /// If set, returns a \[google.rpc.Status][google.rpc.Status\] message that
    /// specifies the error for the operation.
    #[prost(message, optional, tag = "1")]
    pub error: ::core::option::Option<super::super::super::rpc::Status>,
    /// This repeated list contains zero or more results that
    /// correspond to consecutive portions of the audio currently being processed.
    /// It contains zero or one `is_final=true` result (the newly settled portion),
    /// followed by zero or more `is_final=false` results (the interim results).
    #[prost(message, repeated, tag = "2")]
    pub results: ::prost::alloc::vec::Vec<StreamingRecognitionResult>,
    /// Indicates the type of speech event.
    #[prost(enumeration = "streaming_recognize_response::SpeechEventType", tag = "4")]
    pub speech_event_type: i32,
    /// When available, billed audio seconds for the stream.
    /// Set only if this is the last response in the stream.
    #[prost(message, optional, tag = "5")]
    pub total_billed_time: ::core::option::Option<::prost_types::Duration>,
}
/// Nested message and enum types in `StreamingRecognizeResponse`.
pub mod streaming_recognize_response {
    /// Indicates the type of speech event.
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)]
    #[repr(i32)]
    pub enum SpeechEventType {
        /// No speech event specified.
        SpeechEventUnspecified = 0,
        /// This event indicates that the server has detected the end of the user's
        /// speech utterance and expects no additional speech. Therefore, the server
        /// will not process additional audio (although it may subsequently return
        /// additional results). The client should stop sending additional audio
        /// data, half-close the gRPC connection, and wait for any additional results
        /// until the server closes the gRPC connection. This event is only sent if
        /// `single_utterance` was set to `true`, and is not used otherwise.
        EndOfSingleUtterance = 1,
    }
}
/// A streaming speech recognition result corresponding to a portion of the audio
/// that is currently being processed.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct StreamingRecognitionResult {
    /// May contain one or more recognition hypotheses (up to the
    /// maximum specified in `max_alternatives`).
    /// These alternatives are ordered in terms of accuracy, with the top (first)
    /// alternative being the most probable, as ranked by the recognizer.
    #[prost(message, repeated, tag = "1")]
    pub alternatives: ::prost::alloc::vec::Vec<SpeechRecognitionAlternative>,
    /// If `false`, this `StreamingRecognitionResult` represents an
    /// interim result that may change. If `true`, this is the final time the
    /// speech service will return this particular `StreamingRecognitionResult`,
    /// the recognizer will not return any further hypotheses for this portion of
    /// the transcript and corresponding audio.
    #[prost(bool, tag = "2")]
    pub is_final: bool,
    /// An estimate of the likelihood that the recognizer will not
    /// change its guess about this interim result. Values range from 0.0
    /// (completely unstable) to 1.0 (completely stable).
    /// This field is only provided for interim results (`is_final=false`).
    /// The default of 0.0 is a sentinel value indicating `stability` was not set.
    #[prost(float, tag = "3")]
    pub stability: f32,
    /// Time offset of the end of this result relative to the
    /// beginning of the audio.
    #[prost(message, optional, tag = "4")]
    pub result_end_time: ::core::option::Option<::prost_types::Duration>,
    /// For multi-channel audio, this is the channel number corresponding to the
    /// recognized result for the audio from that channel.
    /// For audio_channel_count = N, its output values can range from '1' to 'N'.
    #[prost(int32, tag = "5")]
    pub channel_tag: i32,
    /// The \[BCP-47\](<https://www.rfc-editor.org/rfc/bcp/bcp47.txt>) language tag of
    /// the language in this result. This language code was detected to have the
    /// most likelihood of being spoken in the audio.
    #[prost(string, tag = "6")]
    pub language_code: ::prost::alloc::string::String,
}
/// A speech recognition result corresponding to a portion of the audio.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SpeechRecognitionResult {
    /// May contain one or more recognition hypotheses (up to the
    /// maximum specified in `max_alternatives`).
    /// These alternatives are ordered in terms of accuracy, with the top (first)
    /// alternative being the most probable, as ranked by the recognizer.
    #[prost(message, repeated, tag = "1")]
    pub alternatives: ::prost::alloc::vec::Vec<SpeechRecognitionAlternative>,
    /// For multi-channel audio, this is the channel number corresponding to the
    /// recognized result for the audio from that channel.
    /// For audio_channel_count = N, its output values can range from '1' to 'N'.
    #[prost(int32, tag = "2")]
    pub channel_tag: i32,
}
/// Alternative hypotheses (a.k.a. n-best list).
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct SpeechRecognitionAlternative {
    /// Transcript text representing the words that the user spoke.
    #[prost(string, tag = "1")]
    pub transcript: ::prost::alloc::string::String,
    /// The confidence estimate between 0.0 and 1.0. A higher number
    /// indicates an estimated greater likelihood that the recognized words are
    /// correct. This field is set only for the top alternative of a non-streaming
    /// result or, of a streaming result where `is_final=true`.
    /// This field is not guaranteed to be accurate and users should not rely on it
    /// to be always provided.
    /// The default of 0.0 is a sentinel value indicating `confidence` was not set.
    #[prost(float, tag = "2")]
    pub confidence: f32,
    /// A list of word-specific information for each recognized word.
    /// Note: When `enable_speaker_diarization` is true, you will see all the words
    /// from the beginning of the audio.
    #[prost(message, repeated, tag = "3")]
    pub words: ::prost::alloc::vec::Vec<WordInfo>,
}
/// Word-specific information for recognized words.
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct WordInfo {
    /// Time offset relative to the beginning of the audio,
    /// and corresponding to the start of the spoken word.
    /// This field is only set if `enable_word_time_offsets=true` and only
    /// in the top hypothesis.
    /// This is an experimental feature and the accuracy of the time offset can
    /// vary.
    #[prost(message, optional, tag = "1")]
    pub start_time: ::core::option::Option<::prost_types::Duration>,
    /// Time offset relative to the beginning of the audio,
    /// and corresponding to the end of the spoken word.
    /// This field is only set if `enable_word_time_offsets=true` and only
    /// in the top hypothesis.
    /// This is an experimental feature and the accuracy of the time offset can
    /// vary.
    #[prost(message, optional, tag = "2")]
    pub end_time: ::core::option::Option<::prost_types::Duration>,
    /// The word corresponding to this set of information.
    #[prost(string, tag = "3")]
    pub word: ::prost::alloc::string::String,
    /// Output only. A distinct integer value is assigned for every speaker within
    /// the audio. This field specifies which one of those speakers was detected to
    /// have spoken this word. Value ranges from '1' to diarization_speaker_count.
    /// speaker_tag is set if enable_speaker_diarization = 'true' and only in the
    /// top alternative.
    #[prost(int32, tag = "5")]
    pub speaker_tag: i32,
}
#[doc = r" Generated client implementations."]
pub mod speech_client {
    #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)]
    use tonic::codegen::*;
    #[doc = " Service that implements Google Cloud Speech API."]
    #[derive(Debug, Clone)]
    pub struct SpeechClient<T> {
        inner: tonic::client::Grpc<T>,
    }
    impl<T> SpeechClient<T>
    where
        T: tonic::client::GrpcService<tonic::body::BoxBody>,
        T::ResponseBody: Body + Send + 'static,
        T::Error: Into<StdError>,
        <T::ResponseBody as Body>::Error: Into<StdError> + Send,
    {
        pub fn new(inner: T) -> Self {
            let inner = tonic::client::Grpc::new(inner);
            Self { inner }
        }
        pub fn with_interceptor<F>(
            inner: T,
            interceptor: F,
        ) -> SpeechClient<InterceptedService<T, F>>
        where
            F: tonic::service::Interceptor,
            T: tonic::codegen::Service<
                http::Request<tonic::body::BoxBody>,
                Response = http::Response<
                    <T as tonic::client::GrpcService<tonic::body::BoxBody>>::ResponseBody,
                >,
            >,
            <T as tonic::codegen::Service<http::Request<tonic::body::BoxBody>>>::Error:
                Into<StdError> + Send + Sync,
        {
            SpeechClient::new(InterceptedService::new(inner, interceptor))
        }
        #[doc = r" Compress requests with `gzip`."]
        #[doc = r""]
        #[doc = r" This requires the server to support it otherwise it might respond with an"]
        #[doc = r" error."]
        pub fn send_gzip(mut self) -> Self {
            self.inner = self.inner.send_gzip();
            self
        }
        #[doc = r" Enable decompressing responses with `gzip`."]
        pub fn accept_gzip(mut self) -> Self {
            self.inner = self.inner.accept_gzip();
            self
        }
        #[doc = " Performs synchronous speech recognition: receive results after all audio"]
        #[doc = " has been sent and processed."]
        pub async fn recognize(
            &mut self,
            request: impl tonic::IntoRequest<super::RecognizeRequest>,
        ) -> Result<tonic::Response<super::RecognizeResponse>, tonic::Status> {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path =
                http::uri::PathAndQuery::from_static("/google.cloud.speech.v1.Speech/Recognize");
            self.inner.unary(request.into_request(), path, codec).await
        }
        #[doc = " Performs asynchronous speech recognition: receive results via the"]
        #[doc = " google.longrunning.Operations interface. Returns either an"]
        #[doc = " `Operation.error` or an `Operation.response` which contains"]
        #[doc = " a `LongRunningRecognizeResponse` message."]
        #[doc = " For more information on asynchronous speech recognition, see the"]
        #[doc = " [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize)."]
        pub async fn long_running_recognize(
            &mut self,
            request: impl tonic::IntoRequest<super::LongRunningRecognizeRequest>,
        ) -> Result<
            tonic::Response<super::super::super::super::longrunning::Operation>,
            tonic::Status,
        > {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/google.cloud.speech.v1.Speech/LongRunningRecognize",
            );
            self.inner.unary(request.into_request(), path, codec).await
        }
        #[doc = " Performs bidirectional streaming speech recognition: receive results while"]
        #[doc = " sending audio. This method is only available via the gRPC API (not REST)."]
        pub async fn streaming_recognize(
            &mut self,
            request: impl tonic::IntoStreamingRequest<Message = super::StreamingRecognizeRequest>,
        ) -> Result<
            tonic::Response<tonic::codec::Streaming<super::StreamingRecognizeResponse>>,
            tonic::Status,
        > {
            self.inner.ready().await.map_err(|e| {
                tonic::Status::new(
                    tonic::Code::Unknown,
                    format!("Service was not ready: {}", e.into()),
                )
            })?;
            let codec = tonic::codec::ProstCodec::default();
            let path = http::uri::PathAndQuery::from_static(
                "/google.cloud.speech.v1.Speech/StreamingRecognize",
            );
            self.inner.streaming(request.into_streaming_request(), path, codec).await
        }
    }
}