// DO NOT CHANGE THIS FILE!
// This proto is copied from
// http://google3/speech/soda/chrome/extended_soda_api.proto That is the source
// of truth, and any changes should be submitted and approved there before being
// copied into here.
syntax = "proto2";
package speech.soda.api;
// Optimize generated output for Lite, since it's going to be running on
// end-user devices.
option optimize_for = LITE_RUNTIME;
option java_multiple_files = true;
// Next ID to use: 12
message SerializedSodaConfigMsg {
// Number of channels in RAW audio that will be provided to SODA.
optional int32 channel_count = 1;
// Sample rate, in Hz.
optional int32 sample_rate = 2;
// Maximum size of buffer to use in PipeStream. By default, is 0, which means
// unlimited.
optional int32 max_buffer_bytes = 4 [default = 0];
// If set to true, forces the audio provider to simulate realtime audio
// provision. This only makes sense during testing, to simulate realtime audio
// providing from a big chunk of audio.
// This slows down audio provided to SODA to a maximum of real-time, which
// means more accurate endpointer behavior, but is unsuitable for execution in
// real production environments. Set with caution!
optional bool simulate_realtime_testonly = 5 [default = false];
// config file location for languagepack.
optional string config_file_location = 3 [deprecated = true];
// API key used for call verification.
optional string api_key = 6;
// Directory of the language pack to use.
optional string language_pack_directory = 7;
enum RecognitionMode {
UNKNOWN = 0;
// Intended for voice input for keyboard usage.
IME = 1;
// Intended to caption a stream of audio.
CAPTION = 2;
}
// What kind of recognition to execute here. Impacts model usage.
optional RecognitionMode recognition_mode = 8 [default = IME];
// Whether terse_processor should force a new session after every final
// recognition result.
// This will cause the terse processor to stop processing new audio once an
// endpoint event is detected and wait for it to generate a final event using
// audio up to the endpoint. This will cause processing bursts when a new
// session starts.
optional bool reset_on_final_result = 9 [default = true];
// Whether to populate the timing_metrics field on Recognition and Endpoint
// events.
optional bool include_timing_metrics = 10 [default = true];
// Whether or not to request lang id events.
optional bool enable_lang_id = 11 [default = false];
}
// Next id: 5
message TimingMetrics {
// Epoch time of first audio buffer of main query that is fed into ASR.
// This is the wall time read from the system clock when the first audio
// buffer is received by the terse processor.
optional int64 audio_start_epoch_usec = 1;
// Start time in audio time from start of SODA session.
// This time measures the amount of audio input into SODA.
optional int64 audio_start_time_usec = 2;
// Elapsed wall time usec since first frame.
optional int64 elapsed_wall_time_usec = 3;
// Elapsed processed audio usec from first frame after preamble.
optional int64 event_end_time_usec = 4;
}
// Next id: 5
message SodaRecognitionResult {
// Hypothesis from recognition, in order of probability. We don't get the
// probability from SODA, so the only given is that the first is the "best".
repeated string hypothesis = 1;
enum ResultType {
UNKNOWN = 0;
// Partial result of a speech segment so far.
PARTIAL = 1;
// Final result for this segment.
FINAL = 2;
// Prefetch is only sent for likely query strings. This won't happen for
// non-query mode SODA, but we add here for completeness.
PREFETCH = 3;
}
// What kind of result set this is.
optional ResultType result_type = 2;
enum FinalResultEndpointReason {
ENDPOINT_UNKNOWN = 0;
// End of speech from endpointer.
ENDPOINT_END_OF_SPEECH = 1;
// End of utterance from endpointer.
ENDPOINT_END_OF_UTTERANCE = 2;
// No more audio.
ENDPOINT_END_OF_AUDIO = 3;
// Final was generated because a hotword was detected.
ENDPOINT_ASR_RESET_BY_HOTWORD = 4;
// ASR was reset via the external API.
ENDPOINT_ASR_RESET_EXTERNAL = 5;
// Final recognition result was generated due to an error in ASR.
ENDPOINT_ASR_ERROR = 6;
}
// If this is a final result, why was the recognition marked final.
optional FinalResultEndpointReason endpoint_reason = 3;
// Timing information for the event.
optional TimingMetrics timing_metrics = 4;
}
// Next id: 3
message SodaEndpointEvent {
// What endpoint type we're referring to here.
enum EndpointType {
// A start-of-speech moment has been detected at this time. Audio currently
// contains speech.
START_OF_SPEECH = 0;
// End of speech has been detected by the endpointer, audio does not contain
// speech right now.
END_OF_SPEECH = 1;
// End of Audio due to an end-of-mic data event.
END_OF_AUDIO = 2;
// End of Utterance detected from the endpointer. Not used in
// Caption/Transcription.
END_OF_UTTERANCE = 3;
UNKNOWN = 4;
}
optional EndpointType endpoint_type = 1 [default = UNKNOWN];
// Timing information for the event.
optional TimingMetrics timing_metrics = 2;
}
message SodaAudioLevelInfo {
// Low-pass filtered RMS in range 0..1.
optional float rms = 1;
// Speech likelihood score from in range 0..1.
optional float audio_level = 2;
// Amount of audio seen from start of SODA session until an audio level event.
// This value is only set when audio_level is set.
optional int64 audio_time_usec = 3;
}
message SodaLangIdEvent {
// Locale, e.g. "en-us" or "af-za"
optional string language = 1;
// Equal to the internal enum from langid confidence.
optional int32 confidence_level = 2;
}
message SodaResponse {
enum SodaMessageType {
UNKNOWN = 0;
RECOGNITION = 1;
STOP = 2;
SHUTDOWN = 3;
START = 4;
ENDPOINT = 5;
AUDIO_LEVEL = 6;
LANGID = 7;
}
optional SodaMessageType soda_type = 1 [default = UNKNOWN];
// Set when type is RECOGNITION
optional SodaRecognitionResult recognition_result = 2;
// Set when type is ENDPOINT
optional SodaEndpointEvent endpoint_event = 3;
// Set when type is AUDIO_LEVEL
optional SodaAudioLevelInfo audio_level_info = 4;
// Set when type is LANGID
optional SodaLangIdEvent langid_event = 5;
}