gigastt 1.0.0 - Docs.rs

openapi: 3.1.0

info:
  title: gigastt REST API
  version: "1.0"
  description: |
    Offline and streaming speech-to-text REST API powered by GigaAM v3 e2e_rnnt.
    On-device Russian speech recognition via ONNX Runtime.

    ## Endpoints

    - **Health & discovery**: `GET /health`, `GET /v1/models`
    - **File transcription**: `POST /v1/transcribe` — upload audio, receive full transcript
    - **Streaming transcription**: `POST /v1/transcribe/stream` — upload audio, receive SSE stream of partial/final segments
    - **Metrics**: `GET /metrics` — Prometheus text format (optional, behind `--metrics` flag)

    ## Audio Formats

    Supported upload formats: WAV, MP3, M4A/AAC, OGG/Vorbis, FLAC.
    All formats are decoded server-side via [symphonia](https://github.com/pdeljanov/Symphonia)
    and resampled to 16 kHz mono.

    ## Limits

    All defaults are configurable via CLI flags or environment variables
    (see `gigastt serve --help`):

    - `--body-limit-bytes` (default 50 MiB) — max upload size
    - `--pool-size` (default 4) — concurrent inference sessions
    - `--rate-limit-per-minute` (default 0 = off) — per-IP token bucket
    - File transcription cap: 10 minutes of audio
    - Pool saturation: REST returns `503 Service Unavailable` + `Retry-After: 30`

    ## Error Responses

    All error responses follow a consistent JSON schema:

    ```json
    {
      "error": "Human-readable description",
      "code": "machine_readable_code"
    }
    ```

    Common HTTP status codes:

    | Status | Code | Meaning |
    |--------|------|---------|
    | 400 | `empty_body` | Request body is empty |
    | 413 | `payload_too_large` | Body exceeds `--body-limit-bytes` |
    | 422 | `transcription_error` | Audio decoded but inference failed |
    | 422 | `invalid_audio` | Audio could not be decoded |
    | 503 | `timeout` | Pool saturated; retry after `Retry-After` |
    | 503 | `pool_closed` | Server shutting down |
    | 404 | `metrics_disabled` | `/metrics` called without `--metrics` flag |
  license:
    name: MIT
    url: https://github.com/ekhodzitsky/gigastt/blob/main/LICENSE
  contact:
    name: gigastt
    url: https://github.com/ekhodzitsky/gigastt

servers:
  - url: http://127.0.0.1:9876
    description: Local development server (default)
  - url: http://0.0.0.0:9876
    description: Docker container

paths:
  /health:
    get:
      summary: Health check
      description: |
        Returns service health status. Suitable for load balancer health checks
        and Docker `HEALTHCHECK`.
      operationId: healthCheck
      responses:
        "200":
          description: Service is healthy
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/HealthResponse"
              example:
                status: ok
                model: gigaam-v3-e2e-rnnt
                version: "0.9.5"

  /v1/models:
    get:
      summary: Model information
      description: |
        Returns metadata about the loaded model, including capabilities,
        pool status, and supported audio formats.
      operationId: getModelInfo
      responses:
        "200":
          description: Model metadata
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ModelInfo"
              example:
                id: gigaam-v3-e2e-rnnt
                name: GigaAM v3 RNN-T
                version: "0.9.5"
                encoder: int8
                vocab_size: 1025
                sample_rate: 16000
                pool_size: 4
                pool_available: 3
                supported_formats:
                  - wav
                  - mp3
                  - m4a
                  - ogg
                  - flac
                supported_rates:
                  - 8000
                  - 16000
                  - 24000
                  - 44100
                  - 48000
                diarization: false

  /v1/transcribe:
    post:
      summary: Transcribe audio file
      description: |
        Upload an audio file and receive a complete transcription.
        Supports WAV, MP3, M4A/AAC, OGG/Vorbis, and FLAC.

        The entire file is processed in a single inference session.
        Max audio duration: 10 minutes.
      operationId: transcribeFile
      requestBody:
        required: true
        content:
          audio/wav:
            schema:
              type: string
              format: binary
          audio/mpeg:
            schema:
              type: string
              format: binary
          audio/mp4:
            schema:
              type: string
              format: binary
          audio/ogg:
            schema:
              type: string
              format: binary
          audio/flac:
            schema:
              type: string
              format: binary
          application/octet-stream:
            schema:
              type: string
              format: binary
            description: Raw audio bytes (format auto-detected)
      responses:
        "200":
          description: Transcription successful
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/TranscribeResponse"
              example:
                text: "привет как дела"
                words:
                  - word: "привет"
                    start: 0.5
                    end: 0.8
                  - word: "как"
                    start: 0.9
                    end: 1.0
                  - word: "дела"
                    start: 1.1
                    end: 1.4
                duration: 2.5
        "400":
          $ref: "#/components/responses/BadRequest"
        "413":
          $ref: "#/components/responses/PayloadTooLarge"
        "422":
          $ref: "#/components/responses/TranscriptionError"
        "503":
          $ref: "#/components/responses/ServiceUnavailable"

  /v1/transcribe/stream:
    post:
      summary: Transcribe audio file (SSE streaming)
      description: |
        Upload an audio file and receive a stream of transcription results
        via Server-Sent Events (SSE). Results are emitted as partial and final
        segments as the audio is processed chunk-by-chunk.

        Each SSE event contains a JSON object with `type` discriminator:
        - `partial` — interim result (may change)
        - `final` — finalized utterance
        - `error` — processing error
      operationId: transcribeStream
      requestBody:
        required: true
        content:
          audio/wav:
            schema:
              type: string
              format: binary
          audio/mpeg:
            schema:
              type: string
              format: binary
          audio/mp4:
            schema:
              type: string
              format: binary
          audio/ogg:
            schema:
              type: string
              format: binary
          audio/flac:
            schema:
              type: string
              format: binary
          application/octet-stream:
            schema:
              type: string
              format: binary
      responses:
        "200":
          description: SSE stream of transcription segments
          content:
            text/event-stream:
              schema:
                type: string
              example: |
                data: {"type": "partial", "text": "привет", "timestamp": 1712700000.123, "words": [{"word": "привет", "start": 0.5, "end": 0.8}]}

                data: {"type": "final", "text": "привет как дела", "timestamp": 1712700001.456, "words": [{"word": "привет", "start": 0.5, "end": 0.8}, {"word": "как", "start": 0.9, "end": 1.0}, {"word": "дела", "start": 1.1, "end": 1.4}]}
        "400":
          $ref: "#/components/responses/BadRequest"
        "413":
          $ref: "#/components/responses/PayloadTooLarge"
        "503":
          $ref: "#/components/responses/ServiceUnavailable"

  /metrics:
    get:
      summary: Prometheus metrics
      description: |
        Exposes Prometheus-compatible metrics in text format.
        Only available when the server was started with `--metrics`.
      operationId: getMetrics
      responses:
        "200":
          description: Prometheus metrics
          content:
            text/plain:
              schema:
                type: string
              example: |
                # HELP gigastt_http_requests_total Total HTTP requests
                # TYPE gigastt_http_requests_total counter
                gigastt_http_requests_total{method="POST",path="/v1/transcribe",status="200"} 42
        "404":
          $ref: "#/components/responses/MetricsDisabled"

components:
  schemas:
    HealthResponse:
      type: object
      required:
        - status
        - model
        - version
      properties:
        status:
          type: string
          example: ok
          description: Service status
        model:
          type: string
          example: gigaam-v3-e2e-rnnt
          description: Loaded model identifier
        version:
          type: string
          example: "0.9.5"
          description: Server version (semver)

    ModelInfo:
      type: object
      required:
        - id
        - name
        - version
        - encoder
        - vocab_size
        - sample_rate
        - pool_size
        - pool_available
        - supported_formats
        - supported_rates
        - diarization
      properties:
        id:
          type: string
          example: gigaam-v3-e2e-rnnt
        name:
          type: string
          example: GigaAM v3 RNN-T
        version:
          type: string
          example: "0.9.5"
        encoder:
          type: string
          enum: [fp32, int8]
          description: Encoder quantization type
        vocab_size:
          type: integer
          example: 1025
        sample_rate:
          type: integer
          example: 16000
          description: Internal processing sample rate in Hz
        pool_size:
          type: integer
          example: 4
          description: Total inference session pool size
        pool_available:
          type: integer
          example: 3
          description: Currently available sessions
        supported_formats:
          type: array
          items:
            type: string
          example: [wav, mp3, m4a, ogg, flac]
          description: Supported audio upload formats
        supported_rates:
          type: array
          items:
            type: integer
          example: [8000, 16000, 24000, 44100, 48000]
          description: Supported input sample rates for WebSocket streaming
        diarization:
          type: boolean
          example: false
          description: Whether speaker diarization is available

    TranscribeResponse:
      type: object
      required:
        - text
        - words
        - duration
      properties:
        text:
          type: string
          example: "привет как дела"
          description: Full transcript text
        words:
          type: array
          items:
            $ref: "#/components/schemas/WordInfo"
          description: Word-level transcription with timestamps
        duration:
          type: number
          format: double
          example: 2.5
          description: Audio duration in seconds

    WordInfo:
      type: object
      required:
        - word
        - start
        - end
      properties:
        word:
          type: string
          example: "привет"
          description: Recognized word
        start:
          type: number
          format: double
          example: 0.5
          description: Word start time in seconds
        end:
          type: number
          format: double
          example: 0.8
          description: Word end time in seconds

    ErrorResponse:
      type: object
      required:
        - error
        - code
      properties:
        error:
          type: string
          example: "Request body exceeds the configured size limit"
          description: Human-readable error description
        code:
          type: string
          example: payload_too_large
          description: Machine-readable error code
        retry_after_ms:
          type: integer
          example: 30000
          description: |
            Suggested retry delay in milliseconds. Present only for transient
            backpressure errors (pool saturation).

  responses:
    BadRequest:
      description: Bad request
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
          example:
            error: "Empty request body"
            code: empty_body

    PayloadTooLarge:
      description: Request body too large
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
          example:
            error: "Request body exceeds the configured size limit"
            code: payload_too_large

    TranscriptionError:
      description: Transcription failed
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
          example:
            error: "Transcription failed. Check audio format."
            code: transcription_error

    ServiceUnavailable:
      description: Server busy or shutting down
      headers:
        Retry-After:
          schema:
            type: integer
          description: Seconds to wait before retry
          example: 30
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
          examples:
            timeout:
              summary: Pool saturated
              value:
                error: "Server busy, try again later"
                code: timeout
                retry_after_ms: 30000
            pool_closed:
              summary: Graceful shutdown in progress
              value:
                error: "Server is shutting down"
                code: pool_closed

    MetricsDisabled:
      description: Metrics endpoint is disabled
      content:
        application/json:
          schema:
            $ref: "#/components/schemas/ErrorResponse"
          example:
            error: "metrics endpoint disabled"
            code: metrics_disabled