omniparse 0.4.0

A Rust toolkit for detecting and extracting metadata, text, and content from various file formats
Documentation
# syntax=docker/dockerfile:1.7
#
# Multi-stage build for the omniparse Axum web service. Built with every
# capability flag enabled so the published image is a single drop-in for
# all real-world inputs:
#
#   Feature        Effect
#   -----------    ---------------------------------------------------------
#   pdf            Strict-tier PDF parser (lopdf) + lenient raw_scan
#                  fallback (FlateDecode / LZWDecode / ASCII85Decode).
#                  Default-on; bundled via the implicit default set.
#   pdf-extract    Fourth-tier PDF parser (pdf-extract crate) — handles
#                  linearized PDFs and Identity-H + /ToUnicode CMaps that
#                  lopdf rejects (Lucidchart exports, Word/PowerPoint
#                  print-to-PDF, browser print-to-PDF).
#   ocr            Classical pure-Rust OCR pipeline (default off; pulled
#                  in transitively by ocr-ml and ocr-parallel).
#   ocr-ml         ML OCR backend via ocrs + rten with auto-downloaded,
#                  SHA-256-verified models at /opt/omniparse/models.
#   ocr-train      Train custom glyph prototypes from a TTF/OTF font for
#                  the classical OCR pipeline.
#   ocr-parallel   Per-region OCR recognition parallelized via rayon.
#                  Implies `ocr` + `parallel`.
#   async          tokio-backed async extraction (extract_from_path_async).
#   parallel       Rayon-backed batch processing (process_files_parallel).
#   markdown, svg, webp, epub, mp3
#                  Format parsers bundled by default.
#
# Ship the production `web_service_prod` example binary plus the
# pre-downloaded, SHA-256-verified rten models at `/opt/omniparse/models`.
# Override the model directory at runtime with
# `-e OMNIPARSE_OCR_MODELS=/path -v host/models:/path`.
#
# Build: docker build -t omniparse-web:dev .
# Run:   docker run --rm -p 8080:8080 omniparse-web:dev
#
# Cloud Run: the image listens on $PORT (default 8080) and emits structured
# Cloud Logging JSON automatically when K_SERVICE is set. See
# deploy/cloud-run/deploy.sh for a one-shot deploy script.

# ---------- cargo-chef base ----------
# `cargo-chef` separates dependency compilation from source compilation so
# code-only changes don't re-pull/recompile the dependency graph.
FROM rust:1-slim-bookworm AS chef
WORKDIR /src
RUN apt-get update \
 && apt-get install -y --no-install-recommends \
        pkg-config \
        ca-certificates \
        libssl-dev \
 && rm -rf /var/lib/apt/lists/* \
 && cargo install cargo-chef --locked --version ^0.1

# ---------- recipe planner ----------
FROM chef AS planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json

# ---------- dependency build + binary build ----------
FROM chef AS builder
COPY --from=planner /src/recipe.json recipe.json
RUN cargo chef cook --release --features "ocr-ml ocr-train ocr-parallel pdf-extract async parallel" --recipe-path recipe.json
COPY . .
RUN cargo build --release --features "ocr-ml ocr-train ocr-parallel pdf-extract async parallel" --bin omniparse \
 && cargo build --release --features "ocr-ml ocr-train ocr-parallel pdf-extract async parallel" --example web_service \
 && cargo build --release --features "ocr-ml ocr-train ocr-parallel pdf-extract async parallel" --example web_service_prod

# ---------- model fetch + verify ----------
# Isolated stage so the model layer only invalidates when the model URLs or
# pinned sha256 change — not on every code edit.
FROM debian:bookworm-slim AS models
RUN apt-get update \
 && apt-get install -y --no-install-recommends ca-certificates \
 && rm -rf /var/lib/apt/lists/*
COPY --from=builder /src/target/release/omniparse /usr/local/bin/omniparse
ENV OMNIPARSE_OCR_MODELS=/opt/omniparse/models
RUN omniparse models download && omniparse models verify

# ---------- runtime ----------
FROM gcr.io/distroless/cc-debian12 AS runtime
ENV OMNIPARSE_OCR_MODELS=/opt/omniparse/models \
    OMNIPARSE_OCR=ml \
    PORT=8080
COPY --from=models  /opt/omniparse/models                          /opt/omniparse/models
# Ship both binaries. ENTRYPOINT picks the prod one; ops can override with
# `--entrypoint /usr/local/bin/web_service` for the minimal demo.
COPY --from=builder /src/target/release/examples/web_service       /usr/local/bin/web_service
COPY --from=builder /src/target/release/examples/web_service_prod  /usr/local/bin/web_service_prod
COPY --from=builder /src/target/release/omniparse                  /usr/local/bin/omniparse
EXPOSE 8080
USER 65532:65532
ENTRYPOINT ["/usr/local/bin/web_service_prod"]