codescout 0.14.0

name: codescout-retrieval

# Three-profile retrieval stack: cpu / gpu / amd.
#
#   docker compose --profile cpu up -d     # CPU-only — any machine
#   docker compose --profile gpu up -d     # NVIDIA CUDA
#   docker compose --profile amd up -d     # AMD ROCm (RX 7xxx / MI series)
#
# Dense embedder is llama-server (gguf) on all three profiles — CodeRankEmbed-Q4_K_M
# scored highest on the legacy-natural bench (30/60) and runs ≤1GB VRAM.
# Sparse is splade-pp via TEI. CPU profile uses TEI's CPU image; GPU profile uses
# TEI's CUDA image; AMD profile builds TEI from source against rocm/pytorch:latest
# (see docker/sparse-amd/Dockerfile) — HF does not publish a prebuilt TEI ROCm image.
# Rerank is bge-reranker-base on CPU (smaller, ~250ms p95) and bge-reranker-v2-m3
# on GPU/AMD profiles (full quality, ~80ms p95).
#
# First run: place CodeRankEmbed-Q4_K_M.gguf and (for gpu/amd profiles)
# bge-reranker-v2-m3-Q4_K_M.gguf in ${CODESCOUT_MODEL_DIR:-./models}.
# Pull them from:
#   https://huggingface.co/nomic-ai/CodeRankEmbed-GGUF (Q4_K_M ≈ 90MB)
#   https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF (Q4_K_M ≈ 419MB)
#
# Pinned image tags below are what we tested. Replace tags with sha256 digests
# before treating any profile as production-stable.

services:
  qdrant:
    image: qdrant/qdrant:v1.17.0@sha256:f1c7272cdac52b38c1a0e89313922d940ba50afd90d593a1605dbbc214e66ffb
    container_name: codescout-qdrant
    restart: unless-stopped
    ports:
      - "127.0.0.1:6333:6333"
      - "127.0.0.1:6334:6334"
    volumes:
      - qdrant_storage:/qdrant/storage
    environment:
      QDRANT__LOG_LEVEL: INFO
    healthcheck:
      test: ["CMD-SHELL", "bash -c '</dev/tcp/127.0.0.1/6333'"]
      interval: 10s
      timeout: 3s
      retries: 5
    networks: [retrieval_net]

  # ------------------------------------------------------------------ DENSE
  # llama-server serving CodeRankEmbed Q4_K_M (137M params, dim 768).
  # The query prefix "Represent this query for searching relevant code: " is
  # required — see CODESCOUT_QUERY_PREFIX in .env.{cpu,gpu}.
  dense-cpu:
    profiles: [cpu]
    image: ghcr.io/ggml-org/llama.cpp:server@sha256:6b0a9b4fd7e3a9a55e959e5a74d47e11f8ccd4dfbc2556b7382a6516255dcc73
    container_name: codescout-dense-cpu
    restart: unless-stopped
    command:
      - --model
      - /models/CodeRankEmbed-Q4_K_M.gguf
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      - --embedding
      - --pooling
      - mean
      - --ctx-size
      - "8192"
      - --batch-size
      - "4096"
      - --ubatch-size
      - "4096"
      - --parallel
      - "8"
      - --threads
      - "4"
    ports:
      - "127.0.0.1:48081:8080"
    volumes:
      - ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 30s
    networks: [retrieval_net]

  dense-gpu:
    profiles: [gpu]
    image: ghcr.io/ggml-org/llama.cpp:server-cuda@sha256:a04923d31b4ca0d95bd772a4b80c9112f29121014df64d3d80a16a136ca19672
    container_name: codescout-dense-gpu
    restart: unless-stopped
    command:
      - --model
      - /models/CodeRankEmbed-Q4_K_M.gguf
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      - --embedding
      - --pooling
      - mean
      - --ctx-size
      - "65536"
      - --batch-size
      - "4096"
      - --ubatch-size
      - "4096"
      - --parallel
      - "16"
      - --n-gpu-layers
      - "999"
      - --flash-attn
      - "on"
    ports:
      - "127.0.0.1:48081:8080"
    volumes:
      - ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 60s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks: [retrieval_net]

  dense-amd:
    profiles: [amd]
    image: rocm/llama.cpp:llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server
    container_name: codescout-dense-amd
    restart: unless-stopped
    command:
      - --model
      - /models/CodeRankEmbed-Q4_K_M.gguf
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      - --embedding
      - --pooling
      - mean
      - --ctx-size
      - "65536"
      - --batch-size
      - "4096"
      - --ubatch-size
      - "4096"
      - --parallel
      - "16"
      - --n-gpu-layers
      - "999"
      - --flash-attn
      - "on"
    ports:
      - "127.0.0.1:48081:8080"
    volumes:
      - ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
    devices:
      - /dev/kfd
      - /dev/dri
    group_add:
      - video
      - render
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 60s
    networks: [retrieval_net]

  # ------------------------------------------------------------------ SPARSE
  # Splade_PP_en_v1 on TEI. Identical model across profiles; CPU latency
  # ~80ms is acceptable for the suite.
  sparse-cpu:
    profiles: [cpu]
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6@sha256:66db77d7856c9319bbfaf2c5b80a6d0e0ac9ff128ade09eaca1d9c20213617a4
    container_name: codescout-sparse-cpu
    restart: unless-stopped
    command: ["--model-id", "prithivida/Splade_PP_en_v1", "--pooling", "splade", "--dtype", "float32", "--auto-truncate"]
    ports:
      - "127.0.0.1:48084:80"
    volumes:
      - model_cache:/data
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 30s
    networks: [retrieval_net]

  sparse-gpu:
    profiles: [gpu]
    image: ghcr.io/huggingface/text-embeddings-inference:86-1.8@sha256:65f792e790f976713a5d2ab2586d93d074203d1f0ec2045e87e60113fbd0e256
    container_name: codescout-sparse-gpu
    restart: unless-stopped
    command: ["--model-id", "prithivida/Splade_PP_en_v1", "--pooling", "splade", "--dtype", "float16", "--auto-truncate"]
    ports:
      - "127.0.0.1:48084:80"
    volumes:
      - model_cache:/data
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 30s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks: [retrieval_net]

  sparse-amd:
    profiles: [amd]
    # Built from docker/sparse-amd/Dockerfile because HuggingFace does not
    # publish a prebuilt TEI ROCm image (only TGI has rocm tags). Build target
    # is gfx1100 (RX 7000 / RDNA3); MI200/MI300 should also work via the same
    # rocm/pytorch:latest base. First build takes ~15-20 min.
    build:
      context: ./docker/sparse-amd
      dockerfile: Dockerfile
      args:
        TEI_REF: 1588129f932125a780ab97ccb300e7774b02d230
        PYTORCH_ROCM_ARCH: gfx1101
    image: codescout/sparse-amd:tei-1588129f93
    container_name: codescout-sparse-amd
    restart: unless-stopped
    command:
      - --model-id
      - prithivida/Splade_PP_en_v1
      - --pooling
      - splade
      - --dtype
      - float16
      - --auto-truncate
      - --hostname
      - 0.0.0.0
      - --port
      - "80"
    ports:
      - "127.0.0.1:48084:80"
    volumes:
      - model_cache:/data
    devices:
      - /dev/kfd
      - /dev/dri
    group_add:
      # GIDs (not names) because the rocm/pytorch image's /etc/group doesn't
      # include a `render` entry — docker's `group_add` resolves names against
      # the image, not the host. 44 = video, 992 = render on this host (Ubuntu
      # 24.04 systemd default). Run `getent group video render` to confirm
      # GIDs on other hosts. Other amd services use names because their base
      # images (rocm/llama.cpp) have the group entries.
      - "44"
      - "992"
    shm_size: 8g
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 120s
    networks: [retrieval_net]

  # ------------------------------------------------------------------ RERANK
  # CPU: bge-reranker-base (smaller, ~280MB, ~250ms p95 — usable on CPU).
  # GPU: bge-reranker-v2-m3 (568MB, ~80ms p95 — full quality).
  reranker-cpu:
    profiles: [cpu]
    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6@sha256:66db77d7856c9319bbfaf2c5b80a6d0e0ac9ff128ade09eaca1d9c20213617a4
    container_name: codescout-reranker-cpu
    restart: unless-stopped
    command: ["--model-id", "BAAI/bge-reranker-base", "--dtype", "float32", "--auto-truncate"]
    ports:
      - "127.0.0.1:48083:80"
    volumes:
      - model_cache:/data
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 30s
    networks: [retrieval_net]

  reranker-gpu:
    profiles: [gpu]
    image: ghcr.io/huggingface/text-embeddings-inference:86-1.8@sha256:65f792e790f976713a5d2ab2586d93d074203d1f0ec2045e87e60113fbd0e256
    container_name: codescout-reranker-gpu
    restart: unless-stopped
    command: ["--model-id", "BAAI/bge-reranker-v2-m3", "--dtype", "float16", "--auto-truncate"]
    ports:
      - "127.0.0.1:48083:80"
    volumes:
      - model_cache:/data
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 30s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks: [retrieval_net]

  reranker-amd:
    profiles: [amd]
    image: rocm/llama.cpp:llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server
    container_name: codescout-reranker-amd
    restart: unless-stopped
    command:
      - --model
      - /models/bge-reranker-v2-m3-Q4_K_M.gguf
      - --host
      - 0.0.0.0
      - --port
      - "8080"
      - --reranking
      - --pooling
      - rank
      - --ctx-size
      - "8192"
      - --batch-size
      - "2048"
      - --ubatch-size
      - "2048"
      - --parallel
      - "8"
      - --n-gpu-layers
      - "999"
      - --flash-attn
      - "on"
    ports:
      - "127.0.0.1:48083:8080"
    volumes:
      - ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
    devices:
      - /dev/kfd
      - /dev/dri
    group_add:
      - video
      - render
    healthcheck:
      test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
      interval: 15s
      timeout: 5s
      retries: 10
      start_period: 60s
    networks: [retrieval_net]

volumes:
  qdrant_storage:
  model_cache:

networks:
  retrieval_net:
    driver: bridge