name: codescout-retrieval
services:
qdrant:
image: qdrant/qdrant:v1.17.0@sha256:f1c7272cdac52b38c1a0e89313922d940ba50afd90d593a1605dbbc214e66ffb
container_name: codescout-qdrant
restart: unless-stopped
ports:
- "127.0.0.1:6333:6333"
- "127.0.0.1:6334:6334"
volumes:
- qdrant_storage:/qdrant/storage
environment:
QDRANT__LOG_LEVEL: INFO
healthcheck:
test: ["CMD-SHELL", "bash -c '</dev/tcp/127.0.0.1/6333'"]
interval: 10s
timeout: 3s
retries: 5
networks: [retrieval_net]
dense-cpu:
profiles: [cpu]
image: ghcr.io/ggml-org/llama.cpp:server@sha256:6b0a9b4fd7e3a9a55e959e5a74d47e11f8ccd4dfbc2556b7382a6516255dcc73
container_name: codescout-dense-cpu
restart: unless-stopped
command:
- --model
- /models/CodeRankEmbed-Q4_K_M.gguf
- --host
- 0.0.0.0
- --port
- "8080"
- --embedding
- --pooling
- mean
- --ctx-size
- "8192"
- --batch-size
- "4096"
- --ubatch-size
- "4096"
- --parallel
- "8"
- --threads
- "4"
ports:
- "127.0.0.1:48081:8080"
volumes:
- ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 30s
networks: [retrieval_net]
dense-gpu:
profiles: [gpu]
image: ghcr.io/ggml-org/llama.cpp:server-cuda@sha256:a04923d31b4ca0d95bd772a4b80c9112f29121014df64d3d80a16a136ca19672
container_name: codescout-dense-gpu
restart: unless-stopped
command:
- --model
- /models/CodeRankEmbed-Q4_K_M.gguf
- --host
- 0.0.0.0
- --port
- "8080"
- --embedding
- --pooling
- mean
- --ctx-size
- "65536"
- --batch-size
- "4096"
- --ubatch-size
- "4096"
- --parallel
- "16"
- --n-gpu-layers
- "999"
- --flash-attn
- "on"
ports:
- "127.0.0.1:48081:8080"
volumes:
- ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 60s
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks: [retrieval_net]
dense-amd:
profiles: [amd]
image: rocm/llama.cpp:llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server
container_name: codescout-dense-amd
restart: unless-stopped
command:
- --model
- /models/CodeRankEmbed-Q4_K_M.gguf
- --host
- 0.0.0.0
- --port
- "8080"
- --embedding
- --pooling
- mean
- --ctx-size
- "65536"
- --batch-size
- "4096"
- --ubatch-size
- "4096"
- --parallel
- "16"
- --n-gpu-layers
- "999"
- --flash-attn
- "on"
ports:
- "127.0.0.1:48081:8080"
volumes:
- ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
- render
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 60s
networks: [retrieval_net]
sparse-cpu:
profiles: [cpu]
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6@sha256:66db77d7856c9319bbfaf2c5b80a6d0e0ac9ff128ade09eaca1d9c20213617a4
container_name: codescout-sparse-cpu
restart: unless-stopped
command: ["--model-id", "prithivida/Splade_PP_en_v1", "--pooling", "splade", "--dtype", "float32", "--auto-truncate"]
ports:
- "127.0.0.1:48084:80"
volumes:
- model_cache:/data
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 30s
networks: [retrieval_net]
sparse-gpu:
profiles: [gpu]
image: ghcr.io/huggingface/text-embeddings-inference:86-1.8@sha256:65f792e790f976713a5d2ab2586d93d074203d1f0ec2045e87e60113fbd0e256
container_name: codescout-sparse-gpu
restart: unless-stopped
command: ["--model-id", "prithivida/Splade_PP_en_v1", "--pooling", "splade", "--dtype", "float16", "--auto-truncate"]
ports:
- "127.0.0.1:48084:80"
volumes:
- model_cache:/data
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 30s
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks: [retrieval_net]
sparse-amd:
profiles: [amd]
build:
context: ./docker/sparse-amd
dockerfile: Dockerfile
args:
TEI_REF: 1588129f932125a780ab97ccb300e7774b02d230
PYTORCH_ROCM_ARCH: gfx1101
image: codescout/sparse-amd:tei-1588129f93
container_name: codescout-sparse-amd
restart: unless-stopped
command:
- --model-id
- prithivida/Splade_PP_en_v1
- --pooling
- splade
- --dtype
- float16
- --auto-truncate
- --hostname
- 0.0.0.0
- --port
- "80"
ports:
- "127.0.0.1:48084:80"
volumes:
- model_cache:/data
devices:
- /dev/kfd
- /dev/dri
group_add:
- "44"
- "992"
shm_size: 8g
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 120s
networks: [retrieval_net]
reranker-cpu:
profiles: [cpu]
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.6@sha256:66db77d7856c9319bbfaf2c5b80a6d0e0ac9ff128ade09eaca1d9c20213617a4
container_name: codescout-reranker-cpu
restart: unless-stopped
command: ["--model-id", "BAAI/bge-reranker-base", "--dtype", "float32", "--auto-truncate"]
ports:
- "127.0.0.1:48083:80"
volumes:
- model_cache:/data
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 30s
networks: [retrieval_net]
reranker-gpu:
profiles: [gpu]
image: ghcr.io/huggingface/text-embeddings-inference:86-1.8@sha256:65f792e790f976713a5d2ab2586d93d074203d1f0ec2045e87e60113fbd0e256
container_name: codescout-reranker-gpu
restart: unless-stopped
command: ["--model-id", "BAAI/bge-reranker-v2-m3", "--dtype", "float16", "--auto-truncate"]
ports:
- "127.0.0.1:48083:80"
volumes:
- model_cache:/data
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 30s
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks: [retrieval_net]
reranker-amd:
profiles: [amd]
image: rocm/llama.cpp:llama.cpp-b6652.amd0_rocm7.0.0_ubuntu24.04_server
container_name: codescout-reranker-amd
restart: unless-stopped
command:
- --model
- /models/bge-reranker-v2-m3-Q4_K_M.gguf
- --host
- 0.0.0.0
- --port
- "8080"
- --reranking
- --pooling
- rank
- --ctx-size
- "8192"
- --batch-size
- "2048"
- --ubatch-size
- "2048"
- --parallel
- "8"
- --n-gpu-layers
- "999"
- --flash-attn
- "on"
ports:
- "127.0.0.1:48083:8080"
volumes:
- ${CODESCOUT_MODEL_DIR:-./models}:/models:ro
devices:
- /dev/kfd
- /dev/dri
group_add:
- video
- render
healthcheck:
test: ["CMD-SHELL", "curl -fsS http://127.0.0.1:8080/health || exit 1"]
interval: 15s
timeout: 5s
retries: 10
start_period: 60s
networks: [retrieval_net]
volumes:
qdrant_storage:
model_cache:
networks:
retrieval_net:
driver: bridge