docker-compose-70b.yml

version: '3.6'

services:
  llama-gpt-api-70b:
    # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-70b-chat:latest'
    build:
      context: ./api
      dockerfile: 70B.Dockerfile
    restart: on-failure
    environment:
      MODEL: '/models/llama-2-70b-chat.bin'
      # Llama 2 70B's grouping factor is 8 compared to 7B and 13B's 1. Currently,
      # it's not possible to change this using --n_gqa with llama-cpp-python in
      # run.sh, so we expose it as an environment variable.
      # See: https://github.com/abetlen/llama-cpp-python/issues/528
      # and: https://github.com/facebookresearch/llama/issues/407
      N_GQA: '8'
      USE_MLOCK: 1
    cap_add:
      - IPC_LOCK

  llama-gpt-ui:
    image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
    ports:
      - 3000:3000
    restart: on-failure
    environment:
      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
      - 'OPENAI_API_HOST=http://llama-gpt-api-70b:8000'
      - 'DEFAULT_MODEL=/models/llama-2-70b-chat.bin'
      - 'WAIT_HOSTS=llama-gpt-api-70b:8000'
      - 'WAIT_TIMEOUT=600'