version: "3.8" services: ktransformers: image: docker.citory.tech/public/ktransformers:0.2.2rc1 container_name: ktransformers runtime: nvidia tty: true stdin_open: true ports: - 10580:10580 deploy: resources: reservations: devices: - driver: nvidia count: 2 capabilities: - gpu ipc: host volumes: - /home/deepgeek/data/data_local/server/ktransformers/models:/workspace/models - /home/deepgeek/data/data_local/server/ktransformers/ktransformers/website:/workspace/ktransformers/ktransformers/website env_file: - .env restart: unless-stopped entrypoint: [ "python3", "/workspace/ktransformers/ktransformers/server/main.py", "--gguf_path", "/workspace/models/DeepSeek-R1-GGUF/DeepSeek-R1-UD-Q2_K_XL", "--model_path", "/workspace/models/DeepSeek-R1", "--model_name", "deepseek-r1:671b", "--cpu_infer", "94", "--optimize_config_path", "/workspace/ktransformers/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml", "--max_new_tokens", "8192", "--cache_lens", "32768", "--total_context", "32768", "--cache_q4", "true", "--temperature", "0.6", "--top_p", "0.95", "--force_think", "--no-use_cuda_graph", "--host", "0.0.0.0", "--port", "10580" ] x-dockge: urls: - http://local.citory.tech:10580 networks: {}