📘

DeepSeek-OCR簡単起動docker-compose

2025/10/22に公開

作業用のディレクトリを用意してください
Dockerfile,docker-compose.yml,config.pyをコピペして貼り付けてください
work/inputs,work/outputsディレクトリを作ってください
work/inputs/input.pngに認識させたい画像ファイルを配置してください
docker-compose upでoutputsディレクトリに出力されます

他のファイル形式の場合はconfig.pyの設定を書き換えてinput.pngの部分を書き換えてください
日本語特化の場合はpromptにLanguage is Japanese.って追加すると少し精度上がります

Dockerfile

FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
    git \
    wget \
    python3-pip \
    python3-setuptools \
    python3-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
RUN git clone https://github.com/deepseek-ai/DeepSeek-OCR.git /DeepSeek-OCR
WORKDIR /DeepSeek-OCR
RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
RUN wget https://github.com/vllm-project/vllm/releases/download/v0.8.5/vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl
RUN pip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl
RUN pip install -r requirements.txt
RUN pip install flash-attn==2.7.3 --no-build-isolation

docker-compose.yml

services:
  deepseek-ocr:
    build:
      context: .
      dockerfile: Dockerfile
    volumes:
      - ./config.py:/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py
      - ./work:/work
      - hugggingface:/huggingface
    working_dir: /DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm
    environment:
      - HF_HOME=/huggingface
    entrypoint: 
      - bash
      - -c
      - python3 run_dpsk_ocr_image.py
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
volumes:
  hugggingface:

config.py

# TODO: change modes
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True

BASE_SIZE = 1024
IMAGE_SIZE = 640
CROP_MODE = True
MIN_CROPS= 2
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
NUM_WORKERS = 64 # image pre-process (resize/padding) workers 
PRINT_NUM_VIS_TOKENS = False
SKIP_REPEAT = True
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path

# TODO: change INPUT_PATH
# .pdf: run_dpsk_ocr_pdf.py; 
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py; 
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py

INPUT_PATH = '/work/inputs/input.png'
OUTPUT_PATH = '/work/outputs/'

PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
# PROMPT = '<image>\nFree OCR.'
# TODO commonly used prompts
# document: <image>\n<|grounding|>Convert the document to markdown.
# other image: <image>\n<|grounding|>OCR this image.
# without layouts: <image>\nFree OCR.
# figures in document: <image>\nParse the figure.
# general: <image>\nDescribe this image in detail.
# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
# '先天下之忧而忧'
# .......


from transformers import AutoTokenizer

TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

Discussion