📘
DeepSeek-OCR簡単起動docker-compose
- 作業用のディレクトリを用意してください
- Dockerfile,docker-compose.yml,config.pyをコピペして貼り付けてください
- work/inputs,work/outputsディレクトリを作ってください
- work/inputs/input.pngに認識させたい画像ファイルを配置してください
-
docker-compose upでoutputsディレクトリに出力されます
- 他のファイル形式の場合はconfig.pyの設定を書き換えてinput.pngの部分を書き換えてください
- 日本語特化の場合はpromptに
Language is Japanese.って追加すると少し精度上がります
Dockerfile
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
git \
wget \
python3-pip \
python3-setuptools \
python3-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
RUN git clone https://github.com/deepseek-ai/DeepSeek-OCR.git /DeepSeek-OCR
WORKDIR /DeepSeek-OCR
RUN pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cu118
RUN wget https://github.com/vllm-project/vllm/releases/download/v0.8.5/vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl
RUN pip install vllm-0.8.5+cu118-cp38-abi3-manylinux1_x86_64.whl
RUN pip install -r requirements.txt
RUN pip install flash-attn==2.7.3 --no-build-isolation
docker-compose.yml
services:
deepseek-ocr:
build:
context: .
dockerfile: Dockerfile
volumes:
- ./config.py:/DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm/config.py
- ./work:/work
- hugggingface:/huggingface
working_dir: /DeepSeek-OCR/DeepSeek-OCR-master/DeepSeek-OCR-vllm
environment:
- HF_HOME=/huggingface
entrypoint:
- bash
- -c
- python3 run_dpsk_ocr_image.py
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
volumes:
hugggingface:
config.py
# TODO: change modes
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
BASE_SIZE = 1024
IMAGE_SIZE = 640
CROP_MODE = True
MIN_CROPS= 2
MAX_CROPS= 6 # max:9; If your GPU memory is small, it is recommended to set it to 6.
MAX_CONCURRENCY = 100 # If you have limited GPU memory, lower the concurrency count.
NUM_WORKERS = 64 # image pre-process (resize/padding) workers
PRINT_NUM_VIS_TOKENS = False
SKIP_REPEAT = True
MODEL_PATH = 'deepseek-ai/DeepSeek-OCR' # change to your model path
# TODO: change INPUT_PATH
# .pdf: run_dpsk_ocr_pdf.py;
# .jpg, .png, .jpeg: run_dpsk_ocr_image.py;
# Omnidocbench images path: run_dpsk_ocr_eval_batch.py
INPUT_PATH = '/work/inputs/input.png'
OUTPUT_PATH = '/work/outputs/'
PROMPT = '<image>\n<|grounding|>Convert the document to markdown.'
# PROMPT = '<image>\nFree OCR.'
# TODO commonly used prompts
# document: <image>\n<|grounding|>Convert the document to markdown.
# other image: <image>\n<|grounding|>OCR this image.
# without layouts: <image>\nFree OCR.
# figures in document: <image>\nParse the figure.
# general: <image>\nDescribe this image in detail.
# rec: <image>\nLocate <|ref|>xxxx<|/ref|> in the image.
# '先天下之忧而忧'
# .......
from transformers import AutoTokenizer
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
Discussion