diff --git a/crawler/.dockerignore b/crawler/.dockerignore new file mode 100644 index 0000000..1269488 --- /dev/null +++ b/crawler/.dockerignore @@ -0,0 +1 @@ +data diff --git a/crawler/Dockerfile b/crawler/Dockerfile new file mode 100644 index 0000000..ed4dc8d --- /dev/null +++ b/crawler/Dockerfile @@ -0,0 +1,44 @@ +FROM python:3.13-slim + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + build-essential \ + gcc \ + python3-dev \ + libopencv-dev \ + libgl1 \ + libglib2.0-0 \ + tesseract-ocr \ + tesseract-ocr-eng \ + && rm -rf /var/lib/apt/lists/* + +# Install Poetry +ENV POETRY_HOME=/opt/poetry \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 + +RUN curl -sSL https://install.python-poetry.org | python3 - && \ + cd /usr/local/bin && \ + ln -s /opt/poetry/bin/poetry && \ + poetry --version + +WORKDIR /app + +# Copy only the files needed for dependency installation +COPY pyproject.toml poetry.lock ./ + +# Install dependencies +RUN poetry install --only main --no-root + +ENV PATH="/app/.venv/bin:$PATH" + +# Copy the application code +COPY . . + +# Make the CLI executable +RUN chmod +x /app/main.py + +# Set the entry point (adjust to your CLI's entry point) +# ENTRYPOINT ["python", "/app/main.py"] +ENTRYPOINT ["/app/runall.sh"]