-
Notifications
You must be signed in to change notification settings - Fork 3
/
Dockerfile.content_harvester
31 lines (21 loc) · 1.41 KB
/
Dockerfile.content_harvester
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
FROM public.ecr.aws/docker/library/python:3.12-bullseye
# Install dependencies
RUN apt-get update -qq && apt-get install -y libtiff-tools ffmpeg ghostscript
# By default don't have permissions to modify PDFs with ImageMagick,
# so we need to comment out this line in the the policy
RUN sed -i 's/<policy domain="coder" rights="none" pattern="PDF" \/>/<!--<policy domain="coder" rights="none" pattern="PDF" \/>-->/' /etc/ImageMagick-6/policy.xml
# increase the disk space from 1GB to 8GB for harvesting very large images:
# https://github.com/ucldc/rikolti/issues/818
RUN sed -i 's/<policy domain="resource" name="disk" value="1GiB" \/>/<policy domain="resource" name="disk" value="8GiB" \/>/' /etc/ImageMagick-6/policy.xml
# increase the max width from 16KP to 30KP for harvesting very long images:
# https://github.com/ucldc/rikolti/issues/817
RUN sed -i 's/<policy domain="resource" name="width" value="16KP" \/>/<policy domain="resource" name="width" value="30KP" \/>/' /etc/ImageMagick-6/policy.xml
WORKDIR /
COPY content_harvester/requirements.txt ./
RUN pip install --upgrade pip && pip install -r requirements.txt
COPY content_harvester/ /content_harvester
COPY utils/ /rikolti/utils
RUN chmod +x /content_harvester/by_collection.py
# ENTRYPOINT [ "python", "-m", "content_harvester.by_registry_endpoint" ]
# ENTRYPOINT ["python3", "-m", "content_harvester.by_collection"]
ENTRYPOINT ["python3", "-m", "content_harvester.by_page"]