@@ -12,12 +12,12 @@ RUN apt-get -qq -y update \
12
12
# python deps (mostly to install their dependencies)
13
13
python3-pip python3-dev python3-pil \
14
14
# tesseract
15
- tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\
15
+ tesseract-ocr libtesseract-dev libleptonica-dev pkg-config \
16
16
# libraries
17
17
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \
18
18
zlib1g-dev libicu-dev libxml2-dev \
19
19
# package tools
20
- unrar p7zip-full \
20
+ unrar \
21
21
# audio & video metadata
22
22
libmediainfo-dev \
23
23
# image processing, djvu
@@ -116,41 +116,22 @@ ENV LANG='en_US.UTF-8' \
116
116
OMP_THREAD_LIMIT='1' \
117
117
OPENBLAS_NUM_THREADS='1'
118
118
119
- ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
119
+ # force compile tesserocr 2.6.2 with C++ 14
120
+ # to make it compatible with Tesseract 5
121
+ RUN pip download --no-binary=:all: "tesserocr==2.6.2" \
122
+ && tar -xzf tesserocr-2.6.2.tar.gz \
123
+ && sed -i "s/-std=c++11/-std=c++14/" tesserocr-2.6.2/setup.py \
124
+ && cd tesserocr-2.6.2 \
125
+ && CXXFLAGS="-std=c++14" pip install --no-cache-dir .
120
126
121
127
# tesseract 5
122
128
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
123
129
124
130
RUN groupadd -g 1000 -r app \
125
131
&& useradd -m -u 1000 -s /bin/false -g app app
126
132
127
- # Download the ftm-typepredict model
128
- RUN mkdir /models/ && \
129
- curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
130
-
131
133
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip
132
134
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel
133
135
134
- # Install spaCy
135
- RUN pip3 install --no-cache-dir spacy
136
136
# Install PyICU
137
137
RUN pip3 install --no-binary=:pyicu: pyicu
138
- # Install TesserOCR
139
- RUN pip3 install --no-binary=:tesserocr: tesserocr
140
-
141
- # Install default (small) spaCy models
142
- RUN python3 -m spacy download en_core_web_sm
143
- RUN python3 -m spacy download de_core_news_sm
144
- RUN python3 -m spacy download fr_core_news_sm
145
- RUN python3 -m spacy download es_core_news_sm
146
- RUN python3 -m spacy download ru_core_news_sm
147
- RUN python3 -m spacy download pt_core_news_sm
148
- RUN python3 -m spacy download ro_core_news_sm
149
- RUN python3 -m spacy download mk_core_news_sm
150
- RUN python3 -m spacy download el_core_news_sm
151
- RUN python3 -m spacy download pl_core_news_sm
152
- RUN python3 -m spacy download it_core_news_sm
153
- RUN python3 -m spacy download lt_core_news_sm
154
- RUN python3 -m spacy download nl_core_news_sm
155
- RUN python3 -m spacy download nb_core_news_sm
156
- RUN python3 -m spacy download da_core_news_sm
0 commit comments