Skip to content

Commit f162611

Browse files
authored
Make audio and video searchable (#5)
* Refactor ingest-file to prepare it for using nomenklatura (#2) * Remove README & LICENSE from .dockerignore * Refactor ingest-file to be compatible with nomenklatura * Make linter happy * Add poetry.lock * Build whisper.cpp * Add Whispercpp to Dockerfile from multi-stage build * Launch subprocess for transcription. Refactor error handling. * Make image build architecture-independent. Set model. * Aesthtic adjustments to Dockerfile * Apply transcription logic to audio and video. Add tests. * The Processing of audio/video isn't a failure is transcription fails * Make linter happy * Transcription timeout as env var * Fix wrong import from settings * Cast env var timeout to int
1 parent 3c86592 commit f162611

20 files changed

+2204
-23
lines changed

.dockerignore

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
.git
22
.github
3-
LICENSE
4-
README.md
53
__pycache__
64
convert
7-
docker-compose.yml
5+
docker-compose.yml

Dockerfile

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,43 @@
1-
FROM python:3.9-bookworm
2-
ENV DEBIAN_FRONTEND noninteractive
1+
#### BUILD WHISPER.CPP
2+
#----------------------------------
3+
FROM nvidia/cuda:11.6.2-devel-ubuntu20.04 AS build
34

4-
LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
5-
LABEL org.opencontainers.image.licenses MIT
6-
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
5+
WORKDIR /usr/local/src
6+
RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
7+
bash git make wget g++ ffmpeg cmake
8+
RUN git clone https://github.com/ggml-org/whisper.cpp --depth 1
9+
10+
# whisper.cpp setup
11+
WORKDIR /usr/local/src/whisper.cpp
12+
RUN WHISPER_CUBLAS=0 make -j
13+
RUN bash ./models/download-ggml-model.sh medium-q8_0
14+
15+
#### copy the compiled binaries to the image for prod
16+
# the image above will be discarded
17+
# ----------------------------------
18+
FROM python:3.11-slim
19+
20+
# copy whisper
21+
COPY --from=build /usr/local/src/whisper.cpp /whisper
22+
COPY --from=build /lib/*/libgomp.so.1 /whisper/build/src
23+
24+
# fix some libs
25+
ENV LD_LIBRARY_PATH=/whisper/build/src/:/whisper/build/ggml/src/
26+
27+
# ingest-file
28+
ENV DEBIAN_FRONTEND="noninteractive"
29+
30+
LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors"
31+
LABEL org.opencontainers.image.licenses="MIT"
32+
LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file"
733

834
# Enable non-free archive for `unrar`.
935
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
1036
&& apt-get -qq -y update \
1137
&& apt-get -qq -y install build-essential locales \
1238
# python deps (mostly to install their dependencies)
13-
python3-dev \
39+
git python3-dev \
40+
pkg-config libicu-dev \
1441
# tesseract
1542
tesseract-ocr libtesseract-dev libleptonica-dev \
1643
# libraries
@@ -24,6 +51,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
2451
libtiff5-dev \
2552
libtiff-tools ghostscript librsvg2-bin jbig2dec \
2653
pst-utils libgif-dev \
54+
# necessary for python-magic
55+
libmagic1 \
2756
### tesseract
2857
tesseract-ocr-eng \
2958
tesseract-ocr-swa \
@@ -101,6 +130,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
101130
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
102131
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
103132
fonts-tlwg-purisa \
133+
ffmpeg \
104134
###
105135
&& apt-get -qq -y autoremove \
106136
&& apt-get clean \
@@ -121,6 +151,8 @@ RUN mkdir /models/ && \
121151
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
122152

123153
COPY requirements.txt /tmp/
154+
RUN pip3 install --no-cache-dir -q -U pip setuptools
155+
RUN pip3 install --no-binary=:pyicu: pyicu
124156
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
125157

126158
# Install spaCy models
@@ -143,7 +175,7 @@ RUN python3 -m spacy download el_core_news_sm \
143175

144176
COPY . /ingestors
145177
WORKDIR /ingestors
146-
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
178+
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
147179
RUN chown -R app:app /ingestors
148180

149181
ENV ARCHIVE_TYPE=file \

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ services:
1616
ingest-file:
1717
build:
1818
context: .
19-
image: ghcr.io/alephdata/ingest-file
19+
# image: ghcr.io/alephdata/ingest-file
2020
hostname: ingest
2121
tmpfs:
2222
- /tmp:mode=777

ingestors/analysis/extract.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from functools import lru_cache
44
from normality import collapse_spaces
55
from languagecodes import list_to_alpha3
6-
from fingerprints import clean_entity_name
6+
from fingerprints import clean_entity_prefix
77
from followthemoney.types import registry
88

99
from ingestors import settings
@@ -27,7 +27,7 @@
2727
def clean_name(text):
2828
if text is None or len(text) > NAME_MAX_LENGTH:
2929
return
30-
text = clean_entity_name(text)
30+
text = clean_entity_prefix(text)
3131
text = collapse_spaces(text)
3232
if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
3333
return

ingestors/documents/html.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
99
"HTML file ingestor class. Extracts the text from the web page."
10+
1011
MIME_TYPES = ["text/html"]
1112
EXTENSIONS = [
1213
"htm",

ingestors/exc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
class ProcessingException(Exception):
55
"A data-related error occuring during file processing."
6+
67
pass
78

89

ingestors/manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
193193
now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")
194194

195195
entity.set("processingStatus", self.STATUS_FAILURE)
196-
entity.set("processingAgent", get_distribution("ingest").version)
196+
entity.set("processingAgent", get_distribution("ingestors").version)
197197
entity.set("processedAt", now_string)
198198

199199
ingestor_class = None

ingestors/media/audio.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import logging
2+
from datetime import datetime
23
from followthemoney import model
34
from pymediainfo import MediaInfo
5+
from normality import stringify
46

57
from ingestors.ingestor import Ingestor
68
from ingestors.support.timestamp import TimestampSupport
79
from ingestors.exc import ProcessingException
10+
from ingestors.support.transcription import TranscriptionSupport
811

912
log = logging.getLogger(__name__)
1013

1114

12-
class AudioIngestor(Ingestor, TimestampSupport):
15+
class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
1316
MIME_TYPES = [
1417
"audio/mpeg",
1518
"audio/mp3",
@@ -55,7 +58,23 @@ def ingest(self, file_path, entity):
5558
entity.add("samplingRate", track.sampling_rate)
5659
entity.add("duration", track.duration)
5760
except Exception as ex:
58-
raise ProcessingException("Could not read audio: %r", ex) from ex
61+
raise ProcessingException(f"Could not read audio: {ex}") from ex
62+
try:
63+
start = datetime.now()
64+
log.info(f"Attempting to transcribe {file_path}")
65+
self.transcribe(file_path, entity)
66+
elapsed_time = datetime.now() - start
67+
# caution! this can't store an elapsed time larger than 24h
68+
# datetime.seconds capped at [0,86400)
69+
elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
70+
log.info(
71+
f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
72+
)
73+
except Exception as ex:
74+
# If the transcription fails, the file processing should still count as a success.
75+
# The existance of a transcription is not mandatory, for now.
76+
entity.set("processingError", stringify(ex))
77+
log.error(ex)
5978

6079
@classmethod
6180
def match(cls, file_path, entity):

ingestors/media/video.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
import logging
2+
from datetime import datetime
23
from followthemoney import model
34
from pymediainfo import MediaInfo
5+
from normality import stringify
46

57
from ingestors.ingestor import Ingestor
68
from ingestors.support.timestamp import TimestampSupport
79
from ingestors.exc import ProcessingException
10+
from ingestors.support.transcription import TranscriptionSupport
811

912
log = logging.getLogger(__name__)
1013

1114

12-
class VideoIngestor(Ingestor, TimestampSupport):
15+
class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
1316
MIME_TYPES = [
1417
"application/x-shockwave-flash",
1518
"video/quicktime",
@@ -44,6 +47,23 @@ def ingest(self, file_path, entity):
4447
entity.add("duration", track.duration)
4548
except Exception as ex:
4649
raise ProcessingException("Could not read video: %r", ex) from ex
50+
try:
51+
start = datetime.now()
52+
log.info(f"Attempting to transcribe {file_path}")
53+
audio_only_file = self.extract_audio(file_path)
54+
self.transcribe(audio_only_file, entity)
55+
elapsed_time = datetime.now() - start
56+
# caution! this can't store an elapsed time larger than 24h
57+
# datetime.seconds capped at [0,86400)
58+
elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
59+
log.info(
60+
f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
61+
)
62+
except Exception as ex:
63+
# If the transcription fails, the file processing should still count as a success.
64+
# The existance of a transcription is not mandatory, for now.
65+
entity.set("processingError", stringify(ex))
66+
log.error(ex)
4767

4868
@classmethod
4969
def match(cls, file_path, entity):

ingestors/settings.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,11 @@
5757
SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
5858
"SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
5959
)
60+
61+
WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
62+
# "auto" prompts the model to detect the language
63+
WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
64+
# timeout expressed in seconds
65+
WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
66+
"INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
67+
)

0 commit comments

Comments
 (0)