openaleph · catileptic · Apr 21, 2025 · Feb 28, 2025 · Mar 19, 2025 · Mar 20, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -1,7 +1,5 @@
 .git
 .github
-LICENSE
-README.md
 __pycache__
 convert
-docker-compose.yml
+docker-compose.yml
diff --git a/Dockerfile b/Dockerfile
@@ -1,16 +1,43 @@
-FROM python:3.9-bookworm
-ENV DEBIAN_FRONTEND noninteractive
+#### BUILD WHISPER.CPP
+#----------------------------------
+FROM nvidia/cuda:11.6.2-devel-ubuntu20.04 AS build
 
-LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
-LABEL org.opencontainers.image.licenses MIT
-LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
+WORKDIR /usr/local/src
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
+        bash git make wget g++ ffmpeg cmake
+RUN git clone https://github.com/ggml-org/whisper.cpp --depth 1
+
+# whisper.cpp setup
+WORKDIR /usr/local/src/whisper.cpp
+RUN WHISPER_CUBLAS=0 make -j
+RUN bash ./models/download-ggml-model.sh medium-q8_0
+
+#### copy the compiled binaries to the image for prod
+# the image above will be discarded
+# ----------------------------------
+FROM python:3.11-slim
+
+# copy whisper 
+COPY --from=build /usr/local/src/whisper.cpp /whisper
+COPY --from=build /lib/*/libgomp.so.1 /whisper/build/src
+
+# fix some libs
+ENV LD_LIBRARY_PATH=/whisper/build/src/:/whisper/build/ggml/src/
+
+# ingest-file
+ENV DEBIAN_FRONTEND="noninteractive"
+
+LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors"
+LABEL org.opencontainers.image.licenses="MIT"
+LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file"
 
 # Enable non-free archive for `unrar`.
 RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
     && apt-get -qq -y update \
     && apt-get -qq -y install build-essential locales \
     # python deps (mostly to install their dependencies)
-    python3-dev \
+    git python3-dev \
+    pkg-config libicu-dev \
     # tesseract
     tesseract-ocr libtesseract-dev libleptonica-dev \
     # libraries
@@ -24,6 +51,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
     libtiff5-dev \
     libtiff-tools ghostscript librsvg2-bin jbig2dec \
     pst-utils libgif-dev \
+    # necessary for python-magic
+    libmagic1 \
     ### tesseract
     tesseract-ocr-eng \
     tesseract-ocr-swa \
@@ -101,6 +130,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
     fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
     fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
     fonts-tlwg-purisa \
+    ffmpeg \
     ###
     && apt-get -qq -y autoremove \
     && apt-get clean \
@@ -121,6 +151,8 @@ RUN mkdir /models/ && \
     curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
 
 COPY requirements.txt /tmp/
+RUN pip3 install --no-cache-dir -q -U pip setuptools
+RUN pip3 install --no-binary=:pyicu: pyicu
 RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
 
 # Install spaCy models
@@ -143,7 +175,7 @@ RUN python3 -m spacy download el_core_news_sm \
 
 COPY . /ingestors
 WORKDIR /ingestors
-RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
+RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
 RUN chown -R app:app /ingestors
 
 ENV ARCHIVE_TYPE=file \

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -16,7 +16,7 @@ services:
   ingest-file:
     build:
       context: .
-    image: ghcr.io/alephdata/ingest-file
+    # image: ghcr.io/alephdata/ingest-file
     hostname: ingest
     tmpfs:
       - /tmp:mode=777

diff --git a/ingestors/analysis/extract.py b/ingestors/analysis/extract.py
@@ -3,7 +3,7 @@
 from functools import lru_cache
 from normality import collapse_spaces
 from languagecodes import list_to_alpha3
-from fingerprints import clean_entity_name
+from fingerprints import clean_entity_prefix
 from followthemoney.types import registry
 
 from ingestors import settings
@@ -27,7 +27,7 @@
 def clean_name(text):
     if text is None or len(text) > NAME_MAX_LENGTH:
         return
-    text = clean_entity_name(text)
+    text = clean_entity_prefix(text)
     text = collapse_spaces(text)
     if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
         return

diff --git a/ingestors/documents/html.py b/ingestors/documents/html.py
@@ -7,6 +7,7 @@
 
 class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
     "HTML file ingestor class. Extracts the text from the web page."
+
     MIME_TYPES = ["text/html"]
     EXTENSIONS = [
         "htm",

diff --git a/ingestors/exc.py b/ingestors/exc.py
@@ -3,6 +3,7 @@
 
 class ProcessingException(Exception):
     "A data-related error occuring during file processing."
+
     pass
 
 

diff --git a/ingestors/manager.py b/ingestors/manager.py
@@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
         now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")
 
         entity.set("processingStatus", self.STATUS_FAILURE)
-        entity.set("processingAgent", get_distribution("ingest").version)
+        entity.set("processingAgent", get_distribution("ingestors").version)
         entity.set("processedAt", now_string)
 
         ingestor_class = None

diff --git a/ingestors/media/audio.py b/ingestors/media/audio.py
@@ -1,15 +1,18 @@
 import logging
+from datetime import datetime
 from followthemoney import model
 from pymediainfo import MediaInfo
+from normality import stringify
 
 from ingestors.ingestor import Ingestor
 from ingestors.support.timestamp import TimestampSupport
 from ingestors.exc import ProcessingException
+from ingestors.support.transcription import TranscriptionSupport
 
 log = logging.getLogger(__name__)
 
 
-class AudioIngestor(Ingestor, TimestampSupport):
+class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
     MIME_TYPES = [
         "audio/mpeg",
         "audio/mp3",
@@ -55,7 +58,23 @@ def ingest(self, file_path, entity):
                     entity.add("samplingRate", track.sampling_rate)
                 entity.add("duration", track.duration)
         except Exception as ex:
-            raise ProcessingException("Could not read audio: %r", ex) from ex
+            raise ProcessingException(f"Could not read audio: {ex}") from ex
+        try:
+            start = datetime.now()
+            log.info(f"Attempting to transcribe {file_path}")
+            self.transcribe(file_path, entity)
+            elapsed_time = datetime.now() - start
+            # caution! this can't store an elapsed time larger than 24h
+            # datetime.seconds capped at [0,86400)
+            elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
+            log.info(
+                f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
+            )
+        except Exception as ex:
+            # If the transcription fails, the file processing should still count as a success.
+            # The existance of a transcription is not mandatory, for now.
+            entity.set("processingError", stringify(ex))
+            log.error(ex)
 
     @classmethod
     def match(cls, file_path, entity):

diff --git a/ingestors/media/video.py b/ingestors/media/video.py
@@ -1,15 +1,18 @@
 import logging
+from datetime import datetime
 from followthemoney import model
 from pymediainfo import MediaInfo
+from normality import stringify
 
 from ingestors.ingestor import Ingestor
 from ingestors.support.timestamp import TimestampSupport
 from ingestors.exc import ProcessingException
+from ingestors.support.transcription import TranscriptionSupport
 
 log = logging.getLogger(__name__)
 
 
-class VideoIngestor(Ingestor, TimestampSupport):
+class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
     MIME_TYPES = [
         "application/x-shockwave-flash",
         "video/quicktime",
@@ -44,6 +47,23 @@ def ingest(self, file_path, entity):
                 entity.add("duration", track.duration)
         except Exception as ex:
             raise ProcessingException("Could not read video: %r", ex) from ex
+        try:
+            start = datetime.now()
+            log.info(f"Attempting to transcribe {file_path}")
+            audio_only_file = self.extract_audio(file_path)
+            self.transcribe(audio_only_file, entity)
+            elapsed_time = datetime.now() - start
+            # caution! this can't store an elapsed time larger than 24h
+            # datetime.seconds capped at [0,86400)
+            elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
+            log.info(
+                f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
+            )
+        except Exception as ex:
+            # If the transcription fails, the file processing should still count as a success.
+            # The existance of a transcription is not mandatory, for now.
+            entity.set("processingError", stringify(ex))
+            log.error(ex)
 
     @classmethod
     def match(cls, file_path, entity):

diff --git a/ingestors/settings.py b/ingestors/settings.py
@@ -57,3 +57,11 @@
 SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
     "SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
 )
+
+WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
+# "auto" prompts the model to detect the language
+WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
+# timeout expressed in seconds
+WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
+    "INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
+)
diff --git a/ingestors/support/transcription.py b/ingestors/support/transcription.py
@@ -0,0 +1,102 @@
+import json
+import logging
+import subprocess
+from pathlib import Path
+
+from ingestors import settings
+from ingestors.exc import ProcessingException
+
+log = logging.getLogger(__name__)
+
+
+class TranscriptionSupport:
+    """Provides a helper for transcribing audio and video files."""
+
+    def extract_audio(self, file_path):
+        audio_only_path = Path("/ingestors") / file_path.parts[-1].split(".")[0]
+        audio_only_path = audio_only_path.with_suffix(".wav")
+
+        # https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#quick-start
+        cmd = [
+            "ffmpeg",
+            "-i",
+            file_path,
+            "-ar",
+            "16000",
+            "-ac",
+            "1",
+            "-c:a",
+            "pcm_s16le",
+            audio_only_path,
+        ]
+
+        try:
+            subprocess.run(
+                cmd, timeout=settings.WHISPER_TRANSCRIPTION_TIMEOUT, check=True
+            )
+        except subprocess.CalledProcessError as e:
+            raise e
+
+        if not audio_only_path.is_file():
+            raise ProcessingException("Audio extraction failed.")
+
+        return audio_only_path
+
+    def transcribe(self, file_path, entity):
+        model = settings.WHISPER_MODEL
+
+        models_path = Path("/whisper/models")
+
+        output_path = Path("/ingestors") / file_path.parts[-1].split(".")[0]
+
+        cmd = [
+            "/whisper/build/bin/whisper-cli",
+            "-m",
+            models_path / model,
+            "-f",
+            file_path,
+            "-oj",
+            "-of",
+            output_path,
+            "-l",
+            # setting to "auto" sometimes transcribes audio in an unintended language
+            settings.WHISPER_LANGUAGE,
+        ]
+
+        try:
+            log.info(cmd)
+            subprocess.run(
+                cmd, timeout=int(settings.WHISPER_TRANSCRIPTION_TIMEOUT), check=True
+            )
+        except subprocess.CalledProcessError as e:
+            raise e
+        # if the transcription succeeded, the output is written to a JSON
+        output_path = output_path.with_suffix(".json")
+        if not output_path.is_file():
+            raise ProcessingException(
+                f"Transcription failed. The file type might be unsupported for {file_path.parts[-1]}."
+            )
+
+        with open(output_path, "r") as f:
+            transcription_dict = json.loads(f.read())
+
+        transcription_intervals = transcription_dict.get("transcription")
+        if transcription_intervals:
+            full_transcription = ""
+            for interval in transcription_intervals:
+                full_transcription += f"[{interval['timestamps']['from']} -> {interval['timestamps']['to']}] {interval['text'].strip()}"
+            entity.add("indexText", full_transcription)
+
+        else:
+            self.delete_temporary_file(output_path)
+            raise ProcessingException(
+                f"Transcription failed, no output in file {output_path}."
+            )
+
+        self.delete_temporary_file(output_path)
+
+    def delete_temporary_file(self, file_path):
+        if not file_path.is_file():
+            return
+
+        Path.unlink(file_path)
diff --git a/ingestors/support/xml.py b/ingestors/support/xml.py
@@ -17,7 +17,7 @@ def get_xml_parser(self, **kwargs):
             recover=True,
             resolve_entities=False,
             no_network=True,
-            **kwargs
+            **kwargs,
         )
 
     def parse_xml_path(self, file_path, **kwargs):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,6 +3,7 @@

		class ProcessingException(Exception):
		"A data-related error occuring during file processing."

		pass


Expand Down