openaleph · catileptic · Feb 28, 2025 · Mar 12, 2025 · Mar 12, 2025 · Mar 12, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -1,7 +1,5 @@
 .git
 .github
-LICENSE
-README.md
 __pycache__
 convert
-docker-compose.yml
+docker-compose.yml
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.9-bookworm
+FROM python:3.11-slim
 ENV DEBIAN_FRONTEND noninteractive
 
 LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
@@ -10,7 +10,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
     && apt-get -qq -y update \
     && apt-get -qq -y install build-essential locales \
     # python deps (mostly to install their dependencies)
-    python3-dev \
+    git python3-dev \
+    pkg-config libicu-dev \
     # tesseract
     tesseract-ocr libtesseract-dev libleptonica-dev \
     # libraries
@@ -24,6 +25,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
     libtiff5-dev \
     libtiff-tools ghostscript librsvg2-bin jbig2dec \
     pst-utils libgif-dev \
+    # necessary for python-magic
+    libmagic1 \
     ### tesseract
     tesseract-ocr-eng \
     tesseract-ocr-swa \
@@ -121,6 +124,8 @@ RUN mkdir /models/ && \
     curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
 
 COPY requirements.txt /tmp/
+RUN pip3 install --no-cache-dir -q -U pip setuptools
+RUN pip3 install --no-binary=:pyicu: pyicu
 RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
 
 # Install spaCy models
@@ -143,7 +148,7 @@ RUN python3 -m spacy download el_core_news_sm \
 
 COPY . /ingestors
 WORKDIR /ingestors
-RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
+RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
 RUN chown -R app:app /ingestors
 
 ENV ARCHIVE_TYPE=file \

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -10,13 +10,10 @@ services:
     image: redis:alpine
     command: ["redis-server", "--save", "3600", "10"]
 
-  rabbitmq:
-    image: rabbitmq:3.9-management-alpine
-
   ingest-file:
     build:
       context: .
-    image: ghcr.io/alephdata/ingest-file
+    # image: ghcr.io/alephdata/ingest-file
     hostname: ingest
     tmpfs:
       - /tmp:mode=777
@@ -34,4 +31,3 @@ services:
     depends_on:
       - postgres
       - redis
-      - rabbitmq
diff --git a/ingestors/analysis/extract.py b/ingestors/analysis/extract.py
@@ -3,7 +3,7 @@
 from functools import lru_cache
 from normality import collapse_spaces
 from languagecodes import list_to_alpha3
-from fingerprints import clean_entity_name
+from fingerprints import clean_entity_prefix
 from followthemoney.types import registry
 
 from ingestors import settings
@@ -27,7 +27,7 @@
 def clean_name(text):
     if text is None or len(text) > NAME_MAX_LENGTH:
         return
-    text = clean_entity_name(text)
+    text = clean_entity_prefix(text)
     text = collapse_spaces(text)
     if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
         return

diff --git a/ingestors/documents/html.py b/ingestors/documents/html.py
@@ -7,6 +7,7 @@
 
 class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
     "HTML file ingestor class. Extracts the text from the web page."
+
     MIME_TYPES = ["text/html"]
     EXTENSIONS = [
         "htm",

diff --git a/ingestors/exc.py b/ingestors/exc.py
@@ -3,6 +3,7 @@
 
 class ProcessingException(Exception):
     "A data-related error occuring during file processing."
+
     pass
 
 

diff --git a/ingestors/manager.py b/ingestors/manager.py
@@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
         now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")
 
         entity.set("processingStatus", self.STATUS_FAILURE)
-        entity.set("processingAgent", get_distribution("ingest").version)
+        entity.set("processingAgent", get_distribution("ingestors").version)
         entity.set("processedAt", now_string)
 
         ingestor_class = None

diff --git a/ingestors/media/audio.py b/ingestors/media/audio.py
@@ -5,11 +5,12 @@
 from ingestors.ingestor import Ingestor
 from ingestors.support.timestamp import TimestampSupport
 from ingestors.exc import ProcessingException
+from ingestors.support.transcription import TranscriptionSupport
 
 log = logging.getLogger(__name__)
 
 
-class AudioIngestor(Ingestor, TimestampSupport):
+class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
     MIME_TYPES = [
         "audio/mpeg",
         "audio/mp3",
@@ -54,6 +55,10 @@ def ingest(self, file_path, entity):
                 if track.sampling_rate:
                     entity.add("samplingRate", track.sampling_rate)
                 entity.add("duration", track.duration)
+            try:
+                self.transcribe(file_path, entity)
+            except Exception as ex:
+                log.error(f"Could not transcribe audio to text. {ex}")
         except Exception as ex:
             raise ProcessingException("Could not read audio: %r", ex) from ex
 

diff --git a/ingestors/media/video.py b/ingestors/media/video.py
@@ -5,11 +5,12 @@
 from ingestors.ingestor import Ingestor
 from ingestors.support.timestamp import TimestampSupport
 from ingestors.exc import ProcessingException
+from ingestors.support.transcription import TranscriptionSupport
 
 log = logging.getLogger(__name__)
 
 
-class VideoIngestor(Ingestor, TimestampSupport):
+class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
     MIME_TYPES = [
         "application/x-shockwave-flash",
         "video/quicktime",
@@ -29,7 +30,6 @@ class VideoIngestor(Ingestor, TimestampSupport):
     def ingest(self, file_path, entity):
         try:
             entity.schema = model.get("Video")
-            log.info("[%r] flagged as video.", entity)
             metadata = MediaInfo.parse(file_path)
             for track in metadata.tracks:
                 entity.add("title", track.title)
@@ -42,6 +42,10 @@ def ingest(self, file_path, entity):
                 modified_at = self.parse_timestamp(track.file_last_modification_date)
                 entity.add("modifiedAt", modified_at)
                 entity.add("duration", track.duration)
+            try:
+                self.transcribe(file_path, entity)
+            except Exception as ex:
+                log.error(f"Could not transcribe audio to text. {ex}")
         except Exception as ex:
             raise ProcessingException("Could not read video: %r", ex) from ex
 

diff --git a/ingestors/support/transcription-whisper-openai.py b/ingestors/support/transcription-whisper-openai.py
@@ -0,0 +1,28 @@
+import logging
+
+import whisper
+
+log = logging.getLogger(__name__)
+
+MODEL_SIZE = "medium"
+
+class TranscriptionSupport:
+    """Provides a helper for transcribing audio and video files."""
+
+    def transcribe(self, file_path, entity):
+        """
+        beam_size: https://stackoverflow.com/questions/22273119/what-does-the-beam-size-represent-in-the-beam-search-algorithm 
+        """
+        log.critical("loading model")
+        model = whisper.load_model(MODEL_SIZE)
+        log.critical(f"loading audio file from: {file_path}")
+        audio = whisper.load_audio(file_path)
+        log.critical("running pad_or_trim")
+        audio = whisper.pad_or_trim(audio)
+        log.critical("transcribing")
+        result = model.transcribe(audio, verbose=True)
+
+        # TODO chunking https://stackoverflow.com/a/57126101
+
+        # can it return time stamps?
+        entity.add("bodyText", result.text)
diff --git a/ingestors/support/transcription.py b/ingestors/support/transcription.py
@@ -0,0 +1,50 @@
+import gc
+import logging
+
+from faster_whisper import WhisperModel
+
+log = logging.getLogger(__name__)
+
+MODEL_SIZE = "large-v3"
+
+class TranscriptionSupport:
+    """Provides a helper for transcribing audio and video files."""
+
+    def transcribe(self, file_path, entity):
+        """
+        A description of the arguments for the WhisperModel init:
+        https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py#L603
+        beam_size: https://stackoverflow.com/questions/22273119/what-does-the-beam-size-represent-in-the-beam-search-algorithm 
+        """
+
+        self.model = None
+
+        try:
+            # compute_type="float32"
+            self.model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8", cpu_threads=1, num_workers=1)
+            log.info(f"Transcription model initialized successfully.")
+
+            segments, _ = self.model.transcribe(file_path, vad_filter=True, beam_size=5, no_speech_threshold=0.6, condition_on_previous_text=False)
+
+            for segment in segments:
+                entity.add("bodyText", f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
+                log.info(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
+        finally:
+            self._del_model()
+
+    def _del_model(self):
+        # https://github.com/SYSTRAN/faster-whisper/issues/660
+        self.model.model.unload_model()
+
+        if hasattr(self.model, 'model'):
+            del self.model.model
+        if hasattr(self.model, 'feature_extractor'):
+            del self.model.feature_extractor
+        if hasattr(self.model, 'hf_tokenizer'):
+            del self.model.hf_tokenizer
+
+        del self.model
+
+        gc.collect()
+
+        log.info("Transcription model removed from memory.")
diff --git a/ingestors/support/xml.py b/ingestors/support/xml.py
@@ -17,7 +17,7 @@ def get_xml_parser(self, **kwargs):
             recover=True,
             resolve_entities=False,
             no_network=True,
-            **kwargs
+            **kwargs,
         )
 
     def parse_xml_path(self, file_path, **kwargs):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,6 +3,7 @@

		class ProcessingException(Exception):
		"A data-related error occuring during file processing."

		pass


Expand Down