openaleph
diff --git a/‎.dockerignore‎
Lines changed: 1 addition & 3 deletions b/‎.dockerignore‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎Dockerfile‎
Lines changed: 39 additions & 7 deletions b/‎Dockerfile‎
Lines changed: 39 additions & 7 deletions
diff --git a/‎docker-compose.yml‎
Lines changed: 1 addition & 1 deletion b/‎docker-compose.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ingestors/analysis/extract.py‎
Lines changed: 2 additions & 2 deletions b/‎ingestors/analysis/extract.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ingestors/documents/html.py‎
Lines changed: 1 addition & 0 deletions b/‎ingestors/documents/html.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingestors/exc.py‎
Lines changed: 1 addition & 0 deletions b/‎ingestors/exc.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ingestors/manager.py‎
Lines changed: 1 addition & 1 deletion b/‎ingestors/manager.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ingestors/media/audio.py‎
Lines changed: 21 additions & 2 deletions b/‎ingestors/media/audio.py‎
Lines changed: 21 additions & 2 deletions
diff --git a/‎ingestors/media/video.py‎
Lines changed: 21 additions & 1 deletion b/‎ingestors/media/video.py‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎ingestors/settings.py‎
Lines changed: 8 additions & 0 deletions b/‎ingestors/settings.py‎
Lines changed: 8 additions & 0 deletions
@@ -1,7 +1,5 @@
 .git
 .github
-LICENSE
-README.md
 __pycache__
 convert
-docker-compose.yml
+docker-compose.yml
@@ -1,16 +1,43 @@
-FROM python:3.9-bookworm
-ENV DEBIAN_FRONTEND noninteractive
+#### BUILD WHISPER.CPP
+#----------------------------------
+FROM nvidia/cuda:11.6.2-devel-ubuntu20.04 AS build
 
-LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
-LABEL org.opencontainers.image.licenses MIT
-LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
+WORKDIR /usr/local/src
+RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
+        bash git make wget g++ ffmpeg cmake
+RUN git clone https://github.com/ggml-org/whisper.cpp --depth 1
+
+# whisper.cpp setup
+WORKDIR /usr/local/src/whisper.cpp
+RUN WHISPER_CUBLAS=0 make -j
+RUN bash ./models/download-ggml-model.sh medium-q8_0
+
+#### copy the compiled binaries to the image for prod
+# the image above will be discarded
+# ----------------------------------
+FROM python:3.11-slim
+
+# copy whisper 
+COPY --from=build /usr/local/src/whisper.cpp /whisper
+COPY --from=build /lib/*/libgomp.so.1 /whisper/build/src
+
+# fix some libs
+ENV LD_LIBRARY_PATH=/whisper/build/src/:/whisper/build/ggml/src/
+
+# ingest-file
+ENV DEBIAN_FRONTEND="noninteractive"
+
+LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors"
+LABEL org.opencontainers.image.licenses="MIT"
+LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file"
 
 # Enable non-free archive for `unrar`.
 RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
     && apt-get -qq -y update \
     && apt-get -qq -y install build-essential locales \
     # python deps (mostly to install their dependencies)
-    python3-dev \
+    git python3-dev \
+    pkg-config libicu-dev \
     # tesseract
     tesseract-ocr libtesseract-dev libleptonica-dev \
     # libraries
@@ -24,6 +51,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
     libtiff5-dev \
     libtiff-tools ghostscript librsvg2-bin jbig2dec \
     pst-utils libgif-dev \
+    # necessary for python-magic
+    libmagic1 \
     ### tesseract
     tesseract-ocr-eng \
     tesseract-ocr-swa \
@@ -101,6 +130,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
     fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
     fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
     fonts-tlwg-purisa \
+    ffmpeg \
     ###
     && apt-get -qq -y autoremove \
     && apt-get clean \
@@ -121,6 +151,8 @@ RUN mkdir /models/ && \
     curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
 
 COPY requirements.txt /tmp/
+RUN pip3 install --no-cache-dir -q -U pip setuptools
+RUN pip3 install --no-binary=:pyicu: pyicu
 RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
 
 # Install spaCy models
@@ -143,7 +175,7 @@ RUN python3 -m spacy download el_core_news_sm \
 
 COPY . /ingestors
 WORKDIR /ingestors
-RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
+RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
 RUN chown -R app:app /ingestors
 
 ENV ARCHIVE_TYPE=file \
 
@@ -16,7 +16,7 @@ services:
   ingest-file:
     build:
       context: .
-    image: ghcr.io/alephdata/ingest-file
+    # image: ghcr.io/alephdata/ingest-file
     hostname: ingest
     tmpfs:
       - /tmp:mode=777
 
@@ -3,7 +3,7 @@
 from functools import lru_cache
 from normality import collapse_spaces
 from languagecodes import list_to_alpha3
-from fingerprints import clean_entity_name
+from fingerprints import clean_entity_prefix
 from followthemoney.types import registry
 
 from ingestors import settings
@@ -27,7 +27,7 @@
 def clean_name(text):
     if text is None or len(text) > NAME_MAX_LENGTH:
         return
-    text = clean_entity_name(text)
+    text = clean_entity_prefix(text)
     text = collapse_spaces(text)
     if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
         return
 
@@ -7,6 +7,7 @@
 
 class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
     "HTML file ingestor class. Extracts the text from the web page."
+
     MIME_TYPES = ["text/html"]
     EXTENSIONS = [
         "htm",
 
@@ -3,6 +3,7 @@
 
 class ProcessingException(Exception):
     "A data-related error occuring during file processing."
+
     pass
 
 
 
@@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
         now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")
 
         entity.set("processingStatus", self.STATUS_FAILURE)
-        entity.set("processingAgent", get_distribution("ingest").version)
+        entity.set("processingAgent", get_distribution("ingestors").version)
         entity.set("processedAt", now_string)
 
         ingestor_class = None
 
@@ -1,15 +1,18 @@
 import logging
+from datetime import datetime
 from followthemoney import model
 from pymediainfo import MediaInfo
+from normality import stringify
 
 from ingestors.ingestor import Ingestor
 from ingestors.support.timestamp import TimestampSupport
 from ingestors.exc import ProcessingException
+from ingestors.support.transcription import TranscriptionSupport
 
 log = logging.getLogger(__name__)
 
 
-class AudioIngestor(Ingestor, TimestampSupport):
+class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
     MIME_TYPES = [
         "audio/mpeg",
         "audio/mp3",
@@ -55,7 +58,23 @@ def ingest(self, file_path, entity):
                     entity.add("samplingRate", track.sampling_rate)
                 entity.add("duration", track.duration)
         except Exception as ex:
-            raise ProcessingException("Could not read audio: %r", ex) from ex
+            raise ProcessingException(f"Could not read audio: {ex}") from ex
+        try:
+            start = datetime.now()
+            log.info(f"Attempting to transcribe {file_path}")
+            self.transcribe(file_path, entity)
+            elapsed_time = datetime.now() - start
+            # caution! this can't store an elapsed time larger than 24h
+            # datetime.seconds capped at [0,86400)
+            elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
+            log.info(
+                f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
+            )
+        except Exception as ex:
+            # If the transcription fails, the file processing should still count as a success.
+            # The existance of a transcription is not mandatory, for now.
+            entity.set("processingError", stringify(ex))
+            log.error(ex)
 
     @classmethod
     def match(cls, file_path, entity):
 
@@ -1,15 +1,18 @@
 import logging
+from datetime import datetime
 from followthemoney import model
 from pymediainfo import MediaInfo
+from normality import stringify
 
 from ingestors.ingestor import Ingestor
 from ingestors.support.timestamp import TimestampSupport
 from ingestors.exc import ProcessingException
+from ingestors.support.transcription import TranscriptionSupport
 
 log = logging.getLogger(__name__)
 
 
-class VideoIngestor(Ingestor, TimestampSupport):
+class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
     MIME_TYPES = [
         "application/x-shockwave-flash",
         "video/quicktime",
@@ -44,6 +47,23 @@ def ingest(self, file_path, entity):
                 entity.add("duration", track.duration)
         except Exception as ex:
             raise ProcessingException("Could not read video: %r", ex) from ex
+        try:
+            start = datetime.now()
+            log.info(f"Attempting to transcribe {file_path}")
+            audio_only_file = self.extract_audio(file_path)
+            self.transcribe(audio_only_file, entity)
+            elapsed_time = datetime.now() - start
+            # caution! this can't store an elapsed time larger than 24h
+            # datetime.seconds capped at [0,86400)
+            elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
+            log.info(
+                f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
+            )
+        except Exception as ex:
+            # If the transcription fails, the file processing should still count as a success.
+            # The existance of a transcription is not mandatory, for now.
+            entity.set("processingError", stringify(ex))
+            log.error(ex)
 
     @classmethod
     def match(cls, file_path, entity):
 
@@ -57,3 +57,11 @@
 SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
     "SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
 )
+
+WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
+# "auto" prompts the model to detect the language
+WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
+# timeout expressed in seconds
+WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
+    "INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
+)