Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
.git
.github
LICENSE
README.md
__pycache__
convert
docker-compose.yml
docker-compose.yml
46 changes: 39 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,43 @@
FROM python:3.9-bookworm
ENV DEBIAN_FRONTEND noninteractive
#### BUILD WHISPER.CPP
#----------------------------------
FROM nvidia/cuda:11.6.2-devel-ubuntu20.04 AS build

LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
LABEL org.opencontainers.image.licenses MIT
LABEL org.opencontainers.image.source https://github.com/alephdata/ingest-file
WORKDIR /usr/local/src
RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \
bash git make wget g++ ffmpeg cmake
RUN git clone https://github.com/ggml-org/whisper.cpp --depth 1

# whisper.cpp setup
WORKDIR /usr/local/src/whisper.cpp
RUN WHISPER_CUBLAS=0 make -j
RUN bash ./models/download-ggml-model.sh medium-q8_0

#### copy the compiled binaries to the image for prod
# the image above will be discarded
# ----------------------------------
FROM python:3.11-slim

# copy whisper
COPY --from=build /usr/local/src/whisper.cpp /whisper
COPY --from=build /lib/*/libgomp.so.1 /whisper/build/src

# fix some libs
ENV LD_LIBRARY_PATH=/whisper/build/src/:/whisper/build/ggml/src/

# ingest-file
ENV DEBIAN_FRONTEND="noninteractive"

LABEL org.opencontainers.image.title="FollowTheMoney File Ingestors"
LABEL org.opencontainers.image.licenses="MIT"
LABEL org.opencontainers.image.source="https://github.com/alephdata/ingest-file"

# Enable non-free archive for `unrar`.
RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/sources.list.d/nonfree.list \
&& apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales \
# python deps (mostly to install their dependencies)
python3-dev \
git python3-dev \
pkg-config libicu-dev \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev \
# libraries
Expand All @@ -24,6 +51,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
libtiff5-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils libgif-dev \
# necessary for python-magic
libmagic1 \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
Expand Down Expand Up @@ -101,6 +130,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \
fonts-tlwg-purisa \
ffmpeg \
###
&& apt-get -qq -y autoremove \
&& apt-get clean \
Expand All @@ -121,6 +151,8 @@ RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir -q -U pip setuptools
RUN pip3 install --no-binary=:pyicu: pyicu
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
Expand All @@ -143,7 +175,7 @@ RUN python3 -m spacy download el_core_news_sm \

COPY . /ingestors
WORKDIR /ingestors
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN chown -R app:app /ingestors

ENV ARCHIVE_TYPE=file \
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
ingest-file:
build:
context: .
image: ghcr.io/alephdata/ingest-file
# image: ghcr.io/alephdata/ingest-file
hostname: ingest
tmpfs:
- /tmp:mode=777
Expand Down
4 changes: 2 additions & 2 deletions ingestors/analysis/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import lru_cache
from normality import collapse_spaces
from languagecodes import list_to_alpha3
from fingerprints import clean_entity_name
from fingerprints import clean_entity_prefix
from followthemoney.types import registry

from ingestors import settings
Expand All @@ -27,7 +27,7 @@
def clean_name(text):
if text is None or len(text) > NAME_MAX_LENGTH:
return
text = clean_entity_name(text)
text = clean_entity_prefix(text)
text = collapse_spaces(text)
if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
return
Expand Down
1 change: 1 addition & 0 deletions ingestors/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
"HTML file ingestor class. Extracts the text from the web page."

MIME_TYPES = ["text/html"]
EXTENSIONS = [
"htm",
Expand Down
1 change: 1 addition & 0 deletions ingestors/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

class ProcessingException(Exception):
"A data-related error occuring during file processing."

pass


Expand Down
2 changes: 1 addition & 1 deletion ingestors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")

entity.set("processingStatus", self.STATUS_FAILURE)
entity.set("processingAgent", get_distribution("ingest").version)
entity.set("processingAgent", get_distribution("ingestors").version)
entity.set("processedAt", now_string)

ingestor_class = None
Expand Down
23 changes: 21 additions & 2 deletions ingestors/media/audio.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import logging
from datetime import datetime
from followthemoney import model
from pymediainfo import MediaInfo
from normality import stringify

from ingestors.ingestor import Ingestor
from ingestors.support.timestamp import TimestampSupport
from ingestors.exc import ProcessingException
from ingestors.support.transcription import TranscriptionSupport

log = logging.getLogger(__name__)


class AudioIngestor(Ingestor, TimestampSupport):
class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
MIME_TYPES = [
"audio/mpeg",
"audio/mp3",
Expand Down Expand Up @@ -55,7 +58,23 @@ def ingest(self, file_path, entity):
entity.add("samplingRate", track.sampling_rate)
entity.add("duration", track.duration)
except Exception as ex:
raise ProcessingException("Could not read audio: %r", ex) from ex
raise ProcessingException(f"Could not read audio: {ex}") from ex
try:
start = datetime.now()
log.info(f"Attempting to transcribe {file_path}")
self.transcribe(file_path, entity)
elapsed_time = datetime.now() - start
# caution! this can't store an elapsed time larger than 24h
# datetime.seconds capped at [0,86400)
elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
log.info(
f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
)
except Exception as ex:
# If the transcription fails, the file processing should still count as a success.
# The existance of a transcription is not mandatory, for now.
entity.set("processingError", stringify(ex))
log.error(ex)

@classmethod
def match(cls, file_path, entity):
Expand Down
22 changes: 21 additions & 1 deletion ingestors/media/video.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import logging
from datetime import datetime
from followthemoney import model
from pymediainfo import MediaInfo
from normality import stringify

from ingestors.ingestor import Ingestor
from ingestors.support.timestamp import TimestampSupport
from ingestors.exc import ProcessingException
from ingestors.support.transcription import TranscriptionSupport

log = logging.getLogger(__name__)


class VideoIngestor(Ingestor, TimestampSupport):
class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
MIME_TYPES = [
"application/x-shockwave-flash",
"video/quicktime",
Expand Down Expand Up @@ -44,6 +47,23 @@ def ingest(self, file_path, entity):
entity.add("duration", track.duration)
except Exception as ex:
raise ProcessingException("Could not read video: %r", ex) from ex
try:
start = datetime.now()
log.info(f"Attempting to transcribe {file_path}")
audio_only_file = self.extract_audio(file_path)
self.transcribe(audio_only_file, entity)
elapsed_time = datetime.now() - start
# caution! this can't store an elapsed time larger than 24h
# datetime.seconds capped at [0,86400)
elapsed_time = divmod(elapsed_time.total_seconds(), 60)[0]
log.info(
f"Transcription duration: {elapsed_time} minutes (audio duration: {entity.get('duration')})"
)
except Exception as ex:
# If the transcription fails, the file processing should still count as a success.
# The existance of a transcription is not mandatory, for now.
entity.set("processingError", stringify(ex))
log.error(ex)

@classmethod
def match(cls, file_path, entity):
Expand Down
8 changes: 8 additions & 0 deletions ingestors/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,11 @@
SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
"SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
)

WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
# "auto" prompts the model to detect the language
WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
# timeout expressed in seconds
WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
"INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
)
102 changes: 102 additions & 0 deletions ingestors/support/transcription.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import json
import logging
import subprocess
from pathlib import Path

from ingestors import settings
from ingestors.exc import ProcessingException

log = logging.getLogger(__name__)


class TranscriptionSupport:
"""Provides a helper for transcribing audio and video files."""

def extract_audio(self, file_path):
audio_only_path = Path("/ingestors") / file_path.parts[-1].split(".")[0]
audio_only_path = audio_only_path.with_suffix(".wav")

# https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#quick-start
cmd = [
"ffmpeg",
"-i",
file_path,
"-ar",
"16000",
"-ac",
"1",
"-c:a",
"pcm_s16le",
audio_only_path,
]

try:
subprocess.run(
cmd, timeout=settings.WHISPER_TRANSCRIPTION_TIMEOUT, check=True
)
except subprocess.CalledProcessError as e:
raise e

if not audio_only_path.is_file():
raise ProcessingException("Audio extraction failed.")

return audio_only_path

def transcribe(self, file_path, entity):
model = settings.WHISPER_MODEL

models_path = Path("/whisper/models")

output_path = Path("/ingestors") / file_path.parts[-1].split(".")[0]

cmd = [
"/whisper/build/bin/whisper-cli",
"-m",
models_path / model,
"-f",
file_path,
"-oj",
"-of",
output_path,
"-l",
# setting to "auto" sometimes transcribes audio in an unintended language
settings.WHISPER_LANGUAGE,
]

try:
log.info(cmd)
subprocess.run(
cmd, timeout=int(settings.WHISPER_TRANSCRIPTION_TIMEOUT), check=True
)
except subprocess.CalledProcessError as e:
raise e
# if the transcription succeeded, the output is written to a JSON
output_path = output_path.with_suffix(".json")
if not output_path.is_file():
raise ProcessingException(
f"Transcription failed. The file type might be unsupported for {file_path.parts[-1]}."
)

with open(output_path, "r") as f:
transcription_dict = json.loads(f.read())

transcription_intervals = transcription_dict.get("transcription")
if transcription_intervals:
full_transcription = ""
for interval in transcription_intervals:
full_transcription += f"[{interval['timestamps']['from']} -> {interval['timestamps']['to']}] {interval['text'].strip()}"
entity.add("indexText", full_transcription)

else:
self.delete_temporary_file(output_path)
raise ProcessingException(
f"Transcription failed, no output in file {output_path}."
)

self.delete_temporary_file(output_path)

def delete_temporary_file(self, file_path):
if not file_path.is_file():
return

Path.unlink(file_path)
2 changes: 1 addition & 1 deletion ingestors/support/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_xml_parser(self, **kwargs):
recover=True,
resolve_entities=False,
no_network=True,
**kwargs
**kwargs,
)

def parse_xml_path(self, file_path, **kwargs):
Expand Down
Loading