Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
.git
.github
LICENSE
README.md
__pycache__
convert
docker-compose.yml
docker-compose.yml
11 changes: 8 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9-bookworm
FROM python:3.11-slim
ENV DEBIAN_FRONTEND noninteractive

LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
Expand All @@ -10,7 +10,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
&& apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales \
# python deps (mostly to install their dependencies)
python3-dev \
git python3-dev \
pkg-config libicu-dev \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev \
# libraries
Expand All @@ -24,6 +25,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
libtiff5-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils libgif-dev \
# necessary for python-magic
libmagic1 \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
Expand Down Expand Up @@ -121,6 +124,8 @@ RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir -q -U pip setuptools
RUN pip3 install --no-binary=:pyicu: pyicu
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
Expand All @@ -143,7 +148,7 @@ RUN python3 -m spacy download el_core_news_sm \

COPY . /ingestors
WORKDIR /ingestors
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN chown -R app:app /ingestors

ENV ARCHIVE_TYPE=file \
Expand Down
6 changes: 1 addition & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,10 @@ services:
image: redis:alpine
command: ["redis-server", "--save", "3600", "10"]

rabbitmq:
image: rabbitmq:3.9-management-alpine

ingest-file:
build:
context: .
image: ghcr.io/alephdata/ingest-file
# image: ghcr.io/alephdata/ingest-file
hostname: ingest
tmpfs:
- /tmp:mode=777
Expand All @@ -34,4 +31,3 @@ services:
depends_on:
- postgres
- redis
- rabbitmq
4 changes: 2 additions & 2 deletions ingestors/analysis/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import lru_cache
from normality import collapse_spaces
from languagecodes import list_to_alpha3
from fingerprints import clean_entity_name
from fingerprints import clean_entity_prefix
from followthemoney.types import registry

from ingestors import settings
Expand All @@ -27,7 +27,7 @@
def clean_name(text):
if text is None or len(text) > NAME_MAX_LENGTH:
return
text = clean_entity_name(text)
text = clean_entity_prefix(text)
text = collapse_spaces(text)
if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
return
Expand Down
1 change: 1 addition & 0 deletions ingestors/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
"HTML file ingestor class. Extracts the text from the web page."

MIME_TYPES = ["text/html"]
EXTENSIONS = [
"htm",
Expand Down
1 change: 1 addition & 0 deletions ingestors/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

class ProcessingException(Exception):
"A data-related error occuring during file processing."

pass


Expand Down
2 changes: 1 addition & 1 deletion ingestors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")

entity.set("processingStatus", self.STATUS_FAILURE)
entity.set("processingAgent", get_distribution("ingest").version)
entity.set("processingAgent", get_distribution("ingestors").version)
entity.set("processedAt", now_string)

ingestor_class = None
Expand Down
7 changes: 6 additions & 1 deletion ingestors/media/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from ingestors.ingestor import Ingestor
from ingestors.support.timestamp import TimestampSupport
from ingestors.exc import ProcessingException
from ingestors.support.transcription import TranscriptionSupport

log = logging.getLogger(__name__)


class AudioIngestor(Ingestor, TimestampSupport):
class AudioIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
MIME_TYPES = [
"audio/mpeg",
"audio/mp3",
Expand Down Expand Up @@ -54,6 +55,10 @@ def ingest(self, file_path, entity):
if track.sampling_rate:
entity.add("samplingRate", track.sampling_rate)
entity.add("duration", track.duration)
try:
self.transcribe(file_path, entity)
except Exception as ex:
log.error(f"Could not transcribe audio to text. {ex}")
except Exception as ex:
raise ProcessingException("Could not read audio: %r", ex) from ex

Expand Down
8 changes: 6 additions & 2 deletions ingestors/media/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
from ingestors.ingestor import Ingestor
from ingestors.support.timestamp import TimestampSupport
from ingestors.exc import ProcessingException
from ingestors.support.transcription import TranscriptionSupport

log = logging.getLogger(__name__)


class VideoIngestor(Ingestor, TimestampSupport):
class VideoIngestor(Ingestor, TimestampSupport, TranscriptionSupport):
MIME_TYPES = [
"application/x-shockwave-flash",
"video/quicktime",
Expand All @@ -29,7 +30,6 @@ class VideoIngestor(Ingestor, TimestampSupport):
def ingest(self, file_path, entity):
try:
entity.schema = model.get("Video")
log.info("[%r] flagged as video.", entity)
metadata = MediaInfo.parse(file_path)
for track in metadata.tracks:
entity.add("title", track.title)
Expand All @@ -42,6 +42,10 @@ def ingest(self, file_path, entity):
modified_at = self.parse_timestamp(track.file_last_modification_date)
entity.add("modifiedAt", modified_at)
entity.add("duration", track.duration)
try:
self.transcribe(file_path, entity)
except Exception as ex:
log.error(f"Could not transcribe audio to text. {ex}")
except Exception as ex:
raise ProcessingException("Could not read video: %r", ex) from ex

Expand Down
28 changes: 28 additions & 0 deletions ingestors/support/transcription-whisper-openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import logging

import whisper

log = logging.getLogger(__name__)

MODEL_SIZE = "medium"

class TranscriptionSupport:
"""Provides a helper for transcribing audio and video files."""

def transcribe(self, file_path, entity):
"""
beam_size: https://stackoverflow.com/questions/22273119/what-does-the-beam-size-represent-in-the-beam-search-algorithm
"""
log.critical("loading model")
model = whisper.load_model(MODEL_SIZE)
log.critical(f"loading audio file from: {file_path}")
audio = whisper.load_audio(file_path)
log.critical("running pad_or_trim")
audio = whisper.pad_or_trim(audio)
log.critical("transcribing")
result = model.transcribe(audio, verbose=True)

# TODO chunking https://stackoverflow.com/a/57126101

# can it return time stamps?
entity.add("bodyText", result.text)
50 changes: 50 additions & 0 deletions ingestors/support/transcription.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import gc
import logging

from faster_whisper import WhisperModel

log = logging.getLogger(__name__)

MODEL_SIZE = "large-v3"

class TranscriptionSupport:
"""Provides a helper for transcribing audio and video files."""

def transcribe(self, file_path, entity):
"""
A description of the arguments for the WhisperModel init:
https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py#L603
beam_size: https://stackoverflow.com/questions/22273119/what-does-the-beam-size-represent-in-the-beam-search-algorithm
"""

self.model = None

try:
# compute_type="float32"
self.model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8", cpu_threads=1, num_workers=1)
log.info(f"Transcription model initialized successfully.")

segments, _ = self.model.transcribe(file_path, vad_filter=True, beam_size=5, no_speech_threshold=0.6, condition_on_previous_text=False)

for segment in segments:
entity.add("bodyText", f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
log.info(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
finally:
self._del_model()

def _del_model(self):
# https://github.com/SYSTRAN/faster-whisper/issues/660
self.model.model.unload_model()

if hasattr(self.model, 'model'):
del self.model.model
if hasattr(self.model, 'feature_extractor'):
del self.model.feature_extractor
if hasattr(self.model, 'hf_tokenizer'):
del self.model.hf_tokenizer

del self.model

gc.collect()

log.info("Transcription model removed from memory.")
2 changes: 1 addition & 1 deletion ingestors/support/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_xml_parser(self, **kwargs):
recover=True,
resolve_entities=False,
no_network=True,
**kwargs
**kwargs,
)

def parse_xml_path(self, file_path, **kwargs):
Expand Down
Loading
Loading