Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
data/model_type_prediction.ftz
data/servicelayer-archive
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
22 changes: 12 additions & 10 deletions ingestors/cli.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
import sys
import click
import logging
import sys
from pprint import pprint
from ftmstore import get_dataset
from servicelayer.cache import get_redis, get_fakeredis
from servicelayer.logs import configure_logging
from servicelayer.jobs import Job, Dataset

import click
from ftmq.store.fragments import get_dataset
from servicelayer import settings as sl_settings
from servicelayer.archive.util import ensure_path
from servicelayer.cache import get_fakeredis, get_redis
from servicelayer.jobs import Dataset, Job
from servicelayer.logs import configure_logging
from servicelayer.tags import Tags

from ingestors import settings
from ingestors.manager import Manager
from ingestors.directory import DirectoryIngestor
from ingestors.analysis import Analyzer
from ingestors.worker import IngestWorker, OP_ANALYZE, OP_INGEST
from ingestors.directory import DirectoryIngestor
from ingestors.manager import Manager
from ingestors.worker import OP_ANALYZE, OP_INGEST, IngestWorker

log = logging.getLogger(__name__)
STAGES = [OP_ANALYZE, OP_INGEST]
Expand Down Expand Up @@ -50,7 +51,8 @@ def killthekitten():
conn.flushall()


def _ingest_path(db, conn, dataset, path, languages=[]):
def _ingest_path(db, conn, dataset, path, languages=None):
languages = languages or []
context = {"languages": languages}
job = Job.create(conn, dataset)
stage = job.get_stage(OP_INGEST)
Expand Down
11 changes: 6 additions & 5 deletions ingestors/email/vcard.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import logging

import vobject
from vobject.base import ParseError
from banal import ensure_list
from followthemoney import model
from followthemoney.util import sanitize_text
from vobject.base import Component, ContentLine, ParseError

from ingestors.exc import ProcessingException
from ingestors.ingestor import Ingestor
from ingestors.support.encoding import EncodingSupport
from ingestors.exc import ProcessingException

log = logging.getLogger(__name__)

Expand All @@ -17,9 +18,9 @@ class VCardIngestor(Ingestor, EncodingSupport):
EXTENSIONS = ["vcf", "vcard"]
SCORE = 10

def get_field(self, card, field):
items = ensure_list(card.contents.get(field))
return [i.value for i in items]
def get_field(self, card: Component, field: str):
items: list[ContentLine] = ensure_list(card.contents.get(field))
return [str(i.value) for i in items]

def ingest_card(self, entity, card):
person = self.manager.make_entity("Person")
Expand Down
28 changes: 14 additions & 14 deletions ingestors/manager.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
import magic
import logging
from timeit import default_timer
from tempfile import mkdtemp
from datetime import datetime
from pkg_resources import get_distribution
from tempfile import mkdtemp
from timeit import default_timer

from followthemoney import model
import magic
from banal import ensure_list
from followthemoney import model
from followthemoney.helpers import entity_filename
from followthemoney.namespace import Namespace
from ftmq.store.fragments.utils import safe_fragment
from normality import stringify
from pantomime import normalize_mimetype
from ftmstore.utils import safe_fragment
from pkg_resources import get_distribution
from prometheus_client import Counter, Histogram
from sentry_sdk import capture_exception
from servicelayer.archive import init_archive
from servicelayer.archive.util import ensure_path
from servicelayer.extensions import get_extensions
from sentry_sdk import capture_exception
from followthemoney.helpers import entity_filename
from followthemoney.namespace import Namespace
from prometheus_client import Counter, Histogram

from ingestors import settings
from ingestors.directory import DirectoryIngestor
from ingestors.exc import ProcessingException, ENCRYPTED_MSG
from ingestors.exc import ENCRYPTED_MSG, ProcessingException
from ingestors.util import filter_text, remove_directory
from ingestors import settings

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,9 +68,9 @@ class Manager(object):
"""Handles the lifecycle of an ingestor. This can be subclassed to embed it
into a larger processing framework."""

#: Indicates that during the processing no errors or failures occured.
#: Indicates that during the processing no errors or failures occurred.
STATUS_SUCCESS = "success"
#: Indicates occurance of errors during the processing.
#: Indicates occurrence of errors during the processing.
STATUS_FAILURE = "failure"

MAGIC = magic.Magic(mime=True)
Expand Down
18 changes: 2 additions & 16 deletions ingestors/settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as fts

TESTING = False

Expand Down Expand Up @@ -43,25 +42,12 @@
"INGESTORS_TYPE_MODEL_PATH", "/models/model_type_prediction.ftz"
)

# Use the environment variable set in aleph.env
fts.DATABASE_URI = env.get(
"FTM_STORE_URI", env.get("ALEPH_DATABASE_URI", fts.DATABASE_URI)
)

# Also store cached values in the SQL database
sls.TAGS_DATABASE_URI = fts.DATABASE_URI
sls.TAGS_DATABASE_URI = env.get("FTM_STORE_URI", env.get("ALEPH_DATABASE_URI"))

# ProcessingException is thrown whenever something goes wrong wiht
# ProcessingException is thrown whenever something goes wrong with
# parsing a file. Enable this with care, it can easily eat up the
# Sentry quota of events.
SENTRY_CAPTURE_PROCESSING_EXCEPTIONS = env.to_bool(
"SENTRY_CAPTURE_PROCESSING_EXCEPTIONS", False
)

WHISPER_MODEL = env.get("INGESTORS_WHISPER_MODEL", "ggml-medium-q8_0.bin")
# "auto" prompts the model to detect the language
WHISPER_LANGUAGE = env.get("INGESTORS_WHISPER_LANGUAGE", "auto")
# timeout expressed in seconds
WHISPER_TRANSCRIPTION_TIMEOUT = env.get(
"INGESTORS_WHISPER_TRANSCRIPTION_TIMEOUT", 60 * 60 * 2
)
16 changes: 8 additions & 8 deletions ingestors/support/email.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import logging
import re
import types
import logging
from email.utils import getaddresses, parsedate_to_datetime

from banal import ensure_list
from normality import stringify
from ftmstore.utils import safe_fragment
from email.utils import parsedate_to_datetime, getaddresses
from normality import safe_filename, ascii_text
from followthemoney.types import registry
from ftmq.store.fragments.utils import safe_fragment
from normality import ascii_text, safe_filename, stringify

from ingestors.support.html import HTMLSupport
from ingestors.support.cache import CacheSupport
from ingestors.support.html import HTMLSupport
from ingestors.support.temp import TempFileSupport

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -195,8 +195,8 @@ def extract_msg_headers(self, entity, msg):
sender = self.get_header_identities(msg, "Sender", "X-Sender")
self.apply_identities(entity, sender, "emitters", "sender")

froms = self.get_header_identities(msg, "From", "X-From")
self.apply_identities(entity, froms, "emitters", "from")
froms = self.get_header_identities(msg, "From", "X-From") # codespell:ignore
self.apply_identities(entity, froms, "emitters", "from") # codespell:ignore

tos = self.get_header_identities(msg, "To", "Resent-To")
self.apply_identities(entity, tos, "recipients", "to")
Expand Down
9 changes: 5 additions & 4 deletions ingestors/worker.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import logging

from followthemoney import model
from ftmstore import get_dataset
from servicelayer.worker import Worker
from servicelayer.logs import apply_task_context
from ftmq.store.fragments import get_dataset
from prometheus_client import Info
from servicelayer.logs import apply_task_context
from servicelayer.worker import Worker

from ingestors import __version__
from ingestors.manager import Manager
from ingestors.analysis import Analyzer
from ingestors.manager import Manager

log = logging.getLogger(__name__)
OP_INGEST = "ingest"
Expand Down
Loading
Loading