Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
.git
.github
LICENSE
README.md
__pycache__
convert
docker-compose.yml
docker-compose.yml
11 changes: 8 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9-bookworm
FROM python:3.11-slim
ENV DEBIAN_FRONTEND noninteractive

LABEL org.opencontainers.image.title "FollowTheMoney File Ingestors"
Expand All @@ -10,7 +10,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
&& apt-get -qq -y update \
&& apt-get -qq -y install build-essential locales \
# python deps (mostly to install their dependencies)
python3-dev \
git python3-dev \
pkg-config libicu-dev \
# tesseract
tesseract-ocr libtesseract-dev libleptonica-dev \
# libraries
Expand All @@ -24,6 +25,8 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
libtiff5-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
pst-utils libgif-dev \
# necessary for python-magic
libmagic1 \
### tesseract
tesseract-ocr-eng \
tesseract-ocr-swa \
Expand Down Expand Up @@ -121,6 +124,8 @@ RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir -q -U pip setuptools
RUN pip3 install --no-binary=:pyicu: pyicu
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt

# Install spaCy models
Expand All @@ -143,7 +148,7 @@ RUN python3 -m spacy download el_core_news_sm \

COPY . /ingestors
WORKDIR /ingestors
RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN pip3 install --no-cache-dir --config-settings editable_mode=compat --use-pep517 -e /ingestors
RUN chown -R app:app /ingestors

ENV ARCHIVE_TYPE=file \
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ services:
ingest-file:
build:
context: .
image: ghcr.io/alephdata/ingest-file
# image: ghcr.io/alephdata/ingest-file
hostname: ingest
tmpfs:
- /tmp:mode=777
Expand Down
4 changes: 2 additions & 2 deletions ingestors/analysis/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from functools import lru_cache
from normality import collapse_spaces
from languagecodes import list_to_alpha3
from fingerprints import clean_entity_name
from fingerprints import clean_entity_prefix
from followthemoney.types import registry

from ingestors import settings
Expand All @@ -27,7 +27,7 @@
def clean_name(text):
if text is None or len(text) > NAME_MAX_LENGTH:
return
text = clean_entity_name(text)
text = clean_entity_prefix(text)
text = collapse_spaces(text)
if text is None or len(text) <= NAME_MIN_LENGTH or " " not in text:
return
Expand Down
1 change: 1 addition & 0 deletions ingestors/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

class HTMLIngestor(Ingestor, EncodingSupport, HTMLSupport):
"HTML file ingestor class. Extracts the text from the web page."

MIME_TYPES = ["text/html"]
EXTENSIONS = [
"htm",
Expand Down
1 change: 1 addition & 0 deletions ingestors/exc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

class ProcessingException(Exception):
"A data-related error occuring during file processing."

pass


Expand Down
2 changes: 1 addition & 1 deletion ingestors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def ingest(self, file_path, entity, **kwargs):
now_string = now.strftime("%Y-%m-%dT%H:%M:%S.%f")

entity.set("processingStatus", self.STATUS_FAILURE)
entity.set("processingAgent", get_distribution("ingest").version)
entity.set("processingAgent", get_distribution("ingestors").version)
entity.set("processedAt", now_string)

ingestor_class = None
Expand Down
2 changes: 1 addition & 1 deletion ingestors/support/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_xml_parser(self, **kwargs):
recover=True,
resolve_entities=False,
no_network=True,
**kwargs
**kwargs,
)

def parse_xml_path(self, file_path, **kwargs):
Expand Down
1,849 changes: 1,849 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

108 changes: 108 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
[project]
name = "ingestors"
version = "3.22.0"
description = "Ingestors extract the contents of mixed unstructured documents into structured (followthemoney) data. "
authors = [
{ name = "Friedrich Lindenberg", email = "friedrich@pudo.org" },
{ name = "OCCRP Data Team", email = "data@occrp.org" },
{ name = "ID.IO", email = "hi@investigativedata.org" },
]
readme = "README.md"
license = "AGPL-3.0"
requires-python = ">=3.11,<4.0"
dependencies = [
"banal (==1.0.6)",
"normality (==2.5.0)",
"pantomime (==0.6.1)",
"followthemoney (==3.5.9)",
"followthemoney-store[postgresql] (>=3.1.0,<3.2.0)",
"servicelayer @ git+https://github.com/investigativedata/servicelayer.git@main",
"languagecodes (==1.1.1)",
"countrytagger (==0.1.2)",
"pyicu (==2.12)",
"google-cloud-vision (==3.7.2)",
"tesserocr (==2.7.1)",
"spacy (==3.6.1)",
"numpy (<2.0)",
"fingerprints (==1.2.3)",
"fasttext (==0.9.2)",
"pika (==1.3.2)",
"nomenklatura (==3.15.2)",
"dbf (==0.99.9)",
"pymediainfo (==6.1.0)",
"python-magic (==0.4.27)",
"rarfile (==4.2)",
"xlrd (==2.0.1)",
"openpyxl (==3.1.2)",
"odfpy (==1.4.1)",
"faust-cchardet (==2.1.19)",
"lxml (==5.0.0)",
"olefile (==0.47)",
"Pillow (==10.1.0)",
"vobject (==0.9.6.1)",
"msglite (==0.30.0)",
"icalendar (==5.0.12)",
"cryptography (==41.0.7)",
"requests[security] (==2.31.0)",
"pymupdf (==1.21.1)",
"prometheus-client (==0.17.1)",
"sentry_sdk (==2.0.1)",
# servicelayer extras requirements
"boto3 (>=1.11.9,<2.0.0)",
"grpcio (>=1.32.0,<2.0.0)",
"google-cloud-storage (>=1.31.0,<3.0.0)"
]

[project.scripts]
ingestors = "ingestors.cli:cli"

[project.gui-scripts]
ingestors = "ingestors.cli:cli"

[project.entry-points."ingestors"]
ignore = "ingestors.ignore:IgnoreIngestor"
html = "ingestors.documents.html:HTMLIngestor"
xml = "ingestors.documents.xml:XMLIngestor"
plain = "ingestors.documents.plain:PlainTextIngestor"
office = "ingestors.documents.office:DocumentIngestor"
opendoc = "ingestors.documents.opendoc:OpenDocumentIngestor"
ooxml = "ingestors.documents.ooxml:OfficeOpenXMLIngestor"
djvu = "ingestors.documents.djvu:DjVuIngestor"
pdf = "ingestors.documents.pdf:PDFIngestor"
rar = "ingestors.packages.rar:RARIngestor"
zip = "ingestors.packages.zip:ZipIngestor"
tar = "ingestors.packages.tar:TarIngestor"
7z = "ingestors.packages:SevenZipIngestor"
gz = "ingestors.packages:GzipIngestor"
bz2 = "ingestors.packages:BZ2Ingestor"
pst = "ingestors.email.outlookpst:OutlookPSTIngestor"
olm = "ingestors.email.olm:OutlookOLMArchiveIngestor"
opfmsg = "ingestors.email.olm:OutlookOLMMessageIngestor"
olemsg = "ingestors.email.outlookmsg:OutlookMsgIngestor"
msg = "ingestors.email.msg:RFC822Ingestor"
emlx = "ingestors.email.emlx:AppleEmlxIngestor"
vcard = "ingestors.email.vcard:VCardIngestor"
calendar = "ingestors.email.calendar:CalendarIngestor"
csv = "ingestors.tabular.csv:CSVIngestor"
access = "ingestors.tabular.access:AccessIngestor"
sqlite = "ingestors.tabular.sqlite:SQLiteIngestor"
xls = "ingestors.tabular.xls:ExcelIngestor"
xlsx = "ingestors.tabular.xlsx:ExcelXMLIngestor"
ods = "ingestors.tabular.ods:OpenOfficeSpreadsheetIngestor"
mbox = "ingestors.email.mbox:MboxFileIngestor"
dbf = "ingestors.tabular.dbf:DBFIngestor"
image = "ingestors.media.image:ImageIngestor"
tiff = "ingestors.media.tiff:TIFFIngestor"
svg = "ingestors.media.svg:SVGIngestor"
audio = "ingestors.media.audio:AudioIngestor"
video = "ingestors.media.video:VideoIngestor"
json = "ingestors.misc.jsonfile:JSONIngestor"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.group.dev.dependencies]
pytest = "8.2.0"
pytest-cov = "5.0.0"
click = "8.1.7"
8 changes: 5 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@ normality==2.5.0
pantomime==0.6.1
followthemoney==3.5.9
followthemoney-store[postgresql]==3.1.0
servicelayer[google,amazon]==1.23.0
servicelayer @ git+https://github.com/investigativedata/servicelayer.git
languagecodes==1.1.1
countrytagger==0.1.2
pyicu==2.12
google-cloud-vision==3.7.2
tesserocr==2.7.1
spacy==3.6.1 # pinned because spacy 3.8 requires numpy >2 which breaks fasttext (see https://groups.google.com/g/fasttext-library/c/4EOM0-S6xHU)
numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
fingerprints==1.1.1
fingerprints==1.2.3
fasttext==0.9.2
pika==1.3.2
nomenklatura==3.15.2

# Development
pytest==8.2.0
Expand All @@ -28,7 +29,7 @@ rarfile==4.2
xlrd==2.0.1
openpyxl==3.1.2
odfpy==1.4.1
cchardet==2.1.7
faust-cchardet==2.1.19
lxml==5.0.0
olefile==0.47
Pillow==10.1.0
Expand All @@ -42,3 +43,4 @@ pymupdf==1.21.1

prometheus-client==0.17.1
sentry_sdk==2.0.1