Skip to content

Commit ceb997d

Browse files
authored
TST: Add workflow tests (#1287)
1 parent e909d8c commit ceb997d

File tree

1 file changed

+60
-7
lines changed

1 file changed

+60
-7
lines changed

tests/test_workflows.py

+60-7
Original file line numberDiff line numberDiff line change
@@ -418,42 +418,88 @@ def test_get_metadata(url, name):
418418

419419

420420
@pytest.mark.parametrize(
421-
("url", "name"),
421+
("url", "name", "strict", "exception"),
422422
[
423423
(
424424
"https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf",
425425
"tika-938702.pdf",
426+
False,
427+
(PdfReadError, "Unexpected end of stream"),
426428
),
427429
(
428430
"https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf",
429431
"tika-942358.pdf",
432+
False,
433+
None,
430434
),
431435
(
432436
"https://corpora.tika.apache.org/base/docs/govdocs1/911/911260.pdf",
433437
"tika-911260.pdf",
438+
False,
439+
None,
434440
),
435441
(
436442
"https://corpora.tika.apache.org/base/docs/govdocs1/992/992472.pdf",
437443
"tika-992472.pdf",
444+
False,
445+
None,
438446
),
439447
(
440448
"https://corpora.tika.apache.org/base/docs/govdocs1/978/978477.pdf",
441449
"tika-978477.pdf",
450+
False,
451+
None,
442452
),
443453
(
444454
"https://corpora.tika.apache.org/base/docs/govdocs1/960/960317.pdf",
445455
"tika-960317.pdf",
456+
False,
457+
None,
446458
),
447459
(
448460
"https://corpora.tika.apache.org/base/docs/govdocs1/930/930513.pdf",
449461
"tika-930513.pdf",
462+
False,
463+
None,
464+
),
465+
(
466+
"https://corpora.tika.apache.org/base/docs/govdocs1/918/918113.pdf",
467+
"tika-918113.pdf",
468+
True,
469+
None,
470+
),
471+
(
472+
"https://corpora.tika.apache.org/base/docs/govdocs1/940/940704.pdf",
473+
"tika-940704.pdf",
474+
True,
475+
None,
476+
),
477+
(
478+
"https://corpora.tika.apache.org/base/docs/govdocs1/976/976488.pdf",
479+
"tika-976488.pdf",
480+
True,
481+
None,
482+
),
483+
(
484+
"https://corpora.tika.apache.org/base/docs/govdocs1/948/948176.pdf",
485+
"tika-948176.pdf",
486+
True,
487+
None,
450488
),
451489
],
452490
)
453-
def test_extract_text(url, name):
491+
def test_extract_text(url, name, strict, exception):
454492
data = BytesIO(get_pdf_from_url(url, name=name))
455-
reader = PdfReader(data)
456-
reader.metadata
493+
reader = PdfReader(data, strict=strict)
494+
if not exception:
495+
for page in reader.pages:
496+
page.extract_text()
497+
else:
498+
exc, exc_text = exception
499+
with pytest.raises(exc) as ex_info:
500+
for page in reader.pages:
501+
page.extract_text()
502+
assert ex_info.value.args[0] == exc_text
457503

458504

459505
@pytest.mark.parametrize(
@@ -481,21 +527,28 @@ def test_compress_raised(url, name):
481527

482528

483529
@pytest.mark.parametrize(
484-
("url", "name"),
530+
("url", "name", "strict"),
485531
[
486532
(
487533
"https://corpora.tika.apache.org/base/docs/govdocs1/915/915194.pdf",
488534
"tika-915194.pdf",
535+
False,
489536
),
490537
(
491538
"https://corpora.tika.apache.org/base/docs/govdocs1/950/950337.pdf",
492539
"tika-950337.pdf",
540+
False,
541+
),
542+
(
543+
"https://corpora.tika.apache.org/base/docs/govdocs1/962/962292.pdf",
544+
"tika-962292.pdf",
545+
True,
493546
),
494547
],
495548
)
496-
def test_compress(url, name):
549+
def test_compress(url, name, strict):
497550
data = BytesIO(get_pdf_from_url(url, name=name))
498-
reader = PdfReader(data)
551+
reader = PdfReader(data, strict=strict)
499552
# TODO: which page exactly?
500553
# TODO: Is it reasonable to have an exception here?
501554
for page in reader.pages:

0 commit comments

Comments
 (0)