@@ -418,42 +418,88 @@ def test_get_metadata(url, name):
418
418
419
419
420
420
@pytest .mark .parametrize (
421
- ("url" , "name" ),
421
+ ("url" , "name" , "strict" , "exception" ),
422
422
[
423
423
(
424
424
"https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf" ,
425
425
"tika-938702.pdf" ,
426
+ False ,
427
+ (PdfReadError , "Unexpected end of stream" ),
426
428
),
427
429
(
428
430
"https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf" ,
429
431
"tika-942358.pdf" ,
432
+ False ,
433
+ None ,
430
434
),
431
435
(
432
436
"https://corpora.tika.apache.org/base/docs/govdocs1/911/911260.pdf" ,
433
437
"tika-911260.pdf" ,
438
+ False ,
439
+ None ,
434
440
),
435
441
(
436
442
"https://corpora.tika.apache.org/base/docs/govdocs1/992/992472.pdf" ,
437
443
"tika-992472.pdf" ,
444
+ False ,
445
+ None ,
438
446
),
439
447
(
440
448
"https://corpora.tika.apache.org/base/docs/govdocs1/978/978477.pdf" ,
441
449
"tika-978477.pdf" ,
450
+ False ,
451
+ None ,
442
452
),
443
453
(
444
454
"https://corpora.tika.apache.org/base/docs/govdocs1/960/960317.pdf" ,
445
455
"tika-960317.pdf" ,
456
+ False ,
457
+ None ,
446
458
),
447
459
(
448
460
"https://corpora.tika.apache.org/base/docs/govdocs1/930/930513.pdf" ,
449
461
"tika-930513.pdf" ,
462
+ False ,
463
+ None ,
464
+ ),
465
+ (
466
+ "https://corpora.tika.apache.org/base/docs/govdocs1/918/918113.pdf" ,
467
+ "tika-918113.pdf" ,
468
+ True ,
469
+ None ,
470
+ ),
471
+ (
472
+ "https://corpora.tika.apache.org/base/docs/govdocs1/940/940704.pdf" ,
473
+ "tika-940704.pdf" ,
474
+ True ,
475
+ None ,
476
+ ),
477
+ (
478
+ "https://corpora.tika.apache.org/base/docs/govdocs1/976/976488.pdf" ,
479
+ "tika-976488.pdf" ,
480
+ True ,
481
+ None ,
482
+ ),
483
+ (
484
+ "https://corpora.tika.apache.org/base/docs/govdocs1/948/948176.pdf" ,
485
+ "tika-948176.pdf" ,
486
+ True ,
487
+ None ,
450
488
),
451
489
],
452
490
)
453
- def test_extract_text (url , name ):
491
+ def test_extract_text (url , name , strict , exception ):
454
492
data = BytesIO (get_pdf_from_url (url , name = name ))
455
- reader = PdfReader (data )
456
- reader .metadata
493
+ reader = PdfReader (data , strict = strict )
494
+ if not exception :
495
+ for page in reader .pages :
496
+ page .extract_text ()
497
+ else :
498
+ exc , exc_text = exception
499
+ with pytest .raises (exc ) as ex_info :
500
+ for page in reader .pages :
501
+ page .extract_text ()
502
+ assert ex_info .value .args [0 ] == exc_text
457
503
458
504
459
505
@pytest .mark .parametrize (
@@ -481,21 +527,28 @@ def test_compress_raised(url, name):
481
527
482
528
483
529
@pytest .mark .parametrize (
484
- ("url" , "name" ),
530
+ ("url" , "name" , "strict" ),
485
531
[
486
532
(
487
533
"https://corpora.tika.apache.org/base/docs/govdocs1/915/915194.pdf" ,
488
534
"tika-915194.pdf" ,
535
+ False ,
489
536
),
490
537
(
491
538
"https://corpora.tika.apache.org/base/docs/govdocs1/950/950337.pdf" ,
492
539
"tika-950337.pdf" ,
540
+ False ,
541
+ ),
542
+ (
543
+ "https://corpora.tika.apache.org/base/docs/govdocs1/962/962292.pdf" ,
544
+ "tika-962292.pdf" ,
545
+ True ,
493
546
),
494
547
],
495
548
)
496
- def test_compress (url , name ):
549
+ def test_compress (url , name , strict ):
497
550
data = BytesIO (get_pdf_from_url (url , name = name ))
498
- reader = PdfReader (data )
551
+ reader = PdfReader (data , strict = strict )
499
552
# TODO: which page exactly?
500
553
# TODO: Is it reasonable to have an exception here?
501
554
for page in reader .pages :
0 commit comments