diff --git a/CHANGELOG.md b/CHANGELOG.md index 366a02d18..c41f56340 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,24 @@ # CHANGELOG +## Version 2.10.4, 2022-08-28 + +### Robustness (ROB) +- Fix errors/warnings on no /Resources within extract_text (#1276) +- Add required line separators in ContentStream ArrayObjects (#1281) + +### Maintenance (MAINT) +- Use NameObject idempotency (#1290) + +### Testing (TST) +- Rectangle deletion (#1289) +- Add workflow tests (#1287) +- Remove files after tests ran (#1286) + +### Packaging (PKG) +- Add minimum version for typing_extensions requirement (#1277) + +Full Changelog: https://github.com/py-pdf/PyPDF2/compare/2.10.3...2.10.4 + ## Version 2.10.3, 2022-08-21 ### Robustness (ROB) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 50eb2540b..a0ec2945c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -11,6 +11,7 @@ history and [GitHubs 'Contributors' feature](https://github.com/py-pdf/PyPDF2/gr ## Contributors to the pyPdf / PyPDF2 project +* [DL6ER](https://github.com/DL6ER) * [JianzhengLuo](https://github.com/JianzhengLuo) * [Karvonen, Harry](https://github.com/Hatell/) * [KourFrost](https://github.com/KourFrost) diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py index 77a15ab32..45bf36662 100644 --- a/PyPDF2/_page.py +++ b/PyPDF2/_page.py @@ -97,8 +97,7 @@ def getRectangle( def _set_rectangle(self: Any, name: str, value: Union[RectangleObject, float]) -> None: - if not isinstance(name, NameObject): - name = NameObject(name) + name = NameObject(name) self[name] = value @@ -1140,7 +1139,15 @@ def _extract_text( cmaps: Dict[ str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]] ] = {} - resources_dict = cast(DictionaryObject, obj["/Resources"]) + try: + objr = obj + while NameObject("/Resources") not in objr: + # /Resources can be inherited sometimes so we look to parents + objr = objr["/Parent"].get_object() + # if no parents we will have no /Resources will be available => an exception wil be raised + resources_dict = cast(DictionaryObject, objr["/Resources"]) + except Exception: + return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj) diff --git a/PyPDF2/_version.py b/PyPDF2/_version.py index 21e33ba2c..e3b571e0f 100644 --- a/PyPDF2/_version.py +++ b/PyPDF2/_version.py @@ -1 +1 @@ -__version__ = "2.10.3" +__version__ = "2.10.4" diff --git a/PyPDF2/generic/_data_structures.py b/PyPDF2/generic/_data_structures.py index 85c4e8819..283b33b22 100644 --- a/PyPDF2/generic/_data_structures.py +++ b/PyPDF2/generic/_data_structures.py @@ -679,6 +679,8 @@ def __init__( data = b"" for s in stream: data += b_(s.get_object().get_data()) + if data[-1] != b"\n": + data += b"\n" stream_bytes = BytesIO(data) else: stream_data = stream.get_data() diff --git a/setup.cfg b/setup.cfg index bded39e8a..2c0eebe8f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,7 +38,7 @@ packages = PyPDF2.generic python_requires = >=3.6 install_requires = - typing_extensions; python_version < '3.10' + typing_extensions >= 3.10.0.0; python_version < '3.10' [options.extras_require] crypto = PyCryptodome diff --git a/tests/bench.py b/tests/bench.py index 28329494b..4ae9bb2d1 100644 --- a/tests/bench.py +++ b/tests/bench.py @@ -1,4 +1,3 @@ -import os from pathlib import Path import PyPDF2 @@ -83,12 +82,12 @@ def merge(): merger.set_page_layout("/SinglePage") merger.set_page_mode("/UseThumbs") - tmp_path = "dont_commit_merged.pdf" - merger.write(tmp_path) + write_path = "dont_commit_merged.pdf" + merger.write(write_path) merger.close() # Check if outline is correct - reader = PyPDF2.PdfReader(tmp_path) + reader = PyPDF2.PdfReader(write_path) assert [ el.title for el in reader._get_outline() if isinstance(el, Destination) ] == [ @@ -105,9 +104,6 @@ def merge(): "True", ] - # Clean up - os.remove(tmp_path) - def test_merge(benchmark): """ diff --git a/tests/test_basic_features.py b/tests/test_basic_features.py deleted file mode 100644 index bdc65d074..000000000 --- a/tests/test_basic_features.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from pathlib import Path - -from PyPDF2 import PdfReader, PdfWriter - -TESTS_ROOT = Path(__file__).parent.resolve() -PROJECT_ROOT = TESTS_ROOT.parent -RESOURCE_ROOT = PROJECT_ROOT / "resources" - - -def test_basic_features(): - pdf_path = RESOURCE_ROOT / "crazyones.pdf" - reader = PdfReader(pdf_path) - writer = PdfWriter() - - assert len(reader.pages) == 1 - - # add page 1 from input1 to output document, unchanged - writer.add_page(reader.pages[0]) - - # add page 2 from input1, but rotated clockwise 90 degrees - writer.add_page(reader.pages[0].rotate(90)) - - # add page 3 from input1, but first add a watermark from another PDF: - page3 = reader.pages[0] - watermark_pdf = pdf_path - watermark = PdfReader(watermark_pdf) - page3.merge_page(watermark.pages[0]) - writer.add_page(page3) - - # add page 4 from input1, but crop it to half size: - page4 = reader.pages[0] - page4.mediabox.upper_right = ( - page4.mediabox.right / 2, - page4.mediabox.top / 2, - ) - writer.add_page(page4) - - # add some Javascript to launch the print window on opening this PDF. - # the password dialog may prevent the print dialog from being shown, - # comment the the encription lines, if that's the case, to try this out - writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") - - # encrypt your new PDF and add a password - password = "secret" - writer.encrypt(password) - - # finally, write "output" to PyPDF2-output.pdf - tmp_path = "PyPDF2-output.pdf" - with open(tmp_path, "wb") as output_stream: - writer.write(output_stream) - - # cleanup - os.remove(tmp_path) diff --git a/tests/test_generic.py b/tests/test_generic.py index 6a82df22c..5cb1ae5d1 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -579,17 +579,23 @@ def test_name_object_read_from_stream_unicode_error(): # L588 page.extract_text() -def test_bool_repr(): +def test_bool_repr(tmp_path): url = "https://corpora.tika.apache.org/base/docs/govdocs1/932/932449.pdf" name = "tika-932449.pdf" reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) - with open("tmp-fields-report.txt", "w") as fp: + write_path = tmp_path / "tmp-fields-report.txt" + with open(write_path, "w") as fp: fields = reader.get_fields(fileobj=fp) assert fields - - # cleanup - os.remove("tmp-fields-report.txt") + assert list(fields.keys()) == ["USGPOSignature"] + with open(write_path) as fp: + data = fp.read() + assert data.startswith( + "Field Name: USGPOSignature\nField Type: Signature\nField Flags: 1\n" + "Value: {'/Type': '/Sig', '/Filter': '/Adobe.PPKLite', " + "'/SubFilter':" + ) @patch("PyPDF2._reader.logger_warning") diff --git a/tests/test_page.py b/tests/test_page.py index 2a9c97b00..40906bd3e 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -238,6 +238,13 @@ def test_extract_text_single_quote_op(): page.extract_text() +def test_no_ressources_on_text_extract(): + url = "https://github.com/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf"))) + for page in reader.pages: + page.extract_text() + + def test_iss_1142(): # check fix for problem of context save/restore (q/Q) url = "https://github.com/py-pdf/PyPDF2/files/9150656/ST.2019.PDF" @@ -265,6 +272,11 @@ def test_iss_1142(): "https://github.com/py-pdf/PyPDF2/files/9150656/ST.2019.PDF", "iss_1134.pdf", ), + # iss 1: + ( + "https://github.com/py-pdf/PyPDF2/files/9432350/Work.Flow.From.Check.to.QA.pdf", + "WFCA.pdf", + ), ], ) def test_extract_text_page_pdf(url, name): @@ -280,7 +292,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog): for page in reader.pages: page.extract_text() warn_msgs = normalize_warnings(caplog.text) - assert warn_msgs == [" impossible to decode XFormObject /Meta203"] + assert warn_msgs == [""] # text extraction recognise no text def test_extract_text_operator_t_star(): # L1266, L1267 diff --git a/tests/test_workflows.py b/tests/test_workflows.py index 672ba90b3..0a7b1efb8 100644 --- a/tests/test_workflows.py +++ b/tests/test_workflows.py @@ -29,6 +29,50 @@ sys.path.append(str(PROJECT_ROOT)) +def test_basic_features(tmp_path): + pdf_path = RESOURCE_ROOT / "crazyones.pdf" + reader = PdfReader(pdf_path) + writer = PdfWriter() + + assert len(reader.pages) == 1 + + # add page 1 from input1 to output document, unchanged + writer.add_page(reader.pages[0]) + + # add page 2 from input1, but rotated clockwise 90 degrees + writer.add_page(reader.pages[0].rotate(90)) + + # add page 3 from input1, but first add a watermark from another PDF: + page3 = reader.pages[0] + watermark_pdf = pdf_path + watermark = PdfReader(watermark_pdf) + page3.merge_page(watermark.pages[0]) + writer.add_page(page3) + + # add page 4 from input1, but crop it to half size: + page4 = reader.pages[0] + page4.mediabox.upper_right = ( + page4.mediabox.right / 2, + page4.mediabox.top / 2, + ) + del page4.mediabox + writer.add_page(page4) + + # add some Javascript to launch the print window on opening this PDF. + # the password dialog may prevent the print dialog from being shown, + # comment the the encription lines, if that's the case, to try this out + writer.add_js("this.print({bUI:true,bSilent:false,bShrinkToFit:true});") + + # encrypt your new PDF and add a password + password = "secret" + writer.encrypt(password) + + # finally, write "output" to PyPDF2-output.pdf + write_path = tmp_path / "PyPDF2-output.pdf" + with open(write_path, "wb") as output_stream: + writer.write(output_stream) + + def test_dropdown_items(): inputfile = RESOURCE_ROOT / "libreoffice-form.pdf" reader = PdfReader(inputfile) @@ -321,7 +365,7 @@ def test_overlay(base_path, overlay_path): writer.write(fp) # Cleanup - os.remove("dont_commit_overlay.pdf") + os.remove("dont_commit_overlay.pdf") # remove for manual inspection @pytest.mark.parametrize( @@ -333,16 +377,13 @@ def test_overlay(base_path, overlay_path): ) ], ) -def test_merge_with_warning(url, name): +def test_merge_with_warning(tmp_path, url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) merger = PdfMerger() merger.append(reader) # This could actually be a performance bottleneck: - merger.write("tmp.merged.pdf") - - # Cleanup - os.remove("tmp.merged.pdf") + merger.write(tmp_path / "tmp.merged.pdf") @pytest.mark.parametrize( @@ -354,15 +395,12 @@ def test_merge_with_warning(url, name): ) ], ) -def test_merge(url, name): +def test_merge(tmp_path, url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) merger = PdfMerger() merger.append(reader) - merger.write("tmp.merged.pdf") - - # Cleanup - os.remove("tmp.merged.pdf") + merger.write(tmp_path / "tmp.merged.pdf") @pytest.mark.parametrize( @@ -381,42 +419,88 @@ def test_get_metadata(url, name): @pytest.mark.parametrize( - ("url", "name"), + ("url", "name", "strict", "exception"), [ ( "https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf", "tika-938702.pdf", + False, + (PdfReadError, "Unexpected end of stream"), ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf", "tika-942358.pdf", + False, + None, ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/911/911260.pdf", "tika-911260.pdf", + False, + None, ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/992/992472.pdf", "tika-992472.pdf", + False, + None, ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/978/978477.pdf", "tika-978477.pdf", + False, + None, ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/960/960317.pdf", "tika-960317.pdf", + False, + None, ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/930/930513.pdf", "tika-930513.pdf", + False, + None, + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/918/918113.pdf", + "tika-918113.pdf", + True, + None, + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/940/940704.pdf", + "tika-940704.pdf", + True, + None, + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/976/976488.pdf", + "tika-976488.pdf", + True, + None, + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/948/948176.pdf", + "tika-948176.pdf", + True, + None, ), ], ) -def test_extract_text(url, name): +def test_extract_text(url, name, strict, exception): data = BytesIO(get_pdf_from_url(url, name=name)) - reader = PdfReader(data) - reader.metadata + reader = PdfReader(data, strict=strict) + if not exception: + for page in reader.pages: + page.extract_text() + else: + exc, exc_text = exception + with pytest.raises(exc) as ex_info: + for page in reader.pages: + page.extract_text() + assert ex_info.value.args[0] == exc_text @pytest.mark.parametrize( @@ -444,21 +528,28 @@ def test_compress_raised(url, name): @pytest.mark.parametrize( - ("url", "name"), + ("url", "name", "strict"), [ ( "https://corpora.tika.apache.org/base/docs/govdocs1/915/915194.pdf", "tika-915194.pdf", + False, ), ( "https://corpora.tika.apache.org/base/docs/govdocs1/950/950337.pdf", "tika-950337.pdf", + False, + ), + ( + "https://corpora.tika.apache.org/base/docs/govdocs1/962/962292.pdf", + "tika-962292.pdf", + True, ), ], ) -def test_compress(url, name): +def test_compress(url, name, strict): data = BytesIO(get_pdf_from_url(url, name=name)) - reader = PdfReader(data) + reader = PdfReader(data, strict=strict) # TODO: which page exactly? # TODO: Is it reasonable to have an exception here? for page in reader.pages: @@ -474,18 +565,16 @@ def test_compress(url, name): ), ], ) -def test_get_fields_warns(caplog, url, name): +def test_get_fields_warns(tmp_path, caplog, url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) - with open("tmp.txt", "w") as fp: + write_path = tmp_path / "tmp.txt" + with open(write_path, "w") as fp: retrieved_fields = reader.get_fields(fileobj=fp) assert retrieved_fields == {} assert normalize_warnings(caplog.text) == ["Object 2 0 not defined."] - # Cleanup - os.remove("tmp.txt") - @pytest.mark.parametrize( ("url", "name"), @@ -496,17 +585,15 @@ def test_get_fields_warns(caplog, url, name): ), ], ) -def test_get_fields_no_warning(url, name): +def test_get_fields_no_warning(tmp_path, url, name): data = BytesIO(get_pdf_from_url(url, name=name)) reader = PdfReader(data) - with open("tmp.txt", "w") as fp: + write_path = tmp_path / "tmp.txt" + with open(write_path, "w") as fp: retrieved_fields = reader.get_fields(fileobj=fp) assert len(retrieved_fields) == 10 - # Cleanup - os.remove("tmp.txt") - def test_scale_rectangle_indirect_object(): url = "https://corpora.tika.apache.org/base/docs/govdocs1/999/999944.pdf" diff --git a/tests/test_writer.py b/tests/test_writer.py index d83db2002..9c8f0dae3 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -384,6 +384,8 @@ def test_fill_form(): with open(tmp_filename, "wb") as output_stream: writer.write(output_stream) + os.remove(tmp_filename) # cleanup + @pytest.mark.parametrize( ("use_128bit", "user_pwd", "owner_pwd"), @@ -595,14 +597,18 @@ def test_io_streams(): def test_regression_issue670(): + tmp_file = "dont_commit_issue670.pdf" filepath = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(filepath, strict=False) for _ in range(2): writer = PdfWriter() writer.add_page(reader.pages[0]) - with open("dont_commit_issue670.pdf", "wb") as f_pdf: + with open(tmp_file, "wb") as f_pdf: writer.write(f_pdf) + # cleanup + os.remove(tmp_file) + def test_issue301(): """