ROB: Fix errors/warnings on no /Resources within extract_text (#1276)

pubpub-zz · web-flow · commit af9c01b94c0a · 2022-08-28T12:26:50.000+02:00
Look for /Ressources in parents Closes #1272 (in text) Closes #1269 (in Xform)
diff --git a/PyPDF2/_page.py b/PyPDF2/_page.py
@@ -1140,7 +1140,15 @@ def _extract_text(
         cmaps: Dict[
             str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]]
         ] = {}
-        resources_dict = cast(DictionaryObject, obj["/Resources"])
+        try:
+            objr = obj
+            while NameObject("/Resources") not in objr:
+                # /Resources can be inherited sometimes so we look to parents
+                objr = objr["/Parent"].get_object()
+                # if no parents we will have no /Resources will be available => an exception wil be raised
+            resources_dict = cast(DictionaryObject, objr["/Resources"])
+        except Exception:
+            return ""  # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj
         if "/Font" in resources_dict:
             for f in cast(DictionaryObject, resources_dict["/Font"]):
                 cmaps[f] = build_char_map(f, space_width, obj)
diff --git a/tests/test_page.py b/tests/test_page.py
@@ -238,6 +238,13 @@ def test_extract_text_single_quote_op():
         page.extract_text()
 
 
+def test_no_ressources_on_text_extract():
+    url = "https://github.com/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf")))
+    for page in reader.pages:
+        page.extract_text()
+
+
 def test_iss_1142():
     # check fix for problem of context save/restore (q/Q)
     url = "https://github.com/py-pdf/PyPDF2/files/9150656/ST.2019.PDF"
@@ -285,7 +292,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog):
     for page in reader.pages:
         page.extract_text()
     warn_msgs = normalize_warnings(caplog.text)
-    assert warn_msgs == [" impossible to decode XFormObject /Meta203"]
+    assert warn_msgs == [""]  # text extraction recognise no text
 
 
 def test_extract_text_operator_t_star():  # L1266, L1267