Skip to content

Commit af9c01b

Browse files
authored
ROB: Fix errors/warnings on no /Resources within extract_text (#1276)
Look for /Ressources in parents Closes #1272 (in text) Closes #1269 (in Xform)
1 parent ceb997d commit af9c01b

File tree

2 files changed

+17
-2
lines changed

2 files changed

+17
-2
lines changed

PyPDF2/_page.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,15 @@ def _extract_text(
11401140
cmaps: Dict[
11411141
str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str]]
11421142
] = {}
1143-
resources_dict = cast(DictionaryObject, obj["/Resources"])
1143+
try:
1144+
objr = obj
1145+
while NameObject("/Resources") not in objr:
1146+
# /Resources can be inherited sometimes so we look to parents
1147+
objr = objr["/Parent"].get_object()
1148+
# if no parents we will have no /Resources will be available => an exception wil be raised
1149+
resources_dict = cast(DictionaryObject, objr["/Resources"])
1150+
except Exception:
1151+
return "" # no resources means no text is possible (no font) we consider the file as not damaged, no need to check for TJ or Tj
11441152
if "/Font" in resources_dict:
11451153
for f in cast(DictionaryObject, resources_dict["/Font"]):
11461154
cmaps[f] = build_char_map(f, space_width, obj)

tests/test_page.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,13 @@ def test_extract_text_single_quote_op():
238238
page.extract_text()
239239

240240

241+
def test_no_ressources_on_text_extract():
242+
url = "https://github.com/py-pdf/PyPDF2/files/9428434/TelemetryTX_EM.pdf"
243+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-964029.pdf")))
244+
for page in reader.pages:
245+
page.extract_text()
246+
247+
241248
def test_iss_1142():
242249
# check fix for problem of context save/restore (q/Q)
243250
url = "https://github.com/py-pdf/PyPDF2/files/9150656/ST.2019.PDF"
@@ -285,7 +292,7 @@ def test_extract_text_page_pdf_impossible_decode_xform(caplog):
285292
for page in reader.pages:
286293
page.extract_text()
287294
warn_msgs = normalize_warnings(caplog.text)
288-
assert warn_msgs == [" impossible to decode XFormObject /Meta203"]
295+
assert warn_msgs == [""] # text extraction recognise no text
289296

290297

291298
def test_extract_text_operator_t_star(): # L1266, L1267

0 commit comments

Comments
 (0)