-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
Copy pathtest_cmap.py
319 lines (268 loc) · 9.58 KB
/
test_cmap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""Test the pypdf_cmap module."""
from io import BytesIO
from pathlib import Path
import pytest
from pypdf import PdfReader, PdfWriter
from pypdf._cmap import build_char_map
from pypdf.generic import ArrayObject, IndirectObject, NameObject, NullObject
from . import get_data_from_url
TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"
@pytest.mark.enable_socket
@pytest.mark.slow
@pytest.mark.parametrize(
("url", "name", "strict"),
[
# compute_space_width:
(
None,
"tika-923406.pdf",
False,
),
# _parse_to_unicode_process_rg:
(
None,
"tika-959173.pdf",
False,
),
(
None,
"tika-959173.pdf",
True,
),
# issue #1718:
(
None,
"iss1718.pdf",
False,
),
],
)
def test_text_extraction_slow(caplog, url: str, name: str, strict: bool):
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
for page in reader.pages:
page.extract_text()
assert caplog.text == ""
@pytest.mark.enable_socket
@pytest.mark.parametrize(
("url", "name", "strict"),
[
# bfchar_on_2_chars: issue #1293
(
None,
"ASurveyofImageClassificationBasedTechniques.pdf",
False,
),
# L40, get_font_width_from_default
(
None,
"tika-908104.pdf",
False,
),
# multiline_bfrange / regression test for issue #1285:
(
None,
"The%20lean%20times%20in%20the%20Peruvian%20economy.pdf",
False,
),
(
None,
"Giacalone.pdf",
False,
),
],
)
def test_text_extraction_fast(caplog, url: str, name: str, strict: bool):
"""Text extraction runs without exceptions or warnings"""
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)), strict=strict)
for page in reader.pages:
page.extract_text()
assert caplog.text == ""
@pytest.mark.enable_socket
def test_parse_encoding_advanced_encoding_not_implemented(caplog):
reader = PdfReader(BytesIO(get_data_from_url(name="tika-957144.pdf")))
for page in reader.pages:
page.extract_text()
# The correctly spelled encoding is /WinAnsiEncoding
assert "Advanced encoding /WinAnsEncoding not implemented yet" in caplog.text
@pytest.mark.enable_socket
def test_ascii_charset():
# Issue #1312
reader = PdfReader(BytesIO(get_data_from_url(name="ascii charset.pdf")))
assert "/a" not in reader.pages[0].extract_text()
@pytest.mark.enable_socket
@pytest.mark.parametrize(
("url", "name", "page_nb", "within_text"),
[
(
None,
"cmap1370.pdf",
0,
"",
),
(
None,
"02voc.pdf",
2,
"Document delineation and character sequence decoding",
),
],
ids=["iss1370", "iss1379"],
)
def test_text_extraction_of_specific_pages(
url: str, name: str, page_nb: int, within_text
):
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert within_text in reader.pages[page_nb].extract_text()
@pytest.mark.enable_socket
def test_iss1533():
reader = PdfReader(BytesIO(get_data_from_url(name="iss1533.pdf")))
reader.pages[0].extract_text() # no error
assert build_char_map("/F", 200, reader.pages[0])[3]["\x01"] == "Ü"
@pytest.mark.enable_socket
@pytest.mark.parametrize(
("url", "name", "page_index", "within_text", "caplog_text"),
[
(
None,
"tstUCS2.pdf",
1,
["2 / 12", "S0490520090001", "于博"],
"",
),
(
None,
"tst-GBK_EUC.pdf",
0,
["NJA", "中华男科学杂志"],
"Multiple definitions in dictionary at byte 0x5cb42 for key /MediaBox\n",
),
],
)
def test_cmap_encodings(caplog, url, name, page_index, within_text, caplog_text):
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
extracted = reader.pages[page_index].extract_text() # no error
for contained in within_text:
assert contained in extracted
assert caplog_text in caplog.text
@pytest.mark.enable_socket
def test_latex():
reader = PdfReader(BytesIO(get_data_from_url(name="math_latex.pdf")))
txt = reader.pages[0].extract_text() # no error
for pat in ("α", "β", "γ", "ϕ", "φ", "ℏ", "∫", "∂", "·", "×"):
assert pat in txt
# actually the ϕ and φ seems to be crossed in latex
@pytest.mark.enable_socket
def test_unixxx_glyphs():
reader = PdfReader(BytesIO(get_data_from_url(name="unixxx_glyphs.pdf")))
txt = reader.pages[0].extract_text() # no error
for pat in ("闫耀庭", "龚龑", "张江水", "1′′.2"):
assert pat in txt
@pytest.mark.enable_socket
def test_cmap_compute_space_width():
# issue 2137
# original file URL:
# url = "https://arxiv.org/pdf/2005.05909.pdf"
# URL from github issue is too long to pass code type check, use original arxiv URL instead
# url = "https://github.com/py-pdf/pypdf/files/12489914/Morris.et.al.-.2020.-.TextAttack.A.Framework.for.Adversarial.Attacks.Data.Augmentation.and.Adversarial.Training.in.NLP.pdf"
reader = PdfReader(BytesIO(get_data_from_url(name="TextAttack_paper.pdf")))
reader.pages[0].extract_text() # no error
@pytest.mark.enable_socket
def test_tabs_in_cmap():
"""Issue #2173"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2173.pdf")))
reader.pages[0].extract_text()
@pytest.mark.enable_socket
def test_ignoring_non_put_entries():
"""Issue #2290"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
reader.pages[0].extract_text()
@pytest.mark.enable_socket
def test_eten_b5():
"""Issue #2356"""
reader = PdfReader(BytesIO(get_data_from_url(name="iss2290.pdf")))
reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")
def test_missing_entries_in_cmap():
"""
Issue #2702: this issue is observed on damaged pdfs
use of this file in test has been discarded as too slow/long
we will create the same error from crazyones
"""
pdf_path = RESOURCE_ROOT / "crazyones.pdf"
reader = PdfReader(pdf_path)
p = reader.pages[0]
p["/Resources"]["/Font"]["/F1"][NameObject("/ToUnicode")] = IndirectObject(
99999999, 0, reader
)
p.extract_text()
def test_null_missing_width():
"""For coverage of #2792"""
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
page = writer.pages[0]
ft = page["/Resources"]["/Font"]["/F1"]
ft[NameObject("/Widths")] = ArrayObject()
ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject()
page.extract_text()
@pytest.mark.enable_socket
def test_unigb_utf16():
"""Cf #2812"""
url = (
"https://github.com/user-attachments/files/16767536/W020240105322424121296.pdf"
)
name = "iss2812.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text()
@pytest.mark.enable_socket
def test_too_many_differences():
"""Cf #2836"""
url = (
"https://github.com/user-attachments/files/16911741/dumb_extract_text_crash.pdf"
)
name = "iss2836.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[0].extract_text() == ""
@pytest.mark.enable_socket
def test_iss2925():
url = (
"https://github.com/user-attachments/files/17621508/2305.09315.pdf"
)
name = "iss2925.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "slicing on the PDG to extract the relevant contextual" in reader.pages[3].extract_text()
@pytest.mark.enable_socket
def test_iss2966():
"""Regression test for issue #2966: indirect objects in fonts"""
url = (
"https://github.com/user-attachments/files/17904233/repro_out.pdf"
)
name = "iss2966.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "Lorem ipsum dolor sit amet" in reader.pages[0].extract_text()
@pytest.mark.enable_socket
def test_binascii_odd_length_string(caplog):
"""Tests for #2216"""
url = "https://github.com/user-attachments/files/18199642/iss2216.pdf"
name = "iss2216.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
assert "\n(Many other theorems may\n" in page.extract_text()
assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text
@pytest.mark.enable_socket
def test_standard_encoding(caplog):
"""Tests for #3156"""
url = "https://github.com/user-attachments/files/18983503/standard-encoding.pdf"
name = "issue3156.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
assert page.extract_text() == "Lorem ipsum"
assert "Advanced encoding" not in caplog.text
@pytest.mark.enable_socket
def test_function_in_font_widths(caplog):
"""Tests for #3153"""
url = "https://github.com/user-attachments/files/18945709/Marseille_pypdf_level_0.2._compressed.pdf"
name = "issue3153.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[455]
assert "La vulnérabilité correspond aux conséquences potentielles" in page.extract_text()
assert "Expected numeric value for width, got {'/Bounds': [0.25, 0.25]," in caplog.text