@@ -1349,7 +1349,7 @@ def test_issue8271(self):
1349
1349
# with start byte of a 2-byte sequence
1350
1350
(b'\xc2 ' , FFFD ), # only the start byte
1351
1351
(b'\xc2 \xc2 ' , FFFD * 2 ), # 2 start bytes
1352
- (b'\xc2 \xc2 \xc2 ' , FFFD * 3 ), # 2 start bytes
1352
+ (b'\xc2 \xc2 \xc2 ' , FFFD * 3 ), # 3 start bytes
1353
1353
(b'\xc2 \x41 ' , FFFD + 'A' ), # invalid continuation byte
1354
1354
# with start byte of a 3-byte sequence
1355
1355
(b'\xe1 ' , FFFD ), # only the start byte
@@ -1419,6 +1419,226 @@ def test_issue8271(self):
1419
1419
self .assertEqual (seq .decode ('utf-8' , 'ignore' ),
1420
1420
res .replace ('\uFFFD ' , '' ))
1421
1421
1422
+ def to_bytestring (self , seq ):
1423
+ return bytes (int (c , 16 ) for c in seq .split ())
1424
+
1425
+ def assertCorrectUTF8Decoding (self , seq , res , err ):
1426
+ """
1427
+ Check that an invalid UTF-8 sequence raises an UnicodeDecodeError when
1428
+ 'strict' is used, returns res when 'replace' is used, and that doesn't
1429
+ return anything when 'ignore' is used.
1430
+ """
1431
+ with self .assertRaises (UnicodeDecodeError ) as cm :
1432
+ seq .decode ('utf-8' )
1433
+ exc = cm .exception
1434
+
1435
+ self .assertIn (err , str (exc ))
1436
+ self .assertEqual (seq .decode ('utf-8' , 'replace' ), res )
1437
+ self .assertEqual ((b'aaaa' + seq + b'bbbb' ).decode ('utf-8' , 'replace' ),
1438
+ 'aaaa' + res + 'bbbb' )
1439
+ res = res .replace ('\ufffd ' , '' )
1440
+ self .assertEqual (seq .decode ('utf-8' , 'ignore' ), res )
1441
+ self .assertEqual ((b'aaaa' + seq + b'bbbb' ).decode ('utf-8' , 'ignore' ),
1442
+ 'aaaa' + res + 'bbbb' )
1443
+
1444
+ def test_invalid_start_byte (self ):
1445
+ """
1446
+ Test that an 'invalid start byte' error is raised when the first byte
1447
+ is not in the ASCII range or is not a valid start byte of a 2-, 3-, or
1448
+ 4-bytes sequence. The invalid start byte is replaced with a single
1449
+ U+FFFD when errors='replace'.
1450
+ E.g. <80> is a continuation byte and can appear only after a start byte.
1451
+ """
1452
+ FFFD = '\ufffd '
1453
+ for byte in b'\x80 \xA0 \x9F \xBF \xC0 \xC1 \xF5 \xFF ' :
1454
+ self .assertCorrectUTF8Decoding (bytes ([byte ]), '\ufffd ' ,
1455
+ 'invalid start byte' )
1456
+
1457
+ def test_unexpected_end_of_data (self ):
1458
+ """
1459
+ Test that an 'unexpected end of data' error is raised when the string
1460
+ ends after a start byte of a 2-, 3-, or 4-bytes sequence without having
1461
+ enough continuation bytes. The incomplete sequence is replaced with a
1462
+ single U+FFFD when errors='replace'.
1463
+ E.g. in the sequence <F3 80 80>, F3 is the start byte of a 4-bytes
1464
+ sequence, but it's followed by only 2 valid continuation bytes and the
1465
+ last continuation bytes is missing.
1466
+ Note: the continuation bytes must be all valid, if one of them is
1467
+ invalid another error will be raised.
1468
+ """
1469
+ sequences = [
1470
+ 'C2' , 'DF' ,
1471
+ 'E0 A0' , 'E0 BF' , 'E1 80' , 'E1 BF' , 'EC 80' , 'EC BF' ,
1472
+ 'ED 80' , 'ED 9F' , 'EE 80' , 'EE BF' , 'EF 80' , 'EF BF' ,
1473
+ 'F0 90' , 'F0 BF' , 'F0 90 80' , 'F0 90 BF' , 'F0 BF 80' , 'F0 BF BF' ,
1474
+ 'F1 80' , 'F1 BF' , 'F1 80 80' , 'F1 80 BF' , 'F1 BF 80' , 'F1 BF BF' ,
1475
+ 'F3 80' , 'F3 BF' , 'F3 80 80' , 'F3 80 BF' , 'F3 BF 80' , 'F3 BF BF' ,
1476
+ 'F4 80' , 'F4 8F' , 'F4 80 80' , 'F4 80 BF' , 'F4 8F 80' , 'F4 8F BF'
1477
+ ]
1478
+ FFFD = '\ufffd '
1479
+ for seq in sequences :
1480
+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), '\ufffd ' ,
1481
+ 'unexpected end of data' )
1482
+
1483
+ def test_invalid_cb_for_2bytes_seq (self ):
1484
+ """
1485
+ Test that an 'invalid continuation byte' error is raised when the
1486
+ continuation byte of a 2-bytes sequence is invalid. The start byte
1487
+ is replaced by a single U+FFFD and the second byte is handled
1488
+ separately when errors='replace'.
1489
+ E.g. in the sequence <C2 41>, C2 is the start byte of a 2-bytes
1490
+ sequence, but 41 is not a valid continuation byte because it's the
1491
+ ASCII letter 'A'.
1492
+ """
1493
+ FFFD = '\ufffd '
1494
+ FFFDx2 = FFFD * 2
1495
+ sequences = [
1496
+ ('C2 00' , FFFD + '\x00 ' ), ('C2 7F' , FFFD + '\x7f ' ),
1497
+ ('C2 C0' , FFFDx2 ), ('C2 FF' , FFFDx2 ),
1498
+ ('DF 00' , FFFD + '\x00 ' ), ('DF 7F' , FFFD + '\x7f ' ),
1499
+ ('DF C0' , FFFDx2 ), ('DF FF' , FFFDx2 ),
1500
+ ]
1501
+ for seq , res in sequences :
1502
+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), res ,
1503
+ 'invalid continuation byte' )
1504
+
1505
+ def test_invalid_cb_for_3bytes_seq (self ):
1506
+ """
1507
+ Test that an 'invalid continuation byte' error is raised when the
1508
+ continuation byte(s) of a 3-bytes sequence are invalid. When
1509
+ errors='replace', if the first continuation byte is valid, the first
1510
+ two bytes (start byte + 1st cb) are replaced by a single U+FFFD and the
1511
+ third byte is handled separately, otherwise only the start byte is
1512
+ replaced with a U+FFFD and the other continuation bytes are handled
1513
+ separately.
1514
+ E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1515
+ sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1516
+ because it's the ASCII letter 'A'.
1517
+ Note: when the start byte is E0 or ED, the valid ranges for the first
1518
+ continuation byte are limited to A0..BF and 80..9F respectively.
1519
+ Python 2 used to consider all the bytes in range 80..BF valid when the
1520
+ start byte was ED. This is fixed in Python 3.
1521
+ """
1522
+ FFFD = '\ufffd '
1523
+ FFFDx2 = FFFD * 2
1524
+ sequences = [
1525
+ ('E0 00' , FFFD + '\x00 ' ), ('E0 7F' , FFFD + '\x7f ' ), ('E0 80' , FFFDx2 ),
1526
+ ('E0 9F' , FFFDx2 ), ('E0 C0' , FFFDx2 ), ('E0 FF' , FFFDx2 ),
1527
+ ('E0 A0 00' , FFFD + '\x00 ' ), ('E0 A0 7F' , FFFD + '\x7f ' ),
1528
+ ('E0 A0 C0' , FFFDx2 ), ('E0 A0 FF' , FFFDx2 ),
1529
+ ('E0 BF 00' , FFFD + '\x00 ' ), ('E0 BF 7F' , FFFD + '\x7f ' ),
1530
+ ('E0 BF C0' , FFFDx2 ), ('E0 BF FF' , FFFDx2 ), ('E1 00' , FFFD + '\x00 ' ),
1531
+ ('E1 7F' , FFFD + '\x7f ' ), ('E1 C0' , FFFDx2 ), ('E1 FF' , FFFDx2 ),
1532
+ ('E1 80 00' , FFFD + '\x00 ' ), ('E1 80 7F' , FFFD + '\x7f ' ),
1533
+ ('E1 80 C0' , FFFDx2 ), ('E1 80 FF' , FFFDx2 ),
1534
+ ('E1 BF 00' , FFFD + '\x00 ' ), ('E1 BF 7F' , FFFD + '\x7f ' ),
1535
+ ('E1 BF C0' , FFFDx2 ), ('E1 BF FF' , FFFDx2 ), ('EC 00' , FFFD + '\x00 ' ),
1536
+ ('EC 7F' , FFFD + '\x7f ' ), ('EC C0' , FFFDx2 ), ('EC FF' , FFFDx2 ),
1537
+ ('EC 80 00' , FFFD + '\x00 ' ), ('EC 80 7F' , FFFD + '\x7f ' ),
1538
+ ('EC 80 C0' , FFFDx2 ), ('EC 80 FF' , FFFDx2 ),
1539
+ ('EC BF 00' , FFFD + '\x00 ' ), ('EC BF 7F' , FFFD + '\x7f ' ),
1540
+ ('EC BF C0' , FFFDx2 ), ('EC BF FF' , FFFDx2 ), ('ED 00' , FFFD + '\x00 ' ),
1541
+ ('ED 7F' , FFFD + '\x7f ' ),
1542
+ ('ED A0' , FFFDx2 ), ('ED BF' , FFFDx2 ), # see note ^
1543
+ ('ED C0' , FFFDx2 ), ('ED FF' , FFFDx2 ), ('ED 80 00' , FFFD + '\x00 ' ),
1544
+ ('ED 80 7F' , FFFD + '\x7f ' ), ('ED 80 C0' , FFFDx2 ),
1545
+ ('ED 80 FF' , FFFDx2 ), ('ED 9F 00' , FFFD + '\x00 ' ),
1546
+ ('ED 9F 7F' , FFFD + '\x7f ' ), ('ED 9F C0' , FFFDx2 ),
1547
+ ('ED 9F FF' , FFFDx2 ), ('EE 00' , FFFD + '\x00 ' ),
1548
+ ('EE 7F' , FFFD + '\x7f ' ), ('EE C0' , FFFDx2 ), ('EE FF' , FFFDx2 ),
1549
+ ('EE 80 00' , FFFD + '\x00 ' ), ('EE 80 7F' , FFFD + '\x7f ' ),
1550
+ ('EE 80 C0' , FFFDx2 ), ('EE 80 FF' , FFFDx2 ),
1551
+ ('EE BF 00' , FFFD + '\x00 ' ), ('EE BF 7F' , FFFD + '\x7f ' ),
1552
+ ('EE BF C0' , FFFDx2 ), ('EE BF FF' , FFFDx2 ), ('EF 00' , FFFD + '\x00 ' ),
1553
+ ('EF 7F' , FFFD + '\x7f ' ), ('EF C0' , FFFDx2 ), ('EF FF' , FFFDx2 ),
1554
+ ('EF 80 00' , FFFD + '\x00 ' ), ('EF 80 7F' , FFFD + '\x7f ' ),
1555
+ ('EF 80 C0' , FFFDx2 ), ('EF 80 FF' , FFFDx2 ),
1556
+ ('EF BF 00' , FFFD + '\x00 ' ), ('EF BF 7F' , FFFD + '\x7f ' ),
1557
+ ('EF BF C0' , FFFDx2 ), ('EF BF FF' , FFFDx2 ),
1558
+ ]
1559
+ for seq , res in sequences :
1560
+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), res ,
1561
+ 'invalid continuation byte' )
1562
+
1563
+ def test_invalid_cb_for_4bytes_seq (self ):
1564
+ """
1565
+ Test that an 'invalid continuation byte' error is raised when the
1566
+ continuation byte(s) of a 4-bytes sequence are invalid. When
1567
+ errors='replace',the start byte and all the following valid
1568
+ continuation bytes are replaced with a single U+FFFD, and all the bytes
1569
+ starting from the first invalid continuation bytes (included) are
1570
+ handled separately.
1571
+ E.g. in the sequence <E1 80 41>, E1 is the start byte of a 3-bytes
1572
+ sequence, 80 is a valid continuation byte, but 41 is not a valid cb
1573
+ because it's the ASCII letter 'A'.
1574
+ Note: when the start byte is E0 or ED, the valid ranges for the first
1575
+ continuation byte are limited to A0..BF and 80..9F respectively.
1576
+ However, when the start byte is ED, Python 2 considers all the bytes
1577
+ in range 80..BF valid. This is fixed in Python 3.
1578
+ """
1579
+ FFFD = '\ufffd '
1580
+ FFFDx2 = FFFD * 2
1581
+ sequences = [
1582
+ ('F0 00' , FFFD + '\x00 ' ), ('F0 7F' , FFFD + '\x7f ' ), ('F0 80' , FFFDx2 ),
1583
+ ('F0 8F' , FFFDx2 ), ('F0 C0' , FFFDx2 ), ('F0 FF' , FFFDx2 ),
1584
+ ('F0 90 00' , FFFD + '\x00 ' ), ('F0 90 7F' , FFFD + '\x7f ' ),
1585
+ ('F0 90 C0' , FFFDx2 ), ('F0 90 FF' , FFFDx2 ),
1586
+ ('F0 BF 00' , FFFD + '\x00 ' ), ('F0 BF 7F' , FFFD + '\x7f ' ),
1587
+ ('F0 BF C0' , FFFDx2 ), ('F0 BF FF' , FFFDx2 ),
1588
+ ('F0 90 80 00' , FFFD + '\x00 ' ), ('F0 90 80 7F' , FFFD + '\x7f ' ),
1589
+ ('F0 90 80 C0' , FFFDx2 ), ('F0 90 80 FF' , FFFDx2 ),
1590
+ ('F0 90 BF 00' , FFFD + '\x00 ' ), ('F0 90 BF 7F' , FFFD + '\x7f ' ),
1591
+ ('F0 90 BF C0' , FFFDx2 ), ('F0 90 BF FF' , FFFDx2 ),
1592
+ ('F0 BF 80 00' , FFFD + '\x00 ' ), ('F0 BF 80 7F' , FFFD + '\x7f ' ),
1593
+ ('F0 BF 80 C0' , FFFDx2 ), ('F0 BF 80 FF' , FFFDx2 ),
1594
+ ('F0 BF BF 00' , FFFD + '\x00 ' ), ('F0 BF BF 7F' , FFFD + '\x7f ' ),
1595
+ ('F0 BF BF C0' , FFFDx2 ), ('F0 BF BF FF' , FFFDx2 ),
1596
+ ('F1 00' , FFFD + '\x00 ' ), ('F1 7F' , FFFD + '\x7f ' ), ('F1 C0' , FFFDx2 ),
1597
+ ('F1 FF' , FFFDx2 ), ('F1 80 00' , FFFD + '\x00 ' ),
1598
+ ('F1 80 7F' , FFFD + '\x7f ' ), ('F1 80 C0' , FFFDx2 ),
1599
+ ('F1 80 FF' , FFFDx2 ), ('F1 BF 00' , FFFD + '\x00 ' ),
1600
+ ('F1 BF 7F' , FFFD + '\x7f ' ), ('F1 BF C0' , FFFDx2 ),
1601
+ ('F1 BF FF' , FFFDx2 ), ('F1 80 80 00' , FFFD + '\x00 ' ),
1602
+ ('F1 80 80 7F' , FFFD + '\x7f ' ), ('F1 80 80 C0' , FFFDx2 ),
1603
+ ('F1 80 80 FF' , FFFDx2 ), ('F1 80 BF 00' , FFFD + '\x00 ' ),
1604
+ ('F1 80 BF 7F' , FFFD + '\x7f ' ), ('F1 80 BF C0' , FFFDx2 ),
1605
+ ('F1 80 BF FF' , FFFDx2 ), ('F1 BF 80 00' , FFFD + '\x00 ' ),
1606
+ ('F1 BF 80 7F' , FFFD + '\x7f ' ), ('F1 BF 80 C0' , FFFDx2 ),
1607
+ ('F1 BF 80 FF' , FFFDx2 ), ('F1 BF BF 00' , FFFD + '\x00 ' ),
1608
+ ('F1 BF BF 7F' , FFFD + '\x7f ' ), ('F1 BF BF C0' , FFFDx2 ),
1609
+ ('F1 BF BF FF' , FFFDx2 ), ('F3 00' , FFFD + '\x00 ' ),
1610
+ ('F3 7F' , FFFD + '\x7f ' ), ('F3 C0' , FFFDx2 ), ('F3 FF' , FFFDx2 ),
1611
+ ('F3 80 00' , FFFD + '\x00 ' ), ('F3 80 7F' , FFFD + '\x7f ' ),
1612
+ ('F3 80 C0' , FFFDx2 ), ('F3 80 FF' , FFFDx2 ),
1613
+ ('F3 BF 00' , FFFD + '\x00 ' ), ('F3 BF 7F' , FFFD + '\x7f ' ),
1614
+ ('F3 BF C0' , FFFDx2 ), ('F3 BF FF' , FFFDx2 ),
1615
+ ('F3 80 80 00' , FFFD + '\x00 ' ), ('F3 80 80 7F' , FFFD + '\x7f ' ),
1616
+ ('F3 80 80 C0' , FFFDx2 ), ('F3 80 80 FF' , FFFDx2 ),
1617
+ ('F3 80 BF 00' , FFFD + '\x00 ' ), ('F3 80 BF 7F' , FFFD + '\x7f ' ),
1618
+ ('F3 80 BF C0' , FFFDx2 ), ('F3 80 BF FF' , FFFDx2 ),
1619
+ ('F3 BF 80 00' , FFFD + '\x00 ' ), ('F3 BF 80 7F' , FFFD + '\x7f ' ),
1620
+ ('F3 BF 80 C0' , FFFDx2 ), ('F3 BF 80 FF' , FFFDx2 ),
1621
+ ('F3 BF BF 00' , FFFD + '\x00 ' ), ('F3 BF BF 7F' , FFFD + '\x7f ' ),
1622
+ ('F3 BF BF C0' , FFFDx2 ), ('F3 BF BF FF' , FFFDx2 ),
1623
+ ('F4 00' , FFFD + '\x00 ' ), ('F4 7F' , FFFD + '\x7f ' ), ('F4 90' , FFFDx2 ),
1624
+ ('F4 BF' , FFFDx2 ), ('F4 C0' , FFFDx2 ), ('F4 FF' , FFFDx2 ),
1625
+ ('F4 80 00' , FFFD + '\x00 ' ), ('F4 80 7F' , FFFD + '\x7f ' ),
1626
+ ('F4 80 C0' , FFFDx2 ), ('F4 80 FF' , FFFDx2 ),
1627
+ ('F4 8F 00' , FFFD + '\x00 ' ), ('F4 8F 7F' , FFFD + '\x7f ' ),
1628
+ ('F4 8F C0' , FFFDx2 ), ('F4 8F FF' , FFFDx2 ),
1629
+ ('F4 80 80 00' , FFFD + '\x00 ' ), ('F4 80 80 7F' , FFFD + '\x7f ' ),
1630
+ ('F4 80 80 C0' , FFFDx2 ), ('F4 80 80 FF' , FFFDx2 ),
1631
+ ('F4 80 BF 00' , FFFD + '\x00 ' ), ('F4 80 BF 7F' , FFFD + '\x7f ' ),
1632
+ ('F4 80 BF C0' , FFFDx2 ), ('F4 80 BF FF' , FFFDx2 ),
1633
+ ('F4 8F 80 00' , FFFD + '\x00 ' ), ('F4 8F 80 7F' , FFFD + '\x7f ' ),
1634
+ ('F4 8F 80 C0' , FFFDx2 ), ('F4 8F 80 FF' , FFFDx2 ),
1635
+ ('F4 8F BF 00' , FFFD + '\x00 ' ), ('F4 8F BF 7F' , FFFD + '\x7f ' ),
1636
+ ('F4 8F BF C0' , FFFDx2 ), ('F4 8F BF FF' , FFFDx2 )
1637
+ ]
1638
+ for seq , res in sequences :
1639
+ self .assertCorrectUTF8Decoding (self .to_bytestring (seq ), res ,
1640
+ 'invalid continuation byte' )
1641
+
1422
1642
def test_codecs_idna (self ):
1423
1643
# Test whether trailing dot is preserved
1424
1644
self .assertEqual ("www.python.org." .encode ("idna" ), b"www.python.org." )
0 commit comments