Skip to content

Commit 72da7b2

Browse files
authored
BUG: Fix read_xml raising syntax error when reading XML with Chinese tags (pandas-dev#47905)
* BUG: pd.read_xml read chinese tag throw Syntax error * Fix parser issue in new tests; adjust typing
1 parent e12fd56 commit 72da7b2

File tree

4 files changed

+116
-22
lines changed

4 files changed

+116
-22
lines changed

doc/source/whatsnew/v1.4.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Bug fixes
2626
~~~~~~~~~
2727
- The :class:`errors.FutureWarning` raised when passing arguments (other than ``filepath_or_buffer``) as positional in :func:`read_csv` is now raised at the correct stacklevel (:issue:`47385`)
2828
- Bug in :meth:`DataFrame.to_sql` when ``method`` was a ``callable`` that did not return an ``int`` and would raise a ``TypeError`` (:issue:`46891`)
29+
- Bug in :func:`read_xml` when reading XML files with Chinese character tags and would raise ``XMLSyntaxError`` (:issue:`47902`)
2930

3031
.. ---------------------------------------------------------------------------
3132

pandas/io/xml.py

+23-22
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
)
1313

1414
from pandas._typing import (
15+
TYPE_CHECKING,
1516
CompressionOptions,
1617
ConvertersArg,
1718
DtypeArg,
@@ -46,6 +47,14 @@
4647
)
4748
from pandas.io.parsers import TextParser
4849

50+
if TYPE_CHECKING:
51+
from xml.etree.ElementTree import Element
52+
53+
from lxml.etree import (
54+
_Element,
55+
_XSLTResultTree,
56+
)
57+
4958

5059
@doc(
5160
storage_options=_shared_docs["storage_options"],
@@ -410,7 +419,7 @@ def _validate_names(self) -> None:
410419

411420
def _parse_doc(
412421
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
413-
) -> bytes:
422+
) -> Element | _Element:
414423
"""
415424
Build tree from path_or_buffer.
416425
@@ -427,18 +436,15 @@ class _EtreeFrameParser(_XMLFrameParser):
427436
"""
428437

429438
def parse_data(self) -> list[dict[str, str | None]]:
430-
from xml.etree.ElementTree import (
431-
XML,
432-
iterparse,
433-
)
439+
from xml.etree.ElementTree import iterparse
434440

435441
if self.stylesheet is not None:
436442
raise ValueError(
437443
"To use stylesheet, you need lxml installed and selected as parser."
438444
)
439445

440446
if self.iterparse is None:
441-
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
447+
self.xml_doc = self._parse_doc(self.path_or_buffer)
442448
self._validate_path()
443449
elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
444450

@@ -503,11 +509,10 @@ def _validate_names(self) -> None:
503509

504510
def _parse_doc(
505511
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
506-
) -> bytes:
512+
) -> Element:
507513
from xml.etree.ElementTree import (
508514
XMLParser,
509515
parse,
510-
tostring,
511516
)
512517

513518
handle_data = get_data_from_filepath(
@@ -519,9 +524,9 @@ def _parse_doc(
519524

520525
with preprocess_data(handle_data) as xml_data:
521526
curr_parser = XMLParser(encoding=self.encoding)
522-
r = parse(xml_data, parser=curr_parser)
527+
doc = parse(xml_data, parser=curr_parser)
523528

524-
return tostring(r.getroot())
529+
return doc.getroot()
525530

526531

527532
class _LxmlFrameParser(_XMLFrameParser):
@@ -539,17 +544,14 @@ def parse_data(self) -> list[dict[str, str | None]]:
539544
validate xpath, names, optionally parse and run XSLT,
540545
and parse original or transformed XML and return specific nodes.
541546
"""
542-
from lxml.etree import (
543-
XML,
544-
iterparse,
545-
)
547+
from lxml.etree import iterparse
546548

547549
if self.iterparse is None:
548-
self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
550+
self.xml_doc = self._parse_doc(self.path_or_buffer)
549551

550552
if self.stylesheet:
551-
self.xsl_doc = XML(self._parse_doc(self.stylesheet))
552-
self.xml_doc = XML(self._transform_doc())
553+
self.xsl_doc = self._parse_doc(self.stylesheet)
554+
self.xml_doc = self._transform_doc()
553555

554556
self._validate_path()
555557
elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
@@ -607,12 +609,11 @@ def _validate_names(self) -> None:
607609

608610
def _parse_doc(
609611
self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
610-
) -> bytes:
612+
) -> _Element:
611613
from lxml.etree import (
612614
XMLParser,
613615
fromstring,
614616
parse,
615-
tostring,
616617
)
617618

618619
handle_data = get_data_from_filepath(
@@ -637,9 +638,9 @@ def _parse_doc(
637638
else:
638639
doc = parse(xml_data, parser=curr_parser)
639640

640-
return tostring(doc)
641+
return doc
641642

642-
def _transform_doc(self) -> bytes:
643+
def _transform_doc(self) -> _XSLTResultTree:
643644
"""
644645
Transform original tree using stylesheet.
645646
@@ -652,7 +653,7 @@ def _transform_doc(self) -> bytes:
652653
transformer = XSLT(self.xsl_doc)
653654
new_doc = transformer(self.xml_doc)
654655

655-
return bytes(new_doc)
656+
return new_doc
656657

657658

658659
def get_data_from_filepath(
+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE fragmentdoc [
3+
<!ELEMENT qafragment (qa+)>
4+
<!ELEMENT qa ( q, a )>
5+
<!ELEMENT q ( #PCDATA | title | name)*>
6+
<!ATTLIST q speaker CDATA #REQUIRED>
7+
<!ELEMENT a ( #PCDATA | title | name)*>
8+
<!ATTLIST a speaker CDATA #REQUIRED>
9+
<!ELEMENT name (#PCDATA)>
10+
<!ELEMENT title (#PCDATA)>
11+
<!ENTITY C4-4F71 "Sorry, this is Big5 only">
12+
]>
13+
14+
<qafragment>
15+
<qa>
16+
<問 speaker="Opponent">問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正</問>
17+
<答 speaker="吉藏">答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申
18+
故<title>大品經</title> <name>善吉</name> 致問 何等是菩薩道 何等非菩薩道
19+
<name>佛</name>答云 有所得非菩薩道 無所得是菩薩道</答>
20+
</qa>
21+
<qa>
22+
<問 speaker="Opponent">問 既破有得申無得 亦應但破性執申假名以不</問>
23+
<a speaker="吉藏">答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也</a>
24+
</qa>
25+
<qa>
26+
<問 speaker="Opponent">問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶</問>
27+
<答 speaker="吉藏">答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破</答>
28+
</qa>
29+
</qafragment>

pandas/tests/io/xml/test_xml.py

+63
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,40 @@ def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
423423
tm.assert_frame_equal(df_str, df_expected)
424424

425425

426+
def test_string_charset(parser):
427+
txt = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
428+
429+
df_str = read_xml(txt, parser=parser)
430+
431+
df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
432+
433+
tm.assert_frame_equal(df_str, df_expected)
434+
435+
436+
def test_file_charset(datapath, parser):
437+
xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")
438+
439+
df_file = read_xml(datapath(xml_file), parser=parser)
440+
441+
df_expected = DataFrame(
442+
{
443+
"問": [
444+
"問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
445+
"問 既破有得申無得 亦應但破性執申假名以不",
446+
"問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
447+
],
448+
"答": [
449+
"答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
450+
None,
451+
"答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
452+
],
453+
"a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None],
454+
}
455+
)
456+
457+
tm.assert_frame_equal(df_file, df_expected)
458+
459+
426460
def test_file_handle_close(datapath, parser):
427461
xml_file = datapath("io", "data", "xml", "books.xml")
428462

@@ -1086,6 +1120,35 @@ def test_stylesheet_buffered_reader(datapath, mode):
10861120
tm.assert_frame_equal(df_kml, df_style)
10871121

10881122

1123+
@td.skip_if_no("lxml")
1124+
def test_style_charset():
1125+
xml = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
1126+
1127+
xsl = """\
1128+
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
1129+
<xsl:output omit-xml-declaration="yes" indent="yes"/>
1130+
<xsl:strip-space elements="*"/>
1131+
1132+
<xsl:template match="node()|@*">
1133+
<xsl:copy>
1134+
<xsl:apply-templates select="node()|@*"/>
1135+
</xsl:copy>
1136+
</xsl:template>
1137+
1138+
<xsl:template match="中文標籤">
1139+
<根>
1140+
<xsl:apply-templates />
1141+
</根>
1142+
</xsl:template>
1143+
1144+
</xsl:stylesheet>"""
1145+
1146+
df_orig = read_xml(xml)
1147+
df_style = read_xml(xml, stylesheet=xsl)
1148+
1149+
tm.assert_frame_equal(df_orig, df_style)
1150+
1151+
10891152
@td.skip_if_no("lxml")
10901153
def test_not_stylesheet(datapath):
10911154
from lxml.etree import XSLTParseError

0 commit comments

Comments
 (0)