Skip to content

Commit e69b1ff

Browse files
committed
- Fixed bug #49687 (utf8_decode vulnerabilities and deficiencies in the number
of reported malformed sequences). (Gustavo) #Made a public interface for get_next_char/utf-8 in trunk to use in utf8_decode. #In PHP 5.3, trunk's get_next_char was copied to xml.c because 5.3's #get_next_char is different and is not prepared to recover appropriately from #errors.
1 parent da400e7 commit e69b1ff

File tree

4 files changed

+49
-32
lines changed

4 files changed

+49
-32
lines changed

ext/standard/html.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,9 @@ ZEND_EXTERN_MODULE_GLOBALS(mbstring)
9292

9393
/* {{{ get_next_char
9494
*/
95-
static unsigned int get_next_char(
95+
static inline unsigned int get_next_char(
9696
enum entity_charset charset,
97-
unsigned char *str,
97+
const unsigned char *str,
9898
size_t str_len,
9999
size_t *cursor,
100100
int *status)
@@ -352,6 +352,18 @@ static unsigned int get_next_char(
352352
}
353353
/* }}} */
354354

355+
/* {{{ php_next_utf8_char
356+
* Public interface for get_next_char used with UTF-8 */
357+
PHPAPI unsigned int php_next_utf8_char(
358+
const unsigned char *str,
359+
size_t str_len,
360+
size_t *cursor,
361+
int *status)
362+
{
363+
return get_next_char(cs_utf_8, str, str_len, cursor, status);
364+
}
365+
/* }}} */
366+
355367
/* {{{ entity_charset determine_charset
356368
* returns the charset identifier based on current locale or a hint.
357369
* defaults to UTF-8 */

ext/standard/html.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,6 @@ PHP_FUNCTION(get_html_translation_table);
5757
PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
5858
PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC);
5959
PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
60+
PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, int *status);
6061

6162
#endif /* HTML_H */

ext/xml/tests/bug49687.phpt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
--TEST--
2+
Bug #49687 Several utf8_decode deficiencies and vulnerabilities
3+
--SKIPIF--
4+
<?php
5+
require_once("skipif.inc");
6+
if (!extension_loaded('xml')) die ("skip xml extension not available");
7+
?>
8+
--FILE--
9+
<?php
10+
11+
$tests = array(
12+
"\x41\xC2\x3E\x42",
13+
"\xE3\x80\x22",
14+
"\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98",
15+
);
16+
foreach ($tests as $t) {
17+
echo bin2hex(utf8_decode($t)), "\n";
18+
}
19+
echo "Done.\n";
20+
--EXPECT--
21+
413f3e42
22+
3f22
23+
413f3f423f433f3f
24+
Done.

ext/xml/xml.c

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "zend_variables.h"
3333
#include "ext/standard/php_string.h"
3434
#include "ext/standard/info.h"
35+
#include "ext/standard/html.h"
3536

3637
#if HAVE_XML
3738

@@ -662,7 +663,7 @@ PHPAPI char *xml_utf8_encode(const char *s, int len, int *newlen, const XML_Char
662663
/* {{{ xml_utf8_decode */
663664
PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding)
664665
{
665-
int pos = len;
666+
size_t pos = 0;
666667
char *newbuf = emalloc(len + 1);
667668
unsigned int c;
668669
char (*decoder)(unsigned short) = NULL;
@@ -681,36 +682,15 @@ PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_
681682
newbuf[*newlen] = '\0';
682683
return newbuf;
683684
}
684-
while (pos > 0) {
685-
c = (unsigned char)(*s);
686-
if (c >= 0xf0) { /* four bytes encoded, 21 bits */
687-
if(pos-4 >= 0) {
688-
c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
689-
} else {
690-
c = '?';
691-
}
692-
s += 4;
693-
pos -= 4;
694-
} else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
695-
if(pos-3 >= 0) {
696-
c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
697-
} else {
698-
c = '?';
699-
}
700-
s += 3;
701-
pos -= 3;
702-
} else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
703-
if(pos-2 >= 0) {
704-
c = ((s[0]&63)<<6) | (s[1]&63);
705-
} else {
706-
c = '?';
707-
}
708-
s += 2;
709-
pos -= 2;
710-
} else {
711-
s++;
712-
pos--;
685+
686+
while (pos < (size_t)len) {
687+
int status = FAILURE;
688+
c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status);
689+
690+
if (status == FAILURE || c > 0xFFU) {
691+
c = '?';
713692
}
693+
714694
newbuf[*newlen] = decoder ? decoder(c) : c;
715695
++*newlen;
716696
}

0 commit comments

Comments
 (0)