diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php index 3ac1372fdca1e..5eb467280a5a5 100644 --- a/src/wp-includes/compat.php +++ b/src/wp-includes/compat.php @@ -110,6 +110,147 @@ function _is_utf8_charset( $charset_slug ) { ); } +if ( ! function_exists( 'mb_chr' ) ) : + /** + * Compat function to mimic mb_chr(). + * + * @ignore + * @since 7.1.0 + * + * @see _mb_ord() + * + * @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. + */ + function mb_chr( $codepoint, $encoding = null ) { + return _mb_chr( $codepoint, $encoding ); + } +endif; + +/** + * Internal compat function to mimic mb_chr(). + * + * @ignore + * @since 7.1.0 + * + * @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. + */ +function _mb_chr( $codepoint, $encoding = null ) { + if ( ! is_int( $codepoint ) || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) { + return false; + } + + // Pre-check to ensure a valid code point. + if ( + $codepoint < 0 || + ( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) || + $codepoint > 0x10FFFF + ) { + return false; + } + + if ( $codepoint <= 0x7F ) { + return chr( $codepoint ); + } + + if ( $codepoint <= 0x7FF ) { + $byte1 = chr( ( $codepoint >> 6 ) | 0xC0 ); + $byte2 = chr( $codepoint & 0x3F | 0x80 ); + + return "{$byte1}{$byte2}"; + } + + if ( $codepoint <= 0xFFFF ) { + $byte1 = chr( ( $codepoint >> 12 ) | 0xE0 ); + $byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 ); + $byte3 = chr( $codepoint & 0x3F | 0x80 ); + + return "{$byte1}{$byte2}{$byte3}"; + } + + // Any values above U+10FFFF are eliminated above in the pre-check. + $byte1 = chr( ( $codepoint >> 18 ) | 0xF0 ); + $byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 ); + $byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 ); + $byte4 = chr( $codepoint & 0x3F | 0x80 ); + + return "{$byte1}{$byte2}{$byte3}{$byte4}"; +} + +if ( ! function_exists( 'mb_ord' ) ) : + /** + * Compat function to mimic mb_ord(). + * + * @ignore + * @since 7.1.0 + * + * @see _mb_ord() + * + * @param string $string Return the code point at the start of this string. + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return int|false The Unicode code point for the first character of string or false on failure. + */ + function mb_ord( $string, $encoding = null ) { + return _mb_ord( $string, $encoding ); + } +endif; + +/** + * Internal compat function to mimic mb_ord(). + * + * @ignore + * @since 7.1.0 + * + * @param string $string Return the code point at the start of this string. + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return int|false The Unicode code point for the first character of string or false on failure. + */ +function _mb_ord( $string, $encoding = null ) { + if ( ! is_string( $string ) || '' === $string || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) { + return false; + } + + $byte_length = 0; + $invalid_length = 0; + $found_count = _wp_scan_utf8( $string, $byte_length, $invalid_length, null, 1 ); + + if ( 1 !== $found_count ) { + return false; + } + + // These are valid code points, so no further validation is required. + $b0 = ord( $string[0] ); + + switch ( $byte_length ) { + case 1: + return $b0; + + case 2: + return ( + ( ( $b0 & 0x1F ) << 6 ) | + ( ( ord( $string[1] ) & 0x3F ) ) + ); + + case 3: + return ( + ( ( $b0 & 0x0F ) << 12 ) | + ( ( ord( $string[1] ) & 0x3F ) << 6 ) | + ( ( ord( $string[2] ) & 0x3F ) ) + ); + + case 4: + return ( + ( ( $b0 & 0x07 ) << 18 ) | + ( ( ord( $string[1] ) & 0x3F ) << 12 ) | + ( ( ord( $string[2] ) & 0x3F ) << 6 ) | + ( ( ord( $string[3] ) & 0x3F ) ) + ); + } +} + if ( ! function_exists( 'mb_substr' ) ) : /** * Compat function to mimic mb_substr(). diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index bd62f311aef13..d902f4b7cabc4 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -424,40 +424,8 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * @return string Converted code point, or `�` if invalid. */ public static function code_point_to_utf8_bytes( $code_point ): string { - // Pre-check to ensure a valid code point. - if ( - $code_point <= 0 || - ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || - $code_point > 0x10FFFF - ) { - return '�'; - } - - if ( $code_point <= 0x7F ) { - return chr( $code_point ); - } - - if ( $code_point <= 0x7FF ) { - $byte1 = chr( ( $code_point >> 6 ) | 0xC0 ); - $byte2 = chr( $code_point & 0x3F | 0x80 ); - - return "{$byte1}{$byte2}"; - } - - if ( $code_point <= 0xFFFF ) { - $byte1 = chr( ( $code_point >> 12 ) | 0xE0 ); - $byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 ); - $byte3 = chr( $code_point & 0x3F | 0x80 ); - - return "{$byte1}{$byte2}{$byte3}"; - } - - // Any values above U+10FFFF are eliminated above in the pre-check. - $byte1 = chr( ( $code_point >> 18 ) | 0xF0 ); - $byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 ); - $byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 ); - $byte4 = chr( $code_point & 0x3F | 0x80 ); + $string = mb_chr( $code_point ); - return "{$byte1}{$byte2}{$byte3}{$byte4}"; + return false !== $string ? $string : '�'; } } diff --git a/tests/phpunit/tests/compat/mbChr.php b/tests/phpunit/tests/compat/mbChr.php new file mode 100644 index 0000000000000..6862ed9170479 --- /dev/null +++ b/tests/phpunit/tests/compat/mbChr.php @@ -0,0 +1,28 @@ +assertSame( + mb_chr( $code_point ), + _mb_chr( $code_point ), + 'Failed to properly decode the code point from the string.' + ); + } + + $this->assertFalse( _mb_chr( ord( 'A' ), 'latin1' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertFalse( _mb_ord( ord( 'A' ), 'utf8' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertSame( 'A', _mb_chr( ord( 'A' ), 'UTF-8' ), 'Should have accepted UTF-8 encoding.' ); + } +} diff --git a/tests/phpunit/tests/compat/mbOrd.php b/tests/phpunit/tests/compat/mbOrd.php new file mode 100644 index 0000000000000..214547498d643 --- /dev/null +++ b/tests/phpunit/tests/compat/mbOrd.php @@ -0,0 +1,41 @@ +assertSame( + $code_point, + _mb_ord( mb_chr( $code_point ) ), + 'Failed to properly decode the code point from the string.' + ); + } + } + + $this->assertFalse( _mb_ord( '' ), 'Should have failed on empty string.' ); + $this->assertFalse( _mb_ord( 'hi', 'latin1' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertFalse( _mb_ord( 'hi', 'utf8' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertSame( ord( 'A' ), _mb_ord( 'A', 'UTF-8' ), 'Should have accepted UTF-8 encoding.' ); + $this->assertFalse( _mb_ord( "\xC0" ), 'Should have rejected invalid UTF-8 code point.' ); + $this->assertFalse( _mb_ord( substr( "\xED\xA0\x80", 0, 2 ) ), 'Should have rejected unpaired surrogate half.' ); + } +}