From 500e1f0cc1afe4b171a9dfda9fc8bddfe8bb73a2 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 25 May 2026 15:53:14 -0700 Subject: [PATCH] Charset: Polyfill mb_ord() and mb_chr(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions are useful primitives but missing when the mbstring extension isn’t available. This patch adds polyfills for those few environments where this is the case so that WordPress code can unconditionally call them. --- src/wp-includes/compat.php | 141 ++++++++++++++++++ .../html-api/class-wp-html-decoder.php | 36 +---- tests/phpunit/tests/compat/mbChr.php | 28 ++++ tests/phpunit/tests/compat/mbOrd.php | 41 +++++ 4 files changed, 212 insertions(+), 34 deletions(-) create mode 100644 tests/phpunit/tests/compat/mbChr.php create mode 100644 tests/phpunit/tests/compat/mbOrd.php diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php index 3ac1372fdca1e..5eb467280a5a5 100644 --- a/src/wp-includes/compat.php +++ b/src/wp-includes/compat.php @@ -110,6 +110,147 @@ function _is_utf8_charset( $charset_slug ) { ); } +if ( ! function_exists( 'mb_chr' ) ) : + /** + * Compat function to mimic mb_chr(). + * + * @ignore + * @since 7.1.0 + * + * @see _mb_ord() + * + * @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. + */ + function mb_chr( $codepoint, $encoding = null ) { + return _mb_chr( $codepoint, $encoding ); + } +endif; + +/** + * Internal compat function to mimic mb_chr(). + * + * @ignore + * @since 7.1.0 + * + * @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. + */ +function _mb_chr( $codepoint, $encoding = null ) { + if ( ! is_int( $codepoint ) || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) { + return false; + } + + // Pre-check to ensure a valid code point. + if ( + $codepoint < 0 || + ( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) || + $codepoint > 0x10FFFF + ) { + return false; + } + + if ( $codepoint <= 0x7F ) { + return chr( $codepoint ); + } + + if ( $codepoint <= 0x7FF ) { + $byte1 = chr( ( $codepoint >> 6 ) | 0xC0 ); + $byte2 = chr( $codepoint & 0x3F | 0x80 ); + + return "{$byte1}{$byte2}"; + } + + if ( $codepoint <= 0xFFFF ) { + $byte1 = chr( ( $codepoint >> 12 ) | 0xE0 ); + $byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 ); + $byte3 = chr( $codepoint & 0x3F | 0x80 ); + + return "{$byte1}{$byte2}{$byte3}"; + } + + // Any values above U+10FFFF are eliminated above in the pre-check. + $byte1 = chr( ( $codepoint >> 18 ) | 0xF0 ); + $byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 ); + $byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 ); + $byte4 = chr( $codepoint & 0x3F | 0x80 ); + + return "{$byte1}{$byte2}{$byte3}{$byte4}"; +} + +if ( ! function_exists( 'mb_ord' ) ) : + /** + * Compat function to mimic mb_ord(). + * + * @ignore + * @since 7.1.0 + * + * @see _mb_ord() + * + * @param string $string Return the code point at the start of this string. + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return int|false The Unicode code point for the first character of string or false on failure. + */ + function mb_ord( $string, $encoding = null ) { + return _mb_ord( $string, $encoding ); + } +endif; + +/** + * Internal compat function to mimic mb_ord(). + * + * @ignore + * @since 7.1.0 + * + * @param string $string Return the code point at the start of this string. + * @param "UTF-8"|null $encoding Must be 'UTF-8' or null. + * @return int|false The Unicode code point for the first character of string or false on failure. + */ +function _mb_ord( $string, $encoding = null ) { + if ( ! is_string( $string ) || '' === $string || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) { + return false; + } + + $byte_length = 0; + $invalid_length = 0; + $found_count = _wp_scan_utf8( $string, $byte_length, $invalid_length, null, 1 ); + + if ( 1 !== $found_count ) { + return false; + } + + // These are valid code points, so no further validation is required. + $b0 = ord( $string[0] ); + + switch ( $byte_length ) { + case 1: + return $b0; + + case 2: + return ( + ( ( $b0 & 0x1F ) << 6 ) | + ( ( ord( $string[1] ) & 0x3F ) ) + ); + + case 3: + return ( + ( ( $b0 & 0x0F ) << 12 ) | + ( ( ord( $string[1] ) & 0x3F ) << 6 ) | + ( ( ord( $string[2] ) & 0x3F ) ) + ); + + case 4: + return ( + ( ( $b0 & 0x07 ) << 18 ) | + ( ( ord( $string[1] ) & 0x3F ) << 12 ) | + ( ( ord( $string[2] ) & 0x3F ) << 6 ) | + ( ( ord( $string[3] ) & 0x3F ) ) + ); + } +} + if ( ! function_exists( 'mb_substr' ) ) : /** * Compat function to mimic mb_substr(). diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index bd62f311aef13..d902f4b7cabc4 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -424,40 +424,8 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat * @return string Converted code point, or `�` if invalid. */ public static function code_point_to_utf8_bytes( $code_point ): string { - // Pre-check to ensure a valid code point. - if ( - $code_point <= 0 || - ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || - $code_point > 0x10FFFF - ) { - return '�'; - } - - if ( $code_point <= 0x7F ) { - return chr( $code_point ); - } - - if ( $code_point <= 0x7FF ) { - $byte1 = chr( ( $code_point >> 6 ) | 0xC0 ); - $byte2 = chr( $code_point & 0x3F | 0x80 ); - - return "{$byte1}{$byte2}"; - } - - if ( $code_point <= 0xFFFF ) { - $byte1 = chr( ( $code_point >> 12 ) | 0xE0 ); - $byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 ); - $byte3 = chr( $code_point & 0x3F | 0x80 ); - - return "{$byte1}{$byte2}{$byte3}"; - } - - // Any values above U+10FFFF are eliminated above in the pre-check. - $byte1 = chr( ( $code_point >> 18 ) | 0xF0 ); - $byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 ); - $byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 ); - $byte4 = chr( $code_point & 0x3F | 0x80 ); + $string = mb_chr( $code_point ); - return "{$byte1}{$byte2}{$byte3}{$byte4}"; + return false !== $string ? $string : '�'; } } diff --git a/tests/phpunit/tests/compat/mbChr.php b/tests/phpunit/tests/compat/mbChr.php new file mode 100644 index 0000000000000..6862ed9170479 --- /dev/null +++ b/tests/phpunit/tests/compat/mbChr.php @@ -0,0 +1,28 @@ +assertSame( + mb_chr( $code_point ), + _mb_chr( $code_point ), + 'Failed to properly decode the code point from the string.' + ); + } + + $this->assertFalse( _mb_chr( ord( 'A' ), 'latin1' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertFalse( _mb_ord( ord( 'A' ), 'utf8' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertSame( 'A', _mb_chr( ord( 'A' ), 'UTF-8' ), 'Should have accepted UTF-8 encoding.' ); + } +} diff --git a/tests/phpunit/tests/compat/mbOrd.php b/tests/phpunit/tests/compat/mbOrd.php new file mode 100644 index 0000000000000..214547498d643 --- /dev/null +++ b/tests/phpunit/tests/compat/mbOrd.php @@ -0,0 +1,41 @@ +assertSame( + $code_point, + _mb_ord( mb_chr( $code_point ) ), + 'Failed to properly decode the code point from the string.' + ); + } + } + + $this->assertFalse( _mb_ord( '' ), 'Should have failed on empty string.' ); + $this->assertFalse( _mb_ord( 'hi', 'latin1' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertFalse( _mb_ord( 'hi', 'utf8' ), 'Should have rejected non-UTF-8 encoding.' ); + $this->assertSame( ord( 'A' ), _mb_ord( 'A', 'UTF-8' ), 'Should have accepted UTF-8 encoding.' ); + $this->assertFalse( _mb_ord( "\xC0" ), 'Should have rejected invalid UTF-8 code point.' ); + $this->assertFalse( _mb_ord( substr( "\xED\xA0\x80", 0, 2 ) ), 'Should have rejected unpaired surrogate half.' ); + } +}