Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions src/wp-includes/compat.php
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,147 @@
);
}

if ( ! function_exists( 'mb_chr' ) ) :
/**
* Compat function to mimic mb_chr().
*
* @ignore
* @since 7.1.0
*
* @see _mb_ord()
*
* @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
* @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
*/
function mb_chr( $codepoint, $encoding = null ) {
return _mb_chr( $codepoint, $encoding );
}
endif;

/**
* Internal compat function to mimic mb_chr().
*
* @ignore
* @since 7.1.0
*
* @param int $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
* @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
*/
function _mb_chr( $codepoint, $encoding = null ) {
if ( ! is_int( $codepoint ) || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
return false;
}

// Pre-check to ensure a valid code point.
if (
$codepoint < 0 ||
( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
$codepoint > 0x10FFFF
) {
return false;
}

if ( $codepoint <= 0x7F ) {
return chr( $codepoint );
}

if ( $codepoint <= 0x7FF ) {
$byte1 = chr( ( $codepoint >> 6 ) | 0xC0 );
$byte2 = chr( $codepoint & 0x3F | 0x80 );

return "{$byte1}{$byte2}";
}

if ( $codepoint <= 0xFFFF ) {
$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
$byte3 = chr( $codepoint & 0x3F | 0x80 );

return "{$byte1}{$byte2}{$byte3}";
}

// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
$byte4 = chr( $codepoint & 0x3F | 0x80 );

return "{$byte1}{$byte2}{$byte3}{$byte4}";
}

if ( ! function_exists( 'mb_ord' ) ) :
/**
* Compat function to mimic mb_ord().
*
* @ignore
* @since 7.1.0
*
* @see _mb_ord()
*
* @param string $string Return the code point at the start of this string.
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
* @return int|false The Unicode code point for the first character of string or false on failure.
*/
function mb_ord( $string, $encoding = null ) {
return _mb_ord( $string, $encoding );
}
endif;

/**
* Internal compat function to mimic mb_ord().
*
* @ignore
* @since 7.1.0
*
* @param string $string Return the code point at the start of this string.
* @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
* @return int|false The Unicode code point for the first character of string or false on failure.
*/
function _mb_ord( $string, $encoding = null ) {
if ( ! is_string( $string ) || '' === $string || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
return false;
}

$byte_length = 0;
$invalid_length = 0;
$found_count = _wp_scan_utf8( $string, $byte_length, $invalid_length, null, 1 );

if ( 1 !== $found_count ) {
return false;
}

// These are valid code points, so no further validation is required.
$b0 = ord( $string[0] );

switch ( $byte_length ) {

Check warning on line 227 in src/wp-includes/compat.php

View workflow job for this annotation

GitHub Actions / PHP static analysis / Run PHP static analysis

Function _mb_ord() should return int|false but return statement is missing.
case 1:
return $b0;

case 2:
return (
( ( $b0 & 0x1F ) << 6 ) |
( ( ord( $string[1] ) & 0x3F ) )
);

case 3:
return (
( ( $b0 & 0x0F ) << 12 ) |
( ( ord( $string[1] ) & 0x3F ) << 6 ) |
( ( ord( $string[2] ) & 0x3F ) )
);

case 4:
return (
( ( $b0 & 0x07 ) << 18 ) |
( ( ord( $string[1] ) & 0x3F ) << 12 ) |
( ( ord( $string[2] ) & 0x3F ) << 6 ) |
( ( ord( $string[3] ) & 0x3F ) )
);
}
}

if ( ! function_exists( 'mb_substr' ) ) :
/**
* Compat function to mimic mb_substr().
Expand Down
36 changes: 2 additions & 34 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -424,40 +424,8 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
* @return string Converted code point, or `�` if invalid.
*/
public static function code_point_to_utf8_bytes( $code_point ): string {
// Pre-check to ensure a valid code point.
if (
$code_point <= 0 ||
( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
$code_point > 0x10FFFF
) {
return '�';
}

if ( $code_point <= 0x7F ) {
return chr( $code_point );
}

if ( $code_point <= 0x7FF ) {
$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
$byte2 = chr( $code_point & 0x3F | 0x80 );

return "{$byte1}{$byte2}";
}

if ( $code_point <= 0xFFFF ) {
$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte3 = chr( $code_point & 0x3F | 0x80 );

return "{$byte1}{$byte2}{$byte3}";
}

// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
$byte4 = chr( $code_point & 0x3F | 0x80 );
$string = mb_chr( $code_point );

return "{$byte1}{$byte2}{$byte3}{$byte4}";
return false !== $string ? $string : '�';
}
}
28 changes: 28 additions & 0 deletions tests/phpunit/tests/compat/mbChr.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?php

/**
* @group compat
*
* @covers ::mb_chr
*/
class Tests_Compat_mbChr extends WP_UnitTestCase {
/**
* Ensures that the mb_chr() polyfill matches the behavior of mb_chr()
* for the supported UTF-8 encoding.
*
* @ticket 65342
*/
public function test_mb_chr_polyfill_matches_spec() {
for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
$this->assertSame(
mb_chr( $code_point ),
_mb_chr( $code_point ),
'Failed to properly decode the code point from the string.'
);
}

$this->assertFalse( _mb_chr( ord( 'A' ), 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
$this->assertFalse( _mb_ord( ord( 'A' ), 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
$this->assertSame( 'A', _mb_chr( ord( 'A' ), 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
}
}
41 changes: 41 additions & 0 deletions tests/phpunit/tests/compat/mbOrd.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?php

/**
* @group compat
*
* @covers ::mb_ord
*/
class Tests_Compat_mbOrd extends WP_UnitTestCase {
/**
* Ensures that the mb_ord() polyfill matches the behavior of mb_ord()
* for the supported UTF-8 encoding.
*
* @ticket 65342
*/
public function test_mb_ord_polyfill_matches_spec() {
for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
/*
* Some code points cannot be constructed in UTF-8 because they
* are invalid; notably the surrogate halves. While they could be
* manually constructed here using the direct UTF-8 encoder without
* its constraints, it’s sufficient to test the positive cases here
* and spot-check an unpaired and incorrectly-converted surrogate
* half below.
*/
if ( false !== mb_chr( $code_point ) ) {
$this->assertSame(
$code_point,
_mb_ord( mb_chr( $code_point ) ),
'Failed to properly decode the code point from the string.'
);
}
}

$this->assertFalse( _mb_ord( '' ), 'Should have failed on empty string.' );
$this->assertFalse( _mb_ord( 'hi', 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
$this->assertFalse( _mb_ord( 'hi', 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
$this->assertSame( ord( 'A' ), _mb_ord( 'A', 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
$this->assertFalse( _mb_ord( "\xC0" ), 'Should have rejected invalid UTF-8 code point.' );
$this->assertFalse( _mb_ord( substr( "\xED\xA0\x80", 0, 2 ) ), 'Should have rejected unpaired surrogate half.' );
}
}
Loading