From 500e1f0cc1afe4b171a9dfda9fc8bddfe8bb73a2 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Mon, 25 May 2026 15:53:14 -0700
Subject: [PATCH] Charset: Polyfill mb_ord() and mb_chr().
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These functions are useful primitives but missing when the mbstring
extension isn’t available. This patch adds polyfills for those few
environments where this is the case so that WordPress code can
unconditionally call them.
---
 src/wp-includes/compat.php                    | 141 ++++++++++++++++++
 .../html-api/class-wp-html-decoder.php        |  36 +----
 tests/phpunit/tests/compat/mbChr.php          |  28 ++++
 tests/phpunit/tests/compat/mbOrd.php          |  41 +++++
 4 files changed, 212 insertions(+), 34 deletions(-)
 create mode 100644 tests/phpunit/tests/compat/mbChr.php
 create mode 100644 tests/phpunit/tests/compat/mbOrd.php

diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php
index 3ac1372fdca1e..5eb467280a5a5 100644
--- a/src/wp-includes/compat.php
+++ b/src/wp-includes/compat.php
@@ -110,6 +110,147 @@ function _is_utf8_charset( $charset_slug ) {
 	);
 }
 
+if ( ! function_exists( 'mb_chr' ) ) :
+	/**
+	 * Compat function to mimic mb_chr().
+	 *
+	 * @ignore
+	 * @since 7.1.0
+	 *
+	 * @see _mb_ord()
+	 *
+	 * @param int          $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
+	 * @param "UTF-8"|null $encoding  Must be 'UTF-8' or null.
+	 * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
+	 */
+	function mb_chr( $codepoint, $encoding = null ) {
+		return _mb_chr( $codepoint, $encoding );
+	}
+endif;
+
+/**
+ * Internal compat function to mimic mb_chr().
+ *
+ * @ignore
+ * @since 7.1.0
+ *
+ * @param int          $codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT
+ * @param "UTF-8"|null $encoding  Must be 'UTF-8' or null.
+ * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure.
+ */
+function _mb_chr( $codepoint, $encoding = null ) {
+	if ( ! is_int( $codepoint ) || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
+		return false;
+	}
+
+	// Pre-check to ensure a valid code point.
+	if (
+		$codepoint < 0 ||
+		( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
+		$codepoint > 0x10FFFF
+	) {
+		return false;
+	}
+
+	if ( $codepoint <= 0x7F ) {
+		return chr( $codepoint );
+	}
+
+	if ( $codepoint <= 0x7FF ) {
+		$byte1 = chr( ( $codepoint >> 6 ) | 0xC0 );
+		$byte2 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}";
+	}
+
+	if ( $codepoint <= 0xFFFF ) {
+		$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
+		$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+		$byte3 = chr( $codepoint & 0x3F | 0x80 );
+
+		return "{$byte1}{$byte2}{$byte3}";
+	}
+
+	// Any values above U+10FFFF are eliminated above in the pre-check.
+	$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
+	$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
+	$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
+	$byte4 = chr( $codepoint & 0x3F | 0x80 );
+
+	return "{$byte1}{$byte2}{$byte3}{$byte4}";
+}
+
+if ( ! function_exists( 'mb_ord' ) ) :
+	/**
+	 * Compat function to mimic mb_ord().
+	 *
+	 * @ignore
+	 * @since 7.1.0
+	 *
+	 * @see _mb_ord()
+	 *
+	 * @param string       $string   Return the code point at the start of this string.
+	 * @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
+	 * @return int|false The Unicode code point for the first character of string or false on failure.
+	 */
+	function mb_ord( $string, $encoding = null ) {
+		return _mb_ord( $string, $encoding );
+	}
+endif;
+
+/**
+ * Internal compat function to mimic mb_ord().
+ *
+ * @ignore
+ * @since 7.1.0
+ *
+ * @param string       $string   Return the code point at the start of this string.
+ * @param "UTF-8"|null $encoding Must be 'UTF-8' or null.
+ * @return int|false The Unicode code point for the first character of string or false on failure.
+ */
+function _mb_ord( $string, $encoding = null ) {
+	if ( ! is_string( $string ) || '' === $string || ( isset( $encoding ) && 'UTF-8' !== $encoding ) ) {
+		return false;
+	}
+
+	$byte_length    = 0;
+	$invalid_length = 0;
+	$found_count    = _wp_scan_utf8( $string, $byte_length, $invalid_length, null, 1 );
+
+	if ( 1 !== $found_count ) {
+		return false;
+	}
+
+	// These are valid code points, so no further validation is required.
+	$b0 = ord( $string[0] );
+
+	switch ( $byte_length ) {
+		case 1:
+			return $b0;
+
+		case 2:
+			return (
+				( ( $b0 & 0x1F ) << 6 ) |
+				( ( ord( $string[1] ) & 0x3F ) )
+			);
+
+		case 3:
+			return (
+				( ( $b0 & 0x0F ) << 12 ) |
+				( ( ord( $string[1] ) & 0x3F ) << 6 ) |
+				( ( ord( $string[2] ) & 0x3F ) )
+			);
+
+		case 4:
+			return (
+				( ( $b0 & 0x07 ) << 18 ) |
+				( ( ord( $string[1] ) & 0x3F ) << 12 ) |
+				( ( ord( $string[2] ) & 0x3F ) << 6 ) |
+				( ( ord( $string[3] ) & 0x3F ) )
+			);
+	}
+}
+
 if ( ! function_exists( 'mb_substr' ) ) :
 	/**
 	 * Compat function to mimic mb_substr().
diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php
index bd62f311aef13..d902f4b7cabc4 100644
--- a/src/wp-includes/html-api/class-wp-html-decoder.php
+++ b/src/wp-includes/html-api/class-wp-html-decoder.php
@@ -424,40 +424,8 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
 	 * @return string Converted code point, or `�` if invalid.
 	 */
 	public static function code_point_to_utf8_bytes( $code_point ): string {
-		// Pre-check to ensure a valid code point.
-		if (
-			$code_point <= 0 ||
-			( $code_point >= 0xD800 && $code_point <= 0xDFFF ) ||
-			$code_point > 0x10FFFF
-		) {
-			return '�';
-		}
-
-		if ( $code_point <= 0x7F ) {
-			return chr( $code_point );
-		}
-
-		if ( $code_point <= 0x7FF ) {
-			$byte1 = chr( ( $code_point >> 6 ) | 0xC0 );
-			$byte2 = chr( $code_point & 0x3F | 0x80 );
-
-			return "{$byte1}{$byte2}";
-		}
-
-		if ( $code_point <= 0xFFFF ) {
-			$byte1 = chr( ( $code_point >> 12 ) | 0xE0 );
-			$byte2 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
-			$byte3 = chr( $code_point & 0x3F | 0x80 );
-
-			return "{$byte1}{$byte2}{$byte3}";
-		}
-
-		// Any values above U+10FFFF are eliminated above in the pre-check.
-		$byte1 = chr( ( $code_point >> 18 ) | 0xF0 );
-		$byte2 = chr( ( $code_point >> 12 ) & 0x3F | 0x80 );
-		$byte3 = chr( ( $code_point >> 6 ) & 0x3F | 0x80 );
-		$byte4 = chr( $code_point & 0x3F | 0x80 );
+		$string = mb_chr( $code_point );
 
-		return "{$byte1}{$byte2}{$byte3}{$byte4}";
+		return false !== $string ? $string : '�';
 	}
 }
diff --git a/tests/phpunit/tests/compat/mbChr.php b/tests/phpunit/tests/compat/mbChr.php
new file mode 100644
index 0000000000000..6862ed9170479
--- /dev/null
+++ b/tests/phpunit/tests/compat/mbChr.php
@@ -0,0 +1,28 @@
+<?php
+
+/**
+ * @group compat
+ *
+ * @covers ::mb_chr
+ */
+class Tests_Compat_mbChr extends WP_UnitTestCase {
+	/**
+	 * Ensures that the mb_chr() polyfill matches the behavior of mb_chr()
+	 * for the supported UTF-8 encoding.
+	 *
+	 * @ticket 65342
+	 */
+	public function test_mb_chr_polyfill_matches_spec() {
+		for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
+			$this->assertSame(
+				mb_chr( $code_point ),
+				_mb_chr( $code_point ),
+				'Failed to properly decode the code point from the string.'
+			);
+		}
+
+		$this->assertFalse( _mb_chr( ord( 'A' ), 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertFalse( _mb_ord( ord( 'A' ), 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertSame( 'A', _mb_chr( ord( 'A' ), 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
+	}
+}
diff --git a/tests/phpunit/tests/compat/mbOrd.php b/tests/phpunit/tests/compat/mbOrd.php
new file mode 100644
index 0000000000000..214547498d643
--- /dev/null
+++ b/tests/phpunit/tests/compat/mbOrd.php
@@ -0,0 +1,41 @@
+<?php
+
+/**
+ * @group compat
+ *
+ * @covers ::mb_ord
+ */
+class Tests_Compat_mbOrd extends WP_UnitTestCase {
+	/**
+	 * Ensures that the mb_ord() polyfill matches the behavior of mb_ord()
+	 * for the supported UTF-8 encoding.
+	 *
+	 * @ticket 65342
+	 */
+	public function test_mb_ord_polyfill_matches_spec() {
+		for ( $code_point = 0; $code_point <= 0x10FFFF; $code_point++ ) {
+			/*
+			 * Some code points cannot be constructed in UTF-8 because they
+			 * are invalid; notably the surrogate halves. While they could be
+			 * manually constructed here using the direct UTF-8 encoder without
+			 * its constraints, it’s sufficient to test the positive cases here
+			 * and spot-check an unpaired and incorrectly-converted surrogate
+			 * half below.
+			 */
+			if ( false !== mb_chr( $code_point ) ) {
+				$this->assertSame(
+					$code_point,
+					_mb_ord( mb_chr( $code_point ) ),
+					'Failed to properly decode the code point from the string.'
+				);
+			}
+		}
+
+		$this->assertFalse( _mb_ord( '' ), 'Should have failed on empty string.' );
+		$this->assertFalse( _mb_ord( 'hi', 'latin1' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertFalse( _mb_ord( 'hi', 'utf8' ), 'Should have rejected non-UTF-8 encoding.' );
+		$this->assertSame( ord( 'A' ), _mb_ord( 'A', 'UTF-8' ), 'Should have accepted UTF-8 encoding.' );
+		$this->assertFalse( _mb_ord( "\xC0" ), 'Should have rejected invalid UTF-8 code point.' );
+		$this->assertFalse( _mb_ord( substr( "\xED\xA0\x80", 0, 2 ) ), 'Should have rejected unpaired surrogate half.' );
+	}
+}