Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions src/wp-includes/class-wp-token-map.php
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,10 @@ public static function from_precomputed_table( $state ): ?WP_Token_Map {
* @return bool Whether there's an entry for the given word in the map.
*/
public function contains( string $word, string $case_sensitivity = 'case-sensitive' ): bool {
if ( str_contains( $word, "\x00" ) ) {
return false;
}

$ignore_case = 'ascii-case-insensitive' === $case_sensitivity;

if ( $this->key_length >= strlen( $word ) ) {
Expand Down Expand Up @@ -533,9 +537,17 @@ public function read_token( string $text, int $offset = 0, &$matched_token_byte_

// Search for a long word first, if the text is long enough, and if that fails, a short one.
if ( $text_length > $this->key_length ) {
$group_key = substr( $text, $offset, $this->key_length );
/*
* Keys cannot contain null bytes, which is taken care of for the full words,
* but here it’s required to reject group keys with null bytes so that the
* lookup doesn’t get off track when scanning the group string.
*/
if ( strcspn( $text, "\x00", $offset, $this->key_length ) < $this->key_length ) {
return null;
}

$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
$group_key = substr( $text, $offset, $this->key_length );
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
if ( false === $group_at ) {
// Perhaps a short word then.
return strlen( $this->small_words ) > 0
Expand Down
6 changes: 5 additions & 1 deletion src/wp-includes/html-api/class-wp-html-open-elements.php
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,11 @@ public function after_element_pop( WP_HTML_Token $item ): void {
* When adding support for new elements, expand this switch to trap
* cases where the precalculated value needs to change.
*/
switch ( $item->node_name ) {
$namespaced_name = 'html' === $item->namespace
? $item->node_name
: "{$item->namespace} {$item->node_name}";

switch ( $namespaced_name ) {
case 'APPLET':
case 'BUTTON':
case 'CAPTION':
Expand Down
61 changes: 54 additions & 7 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -813,8 +813,14 @@ private function next_visitable_token(): bool {
* until there are events or until there are no more
* tokens works in the meantime and isn't obviously wrong.
*/
if ( empty( $this->element_queue ) && $this->step() ) {
return $this->next_visitable_token();
if ( empty( $this->element_queue ) ) {
if ( $this->step() ) {
return $this->next_visitable_token();
}

if ( isset( $this->last_error ) ) {
return false;
}
}

// Process the next event on the queue.
Expand Down Expand Up @@ -1401,6 +1407,7 @@ public function serialize_token(): string {
$tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
$in_html = 'html' === $this->get_namespace();
$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
$qualified_name = str_replace( "\x00", "\u{FFFD}", $qualified_name );

if ( $this->is_tag_closer() ) {
$html .= "</{$qualified_name}>";
Expand All @@ -1414,15 +1421,36 @@ public function serialize_token(): string {
}

$html .= "<{$qualified_name}";

$previous_attribute_was_true = false;
$seen_attribute_names = array();
foreach ( $attribute_names as $attribute_name ) {
$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
$qualified_attribute_name = $this->get_qualified_attribute_name( $attribute_name );
$qualified_attribute_name = str_replace( "\x00", "\u{FFFD}", $qualified_attribute_name );
$qualified_attribute_name = wp_scrub_utf8( $qualified_attribute_name );
if ( isset( $seen_attribute_names[ $qualified_attribute_name ] ) ) {
continue;
} else {
$seen_attribute_names[ $qualified_attribute_name ] = true;
}

if (
$previous_attribute_was_true &&
isset( $qualified_attribute_name[0] ) &&
'=' === $qualified_attribute_name[0]
) {
$html .= '=""';
}

$html .= " {$qualified_attribute_name}";
$value = $this->get_attribute( $attribute_name );

if ( is_string( $value ) ) {
$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
}

$html = str_replace( "\x00", "\u{FFFD}", $html );
$previous_attribute_was_true = true === $value;
$html = str_replace( "\x00", "\u{FFFD}", $html );
}

if ( ! $in_html && $this->has_self_closing_flag() ) {
Expand Down Expand Up @@ -2667,8 +2695,7 @@ private function step_in_body(): bool {
*/
case '-FORM':
if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) {
$node = $this->state->form_element;
$this->state->form_element = null;
$node = $this->state->form_element;

/*
* > If node is null or if the stack of open elements does not have node
Expand All @@ -2681,10 +2708,20 @@ private function step_in_body(): bool {
null === $node ||
! $this->state->stack_of_open_elements->has_element_in_scope( 'FORM' )
) {
// Parse error: ignore the token.
/*
* Parse error: ignore the token.
*
* Keep the form pointer intact when the end tag is ignored, such as
* when a FORM closing tag appears inside an SVG TITLE integration
* point. Otherwise the ignored token changes parser state in a way
* that serialization cannot represent, allowing a later FORM opener
* to appear in the first normalization pass and disappear on the second.
*/
return $this->step();
}

$this->state->form_element = null;

$this->generate_implied_end_tags();
if ( $node !== $this->state->stack_of_open_elements->current_node() ) {
// @todo Indicate a parse error once it's possible. This error does not impact the logic here.
Expand Down Expand Up @@ -3492,6 +3529,16 @@ private function step_in_table(): bool {
$this->state->form_element = $this->state->current_token;
$this->state->stack_of_open_elements->pop();
return true;

/*
* > Anything else
*
* A FORM end tag in table insertion mode is processed through the "in body"
* rules with foster parenting enabled. Because this token does not insert
* DOM content, the in-body handling is sufficient here.
*/
case '-FORM':
return $this->step_in_body();
}

/*
Expand Down
8 changes: 7 additions & 1 deletion src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1424,7 +1424,7 @@ private function skip_rcdata( string $tag_name ): bool {
$this->tag_name_starts_at = $at;

// Fail if there is no possible tag closer.
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
if ( false === $at || ( $at + 2 + $tag_length ) > $doc_length ) {
return false;
}

Expand Down Expand Up @@ -1815,6 +1815,12 @@ private function parse_next_tag(): bool {

// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
$span_of_dashes = strspn( $html, '-', $closer_at );
if ( $doc_length <= $span_of_dashes + $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;

return false;
}

if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
/*
* @todo When implementing `set_modifiable_text()` ensure that updates to this token
Expand Down
25 changes: 25 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlDecoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,31 @@ public static function data_edge_cases() {
);
}

/**
* Ensures that character references followed by NULL bytes do not emit native PHP errors.
*
* @ticket {TICKET_NUMBER}
*/
public function test_character_reference_with_null_byte_does_not_emit_native_errors() {
$errors = array();
set_error_handler(
static function ( int $errno, string $errstr ) use ( &$errors ) {
$errors[] = "{$errno}: {$errstr}";
return true;
}
);

try {
$decoded = WP_HTML_Decoder::decode_text_node( "&\x00b" );
} finally {
restore_error_handler();
}

// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
$this->assertSame( array(), $errors );
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}

/**
* Ensures proper detection of attribute prefixes ignoring ASCII case.
*
Expand Down
174 changes: 174 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessor-serialize.php
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,180 @@ public function test_normalize_special_leading_newline_handling( string $input,
$this->assertEqualHTML( $expected, $normalized_twice );
}

/**
* Ensures that fuzzer-discovered inputs do not emit native PHP errors.
*
* @ticket {TICKET_NUMBER}
*
* @dataProvider data_provider_fuzzer_native_error_cases
*
* @param string $input HTML input.
* @param string|null $expected Expected normalized output, or null when unsupported.
*/
public function test_normalize_fuzzer_cases_do_not_emit_native_errors( string $input, ?string $expected ) {
$errors = array();

/*
* This test is checking for native PHP warnings/notices. Unsupported HTML may
* intentionally cause wp_trigger_error() under WP_DEBUG, which is separate
* from the native errors this regression test is trying to catch.
*/
add_filter( 'wp_trigger_error_trigger_error', '__return_false' );
set_error_handler(
static function ( int $errno, string $errstr ) use ( &$errors ) {
$errors[] = "{$errno}: {$errstr}";
return true;
}
);

try {
$normalized = WP_HTML_Processor::normalize( $input );
} finally {
restore_error_handler();
remove_filter( 'wp_trigger_error_trigger_error', '__return_false' );
}

// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
$this->assertSame( array(), $errors );
$this->assertSame( $expected, $normalized, 'Should have normalized the input.' );
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_fuzzer_native_error_cases() {
return array(
'Unsupported active formatting' => array( '<A><I><A>', null ),
);
}

/**
* Ensures that normalized fuzzer-discovered inputs remain supported.
*
* @ticket {TICKET_NUMBER}
*
* @dataProvider data_provider_normalized_fuzzer_cases_that_should_remain_supported
*
* @param string $input HTML input.
*/
public function test_normalized_fuzzer_cases_should_remain_supported( string $input ) {
$errors = array();
set_error_handler(
static function ( int $errno, string $errstr ) use ( &$errors ) {
$errors[] = "{$errno}: {$errstr}";
return true;
}
);

try {
$normalized = WP_HTML_Processor::normalize( $input );
$normalized_twice = is_string( $normalized ) ? WP_HTML_Processor::normalize( $normalized ) : null;
} finally {
restore_error_handler();
}

// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
$this->assertSame( array(), $errors );
$this->assertIsString( $normalized, 'Input HTML should normalize successfully.' );
$this->assertIsString(
$normalized_twice,
'Normalized HTML should remain supported by the HTML Processor.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_normalized_fuzzer_cases_that_should_remain_supported() {
return array(
'FORM in TABLE' => array( '<table><form>' ),
'Mixed-case FORM in TABLE' => array( '<TABLE><Form>' ),
'FORM in TABLE after SCRIPT' => array( '<table><script></script><form>' ),
'Unclosed SVG TITLE after P in EM' => array( '<em><p><svg><title>' ),
'Unclosed SVG TITLE after P in STRONG' => array(
'<strong><p><svg ><title>',
),
);
}

/**
* Ensures that normalized fuzzer-discovered inputs are idempotent.
*
* @ticket {TICKET_NUMBER}
*
* @dataProvider data_provider_normalized_fuzzer_cases_that_should_be_idempotent
*
* @param string $input HTML input.
*/
public function test_normalized_fuzzer_cases_should_be_idempotent( string $input ) {
$errors = array();
set_error_handler(
static function ( int $errno, string $errstr ) use ( &$errors ) {
$errors[] = "{$errno}: {$errstr}";
return true;
}
);

try {
$normalized = WP_HTML_Processor::normalize( $input );
$normalized_twice = is_string( $normalized ) ? WP_HTML_Processor::normalize( $normalized ) : null;
} finally {
restore_error_handler();
}

// Use assertSame() instead of assertEmpty() so PHPUnit shows captured error messages on failure.
$this->assertSame( array(), $errors );
$this->assertIsString( $normalized, 'Input HTML should normalize successfully.' );
$this->assertSame(
$normalized,
$normalized_twice,
'Normalizing already-normalized HTML should not change it.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_provider_normalized_fuzzer_cases_that_should_be_idempotent() {
return array(
'Malformed quoted attribute boundary' => array( '<A "/=>' ),
'Duplicate attribute after bare attribute' => array( '<A V=5 R V=""=>' ),
'Duplicate DATA-ID after numeric attribute' => array( '<E DATA-ID=1 1 DATA-ID=""=>' ),
'Duplicate attribute before tag end' => array( '<R V=5 R V=5 =>' ),
'NULL byte in foreign tag name' => array( "<SVG><L\x00 D>" ),
'Malformed closing-looking attribute' => array( '<a </=>' ),
'Malformed self-closing attribute' => array( '<a h/=>' ),
'Duplicate ID with quote boundary' => array( '<d ID=""" ID=""=>' ),
'Mixed-case duplicate TITLE' => array( "<d TITLE=\"\"' title=\"\"=>" ),
'Colon before self-closing slash' => array( '<e :/=>' ),
'Duplicate class after bare attribute' => array( "<e class=y d class=''=>" ),
'Duplicate DATA-ID after hyphen' => array( '<e data-id=1 - data-id="">' ),
'Duplicate title after quotes' => array( "<e title=''' title=\"\"=>" ),
'FORM with SVG TITLE text edge' => array( "<form ><svg ><title \"'></form><form>" ),
'FORM with TABLE and SCRIPT' => array( '<form id><table te"><script></script><td srce" ID/></form><form claslicate">' ),
'FORM with TABLE CAPTION' => array( '<form><table><caption></form><form >' ),
'Short malformed G attribute C' => array( '<g c/=>' ),
'Short malformed G attribute S' => array( '<g s/=>' ),
'Duplicate SRC boundary' => array( '<g src=""g src="">' ),
'Short malformed H attribute' => array( '<h f/=>' ),
'Malformed SRC equals boundary' => array( '<i src=""= src=""=">' ),
'Malformed slash in tag opener' => array( '<i/t/=>' ),
'Malformed L colon attribute' => array( '<l :/=>' ),
'Malformed L less-than attribute' => array( '<l/</=>' ),
'Malformed N less-than attribute' => array( '<n </=>' ),
'Unclosed SVG TITLE after P' => array( '<p><svg><title>' ),
'Duplicate ALT boundary' => array( '<r alt=\'\'d alt=""=>' ),
'NULL byte in SVG child tag' => array( "<svg><l\x00 '>" ),
'NULL byte before slash in SVG child tag' => array( "<svg><l\x00/r>" ),
);
}

/**
* Data provider.
*
Expand Down
Loading
Loading