wip

2026-01-04 17:50:08 -06:00
parent 7e45ce0756
commit acc8ac87a0
4131 changed files with 232562 additions and 250244 deletions
@@ -36,7 +36,7 @@
 * @see https://html.spec.whatwg.org/#the-doctype
 *
 * DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
- * and an indication of which document compatability mode they would imply if an HTML parser
+ * and an indication of which document compatibility mode they would imply if an HTML parser
 * hadn't already determined it from other information.
 *
 * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
@@ -50,6 +50,8 @@
 *
 * @since 6.7.0
 *
+ * @access private
+ *
 * @see WP_HTML_Processor
 */
 class WP_HTML_Doctype_Info {
@@ -126,14 +128,14 @@ class WP_HTML_Doctype_Info {
 	public $system_identifier = null;

 	/**
-	 * Which document compatability mode this DOCTYPE declaration indicates.
+	 * Which document compatibility mode this DOCTYPE declaration indicates.
 	 *
 	 * This value should be considered "read only" and not modified.
 	 *
-	 * When an HTML parser has not already set the document compatability mode,
-	 * (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
+	 * When an HTML parser has not already set the document compatibility mode,
+	 * (e.g. "quirks" or "no-quirks" mode), it will be inferred from the properties
 	 * of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
-	 * indicate one of three possible document compatability modes:
+	 * indicate one of three possible document compatibility modes:
 	 *
 	 *  - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
 	 *  - "quirks" mode (also called `CSS1Compat` mode).
@@ -148,7 +150,7 @@ class WP_HTML_Doctype_Info {
 	 *
 	 * @var string One of "no-quirks", "limited-quirks", or "quirks".
 	 */
-	public $indicated_compatability_mode;
+	public $indicated_compatibility_mode;

 	/**
 	 * Constructor.
@@ -192,7 +194,7 @@ class WP_HTML_Doctype_Info {
 		 * > The force-quirks flag is set to on.
 		 */
 		if ( $force_quirks_flag ) {
-			$this->indicated_compatability_mode = 'quirks';
+			$this->indicated_compatibility_mode = 'quirks';
 			return;
 		}

@@ -201,7 +203,7 @@ class WP_HTML_Doctype_Info {
 		 * public or system identifiers; short-circuit to avoid extra parsing.
 		 */
 		if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
-			$this->indicated_compatability_mode = 'no-quirks';
+			$this->indicated_compatibility_mode = 'no-quirks';
 			return;
 		}

@@ -212,7 +214,7 @@ class WP_HTML_Doctype_Info {
 		 * the document in upper case; thus no conversion is required here.
 		 */
 		if ( 'html' !== $name ) {
-			$this->indicated_compatability_mode = 'quirks';
+			$this->indicated_compatibility_mode = 'quirks';
 			return;
 		}

@@ -240,7 +242,7 @@ class WP_HTML_Doctype_Info {
 			'-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
 			'html' === $public_identifier
 		) {
-			$this->indicated_compatability_mode = 'quirks';
+			$this->indicated_compatibility_mode = 'quirks';
 			return;
 		}

@@ -248,7 +250,7 @@ class WP_HTML_Doctype_Info {
 		 * > The system identifier is set to…
 		 */
 		if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
-			$this->indicated_compatability_mode = 'quirks';
+			$this->indicated_compatibility_mode = 'quirks';
 			return;
 		}

@@ -257,7 +259,7 @@ class WP_HTML_Doctype_Info {
 		 * If the public identifier is empty, none of the following conditions will match.
 		 */
 		if ( '' === $public_identifier ) {
-			$this->indicated_compatability_mode = 'no-quirks';
+			$this->indicated_compatibility_mode = 'no-quirks';
 			return;
 		}

@@ -325,7 +327,7 @@ class WP_HTML_Doctype_Info {
 			str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
 			str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
 		) {
-			$this->indicated_compatability_mode = 'quirks';
+			$this->indicated_compatibility_mode = 'quirks';
 			return;
 		}

@@ -338,7 +340,7 @@ class WP_HTML_Doctype_Info {
 				str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 			)
 		) {
-			$this->indicated_compatability_mode = 'quirks';
+			$this->indicated_compatibility_mode = 'quirks';
 			return;
 		}

@@ -354,7 +356,7 @@ class WP_HTML_Doctype_Info {
 			str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
 			str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
 		) {
-			$this->indicated_compatability_mode = 'limited-quirks';
+			$this->indicated_compatibility_mode = 'limited-quirks';
 			return;
 		}

@@ -367,11 +369,11 @@ class WP_HTML_Doctype_Info {
 				str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
 			)
 		) {
-			$this->indicated_compatability_mode = 'limited-quirks';
+			$this->indicated_compatibility_mode = 'limited-quirks';
 			return;
 		}

-		$this->indicated_compatability_mode = 'no-quirks';
+		$this->indicated_compatibility_mode = 'no-quirks';
 	}

 	/**
@@ -385,15 +387,15 @@ class WP_HTML_Doctype_Info {
 	 *
 	 *     // Normative HTML DOCTYPE declaration.
 	 *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
-	 *     'no-quirks' === $doctype->indicated_compatability_mode;
+	 *     'no-quirks' === $doctype->indicated_compatibility_mode;
 	 *
 	 *     // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
 	 *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
-	 *     'quirks' === $doctype->indicated_compatability_mode;
+	 *     'quirks' === $doctype->indicated_compatibility_mode;
 	 *
 	 *     // Textual quirks present in raw HTML are handled appropriately.
 	 *     $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
-	 *     'no-quirks' === $doctype->indicated_compatability_mode;
+	 *     'no-quirks' === $doctype->indicated_compatibility_mode;
 	 *
 	 *     // Anything other than a proper DOCTYPE declaration token fails to parse.
 	 *     null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
@@ -297,6 +297,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			return null;
 		}

+		if ( ! is_string( $html ) ) {
+			_doing_it_wrong(
+				__METHOD__,
+				__( 'The HTML parameter must be a string.' ),
+				'6.9.0'
+			);
+			return null;
+		}
+
 		$context_processor = static::create_full_parser( "<!DOCTYPE html>{$context}", $encoding );
 		if ( null === $context_processor ) {
 			return null;
@@ -339,6 +348,14 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 		if ( 'UTF-8' !== $known_definite_encoding ) {
 			return null;
 		}
+		if ( ! is_string( $html ) ) {
+			_doing_it_wrong(
+				__METHOD__,
+				__( 'The HTML parameter must be a string.' ),
+				'6.9.0'
+			);
+			return null;
+		}

 		$processor                             = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
 		$processor->state->encoding            = $known_definite_encoding;
@@ -1304,10 +1321,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	 * @see static::serialize()
 	 *
 	 * @since 6.7.0
+	 * @since 6.9.0 Converted from protected to public method.
 	 *
 	 * @return string Serialization of token, or empty string if no serialization exists.
 	 */
-	protected function serialize_token(): string {
+	public function serialize_token(): string {
 		$html       = '';
 		$token_type = $this->get_token_type();

@@ -1469,7 +1487,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			 */
 			case 'html':
 				$doctype = $this->get_doctype_info();
-				if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
+				if ( null !== $doctype && 'quirks' === $doctype->indicated_compatibility_mode ) {
 					$this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
 				}

@@ -1760,6 +1778,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 			case '+META':
 				$this->insert_html_element( $this->state->current_token );

+				// All following conditions depend on "tentative" encoding confidence.
+				if ( 'tentative' !== $this->state->encoding_confidence ) {
+					return true;
+				}
+
 				/*
 				 * > If the active speculative HTML parser is null, then:
 				 * >   - If the element has a charset attribute, and getting an encoding from
@@ -1767,7 +1790,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				 * >     tentative, then change the encoding to the resulting encoding.
 				 */
 				$charset = $this->get_attribute( 'charset' );
-				if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) {
+				if ( is_string( $charset ) ) {
 					$this->bail( 'Cannot yet process META tags with charset to determine encoding.' );
 				}

@@ -1784,8 +1807,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 				if (
 					is_string( $http_equiv ) &&
 					is_string( $content ) &&
-					0 === strcasecmp( $http_equiv, 'Content-Type' ) &&
-					'tentative' === $this->state->encoding_confidence
+					0 === strcasecmp( $http_equiv, 'Content-Type' )
 				) {
 					$this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' );
 				}
@@ -5268,13 +5290,30 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *
-	 * For boolean attributes special handling is provided:
+	 * This function handles all necessary HTML encoding. Provide normal, unescaped string values.
+	 * The HTML API will encode the strings appropriately so that the browser will interpret them
+	 * as the intended value.
+	 *
+	 * Example:
+	 *
+	 *     // Renders “Eggs & Milk” in a browser, encoded as `<abbr title="Eggs &amp; Milk">`.
+	 *     $processor->set_attribute( 'title', 'Eggs & Milk' );
+	 *
+	 *     // Renders “Eggs &amp; Milk” in a browser, encoded as `<abbr title="Eggs &amp;amp; Milk">`.
+	 *     $processor->set_attribute( 'title', 'Eggs &amp; Milk' );
+	 *
+	 *     // Renders `true` as `<abbr title>`.
+	 *     $processor->set_attribute( 'title', true );
+	 *
+	 *     // Renders without the attribute for `false` as `<abbr>`.
+	 *     $processor->set_attribute( 'title', false );
+	 *
+	 * Special handling is provided for boolean attribute values:
 	 *  - When `true` is passed as the value, then only the attribute name is added to the tag.
 	 *  - When `false` is passed, the attribute gets removed if it existed before.
 	 *
-	 * For string attributes, the value is escaped using the `esc_attr` function.
-	 *
 	 * @since 6.6.0 Subclassed for the HTML Processor.
+	 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
 	 *
 	 * @param string      $name  The attribute name to target.
 	 * @param string|bool $value The new attribute value.
@@ -834,6 +834,14 @@ class WP_HTML_Tag_Processor {
 	 * @param string $html HTML to process.
 	 */
 	public function __construct( $html ) {
+		if ( ! is_string( $html ) ) {
+			_doing_it_wrong(
+				__METHOD__,
+				__( 'The HTML parameter must be a string.' ),
+				'6.9.0'
+			);
+			$html = '';
+		}
 		$this->html = $html;
 	}

@@ -1496,13 +1504,48 @@ class WP_HTML_Tag_Processor {
 		while ( false !== $at && $at < $doc_length ) {
 			$at += strcspn( $html, '-<', $at );

+			/*
+			 * Optimization: Terminating a complete script element requires at least eight
+			 * additional bytes in the document. Some checks below may cause local escaped
+			 * state transitions when processing shorter strings, but those transitions are
+			 * irrelevant if the script tag is incomplete and the function must return false.
+			 *
+			 * This may need updating if those transitions become significant or exported from
+			 * this function in some way, such as when building safe methods to embed JavaScript
+			 * or data inside a SCRIPT element.
+			 *
+			 *     $at may be here.
+			 *        ↓
+			 *     ...</script>
+			 *         ╰──┬───╯
+			 *     $at + 8 additional bytes are required for a non-false return value.
+			 *
+			 * This single check eliminates the need to check lengths for the shorter spans:
+			 *
+			 *           $at may be here.
+			 *                  ↓
+			 *     <script><!-- --></script>
+			 *                   ├╯
+			 *             $at + 2 additional characters does not require a length check.
+			 *
+			 * The transition from "escaped" to "unescaped" is not relevant if the document ends:
+			 *
+			 *           $at may be here.
+			 *                  ↓
+			 *     <script><!-- -->[[END-OF-DOCUMENT]]
+			 *                   ╰──┬───╯
+			 *             $at + 8 additional bytes is not satisfied, return false.
+			 */
+			if ( $at + 8 >= $doc_length ) {
+				return false;
+			}
+
 			/*
 			 * For all script states a "-->"  transitions
 			 * back into the normal unescaped script mode,
 			 * even if that's the current state.
 			 */
 			if (
-				$at + 2 < $doc_length &&
 				'-' === $html[ $at ] &&
 				'-' === $html[ $at + 1 ] &&
 				'>' === $html[ $at + 2 ]
@@ -1512,10 +1555,6 @@ class WP_HTML_Tag_Processor {
 				continue;
 			}

-			if ( $at + 1 >= $doc_length ) {
-				return false;
-			}
-
 			/*
 			 * Everything of interest past here starts with "<".
 			 * Check this character and advance position regardless.
@@ -1525,25 +1564,33 @@ class WP_HTML_Tag_Processor {
 			}

 			/*
-			 * Unlike with "-->", the "<!--" only transitions
-			 * into the escaped mode if not already there.
-			 *
-			 * Inside the escaped modes it will be ignored; and
-			 * should never break out of the double-escaped
-			 * mode and back into the escaped mode.
-			 *
-			 * While this requires a mode change, it does not
-			 * impact the parsing otherwise, so continue
-			 * parsing after updating the state.
+			 * "<!--" only transitions from _unescaped_ to _escaped_. This byte sequence is only
+			 * significant in the _unescaped_ state and is ignored in any other state.
 			 */
 			if (
-				$at + 2 < $doc_length &&
+				'unescaped' === $state &&
 				'!' === $html[ $at ] &&
 				'-' === $html[ $at + 1 ] &&
 				'-' === $html[ $at + 2 ]
 			) {
-				$at   += 3;
-				$state = 'unescaped' === $state ? 'escaped' : $state;
+				$at += 3;
+
+				/*
+				 * The parser is ready to enter the _escaped_ state, but may remain in the
+				 * _unescaped_ state. This occurs when "<!--" is immediately followed by a
+				 * sequence of 0 or more "-" followed by ">". This is similar to abruptly closed
+				 * HTML comments like "<!-->" or "<!--->".
+				 *
+				 * Note that this check may advance the position significantly and requires a
+				 * length check to prevent bad offsets on inputs like `<script><!---------`.
+				 */
+				$at += strspn( $html, '-', $at );
+				if ( $at < $doc_length && '>' === $html[ $at ] ) {
+					++$at;
+					continue;
+				}
+
+				$state = 'escaped';
 				continue;
 			}

@@ -1561,7 +1608,6 @@ class WP_HTML_Tag_Processor {
 			 * proceed scanning to the next potential token in the text.
 			 */
 			if ( ! (
-				$at + 6 < $doc_length &&
 				( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
 				( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
 				( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
@@ -1579,13 +1625,32 @@ class WP_HTML_Tag_Processor {
 			 * "<script123" should not end a script region even though
 			 * "<script" is found within the text.
 			 */
-			if ( $at + 6 >= $doc_length ) {
-				continue;
-			}
 			$at += 6;
 			$c   = $html[ $at ];
-			if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
-				++$at;
+			if (
+				/**
+				 * These characters trigger state transitions of interest:
+				 *
+				 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state}
+				 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state}
+				 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state}
+				 * - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state}
+				 *
+				 * The "\r" character is not present in the above references. However, "\r" must be
+				 * treated the same as "\n". This is because the HTML Standard requires newline
+				 * normalization during preprocessing which applies this replacement.
+				 *
+				 * - @see https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
+				 * - @see https://infra.spec.whatwg.org/#normalize-newlines
+				 */
+				'>' !== $c &&
+				' ' !== $c &&
+				"\n" !== $c &&
+				'/' !== $c &&
+				"\t" !== $c &&
+				"\f" !== $c &&
+				"\r" !== $c
+			) {
 				continue;
 			}

@@ -1611,8 +1676,6 @@ class WP_HTML_Tag_Processor {
 				}

 				if ( $this->bytes_already_parsed >= $doc_length ) {
-					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
-
 					return false;
 				}

@@ -3683,10 +3746,22 @@ class WP_HTML_Tag_Processor {
 	 *         $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
 	 *     }
 	 *
+	 * This function handles all necessary HTML encoding. Provide normal, unescaped string values.
+	 * The HTML API will encode the strings appropriately so that the browser will interpret them
+	 * as the intended value.
+	 *
+	 * Example:
+	 *
+	 *     // Renders as “Eggs & Milk” in a browser, encoded as `<p>Eggs &amp; Milk</p>`.
+	 *     $processor->set_modifiable_text( 'Eggs & Milk' );
+	 *
+	 *     // Renders as “Eggs &amp; Milk” in a browser, encoded as `<p>Eggs &amp;amp; Milk</p>`.
+	 *     $processor->set_modifiable_text( 'Eggs &amp; Milk' );
+	 *
 	 * @since 6.7.0
+	 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
 	 *
 	 * @param string $plaintext_content New text content to represent in the matched token.
-	 *
 	 * @return bool Whether the text was able to update.
 	 */
 	public function set_modifiable_text( string $plaintext_content ): bool {
@@ -3694,7 +3769,16 @@ class WP_HTML_Tag_Processor {
 			$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
 				$this->text_starts_at,
 				$this->text_length,
-				htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
+				strtr(
+					$plaintext_content,
+					array(
+						'<' => '&lt;',
+						'>' => '&gt;',
+						'&' => '&amp;',
+						'"' => '&quot;',
+						"'" => '&apos;',
+					)
+				)
 			);

 			return true;
@@ -3725,17 +3809,28 @@ class WP_HTML_Tag_Processor {

 		switch ( $this->get_tag() ) {
 			case 'SCRIPT':
-				/*
+				/**
 				 * This is over-protective, but ensures the update doesn't break
-				 * out of the SCRIPT element. A more thorough check would need to
-				 * ensure that the script closing tag doesn't exist, and isn't
-				 * also "hidden" inside the script double-escaped state.
+				 * the HTML structure of the SCRIPT element.
 				 *
-				 * It may seem like replacing `</script` with `<\/script` would
-				 * properly escape these things, but this could mask regex patterns
-				 * that previously worked. Resolve this by not sending `</script`
+				 * More thorough analysis could track the HTML tokenizer states
+				 * and to ensure that the SCRIPT element closes at the expected
+				 * SCRIPT close tag as is done in {@see ::skip_script_data()}.
+				 *
+				 * A SCRIPT element could be closed prematurely by contents
+				 * like `</script>`. A SCRIPT element could be prevented from
+				 * closing by contents like `<!--<script>`.
+				 *
+				 * The following strings are essential for dangerous content,
+				 * although they are insufficient on their own. This trade-off
+				 * prevents dangerous scripts from being sent to the browser.
+				 * It is also unlikely to produce HTML that may confuse more
+				 * basic HTML tooling.
 				 */
-				if ( false !== stripos( $plaintext_content, '</script' ) ) {
+				if (
+					false !== stripos( $plaintext_content, '</script' ) ||
+					false !== stripos( $plaintext_content, '<script' )
+				) {
 					return false;
 				}

@@ -3797,14 +3892,31 @@ class WP_HTML_Tag_Processor {
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *
-	 * For boolean attributes special handling is provided:
+	 * This function handles all necessary HTML encoding. Provide normal, unescaped string values.
+	 * The HTML API will encode the strings appropriately so that the browser will interpret them
+	 * as the intended value.
+	 *
+	 * Example:
+	 *
+	 *     // Renders “Eggs & Milk” in a browser, encoded as `<abbr title="Eggs &amp; Milk">`.
+	 *     $processor->set_attribute( 'title', 'Eggs & Milk' );
+	 *
+	 *     // Renders “Eggs &amp; Milk” in a browser, encoded as `<abbr title="Eggs &amp;amp; Milk">`.
+	 *     $processor->set_attribute( 'title', 'Eggs &amp; Milk' );
+	 *
+	 *     // Renders `true` as `<abbr title>`.
+	 *     $processor->set_attribute( 'title', true );
+	 *
+	 *     // Renders without the attribute for `false` as `<abbr>`.
+	 *     $processor->set_attribute( 'title', false );
+	 *
+	 * Special handling is provided for boolean attribute values:
 	 *  - When `true` is passed as the value, then only the attribute name is added to the tag.
 	 *  - When `false` is passed, the attribute gets removed if it existed before.
 	 *
-	 * For string attributes, the value is escaped using the `esc_attr` function.
-	 *
 	 * @since 6.2.0
 	 * @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names.
+	 * @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
 	 *
 	 * @param string      $name  The attribute name to target.
 	 * @param string|bool $value The new attribute value.
@@ -3818,41 +3930,32 @@ class WP_HTML_Tag_Processor {
 			return false;
 		}

-		/*
+		$name_length = strlen( $name );
+
+		/**
 		 * WordPress rejects more characters than are strictly forbidden
 		 * in HTML5. This is to prevent additional security risks deeper
-		 * in the WordPress and plugin stack. Specifically the
-		 * less-than (<) greater-than (>) and ampersand (&) aren't allowed.
+		 * in the WordPress and plugin stack. Specifically the following
+		 * are not allowed to be set as part of an HTML attribute name:
 		 *
-		 * The use of a PCRE match enables looking for specific Unicode
-		 * code points without writing a UTF-8 decoder. Whereas scanning
-		 * for one-byte characters is trivial (with `strcspn`), scanning
-		 * for the longer byte sequences would be more complicated. Given
-		 * that this shouldn't be in the hot path for execution, it's a
-		 * reasonable compromise in efficiency without introducing a
-		 * noticeable impact on the overall system.
+		 *  - greater-than “>”
+		 *  - ampersand “&”
 		 *
 		 * @see https://html.spec.whatwg.org/#attributes-2
-		 *
-		 * @todo As the only regex pattern maybe we should take it out?
-		 *       Are Unicode patterns available broadly in Core?
 		 */
-		if ( preg_match(
-			'~[' .
-				// Syntax-like characters.
-				'"\'>&</ =' .
-				// Control characters.
-				'\x{00}-\x{1F}' .
-				// HTML noncharacters.
-				'\x{FDD0}-\x{FDEF}' .
-				'\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' .
-				'\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' .
-				'\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' .
-				'\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' .
-				'\x{10FFFE}\x{10FFFF}' .
-			']~Ssu',
-			$name
-		) ) {
+		if (
+			0 === $name_length ||
+			// Syntax-like characters.
+			strcspn( $name, '"\'>&</ =' ) !== $name_length ||
+			// Control characters.
+			strcspn(
+				$name,
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" .
+				"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
+			) !== $name_length ||
+			// Unicode noncharacters.
+			wp_has_noncharacters( $name )
+		) {
 			_doing_it_wrong(
 				__METHOD__,
 				__( 'Invalid attribute name.' ),
@@ -3876,12 +3979,23 @@ class WP_HTML_Tag_Processor {
 		} else {
 			$comparable_name = strtolower( $name );

-			/*
-			 * Escape URL attributes.
+			/**
+			 * Escape attribute values appropriately.
 			 *
 			 * @see https://html.spec.whatwg.org/#attributes-3
 			 */
-			$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes(), true ) ? esc_url( $value ) : esc_attr( $value );
+			$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes(), true )
+				? esc_url( $value )
+				: strtr(
+					$value,
+					array(
+						'<' => '&lt;',
+						'>' => '&gt;',
+						'&' => '&amp;',
+						'"' => '&quot;',
+						"'" => '&apos;',
+					)
+				);

 			// If the escaping functions wiped out the update, reject it and indicate it was rejected.
 			if ( '' === $escaped_new_value && '' !== $value ) {
@@ -4504,7 +4618,7 @@ class WP_HTML_Tag_Processor {
 	const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';

 	/**
-	 * No-quirks mode document compatability mode.
+	 * No-quirks mode document compatibility mode.
 	 *
 	 * > In no-quirks mode, the behavior is (hopefully) the desired behavior
 	 * > described by the modern HTML and CSS specifications.
@@ -4519,7 +4633,7 @@ class WP_HTML_Tag_Processor {
 	const NO_QUIRKS_MODE = 'no-quirks-mode';

 	/**
-	 * Quirks mode document compatability mode.
+	 * Quirks mode document compatibility mode.
 	 *
 	 * > In quirks mode, layout emulates behavior in Navigator 4 and Internet
 	 * > Explorer 5. This is essential in order to support websites that were