wip
This commit is contained in:
@@ -36,7 +36,7 @@
|
||||
* @see https://html.spec.whatwg.org/#the-doctype
|
||||
*
|
||||
* DOCTYPE declarations comprise four properties: a name, public identifier, system identifier,
|
||||
* and an indication of which document compatability mode they would imply if an HTML parser
|
||||
* and an indication of which document compatibility mode they would imply if an HTML parser
|
||||
* hadn't already determined it from other information.
|
||||
*
|
||||
* @see https://html.spec.whatwg.org/#the-initial-insertion-mode
|
||||
@@ -50,6 +50,8 @@
|
||||
*
|
||||
* @since 6.7.0
|
||||
*
|
||||
* @access private
|
||||
*
|
||||
* @see WP_HTML_Processor
|
||||
*/
|
||||
class WP_HTML_Doctype_Info {
|
||||
@@ -126,14 +128,14 @@ class WP_HTML_Doctype_Info {
|
||||
public $system_identifier = null;
|
||||
|
||||
/**
|
||||
* Which document compatability mode this DOCTYPE declaration indicates.
|
||||
* Which document compatibility mode this DOCTYPE declaration indicates.
|
||||
*
|
||||
* This value should be considered "read only" and not modified.
|
||||
*
|
||||
* When an HTML parser has not already set the document compatability mode,
|
||||
* (e.g. "quirks" or "no-quirks" mode), it will infer if from the properties
|
||||
* When an HTML parser has not already set the document compatibility mode,
|
||||
* (e.g. "quirks" or "no-quirks" mode), it will be inferred from the properties
|
||||
* of the appropriate DOCTYPE declaration, if one exists. The DOCTYPE can
|
||||
* indicate one of three possible document compatability modes:
|
||||
* indicate one of three possible document compatibility modes:
|
||||
*
|
||||
* - "no-quirks" and "limited-quirks" modes (also called "standards" mode).
|
||||
* - "quirks" mode (also called `CSS1Compat` mode).
|
||||
@@ -148,7 +150,7 @@ class WP_HTML_Doctype_Info {
|
||||
*
|
||||
* @var string One of "no-quirks", "limited-quirks", or "quirks".
|
||||
*/
|
||||
public $indicated_compatability_mode;
|
||||
public $indicated_compatibility_mode;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
@@ -192,7 +194,7 @@ class WP_HTML_Doctype_Info {
|
||||
* > The force-quirks flag is set to on.
|
||||
*/
|
||||
if ( $force_quirks_flag ) {
|
||||
$this->indicated_compatability_mode = 'quirks';
|
||||
$this->indicated_compatibility_mode = 'quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -201,7 +203,7 @@ class WP_HTML_Doctype_Info {
|
||||
* public or system identifiers; short-circuit to avoid extra parsing.
|
||||
*/
|
||||
if ( 'html' === $name && null === $public_identifier && null === $system_identifier ) {
|
||||
$this->indicated_compatability_mode = 'no-quirks';
|
||||
$this->indicated_compatibility_mode = 'no-quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -212,7 +214,7 @@ class WP_HTML_Doctype_Info {
|
||||
* the document in upper case; thus no conversion is required here.
|
||||
*/
|
||||
if ( 'html' !== $name ) {
|
||||
$this->indicated_compatability_mode = 'quirks';
|
||||
$this->indicated_compatibility_mode = 'quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -240,7 +242,7 @@ class WP_HTML_Doctype_Info {
|
||||
'-/w3c/dtd html 4.0 transitional/en' === $public_identifier ||
|
||||
'html' === $public_identifier
|
||||
) {
|
||||
$this->indicated_compatability_mode = 'quirks';
|
||||
$this->indicated_compatibility_mode = 'quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -248,7 +250,7 @@ class WP_HTML_Doctype_Info {
|
||||
* > The system identifier is set to…
|
||||
*/
|
||||
if ( 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' === $system_identifier ) {
|
||||
$this->indicated_compatability_mode = 'quirks';
|
||||
$this->indicated_compatibility_mode = 'quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -257,7 +259,7 @@ class WP_HTML_Doctype_Info {
|
||||
* If the public identifier is empty, none of the following conditions will match.
|
||||
*/
|
||||
if ( '' === $public_identifier ) {
|
||||
$this->indicated_compatability_mode = 'no-quirks';
|
||||
$this->indicated_compatibility_mode = 'no-quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -325,7 +327,7 @@ class WP_HTML_Doctype_Info {
|
||||
str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html 2.0//' ) ||
|
||||
str_starts_with( $public_identifier, '-//webtechs//dtd mozilla html//' )
|
||||
) {
|
||||
$this->indicated_compatability_mode = 'quirks';
|
||||
$this->indicated_compatibility_mode = 'quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -338,7 +340,7 @@ class WP_HTML_Doctype_Info {
|
||||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
|
||||
)
|
||||
) {
|
||||
$this->indicated_compatability_mode = 'quirks';
|
||||
$this->indicated_compatibility_mode = 'quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -354,7 +356,7 @@ class WP_HTML_Doctype_Info {
|
||||
str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 frameset//' ) ||
|
||||
str_starts_with( $public_identifier, '-//w3c//dtd xhtml 1.0 transitional//' )
|
||||
) {
|
||||
$this->indicated_compatability_mode = 'limited-quirks';
|
||||
$this->indicated_compatibility_mode = 'limited-quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -367,11 +369,11 @@ class WP_HTML_Doctype_Info {
|
||||
str_starts_with( $public_identifier, '-//w3c//dtd html 4.01 transitional//' )
|
||||
)
|
||||
) {
|
||||
$this->indicated_compatability_mode = 'limited-quirks';
|
||||
$this->indicated_compatibility_mode = 'limited-quirks';
|
||||
return;
|
||||
}
|
||||
|
||||
$this->indicated_compatability_mode = 'no-quirks';
|
||||
$this->indicated_compatibility_mode = 'no-quirks';
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -385,15 +387,15 @@ class WP_HTML_Doctype_Info {
|
||||
*
|
||||
* // Normative HTML DOCTYPE declaration.
|
||||
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!DOCTYPE html>' );
|
||||
* 'no-quirks' === $doctype->indicated_compatability_mode;
|
||||
* 'no-quirks' === $doctype->indicated_compatibility_mode;
|
||||
*
|
||||
* // A nonsensical DOCTYPE is still valid, and will indicate "quirks" mode.
|
||||
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( '<!doctypeJSON SILLY "nonsense\'>' );
|
||||
* 'quirks' === $doctype->indicated_compatability_mode;
|
||||
* 'quirks' === $doctype->indicated_compatibility_mode;
|
||||
*
|
||||
* // Textual quirks present in raw HTML are handled appropriately.
|
||||
* $doctype = WP_HTML_Doctype_Info::from_doctype_token( "<!DOCTYPE\nhtml\n>" );
|
||||
* 'no-quirks' === $doctype->indicated_compatability_mode;
|
||||
* 'no-quirks' === $doctype->indicated_compatibility_mode;
|
||||
*
|
||||
* // Anything other than a proper DOCTYPE declaration token fails to parse.
|
||||
* null === WP_HTML_Doctype_Info::from_doctype_token( ' <!DOCTYPE>' );
|
||||
|
||||
@@ -297,6 +297,15 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
return null;
|
||||
}
|
||||
|
||||
if ( ! is_string( $html ) ) {
|
||||
_doing_it_wrong(
|
||||
__METHOD__,
|
||||
__( 'The HTML parameter must be a string.' ),
|
||||
'6.9.0'
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
$context_processor = static::create_full_parser( "<!DOCTYPE html>{$context}", $encoding );
|
||||
if ( null === $context_processor ) {
|
||||
return null;
|
||||
@@ -339,6 +348,14 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
if ( 'UTF-8' !== $known_definite_encoding ) {
|
||||
return null;
|
||||
}
|
||||
if ( ! is_string( $html ) ) {
|
||||
_doing_it_wrong(
|
||||
__METHOD__,
|
||||
__( 'The HTML parameter must be a string.' ),
|
||||
'6.9.0'
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
$processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
|
||||
$processor->state->encoding = $known_definite_encoding;
|
||||
@@ -1304,10 +1321,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
* @see static::serialize()
|
||||
*
|
||||
* @since 6.7.0
|
||||
* @since 6.9.0 Converted from protected to public method.
|
||||
*
|
||||
* @return string Serialization of token, or empty string if no serialization exists.
|
||||
*/
|
||||
protected function serialize_token(): string {
|
||||
public function serialize_token(): string {
|
||||
$html = '';
|
||||
$token_type = $this->get_token_type();
|
||||
|
||||
@@ -1469,7 +1487,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
*/
|
||||
case 'html':
|
||||
$doctype = $this->get_doctype_info();
|
||||
if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
|
||||
if ( null !== $doctype && 'quirks' === $doctype->indicated_compatibility_mode ) {
|
||||
$this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
|
||||
}
|
||||
|
||||
@@ -1760,6 +1778,11 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
case '+META':
|
||||
$this->insert_html_element( $this->state->current_token );
|
||||
|
||||
// All following conditions depend on "tentative" encoding confidence.
|
||||
if ( 'tentative' !== $this->state->encoding_confidence ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* > If the active speculative HTML parser is null, then:
|
||||
* > - If the element has a charset attribute, and getting an encoding from
|
||||
@@ -1767,7 +1790,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
* > tentative, then change the encoding to the resulting encoding.
|
||||
*/
|
||||
$charset = $this->get_attribute( 'charset' );
|
||||
if ( is_string( $charset ) && 'tentative' === $this->state->encoding_confidence ) {
|
||||
if ( is_string( $charset ) ) {
|
||||
$this->bail( 'Cannot yet process META tags with charset to determine encoding.' );
|
||||
}
|
||||
|
||||
@@ -1784,8 +1807,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
if (
|
||||
is_string( $http_equiv ) &&
|
||||
is_string( $content ) &&
|
||||
0 === strcasecmp( $http_equiv, 'Content-Type' ) &&
|
||||
'tentative' === $this->state->encoding_confidence
|
||||
0 === strcasecmp( $http_equiv, 'Content-Type' )
|
||||
) {
|
||||
$this->bail( 'Cannot yet process META tags with http-equiv Content-Type to determine encoding.' );
|
||||
}
|
||||
@@ -5268,13 +5290,30 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
|
||||
/**
|
||||
* Updates or creates a new attribute on the currently matched tag with the passed value.
|
||||
*
|
||||
* For boolean attributes special handling is provided:
|
||||
* This function handles all necessary HTML encoding. Provide normal, unescaped string values.
|
||||
* The HTML API will encode the strings appropriately so that the browser will interpret them
|
||||
* as the intended value.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* // Renders “Eggs & Milk” in a browser, encoded as `<abbr title="Eggs & Milk">`.
|
||||
* $processor->set_attribute( 'title', 'Eggs & Milk' );
|
||||
*
|
||||
* // Renders “Eggs & Milk” in a browser, encoded as `<abbr title="Eggs &amp; Milk">`.
|
||||
* $processor->set_attribute( 'title', 'Eggs & Milk' );
|
||||
*
|
||||
* // Renders `true` as `<abbr title>`.
|
||||
* $processor->set_attribute( 'title', true );
|
||||
*
|
||||
* // Renders without the attribute for `false` as `<abbr>`.
|
||||
* $processor->set_attribute( 'title', false );
|
||||
*
|
||||
* Special handling is provided for boolean attribute values:
|
||||
* - When `true` is passed as the value, then only the attribute name is added to the tag.
|
||||
* - When `false` is passed, the attribute gets removed if it existed before.
|
||||
*
|
||||
* For string attributes, the value is escaped using the `esc_attr` function.
|
||||
*
|
||||
* @since 6.6.0 Subclassed for the HTML Processor.
|
||||
* @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
|
||||
*
|
||||
* @param string $name The attribute name to target.
|
||||
* @param string|bool $value The new attribute value.
|
||||
|
||||
@@ -834,6 +834,14 @@ class WP_HTML_Tag_Processor {
|
||||
* @param string $html HTML to process.
|
||||
*/
|
||||
public function __construct( $html ) {
|
||||
if ( ! is_string( $html ) ) {
|
||||
_doing_it_wrong(
|
||||
__METHOD__,
|
||||
__( 'The HTML parameter must be a string.' ),
|
||||
'6.9.0'
|
||||
);
|
||||
$html = '';
|
||||
}
|
||||
$this->html = $html;
|
||||
}
|
||||
|
||||
@@ -1496,13 +1504,48 @@ class WP_HTML_Tag_Processor {
|
||||
while ( false !== $at && $at < $doc_length ) {
|
||||
$at += strcspn( $html, '-<', $at );
|
||||
|
||||
/*
|
||||
* Optimization: Terminating a complete script element requires at least eight
|
||||
* additional bytes in the document. Some checks below may cause local escaped
|
||||
* state transitions when processing shorter strings, but those transitions are
|
||||
* irrelevant if the script tag is incomplete and the function must return false.
|
||||
*
|
||||
* This may need updating if those transitions become significant or exported from
|
||||
* this function in some way, such as when building safe methods to embed JavaScript
|
||||
* or data inside a SCRIPT element.
|
||||
*
|
||||
* $at may be here.
|
||||
* ↓
|
||||
* ...</script>
|
||||
* ╰──┬───╯
|
||||
* $at + 8 additional bytes are required for a non-false return value.
|
||||
*
|
||||
* This single check eliminates the need to check lengths for the shorter spans:
|
||||
*
|
||||
* $at may be here.
|
||||
* ↓
|
||||
* <script><!-- --></script>
|
||||
* ├╯
|
||||
* $at + 2 additional characters does not require a length check.
|
||||
*
|
||||
* The transition from "escaped" to "unescaped" is not relevant if the document ends:
|
||||
*
|
||||
* $at may be here.
|
||||
* ↓
|
||||
* <script><!-- -->[[END-OF-DOCUMENT]]
|
||||
* ╰──┬───╯
|
||||
* $at + 8 additional bytes is not satisfied, return false.
|
||||
*/
|
||||
if ( $at + 8 >= $doc_length ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* For all script states a "-->" transitions
|
||||
* back into the normal unescaped script mode,
|
||||
* even if that's the current state.
|
||||
*/
|
||||
if (
|
||||
$at + 2 < $doc_length &&
|
||||
'-' === $html[ $at ] &&
|
||||
'-' === $html[ $at + 1 ] &&
|
||||
'>' === $html[ $at + 2 ]
|
||||
@@ -1512,10 +1555,6 @@ class WP_HTML_Tag_Processor {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( $at + 1 >= $doc_length ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Everything of interest past here starts with "<".
|
||||
* Check this character and advance position regardless.
|
||||
@@ -1525,25 +1564,33 @@ class WP_HTML_Tag_Processor {
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlike with "-->", the "<!--" only transitions
|
||||
* into the escaped mode if not already there.
|
||||
*
|
||||
* Inside the escaped modes it will be ignored; and
|
||||
* should never break out of the double-escaped
|
||||
* mode and back into the escaped mode.
|
||||
*
|
||||
* While this requires a mode change, it does not
|
||||
* impact the parsing otherwise, so continue
|
||||
* parsing after updating the state.
|
||||
* "<!--" only transitions from _unescaped_ to _escaped_. This byte sequence is only
|
||||
* significant in the _unescaped_ state and is ignored in any other state.
|
||||
*/
|
||||
if (
|
||||
$at + 2 < $doc_length &&
|
||||
'unescaped' === $state &&
|
||||
'!' === $html[ $at ] &&
|
||||
'-' === $html[ $at + 1 ] &&
|
||||
'-' === $html[ $at + 2 ]
|
||||
) {
|
||||
$at += 3;
|
||||
$state = 'unescaped' === $state ? 'escaped' : $state;
|
||||
$at += 3;
|
||||
|
||||
/*
|
||||
* The parser is ready to enter the _escaped_ state, but may remain in the
|
||||
* _unescaped_ state. This occurs when "<!--" is immediately followed by a
|
||||
* sequence of 0 or more "-" followed by ">". This is similar to abruptly closed
|
||||
* HTML comments like "<!-->" or "<!--->".
|
||||
*
|
||||
* Note that this check may advance the position significantly and requires a
|
||||
* length check to prevent bad offsets on inputs like `<script><!---------`.
|
||||
*/
|
||||
$at += strspn( $html, '-', $at );
|
||||
if ( $at < $doc_length && '>' === $html[ $at ] ) {
|
||||
++$at;
|
||||
continue;
|
||||
}
|
||||
|
||||
$state = 'escaped';
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1561,7 +1608,6 @@ class WP_HTML_Tag_Processor {
|
||||
* proceed scanning to the next potential token in the text.
|
||||
*/
|
||||
if ( ! (
|
||||
$at + 6 < $doc_length &&
|
||||
( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
|
||||
( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
|
||||
( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
|
||||
@@ -1579,13 +1625,32 @@ class WP_HTML_Tag_Processor {
|
||||
* "<script123" should not end a script region even though
|
||||
* "<script" is found within the text.
|
||||
*/
|
||||
if ( $at + 6 >= $doc_length ) {
|
||||
continue;
|
||||
}
|
||||
$at += 6;
|
||||
$c = $html[ $at ];
|
||||
if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
|
||||
++$at;
|
||||
if (
|
||||
/**
|
||||
* These characters trigger state transitions of interest:
|
||||
*
|
||||
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state}
|
||||
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state}
|
||||
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state}
|
||||
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state}
|
||||
*
|
||||
* The "\r" character is not present in the above references. However, "\r" must be
|
||||
* treated the same as "\n". This is because the HTML Standard requires newline
|
||||
* normalization during preprocessing which applies this replacement.
|
||||
*
|
||||
* - @see https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
|
||||
* - @see https://infra.spec.whatwg.org/#normalize-newlines
|
||||
*/
|
||||
'>' !== $c &&
|
||||
' ' !== $c &&
|
||||
"\n" !== $c &&
|
||||
'/' !== $c &&
|
||||
"\t" !== $c &&
|
||||
"\f" !== $c &&
|
||||
"\r" !== $c
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1611,8 +1676,6 @@ class WP_HTML_Tag_Processor {
|
||||
}
|
||||
|
||||
if ( $this->bytes_already_parsed >= $doc_length ) {
|
||||
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3683,10 +3746,22 @@ class WP_HTML_Tag_Processor {
|
||||
* $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
|
||||
* }
|
||||
*
|
||||
* This function handles all necessary HTML encoding. Provide normal, unescaped string values.
|
||||
* The HTML API will encode the strings appropriately so that the browser will interpret them
|
||||
* as the intended value.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* // Renders as “Eggs & Milk” in a browser, encoded as `<p>Eggs & Milk</p>`.
|
||||
* $processor->set_modifiable_text( 'Eggs & Milk' );
|
||||
*
|
||||
* // Renders as “Eggs & Milk” in a browser, encoded as `<p>Eggs &amp; Milk</p>`.
|
||||
* $processor->set_modifiable_text( 'Eggs & Milk' );
|
||||
*
|
||||
* @since 6.7.0
|
||||
* @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
|
||||
*
|
||||
* @param string $plaintext_content New text content to represent in the matched token.
|
||||
*
|
||||
* @return bool Whether the text was able to update.
|
||||
*/
|
||||
public function set_modifiable_text( string $plaintext_content ): bool {
|
||||
@@ -3694,7 +3769,16 @@ class WP_HTML_Tag_Processor {
|
||||
$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
|
||||
$this->text_starts_at,
|
||||
$this->text_length,
|
||||
htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
|
||||
strtr(
|
||||
$plaintext_content,
|
||||
array(
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
'&' => '&',
|
||||
'"' => '"',
|
||||
"'" => ''',
|
||||
)
|
||||
)
|
||||
);
|
||||
|
||||
return true;
|
||||
@@ -3725,17 +3809,28 @@ class WP_HTML_Tag_Processor {
|
||||
|
||||
switch ( $this->get_tag() ) {
|
||||
case 'SCRIPT':
|
||||
/*
|
||||
/**
|
||||
* This is over-protective, but ensures the update doesn't break
|
||||
* out of the SCRIPT element. A more thorough check would need to
|
||||
* ensure that the script closing tag doesn't exist, and isn't
|
||||
* also "hidden" inside the script double-escaped state.
|
||||
* the HTML structure of the SCRIPT element.
|
||||
*
|
||||
* It may seem like replacing `</script` with `<\/script` would
|
||||
* properly escape these things, but this could mask regex patterns
|
||||
* that previously worked. Resolve this by not sending `</script`
|
||||
* More thorough analysis could track the HTML tokenizer states
|
||||
* and to ensure that the SCRIPT element closes at the expected
|
||||
* SCRIPT close tag as is done in {@see ::skip_script_data()}.
|
||||
*
|
||||
* A SCRIPT element could be closed prematurely by contents
|
||||
* like `</script>`. A SCRIPT element could be prevented from
|
||||
* closing by contents like `<!--<script>`.
|
||||
*
|
||||
* The following strings are essential for dangerous content,
|
||||
* although they are insufficient on their own. This trade-off
|
||||
* prevents dangerous scripts from being sent to the browser.
|
||||
* It is also unlikely to produce HTML that may confuse more
|
||||
* basic HTML tooling.
|
||||
*/
|
||||
if ( false !== stripos( $plaintext_content, '</script' ) ) {
|
||||
if (
|
||||
false !== stripos( $plaintext_content, '</script' ) ||
|
||||
false !== stripos( $plaintext_content, '<script' )
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3797,14 +3892,31 @@ class WP_HTML_Tag_Processor {
|
||||
/**
|
||||
* Updates or creates a new attribute on the currently matched tag with the passed value.
|
||||
*
|
||||
* For boolean attributes special handling is provided:
|
||||
* This function handles all necessary HTML encoding. Provide normal, unescaped string values.
|
||||
* The HTML API will encode the strings appropriately so that the browser will interpret them
|
||||
* as the intended value.
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* // Renders “Eggs & Milk” in a browser, encoded as `<abbr title="Eggs & Milk">`.
|
||||
* $processor->set_attribute( 'title', 'Eggs & Milk' );
|
||||
*
|
||||
* // Renders “Eggs & Milk” in a browser, encoded as `<abbr title="Eggs &amp; Milk">`.
|
||||
* $processor->set_attribute( 'title', 'Eggs & Milk' );
|
||||
*
|
||||
* // Renders `true` as `<abbr title>`.
|
||||
* $processor->set_attribute( 'title', true );
|
||||
*
|
||||
* // Renders without the attribute for `false` as `<abbr>`.
|
||||
* $processor->set_attribute( 'title', false );
|
||||
*
|
||||
* Special handling is provided for boolean attribute values:
|
||||
* - When `true` is passed as the value, then only the attribute name is added to the tag.
|
||||
* - When `false` is passed, the attribute gets removed if it existed before.
|
||||
*
|
||||
* For string attributes, the value is escaped using the `esc_attr` function.
|
||||
*
|
||||
* @since 6.2.0
|
||||
* @since 6.2.1 Fix: Only create a single update for multiple calls with case-variant attribute names.
|
||||
* @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
|
||||
*
|
||||
* @param string $name The attribute name to target.
|
||||
* @param string|bool $value The new attribute value.
|
||||
@@ -3818,41 +3930,32 @@ class WP_HTML_Tag_Processor {
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
$name_length = strlen( $name );
|
||||
|
||||
/**
|
||||
* WordPress rejects more characters than are strictly forbidden
|
||||
* in HTML5. This is to prevent additional security risks deeper
|
||||
* in the WordPress and plugin stack. Specifically the
|
||||
* less-than (<) greater-than (>) and ampersand (&) aren't allowed.
|
||||
* in the WordPress and plugin stack. Specifically the following
|
||||
* are not allowed to be set as part of an HTML attribute name:
|
||||
*
|
||||
* The use of a PCRE match enables looking for specific Unicode
|
||||
* code points without writing a UTF-8 decoder. Whereas scanning
|
||||
* for one-byte characters is trivial (with `strcspn`), scanning
|
||||
* for the longer byte sequences would be more complicated. Given
|
||||
* that this shouldn't be in the hot path for execution, it's a
|
||||
* reasonable compromise in efficiency without introducing a
|
||||
* noticeable impact on the overall system.
|
||||
* - greater-than “>”
|
||||
* - ampersand “&”
|
||||
*
|
||||
* @see https://html.spec.whatwg.org/#attributes-2
|
||||
*
|
||||
* @todo As the only regex pattern maybe we should take it out?
|
||||
* Are Unicode patterns available broadly in Core?
|
||||
*/
|
||||
if ( preg_match(
|
||||
'~[' .
|
||||
// Syntax-like characters.
|
||||
'"\'>&</ =' .
|
||||
// Control characters.
|
||||
'\x{00}-\x{1F}' .
|
||||
// HTML noncharacters.
|
||||
'\x{FDD0}-\x{FDEF}' .
|
||||
'\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}' .
|
||||
'\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}' .
|
||||
'\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}' .
|
||||
'\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}' .
|
||||
'\x{10FFFE}\x{10FFFF}' .
|
||||
']~Ssu',
|
||||
$name
|
||||
) ) {
|
||||
if (
|
||||
0 === $name_length ||
|
||||
// Syntax-like characters.
|
||||
strcspn( $name, '"\'>&</ =' ) !== $name_length ||
|
||||
// Control characters.
|
||||
strcspn(
|
||||
$name,
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F" .
|
||||
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F"
|
||||
) !== $name_length ||
|
||||
// Unicode noncharacters.
|
||||
wp_has_noncharacters( $name )
|
||||
) {
|
||||
_doing_it_wrong(
|
||||
__METHOD__,
|
||||
__( 'Invalid attribute name.' ),
|
||||
@@ -3876,12 +3979,23 @@ class WP_HTML_Tag_Processor {
|
||||
} else {
|
||||
$comparable_name = strtolower( $name );
|
||||
|
||||
/*
|
||||
* Escape URL attributes.
|
||||
/**
|
||||
* Escape attribute values appropriately.
|
||||
*
|
||||
* @see https://html.spec.whatwg.org/#attributes-3
|
||||
*/
|
||||
$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes(), true ) ? esc_url( $value ) : esc_attr( $value );
|
||||
$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes(), true )
|
||||
? esc_url( $value )
|
||||
: strtr(
|
||||
$value,
|
||||
array(
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
'&' => '&',
|
||||
'"' => '"',
|
||||
"'" => ''',
|
||||
)
|
||||
);
|
||||
|
||||
// If the escaping functions wiped out the update, reject it and indicate it was rejected.
|
||||
if ( '' === $escaped_new_value && '' !== $value ) {
|
||||
@@ -4504,7 +4618,7 @@ class WP_HTML_Tag_Processor {
|
||||
const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
|
||||
|
||||
/**
|
||||
* No-quirks mode document compatability mode.
|
||||
* No-quirks mode document compatibility mode.
|
||||
*
|
||||
* > In no-quirks mode, the behavior is (hopefully) the desired behavior
|
||||
* > described by the modern HTML and CSS specifications.
|
||||
@@ -4519,7 +4633,7 @@ class WP_HTML_Tag_Processor {
|
||||
const NO_QUIRKS_MODE = 'no-quirks-mode';
|
||||
|
||||
/**
|
||||
* Quirks mode document compatability mode.
|
||||
* Quirks mode document compatibility mode.
|
||||
*
|
||||
* > In quirks mode, layout emulates behavior in Navigator 4 and Internet
|
||||
* > Explorer 5. This is essential in order to support websites that were
|
||||
|
||||
Reference in New Issue
Block a user