nt',
__( 'Unable to retrieve body from response at this URL.' ),
array( 'status' => WP_Http::NOT_FOUND )
);
}
return $remote_body;
}
/**
* Parses the title tag contents from the provided HTML.
*
* @since 5.9.0
*
* @param string $html The HTML from the remote website at URL.
* @return string The title tag contents on success. Empty string if not found.
*/
private function get_title( $html ) {
$pattern = '#
]*>(.*?)<\s*/\s*title>#is';
preg_match( $pattern, $html, $match_title );
if ( empty( $match_title[1] ) || ! is_string( $match_title[1] ) ) {
return '';
}
$title = trim( $match_title[1] );
return $this->prepare_metadata_for_output( $title );
}
/**
* Parses the site icon from the provided HTML.
*
* @since 5.9.0
*
* @param string $html The HTML from the remote website at URL.
* @param string $url The target website URL.
* @return string The icon URI on success. Empty string if not found.
*/
private function get_icon( $html, $url ) {
// Grab the icon's link element.
$pattern = '#]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU';
preg_match( $pattern, $html, $element );
if ( empty( $element[0] ) || ! is_string( $element[0] ) ) {
return '';
}
$element = trim( $element[0] );
// Get the icon's href value.
$pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU';
preg_match( $pattern, $element, $icon );
if ( empty( $icon[2] ) || ! is_string( $icon[2] ) ) {
return '';
}
$icon = trim( $icon[2] );
// If the icon is a data URL, return it.
$parsed_icon = parse_url( $icon );
if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) {
return $icon;
}
// Attempt to convert relative URLs to absolute.
if ( ! is_string( $url ) || '' === $url ) {
return $icon;
}
$parsed_url = parse_url( $url );
if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
$root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
$icon = WP_Http::make_absolute_url( $icon, $root_url );
}
return $icon;
}
/**
* Parses the meta description from the provided HTML.
*
* @since 5.9.0
*
* @param array $meta_elements {
* A multi-dimensional indexed array on success, else empty array.
*
* @type string[] $0 Meta elements with a content attribute.
* @type string[] $1 Content attribute's opening quotation mark.
* @type string[] $2 Content attribute's value for each meta element.
* }
* @return string The meta description contents on success. Empty string if not found.
*/
private function get_description( $meta_elements ) {
// Bail out if there are no meta elements.
if ( empty( $meta_elements[0] ) ) {
return '';
}
$description = $this->get_metadata_from_meta_element(
$meta_elements,
'name',
'(?:description|og:description)'
);
// Bail out if description not found.
if ( '' === $description ) {
return '';
}
return $this->prepare_metadata_for_output( $description );
}
/**
* Parses the Open Graph (OG) Image from the provided HTML.
*
* See: https://ogp.me/.
*
* @since 5.9.0
*
* @param array $meta_elements {
* A multi-dimensional indexed array on success, else empty array.
*
* @type string[] $0 Meta elements with a content attribute.
* @type string[] $1 Content attribute's opening quotation mark.
* @type string[] $2 Content attribute's value for each meta element.
* }
* @param string $url The target website URL.
* @return string The OG image on success. Empty string if not found.
*/
private function get_image( $meta_elements, $url ) {
$image = $this->get_metadata_from_meta_element(
$meta_elements,
'property',
'(?:og:image|og:image:url)'
);
// Bail out if image not found.
if ( '' === $image ) {
return '';
}
// Attempt to convert relative URLs to absolute.
$parsed_url = parse_url( $url );
if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) {
$root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/';
$image = WP_Http::make_absolute_url( $image, $root_url );
}
return $image;
}
/**
* Prepares the metadata by:
* - stripping all HTML tags and tag entities.
* - converting non-tag entities into characters.
*
* @since 5.9.0
*
* @param string $metadata The metadata content to prepare.
* @return string The prepared metadata.
*/
private function prepare_metadata_for_output( $metadata ) {
$metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) );
$metadata = wp_strip_all_tags( $metadata );
return $metadata;
}
/**
* Utility function to build cache key for a given URL.
*
* @since 5.9.0
*
* @param string $url The URL for which to build a cache key.
* @return string The cache key.
*/
private function build_cache_key_for_url( $url ) {
return 'g_url_details_response_' . md5( $url );
}
/**
* Utility function to retrieve a value from the cache at a given key.
*
* @since 5.9.0
*
* @param string $key The cache key.
* @return mixed The value from the cache.
*/
private function get_cache( $key ) {
return get_site_transient( $key );
}
/**
* Utility function to cache a given data set at a given cache key.
*
* @since 5.9.0
*
* @param string $key The cache key under which to store the value.
* @param string $data The data to be stored at the given cache key.
* @return bool True when transient set. False if not set.
*/
private function set_cache( $key, $data = '' ) {
$ttl = HOUR_IN_SECONDS;
/**
* Filters the cache expiration.
*
* Can be used to adjust the time until expiration in seconds for the cache
* of the data retrieved for the given URL.
*
* @since 5.9.0
*
* @param int $ttl The time until cache expiration in seconds.
*/
$cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl );
return set_site_transient( $key, $data, $cache_expiration );
}
/**
* Retrieves the head element section.
*
* @since 5.9.0
*
* @param string $html The string of HTML to parse.
* @return string The `..` section on success. Given `$html` if not found.
*/
private function get_document_head( $html ) {
$head_html = $html;
// Find the opening `` tag.
$head_start = strpos( $html, '` tag.
$head_end = strpos( $head_html, '' );
if ( false === $head_end ) {
// Didn't find it. Find the opening `` tag.
$head_end = strpos( $head_html, ' symbol.
*
* The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as
* it's a string to the browser. Imagine what happens when attempting to match for the name=description
* first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match
* as the element's closing symbol. But wait, it's in the content attribute and is not the end of the
* element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation".
* If this happens, what gets matched is not the entire element or all of the content.
*
* Why not search for the name=description and then content="(.*)"?
* The attribute order could be opposite. Plus, additional attributes may exist including being between
* the name and content attributes.
*
* Why not lookahead?
* Lookahead is not constrained to stay within the element. The first symbol.
*/
'[^>]*' .
/*
* Find the content attribute. When found, capture its value (.*).
*
* Allows for (a) single or double quotes and (b) whitespace in the value.
*
* Why capture the opening quotation mark, i.e. (["\']), and then backreference,
* i.e \1, for the closing quotation mark?
* To ensure the closing quotation mark matches the opening one. Why? Attribute values
* can contain quotation marks, such as an apostrophe in the content.
*/
'content=(["\']??)(.*)\1' .
/*
* Allows for additional attributes after the content attribute.
* Searches for anything other than > symbol.
*/
'[^>]*' .
/*
* \/?> searches for the closing > symbol, which can be in either /> or > format.
* # ends the pattern.
*/
'\/?>#' .
/*
* These are the options:
* - i : case insensitive
* - s : allows newline characters for the . match (needed for multiline elements)
* - U means non-greedy matching
*/
'isU';
preg_match_all( $pattern, $html, $elements );
return $elements;
}
/**
* Gets the metadata from a target meta element.
*
* @since 5.9.0
*
* @param array $meta_elements {
* A multi-dimensional indexed array on success, else empty array.
*
* @type string[] $0 Meta elements with a content attribute.
* @type string[] $1 Content attribute's opening quotation mark.
* @type string[] $2 Content attribute's value for each meta element.
* }
* @param string $attr Attribute that identifies the element with the target metadata.
* @param string $attr_value The attribute's value that identifies the element with the target metadata.
* @return string The metadata on success. Empty string if not found.
*/
private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) {
// Bail out if there are no meta elements.
if ( empty( $meta_elements[0] ) ) {
return '';
}
$metadata = '';
$pattern = '#' .
/*
* Target this attribute and value to find the metadata element.
*
* Allows for (a) no, single, double quotes and (b) whitespace in the value.
*
* Why capture the opening quotation mark, i.e. (["\']), and then backreference,
* i.e \1, for the closing quotation mark?
* To ensure the closing quotation mark matches the opening one. Why? Attribute values
* can contain quotation marks, such as an apostrophe in the content.
*/
$attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' .
/*
* These are the options:
* - i : case insensitive
* - s : allows newline characters for the . match (needed for multiline elements)
* - U means non-greedy matching
*/
'#isU';
// Find the metadata element.
foreach ( $meta_elements[0] as $index => $element ) {
preg_match( $pattern, $element, $match );
// This is not the metadata element. Skip it.
if ( empty( $match ) ) {
continue;
}
/*
* Found the metadata element.
* Get the metadata from its matching content array.
*/
if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) {
$metadata = trim( $meta_elements[2][ $index ] );
}
break;
}
return $metadata;
}
}