Skip to content

Commit

Permalink
Merge pull request #519 from brianlmoon/all_ldjson_data
Browse files Browse the repository at this point in the history
Load all JSON-LD data
  • Loading branch information
oscarotero committed Jun 28, 2023
2 parents 4913408 + 631ea03 commit 530593a
Show file tree
Hide file tree
Showing 80 changed files with 4,402 additions and 1,389 deletions.
60 changes: 55 additions & 5 deletions src/LinkedData.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
namespace Embed;

use Exception;
use ML\JsonLD\JsonLD;
use ML\JsonLD\Document as LdDocument;
use ML\JsonLD\DocumentInterface;
use ML\JsonLD\GraphInterface;
Expand All @@ -16,7 +17,9 @@ class LinkedData

private ?DocumentInterface $document;

private function get(string ...$keys)
private array $allData;

public function get(string ...$keys)
{
$graph = $this->getGraph();

Expand All @@ -39,6 +42,15 @@ private function get(string ...$keys)
return null;
}

public function getAll()
{
if (!isset($this->allData)) {
$this->fetchData();
}

return $this->allData;
}

private function getGraph(string $name = null): ?GraphInterface
{
if (!isset($this->document)) {
Expand All @@ -50,20 +62,58 @@ private function getGraph(string $name = null): ?GraphInterface
}
}

return $this->document->getGraph();
return $this->document->getGraph($name);
}

protected function fetchData(): array
{
$this->allData = [];

$document = $this->extractor->getDocument();
$content = $document->select('.//script', ['type' => 'application/ld+json'])->str();
$nodes = $document->select('.//script', ['type' => 'application/ld+json'])->strAll();

if (empty($content)) {
if (empty($nodes)) {
return [];
}

try {
return json_decode($content, true) ?: [];
$data = [];
$request_uri = (string)$this->extractor->getUri();
foreach ($nodes as $node) {
$ldjson = json_decode($node, true);
if (!empty($ldjson)) {

// some pages with multiple ld+json blocks will put
// each block into an array (Flickr does this). Most
// appear to put an object in each ld+json block. To
// prevent them from stepping on one another, the ones
// that are not arrays will be put into an array.
if (!array_is_list($ldjson)) {
$ldjson = [$ldjson];
}

foreach ($ldjson as $node) {
if (empty($data)) {
$data = $node;
} elseif (isset($node['mainEntityOfPage'])) {
$url = '';
if (is_string($node['mainEntityOfPage'])) {
$url = $node['mainEntityOfPage'];
} elseif (isset($node['mainEntityOfPage']['@id'])) {
$url = $node['mainEntityOfPage']['@id'];
}
if (!empty($url) && $url == $request_uri) {
$data = $node;
}
}
}


$this->allData = array_merge($this->allData, $ldjson);
}
}

return $data;
} catch (Exception $exception) {
return [];
}
Expand Down
24 changes: 23 additions & 1 deletion src/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function clean(string $value, bool $allowHTML = false): ?string
$value = trim($value);

if (!$allowHTML) {
$value = html_entity_decode($value);
$value = html_entity_decode($value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML401);
$value = strip_tags($value);
}

Expand Down Expand Up @@ -154,3 +154,25 @@ function isEmpty(mixed ...$values): bool

return false;
}

if (!function_exists("array_is_list")) {
/**
* Polyfil for https://www.php.net/manual/en/function.array-is-list.php
* which is only available in PHP 8.1+
*
* @param array $array The array
*
* @return bool
*/
function array_is_list(array $array): bool
{
$i = -1;
foreach ($array as $k => $v) {
++$i;
if ($k !== $i) {
return false;
}
}
return true;
}
}
2 changes: 1 addition & 1 deletion tests/PagesTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ public function testSoundCloud()

public function testSpotify()
{
$this->assertEmbed('https://play.spotify.com/album/7s66wU1XJ2NsUuWM2NKiUV');
$this->assertEmbed('https://open.spotify.com/album/7s66wU1XJ2NsUuWM2NKiUV');
$this->assertEmbed('https://play.spotify.com/album/7s66wU1XJ2NsUuWM2NKiUV');
}

public function testTwitch()
Expand Down
1 change: 1 addition & 0 deletions tests/PagesTestCase.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ private static function getData(Extractor $extractor): array
if (method_exists($extractor, 'getApi')) {
$data['api'] = $extractor->getApi()->all();
}
$data['allLinkedData'] = $extractor->getLinkedData()->getAll();

return $data;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?php
declare(strict_types = 1);

return [
'headers' => [
'content-type' => [
'text/html; charset=utf-8'
],
'cache-control' => [
'no-cache, no-store, max-age=0, must-revalidate'
],
'pragma' => [
'no-cache'
],
'expires' => [
'Mon, 01 Jan 1990 00:00:00 GMT'
],
'date' => [
'Sun, 04 Apr 2021 15:20:47 GMT'
],
'p3p' => [
'CP="This is not a P3P policy! See g.co/p3phelp for more info."'
],
'content-security-policy' => [
'script-src \'nonce-g9/eaYJePAYsVh50Jyl0EQ\' \'unsafe-inline\';object-src \'none\';base-uri \'self\';report-uri /_/IdentityNotFoundHttp/cspreport;worker-src \'self\'',
'script-src \'nonce-g9/eaYJePAYsVh50Jyl0EQ\' \'self\' https://apis.google.com https://ssl.gstatic.com https://www.google.com https://www.gstatic.com https://www.google-analytics.com;report-uri /_/IdentityNotFoundHttp/cspreport'
],
'content-encoding' => [
'gzip'
],
'server' => [
'ESF'
],
'x-xss-protection' => [
'0'
],
'x-content-type-options' => [
'nosniff'
],
'set-cookie' => [
'NID=212=dAlC8GKROGQ6cWC5EeQ92vga0m4ReROz1kMl9BrboOg7GfaE3zvV7pmmgCmsXsJ7vya8tJGI4jioBfUTai-FbFjJPm264-_PY9-GEu66UJhCsRvBiDJVz3O5Ckjox4e0LsT9RZ2vuLADiJTrbw7nzwn4qwyWUF3duIq6_ZUnLhA; expires=Mon, 04-Oct-2021 15:20:47 GMT; path=/; domain=.google.es; Secure; HttpOnly; SameSite=none'
],
'alt-svc' => [
'h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"'
],
'Content-Location' => [
'https://consent.google.es/\'https:/consent.google.es/ml?continue=https://www.google.es/maps/place/Tordoia,%2BA%2BCoru%25C3%25B1a/@43.0871207,-8.5710004,12z/data%3D!3m1!4b1!4m2!3m1!1s0xd2ef4006f1ef489:0x404f58273ca55a0&gl=ES&hl=es&pc=m&src=1&rffu=true\''
],
'X-Request-Time' => [
'0.112 ms'
]
],
'statusCode' => 404,
'reasonPhrase' => 'Not Found',
'body' => '<html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 404 (No se ha encontrado.)!!1</title><style nonce="g9/eaYJePAYsVh50Jyl0EQ">*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{color:#222;text-align:unset;margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px;}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}pre{white-space:pre-wrap;}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}</style><div id="af-error-container"><a href=//www.google.com><span id=logo aria-label=Google></span></a><p><b>404.</b> <ins>Se trata de un error.</ins><p>No se ha encontrado la URL solicitada en este servidor. <ins>Esa es toda la información de la que disponemos.</ins></div>'
];
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
<?php
declare(strict_types = 1);

return [
'headers' => [
'content-type' => [
'text/html; charset=utf-8'
],
'cache-control' => [
'no-cache, no-store, max-age=0, must-revalidate'
],
'pragma' => [
'no-cache'
],
'expires' => [
'Mon, 01 Jan 1990 00:00:00 GMT'
],
'date' => [
'Sun, 04 Apr 2021 15:23:17 GMT'
],
'content-security-policy' => [
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'unsafe-inline\';object-src \'none\';base-uri \'self\';report-uri /_/IdentityNotFoundHttp/cspreport;worker-src \'self\'',
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'self\' https://apis.google.com https://ssl.gstatic.com https://www.google.com https://www.gstatic.com https://www.google-analytics.com;report-uri /_/IdentityNotFoundHttp/cspreport'
],
'content-encoding' => [
'gzip'
],
'server' => [
'ESF'
],
'x-xss-protection' => [
'0'
],
'x-content-type-options' => [
'nosniff'
],
'alt-svc' => [
'h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"'
],
'Content-Location' => [
'https://consent.youtube.com/\'https:/consent.youtube.com/ml?continue=https://www.youtube.com/channel/UCuZeHD5SGecQomz2pVDHGzg&gl=ES&hl=es&pc=yt&uxe=23983172&src=1&rffu=true\''
],
'X-Request-Time' => [
'0.127 ms'
]
],
'statusCode' => 404,
'reasonPhrase' => 'Not Found',
'body' => '<html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 404 (No se ha encontrado.)!!1</title><style nonce="vsx63Xooae1XpA8j6kEGNA">*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{color:#222;text-align:unset;margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px;}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}pre{white-space:pre-wrap;}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}</style><div id="af-error-container"><a href=//www.google.com><span id=logo aria-label=Google></span></a><p><b>404.</b> <ins>Se trata de un error.</ins><p>No se ha encontrado la URL solicitada en este servidor. <ins>Esa es toda la información de la que disponemos.</ins></div>'
];
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
<?php
declare(strict_types = 1);

return [
'headers' => [
'content-type' => [
'text/html; charset=utf-8'
],
'cache-control' => [
'no-cache, no-store, max-age=0, must-revalidate'
],
'pragma' => [
'no-cache'
],
'expires' => [
'Mon, 01 Jan 1990 00:00:00 GMT'
],
'date' => [
'Sun, 04 Apr 2021 15:23:17 GMT'
],
'content-security-policy' => [
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'unsafe-inline\';object-src \'none\';base-uri \'self\';report-uri /_/IdentityNotFoundHttp/cspreport;worker-src \'self\'',
'script-src \'nonce-vsx63Xooae1XpA8j6kEGNA\' \'self\' https://apis.google.com https://ssl.gstatic.com https://www.google.com https://www.gstatic.com https://www.google-analytics.com;report-uri /_/IdentityNotFoundHttp/cspreport'
],
'content-encoding' => [
'gzip'
],
'server' => [
'ESF'
],
'x-xss-protection' => [
'0'
],
'x-content-type-options' => [
'nosniff'
],
'alt-svc' => [
'h3-29=":443"; ma=2592000,h3-T051=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"'
],
'Content-Location' => [
'https://consent.youtube.com/\'https:/consent.youtube.com/ml?continue=https://www.youtube.com/channel/UCuZeHD5SGecQomz2pVDHGzg&gl=ES&hl=es&pc=yt&uxe=23983172&src=1&rffu=true\''
],
'X-Request-Time' => [
'0.127 ms'
]
],
'statusCode' => 404,
'reasonPhrase' => 'Not Found',
'body' => '<html lang=en><meta charset=utf-8><meta name=viewport content="initial-scale=1, minimum-scale=1, width=device-width"><title>Error 404 (No se ha encontrado.)!!1</title><style nonce="vsx63Xooae1XpA8j6kEGNA">*{margin:0;padding:0}html,code{font:15px/22px arial,sans-serif}html{background:#fff;color:#222;padding:15px}body{color:#222;text-align:unset;margin:7% auto 0;max-width:390px;min-height:180px;padding:30px 0 15px;}* > body{background:url(//www.google.com/images/errors/robot.png) 100% 5px no-repeat;padding-right:205px}p{margin:11px 0 22px;overflow:hidden}pre{white-space:pre-wrap;}ins{color:#777;text-decoration:none}a img{border:0}@media screen and (max-width:772px){body{background:none;margin-top:0;max-width:none;padding-right:0}}#logo{background:url(//www.google.com/images/branding/googlelogo/1x/googlelogo_color_150x54dp.png) no-repeat;margin-left:-5px}@media only screen and (min-resolution:192dpi){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat 0% 0%/100% 100%;-moz-border-image:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) 0}}@media only screen and (-webkit-min-device-pixel-ratio:2){#logo{background:url(//www.google.com/images/branding/googlelogo/2x/googlelogo_color_150x54dp.png) no-repeat;-webkit-background-size:100% 100%}}#logo{display:inline-block;height:54px;width:150px}</style><div id="af-error-container"><a href=//www.google.com><span id=logo aria-label=Google></span></a><p><b>404.</b> <ins>Se trata de un error.</ins><p>No se ha encontrado la URL solicitada en este servidor. <ins>Esa es toda la información de la que disponemos.</ins></div>'
];
Loading

0 comments on commit 530593a

Please sign in to comment.