Skip to content

Commit 5dd178a

Browse files
authored
Improve GA/consent manager detection by not restricting to curl and caching result (matomo-org#20008)
* Added persistent caching of site content detection data, changed explicit curl usage to getHTTPTransport(), code tidy and added additional comments * SiteContentDetector is no longer a singleton, caching is now done by required property, fixed mock and tests * Removed unused use
1 parent 151ebfd commit 5dd178a

File tree

3 files changed

+185
-53
lines changed

3 files changed

+185
-53
lines changed

core/SiteContentDetector.php

Lines changed: 177 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*/
99
namespace Piwik;
1010

11-
use Piwik\Container\StaticContainer;
11+
use Matomo\Cache\Lazy;
1212

1313
/**
1414
* This class provides detection functions for specific content on a site. It can be used to easily detect the
@@ -19,7 +19,7 @@
1919
*
2020
* Usage:
2121
*
22-
* $contentDetector = SiteContentDetector::getInstance();
22+
* $contentDetector = new SiteContentDetector();
2323
* $contentDetector->detectContent([SiteContentDetector::GA3]);
2424
* if ($contentDetector->ga3) {
2525
* // site is using GA3
@@ -46,20 +46,25 @@ class SiteContentDetector
4646
public $gtm; // True if GTM was detected on the site
4747

4848
private $siteData;
49-
private $siteId;
5049

51-
/**
52-
* @return SiteContentDetector
53-
*/
54-
public static function getInstance(): SiteContentDetector
50+
/** @var Lazy */
51+
private $cache;
52+
53+
public function __construct(?Lazy $cache = null)
5554
{
56-
return StaticContainer::get('Piwik\SiteContentDetector');
55+
if ($cache === null) {
56+
$this->cache = Cache::getLazyCache();
57+
} else {
58+
$this->cache = $cache;
59+
}
5760
}
5861

5962
/**
6063
* Reset the detection properties
64+
*
65+
* @return void
6166
*/
62-
private function resetDetectionProperties() : void
67+
private function resetDetectionProperties(): void
6368
{
6469
$this->consentManagerId = null;
6570
$this->consentManagerUrl = null;
@@ -78,55 +83,161 @@ private function resetDetectionProperties() : void
7883
* will speed up the detection check
7984
* @param ?int $idSite Override the site ID, will use the site from the current request if null
8085
* @param string|null $siteData String containing the site data to search, if blank then data will be retrieved
81-
* from the current request site via cURL
82-
* @param int $timeOut How long to wait for the site to response, defaults to 60 seconds
86+
* from the current request site via an http request
87+
* @param int $timeOut How long to wait for the site to response, defaults to 5 seconds
88+
* @return void
8389
*/
8490
public function detectContent(array $detectContent = [SiteContentDetector::ALL_CONTENT],
85-
?int $idSite = null, ?string $siteData = null, int $timeOut = 60)
91+
?int $idSite = null, ?string $siteData = null, int $timeOut = 5): void
8692
{
8793

88-
// If the site data was already retrieved and stored in this object and it is for the same site id and we're
89-
// not being passed a specific sitedata parameter then avoid making another request and just return
90-
if ($siteData === null && $this->siteData != null && $idSite == $this->siteId) {
91-
return;
92-
}
93-
9494
$this->resetDetectionProperties();
9595

96-
// No site data was passed or previously retrieved, so grab the current site main page as a string
97-
if ($siteData === null) {
98-
96+
// If site data was passed in, then just run the detection checks against it and return.
97+
if ($siteData) {
98+
$this->siteData = $siteData;
99+
$this->detectionChecks($detectContent);
100+
return;
101+
}
99102

100-
if ($idSite === null) {
101-
if (!isset($_REQUEST['idSite'])) {
102-
return;
103-
}
104-
$idSite = Common::getRequestVar('idSite', null, 'int');
105-
if (!$idSite) {
106-
return;
107-
}
103+
// Get the site id from the request object if not explicitly passed
104+
if ($idSite === null) {
105+
if (!isset($_REQUEST['idSite'])) {
106+
return;
108107
}
109-
110-
$url = Site::getMainUrlFor($idSite);
111-
if (!$url) {
108+
$idSite = Common::getRequestVar('idSite', null, 'int');
109+
if (!$idSite) {
112110
return;
113111
}
112+
}
114113

115-
try {
116-
$siteData = Http::sendHttpRequestBy('curl', $url, $timeOut, null, null,
117-
null, 0, false, true);
118-
} catch (\Exception $e) {
114+
// Check and load previously cached site content detection data if it exists
115+
$cacheKey = 'SiteContentDetector_'.$idSite;
116+
$requiredProperties = $this->getRequiredProperties($detectContent);
117+
$siteContentDetectionCache = $this->cache->fetch($cacheKey);
118+
if ($siteContentDetectionCache !== false) {
119+
if ($this->checkCacheHasRequiredProperties($requiredProperties, $siteContentDetectionCache)) {
120+
$this->loadRequiredPropertiesFromCache($requiredProperties, $siteContentDetectionCache);
121+
return;
119122
}
120-
121123
}
122124

125+
// No cache hit, no passed data, so make a request for the site content
126+
$siteData = $this->requestSiteData($idSite, $timeOut);
127+
123128
// Abort if still no site data
124129
if ($siteData === null || $siteData === '') {
125130
return;
126131
}
127132

128133
$this->siteData = $siteData;
129-
$this->siteId = $idSite;
134+
135+
// We now have site data to analyze, so run the detection checks
136+
$this->detectionChecks($detectContent);
137+
138+
// A request was made to get this data and it isn't currently cached, so write it to the cache now
139+
$cacheLife = (60 * 60 * 24 * 7);
140+
$this->savePropertiesToCache($cacheKey, $requiredProperties, $cacheLife);
141+
}
142+
143+
/**
144+
* Returns an array of properties required by the detect content array
145+
*
146+
* @param array $detectContent
147+
*
148+
* @return array
149+
*/
150+
private function getRequiredProperties(array $detectContent): array
151+
{
152+
$requiredProperties = [];
153+
if (in_array(SiteContentDetector::CONSENT_MANAGER, $detectContent) || in_array(SiteContentDetector::ALL_CONTENT, $detectContent)) {
154+
$requiredProperties = array_merge($requiredProperties, ['consentManagerId', 'consentManagerName', 'consentManagerUrl', 'isConnected']);
155+
}
156+
if (in_array(SiteContentDetector::GA3, $detectContent) || in_array(SiteContentDetector::ALL_CONTENT, $detectContent)) {
157+
$requiredProperties[] = 'ga3';
158+
}
159+
if (in_array(SiteContentDetector::GA4, $detectContent) || in_array(SiteContentDetector::ALL_CONTENT, $detectContent)) {
160+
$requiredProperties[] = 'ga4';
161+
}
162+
if (in_array(SiteContentDetector::GTM, $detectContent) || in_array(SiteContentDetector::ALL_CONTENT, $detectContent)) {
163+
$requiredProperties[] = 'gtm';
164+
}
165+
166+
return $requiredProperties;
167+
}
168+
169+
/**
170+
* Checks that all required properties are in the cache array
171+
*
172+
* @param array $properties
173+
* @param array $cache
174+
*
175+
* @return bool
176+
*/
177+
private function checkCacheHasRequiredProperties(array $properties, array $cache): bool
178+
{
179+
foreach ($properties as $prop) {
180+
if (!array_key_exists($prop, $cache)) {
181+
return false;
182+
}
183+
}
184+
return true;
185+
}
186+
187+
/**
188+
* Load object properties from the cache array
189+
*
190+
* @param array $properties
191+
* @param array $cache
192+
*
193+
* @return void
194+
*/
195+
private function loadRequiredPropertiesFromCache(array $properties, array $cache): void
196+
{
197+
foreach ($properties as $prop) {
198+
if (!array_key_exists($prop, $cache)) {
199+
continue;
200+
}
201+
$this->{$prop} = $cache[$prop];
202+
}
203+
}
204+
205+
/**
206+
* Save properties to the cache
207+
*
208+
* @param string $cacheKey
209+
* @param array $properties
210+
* @param int $cacheLife
211+
*
212+
* @return void
213+
*/
214+
private function savePropertiesToCache(string $cacheKey, array $properties, int $cacheLife): void
215+
{
216+
217+
$cacheData = [];
218+
219+
// Load any existing cached values
220+
$siteContentDetectionCache = $this->cache->fetch($cacheKey);
221+
if (is_array($siteContentDetectionCache)) {
222+
$cacheData = $siteContentDetectionCache;
223+
}
224+
225+
foreach ($properties as $prop) {
226+
$cacheData[$prop] = $this->{$prop};
227+
}
228+
229+
$this->cache->save($cacheKey, $cacheData, $cacheLife);
230+
}
231+
232+
/**
233+
* Run various detection checks for site content
234+
*
235+
* @param array $detectContent Array of detection types used to filter the checks that are run
236+
*
237+
* @return void
238+
*/
239+
private function detectionChecks($detectContent): void
240+
{
130241

131242
if (in_array(SiteContentDetector::CONSENT_MANAGER, $detectContent) || in_array(SiteContentDetector::ALL_CONTENT, $detectContent)) {
132243
$this->detectConsentManager();
@@ -143,7 +254,30 @@ public function detectContent(array $detectContent = [SiteContentDetector::ALL_C
143254
if (in_array(SiteContentDetector::GTM, $detectContent) || in_array(SiteContentDetector::ALL_CONTENT, $detectContent)) {
144255
$this->detectGTM();
145256
}
257+
}
146258

259+
/**
260+
* Retrieve data from the specified site using an HTTP request
261+
*
262+
* @param int $idSite
263+
* @param int $timeOut
264+
*
265+
* @return string|null
266+
*/
267+
private function requestSiteData(int $idSite, int $timeOut): ?string
268+
{
269+
$siteData = null;
270+
$url = Site::getMainUrlFor($idSite);
271+
if (!$url) {
272+
return null;
273+
}
274+
275+
try {
276+
$siteData = Http::sendHttpRequestBy(Http::getTransportMethod(), $url, $timeOut, null, null,
277+
null, 0, false, true);
278+
} catch (\Exception $e) {
279+
}
280+
return $siteData;
147281
}
148282

149283
/**
@@ -153,7 +287,7 @@ public function detectContent(array $detectContent = [SiteContentDetector::ALL_C
153287
*
154288
* @return void
155289
*/
156-
private function detectConsentManager() : void
290+
private function detectConsentManager(): void
157291
{
158292

159293
$defs = SiteContentDetector::getConsentManagerDefinitions();
@@ -191,7 +325,7 @@ private function detectConsentManager() : void
191325
*
192326
* @return void
193327
*/
194-
private function detectGA3() : void
328+
private function detectGA3(): void
195329
{
196330
if (strpos($this->siteData, '(i,s,o,g,r,a,m)') !== false) {
197331
$this->ga3 = true;
@@ -203,7 +337,7 @@ private function detectGA3() : void
203337
*
204338
* @return void
205339
*/
206-
private function detectGA4() : void
340+
private function detectGA4(): void
207341
{
208342
if (strpos($this->siteData, 'gtag.js') !== false) {
209343
$this->ga4 = true;
@@ -215,7 +349,7 @@ private function detectGA4() : void
215349
*
216350
* @return void
217351
*/
218-
private function detectGTM() : void
352+
private function detectGTM(): void
219353
{
220354
if (strpos($this->siteData, 'gtm.js') !== false) {
221355
$this->gtm = true;
@@ -228,7 +362,7 @@ private function detectGTM() : void
228362
*
229363
* @return array[]
230364
*/
231-
public static function getConsentManagerDefinitions() : array
365+
public static function getConsentManagerDefinitions(): array
232366
{
233367
return [
234368

tests/PHPUnit/Framework/Mock/FakeSiteContentDetector.php

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,17 @@ class FakeSiteContentDetector extends SiteContentDetector
1212
public function __construct($mockData = [])
1313
{
1414
$this->mockData = $mockData;
15+
parent::__construct(null);
1516
}
1617

1718
public function detectContent(array $detectContent = [SiteContentDetector::ALL_CONTENT],
18-
?int $idSite = null, ?string $siteData = null, int $timeOut = 60)
19+
?int $idSite = null, ?string $siteData = null, int $timeOut = 60): void
1920
{
20-
2121
foreach ($this->mockData as $property => $value) {
2222
if (property_exists($this, $property)) {
2323
$this->{$property} = $value;
2424
}
2525
}
26-
27-
return true;
2826
}
2927

3028
}

tests/PHPUnit/Unit/SiteContentDetectorTest.php

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public function test_detectsConsentManager_NotConnected()
2121
{
2222
$siteData = '<html lang="en"><head><title>A site</title></head><script src="https://osano.com/uhs9879874hthg.js"></script></head><body>A site</body></html>';
2323

24-
$scd = SiteContentDetector::getInstance();
24+
$scd = new SiteContentDetector();
2525
$scd->detectContent([SiteContentDetector::ALL_CONTENT], null, $siteData);
2626

2727
$this->assertEquals('osano', $scd->consentManagerId);
@@ -31,7 +31,7 @@ public function test_detectsConsentManager_NotConnected()
3131
public function test_detectsConsentManager_Connected()
3232
{
3333
$siteData = "<html lang='en'><head><title>A site</title></head><script src='https://osano.com/uhs9879874hthg.js'></script><script>Osano.cm.addEventListener('osano-cm-consent-changed', (change) => { console.log('cm-change'); consentSet(change); });</script></><body>A site</body></html>";
34-
$scd = SiteContentDetector::getInstance();
34+
$scd = new SiteContentDetector();
3535
$scd->detectContent([SiteContentDetector::ALL_CONTENT], null, $siteData);
3636

3737
$this->assertEquals('osano', $scd->consentManagerId);
@@ -48,7 +48,7 @@ public function test_detectsGA3_IfPresent()
4848
ga('create', 'UA-xxxxxxxx-x', 'xxxxxx.com');
4949
ga('send', 'pageview');
5050
</script></head><body>A site</body></html>";
51-
$scd = SiteContentDetector::getInstance();
51+
$scd = new SiteContentDetector();
5252
$scd->detectContent([SiteContentDetector::GA3], null, $siteData);
5353

5454
$this->assertEmpty($scd->consentManagerId);
@@ -68,7 +68,7 @@ function gtag(){window.dataLayer.push(arguments);}
6868
gtag('config', 'GA_TRACKING_ID');
6969
</script>
7070
</head><body>A site</body></html>";
71-
$scd = SiteContentDetector::getInstance();
71+
$scd = new SiteContentDetector();
7272
$scd->detectContent([SiteContentDetector::GA4], null, $siteData);
7373

7474
$this->assertFalse($scd->ga3);
@@ -87,7 +87,7 @@ public function test_detectsGTM_IfPresent()
8787
})(window,document,'script','dataLayer','GTM-NRTVJJC');</script>
8888
<!-- End Google Tag Manager -->
8989
</head><body>A site</body></html>";
90-
$scd = SiteContentDetector::getInstance();
90+
$scd = new SiteContentDetector();
9191
$scd->detectContent([SiteContentDetector::GTM], null, $siteData);
9292

9393
$this->assertFalse($scd->ga3);
@@ -98,7 +98,7 @@ public function test_detectsGTM_IfPresent()
9898
public function test_doesNotDetectsGA_IfNotPresent()
9999
{
100100
$siteData = "<html lang=\"en\"><head><title>A site</title><script><script>console.log('abc');</script></head><body>A site</body></html>";
101-
$scd = SiteContentDetector::getInstance();
101+
$scd = new SiteContentDetector();
102102
$scd->detectContent([SiteContentDetector::ALL_CONTENT], null, $siteData);
103103

104104
$this->assertFalse($scd->ga3);

0 commit comments

Comments
 (0)