forked from simplehtmldom/simplehtmldom
-
Notifications
You must be signed in to change notification settings - Fork 0
/
HtmlWeb.php
134 lines (110 loc) · 3.07 KB
/
HtmlWeb.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
<?php namespace simplehtmldom;
/**
* Website: http://sourceforge.net/projects/simplehtmldom/
* Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
*
* Licensed under The MIT License
* See the LICENSE file in the project root for more information.
*
* Authors:
* S.C. Chen
* John Schlick
* Rus Carroll
* logmanoriginal
*
* Contributors:
* Yousuke Kumakura
* Vadim Voituk
* Antcs
*
* Version $Rev$
*/
include_once 'HtmlDocument.php';
class HtmlWeb {
/**
* @return HtmlDocument Returns the DOM for a webpage
* @return null Returns null if the cURL extension is not loaded and allow_url_fopen=Off
* @return null Returns null if the provided URL is invalid (not PHP_URL_SCHEME)
* @return null Returns null if the provided URL does not specify the HTTP or HTTPS protocol
*/
function load($url)
{
if(!filter_var($url, FILTER_VALIDATE_URL)) {
return null;
}
if($scheme = parse_url($url, PHP_URL_SCHEME)) {
switch(strtolower($scheme)) {
case 'http':
case 'https': break;
default: return null;
}
if(extension_loaded('curl')) {
return $this->load_curl($url);
} elseif(ini_get('allow_url_fopen')) {
return $this->load_fopen($url);
} else {
error_log(__FUNCTION__ . ' requires either the cURL extension or allow_url_fopen=On in php.ini');
}
}
return null;
}
/**
* cURL implementation of load
*/
private function load_curl($url)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
// There is no guarantee this request will be fulfilled
// -- https://www.php.net/manual/en/function.curl-setopt.php
curl_setopt($ch, CURLOPT_BUFFERSIZE, MAX_FILE_SIZE);
// There is no guarantee this request will be fulfilled
$header = array(
'Accept: text/html', // Prefer HTML format
'Accept-Charset: utf-8', // Prefer UTF-8 encoding
);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
$doc = curl_exec($ch);
if(curl_getinfo($ch, CURLINFO_RESPONSE_CODE) !== 200) {
return null;
}
curl_close($ch);
if(strlen($doc) > MAX_FILE_SIZE) {
return null;
}
return new HtmlDocument($doc);
}
/**
* fopen implementation of load
*/
private function load_fopen($url)
{
// There is no guarantee this request will be fulfilled
$context = stream_context_create(array('http' => array(
'header' => array(
'Accept: text/html', // Prefer HTML format
'Accept-Charset: utf-8', // Prefer UTF-8 encoding
),
'ignore_errors' => true // Always fetch content
)));
$doc = file_get_contents($url, false, $context, 0, MAX_FILE_SIZE + 1);
if(isset($http_response_header)) {
foreach($http_response_header as $rh) {
// https://stackoverflow.com/a/1442526
$parts = explode(' ', $rh, 3);
if(preg_match('/HTTP\/\d\.\d/', $parts[0])) {
$code = $parts[1];
}
} // Last code is final status
if(!isset($code) || $code !== '200') {
return null;
}
}
if(strlen($doc) > MAX_FILE_SIZE) {
return null;
}
return new HtmlDocument($doc);
}
}