Skip to content

Commit

Permalink
Parse updated IUAM Javascript challenge
Browse files Browse the repository at this point in the history
  • Loading branch information
Anorov committed Apr 5, 2018
1 parent 22400ee commit 798f167
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 21 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Dependencies

* Python 2.6 - 3.x
* **[Requests](https://github.com/kennethreitz/requests)** >= 2.0
* **Node.js** is required for (safe) Javascript execution.
* **[Node.js](https://nodejs.org/)**
* Your computer or server may already have it (check with `node -v`). If not, you can install it with `apt-get install nodejs` on Ubuntu and Debian. Otherwise, please read [Node's installation instructions](https://nodejs.org/en/download/package-manager/).

`python setup.py install` will install the Python dependencies automatically. Node is the only application you need to install yourself.
Expand Down Expand Up @@ -86,7 +86,7 @@ Unfortunately, not all of Requests' session attributes are easily transferable,

Normally, when a browser is faced with a Cloudflare IUAM challenge page, Cloudflare requires the browser to wait 5 seconds before submitting the challenge answer. If a website is under heavy load, sometimes this may fail. One solution is to increase the delay (perhaps to 10 or 15 seconds, depending on the website). If you would like to override this delay, pass the `delay` keyword argument to `create_scraper()` or `CloudflareScraper()`.

There is no need to override this delay unless cloudflare-scrape is generates an error recommending you increase the delay.
There is no need to override this delay unless cloudflare-scrape generates an error recommending you increase the delay.

```python
scraper = cfscrape.create_scraper(delay=10)
Expand Down
36 changes: 18 additions & 18 deletions cfscrape/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@
except ImportError:
from urllib.parse import urlparse

__version__ = "1.9.4"
__version__ = "1.9.5"

DEFAULT_USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0"
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/65.0.3325.181 Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (Linux; Android 7.0; Moto G (5) Build/NPPS25.137-93-8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.137 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:59.0) Gecko/20100101 Firefox/59.0",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"
]

DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
Expand All @@ -34,16 +36,16 @@
ANSWER_ACCEPT_ERROR = """\
The challenge answer was not properly accepted by Cloudflare. This can occur if \
the target website is under heavy load, or if Cloudflare is experiencing issues. You can
potentially resolve this by increasing the challenge answer delay (default: 5 seconds). \
For example: cfscrape.create_scraper(delay=10)
potentially resolve this by increasing the challenge answer delay (default: 8 seconds). \
For example: cfscrape.create_scraper(delay=15)
If increasing the delay does not help, please open a GitHub issue at \
https://github.com/Anorov/cloudflare-scrape/issues\
"""

class CloudflareScraper(Session):
def __init__(self, *args, **kwargs):
self.delay = kwargs.pop("delay", 5)
self.delay = kwargs.pop("delay", 8)
super(CloudflareScraper, self).__init__(*args, **kwargs)

if "requests" in self.headers["User-Agent"]:
Expand All @@ -64,8 +66,6 @@ def request(self, method, url, *args, **kwargs):
# Check if Cloudflare anti-bot is on
if self.is_cloudflare_challenge(resp):
resp = self.solve_cf_challenge(resp, **kwargs)
if self.is_cloudflare_challenge(resp):
raise ValueError(ANSWER_ACCEPT_ERROR)

return resp

Expand Down Expand Up @@ -94,7 +94,7 @@ def solve_cf_challenge(self, resp, **original_kwargs):
raise ValueError("Unable to parse Cloudflare anti-bots page: %s %s" % (e.message, BUG_REPORT))

# Solve the Javascript challenge
params["jschl_answer"] = str(self.solve_challenge(body) + len(domain))
params["jschl_answer"] = self.solve_challenge(body, domain)

# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
Expand All @@ -109,21 +109,21 @@ def solve_cf_challenge(self, resp, **original_kwargs):
return self.request(method, redirect_url, **original_kwargs)
return self.request(method, redirect.headers["Location"], **original_kwargs)

def solve_challenge(self, body):
def solve_challenge(self, body, domain):
try:
js = re.search(r"setTimeout\(function\(\){\s+(var "
"s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n", body).group(1)
except Exception:
raise ValueError("Unable to identify Cloudflare IUAM Javascript on website. %s" % BUG_REPORT)

js = re.sub(r"a\.value = (parseInt\(.+?\)).+", r"\1", js)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js)
js = re.sub(r"a\.value = (.+ \+ t\.length).+", r"\1", js)
js = re.sub(r"\s{3,}[a-z](?: = |\.).+", "", js).replace("t.length", str(len(domain)))

# Strip characters that could be used to exit the string context
# These characters are not currently used in Cloudflare's arithmetic snippet
js = re.sub(r"[\n\\']", "", js)

if "parseInt" not in js:
if "toFixed" not in js:
raise ValueError("Error parsing Cloudflare IUAM Javascript challenge. %s" % BUG_REPORT)

# Use vm.runInNewContext to safely evaluate code
Expand All @@ -134,15 +134,15 @@ def solve_challenge(self, body):
result = subprocess.check_output(["node", "-e", js]).strip()
except OSError as e:
if e.errno == 2:
raise EnvironmentError("Missing Node.js runtime. Node is required. Please read the cfscrape"
raise EnvironmentError("Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cfscrape"
" README's Dependencies section: https://github.com/Anorov/cloudflare-scrape#dependencies.")
raise
except Exception:
logging.error("Error executing Cloudflare IUAM Javascript. %s" % BUG_REPORT)
raise

try:
result = int(result)
float(result)
except Exception:
raise ValueError("Cloudflare IUAM challenge returned unexpected answer. %s" % BUG_REPORT)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@
url = 'https://github.com/Anorov/cloudflare-scrape',
keywords = ['cloudflare', 'scraping'],
include_package_data = True,
install_requires = ['PyExecJS >= 1.4.0', 'requests >= 2.0.0']
install_requires = ['requests >= 2.0.0']
)

0 comments on commit 798f167

Please sign in to comment.