From 14c09349630ed2084691be0b460d3e53ca2ad6e8 Mon Sep 17 00:00:00 2001 From: Christoph Rohrer Date: Sun, 7 May 2017 12:12:37 +0200 Subject: [PATCH] fixes problems with redirects and SSL. Adds old crawling mechanism as fallback. This fixes weird redirect problems with FluxFM. --- package.json | 2 +- radioCrawler.js | 124 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 85 insertions(+), 41 deletions(-) diff --git a/package.json b/package.json index ad75916..b18e0ac 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "spotifyRadioPlaylist", - "version": "2.5.1", + "version": "2.5.2", "license": "MIT", "repository": { "type": "git", diff --git a/radioCrawler.js b/radioCrawler.js index 01e0011..4df2f95 100644 --- a/radioCrawler.js +++ b/radioCrawler.js @@ -2,13 +2,15 @@ * Created by chris on 06.01.16. */ "use strict"; +var http = require('http'); var https = require('https'); var Promise = require('bluebird'); var fs = require('fs'); var cheerio = require('cheerio'); +var URL = require('url'); var logger = require('./logger'); var Horseman = require('node-horseman'); -var horseman = new Horseman(); +var horseman = new Horseman({ignoreSSLErrors: true}); var config = JSON.parse(fs.readFileSync('config.json', 'utf8')); String.prototype.trimEx = function() {return this.trim().replace(/^\s?-\s/, '').toUpperCase()}; // we compare our strings later in uppercase @@ -41,55 +43,97 @@ function getTracks(playlistName, trackserviceUrl){ return new Promise((resolve, reject) => { console.log('getting tracks from radio trackservice'); horseman - .userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0') .open(url) .waitForSelector(playlistConfig.radioEntrySelector) .html() - .then((html) => { - let $ = cheerio.load(html), - searchInArtist = playlistConfig.removeFromArtistString || '', - searchInTitle = playlistConfig.removeFromTitleString || '', - tracks = []; - - $(playlistConfig.radioEntrySelector).each(function(i, elem){ - let $entry = $(this), - $title, // cheerio-object - $artist, // cheerio-object - title, // string - artist; // string - - if (playlistConfig.searchLinear) { - // Stations like the old page of ORF FM4 have strange markup and need linear search - $title = $entry.nextAll(playlistConfig.radioTitleSelector).first(); - $artist = $entry.nextAll(playlistConfig.radioArtistSelector).first(); - } else { - // Most other station playlists feature nested markup - $title = $entry.find(playlistConfig.radioTitleSelector); - $artist = $entry.find(playlistConfig.radioArtistSelector); - } - - title = $title.text().replace(searchInTitle, ''); - artist = $artist.text().replace(searchInArtist, ''); + .then(searchInHtml) + .then((tracks) => resolve(tracks)) + .close() + .catch(() => { + logger.log('error requesting trackservice using horseman.', playlistName); + try { + let httpx = (URL.parse(url).protocol === 'http:') ? http : https; + let trackserviceReq = httpx.request(url, function(res) { + let html = ''; + + if(res.statusCode === 302){ + console.log('following redirect to ' + res.headers.location); + resolve(getTracks(playlistName, res.headers.location)); + return; + } + if(res.statusCode !== 200){ + let error = 'Trackservice Error: Status '+res.statusCode; + logger.log(error, playlistName); + reject(error); + process.exit(1); + return; + } + + res.setEncoding('utf8'); + res.on('data', function (chunk) { + html += chunk; + }); + res.on('end', function() { + searchInHtml(html).then(resolve); + }); + }); - tracks.push({ - title: title, - artist: artist + trackserviceReq.on('error', function(e) { + logger.log('problem with trackservice request: ' + e.message, playlistName); + reject(); }); - }); - if(tracks.length === 0){ - logger.log('no tracks found on radio trackservice.', playlistName); - return reject(); + trackserviceReq.end(); + } + catch(e) { + logger.log('error requesting trackservice using http.', playlistName); + reject(); } - resolve(tracks); - }) - .close() - .catch(() => { - logger.log('error requesting trackservice.', playlistName); - reject(); }); }); + + function searchInHtml(html){ + return new Promise((resolve, reject) => { + let $ = cheerio.load(html), + searchInArtist = playlistConfig.removeFromArtistString || '', + searchInTitle = playlistConfig.removeFromTitleString || '', + tracks = []; + + $(playlistConfig.radioEntrySelector).each(function(i, elem){ + let $entry = $(this), + $title, // cheerio-object + $artist, // cheerio-object + title, // string + artist; // string + + if (playlistConfig.searchLinear) { + // Stations like the old page of ORF FM4 have strange markup and need linear search + $title = $entry.nextAll(playlistConfig.radioTitleSelector).first(); + $artist = $entry.nextAll(playlistConfig.radioArtistSelector).first(); + } else { + // Most other station playlists feature nested markup + $title = $entry.find(playlistConfig.radioTitleSelector); + $artist = $entry.find(playlistConfig.radioArtistSelector); + } + + title = $title.text().replace(searchInTitle, ''); + artist = $artist.text().replace(searchInArtist, ''); + + tracks.push({ + title: title, + artist: artist + }); + }); + + if(tracks.length === 0){ + logger.log('no tracks found on radio trackservice.', playlistName); + return reject(); + } + + resolve(tracks); + }); + } } function getOrfBroadcasts(broadcastsUrl){