Skip to content

Commit

Permalink
fixes problems with redirects and SSL. Adds old crawling mechanism as…
Browse files Browse the repository at this point in the history
… fallback. This fixes weird redirect problems with FluxFM.
  • Loading branch information
crohrer committed May 7, 2017
1 parent 3a6027a commit 14c0934
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 41 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "spotifyRadioPlaylist",
"version": "2.5.1",
"version": "2.5.2",
"license": "MIT",
"repository": {
"type": "git",
Expand Down
124 changes: 84 additions & 40 deletions radioCrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
* Created by chris on 06.01.16.
*/
"use strict";
var http = require('http');
var https = require('https');
var Promise = require('bluebird');
var fs = require('fs');
var cheerio = require('cheerio');
var URL = require('url');
var logger = require('./logger');
var Horseman = require('node-horseman');
var horseman = new Horseman();
var horseman = new Horseman({ignoreSSLErrors: true});
var config = JSON.parse(fs.readFileSync('config.json', 'utf8'));

String.prototype.trimEx = function() {return this.trim().replace(/^\s?-\s/, '').toUpperCase()}; // we compare our strings later in uppercase
Expand Down Expand Up @@ -41,55 +43,97 @@ function getTracks(playlistName, trackserviceUrl){
return new Promise((resolve, reject) => {
console.log('getting tracks from radio trackservice');
horseman
.userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0')
.open(url)
.waitForSelector(playlistConfig.radioEntrySelector)
.html()
.then((html) => {
let $ = cheerio.load(html),
searchInArtist = playlistConfig.removeFromArtistString || '',
searchInTitle = playlistConfig.removeFromTitleString || '',
tracks = [];

$(playlistConfig.radioEntrySelector).each(function(i, elem){
let $entry = $(this),
$title, // cheerio-object
$artist, // cheerio-object
title, // string
artist; // string

if (playlistConfig.searchLinear) {
// Stations like the old page of ORF FM4 have strange markup and need linear search
$title = $entry.nextAll(playlistConfig.radioTitleSelector).first();
$artist = $entry.nextAll(playlistConfig.radioArtistSelector).first();
} else {
// Most other station playlists feature nested markup
$title = $entry.find(playlistConfig.radioTitleSelector);
$artist = $entry.find(playlistConfig.radioArtistSelector);
}

title = $title.text().replace(searchInTitle, '');
artist = $artist.text().replace(searchInArtist, '');
.then(searchInHtml)
.then((tracks) => resolve(tracks))
.close()
.catch(() => {
logger.log('error requesting trackservice using horseman.', playlistName);
try {
let httpx = (URL.parse(url).protocol === 'http:') ? http : https;
let trackserviceReq = httpx.request(url, function(res) {
let html = '';

if(res.statusCode === 302){
console.log('following redirect to ' + res.headers.location);
resolve(getTracks(playlistName, res.headers.location));
return;
}
if(res.statusCode !== 200){
let error = 'Trackservice Error: Status '+res.statusCode;
logger.log(error, playlistName);
reject(error);
process.exit(1);
return;
}

res.setEncoding('utf8');
res.on('data', function (chunk) {
html += chunk;
});
res.on('end', function() {
searchInHtml(html).then(resolve);
});
});

tracks.push({
title: title,
artist: artist
trackserviceReq.on('error', function(e) {
logger.log('problem with trackservice request: ' + e.message, playlistName);
reject();
});
});

if(tracks.length === 0){
logger.log('no tracks found on radio trackservice.', playlistName);
return reject();
trackserviceReq.end();
}
catch(e) {
logger.log('error requesting trackservice using http.', playlistName);
reject();
}

resolve(tracks);
})
.close()
.catch(() => {
logger.log('error requesting trackservice.', playlistName);
reject();
});
});

function searchInHtml(html){
return new Promise((resolve, reject) => {
let $ = cheerio.load(html),
searchInArtist = playlistConfig.removeFromArtistString || '',
searchInTitle = playlistConfig.removeFromTitleString || '',
tracks = [];

$(playlistConfig.radioEntrySelector).each(function(i, elem){
let $entry = $(this),
$title, // cheerio-object
$artist, // cheerio-object
title, // string
artist; // string

if (playlistConfig.searchLinear) {
// Stations like the old page of ORF FM4 have strange markup and need linear search
$title = $entry.nextAll(playlistConfig.radioTitleSelector).first();
$artist = $entry.nextAll(playlistConfig.radioArtistSelector).first();
} else {
// Most other station playlists feature nested markup
$title = $entry.find(playlistConfig.radioTitleSelector);
$artist = $entry.find(playlistConfig.radioArtistSelector);
}

title = $title.text().replace(searchInTitle, '');
artist = $artist.text().replace(searchInArtist, '');

tracks.push({
title: title,
artist: artist
});
});

if(tracks.length === 0){
logger.log('no tracks found on radio trackservice.', playlistName);
return reject();
}

resolve(tracks);
});
}
}

function getOrfBroadcasts(broadcastsUrl){
Expand Down

0 comments on commit 14c0934

Please sign in to comment.