diff --git a/config.example.json b/config.example.json index adca8fa..bec8871 100644 --- a/config.example.json +++ b/config.example.json @@ -14,7 +14,16 @@ "fm4": { "playlistId": "", "radioTrackserviceUrl": "https://audioapi.orf.at/fm4/api/json/current/broadcasts", - "fm4Api": true + "orfApi": true + }, + "fip": { + "playlistId": "", + "radioTrackserviceUrl": "http://www.fipradio.fr/archives-antenne", + "radioEntrySelector": ".list-song > .son", + "radioTitleSelector": ".titre_title", + "removeFromTitleString": "", + "radioArtistSelector": ".titre_artiste", + "removeFromArtistString": "par :" } } } diff --git a/package.json b/package.json index 74aa97f..0ce5e96 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "spotifyRadioPlaylist", - "version": "2.4.0", + "version": "2.5.0", "license": "MIT", "repository": { "type": "git", @@ -8,7 +8,8 @@ }, "dependencies": { "bluebird": "^3.5.0", - "cheerio": "^0.19.0" + "cheerio": "^0.19.0", + "node-horseman": "^3.3.0" }, "scripts": { "start": "node ./main.js" diff --git a/radioCrawler.js b/radioCrawler.js index cf91eb4..c0ac2e0 100644 --- a/radioCrawler.js +++ b/radioCrawler.js @@ -2,16 +2,16 @@ * Created by chris on 06.01.16. */ "use strict"; -var http = require('http'); var https = require('https'); var Promise = require('bluebird'); var fs = require('fs'); var cheerio = require('cheerio'); var logger = require('./logger'); +var Horseman = require('node-horseman'); +var horseman = new Horseman(); var config = JSON.parse(fs.readFileSync('config.json', 'utf8')); String.prototype.trimEx = function() {return this.trim().replace(/^\s?-\s/, '').toUpperCase()}; // we compare our strings later in uppercase -String.prototype.isEmpty = function() {return (!this || !this.length)}; /** * getTracks @@ -22,11 +22,11 @@ String.prototype.isEmpty = function() {return (!this || !this.length)}; function getTracks(playlistName, trackserviceUrl){ let playlistConfig = config.playlists[playlistName]; let url = trackserviceUrl || playlistConfig.radioTrackserviceUrl; - if(playlistConfig.fm4Api){ - return getFm4Broadcasts(url) + if(playlistConfig.orfApi){ + return getOrfBroadcasts(url) .then(broadcasts => { console.log('getting tracks from API for '+broadcasts.length+' broadcasts'); - return broadcasts.map(broadcast => getFm4BroadcastTracks(broadcast)); + return broadcasts.map(broadcast => getOrfBroadcastTracks(broadcast)); }) .then(AllBroadcastsWithTracks => Promise.all(AllBroadcastsWithTracks)) .then(broadcasts => { @@ -40,28 +40,15 @@ function getTracks(playlistName, trackserviceUrl){ return new Promise((resolve, reject) => { console.log('getting tracks from radio trackservice'); - let trackserviceReq = http.request(url, function(res) { - let html = ''; - - if(res.statusCode === 302){ - console.log('following redirect to ' + res.headers.location); - resolve(getTracks(playlistName, res.headers.location)); - return; - } - if(res.statusCode !== 200){ - let error = 'Trackservice Error: Status '+res.statusCode; - logger.log(error, playlistName); - reject(error); - process.exit(1); - return; - } - - res.setEncoding('utf8'); - res.on('data', function (chunk) { - html += chunk; - }); - res.on('end', function() { + horseman + .userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0') + .open(url) + .waitForSelector(playlistConfig.radioEntrySelector) + .html() + .then((html) => { let $ = cheerio.load(html), + searchInArtist = playlistConfig.removeFromArtistString || '', + searchInTitle = playlistConfig.removeFromTitleString || '', tracks = []; $(playlistConfig.radioEntrySelector).each(function(i, elem){ @@ -81,8 +68,8 @@ function getTracks(playlistName, trackserviceUrl){ $artist = $entry.find(playlistConfig.radioArtistSelector); } - title = $title.text(); - artist = $artist.text(); + title = $title.text().replace(searchInTitle, ''); + artist = $artist.text().replace(searchInArtist, ''); tracks.push({ title: title, @@ -92,24 +79,16 @@ function getTracks(playlistName, trackserviceUrl){ if(tracks.length === 0){ logger.log('no tracks found on radio trackservice.', playlistName); - return; - process.exit(1); + return reject(); } resolve(tracks); - }); - }); - - trackserviceReq.on('error', function(e) { - logger.log('problem with trackservice request: ' + e.message, playlistName); - process.exit(1); - }); - - trackserviceReq.end(); + }) + .close(); }); } -function getFm4Broadcasts(broadcastsUrl){ +function getOrfBroadcasts(broadcastsUrl){ return new Promise((resolve, reject) => { https.get(broadcastsUrl, (res) => { if(res.statusCode !== 200){ @@ -137,7 +116,7 @@ function getFm4Broadcasts(broadcastsUrl){ }); } -function getFm4BroadcastTracks(broadcast){ +function getOrfBroadcastTracks(broadcast){ return new Promise((resolve, reject) => { https.get(broadcast.href, (res) => { if(res.statusCode !== 200){ @@ -149,8 +128,8 @@ function getFm4BroadcastTracks(broadcast){ res.on('data', (chunk) => { rawData += chunk; }); res.on('end', () => { try { - var data = JSON.parse(rawData); - var tracks = data.items + let data = JSON.parse(rawData); + let tracks = data.items .map(broadcastItem => { return { title: broadcastItem.title, @@ -180,9 +159,9 @@ function cleanTracks(tracks){ tracks .filter(track => track.artist && track.title) .forEach((track) => { - var isUnique = true; - var artist = track.artist.trimEx(); - var title = track.title.trimEx(); + let isUnique = true; + let artist = track.artist.trimEx(); + let title = track.title.trimEx(); // check for duplicates cleanedTracks.forEach(function(cleanTrack){ diff --git a/readme.md b/readme.md index 99ba7c0..7158fca 100644 --- a/readme.md +++ b/readme.md @@ -34,6 +34,28 @@ You may want to run this on a server via cronjob every X minutes or so (dependin 4. Run `npm install` 5. Configure your cronjob to run `node main.js ` every X minutes (don't forget to change to the correct directory first! - this can be done with a bash script) +## Adding new Radio Stations to the config + +Currently there are two different types of crawling available: + +1. Special API crawling for ORF APIs like `https://audioapi.orf.at/fm4/api/json/current/broadcasts`. +2. Classic crawling for HTML tracklistings. This covers most radio stations. + +For the first type (ORF API) only three Parameters are required: +* `playlistId`: spotify ID for the playlist to add new tracks to +* `orfApi: true`: flag to activate this mode +* `radioTrackserviceUrl`: API URL + +For the second type there are also some optional parameters available. This is the default crawling mode. +* `playlistId`: spotify ID for the playlist to add new tracks to +* `radioTrackserviceUrl`: URL to the page with track listings +* `radioEntrySelector`: jQuery style Selector for the encapsulating element that contains the info for exactly one track (i.e. `.list-item`). This is not used, when using the option `searchLinear`. +* `searchLinear: true`: optional & experimental. Set this to true, when tracks are not encapsulated individually, but are listed one after another. The website this mode was implemented for doesn't exist anymore, so this is not well tested at the moment. +* `radioTitleSelector`: jQuery style Selector for the title text. +* `removeFromTitleString`: optional. String to remove from title (i.e. `Title:`). This is necessary when title info and other texts are not separated correctly by the website owner. +* `radioArtistSelector`: jQuery style Selector for the artist text. +* `removeFromArtistString`: optional. String to remove from Artist (i.e. `Artist:`). This is necessary when artist info and other texts are not separated correctly by the website owner. + ## Updates To update run `git pull` in your installation and you will get the latest changes. diff --git a/yarn.lock b/yarn.lock index ace7103..e06c489 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2,7 +2,7 @@ # yarn lockfile v1 -bluebird@^3.5.0: +bluebird@^3.0.1, bluebird@^3.5.0: version "3.5.0" resolved "https://registry.yarnpkg.com/bluebird/-/bluebird-3.5.0.tgz#791420d7f551eea2897453a8a77653f96606d67c" @@ -20,6 +20,14 @@ cheerio@^0.19.0: htmlparser2 "~3.8.1" lodash "^3.2.0" +clone@^1.0.2: + version "1.0.2" + resolved "https://registry.yarnpkg.com/clone/-/clone-1.0.2.tgz#260b7a99ebb1edfe247538175f783243cb19d149" + +cookies.txt@^0.1.1: + version "0.1.2" + resolved "https://registry.yarnpkg.com/cookies.txt/-/cookies.txt-0.1.2.tgz#a8b249d9ee9699305362d604b7e0ac0f20e7519e" + core-util-is@~1.0.0: version "1.0.2" resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7" @@ -37,6 +45,22 @@ css-what@1.0: version "1.0.0" resolved "https://registry.yarnpkg.com/css-what/-/css-what-1.0.0.tgz#d7cc2df45180666f99d2b14462639469e00f736c" +data-uri-to-buffer@0.0.4: + version "0.0.4" + resolved "https://registry.yarnpkg.com/data-uri-to-buffer/-/data-uri-to-buffer-0.0.4.tgz#46e13ab9da8e309745c8d01ce547213ebdb2fe3f" + +debug@^2.1.1, debug@^2.2.0: + version "2.6.6" + resolved "https://registry.yarnpkg.com/debug/-/debug-2.6.6.tgz#a9fa6fbe9ca43cf1e79f73b75c0189cbb7d6db5a" + dependencies: + ms "0.7.3" + +defaults@~1.0.0: + version "1.0.3" + resolved "https://registry.yarnpkg.com/defaults/-/defaults-1.0.3.tgz#c656051e9817d9ff08ed881477f3fe4019f3ef7d" + dependencies: + clone "^1.0.2" + dom-serializer@0, dom-serializer@~0.1.0: version "0.1.0" resolved "https://registry.yarnpkg.com/dom-serializer/-/dom-serializer-0.1.0.tgz#073c697546ce0780ce23be4a28e293e40bc30c82" @@ -44,11 +68,7 @@ dom-serializer@0, dom-serializer@~0.1.0: domelementtype "~1.1.1" entities "~1.1.1" -domelementtype@1: - version "1.3.0" - resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.3.0.tgz#b17aed82e8ab59e52dd9c19b1756e0fc187204c2" - -domelementtype@~1.1.1: +domelementtype@1, domelementtype@~1.1.1: version "1.1.3" resolved "https://registry.yarnpkg.com/domelementtype/-/domelementtype-1.1.3.tgz#bd28773e2642881aec51544924299c5cd822185b" @@ -101,6 +121,28 @@ lodash@^3.2.0: version "3.10.1" resolved "https://registry.yarnpkg.com/lodash/-/lodash-3.10.1.tgz#5bf45e8e49ba4189e17d482789dfd15bd140b7b6" +ms@0.7.3: + version "0.7.3" + resolved "https://registry.yarnpkg.com/ms/-/ms-0.7.3.tgz#708155a5e44e33f5fd0fc53e81d0d40a91be1fff" + +node-horseman@^3.3.0: + version "3.3.0" + resolved "https://registry.yarnpkg.com/node-horseman/-/node-horseman-3.3.0.tgz#86100369437de794f669e0f52fd94cf945981bb2" + dependencies: + bluebird "^3.0.1" + clone "^1.0.2" + cookies.txt "^0.1.1" + data-uri-to-buffer "0.0.4" + debug "^2.1.1" + defaults "~1.0.0" + node-phantom-simple "^2.2.4" + +node-phantom-simple@^2.2.4: + version "2.2.4" + resolved "https://registry.yarnpkg.com/node-phantom-simple/-/node-phantom-simple-2.2.4.tgz#4fc4effbb02f241fb5082bd4fbab398e4aecb64d" + dependencies: + debug "^2.2.0" + nth-check@~1.0.0: version "1.0.1" resolved "https://registry.yarnpkg.com/nth-check/-/nth-check-1.0.1.tgz#9929acdf628fc2c41098deab82ac580cf149aae4"