-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-books.js
67 lines (62 loc) · 1.97 KB
/
get-books.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#! node
const puppeteer = require('puppeteer');
const fs = require('fs');
const request = require('request');
async function download(uri, filename) {
return new Promise((res) => {
request.head(uri, function (err, res, body) {
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream(filename)).on('close', res);
})
});
};
(async () => {
console.log('Going to work');
const rootUrl = 'https://en.m.wikipedia.org/wiki/Agatha_Christie_bibliography';
const browser = await puppeteer.launch();
try {
const page = await browser.newPage();
await page.goto(rootUrl);
const books = await page.evaluate(r => {
const select = (e, q) => Array.from(e.querySelectorAll(q));
return select(document.querySelector('table.wikitable'), 'tbody tr')
.filter(tr => tr.querySelector('td'))
.map(tr => {
const title = tr.querySelector('th').textContent;
const year = tr.querySelector('td').textContent;
const link = tr.querySelector('th a').href;
return {
title,
link,
year
}
});
})
const data = await Promise.all(books.map(async book => {
const page = await browser.newPage();
await page.goto(book.link);
const img = await page.evaluate(async _ => {
const img = document.querySelector('.infobox img');
if (img) {
const {
src
} = img;
const imgPath = src.split('/').pop();
return {
imgSrc: src,
imgPath
}
}
})
console.log('Fetched infos for ' + book.title);
return Object.assign({}, img, book);
}))
fs.writeFileSync(`${destPath}/data.json`, JSON.stringify(data));
} catch (e) {
console.warn('Error', e);
} finally {
console.log('Going to sleep');
await browser.close();
}
})();