-
Notifications
You must be signed in to change notification settings - Fork 45
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #35 from spencermountain/dev
Dev
- Loading branch information
Showing
34 changed files
with
86,674 additions
and
447 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
{ | ||
"plugins": [ | ||
], | ||
"quotes": { | ||
"type": "single", | ||
"avoidEscape": false | ||
}, | ||
"whiteSpace": { | ||
"before": { | ||
"ParameterList": -1, | ||
"ParameterComma": -1, | ||
"FunctionDeclarationOpeningBrace": -1, | ||
"FunctionDeclarationClosingBrace": -1, | ||
"ForStatementExpressionOpening": -1 | ||
}, | ||
"after": { | ||
"FunctionName": -1, | ||
"ParameterComma": 1, | ||
"FunctionReservedWord": -1, | ||
"ParameterList": -1, | ||
"FunctionDeclarationOpeningBrace": -1, | ||
"PropertyName": -1 | ||
} | ||
}, | ||
"lineBreak": { | ||
"before": { | ||
"EndOfFile": 1 | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
{ | ||
"env": { | ||
"es6": true | ||
}, | ||
"parserOptions": { | ||
"ecmaVersion": 2017, | ||
"sourceType": "module", | ||
"ecmaFeatures": { }, | ||
}, | ||
"rules": { | ||
"no-cond-assign": 2, | ||
"no-var": 0, | ||
"prefer-const": 0, | ||
"no-extra-parens": 0, | ||
"no-dupe-keys": 2, | ||
"no-unreachable": 2, | ||
"eqeqeq": 1, | ||
"keyword-spacing": 0, | ||
"no-native-reassign": 2, | ||
"no-redeclare": 2, | ||
"radix": 1, | ||
"indent": 0, | ||
"quotes": [ | ||
0, | ||
"single", | ||
"avoid-escape" | ||
], | ||
"no-shadow": 2, | ||
"no-unused-vars": 1, | ||
"no-lonely-if": 1, | ||
"no-use-before-define": 2, | ||
"no-bitwise": 2, | ||
"no-dupe-class-members": 2, | ||
"guard-for-in": 1, | ||
"consistent-return": 2, | ||
"no-octal-escape": 2, | ||
"no-constant-condition": 1, | ||
"no-unused-expressions": 2 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,93 +1,127 @@ | ||
# A whole Wikipedia dump, in mongodb. | ||
put your hefty [wikipedia dump](https://dumps.wikimedia.org) into mongo, with fully-parsed wikiscript - without thinking, without loading it into memory, grepping, unzipping, or other crazy command-line nonsense. | ||
<div align="center"> | ||
<h3>dumpster-dive</h3> | ||
<a href="https://npmjs.org/package/dumpster-dive"> | ||
<img src="https://img.shields.io/npm/v/dumpster-dive.svg?style=flat-square" /> | ||
</a> | ||
<a href="https://www.codacy.com/app/spencerkelly86/dumpster-dive"> | ||
<img src="https://api.codacy.com/project/badge/grade/6fad3c588d3d4c97ab8a9abf9f2a5a01" /> | ||
</a> | ||
<div>wikipedia dump parser</div> | ||
<sub> | ||
by | ||
<a href="http://spencermounta.in/">Spencer Kelly</a>, <a href="https://github.com/devrim">Devrim Yasar</a>, | ||
and | ||
<a href="https://github.com/spencermountain/wtf_wikipedia/graphs/contributors"> | ||
others | ||
</a> | ||
</sub> | ||
</div> | ||
<p></p> | ||
|
||
<div align="center"> | ||
gets a wikipedia <a href="https://dumps.wikimedia.org">xml dump</a> into mongo, | ||
<div>so you can mess-around.</div> | ||
|
||
<h2 align="center">💂 Yup 💂</h2> | ||
<div><sup>do it on your laptop.</sup></div> | ||
</div> | ||
|
||
![image](https://user-images.githubusercontent.com/399657/39391259-b57ca9e0-4a6e-11e8-8b33-2064e5fc187e.png) | ||
`dumpster-dive` is a **nodejs** script that puts a **highly-queryable** wikipedia on your computer in a nice afternoon. | ||
|
||
It uses [worker-nodes](https://github.com/allegro/node-worker-nodes) to process pages in parallel, and [wtf_wikipedia](https://github.com/spencermountain/wtf_wikipedia) to turn ***wikiscript*** into whatever json. | ||
|
||
<div align="center"> | ||
-- <b>en-wikipedia</b> takes about 7-hours, end-to-end -- | ||
</div> | ||
|
||
It's a javascript one-liner that puts a highly-queryable wikipedia on your laptop in a nice afternoon. | ||
|
||
It uses [wtf_wikipedia](https://github.com/spencermountain/wtf_wikipedia) to parse wikiscript into *almost-nice* json. | ||
|
||
```bash | ||
npm install -g wikipedia-to-mongodb | ||
``` | ||
### ⚡ From the Command-Line: | ||
```bash | ||
wp2mongo /path/to/my-wikipedia-article-dump.xml.bz2 | ||
npm install -g dumpster-dive | ||
``` | ||
### 😎 From a nodejs script | ||
### 😎 API | ||
```js | ||
var wp2mongo = require('wikipedia-to-mongodb') | ||
wp2mongo({file:'./enwiki-latest-pages-articles.xml.bz2', db: 'enwiki'}, callback) | ||
var dumpster = require('dumpster-dive') | ||
dumpster({ file:'./enwiki-latest-pages-articles.xml', db:'enwiki'}, callback) | ||
``` | ||
|
||
### Command-Line: | ||
```bash | ||
dumpster /path/to/my-wikipedia-article-dump.xml --citations=false --html=true | ||
``` | ||
|
||
then check out the articles in mongo: | ||
*then check out the articles in mongo:* | ||
````bash | ||
$ mongo #enter the mongo shell | ||
use enwiki #grab the database | ||
|
||
db.wikipedia.find({title:"Toronto"})[0].categories | ||
#[ "Former colonial capitals in Canada", | ||
# "Populated places established in 1793" ...] | ||
db.wikipedia.count({type:"redirect"}) | ||
# 124,999... | ||
db.wikipedia.count() | ||
# 4,926,056... | ||
```` | ||
|
||
# Steps: | ||
|
||
### 1) 💪 you can do this. | ||
### 1️⃣ you can do this. | ||
you can do this. | ||
a few Gb. you can do this. | ||
just a few Gb. you can do this. | ||
|
||
### 2) get ready | ||
Install [nodejs](https://nodejs.org/en/), [mongodb](https://docs.mongodb.com/manual/installation/), and optionally [redis](http://redis.io/) | ||
### 2️⃣ get ready | ||
Install [nodejs](https://nodejs.org/en/), [mongodb](https://docs.mongodb.com/manual/installation/) | ||
|
||
```bash | ||
# start mongo | ||
mongod --config /mypath/to/mongod.conf | ||
# install wp2mongo | ||
npm install -g wikipedia-to-mongodb | ||
# install this script | ||
npm install -g dumpster-dive | ||
# (that gives you the global command `dumpster`) | ||
``` | ||
that gives you the global command `wp2mongo`. | ||
|
||
### 3) download a wikipedia | ||
The Afrikaans wikipedia (around 47,000 artikels) only takes a few minutes to download, and 10 mins to load into mongo on a macbook: | ||
### 3️⃣ download a wikipedia | ||
The Afrikaans wikipedia (around 47,000 artikels) only takes a few minutes to download, and 5 mins to load into mongo on a macbook: | ||
```bash | ||
# dowload an xml dump (38mb, couple minutes) | ||
wget https://dumps.wikimedia.org/afwiki/latest/afwiki-latest-pages-articles.xml.bz2 | ||
``` | ||
the english/german ones are bigger. Use whichever xml dump you'd like. The [download page](https://dumps.wikimedia.org) is weird, but you'll want the most-common dump format, without historical diffs, or images, which is `${LANG}wiki-latest-pages-articles.xml.bz2 ` | ||
|
||
### 4) get it going | ||
### 4️⃣ unzip it | ||
i know, this sucks. but it makes the parser so much faster. On a macbook, unzipping en-wikipedia takes an hour or so. Eat some lunch. | ||
|
||
### 5️⃣ OK, start it off | ||
```bash | ||
#load it into mongo (10-15 minutes) | ||
wp2mongo ./afwiki-latest-pages-articles.xml.bz2 | ||
dumpster ./afwiki-latest-pages-articles.xml | ||
``` | ||
### 5) take a bath | ||
just put some [epsom salts](https://www.youtube.com/watch?v=QSlIHCu2Smw) in there, it feels great. You deserve a break once and a while. The en-wiki dump should take a few hours. Should be done before dinner. | ||
### 6️⃣ take a bath | ||
just put some [epsom salts](https://www.youtube.com/watch?v=QSlIHCu2Smw) in there, it feels great. | ||
|
||
The en-wiki dump should take a few hours. Maybe 8. Should be done before dinner. | ||
|
||
The console will update you every couple seconds to let you know where it's at. | ||
|
||
### 6) check-out your data | ||
to view your data in the mongo console, | ||
### 7️⃣ done! | ||
go check-out the data! to view your data in the mongo console: | ||
````javascript | ||
$ mongo | ||
use afwiki | ||
use afwiki //your db name | ||
|
||
//shows a random page | ||
//show a random page | ||
db.wikipedia.find().skip(200).limit(2) | ||
|
||
//count the redirects (~5,000 in afrikaans) | ||
db.wikipedia.count({type:"redirect"}) | ||
|
||
//find a specific page | ||
db.wikipedia.findOne({title:"Toronto"}).categories | ||
```` | ||
|
||
### Same for the English wikipedia: | ||
the english wikipedia will work under the same process, but | ||
the download will take an afternoon, and the loading/parsing a couple hours. The en wikipedia dump is a 13 GB (for [enwiki-20170901-pages-articles.xml.bz2](https://dumps.wikimedia.org/enwiki/20170901/enwiki-20170901-pages-articles.xml.bz2)), and becomes a pretty legit mongo collection uncompressed. It's something like 51GB, but mongo can do it... You can do it! | ||
|
||
the download will take an afternoon, and the loading/parsing a couple hours. The en wikipedia dump is a 13 GB (for [enwiki-20170901-pages-articles.xml.bz2](https://dumps.wikimedia.org/enwiki/20170901/enwiki-20170901-pages-articles.xml.bz2)), and becomes a pretty legit mongo collection uncompressed. It's something like 51GB, but mongo can do it 💪. | ||
|
||
### Options | ||
#### human-readable plaintext **--plaintext** | ||
### Options: | ||
dumpster follows all the conventions of [wtf_wikipedia](https://github.com/spencermountain/wtf_wikipedia), and you can pass-in any fields for it to include in it's json. | ||
* **human-readable plaintext** ***--plaintext*** | ||
```js | ||
wp2mongo({file:'./myfile.xml.bz2', db: 'enwiki', plaintext:true}, console.log) | ||
dumpster({file:'./myfile.xml.bz2', db: 'enwiki', plaintext:true, categories:false}) | ||
/* | ||
[{ | ||
_id:'Toronto', | ||
|
@@ -96,51 +130,30 @@ wp2mongo({file:'./myfile.xml.bz2', db: 'enwiki', plaintext:true}, console.log) | |
}] | ||
*/ | ||
``` | ||
#### go faster with Redis **--worker** | ||
there is yet much faster way (even x10) to import all pages into mongodb but a little more complex. it requires redis installed on your computer and running worker in separate process. | ||
|
||
It also gives you a cool dashboard, to watch the progress. | ||
````bash | ||
# install redis | ||
sudo apt-get install # (or `brew install redis` on a mac) | ||
|
||
# clone the repo | ||
git clone [email protected]:spencermountain/wikipedia-to-mongodb.git && cd wikipedia-to-mongodb | ||
|
||
#load pages into job queue | ||
bin/wp2mongo.js ./afwiki-latest-pages-articles.xml.bz2 --worker | ||
|
||
# start processing jobs (parsing articles and saving to mongodb) on all CPU's | ||
node src/worker.js | ||
|
||
# you can preview processing jobs in kue dashboard (localhost:3000) | ||
node node_modules/kue/bin/kue-dashboard -p 3000 | ||
```` | ||
|
||
#### skip unnecessary pages **--skip_disambig**, **--skip_redirects** | ||
this can make it go faster too, by skipping entries in the dump that aren't full-on articles. | ||
* **disambiguation pages / redirects** ***--skip_disambig***, ***--skip_redirects*** | ||
by default, dumpster skips entries in the dump that aren't full-on articles, you can | ||
```js | ||
let obj = { | ||
file: './path/enwiki-latest-pages-articles.xml.bz2', | ||
db: 'enwiki', | ||
skip_redirects: true, | ||
skip_disambig: true, | ||
skip_first: 1000, // ignore the first 1k pages | ||
verbose: true, // print each article title | ||
skip_redirects: false, | ||
skip_disambig: false | ||
} | ||
wp2mongo(obj, () => console.log('done!') ) | ||
dumpster(obj, () => console.log('done!') ) | ||
``` | ||
|
||
* **reducing file-size:** | ||
you can tell wtf_wikipedia what you want it to parse, and which data you don't need: | ||
```bash | ||
dumpster ./my-wiki-dump.xml --infoboxes=false --citations=false --categories=false --links=false | ||
``` | ||
|
||
## how it works: | ||
this library uses: | ||
* [unbzip2-stream](https://github.com/regular/unbzip2-stream) to stream-uncompress the gnarly bz2 file | ||
|
||
* [xml-stream](https://github.com/assistunion/xml-stream) to stream-parse its xml format | ||
|
||
* [line-by-line](https://www.npmjs.com/package/line-by-line) to stream the gnarly xml file | ||
* [wtf_wikipedia](https://github.com/spencermountain/wtf_wikipedia) to brute-parse the article wikiscript contents into JSON. | ||
|
||
* [redis](http://redis.io/) to (optionally) put wikiscript parsing on separate threads :metal: | ||
|
||
## Addendum: | ||
### \_ids | ||
since wikimedia makes all pages have globally unique titles, we also use them for the mongo `_id` fields. | ||
|
@@ -157,4 +170,6 @@ $ --> \u0024 | |
This library should also work on other wikis with standard xml dumps from [MediaWiki](https://www.mediawiki.org/wiki/MediaWiki). I haven't tested them, but the wtf_wikipedia supports all sorts of non-standard wiktionary/wikivoyage templates, and if you can get a bz-compressed xml dump from your wiki, this should work fine. Open an issue if you find something weird. | ||
|
||
### PRs welcome! | ||
This is an important project, come [help us out](./contributing.md). | ||
|
||
MIT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#! /usr/bin/env node | ||
var dumpster = require('../src') | ||
var yargs = require('yargs') | ||
var argv = yargs | ||
.usage('dumpster <xml filepath> [options]') | ||
.example('dumpster ./my/wikipedia-dump.xml --plaintext true --categories false') | ||
.describe('batch_size', 'how many articles to write to mongo at once [1000]') | ||
.describe('skip_disambig', 'avoid storing disambiguation pages [true]') | ||
.describe('skip_redirects', 'avoid storing redirect pages [true]') | ||
.describe('categories', 'include category data? [true]') | ||
.describe('citations', 'include references/citations? [true]') | ||
.describe('coordinates', 'include coordinate data? [true]') | ||
.describe('infoboxes', 'include infobox data? [true]') | ||
.describe('images', 'include image data? [true]') | ||
.describe('markdown', 'include markdown output [false]') | ||
.describe('html', 'include html output [false]') | ||
.describe('latex', 'include latex output [false]') | ||
.argv; | ||
|
||
const defaults = { | ||
batch_size: 1000, | ||
skip_disambig: true, | ||
skip_redirects: true, | ||
|
||
title: true, | ||
pageID: true, | ||
categories: true, | ||
citations: true, | ||
coordinates: true, | ||
infoboxes: true, | ||
sections: true, | ||
images: true, | ||
|
||
plaintext: false, | ||
html: false, | ||
markdown: false, | ||
latex: false, | ||
}; | ||
const toBool = { | ||
'true': true, | ||
'false': false, | ||
} | ||
|
||
let file = argv['_'][0] | ||
//set defaults to given arguments | ||
let options = Object.assign({}, defaults) | ||
Object.keys(options).forEach((k) => { | ||
if (argv.hasOwnProperty(k) && argv[k] !== undefined) { | ||
//coerce strings to booleans | ||
if (toBool.hasOwnProperty(argv[k])) { | ||
argv[k] = toBool[argv[k]] | ||
} | ||
options[k] = argv[k] | ||
} | ||
}) | ||
|
||
//grab the wiki file | ||
if (!file) { | ||
console.log('please supply a filename to the wikipedia article dump') | ||
process.exit(1) | ||
} | ||
//try to make-up the language name for the db | ||
let db = 'wikipedia' | ||
if (file.match(/-latest-pages-articles/)) { | ||
db = file.match(/([a-z]+)-latest/) || [] | ||
db = db[1] || 'wikipedia' | ||
} | ||
options.file = file | ||
options.db = db | ||
// console.log(options) | ||
dumpster(options) |
Oops, something went wrong.