Skip to content

Commit 0e95bfb

Browse files
committed
first commit
0 parents  commit 0e95bfb

File tree

4 files changed

+119
-0
lines changed

4 files changed

+119
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
node_modules

package.json

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"name": "dcmr",
3+
"version": "0.0.0",
4+
"main": "index.js",
5+
"scripts": {
6+
"test": "echo \"Error: no test specified\" && exit 1"
7+
},
8+
"license": "BSD",
9+
"description": "ERROR: No README.md file found!",
10+
"dependencies": {
11+
"cheerio": "~0.10.1",
12+
"get": "~1.2.1",
13+
"underscore": "~1.4.1",
14+
"url":"*",
15+
"request": "~2.11.4"
16+
}
17+
}

readme.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# DCMR
2+
3+
Scraper borrowed from https://github.com/tmcw/dcmr.
4+
5+
To come:
6+
7+
>> Parser into [Code XML](https://github.com/JoshData/dc-code-prototype) Schema.
8+
9+
>> Simple-generator
10+
11+
>> SimpleDCMR to come.

scrape/scrape.js

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env node
2+
3+
var cheerio = require('cheerio'),
4+
fs = require('fs'),
5+
url = require('url'),
6+
get = require('get'),
7+
_ = require('underscore'),
8+
request = require('request');
9+
10+
function start() {
11+
var u = 'http://www.dcregs.dc.gov/Search/DCMRSearchByTitle.aspx';
12+
request(u, function(err, reponse, body) {
13+
if (err) throw err;
14+
15+
var $ = cheerio.load(body);
16+
var togo = [];
17+
var links = $('a').each(function(i, a) {
18+
if (a.attribs.href.match(/\/Gateway\/TitleHome\.aspx\?/g)) {
19+
togo.push(url.resolve(u, a.attribs.href));
20+
}
21+
});
22+
23+
togo.map(chapterhome);
24+
});
25+
}
26+
27+
function chapterhome(u) {
28+
request(u, function(err, reponse, body) {
29+
if (err) throw err;
30+
console.log('to chapter step');
31+
32+
var $ = cheerio.load(body);
33+
var togo = [];
34+
var links = $('a').each(function(i, a) {
35+
if (a.attribs.href.match(/ChapterHome\.aspx\?/)) {
36+
togo.push(url.resolve(u, a.attribs.href));
37+
}
38+
});
39+
40+
togo.map(rulehome);
41+
});
42+
}
43+
44+
function rulehome(u) {
45+
request(u, function(err, reponse, body) {
46+
if (err) throw err;
47+
48+
console.log('to rule step');
49+
var $ = cheerio.load(body);
50+
var togo = [];
51+
var links = $('a').each(function(i, a) {
52+
if (a.attribs.href.match(/RuleHome\.aspx\?/)) {
53+
togo.push(url.resolve(u, a.attribs.href));
54+
}
55+
});
56+
57+
togo.map(download);
58+
});
59+
}
60+
61+
function download(u) {
62+
request(u, function(err, reponse, body) {
63+
if (err) throw err;
64+
65+
console.log('to dl step');
66+
var $ = cheerio.load(body);
67+
var togo = [];
68+
var links = $('a').each(function(i, a) {
69+
// LOL
70+
if (a.attribs.href && a.attribs.href.match(/Download\.aspx\?/)) {
71+
var title = $('title').text().replace(/\r\n/, '');
72+
togo.push([title, url.resolve(u, a.attribs.href)]);
73+
}
74+
});
75+
76+
togo.map(dl);
77+
});
78+
}
79+
80+
var i = 0;
81+
82+
function dl(u) {
83+
var id = i++;
84+
get(u[1]).toDisk('docs/' + i + '.doc', function(err) {
85+
if (err) console.log(err);
86+
});
87+
fs.writeFileSync('docs/' + i + '.title', u[0]);
88+
}
89+
90+
start();

0 commit comments

Comments
 (0)