-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScraper.rb
237 lines (198 loc) · 6.87 KB
/
Scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#Kevin Wojcik
#CCS CS 130a
#Spring 2012
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'seed-fu'
require 'thread'
URL = "http://www.slugbooks.com"
MUTEX = Mutex.new
#Scrape a course from a relative URL
def scrape_course(course_url, writer, course)
begin
#Get the course specific web page
course_doc = Nokogiri::HTML(open(URL + course_url))
rescue Exception => e
puts "Error in scrape_course(" + URL + course_url + ")"
puts e
return nil
end
#Go through each book for the course
course_doc.css(".first").each do |book|
#Some non books use the same CSS class so we need to filter those
if book.at_css(".title") != nil
#All the attributes for a specific book
children = book.element_children()
#Parse and print out the URL for the image of the book
url = children[0].css('img')[0]['src']
#Parse and print out the name of the book
title = children[1].text.gsub(/\s+/, ' ')[0, children[1].text.index("|") - 1]
#Parse and print out the author of the book
author = children[3].text[4, children[3].text.length].gsub(/\s+/, ' ')
#Parse and print out the ISBN of the book
isbn = children[4].text[6, 13].gsub(/\s+/, ' ') + "\n\n"
#Prints the info
puts url
puts title
puts author
puts isbn
our_url = "images/" + isbn.strip + ".jpg"
#download the file
imagefile = open(URL + url)
file = File::open(our_url, 'w')
file.write(imagefile.read())
file.close
MUTEX.lock
writer.add( :course => course, :title =>title, :author => author, :isbn => isbn, :url => our_url)
MUTEX.unlock
#puts "wrote to file"
end
end
end
#Scrapte a department from a relative URL
def scrape_department(department_url, filename)
begin
#Open the web page for a particular department
department_doc = Nokogiri::HTML(open(URL + department_url))
rescue Exception => e
puts "Error in scrape_department(" + URL + department_url + ")"
puts e
return nil
end
num_scraped = 0
puts "creating writer " + filename
SeedFu::Writer.write(filename, :class_name => 'BookCatalogEntrie') do |writer|
#Go through all courses in that department
department_doc.css(".middleclasslinks").css('li').each do |course|
#Print the name of the course
coursename = course.text.gsub(/\s+/, ' ').strip
#Build the new URL for the course
course_url = course.css('a')[0]['href']
#puts "here4" + coursename
#Scrape the course
scrape_course(course_url, writer, coursename)
num_scraped = num_scraped + 1
end
end
if num_scraped == 0
file = File.open(filename, 'r')
File::unlink(file)
end
end
#Scrape a school from a relative URL
def scrape_school(school_url)
begin
#Open the page for a particular school
school_doc = Nokogiri::HTML(open(URL + school_url))
rescue Exception => e
puts "Error in scrape_school(" + URL + school_url + ")"
puts e
return nil
end
thread_list = []
thread_counter = 0
#Go through each department at a school
school_doc.css(".bottomlinks").css('li').each do |department|
#Print the name of the department
#puts department.text.gsub(/\s+/, ' ').strip
#Build the new URL for the department
department_url = department.css('a')[0]['href']
#Scrape the Department
number_str = thread_counter.to_s()
new_thread = Thread.new{scrape_department(department_url, 'db/fixtures/seed_script_' + number_str + school_url.gsub('/', '...') + '.rb')}
thread_list << new_thread
thread_counter = thread_counter + 1
end
thread_list.each { |t| t.join }
end
#Scrapte a state from a relative URL
def scrape_state(state_url)
begin
#Open the page for a particular state
states_doc = Nokogiri::HTML(open(URL + state_url))
rescue Exception => e
puts "Error in scrape_state(" + URL + state_url + ")"
puts e
return nil
end
#Go through each school in the state
states_doc.css(".middlelinks").css('li').each do |school|
#Print out the name of the school
#puts school.text.gsub(/\s+/, ' ').strip
#Build the new URL for the school
school_url = school.css('a')[0]['href']
#Scrape the school
scrape_school(school_url)
end
end
#Scrape all of slugbooks
def scrape_all(url)
begin
#Open the web page
doc = Nokogiri::HTML(open(url))
rescue Exception => e
puts "Error in scrape_all(" + url + ")"
puts e.text
end
#Get all the URLS at the bottom of the page, each representing a US state
doc.css(".bottomlinks").css('li').each do |state|
#Print out the name of the State
#puts state.text.gsub(/\s+/, ' ').strip
#Build the new URL from the base and the one listed in the <a href="..."/>
state_url = state.css('a')[0]['href']
#Scrape the state
scrape_state(state_url)
end
end
#Help message for the command line
HELP = "[-state url_for_state] Parses only data for that specific state
[-school url_for_school] Parses only data for the school. Takes precedence over state
[-department url_for_department] Parses only data for the department. Takes precedence over school
[-course url_for_course] Parses only data for the course. Takes precedence over department\n"
#Stores the URLs for the specific request if used
command_line_state = nil
command_line_school = nil
command_line_department = nil
command_line_course = nil
#Check for the -h argument
#if ARGV.length == 1 and ARGV[0] == "-h"
# puts HELP
# exit
#end
#Check to make sure that we have pairs of arguments
#if ARGV.length % 2 != 0
# puts "Bad parameters"
# exit
#end
#Parse command line arguments
#ARGV.each_index do |i|
# if i % 2 == 1
# next
# end
# if ARGV[i] == "-state"
# command_line_state = ARGV[i+1]
# elsif ARGV[i] == "-school"
# command_line_school = ARGV[i+1]
# elsif ARGV[i] == "-department"
# command_line_department = ARGV[i+1]
# elsif ARGV[i] == "-course"
# command_line_course = ARGV[i+1]
# else
# puts "Unknown parameter " + ARGV[i]
# exit
# end
#end
#Do the requested scrape
# if command_line_course != nil
# scrape_course(command_line_course, writer, course)
# elsif command_line_department != nil
# scrape_department(command_line_department, writer)
# elsif command_line_school != nil
# scrape_school(command_line_school)
# elsif command_line_state != nil
# scrape_state(command_line_state)
# else
# scrape_all(URL)
scrape_school("/UCSB/UCSB-Textbooks.html")
#end