-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapollogetter.rb
132 lines (116 loc) · 4.31 KB
/
apollogetter.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Author: Marius Schuller
# MIT License, (c) 2014
#
# TODO:
# - Make it useable
# - Make it use threads
# - Make it use proxies?
# - Make mission selectable (e.g. "Only Apollo 1 please!")
# - Make it output a list of all URLs, separated by # <mission_name>
# - Make a easy to use gallery out of downloaded images
require 'mechanize'
require 'rubygems'
require 'net/http'
require 'pry'
class ApolloGetter
url = "http://www.apolloarchive.com/apollo_gallery.html"
tableurl = "http://www.apolloarchive.com/apg_subject_index-test.php?gallery="
# http://www.apolloarchive.com/
# apg_thumbnail-test.php?ptr=<img id>&imageID=<name>
dlurl = "http://www.apolloarchive.com/apg_thumbnail-test.php?ptr="
dlurl_post = "&imageID="
missions = {'mg' => 'MG', 'apmisc' => 'EA', 'ap1' => '1', 'ap7' => '7',
'ap8' => '8', 'ap9' => '9', 'ap10' => '10', 'ap11' => '11',
'ap12' => '12', 'ap13' => '13', 'ap14' => '14',
'ap15' => '15', 'ap16' => '16', 'ap17' => '17',
'sv' => 'saturnv', 'pa' => 'PA'}
linknames = []
table = {}
geturls = {}
mission_images = {}
# hash with the names as key, value is array of imgname
missions.each_key do |name|
mission_images[name] = []
geturls[name] = []
end
a = Mechanize.new
a.get(url) do |hp|
selector = a.click(hp.frame_with(:src => "apg_selector.html").click)
# Get all JS Apollo mission links from the overview frame in the upper left
selector.links.each do |l|
# ... but not the search and magazines link
unless l.href.match(/search|by_magazin/)
linknames << l.href.match(/\(\'(.+)\',/)[1]
end
end
puts "Getting mission_images!"
missions.each do |name, folder_name|
print "Searching for mission #{name}... "
table[name] = a.get(tableurl+folder_name)
table[name].links.each do |link|
mission_images[name] << link.text
end
# Get rid of not-image-links
mission_images[name].pop(6)
unless mission_images[name].size == 0
puts "found #{mission_images[name].size} images!"
else
# XXX: Nothing is found for Saturn V
puts "nothing found!"
end
end
# XXX: Reset content, otherwise it seems to get confused with old content in a
a.reset
# This is very slow (because they allow requests only every 2 secs)
puts "Looking up deeplinks for downloading..."
mission_images.each do |mission, image_table|
resolutionlist = []
print mission
# Get each images own frame from the upper right if clicked in the table
image_table.each_with_index do |image_name, index|
resolutionlist << dlurl + "#{index+1}" + dlurl_post + image_name
end
puts " #{resolutionlist.size}"
# Get every site in resolutionlist and if it has a high res image linked,
# get this otherwise get the standard resolution one
# XXX: This is slow, maybe use different proxies for every request,
# threading
begin
resolutionlist.each do |single_image_frame|
if a.get(single_image_frame).links_with(:text => "Hi-Res").any?
geturls[mission] << a.get(single_image_frame)
.links_with(:text => "Hi-Res").first.href.to_s
elsif a.get(single_image_frame).links_with(:text => "Standard").any?
geturls[mission] << a.get(single_image_frame)
.links_with(:text => "Standard").first.href.to_s
end
end
rescue SignalException => e
puts geturls
break
rescue Exception => e
raise e
end
# Requests to the Archive are limited to one every two seconds, that I am
# not able to fill the resolutionlist and begin downloading images from
# it.
# Either I have to proxy every second or third request because the
# download itself will take some time that one request of that address
# is allowed once more.
end
missions.each do |mission, folder|
# Don't write a file when there is nothing to write into it
next if geturls[mission].size == 0
urlfile = File.new("links/#{mission}/urls.txt", "w")
puts "Writing links to file..."
if urlfile
geturls[mission].each do |u|
urlfile.write(u+"\n")
end
urlfile.close
else
puts "Unable to open file!"
end
end
end
end