-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
109 lines (80 loc) · 2.81 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
from flask import Flask
from flask import render_template
from flask import request
from BeautifulSoup import BeautifulSoup
import Image
import requests
import cStringIO
import json
import re
import urllib
import random
import time
app = Flask(__name__)
GOOGLE_IMAGES_URL = 'https://www.google.com/search?tbm=isch&q=%s'
IMAGES_FOLDER = "static/retrieved_images"
WORD_LIST_FILE = "wordlist.txt"
@app.route('/')
def hello():
return render_template('index.html')
@app.route('/getImageUrls')
def chooseWordAndGetImages():
numberOfImagesRequested = int( request.args.get( "numberOfImagesRequested" ) )
size = 256, 256
# grab a random word from the list
word_list_file_handle = open( WORD_LIST_FILE )
word_list = word_list_file_handle.readlines()
words = len( word_list ) - 1
index = random.randint( 0, words )
word = word_list[ index ].strip()
app.logger.debug("Random word is '%s'" % word)
# construct url for google images
images_url = GOOGLE_IMAGES_URL % word
# get the content of the page
app.logger.debug("Requesting %s..." % images_url)
r = requests.get(images_url)
# scrape page for links to images
soup = BeautifulSoup(r.content)
results = soup.findAll('a', href=re.compile('imgurl'))
input_urls = []
p = re.compile('imgurl=(.*?)&')
for result in results:
url = urllib.unquote(p.findall(str(result))[0])
input_urls.append(url)
output_urls = []
q = re.compile( 'image/([a-zA-Z]+)' )
# TODO: make all of the requests for images happen at the same time
for i, url in enumerate( input_urls ):
## If we've collected enough images, stop looking for more
if len(output_urls) >= numberOfImagesRequested:
break
try:
r = requests.get(url)
except ConnectionError:
continue
content_type = r.headers[ "Content-Type" ]
try:
# Grab the extension out of the Content-Type header
extension = q.findall( content_type )[ 0 ]
except IndexError:
# We didn't get a Content-Type header, so skip this image
continue
app.logger.debug("Retrieved image %d (%s) from [%s]..." % (i, extension, url))
try:
im = Image.open(cStringIO.StringIO(r.content))
img = im.resize(size)
except IOError:
app.logger.warning('Something failed. Continuing...')
continue
filename = IMAGES_FOLDER + "/image%d.%s" % (i, extension)
img.save( filename )
# appending a timestamp to bust the browser's cache
output_urls.append( "%s?%d" % (filename, int(time.time()+300)) )
## This is what we'll return to the client
retval = { "urls": output_urls, "word": word }
return json.dumps( retval )
if __name__ == '__main__':
# Bind to PORT if defined, otherwise default to 5000.
port = int(os.environ.get('PORT', 5000))
app.run(host='0.0.0.0', port=port, debug=True)