Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change selection of top image #122

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ env/
._*
venv/
goose_extractor.egg-info/
.env
7 changes: 3 additions & 4 deletions goose/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,11 @@ def crawl(self, crawl_candidate):
# let's process it
if self.article.top_node is not None:

# video handeling
# video handling
self.video_extractor.get_videos()

# image handeling
if self.config.enable_image_fetching:
self.get_image()
# image handling
self.get_image()

# post cleanup
self.article.top_node = self.extractor.post_cleanup()
Expand Down
41 changes: 32 additions & 9 deletions goose/images/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,27 +81,37 @@ def __init__(self, config, article):
)

def get_best_image(self, doc, topNode):
# first check for known occurrences
image = self.check_known_elements()
if image:
return image

image = self.check_large_images(topNode, 0, 0)
if image:
return image

# then check for curated tags
image = self.check_meta_tag()
if image:
return image

# then make best (and most costly) guess
if self.config.enable_image_fetching:
image = self.check_large_images(topNode, 0, 0)
if image:
return image

return Image()

def check_meta_tag(self):
# check link tag
image = self.check_link_tag()
# check opengraph tag
image = self.check_opengraph_tag()
if image:
return image

# check opengraph tag
image = self.check_opengraph_tag()
# check twitter card tag
image = self.check_twitter_card_tag()
if image:
return image

# check link tag
image = self.check_link_tag()
if image:
return image

Expand Down Expand Up @@ -213,7 +223,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"):
# check if we have a local image
# in order to add more information
# on the Image object
local_image = self.get_local_image(image.src)
local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None
if local_image:
image.bytes = local_image.bytes
image.height = local_image.height
Expand Down Expand Up @@ -337,6 +347,19 @@ def check_opengraph_tag(self):
return self.get_image(item, src, extraction_type='opengraph')
return None

def check_twitter_card_tag(self):
"""\
checks to see if we were able to
find twitter card tags on this page
"""
node = self.article.raw_doc
meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image')
for item in meta:
src = self.parser.getAttribute(item, attr='content')
if src:
return self.get_image(item, src, extraction_type='twitter')
return None

def get_local_image(self, src):
"""\
returns the bytes of the image file on disk
Expand Down
2 changes: 1 addition & 1 deletion goose/images/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self):
self.width = 0

# what kind of image extraction was used for this?
# bestGuess, linkTag, openGraph tags?
# bestGuess, linkTag, openGraph tags, twitter card?
self.extraction_type = "NA"

# stores how many bytes this image is.
Expand Down
2 changes: 1 addition & 1 deletion goose/images/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def clean_src_string(self, src):
def fetch(self, http_client, src):
try:
req = urllib2.Request(src)
f = urllib2.urlopen(req)
f = urllib2.urlopen(req, timeout=15)
data = f.read()
return data
except:
Expand Down