From ea0583c38420d17cb5c35c17be6d6b6dc3a554f8 Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:22:08 +0100 Subject: [PATCH 1/5] try to find images using least costly methods save full image retrieval until last and make configurable (instead of all images on/off) use curated tags (opengraph, twitter card) where possible --- goose/crawler.py | 7 +++---- goose/images/extractors.py | 39 ++++++++++++++++++++++++++++++-------- goose/images/image.py | 2 +- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/goose/crawler.py b/goose/crawler.py index 211d410e..56995998 100644 --- a/goose/crawler.py +++ b/goose/crawler.py @@ -117,12 +117,11 @@ def crawl(self, crawl_candidate): # let's process it if self.article.top_node is not None: - # video handeling + # video handling self.video_extractor.get_videos() - # image handeling - if self.config.enable_image_fetching: - self.get_image() + # image handling + self.get_image() # post cleanup self.article.top_node = self.extractor.post_cleanup() diff --git a/goose/images/extractors.py b/goose/images/extractors.py index 08a207b0..12142549 100644 --- a/goose/images/extractors.py +++ b/goose/images/extractors.py @@ -81,27 +81,37 @@ def __init__(self, config, article): ) def get_best_image(self, doc, topNode): + # first check for known occurrences image = self.check_known_elements() if image: return image - image = self.check_large_images(topNode, 0, 0) - if image: - return image - + # then check for curated tags image = self.check_meta_tag() if image: return image + + # then make best (and most costly) guess + if self.config.enable_image_fetching: + image = self.check_large_images(topNode, 0, 0) + if image: + return image + return Image() def check_meta_tag(self): - # check link tag - image = self.check_link_tag() + # check opengraph tag + image = self.check_opengraph_tag() if image: return image - # check opengraph tag - image = self.check_opengraph_tag() + # check twitter card tag + image = self.check_twitter_card_tag() + if image: + return image + + # check link tag + image = self.check_link_tag() if image: return image @@ -337,6 +347,19 @@ def check_opengraph_tag(self): return self.get_image(item, src, extraction_type='opengraph') return None + def check_twitter_card_tag(self): + """\ + checks to see if we were able to + find twitter card tags on this page + """ + node = self.article.raw_doc + meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image') + for item in meta: + src = self.parser.getAttribute(item, attr='content') + if src: + return self.get_image(item, src, extraction_type='twitter') + return None + def get_local_image(self, src): """\ returns the bytes of the image file on disk diff --git a/goose/images/image.py b/goose/images/image.py index 351e3396..f0da4037 100644 --- a/goose/images/image.py +++ b/goose/images/image.py @@ -42,7 +42,7 @@ def __init__(self): self.width = 0 # what kind of image extraction was used for this? - # bestGuess, linkTag, openGraph tags? + # bestGuess, linkTag, openGraph tags, twitter card? self.extraction_type = "NA" # stores how many bytes this image is. From 3fd05812af4c01dad9eb310c026901bf5f8d230b Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:24:13 +0100 Subject: [PATCH 2/5] ignore .env --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index bea68953..ca768ce2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ env/ ._* venv/ goose_extractor.egg-info/ +.env \ No newline at end of file From 2ac82364b5b68666bace4f09abf3b5d3bea7fd6e Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:26:45 +0100 Subject: [PATCH 3/5] added socket timeout to image retrieval handles 444 back from nginx a little better --- goose/images/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/images/utils.py b/goose/images/utils.py index 2767416f..2ff69d57 100644 --- a/goose/images/utils.py +++ b/goose/images/utils.py @@ -113,7 +113,7 @@ def clean_src_string(self, src): def fetch(self, http_client, src): try: req = urllib2.Request(src) - f = urllib2.urlopen(req) + f = urllib2.urlopen(req, timeout=30) data = f.read() return data except: From ad5d08727eade88e483dc33c13d3a7165085357e Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Fri, 18 Jul 2014 16:31:48 +0100 Subject: [PATCH 4/5] made timeout 15 seconds --- goose/images/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/images/utils.py b/goose/images/utils.py index 2ff69d57..aa9dba39 100644 --- a/goose/images/utils.py +++ b/goose/images/utils.py @@ -113,7 +113,7 @@ def clean_src_string(self, src): def fetch(self, http_client, src): try: req = urllib2.Request(src) - f = urllib2.urlopen(req, timeout=30) + f = urllib2.urlopen(req, timeout=15) data = f.read() return data except: From d18ed20d21c6a4dbee6ab4d91f32cdc68e242608 Mon Sep 17 00:00:00 2001 From: Matt Jackson Date: Thu, 24 Jul 2014 12:57:13 +0100 Subject: [PATCH 5/5] only check image dimensions if image fetching enabled --- goose/images/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/goose/images/extractors.py b/goose/images/extractors.py index 12142549..34e03c19 100644 --- a/goose/images/extractors.py +++ b/goose/images/extractors.py @@ -223,7 +223,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"): # check if we have a local image # in order to add more information # on the Image object - local_image = self.get_local_image(image.src) + local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None if local_image: image.bytes = local_image.bytes image.height = local_image.height