grangier · mwjackson · Jul 18, 2014 · Jul 18, 2014 · Jul 18, 2014 · Jul 18, 2014
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ env/
 ._*
 venv/
 goose_extractor.egg-info/
+.env
diff --git a/goose/crawler.py b/goose/crawler.py
@@ -117,12 +117,11 @@ def crawl(self, crawl_candidate):
         # let's process it
         if self.article.top_node is not None:
 
-            # video handeling
+            # video handling
             self.video_extractor.get_videos()
 
-            # image handeling
-            if self.config.enable_image_fetching:
-                self.get_image()
+            # image handling
+            self.get_image()
 
             # post cleanup
             self.article.top_node = self.extractor.post_cleanup()

diff --git a/goose/images/extractors.py b/goose/images/extractors.py
@@ -81,27 +81,37 @@ def __init__(self, config, article):
         )
 
     def get_best_image(self, doc, topNode):
+        # first check for known occurrences
         image = self.check_known_elements()
         if image:
             return image
 
-        image = self.check_large_images(topNode, 0, 0)
-        if image:
-            return image
-
+        # then check for curated tags
         image = self.check_meta_tag()
         if image:
             return image
+
+        # then make best (and most costly) guess
+        if self.config.enable_image_fetching:
+            image = self.check_large_images(topNode, 0, 0)
+            if image:
+                return image
+
         return Image()
 
     def check_meta_tag(self):
-        # check link tag
-        image = self.check_link_tag()
+        # check opengraph tag
+        image = self.check_opengraph_tag()
         if image:
             return image
 
-        # check opengraph tag
-        image = self.check_opengraph_tag()
+        # check twitter card tag
+        image = self.check_twitter_card_tag()
+        if image:
+            return image
+
+        # check link tag
+        image = self.check_link_tag()
         if image:
             return image
 
@@ -213,7 +223,7 @@ def get_image(self, element, src, score=100, extraction_type="N/A"):
         # check if we have a local image
         # in order to add more information
         # on the Image object
-        local_image = self.get_local_image(image.src)
+        local_image = self.get_local_image(image.src) if self.config.enable_image_fetching else None
         if local_image:
             image.bytes = local_image.bytes
             image.height = local_image.height
@@ -337,6 +347,19 @@ def check_opengraph_tag(self):
                 return self.get_image(item, src, extraction_type='opengraph')
         return None
 
+    def check_twitter_card_tag(self):
+        """\
+        checks to see if we were able to
+        find twitter card tags on this page
+        """
+        node = self.article.raw_doc
+        meta = self.parser.getElementsByTag(node, tag='meta', attr='property', value='twitter:image')
+        for item in meta:
+            src = self.parser.getAttribute(item, attr='content')
+            if src:
+                return self.get_image(item, src, extraction_type='twitter')
+        return None
+
     def get_local_image(self, src):
         """\
         returns the bytes of the image file on disk

diff --git a/goose/images/image.py b/goose/images/image.py
@@ -42,7 +42,7 @@ def __init__(self):
         self.width = 0
 
         # what kind of image extraction was used for this?
-        # bestGuess, linkTag, openGraph tags?
+        # bestGuess, linkTag, openGraph tags, twitter card?
         self.extraction_type = "NA"
 
         # stores how many bytes this image is.

diff --git a/goose/images/utils.py b/goose/images/utils.py
@@ -113,7 +113,7 @@ def clean_src_string(self, src):
     def fetch(self, http_client, src):
         try:
             req = urllib2.Request(src)
-            f = urllib2.urlopen(req)
+            f = urllib2.urlopen(req, timeout=15)
             data = f.read()
             return data
         except:
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ env/ @@
     ._*
     venv/
     goose_extractor.egg-info/
+    .env