Cache entry content in db (#86)

* accept stdin html in extract_article.js * add a separate scraping module * move article extraction to scraping module * remove local_links feature * rename body to content_short * rename in code * add full content field * store content in db after first fetch * handle article extraction from html * skip content from agenda * add prefetch task * add prefetch helper * add content fields to search * fix outdated field name * fix outdated method name * fix fetch content logic
facundoolano · Jan 3, 2024 · d373d18 · d373d18
1 parent 86d9d09
commit d373d18
Show file tree

Hide file tree

Showing 15 changed files with 291 additions and 179 deletions.
diff --git a/README.md b/README.md
@@ -162,7 +162,7 @@ class LobstersParser(BaseParser):
     def is_compatible(_feed_url, feed_data):
         return 'lobste.rs' in feed_data['feed'].get('link', '')
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         # A 'Comments' link is only present on external link submissions
         if 'Comments' in entry['summary']:
             url = self.parse_content_url(entry)

diff --git a/feedi/config/default.py b/feedi/config/default.py
@@ -6,6 +6,7 @@
 DELETE_OLD_CRON_HOURS = '*/12'
 
 SKIP_RECENTLY_UPDATED_MINUTES = 10
+CONTENT_PREFETCH_MINUTES = '*/15'
 RSS_SKIP_OLDER_THAN_DAYS = 7
 DELETE_AFTER_DAYS = 7
 RSS_MINIMUM_ENTRY_AMOUNT = 5

diff --git a/feedi/extract_article.js b/feedi/extract_article.js
@@ -13,9 +13,19 @@ function parseAndPrint(dom) {
   process.stdout.write(JSON.stringify(article), process.exit);
 }
 
+async function read(stream) {
+  const chunks = [];
+  for await (const chunk of stream) chunks.push(chunk);
+  return Buffer.concat(chunks).toString('utf8');
+}
+
 const {values, positionals} =  util.parseArgs({
   allowPositionals: true
 });
 const url = positionals[0];
 
-JSDOM.fromURL(url).then(parseAndPrint);
+if (url) {
+  JSDOM.fromURL(url).then(parseAndPrint);
+} else {
+  read(process.stdin).then(s => new JSDOM(s)).then(parseAndPrint);
+}
diff --git a/feedi/filters.py b/feedi/filters.py
@@ -80,7 +80,7 @@ def sanitize_content(html, truncate=True):
 # to make the text hide on overflow
 @app.template_filter('entry_excerpt')
 def entry_excerpt(entry):
-    if not entry.body:
+    if not entry.content_short:
         return ''
 
     if entry.content_url and entry.title:
@@ -90,7 +90,7 @@ def entry_excerpt(entry):
     else:
         title = entry.feed.name
 
-    body_text = BeautifulSoup(entry.body, 'lxml').text
+    body_text = BeautifulSoup(entry.content_short, 'lxml').text
 
     # truncate according to display title length so all entries
     # have aproximately the same length

diff --git a/feedi/models.py b/feedi/models.py
@@ -13,7 +13,7 @@
 from sqlalchemy.ext.hybrid import hybrid_property
 
 import feedi.parsers as parsers
-from feedi.requests import get_favicon
+from feedi import scraping
 
 # TODO consider adding explicit support for url columns
 
@@ -303,7 +303,7 @@ def fetch_entry_data(self, _force=False):
 
     def load_icon(self):
         ""
-        self.icon_url = get_favicon(self.url)
+        self.icon_url = scraping.get_favicon(self.url)
 
     @classmethod
     def frequency_rank_query(cls):
@@ -471,9 +471,12 @@ class Entry(db.Model):
     avatar_url = sa.Column(
         sa.String, doc="The url of the avatar image to be displayed for the entry.")
 
-    body = sa.Column(sa.String, doc="The content to be displayed in the feed preview. HTML is supported. \
+    content_short = sa.Column(sa.String, doc="The content to be displayed in the feed preview. HTML is supported. \
     For article entries, it would be an excerpt of the full article content.")
 
+    content_full = sa.orm.deferred(sa.Column(
+        sa.String, doc="The content to be displayed in the reader, e.g. the cleaned full article HTML."))
+
     target_url = sa.Column(
         sa.String, doc="The URL to open when accessing the entry at its source. \
         NULL is interpreted as the entry cannot be open at the source.")
@@ -541,6 +544,14 @@ def has_distinct_user(self):
         """
         return self.avatar_url and (self.display_name or self.username)
 
+    def fetch_content(self):
+        if self.content_url and not self.content_full:
+            try:
+                self.content_full = scraping.extract(self.content_url)['content']
+                db.session.commit()
+            except Exception:
+                pass
+
     @classmethod
     def _filtered_query(cls, user_id, hide_seen=False, favorited=None,
                         feed_name=None, username=None, folder=None,
@@ -577,7 +588,8 @@ def _filtered_query(cls, user_id, hide_seen=False, favorited=None,
             # Poor Text Search™
             query = query.filter(cls.title.contains(text) |
                                  cls.username.contains(text) |
-                                 cls.body.contains(text))
+                                 cls.content_short.contains(text) |
+                                 cls.content_full.contains(text))
 
         return query
 

diff --git a/feedi/parsers/custom.py b/feedi/parsers/custom.py
@@ -3,7 +3,8 @@
 
 import dateparser
 from bs4 import BeautifulSoup
-from feedi.requests import CachingRequestsMixin, requests
+from feedi.requests import requests
+from feedi.scraping import CachingRequestsMixin
 
 
 def fetch(feed_name, url):
@@ -54,9 +55,8 @@ def fetch(self):
                 'username': item['additions'].split(';')[0].split('Por ')[-1],
                 'display_date': created,
                 'sort_date': created,
-                'body': item['synopsis'],
+                'content_short': item['synopsis'],
                 'media_url': item['image']['url'],
-                'content_url': content_url,
                 'target_url': content_url,
                 'raw_data': json.dumps(item)
             })
@@ -87,7 +87,7 @@ def fetch(self):
                 'username': item['editor'],
                 'display_date': datetime.datetime.fromisoformat(item['dateCreated']),
                 'sort_date': datetime.datetime.fromisoformat(item['dateModified']),
-                'body': item['description'],
+                'content_short': item['description'],
                 'media_url': item['image'],
                 'target_url': item['url'],
                 # this website does very funky things with the html that can't be parsed in the reader.
@@ -121,7 +121,7 @@ def fetch(self):
                 'username': author,
                 'display_date': date,
                 'sort_date': date,
-                'body': article.find(class_='newsSummary').text,
+                'content_short': article.find(class_='newsSummary').text,
                 'media_url': article.find('img')['src'],
                 'target_url': content_url,
                 'content_url': content_url,
@@ -158,7 +158,7 @@ def fetch(self):
                 'username': article['byline'],
                 'display_date': pub_date,
                 'sort_date': pub_date,
-                'body': self.fetch_meta(article_url, 'og:description', 'description'),
+                'content_short': self.fetch_meta(article_url, 'og:description', 'description'),
                 'media_url': self.fetch_meta(article_url, 'og:image', 'twitter:image'),
                 'content_url': article_url,
             })

diff --git a/feedi/parsers/mastodon.py b/feedi/parsers/mastodon.py
@@ -76,7 +76,7 @@ def fetch_toots(server_url, access_token, newer_than=None, limit=None):
         entry['avatar_url'] = toot['account']['avatar']
         entry['username'] = toot['account']['acct']
         entry['display_name'] = display_name(toot)
-        entry['body'] = toot['content']
+        entry['content_short'] = toot['content']
         entry['remote_id'] = toot['id']
         entry['display_date'] = toot['created_at']
 
@@ -97,10 +97,10 @@ def fetch_toots(server_url, access_token, newer_than=None, limit=None):
 
         # show (read-only) poll options
         if toot.get('poll'):
-            entry['body'] += '<ul>'
+            entry['content_short'] += '<ul>'
             for option in toot['poll']['options']:
-                entry['body'] += f'<li>{option["title"]}</li>'
-            entry['body'] += '</ul>'
+                entry['content_short'] += f'<li>{option["title"]}</li>'
+            entry['content_short'] += '</ul>'
 
         entries.append(entry)
 
@@ -138,7 +138,7 @@ def fetch_notifications(server_url, access_token, newer_than=None, limit=None):
             'avatar_url': notification['account']['avatar'],
             'username': notification['account']['acct'],
             'display_name': display_name(notification),
-            'body': body}
+            'content_short': body}
 
         # NOTE: we could attempt to render the source toot in the body as the mastodon web ui does,
         # but I'm guessing that more often than not that would result in useless messages spamming the feed.

diff --git a/feedi/parsers/rss.py b/feedi/parsers/rss.py
@@ -9,8 +9,8 @@
 
 import feedparser
 from bs4 import BeautifulSoup
-from feedi.requests import (USER_AGENT, CachingRequestsMixin, extract_meta,
-                            get_favicon, requests)
+from feedi.requests import USER_AGENT, requests
+from feedi.scraping import CachingRequestsMixin, extract_meta, get_favicon
 
 logger = logging.getLogger(__name__)
 
@@ -55,8 +55,8 @@ class RSSParser(CachingRequestsMixin):
     for custom feed presentation.
     """
 
-    FIELDS = ['title', 'avatar_url', 'username', 'body', 'media_url', 'remote_id',
-              'display_date', 'sort_date', 'comments_url', 'target_url', 'content_url', 'header']
+    FIELDS = ['title', 'avatar_url', 'username', 'content_short', 'content_full', 'media_url', 'remote_id',
+              'display_date', 'sort_date', 'comments_url', 'target_url', 'content_url', 'header',]
 
     @staticmethod
     def is_compatible(_feed_url):
@@ -197,7 +197,7 @@ def parse_avatar_url(self, entry):
             logger.debug('found entry-level avatar %s', url)
             return url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         summary = entry.get('summary')
         if summary:
             summary = html.unescape(summary)
@@ -218,6 +218,10 @@ def parse_body(self, entry):
         # on the view side if necessary (so it applies regardless of the parser implementation)
         return str(soup)
 
+    def parse_content_full(self, _entry):
+        # by default skip the full content parsing since it's too expensive to do on every article
+        return None
+
     def parse_media_url(self, entry):
         # first try to get it in standard feed fields
         if 'media_thumbnail' in entry:
@@ -346,7 +350,7 @@ class RedditInboxParser(RSSParser):
     def is_compatible(feed_url):
         return 'reddit.com/message' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         return entry['content'][0]['value']
 
     def parse_title(self, entry):
@@ -361,7 +365,7 @@ def is_compatible(feed_url):
         # looks like reddit but not like the inbox feed
         return 'reddit.com' in feed_url and 'reddit.com/message' not in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         soup = BeautifulSoup(entry['summary'], 'lxml')
         link_anchor = soup.find("a", string="[link]")
         comments_anchor = soup.find("a", string="[comments]")
@@ -399,7 +403,7 @@ class LobstersParser(RSSParser):
     def is_compatible(feed_url):
         return 'lobste.rs' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         # fill summary from source for link-only posts
         if 'Comments' in entry['summary']:
             url = self.parse_content_url(entry)
@@ -416,7 +420,7 @@ class HackerNewsParser(RSSParser):
     def is_compatible(feed_url):
         return 'news.ycombinator.com' in feed_url or 'hnrss.org' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         # fill summary from source for link-only posts
         if 'Article URL' in entry['summary']:
             url = self.parse_content_url(entry)
@@ -432,7 +436,7 @@ class GithubFeedParser(RSSParser):
     def is_compatible(feed_url):
         return 'github.com' in feed_url and 'private.atom' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         return entry['title']
 
     def parse_username(self, entry):
@@ -464,7 +468,7 @@ class GoodreadsFeedParser(RSSParser):
     def is_compatible(feed_url):
         return 'goodreads.com' in feed_url and '/home/index_rss' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         # some updates come with escaped html entities
         summary = html.unescape(entry['summary'])
         soup = BeautifulSoup(summary, 'lxml')
@@ -502,7 +506,7 @@ def is_compatible(feed_url):
     def should_skip(entry):
         return 'publi' in entry['title'] or entry['title'].lower().startswith('crisis en el aire')
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         return self.fetch_meta(entry['link'], 'og:description', 'description')
 
 
@@ -511,7 +515,7 @@ class ACMQueueParser(RSSParser):
     def is_compatible(feed_url):
         return 'queue.acm.org' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         content = self.request(entry['link'])
         soup = BeautifulSoup(content, 'lxml')
         title = soup.find('h1')
@@ -531,7 +535,7 @@ class WikiFeaturedParser(RSSParser):
     def is_compatible(feed_url):
         return 'wikipedia.org' in feed_url and 'featuredfeed' in feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         soup = BeautifulSoup(entry['summary'], 'lxml')
         return str(soup.find('p'))
 
@@ -545,7 +549,7 @@ class IndieBlogParser(RSSParser):
     def is_compatible(_feed_url):
         return 'indieblog.page' in _feed_url
 
-    def parse_body(self, entry):
+    def parse_content_short(self, entry):
         soup = BeautifulSoup(entry['summary'], 'lxml')
         body = soup.blockquote
         body.name = 'p'