Skip to content

Commit

Permalink
Cache entry content in db (#86)
Browse files Browse the repository at this point in the history
* accept stdin html in extract_article.js

* add a separate scraping module

* move article extraction to scraping module

* remove local_links feature

* rename body to content_short

* rename in code

* add full content field

* store content in db after first fetch

* handle article extraction from html

* skip content from agenda

* add prefetch task

* add prefetch helper

* add content fields to search

* fix outdated field name

* fix outdated method name

* fix fetch content logic
  • Loading branch information
facundoolano authored Jan 3, 2024
1 parent 86d9d09 commit d373d18
Show file tree
Hide file tree
Showing 15 changed files with 291 additions and 179 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ class LobstersParser(BaseParser):
def is_compatible(_feed_url, feed_data):
return 'lobste.rs' in feed_data['feed'].get('link', '')

def parse_body(self, entry):
def parse_content_short(self, entry):
# A 'Comments' link is only present on external link submissions
if 'Comments' in entry['summary']:
url = self.parse_content_url(entry)
Expand Down
1 change: 1 addition & 0 deletions feedi/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
DELETE_OLD_CRON_HOURS = '*/12'

SKIP_RECENTLY_UPDATED_MINUTES = 10
CONTENT_PREFETCH_MINUTES = '*/15'
RSS_SKIP_OLDER_THAN_DAYS = 7
DELETE_AFTER_DAYS = 7
RSS_MINIMUM_ENTRY_AMOUNT = 5
Expand Down
12 changes: 11 additions & 1 deletion feedi/extract_article.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,19 @@ function parseAndPrint(dom) {
process.stdout.write(JSON.stringify(article), process.exit);
}

async function read(stream) {
const chunks = [];
for await (const chunk of stream) chunks.push(chunk);
return Buffer.concat(chunks).toString('utf8');
}

const {values, positionals} = util.parseArgs({
allowPositionals: true
});
const url = positionals[0];

JSDOM.fromURL(url).then(parseAndPrint);
if (url) {
JSDOM.fromURL(url).then(parseAndPrint);
} else {
read(process.stdin).then(s => new JSDOM(s)).then(parseAndPrint);
}
4 changes: 2 additions & 2 deletions feedi/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def sanitize_content(html, truncate=True):
# to make the text hide on overflow
@app.template_filter('entry_excerpt')
def entry_excerpt(entry):
if not entry.body:
if not entry.content_short:
return ''

if entry.content_url and entry.title:
Expand All @@ -90,7 +90,7 @@ def entry_excerpt(entry):
else:
title = entry.feed.name

body_text = BeautifulSoup(entry.body, 'lxml').text
body_text = BeautifulSoup(entry.content_short, 'lxml').text

# truncate according to display title length so all entries
# have aproximately the same length
Expand Down
20 changes: 16 additions & 4 deletions feedi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from sqlalchemy.ext.hybrid import hybrid_property

import feedi.parsers as parsers
from feedi.requests import get_favicon
from feedi import scraping

# TODO consider adding explicit support for url columns

Expand Down Expand Up @@ -303,7 +303,7 @@ def fetch_entry_data(self, _force=False):

def load_icon(self):
""
self.icon_url = get_favicon(self.url)
self.icon_url = scraping.get_favicon(self.url)

@classmethod
def frequency_rank_query(cls):
Expand Down Expand Up @@ -471,9 +471,12 @@ class Entry(db.Model):
avatar_url = sa.Column(
sa.String, doc="The url of the avatar image to be displayed for the entry.")

body = sa.Column(sa.String, doc="The content to be displayed in the feed preview. HTML is supported. \
content_short = sa.Column(sa.String, doc="The content to be displayed in the feed preview. HTML is supported. \
For article entries, it would be an excerpt of the full article content.")

content_full = sa.orm.deferred(sa.Column(
sa.String, doc="The content to be displayed in the reader, e.g. the cleaned full article HTML."))

target_url = sa.Column(
sa.String, doc="The URL to open when accessing the entry at its source. \
NULL is interpreted as the entry cannot be open at the source.")
Expand Down Expand Up @@ -541,6 +544,14 @@ def has_distinct_user(self):
"""
return self.avatar_url and (self.display_name or self.username)

def fetch_content(self):
if self.content_url and not self.content_full:
try:
self.content_full = scraping.extract(self.content_url)['content']
db.session.commit()
except Exception:
pass

@classmethod
def _filtered_query(cls, user_id, hide_seen=False, favorited=None,
feed_name=None, username=None, folder=None,
Expand Down Expand Up @@ -577,7 +588,8 @@ def _filtered_query(cls, user_id, hide_seen=False, favorited=None,
# Poor Text Search™
query = query.filter(cls.title.contains(text) |
cls.username.contains(text) |
cls.body.contains(text))
cls.content_short.contains(text) |
cls.content_full.contains(text))

return query

Expand Down
12 changes: 6 additions & 6 deletions feedi/parsers/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

import dateparser
from bs4 import BeautifulSoup
from feedi.requests import CachingRequestsMixin, requests
from feedi.requests import requests
from feedi.scraping import CachingRequestsMixin


def fetch(feed_name, url):
Expand Down Expand Up @@ -54,9 +55,8 @@ def fetch(self):
'username': item['additions'].split(';')[0].split('Por ')[-1],
'display_date': created,
'sort_date': created,
'body': item['synopsis'],
'content_short': item['synopsis'],
'media_url': item['image']['url'],
'content_url': content_url,
'target_url': content_url,
'raw_data': json.dumps(item)
})
Expand Down Expand Up @@ -87,7 +87,7 @@ def fetch(self):
'username': item['editor'],
'display_date': datetime.datetime.fromisoformat(item['dateCreated']),
'sort_date': datetime.datetime.fromisoformat(item['dateModified']),
'body': item['description'],
'content_short': item['description'],
'media_url': item['image'],
'target_url': item['url'],
# this website does very funky things with the html that can't be parsed in the reader.
Expand Down Expand Up @@ -121,7 +121,7 @@ def fetch(self):
'username': author,
'display_date': date,
'sort_date': date,
'body': article.find(class_='newsSummary').text,
'content_short': article.find(class_='newsSummary').text,
'media_url': article.find('img')['src'],
'target_url': content_url,
'content_url': content_url,
Expand Down Expand Up @@ -158,7 +158,7 @@ def fetch(self):
'username': article['byline'],
'display_date': pub_date,
'sort_date': pub_date,
'body': self.fetch_meta(article_url, 'og:description', 'description'),
'content_short': self.fetch_meta(article_url, 'og:description', 'description'),
'media_url': self.fetch_meta(article_url, 'og:image', 'twitter:image'),
'content_url': article_url,
})
Expand Down
10 changes: 5 additions & 5 deletions feedi/parsers/mastodon.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def fetch_toots(server_url, access_token, newer_than=None, limit=None):
entry['avatar_url'] = toot['account']['avatar']
entry['username'] = toot['account']['acct']
entry['display_name'] = display_name(toot)
entry['body'] = toot['content']
entry['content_short'] = toot['content']
entry['remote_id'] = toot['id']
entry['display_date'] = toot['created_at']

Expand All @@ -97,10 +97,10 @@ def fetch_toots(server_url, access_token, newer_than=None, limit=None):

# show (read-only) poll options
if toot.get('poll'):
entry['body'] += '<ul>'
entry['content_short'] += '<ul>'
for option in toot['poll']['options']:
entry['body'] += f'<li>{option["title"]}</li>'
entry['body'] += '</ul>'
entry['content_short'] += f'<li>{option["title"]}</li>'
entry['content_short'] += '</ul>'

entries.append(entry)

Expand Down Expand Up @@ -138,7 +138,7 @@ def fetch_notifications(server_url, access_token, newer_than=None, limit=None):
'avatar_url': notification['account']['avatar'],
'username': notification['account']['acct'],
'display_name': display_name(notification),
'body': body}
'content_short': body}

# NOTE: we could attempt to render the source toot in the body as the mastodon web ui does,
# but I'm guessing that more often than not that would result in useless messages spamming the feed.
Expand Down
34 changes: 19 additions & 15 deletions feedi/parsers/rss.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

import feedparser
from bs4 import BeautifulSoup
from feedi.requests import (USER_AGENT, CachingRequestsMixin, extract_meta,
get_favicon, requests)
from feedi.requests import USER_AGENT, requests
from feedi.scraping import CachingRequestsMixin, extract_meta, get_favicon

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -55,8 +55,8 @@ class RSSParser(CachingRequestsMixin):
for custom feed presentation.
"""

FIELDS = ['title', 'avatar_url', 'username', 'body', 'media_url', 'remote_id',
'display_date', 'sort_date', 'comments_url', 'target_url', 'content_url', 'header']
FIELDS = ['title', 'avatar_url', 'username', 'content_short', 'content_full', 'media_url', 'remote_id',
'display_date', 'sort_date', 'comments_url', 'target_url', 'content_url', 'header',]

@staticmethod
def is_compatible(_feed_url):
Expand Down Expand Up @@ -197,7 +197,7 @@ def parse_avatar_url(self, entry):
logger.debug('found entry-level avatar %s', url)
return url

def parse_body(self, entry):
def parse_content_short(self, entry):
summary = entry.get('summary')
if summary:
summary = html.unescape(summary)
Expand All @@ -218,6 +218,10 @@ def parse_body(self, entry):
# on the view side if necessary (so it applies regardless of the parser implementation)
return str(soup)

def parse_content_full(self, _entry):
# by default skip the full content parsing since it's too expensive to do on every article
return None

def parse_media_url(self, entry):
# first try to get it in standard feed fields
if 'media_thumbnail' in entry:
Expand Down Expand Up @@ -346,7 +350,7 @@ class RedditInboxParser(RSSParser):
def is_compatible(feed_url):
return 'reddit.com/message' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
return entry['content'][0]['value']

def parse_title(self, entry):
Expand All @@ -361,7 +365,7 @@ def is_compatible(feed_url):
# looks like reddit but not like the inbox feed
return 'reddit.com' in feed_url and 'reddit.com/message' not in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
soup = BeautifulSoup(entry['summary'], 'lxml')
link_anchor = soup.find("a", string="[link]")
comments_anchor = soup.find("a", string="[comments]")
Expand Down Expand Up @@ -399,7 +403,7 @@ class LobstersParser(RSSParser):
def is_compatible(feed_url):
return 'lobste.rs' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
# fill summary from source for link-only posts
if 'Comments' in entry['summary']:
url = self.parse_content_url(entry)
Expand All @@ -416,7 +420,7 @@ class HackerNewsParser(RSSParser):
def is_compatible(feed_url):
return 'news.ycombinator.com' in feed_url or 'hnrss.org' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
# fill summary from source for link-only posts
if 'Article URL' in entry['summary']:
url = self.parse_content_url(entry)
Expand All @@ -432,7 +436,7 @@ class GithubFeedParser(RSSParser):
def is_compatible(feed_url):
return 'github.com' in feed_url and 'private.atom' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
return entry['title']

def parse_username(self, entry):
Expand Down Expand Up @@ -464,7 +468,7 @@ class GoodreadsFeedParser(RSSParser):
def is_compatible(feed_url):
return 'goodreads.com' in feed_url and '/home/index_rss' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
# some updates come with escaped html entities
summary = html.unescape(entry['summary'])
soup = BeautifulSoup(summary, 'lxml')
Expand Down Expand Up @@ -502,7 +506,7 @@ def is_compatible(feed_url):
def should_skip(entry):
return 'publi' in entry['title'] or entry['title'].lower().startswith('crisis en el aire')

def parse_body(self, entry):
def parse_content_short(self, entry):
return self.fetch_meta(entry['link'], 'og:description', 'description')


Expand All @@ -511,7 +515,7 @@ class ACMQueueParser(RSSParser):
def is_compatible(feed_url):
return 'queue.acm.org' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
content = self.request(entry['link'])
soup = BeautifulSoup(content, 'lxml')
title = soup.find('h1')
Expand All @@ -531,7 +535,7 @@ class WikiFeaturedParser(RSSParser):
def is_compatible(feed_url):
return 'wikipedia.org' in feed_url and 'featuredfeed' in feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
soup = BeautifulSoup(entry['summary'], 'lxml')
return str(soup.find('p'))

Expand All @@ -545,7 +549,7 @@ class IndieBlogParser(RSSParser):
def is_compatible(_feed_url):
return 'indieblog.page' in _feed_url

def parse_body(self, entry):
def parse_content_short(self, entry):
soup = BeautifulSoup(entry['summary'], 'lxml')
body = soup.blockquote
body.name = 'p'
Expand Down
Loading

0 comments on commit d373d18

Please sign in to comment.