Test view entry (#88)

* update test config * view content test and updates to subprocess calling * update test target to support running a single test * datafile helper * debug * Revert "debug" This reverts commit e977784. * Revert "datafile helper" This reverts commit 0783bb4. * maybe https? * debug * add node
facundoolano · Jan 4, 2024 · 51258a7 · 51258a7
1 parent c90d08f
commit 51258a7
Show file tree

Hide file tree

Showing 8 changed files with 102 additions and 15 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -23,8 +23,12 @@ jobs:
       uses: actions/setup-python@v3
       with:
         python-version: "3.9.18"
+    - name: Use Node.js
+      uses: actions/setup-node@v3
+      with:
+        node-version: '20.x'
     - name: Install dependencies
-      run: make deps-dev
+      run: make deps-dev node_modules
     - name: Lint with flake8
       run: make lint
     - name: Test with pytest

diff --git a/Makefile b/Makefile
@@ -19,8 +19,10 @@ deps-dev: deps
 node_modules:
 	npm install || true
 
+# make test
+# make test TEST=test_feed_ad
 test:
-	$(venv) FLASK_ENV=testing pytest --disable-warnings
+	$(venv) FLASK_ENV=testing pytest --disable-warnings -v $(if $(TEST),-k $(TEST))
 
 lint:
 	$(venv) flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude venv,migrations

diff --git a/feedi/extract_article.js b/feedi/extract_article.js
@@ -1,7 +1,7 @@
 #!/usr/bin/env node
-// dumb node.js script that fetches and parses the HTML from the given urls
-// and passes it to the readability package to clean it up
-// The clean HTML is printed to stdout
+// node.js script that parses the HTML from the given urls and passes it to the readability package
+// to clean it up. If no url is passed, the HTML document is expected from stdin.
+// A JSON document is printed to stdout with some metadata and the cleaned up HTML in the 'content' field
 
 const { JSDOM } = require("jsdom");
 const { Readability } = require('@mozilla/readability');

diff --git a/feedi/models.py b/feedi/models.py
@@ -2,6 +2,7 @@
 
 import datetime
 import json
+import logging
 import urllib
 
 import sqlalchemy as sa
@@ -20,6 +21,9 @@
 db = SQLAlchemy()
 
 
+logger = logging.getLogger(__name__)
+
+
 def init_db(app):
     db.init_app(app)
 
@@ -548,8 +552,8 @@ def fetch_content(self):
         if self.content_url and not self.content_full:
             try:
                 self.content_full = scraping.extract(self.content_url)['content']
-            except Exception:
-                pass
+            except Exception as e:
+                logger.debug("failed to fetch content %s", e)
 
     @classmethod
     def _filtered_query(cls, user_id, hide_seen=False, favorited=None,

diff --git a/feedi/scraping.py b/feedi/scraping.py
@@ -78,15 +78,14 @@ def extract(url=None, html=None):
     # article content than all the python libraries I've tried... even than the readabilipy
     # one, which is a wrapper of it. so resorting to running a node.js script on a subprocess
     # for parsing the article sadly this adds a dependency to node and a few npm pacakges
-    if html:
-        r = subprocess.run(["feedi/extract_article.js"], input=html,
-                           capture_output=True, check=True)
-    elif url:
-        r = subprocess.run(["feedi/extract_article.js", url],
-                           capture_output=True, text=True, check=True)
-    else:
+    if url:
+        html = requests.get(url).content
+    elif not html:
         raise ValueError('Expected either url or html')
 
+    r = subprocess.run(["feedi/extract_article.js"], input=html,
+                       capture_output=True, check=True)
+
     article = json.loads(r.stdout)
 
     # load lazy images by replacing putting the data-src into src and stripping other attrs

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -81,7 +81,7 @@ def mock_feed(domain, items):
         entry.updated(item['date'])
         entry.description(item.get('description', 'default description'))
 
-        mock_request(entry_url, body='<p>content!</p>')
+        mock_request(entry_url, body=item.get('body', '<p>content!</p>'))
 
     rssfeed = fg.rss_str()
     mock_request(base_url)

diff --git a/tests/sample.html b/tests/sample.html
@@ -0,0 +1,53 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta name="viewport" content="width=device-width, initial-scale=1">
+        <meta charset="utf-8">
+        <title>Reclaiming the Web with a Personal Reader</title>
+        <link rel="stylesheet" href="/assets/css/main.css">
+        <link type="application/atom+xml" rel="alternate" href="https://olano.dev/feed.xml" title="olano.dev" />
+        <meta name="author" content="Facundo Olano">
+        <meta property="og:article:author" content="Facundo Olano">
+        <meta property="og:site_name" content="olano.dev">
+        <meta property="og:title" content="Reclaiming the Web with a Personal Reader">
+        <meta name="twitter:title" content="Reclaiming the Web with a Personal Reader">
+        <meta name="description" content="There’s a kind of zen flow that programmers unblock when they experience their software daily as an end user. There’s no better catalyst for ideas and experimentation, no better prioritization driver than having to face the bugs, annoyances, and limitations of an application first-hand.">
+        <meta name="og:description" content="There’s a kind of zen flow that programmers unblock when they experience their software daily as an end user. There’s no better catalyst for ideas and experimentation, no better prioritization driver than having to face the bugs, annoyances, and limitations of an application first-hand.">
+        <meta name="twitter:description" content="There’s a kind of zen flow that programmers unblock when they experience their software daily as an end user. There’s no better catalyst for ideas and experimentation, no better prioritization driver than having to face the bugs, annoyances, and limitations of an application first-hand.">
+        <meta property="og:type" content="article">
+        <meta property="og:article:published_time" content="2023-12-12T00:00:00-03:00">
+        <meta property="og:url" content="https://olano.dev/2023-12-12-reclaiming-the-web-with-a-personal-reader/">
+        <link rel="canonical" href="https://olano.dev/2023-12-12-reclaiming-the-web-with-a-personal-reader/">
+        <meta property="og:image" content="https://olano.dev/assets/img/feedi1.png">
+        <meta name="twitter:image" content="https://olano.dev/assets/img/feedi1.png">
+        <meta name="twitter:card" content="summary_large_image">
+    </head>
+    <body>
+        <nav class="text-center">
+    <a href="/">olano.dev</a>
+    <a href="/blog">/blog</a>
+</nav>
+
+<div class="content layout-post" lang="en">
+    <h1>Reclaiming the Web with a Personal Reader</h1>
+    <br/>
+    <br/>
+
+<p><strong>As a user</strong>, I had some ideas of what I wanted from this project.</p>
+
+<p><strong>As a developer</strong>, I wanted to test some of the ideas I’d been ruminating on for over a year. Although I hadn’t yet formulated it in those terms, I wanted to apply what I expressed in <a href="../2023-11-30-code-is-run-more-than-read">another post</a> as: <code class="language-plaintext highlighter-rouge">user &gt; ops &gt; dev</code>. This meant that, when prioritizing tasks or making design trade-offs, I would choose ease of operation over development convenience, and I would put user experience above everything else.</p>
+
+<h2 id="design">Design</h2>
+
+<p>Given that mental framework, I needed to make some initial technical decisions.</p>
+
+
+</div>
+
+<p class="text-center">
+    <span class="date">12/12/2023</span>
+    <span class="tags"><a href="/blog/tags#software">#software</a></span>
+</p>
+
+    </body>
+</html>
diff --git a/tests/test_routes.py b/tests/test_routes.py
@@ -1,3 +1,5 @@
+# coding: utf-8
+
 import datetime as dt
 import re
 
@@ -242,6 +244,29 @@ def test_entries_not_mixed_between_users(client):
 
 
 def test_view_entry_content(client):
+    # create feed with a sample entry
+    with open('tests/sample.html') as sample:
+        body = sample.read()
+    response = create_feed(client, 'olano.dev', [{'title': 'reclaiming-the-web',
+                                                  'date': '2023-12-12T00:00:00-03:00',
+                                                  'description': 'short content',
+                                                  'body': body}])
+    assert 'reclaiming-the-web' in response.text
+    assert 'short content' in response.text
+    entry_url = re.search(r'/entries/(\d+)', response.text).group(0)
+    response = client.get(entry_url)
+
+    assert 'reclaiming-the-web' in response.text
+    assert 'I had some ideas of what I wanted' in response.text
+
+
+def test_add_external_entry(client):
+    # mock response to an arbitrary url
+    # add a standalone entry for that url
+    # extract redirected entry url
+    # verify content parsed
+    # add same url again
+    # verify that redirected entry url is the same as before
     # TODO
     pass