From a68f5d64de14c02368ba7b9b7dd0a1118cd15417 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 14 Oct 2024 12:21:17 +0200
Subject: [PATCH] Threads data source

---
 datasources/threads/DESCRIPTION.md    |  9 ++++
 datasources/threads/__init__.py       | 12 +++++
 datasources/threads/search_threads.py | 78 +++++++++++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 datasources/threads/DESCRIPTION.md
 create mode 100644 datasources/threads/__init__.py
 create mode 100644 datasources/threads/search_threads.py

diff --git a/datasources/threads/DESCRIPTION.md b/datasources/threads/DESCRIPTION.md
new file mode 100644
index 00000000..22f95bba
--- /dev/null
+++ b/datasources/threads/DESCRIPTION.md
@@ -0,0 +1,9 @@
+The Threads data source can be used to manipulate data collected from [Threads](https://threads.net) - Meta's 
+microblogging platform - with  [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected 
+with the browser extension; 4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be 
+uploaded to 4CAT for further processing and analysis. See the Zeeschuimer documentation for more information on how to 
+collect data with it.
+
+Data is collected as it is formatted internally by Threads' website. Posts are stored as (large) JSON objects; it 
+will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure
+is relatively straightforward and contains some data not included in the CSV exports.
\ No newline at end of file
diff --git a/datasources/threads/__init__.py b/datasources/threads/__init__.py
new file mode 100644
index 00000000..a4f01942
--- /dev/null
+++ b/datasources/threads/__init__.py
@@ -0,0 +1,12 @@
+"""
+Initialize Threads data source
+"""
+
+# An init_datasource function is expected to be available to initialize this
+# data source. A default function that does this is available from the
+# backend helpers library.
+from common.lib.helpers import init_datasource
+
+# Internal identifier for this data source
+DATASOURCE = "threads"
+NAME = "Threads"
\ No newline at end of file
diff --git a/datasources/threads/search_threads.py b/datasources/threads/search_threads.py
new file mode 100644
index 00000000..02c8c2de
--- /dev/null
+++ b/datasources/threads/search_threads.py
@@ -0,0 +1,78 @@
+"""
+Import scraped Threads data
+
+It's prohibitively difficult to scrape data from Threads within 4CAT itself due
+to its aggressive rate limiting. Instead, import data collected elsewhere.
+"""
+from datetime import datetime
+from urllib.parse import urlparse, parse_qs, unquote
+import re
+
+from backend.lib.search import Search
+from common.lib.item_mapping import MappedItem
+
+
+class SearchThreads(Search):
+    """
+    Import scraped Threads data
+    """
+    type = "threads-search"  # job ID
+    category = "Search"  # category
+    title = "Import scraped Threads data"  # title displayed in UI
+    description = "Import Threads data collected with an external tool such as Zeeschuimer."  # description displayed in UI
+    extension = "ndjson"  # extension of result file, used internally and in UI
+    is_from_zeeschuimer = True
+
+    # not available as a processor for existing datasets
+    accepts = [None]
+    references = [
+        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
+        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
+    ]
+
+    def get_items(self, query):
+        """
+        Run custom search
+
+        Not available for 9gag
+        """
+        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
+
+    @staticmethod
+    def map_item(post):
+        post_timestamp = datetime.fromtimestamp(post["taken_at"])
+
+        if post["carousel_media"]:
+            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]]
+            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
+        else:
+            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else []
+            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
+
+        linked_url = ""
+        link_thumbnail = ""
+        if post["text_post_app_info"].get("link_preview_attachment"):
+            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
+            linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop()
+            link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
+
+        return MappedItem({
+            "id": post["code"],
+            "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}",
+            "body": post["caption"]["text"] if post["caption"] else "",
+            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
+            "author": post["user"]["username"],
+            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
+            "author_avatar": post["user"].get("profile_pic_url"),
+            "image_url": ",".join(image_urls),
+            "video_url": ",".join(video_urls),
+            "link_url": linked_url,
+            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
+            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
+            "likes": post["like_count"],
+            "reposts": post["text_post_app_info"]["repost_count"],
+            "replies": post["text_post_app_info"]["direct_reply_count"],
+            "quotes": post["text_post_app_info"]["quote_count"],
+            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
+            "unix_timestamp": int(post_timestamp.timestamp()),
+        })