Skip to content

Commit

Permalink
Threads data source
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva committed Oct 14, 2024
1 parent c27fbbe commit a68f5d6
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 0 deletions.
9 changes: 9 additions & 0 deletions datasources/threads/DESCRIPTION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
The Threads data source can be used to manipulate data collected from [Threads](https://threads.net) - Meta's
microblogging platform - with [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected
with the browser extension; 4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be
uploaded to 4CAT for further processing and analysis. See the Zeeschuimer documentation for more information on how to
collect data with it.

Data is collected as it is formatted internally by Threads' website. Posts are stored as (large) JSON objects; it
will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure
is relatively straightforward and contains some data not included in the CSV exports.
12 changes: 12 additions & 0 deletions datasources/threads/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""
Initialize Threads data source
"""

# An init_datasource function is expected to be available to initialize this
# data source. A default function that does this is available from the
# backend helpers library.
from common.lib.helpers import init_datasource

# Internal identifier for this data source
DATASOURCE = "threads"
NAME = "Threads"
78 changes: 78 additions & 0 deletions datasources/threads/search_threads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Import scraped Threads data

It's prohibitively difficult to scrape data from Threads within 4CAT itself due
to its aggressive rate limiting. Instead, import data collected elsewhere.
"""
from datetime import datetime
from urllib.parse import urlparse, parse_qs, unquote
import re

from backend.lib.search import Search
from common.lib.item_mapping import MappedItem


class SearchThreads(Search):
"""
Import scraped Threads data
"""
type = "threads-search" # job ID
category = "Search" # category
title = "Import scraped Threads data" # title displayed in UI
description = "Import Threads data collected with an external tool such as Zeeschuimer." # description displayed in UI
extension = "ndjson" # extension of result file, used internally and in UI
is_from_zeeschuimer = True

# not available as a processor for existing datasets
accepts = [None]
references = [
"[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
"[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
]

def get_items(self, query):
"""
Run custom search

Not available for 9gag
"""
raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")

@staticmethod
def map_item(post):
post_timestamp = datetime.fromtimestamp(post["taken_at"])

if post["carousel_media"]:
image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]]
video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
else:
image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else []
video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []

linked_url = ""
link_thumbnail = ""
if post["text_post_app_info"].get("link_preview_attachment"):
linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop()
link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")

return MappedItem({
"id": post["code"],
"url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}",
"body": post["caption"]["text"] if post["caption"] else "",
"timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
"author": post["user"]["username"],
"author_is_verified": "yes" if post["user"].get("is_verified") else "no",
"author_avatar": post["user"].get("profile_pic_url"),
"image_url": ",".join(image_urls),
"video_url": ",".join(video_urls),
"link_url": linked_url,
"link_thumbnail_url": link_thumbnail if link_thumbnail else "",
"is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
"likes": post["like_count"],
"reposts": post["text_post_app_info"]["repost_count"],
"replies": post["text_post_app_info"]["direct_reply_count"],
"quotes": post["text_post_app_info"]["quote_count"],
"hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
"unix_timestamp": int(post_timestamp.timestamp()),
})

0 comments on commit a68f5d6

Please sign in to comment.