-
Notifications
You must be signed in to change notification settings - Fork 64
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
99 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
The Threads data source can be used to manipulate data collected from [Threads](https://threads.net) - Meta's | ||
microblogging platform - with [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected | ||
with the browser extension; 4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be | ||
uploaded to 4CAT for further processing and analysis. See the Zeeschuimer documentation for more information on how to | ||
collect data with it. | ||
|
||
Data is collected as it is formatted internally by Threads' website. Posts are stored as (large) JSON objects; it | ||
will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure | ||
is relatively straightforward and contains some data not included in the CSV exports. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
""" | ||
Initialize Threads data source | ||
""" | ||
|
||
# An init_datasource function is expected to be available to initialize this | ||
# data source. A default function that does this is available from the | ||
# backend helpers library. | ||
from common.lib.helpers import init_datasource | ||
|
||
# Internal identifier for this data source | ||
DATASOURCE = "threads" | ||
NAME = "Threads" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
""" | ||
Import scraped Threads data | ||
|
||
It's prohibitively difficult to scrape data from Threads within 4CAT itself due | ||
to its aggressive rate limiting. Instead, import data collected elsewhere. | ||
""" | ||
from datetime import datetime | ||
from urllib.parse import urlparse, parse_qs, unquote | ||
import re | ||
|
||
from backend.lib.search import Search | ||
from common.lib.item_mapping import MappedItem | ||
|
||
|
||
class SearchThreads(Search): | ||
""" | ||
Import scraped Threads data | ||
""" | ||
type = "threads-search" # job ID | ||
category = "Search" # category | ||
title = "Import scraped Threads data" # title displayed in UI | ||
description = "Import Threads data collected with an external tool such as Zeeschuimer." # description displayed in UI | ||
extension = "ndjson" # extension of result file, used internally and in UI | ||
is_from_zeeschuimer = True | ||
|
||
# not available as a processor for existing datasets | ||
accepts = [None] | ||
references = [ | ||
"[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", | ||
"[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" | ||
] | ||
|
||
def get_items(self, query): | ||
""" | ||
Run custom search | ||
|
||
Not available for 9gag | ||
""" | ||
raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere") | ||
|
||
@staticmethod | ||
def map_item(post): | ||
post_timestamp = datetime.fromtimestamp(post["taken_at"]) | ||
|
||
if post["carousel_media"]: | ||
image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]] | ||
video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]] | ||
else: | ||
image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else [] | ||
video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else [] | ||
|
||
linked_url = "" | ||
link_thumbnail = "" | ||
if post["text_post_app_info"].get("link_preview_attachment"): | ||
linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"] | ||
linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop() | ||
link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url") | ||
|
||
return MappedItem({ | ||
"id": post["code"], | ||
"url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}", | ||
"body": post["caption"]["text"] if post["caption"] else "", | ||
"timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), | ||
"author": post["user"]["username"], | ||
"author_is_verified": "yes" if post["user"].get("is_verified") else "no", | ||
"author_avatar": post["user"].get("profile_pic_url"), | ||
"image_url": ",".join(image_urls), | ||
"video_url": ",".join(video_urls), | ||
"link_url": linked_url, | ||
"link_thumbnail_url": link_thumbnail if link_thumbnail else "", | ||
"is_paid_partnership": "yes" if post["is_paid_partnership"] else "no", | ||
"likes": post["like_count"], | ||
"reposts": post["text_post_app_info"]["repost_count"], | ||
"replies": post["text_post_app_info"]["direct_reply_count"], | ||
"quotes": post["text_post_app_info"]["quote_count"], | ||
"hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "", | ||
"unix_timestamp": int(post_timestamp.timestamp()), | ||
}) |