Skip to content

Commit

Permalink
Make sure to also add last threads in import scripts for 4chan csvs
Browse files Browse the repository at this point in the history
  • Loading branch information
sal-uva committed Sep 28, 2021
1 parent 0929fa9 commit c0543e4
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 40 deletions.
5 changes: 5 additions & 0 deletions helper-scripts/import_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@
print(" (%i threads waiting to commit)" % len(threads))
db.commit()

# Add the last threads as well
print("Adding leftover threads")
for thread in threads.values():
db.insert("threads_4chan", data=thread, commit=False, safe=safe)

db.commit()

print("Done")
86 changes: 46 additions & 40 deletions helper-scripts/import_v_dump.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@


import argparse
import json
import time
Expand Down Expand Up @@ -30,45 +32,75 @@

csvnone = re.compile(r"^N$")

if args.skip_duplicates:
db_ids = db.fetchall("SELECT id FROM posts_4chan WHERE board ='%s';" % args.board)
added_ids = [[v for k, v in added_id.items()][0] for added_id in db_ids]
print(len(added_ids), "posts for %s already added" % args.board)

seen_post_ids = set()
with open(args.input, encoding="utf-8") as inputfile:

fieldnames = ("doc_id", "media_id", "poster_ip", "num", "subnum", "thread_num", "op", "timestamp", "timestamp_expired", "preview_orig", "preview_w", "preview_h", "media_filename", "media_w", "media_h", "media_size", "media_hash", "media_orig", "spoiler", "deleted", "capcode", "email", "name", "trip", "title", "comment", "delpass", "sticky", "locked", "poster_hash", "poster_country", "exif")

reader = csv.DictReader(inputfile, fieldnames=fieldnames, doublequote=False, escapechar="\\", strict=True)


posts = 0
skipped = 0
threads = {}
duplicates = 0

# Skip rows if needed. Can be useful when importing didn't go correctly.
if args.offset:
print("Skipping")
[next(reader, None) for item in range(args.offset)]

for post in reader:

post = {k: csvnone.sub("", post[k]) if post[k] else None for k in post}

posts += 1
if post["media_filename"] and len({"media_w", "media_h", "preview_h", "preview_w"} - set(post.keys())) == 0:
dimensions = {"w": post["media_w"], "h": post["media_h"], "tw": post["preview_w"], "th": post["preview_h"]}
else:
dimensions = {}

if post["subnum"] != "0":
# ghost post
continue

# Even when we're skipping posts, first store thread data.
# These might not have been updated yet.
if post["thread_num"] not in threads:
threads[post["thread_num"]] = {
"id": post["thread_num"],
"board": args.board,
"timestamp": 0,
"timestamp_scraped": int(time.time()),
"timestamp_modified": 0,
"num_unique_ips": -1,
"num_images": 0,
"num_replies": 0,
"limit_bump": False,
"limit_image": False,
"is_sticky": False,
"is_closed": False,
"post_last": 0
}

if post["op"] == "1":
threads[post["thread_num"]]["timestamp"] = post["timestamp"]
threads[post["thread_num"]]["is_sticky"] = post["sticky"] == "1"
threads[post["thread_num"]]["is_closed"] = post["locked"] == "1"

threads[post["thread_num"]]["num_replies"] += 1
threads[post["thread_num"]]["post_last"] = post["num"]
threads[post["thread_num"]]["timestamp_modified"] = post["timestamp"]

# Duplicate posts
if args.skip_duplicates:
if int(post["num"]) in added_ids:

check_id = db.fetchone("SELECT id FROM posts_4chan WHERE id = %s;" % int(post["num"]))
if check_id:
skipped += 1
if skipped % 10000 == 0:
print("Skippped " + str(skipped) + " posts.")
continue

if post["media_filename"] and len({"media_w", "media_h", "preview_h", "preview_w"} - set(post.keys())) == 0:
dimensions = {"w": post["media_w"], "h": post["media_h"], "tw": post["preview_w"], "th": post["preview_h"]}
else:
dimensions = {}

seen_post_ids.add(post["num"])
post_data = {
Expand All @@ -90,37 +122,11 @@
"image_dimensions": json.dumps(dimensions)
}

if post["thread_num"] not in threads:
threads[post["thread_num"]] = {
"id": post["thread_num"],
"board": args.board,
"timestamp": 0,
"timestamp_scraped": int(time.time()),
"timestamp_modified": 0,
"num_unique_ips": -1,
"num_images": 0,
"num_replies": 0,
"limit_bump": False,
"limit_image": False,
"is_sticky": False,
"is_closed": False,
"post_last": 0
}

if post["op"] == "1":
threads[post["thread_num"]]["timestamp"] = post["timestamp"]
threads[post["thread_num"]]["is_sticky"] = post["sticky"] == "1"
threads[post["thread_num"]]["is_closed"] = post["locked"] == "1"

if post["media_filename"]:
threads[post["thread_num"]]["num_images"] += 1

threads[post["thread_num"]]["num_replies"] += 1
threads[post["thread_num"]]["post_last"] = post["num"]
threads[post["thread_num"]]["timestamp_modified"] = post["timestamp"]

post_data = {k: str(v).replace("\x00", "") for k, v in post_data.items()}
new_id = db.insert("posts_4chan", post_data, commit=False, safe=False, return_field="id_seq")
new_id = db.insert("posts_4chan", post_data, commit=False, safe=True, return_field="id_seq")

if post["deleted"] != "0":
db.insert("posts_4chan_deleted", {"id_seq": new_id, "timestamp_deleted": post["deleted"]})
Expand All @@ -133,9 +139,9 @@

nthreads = 0
for thread in threads.values():
db.insert("threads_4chan", data=thread, commit=False, safe=False)
db.insert("threads_4chan", data=thread, commit=False, safe=True)
if nthreads > 0 and nthreads % 10000 == 0:
print("Committing threads %i - %i" % (posts - 10000, posts))
print("Committing threads %i - %i" % (nthreads - 10000, nthreads))
db.commit()
nthreads += 1

Expand Down

0 comments on commit c0543e4

Please sign in to comment.