Make sure to also add last threads in import scripts for 4chan csvs

digitalmethodsinitiative · Sep 28, 2021 · c0543e4 · c0543e4
1 parent 0929fa9
commit c0543e4
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 40 deletions.
diff --git a/helper-scripts/import_dump.py b/helper-scripts/import_dump.py
@@ -155,6 +155,11 @@
 			print(" (%i threads waiting to commit)" % len(threads))
 			db.commit()
 
+	# Add the last threads as well
+	print("Adding leftover threads")
+	for thread in threads.values():
+		db.insert("threads_4chan", data=thread, commit=False, safe=safe)
+
 	db.commit()
 
 print("Done")
diff --git a/helper-scripts/import_v_dump.py b/helper-scripts/import_v_dump.py
@@ -1,3 +1,5 @@
+
+
 import argparse
 import json
 import time
@@ -30,45 +32,75 @@
 
 csvnone = re.compile(r"^N$")
 
-if args.skip_duplicates:
-    db_ids = db.fetchall("SELECT id FROM posts_4chan WHERE board ='%s';" % args.board)
-    added_ids = [[v for k, v in added_id.items()][0] for added_id in db_ids]
-    print(len(added_ids), "posts for %s already added" % args.board)
-
 seen_post_ids = set()
 with open(args.input, encoding="utf-8") as inputfile:
 
     fieldnames = ("doc_id", "media_id", "poster_ip", "num", "subnum", "thread_num", "op", "timestamp", "timestamp_expired", "preview_orig", "preview_w", "preview_h", "media_filename", "media_w", "media_h", "media_size", "media_hash", "media_orig", "spoiler", "deleted", "capcode", "email", "name", "trip", "title", "comment", "delpass", "sticky", "locked", "poster_hash", "poster_country", "exif")
 
     reader = csv.DictReader(inputfile, fieldnames=fieldnames, doublequote=False, escapechar="\\", strict=True)
 
-
     posts = 0
+    skipped = 0
     threads = {}
     duplicates = 0
 
     # Skip rows if needed. Can be useful when importing didn't go correctly.
     if args.offset:
+        print("Skipping")
         [next(reader, None) for item in range(args.offset)]
 
     for post in reader:
 
         post = {k: csvnone.sub("", post[k]) if post[k] else None for k in post}
 
         posts += 1
-        if post["media_filename"] and len({"media_w", "media_h", "preview_h", "preview_w"} - set(post.keys())) == 0:
-            dimensions = {"w": post["media_w"], "h": post["media_h"], "tw": post["preview_w"], "th": post["preview_h"]}
-        else:
-            dimensions = {}
 
         if post["subnum"] != "0":
             # ghost post
             continue
 
+        # Even when we're skipping posts, first store thread data.
+        # These might not have been updated yet.
+        if post["thread_num"] not in threads:
+            threads[post["thread_num"]] = {
+                "id": post["thread_num"],
+                "board": args.board,
+                "timestamp": 0,
+                "timestamp_scraped": int(time.time()),
+                "timestamp_modified": 0,
+                "num_unique_ips": -1,
+                "num_images": 0,
+                "num_replies": 0,
+                "limit_bump": False,
+                "limit_image": False,
+                "is_sticky": False,
+                "is_closed": False,
+                "post_last": 0
+            }
+
+        if post["op"] == "1":
+            threads[post["thread_num"]]["timestamp"] = post["timestamp"]
+            threads[post["thread_num"]]["is_sticky"] = post["sticky"] == "1"
+            threads[post["thread_num"]]["is_closed"] = post["locked"] == "1"
+
+        threads[post["thread_num"]]["num_replies"] += 1
+        threads[post["thread_num"]]["post_last"] = post["num"]
+        threads[post["thread_num"]]["timestamp_modified"] = post["timestamp"]
+
         # Duplicate posts
         if args.skip_duplicates:
-            if int(post["num"]) in added_ids:
+
+            check_id = db.fetchone("SELECT id FROM posts_4chan WHERE id = %s;" % int(post["num"]))
+            if check_id:
+                skipped += 1
+                if skipped % 10000 == 0:
+                    print("Skippped " + str(skipped) + " posts.")
                 continue
+
+        if post["media_filename"] and len({"media_w", "media_h", "preview_h", "preview_w"} - set(post.keys())) == 0:
+            dimensions = {"w": post["media_w"], "h": post["media_h"], "tw": post["preview_w"], "th": post["preview_h"]}
+        else:
+            dimensions = {}
 
         seen_post_ids.add(post["num"])
         post_data = {
@@ -90,37 +122,11 @@
             "image_dimensions": json.dumps(dimensions)
         }
 
-        if post["thread_num"] not in threads:
-            threads[post["thread_num"]] = {
-                "id": post["thread_num"],
-                "board": args.board,
-                "timestamp": 0,
-                "timestamp_scraped": int(time.time()),
-                "timestamp_modified": 0,
-                "num_unique_ips": -1,
-                "num_images": 0,
-                "num_replies": 0,
-                "limit_bump": False,
-                "limit_image": False,
-                "is_sticky": False,
-                "is_closed": False,
-                "post_last": 0
-            }
-
-        if post["op"] == "1":
-            threads[post["thread_num"]]["timestamp"] = post["timestamp"]
-            threads[post["thread_num"]]["is_sticky"] = post["sticky"] == "1"
-            threads[post["thread_num"]]["is_closed"] = post["locked"] == "1"
-
         if post["media_filename"]:
             threads[post["thread_num"]]["num_images"] += 1
 
-        threads[post["thread_num"]]["num_replies"] += 1
-        threads[post["thread_num"]]["post_last"] = post["num"]
-        threads[post["thread_num"]]["timestamp_modified"] = post["timestamp"]
-
         post_data = {k: str(v).replace("\x00", "") for k, v in post_data.items()}
-        new_id = db.insert("posts_4chan", post_data, commit=False, safe=False, return_field="id_seq")
+        new_id = db.insert("posts_4chan", post_data, commit=False, safe=True, return_field="id_seq")
 
         if post["deleted"] != "0":
             db.insert("posts_4chan_deleted", {"id_seq": new_id, "timestamp_deleted": post["deleted"]})
@@ -133,9 +139,9 @@
 
     nthreads = 0
     for thread in threads.values():
-        db.insert("threads_4chan", data=thread, commit=False, safe=False)
+        db.insert("threads_4chan", data=thread, commit=False, safe=True)
         if nthreads > 0 and nthreads % 10000 == 0:
-            print("Committing threads %i - %i" % (posts - 10000, posts))
+            print("Committing threads %i - %i" % (nthreads - 10000, nthreads))
             db.commit()
         nthreads += 1