#1056: Write PCM tool to refill all lost DISP-S1 products from ASF ba…

…ck into the S3 bucket. 80% complete
nasa · Jan 15, 2025 · 83af64d · 83af64d
1 parent 3450916
commit 83af64d
Show file tree

Hide file tree

Showing 9 changed files with 84 additions and 3 deletions.
diff --git a/report/opera-validator/README.md → report/opera_validator/README.md b/report/opera-validator/README.md → report/opera_validator/README.md
diff --git a/report/opera-validator/data.pkl → report/opera_validator/data.pkl b/report/opera-validator/data.pkl → report/opera_validator/data.pkl
diff --git a/report/opera_validator/data_wrong.pkl b/report/opera_validator/data_wrong.pkl
diff --git a/report/opera-validator/opera_validator.py → report/opera_validator/opera_validator.py b/report/opera-validator/opera_validator.py → report/opera_validator/opera_validator.py
diff --git a/report/opera-validator/opv_disp_s1.py → report/opera_validator/opv_disp_s1.py b/report/opera-validator/opv_disp_s1.py → report/opera_validator/opv_disp_s1.py
diff --git a/report/opera-validator/opv_util.py → report/opera_validator/opv_util.py b/report/opera-validator/opv_util.py → report/opera_validator/opv_util.py
@@ -97,7 +97,7 @@ def parallel_fetch(url, params, page_num, page_size, downloaded_batches):
         with downloaded_batches.get_lock():  # Safely increment the count
             downloaded_batches.value += 1
         return batch_granules
-def generate_url_params(start, end, endpoint = 'OPS', provider = 'ASF', short_name = 'OPERA_L2_RTC-S1_V1', window_length_days = 30, timestamp_type = 'temporal'):
+def generate_url_params(start, end, endpoint = 'OPS', provider = 'ASF', short_name = 'OPERA_L2_RTC-S1_V1', window_length_days = 30, timestamp_type = 'temporal', extra_params = None):
     """
     Generates URL parameters for querying granules from CMR (Common Metadata Repository) based on provided criteria.
 
@@ -145,9 +145,12 @@ def generate_url_params(start, end, endpoint = 'OPS', provider = 'ASF', short_na
     else: # default time query type if not provided or set to temporal
         params['temporal'] = f"{start},{end}"
 
+    if extra_params:
+        params.update(extra_params)
+
     return base_url, params
 
-def retrieve_r3_products(smallest_date, greatest_date, endpoint, shortname):
+def retrieve_r3_products(smallest_date, greatest_date, endpoint, shortname, extra_params = None):
 
     # Convert timestamps to strings in ISO 8601 format
     smallest_date_iso = smallest_date.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3]
@@ -160,7 +163,8 @@ def retrieve_r3_products(smallest_date, greatest_date, endpoint, shortname):
         endpoint=endpoint,
         provider='',  # leave blank
         short_name=shortname,  # Use the specific product short name
-        timestamp_type='temporal'  # Ensure this matches the query requirements
+        timestamp_type='temporal',  # Ensure this matches the query requirements
+        extra_params=extra_params
     )
 
     # Update the params dictionary directly to include any specific parameters needed

diff --git a/report/opera-validator/should_trigger.pkl → report/opera_validator/should_trigger.pkl b/report/opera-validator/should_trigger.pkl → report/opera_validator/should_trigger.pkl
diff --git a/...t/opera-validator/test_opera_validator.py → ...t/opera_validator/test_opera_validator.py b/...t/opera-validator/test_opera_validator.py → ...t/opera_validator/test_opera_validator.py
diff --git a/tools/download_from_daac.py b/tools/download_from_daac.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+from collections import defaultdict
+import sys
+import datetime
+import argparse
+import boto3
+
+from report.opera_validator.opv_util import retrieve_r3_products
+
+_DISP_S1_PRODUCT_TYPE = "OPERA_L3_DISP-S1_V1"
+
+'''
+Queries for DISP-S1 products from DAAC and downloads them to a specified S3 bucket path. Only works for DISP-S1 for now but can 
+easily be generalized for other products as desired.
+'''
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--verbose", action="store_true", help="If set, print out verbose information.", required=False, default=False)
+parser.add_argument("--dry-run", action="store_true", help="If set, do not actually copy any files.", required=False, default=False)
+parser.add_argument("--daac-endpoint", required=False, choices=['UAT', 'OPS'], default='OPS', help='CMR endpoint venue')
+parser.add_argument("--s3-destination", dest="s3_destination", help="S3 bucket name and path to write files to", required=True)
+parser.add_argument("--frame-list-file", dest="frame_list_file", help="DISP-S1 frames to ", required=True)
+args = parser.parse_args()
+
+smallest_date = datetime.datetime.strptime("1999-12-31T23:59:59.999999Z", "%Y-%m-%dT%H:%M:%S.%fZ")
+greatest_date = datetime.datetime.strptime("2099-01-01T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%fZ")
+
+# Open up the text file frame_list_file and parse out all the frame numbers in there. They can be separated by commas or newlines.
+frames_to_download = []
+with open(args.frame_list_file, "r") as f:
+    for line in f:
+        frames_to_download.extend([int(frame) for frame in line.strip().split(",")])
+
+filtered_disp_s1 = {}
+frame_to_count = defaultdict(int)
+
+for frame in frames_to_download:
+
+    native_id_pattern = "OPERA_L3_DISP-S1_IW_F%05d*" % frame
+    if args.verbose:
+        print(f"Searching for DISP-S1 products with native-id pattern: {native_id_pattern}")
+    extra_params = {"native-id[]": native_id_pattern, "options[native-id][pattern]": "true"}
+
+    # Retrieve all DISP-S1 products from CMR within the acquisition time range as a list of granuleIDs
+    all_disp_s1 = retrieve_r3_products(smallest_date, greatest_date, "UAT", _DISP_S1_PRODUCT_TYPE, extra_params=extra_params)
+    for disp_s1 in all_disp_s1:
+
+        # Getting to the frame_id is a bit of a pain
+        for attrib in disp_s1.get("umm").get("AdditionalAttributes"):
+            # Need to perform secondary filter. Not sure if we always need to do this or temporarily so.
+            actual_temporal_time = datetime.datetime.strptime(
+                disp_s1.get("umm").get("TemporalExtent")['RangeDateTime']['EndingDateTime'], "%Y-%m-%dT%H:%M:%SZ")
+            if actual_temporal_time >= smallest_date and actual_temporal_time <= greatest_date:
+                # If the path umm.RelatedUrls contains "URL" that starts with "s3" and "Format" field value "netCDF-4" then store that value
+                for related_url in disp_s1.get("umm").get("RelatedUrls"):
+                    if related_url.get("URL").startswith("s3") and related_url.get("Format") == "netCDF-4":
+                        filtered_disp_s1[disp_s1.get("umm").get("GranuleUR")] = related_url.get("URL")
+                        frame_to_count[frame] += 1
+                        break
+
+print(f"Found {len(filtered_disp_s1.keys())} DISP-S1 products:")
+for frame, count in frame_to_count.items():
+    print(f"Frame {frame}: {count} products")
+if args.verbose:
+    for granule_id, url in filtered_disp_s1.items():
+        print(f"{granule_id}: {url}")
+    print(f"Found {len(filtered_disp_s1.keys())} DISP-S1 products:")
+
+if args.dry_run:
+    print("Dry run. Not copying any files.")
+    sys.exit(0)
+
+# Copy down all the S3 files to here
+s3 = boto3.client('s3')
+for granule_id, url in filtered_disp_s1.items():
+    s3.download_file(url.split("/")[2], "/".join(url.split("/")[3:]), url.split("/")[-1])