Skip to content

Commit

Permalink
Further iterations
Browse files Browse the repository at this point in the history
  • Loading branch information
dormant-user committed Feb 28, 2024
1 parent f9749be commit 43f9177
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 4 deletions.
34 changes: 30 additions & 4 deletions s3/dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tqdm import tqdm

from s3.logger import LOGGER
from s3.exceptions import InvalidDelimiter, convert_to_folder_structure


class Downloader:
Expand All @@ -25,13 +26,14 @@ class Downloader:
}
)

def __init__(self, bucket_name: str = None,
def __init__(self, bucket_name: str,
download_dir: str = None,
region_name: str = os.environ.get("AWS_DEFAULT_REGION"),
profile_name: str = os.environ.get("PROFILE_NAME"),
aws_access_key_id: str = os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key: str = os.environ.get("AWS_SECRET_ACCESS_KEY"),
logger: logging.Logger = LOGGER):
logger: logging.Logger = LOGGER,
delimiter: str = None):
"""Initiates all the necessary args.
Args:
Expand All @@ -54,6 +56,7 @@ def __init__(self, bucket_name: str = None,
self.bucket_name = bucket_name
self.buckets = [bucket_.name for bucket_ in self.s3.buckets.all()]
self.bucket = None
self.delimiter = delimiter

def init(self) -> None:
"""Instantiates the bucket instance.
Expand Down Expand Up @@ -92,8 +95,23 @@ def _get_objects(self) -> List[str]:
if not os.path.isdir(self.download_dir):
os.makedirs(name=self.download_dir)
self.logger.info(f"Created {os.path.abspath(path=self.download_dir)}")
objects = [obj.key for obj in self.bucket.objects.all()]
self.logger.info(f"Nuber of objects found in {self.bucket_name}: {len(objects)}")
all_s3 = self.bucket.objects.all()
if self.delimiter:
objects = [obj.key for obj in all_s3 if obj.key.startswith(self.delimiter)]
if not objects:
available = set()
for obj in all_s3:
paths = obj.key.split('/')
if len(paths) > 1: # folder like hierarchy
available.add('/'.join(paths[0:-1]))
if available: # this means hierarchical structure is present but just not with the same condition
raise InvalidDelimiter(self.delimiter, self.bucket_name, available)
self.logger.info(
f"Nuber of objects found in {self.bucket_name} limited to {self.delimiter!r}: {len(objects)}"
)
else:
objects = [obj.key for obj in all_s3]
self.logger.info(f"Nuber of objects found in {self.bucket_name}: {len(objects)}")
return objects

def _downloader(self, file: str) -> NoReturn:
Expand Down Expand Up @@ -140,3 +158,11 @@ def run_in_parallel(self, threads: int = 5) -> NoReturn:
unit="files", leave=True))
self.logger.info(f"Run Time: {round(float(time.perf_counter()), 2)}s")
self.exit()

def get_bucket_structure(self):
self.init()
# Using list and set will yield the same results but using set we can isolate directories from files
return convert_to_folder_structure(set([obj.key for obj in self.bucket.objects.all()]))

def print_bucket_structure(self):
print(self.get_bucket_structure())
41 changes: 41 additions & 0 deletions s3/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
class S3Error(Exception):
""""""


class BucketNotFound(S3Error):
""""""


def convert_to_folder_structure(input_set):
folder_structure = {}
for item in input_set:
parts = item.split('/')
current_level = folder_structure
for part in parts:
current_level = current_level.setdefault(part, {})

def generate_folder_structure(structure, indent=''):
result = ''
for i, (key, value) in enumerate(structure.items()):
if i == len(structure) - 1:
result += indent + '└── ' + key + '\n'
sub_indent = indent + ' '
else:
result += indent + '├── ' + key + '\n'
sub_indent = indent + '│ '
if value:
result += generate_folder_structure(value, sub_indent)
return result
return generate_folder_structure(folder_structure)


class InvalidDelimiter(S3Error):
def __init__(self, delimiter, bucket_name, available):
self.delimiter = delimiter
self.bucket_name = bucket_name
self.available = available
super().__init__(self.format_error_message())

def format_error_message(self):
return (f"\n\n\t{self.delimiter!r} was not found in {self.bucket_name}.\n\t"
f"Available: {self.available}\n\n{convert_to_folder_structure(self.available)}")

0 comments on commit 43f9177

Please sign in to comment.