Skip to content

Commit

Permalink
Merge pull request #14 from Zipstack/feat/recursive-processing-of-inp…
Browse files Browse the repository at this point in the history
…ut-files

feat: Added option to recurse sub-directories
  • Loading branch information
ritwik-g authored Dec 17, 2024
2 parents adc6aeb + 90d783c commit 502b1f4
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 22 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# File Processing Script

This script processes files in a specified directory using an API, logs results in a local SQLite database, and provides options for retrying failed or pending files. It includes features for skipping specific files, generating reports, and running multiple API calls in parallel.
This script processes files recursively from a specified directory using an API, logs results in a local SQLite database, and provides options for retrying failed or pending files. It includes features for skipping specific files, generating reports, and running multiple API calls in parallel.

## Features

Expand Down Expand Up @@ -61,6 +61,7 @@ This will display detailed usage information.
- `-p`, `--parallel_call_count`: Number of parallel API calls (default: 10).
- `--csv_report`: Path to export the detailed report as a CSV file.
- `--db_path`: Path where the SQlite DB file is stored (default: './file_processing.db')
- `--recursive`: Recursively identify and process files from the input folder path (default: False)
- `--retry_failed`: Retry processing of failed files.
- `--retry_pending`: Retry processing of pending files by making new requests.
- `--skip_pending`: Skip processing of pending files.
Expand Down
56 changes: 35 additions & 21 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class Arguments:
input_folder_path: str = ""
db_path: str = ""
parallel_call_count: int = 5
recurse_input_folder: bool = False
retry_failed: bool = False
retry_pending: bool = False
skip_pending: bool = False
Expand Down Expand Up @@ -463,11 +464,15 @@ def process_file(


def load_folder(args: Arguments):
files = [
os.path.join(args.input_folder_path, f)
for f in os.listdir(args.input_folder_path)
if os.path.isfile(os.path.join(args.input_folder_path, f))
]
files = []
for root, _, filenames in os.walk(args.input_folder_path):
for f in filenames:
file_path = os.path.join(root, f)
if os.path.isfile(file_path):
files.append(file_path)
if not args.recurse_input_folder:
break
logger.debug(f"Loaded '{len(files)}' files from '{args.input_folder_path}': {files}")

with Manager() as manager, Pool(args.parallel_call_count) as executor:
success_count = manager.Value("i", 0) # Shared integer for success count
Expand Down Expand Up @@ -501,6 +506,24 @@ def load_folder(args: Arguments):
pbar.close()


def api_deployment_batch_run(args: Arguments):
logger.warning(f"Running with params: {args}")
init_db(args=args) # Initialize DB

load_folder(args=args)

print_summary(args=args) # Print summary at the end
if args.print_report:
print_report(args=args)
logger.warning(
"Elapsed time calculation of a file which was resumed"
" from pending state will not be correct"
)

if args.csv_report:
export_report_to_csv(args=args)


def main():
parser = argparse.ArgumentParser(description="Process files using Unstract's API deployment")
parser.add_argument(
Expand Down Expand Up @@ -564,6 +587,12 @@ def main():
type=str,
help='Path to export the detailed report as a CSV file',
)
parser.add_argument(
"--recursive",
dest="recurse_input_folder",
action="store_true",
help="Recursively identify and process files from the input folder path (default: False)",
)
parser.add_argument(
"--retry_failed",
dest="retry_failed",
Expand Down Expand Up @@ -625,22 +654,7 @@ def main():
ch.setFormatter(formatter)
logging.basicConfig(level=args.log_level, handlers=[ch])

logger.warning(f"Running with params: {args}")

init_db(args=args) # Initialize DB

load_folder(args=args)

print_summary(args=args) # Print summary at the end
if args.print_report:
print_report(args=args)
logger.warning(
"Elapsed time calculation of a file which was resumed"
" from pending state will not be correct"
)

if args.csv_report:
export_report_to_csv(args=args)
api_deployment_batch_run(args=args)


if __name__ == "__main__":
Expand Down

0 comments on commit 502b1f4

Please sign in to comment.