diff --git a/.baseline_version b/.baseline_version
new file mode 100644
index 0000000..a9c729a
--- /dev/null
+++ b/.baseline_version
@@ -0,0 +1 @@
+0.4.2-baseline-version
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..5573fb6
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,15 @@
+# EditorConfig is awesome: http://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+
+[*.{yml,yaml}]
+indent_style = space
+indent_size = 2
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..74d81cc
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+*.smk linguist-language=Python
+Snakefile linguist-language=Python
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..f0c554b
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,18 @@
+name: docs
+on:
+ workflow_dispatch:
+ push:
+ paths:
+ - 'docs/**'
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: actions/setup-python@v2
+ with:
+ python-version: 3.9
+ - run: pip install --upgrade pip
+ - run: pip install -r docs/requirements.txt
+ - run: mkdocs gh-deploy --force
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
new file mode 100644
index 0000000..ce3a292
--- /dev/null
+++ b/.github/workflows/main.yaml
@@ -0,0 +1,34 @@
+name: tests
+
+on:
+ workflow_dispatch:
+ push:
+ branches:
+ - master
+ - main
+ pull_request:
+ branches_ignore: []
+
+jobs:
+ Dry_Run_and_Lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v2
+ - uses: docker://snakemake/snakemake:v5.24.2
+ - name: Dry Run with test data
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 \
+ /opt2/baseline run --input \
+ /opt2/.tests/WT_S1.R1.fastq.gz /opt2/.tests/WT_S1.R2.fastq.gz \
+ /opt2/.tests/WT_S2_R1.fastq.gz /opt2/.tests/WT_S2_R2.fastq.gz \
+ /opt2/.tests/WT_S3_1.fastq.gz /opt2/.tests/WT_S3_2.fastq.gz \
+ /opt2/.tests/WT_S4_R1.001.fastq.gz /opt2/.tests/WT_S4_R2.001.fastq.gz \
+ --output /opt2/output --mode local --dry-run
+ - name: View the pipeline config file
+ run: |
+ echo "Generated config file for pipeline...." && cat $PWD/output/config.json
+ - name: Lint Workflow
+ continue-on-error: true
+ run: |
+ docker run -v $PWD:/opt2 snakemake/snakemake:v5.24.2 snakemake --lint -s /opt2/output/workflow/Snakefile -d /opt2/output || \
+ echo 'There may have been a few warnings or errors. Please read through the log to determine if its harmless.'
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a403fea
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,171 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+site/
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# DS_Store
+.DS_Store
+._*
+**/.DS_Store
+**/._*
+
+.snakemake*
+**/.snakemake*
+.venv
+.venv/*
+
+# Pipeline Results or Output
+results/
+output/
+tmp/
+scratch/
+
+# mkdocs documentation
+site/
+
+# Pipeline generated files or directories
+.tests/*/
+.snakemake/
+
+# Cached Java directories
+.oracle_jre_usage/
+.java/
+
+# GNU Parallel
+.parallel/
+
+# Temp files
+*.tmp
+**/*.tmp
+
+# Test script and test
+# output directories
+test.sh
+test_*/
+tmp_*/
diff --git a/.tests/WT_S1.R1.fastq.gz b/.tests/WT_S1.R1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S1.R2.fastq.gz b/.tests/WT_S1.R2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S2_R1.fastq.gz b/.tests/WT_S2_R1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S2_R2.fastq.gz b/.tests/WT_S2_R2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S3_1.fastq.gz b/.tests/WT_S3_1.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S3_2.fastq.gz b/.tests/WT_S3_2.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S4_R1.001.fastq.gz b/.tests/WT_S4_R1.001.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/.tests/WT_S4_R2.001.fastq.gz b/.tests/WT_S4_R2.001.fastq.gz
new file mode 100644
index 0000000..e69de29
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..0b11cb7
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.1.0] - 2022-08-22
+### Added
+ - Recommended [scaffold](https://github.com/OpenOmics/baseline) for building a snakemake pipeline
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d532cd9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 OpenOmics
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1c9e9da
--- /dev/null
+++ b/README.md
@@ -0,0 +1,108 @@
+
+
+
baseline 🔬
+
+ **_long pipeline name_**
+
+ [![tests](https://github.com/OpenOmics/baseline/workflows/tests/badge.svg)](https://github.com/OpenOmics/baseline/actions/workflows/main.yaml) [![docs](https://github.com/OpenOmics/baseline/workflows/docs/badge.svg)](https://github.com/OpenOmics/baseline/actions/workflows/docs.yml) [![GitHub issues](https://img.shields.io/github/issues/OpenOmics/baseline?color=brightgreen)](https://github.com/OpenOmics/baseline/issues) [![GitHub license](https://img.shields.io/github/license/OpenOmics/baseline)](https://github.com/OpenOmics/baseline/blob/main/LICENSE)
+
+
+ This is the home of the pipeline, baseline. Its long-term goals: to accurately ...insert goal, to infer ...insert goal, and to boldly ...insert goal like no pipeline before!
+
+
+
+## Overview
+Welcome to baseline! Before getting started, we highly recommend reading through [baseline's documentation](https://openomics.github.io/baseline/).
+
+The **`./baseline`** pipeline is composed several inter-related sub commands to setup and run the pipeline across different systems. Each of the available sub commands perform different functions:
+
+ * [baseline run](https://openomics.github.io/baseline/usage/run/): Run the baseline pipeline with your input files.
+ * [baseline unlock](https://openomics.github.io/baseline/usage/unlock/): Unlocks a previous runs output directory.
+ * [baseline install](https://openomics.github.io/baseline/usage/install/): Download reference files locally.
+ * [baseline cache](https://openomics.github.io/baseline/usage/cache/): Cache remote resources locally, coming soon!
+
+**baseline** is a comprehensive ...insert long description. It relies on technologies like [Singularity1](https://singularity.lbl.gov/) to maintain the highest-level of reproducibility. The pipeline consists of a series of data processing and quality-control steps orchestrated by [Snakemake2](https://snakemake.readthedocs.io/en/stable/), a flexible and scalable workflow management system, to submit jobs to a cluster.
+
+The pipeline is compatible with data generated from Illumina short-read sequencing technologies. As input, it accepts a set of FastQ files and can be run locally on a compute instance or on-premise using a cluster. A user can define the method or mode of execution. The pipeline can submit jobs to a cluster using a job scheduler like SLURM (more coming soon!). A hybrid approach ensures the pipeline is accessible to all users.
+
+Before getting started, we highly recommend reading through the [usage](https://openomics.github.io/baseline/usage/run/) section of each available sub command.
+
+For more information about issues or trouble-shooting a problem, please checkout our [FAQ](https://openomics.github.io/baseline/faq/questions/) prior to [opening an issue on Github](https://github.com/OpenOmics/baseline/issues).
+
+## Dependencies
+**Requires:** `singularity>=3.5` `snakemake>=6.0`
+
+At the current moment, the pipeline uses a mixture of enviroment modules and docker images; however, this will be changing soon! In the very near future, the pipeline will only use docker images. With that being said, [snakemake](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html) and [singularity](https://singularity.lbl.gov/all-releases) must be installed on the target system. Snakemake orchestrates the execution of each step in the pipeline. To guarantee the highest level of reproducibility, each step of the pipeline will rely on versioned images from [DockerHub](https://hub.docker.com/orgs/nciccbr/repositories). Snakemake uses singularity to pull these images onto the local filesystem prior to job execution, and as so, snakemake and singularity will be the only two dependencies in the future.
+
+## Installation
+Please clone this repository to your local filesystem using the following command:
+```bash
+# Clone Repository from Github
+git clone https://github.com/OpenOmics/baseline.git
+# Change your working directory
+cd baseline/
+# Add dependencies to $PATH
+# Biowulf users should run
+module load snakemake singularity
+# Get usage information
+./baseline -h
+```
+
+## Contribute
+This site is a living document, created for and by members like you. baseline is maintained by the members of OpenOmics and is improved by continous feedback! We encourage you to contribute new content and make improvements to existing content via pull request to our [GitHub repository](https://github.com/OpenOmics/baseline).
+
+
+## Cite
+
+If you use this software, please cite it as below:
+
+
+ @BibText
+
+```text
+Citation coming soon!
+```
+
+
+
+
+ @APA
+
+```text
+Citation coming soon!
+```
+
+
+
+
+
+
+## References
+**1.** Kurtzer GM, Sochat V, Bauer MW (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE 12(5): e0177459.
+**2.** Koster, J. and S. Rahmann (2018). "Snakemake-a scalable bioinformatics workflow engine." Bioinformatics 34(20): 3600.
diff --git a/VERSION b/VERSION
new file mode 100644
index 0000000..6e8bf73
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/baseline b/baseline
new file mode 100755
index 0000000..bb3cd87
--- /dev/null
+++ b/baseline
@@ -0,0 +1,700 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+"""
+ABOUT: This is the main entry for the pipeline.
+REQUIRES:
+ - python>=3.6
+ - snakemake (recommended>=6.0.0)
+ - singularity (recommended==latest)
+DISCLAIMER:
+ PUBLIC DOMAIN NOTICE
+ NIAID Collaborative Bioinformatics Resource (NCBR)
+ National Institute of Allergy and Infectious Diseases (NIAID)
+This software/database is a "United States Government Work" under
+the terms of the United States Copyright Act. It was written as
+part of the author's official duties as a United States Government
+employee and thus cannot be copyrighted. This software is freely
+available to the public for use.
+Although all reasonable efforts have been taken to ensure the
+accuracy and reliability of the software and data, NCBR do not and
+cannot warrant the performance or results that may be obtained by
+using this software or data. NCBR and NIH disclaim all warranties,
+express or implied, including warranties of performance,
+merchantability or fitness for any particular purpose.
+Please cite the author and NIH resources like the "Biowulf Cluster"
+in any work or product based on this material.
+USAGE:
+ $ baseline [OPTIONS]
+EXAMPLE:
+ $ baseline run --input *.R?.fastq.gz --output output/
+"""
+
+# Python standard library
+from __future__ import print_function
+import sys, os, subprocess, re, json, textwrap
+
+# 3rd party imports from pypi
+import argparse # potential python3 3rd party package, added in python/3.5
+
+# Local imports
+from src import version
+from src.run import init, setup, bind, dryrun, runner
+from src.shells import bash
+from src.utils import (
+ Colors,
+ err,
+ exists,
+ fatal,
+ hashed,
+ permissions,
+ check_cache,
+ require)
+
+
+# Pipeline Metadata
+__version__ = version
+__authors__ = 'Skyler Kuhn'
+__email__ = 'skyler.kuhn@nih.gov'
+__home__ = os.path.dirname(os.path.abspath(__file__))
+_name = os.path.basename(sys.argv[0])
+_description = 'An awesome baseline pipeline'
+
+
+def unlock(sub_args):
+ """Unlocks a previous runs output directory. If snakemake fails ungracefully,
+ it maybe required to unlock the working directory before proceeding again.
+ This is rare but it does occasionally happen. Maybe worth add a --force
+ option to delete the '.snakemake/' directory in the future.
+ @param sub_args :
+ Parsed arguments for unlock sub-command
+ """
+ print("Unlocking the pipeline's output directory...")
+ outdir = sub_args.output
+
+ try:
+ unlock_output = subprocess.check_output([
+ 'snakemake', '--unlock',
+ '--cores', '1',
+ '--configfile=config.json'
+ ], cwd = outdir,
+ stderr=subprocess.STDOUT)
+ except subprocess.CalledProcessError as e:
+ # Unlocking process returned a non-zero exit code
+ sys.exit("{}\n{}".format(e, e.output))
+
+ print("Successfully unlocked the pipeline's working directory!")
+
+
+def run(sub_args):
+ """Initialize, setup, and run the pipeline.
+ Calls initialize() to create output directory and copy over pipeline resources,
+ setup() to create the pipeline config file, dryrun() to ensure their are no issues
+ before running the pipeline, and finally run() to execute the Snakemake workflow.
+ @param sub_args :
+ Parsed arguments for run sub-command
+ """
+ # Step 0. Check for required dependencies
+ # The pipelines has only two requirements:
+ # snakemake and singularity
+ require(['snakemake', 'singularity'], ['snakemake', 'singularity'])
+
+ # Step 1. Initialize working directory,
+ # copy over required resources to run
+ # the pipeline
+ git_repo = __home__
+ input_files = init(
+ repo_path = git_repo,
+ output_path = sub_args.output,
+ links = sub_args.input
+ )
+
+ # Step 2. Setup pipeline for execution,
+ # dynamically create config.json config
+ # file from user inputs and base config
+ # templates
+ config = setup(sub_args,
+ ifiles = input_files,
+ repo_path = git_repo,
+ output_path = sub_args.output
+ )
+
+ # Step 3. Resolve docker/singularity bind
+ # paths from the config file.
+ bindpaths = bind(
+ sub_args,
+ config = config
+ )
+
+ config['bindpaths'] = bindpaths
+
+ # Step 4. Save config to output directory
+ with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh:
+ json.dump(config, fh, indent = 4, sort_keys = True)
+
+ # Optional Step: Dry-run pipeline
+ if sub_args.dry_run:
+ # Dryrun pipeline
+ dryrun_output = dryrun(outdir = sub_args.output) # python3 returns byte-string representation
+ print("\nDry-running {} pipeline:\n{}".format(_name, dryrun_output.decode("utf-8")))
+ sys.exit(0)
+
+ # Step 5. Orchestrate pipeline execution,
+ # run pipeline in locally on a compute node
+ # for debugging purposes or submit the master
+ # job to the job scheduler, SLURM, and create
+ # logging file
+ if not exists(os.path.join(sub_args.output, 'logfiles')):
+ # Create directory for logfiles
+ os.makedirs(os.path.join(sub_args.output, 'logfiles'))
+ if sub_args.mode == 'local':
+ log = os.path.join(sub_args.output, 'logfiles', 'snakemake.log')
+ else:
+ log = os.path.join(sub_args.output, 'logfiles', 'master.log')
+ logfh = open(log, 'w')
+ mjob = runner(mode = sub_args.mode,
+ outdir = sub_args.output,
+ # additional_bind_paths = all_bind_paths,
+ alt_cache = sub_args.singularity_cache,
+ threads = int(sub_args.threads),
+ jobname = sub_args.job_name,
+ submission_script=os.path.join(__home__, 'src', 'run.sh'),
+ logger = logfh,
+ additional_bind_paths = ",".join(bindpaths),
+ tmp_dir = sub_args.tmp_dir,
+ )
+
+ # Step 6. Wait for subprocess to complete,
+ # this is blocking and not asynchronous
+ if not sub_args.silent:
+ print("\nRunning {} pipeline in '{}' mode...".format(_name, sub_args.mode))
+ mjob.wait()
+ logfh.close()
+
+ # Step 7. Relay information about submission
+ # of the master job or the exit code of the
+ # pipeline that ran in local mode
+ if sub_args.mode == 'local':
+ if int(mjob.returncode) == 0:
+ print('{} pipeline has successfully completed'.format(_name))
+ else:
+ fatal('{} pipeline failed. Please see {} for more information.'.format(_name,
+ os.path.join(sub_args.output, 'logfiles', 'snakemake.log')))
+ elif sub_args.mode == 'slurm':
+ jobid = open(os.path.join(sub_args.output, 'logfiles', 'mjobid.log')).read().strip()
+ if not sub_args.silent:
+ if int(mjob.returncode) == 0:
+ print('Successfully submitted master job: ', end="")
+ else:
+ fatal('Error occurred when submitting the master job.')
+ print(jobid)
+
+
+def cache(sub_args):
+ """Caches remote resources or reference files stored on DockerHub and S3.
+ Local SIFs will be created from images defined in 'config/containers/images.json'.
+ @TODO: add option to cache other shared S3 resources (i.e. kraken db and fqscreen indices)
+ @param sub_args :
+ Parsed arguments for unlock sub-command
+ """
+ print(sub_args)
+ fatal('NotImplementedError... Comming Soon!')
+
+
+def parsed_arguments(name, description):
+ """Parses user-provided command-line arguments. Requires argparse and textwrap
+ package. argparse was added to standard lib in python 3.5 and textwrap was added
+ in python 3.5. To create custom help formatting for subparsers a docstring is
+ used create the help message for required options. argparse does not support named
+ subparser groups, which is normally what would be used to accomphish this reformatting.
+ As so, the help message for require options must be suppressed. If a new required arg
+ is added to a subparser, it must be added to the docstring and the usage statement
+ also must be updated.
+ @param name :
+ Name of the pipeline or command-line tool
+ @param description :
+ Short description of pipeline or command-line tool
+ """
+ # Add styled name and description
+ c = Colors
+ styled_name = "{0}{1}{2}baseline{3}".format(c.bold, c.bg_black, c.cyan, c.end)
+ description = "{0}{1}{2}".format(c.bold, description, c.end)
+
+ # Create a top-level parser
+ parser = argparse.ArgumentParser(description = '{}: {}'.format(styled_name, description))
+
+ # Adding Verison information
+ parser.add_argument('--version', action = 'version', version='%(prog)s {}'.format(__version__))
+
+ # Create sub-command parser
+ subparsers = parser.add_subparsers(help='List of available sub-commands')
+
+ # Sub-parser for the "run" sub-command
+ # Grouped sub-parser arguments are currently
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_run_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}{4}Synopsis:{5}
+ $ {2} run [--help] \\
+ [--dry-run] [--job-name JOB_NAME] [--mode {{slurm,local}}] \\
+ [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \\
+ [--silent] [--threads THREADS] [--tmp-dir TMP_DIR] \\
+ --input INPUT [INPUT ...] \\
+ --output OUTPUT
+
+ Optional arguments are shown in square brackets above.
+
+ {3}{4}Description:{5}
+ To run the ...long pipeline name with your data raw data, please
+ provide a space seperated list of FastQ (globbing is supported) and an output
+ directory to store results.
+
+ {3}{4}Required arguments:{5}
+ --input INPUT [INPUT ...]
+ Input FastQ file(s) to process. The pipeline does NOT
+ support single-end data. FastQ files for one or more
+ samples can be provided. Multiple input FastQ files
+ should be seperated by a space. Globbing for multiple
+ file is also supported.
+ Example: --input .tests/*.R?.fastq.gz
+ --output OUTPUT
+ Path to an output directory. This location is where
+ the pipeline will create all of its output files, also
+ known as the pipeline's working directory. If the user
+ provided working directory has not been initialized,
+ it will be created automatically.
+ Example: --output /data/$USER/output
+
+ {3}{4}Analysis options:{5}
+ ...coming soon!
+
+ {3}{4}Orchestration options:{5}
+ --mode {{slurm,local}}
+ Method of execution. Defines the mode of execution.
+ Vaild options for this mode include: local or slurm.
+ Additional modes of exection are coming soon, default:
+ slurm.
+ Here is a brief description of each mode:
+ • local: uses local method of execution. local runs
+ will run serially on compute instance. This is useful
+ for testing, debugging, or when a users does not have
+ access to a high performance computing environment.
+ If this option is not provided, it will default to a
+ slurm mode of execution.
+ • slurm: uses slurm execution backend. This method
+ will submit jobs to a cluster using sbatch. It is
+ recommended running the pipeline in this mode as it
+ will be significantly faster.
+ Example: --mode slurm
+ --job-name JOB_NAME
+ Overrides the name of the pipeline's master job. When
+ submitting the pipeline to a jobscheduler, this option
+ overrides the default name of the master job. This can
+ be useful for tracking the progress or status of a run,
+ default: pl:{2}.
+ Example: --job-name {2}_03-14.1592
+ --dry-run
+ Does not execute anything. Only displays what steps in
+ the pipeline remain or will be run.
+ Example: --dry-run
+ --silent
+ Silence standard output. This will reduces the amount
+ of information displayed to standard output when the
+ master job is submitted to the job scheduler. Only the
+ job id of the master job is returned.
+ Example: --silent
+ --singularity-cache SINGULARITY_CACHE
+ Overrides the $SINGULARITY_CACHEDIR variable. Images
+ from remote registries are cached locally on the file
+ system. By default, the singularity cache is set to:
+ '/path/to/output/directory/.singularity/'. Please note
+ that this cache cannot be shared across users.
+ Example: --singularity-cache /data/$USER
+ --sif-cache SIF_CACHE
+ Path where a local cache of SIFs are stored. This cache
+ can be shared across users if permissions are properly
+ setup. If a SIF does not exist in the SIF cache, the
+ image will be pulled from Dockerhub. {2} cache
+ sub command can be used to create a local SIF cache.
+ Please see {2} cache for more information.
+ Example: --sif-cache /data/$USER/sifs/
+ --tmp-dir TMP_DIR
+ Path on the file system for writing temporary output
+ files. By default, the temporary directory is set to
+ '/lscratch/$SLURM_JOBID' for backwards compatibility
+ with the NIH's Biowulf cluster; however, if you are
+ running the pipeline on another cluster, this option
+ will need to be specified. Ideally, this path should
+ point to a dedicated location on the filesystem for
+ writing tmp files. On many systems, this location is
+ set to somewhere in /scratch. If you need to inject a
+ variable into this string that should NOT be expanded,
+ please quote this options value in single quotes.
+ Example: --tmp-dir '/scratch/$USER/'
+ --threads THREADS
+ Max number of threads for local processes. It is
+ recommended setting this vaule to the maximum number
+ of CPUs available on the host machine, default: 2.
+ Example: --threads: 16
+
+ {3}{4}Misc Options:{5}
+ -h, --help Show usage information, help message, and exit.
+ Example: --help
+ """.format(styled_name, description, name, c.bold, c.url, c.end, c.italic))
+
+ # Display example usage in epilog
+ run_epilog = textwrap.dedent("""\
+ {2}{3}Example:{4}
+ # Step 1.) Grab an interactive node,
+ # do not run on head node!
+ srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash
+ module purge
+ module load singularity snakemake
+
+ # Step 2A.) Dry-run the pipeline
+ ./{0} run --input .tests/*.R?.fastq.gz \\
+ --output /data/$USER/output \\
+ --mode slurm \\
+ --dry-run
+
+ # Step 2B.) Run the {0} pipeline
+ # The slurm mode will submit jobs to
+ # the cluster. It is recommended running
+ # the pipeline in this mode.
+ ./{0} run --input .tests/*.R?.fastq.gz \\
+ --output /data/$USER/output \\
+ --mode slurm
+
+ {2}{3}Version:{4}
+ {1}
+ """.format(name, __version__, c.bold, c.url, c.end))
+
+ # Supressing help message of required args to overcome no sub-parser named groups
+ subparser_run = subparsers.add_parser('run',
+ help = 'Run the {} pipeline with input files.'.format(name),
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_run_options,
+ epilog = run_epilog,
+ add_help=False
+ )
+
+ # Required Arguments
+ # Input FastQ files
+ subparser_run.add_argument(
+ '--input',
+ # Check if the file exists and if it is readable
+ type = lambda file: permissions(parser, file, os.R_OK),
+ required = True,
+ nargs = '+',
+ help = argparse.SUPPRESS
+ )
+
+ # Output Directory, i.e
+ # working directory
+ subparser_run.add_argument(
+ '--output',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Optional Arguments
+ # Add custom help message
+ subparser_run.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+ # Analysis options
+ # ... add here
+
+ # Orchestration Options
+ # Execution Method, run locally
+ # on a compute node or submit to
+ # a supported job scheduler, etc.
+ subparser_run.add_argument(
+ '--mode',
+ type = str,
+ required = False,
+ default = "slurm",
+ choices = ['slurm', 'local'],
+ help = argparse.SUPPRESS
+ )
+
+ # Name of master job
+ subparser_run.add_argument(
+ '--job-name',
+ type = str,
+ required = False,
+ default = 'pl:{}'.format(name),
+ help = argparse.SUPPRESS
+ )
+
+ # Dry-run
+ # Do not execute the workflow,
+ # prints what steps remain
+ subparser_run.add_argument(
+ '--dry-run',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Silent output mode
+ subparser_run.add_argument(
+ '--silent',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Singularity cache directory,
+ # default uses output directory
+ subparser_run.add_argument(
+ '--singularity-cache',
+ type = lambda option: check_cache(parser, os.path.abspath(os.path.expanduser(option))),
+ required = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Local SIF cache directory,
+ # default pull from Dockerhub
+ subparser_run.add_argument(
+ '--sif-cache',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = False,
+ help = argparse.SUPPRESS
+ )
+
+ # Base directory to write
+ # temporary/intermediate files
+ subparser_run.add_argument(
+ '--tmp-dir',
+ type = str,
+ required = False,
+ default = '/lscratch/$SLURM_JOBID/',
+ help = argparse.SUPPRESS
+ )
+
+ # Number of threads for the
+ # pipeline's main proceess
+ # This is only applicable for
+ # local rules or when running
+ # in local mode.
+ subparser_run.add_argument(
+ '--threads',
+ type = int,
+ required = False,
+ default = 2,
+ help = argparse.SUPPRESS
+ )
+
+ # Sub-parser for the "unlock" sub-command
+ # Grouped sub-parser arguments are currently
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_unlock_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}{4}Synopsis:{5}
+ $ {2} unlock [-h] --output OUTPUT
+
+ Optional arguments are shown in square brackets above.
+
+ {3}{4}Description:{5}
+ If the pipeline fails ungracefully, it maybe required to unlock
+ the working directory before proceeding again. Please verify that
+ the pipeline is not running before running this command. If the
+ pipeline is still running, the workflow manager will report the
+ working directory is locked. This is normal behavior. Do NOT run
+ this command if the pipeline is still running.
+
+ {3}{4}Required arguments:{5}
+ --output OUTPUT Path to a previous run's output directory
+ to unlock. This will remove a lock on the
+ working directory. Please verify that the
+ pipeline is not running before running
+ this command.
+ Example: --output /data/$USER/output
+
+ {3}{4}Misc Options:{5}
+ -h, --help Show usage information, help message,
+ and exit.
+ Example: --help
+ """.format(styled_name, description, name, c.bold, c.url, c.end))
+
+ # Display example usage in epilog
+ unlock_epilog = textwrap.dedent("""\
+ {2}{3}Example:{4}
+ # Unlock output directory of pipeline
+ {0} unlock --output /data/$USER/output
+
+ {2}{3}Version:{4}
+ {1}
+ """.format(name, __version__, c.bold, c.url, c.end))
+
+ # Supressing help message of required args to overcome no sub-parser named groups
+ subparser_unlock = subparsers.add_parser(
+ 'unlock',
+ help = 'Unlocks a previous runs output directory.',
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_unlock_options,
+ epilog = unlock_epilog,
+ add_help = False
+ )
+
+ # Required Arguments
+ # Output Directory (analysis working directory)
+ subparser_unlock.add_argument(
+ '--output',
+ type = str,
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Add custom help message
+ subparser_unlock.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+
+ # Sub-parser for the "cache" sub-command
+ # Grouped sub-parser arguments are
+ # not supported: https://bugs.python.org/issue9341
+ # Here is a work around to create more useful help message for named
+ # options that are required! Please note: if a required arg is added the
+ # description below should be updated (i.e. update usage and add new option)
+ required_cache_options = textwrap.dedent("""\
+ {0}: {1}
+
+ {3}{4}Synopsis:{5}
+ $ {2} cache [-h] [--dry-run] --sif-cache SIF_CACHE
+
+ Optional arguments are shown in square brackets above.
+
+ {3}{4}Description:{5}
+ Creates a local cache resources hosted on DockerHub or AWS S3.
+ These resources are normally pulled onto the filesystem when the
+ pipeline runs; however, due to network issues or DockerHub pull
+ rate limits, it may make sense to pull the resources once so a
+ shared cache can be created. It is worth noting that a singularity
+ cache cannot normally be shared across users. Singularity strictly
+ enforces that a cache is owned by the user. To get around this
+ issue, the cache subcommand can be used to create local SIFs on
+ the filesystem from images on DockerHub.
+
+ {3}{4}Required arguments:{5}
+ --sif-cache SIF_CACHE
+ Path where a local cache of SIFs will be
+ stored. Images defined in containers.json
+ will be pulled into the local filesystem.
+ The path provided to this option can be
+ passed to the --sif-cache option of the
+ run sub command. Please see {2}
+ run sub command for more information.
+ Example: --sif-cache /data/$USER/cache
+
+ {3}{4}Orchestration options:{5}
+ --dry-run Does not execute anything. Only displays
+ what remote resources would be pulled.
+ Example: --dry-run
+
+ {3}{4}Misc Options:{5}
+ -h, --help Show usage information, help message,
+ and exits.
+ Example: --help
+
+ """.format(styled_name, description, name, c.bold, c.url, c.end))
+
+ # Display example usage in epilog
+ cache_epilog = textwrap.dedent("""\
+ {2}{3}Example:{4}
+ # Cache remote resources of pipeline
+ {0} cache --sif-cache /data/$USER/cache
+
+ {2}{3}Version:{4}
+ {1}
+ """.format(name, __version__, c.bold, c.url, c.end))
+
+ # Supressing help message of required args to overcome no sub-parser named groups
+ subparser_cache = subparsers.add_parser(
+ 'cache',
+ help = 'Cache remote resources locally.',
+ usage = argparse.SUPPRESS,
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description = required_cache_options,
+ epilog = cache_epilog,
+ add_help = False
+ )
+
+ # Required Arguments
+ # Output Directory (analysis working directory)
+ subparser_cache.add_argument(
+ '--sif-cache',
+ type = lambda option: os.path.abspath(os.path.expanduser(option)),
+ required = True,
+ help = argparse.SUPPRESS
+ )
+
+ # Optional Arguments
+ # Dry-run cache command (do not pull any remote resources)
+ subparser_cache.add_argument(
+ '--dry-run',
+ action = 'store_true',
+ required = False,
+ default = False,
+ help=argparse.SUPPRESS
+ )
+
+ # Add custom help message
+ subparser_cache.add_argument(
+ '-h', '--help',
+ action='help',
+ help=argparse.SUPPRESS
+ )
+
+ # Define handlers for each sub-parser
+ subparser_run.set_defaults(func = run)
+ subparser_unlock.set_defaults(func = unlock)
+ subparser_cache.set_defaults(func = cache)
+
+ # Parse command-line args
+ args = parser.parse_args()
+ return args
+
+
+def main():
+
+ # Sanity check for usage
+ if len(sys.argv) == 1:
+ # Nothing was provided
+ fatal('Invalid usage: {} [-h] [--version] ...'.format(_name))
+
+ # Collect args for sub-command
+ args = parsed_arguments(
+ name = _name,
+ description = _description
+ )
+
+ # Display version information
+ err('{} ({})'.format(_name, __version__))
+
+ # Mediator method to call sub-command's set handler function
+ args.func(args)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/config/cluster.json b/config/cluster.json
new file mode 100644
index 0000000..545162f
--- /dev/null
+++ b/config/cluster.json
@@ -0,0 +1,8 @@
+{
+ "__default__": {
+ "threads": "4",
+ "mem": "8g",
+ "partition": "norm",
+ "time": "0-04:00:00"
+ }
+}
diff --git a/config/config.json b/config/config.json
new file mode 100644
index 0000000..7056c94
--- /dev/null
+++ b/config/config.json
@@ -0,0 +1,4 @@
+{
+ "options": {
+ }
+}
diff --git a/config/containers.json b/config/containers.json
new file mode 100644
index 0000000..46aecee
--- /dev/null
+++ b/config/containers.json
@@ -0,0 +1,4 @@
+{
+ "images": {
+ }
+}
diff --git a/config/genome.json b/config/genome.json
new file mode 100644
index 0000000..00c1ed7
--- /dev/null
+++ b/config/genome.json
@@ -0,0 +1,4 @@
+{
+ "references": {
+ }
+}
diff --git a/config/modules.json b/config/modules.json
new file mode 100644
index 0000000..7a587e7
--- /dev/null
+++ b/config/modules.json
@@ -0,0 +1,4 @@
+{
+ "tools": {
+ }
+}
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..2f30bcd
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,35 @@
+## Steps for Building Docker Images
+
+Directly below are instructions for building an image using the provided Dockerfile:
+
+```bash
+# See listing of images on computer
+docker image ls
+
+# Build from Dockerfile
+docker build --no-cache -f example.dockerfile --tag=example:v0.1.0 .
+
+# Testing, take a peek inside
+docker run -ti example:v0.1.0 /bin/bash
+
+# Updating Tag before pushing to DockerHub
+docker tag example:v0.1.0 skchronicles/example:v0.1.0
+docker tag example:v0.1.0 skchronicles/example # latest
+
+# Check out new tag(s)
+docker image ls
+
+# Push new tagged image to DockerHub
+docker push skchronicles/example:v0.1.0
+docker push skchronicles/example:latest
+```
+
+### Other Recommended Steps
+
+Scan your image for known vulnerabilities:
+
+```bash
+docker scan example:v0.1.0
+```
+
+> **Please Note**: Any references to `skchronicles` should be replaced your username if you would also like to push the image to a non-org account.
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..7e964c7
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,33 @@
+# Build documentation
+
+> **Please Note:** When a commit is pushed to the `docs/` directory, it triggers a [github actions workflow](https://github.com/OpenOmics/baseline/actions) to build the static-site and push it to the gh-pages branch.
+
+### Installation
+```bash
+# Clone the Repository
+git clone https://github.com/OpenOmics/baseline.git
+cd baseline/
+# Create a virtual environment
+python3 -m venv .venv
+# Activate the virtual environment
+. .venv/bin/activate
+# Update pip
+pip install --upgrade pip
+# Download Dependencies
+pip install -r docs/requirements.txt
+```
+
+### Preview while editing
+MkDocs includes a previewing server, so you can view your updates live and as you write your documentation. The server will automatically rebuild the site upon editing and saving a file.
+```bash
+# Activate the virtual environment
+. .venv/bin/activate
+# Start serving your documentation
+mkdocs serve
+```
+
+### Build static site
+Once you are content with your changes, you can build the static site:
+```bash
+mkdocs build
+```
diff --git a/docs/assets/favicon/favicon.ico b/docs/assets/favicon/favicon.ico
new file mode 100644
index 0000000..e85006a
Binary files /dev/null and b/docs/assets/favicon/favicon.ico differ
diff --git a/docs/assets/icons/doc-book.svg b/docs/assets/icons/doc-book.svg
new file mode 100644
index 0000000..10ced62
--- /dev/null
+++ b/docs/assets/icons/doc-book.svg
@@ -0,0 +1,9 @@
+
+
diff --git a/docs/css/extra.css b/docs/css/extra.css
new file mode 100644
index 0000000..5c0c7aa
--- /dev/null
+++ b/docs/css/extra.css
@@ -0,0 +1,24 @@
+@keyframes heart {
+ 0%, 40%, 80%, 100% {
+ transform: scale(1);
+ }
+ 20%, 60% {
+ transform: scale(1.15);
+ }
+}
+
+.heart {
+ animation: heart 1500ms infinite;
+}
+
+[data-md-color-scheme="slate"] {
+ --md-primary-fg-color: #1A1B23DE;
+ --md-typeset-a-color: #b1b9ed;
+}
+
+.md-typeset .admonition.custom-grid-button,
+.md-typeset details.custom-grid-button {
+ border-color: var(--md-code-bg-color);
+ border-width: 2px;
+ width: 45%;
+}
\ No newline at end of file
diff --git a/docs/faq/questions.md b/docs/faq/questions.md
new file mode 100644
index 0000000..40c323d
--- /dev/null
+++ b/docs/faq/questions.md
@@ -0,0 +1,4 @@
+# Frequently Asked Questions
+
+This page is still under construction. If you need immediate help, please [open an issue](https://github.com/OpenOmics/baseline/issues) on Github!
+
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..086219d
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,91 @@
+
+ This is the home of the pipeline, baseline. Its long-term goals: to accurately ...insert goal, to infer ...insert goal, and to boldly ...insert goal like no pipeline before!
+