diff --git a/advert/Course_Advert.odt b/advert/Course_Advert.odt new file mode 100644 index 0000000..1538dc0 Binary files /dev/null and b/advert/Course_Advert.odt differ diff --git a/handout/README.txt b/handout/README.txt new file mode 100644 index 0000000..82b0952 --- /dev/null +++ b/handout/README.txt @@ -0,0 +1,41 @@ +Copyright (c) 2024, Snakemake Teaching Alliance + Christian Meester & JGU Mainz + All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of Christian Meesters or the JGU Mainz nor the names of + its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +SAMPLE SCRIPTS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CHRISTIAN MEESTERS OR THE JGU +BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------- + +Please note: + +* This handout does not display material to be used as a reference. +* In particular, the recepient has been informed that people new + to Snakemake and HPC should partake this course themselves as (some) + material can and will be outdated, eventually. +* This is a handout release: Slides are merged and some (graphical) + content has been dropped. +* Sample scripts and (where applicable) solutions are provided. The above + notice applies. + + +Please direct suggestion for improvements to https://github.com/cmeesters/snakemake-hpc-teaching-material/issues . diff --git a/images/logos/gwdg.jpg b/images/logos/gwdg.jpg new file mode 100644 index 0000000..940af10 Binary files /dev/null and b/images/logos/gwdg.jpg differ diff --git a/images/misc/data_center.png b/images/misc/data_center.png new file mode 100644 index 0000000..8a3dce3 Binary files /dev/null and b/images/misc/data_center.png differ diff --git a/images/misc/latency-definition.jpg b/images/misc/latency-definition.jpg new file mode 100644 index 0000000..14c6e47 Binary files /dev/null and b/images/misc/latency-definition.jpg differ diff --git a/pack_release.py b/pack_release.py new file mode 100755 index 0000000..d163a91 --- /dev/null +++ b/pack_release.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +import argparse +import sys +import os +import shutil +import subprocess +import shlex +import zipfile + +def run_pdflatex(master): + pwd = os.getcwd() + if os.sep in master: # probably a path + basename = os.path.basename(master) + dirname = os.path.dirname(master) + os.chdir(dirname) + master = basename + call = r'pdflatex -synctex=1 -interaction=nonstopmode "\def\ishandout{1} \input{' + master + r'}"' + subprocess.call(shlex.split(call)) + os.chdir(pwd) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--version', required = True, + help = "indicate the handout version, e.g. edition") + parser.add_argument('--master-tex', required = True, + help = "indicate a TeX master document") + parser.add_argument('--sample-directory', required = True, + help = "path to directory with script files (cloze and solution") + args = parser.parse_args() + + master = args.master_tex + + print(f"Typesetting Handout Version for '{master}'") + #run_pdflatex(master) + + handout_version = os.path.splitext(master)[0] + '.pdf' + final_place = os.path.basename(handout_version) + + file_list=[(handout_version, final_place), + ('handout/README.txt', 'README.txt')] + opj = os.path.join + for root, dirs, files in os.walk(args.sample_directory): + for fname in files: + # restrict to sample files, no helper files + if 'copy' in fname or 'README' in fname: continue + file_list.append((opj(root,fname), opj(*root.split('/')[1:],fname))) + + buildzipfname = 'snakemake_intro_%s.zip' % args.version + z = zipfile.ZipFile(buildzipfname, 'w', compression = zipfile.ZIP_DEFLATED) + for item in file_list: + z.write(item[0], item[1]) + z.close() + diff --git a/setup/copy_script_mogonII.sh b/setup/copy_script_mogonII.sh index bde45ae..9f2d10e 100644 --- a/setup/copy_script_mogonII.sh +++ b/setup/copy_script_mogonII.sh @@ -6,10 +6,12 @@ CLUSTER_ALIAS="mogon_nox" # nox stands for "no X11 forwarding" BASEPATH="/lustre/project/hpckurs/workflows" # repeated used # creating remote directory: -ssh mogon_nox "mkdir -p ${BASEPATH}" +ssh ${CLUSTER_ALIAS} "mkdir -p ${BASEPATH}" -scp condarc "${CLUSTER_ALIAS}:/lustre/project/hpckurs/workflows/condarc" -scp get_tutorial.sh "${CLUSTER_ALIAS}:/lustre/project/hpckurs/workflows/get_tutorial.sh" +scp condarc "${CLUSTER_ALIAS}:${BASEPATH}/condarc" +scp get_tutorial.sh "${CLUSTER_ALIAS}:${BASEPATH}/get_tutorial.sh" +scp install_micromamba.sh "${CLUSTER_ALIAS}:${BASEPATH}/install_micromamba.sh" +scp environment.yaml "${CLUSTER_ALIAS}:${BASEPATH}/environment.yaml" -rsync -rtlv --chmod=D755 "tutorial" "${CLUSTER_ALIAS}:/lustre/project/hpckurs/workflows" +rsync -rtlv --chmod=D755 "tutorial" "${CLUSTER_ALIAS}:${BASEPATH}" rsync -rtlv --chmod=D755 "solutions" "${CLUSTER_ALIAS}:/lustre/project/hpckurs" diff --git a/setup/environment.yaml b/setup/environment.yaml new file mode 100644 index 0000000..25f8bf5 --- /dev/null +++ b/setup/environment.yaml @@ -0,0 +1,18 @@ +channels: + - conda-forge + - bioconda +dependencies: + - snakemake-minimal >=8.4.4 + - snakemake-executor-plugin-slurm + - snakemake-storage-plugin-fs + - jinja2 + - matplotlib + - graphviz + - bcftools =1.19 + - samtools =1.19.2 + - bwa =0.7.17 + # - pysam =0.22 + # at the time of writing - 7. Feb 24 - pysam will require + # a lower python version than snakemake, install pysam + # using pip + - pygments diff --git a/setup/install_micromamba.sh b/setup/install_micromamba.sh new file mode 100644 index 0000000..16646b1 --- /dev/null +++ b/setup/install_micromamba.sh @@ -0,0 +1 @@ +"${SHELL}" <(curl -L micro.mamba.pm/install.sh) diff --git a/setup/tutorial/01_Snakefile b/setup/tutorial/01_Snakefile index 329f04b..a2df533 100644 --- a/setup/tutorial/01_Snakefile +++ b/setup/tutorial/01_Snakefile @@ -1,3 +1,9 @@ +# This is our first "rule" - it +# serves as a template to proceed. +# +# All other templates are in the same +# tutorial folder. + rule bwa_map: input: "data/genome.fa", diff --git a/slides/Snakemake_HPC_Creators.tex b/slides/Snakemake_HPC_Creators.tex index ce368bf..8d333bd 100644 --- a/slides/Snakemake_HPC_Creators.tex +++ b/slides/Snakemake_HPC_Creators.tex @@ -2,6 +2,12 @@ % Snakemake-Intro for Workflow Creators for HPC Users % %-----------------------------------------------------------------% +% this code will compile the document as handout with +% $ pdflatex -synctex=1 -interaction=nonstopmode "\def\ishandout{1} \input{Snakemake_HPC_Creators.tex}" +\ifdefined\ishandout + \PassOptionsToClass{handout}{beamer} +\fi + \documentclass[english,xcolor=pdftex,dvipsnames]{beamer} % to typeset only a few slide sets, set them here during development @@ -74,7 +80,10 @@ %\include{creators/Uploading_Workflows} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\include{common/Workflow_Parameterization_for_HPC} +\include{creators/Workflow_Parameterization_for_HPC} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\include{common/Software_Provisioning_with_Snakemake} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \include{common/using_wrappers} diff --git a/slides/Snakemake_HPC_Users.tex b/slides/Snakemake_HPC_Users.tex index 4f852ed..d033e89 100644 --- a/slides/Snakemake_HPC_Users.tex +++ b/slides/Snakemake_HPC_Users.tex @@ -2,6 +2,13 @@ % Snakemake-Intro for HPC Users % %-------------------------------------------------------% + +% this code will compile the document as handout with +% $ pdflatex -synctex=1 -interaction=nonstopmode "\def\ishandout{1} \input{Snakemake_HPC_Users.tex}" +\ifdefined\ishandout +\PassOptionsToClass{handout}{beamer} +\fi + \documentclass[english,xcolor=pdftex,dvipsnames]{beamer} % to typeset only a few slide sets, set them here during development diff --git a/slides/common/Hello_World_HPC_MogonII.tex b/slides/common/Hello_World_HPC_MogonII.tex index ae6f178..d5e6346 100644 --- a/slides/common/Hello_World_HPC_MogonII.tex +++ b/slides/common/Hello_World_HPC_MogonII.tex @@ -19,7 +19,7 @@ Save the script as \texttt{hello\_world.sh} and submit it with the following statement: \end{task} \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\footnotesize] - $ sbatch hello_world.sh +$ sbatch hello_world.sh \end{lstlisting} \end{onlyenv} \begin{onlyenv}<2> diff --git a/slides/common/Reports.tex b/slides/common/Reports.tex index 23fff78..009c033 100644 --- a/slides/common/Reports.tex +++ b/slides/common/Reports.tex @@ -89,7 +89,7 @@ \subsection{Benchmarking} \end{itemize} \pause \begin{warning} - We conclude: \Snakemake{}'s benchmarking capabilities are limited! + We conclude: \Snakemake's benchmarking capabilities are limited, but a reasonable way to get basic benchmarks. \end{warning} \end{frame} @@ -106,12 +106,55 @@ \subsection{Reporting} This will generate a file called ``\altverb{report.html}'', which you can visualize with a browser. \end{frame} + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\begin{frame} - \frametitle{\Snakemake{} Reports and Cluster Jobs} - \begin{alertblock}{Why reports about Cluster Jobs can be misleading.} - When reporting about a cluster or cloud job, \Snakemake{} shepherd job on the login-node will measure the wall time from submit time to the finish time, not the executing job(s).\newline - This means: Times can be greatly exaggerated! - \end{alertblock} -\end{frame} +\begin{frame}[fragile] + \frametitle{\Snakemake{} Reports - adding Output} + \begin{docs} + Each output file that shall be part of the report has to be marked with the \altverb{report} flag, which optionally points to a caption in \lhref{https://docutils.sourceforge.io/docs/user/rst/quickstart.html}{restructured text format}. + \end{docs} + An example for our workflow would be: + \begin{lstlisting}[language=Python,style=Python] +rule plot_quals: + input: + "calls/all.vcf" + output: + @report("plots/quals.svg",@ + @caption="report/qual.rst")@ + script: + "scripts/plot-quals.py" + \end{lstlisting} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\HandsOn{Writing an Annotation}} + Let's write the file \altverb{report/qual.rst}! It shall contain our caption. + \pause + Our solution might(!) look like this: + \begin{lstlisting} +Number of variations (deviations from reference) +per experimental record. + \end{lstlisting} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\HandsOn{Adding our Figure to the Report}} + Now add the highlighted lines: + \begin{lstlisting}[language=Python,style=Python] +rule plot_quals: + input: + "calls/all.vcf" + output: + @report("plots/quals.svg",@ + @caption="report/qual.rst")@ + script: + "scripts/plot-quals.py" + \end{lstlisting} + \begin{task}{Re-Run our report generator:} + \altverb{snakemake --report} + \end{task} +\end{frame} + diff --git a/slides/common/Software_Provisioning_with_Snakemake.tex b/slides/common/Software_Provisioning_with_Snakemake.tex new file mode 100644 index 0000000..65692c9 --- /dev/null +++ b/slides/common/Software_Provisioning_with_Snakemake.tex @@ -0,0 +1,199 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Software Provisioning with Snakemake} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{Outline} + \begin{columns}[t] + \begin{column}{.5\textwidth} + \tableofcontents[sections={1-7},currentsection] + \end{column} + \begin{column}{.5\textwidth} + \tableofcontents[sections={8-15},currentsection] + \end{column} + \end{columns} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{What is this about?} + \begin{question}[Questions] + \begin{itemize} + \item Conda, Container, Environment Modules?! What is the Fuzz? + \end{itemize} + \end{question} + \begin{docs}[Objectives] + \begin{enumerate} + \item Learn about the possible provisioning methods of \Snakemake. + \item Learning to distinguish these methods. + \end{enumerate} + \end{docs} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Your Options} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{Snakemake's Software Provisioning - Overview} + \Snakemake{} basically offers 3 software deployment methods: + \begin{itemize}[<+->] + \item Container + \item Environment Modules + \item Conda (and its derivatives) + \end{itemize} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{AppTainer} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Using Containers} + Within a rule we can use the \altverb{container} directive to select a container image + \begin{lstlisting}[language=Python,style=Python] +rule NAME: + input: "table.txt" + output: "plots/myplot.pdf" + @container:@ + @"docker://joseespinosa/docker-r-ggplot2"@ + script: + "scripts/plot-stuff.R" + \end{lstlisting} + \begin{docs}{Allowed Container Flavours} + Allowed image urls need to be supported by AppTainer (e.\,g., \altverb{shub://} and \altverb{docker://}). \altverb{docker://} is preferred. + \end{docs} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Using Containers II} + When executing \Snakemake{} with + \begin{lstlisting}[language=Bash, style=Shell] +snakemake --software-deployment-method apptainer +# or the shorthand version +snakemake --sdm apptainer + \end{lstlisting} + it will execute the job within a container that is spawned from the given image. +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\Interlude{Containerizing your Workflow}} + A \Snakemake{} workflow with conda environments for each rule can automatically generate a container image specification (as a Dockerfile) that contains all required environments: + \begin{lstlisting}[language=Bash, style=Shell] +snakemake --containerize > Dockerfile + \end{lstlisting} + \begin{question}{Which is the Purpose?} + \begin{itemize} + \item archive a container for publication. + \item deploy and ship a container to a $3^{\mathsf{rd}}$ party + \end{itemize} + \end{question} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Conda} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Integrated Package Management} + \begin{columns} + \begin{column}{0.5\textwidth} + In rules we can use the \altverb{conda} directive to specify our environment. + \begin{lstlisting}[language=Python,style=Python,basicstyle=\footnotesize] +rule NAME: + input: + "table.txt" + output: + "plots/myplot.pdf" + @conda:@ + "envs/ggplot.yaml" + script: + "scripts/plot-stuff.R" + \end{lstlisting} + \end{column} + \begin{column}{0.5\textwidth} + with the following \lhref{}{environment definition}: + \begin{lstlisting}[style=Plain] +channels: +- r +dependencies: +- r=3.3.1 +- r-ggplot2=2.1.0 + \end{lstlisting} + \end{column} + \end{columns} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Important Conda Flags} + To use Conda add + \begin{lstlisting}[language=Bash, style=Shell] +--software-deployment-method conda +# or short +--sdm conda + \end{lstlisting} + \pause + It is possible to use different Conda frontends: + \begin{lstlisting}[language=Bash, style=Shell] +--conda-frontend mamba # or micromamba + \end{lstlisting} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Environment Modules} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Using Environment Modules} + You can mix Conda and Environment Modules: + \begin{lstlisting}[language=Python,style=Python] +rule bwa: + input: + "genome.fa" + "reads.fq" + output: + "mapped.bam" + conda: + "envs/bwa.yaml" + @envmodules:@ + "bio/bwa/0.7.9", + "bio/samtools/1.9" + shell: + "bwa mem {input} | samtools view -Sbh - > {output}" + \end{lstlisting} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Using Environment Modules II} + When \Snakemake{} is executed with + \begin{lstlisting}[language=Bash, style=Shell] +snakemake --use-envmodules + \end{lstlisting} + it will load the defined modules in the given order, instead of using the also defined conda environment. + \pause + \begin{hint} + It is recommended to specify Conda environments alongside - only then containerizing a workflow works! + \end{hint} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Summing Up} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Software Provisioning Summary} + You can + \begin{itemize} + \item manually install environment (done in this course) + \item use \Snakemake's features for conda, env. modules, containers + \end{itemize} + See + \begin{lstlisting}[language=Bash, style=Shell] +snakemake --help + \end{lstlisting} + for an overview or \lhref{https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html}{the official documentation}. +\end{frame} \ No newline at end of file diff --git a/slides/common/Why_Workflows.tex b/slides/common/Why_Workflows.tex index 3749f84..a58e75c 100644 --- a/slides/common/Why_Workflows.tex +++ b/slides/common/Why_Workflows.tex @@ -146,6 +146,7 @@ \section{Why Workflows} \includegraphics[width=0.85\textwidth]{Snakemake/reproducibility_full.png} \end{figure} \end{onlyenv} + \footnotesize{\lhref{https://doi.org/10.12688/f1000research.29032.2}{From the official \Snakemake-paper.}} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -153,7 +154,7 @@ \section{Why Workflows} \frametitle{\Snakemake} \begin{figure} \centering - \caption{\textbf{>370k} downloads since 2015\newline + \caption*{\textbf{>370k} downloads since 2015\newline \textbf{>1300} citations\newline \textbf{>7} citations per week since 2021} \includegraphics[width=0.6\textwidth]{Snakemake/paper_wall.png} @@ -175,7 +176,7 @@ \section{Why Workflows} \begin{column}{0.5\textwidth} \begin{figure} \includegraphics[width=\textwidth]{Snakemake/Snakemake_Workflow_Catalog.png} - \caption{Screenshot of the Workflow Catalogue} + \caption*{Screenshot of the Workflow Catalogue} \end{figure} \end{column} \end{columns} diff --git a/slides/common/Workflow_Parameterization_for_HPC.tex b/slides/common/Workflow_Parameterization_for_HPC.tex index b17c3f9..646da02 100644 --- a/slides/common/Workflow_Parameterization_for_HPC.tex +++ b/slides/common/Workflow_Parameterization_for_HPC.tex @@ -19,54 +19,240 @@ \section{Parametizing your Workflow - II} \frametitle{What is this about?} \begin{question}[Questions] \begin{itemize} - \item How do we add execution parameters? - \item How do we tune scientific parameters? + \item How do we avoid I/O contention? + \item How do we account for file system latency? \end{itemize} \end{question} \begin{docs}[Objectives] \begin{enumerate} - \item Learn to use parameters relevant for the batch systems. - \item Learn how to tune \Snakemake{} on the command line. - \item Learn how to tune \Snakemake{} with configuration files. + \item Learn how to tune \Snakemake{} to mitigate I/O contention. + \item Learn how to configure \Snakemake{} to allow for file system latency. \end{enumerate} \end{docs} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{The Configuration File} +\begin{frame} + \frametitle{\Interlude{What is Random Access?}} + \vspace{-1.5em} + \begin{figure} + \centering + \begin{tikzpicture} + \node[name=sa] at (3.5,10) { \lhref{https://en.wikipedia.org/wiki/Random_access}{Sequential Access} }; + \foreach[count=\y from 2] \x in {1,...,11}{ + \draw[thick,black,midway,draw] (\x,8.75) rectangle node[name=s\x] {} (\y,8.25); + } + \foreach[count=\y from 2] \x in {1, ..., 10} { + \path[->] ([yshift=.6em]s\x.north west) edge[bend left=30] ([yshift=.6em]s\y.north west); + } + % \path[->] ([yshift=.6em]s1.north) edge [bend left=30] ([yshift=.6em]s2.north) ; + + \node[name=ra] at (3.5,7) { Random Access }; + \foreach[count=\y from 2] \x in {1,...,11}{ + \draw[thick,black,midway,draw] (\x,5.75) rectangle node[name=r\x] {} (\y,5.25); + } + + \path[->] ([yshift=.6em]r1.north) edge [bend left=30] ([yshift=.6em]r5.north) ; + \path[->] ([yshift=.6em]r5.north) edge [bend right=60] ([yshift=.6em]r2.north) ; + \path[->] ([yshift=.6em]r2.north) edge [bend left=30] ([yshift=.6em]r3.north) ; + \path[->] ([yshift=.6em]r3.north) edge [bend left=30] ([yshift=.6em]r11.north) ; + \path[->] ([yshift=.6em]r11.north) edge [bend right=30] ([yshift=.6em]r7.north) ; + \path[->] ([yshift=.6em]r7.north) edge [bend right=30] ([yshift=.6em]r6.north) ; + \path[->] ([yshift=.6em]r6.north) edge [bend left=60] ([yshift=.6em]r8.north) ; + \path[->] ([yshift=.6em]r8.north) edge [bend right=50] ([yshift=.6em]r4.north) ; + \end{tikzpicture} + \end{figure} +\end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\begin{frame}[fragile] - \frametitle{The \texttt{Snakemake} \texttt{resources} Section} - \texttt{Snakemake} rules provide an additional \altverb{resource} section: - \begin{lstlisting}[language=Python,style=Python] -rule : - ... - resources: - partition='parallel', - mem_mb=1800, - cpus_per_task=4 - \end{lstlisting} - \begin{hint} - Note the \textbf{,}! - \end{hint} +\begin{frame} + \frametitle{\Interlude{What is Random Access?}} + \begin{question} + \begin{itemize} + \item What causes Random Access? + \item Why is it harmful? What can we do? + \end{itemize} + \end{question} \pause - \begin{docs} - You \emph{may} define \emph{any} resource keyword within any rule. - \end{docs} + \begin{columns}[t] + \begin{column}{0.5\textwidth} + Causes: + \hrule + \begin{itemize} + \item a number of (threaded) apps accessing the same file space (e.g. reference data) + \item a number of apps accessing a file space exceeding the file system cache size + \end{itemize} + Will slow parallel file systems and your data analysis! + \end{column} + \begin{column}{0.5\textwidth} + Remedies: + \hrule + \begin{itemize} + \item copy data to/from compute nodes equipped with SSD + \item use a RAM disk (RAM = random access memory) - which many clusters provide + \end{itemize} + \end{column} + \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} - \frametitle{The \texttt{Snakemake} \texttt{resources} Section - its Downside} - \begin{block}{Every Resource Spec needs a Change per Rule???} - You might have noticed, this specification per rule is most untidy. \texttt{Snakemake}'s design principle is: ship workflows which run \emph{everywhere} \& \emph{every time}. - \newline \pause - Relax: Every parameter can be specified: - \begin{itemize} - \item in a \texttt{Snakefile}, - \item on the command line, - \item and re-usable in configuration files! - \end{itemize} - \end{block} -\end{frame} \ No newline at end of file + \frametitle{\Interlude{What is File System Latency?}} + \centering + \begin{tikzpicture}[line cap=rect,line width=3pt, + datastore/.style={draw, rounded rectangle, rounded rectangle east arc=concave, rounded rectangle arc length=150}, + ] + \tikzstyle{storage} = [rectangle, minimum width=3cm, minimum height=1cm, text width=3cm, text centered, draw=black] + + \filldraw [fill=cyan] (0,0) circle [radius=1cm]; + \foreach \angle [count=\xi] in {60,30,...,-270} + { + \draw[line width=0.5pt] (\angle:0.9cm) -- (\angle:1cm); + \node[font=\small] at (\angle:0.68cm) {\textsf{\xi}}; + } + \foreach \angle in {0,90,180,270} + \draw[line width=1pt] (\angle:0.8cm) -- (\angle:1cm); + \draw (0,0) -- (120:0.4cm); + \draw (0,0) -- (90:.5cm); + + \node (sto1) [datastore] at (-4, 0) {Storage}; + \node at (4, 0) {\includegraphics[width=.25\textwidth]{misc/data_center.png}}; + \end{tikzpicture} + \begin{docs}{File System Latency} + The time it takes from the file system to the client and back. + \end{docs} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{\Interlude{What is File System Latency? II}} + \begin{docs}{Background} + Some clusters use NFS (Network File System). There, file system latency \emph{can} be an issue.\newline + \pause + On parallel file systems, the latency usually is very low. + \end{docs} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Global Workflow Configuration} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\Snakemake{} Profiles} + \begin{hint} + Profiles can shorten your command lines and be an easy remedy for the described issues! + \end{hint} + \pause + Two kinds of profiles are supported: + \begin{itemize}[<+->] + \item A global profile that is defined in a system-wide or user-specific configuration directory (on Linux, this will be \altverb{\~/.config/snakemake} and \altverb{/etc/xdg/snakemake}, you can find the answer for your system via \altverb{snakemake --help}). + \item A workflow specific profile that is defined via a flag (\altverb{--workflow-profile}) or searched in a default location (profile/default) in the working directory or next to the \altverb{Snakefile}. + \end{itemize} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Your Profile} + \begin{onlyenv}<1| handout:0> + Our first line forces the use of the \lhref{https://yte-template-engine.github.io/}{\bf{Y}AML \bf{T}emplate \bf{E}ngine} to parse the following lines accordingly. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<2| handout:0> + Now, our default executor is SLURM. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm + \end{lstlisting} + No more, \altverb{snakemake --executor slurm ...}! + \end{onlyenv} + \begin{onlyenv}<3| handout:0> + The next line tells Snakemake to wait for a minute, if output files are not present. This is more than enough time, even for NFS-Filesystems. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +latency-wait: 60 + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<4| handout:0> + The entire rest, will tell the storage plugin (\altverb{snakemake-storage-plugin-fs}) to stage in to the node-local storage on Mogon, for \emph{every} job and to copy back your results. When dealing with I/O intensive jobs, this can boost your performance tremendously. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +latency-wait: 60 +default-storage-provider: fs +shared-fs-usage: + - persistence + - sources + - source-cache +local-storage-prefix: /localscratch/$SLURM_JOB_ID + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<5| handout:0> + %TODO remove this part, once the snakemake release is ready + \begin{warning} + Currently, we are working on way to annotate susceptible I/O pattern, the highlighted parts are not yet functional. + \end{warning} + %You may copy this setup from \texttt{\configparam{pathtosetup}/config.yaml} to the \altverb{\~/.config/snakemake}-folder - unless your local admins provide a cluster-wide configuration. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +@latency-wait: 60@ +@default-storage-provider: fs@ +@shared-fs-usage:@ +@ - persistence@ +@ - sources@ +@ - source-cache@ +@local-storage-prefix: /localscratch/$SLURM_JOB_ID@ + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<6| handout:1> + The complete configuration out to be in \altverb{\~/.config/snakemake/config.yaml} + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +latency-wait: 60 +default-storage-provider: fs +shared-fs-usage: +- persistence +- sources +- source-cache +local-storage-prefix: /localscratch/$SLURM_JOB_ID + \end{lstlisting} + \end{onlyenv} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\HandsOn{Running with a Profile I}} + Enter + \begin{lstlisting}[language=Bash, style=Shell] +export SNAKEMAKE_PROFILE="$HOME/.config/snakemake" + \end{lstlisting} + in your \altverb{.bashrc} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\HandsOn{Running with a Profile II}} + Copy the configuration file: + \begin{lstlisting}[language=Bash, style=Shell] +$ mkdir -p ~/.config/snakemake +$ cp ~/workflows/config.yaml \ +> ~/.config/snakemake/. + \end{lstlisting} + Activate the new settings: + \begin{lstlisting}[language=Bash, style=Shell] +$ source ~/.bashrc + \end{lstlisting} + And run the workflow - just for fun: + \begin{lstlisting}[language=Bash, style=Shell] +$ snakemake -F # just this one flag! + \end{lstlisting} +\end{frame} + + + diff --git a/slides/common/contributions.tex b/slides/common/contributions.tex index 67ad7ac..45716ff 100644 --- a/slides/common/contributions.tex +++ b/slides/common/contributions.tex @@ -16,9 +16,12 @@ \section{Contributors} \end{itemize} \end{column} \begin{column}{.5\textwidth} - \begin{itemize} - \item no name yet to fill the list - \end{itemize} + \begin{itemize} + \item Aasish Kumar Sharma \includegraphics[height=\baselineskip]{logos/gwdg.jpg} + \end{itemize} + + Will hopefully find more names, soon. If you want to contribute - get in touch! + \end{column} \end{columns} diff --git a/slides/common/editor_gedit_mainz.tex b/slides/common/editor_gedit_mainz.tex index a860214..e3d9d7e 100644 --- a/slides/common/editor_gedit_mainz.tex +++ b/slides/common/editor_gedit_mainz.tex @@ -92,7 +92,7 @@ \section{Using the \texttt{gedit} Editor on \texttt{Mogon}} $ #or $ gedit @&@ \end{lstlisting} - The \altverb{@} starts the process in the background, if you forgot it: + The \altverb{&} starts the process in the background, if you forgot it: \begin{lstlisting}[language=Bash, style=Shell] $ [Ctrl+z] $ bg diff --git a/slides/common/preamble.tex b/slides/common/preamble.tex index 708222d..714607f 100644 --- a/slides/common/preamble.tex +++ b/slides/common/preamble.tex @@ -69,6 +69,7 @@ \usepackage{subfig} \usepackage{tikz} \usetikzlibrary{arrows,shapes,backgrounds,positioning,shadows,decorations,trees,decorations.pathreplacing} +%\usepackage{tikzpeople} \usepackage{smartdiagram} @@ -90,7 +91,7 @@ %%%%%%%%%%%%%%%%% %% PLEASE NOTE %% %%%%%%%%%%%%%%%%% -% Multiple frames containing ``Hand Out'' or ``Interlude'' should be started: +% frames containing ``Hand Out'' or ``Interlude'' should be started: % \setcounter{preframe_handson}{\value{handson}} % \begin{frame}[fragile] @@ -104,10 +105,7 @@ % \setcounter{interlude}{\value{preframe_interlude}} % \frametitle{Interlude -- Parameter Extension} -% respectively. Single frames may just start with - -% \begin{frame}[fragile] -% \frametitle{\HandsOn{Using \texttt{find}}} +% respectively. \newcounter{handson} \setcounter{handson}{1} @@ -348,10 +346,12 @@ \MyDat[\arabic{datacount},3]}% } -%\newcommand{\pathtosetup}[1]{\path{/lustre/project/hpckurs/workflows/#1}} -\newcommand{\pathtosetup}[1]{\texttt{\configparam{pathtosetup}/#1}} -\newcommand{\pathtoclozure}[1]{\texttt{\configparam{pathtoclozure}/#1}} -\newcommand{\pathtosolutions}[1]{\texttt{\configparam{pathtosolutions}/#1}} + + +%\newcommand{\pathtoexercise}[1]{\path{/lustre/project/m2_jgu-ngstraing/workflows/#1}} +%\newcommand{\pathtoexercise}[1]{\path{ \DTLfetch{data}{thekey}{#1}{thevalue} }} +\newcommand{\pathtoclozure}[1]{\path{workflows/tutorial/#1}} +\newcommand{\pathtosolutions}[1]{\path{/lustre/project/hpckurs/solutions/#1}} \setcounter{tocdepth}{1} diff --git a/slides/common/software_environment.tex b/slides/common/software_environment.tex index b5b7321..73bc777 100644 --- a/slides/common/software_environment.tex +++ b/slides/common/software_environment.tex @@ -161,7 +161,6 @@ \subsection{Using Conda} \end{columns} \end{frame} - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Installing Conda} @@ -196,6 +195,8 @@ \subsection{Using Conda} Please run \begin{lstlisting}[language=Bash, style=Shell] $ "${SHELL}" <(curl -L micro.mamba.pm/install.sh) +# or in your sample folder +$ bash install_micromamba.sh \end{lstlisting} \end{frame} @@ -222,8 +223,8 @@ \subsection{Using Conda} \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\tiny, breaklines=true] # >>> mamba initialize >>> # !! Contents within this block are managed by 'mamba init' !! -export MAMBA_EXE='/home/cm/.local/bin/micromamba'; -export MAMBA_ROOT_PREFIX='/home/cm/micromamba'; +export MAMBA_EXE='/home//.local/bin/micromamba'; +export MAMBA_ROOT_PREFIX='/home/'; __mamba_setup="$("$MAMBA_EXE" shell hook --shell bash --root-prefix "$MAMBA_ROOT_PREFIX" 2> /dev/null)" if [ $? -eq 0 ]; then eval "$__mamba_setup" @@ -237,13 +238,13 @@ \subsection{Using Conda} \end{column} \begin{column}{0.5\textwidth} \pause - Please edit your ``\texttt{\textasciitilde/.bashrc}'' file and put part in a function, to re-gain manual control: + {\footnotesize Please edit your ``\texttt{\textasciitilde/.bashrc}'' file and put part in a function, to re-gain manual control:} \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\tiny, breaklines=true] @function conda_initialize {@ # >>> mamba initialize >>> # !! Contents within this block are managed by 'mamba init' !! -export MAMBA_EXE='/home/cm/.local/bin/micromamba'; -export MAMBA_ROOT_PREFIX='/home/cm/micromamba'; +export MAMBA_EXE='/home//.local/bin/micromamba'; +export MAMBA_ROOT_PREFIX='/home/'; __mamba_setup="$("$MAMBA_EXE" shell hook --shell bash --root-prefix "$MAMBA_ROOT_PREFIX" 2> /dev/null)" if [ $? -eq 0 ]; then eval "$__mamba_setup" @@ -254,7 +255,6 @@ \subsection{Using Conda} # <<< mamba initialize <<< @}@ \end{lstlisting} - \bcattention Add the highlighted lines! and your login will be faster! Also, this allows to separate the module environment and conda-related environments safely. \end{column} \end{columns} \end{frame} diff --git a/slides/config/config.dat b/slides/config/config.dat index 26b6b7e..6a42eb0 100644 --- a/slides/config/config.dat +++ b/slides/config/config.dat @@ -5,9 +5,8 @@ condarcfile = "common/condarc_mogon.tex" % % these are the path names to contain sample data % see the README for explanations. -pathtosetup = /lustre/project/hpckurs/workflows -pathtoclozure = workflows/tutorial -pathtosolutions = /lustre/project/hpckurs/solutions +pathtosetup = /lustre/project/m2\_jgu-ngstraining/workflows +pathtosolutions = /lustre/project/hpckurs/m2\_jgu-ngstraining % Editors recommendations are a matter of taste and technological % setups (e.g. on-demand setups). Hence, specify an editor-slide, % here: diff --git a/slides/creators/A_First_Workflow.tex b/slides/creators/A_First_Workflow.tex index d641a5f..966e7b4 100644 --- a/slides/creators/A_First_Workflow.tex +++ b/slides/creators/A_First_Workflow.tex @@ -35,7 +35,7 @@ \section{A first Workflow} \frametitle{Before we begin \ldots} \begin{exampleblock}{Working with closure Files} To ease the excercises and save typing time, all exercises are supplied as cloze texts.\linebreak - \Snakemake{} relies on a file called \Snakemake{} to be present. You can either rename your cloze texts like + \Snakemake{} relies on a file called \altverb{Snakefile} to be present. You can either rename your cloze texts like \begin{lstlisting}[language=Bash, style=Shell] $ cp _Snakefile Snakefile \end{lstlisting} @@ -50,9 +50,10 @@ \section{A first Workflow} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] - \frametitle{Specifiying \texttt{Snakefile}s (Details)} - \begin{docs}[\texttt{Snakemake}s Default \texttt{Snakefile}] - The workflow definition in form of a snakefile. Usually, you should not need to specify this. By default, \texttt{Snakemake} will search for "\altverb{Snakefile}", "\altverb{snakefile}", "\altverb{workflow/Snakefile}", "\altverb{workflow/snakefile}" beneath the current working directory, in this order.\newline + \frametitle{Specifiying Snakefiles (Details)} + % impossible to use \Snakemake command in environment header + \begin{docs}[Snakemake's default \altverb{Snakefile}] + The workflow definition in form of a snakefile. Usually, you should not need to specify this. By default, \Snakemake{} will search for "\altverb{Snakefile}", "\altverb{snakefile}","\altverb{workflow/Snakefile}", \\"\altverb{workflow/snakefile}" beneath the current working directory, in this order.\newline When using a different layout, you can use \begin{lstlisting}[language=Bash, style=Shell] $ snakemake \ @@ -100,7 +101,7 @@ \subsection{A first Step or ``Rule''} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] - \frametitle{Our first \altverb{Snakefile}!} + \frametitle{Our first Snakefile!} \begin{onlyenv}<1| handout:0> Our first ``\altverb{rule}''; we want to map reads onto a reference genome. \begin{lstlisting}[language=Python,style=Python] @@ -186,16 +187,16 @@ \subsection{A first Step or ``Rule''} " | samtools view -Sb - >" " mapped_reads/A.bam" \end{lstlisting} - You will find working content in the file \pathtoclozure{01\_Snakefile}, too. + You will find working content in the file \altverb{01_Snakefile} in your tutorial folder, too. \end{onlyenv} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} - \frametitle{\altverb{Snakefile}s are Python Files} + \frametitle{Snakefiles are Python Files} \begin{block}{Some Background} - \begin{enumerate} - \item Like Python, you can use either tabs or spaces for indentation — don’t mix! Consensus is to only use spaces. + \begin{enumerate}[<+->] + \item Like Python, you can use either tabs or spaces for indentation — don’t mix! Consensus is to only use \emph{spaces}. \item Together, the target, dependencies, and actions form a rule. A rule is a recipe for how to make things. \end{enumerate} \end{block} @@ -203,12 +204,12 @@ \subsection{A first Step or ``Rule''} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] - \frametitle{Testing our \altverb{Snakefile}} + \frametitle{Testing our Snakefile} When a workflow is executed, \Snakemake{} tries to generate given target files. Target files can be specified via the command line. By executing \begin{lstlisting}[language=Bash, style=Shell] $ snakemake -np mapped_reads/A.bam \end{lstlisting} - in the working directory containing the \altverb{Snakefile}, we tell \Snakemake{} to generate the target file \altverb{mapped_reads/A.bam}.\newline + in the working directory containing the \altverb{Snakefile}, we tell \\\Snakemake{} to generate the target file \altverb{mapped_reads/A.bam}.\newline We are using \begin{itemize}[<+->] \item \altverb{-n/--dry-run} to show the \emph{planned} execution and @@ -245,12 +246,6 @@ \subsection{A first Step or ``Rule''} \begin{lstlisting}[style=Plain, basicstyle=\footnotesize] Nothing to be done (all requested files are present and up to date). \end{lstlisting} - \pause - Now, do: - \begin{lstlisting}[language=Bash, style=Shell] -$ touch mapped_reads/A.bam - \end{lstlisting} - And run \Snakemake{} once more. \begin{question} What happens? Why? \end{question} @@ -304,10 +299,10 @@ \subsection{A first Step or ``Rule''} output: result.txt # The following command will concatenate # the input files. - shell: "cat {input} > {output} + shell: "cat {input} > {output}" \end{lstlisting} \begin{docs} - If the in-/output has multiple lines, like in this example, \texttt{Snakemake} will concatenate them, separated by a whitespace. In other words \altverb{\{input\}} will contain \altverb{replicate_1.txt replicate_2.txt}. + If the in-/output has multiple lines, like in this example, \Snakemake{} will concatenate them, separated by a whitespace. In other words \altverb{\{input\}} will contain \altverb{replicate_1.txt replicate_2.txt}. \end{docs} \end{frame} @@ -318,7 +313,7 @@ \subsection{A first Step or ``Rule''} Your task is simple to introduce the \altverb{\{input\}} and \altverb{\{output\}} wildcards in our current \altverb{bwa_map} rule. Replace the input and output within the \altverb{shell} directive using these wildcards. \end{task} \begin{hint} - Remember, \texttt{Snakemake} will concatenate multiline in- or output. Refer to \altverb{02\_Snakefile} in your tutorial folder for the task setting. + Remember, \Snakemake{} will concatenate multiline in- or output. Refer to \altverb{02_Snakefile} in your tutorial folder for the task setting. \end{hint} \end{frame} @@ -339,7 +334,7 @@ \subsection{A first Step or ``Rule''} " {output}" \end{lstlisting} Since the rule has multiple input files, \Snakemake{} will concatenate them, separated by a whitespace. In other words, \Snakemake{} will replace \altverb{\{input\}} with \altverb{data/genome.fa data/samples/A.fastq} before executing the command.\newline - A working example can be found in \altverb{02_Snakefile}. + A working example can be found in \altverb{02_Snakefile} in the solution folder. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -347,8 +342,8 @@ \subsection{A first Step or ``Rule''} \frametitle{\HandsOn{Further Workflow Decoration}} (Usually), experiments consists of more than one input, e.\,g. several replicas might be present.\newline \begin{task}{Introducing the \altverb{sample} Wildcard} - \texttt{Snakemake} allows generalizing rules by using named wildcards, too. Simply replace the \altverb{A} in the second input file and in the output file with the wildcard \altverb{\{sample\}}.\newline - You may refer to the \pathtoclozure{03\_Snakefile} template. + \Snakemake{} allows generalizing rules by using named wildcards, too. Simply replace the \altverb{A} in the second input file and in the output file with the wildcard \altverb{\{sample\}}.\newline + You may refer to the \altverb{03_Snakefile} template in your tutorial folder. \end{task} \end{frame} @@ -370,28 +365,6 @@ \subsection{A first Step or ``Rule''} When \Snakemake{} determines that this rule can be applied to generate a target file by replacing the wildcard \altverb{\{sample\}} in the output file with an appropriate value, it will propagate that value to all occurrences of \altverb{\{sample\}} in the input files and thereby determine the necessary input for the resulting job. \end{frame} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\begin{frame}[fragile] - \frametitle{\HandsOn{Mini-Task: add the \texttt{sample} Wildcard}} - \vspace{-0.5em} - \begin{task} - Simply refer to your current \texttt{Snakefile} or to the cloze text in \pathtoclozure{03\_Snakefile}. Replace the \altverb{A} in the second input file and in the output file with the wildcard \newline "\altverb{\{sample\}}". - Then run: - \begin{lstlisting}[language=Bash, style=Shell] -$ snakemake -np mapped_reads/B.bam - \end{lstlisting} - \texttt{Snakemake} will determine that the rule \altverb{bwa_map} can be applied to generate the target file by replacing the wildcard \altverb{\{sample\}} with the value \altverb{B}. - \vspace{-0.5em} - \begin{hint} - If your workflow does not work, find the solution in\newline\pathtosolutions{\\03\_Snakefile}. - \end{hint} - \end{task} - \begin{docs}{Background} - When \texttt{Snakemake} determines that this rule can be applied to generate a target file by replacing the wildcard \altverb{\{sample\}} in the output file with an appropriate value, it will propagate that value to all occurrences of \altverb{\{sample\}} in the input files and thereby determine the necessary input for the resulting job.\newline - \pathtosolutions{03\_Snakefile} - \end{docs} -\end{frame} - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Execution} @@ -437,7 +410,7 @@ \subsection{A first Step or ``Rule''} "-O bam {input} > {output}" \end{lstlisting} \begin{task} - Please refer to your template \newline \pathtoclozure{04\_Snakefile} and fill in \altverb{input} and \altverb{output}. + Please refer to your template \newline \altverb{04_Snakefile} in the tutorial folder and fill in \altverb{input} and \altverb{output}. \end{task} \end{frame} @@ -454,7 +427,7 @@ \subsection{A first Step or ``Rule''} "samtools sort -T sorted_reads/{wildcards.sample} " "-O bam {input} > {output}" \end{lstlisting} - You will find this solution in \pathtosolutions{\\04\_Snakefile} + You will find this solution in \pathtosolutions{04_Snakefile} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -542,17 +515,17 @@ \subsection{A first Step or ``Rule''} Now, we can add the following rule to our \altverb{Snakefile}: \begin{lstlisting}[language=Python,style=Python,basicstyle=\footnotesize] rule bcftools_call: - input: - fa="data/genome.fa", - bam=@expand("sorted_reads/{sample}.bam", sample=SAMPLES),@ - bai=@expand("sorted_reads/{sample}.bam.bai", sample=SAMPLES)@ - output: - "calls/all.vcf" - shell: - "bcftools mpileup -f {input.fa} {input.bam} | " - "bcftools call -mv - > {output}" + input: + fa="data/genome.fa", + bam=@expand("sorted_reads/{sample}.bam", sample=SAMPLES),@ + ai=@expand("sorted_reads/{sample}.bam.bai", sample=SAMPLES)@ + output: + "calls/all.vcf" + shell: + "bcftools mpileup -f {input.fa} {input.bam} | " + "bcftools call -mv - > {output}" \end{lstlisting} - We will meet this rule in \pathtoclozure{05\_Snakefile} -- in the next exercise. + We will meet this rule in \altverb{05_Snakefile} of our tutorial folder -- in the next exercise. \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -563,12 +536,12 @@ \subsection{Adding a Target Rule} \frametitle{Introducing: Target Rules} \begin{question}[Why Target Rules?] So far, we always executed the workflow by specifying a target file at the command line. How cumbersome!\newline - I would be better to make \texttt{Snakemake} figure out, which jobs to run. Right? + I would be better to make \Snakemake{} figure out, which jobs to run. Right? \end{question} \pause \begin{docs} - We remember: \texttt{Snakemake} will automatically determine for a given rule, which expected outcomes are missing and execute all necessary rules, accordingly.\newline\pause - The ``trick'' is that a workflow can have a ``target'' rule, which specifies the \emph{final} output(s) of a workflow. Any invokation of \texttt{Snakemake} will then execute \emph{all} rules of a workflow. + We remember: \Snakemake{} will automatically determine for a given rule, which expected outcomes are missing and execute all necessary rules, accordingly.\newline\pause + The ``trick'' is that a workflow can have a ``target'' rule, which specifies the \emph{final} output(s) of a workflow. Any invocation of Snakemake will then execute \emph{all} rules of a workflow. \end{docs} \end{frame} @@ -576,7 +549,7 @@ \subsection{Adding a Target Rule} \begin{frame}[fragile] \frametitle{\HandsOn{Target Rule Practice}} \begin{docs} - If no target is given at the command line, \texttt{Snakemake} will define the first rule of the \texttt{Snakefile} as the target. + If no target is given at the command line, \Snakemake{} will define the first rule of the \altverb{Snakefile} as the target. \end{docs} Conventionally, this rule is named \altverb{all}. This means that we add a rule at the top of our workflow:\newline \begin{onlyenv}<1| handout:0> @@ -590,7 +563,7 @@ \subsection{Adding a Target Rule} \end{onlyenv} \begin{onlyenv}<2| handout:1> \begin{task} - Take the template \pathtoclozure{05\_Snakefile} and fill in the target in the \altverb{all} rule. Try to run the workflow! + Take the template \altverb{05_Snakefile} and fill in the target in the \altverb{all} rule. Try to run the workflow! \end{task} \end{onlyenv} \end{frame} @@ -602,11 +575,11 @@ \subsection{Adding a Target Rule} \begin{lstlisting}[language=Python,style=Python] rule all: input: - "calls/all.vcf" + "calls/all.vcf" \end{lstlisting} \pause \begin{hint} Essentially, you can add all output files you want to keep (e.\,g. plots, final results) to \altverb{all}. \end{hint} - The solution can be found at \pathtosolutions{\\05\_Snakefile} + The solution can be found at \pathtosolutions{05_Snakefile} \end{frame} diff --git a/slides/creators/Decorating_the_Workflow.tex b/slides/creators/Decorating_the_Workflow.tex index ac435d2..7fefe41 100644 --- a/slides/creators/Decorating_the_Workflow.tex +++ b/slides/creators/Decorating_the_Workflow.tex @@ -123,10 +123,6 @@ \subsection{The Configuration File} \begin{docs} You can store a yaml file with \emph{your} workflow configuration -- which may be combined with the desinger's configuration. \end{docs} - \pause - \begin{warning} - It is better to specify fully qualified paths to your data! The tilde is only there to make a point! - \end{warning} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -141,7 +137,7 @@ \subsection{The Configuration File} configfile: "config/config.yaml" \end{lstlisting} \begin{question} - How do pick up resource parameters in out \altverb{Snakefile}?! + How can you pick up that info in your \altverb{Snakefile}?! \end{question} \end{frame} @@ -165,7 +161,7 @@ \subsection{The Configuration File} rule bcftools_call: input: bam=expand("sorted_reads/{sample}.bam", - sample=@config["samples"]@), + sample=@config["samples"]@), ... \end{lstlisting} \end{frame} @@ -226,22 +222,13 @@ \subsection{The Configuration File} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{\HandsOn{Adding a new Sample}} -<<<<<<< HEAD \begin{task} - Refer to \pathtoclozure{\\07\_Snakefile} as the current workflow example containing the mentioned input function.\newline - In the \altverb{data/samples} folder, there is an additional sample \altverb{C.fastq}. Add that sample to the config file and see how \texttt{Snakemake} wants to recompute the part of the workflow belonging to the new sample, when invoking with + Refer to your tutorial file \altverb{07_Snakefile} as the current workflow example containing the mentioned input function.\newline + In the \altverb{data/samples} folder, there is an additional sample \altverb{C.fastq}. Add that sample to the config file and see how \Snakemake{} wants to recompute the part of the workflow belonging to the new sample, when invoking with \begin{lstlisting}[language=Bash, style=Shell] $ snakemake -n --forcerun bcftools_call \end{lstlisting} \end{task} - -======= - In the \altverb{data/samples} folder, there is an additional sample \altverb{C.fastq}. Add that sample to the config file and see how \Snakemake{} wants to recompute the part of the workflow belonging to the new sample, when invoking with - \begin{lstlisting}[language=Bash, style=Shell] -$ snakemake -n --forcerun bcftools_call - \end{lstlisting} - Copy the file XXX as your \altverb{Snakefile}. ->>>>>>> main \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -252,7 +239,7 @@ \subsection{The Configuration File} \end{warning} In our workflow, it is reasonable to annotate aligned reads with so-called read groups, that contain metadata like the sample name. \newline To do so, we add to our \altverb{bwa_map}-rule: - \begin{lstlisting}[style=Plain,basicstyle=\footnotesize] + \begin{lstlisting}[style=Plain,basicstyle=\small] rule bwa_map: input: "data/genome.fa", @@ -271,7 +258,7 @@ \subsection{The Configuration File} \begin{frame}[fragile] \frametitle{Rule Parameters} This is usually part of the configuration file(s), too.\newline - In your config file (\altverb{r""} stands for "raw strings"): \begin{lstlisting}[language=Python,style=Python] bwa_map: rg=r"@RG\tID:{sample}\tSM:{sample}" @@ -280,9 +267,9 @@ \subsection{The Configuration File} \begin{lstlisting}[language=Python,style=Python] rule bwa_map: params: - rg=config['bwa_map']['rg'] + @rg=config['bwa_map']['rg']@ \end{lstlisting} - Now, your workflow is configurable! + You have a configurable workflow! \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -293,7 +280,7 @@ \subsection{The Configuration File} \end{hint} \pause \begin{task} - Log into the workflow: Name all rules, which merit gathering logs. + Look into the workflow: Name all rules, which merit gathering logs. \end{task} \pause \begin{docs} @@ -312,7 +299,7 @@ \subsection{The Configuration File} get_bwa_map_input_fastqs output: "mapped_reads/{sample}.bam" - log: "logs/bwa_mem/{sample.log}" + log: "logs/bwa_mem/{sample}.log" # we combine stdout & stdout shell: "(bwa mem -R '{params.rg}' {input} " @@ -335,7 +322,7 @@ \subsection{The Configuration File} \end{lstlisting} Implement the log directive for all rules you consider worth being logged. What do you observe? \newline - You can take the \pathtoclozure{//xxx} sample file to proceed. + You can take the \altverb{08_Snakemake} sample file to proceed. \end{task} \end{frame} @@ -415,7 +402,7 @@ \subsection{The Command Line} \Snakemake{} lets you select various executors. Not happy with HPC clusters? Pay for a cloud \lhref{https://snakemake.readthedocs.io/en/stable/executor_tutorial/tutorial.html}{Google Lifescience, Tibanna, Kubernetes, \ldots} \newline Meanwhile we select the most prominent HPC batch system by: \begin{lstlisting}[language=Bash, style=Shell] -$ snakemake --slurm +$ snakemake --executor slurm \end{lstlisting} Now, \emph{every} rule will submit its jobs as HPC compute jobs. \begin{hint} @@ -428,7 +415,7 @@ \subsection{The Command Line} \frametitle{Default Resources for \texttt{SLURM}} Without specifying our SLURM-account and a (default) partition, submitting batch jobs will fail. \Snakemake{} allows to define so-called default resources (using \altverb{--default-resources}). With them our minimal command line becomes: \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\footnotesize] -$ snakemake --slurm \ +$ snakemake --executor slurm \ > --default-resources slurm_account=m2_jgu-ngstraining \ > slurm_partition=smp \end{lstlisting} @@ -451,7 +438,7 @@ \subsection{The Command Line} \pause \Snakemake{} offers a semaphore to throttle resource usage, called \altverb{--jobs/-j}. We can now write \altverb{-j unlimited} in place for \altverb{--cores 1}. Let us try \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\footnotesize] -$ snakemake --slurm \ +$ snakemake --executor slurm \ > --default-resources \ > slurm_account=m2_jgu-ngstraining \ > slurm_partition=smp @@ -468,11 +455,11 @@ \subsection{The Command Line} \item hold track of your job status (frequency of checks can be adjusted) \item cancel your jobs, when itself is stopped \item track resource consumption (generated with \altverb{--report [FILE]}) - \item with \altverb{-j unlimited} we allow for an unlimited jumber of spawned jobs! + \item with \altverb{-j unlimited} we allow for an unlimited number of spawned jobs! \end{itemize} \pause \begin{warning} - Unlimited number of jobs may yield in I/O contention and too many calls to check the job status. Use with care for both issues there is a remedy, which we will meet later! + "Unlimited" number of jobs may yield in I/O contention and too many calls to check the job status. Use with care for both issues there is a remedy, which we will meet later! \end{warning} \end{frame} @@ -481,7 +468,7 @@ \subsection{The Command Line} \frametitle{\HandsOn{Running the Worklow as Cluster Jobs}} Our workflow is {\tiny tiny}. It does not merit cluster execution. Yet, for the purpose of this course, please run (with \altverb{-F} to \emph{enforce} execution): \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\footnotesize] -$ snakemake --slurm \ +$ snakemake --executor slurm \ > --default-resources slurm_account=m2_jgu-ngstraining \ > slurm_partition=smp \ > -j unlimited @-F@ @@ -500,13 +487,12 @@ \subsection{The Command Line} \end{question} \pause \begin{itemize} - \item ``real data'' tend to be biffer - insufficient memory + \item ``real data'' tend to be differ - insufficient memory \item ``real runs'' tend to take longer - insufficient wall time \item ``real runs'' tend to run into I/O issues - insufficient workflows \end{itemize} \end{frame} - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Cluster Configuration} @@ -535,7 +521,7 @@ \subsection{The Command Line} \begin{lstlisting}[language=Bash, style=Shell, basicstyle=\footnotesize] $ # first copy the template profile: $ cp -r ../profile . -$ snakemake --slurm \ +$ snakemake --executor slurm \ > --default-resources slurm_account=m2_jgu-ngstraining \ > slurm_partition=smp \ > -j unlimited @--workflow-profile ./profile/ -F@ diff --git a/slides/creators/Finishing_and_Execution.tex b/slides/creators/Finishing_and_Execution.tex index 189f3fe..0e5b7ce 100644 --- a/slides/creators/Finishing_and_Execution.tex +++ b/slides/creators/Finishing_and_Execution.tex @@ -22,58 +22,18 @@ \section{Finishing and Executing the Workflow} \end{question} \begin{docs}[Objectives] \begin{enumerate} - \item Extending the target idea. \item Running simple workflows. \end{enumerate} \end{docs} \end{frame} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\subsection{Adding a Target Rule} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\begin{frame} - \frametitle{Why Target Rules} - So far, we always executed the workflow by specifying a target file at the command line. How cumbersome!\newline - \begin{docs} - We remember: \Snakemake{} will automatically determine for a given rule, which expected outcomes are missing and execute all necessary rules, accordingly.\newline\pause - The ``trick'' is that a workflow can have a ``target'' rule, which specifies the \emph{final} output(s) of a workflow. Any invokation of \Snakemake{} will then execute \emph{all} rules of a workflow. - \end{docs} -\end{frame} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\begin{frame}[fragile] - \frametitle{Best Practice} - \begin{docs} - If no target is given at the command line, \Snakemake{} will define the first rule of the \altverb{Snakefile} as the target. - \end{docs} - Conventially, this rule is named \altverb{all}. This means that we add a rule at the top of our workflow:\newline - \begin{onlyenv}<1| handout:0> - \begin{question} - Which is our target file? - \end{question} - \begin{lstlisting}[language=Python,style=Python] - rule all: - input: - \end{lstlisting} - \end{onlyenv} - \begin{onlyenv}<2| handout:1> - Our target rule is: - \begin{lstlisting}[language=Python,style=Python] - rule all: - input: - "plots/quals.svg" - \end{lstlisting} - \end{onlyenv} -\end{frame} - %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \subsection{Running the final Workflow} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Our ``final'' Workflow} - \footnotesize{Refer to \pathtosolutions{\\06\_Snakefile} as the workflow solution, so far.} + \footnotesize{Refer to \pathtosolutions{06_Snakefile} as the workflow solution, so far.} \vspace{-1em} \begin{columns} \begin{column}{0.5\textwidth} @@ -152,10 +112,10 @@ \subsection{Running the final Workflow} \pause Some targets are already present, we want the entire workflow again: \begin{lstlisting}[language=Bash, style=Shell] -$ snakemake -j4 --forcerun +$ snakemake -c4 --forcerun \end{lstlisting} \begin{question} - What do you observe? Why \altverb{-j4}? + What do you observe? Why \altverb{-c4}? \end{question} \end{frame} diff --git a/slides/creators/Getting_Started.tex b/slides/creators/Getting_Started.tex index 53d5b3e..15725ca 100644 --- a/slides/creators/Getting_Started.tex +++ b/slides/creators/Getting_Started.tex @@ -17,18 +17,9 @@ \section{The Tutorial Scenario} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} \frametitle{What is this about?} - \begin{question}[Questions] - \begin{itemize} - \item How can workflow development be started? - \item What is a good directory layout? - \item How does a workflow description look like? - \end{itemize} - \end{question} - \begin{docs}[Objectives] + \begin{docs}[Objective] \begin{enumerate} - \item Introduce you to conceptualization - \item First demonstration of directory layouts - \item Introducing a first (very basic) workflow + \item Introduce to the Tutorial Scenario \end{enumerate} \end{docs} \end{frame} diff --git a/slides/creators/Python_in_Snakemake.tex b/slides/creators/Python_in_Snakemake.tex index 185ad50..51c0859 100644 --- a/slides/creators/Python_in_Snakemake.tex +++ b/slides/creators/Python_in_Snakemake.tex @@ -36,7 +36,7 @@ \subsection{Python Code in Snakefiles} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame} - \frametitle{Why Python in a \altverb{Snakefile}?} + \frametitle{Why Python in a Snakefile?} Sometimes we do \emph{not} want to run $3^{\mathsf{rd}}$ party code, but run the occasional script for data manipulation or plotting or \ldots \pause \begin{docs} @@ -49,7 +49,7 @@ \subsection{Supported Functions} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] - \frametitle{Using functions in \altverb{Snakefile}s} + \frametitle{Using functions in Snakefiles} We already met the \altverb{expand()}-function. Let's revisit it in detail! \begin{task} Start Python and follow along! @@ -83,7 +83,7 @@ \subsection{Supported Functions} \pause Wow! So, easy! \begin{question} - What happend? What happens if you leave \altverb{.replicas} away? + What happened? What happens if you leave \altverb{.replicas} away? \end{question} \end{frame} @@ -210,7 +210,7 @@ \subsection{Snakemake and external Scripts} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{We want to plot our Results!} - We now add the following rule to your \altverb{Snakefile} we could plot our variant statistics (see \pathtoclozure{06\_Sankefile}): + We now add the following rule to your \altverb{Snakefile} we could plot our variant statistics (see \altverb{06_Snakefile} in our tutorial folder): \begin{lstlisting}[language=Python,style=Python] rule plot_quals: input: @@ -331,6 +331,7 @@ \subsection{Snakemake and external Scripts} plt.hist(quals) plt.savefig(snakemake.output[0]) \end{lstlisting} + To be saved as \altverb{scripts/plot-quals.py}. \end{onlyenv} \end{frame} diff --git a/slides/creators/Sample_Data.tex b/slides/creators/Sample_Data.tex index 1a01ad1..a368112 100644 --- a/slides/creators/Sample_Data.tex +++ b/slides/creators/Sample_Data.tex @@ -32,7 +32,7 @@ \section{Getting your Sample Data} \begin{frame}[fragile] \frametitle{\HandsOn{Getting Your Course Material I}} We shall copy a few install scripts (which also will download some sample data).\newline - Please copy the directory \configparam{pathtosetup}.\newline + Please copy the directory "\texttt{\configparam{pathtosetup}}".\newline \begin{hint} Remember, to copy an entire directory, you can use: \begin{lstlisting}[language=Bash, style=Shell] @@ -58,9 +58,11 @@ \section{Getting your Sample Data} \dirtree{% .1 {.}. .2 {condarc}. - .2 {get\_data.sh}. + .2 {get\_tutorial.sh}. + .2 {install\_micromamba.sh}. .2 {tutorial}. - .3 {template\_files}. + .3 {01\_Snakefile}. + .3 {02\_Snakefile}. .3 {\ldots}. }} \end{minipage} @@ -71,9 +73,9 @@ \section{Getting your Sample Data} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{\HandsOn{Obtaining Your Tutorial Sample Data}} - Please run the \altverb{get_data.sh} script you just copied: + Please run the \altverb{get_tutorial.sh} script you just copied: \begin{lstlisting}[language=Bash, style=Shell,basicstyle=\footnotesize] -$ bash get_data.sh +$ bash get_tutorial.sh \end{lstlisting} \begin{task} This script will download and unpack the sample data for this course. Please take a look in this script to understand it. Where are your sample data after running this script? diff --git a/slides/creators/Workflow_Parameterization_for_HPC.tex b/slides/creators/Workflow_Parameterization_for_HPC.tex new file mode 100644 index 0000000..646da02 --- /dev/null +++ b/slides/creators/Workflow_Parameterization_for_HPC.tex @@ -0,0 +1,258 @@ +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\section{Parametizing your Workflow - II} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{Outline} + \begin{columns}[t] + \begin{column}{.5\textwidth} + \tableofcontents[sections={1-7},currentsection] + \end{column} + \begin{column}{.5\textwidth} + \tableofcontents[sections={8-15},currentsection] + \end{column} + \end{columns} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{What is this about?} + \begin{question}[Questions] + \begin{itemize} + \item How do we avoid I/O contention? + \item How do we account for file system latency? + \end{itemize} + \end{question} + \begin{docs}[Objectives] + \begin{enumerate} + \item Learn how to tune \Snakemake{} to mitigate I/O contention. + \item Learn how to configure \Snakemake{} to allow for file system latency. + \end{enumerate} + \end{docs} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{\Interlude{What is Random Access?}} + \vspace{-1.5em} + \begin{figure} + \centering + \begin{tikzpicture} + \node[name=sa] at (3.5,10) { \lhref{https://en.wikipedia.org/wiki/Random_access}{Sequential Access} }; + \foreach[count=\y from 2] \x in {1,...,11}{ + \draw[thick,black,midway,draw] (\x,8.75) rectangle node[name=s\x] {} (\y,8.25); + } + \foreach[count=\y from 2] \x in {1, ..., 10} { + \path[->] ([yshift=.6em]s\x.north west) edge[bend left=30] ([yshift=.6em]s\y.north west); + } + % \path[->] ([yshift=.6em]s1.north) edge [bend left=30] ([yshift=.6em]s2.north) ; + + \node[name=ra] at (3.5,7) { Random Access }; + \foreach[count=\y from 2] \x in {1,...,11}{ + \draw[thick,black,midway,draw] (\x,5.75) rectangle node[name=r\x] {} (\y,5.25); + } + + \path[->] ([yshift=.6em]r1.north) edge [bend left=30] ([yshift=.6em]r5.north) ; + \path[->] ([yshift=.6em]r5.north) edge [bend right=60] ([yshift=.6em]r2.north) ; + \path[->] ([yshift=.6em]r2.north) edge [bend left=30] ([yshift=.6em]r3.north) ; + \path[->] ([yshift=.6em]r3.north) edge [bend left=30] ([yshift=.6em]r11.north) ; + \path[->] ([yshift=.6em]r11.north) edge [bend right=30] ([yshift=.6em]r7.north) ; + \path[->] ([yshift=.6em]r7.north) edge [bend right=30] ([yshift=.6em]r6.north) ; + \path[->] ([yshift=.6em]r6.north) edge [bend left=60] ([yshift=.6em]r8.north) ; + \path[->] ([yshift=.6em]r8.north) edge [bend right=50] ([yshift=.6em]r4.north) ; + \end{tikzpicture} + \end{figure} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{\Interlude{What is Random Access?}} + \begin{question} + \begin{itemize} + \item What causes Random Access? + \item Why is it harmful? What can we do? + \end{itemize} + \end{question} + \pause + \begin{columns}[t] + \begin{column}{0.5\textwidth} + Causes: + \hrule + \begin{itemize} + \item a number of (threaded) apps accessing the same file space (e.g. reference data) + \item a number of apps accessing a file space exceeding the file system cache size + \end{itemize} + Will slow parallel file systems and your data analysis! + \end{column} + \begin{column}{0.5\textwidth} + Remedies: + \hrule + \begin{itemize} + \item copy data to/from compute nodes equipped with SSD + \item use a RAM disk (RAM = random access memory) - which many clusters provide + \end{itemize} + \end{column} + \end{columns} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{\Interlude{What is File System Latency?}} + \centering + \begin{tikzpicture}[line cap=rect,line width=3pt, + datastore/.style={draw, rounded rectangle, rounded rectangle east arc=concave, rounded rectangle arc length=150}, + ] + \tikzstyle{storage} = [rectangle, minimum width=3cm, minimum height=1cm, text width=3cm, text centered, draw=black] + + \filldraw [fill=cyan] (0,0) circle [radius=1cm]; + \foreach \angle [count=\xi] in {60,30,...,-270} + { + \draw[line width=0.5pt] (\angle:0.9cm) -- (\angle:1cm); + \node[font=\small] at (\angle:0.68cm) {\textsf{\xi}}; + } + \foreach \angle in {0,90,180,270} + \draw[line width=1pt] (\angle:0.8cm) -- (\angle:1cm); + \draw (0,0) -- (120:0.4cm); + \draw (0,0) -- (90:.5cm); + + \node (sto1) [datastore] at (-4, 0) {Storage}; + \node at (4, 0) {\includegraphics[width=.25\textwidth]{misc/data_center.png}}; + \end{tikzpicture} + \begin{docs}{File System Latency} + The time it takes from the file system to the client and back. + \end{docs} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame} + \frametitle{\Interlude{What is File System Latency? II}} + \begin{docs}{Background} + Some clusters use NFS (Network File System). There, file system latency \emph{can} be an issue.\newline + \pause + On parallel file systems, the latency usually is very low. + \end{docs} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Global Workflow Configuration} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\Snakemake{} Profiles} + \begin{hint} + Profiles can shorten your command lines and be an easy remedy for the described issues! + \end{hint} + \pause + Two kinds of profiles are supported: + \begin{itemize}[<+->] + \item A global profile that is defined in a system-wide or user-specific configuration directory (on Linux, this will be \altverb{\~/.config/snakemake} and \altverb{/etc/xdg/snakemake}, you can find the answer for your system via \altverb{snakemake --help}). + \item A workflow specific profile that is defined via a flag (\altverb{--workflow-profile}) or searched in a default location (profile/default) in the working directory or next to the \altverb{Snakefile}. + \end{itemize} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{Your Profile} + \begin{onlyenv}<1| handout:0> + Our first line forces the use of the \lhref{https://yte-template-engine.github.io/}{\bf{Y}AML \bf{T}emplate \bf{E}ngine} to parse the following lines accordingly. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<2| handout:0> + Now, our default executor is SLURM. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm + \end{lstlisting} + No more, \altverb{snakemake --executor slurm ...}! + \end{onlyenv} + \begin{onlyenv}<3| handout:0> + The next line tells Snakemake to wait for a minute, if output files are not present. This is more than enough time, even for NFS-Filesystems. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +latency-wait: 60 + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<4| handout:0> + The entire rest, will tell the storage plugin (\altverb{snakemake-storage-plugin-fs}) to stage in to the node-local storage on Mogon, for \emph{every} job and to copy back your results. When dealing with I/O intensive jobs, this can boost your performance tremendously. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +latency-wait: 60 +default-storage-provider: fs +shared-fs-usage: + - persistence + - sources + - source-cache +local-storage-prefix: /localscratch/$SLURM_JOB_ID + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<5| handout:0> + %TODO remove this part, once the snakemake release is ready + \begin{warning} + Currently, we are working on way to annotate susceptible I/O pattern, the highlighted parts are not yet functional. + \end{warning} + %You may copy this setup from \texttt{\configparam{pathtosetup}/config.yaml} to the \altverb{\~/.config/snakemake}-folder - unless your local admins provide a cluster-wide configuration. + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +@latency-wait: 60@ +@default-storage-provider: fs@ +@shared-fs-usage:@ +@ - persistence@ +@ - sources@ +@ - source-cache@ +@local-storage-prefix: /localscratch/$SLURM_JOB_ID@ + \end{lstlisting} + \end{onlyenv} + \begin{onlyenv}<6| handout:1> + The complete configuration out to be in \altverb{\~/.config/snakemake/config.yaml} + \begin{lstlisting}[language=Bash, style=Shell] +__use_yte__: true +executor: slurm +latency-wait: 60 +default-storage-provider: fs +shared-fs-usage: +- persistence +- sources +- source-cache +local-storage-prefix: /localscratch/$SLURM_JOB_ID + \end{lstlisting} + \end{onlyenv} +\end{frame} + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\HandsOn{Running with a Profile I}} + Enter + \begin{lstlisting}[language=Bash, style=Shell] +export SNAKEMAKE_PROFILE="$HOME/.config/snakemake" + \end{lstlisting} + in your \altverb{.bashrc} +\end{frame} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\begin{frame}[fragile] + \frametitle{\HandsOn{Running with a Profile II}} + Copy the configuration file: + \begin{lstlisting}[language=Bash, style=Shell] +$ mkdir -p ~/.config/snakemake +$ cp ~/workflows/config.yaml \ +> ~/.config/snakemake/. + \end{lstlisting} + Activate the new settings: + \begin{lstlisting}[language=Bash, style=Shell] +$ source ~/.bashrc + \end{lstlisting} + And run the workflow - just for fun: + \begin{lstlisting}[language=Bash, style=Shell] +$ snakemake -F # just this one flag! + \end{lstlisting} +\end{frame} + + +