From aa03a3512129571f3c2e95267518f953bd2bf3ca Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 9 Aug 2024 14:05:14 +0100 Subject: [PATCH 01/10] Updated the URL in the documentation too --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 0bc8e11..21c5d13 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -87,7 +87,7 @@ The pipeline can be tested locally using a provided small test dataset: ``` cd ${GENOMEASSEMBLY_TEST_DATA} -curl https://darwin.cog.sanger.ac.uk/genomeassembly_test_data.tar.gz | tar xzf - +curl https://tolit.cog.sanger.ac.uk/test-data/resources/genomeassembly/genomeassembly_test_data.tar.gz | tar xzf - git clone git@github.com:sanger-tol/genomeassembly.git cd genomeassembly/ From aab9bd6990ec1250e31e4e8805dde86ef566a66c Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 9 Aug 2024 14:37:51 +0100 Subject: [PATCH 02/10] Properly stage the .crai to support remote remote CRAM files --- assets/test_github.yaml | 4 ++-- bin/generate_cram_csv.sh | 7 ++++--- modules/local/generate_cram_csv.nf | 2 +- subworkflows/local/hic_mapping.nf | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/assets/test_github.yaml b/assets/test_github.yaml index 3bf048f..fa4d678 100644 --- a/assets/test_github.yaml +++ b/assets/test_github.yaml @@ -4,10 +4,10 @@ dataset: reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/ pacbio: reads: - - reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta + - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta HiC: reads: - - reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2#7.sub.cram + - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2%237.sub.cram hic_motif: GATC,GANTC,CTNAG,TTAA hic_aligner: minimap2 busco: diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh index 81eaad3..15e5169 100755 --- a/bin/generate_cram_csv.sh +++ b/bin/generate_cram_csv.sh @@ -12,8 +12,9 @@ for cram in "$@"; do rgline=$(samtools view -H $cram|grep "RG"|sed 's/\t/\\t/g'|sed "s/'//g") crampath=$(readlink -f ${cram}) + craipath=$(readlink -f ${cram}.crai) - ncontainers=$(zcat ${crampath}.crai|wc -l) + ncontainers=$(zcat ${craipath} | wc -l) base=$(basename $cram .cram) from=0 @@ -22,7 +23,7 @@ for cram in "$@"; do while [ $to -lt $ncontainers ] do - echo $crampath,${crampath}.crai,${from},${to},${base},${chunkn},${rgline} + echo $crampath,${craipath},${from},${to},${base},${chunkn},${rgline} from=$((to+1)) ((to+=10000)) ((chunkn++)) @@ -30,7 +31,7 @@ for cram in "$@"; do if [ $from -le $ncontainers ] then - echo $crampath,${crampath}.crai,${from},${ncontainers},${base},${chunkn},${rgline} + echo $crampath,${craipath},${from},${ncontainers},${base},${chunkn},${rgline} ((chunkn++)) fi done diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index 860dfe6..45c8e90 100644 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -13,7 +13,7 @@ process GENERATE_CRAM_CSV { 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: - tuple val(meta), path(crampaths, stageAs: "?/*") + tuple val(meta), path(crampaths, stageAs: "?/*"), path(craipaths, stageAs: "?/*") output: diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf index 692c8be..c8786ba 100644 --- a/subworkflows/local/hic_mapping.nf +++ b/subworkflows/local/hic_mapping.nf @@ -42,7 +42,7 @@ workflow HIC_MAPPING { reference_tuple .join( hic_reads_path ) .map { meta, ref, hic_reads_path -> - tuple([ id: meta.id, single_end: true], hic_reads_path) } + tuple([ id: meta.id, single_end: true], hic_reads_path, hic_reads_path.collect { p -> p.resolveSibling(p.name + ".crai") } ) } .set { get_reads_input } // From 83a19a06426a8094ccadd5676482b8d3105ad566 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 9 Aug 2024 16:23:36 +0100 Subject: [PATCH 03/10] Now works on remote URLs --- assets/test_github.yaml | 2 +- modules/local/longranger/align/main.nf | 4 ++-- subworkflows/local/prepare_input.nf | 2 +- workflows/genomeassembly.nf | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/test_github.yaml b/assets/test_github.yaml index fa4d678..6af8639 100644 --- a/assets/test_github.yaml +++ b/assets/test_github.yaml @@ -1,7 +1,7 @@ dataset: id: baUndUnlc1 illumina_10X: - reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/ + reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/ pacbio: reads: - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta diff --git a/modules/local/longranger/align/main.nf b/modules/local/longranger/align/main.nf index 728cd9a..0f3009a 100644 --- a/modules/local/longranger/align/main.nf +++ b/modules/local/longranger/align/main.nf @@ -11,7 +11,7 @@ process LONGRANGER_ALIGN { input: tuple val(meta), path(reference) - path(fastqs) + path(fastqs, stageAs: "10X_inputs/*") output: tuple val(meta), path("${meta.id}/outs/possorted_bam.bam"), emit: bam @@ -26,7 +26,7 @@ process LONGRANGER_ALIGN { def args = task.ext.args ?: '' def sample = "${meta.id}" """ - longranger align --id=$sample --fastqs=$fastqs \ + longranger align --id=$sample --fastqs=10X_inputs \ --sample=$sample --reference=$reference \ ${args} diff --git a/subworkflows/local/prepare_input.nf b/subworkflows/local/prepare_input.nf index 6d04e69..383a14f 100644 --- a/subworkflows/local/prepare_input.nf +++ b/subworkflows/local/prepare_input.nf @@ -40,7 +40,7 @@ workflow PREPARE_INPUT { .multiMap { data -> id_ch : (data.id ? [id: data.id] : []) illumina_10X_ch : ( data.illumina_10X ? [ [id: data.id ], - file(data.illumina_10X.reads, checkIfExists: true), + files(data.illumina_10X.reads + "*", checkIfExists: true), data.illumina_10X.kmer_pref ? data.illumina_10X.kmer_pref : [] ] : [] ) pacbio_ch: ( data.pacbio ? [ [id: data.id ], diff --git a/workflows/genomeassembly.nf b/workflows/genomeassembly.nf index 03eaeaf..a59ec08 100644 --- a/workflows/genomeassembly.nf +++ b/workflows/genomeassembly.nf @@ -245,7 +245,7 @@ workflow GENOMEASSEMBLY { // // LOGIC: REFACTOR ILLUMINA CHANNEL TO PASS IT INTO THE POLISHING SUBWORKFLOW // - PREPARE_INPUT.out.illumina_10X.map{ meta, reads, kmers -> [reads] } + PREPARE_INPUT.out.illumina_10X.map{ meta, reads, kmers -> reads } .set{ illumina_10X_ch } // From fcca06cf74548f64dcb5e86b8bd6ad783a94a353 Mon Sep 17 00:00:00 2001 From: Matthieu Muffato Date: Fri, 9 Aug 2024 17:03:27 +0100 Subject: [PATCH 04/10] These files shouldn't be there --- .../batch_summary.txt | 2 - .../logs/busco.log | 107 ------------------ modules/nf-core/busco/main.nf_ | 83 -------------- 3 files changed, 192 deletions(-) delete mode 100644 modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt delete mode 100644 modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log delete mode 100644 modules/nf-core/busco/main.nf_ diff --git a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt b/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt deleted file mode 100644 index 226d64f..0000000 --- a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt +++ /dev/null @@ -1,2 +0,0 @@ -Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds -iyVesGerm1_scaffolds_final.fa Run failed; check logs diff --git a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log b/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log deleted file mode 100644 index 583cbbf..0000000 --- a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log +++ /dev/null @@ -1,107 +0,0 @@ -2023-03-31 12:35:30 DEBUG:busco.run_BUSCO Command line: /usr/local/bin/busco --cpu 2 --in input_seqs --out iyVesGerm1-insecta_odb10-busco --out_path /lustre/scratch124/tol/projects/darwin/users/kk16/development/nextflow/dev/genomeassembly/./workflows/../subworkflows/local/../../modules/nf-core/busco --lineage_dataset /lustre/scratch123/tol/resources/busco/v5/lineages/insecta_odb10 --mode genome -2023-03-31 12:35:30 INFO:busco.run_BUSCO ***** Start a BUSCO v5.4.3 analysis, current time: 03/31/2023 12:35:30 ***** -2023-03-31 12:35:30 DEBUG:busco.ConfigManager Getting config file -2023-03-31 12:35:30 INFO:busco.ConfigManager Configuring BUSCO with local environment -2023-03-31 12:35:30 INFO:busco.BuscoConfig Mode is genome -2023-03-31 12:35:30 INFO:busco.BuscoDownloadManager Downloading information on latest versions of BUSCO data... -2023-03-31 12:35:32 DEBUG:busco.BuscoConfig State of BUSCO config before run: -2023-03-31 12:35:32 DEBUG:busco.BuscoConfig {'_allow_no_value': False, - '_comment_prefixes': ('#', ';'), - '_converters': , - '_defaults': {}, - '_delimiters': ('=', ':'), - '_dict': , - '_empty_lines_in_values': True, - '_inline_comment_prefixes': (), - '_input_filepath': '/lustre/scratch124/tol/projects/darwin/users/kk16/development/nextflow/dev/genomeassembly/work/06/5d8ceb4658862a248b20dcc4c3b27a/input_seqs', - '_interpolation': , - '_mode': 'genome', - '_optcre': re.compile('\n (?P