From 3231ba76cb0fefd05e064a3845ca6a9ff8084def Mon Sep 17 00:00:00 2001 From: ens-ftricomi Date: Fri, 12 Apr 2024 17:38:10 +0100 Subject: [PATCH] fix variable declarations --- pipelines/nextflow/modules/build_metadata.nf | 5 +- .../modules/copy_output_to_ensembl_ftp.nf | 5 +- .../nextflow/modules/omark/omark_output.nf | 9 +- pipelines/nextflow/modules/utils.nf | 215 +----------------- pipelines/nextflow/subworkflows/run_busco.nf | 4 +- .../subworkflows/run_ensembl_stats.nf | 5 +- pipelines/nextflow/subworkflows/run_omark.nf | 9 +- pipelines/nextflow/workflows/main.nf | 10 +- 8 files changed, 28 insertions(+), 234 deletions(-) diff --git a/pipelines/nextflow/modules/build_metadata.nf b/pipelines/nextflow/modules/build_metadata.nf index 78ec17c..d13b376 100644 --- a/pipelines/nextflow/modules/build_metadata.nf +++ b/pipelines/nextflow/modules/build_metadata.nf @@ -25,11 +25,12 @@ process BUILD_METADATA { output: - stdout + tuple val(db_meta) script: """ + echo "gca,taxon_id" echo "$gca,\$$taxon_id" """ -} \ No newline at end of file +} diff --git a/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf b/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf index 1a7c1d3..960de89 100644 --- a/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf +++ b/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf @@ -15,19 +15,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -include { make_publish_dir } from '../utils.nf' process COPY_OUTPUT_TO_ENSEMBL_FTP { // rename busco summary file in _gca_busco_short_summary.txt tag "$db.species:$db.gca" label 'default' - publishDir { make_publish_dir(db.publish_dir, ${params.project}, 'statistics') }, mode: 'copy' + publishDir "${params.production_ftp_dir}/${db_meta.publish_dir}/statistics", mode: 'copy' input: tuple val(db_meta), path(summary_file) - output: - path(${params.production_ftp_dir}/${db_meta.publish_dir}/statistics) script: """ diff --git a/pipelines/nextflow/modules/omark/omark_output.nf b/pipelines/nextflow/modules/omark/omark_output.nf index 13b4991..c83d647 100644 --- a/pipelines/nextflow/modules/omark/omark_output.nf +++ b/pipelines/nextflow/modules/omark/omark_output.nf @@ -15,19 +15,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -include { make_publish_dir } from '../utils.nf' +//include { make_publish_dir } from '../utils.nf' process OMARK_OUTPUT { // rename busco summary file in _gca_busco_short_summary.txt tag "$db.species:$db.gca" label 'default' - publishDir { make_publish_dir(db.publish_dir, project, 'statistics') }, mode: 'copy' + publishDir "db.publish_dir/statistics", mode: 'copy' input: tuple val(db), path(summary_file, stageAs: "short_summary_from_busco_run.txt") - val(datatype) - val(project) - path("proteins_detailed_summary.txt"), emit: summary_file output: path("*_proteins_detailed_summary.txt"), emit:summary_file @@ -39,4 +36,4 @@ process OMARK_OUTPUT { def gca = db.gca.toLowerCase().replaceAll(/\./, "v").replaceAll(/_/, "") summary_name = [species, gca, "omark", "proteins_detailed_summary.txt"].join("_") """ -} \ No newline at end of file +} diff --git a/pipelines/nextflow/modules/utils.nf b/pipelines/nextflow/modules/utils.nf index 4ca8c74..9b26f86 100644 --- a/pipelines/nextflow/modules/utils.nf +++ b/pipelines/nextflow/modules/utils.nf @@ -25,225 +25,16 @@ def make_publish_dir(publish_dir, project, name) { return list.join("/") } - - - - - - - - - - - - - - - - -import groovy.sql.Sql -import java.time.LocalDateTime -import java.time.format.DateTimeFormatter - -def checkTaxonomy(String jdbcUrl, String username, String password, String taxonId) { - def sql = Sql.newInstance(jdbcUrl, username, password) - - try { - def query = "SELECT * FROM meta WHERE taxon_id = '${taxonId}'" - def result = sql.rows(query) - return result.size() > 0 - } catch (Exception ex) { - ex.printStackTrace()} - finally { - sql.close() - } -} - -def getLastCheckedDate(String jdbcUrl, String username, String password, String taxonId) { - def sql = Sql.newInstance(jdbcUrl, username, password) - def lastCheckedDate = null - - try { - def query = "SELECT last_check FROM meta WHERE taxon_id = '${taxonId}'" - def result = sql.rows(query) - - if (result.size() > 0) { - // Assuming 'last_check' is a date-like column - // Adjust the date format pattern based on the actual format in your database - def dateFormat = new SimpleDateFormat("yyyy-MM-dd") // Adjust the format if needed - lastCheckedDate = dateFormat.parse(result[0].last_check) - } - } catch (Exception ex) { - ex.printStackTrace() - } finally { - sql.close() - } - - return lastCheckedDate -} - -def insertMetaRecord(String jdbcUrl, String username, String password, String taxonId) { - def sql = Sql.newInstance(jdbcUrl, username, password) - - try { - // Get the current date and time - def currentDate = LocalDateTime.now() - def dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd") - def formattedDate = currentDate.format(dateFormatter) - - // Execute the SQL INSERT statement - def insertQuery = "INSERT INTO meta (taxon_id, last_checked_date) VALUES ('${taxonId}', '${formattedDate}')" - sql.executeUpdate(insertQuery, 'meta_id') - } catch (Exception ex) { - ex.printStackTrace() - } finally { - sql.close() - } - -} -def updateLastCheckedDate(String jdbcUrl, String username, String password, String taxonId) { - def sql = Sql.newInstance(jdbcUrl, username, password) - - try { - // Get the current date and time - def currentDate = LocalDateTime.now() - def dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd") - def formattedDate = currentDate.format(dateFormatter) - - // Execute the SQL UPDATE statement - def updateQuery = "UPDATE meta SET last_checked_date = '${formattedDate}' WHERE taxon_id = '${taxonId}'" - sql.executeUpdate(updateQuery) - } catch (Exception ex) { - ex.printStackTrace() - }finally { - sql.close() - } - -} -def build_ncbi_path(gca, assembly_name) { - final gca_splitted = gca.replaceAll("_","").tokenize(".")[0].split("(?<=\\G.{3})").join('/') - return 'https://ftp.ncbi.nlm.nih.gov/genomes/all' + '/' + gca_splitted + '/' + "$gca" +'_' + assembly_name.replaceAll(" ","_") + '/' + "$gca" + '_' + assembly_name.replaceAll(" ","_") + '_genomic.fna.gz' -} - -def getPairedFastqsURL(String jdbcUrl, String username, String password, String run_accession) { - def sql = Sql.newInstance(jdbcUrl, username, password) - try { - def query = "SELECT url FROM file INNER JOIN run ON run_id WHERE run_accession = '${run_accession}'" - def result = sql.rows(query) - } catch (Exception ex) { - ex.printStackTrace() - } finally { - sql.close() - } - - return result +def buildMetadata(gca, taxon_id) { + def db_meta = ["gca": gca, "taxon_id": taxon_id] + return db_meta } -def checkFastqc(String jdbcUrl, String username, String password, String run_accession) { - def sql = Sql.newInstance(jdbcUrl, username, password) - def query = """ SELECT basic_statistics, per_base_sequence_quality, per_sequence_quality_scores, \ - per_base_sequence_content - FROM data_files df - INNER JOIN run r on df.run_id =r.run_id - WHERE r.run_id= '${run_accession}' - """ - def qc_status = null - try { - def result = sql.rows(query) - // Process the results - results.each { row -> - def basicStatistics = row.basic_statistics - def perBaseSequenceQuality = row.per_base_sequence_quality - def perSequenceQualityScores = row.per_sequence_quality_scores - def perBaseSequenceContent = row.per_base_sequence_content - if (basicStatistics=='PASS' && perBaseSequenceQuality='PASS' && - perSequenceQualityScores='PASS' && perBaseSequenceContent='PASS') { - // Execute the SQL UPDATE statement - def updateQuery = "UPDATE RUN set qc_status = 'QC_PASS' WHERE run_id= '${run_accession}'" - sql.executeUpdate(updateQuery) - qc_status = 'QC_PASS' - } - else { - // Execute the SQL UPDATE statement - def updateQuery = "UPDATE RUN set qc_status = 'QC_FAIL' WHERE run_id= '${run_accession}'" - sql.executeUpdate(updateQuery) - qc_status = 'QC_FAIL' - } - } - } catch (Exception ex) { - ex.printStackTrace()} - finally { - sql.close() - } - return qc_status -} -def checkOverrepresentedSequences(String jdbcUrl, String username, String password, String run_accession) { - def sql = Sql.newInstance(jdbcUrl, username, password) - def query = """ SELECT overrepresented_sequences - FROM data_files df - INNER JOIN run r on df.run_id =r.run_id - WHERE r.run_id= '${run_accession}' - """ - def overrepresented_sequences = null - try { - def result = sql.rows(query) - // Process the results - results.each { row -> - def OverrepresentedSequences = row.overrepresented_sequences - - if (OverrepresentedSequences=='WARN' OR OverrepresentedSequences=='FAIL') { - overrepresented_sequences = True - } - else { - overrepresented_sequences = False - } - } - } catch (Exception ex) { - ex.printStackTrace()} - finally { - sql.close() - } - return overrepresented_sequences -} -def concatString(string1, string2, string3){ - return string1 + '_'+string2 + '_'+string3 -} - -def calculateIndexBases(genomeFile) { - def indexBases = Math.min(14, Math.floor((Math.log(genomeFile, 2) / 2) - 1)) - return indexBases -} -def getRunId(String jdbcUrl, String username, String password, String run_accession, String gca, String percentage_mapped) { - def sql = Sql.newInstance(jdbcUrl, username, password) - def run_id = null - try { - def query = "SELECT run_id FROM run WHERE run_accession = '${run_accession}'" - run_id = sql.rows(query) - return run_id - } catch (Exception ex) { - ex.printStackTrace() - } finally { - sql.close() - } - -} -def updateFastqcStatus(String jdbcUrl, String username, String password, String run_accession) { - def sql = Sql.newInstance(jdbcUrl, username, password) - try { - // Execute the SQL UPDATE statement - def updateQuery = "UPDATE run SET qc_status = 'ALIGNED' WHERE run_accession = '${run_accession}'" - sql.executeUpdate(updateQuery) - } catch (Exception ex) { - ex.printStackTrace() - }finally { - sql.close() - } -} \ No newline at end of file diff --git a/pipelines/nextflow/subworkflows/run_busco.nf b/pipelines/nextflow/subworkflows/run_busco.nf index c788630..e4635aa 100644 --- a/pipelines/nextflow/subworkflows/run_busco.nf +++ b/pipelines/nextflow/subworkflows/run_busco.nf @@ -49,7 +49,9 @@ include { CLEANING } from '../modules/cleaning.nf' */ workflow RUN_BUSCO{ take: - tuple val(db_meta), val(busco_mode), bool(copyToFtp) + db_meta + busco_mode + copyToFtp main: // Get the closest Busco dataset from the taxonomy classification stored in db meta table diff --git a/pipelines/nextflow/subworkflows/run_ensembl_stats.nf b/pipelines/nextflow/subworkflows/run_ensembl_stats.nf index 7d2e5a1..e2867cb 100644 --- a/pipelines/nextflow/subworkflows/run_ensembl_stats.nf +++ b/pipelines/nextflow/subworkflows/run_ensembl_stats.nf @@ -18,7 +18,7 @@ limitations under the License. nextflow.enable.dsl=2 -includeConfig '../../../workflows/nextflow.config' +//includeConfig '../../../workflows/nextflow.config' /* @@ -43,7 +43,8 @@ include { CLEANING } from '../modules/cleaning.nf' workflow RUN_ENSEMBL_STATS{ take: - tuple val(dbname),val(db_meta) + dbname + db_meta main: diff --git a/pipelines/nextflow/subworkflows/run_omark.nf b/pipelines/nextflow/subworkflows/run_omark.nf index c63096a..dde8702 100644 --- a/pipelines/nextflow/subworkflows/run_omark.nf +++ b/pipelines/nextflow/subworkflows/run_omark.nf @@ -18,8 +18,8 @@ limitations under the License. nextflow.enable.dsl=2 -includeConfig '../../../workflows/nextflow.config' -includeConfig '../conf/omark.config' +//includeConfig '../../../workflows/nextflow.config' +//includeConfig '../conf/omark.config' /* @@ -47,7 +47,8 @@ include { CLEANING } from '../modules/cleaning.nf' workflow RUN_OMARK{ take: - tuple val(dbname),val(db_meta) + dbname + db_meta main: // @@ -63,7 +64,7 @@ workflow RUN_OMARK{ // omarkOutput = OMARK (omamerOutput) - omarkSummaryFile = OMARK_OUTPUT(db_meta, omarkOutput, params.project) + omarkSummaryFile = OMARK_OUTPUT(db_meta, omarkOutput) if (params.copyToFtp) { COPY_OMARK_OUTPUT(db_meta, omarkSummaryFile) } diff --git a/pipelines/nextflow/workflows/main.nf b/pipelines/nextflow/workflows/main.nf index 2db898a..afa1be3 100644 --- a/pipelines/nextflow/workflows/main.nf +++ b/pipelines/nextflow/workflows/main.nf @@ -101,6 +101,7 @@ include { RUN_ENSEMBL_STATS } from '../subworkflows/run_ensembl_stats.nf' include { BUILD_METADATA } from '../modules/build_metadata.nf' include { SPECIES_METADATA } from '../modules/species_metadata.nf' +include { buildMetadata } from '../modules/utils.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -119,12 +120,15 @@ workflow STATISTICS{ if (params.run_busco_ncbi) { // Read data from the CSV file, split it, and map each row to extract GCA and taxon values - data = Channel.fromPath(params.csvFile).splitCsv().map { row -> + data = Channel.fromPath(params.csvFile).splitCsv(sep:',').map { row -> def gca = row[0] def taxon = row[1] def busco_mode = 'genome' def copyToFtp = false - db_meta = BUILD_METADATA(gca,taxon_id) + println("GCA: $gca, Taxon: $taxon") + def db_meta = buildMetadata(gca, taxon_id) + + //def db_meta = BUILD_METADATA(gca,taxon_id) RUN_BUSCO(db_meta, busco_mode, copyToFtp) } } @@ -152,4 +156,4 @@ workflow STATISTICS{ exec "rm -rf ${params.cacheDir}/*" } -} \ No newline at end of file +}