From 3231ba76cb0fefd05e064a3845ca6a9ff8084def Mon Sep 17 00:00:00 2001
From: ens-ftricomi <ftricomi@ebi.ac.uk>
Date: Fri, 12 Apr 2024 17:38:10 +0100
Subject: [PATCH] fix variable declarations

---
 pipelines/nextflow/modules/build_metadata.nf  |   5 +-
 .../modules/copy_output_to_ensembl_ftp.nf     |   5 +-
 .../nextflow/modules/omark/omark_output.nf    |   9 +-
 pipelines/nextflow/modules/utils.nf           | 215 +-----------------
 pipelines/nextflow/subworkflows/run_busco.nf  |   4 +-
 .../subworkflows/run_ensembl_stats.nf         |   5 +-
 pipelines/nextflow/subworkflows/run_omark.nf  |   9 +-
 pipelines/nextflow/workflows/main.nf          |  10 +-
 8 files changed, 28 insertions(+), 234 deletions(-)
diff --git a/pipelines/nextflow/modules/build_metadata.nf b/pipelines/nextflow/modules/build_metadata.nf
index 78ec17c..d13b376 100644
--- a/pipelines/nextflow/modules/build_metadata.nf
+++ b/pipelines/nextflow/modules/build_metadata.nf
@@ -25,11 +25,12 @@ process BUILD_METADATA {
 
 
   output:
-  stdout
+  tuple val(db_meta)
 
   script:
   """
+
   echo "gca,taxon_id"
   echo "$gca,\$$taxon_id"
   """
-}
\ No newline at end of file
+}
diff --git a/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf b/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf
index 1a7c1d3..960de89 100644
--- a/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf
+++ b/pipelines/nextflow/modules/copy_output_to_ensembl_ftp.nf
@@ -15,19 +15,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-include { make_publish_dir } from '../utils.nf'
 
 process COPY_OUTPUT_TO_ENSEMBL_FTP {
     // rename busco summary file in <production name>_gca_busco_short_summary.txt
     tag "$db.species:$db.gca"
     label 'default'
-    publishDir { make_publish_dir(db.publish_dir, ${params.project}, 'statistics') },  mode: 'copy'
+    publishDir "${params.production_ftp_dir}/${db_meta.publish_dir}/statistics", mode: 'copy'
 
     input:
     tuple val(db_meta), path(summary_file)
 
-    output:
-    path(${params.production_ftp_dir}/${db_meta.publish_dir}/statistics)
 
     script:
     """
diff --git a/pipelines/nextflow/modules/omark/omark_output.nf b/pipelines/nextflow/modules/omark/omark_output.nf
index 13b4991..c83d647 100644
--- a/pipelines/nextflow/modules/omark/omark_output.nf
+++ b/pipelines/nextflow/modules/omark/omark_output.nf
@@ -15,19 +15,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 */
 
-include { make_publish_dir } from '../utils.nf'
+//include { make_publish_dir } from '../utils.nf'
 
 process OMARK_OUTPUT {
     // rename busco summary file in <production name>_gca_busco_short_summary.txt
     tag "$db.species:$db.gca"
     label 'default'
-    publishDir { make_publish_dir(db.publish_dir, project, 'statistics') },  mode: 'copy'
+    publishDir "db.publish_dir/statistics", mode: 'copy'
 
     input:
     tuple val(db), path(summary_file, stageAs: "short_summary_from_busco_run.txt")
-    val(datatype)
-    val(project)
-    path("proteins_detailed_summary.txt"), emit: summary_file
 
     output:
     path("*_proteins_detailed_summary.txt"), emit:summary_file
@@ -39,4 +36,4 @@ process OMARK_OUTPUT {
     def gca = db.gca.toLowerCase().replaceAll(/\./, "v").replaceAll(/_/, "")
     summary_name = [species, gca, "omark", "proteins_detailed_summary.txt"].join("_")
     """
-}
\ No newline at end of file
+}
diff --git a/pipelines/nextflow/modules/utils.nf b/pipelines/nextflow/modules/utils.nf
index 4ca8c74..9b26f86 100644
--- a/pipelines/nextflow/modules/utils.nf
+++ b/pipelines/nextflow/modules/utils.nf
@@ -25,225 +25,16 @@ def make_publish_dir(publish_dir, project, name) {
     return list.join("/")
 }
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-import groovy.sql.Sql
-import java.time.LocalDateTime
-import java.time.format.DateTimeFormatter
-
-def checkTaxonomy(String jdbcUrl, String username, String password, String taxonId) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-    
-    try {
-        def query = "SELECT * FROM meta  WHERE taxon_id = '${taxonId}'"
-        def result = sql.rows(query)
-        return result.size() > 0
-    } catch (Exception ex) {
-        ex.printStackTrace()}
-    finally {
-        sql.close()
-    }
-}
-
-def getLastCheckedDate(String jdbcUrl, String username, String password, String taxonId) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-    def lastCheckedDate = null
-
-    try {
-        def query = "SELECT last_check FROM meta WHERE taxon_id = '${taxonId}'"
-        def result = sql.rows(query)
-
-        if (result.size() > 0) {
-            // Assuming 'last_check' is a date-like column
-            // Adjust the date format pattern based on the actual format in your database
-            def dateFormat = new SimpleDateFormat("yyyy-MM-dd") // Adjust the format if needed
-            lastCheckedDate = dateFormat.parse(result[0].last_check)
-        }
-    } catch (Exception ex) {
-        ex.printStackTrace()    
-    } finally {
-        sql.close()
-    }
-
-    return lastCheckedDate
-}
-
-def insertMetaRecord(String jdbcUrl, String username, String password, String taxonId) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-
-    try {
-        // Get the current date and time
-        def currentDate = LocalDateTime.now()
-        def dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
-        def formattedDate = currentDate.format(dateFormatter)
-
-        // Execute the SQL INSERT statement
-        def insertQuery = "INSERT INTO meta (taxon_id, last_checked_date) VALUES ('${taxonId}', '${formattedDate}')"
-        sql.executeUpdate(insertQuery, 'meta_id')
-    } catch (Exception ex) {
-        ex.printStackTrace()
-    } finally {
-        sql.close()
-    }
-
-}
-def updateLastCheckedDate(String jdbcUrl, String username, String password, String taxonId) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-
-    try {
-        // Get the current date and time
-        def currentDate = LocalDateTime.now()
-        def dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
-        def formattedDate = currentDate.format(dateFormatter)
-
-        // Execute the SQL UPDATE statement
-        def updateQuery = "UPDATE meta SET last_checked_date = '${formattedDate}' WHERE taxon_id = '${taxonId}'"
-        sql.executeUpdate(updateQuery)
-    } catch (Exception ex) {
-        ex.printStackTrace()
-    }finally {
-        sql.close()
-    }
-
-}
-def build_ncbi_path(gca, assembly_name) {
-    final gca_splitted = gca.replaceAll("_","").tokenize(".")[0].split("(?<=\\G.{3})").join('/')
-    return  'https://ftp.ncbi.nlm.nih.gov/genomes/all'  + '/' + gca_splitted + '/' + "$gca" +'_' + assembly_name.replaceAll(" ","_") + '/' + "$gca" + '_' + assembly_name.replaceAll(" ","_") + '_genomic.fna.gz'
-}
-
-def getPairedFastqsURL(String jdbcUrl, String username, String password, String run_accession) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-    try {
-        def query = "SELECT url FROM file INNER JOIN run ON run_id WHERE run_accession = '${run_accession}'"
-        def result = sql.rows(query)
-    } catch (Exception ex) {
-    ex.printStackTrace()    
-    } finally {
-        sql.close()
-    }
-
-    return result
+def buildMetadata(gca, taxon_id) {
+    def db_meta = ["gca": gca, "taxon_id": taxon_id]
+    return db_meta
 }
 
-def checkFastqc(String jdbcUrl, String username, String password, String run_accession) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-    def query = """ SELECT basic_statistics, per_base_sequence_quality, per_sequence_quality_scores, \
-        per_base_sequence_content 
-        FROM data_files df 
-        INNER JOIN run r on df.run_id =r.run_id 
-        WHERE r.run_id= '${run_accession}'
-        """
-    def qc_status = null 
 
-    try {
-        def result = sql.rows(query)
-        // Process the results
-        results.each { row ->
-        def basicStatistics = row.basic_statistics
-        def perBaseSequenceQuality = row.per_base_sequence_quality
-        def perSequenceQualityScores = row.per_sequence_quality_scores
-        def perBaseSequenceContent = row.per_base_sequence_content
-        if (basicStatistics=='PASS' && perBaseSequenceQuality='PASS' &&
-            perSequenceQualityScores='PASS' && perBaseSequenceContent='PASS') {
-            // Execute the SQL UPDATE statement
-            def updateQuery = "UPDATE RUN set qc_status = 'QC_PASS' WHERE run_id= '${run_accession}'"
-            sql.executeUpdate(updateQuery)
-            qc_status = 'QC_PASS'
-            }
-        else {
-            // Execute the SQL UPDATE statement
-            def updateQuery = "UPDATE RUN set qc_status = 'QC_FAIL' WHERE run_id= '${run_accession}'"
-            sql.executeUpdate(updateQuery)
-            qc_status = 'QC_FAIL'
-            }
-    }
-    } catch (Exception ex) {
-        ex.printStackTrace()}
-    finally {
-        sql.close()
-    }
 
-    return qc_status
-}
 
-def checkOverrepresentedSequences(String jdbcUrl, String username, String password, String run_accession) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-    def query = """ SELECT overrepresented_sequences 
-        FROM data_files df 
-        INNER JOIN run r on df.run_id =r.run_id 
-        WHERE r.run_id= '${run_accession}'
-        """
-    def overrepresented_sequences = null 
 
-    try {
-        def result = sql.rows(query)
-        // Process the results
-        results.each { row ->
-        def OverrepresentedSequences = row.overrepresented_sequences
-        
-        if (OverrepresentedSequences=='WARN' OR OverrepresentedSequences=='FAIL') {
-            overrepresented_sequences = True
-            }
-        else {
-            overrepresented_sequences = False
-            }
-    }
-    } catch (Exception ex) {
-        ex.printStackTrace()}
-    finally {
-        sql.close()
-    }
-    return overrepresented_sequences
-}
-def concatString(string1, string2, string3){
-    return string1 + '_'+string2 + '_'+string3
-}
-
-def calculateIndexBases(genomeFile) {
-    def indexBases = Math.min(14, Math.floor((Math.log(genomeFile, 2) / 2) - 1))
-    return indexBases
-}
 
-def getRunId(String jdbcUrl, String username, String password, String run_accession, String gca, String percentage_mapped) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
-    def run_id = null
-    try {
-        def query = "SELECT run_id  FROM run WHERE run_accession = '${run_accession}'"
-        run_id = sql.rows(query)
-        return run_id
-    } catch (Exception ex) {
-        ex.printStackTrace()
-    } finally {
-        sql.close()
-    }
-    
-}
 
-def updateFastqcStatus(String jdbcUrl, String username, String password, String run_accession) {
-    def sql = Sql.newInstance(jdbcUrl, username, password)
 
-    try {
-        // Execute the SQL UPDATE statement
-        def updateQuery = "UPDATE run SET qc_status = 'ALIGNED' WHERE run_accession = '${run_accession}'"
-        sql.executeUpdate(updateQuery)
-    } catch (Exception ex) {
-        ex.printStackTrace()
-    }finally {
-        sql.close()
-    }
 
-}
\ No newline at end of file
diff --git a/pipelines/nextflow/subworkflows/run_busco.nf b/pipelines/nextflow/subworkflows/run_busco.nf
index c788630..e4635aa 100644
--- a/pipelines/nextflow/subworkflows/run_busco.nf
+++ b/pipelines/nextflow/subworkflows/run_busco.nf
@@ -49,7 +49,9 @@ include { CLEANING } from '../modules/cleaning.nf'
 */
 workflow RUN_BUSCO{
     take:                 
-    tuple val(db_meta), val(busco_mode), bool(copyToFtp)
+    db_meta
+    busco_mode
+    copyToFtp
 
     main:
     // Get the closest Busco dataset from the taxonomy classification stored in db meta table 
diff --git a/pipelines/nextflow/subworkflows/run_ensembl_stats.nf b/pipelines/nextflow/subworkflows/run_ensembl_stats.nf
index 7d2e5a1..e2867cb 100644
--- a/pipelines/nextflow/subworkflows/run_ensembl_stats.nf
+++ b/pipelines/nextflow/subworkflows/run_ensembl_stats.nf
@@ -18,7 +18,7 @@ limitations under the License.
 
 nextflow.enable.dsl=2
 
-includeConfig '../../../workflows/nextflow.config'
+//includeConfig '../../../workflows/nextflow.config'
 
 
 /*
@@ -43,7 +43,8 @@ include { CLEANING } from '../modules/cleaning.nf'
 
 workflow RUN_ENSEMBL_STATS{
     take:                 
-    tuple val(dbname),val(db_meta)
+    dbname
+    db_meta
 
     main:
 
diff --git a/pipelines/nextflow/subworkflows/run_omark.nf b/pipelines/nextflow/subworkflows/run_omark.nf
index c63096a..dde8702 100644
--- a/pipelines/nextflow/subworkflows/run_omark.nf
+++ b/pipelines/nextflow/subworkflows/run_omark.nf
@@ -18,8 +18,8 @@ limitations under the License.
 
 nextflow.enable.dsl=2
 
-includeConfig '../../../workflows/nextflow.config'
-includeConfig '../conf/omark.config'
+//includeConfig '../../../workflows/nextflow.config'
+//includeConfig '../conf/omark.config'
 
 
 /*
@@ -47,7 +47,8 @@ include { CLEANING } from '../modules/cleaning.nf'
 
 workflow RUN_OMARK{
     take:                 
-    tuple val(dbname),val(db_meta)
+    dbname
+    db_meta
 
     main:
         //
@@ -63,7 +64,7 @@ workflow RUN_OMARK{
         //        
         omarkOutput = OMARK (omamerOutput)
 
-        omarkSummaryFile = OMARK_OUTPUT(db_meta, omarkOutput, params.project)
+        omarkSummaryFile = OMARK_OUTPUT(db_meta, omarkOutput)
         if (params.copyToFtp) {
             COPY_OMARK_OUTPUT(db_meta, omarkSummaryFile)
         }
diff --git a/pipelines/nextflow/workflows/main.nf b/pipelines/nextflow/workflows/main.nf
index 2db898a..afa1be3 100644
--- a/pipelines/nextflow/workflows/main.nf
+++ b/pipelines/nextflow/workflows/main.nf
@@ -101,6 +101,7 @@ include { RUN_ENSEMBL_STATS } from '../subworkflows/run_ensembl_stats.nf'
 include { BUILD_METADATA } from '../modules/build_metadata.nf'
 include { SPECIES_METADATA } from '../modules/species_metadata.nf'
 
+include { buildMetadata } from '../modules/utils.nf'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -119,12 +120,15 @@ workflow STATISTICS{
     if (params.run_busco_ncbi) {
         // Read data from the CSV file, split it, and map each row to extract GCA and taxon values
 
-        data = Channel.fromPath(params.csvFile).splitCsv().map { row ->
+        data = Channel.fromPath(params.csvFile).splitCsv(sep:',').map { row ->
             def gca = row[0]
             def taxon = row[1]
             def busco_mode = 'genome'
             def copyToFtp = false
-            db_meta = BUILD_METADATA(gca,taxon_id)
+            println("GCA: $gca, Taxon: $taxon")
+            def db_meta = buildMetadata(gca, taxon_id)
+
+            //def db_meta = BUILD_METADATA(gca,taxon_id)
             RUN_BUSCO(db_meta, busco_mode, copyToFtp)
             }
     }
@@ -152,4 +156,4 @@ workflow STATISTICS{
         exec "rm -rf ${params.cacheDir}/*"
     }
     
-}
\ No newline at end of file
+}