common-script

#!/usr/bin/env bash
set -ueo pipefail

# when run interactively via terminal
if [ -t 1 ]; then
    # prefix each command with timestamp in trace mode
    PS4='\033[0D\033[1;37m$(date +"%F %T")>\033[0m '
    function colored() { echo -en "\033[${1}m${2}\033[0m"; }
else
    PS4='$(date +"%F %T")> '
    function colored() { echo -n "$2"; }
fi

# emit message with timestamp
log() {
  colored "1;37" "$(date +'%F %T')> "
  echo "$1"
}

# define 'untrace' command to disable trace mode
unalias -a
shopt -s expand_aliases
alias untrace='{ set +x; } 2> /dev/null'

# start a named processing step and enable trace mode
run() {
  untrace
  colored "1;1" "[$1]"
  echo ""
  set -x
}

# ---- proccessing steps ----

do_validate() {
  GENERAL_PARAMS="--details --trimId --summary --format csv --defaultRecordType BOOKS"
  OUTPUT_PARAMS="--outputDir ${OUTPUT_DIR} --detailsFileName issue-details.csv --summaryFileName issue-summary.csv"
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run validate
  ./validate ${GENERAL_PARAMS} ${OUTPUT_PARAMS} ${PARAMS} ${MARC_DIR}/$MASK 2> ${LOG_DIR}/validate.log
}

do_prepare_solr() {
  run prepare-solr
  ./prepare-solr $NAME 2> ${LOG_DIR}/solr.log
}

do_index() {
  run index
  untrace

  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+//g')
  # HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
  # if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
  #   PARAMS="${PARAMS} --solrForScoresUrl ${NAME}_validation"
  # fi
  PARAMS="${PARAMS} --outputDir ${OUTPUT_DIR}"

  ONLY_INDEX=$(echo ${PARAMS} | grep -c -P -e '--onlyIndex' || true)
  if [[ "${ONLY_INDEX}" == "0" ]]; then
    CORE=${NAME}_dev
  else
    PARAMS=$(echo ${PARAMS} | sed -r 's/\s*--onlyIndex//')
    CORE=${NAME}
  fi
  ./index --core ${CORE} --file-path ${MARC_DIR} --file-mask $MASK ${PARAMS} --trimId 2>> ${LOG_DIR}/solr.log
}

do_postprocess_solr() {
  run postprocess-solr
  ./postprocess-solr $NAME 2>> ${LOG_DIR}/solr.log
}

do_completeness() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run completeness
  ./completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/completeness.log
}

do_completeness_sqlite() {
  run completeness_sqlite

  untrace
  HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
  bash scripts/sqlite/completeness.sqlite.sh ${OUTPUT_DIR} ${HAS_GROUP_PARAM}
}

do_classifications() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run classifications
  ./classifications --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/classifications.log
  Rscript scripts/classifications/classifications-type.R ${OUTPUT_DIR}
}

do_authorities() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run authorities
  ./authorities --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/authorities.log
}

do_tt_completeness() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run tt-completeness
  ./tt-completeness --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ --trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/tt-completeness.log
  Rscript scripts/tt-histogram/tt-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/tt-completeness.log

  # for large files
  # php scripts/tt-histogram/tt-histogram.php ${OUTPUT_DIR} &>> ${LOG_DIR}/tt-completeness.log
}

do_shelf_ready_completeness() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run shelf-ready-completeness
  ./shelf-ready-completeness \
    --defaultRecordType BOOKS \
    ${PARAMS} \
    --outputDir ${OUTPUT_DIR}/ \
    --trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shelf-ready-completeness.log

  Rscript scripts/shelf-ready/shelf-ready-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/shelf-ready-completeness.log

  # for large files
  # php scripts/shelf-ready-histogram.php ${OUTPUT_DIR} &>> ${LOG_DIR}/shelf-ready-completeness.log
}

do_bl_classification() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run bk-classification
  ./bl-classification \
    --defaultRecordType BOOKS \
    ${PARAMS} \
    --outputDir ${OUTPUT_DIR}/ \
    --trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/bl-classification.log
}

do_serial_score() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run serial-score
  ./serial-score --defaultRecordType BOOKS \
                 ${PARAMS} \
                 --outputDir ${OUTPUT_DIR}/ \
                 --trimId ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/serial-score.log

  Rscript scripts/serial-score/serial-score-histogram.R ${OUTPUT_DIR} &>> ${LOG_DIR}/serial-score.log
}

do_format() {
  run format
  ./formatter --defaultRecordType BOOKS ${MARC_DIR}/${MASK}
}

do_functional_analysis() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run functional-analysis
  ./functional-analysis --defaultRecordType BOOKS \
                        ${PARAMS} \
                        --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/functional-analysis.log
}

do_network_analysis() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run network-analysis
  ./network-analysis --defaultRecordType BOOKS \
                     ${PARAMS} \
                     --outputDir ${OUTPUT_DIR}/ \
                     ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/network-analysis.log

  # network.csv (concept, id) ->
  #   network-by-concepts.csv (concept, count, ids)
  #   network-by-record.csv (id, count, concepts)
  #   network-statistics.csv (type, total, single, multi)
  Rscript scripts/network-transform.R ${OUTPUT_DIR} &>> ${LOG_DIR}/network-analysis.log

  # network-by-concepts (concept, count, ids) ->
  #   network-pairs.csv (id1 id2)
  #   network-nodes.csv (id, id)
  ./network-analysis --outputDir ${OUTPUT_DIR} \
                     --action pairing \
                     &>> ${LOG_DIR}/network-analysis.log

  untrace

  sort network-pairs.csv | uniq -c | sort -nr > network-pairs-uniq-with-count.csv
  awk '{print $2 " " $3}' network-pairs-uniq-with-count.csv > network-pairs-all.csv

  log "ziping output"
  PWD=$(pdw)
  cd ${OUTPUT_DIR}
  zip network-input network-nodes.csv network-nodes-???.csv network-pairs-???.csv network-by-concepts-tags.csv
  cd $PWD

  log "upload output"
  scp ${OUTPUT_DIR}/network-input.zip pkiraly@roedel.etrap.eu:/roedel/pkiraly/network/input

  # spark-shell -I scripts/network.scala --conf spark.driver.metadata.qa.dir="${OUTPUT_DIR}"
  # ./network-export.sh ${OUTPUT_DIR}
}

do_pareto() {
  run pareto
  Rscript scripts/pareto/frequency-range.R ${OUTPUT_DIR} &> ${LOG_DIR}/pareto.log
  untrace

  . ./common-variables
  if [[ "${WEB_DIR:-}" != "" ]]; then
    mkdir -p $WEB_DIR/images
    ln -s ${OUTPUT_DIR}/img $WEB_DIR/images/${NAME}
  fi
}

do_marc_history() {
  if [ "${SCHEMA}" == "PICA" ]; then
    SELECTOR='011@$a;001A$0|extractPicaDate'
  else
    SELECTOR="008~7-10;008~0-5"
  fi
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')

  run marc-history
  ./formatter --selector "$SELECTOR" --defaultRecordType BOOKS ${PARAMS} --separator "," \
              --outputDir ${OUTPUT_DIR}/ --fileName "marc-history.csv" ${MARC_DIR}/${MASK} &> ${LOG_DIR}/marc-history.log
    # | grep -v '008~7-10,008~0-5' \
  
  untrace
  tail -n +2 ${OUTPUT_DIR}/marc-history.csv \
    | sort \
    | uniq -c \
    | sed -r 's/([0-9]) ([0-9uc xwticrka])/\1,\2/' \
    | sed 's, ,,g' > ${OUTPUT_DIR}/marc-history-grouped.csv
  set -x

  Rscript scripts/marc-history/marc-history-grouped.R ${OUTPUT_DIR} &>> ${LOG_DIR}/marc-history.log
}

do_record_patterns() {
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')

  run record-patterns
  Rscript scripts/record-patterns/top-fields.R ${OUTPUT_DIR} &>> ${LOG_DIR}/top-fields.log
  ./record-patterns --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} &> ${LOG_DIR}/record-patterns.log

  head -1 ${OUTPUT_DIR}/record-patterns.csv | sed -e 's/^/count,/' > ${OUTPUT_DIR}/record-patterns-grouped.csv
  cat ${OUTPUT_DIR}/record-patterns.csv \
    | grep -v "\\$" \
    | sort \
    | uniq -c \
    | sort -n -r \
    | sed -r 's/^ *([0-9]+) /\1,/' >> ${OUTPUT_DIR}/record-patterns-grouped.csv
}

do_version_link() {
  run version-link

  untrace
  if [[ "${VERSION:-}" != "" ]]; then
    OUTPUT_LINK=${BASE_OUTPUT_DIR}/${NAME}
    if [[ -e ${OUTPUT_LINK} ]]; then
      rm ${OUTPUT_LINK}
    fi
    ln -s ${OUTPUT_DIR} ${OUTPUT_LINK}
  fi
}

do_validate_sqlite() {
  run "validate sqlite"
  php scripts/sqlite/normalize-issue-details.php ${OUTPUT_DIR} &> ${LOG_DIR}/sqlite.log

  untrace

  # log "delete DB"
  # if [[ -e ${OUTPUT_DIR}/qa_catalogue.sqlite ]]; then
  #   rm ${OUTPUT_DIR}/qa_catalogue.sqlite
  # fi

  HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
  if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
    log "create DB structure"
    sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/qa_catalogue.sqlite.sql
  else
    log "create DB structure (grouped)"
    sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/qa_catalogue.grouped.sqlite.sql
  fi

  log "create importable files"
  tail -n +2 ${OUTPUT_DIR}/issue-details-normalized.csv > ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
  tail -n +2 ${OUTPUT_DIR}/issue-summary.csv > ${OUTPUT_DIR}/issue-summary_noheader.csv
  if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
    tail -n +2 ${OUTPUT_DIR}/id-groupid.csv > ${OUTPUT_DIR}/id-groupid_noheader.csv
  fi

  log "import issue details"
  sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/issue-details-normalized_noheader.csv issue_details
EOF

  log "import issue summary"
  sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/issue-summary_noheader.csv issue_summary
EOF

  if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
    log "import id_groupid"
    sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/id-groupid_noheader.csv id_groupid
EOF
  fi

  log "delete importable files"
  rm ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
  rm ${OUTPUT_DIR}/issue-details-normalized.csv
  rm ${OUTPUT_DIR}/issue-summary_noheader.csv
  if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
    rm ${OUTPUT_DIR}/id-groupid_noheader.csv
  fi

  SOLR_FOR_SCORES_URL=$(echo $TYPE_PARAMS | grep -P -o --regexp='--solrForScoresUrl \K([^ ]+)' || true)
  ONLY_INDEX=$(echo ${PARAMS} | grep -c -P -e '--onlyIndex' || true)

  if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
    log "index"
    sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/modify-tables.sql &>> ${LOG_DIR}/sqlite.log
    if [[ "${SOLR_FOR_SCORES_URL}" != "" ]]; then
      echo "index at ${SOLR_FOR_SCORES_URL}"
      # index id-groupid.csv and issue-details.csv
      scripts/sqlite/index-issue-details.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${ONLY_INDEX} ${SOLR_FOR_SCORES_URL}
    fi
  else
    log "index (grouped)"
    scripts/sqlite/index-issue-details.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${ONLY_INDEX} ${SOLR_FOR_SCORES_URL}
    scripts/sqlite/calculate-aggregated-numbers.grouped.sh ${OUTPUT_DIR} ${NAME} ${HAS_GROUP_PARAM} ${SOLR_FOR_SCORES_URL}
  fi
}

do_mysql() {
  run mysql

  php scripts/sqlite/normalize-issue-details.php ${OUTPUT_DIR} &> ${LOG_DIR}/mysql.log

  untrace

  # log "delete DB"
  # if [[ -e ${OUTPUT_DIR}/qa_catalogue.sqlite ]]; then
  #   rm ${OUTPUT_DIR}/qa_catalogue.sqlite
  # fi
  mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS issue_details"
  mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS issue_summary"
  mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS issue_groups"
  mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e "DROP TABLE IF EXISTS id_groupid"

  HAS_GROUP_PARAM=$(echo ${TYPE_PARAMS} | grep -c -P -e '--groupBy [^-]' || true)
  if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
    log "create DB structure"
    mysql --defaults-extra-file=mysql.client.cnf ${NAME} < scripts/sqlite/qa_catalogue.mysql.sql
  else
    log "create DB structure (grouped)"
    mysql --defaults-extra-file=mysql.client.cnf ${NAME} < scripts/sqlite/qa_catalogue.grouped.mysql.sql
  fi

  log "create importable files"
  # tail -n +2 ${OUTPUT_DIR}/issue-details-normalized.csv > ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
  # tail -n +2 ${OUTPUT_DIR}/issue-summary.csv > ${OUTPUT_DIR}/issue-summary_noheader.csv
  # if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
  #   tail -n +2 ${OUTPUT_DIR}/id-groupid.csv > ${OUTPUT_DIR}/id-groupid_noheader.csv
  # fi

  log "import issue details"
  MYSQL_SEC_DIR=$(mysql --defaults-extra-file=mysql.client.cnf ${NAME} -e 'SHOW VARIABLES LIKE "secure_file_priv"\G' | grep 'Value:' | sed 's,\s*Value: ,,')
  sudo cp ${OUTPUT_DIR}/issue-details-normalized.csv ${MYSQL_SEC_DIR}
  mysql --defaults-extra-file=mysql.client.cnf ${NAME} -f << EOF
LOAD DATA INFILE '${MYSQL_SEC_DIR}/issue-details-normalized.csv' 
INTO TABLE issue_details
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
EOF
  # sudo rm ${MYSQL_SEC_DIR}/issue-details-normalized.csv

  log "import issue summary"
  sudo cp ${OUTPUT_DIR}/issue-summary.csv ${MYSQL_SEC_DIR}
  mysql --defaults-extra-file=mysql.client.cnf ${NAME} -f << EOF
LOAD DATA INFILE '${MYSQL_SEC_DIR}/issue-summary.csv' 
INTO TABLE issue_summary
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
EOF
  # sudo rm ${MYSQL_SEC_DIR}/issue-summary.csv

  if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
    log "import id_groupid"
    sudo cp ${OUTPUT_DIR}/id-groupid.csv ${MYSQL_SEC_DIR}
    sudo mysql --defaults-extra-file=mysql.client.cnf ${NAME} -f << EOF
LOAD DATA INFILE '${MYSQL_SEC_DIR}/id-groupid.csv' 
INTO TABLE id_groupid
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 LINES;
EOF
    # sudo rm ${MYSQL_SEC_DIR}/id-groupid.csv
  fi


  log "delete importable files"
  # rm ${OUTPUT_DIR}/issue-details-normalized_noheader.csv
  # rm ${OUTPUT_DIR}/issue-summary_noheader.csv
  # if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
  #   rm ${OUTPUT_DIR}/id-groupid_noheader.csv
  # fi

  if [[ "${HAS_GROUP_PARAM}" == "0" ]]; then
    log "index"
    sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/sqlite/modify-tables.sql &>> ${LOG_DIR}/mysql.log
  else
    log "index (grouped)"
    mysql --defaults-extra-file=mysql.client.cnf ${NAME} < scripts/sqlite/modify-tables.grouped.mysql.sql &>> ${LOG_DIR}/mysql.log
  fi
}

do_export_schema_files() {
  mkdir -p avram-schemas
  run avram

  untrace
  ./export-schema --withSubfieldCodelists | jq . > avram-schemas/marc-schema.json
  ./export-schema --withSubfieldCodelists --solrFieldType mixed --withSelfDescriptiveCode | jq .  > avram-schemas/marc-schema-with-solr.json
  ./export-schema --withSubfieldCodelists --solrFieldType mixed --withSelfDescriptiveCode --withLocallyDefinedFields | jq . > avram-schemas/marc-schema-with-solr-and-extensions.json

  log "files generated at 'avram-schemas' directory: marc-schema.json, marc-schema-with-solr.json, marc-schema-with-solr-and-extensions.json"
}

do_shacl4bib() {
  # note: SHACL specific parameters are missing here --shaclConfigurationFile, --shaclOutputType, --shaclOutputFile
  PARAMS=$(echo ${TYPE_PARAMS} | sed -r 's/\s*--emptyLargeCollectors|\s*--ignorableIssueTypes [^ ]+|\s*--(indexWithTokenizedField|indexFieldCounts|solrUrl)//g')
  run shacl4bib
  echo "  ./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shacl4bib.log"
  ./shacl4bib --defaultRecordType BOOKS ${PARAMS} --outputDir ${OUTPUT_DIR}/ ${MARC_DIR}/${MASK} 2> ${LOG_DIR}/shacl4bib.log
  Rscript scripts/shacl4bib/shacl4bib.R ${OUTPUT_DIR}

  log "import issue details"
  echo "DROP TABLE shacl;" > scripts/shacl4bib/shacl4bib.sql
  head -1 ${OUTPUT_DIR}/shacl4bib.csv \
    | sed 's/,/" VARCHAR(2), "/g' \
    | sed 's/^/CREATE TABLE shacl( "/' \
    | sed 's/id" VARCHAR(2)/id" VARCHAR(100)/' \
    | sed 's/$/" VARCHAR(2)\n);/' \
    | sed 's/,/,\n /g' >> scripts/shacl4bib/shacl4bib.sql
  sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite < scripts/shacl4bib/shacl4bib.sql
  rm scripts/shacl4bib/shacl4bib.sql

  tail -n +2 ${OUTPUT_DIR}/shacl4bib.csv > ${OUTPUT_DIR}/shacl4bib-noheader.csv
  sqlite3 ${OUTPUT_DIR}/qa_catalogue.sqlite << EOF
.mode csv
.import ${OUTPUT_DIR}/shacl4bib-noheader.csv shacl
EOF

}

do_all_analyses() {
  analysis_tasks=$(echo "${ANALYSES}" | tr , ' ')
  for task in $analysis_tasks; do
      declare -F "do_$task" > /dev/null || fatal "unknown analysis task: $task"
  done
  for task in $(echo "${ANALYSES}" | tr , ' '); do
    do_$task
    echo "##"
    echo "## THE RESULT IS $?"
    echo "##"
  done
}

do_all_solr() {
  do_prepare_solr
  do_index
  do_postprocess_solr
#  if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
#    php scripts/sqlite/solr-copy-ids-from-validation.php ${NAME}_validation ${NAME} 2>> ${LOG_DIR}/solr.log
#  fi
}

# ---- usage and execution of proccessing steps ----

help() {
ME=$0

cat <<END
Run QA catalogue analyses

 ${ME} [VARIABLES] <COMMAND[,COMMAND...]>

Commands:
  validate                   record validation
  validate-sqlite            import result of validation to SQLite
  completeness               completeness analysis
  completeness-sqlite        import grouped output of completeness to SQLite
  classifications            classification analysis
  authorities                authority analysis
  tt-completeness            Thompson-Traill completeness analysis
  shelf-ready-completeness   shelf-ready completeness analysis
  bl-classification          British Library's quality classification
  serial-score               serial score analysis
  format                     search and format records
  functional-analysis        FRBR functional requirement analysis
  network-analysis           network analysis
  pareto                     pareto analysis
  marc-history               generating cataloguing history chart
  record-patterns            record patterns
  prepare-solr               prepare indexing
  index                      indexing with Solr
  postprocess-solr           postprocess indexing (swap NAME and NAME_dev indexes)
  export-schema-files        export schema files
  shacl4bib                  run SHACL-like validation
  all-analyses               run all analytical tasks (or those set via ANALYSES)
  all-solr                   run all indexing tasks
  all                        run all tasks (analyses and indexing)
  config                     show configuration
  help                       show this help message

Variable of the setdir.sh script:
  BASE_INPUT_DIR             the base input directory (default: -./input)
  BASE_OUTPUT_DIR            the base input directory (default: -./output)

Environmental variables:
  NAME                       the machine readable name of the dataset. Usually same as the MARC_DIR variable
  MARC_DIR                   the name of the subdirectory where bibliographical data stored. If not set it will
                             be set to BASE_INPUT_DIR/NAME.
  MASK                       the mask of the bibliographical data files (e.g. marc*.mrc.gz)
  OUTPUT_DIR                 the name of subdirectory where the output of the analysis will be stored. If not
                             set it will be set to BASE_OUTPUT_DIR/NAME or if VERSION is set then it will be
                             set to BASE_OUTPUT_DIR/NAME-VERSION.
  TYPE_PARAMS                all parameters of analyses which are described in the documentation
  ANALYSES                   a comma separated list of analyses (commands) to execute
  SCHEMA                     the metadata schema (MARC21 (default) or PICA)
  WEB_DIR                    the directory of the qa-catalogue-web
  LOG_DIR                    the directory where log files are written (default: BASE_LOG_DIR/NAME)
  UPDATE                     the date time string (in YYYY-mm-dd H:M:s format) of the last data update.
                             It will be stored into OUTPUT_DIR/last-update.csv
  VERSION                    a version number for the source data (e.g. the date of the update). If set, the actual
                             OUTPUT_DIR path will contain it, and there will symbolic link created to the latest one
                             (from BASE_OUTPUT_DIR/NAME to BASE_OUTPUT_DIR/NAME-VERSION).

more info: https://github.com/pkiraly/qa-catalogue

END
}

config() {
  set +u
  echo "NAME=$NAME"
  echo "MARC_DIR=$MARC_DIR"
  echo "MASK=$MASK"
  echo "OUTPUT_DIR=$OUTPUT_DIR"
  echo "TYPE_PARAMS=$TYPE_PARAMS"
  echo "ANALYSES=$ANALYSES"
  echo "SCHEMA=$SCHEMA"
  echo "WEB_DIR=$WEB_DIR"
  echo "LOG_DIR=$LOG_DIR"
  echo "UPDATE=$UPDATE"
  cat ./common-variables
}

fatal() {
  colored "1;31" "$1"
  exit 1
}

# initialize environment, set default values
# TODO: remove this because it's already done in ./qa-catalogue)
NAME=${NAME:-$(basename $0 .sh)}
BASE_INPUT_DIR=${BASE_INPUT_DIR:-./input}
BASE_OUTPUT_DIR=${BASE_OUTPUT_DIR:-./output}
BASE_LOG_DIR=${BASE_LOG_DIR:-./logs}

MARC_DIR=${MARC_DIR:-$BASE_INPUT_DIR/$NAME}
SCHEMA=${SCHEMA:-MARC21}

LOG_DIR=${BASE_LOG_DIR}/${NAME}
if [[ "${VERSION:-}" != "" ]]; then
  OUTPUT_DIR=${BASE_OUTPUT_DIR}/${NAME}-${VERSION}
else
  OUTPUT_DIR=${BASE_OUTPUT_DIR}/${NAME}
fi

# which tasks to run on `all-analyses`
if [ "${SCHEMA}" == "PICA" ]; then
  ALL_ANALYSES=validate,validate_sqlite,completeness,completeness_sqlite,classifications,authorities,marc_history

  # automatically set unless already set
  [[ "$TYPE_PARAMS" =~ --marcFormat|-f ]] || TYPE_PARAMS="--marcFormat PICA_NORMALIZED $TYPE_PARAMS"
  [[ "$TYPE_PARAMS" =~ --schemaType|-w ]] || TYPE_PARAMS="--schemaType PICA $TYPE_PARAMS"
else
  ALL_ANALYSES=validate,validate_sqlite,completeness,completeness_sqlite,classifications,authorities,tt_completeness,shelf_ready_completeness,serial_score,functional_analysis,pareto,marc_history
fi
ANALYSES=${ANALYSES:-$ALL_ANALYSES}

tasks="${1:-help}"
datatask=

# Check whether data is going to be processed
for task in ${tasks//,/ }; do
  if [[ ! "$task" =~ ^(help|config|export-schema-files)$ ]]; then
    datatask=true
  fi
done

# check directories for processing commands
if [[ "$datatask" = true ]]; then
  mkdir -p $LOG_DIR
  mkdir -p $OUTPUT_DIR

  log "input:  $MARC_DIR/$MASK"
  log "output: $OUTPUT_DIR"
  log "logs:   $LOG_DIR"

  ls ${MARC_DIR}/${MASK} &> /dev/null || fatal "Missing input files: ${MARC_DIR}/${MASK}!\n"

  if [[ -n "${UPDATE:-}" ]]; then
    log "update: $UPDATE"
    echo "${UPDATE}" > "${OUTPUT_DIR}/last-update.csv"
  fi
fi

for task in ${tasks//,/ }; do
  case $task in
    validate)                 do_validate ; do_validate_sqlite ;;
    validate-sqlite)          do_validate_sqlite ;;
    prepare-solr)             do_prepare_solr ;;
    index)                    do_index ;;
    postprocess-solr)         do_postprocess_solr ;;
    completeness)             do_completeness ; do_completeness_sqlite ;;
    completeness-sqlite)      do_completeness_sqlite ;;
    classifications)          do_classifications ;;
    authorities)              do_authorities ;;
    tt-completeness)          do_tt_completeness ;;
    shelf-ready-completeness) do_shelf_ready_completeness ;;
    bl-classification)        do_bl_classification ;;
    serial-score)             do_serial_score ;;
    format)                   do_format ;;
    functional-analysis)      do_functional_analysis ;;
    network-analysis)         do_network_analysis ;;
    pareto)                   do_pareto ;;
    marc-history)             do_marc_history ;;
    record-patterns)          do_record_patterns ;;
    mysql)                    do_mysql ;;
    export-schema-files)      do_export_schema_files ;;
    shacl4bib)                do_shacl4bib ;;
    all-analyses)             do_all_analyses ;;
    all-solr)                 do_all_solr ;;
    all)                      do_all_analyses ; do_all_solr ;;
    version-link)             do_version_link ;;
    config)                   config ;;
    help)                     help ;;
    *)                        fatal "unknown command: $1"
  esac
done

untrace

if [[ "$datatask" = true ]]; then
  sec=$SECONDS
  log "DONE in $(printf '%02d:%02d:%02d\n' $((sec/3600)) $((sec%3600/60)) $((sec%60)))"
fi