Skip to content

Commit

Permalink
adding studies without samples
Browse files Browse the repository at this point in the history
  • Loading branch information
michael-grace committed Nov 30, 2021
1 parent 6b9a91e commit d3683a6
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 78 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ WORKDIR /app
COPY requirements.txt ./

RUN pip3 install -r requirements.txt
RUN pip3 install git+https://gitlab.internal.sanger.ac.uk/hgi-projects/uploadtogenestack@2.7#egg=uploadtogenestack
RUN pip3 install git+https://gitlab.internal.sanger.ac.uk/hgi-projects/uploadtogenestack@2.8#egg=uploadtogenestack

# Copying Python scripts, and built frontend
COPY . .
Expand Down
158 changes: 83 additions & 75 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import csv
import importlib.metadata
from json.decoder import JSONDecodeError
import os
from pathlib import Path
import time
import typing as T

Expand All @@ -42,13 +44,16 @@
# we define in the config which genestack server we're using
gs_config = uploadtogenestack.GenestackStudy.get_gs_config(
config.GENESTACK_SERVER)
ssh_key_path = f"{os.environ['HOME']}/.ssh/id_rsa_genestack"

# as we need to pull files from the S3 bucket, we need a connection to the bucket
# to start of with. this can throw an exception right at the start of the program
# if we can't access the bucket, so a public policy will need setting so we
# can start the app
try:
s3_bucket = uploadtogenestack.S3BucketUtils(gs_config["genestackbucket"])
s3_bucket = uploadtogenestack.S3BucketUtils(
gs_config["genestackbucket"],
ssh_key_filepath=ssh_key_path)
except (
botocore.exceptions.ClientError,
uploadtogenestack.genestackassist.BucketPermissionDenied,
Expand Down Expand Up @@ -114,83 +119,84 @@ def all_studies() -> Response:
if "Study Title" not in body or body["Study Title"] == "":
body["Study Title"] = body["Study Source"]

# We need to download the sample file from the S3 bucket and
# store it locally so it can get uploaded.
# Once it has been uploaded, we don't care about it anymore,
# so we'll just store it in /tmp
sample_file = f"/tmp/samples_{int(time.time()*1000)}.tsv"
sample_file: T.Optional[Path] = None

with s3.S3PublicPolicy(s3_bucket):

# Getting Data from S3
s3_bucket.download_file(body["Sample File"], sample_file)

# Although Sample File is passed to us in our API,
if body.get("Sample File"):
# We need to download the sample file from the S3 bucket and
# store it locally so it can get uploaded.
# Once it has been uploaded, we don't care about it anymore,
# so we'll just store it in /tmp
sample_file = f"/tmp/samples_{int(time.time()*1000)}.tsv"

# Getting Data from S3
s3_bucket.download_file(body["Sample File"], sample_file)

# Reanming Sample File Columns

# The user has the oppurtunity to rename columns in the sample file,
# or create new columns in the sample file before it gets uploaded.
# We get passed {old: ..., new: ..., colValue: ...} objects giving us
# the column name to rename, the new column name and the value that should
# be in the column. Leaving colValue blank shows we want to use the values
# that are already in that column (which can be different in every row).
# Filling in colValue means we want the same value in each cell, which
# can be used when making new columns. This involves leaving `old` as
# an empty string

# The other important thing is that is a column isn't included, it'll be
# deleted. We don't give the user the option to delete columns, so we need
# to ensure that ALL current columns are also included. For this, we read
# the headers of the sample file, and add those records in, keeping old and
# new the same, and leaving colValue blank, so it uses the already existing
# values.

# Then we open a file to write this all to, how the uploadtogenestack package
# expects it to be. This is a `|` separated file, with a header row:
# old|new|fillvalue
# where fillvalue is what we've called colValue up to now
# Then we can pass the samples file, this new temp file to the package, and
# get back the path of a new samples file, which we'll use later on.

# all this is under the assumption that we're going to rename anything,
# hence `if len(body["renamedColumns"]) != 0:`
if len(body["renamedColumns"]) != 0:
with open(sample_file, encoding="UTF-8") as samples:
reader = csv.reader(samples, delimiter="\t")
headers = next(reader)

for header in headers:
if header not in [x["old"] for x in body["renamedColumns"]]:
body["renamedColumns"].append({
"old": header,
"new": header,
"colValue": ""
})

tmp_rename_fp: str = f"/tmp/gs-rename-{int(time.time()*1000)}.tsv"
with open(tmp_rename_fp, "w", encoding="UTF-8") as tmp_rename:
tmp_rename.write("old|new|fillvalue\n")
for col_rename in body["renamedColumns"]:
tmp_rename.write(
"|".join([
col_rename["old"],
col_rename["new"],
col_rename["colValue"]
if col_rename["colValue"] != ""
else "[fillvalue]"
]) + "\n")

if uploadtogenestack.GenestackUploadUtils.check_suggested_columns(
tmp_rename_fp,
sample_file
):
sample_file = uploadtogenestack.GenestackUploadUtils. \
renamesamplefilecolumns(sample_file, tmp_rename_fp)

# Although these are passed to us in our API,
# it would be invalid in what we pass to genestack, so we
# need rid of it now we've downloaded the file from S3
del body["Sample File"]

# Reanming Sample File Columns

# The user has the oppurtunity to rename columns in the sample file,
# or create new columns in the sample file before it gets uploaded.
# We get passed {old: ..., new: ..., colValue: ...} objects giving us
# the column name to rename, the new column name and the value that should
# be in the column. Leaving colValue blank shows we want to use the values
# that are already in that column (which can be different in every row).
# Filling in colValue means we want the same value in each cell, which
# can be used when making new columns. This involves leaving `old` as
# an empty string

# The other important thing is that is a column isn't included, it'll be
# deleted. We don't give the user the option to delete columns, so we need
# to ensure that ALL current columns are also included. For this, we read
# the headers of the sample file, and add those records in, keeping old and
# new the same, and leaving colValue blank, so it uses the already existing
# values.

# Then we open a file to write this all to, how the uploadtogenestack package
# expects it to be. This is a `|` separated file, with a header row:
# old|new|fillvalue
# where fillvalue is what we've called colValue up to now
# Then we can pass the samples file, this new temp file to the package, and
# get back the path of a new samples file, which we'll use later on.

# all this is under the assumption that we're going to rename anything,
# hence `if len(body["renamedColumns"]) != 0:`
if len(body["renamedColumns"]) != 0:
with open(sample_file, encoding="UTF-8") as samples:
reader = csv.reader(samples, delimiter="\t")
headers = next(reader)

for header in headers:
if header not in [x["old"] for x in body["renamedColumns"]]:
body["renamedColumns"].append({
"old": header,
"new": header,
"colValue": ""
})

tmp_rename_fp: str = f"/tmp/gs-rename-{int(time.time()*1000)}.tsv"
with open(tmp_rename_fp, "w", encoding="UTF-8") as tmp_rename:
tmp_rename.write("old|new|fillvalue\n")
for col_rename in body["renamedColumns"]:
tmp_rename.write(
"|".join([
col_rename["old"],
col_rename["new"],
col_rename["colValue"]
if col_rename["colValue"] != ""
else "[fillvalue]"
]) + "\n")

if uploadtogenestack.GenestackUploadUtils.check_suggested_columns(
tmp_rename_fp,
sample_file
):
sample_file = uploadtogenestack.GenestackUploadUtils. \
renamesamplefilecolumns(sample_file, tmp_rename_fp)

del body["renamedColumns"]

# Creating Metadata TSV
Expand All @@ -210,7 +216,8 @@ def all_studies() -> Response:
samplefile=sample_file,
genestackserver=config.GENESTACK_SERVER,
genestacktoken=token,
studymetadata=tmp_fp
studymetadata=tmp_fp,
ssh_key_filepath=ssh_key_path
)

return create_response({"accession": study.study_accession}, 201)
Expand Down Expand Up @@ -352,7 +359,8 @@ def all_signals(study_id: str) -> Response:
study_genestackaccession=study_id,
genestackserver=config.GENESTACK_SERVER,
genestacktoken=token,
signal_dict=body
signal_dict=body,
ssh_key_filepath=ssh_key_path
)

return CREATED
Expand Down
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

# Configuration Settings

VERSION = "1.0"
VERSION = "1.0.1"

# this is the end of the base url, identical to frontend/.env
# this only affects the swagger ui
Expand Down
2 changes: 1 addition & 1 deletion frontend/pages/studies/index.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ const NewStudy = () => {
var fields = t.data.template
.filter((e) => !e.isReadOnly && e.dataType == "study")
.map((e) => ({ name: e.name, required: e.isRequired }));
fields.unshift({ name: "Sample File", required: true });
fields.unshift({ name: "Sample File", required: false });
setNewStudy(
fields.reduce((xs, x) => ({ ...xs, [x.name]: "" }), {
renamedColumns: [],
Expand Down

0 comments on commit d3683a6

Please sign in to comment.