Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Script to download reference genomes #34

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Data
proteomes/
datasets/*
!datasets/sample_data/

Expand Down
56 changes: 56 additions & 0 deletions download_reference_proteome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import urllib.request
import urllib.error
from datetime import datetime
import argparse

def download_proteome(proteome_id, date=None):
# URL to fetch the proteome
url = f"https://rest.uniprot.org/uniprotkb/stream?download=true&format=fasta&query=%28%28proteome%3A{proteome_id}%29+AND+reviewed%3Dtrue%29"

# Directory to save the proteome file
proteomes_dir = os.getenv("PROTEOMES_DIR")
if not proteomes_dir:
raise EnvironmentError("PROTEOMES_DIR environment variable is not set.")

# Create directory if it doesn't exist
if not os.path.exists(proteomes_dir):
os.makedirs(proteomes_dir)

# Fetch the proteome data
try:
with urllib.request.urlopen(url) as response:
if response.status != 200:
raise Exception(f"Failed to download proteome: {response.status}")
data = response.read().decode('utf-8')

# Parse organism name from FASTA file header (first line)
first_line = data.split('\n')[0]
organism = first_line.split("OS=")[1].split("OX=")[0].strip().replace(' ', '_').lower()
except urllib.error.URLError as e:
raise Exception(f"Failed to download proteome: {e.reason}")

# Use the provided date or the current date
current_date = date if date else datetime.now().strftime("%Y_%m_%d")

# Construct file name
file_name = f"{organism}_uniprotkb_proteome_{proteome_id}_{current_date}.fasta"
file_path = os.path.join(proteomes_dir, file_name)

# Save the proteome data to the file
with open(file_path, "w") as f:
f.write(data)

print(f"Proteome saved to: {file_path}")

if __name__ == "__main__":
# Argument parser setup
parser = argparse.ArgumentParser(description="Download a reference genome from UniProt by proteome ID.")
parser.add_argument("proteome_id", type=str, help="The UniProt proteome ID (e.g., UP000000589)")
parser.add_argument("--date", type=str, help="Optional date for file naming (format: YYYY_MM_DD). Defaults to current date.")

# Parse arguments
args = parser.parse_args()

# Download the proteome using the provided ID
download_proteome(args.proteome_id)