Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create a python download script for Non-unix platforms. #1082

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ In order to download the model weights and tokenizer, please visit the [Meta web

Once your request is approved, you will receive a signed URL over email. Then run the download.sh script, passing the URL provided when prompted to start the download.

Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh`.
Pre-requisites: Make sure you have `wget` and `md5sum` installed. Then run the script: `./download.sh` if you're on a Unix system, or else `python download.py`.

Keep in mind that the links expire after 24 hours and a certain amount of downloads. If you start seeing errors such as `403: Forbidden`, you can always re-request a link.

Expand All @@ -38,7 +38,7 @@ You can follow the steps below to quickly get up and running with Llama 2 models

4. Once registered, you will get an email with a URL to download the models. You will need this URL when you run the download.sh script.

5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script.
5. Once you get the email, navigate to your downloaded llama repository and run the download.sh script for Unix platforms, or else the download.py script for other platforms.
- Make sure to grant execution permissions to the download.sh script
- During this process, you will be prompted to enter the URL from the email.
- Do not use the “Copy Link” option but rather make sure to manually copy the link from the email.
Expand Down
103 changes: 103 additions & 0 deletions download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env python3

# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.

import os
import requests
import hashlib
from tqdm import tqdm # for showing download progress

presigned_url = input("Enter the URL from email: ")
print("")

model_size = input("Enter the list of models to download without spaces (7B,13B,70B,7B-chat,13B-chat,70B-chat), or press Enter for all: ")
target_folder = ".\models" # where all files should end up
headers = {'User-Agent': 'wget'} # Specify wget useragent, as meta only allows downloads from wget useragent
os.makedirs(target_folder, exist_ok=True)

if model_size == "":
model_size = "7B,13B,70B,7B-chat,13B-chat,70B-chat"

print("Downloading LICENSE and Acceptable Usage Policy")
license_response = requests.get(presigned_url.replace('*', "LICENSE"), headers=headers)
with open(os.path.join(target_folder, "LICENSE"), 'wb') as license_file:
license_file.write(license_response.content)
policy_response = requests.get(presigned_url.replace('*', "USE_POLICY.md"), headers=headers)
with open(os.path.join(target_folder, "USE_POLICY.md"), 'wb') as policy_file:
policy_file.write(policy_response.content)

print("Downloading tokenizer")
tokenizer_model_response = requests.get(presigned_url.replace('*', "tokenizer.model"), headers=headers)
with open(os.path.join(target_folder, "tokenizer.model"), 'wb') as tokenizer_model_file:
tokenizer_model_file.write(tokenizer_model_response.content)
tokenizer_checklist_response = requests.get(presigned_url.replace('*', "tokenizer_checklist.chk"), headers=headers)
with open(os.path.join(target_folder, "tokenizer_checklist.chk"), 'wb') as tokenizer_checklist_file:
tokenizer_checklist_file.write(tokenizer_checklist_response.content)

def check_md5(file_path, checksum):
with open(file_path, 'rb') as f:
data = f.read()
md5 = hashlib.md5(data).hexdigest()
return md5 == checksum

def check_checksums(folder_path, checklist_path):
with open(checklist_path) as f:
for line in f:
checksum, file_name = line.strip().split()
file_path = os.path.join(folder_path, file_name)
if check_md5(file_path, checksum):
print(f"{file_name}: OK")
else:
print(f"{file_name}: FAILED")

check_checksums(target_folder, os.path.join(target_folder, "tokenizer_checklist.chk"))

for model in model_size.split(','):
if model == "7B":
shard = 0
model_path = "llama-2-7b"
elif model == "7B-chat":
shard = 0
model_path = "llama-2-7b-chat"
elif model == "13B":
shard = 1
model_path = "llama-2-13b"
elif model == "13B-chat":
shard = 1
model_path = "llama-2-13b-chat"
elif model == "70B":
shard = 7
model_path = "llama-2-70b"
elif model == "70B-chat":
shard = 7
model_path = "llama-2-70b-chat"

print(f"Downloading {model_path}")
os.makedirs(os.path.join(target_folder, model_path), exist_ok=True)

for s in range(shard + 1):
print("Downloading shard " + str(s + 1) + " of shards " + str(shard + 1)) # display shard progress
consolidated_response = requests.get(presigned_url.replace('*', f"{model_path}/consolidated.{s:02d}.pth"), headers=headers, stream=True)
total_cons = int(consolidated_response.headers.get('content-length', 0))

with open(os.path.join(target_folder, model_path, f"consolidated.{s:02d}.pth"), 'wb') as consolidated_file, tqdm(
total=total_cons,
unit='iB',
unit_scale=True,
unit_divisor=1024,
) as bar: # use tqdm to show real-time progress of download
for data in consolidated_response.iter_content(chunk_size=1024):
size = consolidated_file.write(data)
bar.update(size)

params_response = requests.get(presigned_url.replace('*', f"{model_path}/params.json"), headers=headers)
with open(os.path.join(target_folder, model_path, "params.json"), 'wb') as params_file:
params_file.write(params_response.content)
checklist_response = requests.get(presigned_url.replace('*', f"{model_path}/checklist.chk"), headers=headers)
with open(os.path.join(target_folder, model_path, "checklist.chk"), 'wb') as checklist_file:
checklist_file.write(checklist_response.content)

print("Checking checksums")
check_checksums(os.path.join(target_folder, model_path), os.path.join(target_folder, model_path, "checklist.chk"))

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ torch
fairscale
fire
sentencepiece
tqdm