I want to download the FASTQ files from Basespace to the Linux server directly without first downloading to local PC based on the project. I found three references:
1. Download files from Illumina’s BaseSpace
https://gist.github.com/lh3/54f535b11a9ee5d3be8e
2. Use the Python Run Downloader
https://help.basespace.illumina.com/articles/tutorials/using-the-python-run-downloader/
3. API
https://developer.basespace.illumina.com/docs/content/documentation/rest-api/api-reference
But, the Python Run Downloader will download files based on the Run Id that I want to download based on the project. Therefore, I modified the script to meet my requirement.
from urllib2 import Request, urlopen, URLError
import json
import math
import sys
import os
import socket
import optparse
def arg_parser():
cwd_dir = os.getcwd()
parser = optparse.OptionParser()
parser.add_option( '-p', dest='projid', help='Project ID: required')
parser.add_option( '-a', dest='accesstoken', help='Access Token: required')
( options, args ) = parser.parse_args()
try:
if options.projid == None:
raise Exception
if options.accesstoken == None:
raise Exception
except Exception:
print("Usage: BaseSpaceRunDownloader_vN.py -p <ProjID> -a <AccessToken>")
sys.exit()
return options
def restrequest(rawrequest):
request = Request(rawrequest)
try:
response = urlopen(request)
json_string = response.read()
#print(json_string)
json_obj = json.loads(json_string)
except URLError, e:
print 'Got an error code:', e
sys.exit()
return json_obj
def downloadrestrequest(rawrequest,path):
dirname = ProjID + os.sep + os.path.dirname(path)
#print(dirname)
if dirname != '':
if not os.path.isdir(dirname):
os.makedirs(dirname)
request = (rawrequest)
outfile = open(ProjID + os.sep + path,'wb')
try:
response = urlopen(request,timeout=1)
outfile.write(response.read())
outfile.close()
except URLError, e:
print 'Got an error code:', e
outfile.close()
downloadrestrequest(rawrequest,path)
except socket.error:
print 'Got a socket error: retrying'
outfile.close()
downloadrestrequest(rawrequest,path)
options = arg_parser()
ProjID = options.projid
AccessToken = options.accesstoken
hreflist = []
hrefcontentlist = []
pathlist = []
samplelist = []
#Step 1: Find the Biosample ID from project id first
#assume fewer than 1000 samples in a project
request = 'https://api.basespace.illumina.com/v2/biosamples?projectid=%s&access_token=%s&limit=1000' %(ProjID,AccessToken)
json_obj = restrequest(request)
nSamples = len(json_obj['Items'])
for sampleindex in range(nSamples):
sampleid = json_obj['Items'][sampleindex]['Id']
samplelist.append(sampleid)
samplecsv = ','.join([str(i) for i in samplelist])
print(samplecsv)
#Step 2: Call API to get datasets based on biosample
request = 'https://api.basespace.illumina.com/v2/datasets?inputbiosamples=%s&access_token=%s' %(samplecsv,AccessToken)
json_obj = restrequest(request)
totalCount = int(json_obj['Paging']['TotalCount'])
noffsets = int(math.ceil(float(totalCount)/1000.0))
for index in range(noffsets):
offset = 1000*index
request = 'https://api.basespace.illumina.com/v2/datasets?inputbiosamples=%s&access_token=%s&limit=1000&Offset=%s' %(samplecsv,AccessToken,offset)
#print(request)
json_obj = restrequest(request)
nDatasets = len(json_obj['Items'])
for fileindex in range(nDatasets):
href = json_obj['Items'][fileindex]['HrefFiles']
hreflist.append(href)
#Step 3: Get the download filepath (HrefContent) and filename (Path)
#normally two files per dataset in our case
for index in range(len(hreflist)):
request = '%s?access_token=%s'%(hreflist[index],AccessToken)
#print(request)
json_obj = restrequest(request)
nfiles = len(json_obj['Items'])
for fileindex in range(nfiles):
hrefcontent = json_obj['Items'][fileindex]['HrefContent']
hrefcontentlist.append(hrefcontent)
path = json_obj['Items'][fileindex]['Path']
pathlist.append(path)
for index in range(len(hreflist)):
request = '%s?access_token=%s'%(hrefcontentlist[index],AccessToken)
print(request)
print 'downloading %s' %(pathlist[index])
downloadrestrequest(request, pathlist[index])
The script can be started by
python BaseSpaceRunDownloader_v2.py -p $ProjId -a $AccessToken
Notes:
- ProjId – Get it from the Basespace website, i.e. ProjId: 66706640 in this case
- AccessToken – got from previous reference link. No need to submit App for review!!

Files are saving to the subdirectory with directory name is <ProjId>
Hope it helps!!