Download FASTQ files from Basespace by script

I want to download the FASTQ files from Basespace to the Linux server directly without first downloading to local PC based on the project. I found three references:

1. Download files from Illumina’s BaseSpace
https://gist.github.com/lh3/54f535b11a9ee5d3be8e

2. Use the Python Run Downloader
https://help.basespace.illumina.com/articles/tutorials/using-the-python-run-downloader/

3. API
https://developer.basespace.illumina.com/docs/content/documentation/rest-api/api-reference

But, the Python Run Downloader will download files based on the Run Id that I want to download based on the project. Therefore, I modified the script to meet my requirement.


from urllib2 import Request, urlopen, URLError
import json
import math
import sys
import os
import socket
import optparse

def arg_parser():
 cwd_dir = os.getcwd()
 parser = optparse.OptionParser()
 parser.add_option( '-p', dest='projid', help='Project ID: required')
 parser.add_option( '-a', dest='accesstoken', help='Access Token: required')
 ( options, args ) = parser.parse_args()

 try:
 if options.projid == None:
 raise Exception
 if options.accesstoken == None:
 raise Exception

except Exception:
 print("Usage: BaseSpaceRunDownloader_vN.py -p <ProjID> -a <AccessToken>")
 sys.exit()

 return options

def restrequest(rawrequest):
 request = Request(rawrequest)

try:
 response = urlopen(request)
 json_string = response.read()
 #print(json_string)
 json_obj = json.loads(json_string)

except URLError, e:
 print 'Got an error code:', e
 sys.exit()

return json_obj

def downloadrestrequest(rawrequest,path):
 dirname = ProjID + os.sep + os.path.dirname(path)
 #print(dirname)

if dirname != '':
 if not os.path.isdir(dirname):
 os.makedirs(dirname)

 request = (rawrequest)

outfile = open(ProjID + os.sep + path,'wb')

try:
 response = urlopen(request,timeout=1)

 outfile.write(response.read())
 outfile.close()

except URLError, e:
 print 'Got an error code:', e
 outfile.close()
 downloadrestrequest(rawrequest,path)

except socket.error:
 print 'Got a socket error: retrying'
 outfile.close()
 downloadrestrequest(rawrequest,path)

options = arg_parser()

ProjID = options.projid
AccessToken = options.accesstoken

hreflist = []
hrefcontentlist = []
pathlist = []
samplelist = []

#Step 1: Find the Biosample ID from project id first
#assume fewer than 1000 samples in a project
request = 'https://api.basespace.illumina.com/v2/biosamples?projectid=%s&access_token=%s&limit=1000' %(ProjID,AccessToken)
json_obj = restrequest(request)
nSamples = len(json_obj['Items'])

for sampleindex in range(nSamples):
 sampleid = json_obj['Items'][sampleindex]['Id']
 samplelist.append(sampleid)

samplecsv = ','.join([str(i) for i in samplelist])
print(samplecsv)

#Step 2: Call API to get datasets based on biosample
request = 'https://api.basespace.illumina.com/v2/datasets?inputbiosamples=%s&access_token=%s' %(samplecsv,AccessToken)

json_obj = restrequest(request)
totalCount = int(json_obj['Paging']['TotalCount'])
noffsets = int(math.ceil(float(totalCount)/1000.0))

for index in range(noffsets):
 offset = 1000*index
 request = 'https://api.basespace.illumina.com/v2/datasets?inputbiosamples=%s&access_token=%s&limit=1000&Offset=%s' %(samplecsv,AccessToken,offset)
 #print(request) 
 json_obj = restrequest(request)
 nDatasets = len(json_obj['Items'])
 for fileindex in range(nDatasets):
 href = json_obj['Items'][fileindex]['HrefFiles']
 hreflist.append(href)

#Step 3: Get the download filepath (HrefContent) and filename (Path)
#normally two files per dataset in our case
for index in range(len(hreflist)):
 request = '%s?access_token=%s'%(hreflist[index],AccessToken)
 #print(request)
 json_obj = restrequest(request) 
 nfiles = len(json_obj['Items'])
 for fileindex in range(nfiles):
 hrefcontent = json_obj['Items'][fileindex]['HrefContent']
 hrefcontentlist.append(hrefcontent)
 path = json_obj['Items'][fileindex]['Path']
 pathlist.append(path)

for index in range(len(hreflist)):
 request = '%s?access_token=%s'%(hrefcontentlist[index],AccessToken)
 print(request)
 print 'downloading %s' %(pathlist[index]) 
 downloadrestrequest(request, pathlist[index])

 

The script can be started by

python BaseSpaceRunDownloader_v2.py -p $ProjId -a $AccessToken

 

Notes:

  1. ProjId – Get it from the Basespace website, i.e. ProjId: 66706640 in this case
  2. AccessToken – got from previous reference link. No need to submit App for review!!

 

Basespace-project

Files are saving to the subdirectory with directory name is <ProjId>

Hope it helps!!

 

 

 

Leave a comment