Skip to content

Instantly share code, notes, and snippets.

@mtekman
Last active October 8, 2020 16:17
Show Gist options
  • Save mtekman/b13ee09dd0f0e4768b88cb47b3b32146 to your computer and use it in GitHub Desktop.
Save mtekman/b13ee09dd0f0e4768b88cb47b3b32146 to your computer and use it in GitHub Desktop.
Galaxy Download Scripts
#!/usr/bin/env python3
__VERSION__ = "0.3"
'''This script downloads datasets from public or user histories, using
a variety of different strategies, namely:
* If the user provides an URL: Either import the history, or select
it publicly (but with restricted access, i.e. slow dataset matching)
* If the user provides a title: Either select one from the user's list,
or search the public histories for matching title (again with
restricted access).
--MCT
'''
import json
import sys
from os import getenv
from time import sleep
import requests
from bioblend import galaxy
def help_text():
'''Generic help text'''
print('''
KEY=<your-api-key> python %s <HIST_COMMAND> --matching STR
where HIST_COMMAND is one of:
--url-import HIST_URL Slow to import history, fast to match datasets.
--url-public HIST_URL Fast to retrieve history, slow to match datasets.
--title-public HIST_TITLE Fast to retrieve history, slow to match datasets.
Uses first matching public history
--title-user HIST_TITLE Fast to retrieve history, fast to match datasets.
Requires user to already own the history.
Uses first matching user history
KEY is your galaxy api key from https://usegalaxy.eu/user/api_key
HIST_URL and HIST_TITLE must be exact strings if used.
STR is a regex when used in conjunction with fast match methods, otherwise
it is an exact string for slow match methods.
''' % sys.argv[0])
sys.exit(-1)
def parse_args():
'''Quickly parse args, no need for opt_args.'''
if len(sys.argv)!=5:
help_text()
hcomm = sys.argv[1]
hdata = sys.argv[2]
mcomm = sys.argv[3]
mdata = sys.argv[4]
if not((hcomm in ("--title-user", "--url-import",
"--url-public", "--title-public")) and mcomm=="--matching"):
help_text()
return hcomm, hdata, mdata
def get_request(hist_url):
"Get request wrapper"
try:
req = requests.get(hist_url, timeout=3)
return req
except requests.ConnectionError:
print("Failed to get contents for url")
sys.exit(-1)
def get_hid_from_public_url (hist_url):
'''Return history ID for a public Galaxy URL.'''
req = get_request(hist_url)
## Search for history id in page source
lines = str(req.content).splitlines()
hhid = None
for line in lines:
if "data: {" in line:
hhid = line.split(' id : ')[1].split('"')[1]
break
return hhid
def get_hid_from_public_title (hist_title):
'''Return first history ID for a public Galaxy history with title HIST_TITLE.
Method suggested @mvdbeek via @simonbray'''
req = get_request("https://usegalaxy.eu/api/histories/published?q=slug&qv=%s" % hist_title)
data = json.loads(req.content)[0]
return data['id']
def get_hid_from_user_title (hist_title):
'''Return first history ID for a user Galaxy history with exact title HIST_TITLE.'''
res = hc.get_histories(name=hist_title)
if len(res) == 0:
print("No histories with that title found in your histories")
sys.exit(-1)
return res[0]['id']
def get_hid_by_import (hist_url, tries_max=100, tries_interval=5):
'''If given a public HIST_URL import it for the user, and return the ID.
TRIES_MAX = maximum number of tries to find history
TRIES_INTERVAL = sleep interval in seconds for finding history
This is incredibly slow because the import_history function is async and so
we need to manually poll the user history periodically to check that the
import is complete.
Avoid this method entirely if you are happy to use a non-bioblend solution
of getting the history_id from an url (see: get_hid_from_url) .
'''
print(hc.import_history(url=hist_url)['message'])
hist_title = hist_url.split('/')[-1]
# import_history is async, so we need to manually
# poll until we're sure it's imported
tries = 0
while tries < tries_max:
tries += 1
# Get first history in array
hist = gi.histories.get_histories(name="imported: "+hist_title)
#hist = list(filter(lambda x: x['name'].startswith(hist_title), hl))[0]
if len(hist) > 0:
break
sleep(tries_interval)
if tries >= tries_max:
print("Unable to find imported history")
sys.exit(-1)
hist = hist[0]
print("Found history '%s' after %s tries (%s seconds)" % (
hist['name'], tries, tries * tries_interval))
return hist['id']
def match_datasets_fast(hhid, name_filter):
'''Grab the dataset ids for all datasets with history_id
HID matching string or regex NAME_FILTER.
This ONLY works if the user has imported the history (e.g.
via `get_hid_by_import`), otherwise it errors out with 403 error
"HistoryDatasetAssociation is not accessible by user"
I find this behaviour to be strange, since it can be bypassed by
slowly looping through the history and checking the dataset names
manually. I will file a bug.
'''
if name_filter == "None":
name_filter = None
datasets = hc.show_matching_datasets(hhid, name_filter=name_filter)
if len(datasets) == 0:
print("No datasets for '%s' found, try a regex." % name_filter)
sys.exit(-1)
return [(x['id'],x['name']) for x in datasets]
## M A I N ##
if __name__ == "__main__":
history_command, url_or_title, dataset_match = parse_args()
oskey = getenv("KEY", default='ad77d47874dfdc5bf26f6a016c9ed375')
gi = galaxy.GalaxyInstance(url="https://usegalaxy.eu", key=oskey)
dc = galaxy.datasets.DatasetClient(gi)
hc = galaxy.histories.HistoryClient(gi)
# Get history ID
HID = None
if history_command == "--title-public":
HID = get_hid_from_public_title(url_or_title)
elif history_command == "--title-user":
HID = get_hid_from_user_title(url_or_title)
elif history_command == "--url-public":
HID = get_hid_from_public_url(url_or_title)
elif history_command == "--url-import":
HID = get_hid_by_import(url_or_title)
if HID is None:
print("Could not determine history id")
sys.exit(-1)
# Match datasets
if history_command in ("--url-import", "--title-user"):
## We can run the fast match function if the user
## owns the history
print("Using fast matching method")
set_names_ids = match_datasets_fast(HID, dataset_match)
lset = len(set_names_ids)
for num, se in enumerate(set_names_ids):
se_id, se_name = se
print("Downloading: '%s' [%d / %d] " % (se_name, num + 1, lset),
flush=True)
dc.download_dataset(se_id, ".", use_default_filename = True)
else:
## We must slowly iterate through the history
print("Using slow matching method")
tmp = hc.show_history(HID, contents=True)
datasets = list(filter(lambda x: ('state' in x) and
(x['state'] == 'ok') and
(dataset_match in x['name']), tmp))
lset = len(datasets)
for num, se in enumerate(datasets):
print("Downloading: '%s' [%d / %d] " % (se['name'], num + 1, lset),
flush=True)
dc.download_dataset(se['id'], ".", use_default_filename = True)
# Tests
# [public histories, slow matching]
# ./galaxy_download_datasets.py --title-public 2020-09-28-update --matching SnpEff
# ./galaxy_download_datasets.py --url-public\
# https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina
# [private histories, fast matching]
# ./galaxy_download_datasets.py --title-user\
# "STARSolo noconcat vs concat" --matching ".*Concat.*
# ./galaxy_download_datasets.py --url-import\
# https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment