Last active
October 8, 2020 16:17
-
-
Save mtekman/b13ee09dd0f0e4768b88cb47b3b32146 to your computer and use it in GitHub Desktop.
Galaxy Download Scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
__VERSION__ = "0.3" | |
'''This script downloads datasets from public or user histories, using | |
a variety of different strategies, namely: | |
* If the user provides an URL: Either import the history, or select | |
it publicly (but with restricted access, i.e. slow dataset matching) | |
* If the user provides a title: Either select one from the user's list, | |
or search the public histories for matching title (again with | |
restricted access). | |
--MCT | |
''' | |
import json | |
import sys | |
from os import getenv | |
from time import sleep | |
import requests | |
from bioblend import galaxy | |
def help_text(): | |
'''Generic help text''' | |
print(''' | |
KEY=<your-api-key> python %s <HIST_COMMAND> --matching STR | |
where HIST_COMMAND is one of: | |
--url-import HIST_URL Slow to import history, fast to match datasets. | |
--url-public HIST_URL Fast to retrieve history, slow to match datasets. | |
--title-public HIST_TITLE Fast to retrieve history, slow to match datasets. | |
Uses first matching public history | |
--title-user HIST_TITLE Fast to retrieve history, fast to match datasets. | |
Requires user to already own the history. | |
Uses first matching user history | |
KEY is your galaxy api key from https://usegalaxy.eu/user/api_key | |
HIST_URL and HIST_TITLE must be exact strings if used. | |
STR is a regex when used in conjunction with fast match methods, otherwise | |
it is an exact string for slow match methods. | |
''' % sys.argv[0]) | |
sys.exit(-1) | |
def parse_args(): | |
'''Quickly parse args, no need for opt_args.''' | |
if len(sys.argv)!=5: | |
help_text() | |
hcomm = sys.argv[1] | |
hdata = sys.argv[2] | |
mcomm = sys.argv[3] | |
mdata = sys.argv[4] | |
if not((hcomm in ("--title-user", "--url-import", | |
"--url-public", "--title-public")) and mcomm=="--matching"): | |
help_text() | |
return hcomm, hdata, mdata | |
def get_request(hist_url): | |
"Get request wrapper" | |
try: | |
req = requests.get(hist_url, timeout=3) | |
return req | |
except requests.ConnectionError: | |
print("Failed to get contents for url") | |
sys.exit(-1) | |
def get_hid_from_public_url (hist_url): | |
'''Return history ID for a public Galaxy URL.''' | |
req = get_request(hist_url) | |
## Search for history id in page source | |
lines = str(req.content).splitlines() | |
hhid = None | |
for line in lines: | |
if "data: {" in line: | |
hhid = line.split(' id : ')[1].split('"')[1] | |
break | |
return hhid | |
def get_hid_from_public_title (hist_title): | |
'''Return first history ID for a public Galaxy history with title HIST_TITLE. | |
Method suggested @mvdbeek via @simonbray''' | |
req = get_request("https://usegalaxy.eu/api/histories/published?q=slug&qv=%s" % hist_title) | |
data = json.loads(req.content)[0] | |
return data['id'] | |
def get_hid_from_user_title (hist_title): | |
'''Return first history ID for a user Galaxy history with exact title HIST_TITLE.''' | |
res = hc.get_histories(name=hist_title) | |
if len(res) == 0: | |
print("No histories with that title found in your histories") | |
sys.exit(-1) | |
return res[0]['id'] | |
def get_hid_by_import (hist_url, tries_max=100, tries_interval=5): | |
'''If given a public HIST_URL import it for the user, and return the ID. | |
TRIES_MAX = maximum number of tries to find history | |
TRIES_INTERVAL = sleep interval in seconds for finding history | |
This is incredibly slow because the import_history function is async and so | |
we need to manually poll the user history periodically to check that the | |
import is complete. | |
Avoid this method entirely if you are happy to use a non-bioblend solution | |
of getting the history_id from an url (see: get_hid_from_url) . | |
''' | |
print(hc.import_history(url=hist_url)['message']) | |
hist_title = hist_url.split('/')[-1] | |
# import_history is async, so we need to manually | |
# poll until we're sure it's imported | |
tries = 0 | |
while tries < tries_max: | |
tries += 1 | |
# Get first history in array | |
hist = gi.histories.get_histories(name="imported: "+hist_title) | |
#hist = list(filter(lambda x: x['name'].startswith(hist_title), hl))[0] | |
if len(hist) > 0: | |
break | |
sleep(tries_interval) | |
if tries >= tries_max: | |
print("Unable to find imported history") | |
sys.exit(-1) | |
hist = hist[0] | |
print("Found history '%s' after %s tries (%s seconds)" % ( | |
hist['name'], tries, tries * tries_interval)) | |
return hist['id'] | |
def match_datasets_fast(hhid, name_filter): | |
'''Grab the dataset ids for all datasets with history_id | |
HID matching string or regex NAME_FILTER. | |
This ONLY works if the user has imported the history (e.g. | |
via `get_hid_by_import`), otherwise it errors out with 403 error | |
"HistoryDatasetAssociation is not accessible by user" | |
I find this behaviour to be strange, since it can be bypassed by | |
slowly looping through the history and checking the dataset names | |
manually. I will file a bug. | |
''' | |
if name_filter == "None": | |
name_filter = None | |
datasets = hc.show_matching_datasets(hhid, name_filter=name_filter) | |
if len(datasets) == 0: | |
print("No datasets for '%s' found, try a regex." % name_filter) | |
sys.exit(-1) | |
return [(x['id'],x['name']) for x in datasets] | |
## M A I N ## | |
if __name__ == "__main__": | |
history_command, url_or_title, dataset_match = parse_args() | |
oskey = getenv("KEY", default='ad77d47874dfdc5bf26f6a016c9ed375') | |
gi = galaxy.GalaxyInstance(url="https://usegalaxy.eu", key=oskey) | |
dc = galaxy.datasets.DatasetClient(gi) | |
hc = galaxy.histories.HistoryClient(gi) | |
# Get history ID | |
HID = None | |
if history_command == "--title-public": | |
HID = get_hid_from_public_title(url_or_title) | |
elif history_command == "--title-user": | |
HID = get_hid_from_user_title(url_or_title) | |
elif history_command == "--url-public": | |
HID = get_hid_from_public_url(url_or_title) | |
elif history_command == "--url-import": | |
HID = get_hid_by_import(url_or_title) | |
if HID is None: | |
print("Could not determine history id") | |
sys.exit(-1) | |
# Match datasets | |
if history_command in ("--url-import", "--title-user"): | |
## We can run the fast match function if the user | |
## owns the history | |
print("Using fast matching method") | |
set_names_ids = match_datasets_fast(HID, dataset_match) | |
lset = len(set_names_ids) | |
for num, se in enumerate(set_names_ids): | |
se_id, se_name = se | |
print("Downloading: '%s' [%d / %d] " % (se_name, num + 1, lset), | |
flush=True) | |
dc.download_dataset(se_id, ".", use_default_filename = True) | |
else: | |
## We must slowly iterate through the history | |
print("Using slow matching method") | |
tmp = hc.show_history(HID, contents=True) | |
datasets = list(filter(lambda x: ('state' in x) and | |
(x['state'] == 'ok') and | |
(dataset_match in x['name']), tmp)) | |
lset = len(datasets) | |
for num, se in enumerate(datasets): | |
print("Downloading: '%s' [%d / %d] " % (se['name'], num + 1, lset), | |
flush=True) | |
dc.download_dataset(se['id'], ".", use_default_filename = True) | |
# Tests | |
# [public histories, slow matching] | |
# ./galaxy_download_datasets.py --title-public 2020-09-28-update --matching SnpEff | |
# ./galaxy_download_datasets.py --url-public\ | |
# https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina | |
# [private histories, fast matching] | |
# ./galaxy_download_datasets.py --title-user\ | |
# "STARSolo noconcat vs concat" --matching ".*Concat.* | |
# ./galaxy_download_datasets.py --url-import\ | |
# https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment