mtekman · October 8, 2020 16:17
diff --git a/galaxy_download_datasets.py b/galaxy_download_datasets.py
 #!/usr/bin/env python3

 __VERSION__ = "0.3"

 '''This script downloads datasets from public or user histories, using
 a variety of different strategies, namely:

 * If the user provides an URL: Either import the history, or select
  it publicly (but with restricted access, i.e. slow dataset matching)
 * If the user provides a title: Either select one from the user's list,
  or search the public histories for matching title (again with
  restricted access).

 --MCT
 '''

 import json
 import sys
 from os import getenv
 from time import sleep
 import requests
 from bioblend import galaxy


 def help_text():
    '''Generic help text'''
    print('''
 KEY=<your-api-key> python %s <HIST_COMMAND> --matching STR

 where HIST_COMMAND is one of:
   --url-import HIST_URL      Slow to import history, fast to match datasets.
   --url-public HIST_URL      Fast to retrieve history, slow to match datasets.
   --title-public HIST_TITLE  Fast to retrieve history, slow to match datasets.
                              Uses first matching public history
   --title-user HIST_TITLE    Fast to retrieve history, fast to match datasets.
                              Requires user to already own the history.
                              Uses first matching user history

 KEY is your galaxy api key from https://usegalaxy.eu/user/api_key
 HIST_URL and HIST_TITLE must be exact strings if used.

 STR is a regex when used in conjunction with fast match methods, otherwise
    it is an exact string for slow match methods.


 ''' % sys.argv[0])
    sys.exit(-1)

 def parse_args():
    '''Quickly parse args, no need for opt_args.'''
    if len(sys.argv)!=5:
        help_text()
    hcomm = sys.argv[1]
    hdata = sys.argv[2]
    mcomm = sys.argv[3]
    mdata = sys.argv[4]
    if not((hcomm in ("--title-user", "--url-import",
                      "--url-public", "--title-public")) and mcomm=="--matching"):
        help_text()
    return hcomm, hdata, mdata

 def get_request(hist_url):
    "Get request wrapper"
    try:
        req = requests.get(hist_url, timeout=3)
        return req
    except requests.ConnectionError:
        print("Failed to get contents for url")
        sys.exit(-1)

 def get_hid_from_public_url (hist_url):
    '''Return history ID for a public Galaxy URL.'''
    req = get_request(hist_url)
    ## Search for history id in page source
    lines = str(req.content).splitlines()
    hhid = None
    for line in lines:
        if "data: {" in line:
            hhid = line.split(' id : ')[1].split('"')[1]
            break
    return hhid

 def get_hid_from_public_title (hist_title):
    '''Return first history ID for a public Galaxy history with title HIST_TITLE.
 Method suggested @mvdbeek via @simonbray'''
    req = get_request("https://usegalaxy.eu/api/histories/published?q=slug&qv=%s" % hist_title)
    data = json.loads(req.content)[0]
    return data['id']

 def get_hid_from_user_title (hist_title):
    '''Return first history ID for a user Galaxy history with exact title HIST_TITLE.'''
    res = hc.get_histories(name=hist_title)
    if len(res) == 0:
        print("No histories with that title found in your histories")
        sys.exit(-1)
    return res[0]['id']

 def get_hid_by_import (hist_url, tries_max=100, tries_interval=5):
    '''If given a public HIST_URL import it for the user, and return the ID.

 TRIES_MAX = maximum number of tries to find history
 TRIES_INTERVAL = sleep interval in seconds for finding history

 This is incredibly slow because the import_history function is async and so
 we need to manually poll the user history periodically to check that the
 import is complete.

 Avoid this method entirely if you are happy to use a non-bioblend solution
 of getting the history_id from an url (see: get_hid_from_url) .
 '''
    print(hc.import_history(url=hist_url)['message'])
    hist_title = hist_url.split('/')[-1]

    # import_history is async, so we need to manually
    # poll until we're sure it's imported
    tries = 0
    while tries < tries_max:
        tries += 1
        # Get first history in array
        hist = gi.histories.get_histories(name="imported: "+hist_title)
        #hist = list(filter(lambda x: x['name'].startswith(hist_title), hl))[0]
        if len(hist) > 0:
            break
        sleep(tries_interval)

    if tries >= tries_max:
        print("Unable to find imported history")
        sys.exit(-1)

    hist = hist[0]
    print("Found history '%s' after %s tries (%s seconds)" % (
        hist['name'], tries, tries * tries_interval))
    return hist['id']


 def match_datasets_fast(hhid, name_filter):
    '''Grab the dataset ids for all datasets with history_id
 HID matching string or regex NAME_FILTER.

 This ONLY works if the user has imported the history (e.g.
 via `get_hid_by_import`), otherwise it errors out with 403 error
 "HistoryDatasetAssociation  is not accessible by user"

 I find this behaviour to be strange, since it can be bypassed by
 slowly looping through the history and checking the dataset names
 manually. I will file a bug.

 '''
    if name_filter == "None":
        name_filter = None

    datasets = hc.show_matching_datasets(hhid, name_filter=name_filter)
    if len(datasets) == 0:
        print("No datasets for '%s' found, try a regex." % name_filter)
        sys.exit(-1)
    return [(x['id'],x['name']) for x in datasets]


 ## M A I N ##

 if __name__ == "__main__":
    history_command, url_or_title, dataset_match = parse_args()

    oskey = getenv("KEY", default='ad77d47874dfdc5bf26f6a016c9ed375')

    gi = galaxy.GalaxyInstance(url="https://usegalaxy.eu", key=oskey)
    dc = galaxy.datasets.DatasetClient(gi)
    hc = galaxy.histories.HistoryClient(gi)

    # Get history ID
    HID = None
    if history_command == "--title-public":
        HID = get_hid_from_public_title(url_or_title)
    elif history_command == "--title-user":
        HID = get_hid_from_user_title(url_or_title)
    elif history_command == "--url-public":
        HID = get_hid_from_public_url(url_or_title)
    elif history_command == "--url-import":
        HID = get_hid_by_import(url_or_title)

    if HID is None:
        print("Could not determine history id")
        sys.exit(-1)

    # Match datasets
    if history_command in ("--url-import", "--title-user"):
        ## We can run the fast match function if the user
        ## owns the history
        print("Using fast matching method")
        set_names_ids = match_datasets_fast(HID, dataset_match)
        lset = len(set_names_ids)
        for num, se in enumerate(set_names_ids):
            se_id, se_name = se
            print("Downloading: '%s' [%d / %d]  " % (se_name, num + 1, lset),
                  flush=True)
            dc.download_dataset(se_id, ".", use_default_filename = True)

    else:
        ## We must slowly iterate through the history
        print("Using slow matching method")
        tmp = hc.show_history(HID, contents=True)
        datasets = list(filter(lambda x: ('state' in x) and
                               (x['state'] == 'ok') and
                               (dataset_match in x['name']), tmp))
        lset = len(datasets)
        for num, se in enumerate(datasets):
            print("Downloading: '%s' [%d / %d]  " % (se['name'], num + 1, lset),
                  flush=True)
            dc.download_dataset(se['id'], ".", use_default_filename = True)


 # Tests
 # [public histories, slow matching]
 # ./galaxy_download_datasets.py --title-public 2020-09-28-update --matching SnpEff
 # ./galaxy_download_datasets.py --url-public\
 #   https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina
 # [private histories, fast matching]
 # ./galaxy_download_datasets.py --title-user\
 #    "STARSolo noconcat vs concat" --matching ".*Concat.*
 # ./galaxy_download_datasets.py --url-import\
 #   https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina
	#!/usr/bin/env python3

	__VERSION__ = "0.3"

	'''This script downloads datasets from public or user histories, using
	a variety of different strategies, namely:

	* If the user provides an URL: Either import the history, or select
	it publicly (but with restricted access, i.e. slow dataset matching)
	* If the user provides a title: Either select one from the user's list,
	or search the public histories for matching title (again with
	restricted access).

	--MCT
	'''

	import json
	import sys
	from os import getenv
	from time import sleep
	import requests
	from bioblend import galaxy


	def help_text():
	'''Generic help text'''
	print('''
	KEY=<your-api-key> python %s <HIST_COMMAND> --matching STR

	where HIST_COMMAND is one of:
	--url-import HIST_URL Slow to import history, fast to match datasets.
	--url-public HIST_URL Fast to retrieve history, slow to match datasets.
	--title-public HIST_TITLE Fast to retrieve history, slow to match datasets.
	Uses first matching public history
	--title-user HIST_TITLE Fast to retrieve history, fast to match datasets.
	Requires user to already own the history.
	Uses first matching user history

	KEY is your galaxy api key from https://usegalaxy.eu/user/api_key
	HIST_URL and HIST_TITLE must be exact strings if used.

	STR is a regex when used in conjunction with fast match methods, otherwise
	it is an exact string for slow match methods.


	''' % sys.argv[0])
	sys.exit(-1)

	def parse_args():
	'''Quickly parse args, no need for opt_args.'''
	if len(sys.argv)!=5:
	help_text()
	hcomm = sys.argv[1]
	hdata = sys.argv[2]
	mcomm = sys.argv[3]
	mdata = sys.argv[4]
	if not((hcomm in ("--title-user", "--url-import",
	"--url-public", "--title-public")) and mcomm=="--matching"):
	help_text()
	return hcomm, hdata, mdata

	def get_request(hist_url):
	"Get request wrapper"
	try:
	req = requests.get(hist_url, timeout=3)
	return req
	except requests.ConnectionError:
	print("Failed to get contents for url")
	sys.exit(-1)

	def get_hid_from_public_url (hist_url):
	'''Return history ID for a public Galaxy URL.'''
	req = get_request(hist_url)
	## Search for history id in page source
	lines = str(req.content).splitlines()
	hhid = None
	for line in lines:
	if "data: {" in line:
	hhid = line.split(' id : ')[1].split('"')[1]
	break
	return hhid

	def get_hid_from_public_title (hist_title):
	'''Return first history ID for a public Galaxy history with title HIST_TITLE.
	Method suggested @mvdbeek via @simonbray'''
	req = get_request("https://usegalaxy.eu/api/histories/published?q=slug&qv=%s" % hist_title)
	data = json.loads(req.content)[0]
	return data['id']

	def get_hid_from_user_title (hist_title):
	'''Return first history ID for a user Galaxy history with exact title HIST_TITLE.'''
	res = hc.get_histories(name=hist_title)
	if len(res) == 0:
	print("No histories with that title found in your histories")
	sys.exit(-1)
	return res[0]['id']

	def get_hid_by_import (hist_url, tries_max=100, tries_interval=5):
	'''If given a public HIST_URL import it for the user, and return the ID.

	TRIES_MAX = maximum number of tries to find history
	TRIES_INTERVAL = sleep interval in seconds for finding history

	This is incredibly slow because the import_history function is async and so
	we need to manually poll the user history periodically to check that the
	import is complete.

	Avoid this method entirely if you are happy to use a non-bioblend solution
	of getting the history_id from an url (see: get_hid_from_url) .
	'''
	print(hc.import_history(url=hist_url)['message'])
	hist_title = hist_url.split('/')[-1]

	# import_history is async, so we need to manually
	# poll until we're sure it's imported
	tries = 0
	while tries < tries_max:
	tries += 1
	# Get first history in array
	hist = gi.histories.get_histories(name="imported: "+hist_title)
	#hist = list(filter(lambda x: x['name'].startswith(hist_title), hl))[0]
	if len(hist) > 0:
	break
	sleep(tries_interval)

	if tries >= tries_max:
	print("Unable to find imported history")
	sys.exit(-1)

	hist = hist[0]
	print("Found history '%s' after %s tries (%s seconds)" % (
	hist['name'], tries, tries * tries_interval))
	return hist['id']


	def match_datasets_fast(hhid, name_filter):
	'''Grab the dataset ids for all datasets with history_id
	HID matching string or regex NAME_FILTER.

	This ONLY works if the user has imported the history (e.g.
	via `get_hid_by_import`), otherwise it errors out with 403 error
	"HistoryDatasetAssociation is not accessible by user"

	I find this behaviour to be strange, since it can be bypassed by
	slowly looping through the history and checking the dataset names
	manually. I will file a bug.

	'''
	if name_filter == "None":
	name_filter = None

	datasets = hc.show_matching_datasets(hhid, name_filter=name_filter)
	if len(datasets) == 0:
	print("No datasets for '%s' found, try a regex." % name_filter)
	sys.exit(-1)
	return [(x['id'],x['name']) for x in datasets]


	## M A I N ##

	if __name__ == "__main__":
	history_command, url_or_title, dataset_match = parse_args()

	oskey = getenv("KEY", default='ad77d47874dfdc5bf26f6a016c9ed375')

	gi = galaxy.GalaxyInstance(url="https://usegalaxy.eu", key=oskey)
	dc = galaxy.datasets.DatasetClient(gi)
	hc = galaxy.histories.HistoryClient(gi)

	# Get history ID
	HID = None
	if history_command == "--title-public":
	HID = get_hid_from_public_title(url_or_title)
	elif history_command == "--title-user":
	HID = get_hid_from_user_title(url_or_title)
	elif history_command == "--url-public":
	HID = get_hid_from_public_url(url_or_title)
	elif history_command == "--url-import":
	HID = get_hid_by_import(url_or_title)

	if HID is None:
	print("Could not determine history id")
	sys.exit(-1)

	# Match datasets
	if history_command in ("--url-import", "--title-user"):
	## We can run the fast match function if the user
	## owns the history
	print("Using fast matching method")
	set_names_ids = match_datasets_fast(HID, dataset_match)
	lset = len(set_names_ids)
	for num, se in enumerate(set_names_ids):
	se_id, se_name = se
	print("Downloading: '%s' [%d / %d] " % (se_name, num + 1, lset),
	flush=True)
	dc.download_dataset(se_id, ".", use_default_filename = True)

	else:
	## We must slowly iterate through the history
	print("Using slow matching method")
	tmp = hc.show_history(HID, contents=True)
	datasets = list(filter(lambda x: ('state' in x) and
	(x['state'] == 'ok') and
	(dataset_match in x['name']), tmp))
	lset = len(datasets)
	for num, se in enumerate(datasets):
	print("Downloading: '%s' [%d / %d] " % (se['name'], num + 1, lset),
	flush=True)
	dc.download_dataset(se['id'], ".", use_default_filename = True)


	# Tests
	# [public histories, slow matching]
	# ./galaxy_download_datasets.py --title-public 2020-09-28-update --matching SnpEff
	# ./galaxy_download_datasets.py --url-public\
	# https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina
	# [private histories, fast matching]
	# ./galaxy_download_datasets.py --title-user\
	# "STARSolo noconcat vs concat" --matching ".Concat.
	# ./galaxy_download_datasets.py --url-import\
	# https://usegalaxy.eu/u/sars-cov2-bot/h/2020-09-28-update --matching illumina