Last active
September 28, 2022 16:37
-
-
Save TimoRoth/4c0eded7004b06659e53345e3f64dbdf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import tarfile | |
import requests | |
import io | |
class SeekableHttpStream: | |
def __init__(self, url, buf_size = 10 * 1024 * 1024): | |
self._url = url | |
self._buf_size = buf_size | |
self._pos = 0 | |
self._buf = None | |
self._buf_pos = -1 | |
self.transferred = 0 | |
self._ses = requests.Session() | |
r = self._ses.head(self._url) | |
r.raise_for_status() | |
self._size = int(r.headers["content-length"]) | |
def read(self, size=-1): | |
# If the request just asks for the entire file, just download the whole range: | |
if size < 0: | |
r = self._ses.get(self._url, headers={"Range": f"bytes={self._pos}-"}) | |
r.raise_for_status() | |
self._pos += int(r.headers["content-length"]) | |
self._buf = None | |
return r.content | |
# Calculate position in our buffer, and use any matching data: | |
bdpos = self._pos - self._buf_pos | |
bdlen = len(self._buf or []) - bdpos | |
if self._buf and bdlen > 0 and bdpos >= 0: | |
bdrd = min(size, bdlen) | |
res = self._buf[bdpos:bdpos+bdrd] | |
else: | |
self._buf = None | |
res = b"" | |
# Check if the entire request was served from buffer: | |
if len(res) >= size: | |
self._pos += len(res) | |
return res | |
# Calculate amount of remaining data to fetch | |
rem = size - len(res) | |
newpos = self._pos + len(res) | |
# Fetch remaining data plus a new full buffer | |
r = self._ses.get(self._url, headers={"Range": f"bytes={newpos}-{newpos + rem + self._buf_size - 1}"}) | |
r.raise_for_status() | |
# Fill remaining request data and buffer | |
res += r.content[:rem] | |
self._buf = r.content[rem:] | |
# Both buffer and reading are at the same new pos now | |
self._pos += len(res) | |
self._buf_pos = self._pos | |
# Informational metadata about actually transferred data | |
self.transferred += len(r.content) | |
return res | |
def tell(self): | |
return self._pos | |
def seek(self, offset, whence = io.SEEK_SET): | |
if whence == io.SEEK_SET: | |
self._pos = offset | |
elif whence == io.SEEK_CUR: | |
self._pos += offset | |
elif whence == io.SEEK_END: | |
self._pos = self._size + offset | |
else: | |
raise ValueError() | |
s = SeekableHttpStream("https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/centerlines/RGI62/b_160/L1/RGI60-02/RGI60-02.00.tar") | |
t = tarfile.open(fileobj=s, mode="r:") | |
m = t.getmember('RGI60-02.00/RGI60-02.00751.tar.gz') | |
print("Got member: " + m.name) | |
f = t.extractfile(m) | |
d = f.read() | |
print(len(d)) | |
print(s.transferred/s._size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment