Created
October 4, 2018 00:55
-
-
Save tsudoko/54e9367b8cf15132edc4e2e60a603632 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_SPACE = "\u0020\u0009\u000a\u000c\u000d" | |
_POS_OUTSIDE = 0 | |
_POS_URL = 1 | |
_POS_DESCRIPTOR = 2 | |
def urls(srcset): | |
# URLs may contain commas, so we can't just .split(',') | |
pos = _POS_OUTSIDE | |
url = "" | |
for c in srcset: | |
if pos == _POS_OUTSIDE and c not in _SPACE: | |
if c == ",": # stray comma | |
continue | |
url += c | |
pos = _POS_URL | |
elif pos == _POS_URL: | |
if c in _SPACE: | |
if len(url) >= 1 and url[-1] == ",": # no descriptor | |
pos = _POS_OUTSIDE | |
else: | |
pos = _POS_DESCRIPTOR | |
yield url.rstrip(",") | |
url = "" | |
else: | |
url += c | |
elif pos == _POS_DESCRIPTOR: | |
if c == ",": | |
pos = _POS_OUTSIDE | |
if url: | |
yield url.rstrip(",") | |
# https://raw.githubusercontent.com/web-platform-tests/wpt/704d739c5a54658e6ea09efaa76824b134b36504/html/semantics/embedded-content/the-img-element/srcset/parse-a-srcset-attribute.html | |
_splitting_loop = r""" | |
<img srcset='' data-expect=''> | |
<img srcset=',' data-expect=''> | |
<img srcset=',,,' data-expect=''> | |
<img srcset=' data:,a 1x ' data-expect='data:,a'> | |
<img srcset='		data:,a		1x		' data-expect='data:,a'> | |
<img srcset='

data:,a

1x

' data-expect='data:,a'> | |
<img srcset='data:,a1x' data-expect='data:,a1x' data-resolve> | |
<!-- <img srcset='data:,a1x' data-expect='data:,a'> <!-- this one fails on lxml; srcset in the parsed tag is empty --> | |
<img srcset='

data:,a

1x

' data-expect='data:,a'> | |
<img srcset='data:,a1x' data-expect='data:,a1x' data-resolve> | |
<img srcset='data:,a1x' data-expect='data:,a1x' data-resolve> | |
<img srcset='data:,a1x' data-expect='data:,a1x' data-resolve> | |
<img srcset='data:,a' data-expect='data:,a'> | |
<img srcset='data:,a ' data-expect='data:,a'> | |
<img srcset='data:,a ,' data-expect='data:,a'> | |
<img srcset='data:,a,' data-expect='data:,a'> | |
<img srcset='data:,a, ' data-expect='data:,a'> | |
<img srcset='data:,a,,,' data-expect='data:,a'> | |
<img srcset='data:,a,, , ' data-expect='data:,a'> | |
<img srcset=' data:,a' data-expect='data:,a'> | |
<img srcset=',,,data:,a' data-expect='data:,a'> | |
<img srcset=' , ,,data:,a' data-expect='data:,a'> | |
<img srcset=' data:,a' data-expect=' data:,a' data-resolve> | |
<img srcset='data:,a ' data-expect='data:,a ' data-resolve> | |
""" | |
_descriptor_tokenizer = r""" | |
<img srcset='data:,a 1x' data-expect='data:,a'> | |
<img srcset='data:,a 1x ' data-expect='data:,a'> | |
<img srcset='data:,a 1x,' data-expect='data:,a'> | |
<img srcset='data:,a ( , data:,b 1x, ), data:,c' data-expect='data:,c'> | |
<img srcset='data:,a ((( , data:,b 1x, ), data:,c' data-expect='data:,c'> | |
<img srcset='data:,a [ , data:,b 1x, ], data:,c' data-expect='data:,b'> | |
<img srcset='data:,a { , data:,b 1x, }, data:,c' data-expect='data:,b'> | |
<img srcset='data:,a " , data:,b 1x, ", data:,c' data-expect='data:,b'> | |
<img srcset='data:,a \,data:;\,b, data:,c' data-expect='data:;\,b'> | |
<img srcset='data:,a, data:,b (' data-expect='data:,a'> | |
<img srcset='data:,a, data:,b ( ' data-expect='data:,a'> | |
<img srcset='data:,a, data:,b (,' data-expect='data:,a'> | |
<img srcset='data:,a, data:,b (x' data-expect='data:,a'> | |
<img srcset='data:,a, data:,b ()' data-expect='data:,a'> | |
<img srcset='data:,a (, data:,b' data-expect=''> | |
<img srcset='data:,a /*, data:,b, data:,c */' data-expect='data:,b'> | |
<img srcset='data:,a //, data:,b' data-expect='data:,b'> | |
""" | |
def test(): | |
import bs4 | |
for _html in (_splitting_loop, _descriptor_tokenizer): | |
soup = bs4.BeautifulSoup(_html) | |
total = 0 | |
success = 0 | |
for img in soup.find_all("img"): | |
total += 1 | |
parsed = set(urls(img['srcset'])) | |
expect = {img['data-expect']} if img['data-expect'] else set() | |
if expect == parsed: | |
success += 1 | |
elif expect.issubset(parsed): | |
# we don't mind getting some strictly invalid descriptors/URLs | |
# as long as the valid ones are extracted correctly | |
print("%2d" % total, "(w) extracted too much:", img['srcset'].__repr__(), "→", parsed) | |
success += 1 | |
else: | |
print( | |
"%2d" % total, "(F)", | |
#img, | |
img['srcset'].__repr__(), | |
"→", | |
parsed, | |
"(expected", [img['data-expect']], ")" | |
) | |
print(f"{success}/{total}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment