Last active
March 25, 2025 17:12
-
-
Save oconnor663/c69cb4dbffb9b13bbced3fe8ce2181ac to your computer and use it in GitHub Desktop.
A demonstration of GIL-releasing hashing leading to a data race in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
import hashlib | |
import threading | |
def hash_buf(buf): | |
return hashlib.sha256(buf).hexdigest() | |
def hash_buf_on_bg_thread(buf): | |
result = [None] | |
def work(): | |
result[0] = hash_buf(buf) | |
thread = threading.Thread(target=work) | |
thread.start() | |
return (thread, result) | |
def main(): | |
buf = bytearray(1_000_000) | |
# Print the hash of the buffer while it's all zero. | |
hash_before = hash_buf(buf) | |
print("hash before:", hash_before) | |
# Start a background thread to hash the buffer again. | |
thread, result = hash_buf_on_bg_thread(buf) | |
# Do two unsynchronized writes, one at the front of the buffer, and one at | |
# the back. This is at the same time as the background thread is working. | |
# It will likely miss the first write (either because it's already read the | |
# front of the buffer, or because our CPU core won't flush its stores | |
# immediately) but observe the second. | |
buf[0] = 1 | |
buf[-1] = 1 | |
# Await the background thread and print its result. | |
thread.join() | |
hash_during = result[0] | |
print("hash in bg: ", hash_during) | |
# Print the hash of the buffer after both writes. | |
hash_after = hash_buf(buf) | |
print("hash after: ", hash_after) | |
print() | |
if hash_during in (hash_before, hash_after): | |
# Hashlib C code either observed both of our writes or neither of our | |
# writes. This is consistent with what we might see if it held onto the | |
# GIL across the hashing loop. | |
print("This is fine.") | |
else: | |
# Hashlib C code observed one write but not the other. Because it | |
# releases the GIL across the entire hashing loop (i.e. there are no | |
# synchronization points in the loop), there must be data race. | |
print("This is a data race! Technically undefined behavior!") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment