Created
September 11, 2020 07:06
-
-
Save sharnoff/5dc5000fca80a2ab0f78b2786b75c2eb to your computer and use it in GitHub Desktop.
Triggering code for tensorflow/tensorflow issue #43119
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
# These are the "givens" from the issue - essentially they're set to mimic | |
# the actual values that I ran into this issue with | |
ABSURDLY_LARGE_NUMBER = 2 * 10**17 | |
YOUR_CSV_PATH = "temp-data.csv" | |
LABEL_COLUMN = "label" | |
INPUT_SIZE = 300 | |
INPUT_COLUMNS = [ str(x) for x in range(INPUT_SIZE) ] | |
import tensorflow as tf | |
# This function is the culprit -- setting `shuffle_buffer_size` to a number | |
# large enough to cause the requested allocation size to overflow turns out to | |
# break things down the line. More information in the issue. | |
def get_dataset(file_path, **kwargs): | |
return tf.data.experimental.make_csv_dataset( | |
file_path, | |
batch_size=1, | |
num_epochs=1, | |
label_name=LABEL_COLUMN, | |
select_columns=[LABEL_COLUMN] + INPUT_COLUMNS, | |
header=True, | |
shuffle=True, | |
shuffle_buffer_size=ABSURDLY_LARGE_NUMBER, | |
**kwargs) | |
# In order to be able to run `get_dataset`, we'll first write a little csv to a file | |
# | |
# Aside from a header, all of the values in the dataset will be zero, because it | |
# doesn't actually matter what's there | |
header_cols = [LABEL_COLUMN] + INPUT_COLUMNS | |
header = ','.join(header_cols) | |
line = ','.join(['0' for _ in header_cols]) | |
with open(YOUR_CSV_PATH, "w+") as f: | |
s = '\n'.join([header, line]) | |
f.write(s) | |
# And there we go! That should be enough to get us going :) | |
################################################################################ | |
# The next few pieces are mostly adapted from this tutorial: | |
# https://www.tensorflow.org/tutorials/load_data/csv | |
class PackNumericFeatures(object): | |
def __init__(self, names): | |
self.names = names | |
self.batch_size = 1 | |
def __call__(self, features, labels): | |
numeric_features = [features.pop(name) for name in self.names] | |
numeric_features = [tf.cast(feat, tf.uint8) for feat in numeric_features] | |
numeric_features = tf.reshape(numeric_features, (self.batch_size, INPUT_SIZE)) | |
features['numeric'] = numeric_features | |
return features, labels | |
def packed_dataset(file_path): | |
raw_data = get_dataset(file_path) | |
numeric_features = INPUT_COLUMNS | |
return raw_data.map(PackNumericFeatures(numeric_features)) | |
numeric_column = tf.feature_column.numeric_column( | |
'numeric', shape=[INPUT_SIZE]) | |
################################################################################ | |
# And then we get to constructing the model itself. Not too much to see here | |
model = tf.keras.Sequential([ | |
tf.keras.layers.DenseFeatures([numeric_column]), | |
tf.keras.layers.Dense(1, activation='relu'), | |
]) | |
model.compile( | |
loss='mse', | |
optimizer=tf.keras.optimizers.SGD(), | |
metrics=['accuracy'], | |
) | |
# Finally, when we try to evaluate this model (using a dataset prepared badly), | |
# our good friend Mr. Segfault shows up :) | |
model.evaluate(packed_dataset(YOUR_CSV_PATH)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment