Last active
August 1, 2023 19:41
-
-
Save vpj/e03c32819641dd65e0e70e563a56be42 to your computer and use it in GitHub Desktop.
Distributed learning for keras models with tensorflow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
The code is inspired from François Chollet's answer to the following quora question[1] and distributed tensorflow tutorial[2]. | |
It runs the Keras MNIST mlp example across multiple servers. | |
This sample code runs multiple processes on a single host. It can be configured | |
to run on multiple hosts simply by chaning the host names given in *ClusterSpec*. | |
Training the model: | |
Start the parameter server | |
python keras_distributed.py --job_name="ps" --task_index=0 | |
Start the three workers | |
python keras_distributed.py --job_name="worker" --task_index=0 | |
python keras_distributed.py --job_name="worker" --task_index=1 | |
python keras_distributed.py --job_name="worker" --task_index=2 | |
[1] https://www.quora.com/What-is-the-state-of-distributed-learning-multi-GPU-and-across-multiple-hosts-in-Keras-and-what-are-the-future-plans | |
[2] https://www.tensorflow.org/deploy/distributed | |
''' | |
import tensorflow as tf | |
import keras | |
# Define input flags to identify the job and task | |
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'") | |
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job") | |
FLAGS = tf.app.flags.FLAGS | |
# Create a tensorflow cluster | |
# Replace localhost with the host names if you are running on multiple hosts | |
cluster = tf.train.ClusterSpec({"ps": ["localhost:2222"], | |
"worker": [ "localhost:2223", | |
"localhost:2224", | |
"localhost:2225"]}) | |
# Start the server | |
server = tf.train.Server(cluster, | |
job_name=FLAGS.job_name, | |
task_index=FLAGS.task_index) | |
# Configurations | |
batch_size = 128 | |
learning_rate = 0.0005 | |
training_iterations = 100 | |
num_classes = 10 | |
log_frequency = 10 | |
# Load mnist data | |
def load_data(): | |
global mnist | |
from tensorflow.examples.tutorials.mnist import input_data | |
mnist = input_data.read_data_sets('MNIST_data', one_hot=True) | |
print("Data loaded") | |
# Create Keras model | |
def create_model(): | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout | |
model = Sequential() | |
model.add(Dense(512, activation='relu', input_shape=(784,))) | |
model.add(Dropout(0.2)) | |
model.add(Dense(512, activation='relu')) | |
model.add(Dropout(0.2)) | |
model.add(Dense(10, activation='softmax')) | |
model.summary() | |
return model | |
# Create the optimizer | |
# We cannot use model.compile and model.fit | |
def create_optimizer(model, targets): | |
predictions = model.output | |
loss = tf.reduce_mean( | |
keras.losses.categorical_crossentropy(targets, predictions)) | |
# Only if you have regularizers, not in this example | |
total_loss = loss * 1.0 # Copy | |
for regularizer_loss in model.losses: | |
tf.assign_add(total_loss, regularizer_loss) | |
optimizer = tf.train.RMSPropOptimizer(learning_rate) | |
# Barrier to compute gradients after updating moving avg of batch norm | |
with tf.control_dependencies(model.updates): | |
barrier = tf.no_op(name="update_barrier") | |
with tf.control_dependencies([barrier]): | |
grads = optimizer.compute_gradients( | |
total_loss, | |
model.trainable_weights) | |
grad_updates = optimizer.apply_gradients(grads) | |
with tf.control_dependencies([grad_updates]): | |
train_op = tf.identity(total_loss, name="train") | |
return (train_op, total_loss, predictions) | |
# Train the model (a single step) | |
def train(train_op, total_loss, global_step, step): | |
import time | |
start_time = time.time() | |
batch_x, batch_y = mnist.train.next_batch(batch_size) | |
# perform the operations we defined earlier on batch | |
loss_value, step_value = sess.run( | |
[train_op, global_step], | |
feed_dict={ | |
model.inputs[0]: batch_x, | |
targets: batch_y}) | |
if step % log_frequency == 0: | |
elapsed_time = time.time() - start_time | |
start_time = time.time() | |
accuracy = sess.run(total_loss, | |
feed_dict={ | |
model.inputs[0]: mnist.test.images, | |
targets: mnist.test.labels}) | |
print("Step: %d," % (step_value + 1), | |
" Iteration: %2d," % step, | |
" Cost: %.4f," % loss_value, | |
" Accuracy: %.4f" % accuracy, | |
" AvgTime: %3.2fms" % float(elapsed_time * 1000 / log_frequency)) | |
if FLAGS.job_name == "ps": | |
server.join() | |
elif FLAGS.job_name == "worker": | |
load_data() | |
# Assign operations to local server | |
with tf.device(tf.train.replica_device_setter( | |
worker_device="/job:worker/task:%d" % FLAGS.task_index, | |
cluster=cluster)): | |
keras.backend.set_learning_phase(1) | |
keras.backend.manual_variable_initialization(True) | |
model = create_model() | |
targets = tf.placeholder(tf.float32, shape=[None, 10], name="y-input") | |
train_op, total_loss, predictions = create_optimizer(model, targets) | |
global_step = tf.get_variable('global_step', [], | |
initializer=tf.constant_initializer(0), | |
trainable=False) | |
init_op = tf.global_variables_initializer() | |
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0), | |
global_step=global_step, | |
logdir="/tmp/train_logs", | |
save_model_secs=600, | |
init_op=init_op) | |
print("Waiting for other servers") | |
with sv.managed_session(server.target) as sess: | |
keras.backend.set_session(sess) | |
step = 0 | |
while not sv.should_stop() and step < 1000000: | |
train(train_op, total_loss, global_step, step) | |
step += 1 | |
sv.stop() | |
print("done") |
This code does not account for the change in behavior for the dropout layer between train and test learning phases.
I'm having trouble with this code. It keeps printing "step 1" in all three of the workers I start.
Here is the exact code I'm running and the outputs ( I called the script 3_DistributedTraining_Keras.py)
SUMMARY
Run from terminal as:
- Start the parameter server
- python 3_DistributedTraining_Keras.py --job_name="ps" --task_index=0
- Start the three workers
- python 3_DistributedTraining_Keras.py --job_name="worker" --task_index=0
- python 3_DistributedTraining_Keras.py --job_name="worker" --task_index=1
- python 3_DistributedTraining_Keras.py --job_name="worker" --task_index=2
Properties:
- i don't quite understand the output - it's always "step 1"
Code:
import tensorflow as tf
import keras
# Define input flags to identify the job and task
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS
# Create a tensorflow cluster
# Replace localhost with the host names if you are running on multiple hosts
cluster = tf.train.ClusterSpec({"ps": ["localhost:2222"],
"worker": [ "localhost:2223",
"localhost:2224",
"localhost:2225"]})
# Start the server
server = tf.train.Server(cluster,
job_name=FLAGS.job_name,
task_index=FLAGS.task_index)
# Configurations
batch_size = 128
learning_rate = 0.0005
training_iterations = 100
num_classes = 10
log_frequency = 10
# Load mnist data
def load_data():
global mnist
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
print("Data loaded")
# Create Keras model
def create_model():
from keras.models import Sequential
from keras.layers import Dense, Dropout
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))
model.summary()
return model
# Create the optimizer
# We cannot use model.compile and model.fit
def create_optimizer(model, targets):
predictions = model.output
loss = tf.reduce_mean(
keras.losses.categorical_crossentropy(targets, predictions))
# Only if you have regularizers, not in this example
total_loss = loss * 1.0 # Copy
for regularizer_loss in model.losses:
tf.assign_add(total_loss, regularizer_loss)
optimizer = tf.train.RMSPropOptimizer(learning_rate)
# Barrier to compute gradients after updating moving avg of batch norm
with tf.control_dependencies(model.updates):
barrier = tf.no_op(name="update_barrier")
with tf.control_dependencies([barrier]):
grads = optimizer.compute_gradients(
total_loss,
model.trainable_weights)
grad_updates = optimizer.apply_gradients(grads)
with tf.control_dependencies([grad_updates]):
train_op = tf.identity(total_loss, name="train")
return (train_op, total_loss, predictions)
# Train the model (a single step)
def train(train_op, total_loss, global_step, step):
import time
start_time = time.time()
batch_x, batch_y = mnist.train.next_batch(batch_size)
# perform the operations we defined earlier on batch
loss_value, step_value = sess.run(
[train_op, global_step],
feed_dict={
model.inputs[0]: batch_x,
targets: batch_y})
if step % log_frequency == 0:
elapsed_time = time.time() - start_time
start_time = time.time()
accuracy = sess.run(total_loss,
feed_dict={
model.inputs[0]: mnist.test.images,
targets: mnist.test.labels})
print("Step: %d," % (step_value + 1),
" Iteration: %2d," % step,
" Cost: %.4f," % loss_value,
" Accuracy: %.4f" % accuracy,
" AvgTime: %3.2fms" % float(elapsed_time * 1000 / log_frequency))
if FLAGS.job_name == "ps":
server.join()
elif FLAGS.job_name == "worker":
load_data()
# Assign operations to local server
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % FLAGS.task_index,
cluster=cluster)):
keras.backend.set_learning_phase(1)
keras.backend.manual_variable_initialization(True)
model = create_model()
targets = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
train_op, total_loss, predictions = create_optimizer(model, targets)
global_step = tf.get_variable('global_step', [],
initializer=tf.constant_initializer(0),
trainable=False)
init_op = tf.global_variables_initializer()
sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
global_step=global_step,
logdir="/tmp/train_logs",
save_model_secs=600,
init_op=init_op)
print("Waiting for other servers")
with sv.managed_session(server.target) as sess:
keras.backend.set_session(sess)
step = 0
while not sv.should_stop() and step < 1000000:
train(train_op, total_loss, global_step, step)
step += 1
sv.stop()
print("done")
RUN OUTPUTS
This output is confusing
PS 0 Output:
PS 0 output:
Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:42.146444 140444513494848 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.
2019-07-31 14:35:42.147097: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:42.170729: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:42.171781: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c1907a73a0 executing computations on platform Host. Devices:
2019-07-31 14:35:42.171848: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:42.174299: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:42.174351: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:42.176022: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2222
WORKER 0 Output:
WORKER 0 output:
Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:55.569238 140370433529664 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.
2019-07-31 14:35:55.569718: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:55.591176: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:55.592915: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55611188a460 executing computations on platform Host. Devices:
2019-07-31 14:35:55.593149: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:55.595517: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:55.595558: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:55.600802: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2223
W0731 14:35:56.677980 140370433529664 deprecation.py:323] From 3_DistributedTraining_Keras.py:52: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0731 14:35:56.679641 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0731 14:35:56.681064 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
W0731 14:35:57.140425 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
W0731 14:35:57.141411 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
W0731 14:35:57.231534 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Data loaded
W0731 14:35:57.736789 140370433529664 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
W0731 14:35:57.781671 140370433529664 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 512) 401920
_________________________________________________________________
dropout_1 (Dropout) (None, 512) 0
_________________________________________________________________
dense_2 (Dense) (None, 512) 262656
_________________________________________________________________
dropout_2 (Dropout) (None, 512) 0
_________________________________________________________________
dense_3 (Dense) (None, 10) 5130
=================================================================
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
W0731 14:35:58.057884 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1423: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0731 14:35:58.295027 140370433529664 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0731 14:35:58.453169 140370433529664 deprecation.py:323] From 3_DistributedTraining_Keras.py:150: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Waiting for other servers
Step: 1, Iteration: 0, Cost: 2.3329, Accuracy: 2.3423 AvgTime: 39.85ms
Step: 1, Iteration: 10, Cost: 2.3359, Accuracy: 2.3367 AvgTime: 3.37ms
Step: 1, Iteration: 20, Cost: 2.3393, Accuracy: 2.3282 AvgTime: 3.36ms
Step: 1, Iteration: 30, Cost: 2.3277, Accuracy: 2.3131 AvgTime: 2.73ms
Step: 1, Iteration: 40, Cost: 2.2992, Accuracy: 2.2895 AvgTime: 3.15ms
Step: 1, Iteration: 50, Cost: 2.2419, Accuracy: 2.2522 AvgTime: 3.77ms
Step: 1, Iteration: 60, Cost: 2.2233, Accuracy: 2.1950 AvgTime: 2.47ms
Step: 1, Iteration: 70, Cost: 2.0709, Accuracy: 2.1041 AvgTime: 2.75ms
Step: 1, Iteration: 80, Cost: 1.9741, Accuracy: 1.9618 AvgTime: 3.46ms
Step: 1, Iteration: 90, Cost: 1.7745, Accuracy: 1.7105 AvgTime: 3.10ms
Step: 1, Iteration: 100, Cost: 1.4761, Accuracy: 1.3430 AvgTime: 3.14ms
Step: 1, Iteration: 110, Cost: 0.9557, Accuracy: 0.9771 AvgTime: 2.75ms
Step: 1, Iteration: 120, Cost: 0.7194, Accuracy: 0.7386 AvgTime: 4.08ms
Step: 1, Iteration: 130, Cost: 0.6084, Accuracy: 0.6058 AvgTime: 3.89ms
Step: 1, Iteration: 140, Cost: 0.6284, Accuracy: 0.5267 AvgTime: 2.95ms
Step: 1, Iteration: 150, Cost: 0.6367, Accuracy: 0.5186 AvgTime: 2.90ms
Step: 1, Iteration: 160, Cost: 0.4401, Accuracy: 0.4379 AvgTime: 3.46ms
Step: 1, Iteration: 170, Cost: 0.4324, Accuracy: 0.3888 AvgTime: 2.76ms
Step: 1, Iteration: 180, Cost: 0.4951, Accuracy: 0.3730 AvgTime: 3.34ms
Step: 1, Iteration: 190, Cost: 0.5490, Accuracy: 0.3541 AvgTime: 2.93ms
Step: 1, Iteration: 200, Cost: 0.3574, Accuracy: 0.3452 AvgTime: 2.78ms
Step: 1, Iteration: 210, Cost: 0.3405, Accuracy: 0.3523 AvgTime: 2.88ms
Step: 1, Iteration: 220, Cost: 0.4327, Accuracy: 0.3089 AvgTime: 4.03ms
Step: 1, Iteration: 230, Cost: 0.3922, Accuracy: 0.2930 AvgTime: 2.70ms
Step: 1, Iteration: 240, Cost: 0.2659, Accuracy: 0.2849 AvgTime: 2.99ms
Step: 1, Iteration: 250, Cost: 0.5087, Accuracy: 0.2948 AvgTime: 3.91ms
Step: 1, Iteration: 260, Cost: 0.2864, Accuracy: 0.2718 AvgTime: 2.98ms
Step: 1, Iteration: 270, Cost: 0.1802, Accuracy: 0.2785 AvgTime: 2.58ms
Step: 1, Iteration: 280, Cost: 0.2760, Accuracy: 0.2509 AvgTime: 2.55ms
Step: 1, Iteration: 290, Cost: 0.1846, Accuracy: 0.2383 AvgTime: 3.75ms
Step: 1, Iteration: 300, Cost: 0.4293, Accuracy: 0.2611 AvgTime: 2.68ms
Step: 1, Iteration: 310, Cost: 0.2681, Accuracy: 0.2281 AvgTime: 3.03ms
Step: 1, Iteration: 320, Cost: 0.2917, Accuracy: 0.2271 AvgTime: 2.87ms
Step: 1, Iteration: 330, Cost: 0.2379, Accuracy: 0.2331 AvgTime: 3.86ms
Step: 1, Iteration: 340, Cost: 0.3044, Accuracy: 0.2149 AvgTime: 2.78ms
Step: 1, Iteration: 350, Cost: 0.1893, Accuracy: 0.2080 AvgTime: 14.89ms
Step: 1, Iteration: 360, Cost: 0.1554, Accuracy: 0.1917 AvgTime: 7.54ms
Step: 1, Iteration: 370, Cost: 0.2736, Accuracy: 0.1855 AvgTime: 6.93ms
Step: 1, Iteration: 380, Cost: 0.2343, Accuracy: 0.1846 AvgTime: 7.23ms
Step: 1, Iteration: 390, Cost: 0.2085, Accuracy: 0.1787 AvgTime: 9.16ms
Step: 1, Iteration: 400, Cost: 0.0910, Accuracy: 0.1561 AvgTime: 6.01ms
Step: 1, Iteration: 410, Cost: 0.2396, Accuracy: 0.1581 AvgTime: 7.86ms
Step: 1, Iteration: 420, Cost: 0.2121, Accuracy: 0.1534 AvgTime: 6.35ms
Step: 1, Iteration: 430, Cost: 0.1069, Accuracy: 0.1415 AvgTime: 5.18ms
Step: 1, Iteration: 440, Cost: 0.0921, Accuracy: 0.1407 AvgTime: 6.24ms
Step: 1, Iteration: 450, Cost: 0.2879, Accuracy: 0.1395 AvgTime: 5.01ms
Step: 1, Iteration: 460, Cost: 0.0621, Accuracy: 0.1356 AvgTime: 6.41ms
Step: 1, Iteration: 470, Cost: 0.0545, Accuracy: 0.1272 AvgTime: 6.20ms
Step: 1, Iteration: 480, Cost: 0.1028, Accuracy: 0.1294 AvgTime: 4.86ms
Step: 1, Iteration: 490, Cost: 0.0735, Accuracy: 0.1329 AvgTime: 5.35ms
Step: 1, Iteration: 500, Cost: 0.2314, Accuracy: 0.1214 AvgTime: 6.54ms
Step: 1, Iteration: 510, Cost: 0.1395, Accuracy: 0.1489 AvgTime: 6.54ms
Step: 1, Iteration: 520, Cost: 0.0821, Accuracy: 0.1201 AvgTime: 4.28ms
Step: 1, Iteration: 530, Cost: 0.1252, Accuracy: 0.1169 AvgTime: 3.14ms
Step: 1, Iteration: 540, Cost: 0.1443, Accuracy: 0.1175 AvgTime: 6.44ms
Step: 1, Iteration: 550, Cost: 0.1121, Accuracy: 0.1134 AvgTime: 7.80ms
Step: 1, Iteration: 560, Cost: 0.0879, Accuracy: 0.1157 AvgTime: 6.35ms
Step: 1, Iteration: 570, Cost: 0.0860, Accuracy: 0.1206 AvgTime: 5.99ms
Step: 1, Iteration: 580, Cost: 0.0624, Accuracy: 0.1001 AvgTime: 5.28ms
Step: 1, Iteration: 590, Cost: 0.0835, Accuracy: 0.1107 AvgTime: 6.77ms
Step: 1, Iteration: 600, Cost: 0.0793, Accuracy: 0.0997 AvgTime: 6.92ms
Step: 1, Iteration: 610, Cost: 0.0886, Accuracy: 0.1087 AvgTime: 6.16ms
Step: 1, Iteration: 620, Cost: 0.0593, Accuracy: 0.1035 AvgTime: 6.40ms
Step: 1, Iteration: 630, Cost: 0.1416, Accuracy: 0.1104 AvgTime: 6.04ms
Step: 1, Iteration: 640, Cost: 0.0872, Accuracy: 0.0974 AvgTime: 6.47ms
Step: 1, Iteration: 650, Cost: 0.1216, Accuracy: 0.1080 AvgTime: 7.88ms
Step: 1, Iteration: 660, Cost: 0.0221, Accuracy: 0.1064 AvgTime: 5.23ms
Step: 1, Iteration: 670, Cost: 0.0683, Accuracy: 0.1044 AvgTime: 5.77ms
Step: 1, Iteration: 680, Cost: 0.0622, Accuracy: 0.0976 AvgTime: 6.75ms
Step: 1, Iteration: 690, Cost: 0.0619, Accuracy: 0.0982 AvgTime: 6.31ms
Step: 1, Iteration: 700, Cost: 0.0932, Accuracy: 0.0959 AvgTime: 7.28ms
Step: 1, Iteration: 710, Cost: 0.1297, Accuracy: 0.0986 AvgTime: 5.82ms
Step: 1, Iteration: 720, Cost: 0.0256, Accuracy: 0.0974 AvgTime: 6.17ms
Step: 1, Iteration: 730, Cost: 0.0519, Accuracy: 0.1098 AvgTime: 8.14ms
Step: 1, Iteration: 740, Cost: 0.1260, Accuracy: 0.1044 AvgTime: 6.30ms
Step: 1, Iteration: 750, Cost: 0.0849, Accuracy: 0.0933 AvgTime: 7.17ms
Step: 1, Iteration: 760, Cost: 0.1534, Accuracy: 0.0957 AvgTime: 7.38ms
Step: 1, Iteration: 770, Cost: 0.0844, Accuracy: 0.0881 AvgTime: 7.31ms
Step: 1, Iteration: 780, Cost: 0.0275, Accuracy: 0.0918 AvgTime: 5.93ms
Step: 1, Iteration: 790, Cost: 0.0378, Accuracy: 0.0937 AvgTime: 5.22ms
Step: 1, Iteration: 800, Cost: 0.1158, Accuracy: 0.0941 AvgTime: 6.59ms
Step: 1, Iteration: 810, Cost: 0.0494, Accuracy: 0.0939 AvgTime: 5.55ms
Step: 1, Iteration: 820, Cost: 0.0642, Accuracy: 0.0919 AvgTime: 6.05ms
Step: 1, Iteration: 830, Cost: 0.1243, Accuracy: 0.0877 AvgTime: 6.80ms
Step: 1, Iteration: 840, Cost: 0.0408, Accuracy: 0.0822 AvgTime: 5.83ms
Step: 1, Iteration: 850, Cost: 0.0465, Accuracy: 0.0955 AvgTime: 7.31ms
Step: 1, Iteration: 860, Cost: 0.1252, Accuracy: 0.0861 AvgTime: 6.74ms
Step: 1, Iteration: 870, Cost: 0.0487, Accuracy: 0.0919 AvgTime: 5.77ms
Step: 1, Iteration: 880, Cost: 0.0459, Accuracy: 0.0918 AvgTime: 6.60ms
Step: 1, Iteration: 890, Cost: 0.0443, Accuracy: 0.0812 AvgTime: 6.51ms
Step: 1, Iteration: 900, Cost: 0.0634, Accuracy: 0.0888 AvgTime: 7.34ms
Step: 1, Iteration: 910, Cost: 0.0304, Accuracy: 0.0806 AvgTime: 3.48ms
Step: 1, Iteration: 920, Cost: 0.0203, Accuracy: 0.0835 AvgTime: 1.95ms
^Z
[1]+ Stopped python3.7 3_DistributedTraining_Keras.py --job_name="worker" --task_index=0
WORKER 1 Output:
WORKER 1 output:
Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:49.526217 139767475332928 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.
2019-07-31 14:35:49.527295: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:49.558593: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:49.563598: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x561bc176c460 executing computations on platform Host. Devices:
2019-07-31 14:35:49.563664: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:49.565682: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:49.565875: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:49.569324: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2224
W0731 14:35:50.856184 139767475332928 deprecation.py:323] From 3_DistributedTraining_Keras.py:52: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0731 14:35:50.856578 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0731 14:35:50.856914 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
W0731 14:35:51.327049 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
W0731 14:35:51.328959 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
W0731 14:35:51.427278 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Data loaded
W0731 14:35:51.817702 139767475332928 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
W0731 14:35:51.869127 139767475332928 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 512) 401920
_________________________________________________________________
dropout_1 (Dropout) (None, 512) 0
_________________________________________________________________
dense_2 (Dense) (None, 512) 262656
_________________________________________________________________
dropout_2 (Dropout) (None, 512) 0
_________________________________________________________________
dense_3 (Dense) (None, 10) 5130
=================================================================
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
W0731 14:35:52.149781 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1423: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0731 14:35:52.404864 139767475332928 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0731 14:35:52.554406 139767475332928 deprecation.py:323] From 3_DistributedTraining_Keras.py:150: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Waiting for other servers
Step: 1, Iteration: 0, Cost: 0.2173, Accuracy: 0.2109 AvgTime: 131.15ms
Step: 1, Iteration: 10, Cost: 0.1339, Accuracy: 0.1955 AvgTime: 8.02ms
Step: 1, Iteration: 20, Cost: 0.3180, Accuracy: 0.1812 AvgTime: 9.24ms
Step: 1, Iteration: 30, Cost: 0.2283, Accuracy: 0.1766 AvgTime: 7.34ms
Step: 1, Iteration: 40, Cost: 0.1412, Accuracy: 0.1755 AvgTime: 7.67ms
Step: 1, Iteration: 50, Cost: 0.1774, Accuracy: 0.1597 AvgTime: 7.21ms
Step: 1, Iteration: 60, Cost: 0.0932, Accuracy: 0.1559 AvgTime: 5.60ms
Step: 1, Iteration: 70, Cost: 0.1590, Accuracy: 0.2572 AvgTime: 5.78ms
Step: 1, Iteration: 80, Cost: 0.1236, Accuracy: 0.1754 AvgTime: 6.21ms
Step: 1, Iteration: 90, Cost: 0.0850, Accuracy: 0.1614 AvgTime: 5.17ms
Step: 1, Iteration: 100, Cost: 0.1484, Accuracy: 0.1531 AvgTime: 5.34ms
Step: 1, Iteration: 110, Cost: 0.1179, Accuracy: 0.1272 AvgTime: 6.42ms
Step: 1, Iteration: 120, Cost: 0.1411, Accuracy: 0.1315 AvgTime: 6.00ms
Step: 1, Iteration: 130, Cost: 0.1211, Accuracy: 0.1333 AvgTime: 6.73ms
Step: 1, Iteration: 140, Cost: 0.0522, Accuracy: 0.1198 AvgTime: 5.69ms
Step: 1, Iteration: 150, Cost: 0.1244, Accuracy: 0.1447 AvgTime: 5.23ms
Step: 1, Iteration: 160, Cost: 0.1077, Accuracy: 0.1173 AvgTime: 6.49ms
Step: 1, Iteration: 170, Cost: 0.1420, Accuracy: 0.1162 AvgTime: 5.26ms
Step: 1, Iteration: 180, Cost: 0.0956, Accuracy: 0.1074 AvgTime: 7.24ms
Step: 1, Iteration: 190, Cost: 0.1855, Accuracy: 0.1113 AvgTime: 5.88ms
Step: 1, Iteration: 200, Cost: 0.0764, Accuracy: 0.1101 AvgTime: 8.10ms
Step: 1, Iteration: 210, Cost: 0.0315, Accuracy: 0.1318 AvgTime: 5.23ms
Step: 1, Iteration: 220, Cost: 0.1038, Accuracy: 0.1200 AvgTime: 5.42ms
Step: 1, Iteration: 230, Cost: 0.0580, Accuracy: 0.1029 AvgTime: 5.99ms
Step: 1, Iteration: 240, Cost: 0.0765, Accuracy: 0.1068 AvgTime: 6.40ms
Step: 1, Iteration: 250, Cost: 0.1655, Accuracy: 0.1030 AvgTime: 5.89ms
Step: 1, Iteration: 260, Cost: 0.0815, Accuracy: 0.1023 AvgTime: 6.38ms
Step: 1, Iteration: 270, Cost: 0.0593, Accuracy: 0.1095 AvgTime: 5.73ms
Step: 1, Iteration: 280, Cost: 0.1392, Accuracy: 0.0979 AvgTime: 6.25ms
Step: 1, Iteration: 290, Cost: 0.0720, Accuracy: 0.0939 AvgTime: 5.22ms
Step: 1, Iteration: 300, Cost: 0.1074, Accuracy: 0.1013 AvgTime: 6.45ms
Step: 1, Iteration: 310, Cost: 0.1495, Accuracy: 0.1061 AvgTime: 7.42ms
Step: 1, Iteration: 320, Cost: 0.0561, Accuracy: 0.1017 AvgTime: 6.39ms
Step: 1, Iteration: 330, Cost: 0.1275, Accuracy: 0.0980 AvgTime: 5.33ms
Step: 1, Iteration: 340, Cost: 0.0255, Accuracy: 0.0902 AvgTime: 6.38ms
Step: 1, Iteration: 350, Cost: 0.0542, Accuracy: 0.0900 AvgTime: 5.57ms
Step: 1, Iteration: 360, Cost: 0.1004, Accuracy: 0.0918 AvgTime: 5.77ms
Step: 1, Iteration: 370, Cost: 0.0864, Accuracy: 0.0920 AvgTime: 5.29ms
Step: 1, Iteration: 380, Cost: 0.0445, Accuracy: 0.0908 AvgTime: 6.40ms
Step: 1, Iteration: 390, Cost: 0.1194, Accuracy: 0.0931 AvgTime: 6.35ms
Step: 1, Iteration: 400, Cost: 0.0528, Accuracy: 0.0944 AvgTime: 7.30ms
Step: 1, Iteration: 410, Cost: 0.0455, Accuracy: 0.0890 AvgTime: 6.10ms
Step: 1, Iteration: 420, Cost: 0.0811, Accuracy: 0.0866 AvgTime: 6.93ms
Step: 1, Iteration: 430, Cost: 0.0851, Accuracy: 0.0881 AvgTime: 5.05ms
Step: 1, Iteration: 440, Cost: 0.0598, Accuracy: 0.0826 AvgTime: 5.60ms
Step: 1, Iteration: 450, Cost: 0.1386, Accuracy: 0.0960 AvgTime: 5.10ms
Step: 1, Iteration: 460, Cost: 0.0387, Accuracy: 0.0922 AvgTime: 6.45ms
Step: 1, Iteration: 470, Cost: 0.0139, Accuracy: 0.0833 AvgTime: 5.73ms
Step: 1, Iteration: 480, Cost: 0.0370, Accuracy: 0.0885 AvgTime: 5.66ms
Step: 1, Iteration: 490, Cost: 0.0151, Accuracy: 0.0863 AvgTime: 6.04ms
Step: 1, Iteration: 500, Cost: 0.0458, Accuracy: 0.0860 AvgTime: 6.72ms
Step: 1, Iteration: 510, Cost: 0.0332, Accuracy: 0.0995 AvgTime: 8.07ms
Step: 1, Iteration: 520, Cost: 0.0347, Accuracy: 0.0982 AvgTime: 6.08ms
^CTraceback (most recent call last):
File "3_DistributedTraining_Keras.py", line 157, in <module>
train(train_op, total_loss, global_step, step)
File "3_DistributedTraining_Keras.py", line 118, in train
targets: mnist.test.labels})
File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 949, in run
run_metadata_ptr)
File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1172, in _run
feed_dict_tensor, options, run_metadata)
File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1356, in _do_call
return fn(*args)
File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1341, in _run_fn
target_list, run_metadata)
File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1434, in _call_tf_sessionrun
run_metadata)
KeyboardInterrupt
^Z
[1]+ Stopped python3.7 3_DistributedTraining_Keras.py --job_name="worker" --task_index=1
WORKER 2 Output:
WORKER 2 output:
Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:51.255171 140154457438016 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.
2019-07-31 14:35:51.257948: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:51.286525: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:51.286844: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x561d7721a460 executing computations on platform Host. Devices:
2019-07-31 14:35:51.286869: I tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:51.288613: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:51.288631: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:51.289537: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2225
W0731 14:35:52.914685 140154457438016 deprecation.py:323] From 3_DistributedTraining_Keras.py:52: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0731 14:35:52.915120 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0731 14:35:52.915565 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
W0731 14:35:53.339766 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
W0731 14:35:53.340977 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
W0731 14:35:53.428383 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Data loaded
W0731 14:35:53.791190 140154457438016 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.
W0731 14:35:53.841835 140154457438016 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 512) 401920
_________________________________________________________________
dropout_1 (Dropout) (None, 512) 0
_________________________________________________________________
dense_2 (Dense) (None, 512) 262656
_________________________________________________________________
dropout_2 (Dropout) (None, 512) 0
_________________________________________________________________
dense_3 (Dense) (None, 10) 5130
=================================================================
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
W0731 14:35:54.131115 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1423: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0731 14:35:54.364125 140154457438016 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0731 14:35:54.570417 140154457438016 deprecation.py:323] From 3_DistributedTraining_Keras.py:150: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Waiting for other servers
Step: 1, Iteration: 0, Cost: 0.2419, Accuracy: 0.2078 AvgTime: 128.85ms
Step: 1, Iteration: 10, Cost: 0.3279, Accuracy: 0.2523 AvgTime: 9.07ms
Step: 1, Iteration: 20, Cost: 0.2569, Accuracy: 0.2523 AvgTime: 6.31ms
Step: 1, Iteration: 30, Cost: 0.1381, Accuracy: 0.2024 AvgTime: 7.35ms
Step: 1, Iteration: 40, Cost: 0.1605, Accuracy: 0.1673 AvgTime: 6.83ms
Step: 1, Iteration: 50, Cost: 0.1673, Accuracy: 0.1972 AvgTime: 4.73ms
Step: 1, Iteration: 60, Cost: 0.2967, Accuracy: 0.2268 AvgTime: 6.00ms
Step: 1, Iteration: 70, Cost: 0.3198, Accuracy: 0.1862 AvgTime: 7.29ms
Step: 1, Iteration: 80, Cost: 0.1704, Accuracy: 0.1752 AvgTime: 4.52ms
Step: 1, Iteration: 90, Cost: 0.1098, Accuracy: 0.1448 AvgTime: 4.79ms
Step: 1, Iteration: 100, Cost: 0.1639, Accuracy: 0.1544 AvgTime: 5.35ms
Step: 1, Iteration: 110, Cost: 0.1918, Accuracy: 0.1670 AvgTime: 5.80ms
Step: 1, Iteration: 120, Cost: 0.1761, Accuracy: 0.1344 AvgTime: 5.51ms
Step: 1, Iteration: 130, Cost: 0.0982, Accuracy: 0.1440 AvgTime: 4.79ms
Step: 1, Iteration: 140, Cost: 0.1023, Accuracy: 0.1425 AvgTime: 6.43ms
Step: 1, Iteration: 150, Cost: 0.1394, Accuracy: 0.1168 AvgTime: 7.08ms
Step: 1, Iteration: 160, Cost: 0.0528, Accuracy: 0.1153 AvgTime: 6.02ms
Step: 1, Iteration: 170, Cost: 0.0762, Accuracy: 0.1162 AvgTime: 5.03ms
Step: 1, Iteration: 180, Cost: 0.1318, Accuracy: 0.1172 AvgTime: 4.80ms
Step: 1, Iteration: 190, Cost: 0.0767, Accuracy: 0.1140 AvgTime: 5.67ms
Step: 1, Iteration: 200, Cost: 0.0308, Accuracy: 0.1056 AvgTime: 5.85ms
Step: 1, Iteration: 210, Cost: 0.1512, Accuracy: 0.1070 AvgTime: 6.82ms
Step: 1, Iteration: 220, Cost: 0.1435, Accuracy: 0.1104 AvgTime: 5.16ms
Step: 1, Iteration: 230, Cost: 0.0523, Accuracy: 0.1194 AvgTime: 5.47ms
Step: 1, Iteration: 240, Cost: 0.0741, Accuracy: 0.1011 AvgTime: 6.94ms
Step: 1, Iteration: 250, Cost: 0.0468, Accuracy: 0.1207 AvgTime: 6.44ms
Step: 1, Iteration: 260, Cost: 0.0629, Accuracy: 0.1012 AvgTime: 5.15ms
Step: 1, Iteration: 270, Cost: 0.1853, Accuracy: 0.0958 AvgTime: 7.51ms
Step: 1, Iteration: 280, Cost: 0.0583, Accuracy: 0.0932 AvgTime: 6.40ms
Step: 1, Iteration: 290, Cost: 0.0445, Accuracy: 0.1011 AvgTime: 6.16ms
Step: 1, Iteration: 300, Cost: 0.1400, Accuracy: 0.0997 AvgTime: 6.11ms
Step: 1, Iteration: 310, Cost: 0.0613, Accuracy: 0.0969 AvgTime: 6.11ms
Step: 1, Iteration: 320, Cost: 0.0854, Accuracy: 0.0921 AvgTime: 6.13ms
Step: 1, Iteration: 330, Cost: 0.0611, Accuracy: 0.0950 AvgTime: 4.06ms
Step: 1, Iteration: 340, Cost: 0.0936, Accuracy: 0.0949 AvgTime: 6.53ms
Step: 1, Iteration: 350, Cost: 0.1440, Accuracy: 0.1392 AvgTime: 6.24ms
Step: 1, Iteration: 360, Cost: 0.0144, Accuracy: 0.0963 AvgTime: 7.43ms
Step: 1, Iteration: 370, Cost: 0.0853, Accuracy: 0.1159 AvgTime: 4.50ms
Step: 1, Iteration: 380, Cost: 0.0935, Accuracy: 0.1051 AvgTime: 4.50ms
Step: 1, Iteration: 390, Cost: 0.0355, Accuracy: 0.1064 AvgTime: 5.98ms
Step: 1, Iteration: 400, Cost: 0.0590, Accuracy: 0.0945 AvgTime: 5.91ms
Step: 1, Iteration: 410, Cost: 0.0952, Accuracy: 0.0982 AvgTime: 5.27ms
Step: 1, Iteration: 420, Cost: 0.3120, Accuracy: 0.0899 AvgTime: 5.07ms
Step: 1, Iteration: 430, Cost: 0.0367, Accuracy: 0.0937 AvgTime: 6.50ms
Step: 1, Iteration: 440, Cost: 0.0351, Accuracy: 0.0941 AvgTime: 6.23ms
Step: 1, Iteration: 450, Cost: 0.0805, Accuracy: 0.0895 AvgTime: 5.38ms
Step: 1, Iteration: 460, Cost: 0.0705, Accuracy: 0.0950 AvgTime: 6.06ms
Step: 1, Iteration: 470, Cost: 0.0208, Accuracy: 0.0925 AvgTime: 5.52ms
Step: 1, Iteration: 480, Cost: 0.0083, Accuracy: 0.0976 AvgTime: 4.99ms
Step: 1, Iteration: 490, Cost: 0.0235, Accuracy: 0.0851 AvgTime: 5.91ms
Step: 1, Iteration: 500, Cost: 0.0242, Accuracy: 0.0861 AvgTime: 6.27ms
Step: 1, Iteration: 510, Cost: 0.0483, Accuracy: 0.0927 AvgTime: 4.62ms
Step: 1, Iteration: 520, Cost: 0.2033, Accuracy: 0.1109 AvgTime: 5.73ms
Step: 1, Iteration: 530, Cost: 0.0694, Accuracy: 0.0898 AvgTime: 6.71ms
Step: 1, Iteration: 540, Cost: 0.0781, Accuracy: 0.0820 AvgTime: 4.75ms
^Z
[1]+ Stopped python3.7 3_DistributedTraining_Keras.py --job_name="worker" --task_index=2
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the good example implementation. I have deployed the code on 4 servers i have (1-master 3- slaves) and started training. It successfully ran.
Please help me with this questions
question 1: Is it necessary to deploy the code on every server and start the training on the slave servers? or is this how distributed tensorflow designed?
question 2: I don't any response in master slave after successful training. why is that? Will I able to see any message after successful training?