vpj/keras_distributed.py

vamsinimmala1992 · 2018-12-07T21:36:59Z

Thanks for the good example implementation. I have deployed the code on 4 servers i have (1-master 3- slaves) and started training. It successfully ran.
Please help me with this questions

question 1: Is it necessary to deploy the code on every server and start the training on the slave servers? or is this how distributed tensorflow designed?

question 2: I don't any response in master slave after successful training. why is that? Will I able to see any message after successful training?

tnybny · 2019-04-16T21:37:56Z

This code does not account for the change in behavior for the dropout layer between train and test learning phases.

katarina-cavar · 2019-08-23T13:30:57Z

I'm having trouble with this code. It keeps printing "step 1" in all three of the workers I start.

Here is the exact code I'm running and the outputs ( I called the script 3_DistributedTraining_Keras.py)

SUMMARY

Run from terminal as:

Start the parameter server

python 3_DistributedTraining_Keras.py --job_name="ps" --task_index=0

Start the three workers

python 3_DistributedTraining_Keras.py --job_name="worker" --task_index=0
python 3_DistributedTraining_Keras.py --job_name="worker" --task_index=1
python 3_DistributedTraining_Keras.py --job_name="worker" --task_index=2

Properties:

i don't quite understand the output - it's always "step 1"

Code:

import tensorflow as tf
import keras

# Define input flags to identify the job and task
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS

# Create a tensorflow cluster
# Replace localhost with the host names if you are running on multiple hosts
cluster = tf.train.ClusterSpec({"ps": ["localhost:2222"],
                                "worker": [	"localhost:2223",
                                            "localhost:2224",
                                            "localhost:2225"]})

# Start the server
server = tf.train.Server(cluster,
                         job_name=FLAGS.job_name,
                         task_index=FLAGS.task_index)

# Configurations
batch_size = 128
learning_rate = 0.0005
training_iterations = 100
num_classes = 10
log_frequency = 10

# Load mnist data
def load_data():
    global mnist
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
    print("Data loaded")

# Create Keras model
def create_model():
    from keras.models import Sequential
    from keras.layers import Dense, Dropout
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation='softmax'))

    model.summary()

    return model

# Create the optimizer
# We cannot use model.compile and model.fit
def create_optimizer(model, targets):
    predictions = model.output
    loss = tf.reduce_mean(
        keras.losses.categorical_crossentropy(targets, predictions))

    # Only if you have regularizers, not in this example
    total_loss = loss * 1.0  # Copy
    for regularizer_loss in model.losses:
        tf.assign_add(total_loss, regularizer_loss)

    optimizer = tf.train.RMSPropOptimizer(learning_rate)

    # Barrier to compute gradients after updating moving avg of batch norm
    with tf.control_dependencies(model.updates):
        barrier = tf.no_op(name="update_barrier")

    with tf.control_dependencies([barrier]):
        grads = optimizer.compute_gradients(
            total_loss,
            model.trainable_weights)
        grad_updates = optimizer.apply_gradients(grads)

    with tf.control_dependencies([grad_updates]):
        train_op = tf.identity(total_loss, name="train")

    return (train_op, total_loss, predictions)

# Train the model (a single step)
def train(train_op, total_loss, global_step, step):
        import time
        start_time = time.time()
        batch_x, batch_y = mnist.train.next_batch(batch_size)

        # perform the operations we defined earlier on batch
        loss_value, step_value = sess.run(
            [train_op, global_step],
            feed_dict={
                model.inputs[0]: batch_x,
                targets: batch_y})

        if step % log_frequency == 0:
            elapsed_time = time.time() - start_time
            start_time = time.time()
            accuracy = sess.run(total_loss,
                                feed_dict={
                                    model.inputs[0]: mnist.test.images,
                                    targets: mnist.test.labels})
            print("Step: %d," % (step_value + 1),
                  " Iteration: %2d," % step,
                  " Cost: %.4f," % loss_value,
                  " Accuracy: %.4f" % accuracy,
                  " AvgTime: %3.2fms" % float(elapsed_time * 1000 / log_frequency))


if FLAGS.job_name == "ps":
    server.join()
elif FLAGS.job_name == "worker":
    load_data()

    # Assign operations to local server
    with tf.device(tf.train.replica_device_setter(
            worker_device="/job:worker/task:%d" % FLAGS.task_index,
            cluster=cluster)):
        keras.backend.set_learning_phase(1)
        keras.backend.manual_variable_initialization(True)
        model = create_model()
        targets = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
        train_op, total_loss, predictions = create_optimizer(model, targets)

        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        init_op = tf.global_variables_initializer()

    sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
                             global_step=global_step,
                             logdir="/tmp/train_logs",
                             save_model_secs=600,
                             init_op=init_op)

    print("Waiting for other servers")
    with sv.managed_session(server.target) as sess:
        keras.backend.set_session(sess)
        step = 0
        while not sv.should_stop() and step < 1000000:
            train(train_op, total_loss, global_step, step)
            step += 1

    sv.stop()
    print("done")

RUN OUTPUTS

This output is confusing

PS 0 Output:

PS 0 output:

Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:42.146444 140444513494848 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.

2019-07-31 14:35:42.147097: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:42.170729: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:42.171781: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55c1907a73a0 executing computations on platform Host. Devices:
2019-07-31 14:35:42.171848: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:42.174299: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:42.174351: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:42.176022: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2222

WORKER 0 Output:

WORKER 0 output:

Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:55.569238 140370433529664 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.

2019-07-31 14:35:55.569718: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:55.591176: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:55.592915: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55611188a460 executing computations on platform Host. Devices:
2019-07-31 14:35:55.593149: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:55.595517: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:55.595558: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:55.600802: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2223
W0731 14:35:56.677980 140370433529664 deprecation.py:323] From 3_DistributedTraining_Keras.py:52: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0731 14:35:56.679641 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0731 14:35:56.681064 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
W0731 14:35:57.140425 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
W0731 14:35:57.141411 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
W0731 14:35:57.231534 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Data loaded
W0731 14:35:57.736789 140370433529664 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0731 14:35:57.781671 140370433529664 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 512)               401920    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                5130      
=================================================================
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
W0731 14:35:58.057884 140370433529664 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1423: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0731 14:35:58.295027 140370433529664 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0731 14:35:58.453169 140370433529664 deprecation.py:323] From 3_DistributedTraining_Keras.py:150: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Waiting for other servers
Step: 1,  Iteration:  0,  Cost: 2.3329,  Accuracy: 2.3423  AvgTime: 39.85ms
Step: 1,  Iteration: 10,  Cost: 2.3359,  Accuracy: 2.3367  AvgTime: 3.37ms
Step: 1,  Iteration: 20,  Cost: 2.3393,  Accuracy: 2.3282  AvgTime: 3.36ms
Step: 1,  Iteration: 30,  Cost: 2.3277,  Accuracy: 2.3131  AvgTime: 2.73ms
Step: 1,  Iteration: 40,  Cost: 2.2992,  Accuracy: 2.2895  AvgTime: 3.15ms
Step: 1,  Iteration: 50,  Cost: 2.2419,  Accuracy: 2.2522  AvgTime: 3.77ms
Step: 1,  Iteration: 60,  Cost: 2.2233,  Accuracy: 2.1950  AvgTime: 2.47ms
Step: 1,  Iteration: 70,  Cost: 2.0709,  Accuracy: 2.1041  AvgTime: 2.75ms
Step: 1,  Iteration: 80,  Cost: 1.9741,  Accuracy: 1.9618  AvgTime: 3.46ms
Step: 1,  Iteration: 90,  Cost: 1.7745,  Accuracy: 1.7105  AvgTime: 3.10ms
Step: 1,  Iteration: 100,  Cost: 1.4761,  Accuracy: 1.3430  AvgTime: 3.14ms
Step: 1,  Iteration: 110,  Cost: 0.9557,  Accuracy: 0.9771  AvgTime: 2.75ms
Step: 1,  Iteration: 120,  Cost: 0.7194,  Accuracy: 0.7386  AvgTime: 4.08ms
Step: 1,  Iteration: 130,  Cost: 0.6084,  Accuracy: 0.6058  AvgTime: 3.89ms
Step: 1,  Iteration: 140,  Cost: 0.6284,  Accuracy: 0.5267  AvgTime: 2.95ms
Step: 1,  Iteration: 150,  Cost: 0.6367,  Accuracy: 0.5186  AvgTime: 2.90ms
Step: 1,  Iteration: 160,  Cost: 0.4401,  Accuracy: 0.4379  AvgTime: 3.46ms
Step: 1,  Iteration: 170,  Cost: 0.4324,  Accuracy: 0.3888  AvgTime: 2.76ms
Step: 1,  Iteration: 180,  Cost: 0.4951,  Accuracy: 0.3730  AvgTime: 3.34ms
Step: 1,  Iteration: 190,  Cost: 0.5490,  Accuracy: 0.3541  AvgTime: 2.93ms
Step: 1,  Iteration: 200,  Cost: 0.3574,  Accuracy: 0.3452  AvgTime: 2.78ms
Step: 1,  Iteration: 210,  Cost: 0.3405,  Accuracy: 0.3523  AvgTime: 2.88ms
Step: 1,  Iteration: 220,  Cost: 0.4327,  Accuracy: 0.3089  AvgTime: 4.03ms
Step: 1,  Iteration: 230,  Cost: 0.3922,  Accuracy: 0.2930  AvgTime: 2.70ms
Step: 1,  Iteration: 240,  Cost: 0.2659,  Accuracy: 0.2849  AvgTime: 2.99ms
Step: 1,  Iteration: 250,  Cost: 0.5087,  Accuracy: 0.2948  AvgTime: 3.91ms
Step: 1,  Iteration: 260,  Cost: 0.2864,  Accuracy: 0.2718  AvgTime: 2.98ms
Step: 1,  Iteration: 270,  Cost: 0.1802,  Accuracy: 0.2785  AvgTime: 2.58ms
Step: 1,  Iteration: 280,  Cost: 0.2760,  Accuracy: 0.2509  AvgTime: 2.55ms
Step: 1,  Iteration: 290,  Cost: 0.1846,  Accuracy: 0.2383  AvgTime: 3.75ms
Step: 1,  Iteration: 300,  Cost: 0.4293,  Accuracy: 0.2611  AvgTime: 2.68ms
Step: 1,  Iteration: 310,  Cost: 0.2681,  Accuracy: 0.2281  AvgTime: 3.03ms
Step: 1,  Iteration: 320,  Cost: 0.2917,  Accuracy: 0.2271  AvgTime: 2.87ms
Step: 1,  Iteration: 330,  Cost: 0.2379,  Accuracy: 0.2331  AvgTime: 3.86ms
Step: 1,  Iteration: 340,  Cost: 0.3044,  Accuracy: 0.2149  AvgTime: 2.78ms
Step: 1,  Iteration: 350,  Cost: 0.1893,  Accuracy: 0.2080  AvgTime: 14.89ms
Step: 1,  Iteration: 360,  Cost: 0.1554,  Accuracy: 0.1917  AvgTime: 7.54ms
Step: 1,  Iteration: 370,  Cost: 0.2736,  Accuracy: 0.1855  AvgTime: 6.93ms
Step: 1,  Iteration: 380,  Cost: 0.2343,  Accuracy: 0.1846  AvgTime: 7.23ms
Step: 1,  Iteration: 390,  Cost: 0.2085,  Accuracy: 0.1787  AvgTime: 9.16ms
Step: 1,  Iteration: 400,  Cost: 0.0910,  Accuracy: 0.1561  AvgTime: 6.01ms
Step: 1,  Iteration: 410,  Cost: 0.2396,  Accuracy: 0.1581  AvgTime: 7.86ms
Step: 1,  Iteration: 420,  Cost: 0.2121,  Accuracy: 0.1534  AvgTime: 6.35ms
Step: 1,  Iteration: 430,  Cost: 0.1069,  Accuracy: 0.1415  AvgTime: 5.18ms
Step: 1,  Iteration: 440,  Cost: 0.0921,  Accuracy: 0.1407  AvgTime: 6.24ms
Step: 1,  Iteration: 450,  Cost: 0.2879,  Accuracy: 0.1395  AvgTime: 5.01ms
Step: 1,  Iteration: 460,  Cost: 0.0621,  Accuracy: 0.1356  AvgTime: 6.41ms
Step: 1,  Iteration: 470,  Cost: 0.0545,  Accuracy: 0.1272  AvgTime: 6.20ms
Step: 1,  Iteration: 480,  Cost: 0.1028,  Accuracy: 0.1294  AvgTime: 4.86ms
Step: 1,  Iteration: 490,  Cost: 0.0735,  Accuracy: 0.1329  AvgTime: 5.35ms
Step: 1,  Iteration: 500,  Cost: 0.2314,  Accuracy: 0.1214  AvgTime: 6.54ms
Step: 1,  Iteration: 510,  Cost: 0.1395,  Accuracy: 0.1489  AvgTime: 6.54ms
Step: 1,  Iteration: 520,  Cost: 0.0821,  Accuracy: 0.1201  AvgTime: 4.28ms
Step: 1,  Iteration: 530,  Cost: 0.1252,  Accuracy: 0.1169  AvgTime: 3.14ms
Step: 1,  Iteration: 540,  Cost: 0.1443,  Accuracy: 0.1175  AvgTime: 6.44ms
Step: 1,  Iteration: 550,  Cost: 0.1121,  Accuracy: 0.1134  AvgTime: 7.80ms
Step: 1,  Iteration: 560,  Cost: 0.0879,  Accuracy: 0.1157  AvgTime: 6.35ms
Step: 1,  Iteration: 570,  Cost: 0.0860,  Accuracy: 0.1206  AvgTime: 5.99ms
Step: 1,  Iteration: 580,  Cost: 0.0624,  Accuracy: 0.1001  AvgTime: 5.28ms
Step: 1,  Iteration: 590,  Cost: 0.0835,  Accuracy: 0.1107  AvgTime: 6.77ms
Step: 1,  Iteration: 600,  Cost: 0.0793,  Accuracy: 0.0997  AvgTime: 6.92ms
Step: 1,  Iteration: 610,  Cost: 0.0886,  Accuracy: 0.1087  AvgTime: 6.16ms
Step: 1,  Iteration: 620,  Cost: 0.0593,  Accuracy: 0.1035  AvgTime: 6.40ms
Step: 1,  Iteration: 630,  Cost: 0.1416,  Accuracy: 0.1104  AvgTime: 6.04ms
Step: 1,  Iteration: 640,  Cost: 0.0872,  Accuracy: 0.0974  AvgTime: 6.47ms
Step: 1,  Iteration: 650,  Cost: 0.1216,  Accuracy: 0.1080  AvgTime: 7.88ms
Step: 1,  Iteration: 660,  Cost: 0.0221,  Accuracy: 0.1064  AvgTime: 5.23ms
Step: 1,  Iteration: 670,  Cost: 0.0683,  Accuracy: 0.1044  AvgTime: 5.77ms
Step: 1,  Iteration: 680,  Cost: 0.0622,  Accuracy: 0.0976  AvgTime: 6.75ms
Step: 1,  Iteration: 690,  Cost: 0.0619,  Accuracy: 0.0982  AvgTime: 6.31ms
Step: 1,  Iteration: 700,  Cost: 0.0932,  Accuracy: 0.0959  AvgTime: 7.28ms
Step: 1,  Iteration: 710,  Cost: 0.1297,  Accuracy: 0.0986  AvgTime: 5.82ms
Step: 1,  Iteration: 720,  Cost: 0.0256,  Accuracy: 0.0974  AvgTime: 6.17ms
Step: 1,  Iteration: 730,  Cost: 0.0519,  Accuracy: 0.1098  AvgTime: 8.14ms
Step: 1,  Iteration: 740,  Cost: 0.1260,  Accuracy: 0.1044  AvgTime: 6.30ms
Step: 1,  Iteration: 750,  Cost: 0.0849,  Accuracy: 0.0933  AvgTime: 7.17ms
Step: 1,  Iteration: 760,  Cost: 0.1534,  Accuracy: 0.0957  AvgTime: 7.38ms
Step: 1,  Iteration: 770,  Cost: 0.0844,  Accuracy: 0.0881  AvgTime: 7.31ms
Step: 1,  Iteration: 780,  Cost: 0.0275,  Accuracy: 0.0918  AvgTime: 5.93ms
Step: 1,  Iteration: 790,  Cost: 0.0378,  Accuracy: 0.0937  AvgTime: 5.22ms
Step: 1,  Iteration: 800,  Cost: 0.1158,  Accuracy: 0.0941  AvgTime: 6.59ms
Step: 1,  Iteration: 810,  Cost: 0.0494,  Accuracy: 0.0939  AvgTime: 5.55ms
Step: 1,  Iteration: 820,  Cost: 0.0642,  Accuracy: 0.0919  AvgTime: 6.05ms
Step: 1,  Iteration: 830,  Cost: 0.1243,  Accuracy: 0.0877  AvgTime: 6.80ms
Step: 1,  Iteration: 840,  Cost: 0.0408,  Accuracy: 0.0822  AvgTime: 5.83ms
Step: 1,  Iteration: 850,  Cost: 0.0465,  Accuracy: 0.0955  AvgTime: 7.31ms
Step: 1,  Iteration: 860,  Cost: 0.1252,  Accuracy: 0.0861  AvgTime: 6.74ms
Step: 1,  Iteration: 870,  Cost: 0.0487,  Accuracy: 0.0919  AvgTime: 5.77ms
Step: 1,  Iteration: 880,  Cost: 0.0459,  Accuracy: 0.0918  AvgTime: 6.60ms
Step: 1,  Iteration: 890,  Cost: 0.0443,  Accuracy: 0.0812  AvgTime: 6.51ms
Step: 1,  Iteration: 900,  Cost: 0.0634,  Accuracy: 0.0888  AvgTime: 7.34ms
Step: 1,  Iteration: 910,  Cost: 0.0304,  Accuracy: 0.0806  AvgTime: 3.48ms
Step: 1,  Iteration: 920,  Cost: 0.0203,  Accuracy: 0.0835  AvgTime: 1.95ms
^Z
[1]+  Stopped                 python3.7 3_DistributedTraining_Keras.py --job_name="worker" --task_index=0

WORKER 1 Output:

WORKER 1 output:

Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:49.526217 139767475332928 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.

2019-07-31 14:35:49.527295: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:49.558593: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:49.563598: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x561bc176c460 executing computations on platform Host. Devices:
2019-07-31 14:35:49.563664: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:49.565682: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:49.565875: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:49.569324: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2224
W0731 14:35:50.856184 139767475332928 deprecation.py:323] From 3_DistributedTraining_Keras.py:52: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0731 14:35:50.856578 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0731 14:35:50.856914 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
W0731 14:35:51.327049 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
W0731 14:35:51.328959 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
W0731 14:35:51.427278 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Data loaded
W0731 14:35:51.817702 139767475332928 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0731 14:35:51.869127 139767475332928 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 512)               401920    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                5130      
=================================================================
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
W0731 14:35:52.149781 139767475332928 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1423: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0731 14:35:52.404864 139767475332928 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0731 14:35:52.554406 139767475332928 deprecation.py:323] From 3_DistributedTraining_Keras.py:150: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Waiting for other servers
Step: 1,  Iteration:  0,  Cost: 0.2173,  Accuracy: 0.2109  AvgTime: 131.15ms
Step: 1,  Iteration: 10,  Cost: 0.1339,  Accuracy: 0.1955  AvgTime: 8.02ms
Step: 1,  Iteration: 20,  Cost: 0.3180,  Accuracy: 0.1812  AvgTime: 9.24ms
Step: 1,  Iteration: 30,  Cost: 0.2283,  Accuracy: 0.1766  AvgTime: 7.34ms
Step: 1,  Iteration: 40,  Cost: 0.1412,  Accuracy: 0.1755  AvgTime: 7.67ms
Step: 1,  Iteration: 50,  Cost: 0.1774,  Accuracy: 0.1597  AvgTime: 7.21ms
Step: 1,  Iteration: 60,  Cost: 0.0932,  Accuracy: 0.1559  AvgTime: 5.60ms
Step: 1,  Iteration: 70,  Cost: 0.1590,  Accuracy: 0.2572  AvgTime: 5.78ms
Step: 1,  Iteration: 80,  Cost: 0.1236,  Accuracy: 0.1754  AvgTime: 6.21ms
Step: 1,  Iteration: 90,  Cost: 0.0850,  Accuracy: 0.1614  AvgTime: 5.17ms
Step: 1,  Iteration: 100,  Cost: 0.1484,  Accuracy: 0.1531  AvgTime: 5.34ms
Step: 1,  Iteration: 110,  Cost: 0.1179,  Accuracy: 0.1272  AvgTime: 6.42ms
Step: 1,  Iteration: 120,  Cost: 0.1411,  Accuracy: 0.1315  AvgTime: 6.00ms
Step: 1,  Iteration: 130,  Cost: 0.1211,  Accuracy: 0.1333  AvgTime: 6.73ms
Step: 1,  Iteration: 140,  Cost: 0.0522,  Accuracy: 0.1198  AvgTime: 5.69ms
Step: 1,  Iteration: 150,  Cost: 0.1244,  Accuracy: 0.1447  AvgTime: 5.23ms
Step: 1,  Iteration: 160,  Cost: 0.1077,  Accuracy: 0.1173  AvgTime: 6.49ms
Step: 1,  Iteration: 170,  Cost: 0.1420,  Accuracy: 0.1162  AvgTime: 5.26ms
Step: 1,  Iteration: 180,  Cost: 0.0956,  Accuracy: 0.1074  AvgTime: 7.24ms
Step: 1,  Iteration: 190,  Cost: 0.1855,  Accuracy: 0.1113  AvgTime: 5.88ms
Step: 1,  Iteration: 200,  Cost: 0.0764,  Accuracy: 0.1101  AvgTime: 8.10ms
Step: 1,  Iteration: 210,  Cost: 0.0315,  Accuracy: 0.1318  AvgTime: 5.23ms
Step: 1,  Iteration: 220,  Cost: 0.1038,  Accuracy: 0.1200  AvgTime: 5.42ms
Step: 1,  Iteration: 230,  Cost: 0.0580,  Accuracy: 0.1029  AvgTime: 5.99ms
Step: 1,  Iteration: 240,  Cost: 0.0765,  Accuracy: 0.1068  AvgTime: 6.40ms
Step: 1,  Iteration: 250,  Cost: 0.1655,  Accuracy: 0.1030  AvgTime: 5.89ms
Step: 1,  Iteration: 260,  Cost: 0.0815,  Accuracy: 0.1023  AvgTime: 6.38ms
Step: 1,  Iteration: 270,  Cost: 0.0593,  Accuracy: 0.1095  AvgTime: 5.73ms
Step: 1,  Iteration: 280,  Cost: 0.1392,  Accuracy: 0.0979  AvgTime: 6.25ms
Step: 1,  Iteration: 290,  Cost: 0.0720,  Accuracy: 0.0939  AvgTime: 5.22ms
Step: 1,  Iteration: 300,  Cost: 0.1074,  Accuracy: 0.1013  AvgTime: 6.45ms
Step: 1,  Iteration: 310,  Cost: 0.1495,  Accuracy: 0.1061  AvgTime: 7.42ms
Step: 1,  Iteration: 320,  Cost: 0.0561,  Accuracy: 0.1017  AvgTime: 6.39ms
Step: 1,  Iteration: 330,  Cost: 0.1275,  Accuracy: 0.0980  AvgTime: 5.33ms
Step: 1,  Iteration: 340,  Cost: 0.0255,  Accuracy: 0.0902  AvgTime: 6.38ms
Step: 1,  Iteration: 350,  Cost: 0.0542,  Accuracy: 0.0900  AvgTime: 5.57ms
Step: 1,  Iteration: 360,  Cost: 0.1004,  Accuracy: 0.0918  AvgTime: 5.77ms
Step: 1,  Iteration: 370,  Cost: 0.0864,  Accuracy: 0.0920  AvgTime: 5.29ms
Step: 1,  Iteration: 380,  Cost: 0.0445,  Accuracy: 0.0908  AvgTime: 6.40ms
Step: 1,  Iteration: 390,  Cost: 0.1194,  Accuracy: 0.0931  AvgTime: 6.35ms
Step: 1,  Iteration: 400,  Cost: 0.0528,  Accuracy: 0.0944  AvgTime: 7.30ms
Step: 1,  Iteration: 410,  Cost: 0.0455,  Accuracy: 0.0890  AvgTime: 6.10ms
Step: 1,  Iteration: 420,  Cost: 0.0811,  Accuracy: 0.0866  AvgTime: 6.93ms
Step: 1,  Iteration: 430,  Cost: 0.0851,  Accuracy: 0.0881  AvgTime: 5.05ms
Step: 1,  Iteration: 440,  Cost: 0.0598,  Accuracy: 0.0826  AvgTime: 5.60ms
Step: 1,  Iteration: 450,  Cost: 0.1386,  Accuracy: 0.0960  AvgTime: 5.10ms
Step: 1,  Iteration: 460,  Cost: 0.0387,  Accuracy: 0.0922  AvgTime: 6.45ms
Step: 1,  Iteration: 470,  Cost: 0.0139,  Accuracy: 0.0833  AvgTime: 5.73ms
Step: 1,  Iteration: 480,  Cost: 0.0370,  Accuracy: 0.0885  AvgTime: 5.66ms
Step: 1,  Iteration: 490,  Cost: 0.0151,  Accuracy: 0.0863  AvgTime: 6.04ms
Step: 1,  Iteration: 500,  Cost: 0.0458,  Accuracy: 0.0860  AvgTime: 6.72ms
Step: 1,  Iteration: 510,  Cost: 0.0332,  Accuracy: 0.0995  AvgTime: 8.07ms
Step: 1,  Iteration: 520,  Cost: 0.0347,  Accuracy: 0.0982  AvgTime: 6.08ms
^CTraceback (most recent call last):
  File "3_DistributedTraining_Keras.py", line 157, in <module>
    train(train_op, total_loss, global_step, step)
  File "3_DistributedTraining_Keras.py", line 118, in train
    targets: mnist.test.labels})
  File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 949, in run
    run_metadata_ptr)
  File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1172, in _run
    feed_dict_tensor, options, run_metadata)
  File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1350, in _do_run
    run_metadata)
  File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1356, in _do_call
    return fn(*args)
  File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1341, in _run_fn
    target_list, run_metadata)
  File "/home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/client/session.py", line 1434, in _call_tf_sessionrun
    run_metadata)
KeyboardInterrupt
^Z
[1]+  Stopped                 python3.7 3_DistributedTraining_Keras.py --job_name="worker" --task_index=1

WORKER 2 Output:

WORKER 2 output:

Using TensorFlow backend.
WARNING: Logging before flag parsing goes to stderr.
W0731 14:35:51.255171 140154457438016 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.train.Server is deprecated. Please use tf.distribute.Server instead.

2019-07-31 14:35:51.257948: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2019-07-31 14:35:51.286525: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2494245000 Hz
2019-07-31 14:35:51.286844: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x561d7721a460 executing computations on platform Host. Devices:
2019-07-31 14:35:51.286869: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>
2019-07-31 14:35:51.288613: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job ps -> {0 -> localhost:2222}
2019-07-31 14:35:51.288631: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:254] Initialize GrpcChannelCache for job worker -> {0 -> localhost:2223, 1 -> localhost:2224, 2 -> localhost:2225}
2019-07-31 14:35:51.289537: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:2225
W0731 14:35:52.914685 140154457438016 deprecation.py:323] From 3_DistributedTraining_Keras.py:52: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0731 14:35:52.915120 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0731 14:35:52.915565 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-images-idx3-ubyte.gz
W0731 14:35:53.339766 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
W0731 14:35:53.340977 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
W0731 14:35:53.428383 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Data loaded
W0731 14:35:53.791190 140154457438016 module_wrapper.py:136] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0731 14:35:53.841835 140154457438016 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 512)               401920    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                5130      
=================================================================
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
_________________________________________________________________
W0731 14:35:54.131115 140154457438016 deprecation.py:323] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1423: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0731 14:35:54.364125 140154457438016 deprecation.py:506] From /home/katarina/miniconda3/lib/python3.7/site-packages/tensorflow_core/python/training/rmsprop.py:119: calling Ones.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0731 14:35:54.570417 140154457438016 deprecation.py:323] From 3_DistributedTraining_Keras.py:150: Supervisor.__init__ (from tensorflow.python.training.supervisor) is deprecated and will be removed in a future version.
Instructions for updating:
Please switch to tf.train.MonitoredTrainingSession
Waiting for other servers
Step: 1,  Iteration:  0,  Cost: 0.2419,  Accuracy: 0.2078  AvgTime: 128.85ms
Step: 1,  Iteration: 10,  Cost: 0.3279,  Accuracy: 0.2523  AvgTime: 9.07ms
Step: 1,  Iteration: 20,  Cost: 0.2569,  Accuracy: 0.2523  AvgTime: 6.31ms
Step: 1,  Iteration: 30,  Cost: 0.1381,  Accuracy: 0.2024  AvgTime: 7.35ms
Step: 1,  Iteration: 40,  Cost: 0.1605,  Accuracy: 0.1673  AvgTime: 6.83ms
Step: 1,  Iteration: 50,  Cost: 0.1673,  Accuracy: 0.1972  AvgTime: 4.73ms
Step: 1,  Iteration: 60,  Cost: 0.2967,  Accuracy: 0.2268  AvgTime: 6.00ms
Step: 1,  Iteration: 70,  Cost: 0.3198,  Accuracy: 0.1862  AvgTime: 7.29ms
Step: 1,  Iteration: 80,  Cost: 0.1704,  Accuracy: 0.1752  AvgTime: 4.52ms
Step: 1,  Iteration: 90,  Cost: 0.1098,  Accuracy: 0.1448  AvgTime: 4.79ms
Step: 1,  Iteration: 100,  Cost: 0.1639,  Accuracy: 0.1544  AvgTime: 5.35ms
Step: 1,  Iteration: 110,  Cost: 0.1918,  Accuracy: 0.1670  AvgTime: 5.80ms
Step: 1,  Iteration: 120,  Cost: 0.1761,  Accuracy: 0.1344  AvgTime: 5.51ms
Step: 1,  Iteration: 130,  Cost: 0.0982,  Accuracy: 0.1440  AvgTime: 4.79ms
Step: 1,  Iteration: 140,  Cost: 0.1023,  Accuracy: 0.1425  AvgTime: 6.43ms
Step: 1,  Iteration: 150,  Cost: 0.1394,  Accuracy: 0.1168  AvgTime: 7.08ms
Step: 1,  Iteration: 160,  Cost: 0.0528,  Accuracy: 0.1153  AvgTime: 6.02ms
Step: 1,  Iteration: 170,  Cost: 0.0762,  Accuracy: 0.1162  AvgTime: 5.03ms
Step: 1,  Iteration: 180,  Cost: 0.1318,  Accuracy: 0.1172  AvgTime: 4.80ms
Step: 1,  Iteration: 190,  Cost: 0.0767,  Accuracy: 0.1140  AvgTime: 5.67ms
Step: 1,  Iteration: 200,  Cost: 0.0308,  Accuracy: 0.1056  AvgTime: 5.85ms
Step: 1,  Iteration: 210,  Cost: 0.1512,  Accuracy: 0.1070  AvgTime: 6.82ms
Step: 1,  Iteration: 220,  Cost: 0.1435,  Accuracy: 0.1104  AvgTime: 5.16ms
Step: 1,  Iteration: 230,  Cost: 0.0523,  Accuracy: 0.1194  AvgTime: 5.47ms
Step: 1,  Iteration: 240,  Cost: 0.0741,  Accuracy: 0.1011  AvgTime: 6.94ms
Step: 1,  Iteration: 250,  Cost: 0.0468,  Accuracy: 0.1207  AvgTime: 6.44ms
Step: 1,  Iteration: 260,  Cost: 0.0629,  Accuracy: 0.1012  AvgTime: 5.15ms
Step: 1,  Iteration: 270,  Cost: 0.1853,  Accuracy: 0.0958  AvgTime: 7.51ms
Step: 1,  Iteration: 280,  Cost: 0.0583,  Accuracy: 0.0932  AvgTime: 6.40ms
Step: 1,  Iteration: 290,  Cost: 0.0445,  Accuracy: 0.1011  AvgTime: 6.16ms
Step: 1,  Iteration: 300,  Cost: 0.1400,  Accuracy: 0.0997  AvgTime: 6.11ms
Step: 1,  Iteration: 310,  Cost: 0.0613,  Accuracy: 0.0969  AvgTime: 6.11ms
Step: 1,  Iteration: 320,  Cost: 0.0854,  Accuracy: 0.0921  AvgTime: 6.13ms
Step: 1,  Iteration: 330,  Cost: 0.0611,  Accuracy: 0.0950  AvgTime: 4.06ms
Step: 1,  Iteration: 340,  Cost: 0.0936,  Accuracy: 0.0949  AvgTime: 6.53ms
Step: 1,  Iteration: 350,  Cost: 0.1440,  Accuracy: 0.1392  AvgTime: 6.24ms
Step: 1,  Iteration: 360,  Cost: 0.0144,  Accuracy: 0.0963  AvgTime: 7.43ms
Step: 1,  Iteration: 370,  Cost: 0.0853,  Accuracy: 0.1159  AvgTime: 4.50ms
Step: 1,  Iteration: 380,  Cost: 0.0935,  Accuracy: 0.1051  AvgTime: 4.50ms
Step: 1,  Iteration: 390,  Cost: 0.0355,  Accuracy: 0.1064  AvgTime: 5.98ms
Step: 1,  Iteration: 400,  Cost: 0.0590,  Accuracy: 0.0945  AvgTime: 5.91ms
Step: 1,  Iteration: 410,  Cost: 0.0952,  Accuracy: 0.0982  AvgTime: 5.27ms
Step: 1,  Iteration: 420,  Cost: 0.3120,  Accuracy: 0.0899  AvgTime: 5.07ms
Step: 1,  Iteration: 430,  Cost: 0.0367,  Accuracy: 0.0937  AvgTime: 6.50ms
Step: 1,  Iteration: 440,  Cost: 0.0351,  Accuracy: 0.0941  AvgTime: 6.23ms
Step: 1,  Iteration: 450,  Cost: 0.0805,  Accuracy: 0.0895  AvgTime: 5.38ms
Step: 1,  Iteration: 460,  Cost: 0.0705,  Accuracy: 0.0950  AvgTime: 6.06ms
Step: 1,  Iteration: 470,  Cost: 0.0208,  Accuracy: 0.0925  AvgTime: 5.52ms
Step: 1,  Iteration: 480,  Cost: 0.0083,  Accuracy: 0.0976  AvgTime: 4.99ms
Step: 1,  Iteration: 490,  Cost: 0.0235,  Accuracy: 0.0851  AvgTime: 5.91ms
Step: 1,  Iteration: 500,  Cost: 0.0242,  Accuracy: 0.0861  AvgTime: 6.27ms
Step: 1,  Iteration: 510,  Cost: 0.0483,  Accuracy: 0.0927  AvgTime: 4.62ms
Step: 1,  Iteration: 520,  Cost: 0.2033,  Accuracy: 0.1109  AvgTime: 5.73ms
Step: 1,  Iteration: 530,  Cost: 0.0694,  Accuracy: 0.0898  AvgTime: 6.71ms
Step: 1,  Iteration: 540,  Cost: 0.0781,  Accuracy: 0.0820  AvgTime: 4.75ms
^Z
[1]+  Stopped                 python3.7 3_DistributedTraining_Keras.py --job_name="worker" --task_index=2

vpj/keras_distributed.py

vamsinimmala1992 commented Dec 7, 2018

tnybny commented Apr 16, 2019

katarina-cavar commented Aug 23, 2019 •

edited

Loading

	'''
	The code is inspired from François Chollet's answer to the following quora question[1] and distributed tensorflow tutorial[2].

	It runs the Keras MNIST mlp example across multiple servers.

	This sample code runs multiple processes on a single host. It can be configured
	to run on multiple hosts simply by chaning the host names given in ClusterSpec.

	Training the model:

	Start the parameter server
	python keras_distributed.py --job_name="ps" --task_index=0

	Start the three workers
	python keras_distributed.py --job_name="worker" --task_index=0
	python keras_distributed.py --job_name="worker" --task_index=1
	python keras_distributed.py --job_name="worker" --task_index=2

	[1] https://www.quora.com/What-is-the-state-of-distributed-learning-multi-GPU-and-across-multiple-hosts-in-Keras-and-what-are-the-future-plans
	[2] https://www.tensorflow.org/deploy/distributed
	'''

	import tensorflow as tf
	import keras

	# Define input flags to identify the job and task
	tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
	tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
	FLAGS = tf.app.flags.FLAGS

	# Create a tensorflow cluster
	# Replace localhost with the host names if you are running on multiple hosts
	cluster = tf.train.ClusterSpec({"ps": ["localhost:2222"],
	"worker": [ "localhost:2223",
	"localhost:2224",
	"localhost:2225"]})

	# Start the server
	server = tf.train.Server(cluster,
	job_name=FLAGS.job_name,
	task_index=FLAGS.task_index)

	# Configurations
	batch_size = 128
	learning_rate = 0.0005
	training_iterations = 100
	num_classes = 10
	log_frequency = 10

	# Load mnist data
	def load_data():
	global mnist
	from tensorflow.examples.tutorials.mnist import input_data
	mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
	print("Data loaded")

	# Create Keras model
	def create_model():
	from keras.models import Sequential
	from keras.layers import Dense, Dropout
	model = Sequential()
	model.add(Dense(512, activation='relu', input_shape=(784,)))
	model.add(Dropout(0.2))
	model.add(Dense(512, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(10, activation='softmax'))

	model.summary()

	return model

	# Create the optimizer
	# We cannot use model.compile and model.fit
	def create_optimizer(model, targets):
	predictions = model.output
	loss = tf.reduce_mean(
	keras.losses.categorical_crossentropy(targets, predictions))

	# Only if you have regularizers, not in this example
	total_loss = loss * 1.0 # Copy
	for regularizer_loss in model.losses:
	tf.assign_add(total_loss, regularizer_loss)

	optimizer = tf.train.RMSPropOptimizer(learning_rate)

	# Barrier to compute gradients after updating moving avg of batch norm
	with tf.control_dependencies(model.updates):
	barrier = tf.no_op(name="update_barrier")

	with tf.control_dependencies([barrier]):
	grads = optimizer.compute_gradients(
	total_loss,
	model.trainable_weights)
	grad_updates = optimizer.apply_gradients(grads)

	with tf.control_dependencies([grad_updates]):
	train_op = tf.identity(total_loss, name="train")

	return (train_op, total_loss, predictions)

	# Train the model (a single step)
	def train(train_op, total_loss, global_step, step):
	import time
	start_time = time.time()
	batch_x, batch_y = mnist.train.next_batch(batch_size)

	# perform the operations we defined earlier on batch
	loss_value, step_value = sess.run(
	[train_op, global_step],
	feed_dict={
	model.inputs[0]: batch_x,
	targets: batch_y})

	if step % log_frequency == 0:
	elapsed_time = time.time() - start_time
	start_time = time.time()
	accuracy = sess.run(total_loss,
	feed_dict={
	model.inputs[0]: mnist.test.images,
	targets: mnist.test.labels})
	print("Step: %d," % (step_value + 1),
	" Iteration: %2d," % step,
	" Cost: %.4f," % loss_value,
	" Accuracy: %.4f" % accuracy,
	" AvgTime: %3.2fms" % float(elapsed_time * 1000 / log_frequency))


	if FLAGS.job_name == "ps":
	server.join()
	elif FLAGS.job_name == "worker":
	load_data()

	# Assign operations to local server
	with tf.device(tf.train.replica_device_setter(
	worker_device="/job:worker/task:%d" % FLAGS.task_index,
	cluster=cluster)):
	keras.backend.set_learning_phase(1)
	keras.backend.manual_variable_initialization(True)
	model = create_model()
	targets = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")
	train_op, total_loss, predictions = create_optimizer(model, targets)

	global_step = tf.get_variable('global_step', [],
	initializer=tf.constant_initializer(0),
	trainable=False)
	init_op = tf.global_variables_initializer()

	sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),
	global_step=global_step,
	logdir="/tmp/train_logs",
	save_model_secs=600,
	init_op=init_op)

	print("Waiting for other servers")
	with sv.managed_session(server.target) as sess:
	keras.backend.set_session(sess)
	step = 0
	while not sv.should_stop() and step < 1000000:
	train(train_op, total_loss, global_step, step)
	step += 1

	sv.stop()
	print("done")

vpj/keras_distributed.py

vamsinimmala1992 commented Dec 7, 2018

tnybny commented Apr 16, 2019

katarina-cavar commented Aug 23, 2019 • edited Loading

SUMMARY

RUN OUTPUTS

PS 0 output:

WORKER 0 output:

WORKER 1 output:

WORKER 2 output:

katarina-cavar commented Aug 23, 2019 •

edited

Loading