Skip to content

Instantly share code, notes, and snippets.

@lapp0
Created November 29, 2024 08:06
Show Gist options
  • Save lapp0/ff6f10c3cd6d0aefb28a49681a44b78c to your computer and use it in GitHub Desktop.
Save lapp0/ff6f10c3cd6d0aefb28a49681a44b78c to your computer and use it in GitHub Desktop.
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import uuid
import glob
import time
import contextlib
from dataclasses import dataclass
from typing import Optional
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch._inductor.config as config
from torch.nn.parallel import DistributedDataParallel as DDP
# Use of FlexAttention contributed by @KoszarskyB
from torch.nn.attention.flex_attention import flex_attention, create_block_mask
flex_attention = torch.compile(flex_attention, dynamic=False)
create_block_mask = torch.compile(create_block_mask, dynamic=False)
# -----------------------------------------------------------------------------
# Muon optimizer
def zeropower_via_svd(G, steps=None):
U, S, V = G.svd()
return U @ V.T
@torch.compile
def zeropower_via_newtonschulz5(G, steps=10, eps=1e-7):
"""
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
zero even beyond the point where the iteration no longer converges all the way to one everywhere
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
performance at all relative to UV^T, where USV^T = G is the SVD.
"""
assert len(G.shape) == 2
a, b, c = (3.4445, -4.7750, 2.0315)
X = G.bfloat16()
X /= (X.norm() + eps) # ensure top singular value <= 1
if G.size(0) > G.size(1):
X = X.T
for _ in range(steps):
A = X @ X.T
B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
X = a * X + B @ X
if G.size(0) > G.size(1):
X = X.T
return X
zeropower_backends = dict(svd=zeropower_via_svd, newtonschulz5=zeropower_via_newtonschulz5)
class Muon(torch.optim.Optimizer):
"""
Muon - MomentUm Orthogonalized by Newton-schulz
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
the advantage that it can be stably run in bfloat16 on the GPU.
Some warnings:
- This optimizer assumes that all parameters passed in are 2D.
- It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
parameters; those should all be optimized by a standard method (e.g., AdamW).
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
- We believe it is unlikely to work well for training with small batch size.
- We believe it may not work well for finetuning pretrained models, but we haven't tested this.
- We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).
Arguments:
lr: The learning rate used by the internal SGD.
momentum: The momentum used by the internal SGD.
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
backend: The chosen backend for the orthogonalization step. (recommended: 'newtonschulz5')
backend_steps: The number of iteration steps to use in the backend, if it is iterative.
"""
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True,
backend='newtonschulz5', backend_steps=5):
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, backend=backend, backend_steps=backend_steps)
super().__init__(params, defaults)
def step(self):
for group in self.param_groups:
lr = group['lr']
momentum = group['momentum']
zeropower_backend = zeropower_backends[group['backend']]
# generate weight updates in distributed fashion
total_params = sum(p.numel() for p in group['params'])
updates_flat = torch.zeros(total_params, device='cuda', dtype=torch.bfloat16)
curr_idx = 0
for i, p in enumerate(group['params']):
# luckily this will perfectly distribute a transformer with multiple of 4 layers to 8 GPUs
if i % int(os.environ['WORLD_SIZE']) == int(os.environ['RANK']):
g = p.grad
assert g is not None
state = self.state[p]
if 'momentum_buffer' not in state:
state['momentum_buffer'] = torch.zeros_like(g)
buf = state['momentum_buffer']
buf.mul_(momentum).add_(g)
g = g.add(buf, alpha=momentum) if group['nesterov'] else buf
g = zeropower_backend(g, steps=group['backend_steps'])
g *= max(1, g.size(0)/g.size(1))**0.5
updates_flat[curr_idx:curr_idx+p.numel()] = g.flatten()
curr_idx += p.numel()
# sync updates across devices. we are not memory-constrained so can do this simple deserialization
dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM)
# deserialize and apply updates
curr_idx = 0
for p in group['params']:
g = updates_flat[curr_idx:curr_idx+p.numel()].view_as(p.data).type_as(p.data)
p.data.add_(g, alpha=-lr)
curr_idx += p.numel()
# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions for the GPT-2 model
def norm(x):
return F.rms_norm(x, (x.size(-1),))
class CastedLinear(nn.Linear):
def __init__(self, in_features, out_features):
super().__init__(in_features, out_features, bias=False)
def forward(self, x):
return F.linear(x, self.weight.to(x.dtype))
class Rotary(torch.nn.Module):
def __init__(self, dim, base=10000):
super().__init__()
self.dim = dim
self.base = base
self.inv_freq = None
self.seq_len_cached = None
self.cos_cached = None
self.sin_cached = None
def forward(self, x):
seq_len = x.shape[1]
if seq_len != self.seq_len_cached:
self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=x.device).float() / self.dim))
self.seq_len_cached = seq_len
t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
freqs = torch.outer(t, self.inv_freq)
self.cos_cached = freqs.cos().bfloat16()
self.sin_cached = freqs.sin().bfloat16()
cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]
# apply_rotary_emb(x, cos, sin)
assert x.ndim == 4 # multihead attention
d = x.shape[3]//2
x1 = x[..., :d]
x2 = x[..., d:]
y1 = x1 * cos + x2 * sin
y2 = x1 * (-sin) + x2 * cos
return torch.cat([y1, y2], 3).type_as(x)
class CausalSelfAttention(nn.Module):
def __init__(self, dim, n_head, flex_kernel_options=None):
super().__init__()
assert dim % n_head == 0
self.n_head = n_head
self.c_q = CastedLinear(dim, dim)
self.c_k = CastedLinear(dim, dim)
self.c_v = CastedLinear(dim, dim)
# value residual lambda
self.lamb = nn.Parameter(torch.tensor(0.5)) # @Grad62304977
# rotary embeddings
self.rotary = Rotary(dim // n_head) # dim // n_head = head_dim
# output projection
self.c_proj = CastedLinear(dim, dim)
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977
# flex_attention kernel_options
self.flex_kernel_options = flex_kernel_options
def forward(self, x, v1, block_mask):
B, T = x.size(0), x.size(1) # batch size, sequence length
assert B == 1, "Must use batch size = 1 for FlexAttention"
q = self.c_q(x).view(B, T, self.n_head, -1)
k = self.c_k(x).view(B, T, self.n_head, -1)
v = self.c_v(x).view(B, T, self.n_head, -1)
if v1 is None:
v1 = v # This happens if we are in the first block. v needs to be accessed by subsequent blocks
v = (1 - self.lamb) * v + self.lamb * v1.view_as(v) # @Grad62304977
q, k = norm(q), norm(k) # QK norm suggested by @Grad62304977
q, k = self.rotary(q), self.rotary(k)
y = flex_attention(
q.transpose(1, 2),
k.transpose(1, 2),
v.transpose(1, 2),
block_mask=block_mask,
kernel_options=self.flex_kernel_options
)
y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side
y = self.c_proj(y)
return y, v1
class MLP(nn.Module):
def __init__(self, dim):
super().__init__()
self.c_fc = CastedLinear(dim, 4 * dim)
self.c_proj = CastedLinear(4 * dim, dim)
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977
def forward(self, x):
x = self.c_fc(x)
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
x = self.c_proj(x)
return x
class Block(nn.Module):
def __init__(self, config):
super().__init__()
self.attn = CausalSelfAttention(config.n_embd, config.n_head, config.flex_kernel_options)
self.mlp = MLP(config.n_embd)
self.lambdas = nn.Parameter(torch.tensor([1., 0.]))
def forward(self, x, v1, x0, block_mask):
x = self.lambdas[0] * x + self.lambdas[1] * x0
x1, v1 = self.attn(norm(x), v1, block_mask)
x = x + x1
x = x + self.mlp(norm(x))
return x, v1
# -----------------------------------------------------------------------------
# The main GPT-2 model
@dataclass
class GPTConfig:
# there are only 50257 unique GPT-2 tokens; we extend to nearest
# multiple of 128 for efficiency. suggested to me by @Grad62304977.
# this originates from Karpathy's experiments.
vocab_size : int = 50304
n_layer : int = 12
n_head : int = 6 # head dim 128 suggested by @Grad62304977
n_embd : int = 768
flex_kernel_options: Optional[dict] = None
class GPT(nn.Module):
def __init__(self, config):
super().__init__()
# U-net design by @brendanh0gan
self.num_encoder_layers = config.n_layer // 2 # Half of the layers for encoder
self.num_decoder_layers = config.n_layer - self.num_encoder_layers # Remaining for decoder
# Add learnable skip connection weights for decoder layers
self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers))
self.transformer = nn.ModuleDict(dict(
wte = nn.Embedding(config.vocab_size, config.n_embd),
h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
))
self.lm_head = CastedLinear(config.n_embd, config.vocab_size)
self.lm_head.weight.data.zero_() # @Grad62304977
def forward(self, idx, target, attn_blocksize):
docs = (idx == 50256).cumsum(0)
def document_causal_mask(b, h, q_idx, kv_idx):
causal_mask = q_idx >= kv_idx
document_mask = docs[q_idx] == docs[kv_idx]
window_mask = q_idx - kv_idx < attn_blocksize
return causal_mask & document_mask & window_mask
S = len(idx)
block_mask = create_block_mask(document_causal_mask, None, None, S, S, device="cuda", _compile=True)
# forward the GPT model itself
x = self.transformer.wte(idx[None]) # token embeddings of shape (b, t, n_embd)
x = norm(x) # @Grad62304977
x0 = x
v1 = None
# Store outputs for U-Net skip connections
skip_connections = []
# Encoder pass - process only the first half of the blocks
for i in range(self.num_encoder_layers):
x, v1 = self.transformer.h[i](x, v1, x0, block_mask)
skip_connections.append(x)
# Decoder pass - process the remaining blocks with weighted skip connections
for i in range(self.num_decoder_layers):
x = x + self.skip_weights[i] * skip_connections.pop()
x, v1 = self.transformer.h[self.num_encoder_layers + i](x, v1, x0, block_mask)
x = norm(x)
logits = self.lm_head(x)
logits = 30 * torch.tanh(logits / 30) # @Grad62304977
logits = logits.float()
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), target.view(-1))
return loss
# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader
def _peek_data_shard(filename):
# only reads the header, returns header data
with open(filename, "rb") as f:
# first read the header, which is 256 int32 integers (4 bytes each)
header = np.frombuffer(f.read(256*4), dtype=np.int32)
if header[0] != 20240520:
print("ERROR: magic number mismatch in the data .bin file!")
print("---> HINT: Are you passing in a correct file with --input_bin?")
print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README")
print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
exit(1)
assert header[1] == 1, "unsupported version"
ntok = header[2] # number of tokens (claimed)
return ntok # for now just return the number of tokens
def _load_data_shard(filename):
with open(filename, "rb") as f:
# first read the header, which is 256 int32 integers (4 bytes each)
header = np.frombuffer(f.read(256*4), dtype=np.int32)
assert header[0] == 20240520, "magic number mismatch in the data .bin file"
assert header[1] == 1, "unsupported version"
ntok = header[2] # number of tokens (claimed)
# the rest of it are tokens, stored as uint16
tokens = np.frombuffer(f.read(), dtype=np.uint16)
assert len(tokens) == ntok, "number of tokens read does not match header?"
return tokens
class DistributedDataLoader:
def __init__(self, filename_pattern, T, process_rank, num_processes):
self.process_rank = process_rank
self.num_processes = num_processes
self.T = T
# glob files that match the pattern
self.files = sorted(glob.glob(filename_pattern))
assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"
# load and validate all data shards, count number of tokens in total
ntok_total = 0
for fname in self.files:
shard_ntok = _peek_data_shard(fname)
assert shard_ntok >= num_processes * T + 1
ntok_total += int(shard_ntok)
self.ntok_total = ntok_total
self.reset()
def reset(self):
self.current_shard = -1
self.advance()
def advance(self): # advance to next data shard
self.current_shard = (self.current_shard + 1) % len(self.files)
self.current_position = self.process_rank * self.T
self.tokens = _load_data_shard(self.files[self.current_shard])
def next_batch(self):
batch_size = self.T * self.num_processes
buf = self.tokens[self.current_position:self.current_position+self.T+1]
buf = torch.tensor(buf.astype(np.int32), dtype=torch.long)
x = buf[:-1] # inputs
y = buf[1:] # targets
# advance current position and load next shard if necessary
self.current_position += batch_size
if self.current_position + batch_size >= len(self.tokens):
self.advance()
return x.cuda(), y.cuda()
# -----------------------------------------------------------------------------
# int main
@dataclass
class Hyperparameters:
# data hyperparams
input_bin : str = 'data/fineweb10B/fineweb_train_*.bin' # input .bin to train on
input_val_bin : str = 'data/fineweb10B/fineweb_val_*.bin' # input .bin to eval validation loss on
# optimization hyperparams
batch_size : int = 8 # batch size, in sequences, across all devices
sequence_length : int = 64*1024 # sequence length, in tokens
num_iterations : int = 1750 # number of iterations to run
warmup_iters : int = 0
cooldown_iters : int = 640 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule
weight_decay : float = 0
# evaluation and logging hyperparams
val_loss_every : int = 125 # every how many steps to evaluate val loss? 0 for only at the end
val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
save_every : int = 0 # every how many steps to save the checkpoint? 0 for only at the end
if len(sys.argv) > 1 and sys.argv[1] == "1x4090":
args = Hyperparameters(batch_size=16, sequence_length=32*1024) # set to4090
model_config = GPTConfig(
flex_kernel_options={
"BLOCK_M": 64, "BLOCK_N": 64, # forward
"BLOCK_M1": 32, "BLOCK_N1": 64, "BLOCK_M2": 64, "BLOCK_N2": 32 # backwards
}
)
else:
args = Hyperparameters() # default 8xH100
model_config = GPTConfig()
# set up DDP (distributed data parallel). torchrun sets this env variable
assert torch.cuda.is_available()
dist.init_process_group(backend='nccl')
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
print(f"using device: {device}")
master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.
# begin logging
logfile = None
if master_process:
run_id = str(uuid.uuid4())
logdir = 'logs/%s/' % run_id
os.makedirs(logdir, exist_ok=True)
logfile = 'logs/%s.txt' % run_id
# create the log file
with open(logfile, "w") as f:
# begin the log by printing this file (the Python code)
f.write(code)
f.write('='*100 + '\n')
def print0(s, logonly=False):
if master_process:
with open(logfile, "a") as f:
if not logonly:
print(s)
f.write(s+'\n')
# log information about the hardware/software environment this is running on
# and print the full `nvidia-smi` to file
print0(f"Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:")
import subprocess
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
print0(f'{result.stdout}', logonly=True)
print0('='*100, logonly=True)
# convenience variables
T = args.sequence_length
# calculate the number of steps to take in the val loop.
assert args.val_tokens % (T * ddp_world_size) == 0
val_steps = args.val_tokens // (T * ddp_world_size)
# calculate the steps of gradient accumulation required to attain the desired global batch size.
assert args.batch_size % (ddp_world_size) == 0
train_accumulation_steps = args.batch_size // ddp_world_size
# load tokens
train_loader = DistributedDataLoader(args.input_bin, T, ddp_rank, ddp_world_size)
val_loader = DistributedDataLoader(args.input_val_bin, T, ddp_rank, ddp_world_size)
print0(f"Training DataLoader: total number of tokens: {train_loader.ntok_total} across {len(train_loader.files)} files")
print0(f"Validation DataLoader: total number of tokens: {val_loader.ntok_total} across {len(val_loader.files)} files")
print0('='*100, logonly=True)
x, y = train_loader.next_batch()
model = GPT(model_config)
model = model.cuda().bfloat16()
for m in model.modules():
if isinstance(m, CastedLinear):
m.float()
if hasattr(config, "coordinate_descent_tuning"):
config.coordinate_descent_tuning = True # suggested by @Chillee
model = torch.compile(model)
# here we wrap model into DDP container
model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module # always contains the "raw" unwrapped model
# init the optimizer(s)
optimizer1 = torch.optim.Adam([raw_model.transformer.wte.weight], lr=0.6, betas=(0.8, 0.95), fused=True)
optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.008, betas=(0.8, 0.95), fused=True)
params = list(raw_model.transformer.h.parameters())
matrix_params = [p for p in params if p.ndim == 2]
scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights]
optimizer3 = Muon(matrix_params, lr=0.05, momentum=0.95)
optimizer4 = torch.optim.Adam(scalar_params, lr=0.04, betas=(0.8, 0.95), fused=True) # note that this learning rate is neither sensitive nor tuned
optimizers = [optimizer1, optimizer2, optimizer3, optimizer4]
# learning rate decay scheduler (linear warmup and cooldown)
def get_lr(it):
assert it <= args.num_iterations
# 1) linear warmup for warmup_iters steps
if it < args.warmup_iters:
return (it+1) / args.warmup_iters
# 2) constant lr for a while
elif it < args.num_iterations - args.cooldown_iters:
return 1.0
# 3) linear cooldown
else:
decay_ratio = (args.num_iterations - it) / args.cooldown_iters
return decay_ratio
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers]
# Start training loop
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.time()
# begin training
for step in range(args.num_iterations + 1):
last_step = (step == args.num_iterations)
# This effectively ignores timing first 10 steps, which are slower for weird reasons.
# Alternately, and slightly more correctly in terms of benchmarking, we could do 10
# steps with dummy data first, and then re-initialize the model and reset the loader.
if step == 10:
training_time_ms = 0
t0 = time.time()
timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val
# Set the attention blocksize for the current step, in chunks of 64. By @fernbear.bsky.social
attn_blocksize = torch.tensor(64*((step/args.num_iterations * (1792 - 64) + 64)//64), dtype=torch.int, device='cuda')
# once in a while evaluate the validation dataset
if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)):
# stop the clock
torch.cuda.synchronize()
training_time_ms += 1000 * (time.time() - t0)
# run validation batches
model.eval()
val_loader.reset()
val_loss = 0.0
for _ in range(val_steps):
with torch.no_grad():
x_val, y_val = val_loader.next_batch()
val_loss += model(x_val, y_val, attn_blocksize=attn_blocksize)
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
val_loss /= val_steps
# log val loss to console and to logfile
print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms')
# start the clock again
torch.cuda.synchronize()
t0 = time.time()
if master_process and (last_step or (args.save_every > 0 and step % args.save_every == 0)):
# stop the clock
torch.cuda.synchronize()
training_time_ms += 1000 * (time.time() - t0)
# save the state of the training process
log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step))
# start the clock again
torch.cuda.synchronize()
t0 = time.time()
# bit confusing: we want to make sure to eval on 0th iteration
# but also after the very last iteration. so we loop for step <= num_iterations
# instead of just < num_iterations (one extra due to <=), only to do
# the validation/sampling one last time, and then we break right here as we're done.
if last_step:
break
# --------------- TRAINING SECTION BEGIN -----------------
model.train()
for i in range(1, train_accumulation_steps+1):
ctx = model.no_sync() if i < train_accumulation_steps else contextlib.nullcontext()
with ctx: # there's no need to sync gradients every accumulation step
# forward pass
loss = model(x, y, attn_blocksize=attn_blocksize)
# advance the dataset for the next batch
x, y = train_loader.next_batch()
# backward pass
loss.backward()
train_loss = loss.detach()
for p in model.parameters():
p.grad /= train_accumulation_steps
# momentum warmup for Muon
frac = min(step/300, 1)
optimizer3.param_groups[0]['momentum'] = (1 - frac) * 0.85 + frac * 0.95
# step the optimizers and schedulers
for opt, sched in zip(optimizers, schedulers):
opt.step()
sched.step()
# null the gradients
model.zero_grad(set_to_none=True)
# --------------- TRAINING SECTION END -------------------
# everything that follows now is just diagnostics, prints, logging, etc.
#dist.all_reduce(train_loss, op=dist.ReduceOp.AVG) # all-reducing the training loss would be more correct in terms of logging, but slower
approx_time = training_time_ms + 1000 * (time.time() - t0)
print0(f"step:{step+1}/{args.num_iterations} train_loss:{train_loss.item():.4f} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms")
if master_process:
print(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")
# -------------------------------------------------------------------------
# clean up nice
dist.destroy_process_group()
====================================================================================================
Running pytorch 2.6.0.dev20241126+cu124 compiled for CUDA 12.4
nvidia-smi:
Fri Nov 29 04:53:49 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.76 Driver Version: 550.76 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4090 On | 00000000:61:00.0 Off | Off |
| 0% 31C P8 26W / 450W | 4MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
====================================================================================================
Training DataLoader: total number of tokens: 1000000000 across 10 files
Validation DataLoader: total number of tokens: 100000000 across 1 files
====================================================================================================
step:0/1750 val_loss:10.8259 train_time:0ms step_avg:nanms
step:1/1750 train_loss:10.8258 train_time:44047ms step_avg:nanms
step:2/1750 train_loss:10.1070 train_time:47871ms step_avg:nanms
step:3/1750 train_loss:8.1877 train_time:51741ms step_avg:nanms
step:4/1750 train_loss:7.8481 train_time:55607ms step_avg:nanms
step:5/1750 train_loss:7.3925 train_time:59471ms step_avg:nanms
step:6/1750 train_loss:7.2481 train_time:63343ms step_avg:nanms
step:7/1750 train_loss:6.8819 train_time:67223ms step_avg:nanms
step:8/1750 train_loss:6.6728 train_time:71119ms step_avg:nanms
step:9/1750 train_loss:6.9420 train_time:75001ms step_avg:nanms
step:10/1750 train_loss:6.4015 train_time:78883ms step_avg:nanms
step:11/1750 train_loss:6.3886 train_time:3883ms step_avg:nanms
step:12/1750 train_loss:6.3826 train_time:7776ms step_avg:nanms
step:13/1750 train_loss:6.3659 train_time:11663ms step_avg:3887.51ms
step:14/1750 train_loss:6.3089 train_time:15553ms step_avg:3888.20ms
step:15/1750 train_loss:6.3283 train_time:19443ms step_avg:3888.55ms
step:16/1750 train_loss:6.2602 train_time:23331ms step_avg:3888.52ms
step:17/1750 train_loss:6.2402 train_time:27221ms step_avg:3888.72ms
step:18/1750 train_loss:6.1276 train_time:31112ms step_avg:3889.05ms
step:19/1750 train_loss:6.1371 train_time:35004ms step_avg:3889.36ms
step:20/1750 train_loss:5.9751 train_time:38895ms step_avg:3889.53ms
step:21/1750 train_loss:6.0147 train_time:42785ms step_avg:3889.58ms
step:22/1750 train_loss:5.9479 train_time:46677ms step_avg:3889.73ms
step:23/1750 train_loss:5.8518 train_time:50567ms step_avg:3889.78ms
step:24/1750 train_loss:5.9653 train_time:54460ms step_avg:3889.97ms
step:25/1750 train_loss:6.0085 train_time:58352ms step_avg:3890.11ms
step:26/1750 train_loss:6.2903 train_time:62245ms step_avg:3890.32ms
step:27/1750 train_loss:5.7136 train_time:66139ms step_avg:3890.52ms
step:28/1750 train_loss:5.6314 train_time:70031ms step_avg:3890.63ms
step:29/1750 train_loss:5.6734 train_time:73927ms step_avg:3890.89ms
step:30/1750 train_loss:5.6962 train_time:77822ms step_avg:3891.11ms
step:31/1750 train_loss:5.5028 train_time:81719ms step_avg:3891.40ms
step:32/1750 train_loss:5.7125 train_time:85616ms step_avg:3891.65ms
step:33/1750 train_loss:5.4414 train_time:89512ms step_avg:3891.83ms
step:34/1750 train_loss:5.6516 train_time:93410ms step_avg:3892.07ms
step:35/1750 train_loss:5.4558 train_time:97307ms step_avg:3892.30ms
step:36/1750 train_loss:5.5557 train_time:101206ms step_avg:3892.55ms
step:37/1750 train_loss:5.4063 train_time:105107ms step_avg:3892.86ms
step:38/1750 train_loss:5.3617 train_time:109006ms step_avg:3893.09ms
step:39/1750 train_loss:5.4849 train_time:112904ms step_avg:3893.25ms
step:40/1750 train_loss:5.4548 train_time:116802ms step_avg:3893.40ms
step:41/1750 train_loss:5.4738 train_time:120702ms step_avg:3893.61ms
step:42/1750 train_loss:5.3319 train_time:124600ms step_avg:3893.76ms
step:43/1750 train_loss:5.4276 train_time:128502ms step_avg:3893.99ms
step:44/1750 train_loss:5.2833 train_time:132401ms step_avg:3894.16ms
step:45/1750 train_loss:5.2759 train_time:136302ms step_avg:3894.36ms
step:46/1750 train_loss:5.2702 train_time:140202ms step_avg:3894.50ms
step:47/1750 train_loss:5.4754 train_time:144101ms step_avg:3894.63ms
step:48/1750 train_loss:5.4112 train_time:148000ms step_avg:3894.74ms
step:49/1750 train_loss:5.2523 train_time:151899ms step_avg:3894.84ms
step:50/1750 train_loss:5.2674 train_time:155798ms step_avg:3894.95ms
step:51/1750 train_loss:5.1538 train_time:159696ms step_avg:3895.03ms
step:52/1750 train_loss:5.0841 train_time:163595ms step_avg:3895.12ms
step:53/1750 train_loss:4.9812 train_time:167493ms step_avg:3895.18ms
step:54/1750 train_loss:5.2741 train_time:171387ms step_avg:3895.16ms
step:55/1750 train_loss:5.1322 train_time:175285ms step_avg:3895.22ms
step:56/1750 train_loss:5.1577 train_time:179183ms step_avg:3895.28ms
step:57/1750 train_loss:5.3600 train_time:183082ms step_avg:3895.35ms
step:58/1750 train_loss:5.0982 train_time:186980ms step_avg:3895.41ms
step:59/1750 train_loss:5.0987 train_time:190873ms step_avg:3895.38ms
step:60/1750 train_loss:4.9204 train_time:194767ms step_avg:3895.34ms
step:61/1750 train_loss:4.9409 train_time:198662ms step_avg:3895.34ms
step:62/1750 train_loss:5.0497 train_time:202556ms step_avg:3895.31ms
step:63/1750 train_loss:4.8994 train_time:206450ms step_avg:3895.29ms
step:64/1750 train_loss:4.9898 train_time:210347ms step_avg:3895.32ms
step:65/1750 train_loss:5.1226 train_time:214244ms step_avg:3895.34ms
step:66/1750 train_loss:5.2665 train_time:218138ms step_avg:3895.31ms
step:67/1750 train_loss:5.3679 train_time:222032ms step_avg:3895.30ms
step:68/1750 train_loss:5.0360 train_time:225926ms step_avg:3895.28ms
step:69/1750 train_loss:5.0196 train_time:229825ms step_avg:3895.34ms
step:70/1750 train_loss:5.0452 train_time:233721ms step_avg:3895.35ms
step:71/1750 train_loss:4.9577 train_time:237614ms step_avg:3895.31ms
step:72/1750 train_loss:4.9461 train_time:241507ms step_avg:3895.28ms
step:73/1750 train_loss:4.9612 train_time:245403ms step_avg:3895.28ms
step:74/1750 train_loss:5.1006 train_time:249298ms step_avg:3895.28ms
step:75/1750 train_loss:5.1172 train_time:253198ms step_avg:3895.35ms
step:76/1750 train_loss:5.0083 train_time:257097ms step_avg:3895.41ms
step:77/1750 train_loss:4.8464 train_time:260993ms step_avg:3895.42ms
step:78/1750 train_loss:4.8726 train_time:264891ms step_avg:3895.46ms
step:79/1750 train_loss:4.7200 train_time:268788ms step_avg:3895.47ms
step:80/1750 train_loss:4.8832 train_time:272683ms step_avg:3895.47ms
step:81/1750 train_loss:4.8341 train_time:276577ms step_avg:3895.45ms
step:82/1750 train_loss:4.8660 train_time:280469ms step_avg:3895.40ms
step:83/1750 train_loss:4.9373 train_time:284363ms step_avg:3895.38ms
step:84/1750 train_loss:4.7855 train_time:288258ms step_avg:3895.37ms
step:85/1750 train_loss:5.0314 train_time:292152ms step_avg:3895.35ms
step:86/1750 train_loss:4.7514 train_time:296042ms step_avg:3895.28ms
step:87/1750 train_loss:4.9885 train_time:299935ms step_avg:3895.26ms
step:88/1750 train_loss:5.6188 train_time:303830ms step_avg:3895.26ms
step:89/1750 train_loss:4.7681 train_time:307723ms step_avg:3895.23ms
step:90/1750 train_loss:4.6971 train_time:311616ms step_avg:3895.20ms
step:91/1750 train_loss:5.1873 train_time:315505ms step_avg:3895.12ms
step:92/1750 train_loss:4.7056 train_time:319397ms step_avg:3895.09ms
step:93/1750 train_loss:4.7128 train_time:323291ms step_avg:3895.08ms
step:94/1750 train_loss:4.9405 train_time:327182ms step_avg:3895.02ms
step:95/1750 train_loss:4.8284 train_time:331072ms step_avg:3894.97ms
step:96/1750 train_loss:4.6037 train_time:334965ms step_avg:3894.94ms
step:97/1750 train_loss:5.2248 train_time:338854ms step_avg:3894.87ms
step:98/1750 train_loss:4.5490 train_time:342747ms step_avg:3894.85ms
step:99/1750 train_loss:4.6016 train_time:346639ms step_avg:3894.82ms
step:100/1750 train_loss:4.6563 train_time:350529ms step_avg:3894.77ms
step:101/1750 train_loss:4.9597 train_time:354418ms step_avg:3894.70ms
step:102/1750 train_loss:4.6806 train_time:358309ms step_avg:3894.66ms
step:103/1750 train_loss:4.5464 train_time:362201ms step_avg:3894.63ms
step:104/1750 train_loss:4.6158 train_time:366095ms step_avg:3894.62ms
step:105/1750 train_loss:4.4007 train_time:369985ms step_avg:3894.58ms
step:106/1750 train_loss:4.5399 train_time:373877ms step_avg:3894.55ms
step:107/1750 train_loss:4.6112 train_time:377769ms step_avg:3894.52ms
step:108/1750 train_loss:4.6995 train_time:381659ms step_avg:3894.47ms
step:109/1750 train_loss:4.7862 train_time:385552ms step_avg:3894.46ms
step:110/1750 train_loss:5.0967 train_time:389444ms step_avg:3894.44ms
step:111/1750 train_loss:4.3489 train_time:393336ms step_avg:3894.41ms
step:112/1750 train_loss:4.7046 train_time:397226ms step_avg:3894.38ms
step:113/1750 train_loss:4.8625 train_time:401119ms step_avg:3894.36ms
step:114/1750 train_loss:4.5787 train_time:405010ms step_avg:3894.32ms
step:115/1750 train_loss:5.5714 train_time:408901ms step_avg:3894.30ms
step:116/1750 train_loss:4.5840 train_time:412793ms step_avg:3894.27ms
step:117/1750 train_loss:4.6161 train_time:416681ms step_avg:3894.22ms
step:118/1750 train_loss:4.3161 train_time:420571ms step_avg:3894.18ms
step:119/1750 train_loss:4.5028 train_time:424462ms step_avg:3894.15ms
step:120/1750 train_loss:4.5046 train_time:428352ms step_avg:3894.11ms
step:121/1750 train_loss:4.5220 train_time:432240ms step_avg:3894.06ms
step:122/1750 train_loss:4.5112 train_time:436129ms step_avg:3894.01ms
step:123/1750 train_loss:4.4959 train_time:440018ms step_avg:3893.96ms
step:124/1750 train_loss:4.4130 train_time:443906ms step_avg:3893.91ms
step:125/1750 train_loss:4.4503 train_time:447794ms step_avg:3893.86ms
step:125/1750 val_loss:4.4595 train_time:447794ms step_avg:3893.86ms
step:126/1750 train_loss:4.5119 train_time:451688ms step_avg:3893.86ms
step:127/1750 train_loss:4.3738 train_time:455575ms step_avg:3893.80ms
step:128/1750 train_loss:4.4086 train_time:459461ms step_avg:3893.73ms
step:129/1750 train_loss:4.4630 train_time:463349ms step_avg:3893.69ms
step:130/1750 train_loss:4.2837 train_time:467236ms step_avg:3893.64ms
step:131/1750 train_loss:4.2326 train_time:471222ms step_avg:3894.40ms
step:132/1750 train_loss:4.6199 train_time:475206ms step_avg:3895.13ms
step:133/1750 train_loss:4.3850 train_time:479189ms step_avg:3895.84ms
step:134/1750 train_loss:4.2735 train_time:483175ms step_avg:3896.57ms
step:135/1750 train_loss:4.2820 train_time:487160ms step_avg:3897.28ms
step:136/1750 train_loss:4.4734 train_time:491146ms step_avg:3897.98ms
step:137/1750 train_loss:4.2571 train_time:495131ms step_avg:3898.67ms
step:138/1750 train_loss:4.4815 train_time:499114ms step_avg:3899.33ms
step:139/1750 train_loss:4.3787 train_time:503101ms step_avg:3900.00ms
step:140/1750 train_loss:4.4227 train_time:507084ms step_avg:3900.64ms
step:141/1750 train_loss:4.3991 train_time:511066ms step_avg:3901.27ms
step:142/1750 train_loss:4.3546 train_time:515052ms step_avg:3901.91ms
step:143/1750 train_loss:4.5474 train_time:519039ms step_avg:3902.55ms
step:144/1750 train_loss:4.2149 train_time:523024ms step_avg:3903.17ms
step:145/1750 train_loss:4.4677 train_time:527007ms step_avg:3903.76ms
step:146/1750 train_loss:4.2476 train_time:530995ms step_avg:3904.37ms
step:147/1750 train_loss:4.3444 train_time:534978ms step_avg:3904.95ms
step:148/1750 train_loss:4.7406 train_time:538964ms step_avg:3905.54ms
step:149/1750 train_loss:4.4862 train_time:542952ms step_avg:3906.13ms
step:150/1750 train_loss:4.1949 train_time:546938ms step_avg:3906.70ms
step:151/1750 train_loss:4.4441 train_time:550924ms step_avg:3907.26ms
step:152/1750 train_loss:4.1321 train_time:554913ms step_avg:3907.84ms
step:153/1750 train_loss:4.3307 train_time:558903ms step_avg:3908.41ms
step:154/1750 train_loss:4.2995 train_time:562890ms step_avg:3908.96ms
step:155/1750 train_loss:4.1960 train_time:566877ms step_avg:3909.49ms
step:156/1750 train_loss:4.3172 train_time:570863ms step_avg:3910.02ms
step:157/1750 train_loss:4.2111 train_time:574850ms step_avg:3910.54ms
step:158/1750 train_loss:4.2953 train_time:578835ms step_avg:3911.05ms
step:159/1750 train_loss:4.1764 train_time:582819ms step_avg:3911.54ms
step:160/1750 train_loss:4.2173 train_time:586806ms step_avg:3912.04ms
step:161/1750 train_loss:4.3154 train_time:590792ms step_avg:3912.53ms
step:162/1750 train_loss:4.1942 train_time:594774ms step_avg:3912.99ms
step:163/1750 train_loss:4.2440 train_time:598755ms step_avg:3913.43ms
step:164/1750 train_loss:4.3209 train_time:602738ms step_avg:3913.88ms
step:165/1750 train_loss:4.6274 train_time:606722ms step_avg:3914.34ms
step:166/1750 train_loss:4.2160 train_time:610705ms step_avg:3914.78ms
step:167/1750 train_loss:4.2404 train_time:614688ms step_avg:3915.21ms
step:168/1750 train_loss:4.2526 train_time:618666ms step_avg:3915.61ms
step:169/1750 train_loss:4.2308 train_time:622647ms step_avg:3916.02ms
step:170/1750 train_loss:4.1024 train_time:626631ms step_avg:3916.44ms
step:171/1750 train_loss:4.3391 train_time:630618ms step_avg:3916.88ms
step:172/1750 train_loss:4.3204 train_time:634602ms step_avg:3917.29ms
step:173/1750 train_loss:4.1553 train_time:638584ms step_avg:3917.69ms
step:174/1750 train_loss:4.2613 train_time:642565ms step_avg:3918.08ms
step:175/1750 train_loss:4.1633 train_time:646545ms step_avg:3918.45ms
step:176/1750 train_loss:4.2691 train_time:650531ms step_avg:3918.86ms
step:177/1750 train_loss:4.1723 train_time:654510ms step_avg:3919.22ms
step:178/1750 train_loss:4.2656 train_time:658490ms step_avg:3919.58ms
step:179/1750 train_loss:4.0780 train_time:662471ms step_avg:3919.94ms
step:180/1750 train_loss:4.1223 train_time:666450ms step_avg:3920.30ms
step:181/1750 train_loss:4.3319 train_time:670434ms step_avg:3920.67ms
step:182/1750 train_loss:4.1957 train_time:674417ms step_avg:3921.03ms
step:183/1750 train_loss:4.1540 train_time:678401ms step_avg:3921.39ms
step:184/1750 train_loss:3.9753 train_time:682384ms step_avg:3921.75ms
step:185/1750 train_loss:4.2734 train_time:686369ms step_avg:3922.11ms
step:186/1750 train_loss:4.2276 train_time:690356ms step_avg:3922.47ms
step:187/1750 train_loss:4.1941 train_time:694342ms step_avg:3922.84ms
step:188/1750 train_loss:4.3555 train_time:698325ms step_avg:3923.17ms
step:189/1750 train_loss:4.2411 train_time:702309ms step_avg:3923.51ms
step:190/1750 train_loss:4.2542 train_time:706291ms step_avg:3923.84ms
step:191/1750 train_loss:4.1004 train_time:710277ms step_avg:3924.18ms
step:192/1750 train_loss:4.3047 train_time:714259ms step_avg:3924.50ms
step:193/1750 train_loss:4.1080 train_time:718247ms step_avg:3924.85ms
step:194/1750 train_loss:4.1732 train_time:722225ms step_avg:3925.13ms
step:195/1750 train_loss:4.1443 train_time:726203ms step_avg:3925.42ms
step:196/1750 train_loss:4.2819 train_time:730179ms step_avg:3925.70ms
step:197/1750 train_loss:4.0339 train_time:734156ms step_avg:3925.97ms
step:198/1750 train_loss:4.2717 train_time:738134ms step_avg:3926.25ms
step:199/1750 train_loss:4.1600 train_time:742109ms step_avg:3926.50ms
step:200/1750 train_loss:4.0068 train_time:746084ms step_avg:3926.76ms
step:201/1750 train_loss:4.1011 train_time:750065ms step_avg:3927.04ms
step:202/1750 train_loss:4.2772 train_time:754043ms step_avg:3927.31ms
step:203/1750 train_loss:4.4687 train_time:758020ms step_avg:3927.56ms
step:204/1750 train_loss:4.0606 train_time:761999ms step_avg:3927.83ms
step:205/1750 train_loss:4.1754 train_time:765976ms step_avg:3928.08ms
step:206/1750 train_loss:3.9538 train_time:769952ms step_avg:3928.33ms
step:207/1750 train_loss:4.1080 train_time:773931ms step_avg:3928.58ms
step:208/1750 train_loss:4.2817 train_time:777910ms step_avg:3928.84ms
step:209/1750 train_loss:4.0813 train_time:781888ms step_avg:3929.08ms
step:210/1750 train_loss:4.2787 train_time:785871ms step_avg:3929.36ms
step:211/1750 train_loss:4.1150 train_time:789847ms step_avg:3929.59ms
step:212/1750 train_loss:4.2054 train_time:793823ms step_avg:3929.82ms
step:213/1750 train_loss:4.1890 train_time:797801ms step_avg:3930.05ms
step:214/1750 train_loss:4.0723 train_time:801778ms step_avg:3930.28ms
step:215/1750 train_loss:4.3999 train_time:805756ms step_avg:3930.52ms
step:216/1750 train_loss:4.0543 train_time:809736ms step_avg:3930.76ms
step:217/1750 train_loss:4.1324 train_time:813714ms step_avg:3930.98ms
step:218/1750 train_loss:3.9840 train_time:817690ms step_avg:3931.20ms
step:219/1750 train_loss:4.1002 train_time:821665ms step_avg:3931.41ms
step:220/1750 train_loss:3.9125 train_time:825642ms step_avg:3931.63ms
step:221/1750 train_loss:3.9795 train_time:829621ms step_avg:3931.85ms
step:222/1750 train_loss:3.9198 train_time:833599ms step_avg:3932.07ms
step:223/1750 train_loss:3.9641 train_time:837581ms step_avg:3932.30ms
step:224/1750 train_loss:3.9237 train_time:841559ms step_avg:3932.52ms
step:225/1750 train_loss:4.0251 train_time:845538ms step_avg:3932.74ms
step:226/1750 train_loss:4.1501 train_time:849515ms step_avg:3932.94ms
step:227/1750 train_loss:4.0975 train_time:853491ms step_avg:3933.14ms
step:228/1750 train_loss:4.2110 train_time:857470ms step_avg:3933.35ms
step:229/1750 train_loss:4.1701 train_time:861446ms step_avg:3933.54ms
step:230/1750 train_loss:3.9404 train_time:865426ms step_avg:3933.75ms
step:231/1750 train_loss:3.9791 train_time:869405ms step_avg:3933.96ms
step:232/1750 train_loss:4.1811 train_time:873379ms step_avg:3934.14ms
step:233/1750 train_loss:4.0523 train_time:877360ms step_avg:3934.35ms
step:234/1750 train_loss:4.2126 train_time:881338ms step_avg:3934.54ms
step:235/1750 train_loss:3.9620 train_time:885320ms step_avg:3934.75ms
step:236/1750 train_loss:3.9099 train_time:889299ms step_avg:3934.95ms
step:237/1750 train_loss:4.1435 train_time:893276ms step_avg:3935.13ms
step:238/1750 train_loss:4.1521 train_time:897256ms step_avg:3935.33ms
step:239/1750 train_loss:4.0015 train_time:901230ms step_avg:3935.50ms
step:240/1750 train_loss:3.8644 train_time:905208ms step_avg:3935.69ms
step:241/1750 train_loss:4.1873 train_time:909190ms step_avg:3935.89ms
step:242/1750 train_loss:4.0268 train_time:913168ms step_avg:3936.07ms
step:243/1750 train_loss:3.9770 train_time:917146ms step_avg:3936.25ms
step:244/1750 train_loss:4.2102 train_time:921126ms step_avg:3936.43ms
step:245/1750 train_loss:4.0339 train_time:925104ms step_avg:3936.61ms
step:246/1750 train_loss:4.0810 train_time:929082ms step_avg:3936.79ms
step:247/1750 train_loss:4.0278 train_time:933062ms step_avg:3936.97ms
step:248/1750 train_loss:4.0468 train_time:937040ms step_avg:3937.14ms
step:249/1750 train_loss:4.1599 train_time:941020ms step_avg:3937.32ms
step:250/1750 train_loss:4.0054 train_time:945002ms step_avg:3937.51ms
step:250/1750 val_loss:4.0413 train_time:945003ms step_avg:3937.51ms
step:251/1750 train_loss:4.0131 train_time:948987ms step_avg:3937.71ms
step:252/1750 train_loss:3.9986 train_time:952968ms step_avg:3937.89ms
step:253/1750 train_loss:3.8192 train_time:956950ms step_avg:3938.07ms
step:254/1750 train_loss:3.9264 train_time:960929ms step_avg:3938.23ms
step:255/1750 train_loss:4.2648 train_time:964909ms step_avg:3938.41ms
step:256/1750 train_loss:4.0192 train_time:968895ms step_avg:3938.60ms
step:257/1750 train_loss:4.0314 train_time:972875ms step_avg:3938.77ms
step:258/1750 train_loss:4.4592 train_time:976860ms step_avg:3938.95ms
step:259/1750 train_loss:3.9903 train_time:980843ms step_avg:3939.13ms
step:260/1750 train_loss:4.0916 train_time:984822ms step_avg:3939.29ms
step:261/1750 train_loss:3.6722 train_time:988888ms step_avg:3939.79ms
step:262/1750 train_loss:4.1279 train_time:992965ms step_avg:3940.34ms
step:263/1750 train_loss:4.0630 train_time:997028ms step_avg:3940.82ms
step:264/1750 train_loss:3.9424 train_time:1001092ms step_avg:3941.31ms
step:265/1750 train_loss:3.8082 train_time:1005161ms step_avg:3941.81ms
step:266/1750 train_loss:4.0844 train_time:1009222ms step_avg:3942.27ms
step:267/1750 train_loss:3.9161 train_time:1013285ms step_avg:3942.74ms
step:268/1750 train_loss:4.0290 train_time:1017355ms step_avg:3943.24ms
step:269/1750 train_loss:3.9258 train_time:1021420ms step_avg:3943.71ms
step:270/1750 train_loss:3.7799 train_time:1025477ms step_avg:3944.14ms
step:271/1750 train_loss:4.0630 train_time:1029535ms step_avg:3944.58ms
step:272/1750 train_loss:3.9813 train_time:1033602ms step_avg:3945.05ms
step:273/1750 train_loss:3.9704 train_time:1037662ms step_avg:3945.48ms
step:274/1750 train_loss:4.0409 train_time:1041727ms step_avg:3945.94ms
step:275/1750 train_loss:4.2380 train_time:1045797ms step_avg:3946.40ms
step:276/1750 train_loss:3.8676 train_time:1049862ms step_avg:3946.85ms
step:277/1750 train_loss:3.9427 train_time:1053932ms step_avg:3947.31ms
step:278/1750 train_loss:3.9109 train_time:1057991ms step_avg:3947.73ms
step:279/1750 train_loss:3.8155 train_time:1062054ms step_avg:3948.16ms
step:280/1750 train_loss:3.9252 train_time:1066121ms step_avg:3948.60ms
step:281/1750 train_loss:3.8149 train_time:1070183ms step_avg:3949.01ms
step:282/1750 train_loss:3.9998 train_time:1074257ms step_avg:3949.47ms
step:283/1750 train_loss:3.8466 train_time:1078322ms step_avg:3949.90ms
step:284/1750 train_loss:3.9967 train_time:1082382ms step_avg:3950.30ms
step:285/1750 train_loss:4.1541 train_time:1086441ms step_avg:3950.69ms
step:286/1750 train_loss:3.9642 train_time:1090512ms step_avg:3951.13ms
step:287/1750 train_loss:4.0665 train_time:1094570ms step_avg:3951.52ms
step:288/1750 train_loss:3.8688 train_time:1098628ms step_avg:3951.90ms
step:289/1750 train_loss:3.9533 train_time:1102698ms step_avg:3952.32ms
step:290/1750 train_loss:4.0459 train_time:1106762ms step_avg:3952.72ms
step:291/1750 train_loss:4.0309 train_time:1110823ms step_avg:3953.11ms
step:292/1750 train_loss:4.0480 train_time:1114883ms step_avg:3953.49ms
step:293/1750 train_loss:4.0411 train_time:1118939ms step_avg:3953.85ms
step:294/1750 train_loss:3.9803 train_time:1123000ms step_avg:3954.22ms
step:295/1750 train_loss:3.8292 train_time:1127061ms step_avg:3954.60ms
step:296/1750 train_loss:3.7409 train_time:1131135ms step_avg:3955.02ms
step:297/1750 train_loss:4.0000 train_time:1135204ms step_avg:3955.42ms
step:298/1750 train_loss:4.2449 train_time:1139266ms step_avg:3955.78ms
step:299/1750 train_loss:3.8651 train_time:1143335ms step_avg:3956.18ms
step:300/1750 train_loss:3.9862 train_time:1147401ms step_avg:3956.55ms
step:301/1750 train_loss:3.8961 train_time:1151461ms step_avg:3956.91ms
step:302/1750 train_loss:3.9898 train_time:1155521ms step_avg:3957.26ms
step:303/1750 train_loss:4.0578 train_time:1159577ms step_avg:3957.60ms
step:304/1750 train_loss:3.9084 train_time:1163635ms step_avg:3957.94ms
step:305/1750 train_loss:3.8397 train_time:1167693ms step_avg:3958.28ms
step:306/1750 train_loss:3.8988 train_time:1171752ms step_avg:3958.62ms
step:307/1750 train_loss:4.2073 train_time:1175813ms step_avg:3958.97ms
step:308/1750 train_loss:3.9082 train_time:1179875ms step_avg:3959.31ms
step:309/1750 train_loss:3.7460 train_time:1183939ms step_avg:3959.66ms
step:310/1750 train_loss:4.1074 train_time:1187996ms step_avg:3959.99ms
step:311/1750 train_loss:3.9712 train_time:1192064ms step_avg:3960.34ms
step:312/1750 train_loss:4.1178 train_time:1196123ms step_avg:3960.67ms
step:313/1750 train_loss:3.9788 train_time:1200187ms step_avg:3961.01ms
step:314/1750 train_loss:3.8735 train_time:1204246ms step_avg:3961.33ms
step:315/1750 train_loss:3.8405 train_time:1208305ms step_avg:3961.66ms
step:316/1750 train_loss:3.9735 train_time:1212364ms step_avg:3961.97ms
step:317/1750 train_loss:3.9746 train_time:1216424ms step_avg:3962.29ms
step:318/1750 train_loss:3.8373 train_time:1220491ms step_avg:3962.63ms
step:319/1750 train_loss:3.8479 train_time:1224549ms step_avg:3962.94ms
step:320/1750 train_loss:3.9406 train_time:1228611ms step_avg:3963.26ms
step:321/1750 train_loss:3.7471 train_time:1232666ms step_avg:3963.56ms
step:322/1750 train_loss:3.9257 train_time:1236724ms step_avg:3963.86ms
step:323/1750 train_loss:4.1676 train_time:1240781ms step_avg:3964.16ms
step:324/1750 train_loss:3.8284 train_time:1244840ms step_avg:3964.46ms
step:325/1750 train_loss:4.2600 train_time:1248902ms step_avg:3964.77ms
step:326/1750 train_loss:4.0591 train_time:1252951ms step_avg:3965.03ms
step:327/1750 train_loss:3.9483 train_time:1257009ms step_avg:3965.33ms
step:328/1750 train_loss:4.0175 train_time:1261077ms step_avg:3965.65ms
step:329/1750 train_loss:4.0041 train_time:1265150ms step_avg:3965.99ms
step:330/1750 train_loss:4.1031 train_time:1269208ms step_avg:3966.28ms
step:331/1750 train_loss:3.8939 train_time:1273266ms step_avg:3966.56ms
step:332/1750 train_loss:3.6446 train_time:1277319ms step_avg:3966.83ms
step:333/1750 train_loss:3.8092 train_time:1281370ms step_avg:3967.09ms
step:334/1750 train_loss:3.8732 train_time:1285423ms step_avg:3967.35ms
step:335/1750 train_loss:3.9058 train_time:1289477ms step_avg:3967.62ms
step:336/1750 train_loss:3.9214 train_time:1293528ms step_avg:3967.88ms
step:337/1750 train_loss:5.5569 train_time:1297589ms step_avg:3968.16ms
step:338/1750 train_loss:3.7948 train_time:1301644ms step_avg:3968.43ms
step:339/1750 train_loss:3.8032 train_time:1305704ms step_avg:3968.71ms
step:340/1750 train_loss:3.9356 train_time:1309761ms step_avg:3968.97ms
step:341/1750 train_loss:3.9421 train_time:1313820ms step_avg:3969.24ms
step:342/1750 train_loss:3.9404 train_time:1317872ms step_avg:3969.49ms
step:343/1750 train_loss:3.8204 train_time:1321934ms step_avg:3969.77ms
step:344/1750 train_loss:3.8323 train_time:1325992ms step_avg:3970.04ms
step:345/1750 train_loss:3.9982 train_time:1330052ms step_avg:3970.30ms
step:346/1750 train_loss:3.7260 train_time:1334109ms step_avg:3970.56ms
step:347/1750 train_loss:3.9970 train_time:1338168ms step_avg:3970.83ms
step:348/1750 train_loss:4.0635 train_time:1342230ms step_avg:3971.09ms
step:349/1750 train_loss:3.5886 train_time:1346287ms step_avg:3971.35ms
step:350/1750 train_loss:3.8084 train_time:1350348ms step_avg:3971.61ms
step:351/1750 train_loss:3.8013 train_time:1354406ms step_avg:3971.87ms
step:352/1750 train_loss:3.6904 train_time:1358460ms step_avg:3972.10ms
step:353/1750 train_loss:4.0473 train_time:1362520ms step_avg:3972.36ms
step:354/1750 train_loss:3.9186 train_time:1366575ms step_avg:3972.60ms
step:355/1750 train_loss:3.9633 train_time:1370639ms step_avg:3972.87ms
step:356/1750 train_loss:3.7543 train_time:1374692ms step_avg:3973.10ms
step:357/1750 train_loss:3.8679 train_time:1378749ms step_avg:3973.34ms
step:358/1750 train_loss:3.6855 train_time:1382807ms step_avg:3973.58ms
step:359/1750 train_loss:3.9036 train_time:1386869ms step_avg:3973.84ms
step:360/1750 train_loss:3.8064 train_time:1390933ms step_avg:3974.10ms
step:361/1750 train_loss:3.8576 train_time:1394991ms step_avg:3974.33ms
step:362/1750 train_loss:3.8552 train_time:1399050ms step_avg:3974.57ms
step:363/1750 train_loss:4.0061 train_time:1403104ms step_avg:3974.80ms
step:364/1750 train_loss:3.3965 train_time:1407163ms step_avg:3975.04ms
step:365/1750 train_loss:4.1019 train_time:1411214ms step_avg:3975.25ms
step:366/1750 train_loss:3.7389 train_time:1415277ms step_avg:3975.50ms
step:367/1750 train_loss:4.0259 train_time:1419329ms step_avg:3975.71ms
step:368/1750 train_loss:3.8532 train_time:1423389ms step_avg:3975.95ms
step:369/1750 train_loss:3.7676 train_time:1427444ms step_avg:3976.17ms
step:370/1750 train_loss:3.8122 train_time:1431499ms step_avg:3976.38ms
step:371/1750 train_loss:3.9400 train_time:1435550ms step_avg:3976.59ms
step:372/1750 train_loss:3.9746 train_time:1439606ms step_avg:3976.81ms
step:373/1750 train_loss:3.9207 train_time:1443656ms step_avg:3977.01ms
step:374/1750 train_loss:4.0256 train_time:1447710ms step_avg:3977.22ms
step:375/1750 train_loss:3.9832 train_time:1451762ms step_avg:3977.43ms
step:375/1750 val_loss:3.8525 train_time:1451763ms step_avg:3977.43ms
step:376/1750 train_loss:3.8233 train_time:1455824ms step_avg:3977.66ms
step:377/1750 train_loss:3.7451 train_time:1459883ms step_avg:3977.88ms
step:378/1750 train_loss:4.1916 train_time:1463943ms step_avg:3978.11ms
step:379/1750 train_loss:3.8409 train_time:1467996ms step_avg:3978.31ms
step:380/1750 train_loss:3.8411 train_time:1472054ms step_avg:3978.53ms
step:381/1750 train_loss:3.8376 train_time:1476110ms step_avg:3978.73ms
step:382/1750 train_loss:3.7991 train_time:1480165ms step_avg:3978.94ms
step:383/1750 train_loss:3.8458 train_time:1484237ms step_avg:3979.19ms
step:384/1750 train_loss:4.0041 train_time:1488288ms step_avg:3979.38ms
step:385/1750 train_loss:3.7348 train_time:1492347ms step_avg:3979.59ms
step:386/1750 train_loss:3.8015 train_time:1496402ms step_avg:3979.79ms
step:387/1750 train_loss:3.8538 train_time:1500456ms step_avg:3979.99ms
step:388/1750 train_loss:4.1197 train_time:1504520ms step_avg:3980.21ms
step:389/1750 train_loss:3.7349 train_time:1508577ms step_avg:3980.41ms
step:390/1750 train_loss:3.8717 train_time:1512695ms step_avg:3980.78ms
step:391/1750 train_loss:3.7879 train_time:1516834ms step_avg:3981.19ms
step:392/1750 train_loss:4.1000 train_time:1520959ms step_avg:3981.57ms
step:393/1750 train_loss:3.9773 train_time:1525084ms step_avg:3981.94ms
step:394/1750 train_loss:3.7149 train_time:1529211ms step_avg:3982.32ms
step:395/1750 train_loss:3.8707 train_time:1533331ms step_avg:3982.68ms
step:396/1750 train_loss:3.6602 train_time:1537453ms step_avg:3983.04ms
step:397/1750 train_loss:3.5506 train_time:1541586ms step_avg:3983.43ms
step:398/1750 train_loss:3.8352 train_time:1545716ms step_avg:3983.80ms
step:399/1750 train_loss:3.8193 train_time:1549843ms step_avg:3984.17ms
step:400/1750 train_loss:3.7057 train_time:1553960ms step_avg:3984.51ms
step:401/1750 train_loss:3.9049 train_time:1558084ms step_avg:3984.87ms
step:402/1750 train_loss:3.7253 train_time:1562210ms step_avg:3985.23ms
step:403/1750 train_loss:4.0402 train_time:1566332ms step_avg:3985.58ms
step:404/1750 train_loss:3.9318 train_time:1570464ms step_avg:3985.95ms
step:405/1750 train_loss:3.8186 train_time:1574590ms step_avg:3986.30ms
step:406/1750 train_loss:3.8965 train_time:1578707ms step_avg:3986.64ms
step:407/1750 train_loss:3.9263 train_time:1582820ms step_avg:3986.95ms
step:408/1750 train_loss:3.8100 train_time:1586947ms step_avg:3987.30ms
step:409/1750 train_loss:3.9145 train_time:1591067ms step_avg:3987.64ms
step:410/1750 train_loss:3.8896 train_time:1595182ms step_avg:3987.96ms
step:411/1750 train_loss:3.7629 train_time:1599300ms step_avg:3988.28ms
step:412/1750 train_loss:3.7735 train_time:1603415ms step_avg:3988.59ms
step:413/1750 train_loss:3.8163 train_time:1607531ms step_avg:3988.91ms
step:414/1750 train_loss:4.0424 train_time:1611657ms step_avg:3989.25ms
step:415/1750 train_loss:3.8413 train_time:1615782ms step_avg:3989.59ms
step:416/1750 train_loss:3.7843 train_time:1619902ms step_avg:3989.91ms
step:417/1750 train_loss:3.8439 train_time:1624026ms step_avg:3990.23ms
step:418/1750 train_loss:3.6551 train_time:1628142ms step_avg:3990.54ms
step:419/1750 train_loss:4.0733 train_time:1632266ms step_avg:3990.87ms
step:420/1750 train_loss:4.0161 train_time:1636386ms step_avg:3991.19ms
step:421/1750 train_loss:3.7925 train_time:1640505ms step_avg:3991.50ms
step:422/1750 train_loss:3.8895 train_time:1644620ms step_avg:3991.80ms
step:423/1750 train_loss:3.5602 train_time:1648735ms step_avg:3992.09ms
step:424/1750 train_loss:3.8682 train_time:1652865ms step_avg:3992.43ms
step:425/1750 train_loss:3.7600 train_time:1656987ms step_avg:3992.74ms
step:426/1750 train_loss:3.8551 train_time:1661118ms step_avg:3993.07ms
step:427/1750 train_loss:3.8638 train_time:1665238ms step_avg:3993.38ms
step:428/1750 train_loss:3.6856 train_time:1669357ms step_avg:3993.68ms
step:429/1750 train_loss:3.8821 train_time:1673476ms step_avg:3993.98ms
step:430/1750 train_loss:3.7711 train_time:1677598ms step_avg:3994.28ms
step:431/1750 train_loss:3.7224 train_time:1681731ms step_avg:3994.61ms
step:432/1750 train_loss:3.7419 train_time:1685866ms step_avg:3994.94ms
step:433/1750 train_loss:3.8725 train_time:1689981ms step_avg:3995.23ms
step:434/1750 train_loss:3.9317 train_time:1694102ms step_avg:3995.52ms
step:435/1750 train_loss:3.9553 train_time:1698226ms step_avg:3995.82ms
step:436/1750 train_loss:3.8119 train_time:1702348ms step_avg:3996.12ms
step:437/1750 train_loss:3.8316 train_time:1706464ms step_avg:3996.40ms
step:438/1750 train_loss:3.8578 train_time:1710578ms step_avg:3996.68ms
step:439/1750 train_loss:3.9433 train_time:1714702ms step_avg:3996.97ms
step:440/1750 train_loss:3.7045 train_time:1718825ms step_avg:3997.27ms
step:441/1750 train_loss:3.9376 train_time:1722950ms step_avg:3997.56ms
step:442/1750 train_loss:3.6278 train_time:1727075ms step_avg:3997.86ms
step:443/1750 train_loss:3.6902 train_time:1731198ms step_avg:3998.15ms
step:444/1750 train_loss:3.8810 train_time:1735322ms step_avg:3998.44ms
step:445/1750 train_loss:4.0050 train_time:1739443ms step_avg:3998.72ms
step:446/1750 train_loss:3.6252 train_time:1743556ms step_avg:3998.98ms
step:447/1750 train_loss:3.7759 train_time:1747671ms step_avg:3999.25ms
step:448/1750 train_loss:3.7903 train_time:1751794ms step_avg:3999.53ms
step:449/1750 train_loss:3.6624 train_time:1755921ms step_avg:3999.82ms
step:450/1750 train_loss:3.5440 train_time:1760053ms step_avg:4000.12ms
step:451/1750 train_loss:3.7444 train_time:1764191ms step_avg:4000.43ms
step:452/1750 train_loss:4.3208 train_time:1768317ms step_avg:4000.72ms
step:453/1750 train_loss:3.7575 train_time:1772439ms step_avg:4000.99ms
step:454/1750 train_loss:4.0562 train_time:1776560ms step_avg:4001.26ms
step:455/1750 train_loss:3.6896 train_time:1780692ms step_avg:4001.56ms
step:456/1750 train_loss:3.7997 train_time:1784812ms step_avg:4001.82ms
step:457/1750 train_loss:3.7914 train_time:1788926ms step_avg:4002.07ms
step:458/1750 train_loss:3.7391 train_time:1793041ms step_avg:4002.32ms
step:459/1750 train_loss:3.8114 train_time:1797153ms step_avg:4002.57ms
step:460/1750 train_loss:3.8453 train_time:1801278ms step_avg:4002.84ms
step:461/1750 train_loss:3.8533 train_time:1805396ms step_avg:4003.09ms
step:462/1750 train_loss:3.7039 train_time:1809526ms step_avg:4003.38ms
step:463/1750 train_loss:3.5892 train_time:1813640ms step_avg:4003.62ms
step:464/1750 train_loss:3.8055 train_time:1817760ms step_avg:4003.88ms
step:465/1750 train_loss:3.7287 train_time:1821865ms step_avg:4004.10ms
step:466/1750 train_loss:3.7948 train_time:1825977ms step_avg:4004.33ms
step:467/1750 train_loss:3.8364 train_time:1830099ms step_avg:4004.59ms
step:468/1750 train_loss:3.8302 train_time:1834228ms step_avg:4004.87ms
step:469/1750 train_loss:3.8178 train_time:1838335ms step_avg:4005.09ms
step:470/1750 train_loss:3.7246 train_time:1842449ms step_avg:4005.32ms
step:471/1750 train_loss:3.6823 train_time:1846560ms step_avg:4005.55ms
step:472/1750 train_loss:3.7927 train_time:1850698ms step_avg:4005.84ms
step:473/1750 train_loss:3.7429 train_time:1854803ms step_avg:4006.05ms
step:474/1750 train_loss:3.7815 train_time:1858914ms step_avg:4006.28ms
step:475/1750 train_loss:3.8759 train_time:1863023ms step_avg:4006.50ms
step:476/1750 train_loss:4.3238 train_time:1867144ms step_avg:4006.75ms
step:477/1750 train_loss:3.7057 train_time:1871255ms step_avg:4006.97ms
step:478/1750 train_loss:3.7867 train_time:1875376ms step_avg:4007.21ms
step:479/1750 train_loss:3.5089 train_time:1879492ms step_avg:4007.44ms
step:480/1750 train_loss:3.7389 train_time:1883613ms step_avg:4007.69ms
step:481/1750 train_loss:3.7987 train_time:1887737ms step_avg:4007.93ms
step:482/1750 train_loss:3.8379 train_time:1891864ms step_avg:4008.19ms
step:483/1750 train_loss:3.7665 train_time:1895982ms step_avg:4008.42ms
step:484/1750 train_loss:3.8438 train_time:1900102ms step_avg:4008.65ms
step:485/1750 train_loss:3.8108 train_time:1904227ms step_avg:4008.90ms
step:486/1750 train_loss:3.6795 train_time:1908346ms step_avg:4009.13ms
step:487/1750 train_loss:3.8044 train_time:1912466ms step_avg:4009.36ms
step:488/1750 train_loss:3.7019 train_time:1916584ms step_avg:4009.59ms
step:489/1750 train_loss:3.7385 train_time:1920692ms step_avg:4009.80ms
step:490/1750 train_loss:3.8944 train_time:1924806ms step_avg:4010.01ms
step:491/1750 train_loss:3.7971 train_time:1928919ms step_avg:4010.23ms
step:492/1750 train_loss:3.6923 train_time:1933039ms step_avg:4010.46ms
step:493/1750 train_loss:3.8270 train_time:1937147ms step_avg:4010.66ms
step:494/1750 train_loss:3.5902 train_time:1941260ms step_avg:4010.87ms
step:495/1750 train_loss:3.6096 train_time:1945381ms step_avg:4011.10ms
step:496/1750 train_loss:3.8052 train_time:1949513ms step_avg:4011.34ms
step:497/1750 train_loss:3.7424 train_time:1953636ms step_avg:4011.57ms
step:498/1750 train_loss:3.7565 train_time:1957751ms step_avg:4011.78ms
step:499/1750 train_loss:3.6966 train_time:1961870ms step_avg:4012.00ms
step:500/1750 train_loss:4.0554 train_time:1965982ms step_avg:4012.21ms
step:500/1750 val_loss:3.7315 train_time:1965982ms step_avg:4012.21ms
step:501/1750 train_loss:3.7578 train_time:1970111ms step_avg:4012.45ms
step:502/1750 train_loss:3.6334 train_time:1974229ms step_avg:4012.66ms
step:503/1750 train_loss:3.6755 train_time:1978360ms step_avg:4012.90ms
step:504/1750 train_loss:3.6635 train_time:1982483ms step_avg:4013.12ms
step:505/1750 train_loss:4.0744 train_time:1986596ms step_avg:4013.32ms
step:506/1750 train_loss:3.7149 train_time:1990721ms step_avg:4013.55ms
step:507/1750 train_loss:3.6322 train_time:1994839ms step_avg:4013.76ms
step:508/1750 train_loss:3.9282 train_time:1998966ms step_avg:4013.99ms
step:509/1750 train_loss:3.7674 train_time:2003084ms step_avg:4014.20ms
step:510/1750 train_loss:3.6575 train_time:2007192ms step_avg:4014.38ms
step:511/1750 train_loss:3.8620 train_time:2011318ms step_avg:4014.61ms
step:512/1750 train_loss:3.4927 train_time:2015444ms step_avg:4014.83ms
step:513/1750 train_loss:3.7404 train_time:2019560ms step_avg:4015.03ms
step:514/1750 train_loss:3.9519 train_time:2023671ms step_avg:4015.22ms
step:515/1750 train_loss:3.5946 train_time:2027810ms step_avg:4015.47ms
step:516/1750 train_loss:3.7855 train_time:2031944ms step_avg:4015.70ms
step:517/1750 train_loss:3.5649 train_time:2036079ms step_avg:4015.94ms
step:518/1750 train_loss:3.9279 train_time:2040198ms step_avg:4016.14ms
step:519/1750 train_loss:3.4953 train_time:2044310ms step_avg:4016.33ms
step:520/1750 train_loss:3.5988 train_time:2048502ms step_avg:4016.67ms
step:521/1750 train_loss:3.6437 train_time:2052676ms step_avg:4016.98ms
step:522/1750 train_loss:3.5609 train_time:2056853ms step_avg:4017.29ms
step:523/1750 train_loss:3.7145 train_time:2061036ms step_avg:4017.61ms
step:524/1750 train_loss:3.7085 train_time:2065225ms step_avg:4017.95ms
step:525/1750 train_loss:3.7483 train_time:2069406ms step_avg:4018.26ms
step:526/1750 train_loss:3.5848 train_time:2073595ms step_avg:4018.59ms
step:527/1750 train_loss:4.1448 train_time:2077767ms step_avg:4018.89ms
step:528/1750 train_loss:3.7568 train_time:2081948ms step_avg:4019.20ms
step:529/1750 train_loss:3.5855 train_time:2086118ms step_avg:4019.50ms
step:530/1750 train_loss:3.6799 train_time:2090279ms step_avg:4019.77ms
step:531/1750 train_loss:3.8700 train_time:2094479ms step_avg:4020.11ms
step:532/1750 train_loss:3.7642 train_time:2098658ms step_avg:4020.42ms
step:533/1750 train_loss:3.7545 train_time:2102824ms step_avg:4020.70ms
step:534/1750 train_loss:3.9090 train_time:2107016ms step_avg:4021.02ms
step:535/1750 train_loss:3.8393 train_time:2111192ms step_avg:4021.32ms
step:536/1750 train_loss:3.6030 train_time:2115388ms step_avg:4021.65ms
step:537/1750 train_loss:3.5967 train_time:2119567ms step_avg:4021.95ms
step:538/1750 train_loss:3.6766 train_time:2123751ms step_avg:4022.26ms
step:539/1750 train_loss:3.5428 train_time:2127934ms step_avg:4022.56ms
step:540/1750 train_loss:3.6796 train_time:2132116ms step_avg:4022.86ms
step:541/1750 train_loss:3.6111 train_time:2136295ms step_avg:4023.16ms
step:542/1750 train_loss:3.5639 train_time:2140482ms step_avg:4023.46ms
step:543/1750 train_loss:3.7433 train_time:2144662ms step_avg:4023.76ms
step:544/1750 train_loss:3.7250 train_time:2148833ms step_avg:4024.03ms
step:545/1750 train_loss:3.7564 train_time:2153009ms step_avg:4024.32ms
step:546/1750 train_loss:3.7042 train_time:2157199ms step_avg:4024.62ms
step:547/1750 train_loss:3.5565 train_time:2161376ms step_avg:4024.91ms
step:548/1750 train_loss:3.7122 train_time:2165562ms step_avg:4025.21ms
step:549/1750 train_loss:3.3544 train_time:2169767ms step_avg:4025.54ms
step:550/1750 train_loss:3.7267 train_time:2173928ms step_avg:4025.79ms
step:551/1750 train_loss:3.6996 train_time:2178119ms step_avg:4026.10ms
step:552/1750 train_loss:3.7554 train_time:2182280ms step_avg:4026.35ms
step:553/1750 train_loss:3.7521 train_time:2186461ms step_avg:4026.63ms
step:554/1750 train_loss:3.7640 train_time:2190652ms step_avg:4026.93ms
step:555/1750 train_loss:3.7029 train_time:2194830ms step_avg:4027.21ms
step:556/1750 train_loss:3.6317 train_time:2199010ms step_avg:4027.49ms
step:557/1750 train_loss:3.7640 train_time:2203188ms step_avg:4027.77ms
step:558/1750 train_loss:3.6373 train_time:2207362ms step_avg:4028.03ms
step:559/1750 train_loss:3.8373 train_time:2211534ms step_avg:4028.30ms
step:560/1750 train_loss:3.6194 train_time:2215708ms step_avg:4028.56ms
step:561/1750 train_loss:3.7406 train_time:2219868ms step_avg:4028.80ms
step:562/1750 train_loss:3.7758 train_time:2224038ms step_avg:4029.05ms
step:563/1750 train_loss:3.5711 train_time:2228208ms step_avg:4029.31ms
step:564/1750 train_loss:3.6808 train_time:2232379ms step_avg:4029.56ms
step:565/1750 train_loss:3.6890 train_time:2236554ms step_avg:4029.83ms
step:566/1750 train_loss:3.6096 train_time:2240725ms step_avg:4030.08ms
step:567/1750 train_loss:3.5089 train_time:2244924ms step_avg:4030.38ms
step:568/1750 train_loss:3.6889 train_time:2249100ms step_avg:4030.64ms
step:569/1750 train_loss:3.6276 train_time:2253269ms step_avg:4030.89ms
step:570/1750 train_loss:3.7252 train_time:2257433ms step_avg:4031.13ms
step:571/1750 train_loss:3.6951 train_time:2261619ms step_avg:4031.41ms
step:572/1750 train_loss:3.6679 train_time:2265808ms step_avg:4031.69ms
step:573/1750 train_loss:3.4184 train_time:2269977ms step_avg:4031.93ms
step:574/1750 train_loss:3.6512 train_time:2274170ms step_avg:4032.22ms
step:575/1750 train_loss:3.7414 train_time:2278378ms step_avg:4032.53ms
step:576/1750 train_loss:3.6030 train_time:2282562ms step_avg:4032.80ms
step:577/1750 train_loss:3.7056 train_time:2286734ms step_avg:4033.04ms
step:578/1750 train_loss:3.7706 train_time:2290905ms step_avg:4033.28ms
step:579/1750 train_loss:3.7858 train_time:2295073ms step_avg:4033.52ms
step:580/1750 train_loss:4.1214 train_time:2299241ms step_avg:4033.76ms
step:581/1750 train_loss:3.6952 train_time:2303422ms step_avg:4034.01ms
step:582/1750 train_loss:3.7340 train_time:2307608ms step_avg:4034.28ms
step:583/1750 train_loss:3.6309 train_time:2311787ms step_avg:4034.53ms
step:584/1750 train_loss:3.7456 train_time:2315959ms step_avg:4034.77ms
step:585/1750 train_loss:3.6551 train_time:2320123ms step_avg:4035.00ms
step:586/1750 train_loss:3.4558 train_time:2324301ms step_avg:4035.24ms
step:587/1750 train_loss:3.6268 train_time:2328479ms step_avg:4035.49ms
step:588/1750 train_loss:3.8592 train_time:2332652ms step_avg:4035.73ms
step:589/1750 train_loss:3.6544 train_time:2336833ms step_avg:4035.98ms
step:590/1750 train_loss:3.5365 train_time:2341003ms step_avg:4036.21ms
step:591/1750 train_loss:3.6218 train_time:2345188ms step_avg:4036.47ms
step:592/1750 train_loss:3.7240 train_time:2349353ms step_avg:4036.69ms
step:593/1750 train_loss:3.5073 train_time:2353538ms step_avg:4036.94ms
step:594/1750 train_loss:3.7516 train_time:2357719ms step_avg:4037.19ms
step:595/1750 train_loss:3.4399 train_time:2361896ms step_avg:4037.43ms
step:596/1750 train_loss:3.8877 train_time:2366081ms step_avg:4037.68ms
step:597/1750 train_loss:3.7728 train_time:2370262ms step_avg:4037.93ms
step:598/1750 train_loss:3.5858 train_time:2374455ms step_avg:4038.19ms
step:599/1750 train_loss:3.6485 train_time:2378619ms step_avg:4038.40ms
step:600/1750 train_loss:3.6155 train_time:2382780ms step_avg:4038.61ms
step:601/1750 train_loss:3.7059 train_time:2386961ms step_avg:4038.85ms
step:602/1750 train_loss:3.6531 train_time:2391138ms step_avg:4039.08ms
step:603/1750 train_loss:3.7169 train_time:2395291ms step_avg:4039.28ms
step:604/1750 train_loss:4.1303 train_time:2399474ms step_avg:4039.52ms
step:605/1750 train_loss:3.5027 train_time:2403642ms step_avg:4039.73ms
step:606/1750 train_loss:3.6309 train_time:2407831ms step_avg:4039.99ms
step:607/1750 train_loss:3.5783 train_time:2412004ms step_avg:4040.21ms
step:608/1750 train_loss:3.8205 train_time:2416190ms step_avg:4040.45ms
step:609/1750 train_loss:3.6462 train_time:2420356ms step_avg:4040.66ms
step:610/1750 train_loss:3.7502 train_time:2424527ms step_avg:4040.88ms
step:611/1750 train_loss:3.5449 train_time:2428684ms step_avg:4041.07ms
step:612/1750 train_loss:3.8085 train_time:2432851ms step_avg:4041.28ms
step:613/1750 train_loss:3.6414 train_time:2437019ms step_avg:4041.49ms
step:614/1750 train_loss:3.7468 train_time:2441214ms step_avg:4041.75ms
step:615/1750 train_loss:3.6111 train_time:2445390ms step_avg:4041.97ms
step:616/1750 train_loss:3.8466 train_time:2449554ms step_avg:4042.17ms
step:617/1750 train_loss:3.5800 train_time:2453717ms step_avg:4042.37ms
step:618/1750 train_loss:3.5941 train_time:2457882ms step_avg:4042.57ms
step:619/1750 train_loss:3.7216 train_time:2462052ms step_avg:4042.78ms
step:620/1750 train_loss:3.7411 train_time:2466225ms step_avg:4042.99ms
step:621/1750 train_loss:3.5835 train_time:2470412ms step_avg:4043.23ms
step:622/1750 train_loss:3.6324 train_time:2474588ms step_avg:4043.44ms
step:623/1750 train_loss:3.6183 train_time:2478765ms step_avg:4043.66ms
step:624/1750 train_loss:3.6846 train_time:2482931ms step_avg:4043.86ms
step:625/1750 train_loss:3.5845 train_time:2487122ms step_avg:4044.10ms
step:625/1750 val_loss:3.6501 train_time:2487126ms step_avg:4044.11ms
step:626/1750 train_loss:3.6055 train_time:2491290ms step_avg:4044.30ms
step:627/1750 train_loss:3.6300 train_time:2495487ms step_avg:4044.55ms
step:628/1750 train_loss:3.6561 train_time:2499650ms step_avg:4044.74ms
step:629/1750 train_loss:3.9083 train_time:2503823ms step_avg:4044.95ms
step:630/1750 train_loss:3.6456 train_time:2507987ms step_avg:4045.14ms
step:631/1750 train_loss:3.6016 train_time:2512146ms step_avg:4045.32ms
step:632/1750 train_loss:3.5145 train_time:2516311ms step_avg:4045.52ms
step:633/1750 train_loss:3.6754 train_time:2520464ms step_avg:4045.69ms
step:634/1750 train_loss:3.8027 train_time:2524618ms step_avg:4045.86ms
step:635/1750 train_loss:3.8763 train_time:2528791ms step_avg:4046.07ms
step:636/1750 train_loss:3.7310 train_time:2532950ms step_avg:4046.25ms
step:637/1750 train_loss:3.5808 train_time:2537120ms step_avg:4046.44ms
step:638/1750 train_loss:3.4564 train_time:2541282ms step_avg:4046.63ms
step:639/1750 train_loss:3.6214 train_time:2545452ms step_avg:4046.82ms
step:640/1750 train_loss:3.7973 train_time:2549611ms step_avg:4047.00ms
step:641/1750 train_loss:3.6605 train_time:2553772ms step_avg:4047.18ms
step:642/1750 train_loss:3.7915 train_time:2557946ms step_avg:4047.38ms
step:643/1750 train_loss:4.0291 train_time:2562094ms step_avg:4047.54ms
step:644/1750 train_loss:3.6552 train_time:2566266ms step_avg:4047.74ms
step:645/1750 train_loss:3.8955 train_time:2570432ms step_avg:4047.92ms
step:646/1750 train_loss:3.7222 train_time:2574604ms step_avg:4048.12ms
step:647/1750 train_loss:3.6540 train_time:2578757ms step_avg:4048.28ms
step:648/1750 train_loss:3.7389 train_time:2582912ms step_avg:4048.45ms
step:649/1750 train_loss:3.7013 train_time:2587098ms step_avg:4048.67ms
step:650/1750 train_loss:3.7570 train_time:2591329ms step_avg:4048.95ms
step:651/1750 train_loss:3.6466 train_time:2595569ms step_avg:4049.25ms
step:652/1750 train_loss:3.7586 train_time:2599792ms step_avg:4049.52ms
step:653/1750 train_loss:3.6987 train_time:2604014ms step_avg:4049.79ms
step:654/1750 train_loss:3.7346 train_time:2608233ms step_avg:4050.05ms
step:655/1750 train_loss:3.4698 train_time:2612458ms step_avg:4050.32ms
step:656/1750 train_loss:3.9292 train_time:2616690ms step_avg:4050.60ms
step:657/1750 train_loss:3.6330 train_time:2620903ms step_avg:4050.85ms
step:658/1750 train_loss:3.6549 train_time:2625109ms step_avg:4051.09ms
step:659/1750 train_loss:3.5550 train_time:2629335ms step_avg:4051.36ms
step:660/1750 train_loss:3.6164 train_time:2633550ms step_avg:4051.61ms
step:661/1750 train_loss:3.7080 train_time:2637760ms step_avg:4051.86ms
step:662/1750 train_loss:3.5801 train_time:2641990ms step_avg:4052.13ms
step:663/1750 train_loss:3.5183 train_time:2646206ms step_avg:4052.38ms
step:664/1750 train_loss:3.6941 train_time:2650394ms step_avg:4052.59ms
step:665/1750 train_loss:3.5782 train_time:2654625ms step_avg:4052.86ms
step:666/1750 train_loss:3.4125 train_time:2658841ms step_avg:4053.11ms
step:667/1750 train_loss:3.5081 train_time:2663073ms step_avg:4053.38ms
step:668/1750 train_loss:3.6635 train_time:2667293ms step_avg:4053.64ms
step:669/1750 train_loss:3.5886 train_time:2671513ms step_avg:4053.89ms
step:670/1750 train_loss:3.5165 train_time:2675741ms step_avg:4054.15ms
step:671/1750 train_loss:3.7858 train_time:2679970ms step_avg:4054.42ms
step:672/1750 train_loss:3.7166 train_time:2684195ms step_avg:4054.68ms
step:673/1750 train_loss:3.9677 train_time:2688424ms step_avg:4054.94ms
step:674/1750 train_loss:3.5715 train_time:2692634ms step_avg:4055.17ms
step:675/1750 train_loss:3.6027 train_time:2696870ms step_avg:4055.44ms
step:676/1750 train_loss:3.5583 train_time:2701097ms step_avg:4055.70ms
step:677/1750 train_loss:3.9484 train_time:2705341ms step_avg:4055.98ms
step:678/1750 train_loss:3.6086 train_time:2709563ms step_avg:4056.23ms
step:679/1750 train_loss:3.5944 train_time:2713782ms step_avg:4056.47ms
step:680/1750 train_loss:3.8033 train_time:2718001ms step_avg:4056.72ms
step:681/1750 train_loss:3.7934 train_time:2722252ms step_avg:4057.01ms
step:682/1750 train_loss:3.6291 train_time:2726475ms step_avg:4057.25ms
step:683/1750 train_loss:3.5525 train_time:2730678ms step_avg:4057.47ms
step:684/1750 train_loss:3.4129 train_time:2734903ms step_avg:4057.72ms
step:685/1750 train_loss:3.6717 train_time:2739146ms step_avg:4057.99ms
step:686/1750 train_loss:3.5345 train_time:2743393ms step_avg:4058.27ms
step:687/1750 train_loss:3.5820 train_time:2747622ms step_avg:4058.53ms
step:688/1750 train_loss:3.6204 train_time:2751839ms step_avg:4058.76ms
step:689/1750 train_loss:3.5131 train_time:2756062ms step_avg:4059.00ms
step:690/1750 train_loss:3.7817 train_time:2760311ms step_avg:4059.28ms
step:691/1750 train_loss:3.6896 train_time:2764569ms step_avg:4059.57ms
step:692/1750 train_loss:3.4669 train_time:2768780ms step_avg:4059.79ms
step:693/1750 train_loss:3.7327 train_time:2772988ms step_avg:4060.01ms
step:694/1750 train_loss:3.5445 train_time:2777185ms step_avg:4060.21ms
step:695/1750 train_loss:3.4012 train_time:2781421ms step_avg:4060.47ms
step:696/1750 train_loss:3.6758 train_time:2785630ms step_avg:4060.69ms
step:697/1750 train_loss:3.6809 train_time:2789825ms step_avg:4060.88ms
step:698/1750 train_loss:3.5599 train_time:2794045ms step_avg:4061.11ms
step:699/1750 train_loss:3.6232 train_time:2798274ms step_avg:4061.36ms
step:700/1750 train_loss:4.0559 train_time:2802487ms step_avg:4061.58ms
step:701/1750 train_loss:3.5792 train_time:2806733ms step_avg:4061.84ms
step:702/1750 train_loss:3.7152 train_time:2810957ms step_avg:4062.08ms
step:703/1750 train_loss:3.6862 train_time:2815169ms step_avg:4062.29ms
step:704/1750 train_loss:3.7794 train_time:2819383ms step_avg:4062.51ms
step:705/1750 train_loss:3.7163 train_time:2823624ms step_avg:4062.77ms
step:706/1750 train_loss:3.4377 train_time:2827857ms step_avg:4063.01ms
step:707/1750 train_loss:3.6245 train_time:2832112ms step_avg:4063.29ms
step:708/1750 train_loss:3.6936 train_time:2836307ms step_avg:4063.48ms
step:709/1750 train_loss:3.5647 train_time:2840533ms step_avg:4063.71ms
step:710/1750 train_loss:3.7569 train_time:2844765ms step_avg:4063.95ms
step:711/1750 train_loss:3.4137 train_time:2848993ms step_avg:4064.18ms
step:712/1750 train_loss:3.6341 train_time:2853229ms step_avg:4064.43ms
step:713/1750 train_loss:3.5918 train_time:2857471ms step_avg:4064.68ms
step:714/1750 train_loss:3.5061 train_time:2861690ms step_avg:4064.90ms
step:715/1750 train_loss:3.7093 train_time:2865913ms step_avg:4065.12ms
step:716/1750 train_loss:3.3098 train_time:2870131ms step_avg:4065.34ms
step:717/1750 train_loss:3.5617 train_time:2874361ms step_avg:4065.57ms
step:718/1750 train_loss:3.6220 train_time:2878572ms step_avg:4065.78ms
step:719/1750 train_loss:3.4841 train_time:2882798ms step_avg:4066.01ms
step:720/1750 train_loss:3.5308 train_time:2886988ms step_avg:4066.18ms
step:721/1750 train_loss:3.6190 train_time:2891202ms step_avg:4066.39ms
step:722/1750 train_loss:3.4324 train_time:2895430ms step_avg:4066.62ms
step:723/1750 train_loss:3.5613 train_time:2899670ms step_avg:4066.86ms
step:724/1750 train_loss:3.6761 train_time:2903885ms step_avg:4067.07ms
step:725/1750 train_loss:3.6946 train_time:2908109ms step_avg:4067.29ms
step:726/1750 train_loss:3.7547 train_time:2912343ms step_avg:4067.52ms
step:727/1750 train_loss:3.6204 train_time:2916583ms step_avg:4067.76ms
step:728/1750 train_loss:3.5253 train_time:2920818ms step_avg:4067.99ms
step:729/1750 train_loss:3.6924 train_time:2925036ms step_avg:4068.20ms
step:730/1750 train_loss:3.5273 train_time:2929249ms step_avg:4068.40ms
step:731/1750 train_loss:3.8386 train_time:2933502ms step_avg:4068.66ms
step:732/1750 train_loss:3.9836 train_time:2937730ms step_avg:4068.88ms
step:733/1750 train_loss:3.6458 train_time:2941936ms step_avg:4069.07ms
step:734/1750 train_loss:3.9149 train_time:2946154ms step_avg:4069.27ms
step:735/1750 train_loss:3.4461 train_time:2950352ms step_avg:4069.45ms
step:736/1750 train_loss:3.6156 train_time:2954588ms step_avg:4069.68ms
step:737/1750 train_loss:3.7104 train_time:2958803ms step_avg:4069.88ms
step:738/1750 train_loss:4.0871 train_time:2963073ms step_avg:4070.15ms
step:739/1750 train_loss:3.6570 train_time:2967294ms step_avg:4070.36ms
step:740/1750 train_loss:3.5046 train_time:2971513ms step_avg:4070.57ms
step:741/1750 train_loss:3.7073 train_time:2975721ms step_avg:4070.75ms
step:742/1750 train_loss:3.6470 train_time:2979967ms step_avg:4070.99ms
step:743/1750 train_loss:3.5568 train_time:2984197ms step_avg:4071.21ms
step:744/1750 train_loss:3.3653 train_time:2988426ms step_avg:4071.43ms
step:745/1750 train_loss:3.6069 train_time:2992641ms step_avg:4071.62ms
step:746/1750 train_loss:3.6110 train_time:2996893ms step_avg:4071.87ms
step:747/1750 train_loss:3.5930 train_time:3001101ms step_avg:4072.05ms
step:748/1750 train_loss:3.5836 train_time:3005325ms step_avg:4072.26ms
step:749/1750 train_loss:3.5476 train_time:3009527ms step_avg:4072.43ms
step:750/1750 train_loss:3.4145 train_time:3013775ms step_avg:4072.67ms
step:750/1750 val_loss:3.5928 train_time:3013775ms step_avg:4072.67ms
step:751/1750 train_loss:3.4333 train_time:3018000ms step_avg:4072.87ms
step:752/1750 train_loss:3.7301 train_time:3022254ms step_avg:4073.12ms
step:753/1750 train_loss:3.4723 train_time:3026509ms step_avg:4073.36ms
step:754/1750 train_loss:3.5647 train_time:3030707ms step_avg:4073.53ms
step:755/1750 train_loss:3.5285 train_time:3034941ms step_avg:4073.75ms
step:756/1750 train_loss:4.0700 train_time:3039167ms step_avg:4073.95ms
step:757/1750 train_loss:3.6815 train_time:3043370ms step_avg:4074.12ms
step:758/1750 train_loss:3.5230 train_time:3047631ms step_avg:4074.37ms
step:759/1750 train_loss:3.6191 train_time:3051886ms step_avg:4074.61ms
step:760/1750 train_loss:3.5828 train_time:3056083ms step_avg:4074.78ms
step:761/1750 train_loss:3.4600 train_time:3060305ms step_avg:4074.97ms
step:762/1750 train_loss:3.4424 train_time:3064536ms step_avg:4075.18ms
step:763/1750 train_loss:3.6050 train_time:3068757ms step_avg:4075.37ms
step:764/1750 train_loss:3.4818 train_time:3072965ms step_avg:4075.55ms
step:765/1750 train_loss:3.4179 train_time:3077195ms step_avg:4075.75ms
step:766/1750 train_loss:3.5355 train_time:3081426ms step_avg:4075.96ms
step:767/1750 train_loss:3.5303 train_time:3085659ms step_avg:4076.17ms
step:768/1750 train_loss:3.7718 train_time:3089923ms step_avg:4076.42ms
step:769/1750 train_loss:3.6077 train_time:3094151ms step_avg:4076.62ms
step:770/1750 train_loss:3.5662 train_time:3098388ms step_avg:4076.83ms
step:771/1750 train_loss:3.5860 train_time:3102634ms step_avg:4077.05ms
step:772/1750 train_loss:3.4679 train_time:3106848ms step_avg:4077.23ms
step:773/1750 train_loss:3.7978 train_time:3111070ms step_avg:4077.42ms
step:774/1750 train_loss:3.7088 train_time:3115285ms step_avg:4077.60ms
step:775/1750 train_loss:3.6293 train_time:3119534ms step_avg:4077.82ms
step:776/1750 train_loss:3.5729 train_time:3123752ms step_avg:4078.00ms
step:777/1750 train_loss:3.5359 train_time:3127956ms step_avg:4078.17ms
step:778/1750 train_loss:3.5219 train_time:3132172ms step_avg:4078.35ms
step:779/1750 train_loss:3.5537 train_time:3136442ms step_avg:4078.60ms
step:780/1750 train_loss:3.4560 train_time:3140666ms step_avg:4078.79ms
step:781/1750 train_loss:3.4459 train_time:3144918ms step_avg:4079.01ms
step:782/1750 train_loss:3.5557 train_time:3149215ms step_avg:4079.29ms
step:783/1750 train_loss:3.6512 train_time:3153490ms step_avg:4079.55ms
step:784/1750 train_loss:3.8048 train_time:3157757ms step_avg:4079.79ms
step:785/1750 train_loss:3.5062 train_time:3162015ms step_avg:4080.02ms
step:786/1750 train_loss:3.5570 train_time:3166283ms step_avg:4080.26ms
step:787/1750 train_loss:3.6060 train_time:3170548ms step_avg:4080.50ms
step:788/1750 train_loss:3.6492 train_time:3174826ms step_avg:4080.75ms
step:789/1750 train_loss:3.6040 train_time:3179077ms step_avg:4080.97ms
step:790/1750 train_loss:3.9517 train_time:3183344ms step_avg:4081.21ms
step:791/1750 train_loss:3.5910 train_time:3187599ms step_avg:4081.43ms
step:792/1750 train_loss:3.4988 train_time:3191837ms step_avg:4081.63ms
step:793/1750 train_loss:4.2635 train_time:3196101ms step_avg:4081.87ms
step:794/1750 train_loss:3.6540 train_time:3200391ms step_avg:4082.13ms
step:795/1750 train_loss:3.4652 train_time:3204656ms step_avg:4082.36ms
step:796/1750 train_loss:3.5589 train_time:3208950ms step_avg:4082.63ms
step:797/1750 train_loss:3.6320 train_time:3213303ms step_avg:4082.98ms
step:798/1750 train_loss:3.5315 train_time:3217592ms step_avg:4083.24ms
step:799/1750 train_loss:3.5431 train_time:3221855ms step_avg:4083.47ms
step:800/1750 train_loss:3.4756 train_time:3226152ms step_avg:4083.74ms
step:801/1750 train_loss:3.5285 train_time:3230450ms step_avg:4084.01ms
step:802/1750 train_loss:3.5791 train_time:3234725ms step_avg:4084.25ms
step:803/1750 train_loss:5.1919 train_time:3238997ms step_avg:4084.49ms
step:804/1750 train_loss:3.6027 train_time:3243276ms step_avg:4084.73ms
step:805/1750 train_loss:3.3569 train_time:3247518ms step_avg:4084.93ms
step:806/1750 train_loss:3.5745 train_time:3251764ms step_avg:4085.13ms
step:807/1750 train_loss:3.6482 train_time:3256077ms step_avg:4085.42ms
step:808/1750 train_loss:3.6452 train_time:3260334ms step_avg:4085.63ms
step:809/1750 train_loss:3.4986 train_time:3264574ms step_avg:4085.82ms
step:810/1750 train_loss:3.5114 train_time:3268843ms step_avg:4086.05ms
step:811/1750 train_loss:3.2326 train_time:3273095ms step_avg:4086.26ms
step:812/1750 train_loss:3.5873 train_time:3277370ms step_avg:4086.50ms
step:813/1750 train_loss:3.3666 train_time:3281640ms step_avg:4086.72ms
step:814/1750 train_loss:3.6384 train_time:3285907ms step_avg:4086.95ms
step:815/1750 train_loss:3.6050 train_time:3290152ms step_avg:4087.14ms
step:816/1750 train_loss:3.6854 train_time:3294403ms step_avg:4087.35ms
step:817/1750 train_loss:3.4267 train_time:3298671ms step_avg:4087.57ms
step:818/1750 train_loss:3.5433 train_time:3302949ms step_avg:4087.81ms
step:819/1750 train_loss:3.5101 train_time:3307188ms step_avg:4087.99ms
step:820/1750 train_loss:3.4765 train_time:3311438ms step_avg:4088.20ms
step:821/1750 train_loss:3.4357 train_time:3315699ms step_avg:4088.41ms
step:822/1750 train_loss:3.5827 train_time:3319982ms step_avg:4088.65ms
step:823/1750 train_loss:3.5189 train_time:3324230ms step_avg:4088.84ms
step:824/1750 train_loss:3.5754 train_time:3328481ms step_avg:4089.04ms
step:825/1750 train_loss:3.6788 train_time:3332736ms step_avg:4089.25ms
step:826/1750 train_loss:3.5609 train_time:3336986ms step_avg:4089.44ms
step:827/1750 train_loss:3.4560 train_time:3341257ms step_avg:4089.67ms
step:828/1750 train_loss:3.7171 train_time:3345586ms step_avg:4089.96ms
step:829/1750 train_loss:3.4250 train_time:3349843ms step_avg:4090.16ms
step:830/1750 train_loss:3.4134 train_time:3354128ms step_avg:4090.40ms
step:831/1750 train_loss:3.6994 train_time:3358387ms step_avg:4090.61ms
step:832/1750 train_loss:3.5950 train_time:3362653ms step_avg:4090.82ms
step:833/1750 train_loss:4.4954 train_time:3366924ms step_avg:4091.04ms
step:834/1750 train_loss:3.4383 train_time:3371220ms step_avg:4091.29ms
step:835/1750 train_loss:3.4316 train_time:3375461ms step_avg:4091.47ms
step:836/1750 train_loss:3.4743 train_time:3379766ms step_avg:4091.73ms
step:837/1750 train_loss:3.5175 train_time:3384043ms step_avg:4091.95ms
step:838/1750 train_loss:3.6096 train_time:3388313ms step_avg:4092.17ms
step:839/1750 train_loss:3.8956 train_time:3392576ms step_avg:4092.37ms
step:840/1750 train_loss:3.6351 train_time:3396820ms step_avg:4092.55ms
step:841/1750 train_loss:3.5123 train_time:3401072ms step_avg:4092.75ms
step:842/1750 train_loss:3.6630 train_time:3405326ms step_avg:4092.94ms
step:843/1750 train_loss:3.6713 train_time:3409584ms step_avg:4093.14ms
step:844/1750 train_loss:3.6605 train_time:3413842ms step_avg:4093.34ms
step:845/1750 train_loss:3.6659 train_time:3418089ms step_avg:4093.52ms
step:846/1750 train_loss:3.3795 train_time:3422359ms step_avg:4093.73ms
step:847/1750 train_loss:3.4259 train_time:3426587ms step_avg:4093.89ms
step:848/1750 train_loss:3.6818 train_time:3430886ms step_avg:4094.14ms
step:849/1750 train_loss:3.6541 train_time:3435159ms step_avg:4094.35ms
step:850/1750 train_loss:3.4888 train_time:3439410ms step_avg:4094.54ms
step:851/1750 train_loss:3.7276 train_time:3443682ms step_avg:4094.75ms
step:852/1750 train_loss:3.5941 train_time:3447960ms step_avg:4094.96ms
step:853/1750 train_loss:3.6057 train_time:3452252ms step_avg:4095.20ms
step:854/1750 train_loss:3.6977 train_time:3456523ms step_avg:4095.41ms
step:855/1750 train_loss:3.5524 train_time:3460778ms step_avg:4095.60ms
step:856/1750 train_loss:3.5113 train_time:3465025ms step_avg:4095.77ms
step:857/1750 train_loss:3.6043 train_time:3469257ms step_avg:4095.93ms
step:858/1750 train_loss:3.5897 train_time:3473510ms step_avg:4096.12ms
step:859/1750 train_loss:3.4993 train_time:3477753ms step_avg:4096.29ms
step:860/1750 train_loss:3.5280 train_time:3482087ms step_avg:4096.57ms
step:861/1750 train_loss:3.5228 train_time:3486359ms step_avg:4096.78ms
step:862/1750 train_loss:3.4983 train_time:3490609ms step_avg:4096.96ms
step:863/1750 train_loss:3.4100 train_time:3494885ms step_avg:4097.17ms
step:864/1750 train_loss:3.0345 train_time:3499166ms step_avg:4097.38ms
step:865/1750 train_loss:3.5060 train_time:3503432ms step_avg:4097.58ms
step:866/1750 train_loss:3.7220 train_time:3507709ms step_avg:4097.79ms
step:867/1750 train_loss:3.4330 train_time:3511954ms step_avg:4097.96ms
step:868/1750 train_loss:3.5790 train_time:3516283ms step_avg:4098.23ms
step:869/1750 train_loss:3.5266 train_time:3520526ms step_avg:4098.40ms
step:870/1750 train_loss:3.5841 train_time:3524758ms step_avg:4098.56ms
step:871/1750 train_loss:3.5598 train_time:3529010ms step_avg:4098.73ms
step:872/1750 train_loss:3.4150 train_time:3533287ms step_avg:4098.94ms
step:873/1750 train_loss:3.4501 train_time:3537545ms step_avg:4099.13ms
step:874/1750 train_loss:3.3052 train_time:3541807ms step_avg:4099.31ms
step:875/1750 train_loss:3.5010 train_time:3546058ms step_avg:4099.49ms
step:875/1750 val_loss:3.5482 train_time:3546059ms step_avg:4099.49ms
step:876/1750 train_loss:3.2645 train_time:3550334ms step_avg:4099.69ms
step:877/1750 train_loss:3.3980 train_time:3554591ms step_avg:4099.87ms
step:878/1750 train_loss:3.5072 train_time:3558869ms step_avg:4100.08ms
step:879/1750 train_loss:3.6380 train_time:3563143ms step_avg:4100.28ms
step:880/1750 train_loss:3.5344 train_time:3567418ms step_avg:4100.48ms
step:881/1750 train_loss:3.4663 train_time:3571693ms step_avg:4100.68ms
step:882/1750 train_loss:3.7707 train_time:3575950ms step_avg:4100.86ms
step:883/1750 train_loss:3.3851 train_time:3580201ms step_avg:4101.03ms
step:884/1750 train_loss:3.4273 train_time:3584456ms step_avg:4101.21ms
step:885/1750 train_loss:3.5164 train_time:3588711ms step_avg:4101.38ms
step:886/1750 train_loss:3.4281 train_time:3592984ms step_avg:4101.58ms
step:887/1750 train_loss:3.3886 train_time:3597248ms step_avg:4101.77ms
step:888/1750 train_loss:3.6192 train_time:3601522ms step_avg:4101.96ms
step:889/1750 train_loss:3.5981 train_time:3605774ms step_avg:4102.13ms
step:890/1750 train_loss:3.4848 train_time:3610078ms step_avg:4102.36ms
step:891/1750 train_loss:3.4901 train_time:3614309ms step_avg:4102.51ms
step:892/1750 train_loss:3.5809 train_time:3618562ms step_avg:4102.68ms
step:893/1750 train_loss:3.7462 train_time:3622819ms step_avg:4102.85ms
step:894/1750 train_loss:3.7012 train_time:3627074ms step_avg:4103.03ms
step:895/1750 train_loss:3.4419 train_time:3631338ms step_avg:4103.21ms
step:896/1750 train_loss:3.4191 train_time:3635592ms step_avg:4103.38ms
step:897/1750 train_loss:3.8254 train_time:3639863ms step_avg:4103.57ms
step:898/1750 train_loss:3.5594 train_time:3644108ms step_avg:4103.72ms
step:899/1750 train_loss:3.5988 train_time:3648372ms step_avg:4103.91ms
step:900/1750 train_loss:3.5763 train_time:3652664ms step_avg:4104.12ms
step:901/1750 train_loss:3.5295 train_time:3656942ms step_avg:4104.31ms
step:902/1750 train_loss:3.3459 train_time:3661185ms step_avg:4104.47ms
step:903/1750 train_loss:3.4745 train_time:3665455ms step_avg:4104.65ms
step:904/1750 train_loss:5.3846 train_time:3669707ms step_avg:4104.82ms
step:905/1750 train_loss:3.5196 train_time:3673971ms step_avg:4105.00ms
step:906/1750 train_loss:3.5941 train_time:3678238ms step_avg:4105.18ms
step:907/1750 train_loss:3.6461 train_time:3682481ms step_avg:4105.33ms
step:908/1750 train_loss:3.5213 train_time:3686755ms step_avg:4105.52ms
step:909/1750 train_loss:3.5245 train_time:3691088ms step_avg:4105.77ms
step:910/1750 train_loss:3.4600 train_time:3695379ms step_avg:4105.98ms
step:911/1750 train_loss:3.5255 train_time:3699673ms step_avg:4106.19ms
step:912/1750 train_loss:3.7726 train_time:3704035ms step_avg:4106.47ms
step:913/1750 train_loss:3.4946 train_time:3708353ms step_avg:4106.70ms
step:914/1750 train_loss:3.6687 train_time:3712663ms step_avg:4106.93ms
step:915/1750 train_loss:3.7087 train_time:3716953ms step_avg:4107.13ms
step:916/1750 train_loss:3.5532 train_time:3721246ms step_avg:4107.34ms
step:917/1750 train_loss:3.7066 train_time:3725583ms step_avg:4107.59ms
step:918/1750 train_loss:3.6515 train_time:3729889ms step_avg:4107.81ms
step:919/1750 train_loss:3.6180 train_time:3734217ms step_avg:4108.05ms
step:920/1750 train_loss:3.7323 train_time:3738531ms step_avg:4108.28ms
step:921/1750 train_loss:3.5283 train_time:3742868ms step_avg:4108.53ms
step:922/1750 train_loss:3.8176 train_time:3747181ms step_avg:4108.75ms
step:923/1750 train_loss:3.6982 train_time:3751469ms step_avg:4108.95ms
step:924/1750 train_loss:3.3168 train_time:3755775ms step_avg:4109.16ms
step:925/1750 train_loss:3.4520 train_time:3760035ms step_avg:4109.33ms
step:926/1750 train_loss:3.4128 train_time:3764335ms step_avg:4109.54ms
step:927/1750 train_loss:3.7777 train_time:3768662ms step_avg:4109.77ms
step:928/1750 train_loss:3.5386 train_time:3772952ms step_avg:4109.97ms
step:929/1750 train_loss:3.5393 train_time:3777252ms step_avg:4110.18ms
step:930/1750 train_loss:3.5206 train_time:3781568ms step_avg:4110.40ms
step:931/1750 train_loss:4.3604 train_time:3785893ms step_avg:4110.63ms
step:932/1750 train_loss:3.5030 train_time:3790198ms step_avg:4110.84ms
step:933/1750 train_loss:3.5733 train_time:3794482ms step_avg:4111.03ms
step:934/1750 train_loss:3.9693 train_time:3798808ms step_avg:4111.26ms
step:935/1750 train_loss:3.7475 train_time:3803081ms step_avg:4111.44ms
step:936/1750 train_loss:3.9976 train_time:3807421ms step_avg:4111.69ms
step:937/1750 train_loss:3.6592 train_time:3811711ms step_avg:4111.88ms
step:938/1750 train_loss:3.6137 train_time:3816016ms step_avg:4112.09ms
step:939/1750 train_loss:3.6132 train_time:3820340ms step_avg:4112.31ms
step:940/1750 train_loss:3.4544 train_time:3824629ms step_avg:4112.50ms
step:941/1750 train_loss:3.1997 train_time:3828970ms step_avg:4112.75ms
step:942/1750 train_loss:3.5394 train_time:3833267ms step_avg:4112.95ms
step:943/1750 train_loss:3.4988 train_time:3837569ms step_avg:4113.15ms
step:944/1750 train_loss:3.3534 train_time:3841846ms step_avg:4113.33ms
step:945/1750 train_loss:3.4156 train_time:3846145ms step_avg:4113.52ms
step:946/1750 train_loss:3.4713 train_time:3850494ms step_avg:4113.78ms
step:947/1750 train_loss:3.4855 train_time:3854828ms step_avg:4114.01ms
step:948/1750 train_loss:3.5311 train_time:3859174ms step_avg:4114.26ms
step:949/1750 train_loss:3.4780 train_time:3863468ms step_avg:4114.45ms
step:950/1750 train_loss:3.4294 train_time:3867799ms step_avg:4114.68ms
step:951/1750 train_loss:3.5616 train_time:3872132ms step_avg:4114.91ms
step:952/1750 train_loss:3.4629 train_time:3876416ms step_avg:4115.09ms
step:953/1750 train_loss:3.7227 train_time:3880727ms step_avg:4115.30ms
step:954/1750 train_loss:3.6124 train_time:3885027ms step_avg:4115.49ms
step:955/1750 train_loss:3.2374 train_time:3889338ms step_avg:4115.70ms
step:956/1750 train_loss:3.2563 train_time:3893626ms step_avg:4115.88ms
step:957/1750 train_loss:3.4600 train_time:3897924ms step_avg:4116.08ms
step:958/1750 train_loss:3.5425 train_time:3902188ms step_avg:4116.23ms
step:959/1750 train_loss:3.2815 train_time:3906517ms step_avg:4116.46ms
step:960/1750 train_loss:3.5184 train_time:3910803ms step_avg:4116.63ms
step:961/1750 train_loss:3.7168 train_time:3915108ms step_avg:4116.83ms
step:962/1750 train_loss:3.4744 train_time:3919467ms step_avg:4117.09ms
step:963/1750 train_loss:3.3870 train_time:3923787ms step_avg:4117.30ms
step:964/1750 train_loss:3.3582 train_time:3928075ms step_avg:4117.48ms
step:965/1750 train_loss:3.5796 train_time:3932357ms step_avg:4117.65ms
step:966/1750 train_loss:3.6220 train_time:3936685ms step_avg:4117.87ms
step:967/1750 train_loss:3.5917 train_time:3940988ms step_avg:4118.07ms
step:968/1750 train_loss:3.6095 train_time:3945255ms step_avg:4118.22ms
step:969/1750 train_loss:3.5036 train_time:3949558ms step_avg:4118.41ms
step:970/1750 train_loss:3.4831 train_time:3953866ms step_avg:4118.61ms
step:971/1750 train_loss:3.3092 train_time:3958168ms step_avg:4118.80ms
step:972/1750 train_loss:3.4712 train_time:3962490ms step_avg:4119.01ms
step:973/1750 train_loss:3.2152 train_time:3966776ms step_avg:4119.19ms
step:974/1750 train_loss:3.5323 train_time:3971056ms step_avg:4119.35ms
step:975/1750 train_loss:3.2662 train_time:3975368ms step_avg:4119.55ms
step:976/1750 train_loss:3.5693 train_time:3979673ms step_avg:4119.74ms
step:977/1750 train_loss:3.7447 train_time:3983960ms step_avg:4119.92ms
step:978/1750 train_loss:3.6421 train_time:3988251ms step_avg:4120.09ms
step:979/1750 train_loss:3.4642 train_time:3992564ms step_avg:4120.29ms
step:980/1750 train_loss:3.5245 train_time:3996835ms step_avg:4120.45ms
step:981/1750 train_loss:3.4675 train_time:4001112ms step_avg:4120.61ms
step:982/1750 train_loss:3.4019 train_time:4005409ms step_avg:4120.79ms
step:983/1750 train_loss:3.4452 train_time:4009701ms step_avg:4120.97ms
step:984/1750 train_loss:3.5692 train_time:4013961ms step_avg:4121.11ms
step:985/1750 train_loss:3.6528 train_time:4018212ms step_avg:4121.24ms
step:986/1750 train_loss:3.4127 train_time:4022507ms step_avg:4121.42ms
step:987/1750 train_loss:3.5275 train_time:4026779ms step_avg:4121.57ms
step:988/1750 train_loss:3.5118 train_time:4031107ms step_avg:4121.79ms
step:989/1750 train_loss:3.5514 train_time:4035447ms step_avg:4122.01ms
step:990/1750 train_loss:3.4681 train_time:4039700ms step_avg:4122.14ms
step:991/1750 train_loss:3.4990 train_time:4044009ms step_avg:4122.33ms
step:992/1750 train_loss:3.4864 train_time:4048290ms step_avg:4122.49ms
step:993/1750 train_loss:3.3092 train_time:4052614ms step_avg:4122.70ms
step:994/1750 train_loss:3.5886 train_time:4056928ms step_avg:4122.89ms
step:995/1750 train_loss:3.4125 train_time:4061270ms step_avg:4123.12ms
step:996/1750 train_loss:3.4127 train_time:4065569ms step_avg:4123.30ms
step:997/1750 train_loss:3.5228 train_time:4069851ms step_avg:4123.46ms
step:998/1750 train_loss:3.5509 train_time:4074130ms step_avg:4123.61ms
step:999/1750 train_loss:3.7477 train_time:4078404ms step_avg:4123.77ms
step:1000/1750 train_loss:3.5269 train_time:4082663ms step_avg:4123.90ms
step:1000/1750 val_loss:3.5071 train_time:4082664ms step_avg:4123.90ms
step:1001/1750 train_loss:3.7410 train_time:4086941ms step_avg:4124.06ms
step:1002/1750 train_loss:3.6205 train_time:4091222ms step_avg:4124.22ms
step:1003/1750 train_loss:3.4958 train_time:4095523ms step_avg:4124.39ms
step:1004/1750 train_loss:3.4959 train_time:4099802ms step_avg:4124.55ms
step:1005/1750 train_loss:3.5501 train_time:4104121ms step_avg:4124.74ms
step:1006/1750 train_loss:3.5141 train_time:4108434ms step_avg:4124.93ms
step:1007/1750 train_loss:3.5614 train_time:4112787ms step_avg:4125.16ms
step:1008/1750 train_loss:3.4478 train_time:4117092ms step_avg:4125.34ms
step:1009/1750 train_loss:3.4659 train_time:4121361ms step_avg:4125.49ms
step:1010/1750 train_loss:3.5621 train_time:4125664ms step_avg:4125.66ms
step:1011/1750 train_loss:3.9934 train_time:4129994ms step_avg:4125.87ms
step:1012/1750 train_loss:3.6384 train_time:4134290ms step_avg:4126.04ms
step:1013/1750 train_loss:3.4279 train_time:4138602ms step_avg:4126.22ms
step:1014/1750 train_loss:3.3730 train_time:4142894ms step_avg:4126.39ms
step:1015/1750 train_loss:3.4206 train_time:4147244ms step_avg:4126.61ms
step:1016/1750 train_loss:3.4290 train_time:4151521ms step_avg:4126.76ms
step:1017/1750 train_loss:3.6755 train_time:4155812ms step_avg:4126.92ms
step:1018/1750 train_loss:3.3742 train_time:4160118ms step_avg:4127.10ms
step:1019/1750 train_loss:3.5063 train_time:4164428ms step_avg:4127.28ms
step:1020/1750 train_loss:3.4828 train_time:4168723ms step_avg:4127.45ms
step:1021/1750 train_loss:3.5650 train_time:4173038ms step_avg:4127.63ms
step:1022/1750 train_loss:3.4805 train_time:4177352ms step_avg:4127.82ms
step:1023/1750 train_loss:3.8176 train_time:4181669ms step_avg:4128.01ms
step:1024/1750 train_loss:3.5932 train_time:4185947ms step_avg:4128.15ms
step:1025/1750 train_loss:3.4194 train_time:4190209ms step_avg:4128.28ms
step:1026/1750 train_loss:3.6884 train_time:4194521ms step_avg:4128.47ms
step:1027/1750 train_loss:3.4575 train_time:4198852ms step_avg:4128.66ms
step:1028/1750 train_loss:3.5189 train_time:4203159ms step_avg:4128.84ms
step:1029/1750 train_loss:3.4456 train_time:4207450ms step_avg:4129.00ms
step:1030/1750 train_loss:3.2854 train_time:4211744ms step_avg:4129.16ms
step:1031/1750 train_loss:3.6676 train_time:4216100ms step_avg:4129.38ms
step:1032/1750 train_loss:3.9201 train_time:4220441ms step_avg:4129.59ms
step:1033/1750 train_loss:3.4745 train_time:4224717ms step_avg:4129.73ms
step:1034/1750 train_loss:3.4424 train_time:4228978ms step_avg:4129.86ms
step:1035/1750 train_loss:3.5072 train_time:4233262ms step_avg:4130.01ms
step:1036/1750 train_loss:3.7298 train_time:4237588ms step_avg:4130.20ms
step:1037/1750 train_loss:3.3466 train_time:4241905ms step_avg:4130.38ms
step:1038/1750 train_loss:3.2580 train_time:4246215ms step_avg:4130.56ms
step:1039/1750 train_loss:3.6420 train_time:4250553ms step_avg:4130.76ms
step:1040/1750 train_loss:3.3245 train_time:4254924ms step_avg:4130.99ms
step:1041/1750 train_loss:3.3227 train_time:4259232ms step_avg:4131.17ms
step:1042/1750 train_loss:3.5527 train_time:4263564ms step_avg:4131.36ms
step:1043/1750 train_loss:3.1842 train_time:4267885ms step_avg:4131.54ms
step:1044/1750 train_loss:3.4370 train_time:4272204ms step_avg:4131.73ms
step:1045/1750 train_loss:3.5331 train_time:4276505ms step_avg:4131.89ms
step:1046/1750 train_loss:3.7549 train_time:4280833ms step_avg:4132.08ms
step:1047/1750 train_loss:3.3435 train_time:4285206ms step_avg:4132.31ms
step:1048/1750 train_loss:3.7788 train_time:4289572ms step_avg:4132.54ms
step:1049/1750 train_loss:3.3527 train_time:4293889ms step_avg:4132.71ms
step:1050/1750 train_loss:3.4081 train_time:4298197ms step_avg:4132.88ms
step:1051/1750 train_loss:3.3758 train_time:4302540ms step_avg:4133.08ms
step:1052/1750 train_loss:3.4073 train_time:4306873ms step_avg:4133.28ms
step:1053/1750 train_loss:3.4351 train_time:4311229ms step_avg:4133.49ms
step:1054/1750 train_loss:3.4987 train_time:4315534ms step_avg:4133.65ms
step:1055/1750 train_loss:3.5127 train_time:4319873ms step_avg:4133.85ms
step:1056/1750 train_loss:3.5316 train_time:4324185ms step_avg:4134.02ms
step:1057/1750 train_loss:3.4735 train_time:4328480ms step_avg:4134.17ms
step:1058/1750 train_loss:3.3973 train_time:4332820ms step_avg:4134.37ms
step:1059/1750 train_loss:3.5637 train_time:4337108ms step_avg:4134.52ms
step:1060/1750 train_loss:3.5444 train_time:4341430ms step_avg:4134.69ms
step:1061/1750 train_loss:3.5342 train_time:4345857ms step_avg:4134.97ms
step:1062/1750 train_loss:3.6877 train_time:4350208ms step_avg:4135.18ms
step:1063/1750 train_loss:3.3981 train_time:4354524ms step_avg:4135.35ms
step:1064/1750 train_loss:3.3367 train_time:4358846ms step_avg:4135.53ms
step:1065/1750 train_loss:3.4132 train_time:4363160ms step_avg:4135.70ms
step:1066/1750 train_loss:3.5532 train_time:4367484ms step_avg:4135.88ms
step:1067/1750 train_loss:3.5331 train_time:4371777ms step_avg:4136.02ms
step:1068/1750 train_loss:3.4981 train_time:4376141ms step_avg:4136.24ms
step:1069/1750 train_loss:3.4003 train_time:4380476ms step_avg:4136.43ms
step:1070/1750 train_loss:3.5061 train_time:4384840ms step_avg:4136.64ms
step:1071/1750 train_loss:3.7657 train_time:4389178ms step_avg:4136.83ms
step:1072/1750 train_loss:3.4794 train_time:4393488ms step_avg:4136.99ms
step:1073/1750 train_loss:3.2011 train_time:4397796ms step_avg:4137.16ms
step:1074/1750 train_loss:3.4987 train_time:4402159ms step_avg:4137.37ms
step:1075/1750 train_loss:3.6015 train_time:4406461ms step_avg:4137.52ms
step:1076/1750 train_loss:3.5082 train_time:4410757ms step_avg:4137.67ms
step:1077/1750 train_loss:3.6033 train_time:4415094ms step_avg:4137.86ms
step:1078/1750 train_loss:3.5859 train_time:4419420ms step_avg:4138.03ms
step:1079/1750 train_loss:3.4619 train_time:4423712ms step_avg:4138.18ms
step:1080/1750 train_loss:3.5041 train_time:4428011ms step_avg:4138.33ms
step:1081/1750 train_loss:3.5543 train_time:4432374ms step_avg:4138.54ms
step:1082/1750 train_loss:3.5521 train_time:4436696ms step_avg:4138.71ms
step:1083/1750 train_loss:3.4154 train_time:4441056ms step_avg:4138.92ms
step:1084/1750 train_loss:3.4800 train_time:4445371ms step_avg:4139.08ms
step:1085/1750 train_loss:3.2039 train_time:4449716ms step_avg:4139.27ms
step:1086/1750 train_loss:3.4524 train_time:4454029ms step_avg:4139.43ms
step:1087/1750 train_loss:3.6695 train_time:4458417ms step_avg:4139.66ms
step:1088/1750 train_loss:3.4711 train_time:4462756ms step_avg:4139.85ms
step:1089/1750 train_loss:3.4233 train_time:4467096ms step_avg:4140.03ms
step:1090/1750 train_loss:3.4145 train_time:4471433ms step_avg:4140.22ms
step:1091/1750 train_loss:3.5455 train_time:4475799ms step_avg:4140.42ms
step:1092/1750 train_loss:3.4137 train_time:4480170ms step_avg:4140.64ms
step:1093/1750 train_loss:3.5154 train_time:4484513ms step_avg:4140.82ms
step:1094/1750 train_loss:3.3696 train_time:4488855ms step_avg:4141.01ms
step:1095/1750 train_loss:3.3212 train_time:4493181ms step_avg:4141.18ms
step:1096/1750 train_loss:3.6876 train_time:4497500ms step_avg:4141.34ms
step:1097/1750 train_loss:3.4452 train_time:4501847ms step_avg:4141.53ms
step:1098/1750 train_loss:3.6334 train_time:4506138ms step_avg:4141.67ms
step:1099/1750 train_loss:3.4931 train_time:4510499ms step_avg:4141.87ms
step:1100/1750 train_loss:3.6172 train_time:4514831ms step_avg:4142.05ms
step:1101/1750 train_loss:3.6071 train_time:4519182ms step_avg:4142.24ms
step:1102/1750 train_loss:3.4513 train_time:4523488ms step_avg:4142.39ms
step:1103/1750 train_loss:3.6070 train_time:4527749ms step_avg:4142.50ms
step:1104/1750 train_loss:3.4955 train_time:4532108ms step_avg:4142.69ms
step:1105/1750 train_loss:3.4317 train_time:4536455ms step_avg:4142.88ms
step:1106/1750 train_loss:3.5963 train_time:4540781ms step_avg:4143.05ms
step:1107/1750 train_loss:3.5854 train_time:4545081ms step_avg:4143.19ms
step:1108/1750 train_loss:3.4376 train_time:4549433ms step_avg:4143.38ms
step:1109/1750 train_loss:3.7583 train_time:4553783ms step_avg:4143.57ms
step:1110/1750 train_loss:3.3300 train_time:4558109ms step_avg:4143.74ms
step:1111/1750 train_loss:3.5560 train_time:4562452ms step_avg:4143.92ms
step:1112/1750 train_loss:3.6273 train_time:4566758ms step_avg:4144.06ms
step:1113/1750 train_loss:3.3385 train_time:4571107ms step_avg:4144.25ms
step:1114/1750 train_loss:3.6703 train_time:4575422ms step_avg:4144.40ms
step:1115/1750 train_loss:3.3341 train_time:4579716ms step_avg:4144.54ms
step:1116/1750 train_loss:3.5662 train_time:4584034ms step_avg:4144.70ms
step:1117/1750 train_loss:3.6509 train_time:4588379ms step_avg:4144.88ms
step:1118/1750 train_loss:3.2760 train_time:4592702ms step_avg:4145.04ms
step:1119/1750 train_loss:3.8304 train_time:4597008ms step_avg:4145.18ms
step:1120/1750 train_loss:3.3726 train_time:4601419ms step_avg:4145.42ms
step:1121/1750 train_loss:3.1968 train_time:4605792ms step_avg:4145.63ms
step:1122/1750 train_loss:3.4052 train_time:4610083ms step_avg:4145.76ms
step:1123/1750 train_loss:3.4867 train_time:4614411ms step_avg:4145.92ms
step:1124/1750 train_loss:3.4983 train_time:4618718ms step_avg:4146.07ms
step:1125/1750 train_loss:3.5032 train_time:4623004ms step_avg:4146.19ms
step:1125/1750 val_loss:3.4805 train_time:4623007ms step_avg:4146.19ms
step:1126/1750 train_loss:3.6340 train_time:4627318ms step_avg:4146.34ms
step:1127/1750 train_loss:3.4142 train_time:4631623ms step_avg:4146.48ms
step:1128/1750 train_loss:3.6645 train_time:4635955ms step_avg:4146.65ms
step:1129/1750 train_loss:3.2466 train_time:4640317ms step_avg:4146.84ms
step:1130/1750 train_loss:3.3492 train_time:4644671ms step_avg:4147.03ms
step:1131/1750 train_loss:3.4354 train_time:4648991ms step_avg:4147.18ms
step:1132/1750 train_loss:3.4655 train_time:4653334ms step_avg:4147.36ms
step:1133/1750 train_loss:3.6070 train_time:4657667ms step_avg:4147.52ms
step:1134/1750 train_loss:3.6971 train_time:4661984ms step_avg:4147.67ms
step:1135/1750 train_loss:3.4637 train_time:4666285ms step_avg:4147.81ms
step:1136/1750 train_loss:3.4461 train_time:4670602ms step_avg:4147.96ms
step:1137/1750 train_loss:3.6054 train_time:4674925ms step_avg:4148.11ms
step:1138/1750 train_loss:3.5582 train_time:4679278ms step_avg:4148.30ms
step:1139/1750 train_loss:3.4892 train_time:4683637ms step_avg:4148.48ms
step:1140/1750 train_loss:3.6424 train_time:4687927ms step_avg:4148.61ms
step:1141/1750 train_loss:3.5187 train_time:4692256ms step_avg:4148.77ms
step:1142/1750 train_loss:3.4743 train_time:4696629ms step_avg:4148.97ms
step:1143/1750 train_loss:3.4617 train_time:4700939ms step_avg:4149.11ms
step:1144/1750 train_loss:3.7136 train_time:4705262ms step_avg:4149.26ms
step:1145/1750 train_loss:3.7206 train_time:4709695ms step_avg:4149.51ms
step:1146/1750 train_loss:3.5422 train_time:4714032ms step_avg:4149.68ms
step:1147/1750 train_loss:3.4850 train_time:4718344ms step_avg:4149.82ms
step:1148/1750 train_loss:3.5151 train_time:4722660ms step_avg:4149.96ms
step:1149/1750 train_loss:3.5791 train_time:4726967ms step_avg:4150.10ms
step:1150/1750 train_loss:3.4456 train_time:4731282ms step_avg:4150.25ms
step:1151/1750 train_loss:3.4160 train_time:4735653ms step_avg:4150.44ms
step:1152/1750 train_loss:3.5975 train_time:4739948ms step_avg:4150.57ms
step:1153/1750 train_loss:3.4351 train_time:4744279ms step_avg:4150.72ms
step:1154/1750 train_loss:3.5070 train_time:4748635ms step_avg:4150.90ms
step:1155/1750 train_loss:3.3044 train_time:4752952ms step_avg:4151.05ms
step:1156/1750 train_loss:3.8189 train_time:4757309ms step_avg:4151.23ms
step:1157/1750 train_loss:3.5962 train_time:4761637ms step_avg:4151.38ms
step:1158/1750 train_loss:3.5008 train_time:4765973ms step_avg:4151.54ms
step:1159/1750 train_loss:3.4741 train_time:4770361ms step_avg:4151.75ms
step:1160/1750 train_loss:3.6218 train_time:4774704ms step_avg:4151.92ms
step:1161/1750 train_loss:3.5039 train_time:4779036ms step_avg:4152.07ms
step:1162/1750 train_loss:3.6110 train_time:4783355ms step_avg:4152.22ms
step:1163/1750 train_loss:3.4820 train_time:4787683ms step_avg:4152.37ms
step:1164/1750 train_loss:3.5568 train_time:4791991ms step_avg:4152.51ms
step:1165/1750 train_loss:3.3957 train_time:4796324ms step_avg:4152.66ms
step:1166/1750 train_loss:3.5395 train_time:4800618ms step_avg:4152.78ms
step:1167/1750 train_loss:3.6034 train_time:4804967ms step_avg:4152.95ms
step:1168/1750 train_loss:3.3790 train_time:4809306ms step_avg:4153.11ms
step:1169/1750 train_loss:3.6051 train_time:4813669ms step_avg:4153.30ms
step:1170/1750 train_loss:3.5532 train_time:4818007ms step_avg:4153.45ms
step:1171/1750 train_loss:3.3870 train_time:4822400ms step_avg:4153.66ms
step:1172/1750 train_loss:3.6833 train_time:4826729ms step_avg:4153.81ms
step:1173/1750 train_loss:3.4555 train_time:4831068ms step_avg:4153.97ms
step:1174/1750 train_loss:3.5643 train_time:4835404ms step_avg:4154.13ms
step:1175/1750 train_loss:3.1927 train_time:4839756ms step_avg:4154.30ms
step:1176/1750 train_loss:3.4647 train_time:4844088ms step_avg:4154.45ms
step:1177/1750 train_loss:3.4081 train_time:4848538ms step_avg:4154.70ms
step:1178/1750 train_loss:3.5174 train_time:4852903ms step_avg:4154.88ms
step:1179/1750 train_loss:3.4659 train_time:4857304ms step_avg:4155.09ms
step:1180/1750 train_loss:3.5545 train_time:4861704ms step_avg:4155.30ms
step:1181/1750 train_loss:3.3882 train_time:4866069ms step_avg:4155.48ms
step:1182/1750 train_loss:3.3341 train_time:4870380ms step_avg:4155.61ms
step:1183/1750 train_loss:3.3791 train_time:4874780ms step_avg:4155.82ms
step:1184/1750 train_loss:3.2791 train_time:4879203ms step_avg:4156.05ms
step:1185/1750 train_loss:3.4532 train_time:4883526ms step_avg:4156.19ms
step:1186/1750 train_loss:3.5271 train_time:4887901ms step_avg:4156.38ms
step:1187/1750 train_loss:3.4642 train_time:4892286ms step_avg:4156.57ms
step:1188/1750 train_loss:3.3645 train_time:4896667ms step_avg:4156.76ms
step:1189/1750 train_loss:3.4752 train_time:4901021ms step_avg:4156.93ms
step:1190/1750 train_loss:3.5741 train_time:4905485ms step_avg:4157.19ms
step:1191/1750 train_loss:3.4694 train_time:4909853ms step_avg:4157.37ms
step:1192/1750 train_loss:3.4387 train_time:4914223ms step_avg:4157.55ms
step:1193/1750 train_loss:3.6773 train_time:4918558ms step_avg:4157.70ms
step:1194/1750 train_loss:3.7625 train_time:4922891ms step_avg:4157.85ms
step:1195/1750 train_loss:3.3565 train_time:4927259ms step_avg:4158.02ms
step:1196/1750 train_loss:4.2273 train_time:4931591ms step_avg:4158.17ms
step:1197/1750 train_loss:3.2733 train_time:4935936ms step_avg:4158.33ms
step:1198/1750 train_loss:3.2955 train_time:4940275ms step_avg:4158.48ms
step:1199/1750 train_loss:3.1738 train_time:4944636ms step_avg:4158.65ms
step:1200/1750 train_loss:3.5199 train_time:4948998ms step_avg:4158.82ms
step:1201/1750 train_loss:3.2989 train_time:4953431ms step_avg:4159.05ms
step:1202/1750 train_loss:3.5672 train_time:4957808ms step_avg:4159.23ms
step:1203/1750 train_loss:3.4574 train_time:4962168ms step_avg:4159.40ms
step:1204/1750 train_loss:3.3324 train_time:4966487ms step_avg:4159.54ms
step:1205/1750 train_loss:3.1405 train_time:4970862ms step_avg:4159.72ms
step:1206/1750 train_loss:3.4915 train_time:4975356ms step_avg:4160.00ms
step:1207/1750 train_loss:3.6586 train_time:4979735ms step_avg:4160.18ms
step:1208/1750 train_loss:3.6929 train_time:4984094ms step_avg:4160.35ms
step:1209/1750 train_loss:3.5930 train_time:4988442ms step_avg:4160.50ms
step:1210/1750 train_loss:3.3155 train_time:4992793ms step_avg:4160.66ms
step:1211/1750 train_loss:3.3604 train_time:4997130ms step_avg:4160.81ms
step:1212/1750 train_loss:3.3397 train_time:5001522ms step_avg:4161.00ms
step:1213/1750 train_loss:3.3909 train_time:5005936ms step_avg:4161.21ms
step:1214/1750 train_loss:3.4692 train_time:5010324ms step_avg:4161.40ms
step:1215/1750 train_loss:3.2018 train_time:5014686ms step_avg:4161.56ms
step:1216/1750 train_loss:3.3407 train_time:5019052ms step_avg:4161.73ms
step:1217/1750 train_loss:3.4550 train_time:5023431ms step_avg:4161.91ms
step:1218/1750 train_loss:3.2812 train_time:5027812ms step_avg:4162.10ms
step:1219/1750 train_loss:3.6110 train_time:5032107ms step_avg:4162.21ms
step:1220/1750 train_loss:3.4064 train_time:5036446ms step_avg:4162.35ms
step:1221/1750 train_loss:3.3558 train_time:5040762ms step_avg:4162.48ms
step:1222/1750 train_loss:3.3760 train_time:5045049ms step_avg:4162.58ms
step:1223/1750 train_loss:3.5351 train_time:5049413ms step_avg:4162.75ms
step:1224/1750 train_loss:3.5495 train_time:5053746ms step_avg:4162.89ms
step:1225/1750 train_loss:3.5631 train_time:5058107ms step_avg:4163.05ms
step:1226/1750 train_loss:3.8126 train_time:5062450ms step_avg:4163.20ms
step:1227/1750 train_loss:3.6845 train_time:5066808ms step_avg:4163.36ms
step:1228/1750 train_loss:3.4628 train_time:5071260ms step_avg:4163.60ms
step:1229/1750 train_loss:3.4777 train_time:5075570ms step_avg:4163.72ms
step:1230/1750 train_loss:3.6926 train_time:5079940ms step_avg:4163.89ms
step:1231/1750 train_loss:3.4206 train_time:5084324ms step_avg:4164.07ms
step:1232/1750 train_loss:3.2537 train_time:5088652ms step_avg:4164.20ms
step:1233/1750 train_loss:2.9162 train_time:5093090ms step_avg:4164.42ms
step:1234/1750 train_loss:3.5289 train_time:5097455ms step_avg:4164.59ms
step:1235/1750 train_loss:3.2636 train_time:5101855ms step_avg:4164.78ms
step:1236/1750 train_loss:3.3445 train_time:5106222ms step_avg:4164.94ms
step:1237/1750 train_loss:3.2236 train_time:5110569ms step_avg:4165.09ms
step:1238/1750 train_loss:3.1069 train_time:5114918ms step_avg:4165.24ms
step:1239/1750 train_loss:3.4157 train_time:5119253ms step_avg:4165.38ms
step:1240/1750 train_loss:3.3740 train_time:5123590ms step_avg:4165.52ms
step:1241/1750 train_loss:3.4793 train_time:5128021ms step_avg:4165.74ms
step:1242/1750 train_loss:3.3409 train_time:5132376ms step_avg:4165.89ms
step:1243/1750 train_loss:3.5600 train_time:5136749ms step_avg:4166.06ms
step:1244/1750 train_loss:3.3169 train_time:5141137ms step_avg:4166.24ms
step:1245/1750 train_loss:3.5297 train_time:5145501ms step_avg:4166.40ms
step:1246/1750 train_loss:3.6564 train_time:5149874ms step_avg:4166.56ms
step:1247/1750 train_loss:3.3527 train_time:5154233ms step_avg:4166.72ms
step:1248/1750 train_loss:3.2856 train_time:5158568ms step_avg:4166.86ms
step:1249/1750 train_loss:3.5363 train_time:5162892ms step_avg:4166.98ms
step:1250/1750 train_loss:3.3672 train_time:5167252ms step_avg:4167.14ms
step:1250/1750 val_loss:3.4256 train_time:5167252ms step_avg:4167.14ms
step:1251/1750 train_loss:3.5006 train_time:5171611ms step_avg:4167.29ms
step:1252/1750 train_loss:3.2946 train_time:5175913ms step_avg:4167.40ms
step:1253/1750 train_loss:3.5275 train_time:5180265ms step_avg:4167.55ms
step:1254/1750 train_loss:3.4184 train_time:5184649ms step_avg:4167.72ms
step:1255/1750 train_loss:3.1731 train_time:5189046ms step_avg:4167.91ms
step:1256/1750 train_loss:3.4098 train_time:5193359ms step_avg:4168.03ms
step:1257/1750 train_loss:3.5254 train_time:5197653ms step_avg:4168.13ms
step:1258/1750 train_loss:3.5038 train_time:5202105ms step_avg:4168.35ms
step:1259/1750 train_loss:3.2643 train_time:5206442ms step_avg:4168.49ms
step:1260/1750 train_loss:3.5403 train_time:5210804ms step_avg:4168.64ms
step:1261/1750 train_loss:3.3609 train_time:5215176ms step_avg:4168.81ms
step:1262/1750 train_loss:3.5001 train_time:5219631ms step_avg:4169.03ms
step:1263/1750 train_loss:3.3305 train_time:5223950ms step_avg:4169.15ms
step:1264/1750 train_loss:3.5473 train_time:5228280ms step_avg:4169.28ms
step:1265/1750 train_loss:3.5472 train_time:5232724ms step_avg:4169.50ms
step:1266/1750 train_loss:3.3003 train_time:5237120ms step_avg:4169.68ms
step:1267/1750 train_loss:3.5440 train_time:5241462ms step_avg:4169.82ms
step:1268/1750 train_loss:3.3852 train_time:5245794ms step_avg:4169.95ms
step:1269/1750 train_loss:3.2848 train_time:5250183ms step_avg:4170.12ms
step:1270/1750 train_loss:3.4991 train_time:5254540ms step_avg:4170.27ms
step:1271/1750 train_loss:3.4176 train_time:5258917ms step_avg:4170.43ms
step:1272/1750 train_loss:3.3343 train_time:5263243ms step_avg:4170.56ms
step:1273/1750 train_loss:3.3510 train_time:5267657ms step_avg:4170.75ms
step:1274/1750 train_loss:3.6842 train_time:5271965ms step_avg:4170.86ms
step:1275/1750 train_loss:3.4150 train_time:5276285ms step_avg:4170.98ms
step:1276/1750 train_loss:3.5273 train_time:5280615ms step_avg:4171.10ms
step:1277/1750 train_loss:3.4782 train_time:5284929ms step_avg:4171.21ms
step:1278/1750 train_loss:3.4432 train_time:5289301ms step_avg:4171.37ms
step:1279/1750 train_loss:3.2901 train_time:5293634ms step_avg:4171.50ms
step:1280/1750 train_loss:3.3592 train_time:5297969ms step_avg:4171.63ms
step:1281/1750 train_loss:3.4259 train_time:5302338ms step_avg:4171.78ms
step:1282/1750 train_loss:3.3672 train_time:5306669ms step_avg:4171.91ms
step:1283/1750 train_loss:3.4783 train_time:5311121ms step_avg:4172.13ms
step:1284/1750 train_loss:3.3696 train_time:5315520ms step_avg:4172.31ms
step:1285/1750 train_loss:3.5847 train_time:5319835ms step_avg:4172.42ms
step:1286/1750 train_loss:3.3091 train_time:5324142ms step_avg:4172.52ms
step:1287/1750 train_loss:3.3544 train_time:5328513ms step_avg:4172.68ms
step:1288/1750 train_loss:3.2988 train_time:5332854ms step_avg:4172.81ms
step:1289/1750 train_loss:3.4621 train_time:5337209ms step_avg:4172.95ms
step:1290/1750 train_loss:3.5511 train_time:5341576ms step_avg:4173.11ms
step:1291/1750 train_loss:3.3536 train_time:5345910ms step_avg:4173.23ms
step:1292/1750 train_loss:3.3600 train_time:5350259ms step_avg:4173.37ms
step:1293/1750 train_loss:3.3982 train_time:5354674ms step_avg:4173.56ms
step:1294/1750 train_loss:3.2993 train_time:5359118ms step_avg:4173.77ms
step:1295/1750 train_loss:3.5163 train_time:5363481ms step_avg:4173.92ms
step:1296/1750 train_loss:3.8270 train_time:5367858ms step_avg:4174.07ms
step:1297/1750 train_loss:3.5784 train_time:5372187ms step_avg:4174.19ms
step:1298/1750 train_loss:3.3940 train_time:5376551ms step_avg:4174.34ms
step:1299/1750 train_loss:3.5251 train_time:5380961ms step_avg:4174.52ms
step:1300/1750 train_loss:3.4207 train_time:5385410ms step_avg:4174.74ms
step:1301/1750 train_loss:3.4138 train_time:5389760ms step_avg:4174.87ms
step:1302/1750 train_loss:3.5058 train_time:5394142ms step_avg:4175.03ms
step:1303/1750 train_loss:3.3714 train_time:5398510ms step_avg:4175.18ms
step:1304/1750 train_loss:3.5391 train_time:5402894ms step_avg:4175.34ms
step:1305/1750 train_loss:3.2668 train_time:5407258ms step_avg:4175.49ms
step:1306/1750 train_loss:3.1601 train_time:5411651ms step_avg:4175.66ms
step:1307/1750 train_loss:3.5255 train_time:5416110ms step_avg:4175.88ms
step:1308/1750 train_loss:3.3540 train_time:5420499ms step_avg:4176.04ms
step:1309/1750 train_loss:3.7780 train_time:5424871ms step_avg:4176.19ms
step:1310/1750 train_loss:3.3919 train_time:5429293ms step_avg:4176.38ms
step:1311/1750 train_loss:3.4928 train_time:5433660ms step_avg:4176.53ms
step:1312/1750 train_loss:3.4931 train_time:5438078ms step_avg:4176.71ms
step:1313/1750 train_loss:3.7380 train_time:5442464ms step_avg:4176.87ms
step:1314/1750 train_loss:3.3747 train_time:5446842ms step_avg:4177.03ms
step:1315/1750 train_loss:3.3024 train_time:5451209ms step_avg:4177.17ms
step:1316/1750 train_loss:3.2810 train_time:5455601ms step_avg:4177.34ms
step:1317/1750 train_loss:3.2329 train_time:5459952ms step_avg:4177.47ms
step:1318/1750 train_loss:3.4000 train_time:5464343ms step_avg:4177.63ms
step:1319/1750 train_loss:3.4384 train_time:5468734ms step_avg:4177.80ms
step:1320/1750 train_loss:3.7455 train_time:5473116ms step_avg:4177.95ms
step:1321/1750 train_loss:3.4062 train_time:5477446ms step_avg:4178.07ms
step:1322/1750 train_loss:3.3583 train_time:5481875ms step_avg:4178.26ms
step:1323/1750 train_loss:3.4290 train_time:5486260ms step_avg:4178.42ms
step:1324/1750 train_loss:3.7275 train_time:5490644ms step_avg:4178.57ms
step:1325/1750 train_loss:3.3750 train_time:5495098ms step_avg:4178.78ms
step:1326/1750 train_loss:3.2385 train_time:5499449ms step_avg:4178.91ms
step:1327/1750 train_loss:3.3077 train_time:5503800ms step_avg:4179.04ms
step:1328/1750 train_loss:3.4422 train_time:5508169ms step_avg:4179.19ms
step:1329/1750 train_loss:3.3785 train_time:5512616ms step_avg:4179.39ms
step:1330/1750 train_loss:3.3627 train_time:5517036ms step_avg:4179.57ms
step:1331/1750 train_loss:3.3450 train_time:5521409ms step_avg:4179.72ms
step:1332/1750 train_loss:3.3727 train_time:5525848ms step_avg:4179.92ms
step:1333/1750 train_loss:3.8370 train_time:5530251ms step_avg:4180.08ms
step:1334/1750 train_loss:3.3007 train_time:5534704ms step_avg:4180.29ms
step:1335/1750 train_loss:3.8516 train_time:5539035ms step_avg:4180.40ms
step:1336/1750 train_loss:3.2813 train_time:5543461ms step_avg:4180.59ms
step:1337/1750 train_loss:3.3216 train_time:5547834ms step_avg:4180.73ms
step:1338/1750 train_loss:3.4410 train_time:5552200ms step_avg:4180.87ms
step:1339/1750 train_loss:3.2457 train_time:5556549ms step_avg:4181.00ms
step:1340/1750 train_loss:3.4092 train_time:5560962ms step_avg:4181.17ms
step:1341/1750 train_loss:3.3240 train_time:5565361ms step_avg:4181.34ms
step:1342/1750 train_loss:3.4591 train_time:5569730ms step_avg:4181.48ms
step:1343/1750 train_loss:3.5322 train_time:5574137ms step_avg:4181.65ms
step:1344/1750 train_loss:3.5345 train_time:5578536ms step_avg:4181.81ms
step:1345/1750 train_loss:3.3893 train_time:5582873ms step_avg:4181.93ms
step:1346/1750 train_loss:3.5583 train_time:5587277ms step_avg:4182.09ms
step:1347/1750 train_loss:3.6617 train_time:5591628ms step_avg:4182.22ms
step:1348/1750 train_loss:3.4381 train_time:5595973ms step_avg:4182.34ms
step:1349/1750 train_loss:3.2487 train_time:5600361ms step_avg:4182.50ms
step:1350/1750 train_loss:3.5366 train_time:5604747ms step_avg:4182.65ms
step:1351/1750 train_loss:3.3951 train_time:5609160ms step_avg:4182.82ms
step:1352/1750 train_loss:3.4906 train_time:5613550ms step_avg:4182.97ms
step:1353/1750 train_loss:3.7497 train_time:5617938ms step_avg:4183.13ms
step:1354/1750 train_loss:3.5238 train_time:5622275ms step_avg:4183.24ms
step:1355/1750 train_loss:3.4410 train_time:5626662ms step_avg:4183.39ms
step:1356/1750 train_loss:3.5365 train_time:5631062ms step_avg:4183.55ms
step:1357/1750 train_loss:3.2120 train_time:5635511ms step_avg:4183.75ms
step:1358/1750 train_loss:3.3976 train_time:5639890ms step_avg:4183.89ms
step:1359/1750 train_loss:3.3530 train_time:5644274ms step_avg:4184.04ms
step:1360/1750 train_loss:3.3657 train_time:5648602ms step_avg:4184.15ms
step:1361/1750 train_loss:3.3582 train_time:5653039ms step_avg:4184.34ms
step:1362/1750 train_loss:3.4729 train_time:5657449ms step_avg:4184.50ms
step:1363/1750 train_loss:3.4069 train_time:5661815ms step_avg:4184.64ms
step:1364/1750 train_loss:3.4629 train_time:5666198ms step_avg:4184.78ms
step:1365/1750 train_loss:3.5063 train_time:5670641ms step_avg:4184.98ms
step:1366/1750 train_loss:3.2022 train_time:5675004ms step_avg:4185.11ms
step:1367/1750 train_loss:3.6016 train_time:5679452ms step_avg:4185.30ms
step:1368/1750 train_loss:3.3156 train_time:5683854ms step_avg:4185.46ms
step:1369/1750 train_loss:3.4085 train_time:5688151ms step_avg:4185.54ms
step:1370/1750 train_loss:3.3144 train_time:5692533ms step_avg:4185.69ms
step:1371/1750 train_loss:3.4274 train_time:5696926ms step_avg:4185.84ms
step:1372/1750 train_loss:3.4629 train_time:5701318ms step_avg:4185.99ms
step:1373/1750 train_loss:3.1496 train_time:5705834ms step_avg:4186.23ms
step:1374/1750 train_loss:3.3967 train_time:5710201ms step_avg:4186.36ms
step:1375/1750 train_loss:3.4542 train_time:5714568ms step_avg:4186.50ms
step:1375/1750 val_loss:3.3781 train_time:5714569ms step_avg:4186.50ms
step:1376/1750 train_loss:3.4657 train_time:5718999ms step_avg:4186.68ms
step:1377/1750 train_loss:3.4022 train_time:5723354ms step_avg:4186.80ms
step:1378/1750 train_loss:3.8260 train_time:5727821ms step_avg:4187.00ms
step:1379/1750 train_loss:3.4054 train_time:5732160ms step_avg:4187.11ms
step:1380/1750 train_loss:3.4674 train_time:5736517ms step_avg:4187.24ms
step:1381/1750 train_loss:3.3821 train_time:5740935ms step_avg:4187.41ms
step:1382/1750 train_loss:3.4563 train_time:5745313ms step_avg:4187.55ms
step:1383/1750 train_loss:3.3398 train_time:5749713ms step_avg:4187.70ms
step:1384/1750 train_loss:3.4099 train_time:5754038ms step_avg:4187.80ms
step:1385/1750 train_loss:3.9206 train_time:5758541ms step_avg:4188.03ms
step:1386/1750 train_loss:3.3941 train_time:5762941ms step_avg:4188.18ms
step:1387/1750 train_loss:3.3461 train_time:5767296ms step_avg:4188.31ms
step:1388/1750 train_loss:3.2992 train_time:5771776ms step_avg:4188.52ms
step:1389/1750 train_loss:3.6117 train_time:5776122ms step_avg:4188.63ms
step:1390/1750 train_loss:3.4679 train_time:5780483ms step_avg:4188.76ms
step:1391/1750 train_loss:3.4334 train_time:5784865ms step_avg:4188.90ms
step:1392/1750 train_loss:3.3329 train_time:5789198ms step_avg:4189.00ms
step:1393/1750 train_loss:3.5835 train_time:5793535ms step_avg:4189.11ms
step:1394/1750 train_loss:3.3772 train_time:5797890ms step_avg:4189.23ms
step:1395/1750 train_loss:3.6862 train_time:5802255ms step_avg:4189.35ms
step:1396/1750 train_loss:3.2994 train_time:5806637ms step_avg:4189.49ms
step:1397/1750 train_loss:3.6200 train_time:5811008ms step_avg:4189.62ms
step:1398/1750 train_loss:3.5276 train_time:5815389ms step_avg:4189.76ms
step:1399/1750 train_loss:3.4260 train_time:5819736ms step_avg:4189.87ms
step:1400/1750 train_loss:3.3483 train_time:5824100ms step_avg:4190.00ms
step:1401/1750 train_loss:3.2644 train_time:5828425ms step_avg:4190.10ms
step:1402/1750 train_loss:3.3986 train_time:5832791ms step_avg:4190.22ms
step:1403/1750 train_loss:3.4022 train_time:5837125ms step_avg:4190.33ms
step:1404/1750 train_loss:3.3475 train_time:5841567ms step_avg:4190.51ms
step:1405/1750 train_loss:3.3247 train_time:5845877ms step_avg:4190.59ms
step:1406/1750 train_loss:3.5816 train_time:5850264ms step_avg:4190.73ms
step:1407/1750 train_loss:3.3582 train_time:5854685ms step_avg:4190.90ms
step:1408/1750 train_loss:3.3566 train_time:5859028ms step_avg:4191.01ms
step:1409/1750 train_loss:3.0474 train_time:5863419ms step_avg:4191.15ms
step:1410/1750 train_loss:3.4349 train_time:5867793ms step_avg:4191.28ms
step:1411/1750 train_loss:3.5263 train_time:5872139ms step_avg:4191.39ms
step:1412/1750 train_loss:3.4393 train_time:5876445ms step_avg:4191.47ms
step:1413/1750 train_loss:3.3453 train_time:5880860ms step_avg:4191.63ms
step:1414/1750 train_loss:3.2381 train_time:5885248ms step_avg:4191.77ms
step:1415/1750 train_loss:3.3479 train_time:5889589ms step_avg:4191.88ms
step:1416/1750 train_loss:3.3141 train_time:5893936ms step_avg:4191.99ms
step:1417/1750 train_loss:3.3409 train_time:5898333ms step_avg:4192.13ms
step:1418/1750 train_loss:3.3775 train_time:5902687ms step_avg:4192.25ms
step:1419/1750 train_loss:3.2939 train_time:5907065ms step_avg:4192.38ms
step:1420/1750 train_loss:3.3426 train_time:5911528ms step_avg:4192.57ms
step:1421/1750 train_loss:3.2264 train_time:5915899ms step_avg:4192.70ms
step:1422/1750 train_loss:3.5667 train_time:5920238ms step_avg:4192.80ms
step:1423/1750 train_loss:3.3172 train_time:5924601ms step_avg:4192.92ms
step:1424/1750 train_loss:3.2662 train_time:5929015ms step_avg:4193.08ms
step:1425/1750 train_loss:3.3272 train_time:5933415ms step_avg:4193.23ms
step:1426/1750 train_loss:3.3589 train_time:5937823ms step_avg:4193.38ms
step:1427/1750 train_loss:3.3936 train_time:5942165ms step_avg:4193.48ms
step:1428/1750 train_loss:3.4530 train_time:5946612ms step_avg:4193.66ms
step:1429/1750 train_loss:3.2769 train_time:5951148ms step_avg:4193.90ms
step:1430/1750 train_loss:3.2385 train_time:5955512ms step_avg:4194.02ms
step:1431/1750 train_loss:3.2689 train_time:5959891ms step_avg:4194.15ms
step:1432/1750 train_loss:3.3981 train_time:5964292ms step_avg:4194.30ms
step:1433/1750 train_loss:3.4366 train_time:5968631ms step_avg:4194.40ms
step:1434/1750 train_loss:3.1178 train_time:5973032ms step_avg:4194.55ms
step:1435/1750 train_loss:3.2495 train_time:5977456ms step_avg:4194.71ms
step:1436/1750 train_loss:3.0629 train_time:5981897ms step_avg:4194.88ms
step:1437/1750 train_loss:3.3843 train_time:5986327ms step_avg:4195.04ms
step:1438/1750 train_loss:3.3258 train_time:5990831ms step_avg:4195.26ms
step:1439/1750 train_loss:3.3476 train_time:5995179ms step_avg:4195.37ms
step:1440/1750 train_loss:3.1304 train_time:5999564ms step_avg:4195.50ms
step:1441/1750 train_loss:3.2882 train_time:6003962ms step_avg:4195.64ms
step:1442/1750 train_loss:3.2720 train_time:6008351ms step_avg:4195.78ms
step:1443/1750 train_loss:3.2825 train_time:6012762ms step_avg:4195.93ms
step:1444/1750 train_loss:3.0019 train_time:6017155ms step_avg:4196.06ms
step:1445/1750 train_loss:3.4340 train_time:6021603ms step_avg:4196.24ms
step:1446/1750 train_loss:2.9779 train_time:6026047ms step_avg:4196.41ms
step:1447/1750 train_loss:3.3200 train_time:6030544ms step_avg:4196.62ms
step:1448/1750 train_loss:3.3845 train_time:6034937ms step_avg:4196.76ms
step:1449/1750 train_loss:3.4478 train_time:6039329ms step_avg:4196.89ms
step:1450/1750 train_loss:3.8626 train_time:6043795ms step_avg:4197.08ms
step:1451/1750 train_loss:3.4306 train_time:6048213ms step_avg:4197.23ms
step:1452/1750 train_loss:3.2466 train_time:6052611ms step_avg:4197.37ms
step:1453/1750 train_loss:3.3793 train_time:6057022ms step_avg:4197.52ms
step:1454/1750 train_loss:3.5871 train_time:6061451ms step_avg:4197.68ms
step:1455/1750 train_loss:3.4564 train_time:6065852ms step_avg:4197.82ms
step:1456/1750 train_loss:3.1068 train_time:6070252ms step_avg:4197.96ms
step:1457/1750 train_loss:3.3360 train_time:6074638ms step_avg:4198.09ms
step:1458/1750 train_loss:3.3419 train_time:6079002ms step_avg:4198.21ms
step:1459/1750 train_loss:3.6614 train_time:6083519ms step_avg:4198.43ms
step:1460/1750 train_loss:3.4146 train_time:6087889ms step_avg:4198.54ms
step:1461/1750 train_loss:3.4084 train_time:6092277ms step_avg:4198.67ms
step:1462/1750 train_loss:3.3704 train_time:6096674ms step_avg:4198.81ms
step:1463/1750 train_loss:3.8692 train_time:6101147ms step_avg:4199.00ms
step:1464/1750 train_loss:3.0745 train_time:6105558ms step_avg:4199.15ms
step:1465/1750 train_loss:3.3958 train_time:6109946ms step_avg:4199.28ms
step:1466/1750 train_loss:3.3647 train_time:6114332ms step_avg:4199.40ms
step:1467/1750 train_loss:3.1940 train_time:6118753ms step_avg:4199.56ms
step:1468/1750 train_loss:3.4633 train_time:6123194ms step_avg:4199.72ms
step:1469/1750 train_loss:3.0142 train_time:6127544ms step_avg:4199.82ms
step:1470/1750 train_loss:3.7070 train_time:6131927ms step_avg:4199.95ms
step:1471/1750 train_loss:3.2341 train_time:6136337ms step_avg:4200.09ms
step:1472/1750 train_loss:3.4421 train_time:6140724ms step_avg:4200.22ms
step:1473/1750 train_loss:3.5072 train_time:6145114ms step_avg:4200.35ms
step:1474/1750 train_loss:2.9251 train_time:6149514ms step_avg:4200.49ms
step:1475/1750 train_loss:3.2148 train_time:6153934ms step_avg:4200.64ms
step:1476/1750 train_loss:3.3462 train_time:6158417ms step_avg:4200.83ms
step:1477/1750 train_loss:3.4189 train_time:6162837ms step_avg:4200.98ms
step:1478/1750 train_loss:3.3971 train_time:6167281ms step_avg:4201.15ms
step:1479/1750 train_loss:3.1583 train_time:6171709ms step_avg:4201.30ms
step:1480/1750 train_loss:3.4164 train_time:6176118ms step_avg:4201.44ms
step:1481/1750 train_loss:3.2241 train_time:6180521ms step_avg:4201.58ms
step:1482/1750 train_loss:3.2807 train_time:6184945ms step_avg:4201.73ms
step:1483/1750 train_loss:3.4833 train_time:6189340ms step_avg:4201.86ms
step:1484/1750 train_loss:3.4016 train_time:6193751ms step_avg:4202.00ms
step:1485/1750 train_loss:3.2556 train_time:6198188ms step_avg:4202.16ms
step:1486/1750 train_loss:3.4247 train_time:6202646ms step_avg:4202.33ms
step:1487/1750 train_loss:3.3547 train_time:6207157ms step_avg:4202.54ms
step:1488/1750 train_loss:3.4092 train_time:6211492ms step_avg:4202.63ms
step:1489/1750 train_loss:3.2151 train_time:6215926ms step_avg:4202.79ms
step:1490/1750 train_loss:3.3404 train_time:6220357ms step_avg:4202.94ms
step:1491/1750 train_loss:3.3528 train_time:6224729ms step_avg:4203.06ms
step:1492/1750 train_loss:3.2578 train_time:6229214ms step_avg:4203.25ms
step:1493/1750 train_loss:3.3367 train_time:6233627ms step_avg:4203.39ms
step:1494/1750 train_loss:3.2588 train_time:6238039ms step_avg:4203.53ms
step:1495/1750 train_loss:3.2117 train_time:6242473ms step_avg:4203.69ms
step:1496/1750 train_loss:3.2795 train_time:6246901ms step_avg:4203.84ms
step:1497/1750 train_loss:3.2837 train_time:6251270ms step_avg:4203.95ms
step:1498/1750 train_loss:3.2426 train_time:6255680ms step_avg:4204.09ms
step:1499/1750 train_loss:3.2890 train_time:6260097ms step_avg:4204.23ms
step:1500/1750 train_loss:3.3706 train_time:6264463ms step_avg:4204.34ms
step:1500/1750 val_loss:3.3415 train_time:6264464ms step_avg:4204.34ms
step:1501/1750 train_loss:3.3777 train_time:6268841ms step_avg:4204.45ms
step:1502/1750 train_loss:3.2943 train_time:6273238ms step_avg:4204.58ms
step:1503/1750 train_loss:3.0155 train_time:6277665ms step_avg:4204.73ms
step:1504/1750 train_loss:3.5393 train_time:6282111ms step_avg:4204.89ms
step:1505/1750 train_loss:3.0058 train_time:6286612ms step_avg:4205.09ms
step:1506/1750 train_loss:3.2438 train_time:6291124ms step_avg:4205.30ms
step:1507/1750 train_loss:3.3621 train_time:6295540ms step_avg:4205.44ms
step:1508/1750 train_loss:3.2891 train_time:6299989ms step_avg:4205.60ms
step:1509/1750 train_loss:3.2055 train_time:6304383ms step_avg:4205.73ms
step:1510/1750 train_loss:3.5355 train_time:6308823ms step_avg:4205.88ms
step:1511/1750 train_loss:3.2551 train_time:6313340ms step_avg:4206.09ms
step:1512/1750 train_loss:3.3387 train_time:6317834ms step_avg:4206.28ms
step:1513/1750 train_loss:3.2498 train_time:6322215ms step_avg:4206.40ms
step:1514/1750 train_loss:3.4843 train_time:6326599ms step_avg:4206.52ms
step:1515/1750 train_loss:3.4151 train_time:6330969ms step_avg:4206.62ms
step:1516/1750 train_loss:3.2889 train_time:6335358ms step_avg:4206.75ms
step:1517/1750 train_loss:3.3871 train_time:6339761ms step_avg:4206.87ms
step:1518/1750 train_loss:3.4283 train_time:6344175ms step_avg:4207.01ms
step:1519/1750 train_loss:3.2305 train_time:6348544ms step_avg:4207.12ms
step:1520/1750 train_loss:3.4237 train_time:6352994ms step_avg:4207.28ms
step:1521/1750 train_loss:3.4219 train_time:6357364ms step_avg:4207.39ms
step:1522/1750 train_loss:3.2901 train_time:6361832ms step_avg:4207.56ms
step:1523/1750 train_loss:3.0329 train_time:6366213ms step_avg:4207.68ms
step:1524/1750 train_loss:3.0162 train_time:6370684ms step_avg:4207.85ms
step:1525/1750 train_loss:3.2516 train_time:6375158ms step_avg:4208.02ms
step:1526/1750 train_loss:3.6394 train_time:6379530ms step_avg:4208.13ms
step:1527/1750 train_loss:3.2347 train_time:6383961ms step_avg:4208.28ms
step:1528/1750 train_loss:3.2467 train_time:6388384ms step_avg:4208.42ms
step:1529/1750 train_loss:3.2691 train_time:6392760ms step_avg:4208.53ms
step:1530/1750 train_loss:3.0597 train_time:6397235ms step_avg:4208.71ms
step:1531/1750 train_loss:3.4002 train_time:6401623ms step_avg:4208.83ms
step:1532/1750 train_loss:3.3351 train_time:6406070ms step_avg:4208.98ms
step:1533/1750 train_loss:3.3742 train_time:6410499ms step_avg:4209.13ms
step:1534/1750 train_loss:3.2418 train_time:6414837ms step_avg:4209.21ms
step:1535/1750 train_loss:3.1869 train_time:6419211ms step_avg:4209.32ms
step:1536/1750 train_loss:3.1863 train_time:6423613ms step_avg:4209.44ms
step:1537/1750 train_loss:3.2947 train_time:6428024ms step_avg:4209.58ms
step:1538/1750 train_loss:3.3506 train_time:6432502ms step_avg:4209.75ms
step:1539/1750 train_loss:3.2709 train_time:6436967ms step_avg:4209.92ms
step:1540/1750 train_loss:3.2615 train_time:6441408ms step_avg:4210.07ms
step:1541/1750 train_loss:3.4348 train_time:6445848ms step_avg:4210.22ms
step:1542/1750 train_loss:3.3796 train_time:6450218ms step_avg:4210.33ms
step:1543/1750 train_loss:3.1241 train_time:6454664ms step_avg:4210.48ms
step:1544/1750 train_loss:3.4141 train_time:6459152ms step_avg:4210.66ms
step:1545/1750 train_loss:3.1978 train_time:6463574ms step_avg:4210.80ms
step:1546/1750 train_loss:2.8102 train_time:6467967ms step_avg:4210.92ms
step:1547/1750 train_loss:3.5868 train_time:6472450ms step_avg:4211.09ms
step:1548/1750 train_loss:3.2229 train_time:6476885ms step_avg:4211.24ms
step:1549/1750 train_loss:3.4237 train_time:6481240ms step_avg:4211.33ms
step:1550/1750 train_loss:3.0496 train_time:6485701ms step_avg:4211.49ms
step:1551/1750 train_loss:3.3863 train_time:6490137ms step_avg:4211.64ms
step:1552/1750 train_loss:3.2506 train_time:6494541ms step_avg:4211.76ms
step:1553/1750 train_loss:3.3800 train_time:6499010ms step_avg:4211.93ms
step:1554/1750 train_loss:3.3687 train_time:6503430ms step_avg:4212.07ms
step:1555/1750 train_loss:3.4047 train_time:6507875ms step_avg:4212.22ms
step:1556/1750 train_loss:3.5160 train_time:6512253ms step_avg:4212.32ms
step:1557/1750 train_loss:3.3800 train_time:6516746ms step_avg:4212.51ms
step:1558/1750 train_loss:3.3528 train_time:6521150ms step_avg:4212.63ms
step:1559/1750 train_loss:3.1827 train_time:6525567ms step_avg:4212.76ms
step:1560/1750 train_loss:3.3374 train_time:6530059ms step_avg:4212.94ms
step:1561/1750 train_loss:3.4135 train_time:6534504ms step_avg:4213.09ms
step:1562/1750 train_loss:3.2998 train_time:6538890ms step_avg:4213.20ms
step:1563/1750 train_loss:3.2029 train_time:6543294ms step_avg:4213.32ms
step:1564/1750 train_loss:3.3104 train_time:6547778ms step_avg:4213.50ms
step:1565/1750 train_loss:3.2784 train_time:6552262ms step_avg:4213.67ms
step:1566/1750 train_loss:3.2570 train_time:6556667ms step_avg:4213.80ms
step:1567/1750 train_loss:3.2159 train_time:6561060ms step_avg:4213.91ms
step:1568/1750 train_loss:3.2324 train_time:6565456ms step_avg:4214.03ms
step:1569/1750 train_loss:3.1357 train_time:6569850ms step_avg:4214.14ms
step:1570/1750 train_loss:3.5148 train_time:6574336ms step_avg:4214.32ms
step:1571/1750 train_loss:3.5977 train_time:6578751ms step_avg:4214.45ms
step:1572/1750 train_loss:3.3829 train_time:6583135ms step_avg:4214.56ms
step:1573/1750 train_loss:3.2910 train_time:6587537ms step_avg:4214.67ms
step:1574/1750 train_loss:3.4217 train_time:6591971ms step_avg:4214.82ms
step:1575/1750 train_loss:3.2168 train_time:6596360ms step_avg:4214.93ms
step:1576/1750 train_loss:3.3518 train_time:6600794ms step_avg:4215.07ms
step:1577/1750 train_loss:3.6452 train_time:6605256ms step_avg:4215.22ms
step:1578/1750 train_loss:3.1853 train_time:6609633ms step_avg:4215.33ms
step:1579/1750 train_loss:3.2705 train_time:6614020ms step_avg:4215.44ms
step:1580/1750 train_loss:3.4856 train_time:6618502ms step_avg:4215.61ms
step:1581/1750 train_loss:3.2956 train_time:6622892ms step_avg:4215.72ms
step:1582/1750 train_loss:3.2815 train_time:6627414ms step_avg:4215.91ms
step:1583/1750 train_loss:3.4712 train_time:6631914ms step_avg:4216.09ms
step:1584/1750 train_loss:3.3916 train_time:6636376ms step_avg:4216.25ms
step:1585/1750 train_loss:3.2794 train_time:6640751ms step_avg:4216.35ms
step:1586/1750 train_loss:3.4021 train_time:6645254ms step_avg:4216.53ms
step:1587/1750 train_loss:3.1883 train_time:6649668ms step_avg:4216.66ms
step:1588/1750 train_loss:3.2091 train_time:6654083ms step_avg:4216.78ms
step:1589/1750 train_loss:3.2691 train_time:6658583ms step_avg:4216.96ms
step:1590/1750 train_loss:3.1544 train_time:6663001ms step_avg:4217.09ms
step:1591/1750 train_loss:3.2149 train_time:6667361ms step_avg:4217.18ms
step:1592/1750 train_loss:3.4248 train_time:6671803ms step_avg:4217.32ms
step:1593/1750 train_loss:3.3112 train_time:6676253ms step_avg:4217.47ms
step:1594/1750 train_loss:3.0795 train_time:6680676ms step_avg:4217.60ms
step:1595/1750 train_loss:3.4347 train_time:6685088ms step_avg:4217.72ms
step:1596/1750 train_loss:3.3082 train_time:6689573ms step_avg:4217.89ms
step:1597/1750 train_loss:3.2991 train_time:6693988ms step_avg:4218.01ms
step:1598/1750 train_loss:3.4639 train_time:6698487ms step_avg:4218.19ms
step:1599/1750 train_loss:3.3951 train_time:6702906ms step_avg:4218.32ms
step:1600/1750 train_loss:3.2673 train_time:6707378ms step_avg:4218.48ms
step:1601/1750 train_loss:2.8543 train_time:6711933ms step_avg:4218.69ms
step:1602/1750 train_loss:3.3123 train_time:6716354ms step_avg:4218.82ms
step:1603/1750 train_loss:3.4072 train_time:6720813ms step_avg:4218.97ms
step:1604/1750 train_loss:3.2275 train_time:6725208ms step_avg:4219.08ms
step:1605/1750 train_loss:3.2199 train_time:6729682ms step_avg:4219.24ms
step:1606/1750 train_loss:3.3077 train_time:6734072ms step_avg:4219.34ms
step:1607/1750 train_loss:3.3098 train_time:6738535ms step_avg:4219.50ms
step:1608/1750 train_loss:3.4136 train_time:6742965ms step_avg:4219.63ms
step:1609/1750 train_loss:3.3460 train_time:6747428ms step_avg:4219.78ms
step:1610/1750 train_loss:3.5783 train_time:6751842ms step_avg:4219.90ms
step:1611/1750 train_loss:3.4145 train_time:6756377ms step_avg:4220.10ms
step:1612/1750 train_loss:3.2883 train_time:6760759ms step_avg:4220.20ms
step:1613/1750 train_loss:3.4096 train_time:6765271ms step_avg:4220.38ms
step:1614/1750 train_loss:3.2181 train_time:6769684ms step_avg:4220.50ms
step:1615/1750 train_loss:3.3040 train_time:6774218ms step_avg:4220.70ms
step:1616/1750 train_loss:3.0903 train_time:6778821ms step_avg:4220.93ms
step:1617/1750 train_loss:3.0473 train_time:6783346ms step_avg:4221.12ms
step:1618/1750 train_loss:3.1563 train_time:6787810ms step_avg:4221.27ms
step:1619/1750 train_loss:3.3295 train_time:6792281ms step_avg:4221.43ms
step:1620/1750 train_loss:3.6660 train_time:6796838ms step_avg:4221.64ms
step:1621/1750 train_loss:3.3324 train_time:6801319ms step_avg:4221.80ms
step:1622/1750 train_loss:3.3233 train_time:6805708ms step_avg:4221.90ms
step:1623/1750 train_loss:3.2921 train_time:6810129ms step_avg:4222.03ms
step:1624/1750 train_loss:3.4656 train_time:6814596ms step_avg:4222.18ms
step:1625/1750 train_loss:3.1892 train_time:6819008ms step_avg:4222.30ms
step:1625/1750 val_loss:3.3040 train_time:6819009ms step_avg:4222.30ms
step:1626/1750 train_loss:3.4118 train_time:6823529ms step_avg:4222.48ms
step:1627/1750 train_loss:3.4212 train_time:6827940ms step_avg:4222.60ms
step:1628/1750 train_loss:3.4642 train_time:6832380ms step_avg:4222.73ms
step:1629/1750 train_loss:3.3101 train_time:6836806ms step_avg:4222.86ms
step:1630/1750 train_loss:3.4607 train_time:6841281ms step_avg:4223.01ms
step:1631/1750 train_loss:3.7204 train_time:6845680ms step_avg:4223.12ms
step:1632/1750 train_loss:3.3239 train_time:6850022ms step_avg:4223.20ms
step:1633/1750 train_loss:3.3096 train_time:6854452ms step_avg:4223.32ms
step:1634/1750 train_loss:3.3471 train_time:6858855ms step_avg:4223.43ms
step:1635/1750 train_loss:3.1355 train_time:6863327ms step_avg:4223.59ms
step:1636/1750 train_loss:3.3154 train_time:6867793ms step_avg:4223.73ms
step:1637/1750 train_loss:3.4596 train_time:6872264ms step_avg:4223.89ms
step:1638/1750 train_loss:3.3141 train_time:6876673ms step_avg:4224.00ms
step:1639/1750 train_loss:3.3869 train_time:6881127ms step_avg:4224.14ms
step:1640/1750 train_loss:3.1136 train_time:6885521ms step_avg:4224.25ms
step:1641/1750 train_loss:3.3856 train_time:6889962ms step_avg:4224.38ms
step:1642/1750 train_loss:3.7954 train_time:6894461ms step_avg:4224.55ms
step:1643/1750 train_loss:3.3050 train_time:6898864ms step_avg:4224.66ms
step:1644/1750 train_loss:3.4399 train_time:6903487ms step_avg:4224.90ms
step:1645/1750 train_loss:3.2826 train_time:6907909ms step_avg:4225.02ms
step:1646/1750 train_loss:3.4374 train_time:6912337ms step_avg:4225.14ms
step:1647/1750 train_loss:3.7804 train_time:6916788ms step_avg:4225.28ms
step:1648/1750 train_loss:3.4288 train_time:6921292ms step_avg:4225.45ms
step:1649/1750 train_loss:3.2930 train_time:6925670ms step_avg:4225.55ms
step:1650/1750 train_loss:3.2057 train_time:6930016ms step_avg:4225.62ms
step:1651/1750 train_loss:3.4784 train_time:6934450ms step_avg:4225.75ms
step:1652/1750 train_loss:3.4356 train_time:6938892ms step_avg:4225.88ms
step:1653/1750 train_loss:3.5223 train_time:6943405ms step_avg:4226.05ms
step:1654/1750 train_loss:3.3924 train_time:6947782ms step_avg:4226.15ms
step:1655/1750 train_loss:3.3176 train_time:6952210ms step_avg:4226.27ms
step:1656/1750 train_loss:3.2771 train_time:6956664ms step_avg:4226.41ms
step:1657/1750 train_loss:3.6235 train_time:6961131ms step_avg:4226.55ms
step:1658/1750 train_loss:3.4812 train_time:6965575ms step_avg:4226.68ms
step:1659/1750 train_loss:3.3351 train_time:6970018ms step_avg:4226.81ms
step:1660/1750 train_loss:3.4898 train_time:6974453ms step_avg:4226.94ms
step:1661/1750 train_loss:2.8738 train_time:6978952ms step_avg:4227.11ms
step:1662/1750 train_loss:3.1500 train_time:6983427ms step_avg:4227.26ms
step:1663/1750 train_loss:3.2340 train_time:6987772ms step_avg:4227.33ms
step:1664/1750 train_loss:3.2921 train_time:6992219ms step_avg:4227.46ms
step:1665/1750 train_loss:3.2719 train_time:6996621ms step_avg:4227.57ms
step:1666/1750 train_loss:3.3770 train_time:7001084ms step_avg:4227.71ms
step:1667/1750 train_loss:3.1843 train_time:7005548ms step_avg:4227.85ms
step:1668/1750 train_loss:3.5992 train_time:7010054ms step_avg:4228.02ms
step:1669/1750 train_loss:3.2746 train_time:7014455ms step_avg:4228.12ms
step:1670/1750 train_loss:3.1726 train_time:7018892ms step_avg:4228.25ms
step:1671/1750 train_loss:3.0424 train_time:7023299ms step_avg:4228.36ms
step:1672/1750 train_loss:3.1752 train_time:7027832ms step_avg:4228.54ms
step:1673/1750 train_loss:3.2774 train_time:7032288ms step_avg:4228.68ms
step:1674/1750 train_loss:3.3515 train_time:7036767ms step_avg:4228.83ms
step:1675/1750 train_loss:2.9960 train_time:7041147ms step_avg:4228.92ms
step:1676/1750 train_loss:3.3346 train_time:7045545ms step_avg:4229.02ms
step:1677/1750 train_loss:3.2537 train_time:7050110ms step_avg:4229.22ms
step:1678/1750 train_loss:3.3955 train_time:7054566ms step_avg:4229.36ms
step:1679/1750 train_loss:3.2193 train_time:7058992ms step_avg:4229.47ms
step:1680/1750 train_loss:3.3933 train_time:7063439ms step_avg:4229.60ms
step:1681/1750 train_loss:3.3387 train_time:7067907ms step_avg:4229.75ms
step:1682/1750 train_loss:3.4920 train_time:7072361ms step_avg:4229.88ms
step:1683/1750 train_loss:3.2330 train_time:7076820ms step_avg:4230.02ms
step:1684/1750 train_loss:3.2027 train_time:7081211ms step_avg:4230.11ms
step:1685/1750 train_loss:3.1887 train_time:7085663ms step_avg:4230.25ms
step:1686/1750 train_loss:3.3565 train_time:7090062ms step_avg:4230.35ms
step:1687/1750 train_loss:3.2740 train_time:7094536ms step_avg:4230.49ms
step:1688/1750 train_loss:3.3381 train_time:7099064ms step_avg:4230.67ms
step:1689/1750 train_loss:3.2637 train_time:7103543ms step_avg:4230.82ms
step:1690/1750 train_loss:3.1953 train_time:7108017ms step_avg:4230.96ms
step:1691/1750 train_loss:3.2454 train_time:7112543ms step_avg:4231.14ms
step:1692/1750 train_loss:3.3571 train_time:7116975ms step_avg:4231.26ms
step:1693/1750 train_loss:3.2897 train_time:7121585ms step_avg:4231.48ms
step:1694/1750 train_loss:3.1644 train_time:7126144ms step_avg:4231.68ms
step:1695/1750 train_loss:3.3930 train_time:7130607ms step_avg:4231.81ms
step:1696/1750 train_loss:3.2645 train_time:7135029ms step_avg:4231.93ms
step:1697/1750 train_loss:3.3692 train_time:7139407ms step_avg:4232.01ms
step:1698/1750 train_loss:3.2924 train_time:7143878ms step_avg:4232.16ms
step:1699/1750 train_loss:3.2790 train_time:7148386ms step_avg:4232.32ms
step:1700/1750 train_loss:3.3031 train_time:7152909ms step_avg:4232.49ms
step:1701/1750 train_loss:3.5211 train_time:7157392ms step_avg:4232.64ms
step:1702/1750 train_loss:3.4862 train_time:7161852ms step_avg:4232.77ms
step:1703/1750 train_loss:3.0719 train_time:7166270ms step_avg:4232.88ms
step:1704/1750 train_loss:3.4396 train_time:7170644ms step_avg:4232.97ms
step:1705/1750 train_loss:3.3961 train_time:7175136ms step_avg:4233.12ms
step:1706/1750 train_loss:3.2292 train_time:7179532ms step_avg:4233.21ms
step:1707/1750 train_loss:3.2685 train_time:7183968ms step_avg:4233.33ms
step:1708/1750 train_loss:3.2335 train_time:7188429ms step_avg:4233.47ms
step:1709/1750 train_loss:3.3244 train_time:7192798ms step_avg:4233.55ms
step:1710/1750 train_loss:3.1753 train_time:7197240ms step_avg:4233.67ms
step:1711/1750 train_loss:3.3357 train_time:7201655ms step_avg:4233.78ms
step:1712/1750 train_loss:3.4057 train_time:7206089ms step_avg:4233.89ms
step:1713/1750 train_loss:3.0926 train_time:7210564ms step_avg:4234.04ms
step:1714/1750 train_loss:3.2896 train_time:7214977ms step_avg:4234.14ms
step:1715/1750 train_loss:3.2917 train_time:7219540ms step_avg:4234.33ms
step:1716/1750 train_loss:3.2915 train_time:7224012ms step_avg:4234.47ms
step:1717/1750 train_loss:3.3006 train_time:7228483ms step_avg:4234.61ms
step:1718/1750 train_loss:3.3316 train_time:7232894ms step_avg:4234.72ms
step:1719/1750 train_loss:3.2315 train_time:7237322ms step_avg:4234.83ms
step:1720/1750 train_loss:3.4787 train_time:7241827ms step_avg:4234.99ms
step:1721/1750 train_loss:3.0866 train_time:7246296ms step_avg:4235.12ms
step:1722/1750 train_loss:3.3530 train_time:7250688ms step_avg:4235.22ms
step:1723/1750 train_loss:3.2726 train_time:7255107ms step_avg:4235.32ms
step:1724/1750 train_loss:3.2394 train_time:7259575ms step_avg:4235.46ms
step:1725/1750 train_loss:3.1955 train_time:7264075ms step_avg:4235.61ms
step:1726/1750 train_loss:3.1880 train_time:7268647ms step_avg:4235.81ms
step:1727/1750 train_loss:3.3849 train_time:7273038ms step_avg:4235.90ms
step:1728/1750 train_loss:3.3716 train_time:7277498ms step_avg:4236.03ms
step:1729/1750 train_loss:3.2809 train_time:7282000ms step_avg:4236.18ms
step:1730/1750 train_loss:3.3887 train_time:7286470ms step_avg:4236.32ms
step:1731/1750 train_loss:3.2128 train_time:7290862ms step_avg:4236.41ms
step:1732/1750 train_loss:3.1450 train_time:7295345ms step_avg:4236.55ms
step:1733/1750 train_loss:3.1667 train_time:7299793ms step_avg:4236.68ms
step:1734/1750 train_loss:3.3826 train_time:7304278ms step_avg:4236.82ms
step:1735/1750 train_loss:3.1558 train_time:7308710ms step_avg:4236.93ms
step:1736/1750 train_loss:3.5720 train_time:7313227ms step_avg:4237.10ms
step:1737/1750 train_loss:3.1654 train_time:7317660ms step_avg:4237.21ms
step:1738/1750 train_loss:3.2717 train_time:7322060ms step_avg:4237.30ms
step:1739/1750 train_loss:3.3232 train_time:7326494ms step_avg:4237.42ms
step:1740/1750 train_loss:4.5101 train_time:7330984ms step_avg:4237.56ms
step:1741/1750 train_loss:3.1658 train_time:7335522ms step_avg:4237.74ms
step:1742/1750 train_loss:3.2740 train_time:7339930ms step_avg:4237.84ms
step:1743/1750 train_loss:3.2631 train_time:7344440ms step_avg:4237.99ms
step:1744/1750 train_loss:3.0602 train_time:7348904ms step_avg:4238.12ms
step:1745/1750 train_loss:3.3559 train_time:7353349ms step_avg:4238.24ms
step:1746/1750 train_loss:3.1515 train_time:7357825ms step_avg:4238.38ms
step:1747/1750 train_loss:3.2814 train_time:7362241ms step_avg:4238.48ms
step:1748/1750 train_loss:3.3111 train_time:7366794ms step_avg:4238.66ms
step:1749/1750 train_loss:3.4520 train_time:7371287ms step_avg:4238.81ms
step:1750/1750 train_loss:3.1679 train_time:7375715ms step_avg:4238.92ms
step:1750/1750 val_loss:3.2817 train_time:7375716ms step_avg:4238.92ms
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment