Skip to content

Instantly share code, notes, and snippets.

@lapp0
Created December 24, 2024 02:45
Show Gist options
  • Save lapp0/5fa56d60f1a6a2c256b61ce01b0834c2 to your computer and use it in GitHub Desktop.
Save lapp0/5fa56d60f1a6a2c256b61ce01b0834c2 to your computer and use it in GitHub Desktop.
import os
import sys
with open(sys.argv[0]) as f:
code = f.read() # read the code of this file ASAP, for logging
import uuid
import time
import contextlib
from dataclasses import dataclass
import math
from pathlib import Path
import torch
from torch import nn
import torch.nn.functional as F
import torch.distributed as dist
import torch._inductor.config as config
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.attention.flex_attention import flex_attention, create_block_mask
# -----------------------------------------------------------------------------
# Muon optimizer
@torch.compile
def zeropower_via_newtonschulz5(G, steps):
"""
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
zero even beyond the point where the iteration no longer converges all the way to one everywhere
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
performance at all relative to UV^T, where USV^T = G is the SVD.
"""
assert len(G.shape) == 2
a, b, c = (3.4445, -4.7750, 2.0315)
X = G.bfloat16()
if G.size(0) > G.size(1):
X = X.T
# Ensure spectral norm is at most 1
X = X / (X.norm() + 1e-7)
# Perform the NS iterations
for _ in range(steps):
A = X @ X.T
B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
X = a * X + B @ X
if G.size(0) > G.size(1):
X = X.T
return X
class Muon(torch.optim.Optimizer):
"""
Muon - MomentUm Orthogonalized by Newton-schulz
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
the advantage that it can be stably run in bfloat16 on the GPU.
Some warnings:
- This optimizer assumes that all parameters passed in are 2D.
- It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D
parameters; those should all be optimized by a standard method (e.g., AdamW).
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
- We believe it is unlikely to work well for training with small batch size.
- We believe it may not work well for finetuning pretrained models, but we haven't tested this.
- We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M).
Arguments:
lr: The learning rate used by the internal SGD.
momentum: The momentum used by the internal SGD.
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
ns_steps: The number of Newton-Schulz iteration steps to use.
"""
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5):
self.world_size = int(os.environ['WORLD_SIZE'])
self.rank = int(os.environ['RANK'])
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
params = list(params)
assert all(isinstance(p, torch.Tensor) for p in params)
sizes = {p.numel() for p in params}
param_groups = [
{
'params': [p for p in params if p.numel() == size],
'update_buffer': [
torch.empty(size, device='cuda', dtype=torch.bfloat16)
for _ in range(self.world_size)
],
}
for size in sizes
]
super().__init__(param_groups, defaults)
def step(self):
for group in self.param_groups:
lr = group['lr']
momentum = group['momentum']
nesterov = group['nesterov']
ns_steps = group['ns_steps']
update_buffers = group['update_buffer']
# generate weight updates in distributed fashion
params = group['params']
assert len(params) % self.world_size == 0
handle = None
params_world = None
def update_prev():
if params_world is None:
return
assert handle is not None
handle.wait()
for p_world, g_world in zip(params_world, update_buffers):
p_world.data.add_(
g_world.view_as(p_world),
alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5,
)
for base_i in range(len(params))[::self.world_size]:
p = params[base_i + self.rank]
g = p.grad
assert g is not None
state = self.state[p]
if 'momentum_buffer' not in state:
state['momentum_buffer'] = torch.zeros_like(g)
buf = state['momentum_buffer']
buf.lerp_(g, 1 - momentum)
g = g.lerp_(buf, momentum) if nesterov else buf
g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten()
update_prev()
handle = dist.all_gather(update_buffers, g, async_op=True)
params_world = params[base_i : base_i + self.world_size]
update_prev()
# -----------------------------------------------------------------------------
# PyTorch nn.Module definitions
def norm(x):
return F.rms_norm(x, (x.size(-1),))
class CastedLinear(nn.Linear):
def __init__(self, in_features, out_features):
super().__init__(in_features, out_features, bias=False)
def forward(self, x):
return F.linear(x, self.weight.to(x.dtype))
class Rotary(torch.nn.Module):
def __init__(self, dim, base=10000):
super().__init__()
self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim))
self.seq_len_cached = None
self.cos_cached = None
self.sin_cached = None
def forward(self, x):
seq_len = x.shape[1]
if seq_len != self.seq_len_cached:
t = torch.arange(seq_len, device=x.device)
freqs = torch.outer(t, self.inv_freq)
self.seq_len_cached = seq_len
self.cos_cached = freqs.cos()
self.sin_cached = freqs.sin()
cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :]
# apply_rotary_emb(x, cos, sin)
x1, x2 = x.chunk(2, dim=3)
y1 = x1 * cos + x2 * sin
y2 = x1 * (-sin) + x2 * cos
return torch.cat((y1, y2), 3).type_as(x)
class CausalSelfAttention(nn.Module):
def __init__(self, dim, num_heads):
super().__init__()
assert dim % num_heads == 0
self.num_heads = num_heads
self.c_q = CastedLinear(dim, dim)
self.c_k = CastedLinear(dim, dim)
self.c_v = CastedLinear(dim, dim)
self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5]))
self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim
self.c_proj = CastedLinear(dim, dim)
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977
def forward(self, x, vi, block_mask):
B, T = x.size(0), x.size(1) # batch size, sequence length
assert B == 1, "Must use batch size = 1 for FlexAttention"
q = self.c_q(x).view(B, T, self.num_heads, -1)
k = self.c_k(x).view(B, T, self.num_heads, -1)
v = self.c_v(x).view(B, T, self.num_heads, -1)
v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977
q, k = norm(q), norm(k) # QK norm @Grad62304977
q, k = self.rotary(q), self.rotary(k)
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True, kernel_options = {
"BLOCK_M": 64, "BLOCK_N": 64, # forward
"BLOCK_M1": 32, "BLOCK_N1": 64, "BLOCK_M2": 64, "BLOCK_N2": 32 # backwards
})
y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side
y = self.c_proj(y)
return y
class MLP(nn.Module):
def __init__(self, dim):
super().__init__()
self.c_fc = CastedLinear(dim, 4 * dim)
self.c_proj = CastedLinear(4 * dim, dim)
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977
def forward(self, x):
x = self.c_fc(x)
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977
x = self.c_proj(x)
return x
class Block(nn.Module):
def __init__(self, config):
super().__init__()
self.attn = CausalSelfAttention(config.model_dim, config.num_heads)
self.mlp = MLP(config.model_dim)
self.lambdas = nn.Parameter(torch.tensor([1., 0.]))
def forward(self, x, vi, x0, block_mask):
x = self.lambdas[0] * x + self.lambdas[1] * x0
x = x + self.attn(norm(x), vi, block_mask)
x = x + self.mlp(norm(x))
return x
class ValueEmbedding(nn.Module):
def __init__(self, config: "ModelConfig"):
super().__init__()
self.embed = nn.ModuleList([
nn.Embedding(config.vocab_size, config.model_dim)
for _ in range(6)
])
def forward(self, inputs) -> "list[torch.Tensor]":
ve = [emb(inputs) for emb in self.embed]
ve += reversed(ve)
return ve
# -----------------------------------------------------------------------------
# The main ESM Bert model
class BERT(nn.Module):
def __init__(self, config: "ModelConfig"):
super().__init__()
self.mask_id = 32
self.bos_id = 0
self.num_layers = config.num_layers
# U-net design by @brendanh0gan
self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder
self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder
# Add learnable skip connection weights for decoder layers
self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers))
self.embed = nn.Embedding(config.vocab_size, config.model_dim)
self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)])
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning
# U-net structure on token value embeddings by @leloykun
self.value_embeds = ValueEmbedding(config)
self.lm_head = CastedLinear(config.model_dim, config.vocab_size)
self.lm_head.weight.data.zero_() # @Grad62304977
def encoder_pass(self, input_seq: torch.Tensor, sliding_window_size: torch.Tensor):
docs = (input_seq == self.bos_id).cumsum(0)
def doc_mask_mod(b, h, q_idx, kv_idx):
bidirectional_sliding_window_mask = torch.abs(q_idx - kv_idx) < sliding_window_size
doc_mask = docs[q_idx] == docs[kv_idx]
return bidirectional_sliding_window_mask & doc_mask
S = len(input_seq)
block_mask = create_block_mask(
doc_mask_mod, None, None, S, S,
)
x = self.embed(input_seq[None])
x = norm(x) # @Grad62304977
x0 = x
ve = self.value_embeds(input_seq)
ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:]
# Store outputs for U-Net skip connections
skip_connections = []
# Encoder pass - process only the first half of the blocks
for i in range(self.num_encoder_layers):
x = self.blocks[i](x, ve_enc[i], x0, block_mask)
skip_connections.append(x)
# Decoder pass - process the remaining blocks with weighted skip connections
for i in range(self.num_decoder_layers):
x = x + self.skip_weights[i] * skip_connections.pop()
# U-net structure on token value embeddings by @leloykun
x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask)
x = norm(x)
logits = self.lm_head(x)
logits = 30 * torch.tanh(logits / 30) # @Grad62304977
logits = logits.float()
return logits
def forward(self, seq, sliding_window_size: torch.Tensor):
# MLM mask/replace constants from https://www.biorxiv.org/content/10.1101/2022.07.20.500902v3.full.pdf
pct_masked = 0.12
pct_replaced = 0.015
pct_kept = 0.015
# set pct_masked% to <mask>
mlm_mask = self.get_frac_mask(seq, pct_masked, torch.ones_like(seq, dtype=torch.bool))
input_seq = seq.clone().masked_fill(mlm_mask, self.mask_id)
# substitute pct_replaced% with token id between 4 and 30 (inclusive)
sub_mask = self.get_frac_mask(seq, pct_replaced, ~mlm_mask)
input_seq[sub_mask] = torch.randint(4, 31, (sub_mask.sum(),), dtype=seq.dtype, device=seq.device)
# retain pct_kept%
keep_mask = self.get_frac_mask(seq, pct_kept, ~(sub_mask | mlm_mask))
mlm_loss_mask = mlm_mask | sub_mask | keep_mask
logits = self.encoder_pass(input_seq, sliding_window_size)
return F.cross_entropy(
logits.view(-1, logits.size(-1)),
seq.masked_fill(~mlm_loss_mask, -100).to(dtype=torch.int64).view(-1),
ignore_index=-100
)
def get_frac_mask(self, seq: torch.Tensor, pct: float, include=None):
docs = (seq == self.bos_id).cumsum(0)
valid_tokens_mask = (seq >= 4) & (seq <= 30)
if include is not None:
valid_tokens_mask &= include
random_values = torch.rand_like(docs, dtype=torch.float) * valid_tokens_mask
# Map each token to its doc index, count tokens per doc, and compute how many to mask
_, inv_docs = torch.unique(docs, return_inverse=True)
doc_counts = torch.bincount(inv_docs) # total tokens in each doc
num_to_mask = (doc_counts.float() * pct).ceil().to(torch.int64)
# Rank tokens globally by random value and select num_to_mask
sorted_indices = torch.argsort(random_values, descending=True)
ranks = torch.empty_like(sorted_indices, dtype=torch.int64)
ranks[sorted_indices] = torch.arange(len(seq), device=seq.device)
return ranks < num_to_mask[inv_docs]
# -----------------------------------------------------------------------------
# Our own simple Distributed Data Loader
def _peek_data_shard(file: Path):
# only reads the header, returns header data
# header is 256 int32
header = torch.from_file(f"{file}", False, 256, dtype=torch.int32)
assert header[0] == 20240520, "magic number mismatch in the data .bin file"
assert header[1] == 1, "unsupported version"
return int(header[2]) # number of tokens (claimed)
def _load_data_shard(path: Path, num_tokens):
with path.open("rb", buffering=0) as f:
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True)
f.seek(256 * 4)
nbytes = f.readinto(tokens.numpy())
assert nbytes == 2 * num_tokens, "number of tokens read does not match header?"
return tokens
class DistributedDataLoader:
def __init__(self, filename_pattern, seq_len, process_rank, num_processes):
self.process_rank = process_rank
self.num_processes = num_processes
self.seq_len = seq_len
# glob files that match the pattern
self.files = sorted(Path.cwd().glob(filename_pattern))
assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"
# load and validate all data shards, count number of tokens in total
self.files_num_tokens = [_peek_data_shard(file) for file in self.files]
assert min(self.files_num_tokens) >= num_processes * seq_len + 1
self.total_num_tokens = sum(self.files_num_tokens)
self.reset()
def reset(self):
self.current_shard = -1
self.advance()
def advance(self): # advance to next data shard
self.current_shard = (self.current_shard + 1) % len(self.files)
self.current_position = self.process_rank * self.seq_len
self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard])
def next_batch(self):
batch_size = self.seq_len * self.num_processes
buf = self.tokens[self.current_position:self.current_position+self.seq_len+1]
# host side async is sufficient;
# no performance improvement was observed when introducing a separate stream.
seq = buf.to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs
# advance current position and load next shard if necessary
self.current_position += batch_size
if self.current_position + batch_size + 1 >= len(self.tokens):
self.advance()
return seq
# -----------------------------------------------------------------------------
# int main
@dataclass
class Hyperparameters:
# data hyperparams
input_bin : str = 'data/omgprot50/omgprot50_train_*.bin' # input .bin to train on
input_val_bin : str = 'data/omgprot50/omgprot50_val_*.bin' # input .bin to eval validation loss on
# optimization hyperparams
batch_size : int = 16 # batch size, in sequences, across all devices
sequence_length : int = 32*1024 # sequence length, in tokens
num_iterations : int = 3000 # number of iterations to run
warmup_iters : int = 0
cooldown_iters : int = 2000 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule
weight_decay : float = 0
# evaluation and logging hyperparams
val_loss_every : int = 25 # every how many steps to evaluate val loss? 0 for only at the end
val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons
@dataclass
class ModelConfig:
# 33 tokens: https://huggingface.co/Synthyra/ESMplusplus_large/blob/main/modeling_esm_plusplus.py#L868-L874
# Depth of the number of layers is typically more important than the depth of the hidden dimension for PLMs
# ESM2-8M has 6 layers, 20 heads, 320 hidden dim: https://huggingface.co/facebook/esm2_t6_8M_UR50D/blob/main/config.json
# ESM2-35M has 12 layers, 20 heads, 480 hidden dim: https://huggingface.co/facebook/esm2_t12_35M_UR50D/blob/main/config.json
# ESM2-150M has 30 layers, 20 heads, 640 hidden dim: https://huggingface.co/facebook/esm2_t30_150M_UR50D/blob/main/config.json
# ESM2-650M has 33 layers, 20 heads, 1280 hidden dim: https://huggingface.co/facebook/esm2_t33_650M_UR50D/blob/main/config.json
vocab_size : int = 33
num_layers : int = 12
num_heads : int = 6 # head dim 128 suggested by @Grad62304977
model_dim : int = 768
model_config = ModelConfig()
args = Hyperparameters()
def get_param_count(model):
total_params = 0
for name, param in model.named_parameters():
total_params += param.numel()
return total_params
# set up DDP (distributed data parallel). torchrun sets this env variable
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
assert torch.cuda.is_available()
device = torch.device(f'cuda:{ddp_local_rank}')
torch.cuda.set_device(device)
print(f'using device: {device}')
dist.init_process_group(backend='nccl', device_id=device)
dist.barrier()
master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc.
# begin logging
logfile = None
if master_process:
run_id = uuid.uuid4()
Path('logs').mkdir(exist_ok=True)
# logdir = Path('logs') / f'{run_id}'
# logdir.mkdir()
logfile = Path('logs') / f'{run_id}.txt'
print(logfile.stem)
# create the log file
with logfile.open('w') as f:
# begin the log by printing this file (the Python code)
print(code, file=f)
print('=' * 100, file=f)
def print0(s, logonly=False):
if master_process:
with logfile.open('a') as f:
if not logonly:
print(s)
print(s, file=f)
# log information about the hardware/software environment this is running on
# and print the full `nvidia-smi` to file
print0(f'Running python {sys.version}')
print0(f'Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:')
import subprocess
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
print0(f'{result.stdout}', logonly=True)
print0('='*100, logonly=True)
# calculate the number of steps to take in the val loop.
assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0
val_steps = args.val_tokens // (args.sequence_length * ddp_world_size)
# calculate the steps of gradient accumulation required to attain the desired global batch size.
assert args.batch_size % (ddp_world_size) == 0
train_accumulation_steps = args.batch_size // ddp_world_size
# load tokens
train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size)
val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size)
print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files")
print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files")
print0('='*100, logonly=True)
seq_train = train_loader.next_batch()
model = BERT(model_config)
model = model.cuda().bfloat16()
for m in model.modules():
if isinstance(m, CastedLinear):
m.float()
config.coordinate_descent_tuning = True # suggested by @Chillee
model = torch.compile(model)
# here we wrap model into DDP container
model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True)
raw_model = model.module # always contains the "raw" unwrapped model
# init the optimizer(s)
embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()]
optimizer1 = torch.optim.Adam(embed_params, lr=0.1, betas=(0.8, 0.95), fused=True)
optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.001, betas=(0.8, 0.95), fused=True)
params = list(raw_model.blocks.parameters())
matrix_params = [p for p in params if p.ndim == 2]
scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights]
optimizer3 = Muon(matrix_params, lr=0.01, momentum=0.95)
optimizer4 = torch.optim.Adam(scalar_params, lr=0.01, betas=(0.8, 0.95), fused=True)
optimizers = [optimizer1, optimizer2, optimizer3, optimizer4]
# learning rate decay scheduler (linear warmup and cooldown)
def get_lr(it):
assert it <= args.num_iterations
# 1) linear warmup for warmup_iters steps
if it < args.warmup_iters:
return (it+1) / args.warmup_iters
# 2) constant lr for a while
elif it < args.num_iterations - args.cooldown_iters:
return 1.0
# 3) linear cooldown
else:
decay_ratio = (args.num_iterations - it) / args.cooldown_iters
return decay_ratio
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers]
sliding_window_size = torch.tensor(1024 - 128, dtype=torch.int32, device="cuda")
sw_prev = 1024 - 128
# Start training loop
training_time_ms = 0
# start the clock
torch.cuda.synchronize()
t0 = time.perf_counter()
# begin training
for step in range(args.num_iterations + 1):
last_step = (step == args.num_iterations)
# This effectively ignores timing first 10 steps, which are slower for weird reasons.
# Alternately, and slightly more correctly in terms of benchmarking, we could do 10
# steps with dummy data first, and then re-initialize the model and reset the loader.
if step == 10:
training_time_ms = 0
t0 = time.perf_counter()
timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val
# Linearly increase the sliding window size over training in chunks of 128 from 1024 -> 2048. By @fernbear.bsky.social
frac_done = step / args.num_iterations # training progress
sw_size = int(((1 - frac_done) * 1023 + frac_done * 2048) // 128) * 128
if sw_size != sw_prev:
sliding_window_size.copy_(sw_size, non_blocking=True)
sw_prev = sw_size
# once in a while evaluate the validation dataset
if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)):
# stop the clock
torch.cuda.synchronize()
training_time_ms += 1000 * (time.perf_counter() - t0)
# run validation batches
model.eval()
val_loader.reset()
val_loss = 0.0
for _ in range(val_steps):
with torch.no_grad():
seq_val = val_loader.next_batch()
val_loss += model(seq_val, sliding_window_size)
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG)
val_loss /= val_steps
# log val loss to console and to logfile
print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms perplexity:{(math.e**val_loss):.4f} param_count:{get_param_count(model):,}')
# start the clock again
torch.cuda.synchronize()
t0 = time.perf_counter()
# uncomment if you want to save any checkpoints
#save_every = 1000
#if master_process and (last_step or (save_every > 0 and step % save_every == 0)):
# # stop the clock
# torch.cuda.synchronize()
# training_time_ms += 1000 * (time.perf_counter() - t0)
# # save the state of the training process
# log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers])
# torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step))
# # start the clock again
# torch.cuda.synchronize()
# t0 = time.perf_counter()
# bit confusing: we want to make sure to eval on 0th iteration
# but also after the very last iteration. so we loop for step <= num_iterations
# instead of just < num_iterations (one extra due to <=), only to do
# the validation/sampling one last time, and then we break right here as we're done.
if last_step:
break
# --------------- TRAINING SECTION BEGIN -----------------
model.train()
for i in range(1, train_accumulation_steps + 1):
with contextlib.ExitStack() as stack:
if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step
stack.enter_context(model.no_sync())
#if step >= 5:
# stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True))
model(seq_train, sliding_window_size).backward()
seq_train = train_loader.next_batch()
if train_accumulation_steps != 1:
for p in model.parameters():
p.grad /= train_accumulation_steps
# momentum warmup for Muon
frac = min(step/300, 1)
for group in optimizer3.param_groups:
group['momentum'] = (1 - frac) * 0.85 + frac * 0.95
# step the optimizers and schedulers
for opt, sched in zip(optimizers, schedulers):
opt.step()
sched.step()
# null the gradients
model.zero_grad(set_to_none=True)
# --------------- TRAINING SECTION END -------------------
# everything that follows now is just diagnostics, prints, logging, etc.
approx_time = training_time_ms + 1000 * (time.perf_counter() - t0)
print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms")
print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")
# -------------------------------------------------------------------------
# clean up nice
dist.destroy_process_group()
====================================================================================================
Running python 3.11.10 | packaged by conda-forge | (main, Oct 16 2024, 01:27:36) [GCC 13.3.0]
Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4
nvidia-smi:
Tue Dec 24 01:42:58 2024
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120 Driver Version: 550.120 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 4090 On | 00000000:42:00.0 Off | Off |
| 39% 32C P2 44W / 450W | 1756MiB / 24564MiB | 33% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA GeForce RTX 4090 On | 00000000:81:00.0 Off | Off |
| 41% 35C P2 43W / 450W | 591MiB / 24564MiB | 10% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA GeForce RTX 4090 On | 00000000:82:00.0 Off | Off |
| 40% 31C P2 52W / 450W | 591MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA GeForce RTX 4090 On | 00000000:C1:00.0 Off | Off |
| 40% 35C P2 35W / 450W | 591MiB / 24564MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+
====================================================================================================
Training DataLoader: total number of tokens: 10000000000 across 100 files
Validation DataLoader: total number of tokens: 100000000 across 1 files
====================================================================================================
step:0/3000 val_loss:3.4965 train_time:0ms step_avg:nanms perplexity:33.0000 param_count:85,137,462
step:1/3000 train_time:48133ms step_avg:nanms
step:2/3000 train_time:49679ms step_avg:nanms
step:3/3000 train_time:50628ms step_avg:nanms
step:4/3000 train_time:51613ms step_avg:nanms
step:5/3000 train_time:52610ms step_avg:nanms
step:6/3000 train_time:53612ms step_avg:nanms
step:7/3000 train_time:54595ms step_avg:nanms
step:8/3000 train_time:55582ms step_avg:nanms
step:9/3000 train_time:56571ms step_avg:nanms
step:10/3000 train_time:57565ms step_avg:nanms
step:11/3000 train_time:981ms step_avg:nanms
step:12/3000 train_time:1974ms step_avg:nanms
step:13/3000 train_time:2962ms step_avg:987.20ms
step:14/3000 train_time:3946ms step_avg:986.38ms
step:15/3000 train_time:4939ms step_avg:987.76ms
step:16/3000 train_time:5918ms step_avg:986.30ms
step:17/3000 train_time:6906ms step_avg:986.62ms
step:18/3000 train_time:7894ms step_avg:986.80ms
step:19/3000 train_time:8890ms step_avg:987.82ms
step:20/3000 train_time:9882ms step_avg:988.19ms
step:21/3000 train_time:10870ms step_avg:988.15ms
step:22/3000 train_time:11864ms step_avg:988.65ms
step:23/3000 train_time:12848ms step_avg:988.34ms
step:24/3000 train_time:13826ms step_avg:987.55ms
step:25/3000 train_time:14819ms step_avg:987.92ms
step:25/3000 val_loss:2.6738 train_time:14859ms step_avg:990.60ms perplexity:14.4954 param_count:85,137,462
step:26/3000 train_time:15809ms step_avg:988.04ms
step:27/3000 train_time:16797ms step_avg:988.05ms
step:28/3000 train_time:17797ms step_avg:988.72ms
step:29/3000 train_time:18801ms step_avg:989.52ms
step:30/3000 train_time:19787ms step_avg:989.37ms
step:31/3000 train_time:20782ms step_avg:989.64ms
step:32/3000 train_time:21776ms step_avg:989.81ms
step:33/3000 train_time:22757ms step_avg:989.43ms
step:34/3000 train_time:23752ms step_avg:989.66ms
step:35/3000 train_time:24750ms step_avg:990.01ms
step:36/3000 train_time:25736ms step_avg:989.84ms
step:37/3000 train_time:26739ms step_avg:990.34ms
step:38/3000 train_time:27740ms step_avg:990.72ms
step:39/3000 train_time:28732ms step_avg:990.77ms
step:40/3000 train_time:29722ms step_avg:990.74ms
step:41/3000 train_time:30711ms step_avg:990.69ms
step:42/3000 train_time:31699ms step_avg:990.59ms
step:43/3000 train_time:32680ms step_avg:990.31ms
step:44/3000 train_time:33681ms step_avg:990.61ms
step:45/3000 train_time:34674ms step_avg:990.69ms
step:46/3000 train_time:35661ms step_avg:990.59ms
step:47/3000 train_time:36653ms step_avg:990.63ms
step:48/3000 train_time:37655ms step_avg:990.92ms
step:49/3000 train_time:38645ms step_avg:990.89ms
step:50/3000 train_time:39643ms step_avg:991.06ms
step:50/3000 val_loss:2.6546 train_time:39682ms step_avg:992.05ms perplexity:14.2189 param_count:85,137,462
step:51/3000 train_time:40619ms step_avg:990.71ms
step:52/3000 train_time:41614ms step_avg:990.81ms
step:53/3000 train_time:42604ms step_avg:990.79ms
step:54/3000 train_time:43597ms step_avg:990.85ms
step:55/3000 train_time:44588ms step_avg:990.83ms
step:56/3000 train_time:45574ms step_avg:990.74ms
step:57/3000 train_time:46569ms step_avg:990.84ms
step:58/3000 train_time:47553ms step_avg:990.69ms
step:59/3000 train_time:48555ms step_avg:990.93ms
step:60/3000 train_time:49553ms step_avg:991.07ms
step:61/3000 train_time:50554ms step_avg:991.25ms
step:62/3000 train_time:51544ms step_avg:991.23ms
step:63/3000 train_time:52535ms step_avg:991.23ms
step:64/3000 train_time:53535ms step_avg:991.40ms
step:65/3000 train_time:54524ms step_avg:991.34ms
step:66/3000 train_time:55540ms step_avg:991.79ms
step:67/3000 train_time:56533ms step_avg:991.82ms
step:68/3000 train_time:57513ms step_avg:991.60ms
step:69/3000 train_time:58506ms step_avg:991.63ms
step:70/3000 train_time:59495ms step_avg:991.58ms
step:71/3000 train_time:60498ms step_avg:991.78ms
step:72/3000 train_time:61485ms step_avg:991.69ms
step:73/3000 train_time:62470ms step_avg:991.58ms
step:74/3000 train_time:63467ms step_avg:991.67ms
step:75/3000 train_time:64471ms step_avg:991.86ms
step:75/3000 val_loss:2.6359 train_time:64512ms step_avg:992.49ms perplexity:13.9554 param_count:85,137,462
step:76/3000 train_time:65459ms step_avg:991.80ms
step:77/3000 train_time:66447ms step_avg:991.75ms
step:78/3000 train_time:67433ms step_avg:991.66ms
step:79/3000 train_time:68434ms step_avg:991.80ms
step:80/3000 train_time:69425ms step_avg:991.78ms
step:81/3000 train_time:70419ms step_avg:991.81ms
step:82/3000 train_time:71401ms step_avg:991.68ms
step:83/3000 train_time:72401ms step_avg:991.79ms
step:84/3000 train_time:73385ms step_avg:991.68ms
step:85/3000 train_time:74378ms step_avg:991.71ms
step:86/3000 train_time:75357ms step_avg:991.54ms
step:87/3000 train_time:76351ms step_avg:991.58ms
step:88/3000 train_time:77343ms step_avg:991.58ms
step:89/3000 train_time:78336ms step_avg:991.59ms
step:90/3000 train_time:79329ms step_avg:991.61ms
step:91/3000 train_time:80320ms step_avg:991.60ms
step:92/3000 train_time:81315ms step_avg:991.64ms
step:93/3000 train_time:82297ms step_avg:991.53ms
step:94/3000 train_time:83291ms step_avg:991.56ms
step:95/3000 train_time:84286ms step_avg:991.60ms
step:96/3000 train_time:85272ms step_avg:991.54ms
step:97/3000 train_time:86257ms step_avg:991.46ms
step:98/3000 train_time:87244ms step_avg:991.40ms
step:99/3000 train_time:88237ms step_avg:991.43ms
step:100/3000 train_time:89229ms step_avg:991.43ms
step:100/3000 val_loss:2.6370 train_time:89268ms step_avg:991.87ms perplexity:13.9708 param_count:85,137,462
step:101/3000 train_time:90221ms step_avg:991.44ms
step:102/3000 train_time:91214ms step_avg:991.46ms
step:103/3000 train_time:92227ms step_avg:991.68ms
step:104/3000 train_time:93214ms step_avg:991.63ms
step:105/3000 train_time:94192ms step_avg:991.49ms
step:106/3000 train_time:95184ms step_avg:991.49ms
step:107/3000 train_time:96175ms step_avg:991.50ms
step:108/3000 train_time:97160ms step_avg:991.43ms
step:109/3000 train_time:98159ms step_avg:991.51ms
step:110/3000 train_time:99156ms step_avg:991.56ms
step:111/3000 train_time:100137ms step_avg:991.45ms
step:112/3000 train_time:101131ms step_avg:991.48ms
step:113/3000 train_time:102119ms step_avg:991.44ms
step:114/3000 train_time:103111ms step_avg:991.45ms
step:115/3000 train_time:104104ms step_avg:991.47ms
step:116/3000 train_time:105110ms step_avg:991.60ms
step:117/3000 train_time:106098ms step_avg:991.57ms
step:118/3000 train_time:107086ms step_avg:991.53ms
step:119/3000 train_time:108077ms step_avg:991.53ms
step:120/3000 train_time:109075ms step_avg:991.60ms
step:121/3000 train_time:110065ms step_avg:991.58ms
step:122/3000 train_time:111056ms step_avg:991.57ms
step:123/3000 train_time:112046ms step_avg:991.56ms
step:124/3000 train_time:113029ms step_avg:991.48ms
step:125/3000 train_time:114020ms step_avg:991.48ms
step:125/3000 val_loss:2.6425 train_time:114059ms step_avg:991.82ms perplexity:14.0477 param_count:85,137,462
step:126/3000 train_time:115005ms step_avg:991.42ms
step:127/3000 train_time:115992ms step_avg:991.38ms
step:128/3000 train_time:116990ms step_avg:991.44ms
step:129/3000 train_time:117979ms step_avg:991.42ms
step:130/3000 train_time:118975ms step_avg:991.46ms
step:131/3000 train_time:119966ms step_avg:991.46ms
step:132/3000 train_time:120964ms step_avg:991.51ms
step:133/3000 train_time:121959ms step_avg:991.54ms
step:134/3000 train_time:122942ms step_avg:991.47ms
step:135/3000 train_time:123932ms step_avg:991.46ms
step:136/3000 train_time:124919ms step_avg:991.42ms
step:137/3000 train_time:125908ms step_avg:991.40ms
step:138/3000 train_time:126906ms step_avg:991.45ms
step:139/3000 train_time:127899ms step_avg:991.46ms
step:140/3000 train_time:128890ms step_avg:991.46ms
step:141/3000 train_time:129891ms step_avg:991.53ms
step:142/3000 train_time:130879ms step_avg:991.51ms
step:143/3000 train_time:131877ms step_avg:991.56ms
step:144/3000 train_time:132872ms step_avg:991.58ms
step:145/3000 train_time:133867ms step_avg:991.61ms
step:146/3000 train_time:134864ms step_avg:991.65ms
step:147/3000 train_time:135856ms step_avg:991.65ms
step:148/3000 train_time:136853ms step_avg:991.69ms
step:149/3000 train_time:137862ms step_avg:991.81ms
step:150/3000 train_time:138859ms step_avg:991.85ms
step:150/3000 val_loss:2.6375 train_time:138898ms step_avg:992.13ms perplexity:13.9780 param_count:85,137,462
step:151/3000 train_time:139847ms step_avg:991.82ms
step:152/3000 train_time:140832ms step_avg:991.77ms
step:153/3000 train_time:141824ms step_avg:991.78ms
step:154/3000 train_time:142807ms step_avg:991.71ms
step:155/3000 train_time:143796ms step_avg:991.70ms
step:156/3000 train_time:144781ms step_avg:991.65ms
step:157/3000 train_time:145778ms step_avg:991.68ms
step:158/3000 train_time:146768ms step_avg:991.67ms
step:159/3000 train_time:147750ms step_avg:991.61ms
step:160/3000 train_time:148740ms step_avg:991.60ms
step:161/3000 train_time:149734ms step_avg:991.61ms
step:162/3000 train_time:150724ms step_avg:991.60ms
step:163/3000 train_time:151712ms step_avg:991.58ms
step:164/3000 train_time:152711ms step_avg:991.63ms
step:165/3000 train_time:153692ms step_avg:991.56ms
step:166/3000 train_time:154692ms step_avg:991.61ms
step:167/3000 train_time:155689ms step_avg:991.65ms
step:168/3000 train_time:156675ms step_avg:991.61ms
step:169/3000 train_time:157696ms step_avg:991.80ms
step:170/3000 train_time:158685ms step_avg:991.78ms
step:171/3000 train_time:159684ms step_avg:991.83ms
step:172/3000 train_time:160664ms step_avg:991.75ms
step:173/3000 train_time:161656ms step_avg:991.75ms
step:174/3000 train_time:162649ms step_avg:991.76ms
step:175/3000 train_time:163643ms step_avg:991.78ms
step:175/3000 val_loss:2.6236 train_time:163684ms step_avg:992.02ms perplexity:13.7847 param_count:85,137,462
step:176/3000 train_time:164632ms step_avg:991.76ms
step:177/3000 train_time:165633ms step_avg:991.81ms
step:178/3000 train_time:166621ms step_avg:991.79ms
step:179/3000 train_time:167603ms step_avg:991.73ms
step:180/3000 train_time:168593ms step_avg:991.72ms
step:181/3000 train_time:169582ms step_avg:991.70ms
step:182/3000 train_time:170567ms step_avg:991.67ms
step:183/3000 train_time:171562ms step_avg:991.69ms
step:184/3000 train_time:172560ms step_avg:991.72ms
step:185/3000 train_time:173551ms step_avg:991.72ms
step:186/3000 train_time:174543ms step_avg:991.72ms
step:187/3000 train_time:175526ms step_avg:991.67ms
step:188/3000 train_time:176520ms step_avg:991.69ms
step:189/3000 train_time:177533ms step_avg:991.80ms
step:190/3000 train_time:178531ms step_avg:991.84ms
step:191/3000 train_time:179517ms step_avg:991.81ms
step:192/3000 train_time:180511ms step_avg:991.82ms
step:193/3000 train_time:181499ms step_avg:991.80ms
step:194/3000 train_time:182482ms step_avg:991.75ms
step:195/3000 train_time:183464ms step_avg:991.70ms
step:196/3000 train_time:184465ms step_avg:991.75ms
step:197/3000 train_time:185456ms step_avg:991.74ms
step:198/3000 train_time:186456ms step_avg:991.79ms
step:199/3000 train_time:187461ms step_avg:991.86ms
step:200/3000 train_time:188452ms step_avg:991.85ms
step:200/3000 val_loss:2.6330 train_time:188493ms step_avg:992.07ms perplexity:13.9151 param_count:85,137,462
step:201/3000 train_time:189438ms step_avg:991.82ms
step:202/3000 train_time:190433ms step_avg:991.84ms
step:203/3000 train_time:191420ms step_avg:991.81ms
step:204/3000 train_time:192423ms step_avg:991.87ms
step:205/3000 train_time:193413ms step_avg:991.86ms
step:206/3000 train_time:194399ms step_avg:991.83ms
step:207/3000 train_time:195383ms step_avg:991.79ms
step:208/3000 train_time:196370ms step_avg:991.77ms
step:209/3000 train_time:197358ms step_avg:991.75ms
step:210/3000 train_time:198359ms step_avg:991.80ms
step:211/3000 train_time:199362ms step_avg:991.85ms
step:212/3000 train_time:200356ms step_avg:991.86ms
step:213/3000 train_time:201346ms step_avg:991.85ms
step:214/3000 train_time:202339ms step_avg:991.86ms
step:215/3000 train_time:203334ms step_avg:991.87ms
step:216/3000 train_time:204329ms step_avg:991.89ms
step:217/3000 train_time:205324ms step_avg:991.90ms
step:218/3000 train_time:206322ms step_avg:991.93ms
step:219/3000 train_time:207310ms step_avg:991.92ms
step:220/3000 train_time:208296ms step_avg:991.89ms
step:221/3000 train_time:209285ms step_avg:991.87ms
step:222/3000 train_time:210276ms step_avg:991.87ms
step:223/3000 train_time:211273ms step_avg:991.89ms
step:224/3000 train_time:212265ms step_avg:991.89ms
step:225/3000 train_time:213250ms step_avg:991.86ms
step:225/3000 val_loss:2.6215 train_time:213290ms step_avg:992.04ms perplexity:13.7567 param_count:85,137,462
step:226/3000 train_time:214236ms step_avg:991.83ms
step:227/3000 train_time:215229ms step_avg:991.84ms
step:228/3000 train_time:216215ms step_avg:991.81ms
step:229/3000 train_time:217204ms step_avg:991.80ms
step:230/3000 train_time:218188ms step_avg:991.76ms
step:231/3000 train_time:219187ms step_avg:991.80ms
step:232/3000 train_time:220171ms step_avg:991.76ms
step:233/3000 train_time:221172ms step_avg:991.80ms
step:234/3000 train_time:222168ms step_avg:991.82ms
step:235/3000 train_time:223156ms step_avg:991.80ms
step:236/3000 train_time:224148ms step_avg:991.80ms
step:237/3000 train_time:225133ms step_avg:991.78ms
step:238/3000 train_time:226131ms step_avg:991.80ms
step:239/3000 train_time:227131ms step_avg:991.84ms
step:240/3000 train_time:228121ms step_avg:991.83ms
step:241/3000 train_time:229121ms step_avg:991.87ms
step:242/3000 train_time:230113ms step_avg:991.87ms
step:243/3000 train_time:231102ms step_avg:991.85ms
step:244/3000 train_time:232094ms step_avg:991.85ms
step:245/3000 train_time:233085ms step_avg:991.85ms
step:246/3000 train_time:234074ms step_avg:991.84ms
step:247/3000 train_time:235057ms step_avg:991.80ms
step:248/3000 train_time:236046ms step_avg:991.79ms
step:249/3000 train_time:237042ms step_avg:991.81ms
step:250/3000 train_time:238035ms step_avg:991.81ms
step:250/3000 val_loss:2.6061 train_time:238074ms step_avg:991.98ms perplexity:13.5458 param_count:85,137,462
step:251/3000 train_time:239024ms step_avg:991.80ms
step:252/3000 train_time:240016ms step_avg:991.80ms
step:253/3000 train_time:241019ms step_avg:991.85ms
step:254/3000 train_time:242017ms step_avg:991.87ms
step:255/3000 train_time:243003ms step_avg:991.85ms
step:256/3000 train_time:244006ms step_avg:991.89ms
step:257/3000 train_time:245007ms step_avg:991.93ms
step:258/3000 train_time:245999ms step_avg:991.93ms
step:259/3000 train_time:246997ms step_avg:991.96ms
step:260/3000 train_time:247990ms step_avg:991.96ms
step:261/3000 train_time:248969ms step_avg:991.91ms
step:262/3000 train_time:249963ms step_avg:991.92ms
step:263/3000 train_time:250949ms step_avg:991.89ms
step:264/3000 train_time:251939ms step_avg:991.89ms
step:265/3000 train_time:252934ms step_avg:991.90ms
step:266/3000 train_time:253931ms step_avg:991.92ms
step:267/3000 train_time:254916ms step_avg:991.89ms
step:268/3000 train_time:255913ms step_avg:991.91ms
step:269/3000 train_time:256917ms step_avg:991.96ms
step:270/3000 train_time:257912ms step_avg:991.97ms
step:271/3000 train_time:258905ms step_avg:991.97ms
step:272/3000 train_time:259893ms step_avg:991.96ms
step:273/3000 train_time:260887ms step_avg:991.97ms
step:274/3000 train_time:261876ms step_avg:991.95ms
step:275/3000 train_time:262879ms step_avg:992.00ms
step:275/3000 val_loss:2.6002 train_time:262920ms step_avg:992.15ms perplexity:13.4664 param_count:85,137,462
step:276/3000 train_time:263865ms step_avg:991.97ms
step:277/3000 train_time:264852ms step_avg:991.95ms
step:278/3000 train_time:265845ms step_avg:991.96ms
step:279/3000 train_time:266834ms step_avg:991.95ms
step:280/3000 train_time:267828ms step_avg:991.95ms
step:281/3000 train_time:268820ms step_avg:991.95ms
step:282/3000 train_time:269810ms step_avg:991.95ms
step:283/3000 train_time:270802ms step_avg:991.95ms
step:284/3000 train_time:271803ms step_avg:991.98ms
step:285/3000 train_time:272792ms step_avg:991.97ms
step:286/3000 train_time:273786ms step_avg:991.98ms
step:287/3000 train_time:274778ms step_avg:991.98ms
step:288/3000 train_time:275769ms step_avg:991.97ms
step:289/3000 train_time:276749ms step_avg:991.93ms
step:290/3000 train_time:277748ms step_avg:991.96ms
step:291/3000 train_time:278735ms step_avg:991.94ms
step:292/3000 train_time:279719ms step_avg:991.91ms
step:293/3000 train_time:280703ms step_avg:991.88ms
step:294/3000 train_time:281693ms step_avg:991.88ms
step:295/3000 train_time:282691ms step_avg:991.90ms
step:296/3000 train_time:283687ms step_avg:991.91ms
step:297/3000 train_time:284675ms step_avg:991.90ms
step:298/3000 train_time:285663ms step_avg:991.89ms
step:299/3000 train_time:286653ms step_avg:991.88ms
step:300/3000 train_time:287634ms step_avg:991.84ms
step:300/3000 val_loss:2.6102 train_time:287675ms step_avg:991.98ms perplexity:13.6020 param_count:85,137,462
step:301/3000 train_time:288622ms step_avg:991.83ms
step:302/3000 train_time:289610ms step_avg:991.81ms
step:303/3000 train_time:290593ms step_avg:991.78ms
step:304/3000 train_time:291580ms step_avg:991.77ms
step:305/3000 train_time:292565ms step_avg:991.75ms
step:306/3000 train_time:293568ms step_avg:991.78ms
step:307/3000 train_time:294559ms step_avg:991.78ms
step:308/3000 train_time:295545ms step_avg:991.76ms
step:309/3000 train_time:296549ms step_avg:991.80ms
step:310/3000 train_time:297546ms step_avg:991.82ms
step:311/3000 train_time:298539ms step_avg:991.82ms
step:312/3000 train_time:299532ms step_avg:991.83ms
step:313/3000 train_time:300518ms step_avg:991.81ms
step:314/3000 train_time:301513ms step_avg:991.82ms
step:315/3000 train_time:302500ms step_avg:991.80ms
step:316/3000 train_time:303484ms step_avg:991.78ms
step:317/3000 train_time:304480ms step_avg:991.79ms
step:318/3000 train_time:305470ms step_avg:991.79ms
step:319/3000 train_time:306462ms step_avg:991.79ms
step:320/3000 train_time:307451ms step_avg:991.78ms
step:321/3000 train_time:308431ms step_avg:991.74ms
step:322/3000 train_time:309420ms step_avg:991.73ms
step:323/3000 train_time:310403ms step_avg:991.70ms
step:324/3000 train_time:311393ms step_avg:991.70ms
step:325/3000 train_time:312400ms step_avg:991.75ms
step:325/3000 val_loss:2.5998 train_time:312441ms step_avg:991.88ms perplexity:13.4607 param_count:85,137,462
step:326/3000 train_time:313388ms step_avg:991.73ms
step:327/3000 train_time:314387ms step_avg:991.76ms
step:328/3000 train_time:315379ms step_avg:991.76ms
step:329/3000 train_time:316368ms step_avg:991.75ms
step:330/3000 train_time:317354ms step_avg:991.73ms
step:331/3000 train_time:318341ms step_avg:991.72ms
step:332/3000 train_time:319325ms step_avg:991.69ms
step:333/3000 train_time:320320ms step_avg:991.70ms
step:334/3000 train_time:321307ms step_avg:991.69ms
step:335/3000 train_time:322308ms step_avg:991.72ms
step:336/3000 train_time:323310ms step_avg:991.75ms
step:337/3000 train_time:324307ms step_avg:991.77ms
step:338/3000 train_time:325291ms step_avg:991.74ms
step:339/3000 train_time:326279ms step_avg:991.73ms
step:340/3000 train_time:327268ms step_avg:991.72ms
step:341/3000 train_time:328255ms step_avg:991.71ms
step:342/3000 train_time:329244ms step_avg:991.70ms
step:343/3000 train_time:330233ms step_avg:991.69ms
step:344/3000 train_time:331230ms step_avg:991.71ms
step:345/3000 train_time:332224ms step_avg:991.71ms
step:346/3000 train_time:333218ms step_avg:991.72ms
step:347/3000 train_time:334221ms step_avg:991.75ms
step:348/3000 train_time:335215ms step_avg:991.76ms
step:349/3000 train_time:336210ms step_avg:991.77ms
step:350/3000 train_time:337194ms step_avg:991.75ms
step:350/3000 val_loss:2.5841 train_time:337234ms step_avg:991.87ms perplexity:13.2511 param_count:85,137,462
step:351/3000 train_time:338181ms step_avg:991.73ms
step:352/3000 train_time:339180ms step_avg:991.75ms
step:353/3000 train_time:340168ms step_avg:991.74ms
step:354/3000 train_time:341168ms step_avg:991.77ms
step:355/3000 train_time:342153ms step_avg:991.75ms
step:356/3000 train_time:343139ms step_avg:991.73ms
step:357/3000 train_time:344132ms step_avg:991.73ms
step:358/3000 train_time:345122ms step_avg:991.73ms
step:359/3000 train_time:346110ms step_avg:991.72ms
step:360/3000 train_time:347108ms step_avg:991.74ms
step:361/3000 train_time:348096ms step_avg:991.73ms
step:362/3000 train_time:349084ms step_avg:991.72ms
step:363/3000 train_time:350079ms step_avg:991.73ms
step:364/3000 train_time:351070ms step_avg:991.72ms
step:365/3000 train_time:352067ms step_avg:991.74ms
step:366/3000 train_time:353062ms step_avg:991.75ms
step:367/3000 train_time:354053ms step_avg:991.74ms
step:368/3000 train_time:355037ms step_avg:991.72ms
step:369/3000 train_time:356030ms step_avg:991.73ms
step:370/3000 train_time:357011ms step_avg:991.70ms
step:371/3000 train_time:358010ms step_avg:991.72ms
step:372/3000 train_time:359004ms step_avg:991.72ms
step:373/3000 train_time:359990ms step_avg:991.71ms
step:374/3000 train_time:360985ms step_avg:991.72ms
step:375/3000 train_time:361977ms step_avg:991.72ms
step:375/3000 val_loss:2.5964 train_time:362018ms step_avg:991.83ms perplexity:13.4160 param_count:85,137,462
step:376/3000 train_time:362965ms step_avg:991.71ms
step:377/3000 train_time:363955ms step_avg:991.70ms
step:378/3000 train_time:364938ms step_avg:991.68ms
step:379/3000 train_time:365955ms step_avg:991.75ms
step:380/3000 train_time:366940ms step_avg:991.73ms
step:381/3000 train_time:367939ms step_avg:991.75ms
step:382/3000 train_time:368939ms step_avg:991.77ms
step:383/3000 train_time:369929ms step_avg:991.77ms
step:384/3000 train_time:370918ms step_avg:991.76ms
step:385/3000 train_time:371914ms step_avg:991.77ms
step:386/3000 train_time:372909ms step_avg:991.78ms
step:387/3000 train_time:373887ms step_avg:991.74ms
step:388/3000 train_time:374872ms step_avg:991.73ms
step:389/3000 train_time:375861ms step_avg:991.72ms
step:390/3000 train_time:376850ms step_avg:991.71ms
step:391/3000 train_time:377847ms step_avg:991.72ms
step:392/3000 train_time:378832ms step_avg:991.71ms
step:393/3000 train_time:379816ms step_avg:991.69ms
step:394/3000 train_time:380814ms step_avg:991.70ms
step:395/3000 train_time:381808ms step_avg:991.71ms
step:396/3000 train_time:382811ms step_avg:991.74ms
step:397/3000 train_time:383797ms step_avg:991.72ms
step:398/3000 train_time:384795ms step_avg:991.74ms
step:399/3000 train_time:385780ms step_avg:991.72ms
step:400/3000 train_time:386780ms step_avg:991.74ms
step:400/3000 val_loss:2.5936 train_time:386821ms step_avg:991.85ms perplexity:13.3785 param_count:85,137,462
step:401/3000 train_time:387764ms step_avg:991.72ms
step:402/3000 train_time:388758ms step_avg:991.73ms
step:403/3000 train_time:389753ms step_avg:991.74ms
step:404/3000 train_time:390749ms step_avg:991.75ms
step:405/3000 train_time:391727ms step_avg:991.71ms
step:406/3000 train_time:392726ms step_avg:991.73ms
step:407/3000 train_time:393719ms step_avg:991.74ms
step:408/3000 train_time:394709ms step_avg:991.73ms
step:409/3000 train_time:395692ms step_avg:991.71ms
step:410/3000 train_time:396682ms step_avg:991.71ms
step:411/3000 train_time:397684ms step_avg:991.73ms
step:412/3000 train_time:398683ms step_avg:991.75ms
step:413/3000 train_time:399692ms step_avg:991.79ms
step:414/3000 train_time:400678ms step_avg:991.78ms
step:415/3000 train_time:401674ms step_avg:991.79ms
step:416/3000 train_time:402683ms step_avg:991.83ms
step:417/3000 train_time:403677ms step_avg:991.84ms
step:418/3000 train_time:404665ms step_avg:991.83ms
step:419/3000 train_time:405649ms step_avg:991.81ms
step:420/3000 train_time:406643ms step_avg:991.81ms
step:421/3000 train_time:407639ms step_avg:991.82ms
step:422/3000 train_time:408636ms step_avg:991.84ms
step:423/3000 train_time:409633ms step_avg:991.85ms
step:424/3000 train_time:410627ms step_avg:991.85ms
step:425/3000 train_time:411621ms step_avg:991.86ms
step:425/3000 val_loss:2.5910 train_time:411660ms step_avg:991.95ms perplexity:13.3427 param_count:85,137,462
step:426/3000 train_time:412603ms step_avg:991.83ms
step:427/3000 train_time:413595ms step_avg:991.84ms
step:428/3000 train_time:414581ms step_avg:991.82ms
step:429/3000 train_time:415578ms step_avg:991.83ms
step:430/3000 train_time:416574ms step_avg:991.84ms
step:431/3000 train_time:417571ms step_avg:991.85ms
step:432/3000 train_time:418557ms step_avg:991.84ms
step:433/3000 train_time:419553ms step_avg:991.85ms
step:434/3000 train_time:420540ms step_avg:991.84ms
step:435/3000 train_time:421540ms step_avg:991.86ms
step:436/3000 train_time:422538ms step_avg:991.87ms
step:437/3000 train_time:423526ms step_avg:991.86ms
step:438/3000 train_time:424532ms step_avg:991.90ms
step:439/3000 train_time:425535ms step_avg:991.92ms
step:440/3000 train_time:426533ms step_avg:991.94ms
step:441/3000 train_time:427521ms step_avg:991.93ms
step:442/3000 train_time:428511ms step_avg:991.92ms
step:443/3000 train_time:429511ms step_avg:991.94ms
step:444/3000 train_time:430493ms step_avg:991.92ms
step:445/3000 train_time:431485ms step_avg:991.92ms
step:446/3000 train_time:432481ms step_avg:991.93ms
step:447/3000 train_time:433471ms step_avg:991.92ms
step:448/3000 train_time:434468ms step_avg:991.94ms
step:449/3000 train_time:435463ms step_avg:991.94ms
step:450/3000 train_time:436465ms step_avg:991.97ms
step:450/3000 val_loss:2.5977 train_time:436506ms step_avg:992.06ms perplexity:13.4325 param_count:85,137,462
step:451/3000 train_time:437468ms step_avg:991.99ms
step:452/3000 train_time:438468ms step_avg:992.01ms
step:453/3000 train_time:439453ms step_avg:991.99ms
step:454/3000 train_time:440442ms step_avg:991.99ms
step:455/3000 train_time:441441ms step_avg:992.00ms
step:456/3000 train_time:442433ms step_avg:992.00ms
step:457/3000 train_time:443439ms step_avg:992.03ms
step:458/3000 train_time:444428ms step_avg:992.03ms
step:459/3000 train_time:445416ms step_avg:992.02ms
step:460/3000 train_time:446407ms step_avg:992.02ms
step:461/3000 train_time:447396ms step_avg:992.01ms
step:462/3000 train_time:448392ms step_avg:992.02ms
step:463/3000 train_time:449383ms step_avg:992.02ms
step:464/3000 train_time:450365ms step_avg:991.99ms
step:465/3000 train_time:451350ms step_avg:991.98ms
step:466/3000 train_time:452351ms step_avg:992.00ms
step:467/3000 train_time:453341ms step_avg:991.99ms
step:468/3000 train_time:454336ms step_avg:992.00ms
step:469/3000 train_time:455322ms step_avg:991.99ms
step:470/3000 train_time:456315ms step_avg:991.99ms
step:471/3000 train_time:457299ms step_avg:991.97ms
step:472/3000 train_time:458282ms step_avg:991.95ms
step:473/3000 train_time:459278ms step_avg:991.96ms
step:474/3000 train_time:460275ms step_avg:991.97ms
step:475/3000 train_time:461265ms step_avg:991.97ms
step:475/3000 val_loss:2.5901 train_time:461304ms step_avg:992.05ms perplexity:13.3305 param_count:85,137,462
step:476/3000 train_time:462248ms step_avg:991.95ms
step:477/3000 train_time:463232ms step_avg:991.93ms
step:478/3000 train_time:464231ms step_avg:991.95ms
step:479/3000 train_time:465221ms step_avg:991.94ms
step:480/3000 train_time:466211ms step_avg:991.94ms
step:481/3000 train_time:467203ms step_avg:991.94ms
step:482/3000 train_time:468196ms step_avg:991.94ms
step:483/3000 train_time:469183ms step_avg:991.93ms
step:484/3000 train_time:470197ms step_avg:991.98ms
step:485/3000 train_time:471178ms step_avg:991.95ms
step:486/3000 train_time:472162ms step_avg:991.94ms
step:487/3000 train_time:473151ms step_avg:991.93ms
step:488/3000 train_time:474137ms step_avg:991.92ms
step:489/3000 train_time:475128ms step_avg:991.92ms
step:490/3000 train_time:476127ms step_avg:991.93ms
step:491/3000 train_time:477125ms step_avg:991.94ms
step:492/3000 train_time:478115ms step_avg:991.94ms
step:493/3000 train_time:479102ms step_avg:991.93ms
step:494/3000 train_time:480092ms step_avg:991.93ms
step:495/3000 train_time:481087ms step_avg:991.93ms
step:496/3000 train_time:482069ms step_avg:991.91ms
step:497/3000 train_time:483059ms step_avg:991.91ms
step:498/3000 train_time:484045ms step_avg:991.89ms
step:499/3000 train_time:485037ms step_avg:991.90ms
step:500/3000 train_time:486038ms step_avg:991.91ms
step:500/3000 val_loss:2.5974 train_time:486080ms step_avg:992.00ms perplexity:13.4283 param_count:85,137,462
step:501/3000 train_time:487012ms step_avg:991.88ms
step:502/3000 train_time:487999ms step_avg:991.87ms
step:503/3000 train_time:488988ms step_avg:991.86ms
step:504/3000 train_time:489981ms step_avg:991.86ms
step:505/3000 train_time:490982ms step_avg:991.88ms
step:506/3000 train_time:491987ms step_avg:991.91ms
step:507/3000 train_time:492973ms step_avg:991.90ms
step:508/3000 train_time:493963ms step_avg:991.89ms
step:509/3000 train_time:494955ms step_avg:991.89ms
step:510/3000 train_time:495938ms step_avg:991.88ms
step:511/3000 train_time:496932ms step_avg:991.88ms
step:512/3000 train_time:497938ms step_avg:991.91ms
step:513/3000 train_time:498949ms step_avg:991.95ms
step:514/3000 train_time:499949ms step_avg:991.96ms
step:515/3000 train_time:500941ms step_avg:991.96ms
step:516/3000 train_time:501941ms step_avg:991.98ms
step:517/3000 train_time:502938ms step_avg:991.99ms
step:518/3000 train_time:503933ms step_avg:991.99ms
step:519/3000 train_time:504922ms step_avg:991.99ms
step:520/3000 train_time:505917ms step_avg:991.99ms
step:521/3000 train_time:506912ms step_avg:992.00ms
step:522/3000 train_time:507908ms step_avg:992.01ms
step:523/3000 train_time:508899ms step_avg:992.01ms
step:524/3000 train_time:509896ms step_avg:992.02ms
step:525/3000 train_time:510887ms step_avg:992.01ms
step:525/3000 val_loss:2.5882 train_time:510927ms step_avg:992.09ms perplexity:13.3057 param_count:85,137,462
step:526/3000 train_time:511875ms step_avg:992.01ms
step:527/3000 train_time:512862ms step_avg:992.00ms
step:528/3000 train_time:513861ms step_avg:992.01ms
step:529/3000 train_time:514866ms step_avg:992.03ms
step:530/3000 train_time:515863ms step_avg:992.04ms
step:531/3000 train_time:516854ms step_avg:992.04ms
step:532/3000 train_time:517847ms step_avg:992.04ms
step:533/3000 train_time:518839ms step_avg:992.04ms
step:534/3000 train_time:519824ms step_avg:992.03ms
step:535/3000 train_time:520818ms step_avg:992.03ms
step:536/3000 train_time:521798ms step_avg:992.01ms
step:537/3000 train_time:522783ms step_avg:992.00ms
step:538/3000 train_time:523763ms step_avg:991.97ms
step:539/3000 train_time:524762ms step_avg:991.99ms
step:540/3000 train_time:525754ms step_avg:991.99ms
step:541/3000 train_time:526753ms step_avg:992.00ms
step:542/3000 train_time:527744ms step_avg:992.00ms
step:543/3000 train_time:528748ms step_avg:992.02ms
step:544/3000 train_time:529733ms step_avg:992.01ms
step:545/3000 train_time:530724ms step_avg:992.01ms
step:546/3000 train_time:531722ms step_avg:992.02ms
step:547/3000 train_time:532714ms step_avg:992.02ms
step:548/3000 train_time:533702ms step_avg:992.01ms
step:549/3000 train_time:534699ms step_avg:992.02ms
step:550/3000 train_time:535687ms step_avg:992.01ms
step:550/3000 val_loss:2.5817 train_time:535728ms step_avg:992.09ms perplexity:13.2193 param_count:85,137,462
step:551/3000 train_time:536674ms step_avg:992.00ms
step:552/3000 train_time:537664ms step_avg:992.00ms
step:553/3000 train_time:538657ms step_avg:992.00ms
step:554/3000 train_time:539658ms step_avg:992.02ms
step:555/3000 train_time:540635ms step_avg:991.99ms
step:556/3000 train_time:541654ms step_avg:992.04ms
step:557/3000 train_time:542655ms step_avg:992.06ms
step:558/3000 train_time:543652ms step_avg:992.07ms
step:559/3000 train_time:544662ms step_avg:992.10ms
step:560/3000 train_time:545669ms step_avg:992.12ms
step:561/3000 train_time:546670ms step_avg:992.14ms
step:562/3000 train_time:547662ms step_avg:992.14ms
step:563/3000 train_time:548653ms step_avg:992.14ms
step:564/3000 train_time:549654ms step_avg:992.16ms
step:565/3000 train_time:550635ms step_avg:992.14ms
step:566/3000 train_time:551635ms step_avg:992.15ms
step:567/3000 train_time:552646ms step_avg:992.18ms
step:568/3000 train_time:553645ms step_avg:992.20ms
step:569/3000 train_time:554640ms step_avg:992.20ms
step:570/3000 train_time:555635ms step_avg:992.21ms
step:571/3000 train_time:556624ms step_avg:992.20ms
step:572/3000 train_time:557621ms step_avg:992.21ms
step:573/3000 train_time:558617ms step_avg:992.21ms
step:574/3000 train_time:559613ms step_avg:992.22ms
step:575/3000 train_time:560606ms step_avg:992.22ms
step:575/3000 val_loss:2.5812 train_time:560648ms step_avg:992.30ms perplexity:13.2133 param_count:85,137,462
step:576/3000 train_time:561589ms step_avg:992.21ms
step:577/3000 train_time:562595ms step_avg:992.23ms
step:578/3000 train_time:563590ms step_avg:992.24ms
step:579/3000 train_time:564588ms step_avg:992.25ms
step:580/3000 train_time:565589ms step_avg:992.26ms
step:581/3000 train_time:566596ms step_avg:992.29ms
step:582/3000 train_time:567635ms step_avg:992.37ms
step:583/3000 train_time:568631ms step_avg:992.37ms
step:584/3000 train_time:569618ms step_avg:992.37ms
step:585/3000 train_time:570613ms step_avg:992.37ms
step:586/3000 train_time:571607ms step_avg:992.37ms
step:587/3000 train_time:572602ms step_avg:992.38ms
step:588/3000 train_time:573598ms step_avg:992.38ms
step:589/3000 train_time:574597ms step_avg:992.40ms
step:590/3000 train_time:575589ms step_avg:992.39ms
step:591/3000 train_time:576600ms step_avg:992.43ms
step:592/3000 train_time:577602ms step_avg:992.44ms
step:593/3000 train_time:578595ms step_avg:992.44ms
step:594/3000 train_time:579588ms step_avg:992.44ms
step:595/3000 train_time:580572ms step_avg:992.43ms
step:596/3000 train_time:581565ms step_avg:992.43ms
step:597/3000 train_time:582548ms step_avg:992.42ms
step:598/3000 train_time:583548ms step_avg:992.43ms
step:599/3000 train_time:584552ms step_avg:992.45ms
step:600/3000 train_time:585553ms step_avg:992.46ms
step:600/3000 val_loss:2.5913 train_time:585593ms step_avg:992.53ms perplexity:13.3468 param_count:85,137,462
step:601/3000 train_time:586533ms step_avg:992.44ms
step:602/3000 train_time:587533ms step_avg:992.45ms
step:603/3000 train_time:588540ms step_avg:992.48ms
step:604/3000 train_time:589532ms step_avg:992.48ms
step:605/3000 train_time:590528ms step_avg:992.48ms
step:606/3000 train_time:591541ms step_avg:992.52ms
step:607/3000 train_time:592528ms step_avg:992.51ms
step:608/3000 train_time:593524ms step_avg:992.52ms
step:609/3000 train_time:594528ms step_avg:992.53ms
step:610/3000 train_time:595519ms step_avg:992.53ms
step:611/3000 train_time:596510ms step_avg:992.53ms
step:612/3000 train_time:597505ms step_avg:992.53ms
step:613/3000 train_time:598487ms step_avg:992.52ms
step:614/3000 train_time:599483ms step_avg:992.52ms
step:615/3000 train_time:600473ms step_avg:992.52ms
step:616/3000 train_time:601462ms step_avg:992.51ms
step:617/3000 train_time:602461ms step_avg:992.52ms
step:618/3000 train_time:603453ms step_avg:992.52ms
step:619/3000 train_time:604449ms step_avg:992.53ms
step:620/3000 train_time:605433ms step_avg:992.51ms
step:621/3000 train_time:606424ms step_avg:992.51ms
step:622/3000 train_time:607417ms step_avg:992.51ms
step:623/3000 train_time:608406ms step_avg:992.51ms
step:624/3000 train_time:609402ms step_avg:992.51ms
step:625/3000 train_time:610385ms step_avg:992.50ms
step:625/3000 val_loss:2.5798 train_time:610427ms step_avg:992.56ms perplexity:13.1942 param_count:85,137,462
step:626/3000 train_time:611376ms step_avg:992.49ms
step:627/3000 train_time:612367ms step_avg:992.49ms
step:628/3000 train_time:613355ms step_avg:992.48ms
step:629/3000 train_time:614351ms step_avg:992.49ms
step:630/3000 train_time:615343ms step_avg:992.49ms
step:631/3000 train_time:616330ms step_avg:992.48ms
step:632/3000 train_time:617320ms step_avg:992.48ms
step:633/3000 train_time:618330ms step_avg:992.50ms
step:634/3000 train_time:619322ms step_avg:992.50ms
step:635/3000 train_time:620313ms step_avg:992.50ms
step:636/3000 train_time:621310ms step_avg:992.51ms
step:637/3000 train_time:622300ms step_avg:992.50ms
step:638/3000 train_time:623283ms step_avg:992.49ms
step:639/3000 train_time:624277ms step_avg:992.49ms
step:640/3000 train_time:625271ms step_avg:992.49ms
step:641/3000 train_time:626267ms step_avg:992.50ms
step:642/3000 train_time:627262ms step_avg:992.50ms
step:643/3000 train_time:628247ms step_avg:992.49ms
step:644/3000 train_time:629232ms step_avg:992.48ms
step:645/3000 train_time:630237ms step_avg:992.50ms
step:646/3000 train_time:631230ms step_avg:992.50ms
step:647/3000 train_time:632225ms step_avg:992.50ms
step:648/3000 train_time:633226ms step_avg:992.52ms
step:649/3000 train_time:634222ms step_avg:992.52ms
step:650/3000 train_time:635216ms step_avg:992.52ms
step:650/3000 val_loss:2.5856 train_time:635255ms step_avg:992.59ms perplexity:13.2709 param_count:85,137,462
step:651/3000 train_time:636205ms step_avg:992.52ms
step:652/3000 train_time:637192ms step_avg:992.51ms
step:653/3000 train_time:638185ms step_avg:992.51ms
step:654/3000 train_time:639189ms step_avg:992.53ms
step:655/3000 train_time:640180ms step_avg:992.53ms
step:656/3000 train_time:641166ms step_avg:992.52ms
step:657/3000 train_time:642162ms step_avg:992.52ms
step:658/3000 train_time:643159ms step_avg:992.53ms
step:659/3000 train_time:644197ms step_avg:992.60ms
step:660/3000 train_time:645196ms step_avg:992.61ms
step:661/3000 train_time:646182ms step_avg:992.60ms
step:662/3000 train_time:647178ms step_avg:992.60ms
step:663/3000 train_time:648167ms step_avg:992.60ms
step:664/3000 train_time:649162ms step_avg:992.60ms
step:665/3000 train_time:650160ms step_avg:992.61ms
step:666/3000 train_time:651150ms step_avg:992.61ms
step:667/3000 train_time:652162ms step_avg:992.64ms
step:668/3000 train_time:653153ms step_avg:992.63ms
step:669/3000 train_time:654142ms step_avg:992.63ms
step:670/3000 train_time:655133ms step_avg:992.63ms
step:671/3000 train_time:656120ms step_avg:992.62ms
step:672/3000 train_time:657103ms step_avg:992.60ms
step:673/3000 train_time:658093ms step_avg:992.60ms
step:674/3000 train_time:659095ms step_avg:992.61ms
step:675/3000 train_time:660086ms step_avg:992.61ms
step:675/3000 val_loss:2.5814 train_time:660127ms step_avg:992.67ms perplexity:13.2159 param_count:85,137,462
step:676/3000 train_time:661084ms step_avg:992.62ms
step:677/3000 train_time:662071ms step_avg:992.61ms
step:678/3000 train_time:663055ms step_avg:992.60ms
step:679/3000 train_time:664055ms step_avg:992.61ms
step:680/3000 train_time:665059ms step_avg:992.62ms
step:681/3000 train_time:666043ms step_avg:992.61ms
step:682/3000 train_time:667031ms step_avg:992.61ms
step:683/3000 train_time:668027ms step_avg:992.61ms
step:684/3000 train_time:669022ms step_avg:992.61ms
step:685/3000 train_time:670009ms step_avg:992.61ms
step:686/3000 train_time:671002ms step_avg:992.61ms
step:687/3000 train_time:671988ms step_avg:992.60ms
step:688/3000 train_time:672980ms step_avg:992.60ms
step:689/3000 train_time:673989ms step_avg:992.62ms
step:690/3000 train_time:674974ms step_avg:992.61ms
step:691/3000 train_time:675963ms step_avg:992.60ms
step:692/3000 train_time:676957ms step_avg:992.61ms
step:693/3000 train_time:677945ms step_avg:992.60ms
step:694/3000 train_time:678934ms step_avg:992.59ms
step:695/3000 train_time:679931ms step_avg:992.60ms
step:696/3000 train_time:680910ms step_avg:992.58ms
step:697/3000 train_time:681900ms step_avg:992.58ms
step:698/3000 train_time:682888ms step_avg:992.57ms
step:699/3000 train_time:683876ms step_avg:992.56ms
step:700/3000 train_time:684873ms step_avg:992.57ms
step:700/3000 val_loss:2.5794 train_time:684913ms step_avg:992.63ms perplexity:13.1894 param_count:85,137,462
step:701/3000 train_time:685867ms step_avg:992.57ms
step:702/3000 train_time:686855ms step_avg:992.56ms
step:703/3000 train_time:687850ms step_avg:992.57ms
step:704/3000 train_time:688845ms step_avg:992.57ms
step:705/3000 train_time:689835ms step_avg:992.57ms
step:706/3000 train_time:690832ms step_avg:992.57ms
step:707/3000 train_time:691827ms step_avg:992.58ms
step:708/3000 train_time:692815ms step_avg:992.57ms
step:709/3000 train_time:693804ms step_avg:992.57ms
step:710/3000 train_time:694798ms step_avg:992.57ms
step:711/3000 train_time:695793ms step_avg:992.57ms
step:712/3000 train_time:696793ms step_avg:992.58ms
step:713/3000 train_time:697789ms step_avg:992.59ms
step:714/3000 train_time:698783ms step_avg:992.59ms
step:715/3000 train_time:699768ms step_avg:992.58ms
step:716/3000 train_time:700760ms step_avg:992.58ms
step:717/3000 train_time:701766ms step_avg:992.60ms
step:718/3000 train_time:702774ms step_avg:992.62ms
step:719/3000 train_time:703758ms step_avg:992.61ms
step:720/3000 train_time:704758ms step_avg:992.62ms
step:721/3000 train_time:705758ms step_avg:992.63ms
step:722/3000 train_time:706786ms step_avg:992.68ms
step:723/3000 train_time:707781ms step_avg:992.68ms
step:724/3000 train_time:708795ms step_avg:992.71ms
step:725/3000 train_time:709784ms step_avg:992.70ms
step:725/3000 val_loss:2.5799 train_time:709824ms step_avg:992.76ms perplexity:13.1955 param_count:85,137,462
step:726/3000 train_time:710778ms step_avg:992.71ms
step:727/3000 train_time:711769ms step_avg:992.70ms
step:728/3000 train_time:712772ms step_avg:992.72ms
step:729/3000 train_time:713773ms step_avg:992.73ms
step:730/3000 train_time:714766ms step_avg:992.73ms
step:731/3000 train_time:715753ms step_avg:992.72ms
step:732/3000 train_time:716748ms step_avg:992.73ms
step:733/3000 train_time:717746ms step_avg:992.73ms
step:734/3000 train_time:718738ms step_avg:992.73ms
step:735/3000 train_time:719732ms step_avg:992.73ms
step:736/3000 train_time:720731ms step_avg:992.74ms
step:737/3000 train_time:721720ms step_avg:992.74ms
step:738/3000 train_time:722717ms step_avg:992.74ms
step:739/3000 train_time:723708ms step_avg:992.74ms
step:740/3000 train_time:724696ms step_avg:992.73ms
step:741/3000 train_time:725691ms step_avg:992.74ms
step:742/3000 train_time:726686ms step_avg:992.74ms
step:743/3000 train_time:727674ms step_avg:992.73ms
step:744/3000 train_time:728665ms step_avg:992.73ms
step:745/3000 train_time:729649ms step_avg:992.72ms
step:746/3000 train_time:730647ms step_avg:992.73ms
step:747/3000 train_time:731633ms step_avg:992.72ms
step:748/3000 train_time:732626ms step_avg:992.72ms
step:749/3000 train_time:733626ms step_avg:992.73ms
step:750/3000 train_time:734610ms step_avg:992.72ms
step:750/3000 val_loss:2.5809 train_time:734650ms step_avg:992.77ms perplexity:13.2095 param_count:85,137,462
step:751/3000 train_time:735594ms step_avg:992.70ms
step:752/3000 train_time:736589ms step_avg:992.71ms
step:753/3000 train_time:737577ms step_avg:992.70ms
step:754/3000 train_time:738567ms step_avg:992.70ms
step:755/3000 train_time:739566ms step_avg:992.71ms
step:756/3000 train_time:740571ms step_avg:992.72ms
step:757/3000 train_time:741569ms step_avg:992.73ms
step:758/3000 train_time:742568ms step_avg:992.74ms
step:759/3000 train_time:743560ms step_avg:992.74ms
step:760/3000 train_time:744553ms step_avg:992.74ms
step:761/3000 train_time:745592ms step_avg:992.80ms
step:762/3000 train_time:746598ms step_avg:992.82ms
step:763/3000 train_time:747593ms step_avg:992.82ms
step:764/3000 train_time:748584ms step_avg:992.82ms
step:765/3000 train_time:749568ms step_avg:992.80ms
step:766/3000 train_time:750563ms step_avg:992.81ms
step:767/3000 train_time:751551ms step_avg:992.80ms
step:768/3000 train_time:752552ms step_avg:992.81ms
step:769/3000 train_time:753557ms step_avg:992.83ms
step:770/3000 train_time:754567ms step_avg:992.85ms
step:771/3000 train_time:755557ms step_avg:992.85ms
step:772/3000 train_time:756552ms step_avg:992.85ms
step:773/3000 train_time:757527ms step_avg:992.83ms
step:774/3000 train_time:758516ms step_avg:992.82ms
step:775/3000 train_time:759507ms step_avg:992.82ms
step:775/3000 val_loss:2.5791 train_time:759547ms step_avg:992.87ms perplexity:13.1848 param_count:85,137,462
step:776/3000 train_time:760496ms step_avg:992.81ms
step:777/3000 train_time:761494ms step_avg:992.82ms
step:778/3000 train_time:762485ms step_avg:992.82ms
step:779/3000 train_time:763472ms step_avg:992.81ms
step:780/3000 train_time:764458ms step_avg:992.80ms
step:781/3000 train_time:765462ms step_avg:992.82ms
step:782/3000 train_time:766454ms step_avg:992.82ms
step:783/3000 train_time:767440ms step_avg:992.81ms
step:784/3000 train_time:768436ms step_avg:992.81ms
step:785/3000 train_time:769431ms step_avg:992.81ms
step:786/3000 train_time:770416ms step_avg:992.80ms
step:787/3000 train_time:771412ms step_avg:992.81ms
step:788/3000 train_time:772402ms step_avg:992.80ms
step:789/3000 train_time:773405ms step_avg:992.82ms
step:790/3000 train_time:774399ms step_avg:992.82ms
step:791/3000 train_time:775386ms step_avg:992.81ms
step:792/3000 train_time:776384ms step_avg:992.82ms
step:793/3000 train_time:777375ms step_avg:992.82ms
step:794/3000 train_time:778363ms step_avg:992.81ms
step:795/3000 train_time:779353ms step_avg:992.81ms
step:796/3000 train_time:780351ms step_avg:992.81ms
step:797/3000 train_time:781346ms step_avg:992.82ms
step:798/3000 train_time:782332ms step_avg:992.81ms
step:799/3000 train_time:783325ms step_avg:992.81ms
step:800/3000 train_time:784322ms step_avg:992.81ms
step:800/3000 val_loss:2.5709 train_time:784363ms step_avg:992.86ms perplexity:13.0775 param_count:85,137,462
step:801/3000 train_time:785313ms step_avg:992.81ms
step:802/3000 train_time:786306ms step_avg:992.81ms
step:803/3000 train_time:787293ms step_avg:992.80ms
step:804/3000 train_time:788275ms step_avg:992.79ms
step:805/3000 train_time:789271ms step_avg:992.79ms
step:806/3000 train_time:790263ms step_avg:992.79ms
step:807/3000 train_time:791255ms step_avg:992.79ms
step:808/3000 train_time:792247ms step_avg:992.79ms
step:809/3000 train_time:793239ms step_avg:992.79ms
step:810/3000 train_time:794235ms step_avg:992.79ms
step:811/3000 train_time:795217ms step_avg:992.78ms
step:812/3000 train_time:796209ms step_avg:992.78ms
step:813/3000 train_time:797202ms step_avg:992.78ms
step:814/3000 train_time:798187ms step_avg:992.77ms
step:815/3000 train_time:799186ms step_avg:992.78ms
step:816/3000 train_time:800177ms step_avg:992.78ms
step:817/3000 train_time:801182ms step_avg:992.79ms
step:818/3000 train_time:802174ms step_avg:992.79ms
step:819/3000 train_time:803169ms step_avg:992.79ms
step:820/3000 train_time:804164ms step_avg:992.79ms
step:821/3000 train_time:805175ms step_avg:992.82ms
step:822/3000 train_time:806176ms step_avg:992.83ms
step:823/3000 train_time:807169ms step_avg:992.83ms
step:824/3000 train_time:808164ms step_avg:992.83ms
step:825/3000 train_time:809164ms step_avg:992.84ms
step:825/3000 val_loss:2.5760 train_time:809205ms step_avg:992.89ms perplexity:13.1446 param_count:85,137,462
step:826/3000 train_time:810150ms step_avg:992.83ms
step:827/3000 train_time:811142ms step_avg:992.83ms
step:828/3000 train_time:812127ms step_avg:992.82ms
step:829/3000 train_time:813125ms step_avg:992.83ms
step:830/3000 train_time:814120ms step_avg:992.83ms
step:831/3000 train_time:815109ms step_avg:992.82ms
step:832/3000 train_time:816107ms step_avg:992.83ms
step:833/3000 train_time:817109ms step_avg:992.84ms
step:834/3000 train_time:818105ms step_avg:992.85ms
step:835/3000 train_time:819088ms step_avg:992.83ms
step:836/3000 train_time:820087ms step_avg:992.84ms
step:837/3000 train_time:821075ms step_avg:992.84ms
step:838/3000 train_time:822064ms step_avg:992.83ms
step:839/3000 train_time:823058ms step_avg:992.83ms
step:840/3000 train_time:824053ms step_avg:992.83ms
step:841/3000 train_time:825050ms step_avg:992.84ms
step:842/3000 train_time:826042ms step_avg:992.84ms
step:843/3000 train_time:827023ms step_avg:992.83ms
step:844/3000 train_time:828018ms step_avg:992.83ms
step:845/3000 train_time:829005ms step_avg:992.82ms
step:846/3000 train_time:830000ms step_avg:992.82ms
step:847/3000 train_time:830998ms step_avg:992.83ms
step:848/3000 train_time:832005ms step_avg:992.85ms
step:849/3000 train_time:833001ms step_avg:992.85ms
step:850/3000 train_time:834003ms step_avg:992.86ms
step:850/3000 val_loss:2.5737 train_time:834045ms step_avg:992.91ms perplexity:13.1140 param_count:85,137,462
step:851/3000 train_time:835001ms step_avg:992.87ms
step:852/3000 train_time:835996ms step_avg:992.87ms
step:853/3000 train_time:836982ms step_avg:992.86ms
step:854/3000 train_time:837980ms step_avg:992.87ms
step:855/3000 train_time:838970ms step_avg:992.86ms
step:856/3000 train_time:839968ms step_avg:992.87ms
step:857/3000 train_time:840967ms step_avg:992.88ms
step:858/3000 train_time:841965ms step_avg:992.88ms
step:859/3000 train_time:842959ms step_avg:992.88ms
step:860/3000 train_time:843954ms step_avg:992.89ms
step:861/3000 train_time:844937ms step_avg:992.87ms
step:862/3000 train_time:845928ms step_avg:992.87ms
step:863/3000 train_time:846924ms step_avg:992.88ms
step:864/3000 train_time:847915ms step_avg:992.87ms
step:865/3000 train_time:848901ms step_avg:992.87ms
step:866/3000 train_time:849892ms step_avg:992.87ms
step:867/3000 train_time:850883ms step_avg:992.86ms
step:868/3000 train_time:851875ms step_avg:992.86ms
step:869/3000 train_time:852878ms step_avg:992.87ms
step:870/3000 train_time:853864ms step_avg:992.86ms
step:871/3000 train_time:854856ms step_avg:992.86ms
step:872/3000 train_time:855853ms step_avg:992.87ms
step:873/3000 train_time:856853ms step_avg:992.88ms
step:874/3000 train_time:857841ms step_avg:992.87ms
step:875/3000 train_time:858839ms step_avg:992.88ms
step:875/3000 val_loss:2.5850 train_time:858879ms step_avg:992.92ms perplexity:13.2632 param_count:85,137,462
step:876/3000 train_time:859829ms step_avg:992.87ms
step:877/3000 train_time:860840ms step_avg:992.90ms
step:878/3000 train_time:861831ms step_avg:992.89ms
step:879/3000 train_time:862823ms step_avg:992.89ms
step:880/3000 train_time:863817ms step_avg:992.89ms
step:881/3000 train_time:864816ms step_avg:992.90ms
step:882/3000 train_time:865810ms step_avg:992.90ms
step:883/3000 train_time:866799ms step_avg:992.90ms
step:884/3000 train_time:867784ms step_avg:992.89ms
step:885/3000 train_time:868782ms step_avg:992.89ms
step:886/3000 train_time:869772ms step_avg:992.89ms
step:887/3000 train_time:870790ms step_avg:992.92ms
step:888/3000 train_time:871779ms step_avg:992.91ms
step:889/3000 train_time:872780ms step_avg:992.92ms
step:890/3000 train_time:873780ms step_avg:992.93ms
step:891/3000 train_time:874771ms step_avg:992.93ms
step:892/3000 train_time:875769ms step_avg:992.94ms
step:893/3000 train_time:876759ms step_avg:992.93ms
step:894/3000 train_time:877759ms step_avg:992.94ms
step:895/3000 train_time:878782ms step_avg:992.97ms
step:896/3000 train_time:879774ms step_avg:992.97ms
step:897/3000 train_time:880766ms step_avg:992.97ms
step:898/3000 train_time:881777ms step_avg:992.99ms
step:899/3000 train_time:882771ms step_avg:992.99ms
step:900/3000 train_time:883758ms step_avg:992.99ms
step:900/3000 val_loss:2.5623 train_time:883798ms step_avg:993.03ms perplexity:12.9661 param_count:85,137,462
step:901/3000 train_time:884743ms step_avg:992.98ms
step:902/3000 train_time:885733ms step_avg:992.97ms
step:903/3000 train_time:886728ms step_avg:992.98ms
step:904/3000 train_time:887735ms step_avg:992.99ms
step:905/3000 train_time:888730ms step_avg:992.99ms
step:906/3000 train_time:889741ms step_avg:993.01ms
step:907/3000 train_time:890745ms step_avg:993.03ms
step:908/3000 train_time:891734ms step_avg:993.02ms
step:909/3000 train_time:892724ms step_avg:993.02ms
step:910/3000 train_time:893720ms step_avg:993.02ms
step:911/3000 train_time:894712ms step_avg:993.02ms
step:912/3000 train_time:895713ms step_avg:993.03ms
step:913/3000 train_time:896697ms step_avg:993.02ms
step:914/3000 train_time:897692ms step_avg:993.02ms
step:915/3000 train_time:898683ms step_avg:993.02ms
step:916/3000 train_time:899675ms step_avg:993.02ms
step:917/3000 train_time:900677ms step_avg:993.03ms
step:918/3000 train_time:901669ms step_avg:993.03ms
step:919/3000 train_time:902666ms step_avg:993.03ms
step:920/3000 train_time:903673ms step_avg:993.05ms
step:921/3000 train_time:904674ms step_avg:993.06ms
step:922/3000 train_time:905668ms step_avg:993.06ms
step:923/3000 train_time:906663ms step_avg:993.06ms
step:924/3000 train_time:907654ms step_avg:993.06ms
step:925/3000 train_time:908647ms step_avg:993.06ms
step:925/3000 val_loss:2.5735 train_time:908686ms step_avg:993.10ms perplexity:13.1112 param_count:85,137,462
step:926/3000 train_time:909649ms step_avg:993.07ms
step:927/3000 train_time:910638ms step_avg:993.06ms
step:928/3000 train_time:911637ms step_avg:993.07ms
step:929/3000 train_time:912654ms step_avg:993.10ms
step:930/3000 train_time:913647ms step_avg:993.09ms
step:931/3000 train_time:914651ms step_avg:993.11ms
step:932/3000 train_time:915648ms step_avg:993.11ms
step:933/3000 train_time:916645ms step_avg:993.11ms
step:934/3000 train_time:917640ms step_avg:993.12ms
step:935/3000 train_time:918633ms step_avg:993.12ms
step:936/3000 train_time:919635ms step_avg:993.13ms
step:937/3000 train_time:920643ms step_avg:993.14ms
step:938/3000 train_time:921636ms step_avg:993.14ms
step:939/3000 train_time:922628ms step_avg:993.14ms
step:940/3000 train_time:923625ms step_avg:993.14ms
step:941/3000 train_time:924626ms step_avg:993.15ms
step:942/3000 train_time:925618ms step_avg:993.15ms
step:943/3000 train_time:926617ms step_avg:993.16ms
step:944/3000 train_time:927619ms step_avg:993.17ms
step:945/3000 train_time:928605ms step_avg:993.16ms
step:946/3000 train_time:929600ms step_avg:993.16ms
step:947/3000 train_time:930601ms step_avg:993.17ms
step:948/3000 train_time:931591ms step_avg:993.17ms
step:949/3000 train_time:932584ms step_avg:993.17ms
step:950/3000 train_time:933588ms step_avg:993.18ms
step:950/3000 val_loss:2.5750 train_time:933627ms step_avg:993.22ms perplexity:13.1318 param_count:85,137,462
step:951/3000 train_time:934571ms step_avg:993.17ms
step:952/3000 train_time:935571ms step_avg:993.17ms
step:953/3000 train_time:936570ms step_avg:993.18ms
step:954/3000 train_time:937567ms step_avg:993.19ms
step:955/3000 train_time:938566ms step_avg:993.19ms
step:956/3000 train_time:939568ms step_avg:993.20ms
step:957/3000 train_time:940570ms step_avg:993.21ms
step:958/3000 train_time:941570ms step_avg:993.22ms
step:959/3000 train_time:942570ms step_avg:993.22ms
step:960/3000 train_time:943562ms step_avg:993.22ms
step:961/3000 train_time:944553ms step_avg:993.22ms
step:962/3000 train_time:945543ms step_avg:993.22ms
step:963/3000 train_time:946564ms step_avg:993.25ms
step:964/3000 train_time:947558ms step_avg:993.25ms
step:965/3000 train_time:948552ms step_avg:993.25ms
step:966/3000 train_time:949540ms step_avg:993.24ms
step:967/3000 train_time:950525ms step_avg:993.23ms
step:968/3000 train_time:951510ms step_avg:993.23ms
step:969/3000 train_time:952502ms step_avg:993.22ms
step:970/3000 train_time:953506ms step_avg:993.24ms
step:971/3000 train_time:954502ms step_avg:993.24ms
step:972/3000 train_time:955502ms step_avg:993.25ms
step:973/3000 train_time:956503ms step_avg:993.25ms
step:974/3000 train_time:957507ms step_avg:993.26ms
step:975/3000 train_time:958496ms step_avg:993.26ms
step:975/3000 val_loss:2.5759 train_time:958536ms step_avg:993.30ms perplexity:13.1427 param_count:85,137,462
step:976/3000 train_time:959468ms step_avg:993.24ms
step:977/3000 train_time:960460ms step_avg:993.24ms
step:978/3000 train_time:961451ms step_avg:993.23ms
step:979/3000 train_time:962446ms step_avg:993.24ms
step:980/3000 train_time:963438ms step_avg:993.23ms
step:981/3000 train_time:964429ms step_avg:993.23ms
step:982/3000 train_time:965418ms step_avg:993.23ms
step:983/3000 train_time:966411ms step_avg:993.23ms
step:984/3000 train_time:967403ms step_avg:993.23ms
step:985/3000 train_time:968394ms step_avg:993.22ms
step:986/3000 train_time:969381ms step_avg:993.22ms
step:987/3000 train_time:970376ms step_avg:993.22ms
step:988/3000 train_time:971377ms step_avg:993.23ms
step:989/3000 train_time:972382ms step_avg:993.24ms
step:990/3000 train_time:973385ms step_avg:993.25ms
step:991/3000 train_time:974417ms step_avg:993.29ms
step:992/3000 train_time:975436ms step_avg:993.32ms
step:993/3000 train_time:976436ms step_avg:993.32ms
step:994/3000 train_time:977428ms step_avg:993.32ms
step:995/3000 train_time:978417ms step_avg:993.32ms
step:996/3000 train_time:979405ms step_avg:993.31ms
step:997/3000 train_time:980400ms step_avg:993.31ms
step:998/3000 train_time:981397ms step_avg:993.32ms
step:999/3000 train_time:982394ms step_avg:993.32ms
step:1000/3000 train_time:983377ms step_avg:993.31ms
step:1000/3000 val_loss:2.5747 train_time:983417ms step_avg:993.35ms perplexity:13.1275 param_count:85,137,462
step:1001/3000 train_time:984364ms step_avg:993.30ms
step:1002/3000 train_time:985396ms step_avg:993.34ms
step:1003/3000 train_time:986391ms step_avg:993.34ms
step:1004/3000 train_time:987396ms step_avg:993.36ms
step:1005/3000 train_time:988386ms step_avg:993.35ms
step:1006/3000 train_time:989384ms step_avg:993.36ms
step:1007/3000 train_time:990377ms step_avg:993.36ms
step:1008/3000 train_time:991372ms step_avg:993.36ms
step:1009/3000 train_time:992386ms step_avg:993.38ms
step:1010/3000 train_time:993384ms step_avg:993.38ms
step:1011/3000 train_time:994395ms step_avg:993.40ms
step:1012/3000 train_time:995395ms step_avg:993.41ms
step:1013/3000 train_time:996401ms step_avg:993.42ms
step:1014/3000 train_time:997405ms step_avg:993.43ms
step:1015/3000 train_time:998406ms step_avg:993.44ms
step:1016/3000 train_time:999405ms step_avg:993.44ms
step:1017/3000 train_time:1000409ms step_avg:993.45ms
step:1018/3000 train_time:1001400ms step_avg:993.45ms
step:1019/3000 train_time:1002398ms step_avg:993.46ms
step:1020/3000 train_time:1003412ms step_avg:993.48ms
step:1021/3000 train_time:1004402ms step_avg:993.47ms
step:1022/3000 train_time:1005398ms step_avg:993.48ms
step:1023/3000 train_time:1006400ms step_avg:993.48ms
step:1024/3000 train_time:1007397ms step_avg:993.49ms
step:1025/3000 train_time:1008403ms step_avg:993.50ms
step:1025/3000 val_loss:2.5699 train_time:1008443ms step_avg:993.54ms perplexity:13.0645 param_count:85,137,462
step:1026/3000 train_time:1009389ms step_avg:993.49ms
step:1027/3000 train_time:1010372ms step_avg:993.48ms
step:1028/3000 train_time:1011370ms step_avg:993.49ms
step:1029/3000 train_time:1012369ms step_avg:993.49ms
step:1030/3000 train_time:1013353ms step_avg:993.48ms
step:1031/3000 train_time:1014349ms step_avg:993.49ms
step:1032/3000 train_time:1015348ms step_avg:993.49ms
step:1033/3000 train_time:1016353ms step_avg:993.50ms
step:1034/3000 train_time:1017345ms step_avg:993.50ms
step:1035/3000 train_time:1018351ms step_avg:993.51ms
step:1036/3000 train_time:1019343ms step_avg:993.51ms
step:1037/3000 train_time:1020356ms step_avg:993.53ms
step:1038/3000 train_time:1021359ms step_avg:993.54ms
step:1039/3000 train_time:1022366ms step_avg:993.55ms
step:1040/3000 train_time:1023372ms step_avg:993.57ms
step:1041/3000 train_time:1024365ms step_avg:993.56ms
step:1042/3000 train_time:1025356ms step_avg:993.56ms
step:1043/3000 train_time:1026351ms step_avg:993.56ms
step:1044/3000 train_time:1027347ms step_avg:993.57ms
step:1045/3000 train_time:1028330ms step_avg:993.56ms
step:1046/3000 train_time:1029327ms step_avg:993.56ms
step:1047/3000 train_time:1030328ms step_avg:993.57ms
step:1048/3000 train_time:1031320ms step_avg:993.56ms
step:1049/3000 train_time:1032319ms step_avg:993.57ms
step:1050/3000 train_time:1033314ms step_avg:993.57ms
step:1050/3000 val_loss:2.5814 train_time:1033355ms step_avg:993.61ms perplexity:13.2162 param_count:85,137,462
step:1051/3000 train_time:1034296ms step_avg:993.56ms
step:1052/3000 train_time:1035295ms step_avg:993.57ms
step:1053/3000 train_time:1036296ms step_avg:993.57ms
step:1054/3000 train_time:1037290ms step_avg:993.57ms
step:1055/3000 train_time:1038278ms step_avg:993.57ms
step:1056/3000 train_time:1039275ms step_avg:993.57ms
step:1057/3000 train_time:1040270ms step_avg:993.57ms
step:1058/3000 train_time:1041251ms step_avg:993.56ms
step:1059/3000 train_time:1042248ms step_avg:993.56ms
step:1060/3000 train_time:1043241ms step_avg:993.56ms
step:1061/3000 train_time:1044236ms step_avg:993.56ms
step:1062/3000 train_time:1045228ms step_avg:993.56ms
step:1063/3000 train_time:1046227ms step_avg:993.57ms
step:1064/3000 train_time:1047225ms step_avg:993.57ms
step:1065/3000 train_time:1048228ms step_avg:993.58ms
step:1066/3000 train_time:1049225ms step_avg:993.58ms
step:1067/3000 train_time:1050223ms step_avg:993.59ms
step:1068/3000 train_time:1051209ms step_avg:993.58ms
step:1069/3000 train_time:1052221ms step_avg:993.60ms
step:1070/3000 train_time:1053213ms step_avg:993.60ms
step:1071/3000 train_time:1054199ms step_avg:993.59ms
step:1072/3000 train_time:1055196ms step_avg:993.59ms
step:1073/3000 train_time:1056208ms step_avg:993.61ms
step:1074/3000 train_time:1057206ms step_avg:993.61ms
step:1075/3000 train_time:1058201ms step_avg:993.62ms
step:1075/3000 val_loss:2.5737 train_time:1058242ms step_avg:993.65ms perplexity:13.1145 param_count:85,137,462
step:1076/3000 train_time:1059209ms step_avg:993.63ms
step:1077/3000 train_time:1060198ms step_avg:993.63ms
step:1078/3000 train_time:1061215ms step_avg:993.65ms
step:1079/3000 train_time:1062206ms step_avg:993.64ms
step:1080/3000 train_time:1063204ms step_avg:993.65ms
step:1081/3000 train_time:1064201ms step_avg:993.65ms
step:1082/3000 train_time:1065193ms step_avg:993.65ms
step:1083/3000 train_time:1066190ms step_avg:993.65ms
step:1084/3000 train_time:1067186ms step_avg:993.66ms
step:1085/3000 train_time:1068188ms step_avg:993.66ms
step:1086/3000 train_time:1069177ms step_avg:993.66ms
step:1087/3000 train_time:1070170ms step_avg:993.66ms
step:1088/3000 train_time:1071160ms step_avg:993.66ms
step:1089/3000 train_time:1072165ms step_avg:993.67ms
step:1090/3000 train_time:1073157ms step_avg:993.66ms
step:1091/3000 train_time:1074159ms step_avg:993.67ms
step:1092/3000 train_time:1075171ms step_avg:993.69ms
step:1093/3000 train_time:1076166ms step_avg:993.69ms
step:1094/3000 train_time:1077155ms step_avg:993.69ms
step:1095/3000 train_time:1078156ms step_avg:993.69ms
step:1096/3000 train_time:1079172ms step_avg:993.71ms
step:1097/3000 train_time:1080159ms step_avg:993.71ms
step:1098/3000 train_time:1081162ms step_avg:993.72ms
step:1099/3000 train_time:1082156ms step_avg:993.72ms
step:1100/3000 train_time:1083150ms step_avg:993.72ms
step:1100/3000 val_loss:2.5800 train_time:1083190ms step_avg:993.75ms perplexity:13.1975 param_count:85,137,462
step:1101/3000 train_time:1084147ms step_avg:993.72ms
step:1102/3000 train_time:1085135ms step_avg:993.71ms
step:1103/3000 train_time:1086125ms step_avg:993.71ms
step:1104/3000 train_time:1087123ms step_avg:993.71ms
step:1105/3000 train_time:1088113ms step_avg:993.71ms
step:1106/3000 train_time:1089095ms step_avg:993.70ms
step:1107/3000 train_time:1090089ms step_avg:993.70ms
step:1108/3000 train_time:1091088ms step_avg:993.71ms
step:1109/3000 train_time:1092077ms step_avg:993.70ms
step:1110/3000 train_time:1093073ms step_avg:993.70ms
step:1111/3000 train_time:1094068ms step_avg:993.70ms
step:1112/3000 train_time:1095056ms step_avg:993.70ms
step:1113/3000 train_time:1096046ms step_avg:993.70ms
step:1114/3000 train_time:1097037ms step_avg:993.69ms
step:1115/3000 train_time:1098027ms step_avg:993.69ms
step:1116/3000 train_time:1099023ms step_avg:993.69ms
step:1117/3000 train_time:1100010ms step_avg:993.69ms
step:1118/3000 train_time:1101003ms step_avg:993.68ms
step:1119/3000 train_time:1101993ms step_avg:993.68ms
step:1120/3000 train_time:1102985ms step_avg:993.68ms
step:1121/3000 train_time:1103980ms step_avg:993.68ms
step:1122/3000 train_time:1104981ms step_avg:993.69ms
step:1123/3000 train_time:1105978ms step_avg:993.69ms
step:1124/3000 train_time:1106975ms step_avg:993.69ms
step:1125/3000 train_time:1107968ms step_avg:993.69ms
step:1125/3000 val_loss:2.5759 train_time:1108008ms step_avg:993.73ms perplexity:13.1430 param_count:85,137,462
step:1126/3000 train_time:1108963ms step_avg:993.69ms
step:1127/3000 train_time:1109961ms step_avg:993.70ms
step:1128/3000 train_time:1110961ms step_avg:993.70ms
step:1129/3000 train_time:1111956ms step_avg:993.71ms
step:1130/3000 train_time:1112947ms step_avg:993.70ms
step:1131/3000 train_time:1113941ms step_avg:993.70ms
step:1132/3000 train_time:1114928ms step_avg:993.70ms
step:1133/3000 train_time:1115912ms step_avg:993.69ms
step:1134/3000 train_time:1116907ms step_avg:993.69ms
step:1135/3000 train_time:1117893ms step_avg:993.68ms
step:1136/3000 train_time:1118876ms step_avg:993.67ms
step:1137/3000 train_time:1119871ms step_avg:993.67ms
step:1138/3000 train_time:1120867ms step_avg:993.68ms
step:1139/3000 train_time:1121872ms step_avg:993.69ms
step:1140/3000 train_time:1122861ms step_avg:993.68ms
step:1141/3000 train_time:1123866ms step_avg:993.69ms
step:1142/3000 train_time:1124857ms step_avg:993.69ms
step:1143/3000 train_time:1125842ms step_avg:993.68ms
step:1144/3000 train_time:1126836ms step_avg:993.68ms
step:1145/3000 train_time:1127822ms step_avg:993.68ms
step:1146/3000 train_time:1128820ms step_avg:993.68ms
step:1147/3000 train_time:1129814ms step_avg:993.68ms
step:1148/3000 train_time:1130808ms step_avg:993.68ms
step:1149/3000 train_time:1131801ms step_avg:993.68ms
step:1150/3000 train_time:1132798ms step_avg:993.68ms
step:1150/3000 val_loss:2.5636 train_time:1132839ms step_avg:993.72ms perplexity:12.9831 param_count:85,137,462
step:1151/3000 train_time:1133781ms step_avg:993.67ms
step:1152/3000 train_time:1134765ms step_avg:993.66ms
step:1153/3000 train_time:1135752ms step_avg:993.66ms
step:1154/3000 train_time:1136751ms step_avg:993.66ms
step:1155/3000 train_time:1137744ms step_avg:993.66ms
step:1156/3000 train_time:1138734ms step_avg:993.66ms
step:1157/3000 train_time:1139743ms step_avg:993.67ms
step:1158/3000 train_time:1140726ms step_avg:993.66ms
step:1159/3000 train_time:1141727ms step_avg:993.67ms
step:1160/3000 train_time:1142717ms step_avg:993.67ms
step:1161/3000 train_time:1143713ms step_avg:993.67ms
step:1162/3000 train_time:1144707ms step_avg:993.67ms
step:1163/3000 train_time:1145698ms step_avg:993.67ms
step:1164/3000 train_time:1146691ms step_avg:993.67ms
step:1165/3000 train_time:1147688ms step_avg:993.67ms
step:1166/3000 train_time:1148695ms step_avg:993.68ms
step:1167/3000 train_time:1149713ms step_avg:993.70ms
step:1168/3000 train_time:1150704ms step_avg:993.70ms
step:1169/3000 train_time:1151697ms step_avg:993.70ms
step:1170/3000 train_time:1152693ms step_avg:993.70ms
step:1171/3000 train_time:1153702ms step_avg:993.71ms
step:1172/3000 train_time:1154700ms step_avg:993.72ms
step:1173/3000 train_time:1155702ms step_avg:993.72ms
step:1174/3000 train_time:1156699ms step_avg:993.73ms
step:1175/3000 train_time:1157709ms step_avg:993.74ms
step:1175/3000 val_loss:2.5677 train_time:1157749ms step_avg:993.78ms perplexity:13.0352 param_count:85,137,462
step:1176/3000 train_time:1158698ms step_avg:993.74ms
step:1177/3000 train_time:1159687ms step_avg:993.73ms
step:1178/3000 train_time:1160681ms step_avg:993.73ms
step:1179/3000 train_time:1161682ms step_avg:993.74ms
step:1180/3000 train_time:1162681ms step_avg:993.74ms
step:1181/3000 train_time:1163673ms step_avg:993.74ms
step:1182/3000 train_time:1164666ms step_avg:993.74ms
step:1183/3000 train_time:1165655ms step_avg:993.74ms
step:1184/3000 train_time:1166677ms step_avg:993.76ms
step:1185/3000 train_time:1167675ms step_avg:993.77ms
step:1186/3000 train_time:1168666ms step_avg:993.76ms
step:1187/3000 train_time:1169666ms step_avg:993.77ms
step:1188/3000 train_time:1170667ms step_avg:993.77ms
step:1189/3000 train_time:1171656ms step_avg:993.77ms
step:1190/3000 train_time:1172655ms step_avg:993.78ms
step:1191/3000 train_time:1173652ms step_avg:993.78ms
step:1192/3000 train_time:1174645ms step_avg:993.78ms
step:1193/3000 train_time:1175642ms step_avg:993.78ms
step:1194/3000 train_time:1176635ms step_avg:993.78ms
step:1195/3000 train_time:1177645ms step_avg:993.79ms
step:1196/3000 train_time:1178650ms step_avg:993.80ms
step:1197/3000 train_time:1179654ms step_avg:993.81ms
step:1198/3000 train_time:1180637ms step_avg:993.80ms
step:1199/3000 train_time:1181630ms step_avg:993.80ms
step:1200/3000 train_time:1182638ms step_avg:993.81ms
step:1200/3000 val_loss:2.5756 train_time:1182678ms step_avg:993.85ms perplexity:13.1396 param_count:85,137,462
step:1201/3000 train_time:1183623ms step_avg:993.81ms
step:1202/3000 train_time:1184615ms step_avg:993.80ms
step:1203/3000 train_time:1185617ms step_avg:993.81ms
step:1204/3000 train_time:1186612ms step_avg:993.81ms
step:1205/3000 train_time:1187599ms step_avg:993.81ms
step:1206/3000 train_time:1188596ms step_avg:993.81ms
step:1207/3000 train_time:1189589ms step_avg:993.81ms
step:1208/3000 train_time:1190579ms step_avg:993.81ms
step:1209/3000 train_time:1191574ms step_avg:993.81ms
step:1210/3000 train_time:1192564ms step_avg:993.80ms
step:1211/3000 train_time:1193554ms step_avg:993.80ms
step:1212/3000 train_time:1194552ms step_avg:993.80ms
step:1213/3000 train_time:1195544ms step_avg:993.80ms
step:1214/3000 train_time:1196537ms step_avg:993.80ms
step:1215/3000 train_time:1197532ms step_avg:993.80ms
step:1216/3000 train_time:1198531ms step_avg:993.81ms
step:1217/3000 train_time:1199523ms step_avg:993.81ms
step:1218/3000 train_time:1200518ms step_avg:993.81ms
step:1219/3000 train_time:1201529ms step_avg:993.82ms
step:1220/3000 train_time:1202518ms step_avg:993.82ms
step:1221/3000 train_time:1203519ms step_avg:993.82ms
step:1222/3000 train_time:1204516ms step_avg:993.82ms
step:1223/3000 train_time:1205506ms step_avg:993.82ms
step:1224/3000 train_time:1206511ms step_avg:993.83ms
step:1225/3000 train_time:1207507ms step_avg:993.83ms
step:1225/3000 val_loss:2.5578 train_time:1207549ms step_avg:993.87ms perplexity:12.9080 param_count:85,137,462
step:1226/3000 train_time:1208505ms step_avg:993.84ms
step:1227/3000 train_time:1209503ms step_avg:993.84ms
step:1228/3000 train_time:1210489ms step_avg:993.83ms
step:1229/3000 train_time:1211484ms step_avg:993.83ms
step:1230/3000 train_time:1212485ms step_avg:993.84ms
step:1231/3000 train_time:1213479ms step_avg:993.84ms
step:1232/3000 train_time:1214469ms step_avg:993.84ms
step:1233/3000 train_time:1215463ms step_avg:993.84ms
step:1234/3000 train_time:1216464ms step_avg:993.84ms
step:1235/3000 train_time:1217467ms step_avg:993.85ms
step:1236/3000 train_time:1218459ms step_avg:993.85ms
step:1237/3000 train_time:1219452ms step_avg:993.85ms
step:1238/3000 train_time:1220456ms step_avg:993.86ms
step:1239/3000 train_time:1221453ms step_avg:993.86ms
step:1240/3000 train_time:1222456ms step_avg:993.87ms
step:1241/3000 train_time:1223458ms step_avg:993.87ms
step:1242/3000 train_time:1224448ms step_avg:993.87ms
step:1243/3000 train_time:1225439ms step_avg:993.87ms
step:1244/3000 train_time:1226425ms step_avg:993.86ms
step:1245/3000 train_time:1227413ms step_avg:993.86ms
step:1246/3000 train_time:1228403ms step_avg:993.85ms
step:1247/3000 train_time:1229397ms step_avg:993.85ms
step:1248/3000 train_time:1230387ms step_avg:993.85ms
step:1249/3000 train_time:1231380ms step_avg:993.85ms
step:1250/3000 train_time:1232373ms step_avg:993.85ms
step:1250/3000 val_loss:2.5574 train_time:1232413ms step_avg:993.88ms perplexity:12.9021 param_count:85,137,462
step:1251/3000 train_time:1233365ms step_avg:993.85ms
step:1252/3000 train_time:1234367ms step_avg:993.85ms
step:1253/3000 train_time:1235363ms step_avg:993.86ms
step:1254/3000 train_time:1236356ms step_avg:993.86ms
step:1255/3000 train_time:1237344ms step_avg:993.85ms
step:1256/3000 train_time:1238335ms step_avg:993.85ms
step:1257/3000 train_time:1239322ms step_avg:993.84ms
step:1258/3000 train_time:1240320ms step_avg:993.85ms
step:1259/3000 train_time:1241314ms step_avg:993.85ms
step:1260/3000 train_time:1242330ms step_avg:993.86ms
step:1261/3000 train_time:1243348ms step_avg:993.88ms
step:1262/3000 train_time:1244345ms step_avg:993.89ms
step:1263/3000 train_time:1245342ms step_avg:993.89ms
step:1264/3000 train_time:1246338ms step_avg:993.89ms
step:1265/3000 train_time:1247336ms step_avg:993.89ms
step:1266/3000 train_time:1248344ms step_avg:993.90ms
step:1267/3000 train_time:1249335ms step_avg:993.90ms
step:1268/3000 train_time:1250326ms step_avg:993.90ms
step:1269/3000 train_time:1251316ms step_avg:993.90ms
step:1270/3000 train_time:1252304ms step_avg:993.89ms
step:1271/3000 train_time:1253300ms step_avg:993.89ms
step:1272/3000 train_time:1254289ms step_avg:993.89ms
step:1273/3000 train_time:1255276ms step_avg:993.88ms
step:1274/3000 train_time:1256277ms step_avg:993.89ms
step:1275/3000 train_time:1257278ms step_avg:993.90ms
step:1275/3000 val_loss:2.5658 train_time:1257318ms step_avg:993.93ms perplexity:13.0105 param_count:85,137,462
step:1276/3000 train_time:1258263ms step_avg:993.89ms
step:1277/3000 train_time:1259255ms step_avg:993.89ms
step:1278/3000 train_time:1260251ms step_avg:993.89ms
step:1279/3000 train_time:1261240ms step_avg:993.88ms
step:1280/3000 train_time:1262228ms step_avg:993.88ms
step:1281/3000 train_time:1263230ms step_avg:993.89ms
step:1282/3000 train_time:1264220ms step_avg:993.88ms
step:1283/3000 train_time:1265207ms step_avg:993.88ms
step:1284/3000 train_time:1266204ms step_avg:993.88ms
step:1285/3000 train_time:1267204ms step_avg:993.89ms
step:1286/3000 train_time:1268211ms step_avg:993.90ms
step:1287/3000 train_time:1269212ms step_avg:993.90ms
step:1288/3000 train_time:1270226ms step_avg:993.92ms
step:1289/3000 train_time:1271219ms step_avg:993.92ms
step:1290/3000 train_time:1272230ms step_avg:993.93ms
step:1291/3000 train_time:1273228ms step_avg:993.93ms
step:1292/3000 train_time:1274228ms step_avg:993.94ms
step:1293/3000 train_time:1275216ms step_avg:993.93ms
step:1294/3000 train_time:1276216ms step_avg:993.94ms
step:1295/3000 train_time:1277213ms step_avg:993.94ms
step:1296/3000 train_time:1278200ms step_avg:993.94ms
step:1297/3000 train_time:1279197ms step_avg:993.94ms
step:1298/3000 train_time:1280195ms step_avg:993.94ms
step:1299/3000 train_time:1281184ms step_avg:993.94ms
step:1300/3000 train_time:1282173ms step_avg:993.93ms
step:1300/3000 val_loss:2.5583 train_time:1282213ms step_avg:993.96ms perplexity:12.9137 param_count:85,137,462
step:1301/3000 train_time:1283155ms step_avg:993.92ms
step:1302/3000 train_time:1284148ms step_avg:993.92ms
step:1303/3000 train_time:1285143ms step_avg:993.92ms
step:1304/3000 train_time:1286126ms step_avg:993.91ms
step:1305/3000 train_time:1287112ms step_avg:993.91ms
step:1306/3000 train_time:1288126ms step_avg:993.92ms
step:1307/3000 train_time:1289134ms step_avg:993.94ms
step:1308/3000 train_time:1290125ms step_avg:993.93ms
step:1309/3000 train_time:1291131ms step_avg:993.94ms
step:1310/3000 train_time:1292130ms step_avg:993.95ms
step:1311/3000 train_time:1293128ms step_avg:993.95ms
step:1312/3000 train_time:1294127ms step_avg:993.95ms
step:1313/3000 train_time:1295124ms step_avg:993.96ms
step:1314/3000 train_time:1296137ms step_avg:993.97ms
step:1315/3000 train_time:1297129ms step_avg:993.97ms
step:1316/3000 train_time:1298129ms step_avg:993.97ms
step:1317/3000 train_time:1299125ms step_avg:993.97ms
step:1318/3000 train_time:1300130ms step_avg:993.98ms
step:1319/3000 train_time:1301134ms step_avg:993.99ms
step:1320/3000 train_time:1302137ms step_avg:994.00ms
step:1321/3000 train_time:1303134ms step_avg:994.00ms
step:1322/3000 train_time:1304132ms step_avg:994.00ms
step:1323/3000 train_time:1305137ms step_avg:994.01ms
step:1324/3000 train_time:1306130ms step_avg:994.01ms
step:1325/3000 train_time:1307126ms step_avg:994.01ms
step:1325/3000 val_loss:2.5675 train_time:1307166ms step_avg:994.04ms perplexity:13.0337 param_count:85,137,462
step:1326/3000 train_time:1308106ms step_avg:994.00ms
step:1327/3000 train_time:1309094ms step_avg:994.00ms
step:1328/3000 train_time:1310085ms step_avg:993.99ms
step:1329/3000 train_time:1311105ms step_avg:994.01ms
step:1330/3000 train_time:1312095ms step_avg:994.01ms
step:1331/3000 train_time:1313106ms step_avg:994.02ms
step:1332/3000 train_time:1314104ms step_avg:994.03ms
step:1333/3000 train_time:1315098ms step_avg:994.03ms
step:1334/3000 train_time:1316089ms step_avg:994.02ms
step:1335/3000 train_time:1317096ms step_avg:994.03ms
step:1336/3000 train_time:1318095ms step_avg:994.04ms
step:1337/3000 train_time:1319098ms step_avg:994.05ms
step:1338/3000 train_time:1320096ms step_avg:994.05ms
step:1339/3000 train_time:1321093ms step_avg:994.05ms
step:1340/3000 train_time:1322085ms step_avg:994.05ms
step:1341/3000 train_time:1323082ms step_avg:994.05ms
step:1342/3000 train_time:1324089ms step_avg:994.06ms
step:1343/3000 train_time:1325083ms step_avg:994.06ms
step:1344/3000 train_time:1326091ms step_avg:994.07ms
step:1345/3000 train_time:1327098ms step_avg:994.08ms
step:1346/3000 train_time:1328100ms step_avg:994.09ms
step:1347/3000 train_time:1329097ms step_avg:994.09ms
step:1348/3000 train_time:1330094ms step_avg:994.09ms
step:1349/3000 train_time:1331082ms step_avg:994.09ms
step:1350/3000 train_time:1332073ms step_avg:994.08ms
step:1350/3000 val_loss:2.5595 train_time:1332112ms step_avg:994.11ms perplexity:12.9299 param_count:85,137,462
step:1351/3000 train_time:1333082ms step_avg:994.10ms
step:1352/3000 train_time:1334074ms step_avg:994.09ms
step:1353/3000 train_time:1335081ms step_avg:994.10ms
step:1354/3000 train_time:1336071ms step_avg:994.10ms
step:1355/3000 train_time:1337068ms step_avg:994.10ms
step:1356/3000 train_time:1338055ms step_avg:994.10ms
step:1357/3000 train_time:1339054ms step_avg:994.10ms
step:1358/3000 train_time:1340050ms step_avg:994.10ms
step:1359/3000 train_time:1341056ms step_avg:994.11ms
step:1360/3000 train_time:1342046ms step_avg:994.11ms
step:1361/3000 train_time:1343041ms step_avg:994.11ms
step:1362/3000 train_time:1344033ms step_avg:994.11ms
step:1363/3000 train_time:1345040ms step_avg:994.12ms
step:1364/3000 train_time:1346041ms step_avg:994.12ms
step:1365/3000 train_time:1347037ms step_avg:994.12ms
step:1366/3000 train_time:1348032ms step_avg:994.12ms
step:1367/3000 train_time:1349023ms step_avg:994.12ms
step:1368/3000 train_time:1350021ms step_avg:994.12ms
step:1369/3000 train_time:1351014ms step_avg:994.12ms
step:1370/3000 train_time:1352013ms step_avg:994.13ms
step:1371/3000 train_time:1353010ms step_avg:994.13ms
step:1372/3000 train_time:1354025ms step_avg:994.14ms
step:1373/3000 train_time:1355009ms step_avg:994.14ms
step:1374/3000 train_time:1356003ms step_avg:994.14ms
step:1375/3000 train_time:1356995ms step_avg:994.14ms
step:1375/3000 val_loss:2.5532 train_time:1357036ms step_avg:994.17ms perplexity:12.8482 param_count:85,137,462
step:1376/3000 train_time:1357978ms step_avg:994.13ms
step:1377/3000 train_time:1358983ms step_avg:994.14ms
step:1378/3000 train_time:1359977ms step_avg:994.14ms
step:1379/3000 train_time:1360971ms step_avg:994.14ms
step:1380/3000 train_time:1361959ms step_avg:994.13ms
step:1381/3000 train_time:1362952ms step_avg:994.13ms
step:1382/3000 train_time:1363950ms step_avg:994.13ms
step:1383/3000 train_time:1364944ms step_avg:994.13ms
step:1384/3000 train_time:1365927ms step_avg:994.12ms
step:1385/3000 train_time:1366918ms step_avg:994.12ms
step:1386/3000 train_time:1367906ms step_avg:994.12ms
step:1387/3000 train_time:1368900ms step_avg:994.12ms
step:1388/3000 train_time:1369882ms step_avg:994.11ms
step:1389/3000 train_time:1370877ms step_avg:994.11ms
step:1390/3000 train_time:1371885ms step_avg:994.12ms
step:1391/3000 train_time:1372884ms step_avg:994.12ms
step:1392/3000 train_time:1373878ms step_avg:994.12ms
step:1393/3000 train_time:1374872ms step_avg:994.12ms
step:1394/3000 train_time:1375850ms step_avg:994.11ms
step:1395/3000 train_time:1376835ms step_avg:994.10ms
step:1396/3000 train_time:1377827ms step_avg:994.10ms
step:1397/3000 train_time:1378827ms step_avg:994.11ms
step:1398/3000 train_time:1379825ms step_avg:994.11ms
step:1399/3000 train_time:1380821ms step_avg:994.11ms
step:1400/3000 train_time:1381821ms step_avg:994.12ms
step:1400/3000 val_loss:2.5604 train_time:1381861ms step_avg:994.14ms perplexity:12.9408 param_count:85,137,462
step:1401/3000 train_time:1382806ms step_avg:994.11ms
step:1402/3000 train_time:1383797ms step_avg:994.11ms
step:1403/3000 train_time:1384785ms step_avg:994.10ms
step:1404/3000 train_time:1385772ms step_avg:994.10ms
step:1405/3000 train_time:1386763ms step_avg:994.10ms
step:1406/3000 train_time:1387760ms step_avg:994.10ms
step:1407/3000 train_time:1388752ms step_avg:994.10ms
step:1408/3000 train_time:1389754ms step_avg:994.10ms
step:1409/3000 train_time:1390738ms step_avg:994.09ms
step:1410/3000 train_time:1391724ms step_avg:994.09ms
step:1411/3000 train_time:1392704ms step_avg:994.08ms
step:1412/3000 train_time:1393695ms step_avg:994.08ms
step:1413/3000 train_time:1394697ms step_avg:994.08ms
step:1414/3000 train_time:1395679ms step_avg:994.07ms
step:1415/3000 train_time:1396676ms step_avg:994.08ms
step:1416/3000 train_time:1397671ms step_avg:994.08ms
step:1417/3000 train_time:1398658ms step_avg:994.07ms
step:1418/3000 train_time:1399639ms step_avg:994.06ms
step:1419/3000 train_time:1400630ms step_avg:994.06ms
step:1420/3000 train_time:1401625ms step_avg:994.06ms
step:1421/3000 train_time:1402624ms step_avg:994.06ms
step:1422/3000 train_time:1403610ms step_avg:994.06ms
step:1423/3000 train_time:1404605ms step_avg:994.06ms
step:1424/3000 train_time:1405588ms step_avg:994.05ms
step:1425/3000 train_time:1406582ms step_avg:994.05ms
step:1425/3000 val_loss:2.5477 train_time:1406623ms step_avg:994.08ms perplexity:12.7783 param_count:85,137,462
step:1426/3000 train_time:1407577ms step_avg:994.05ms
step:1427/3000 train_time:1408568ms step_avg:994.05ms
step:1428/3000 train_time:1409561ms step_avg:994.05ms
step:1429/3000 train_time:1410546ms step_avg:994.04ms
step:1430/3000 train_time:1411553ms step_avg:994.05ms
step:1431/3000 train_time:1412546ms step_avg:994.05ms
step:1432/3000 train_time:1413534ms step_avg:994.05ms
step:1433/3000 train_time:1414523ms step_avg:994.04ms
step:1434/3000 train_time:1415516ms step_avg:994.04ms
step:1435/3000 train_time:1416514ms step_avg:994.04ms
step:1436/3000 train_time:1417504ms step_avg:994.04ms
step:1437/3000 train_time:1418495ms step_avg:994.04ms
step:1438/3000 train_time:1419506ms step_avg:994.05ms
step:1439/3000 train_time:1420499ms step_avg:994.05ms
step:1440/3000 train_time:1421491ms step_avg:994.05ms
step:1441/3000 train_time:1422480ms step_avg:994.05ms
step:1442/3000 train_time:1423475ms step_avg:994.05ms
step:1443/3000 train_time:1424482ms step_avg:994.06ms
step:1444/3000 train_time:1425509ms step_avg:994.08ms
step:1445/3000 train_time:1426498ms step_avg:994.08ms
step:1446/3000 train_time:1427492ms step_avg:994.08ms
step:1447/3000 train_time:1428474ms step_avg:994.07ms
step:1448/3000 train_time:1429464ms step_avg:994.06ms
step:1449/3000 train_time:1430461ms step_avg:994.07ms
step:1450/3000 train_time:1431452ms step_avg:994.06ms
step:1450/3000 val_loss:2.5569 train_time:1431492ms step_avg:994.09ms perplexity:12.8960 param_count:85,137,462
step:1451/3000 train_time:1432436ms step_avg:994.06ms
step:1452/3000 train_time:1433422ms step_avg:994.05ms
step:1453/3000 train_time:1434417ms step_avg:994.05ms
step:1454/3000 train_time:1435402ms step_avg:994.05ms
step:1455/3000 train_time:1436402ms step_avg:994.05ms
step:1456/3000 train_time:1437413ms step_avg:994.06ms
step:1457/3000 train_time:1438409ms step_avg:994.06ms
step:1458/3000 train_time:1439392ms step_avg:994.06ms
step:1459/3000 train_time:1440407ms step_avg:994.07ms
step:1460/3000 train_time:1441405ms step_avg:994.07ms
step:1461/3000 train_time:1442401ms step_avg:994.07ms
step:1462/3000 train_time:1443405ms step_avg:994.08ms
step:1463/3000 train_time:1444401ms step_avg:994.08ms
step:1464/3000 train_time:1445392ms step_avg:994.08ms
step:1465/3000 train_time:1446382ms step_avg:994.08ms
step:1466/3000 train_time:1447381ms step_avg:994.08ms
step:1467/3000 train_time:1448381ms step_avg:994.08ms
step:1468/3000 train_time:1449368ms step_avg:994.08ms
step:1469/3000 train_time:1450363ms step_avg:994.08ms
step:1470/3000 train_time:1451369ms step_avg:994.09ms
step:1471/3000 train_time:1452353ms step_avg:994.08ms
step:1472/3000 train_time:1453348ms step_avg:994.08ms
step:1473/3000 train_time:1454337ms step_avg:994.08ms
step:1474/3000 train_time:1455338ms step_avg:994.08ms
step:1475/3000 train_time:1456321ms step_avg:994.08ms
step:1475/3000 val_loss:2.5588 train_time:1456363ms step_avg:994.10ms perplexity:12.9201 param_count:85,137,462
step:1476/3000 train_time:1457312ms step_avg:994.07ms
step:1477/3000 train_time:1458307ms step_avg:994.07ms
step:1478/3000 train_time:1459307ms step_avg:994.08ms
step:1479/3000 train_time:1460302ms step_avg:994.08ms
step:1480/3000 train_time:1461303ms step_avg:994.08ms
step:1481/3000 train_time:1462298ms step_avg:994.08ms
step:1482/3000 train_time:1463281ms step_avg:994.08ms
step:1483/3000 train_time:1464276ms step_avg:994.08ms
step:1484/3000 train_time:1465273ms step_avg:994.08ms
step:1485/3000 train_time:1466272ms step_avg:994.08ms
step:1486/3000 train_time:1467268ms step_avg:994.08ms
step:1487/3000 train_time:1468272ms step_avg:994.09ms
step:1488/3000 train_time:1469260ms step_avg:994.09ms
step:1489/3000 train_time:1470253ms step_avg:994.09ms
step:1490/3000 train_time:1471246ms step_avg:994.09ms
step:1491/3000 train_time:1472242ms step_avg:994.09ms
step:1492/3000 train_time:1473241ms step_avg:994.09ms
step:1493/3000 train_time:1474236ms step_avg:994.09ms
step:1494/3000 train_time:1475223ms step_avg:994.09ms
step:1495/3000 train_time:1476216ms step_avg:994.08ms
step:1496/3000 train_time:1477206ms step_avg:994.08ms
step:1497/3000 train_time:1478222ms step_avg:994.10ms
step:1498/3000 train_time:1479210ms step_avg:994.09ms
step:1499/3000 train_time:1480202ms step_avg:994.09ms
step:1500/3000 train_time:1481193ms step_avg:994.09ms
step:1500/3000 val_loss:2.5574 train_time:1481231ms step_avg:994.12ms perplexity:12.9019 param_count:85,137,462
step:1501/3000 train_time:1482201ms step_avg:994.10ms
step:1502/3000 train_time:1483192ms step_avg:994.10ms
step:1503/3000 train_time:1484189ms step_avg:994.10ms
step:1504/3000 train_time:1485192ms step_avg:994.10ms
step:1505/3000 train_time:1486192ms step_avg:994.11ms
step:1506/3000 train_time:1487200ms step_avg:994.12ms
step:1507/3000 train_time:1488197ms step_avg:994.12ms
step:1508/3000 train_time:1489195ms step_avg:994.12ms
step:1509/3000 train_time:1490187ms step_avg:994.12ms
step:1510/3000 train_time:1491183ms step_avg:994.12ms
step:1511/3000 train_time:1492183ms step_avg:994.13ms
step:1512/3000 train_time:1493175ms step_avg:994.12ms
step:1513/3000 train_time:1494172ms step_avg:994.13ms
step:1514/3000 train_time:1495163ms step_avg:994.12ms
step:1515/3000 train_time:1496154ms step_avg:994.12ms
step:1516/3000 train_time:1497145ms step_avg:994.12ms
step:1517/3000 train_time:1498140ms step_avg:994.12ms
step:1518/3000 train_time:1499144ms step_avg:994.13ms
step:1519/3000 train_time:1500132ms step_avg:994.12ms
step:1520/3000 train_time:1501132ms step_avg:994.13ms
step:1521/3000 train_time:1502123ms step_avg:994.13ms
step:1522/3000 train_time:1503106ms step_avg:994.12ms
step:1523/3000 train_time:1504092ms step_avg:994.11ms
step:1524/3000 train_time:1505088ms step_avg:994.11ms
step:1525/3000 train_time:1506089ms step_avg:994.12ms
step:1525/3000 val_loss:2.5578 train_time:1506129ms step_avg:994.14ms perplexity:12.9071 param_count:85,137,462
step:1526/3000 train_time:1507077ms step_avg:994.11ms
step:1527/3000 train_time:1508071ms step_avg:994.11ms
step:1528/3000 train_time:1509051ms step_avg:994.10ms
step:1529/3000 train_time:1510042ms step_avg:994.10ms
step:1530/3000 train_time:1511047ms step_avg:994.11ms
step:1531/3000 train_time:1512033ms step_avg:994.10ms
step:1532/3000 train_time:1513041ms step_avg:994.11ms
step:1533/3000 train_time:1514039ms step_avg:994.12ms
step:1534/3000 train_time:1515030ms step_avg:994.11ms
step:1535/3000 train_time:1516033ms step_avg:994.12ms
step:1536/3000 train_time:1517026ms step_avg:994.12ms
step:1537/3000 train_time:1518025ms step_avg:994.12ms
step:1538/3000 train_time:1519024ms step_avg:994.13ms
step:1539/3000 train_time:1520018ms step_avg:994.13ms
step:1540/3000 train_time:1521004ms step_avg:994.12ms
step:1541/3000 train_time:1522001ms step_avg:994.12ms
step:1542/3000 train_time:1522993ms step_avg:994.12ms
step:1543/3000 train_time:1523992ms step_avg:994.12ms
step:1544/3000 train_time:1524987ms step_avg:994.12ms
step:1545/3000 train_time:1525982ms step_avg:994.13ms
step:1546/3000 train_time:1526995ms step_avg:994.14ms
step:1547/3000 train_time:1527988ms step_avg:994.14ms
step:1548/3000 train_time:1528992ms step_avg:994.14ms
step:1549/3000 train_time:1529983ms step_avg:994.14ms
step:1550/3000 train_time:1530982ms step_avg:994.14ms
step:1550/3000 val_loss:2.5567 train_time:1531023ms step_avg:994.17ms perplexity:12.8931 param_count:85,137,462
step:1551/3000 train_time:1531971ms step_avg:994.14ms
step:1552/3000 train_time:1532970ms step_avg:994.14ms
step:1553/3000 train_time:1533966ms step_avg:994.15ms
step:1554/3000 train_time:1534953ms step_avg:994.14ms
step:1555/3000 train_time:1535940ms step_avg:994.14ms
step:1556/3000 train_time:1536941ms step_avg:994.14ms
step:1557/3000 train_time:1537950ms step_avg:994.15ms
step:1558/3000 train_time:1538960ms step_avg:994.16ms
step:1559/3000 train_time:1539951ms step_avg:994.16ms
step:1560/3000 train_time:1540941ms step_avg:994.16ms
step:1561/3000 train_time:1541940ms step_avg:994.16ms
step:1562/3000 train_time:1542936ms step_avg:994.16ms
step:1563/3000 train_time:1543938ms step_avg:994.16ms
step:1564/3000 train_time:1544934ms step_avg:994.17ms
step:1565/3000 train_time:1545928ms step_avg:994.17ms
step:1566/3000 train_time:1546927ms step_avg:994.17ms
step:1567/3000 train_time:1547922ms step_avg:994.17ms
step:1568/3000 train_time:1548916ms step_avg:994.17ms
step:1569/3000 train_time:1549920ms step_avg:994.18ms
step:1570/3000 train_time:1550911ms step_avg:994.17ms
step:1571/3000 train_time:1551903ms step_avg:994.17ms
step:1572/3000 train_time:1552895ms step_avg:994.17ms
step:1573/3000 train_time:1553891ms step_avg:994.17ms
step:1574/3000 train_time:1554895ms step_avg:994.18ms
step:1575/3000 train_time:1555883ms step_avg:994.17ms
step:1575/3000 val_loss:2.5497 train_time:1555923ms step_avg:994.20ms perplexity:12.8032 param_count:85,137,462
step:1576/3000 train_time:1556879ms step_avg:994.18ms
step:1577/3000 train_time:1557885ms step_avg:994.18ms
step:1578/3000 train_time:1558882ms step_avg:994.18ms
step:1579/3000 train_time:1559891ms step_avg:994.19ms
step:1580/3000 train_time:1560889ms step_avg:994.20ms
step:1581/3000 train_time:1561912ms step_avg:994.21ms
step:1582/3000 train_time:1562908ms step_avg:994.22ms
step:1583/3000 train_time:1563898ms step_avg:994.21ms
step:1584/3000 train_time:1564897ms step_avg:994.22ms
step:1585/3000 train_time:1565883ms step_avg:994.21ms
step:1586/3000 train_time:1566873ms step_avg:994.21ms
step:1587/3000 train_time:1567864ms step_avg:994.21ms
step:1588/3000 train_time:1568861ms step_avg:994.21ms
step:1589/3000 train_time:1569942ms step_avg:994.26ms
step:1590/3000 train_time:1570935ms step_avg:994.26ms
step:1591/3000 train_time:1571931ms step_avg:994.26ms
step:1592/3000 train_time:1572923ms step_avg:994.26ms
step:1593/3000 train_time:1573919ms step_avg:994.26ms
step:1594/3000 train_time:1574918ms step_avg:994.27ms
step:1595/3000 train_time:1575900ms step_avg:994.26ms
step:1596/3000 train_time:1576916ms step_avg:994.27ms
step:1597/3000 train_time:1577906ms step_avg:994.27ms
step:1598/3000 train_time:1578917ms step_avg:994.28ms
step:1599/3000 train_time:1579917ms step_avg:994.28ms
step:1600/3000 train_time:1580913ms step_avg:994.28ms
step:1600/3000 val_loss:2.5546 train_time:1580953ms step_avg:994.31ms perplexity:12.8663 param_count:85,137,462
step:1601/3000 train_time:1581906ms step_avg:994.28ms
step:1602/3000 train_time:1582906ms step_avg:994.29ms
step:1603/3000 train_time:1583899ms step_avg:994.29ms
step:1604/3000 train_time:1584908ms step_avg:994.30ms
step:1605/3000 train_time:1585900ms step_avg:994.29ms
step:1606/3000 train_time:1586887ms step_avg:994.29ms
step:1607/3000 train_time:1587887ms step_avg:994.29ms
step:1608/3000 train_time:1588884ms step_avg:994.30ms
step:1609/3000 train_time:1589870ms step_avg:994.29ms
step:1610/3000 train_time:1590873ms step_avg:994.30ms
step:1611/3000 train_time:1591853ms step_avg:994.29ms
step:1612/3000 train_time:1592851ms step_avg:994.29ms
step:1613/3000 train_time:1593838ms step_avg:994.28ms
step:1614/3000 train_time:1594826ms step_avg:994.28ms
step:1615/3000 train_time:1595813ms step_avg:994.28ms
step:1616/3000 train_time:1596811ms step_avg:994.28ms
step:1617/3000 train_time:1597802ms step_avg:994.28ms
step:1618/3000 train_time:1598800ms step_avg:994.28ms
step:1619/3000 train_time:1599808ms step_avg:994.29ms
step:1620/3000 train_time:1600793ms step_avg:994.28ms
step:1621/3000 train_time:1601789ms step_avg:994.28ms
step:1622/3000 train_time:1602799ms step_avg:994.29ms
step:1623/3000 train_time:1603786ms step_avg:994.29ms
step:1624/3000 train_time:1604778ms step_avg:994.29ms
step:1625/3000 train_time:1605777ms step_avg:994.29ms
step:1625/3000 val_loss:2.5460 train_time:1605817ms step_avg:994.31ms perplexity:12.7554 param_count:85,137,462
step:1626/3000 train_time:1606761ms step_avg:994.28ms
step:1627/3000 train_time:1607772ms step_avg:994.29ms
step:1628/3000 train_time:1608755ms step_avg:994.29ms
step:1629/3000 train_time:1609753ms step_avg:994.29ms
step:1630/3000 train_time:1610745ms step_avg:994.29ms
step:1631/3000 train_time:1611733ms step_avg:994.28ms
step:1632/3000 train_time:1612736ms step_avg:994.29ms
step:1633/3000 train_time:1613728ms step_avg:994.29ms
step:1634/3000 train_time:1614729ms step_avg:994.29ms
step:1635/3000 train_time:1615721ms step_avg:994.29ms
step:1636/3000 train_time:1616723ms step_avg:994.29ms
step:1637/3000 train_time:1617729ms step_avg:994.30ms
step:1638/3000 train_time:1618713ms step_avg:994.30ms
step:1639/3000 train_time:1619710ms step_avg:994.30ms
step:1640/3000 train_time:1620706ms step_avg:994.30ms
step:1641/3000 train_time:1621713ms step_avg:994.31ms
step:1642/3000 train_time:1622712ms step_avg:994.31ms
step:1643/3000 train_time:1623706ms step_avg:994.31ms
step:1644/3000 train_time:1624706ms step_avg:994.31ms
step:1645/3000 train_time:1625699ms step_avg:994.31ms
step:1646/3000 train_time:1626682ms step_avg:994.30ms
step:1647/3000 train_time:1627676ms step_avg:994.30ms
step:1648/3000 train_time:1628674ms step_avg:994.31ms
step:1649/3000 train_time:1629673ms step_avg:994.31ms
step:1650/3000 train_time:1630671ms step_avg:994.31ms
step:1650/3000 val_loss:2.5544 train_time:1630711ms step_avg:994.34ms perplexity:12.8630 param_count:85,137,462
step:1651/3000 train_time:1631659ms step_avg:994.31ms
step:1652/3000 train_time:1632645ms step_avg:994.30ms
step:1653/3000 train_time:1633635ms step_avg:994.30ms
step:1654/3000 train_time:1634621ms step_avg:994.29ms
step:1655/3000 train_time:1635617ms step_avg:994.30ms
step:1656/3000 train_time:1636616ms step_avg:994.30ms
step:1657/3000 train_time:1637611ms step_avg:994.30ms
step:1658/3000 train_time:1638614ms step_avg:994.30ms
step:1659/3000 train_time:1639594ms step_avg:994.30ms
step:1660/3000 train_time:1640590ms step_avg:994.30ms
step:1661/3000 train_time:1641574ms step_avg:994.29ms
step:1662/3000 train_time:1642577ms step_avg:994.30ms
step:1663/3000 train_time:1643563ms step_avg:994.29ms
step:1664/3000 train_time:1644571ms step_avg:994.30ms
step:1665/3000 train_time:1645565ms step_avg:994.30ms
step:1666/3000 train_time:1646567ms step_avg:994.30ms
step:1667/3000 train_time:1647561ms step_avg:994.30ms
step:1668/3000 train_time:1648556ms step_avg:994.30ms
step:1669/3000 train_time:1649563ms step_avg:994.31ms
step:1670/3000 train_time:1650577ms step_avg:994.32ms
step:1671/3000 train_time:1651573ms step_avg:994.32ms
step:1672/3000 train_time:1652569ms step_avg:994.33ms
step:1673/3000 train_time:1653584ms step_avg:994.34ms
step:1674/3000 train_time:1654578ms step_avg:994.34ms
step:1675/3000 train_time:1655573ms step_avg:994.34ms
step:1675/3000 val_loss:2.5435 train_time:1655613ms step_avg:994.36ms perplexity:12.7246 param_count:85,137,462
step:1676/3000 train_time:1656558ms step_avg:994.33ms
step:1677/3000 train_time:1657550ms step_avg:994.33ms
step:1678/3000 train_time:1658545ms step_avg:994.33ms
step:1679/3000 train_time:1659547ms step_avg:994.34ms
step:1680/3000 train_time:1660541ms step_avg:994.34ms
step:1681/3000 train_time:1661530ms step_avg:994.33ms
step:1682/3000 train_time:1662524ms step_avg:994.33ms
step:1683/3000 train_time:1663514ms step_avg:994.33ms
step:1684/3000 train_time:1664509ms step_avg:994.33ms
step:1685/3000 train_time:1665517ms step_avg:994.34ms
step:1686/3000 train_time:1666523ms step_avg:994.35ms
step:1687/3000 train_time:1667515ms step_avg:994.34ms
step:1688/3000 train_time:1668503ms step_avg:994.34ms
step:1689/3000 train_time:1669496ms step_avg:994.34ms
step:1690/3000 train_time:1670510ms step_avg:994.35ms
step:1691/3000 train_time:1671505ms step_avg:994.35ms
step:1692/3000 train_time:1672501ms step_avg:994.35ms
step:1693/3000 train_time:1673501ms step_avg:994.36ms
step:1694/3000 train_time:1674495ms step_avg:994.36ms
step:1695/3000 train_time:1675501ms step_avg:994.36ms
step:1696/3000 train_time:1676487ms step_avg:994.36ms
step:1697/3000 train_time:1677489ms step_avg:994.36ms
step:1698/3000 train_time:1678478ms step_avg:994.36ms
step:1699/3000 train_time:1679471ms step_avg:994.36ms
step:1700/3000 train_time:1680471ms step_avg:994.36ms
step:1700/3000 val_loss:2.5518 train_time:1680512ms step_avg:994.39ms perplexity:12.8297 param_count:85,137,462
step:1701/3000 train_time:1681451ms step_avg:994.35ms
step:1702/3000 train_time:1682442ms step_avg:994.35ms
step:1703/3000 train_time:1683431ms step_avg:994.35ms
step:1704/3000 train_time:1684426ms step_avg:994.35ms
step:1705/3000 train_time:1685428ms step_avg:994.35ms
step:1706/3000 train_time:1686428ms step_avg:994.36ms
step:1707/3000 train_time:1687416ms step_avg:994.35ms
step:1708/3000 train_time:1688414ms step_avg:994.35ms
step:1709/3000 train_time:1689409ms step_avg:994.35ms
step:1710/3000 train_time:1690413ms step_avg:994.36ms
step:1711/3000 train_time:1691425ms step_avg:994.37ms
step:1712/3000 train_time:1692421ms step_avg:994.37ms
step:1713/3000 train_time:1693420ms step_avg:994.37ms
step:1714/3000 train_time:1694426ms step_avg:994.38ms
step:1715/3000 train_time:1695421ms step_avg:994.38ms
step:1716/3000 train_time:1696430ms step_avg:994.39ms
step:1717/3000 train_time:1697422ms step_avg:994.39ms
step:1718/3000 train_time:1698419ms step_avg:994.39ms
step:1719/3000 train_time:1699416ms step_avg:994.39ms
step:1720/3000 train_time:1700428ms step_avg:994.40ms
step:1721/3000 train_time:1701417ms step_avg:994.40ms
step:1722/3000 train_time:1702408ms step_avg:994.40ms
step:1723/3000 train_time:1703404ms step_avg:994.40ms
step:1724/3000 train_time:1704404ms step_avg:994.40ms
step:1725/3000 train_time:1705418ms step_avg:994.41ms
step:1725/3000 val_loss:2.5494 train_time:1705459ms step_avg:994.44ms perplexity:12.7998 param_count:85,137,462
step:1726/3000 train_time:1706412ms step_avg:994.41ms
step:1727/3000 train_time:1707396ms step_avg:994.41ms
step:1728/3000 train_time:1708387ms step_avg:994.40ms
step:1729/3000 train_time:1709391ms step_avg:994.41ms
step:1730/3000 train_time:1710387ms step_avg:994.41ms
step:1731/3000 train_time:1711380ms step_avg:994.41ms
step:1732/3000 train_time:1712373ms step_avg:994.41ms
step:1733/3000 train_time:1713369ms step_avg:994.41ms
step:1734/3000 train_time:1714364ms step_avg:994.41ms
step:1735/3000 train_time:1715350ms step_avg:994.41ms
step:1736/3000 train_time:1716350ms step_avg:994.41ms
step:1737/3000 train_time:1717336ms step_avg:994.40ms
step:1738/3000 train_time:1718329ms step_avg:994.40ms
step:1739/3000 train_time:1719316ms step_avg:994.40ms
step:1740/3000 train_time:1720306ms step_avg:994.40ms
step:1741/3000 train_time:1721306ms step_avg:994.40ms
step:1742/3000 train_time:1722305ms step_avg:994.40ms
step:1743/3000 train_time:1723291ms step_avg:994.40ms
step:1744/3000 train_time:1724290ms step_avg:994.40ms
step:1745/3000 train_time:1725292ms step_avg:994.40ms
step:1746/3000 train_time:1726289ms step_avg:994.41ms
step:1747/3000 train_time:1727285ms step_avg:994.41ms
step:1748/3000 train_time:1728275ms step_avg:994.40ms
step:1749/3000 train_time:1729263ms step_avg:994.40ms
step:1750/3000 train_time:1730268ms step_avg:994.41ms
step:1750/3000 val_loss:2.5522 train_time:1730310ms step_avg:994.43ms perplexity:12.8355 param_count:85,137,462
step:1751/3000 train_time:1731263ms step_avg:994.41ms
step:1752/3000 train_time:1732257ms step_avg:994.41ms
step:1753/3000 train_time:1733256ms step_avg:994.41ms
step:1754/3000 train_time:1734248ms step_avg:994.41ms
step:1755/3000 train_time:1735260ms step_avg:994.42ms
step:1756/3000 train_time:1736271ms step_avg:994.43ms
step:1757/3000 train_time:1737267ms step_avg:994.43ms
step:1758/3000 train_time:1738265ms step_avg:994.43ms
step:1759/3000 train_time:1739255ms step_avg:994.43ms
step:1760/3000 train_time:1740251ms step_avg:994.43ms
step:1761/3000 train_time:1741246ms step_avg:994.43ms
step:1762/3000 train_time:1742251ms step_avg:994.44ms
step:1763/3000 train_time:1743250ms step_avg:994.44ms
step:1764/3000 train_time:1744267ms step_avg:994.45ms
step:1765/3000 train_time:1745259ms step_avg:994.45ms
step:1766/3000 train_time:1746256ms step_avg:994.45ms
step:1767/3000 train_time:1747280ms step_avg:994.47ms
step:1768/3000 train_time:1748268ms step_avg:994.46ms
step:1769/3000 train_time:1749264ms step_avg:994.47ms
step:1770/3000 train_time:1750254ms step_avg:994.46ms
step:1771/3000 train_time:1751242ms step_avg:994.46ms
step:1772/3000 train_time:1752235ms step_avg:994.46ms
step:1773/3000 train_time:1753218ms step_avg:994.45ms
step:1774/3000 train_time:1754209ms step_avg:994.45ms
step:1775/3000 train_time:1755197ms step_avg:994.45ms
step:1775/3000 val_loss:2.5573 train_time:1755236ms step_avg:994.47ms perplexity:12.9005 param_count:85,137,462
step:1776/3000 train_time:1756203ms step_avg:994.45ms
step:1777/3000 train_time:1757192ms step_avg:994.45ms
step:1778/3000 train_time:1758190ms step_avg:994.45ms
step:1779/3000 train_time:1759190ms step_avg:994.45ms
step:1780/3000 train_time:1760196ms step_avg:994.46ms
step:1781/3000 train_time:1761193ms step_avg:994.46ms
step:1782/3000 train_time:1762181ms step_avg:994.46ms
step:1783/3000 train_time:1763178ms step_avg:994.46ms
step:1784/3000 train_time:1764178ms step_avg:994.46ms
step:1785/3000 train_time:1765172ms step_avg:994.46ms
step:1786/3000 train_time:1766172ms step_avg:994.47ms
step:1787/3000 train_time:1767154ms step_avg:994.46ms
step:1788/3000 train_time:1768146ms step_avg:994.46ms
step:1789/3000 train_time:1769141ms step_avg:994.46ms
step:1790/3000 train_time:1770124ms step_avg:994.45ms
step:1791/3000 train_time:1771105ms step_avg:994.44ms
step:1792/3000 train_time:1772098ms step_avg:994.44ms
step:1793/3000 train_time:1773093ms step_avg:994.44ms
step:1794/3000 train_time:1774084ms step_avg:994.44ms
step:1795/3000 train_time:1775074ms step_avg:994.44ms
step:1796/3000 train_time:1776058ms step_avg:994.43ms
step:1797/3000 train_time:1777064ms step_avg:994.44ms
step:1798/3000 train_time:1778059ms step_avg:994.44ms
step:1799/3000 train_time:1779070ms step_avg:994.45ms
step:1800/3000 train_time:1780062ms step_avg:994.45ms
step:1800/3000 val_loss:2.5488 train_time:1780102ms step_avg:994.47ms perplexity:12.7915 param_count:85,137,462
step:1801/3000 train_time:1781070ms step_avg:994.46ms
step:1802/3000 train_time:1782060ms step_avg:994.45ms
step:1803/3000 train_time:1783073ms step_avg:994.46ms
step:1804/3000 train_time:1784080ms step_avg:994.47ms
step:1805/3000 train_time:1785086ms step_avg:994.48ms
step:1806/3000 train_time:1786080ms step_avg:994.48ms
step:1807/3000 train_time:1787074ms step_avg:994.48ms
step:1808/3000 train_time:1788075ms step_avg:994.48ms
step:1809/3000 train_time:1789077ms step_avg:994.48ms
step:1810/3000 train_time:1790066ms step_avg:994.48ms
step:1811/3000 train_time:1791063ms step_avg:994.48ms
step:1812/3000 train_time:1792066ms step_avg:994.49ms
step:1813/3000 train_time:1793057ms step_avg:994.49ms
step:1814/3000 train_time:1794047ms step_avg:994.48ms
step:1815/3000 train_time:1795038ms step_avg:994.48ms
step:1816/3000 train_time:1796031ms step_avg:994.48ms
step:1817/3000 train_time:1797021ms step_avg:994.48ms
step:1818/3000 train_time:1798023ms step_avg:994.48ms
step:1819/3000 train_time:1799011ms step_avg:994.48ms
step:1820/3000 train_time:1800019ms step_avg:994.49ms
step:1821/3000 train_time:1800997ms step_avg:994.48ms
step:1822/3000 train_time:1802017ms step_avg:994.49ms
step:1823/3000 train_time:1803011ms step_avg:994.49ms
step:1824/3000 train_time:1804008ms step_avg:994.49ms
step:1825/3000 train_time:1805020ms step_avg:994.50ms
step:1825/3000 val_loss:2.5542 train_time:1805061ms step_avg:994.52ms perplexity:12.8608 param_count:85,137,462
step:1826/3000 train_time:1805998ms step_avg:994.49ms
step:1827/3000 train_time:1806998ms step_avg:994.50ms
step:1828/3000 train_time:1808000ms step_avg:994.50ms
step:1829/3000 train_time:1809000ms step_avg:994.50ms
step:1830/3000 train_time:1809991ms step_avg:994.50ms
step:1831/3000 train_time:1810984ms step_avg:994.50ms
step:1832/3000 train_time:1811984ms step_avg:994.50ms
step:1833/3000 train_time:1812981ms step_avg:994.50ms
step:1834/3000 train_time:1813981ms step_avg:994.51ms
step:1835/3000 train_time:1814972ms step_avg:994.51ms
step:1836/3000 train_time:1815967ms step_avg:994.51ms
step:1837/3000 train_time:1816965ms step_avg:994.51ms
step:1838/3000 train_time:1817960ms step_avg:994.51ms
step:1839/3000 train_time:1818955ms step_avg:994.51ms
step:1840/3000 train_time:1819942ms step_avg:994.50ms
step:1841/3000 train_time:1820941ms step_avg:994.51ms
step:1842/3000 train_time:1821935ms step_avg:994.51ms
step:1843/3000 train_time:1822922ms step_avg:994.50ms
step:1844/3000 train_time:1823914ms step_avg:994.50ms
step:1845/3000 train_time:1824909ms step_avg:994.50ms
step:1846/3000 train_time:1825910ms step_avg:994.50ms
step:1847/3000 train_time:1826899ms step_avg:994.50ms
step:1848/3000 train_time:1827880ms step_avg:994.49ms
step:1849/3000 train_time:1828866ms step_avg:994.49ms
step:1850/3000 train_time:1829866ms step_avg:994.49ms
step:1850/3000 val_loss:2.5373 train_time:1829908ms step_avg:994.51ms perplexity:12.6459 param_count:85,137,462
step:1851/3000 train_time:1830845ms step_avg:994.48ms
step:1852/3000 train_time:1831858ms step_avg:994.49ms
step:1853/3000 train_time:1832847ms step_avg:994.49ms
step:1854/3000 train_time:1833831ms step_avg:994.49ms
step:1855/3000 train_time:1834829ms step_avg:994.49ms
step:1856/3000 train_time:1835815ms step_avg:994.48ms
step:1857/3000 train_time:1836819ms step_avg:994.49ms
step:1858/3000 train_time:1837812ms step_avg:994.49ms
step:1859/3000 train_time:1838833ms step_avg:994.50ms
step:1860/3000 train_time:1839831ms step_avg:994.50ms
step:1861/3000 train_time:1840828ms step_avg:994.50ms
step:1862/3000 train_time:1841824ms step_avg:994.51ms
step:1863/3000 train_time:1842832ms step_avg:994.51ms
step:1864/3000 train_time:1843819ms step_avg:994.51ms
step:1865/3000 train_time:1844821ms step_avg:994.51ms
step:1866/3000 train_time:1845801ms step_avg:994.51ms
step:1867/3000 train_time:1846789ms step_avg:994.50ms
step:1868/3000 train_time:1847794ms step_avg:994.51ms
step:1869/3000 train_time:1848779ms step_avg:994.50ms
step:1870/3000 train_time:1849772ms step_avg:994.50ms
step:1871/3000 train_time:1850776ms step_avg:994.51ms
step:1872/3000 train_time:1851813ms step_avg:994.53ms
step:1873/3000 train_time:1852797ms step_avg:994.52ms
step:1874/3000 train_time:1853779ms step_avg:994.52ms
step:1875/3000 train_time:1854764ms step_avg:994.51ms
step:1875/3000 val_loss:2.5440 train_time:1854804ms step_avg:994.53ms perplexity:12.7303 param_count:85,137,462
step:1876/3000 train_time:1855755ms step_avg:994.51ms
step:1877/3000 train_time:1856752ms step_avg:994.51ms
step:1878/3000 train_time:1857747ms step_avg:994.51ms
step:1879/3000 train_time:1858746ms step_avg:994.51ms
step:1880/3000 train_time:1859738ms step_avg:994.51ms
step:1881/3000 train_time:1860723ms step_avg:994.51ms
step:1882/3000 train_time:1861713ms step_avg:994.50ms
step:1883/3000 train_time:1862707ms step_avg:994.50ms
step:1884/3000 train_time:1863691ms step_avg:994.50ms
step:1885/3000 train_time:1864694ms step_avg:994.50ms
step:1886/3000 train_time:1865697ms step_avg:994.51ms
step:1887/3000 train_time:1866712ms step_avg:994.52ms
step:1888/3000 train_time:1867734ms step_avg:994.53ms
step:1889/3000 train_time:1868711ms step_avg:994.52ms
step:1890/3000 train_time:1869711ms step_avg:994.53ms
step:1891/3000 train_time:1870708ms step_avg:994.53ms
step:1892/3000 train_time:1871701ms step_avg:994.53ms
step:1893/3000 train_time:1872702ms step_avg:994.53ms
step:1894/3000 train_time:1873697ms step_avg:994.53ms
step:1895/3000 train_time:1874691ms step_avg:994.53ms
step:1896/3000 train_time:1875672ms step_avg:994.52ms
step:1897/3000 train_time:1876669ms step_avg:994.52ms
step:1898/3000 train_time:1877661ms step_avg:994.52ms
step:1899/3000 train_time:1878669ms step_avg:994.53ms
step:1900/3000 train_time:1879670ms step_avg:994.53ms
step:1900/3000 val_loss:2.5574 train_time:1879710ms step_avg:994.56ms perplexity:12.9026 param_count:85,137,462
step:1901/3000 train_time:1880664ms step_avg:994.53ms
step:1902/3000 train_time:1881657ms step_avg:994.53ms
step:1903/3000 train_time:1882655ms step_avg:994.53ms
step:1904/3000 train_time:1883637ms step_avg:994.53ms
step:1905/3000 train_time:1884625ms step_avg:994.53ms
step:1906/3000 train_time:1885638ms step_avg:994.54ms
step:1907/3000 train_time:1886633ms step_avg:994.54ms
step:1908/3000 train_time:1887628ms step_avg:994.54ms
step:1909/3000 train_time:1888634ms step_avg:994.54ms
step:1910/3000 train_time:1889626ms step_avg:994.54ms
step:1911/3000 train_time:1890635ms step_avg:994.55ms
step:1912/3000 train_time:1891622ms step_avg:994.54ms
step:1913/3000 train_time:1892624ms step_avg:994.55ms
step:1914/3000 train_time:1893627ms step_avg:994.55ms
step:1915/3000 train_time:1894620ms step_avg:994.55ms
step:1916/3000 train_time:1895611ms step_avg:994.55ms
step:1917/3000 train_time:1896610ms step_avg:994.55ms
step:1918/3000 train_time:1897610ms step_avg:994.55ms
step:1919/3000 train_time:1898604ms step_avg:994.55ms
step:1920/3000 train_time:1899599ms step_avg:994.55ms
step:1921/3000 train_time:1900601ms step_avg:994.56ms
step:1922/3000 train_time:1901602ms step_avg:994.56ms
step:1923/3000 train_time:1902597ms step_avg:994.56ms
step:1924/3000 train_time:1903590ms step_avg:994.56ms
step:1925/3000 train_time:1904577ms step_avg:994.56ms
step:1925/3000 val_loss:2.5514 train_time:1904618ms step_avg:994.58ms perplexity:12.8248 param_count:85,137,462
step:1926/3000 train_time:1905576ms step_avg:994.56ms
step:1927/3000 train_time:1906583ms step_avg:994.57ms
step:1928/3000 train_time:1907576ms step_avg:994.57ms
step:1929/3000 train_time:1908576ms step_avg:994.57ms
step:1930/3000 train_time:1909576ms step_avg:994.57ms
step:1931/3000 train_time:1910570ms step_avg:994.57ms
step:1932/3000 train_time:1911556ms step_avg:994.57ms
step:1933/3000 train_time:1912556ms step_avg:994.57ms
step:1934/3000 train_time:1913544ms step_avg:994.57ms
step:1935/3000 train_time:1914540ms step_avg:994.57ms
step:1936/3000 train_time:1915521ms step_avg:994.56ms
step:1937/3000 train_time:1916514ms step_avg:994.56ms
step:1938/3000 train_time:1917523ms step_avg:994.57ms
step:1939/3000 train_time:1918519ms step_avg:994.57ms
step:1940/3000 train_time:1919511ms step_avg:994.57ms
step:1941/3000 train_time:1920509ms step_avg:994.57ms
step:1942/3000 train_time:1921499ms step_avg:994.56ms
step:1943/3000 train_time:1922493ms step_avg:994.56ms
step:1944/3000 train_time:1923491ms step_avg:994.57ms
step:1945/3000 train_time:1924476ms step_avg:994.56ms
step:1946/3000 train_time:1925471ms step_avg:994.56ms
step:1947/3000 train_time:1926468ms step_avg:994.56ms
step:1948/3000 train_time:1927471ms step_avg:994.57ms
step:1949/3000 train_time:1928463ms step_avg:994.57ms
step:1950/3000 train_time:1929451ms step_avg:994.56ms
step:1950/3000 val_loss:2.5519 train_time:1929490ms step_avg:994.58ms perplexity:12.8320 param_count:85,137,462
step:1951/3000 train_time:1930433ms step_avg:994.56ms
step:1952/3000 train_time:1931430ms step_avg:994.56ms
step:1953/3000 train_time:1932431ms step_avg:994.56ms
step:1954/3000 train_time:1933425ms step_avg:994.56ms
step:1955/3000 train_time:1934414ms step_avg:994.56ms
step:1956/3000 train_time:1935418ms step_avg:994.56ms
step:1957/3000 train_time:1936409ms step_avg:994.56ms
step:1958/3000 train_time:1937396ms step_avg:994.56ms
step:1959/3000 train_time:1938383ms step_avg:994.55ms
step:1960/3000 train_time:1939390ms step_avg:994.56ms
step:1961/3000 train_time:1940379ms step_avg:994.56ms
step:1962/3000 train_time:1941375ms step_avg:994.56ms
step:1963/3000 train_time:1942370ms step_avg:994.56ms
step:1964/3000 train_time:1943366ms step_avg:994.56ms
step:1965/3000 train_time:1944362ms step_avg:994.56ms
step:1966/3000 train_time:1945364ms step_avg:994.56ms
step:1967/3000 train_time:1946358ms step_avg:994.56ms
step:1968/3000 train_time:1947354ms step_avg:994.56ms
step:1969/3000 train_time:1948355ms step_avg:994.57ms
step:1970/3000 train_time:1949356ms step_avg:994.57ms
step:1971/3000 train_time:1950346ms step_avg:994.57ms
step:1972/3000 train_time:1951355ms step_avg:994.57ms
step:1973/3000 train_time:1952354ms step_avg:994.58ms
step:1974/3000 train_time:1953354ms step_avg:994.58ms
step:1975/3000 train_time:1954344ms step_avg:994.58ms
step:1975/3000 val_loss:2.5504 train_time:1954384ms step_avg:994.60ms perplexity:12.8127 param_count:85,137,462
step:1976/3000 train_time:1955329ms step_avg:994.57ms
step:1977/3000 train_time:1956329ms step_avg:994.57ms
step:1978/3000 train_time:1957317ms step_avg:994.57ms
step:1979/3000 train_time:1958334ms step_avg:994.58ms
step:1980/3000 train_time:1959325ms step_avg:994.58ms
step:1981/3000 train_time:1960335ms step_avg:994.59ms
step:1982/3000 train_time:1961325ms step_avg:994.59ms
step:1983/3000 train_time:1962314ms step_avg:994.58ms
step:1984/3000 train_time:1963303ms step_avg:994.58ms
step:1985/3000 train_time:1964304ms step_avg:994.58ms
step:1986/3000 train_time:1965295ms step_avg:994.58ms
step:1987/3000 train_time:1966301ms step_avg:994.59ms
step:1988/3000 train_time:1967299ms step_avg:994.59ms
step:1989/3000 train_time:1968288ms step_avg:994.59ms
step:1990/3000 train_time:1969290ms step_avg:994.59ms
step:1991/3000 train_time:1970281ms step_avg:994.59ms
step:1992/3000 train_time:1971273ms step_avg:994.59ms
step:1993/3000 train_time:1972279ms step_avg:994.59ms
step:1994/3000 train_time:1973273ms step_avg:994.59ms
step:1995/3000 train_time:1974262ms step_avg:994.59ms
step:1996/3000 train_time:1975267ms step_avg:994.60ms
step:1997/3000 train_time:1976266ms step_avg:994.60ms
step:1998/3000 train_time:1977259ms step_avg:994.60ms
step:1999/3000 train_time:1978252ms step_avg:994.60ms
step:2000/3000 train_time:1979242ms step_avg:994.59ms
step:2000/3000 val_loss:2.5368 train_time:1979282ms step_avg:994.61ms perplexity:12.6391 param_count:85,137,462
step:2001/3000 train_time:1980237ms step_avg:994.59ms
step:2002/3000 train_time:1981227ms step_avg:994.59ms
step:2003/3000 train_time:1982209ms step_avg:994.59ms
step:2004/3000 train_time:1983196ms step_avg:994.58ms
step:2005/3000 train_time:1984188ms step_avg:994.58ms
step:2006/3000 train_time:1985182ms step_avg:994.58ms
step:2007/3000 train_time:1986178ms step_avg:994.58ms
step:2008/3000 train_time:1987208ms step_avg:994.60ms
step:2009/3000 train_time:1988213ms step_avg:994.60ms
step:2010/3000 train_time:1989205ms step_avg:994.60ms
step:2011/3000 train_time:1990217ms step_avg:994.61ms
step:2012/3000 train_time:1991201ms step_avg:994.61ms
step:2013/3000 train_time:1992189ms step_avg:994.60ms
step:2014/3000 train_time:1993173ms step_avg:994.60ms
step:2015/3000 train_time:1994168ms step_avg:994.60ms
step:2016/3000 train_time:1995174ms step_avg:994.60ms
step:2017/3000 train_time:1996180ms step_avg:994.61ms
step:2018/3000 train_time:1997172ms step_avg:994.61ms
step:2019/3000 train_time:1998177ms step_avg:994.61ms
step:2020/3000 train_time:1999177ms step_avg:994.62ms
step:2021/3000 train_time:2000174ms step_avg:994.62ms
step:2022/3000 train_time:2001176ms step_avg:994.62ms
step:2023/3000 train_time:2002174ms step_avg:994.62ms
step:2024/3000 train_time:2003167ms step_avg:994.62ms
step:2025/3000 train_time:2004155ms step_avg:994.62ms
step:2025/3000 val_loss:2.5336 train_time:2004196ms step_avg:994.64ms perplexity:12.5990 param_count:85,137,462
step:2026/3000 train_time:2005134ms step_avg:994.61ms
step:2027/3000 train_time:2006131ms step_avg:994.61ms
step:2028/3000 train_time:2007131ms step_avg:994.61ms
step:2029/3000 train_time:2008124ms step_avg:994.61ms
step:2030/3000 train_time:2009128ms step_avg:994.62ms
step:2031/3000 train_time:2010123ms step_avg:994.62ms
step:2032/3000 train_time:2011112ms step_avg:994.62ms
step:2033/3000 train_time:2012133ms step_avg:994.63ms
step:2034/3000 train_time:2013136ms step_avg:994.63ms
step:2035/3000 train_time:2014137ms step_avg:994.64ms
step:2036/3000 train_time:2015128ms step_avg:994.63ms
step:2037/3000 train_time:2016127ms step_avg:994.64ms
step:2038/3000 train_time:2017117ms step_avg:994.63ms
step:2039/3000 train_time:2018111ms step_avg:994.63ms
step:2040/3000 train_time:2019099ms step_avg:994.63ms
step:2041/3000 train_time:2020102ms step_avg:994.63ms
step:2042/3000 train_time:2021085ms step_avg:994.63ms
step:2043/3000 train_time:2022082ms step_avg:994.63ms
step:2044/3000 train_time:2023087ms step_avg:994.63ms
step:2045/3000 train_time:2024102ms step_avg:994.64ms
step:2046/3000 train_time:2025098ms step_avg:994.65ms
step:2047/3000 train_time:2026088ms step_avg:994.64ms
step:2048/3000 train_time:2027074ms step_avg:994.64ms
step:2049/3000 train_time:2028059ms step_avg:994.63ms
step:2050/3000 train_time:2029055ms step_avg:994.63ms
step:2050/3000 val_loss:2.5394 train_time:2029096ms step_avg:994.66ms perplexity:12.6721 param_count:85,137,462
step:2051/3000 train_time:2030067ms step_avg:994.64ms
step:2052/3000 train_time:2031071ms step_avg:994.65ms
step:2053/3000 train_time:2032071ms step_avg:994.65ms
step:2054/3000 train_time:2033074ms step_avg:994.65ms
step:2055/3000 train_time:2034068ms step_avg:994.65ms
step:2056/3000 train_time:2035071ms step_avg:994.66ms
step:2057/3000 train_time:2036055ms step_avg:994.65ms
step:2058/3000 train_time:2037049ms step_avg:994.65ms
step:2059/3000 train_time:2038052ms step_avg:994.66ms
step:2060/3000 train_time:2039047ms step_avg:994.66ms
step:2061/3000 train_time:2040048ms step_avg:994.66ms
step:2062/3000 train_time:2041068ms step_avg:994.67ms
step:2063/3000 train_time:2042068ms step_avg:994.68ms
step:2064/3000 train_time:2043063ms step_avg:994.68ms
step:2065/3000 train_time:2044062ms step_avg:994.68ms
step:2066/3000 train_time:2045056ms step_avg:994.68ms
step:2067/3000 train_time:2046049ms step_avg:994.68ms
step:2068/3000 train_time:2047043ms step_avg:994.68ms
step:2069/3000 train_time:2048039ms step_avg:994.68ms
step:2070/3000 train_time:2049032ms step_avg:994.68ms
step:2071/3000 train_time:2050045ms step_avg:994.68ms
step:2072/3000 train_time:2051031ms step_avg:994.68ms
step:2073/3000 train_time:2052028ms step_avg:994.68ms
step:2074/3000 train_time:2053032ms step_avg:994.69ms
step:2075/3000 train_time:2054019ms step_avg:994.68ms
step:2075/3000 val_loss:2.5415 train_time:2054060ms step_avg:994.70ms perplexity:12.6981 param_count:85,137,462
step:2076/3000 train_time:2055011ms step_avg:994.68ms
step:2077/3000 train_time:2056010ms step_avg:994.68ms
step:2078/3000 train_time:2057001ms step_avg:994.68ms
step:2079/3000 train_time:2057994ms step_avg:994.68ms
step:2080/3000 train_time:2058999ms step_avg:994.69ms
step:2081/3000 train_time:2060013ms step_avg:994.69ms
step:2082/3000 train_time:2061010ms step_avg:994.70ms
step:2083/3000 train_time:2062008ms step_avg:994.70ms
step:2084/3000 train_time:2063014ms step_avg:994.70ms
step:2085/3000 train_time:2064015ms step_avg:994.71ms
step:2086/3000 train_time:2065013ms step_avg:994.71ms
step:2087/3000 train_time:2066013ms step_avg:994.71ms
step:2088/3000 train_time:2067008ms step_avg:994.71ms
step:2089/3000 train_time:2068007ms step_avg:994.71ms
step:2090/3000 train_time:2069002ms step_avg:994.71ms
step:2091/3000 train_time:2070025ms step_avg:994.73ms
step:2092/3000 train_time:2071008ms step_avg:994.72ms
step:2093/3000 train_time:2072026ms step_avg:994.73ms
step:2094/3000 train_time:2073023ms step_avg:994.73ms
step:2095/3000 train_time:2074019ms step_avg:994.73ms
step:2096/3000 train_time:2075010ms step_avg:994.73ms
step:2097/3000 train_time:2075997ms step_avg:994.73ms
step:2098/3000 train_time:2076993ms step_avg:994.73ms
step:2099/3000 train_time:2077983ms step_avg:994.73ms
step:2100/3000 train_time:2078965ms step_avg:994.72ms
step:2100/3000 val_loss:2.5450 train_time:2079005ms step_avg:994.74ms perplexity:12.7427 param_count:85,137,462
step:2101/3000 train_time:2079960ms step_avg:994.72ms
step:2102/3000 train_time:2080965ms step_avg:994.73ms
step:2103/3000 train_time:2081972ms step_avg:994.73ms
step:2104/3000 train_time:2082973ms step_avg:994.73ms
step:2105/3000 train_time:2083964ms step_avg:994.73ms
step:2106/3000 train_time:2084957ms step_avg:994.73ms
step:2107/3000 train_time:2085968ms step_avg:994.74ms
step:2108/3000 train_time:2086958ms step_avg:994.74ms
step:2109/3000 train_time:2087951ms step_avg:994.74ms
step:2110/3000 train_time:2088943ms step_avg:994.73ms
step:2111/3000 train_time:2089936ms step_avg:994.73ms
step:2112/3000 train_time:2090934ms step_avg:994.74ms
step:2113/3000 train_time:2091921ms step_avg:994.73ms
step:2114/3000 train_time:2092926ms step_avg:994.74ms
step:2115/3000 train_time:2093920ms step_avg:994.74ms
step:2116/3000 train_time:2094927ms step_avg:994.74ms
step:2117/3000 train_time:2095932ms step_avg:994.75ms
step:2118/3000 train_time:2096922ms step_avg:994.74ms
step:2119/3000 train_time:2097912ms step_avg:994.74ms
step:2120/3000 train_time:2098904ms step_avg:994.74ms
step:2121/3000 train_time:2099893ms step_avg:994.74ms
step:2122/3000 train_time:2100889ms step_avg:994.74ms
step:2123/3000 train_time:2101881ms step_avg:994.74ms
step:2124/3000 train_time:2102876ms step_avg:994.74ms
step:2125/3000 train_time:2103870ms step_avg:994.74ms
step:2125/3000 val_loss:2.5367 train_time:2103910ms step_avg:994.76ms perplexity:12.6379 param_count:85,137,462
step:2126/3000 train_time:2104852ms step_avg:994.73ms
step:2127/3000 train_time:2105850ms step_avg:994.73ms
step:2128/3000 train_time:2106850ms step_avg:994.74ms
step:2129/3000 train_time:2107855ms step_avg:994.74ms
step:2130/3000 train_time:2108862ms step_avg:994.75ms
step:2131/3000 train_time:2109890ms step_avg:994.76ms
step:2132/3000 train_time:2110875ms step_avg:994.76ms
step:2133/3000 train_time:2111898ms step_avg:994.77ms
step:2134/3000 train_time:2112890ms step_avg:994.77ms
step:2135/3000 train_time:2113887ms step_avg:994.77ms
step:2136/3000 train_time:2114883ms step_avg:994.77ms
step:2137/3000 train_time:2115874ms step_avg:994.77ms
step:2138/3000 train_time:2116863ms step_avg:994.77ms
step:2139/3000 train_time:2117852ms step_avg:994.76ms
step:2140/3000 train_time:2118845ms step_avg:994.76ms
step:2141/3000 train_time:2119841ms step_avg:994.76ms
step:2142/3000 train_time:2120824ms step_avg:994.76ms
step:2143/3000 train_time:2121817ms step_avg:994.76ms
step:2144/3000 train_time:2122821ms step_avg:994.76ms
step:2145/3000 train_time:2123812ms step_avg:994.76ms
step:2146/3000 train_time:2124806ms step_avg:994.76ms
step:2147/3000 train_time:2125805ms step_avg:994.76ms
step:2148/3000 train_time:2126832ms step_avg:994.78ms
step:2149/3000 train_time:2127824ms step_avg:994.78ms
step:2150/3000 train_time:2128832ms step_avg:994.78ms
step:2150/3000 val_loss:2.5304 train_time:2128872ms step_avg:994.80ms perplexity:12.5583 param_count:85,137,462
step:2151/3000 train_time:2129818ms step_avg:994.78ms
step:2152/3000 train_time:2130812ms step_avg:994.78ms
step:2153/3000 train_time:2131805ms step_avg:994.78ms
step:2154/3000 train_time:2132791ms step_avg:994.77ms
step:2155/3000 train_time:2133783ms step_avg:994.77ms
step:2156/3000 train_time:2134782ms step_avg:994.77ms
step:2157/3000 train_time:2135771ms step_avg:994.77ms
step:2158/3000 train_time:2136768ms step_avg:994.77ms
step:2159/3000 train_time:2137767ms step_avg:994.77ms
step:2160/3000 train_time:2138763ms step_avg:994.77ms
step:2161/3000 train_time:2139772ms step_avg:994.78ms
step:2162/3000 train_time:2140771ms step_avg:994.78ms
step:2163/3000 train_time:2141759ms step_avg:994.78ms
step:2164/3000 train_time:2142749ms step_avg:994.78ms
step:2165/3000 train_time:2143735ms step_avg:994.77ms
step:2166/3000 train_time:2144729ms step_avg:994.77ms
step:2167/3000 train_time:2145738ms step_avg:994.78ms
step:2168/3000 train_time:2146731ms step_avg:994.78ms
step:2169/3000 train_time:2147724ms step_avg:994.78ms
step:2170/3000 train_time:2148715ms step_avg:994.78ms
step:2171/3000 train_time:2149714ms step_avg:994.78ms
step:2172/3000 train_time:2150711ms step_avg:994.78ms
step:2173/3000 train_time:2151698ms step_avg:994.77ms
step:2174/3000 train_time:2152705ms step_avg:994.78ms
step:2175/3000 train_time:2153705ms step_avg:994.78ms
step:2175/3000 val_loss:2.5303 train_time:2153745ms step_avg:994.80ms perplexity:12.5573 param_count:85,137,462
step:2176/3000 train_time:2154690ms step_avg:994.78ms
step:2177/3000 train_time:2155687ms step_avg:994.78ms
step:2178/3000 train_time:2156685ms step_avg:994.78ms
step:2179/3000 train_time:2157681ms step_avg:994.78ms
step:2180/3000 train_time:2158680ms step_avg:994.78ms
step:2181/3000 train_time:2159681ms step_avg:994.79ms
step:2182/3000 train_time:2160678ms step_avg:994.79ms
step:2183/3000 train_time:2161674ms step_avg:994.79ms
step:2184/3000 train_time:2162659ms step_avg:994.78ms
step:2185/3000 train_time:2163681ms step_avg:994.80ms
step:2186/3000 train_time:2164674ms step_avg:994.80ms
step:2187/3000 train_time:2165679ms step_avg:994.80ms
step:2188/3000 train_time:2166678ms step_avg:994.80ms
step:2189/3000 train_time:2167678ms step_avg:994.80ms
step:2190/3000 train_time:2168658ms step_avg:994.80ms
step:2191/3000 train_time:2169641ms step_avg:994.79ms
step:2192/3000 train_time:2170640ms step_avg:994.79ms
step:2193/3000 train_time:2171627ms step_avg:994.79ms
step:2194/3000 train_time:2172614ms step_avg:994.79ms
step:2195/3000 train_time:2173631ms step_avg:994.80ms
step:2196/3000 train_time:2174646ms step_avg:994.81ms
step:2197/3000 train_time:2175638ms step_avg:994.80ms
step:2198/3000 train_time:2176625ms step_avg:994.80ms
step:2199/3000 train_time:2177617ms step_avg:994.80ms
step:2200/3000 train_time:2178632ms step_avg:994.81ms
step:2200/3000 val_loss:2.5300 train_time:2178673ms step_avg:994.83ms perplexity:12.5541 param_count:85,137,462
step:2201/3000 train_time:2179635ms step_avg:994.81ms
step:2202/3000 train_time:2180635ms step_avg:994.82ms
step:2203/3000 train_time:2181632ms step_avg:994.82ms
step:2204/3000 train_time:2182629ms step_avg:994.82ms
step:2205/3000 train_time:2183624ms step_avg:994.82ms
step:2206/3000 train_time:2184616ms step_avg:994.82ms
step:2207/3000 train_time:2185612ms step_avg:994.82ms
step:2208/3000 train_time:2186601ms step_avg:994.81ms
step:2209/3000 train_time:2187600ms step_avg:994.82ms
step:2210/3000 train_time:2188589ms step_avg:994.81ms
step:2211/3000 train_time:2189600ms step_avg:994.82ms
step:2212/3000 train_time:2190589ms step_avg:994.82ms
step:2213/3000 train_time:2191575ms step_avg:994.81ms
step:2214/3000 train_time:2192570ms step_avg:994.81ms
step:2215/3000 train_time:2193567ms step_avg:994.82ms
step:2216/3000 train_time:2194563ms step_avg:994.82ms
step:2217/3000 train_time:2195547ms step_avg:994.81ms
step:2218/3000 train_time:2196537ms step_avg:994.81ms
step:2219/3000 train_time:2197533ms step_avg:994.81ms
step:2220/3000 train_time:2198545ms step_avg:994.82ms
step:2221/3000 train_time:2199543ms step_avg:994.82ms
step:2222/3000 train_time:2200535ms step_avg:994.82ms
step:2223/3000 train_time:2201515ms step_avg:994.81ms
step:2224/3000 train_time:2202505ms step_avg:994.81ms
step:2225/3000 train_time:2203516ms step_avg:994.82ms
step:2225/3000 val_loss:2.5276 train_time:2203556ms step_avg:994.83ms perplexity:12.5232 param_count:85,137,462
step:2226/3000 train_time:2204528ms step_avg:994.82ms
step:2227/3000 train_time:2205529ms step_avg:994.83ms
step:2228/3000 train_time:2206548ms step_avg:994.84ms
step:2229/3000 train_time:2207553ms step_avg:994.84ms
step:2230/3000 train_time:2208552ms step_avg:994.84ms
step:2231/3000 train_time:2209551ms step_avg:994.85ms
step:2232/3000 train_time:2210535ms step_avg:994.84ms
step:2233/3000 train_time:2211529ms step_avg:994.84ms
step:2234/3000 train_time:2212529ms step_avg:994.84ms
step:2235/3000 train_time:2213515ms step_avg:994.84ms
step:2236/3000 train_time:2214502ms step_avg:994.83ms
step:2237/3000 train_time:2215505ms step_avg:994.84ms
step:2238/3000 train_time:2216501ms step_avg:994.84ms
step:2239/3000 train_time:2217512ms step_avg:994.85ms
step:2240/3000 train_time:2218521ms step_avg:994.85ms
step:2241/3000 train_time:2219524ms step_avg:994.86ms
step:2242/3000 train_time:2220519ms step_avg:994.86ms
step:2243/3000 train_time:2221537ms step_avg:994.87ms
step:2244/3000 train_time:2222524ms step_avg:994.86ms
step:2245/3000 train_time:2223518ms step_avg:994.86ms
step:2246/3000 train_time:2224531ms step_avg:994.87ms
step:2247/3000 train_time:2225520ms step_avg:994.87ms
step:2248/3000 train_time:2226514ms step_avg:994.87ms
step:2249/3000 train_time:2227500ms step_avg:994.86ms
step:2250/3000 train_time:2228497ms step_avg:994.86ms
step:2250/3000 val_loss:2.5356 train_time:2228539ms step_avg:994.88ms perplexity:12.6242 param_count:85,137,462
step:2251/3000 train_time:2229479ms step_avg:994.86ms
step:2252/3000 train_time:2230467ms step_avg:994.86ms
step:2253/3000 train_time:2231455ms step_avg:994.85ms
step:2254/3000 train_time:2232451ms step_avg:994.85ms
step:2255/3000 train_time:2233441ms step_avg:994.85ms
step:2256/3000 train_time:2234450ms step_avg:994.86ms
step:2257/3000 train_time:2235456ms step_avg:994.86ms
step:2258/3000 train_time:2236449ms step_avg:994.86ms
step:2259/3000 train_time:2237443ms step_avg:994.86ms
step:2260/3000 train_time:2238468ms step_avg:994.87ms
step:2261/3000 train_time:2239463ms step_avg:994.87ms
step:2262/3000 train_time:2240452ms step_avg:994.87ms
step:2263/3000 train_time:2241435ms step_avg:994.87ms
step:2264/3000 train_time:2242445ms step_avg:994.87ms
step:2265/3000 train_time:2243454ms step_avg:994.88ms
step:2266/3000 train_time:2244460ms step_avg:994.88ms
step:2267/3000 train_time:2245457ms step_avg:994.89ms
step:2268/3000 train_time:2246459ms step_avg:994.89ms
step:2269/3000 train_time:2247449ms step_avg:994.89ms
step:2270/3000 train_time:2248462ms step_avg:994.89ms
step:2271/3000 train_time:2249451ms step_avg:994.89ms
step:2272/3000 train_time:2250442ms step_avg:994.89ms
step:2273/3000 train_time:2251476ms step_avg:994.91ms
step:2274/3000 train_time:2252466ms step_avg:994.91ms
step:2275/3000 train_time:2253458ms step_avg:994.90ms
step:2275/3000 val_loss:2.5314 train_time:2253500ms step_avg:994.92ms perplexity:12.5715 param_count:85,137,462
step:2276/3000 train_time:2254456ms step_avg:994.91ms
step:2277/3000 train_time:2255467ms step_avg:994.91ms
step:2278/3000 train_time:2256460ms step_avg:994.91ms
step:2279/3000 train_time:2257486ms step_avg:994.93ms
step:2280/3000 train_time:2258490ms step_avg:994.93ms
step:2281/3000 train_time:2259475ms step_avg:994.93ms
step:2282/3000 train_time:2260465ms step_avg:994.92ms
step:2283/3000 train_time:2261455ms step_avg:994.92ms
step:2284/3000 train_time:2262453ms step_avg:994.92ms
step:2285/3000 train_time:2263450ms step_avg:994.92ms
step:2286/3000 train_time:2264447ms step_avg:994.92ms
step:2287/3000 train_time:2265463ms step_avg:994.93ms
step:2288/3000 train_time:2266462ms step_avg:994.94ms
step:2289/3000 train_time:2267458ms step_avg:994.94ms
step:2290/3000 train_time:2268452ms step_avg:994.93ms
step:2291/3000 train_time:2269452ms step_avg:994.94ms
step:2292/3000 train_time:2270445ms step_avg:994.94ms
step:2293/3000 train_time:2271433ms step_avg:994.93ms
step:2294/3000 train_time:2272433ms step_avg:994.94ms
step:2295/3000 train_time:2273426ms step_avg:994.93ms
step:2296/3000 train_time:2274437ms step_avg:994.94ms
step:2297/3000 train_time:2275439ms step_avg:994.94ms
step:2298/3000 train_time:2276437ms step_avg:994.95ms
step:2299/3000 train_time:2277422ms step_avg:994.94ms
step:2300/3000 train_time:2278424ms step_avg:994.95ms
step:2300/3000 val_loss:2.5293 train_time:2278465ms step_avg:994.96ms perplexity:12.5443 param_count:85,137,462
step:2301/3000 train_time:2279417ms step_avg:994.94ms
step:2302/3000 train_time:2280406ms step_avg:994.94ms
step:2303/3000 train_time:2281403ms step_avg:994.94ms
step:2304/3000 train_time:2282391ms step_avg:994.94ms
step:2305/3000 train_time:2283391ms step_avg:994.94ms
step:2306/3000 train_time:2284391ms step_avg:994.94ms
step:2307/3000 train_time:2285400ms step_avg:994.95ms
step:2308/3000 train_time:2286397ms step_avg:994.95ms
step:2309/3000 train_time:2287386ms step_avg:994.95ms
step:2310/3000 train_time:2288390ms step_avg:994.95ms
step:2311/3000 train_time:2289392ms step_avg:994.96ms
step:2312/3000 train_time:2290390ms step_avg:994.96ms
step:2313/3000 train_time:2291382ms step_avg:994.96ms
step:2314/3000 train_time:2292386ms step_avg:994.96ms
step:2315/3000 train_time:2293378ms step_avg:994.96ms
step:2316/3000 train_time:2294407ms step_avg:994.97ms
step:2317/3000 train_time:2295405ms step_avg:994.97ms
step:2318/3000 train_time:2296407ms step_avg:994.98ms
step:2319/3000 train_time:2297401ms step_avg:994.98ms
step:2320/3000 train_time:2298404ms step_avg:994.98ms
step:2321/3000 train_time:2299390ms step_avg:994.98ms
step:2322/3000 train_time:2300381ms step_avg:994.97ms
step:2323/3000 train_time:2301400ms step_avg:994.98ms
step:2324/3000 train_time:2302423ms step_avg:995.00ms
step:2325/3000 train_time:2303412ms step_avg:994.99ms
step:2325/3000 val_loss:2.5377 train_time:2303451ms step_avg:995.01ms perplexity:12.6509 param_count:85,137,462
step:2326/3000 train_time:2304409ms step_avg:995.00ms
step:2327/3000 train_time:2305411ms step_avg:995.00ms
step:2328/3000 train_time:2306406ms step_avg:995.00ms
step:2329/3000 train_time:2307433ms step_avg:995.01ms
step:2330/3000 train_time:2308437ms step_avg:995.02ms
step:2331/3000 train_time:2309442ms step_avg:995.02ms
step:2332/3000 train_time:2310428ms step_avg:995.02ms
step:2333/3000 train_time:2311423ms step_avg:995.02ms
step:2334/3000 train_time:2312428ms step_avg:995.02ms
step:2335/3000 train_time:2313432ms step_avg:995.02ms
step:2336/3000 train_time:2314431ms step_avg:995.03ms
step:2337/3000 train_time:2315456ms step_avg:995.04ms
step:2338/3000 train_time:2316479ms step_avg:995.05ms
step:2339/3000 train_time:2317477ms step_avg:995.05ms
step:2340/3000 train_time:2318476ms step_avg:995.05ms
step:2341/3000 train_time:2319470ms step_avg:995.05ms
step:2342/3000 train_time:2320468ms step_avg:995.05ms
step:2343/3000 train_time:2321474ms step_avg:995.06ms
step:2344/3000 train_time:2322473ms step_avg:995.06ms
step:2345/3000 train_time:2323476ms step_avg:995.06ms
step:2346/3000 train_time:2324484ms step_avg:995.07ms
step:2347/3000 train_time:2325471ms step_avg:995.07ms
step:2348/3000 train_time:2326465ms step_avg:995.07ms
step:2349/3000 train_time:2327453ms step_avg:995.06ms
step:2350/3000 train_time:2328446ms step_avg:995.06ms
step:2350/3000 val_loss:2.5314 train_time:2328484ms step_avg:995.08ms perplexity:12.5712 param_count:85,137,462
step:2351/3000 train_time:2329432ms step_avg:995.06ms
step:2352/3000 train_time:2330415ms step_avg:995.05ms
step:2353/3000 train_time:2331416ms step_avg:995.06ms
step:2354/3000 train_time:2332407ms step_avg:995.05ms
step:2355/3000 train_time:2333401ms step_avg:995.05ms
step:2356/3000 train_time:2334404ms step_avg:995.06ms
step:2357/3000 train_time:2335390ms step_avg:995.05ms
step:2358/3000 train_time:2336383ms step_avg:995.05ms
step:2359/3000 train_time:2337384ms step_avg:995.06ms
step:2360/3000 train_time:2338373ms step_avg:995.05ms
step:2361/3000 train_time:2339382ms step_avg:995.06ms
step:2362/3000 train_time:2340375ms step_avg:995.06ms
step:2363/3000 train_time:2341379ms step_avg:995.06ms
step:2364/3000 train_time:2342371ms step_avg:995.06ms
step:2365/3000 train_time:2343395ms step_avg:995.07ms
step:2366/3000 train_time:2344377ms step_avg:995.07ms
step:2367/3000 train_time:2345373ms step_avg:995.07ms
step:2368/3000 train_time:2346375ms step_avg:995.07ms
step:2369/3000 train_time:2347369ms step_avg:995.07ms
step:2370/3000 train_time:2348360ms step_avg:995.07ms
step:2371/3000 train_time:2349362ms step_avg:995.07ms
step:2372/3000 train_time:2350359ms step_avg:995.07ms
step:2373/3000 train_time:2351357ms step_avg:995.07ms
step:2374/3000 train_time:2352363ms step_avg:995.08ms
step:2375/3000 train_time:2353366ms step_avg:995.08ms
step:2375/3000 val_loss:2.5128 train_time:2353407ms step_avg:995.10ms perplexity:12.3400 param_count:85,137,462
step:2376/3000 train_time:2354352ms step_avg:995.08ms
step:2377/3000 train_time:2355346ms step_avg:995.08ms
step:2378/3000 train_time:2356347ms step_avg:995.08ms
step:2379/3000 train_time:2357343ms step_avg:995.08ms
step:2380/3000 train_time:2358320ms step_avg:995.07ms
step:2381/3000 train_time:2359318ms step_avg:995.07ms
step:2382/3000 train_time:2360372ms step_avg:995.10ms
step:2383/3000 train_time:2361365ms step_avg:995.10ms
step:2384/3000 train_time:2362365ms step_avg:995.10ms
step:2385/3000 train_time:2363379ms step_avg:995.11ms
step:2386/3000 train_time:2364385ms step_avg:995.11ms
step:2387/3000 train_time:2365381ms step_avg:995.11ms
step:2388/3000 train_time:2366376ms step_avg:995.11ms
step:2389/3000 train_time:2367366ms step_avg:995.11ms
step:2390/3000 train_time:2368359ms step_avg:995.11ms
step:2391/3000 train_time:2369342ms step_avg:995.10ms
step:2392/3000 train_time:2370341ms step_avg:995.11ms
step:2393/3000 train_time:2371328ms step_avg:995.10ms
step:2394/3000 train_time:2372317ms step_avg:995.10ms
step:2395/3000 train_time:2373321ms step_avg:995.10ms
step:2396/3000 train_time:2374329ms step_avg:995.11ms
step:2397/3000 train_time:2375329ms step_avg:995.11ms
step:2398/3000 train_time:2376341ms step_avg:995.12ms
step:2399/3000 train_time:2377349ms step_avg:995.12ms
step:2400/3000 train_time:2378345ms step_avg:995.12ms
step:2400/3000 val_loss:2.5200 train_time:2378383ms step_avg:995.14ms perplexity:12.4287 param_count:85,137,462
step:2401/3000 train_time:2379332ms step_avg:995.12ms
step:2402/3000 train_time:2380325ms step_avg:995.12ms
step:2403/3000 train_time:2381316ms step_avg:995.12ms
step:2404/3000 train_time:2382350ms step_avg:995.13ms
step:2405/3000 train_time:2383344ms step_avg:995.13ms
step:2406/3000 train_time:2384333ms step_avg:995.13ms
step:2407/3000 train_time:2385345ms step_avg:995.14ms
step:2408/3000 train_time:2386340ms step_avg:995.14ms
step:2409/3000 train_time:2387330ms step_avg:995.14ms
step:2410/3000 train_time:2388328ms step_avg:995.14ms
step:2411/3000 train_time:2389323ms step_avg:995.14ms
step:2412/3000 train_time:2390319ms step_avg:995.14ms
step:2413/3000 train_time:2391306ms step_avg:995.13ms
step:2414/3000 train_time:2392308ms step_avg:995.14ms
step:2415/3000 train_time:2393303ms step_avg:995.14ms
step:2416/3000 train_time:2394287ms step_avg:995.13ms
step:2417/3000 train_time:2395309ms step_avg:995.14ms
step:2418/3000 train_time:2396330ms step_avg:995.15ms
step:2419/3000 train_time:2397323ms step_avg:995.15ms
step:2420/3000 train_time:2398314ms step_avg:995.15ms
step:2421/3000 train_time:2399324ms step_avg:995.16ms
step:2422/3000 train_time:2400316ms step_avg:995.16ms
step:2423/3000 train_time:2401311ms step_avg:995.16ms
step:2424/3000 train_time:2402302ms step_avg:995.15ms
step:2425/3000 train_time:2403317ms step_avg:995.16ms
step:2425/3000 val_loss:2.5146 train_time:2403355ms step_avg:995.18ms perplexity:12.3615 param_count:85,137,462
step:2426/3000 train_time:2404315ms step_avg:995.16ms
step:2427/3000 train_time:2405320ms step_avg:995.17ms
step:2428/3000 train_time:2406314ms step_avg:995.17ms
step:2429/3000 train_time:2407338ms step_avg:995.18ms
step:2430/3000 train_time:2408340ms step_avg:995.18ms
step:2431/3000 train_time:2409334ms step_avg:995.18ms
step:2432/3000 train_time:2410336ms step_avg:995.18ms
step:2433/3000 train_time:2411335ms step_avg:995.19ms
step:2434/3000 train_time:2412326ms step_avg:995.18ms
step:2435/3000 train_time:2413323ms step_avg:995.18ms
step:2436/3000 train_time:2414339ms step_avg:995.19ms
step:2437/3000 train_time:2415329ms step_avg:995.19ms
step:2438/3000 train_time:2416335ms step_avg:995.20ms
step:2439/3000 train_time:2417330ms step_avg:995.20ms
step:2440/3000 train_time:2418324ms step_avg:995.20ms
step:2441/3000 train_time:2419325ms step_avg:995.20ms
step:2442/3000 train_time:2420323ms step_avg:995.20ms
step:2443/3000 train_time:2421309ms step_avg:995.19ms
step:2444/3000 train_time:2422324ms step_avg:995.20ms
step:2445/3000 train_time:2423320ms step_avg:995.20ms
step:2446/3000 train_time:2424338ms step_avg:995.21ms
step:2447/3000 train_time:2425332ms step_avg:995.21ms
step:2448/3000 train_time:2426329ms step_avg:995.21ms
step:2449/3000 train_time:2427340ms step_avg:995.22ms
step:2450/3000 train_time:2428360ms step_avg:995.23ms
step:2450/3000 val_loss:2.5277 train_time:2428400ms step_avg:995.25ms perplexity:12.5252 param_count:85,137,462
step:2451/3000 train_time:2429349ms step_avg:995.23ms
step:2452/3000 train_time:2430350ms step_avg:995.23ms
step:2453/3000 train_time:2431338ms step_avg:995.23ms
step:2454/3000 train_time:2432332ms step_avg:995.23ms
step:2455/3000 train_time:2433345ms step_avg:995.23ms
step:2456/3000 train_time:2434332ms step_avg:995.23ms
step:2457/3000 train_time:2435325ms step_avg:995.23ms
step:2458/3000 train_time:2436321ms step_avg:995.23ms
step:2459/3000 train_time:2437327ms step_avg:995.23ms
step:2460/3000 train_time:2438333ms step_avg:995.24ms
step:2461/3000 train_time:2439329ms step_avg:995.24ms
step:2462/3000 train_time:2440321ms step_avg:995.24ms
step:2463/3000 train_time:2441311ms step_avg:995.23ms
step:2464/3000 train_time:2442312ms step_avg:995.24ms
step:2465/3000 train_time:2443312ms step_avg:995.24ms
step:2466/3000 train_time:2444306ms step_avg:995.24ms
step:2467/3000 train_time:2445311ms step_avg:995.24ms
step:2468/3000 train_time:2446307ms step_avg:995.24ms
step:2469/3000 train_time:2447319ms step_avg:995.25ms
step:2470/3000 train_time:2448324ms step_avg:995.25ms
step:2471/3000 train_time:2449332ms step_avg:995.26ms
step:2472/3000 train_time:2450332ms step_avg:995.26ms
step:2473/3000 train_time:2451330ms step_avg:995.26ms
step:2474/3000 train_time:2452357ms step_avg:995.27ms
step:2475/3000 train_time:2453355ms step_avg:995.28ms
step:2475/3000 val_loss:2.5319 train_time:2453394ms step_avg:995.29ms perplexity:12.5778 param_count:85,137,462
step:2476/3000 train_time:2454348ms step_avg:995.27ms
step:2477/3000 train_time:2455352ms step_avg:995.28ms
step:2478/3000 train_time:2456342ms step_avg:995.28ms
step:2479/3000 train_time:2457340ms step_avg:995.28ms
step:2480/3000 train_time:2458333ms step_avg:995.28ms
step:2481/3000 train_time:2459332ms step_avg:995.28ms
step:2482/3000 train_time:2460322ms step_avg:995.28ms
step:2483/3000 train_time:2461322ms step_avg:995.28ms
step:2484/3000 train_time:2462316ms step_avg:995.28ms
step:2485/3000 train_time:2463319ms step_avg:995.28ms
step:2486/3000 train_time:2464314ms step_avg:995.28ms
step:2487/3000 train_time:2465323ms step_avg:995.29ms
step:2488/3000 train_time:2466323ms step_avg:995.29ms
step:2489/3000 train_time:2467337ms step_avg:995.30ms
step:2490/3000 train_time:2468325ms step_avg:995.29ms
step:2491/3000 train_time:2469338ms step_avg:995.30ms
step:2492/3000 train_time:2470330ms step_avg:995.30ms
step:2493/3000 train_time:2471324ms step_avg:995.30ms
step:2494/3000 train_time:2472336ms step_avg:995.30ms
step:2495/3000 train_time:2473337ms step_avg:995.31ms
step:2496/3000 train_time:2474345ms step_avg:995.31ms
step:2497/3000 train_time:2475360ms step_avg:995.32ms
step:2498/3000 train_time:2476355ms step_avg:995.32ms
step:2499/3000 train_time:2477351ms step_avg:995.32ms
step:2500/3000 train_time:2478352ms step_avg:995.32ms
step:2500/3000 val_loss:2.5206 train_time:2478393ms step_avg:995.34ms perplexity:12.4361 param_count:85,137,462
step:2501/3000 train_time:2479338ms step_avg:995.32ms
step:2502/3000 train_time:2480346ms step_avg:995.32ms
step:2503/3000 train_time:2481343ms step_avg:995.32ms
step:2504/3000 train_time:2482344ms step_avg:995.33ms
step:2505/3000 train_time:2483338ms step_avg:995.33ms
step:2506/3000 train_time:2484331ms step_avg:995.32ms
step:2507/3000 train_time:2485318ms step_avg:995.32ms
step:2508/3000 train_time:2486330ms step_avg:995.33ms
step:2509/3000 train_time:2487335ms step_avg:995.33ms
step:2510/3000 train_time:2488339ms step_avg:995.34ms
step:2511/3000 train_time:2489336ms step_avg:995.34ms
step:2512/3000 train_time:2490340ms step_avg:995.34ms
step:2513/3000 train_time:2491349ms step_avg:995.35ms
step:2514/3000 train_time:2492338ms step_avg:995.34ms
step:2515/3000 train_time:2493334ms step_avg:995.34ms
step:2516/3000 train_time:2494328ms step_avg:995.34ms
step:2517/3000 train_time:2495327ms step_avg:995.34ms
step:2518/3000 train_time:2496332ms step_avg:995.35ms
step:2519/3000 train_time:2497329ms step_avg:995.35ms
step:2520/3000 train_time:2498342ms step_avg:995.36ms
step:2521/3000 train_time:2499328ms step_avg:995.35ms
step:2522/3000 train_time:2500322ms step_avg:995.35ms
step:2523/3000 train_time:2501310ms step_avg:995.35ms
step:2524/3000 train_time:2502314ms step_avg:995.35ms
step:2525/3000 train_time:2503310ms step_avg:995.35ms
step:2525/3000 val_loss:2.5350 train_time:2503351ms step_avg:995.37ms perplexity:12.6161 param_count:85,137,462
step:2526/3000 train_time:2504319ms step_avg:995.36ms
step:2527/3000 train_time:2505319ms step_avg:995.36ms
step:2528/3000 train_time:2506339ms step_avg:995.37ms
step:2529/3000 train_time:2507333ms step_avg:995.37ms
step:2530/3000 train_time:2508324ms step_avg:995.37ms
step:2531/3000 train_time:2509328ms step_avg:995.37ms
step:2532/3000 train_time:2510315ms step_avg:995.37ms
step:2533/3000 train_time:2511315ms step_avg:995.37ms
step:2534/3000 train_time:2512319ms step_avg:995.37ms
step:2535/3000 train_time:2513306ms step_avg:995.37ms
step:2536/3000 train_time:2514329ms step_avg:995.38ms
step:2537/3000 train_time:2515322ms step_avg:995.38ms
step:2538/3000 train_time:2516314ms step_avg:995.38ms
step:2539/3000 train_time:2517307ms step_avg:995.38ms
step:2540/3000 train_time:2518314ms step_avg:995.38ms
step:2541/3000 train_time:2519322ms step_avg:995.39ms
step:2542/3000 train_time:2520349ms step_avg:995.40ms
step:2543/3000 train_time:2521391ms step_avg:995.42ms
step:2544/3000 train_time:2522398ms step_avg:995.42ms
step:2545/3000 train_time:2523386ms step_avg:995.42ms
step:2546/3000 train_time:2524378ms step_avg:995.42ms
step:2547/3000 train_time:2525373ms step_avg:995.42ms
step:2548/3000 train_time:2526374ms step_avg:995.42ms
step:2549/3000 train_time:2527372ms step_avg:995.42ms
step:2550/3000 train_time:2528372ms step_avg:995.42ms
step:2550/3000 val_loss:2.5119 train_time:2528411ms step_avg:995.44ms perplexity:12.3284 param_count:85,137,462
step:2551/3000 train_time:2529351ms step_avg:995.42ms
step:2552/3000 train_time:2530370ms step_avg:995.42ms
step:2553/3000 train_time:2531365ms step_avg:995.42ms
step:2554/3000 train_time:2532354ms step_avg:995.42ms
step:2555/3000 train_time:2533358ms step_avg:995.43ms
step:2556/3000 train_time:2534351ms step_avg:995.42ms
step:2557/3000 train_time:2535348ms step_avg:995.43ms
step:2558/3000 train_time:2536356ms step_avg:995.43ms
step:2559/3000 train_time:2537351ms step_avg:995.43ms
step:2560/3000 train_time:2538339ms step_avg:995.43ms
step:2561/3000 train_time:2539328ms step_avg:995.42ms
step:2562/3000 train_time:2540328ms step_avg:995.43ms
step:2563/3000 train_time:2541319ms step_avg:995.42ms
step:2564/3000 train_time:2542323ms step_avg:995.43ms
step:2565/3000 train_time:2543328ms step_avg:995.43ms
step:2566/3000 train_time:2544350ms step_avg:995.44ms
step:2567/3000 train_time:2545355ms step_avg:995.45ms
step:2568/3000 train_time:2546359ms step_avg:995.45ms
step:2569/3000 train_time:2547362ms step_avg:995.45ms
step:2570/3000 train_time:2548345ms step_avg:995.45ms
step:2571/3000 train_time:2549336ms step_avg:995.45ms
step:2572/3000 train_time:2550336ms step_avg:995.45ms
step:2573/3000 train_time:2551333ms step_avg:995.45ms
step:2574/3000 train_time:2552333ms step_avg:995.45ms
step:2575/3000 train_time:2553320ms step_avg:995.45ms
step:2575/3000 val_loss:2.5182 train_time:2553360ms step_avg:995.46ms perplexity:12.4062 param_count:85,137,462
step:2576/3000 train_time:2554311ms step_avg:995.44ms
step:2577/3000 train_time:2555325ms step_avg:995.45ms
step:2578/3000 train_time:2556321ms step_avg:995.45ms
step:2579/3000 train_time:2557323ms step_avg:995.45ms
step:2580/3000 train_time:2558326ms step_avg:995.46ms
step:2581/3000 train_time:2559326ms step_avg:995.46ms
step:2582/3000 train_time:2560320ms step_avg:995.46ms
step:2583/3000 train_time:2561307ms step_avg:995.46ms
step:2584/3000 train_time:2562293ms step_avg:995.45ms
step:2585/3000 train_time:2563280ms step_avg:995.45ms
step:2586/3000 train_time:2564279ms step_avg:995.45ms
step:2587/3000 train_time:2565280ms step_avg:995.45ms
step:2588/3000 train_time:2566275ms step_avg:995.45ms
step:2589/3000 train_time:2567262ms step_avg:995.45ms
step:2590/3000 train_time:2568268ms step_avg:995.45ms
step:2591/3000 train_time:2569255ms step_avg:995.45ms
step:2592/3000 train_time:2570291ms step_avg:995.47ms
step:2593/3000 train_time:2571282ms step_avg:995.46ms
step:2594/3000 train_time:2572280ms step_avg:995.46ms
step:2595/3000 train_time:2573268ms step_avg:995.46ms
step:2596/3000 train_time:2574255ms step_avg:995.46ms
step:2597/3000 train_time:2575264ms step_avg:995.46ms
step:2598/3000 train_time:2576261ms step_avg:995.46ms
step:2599/3000 train_time:2577267ms step_avg:995.47ms
step:2600/3000 train_time:2578266ms step_avg:995.47ms
step:2600/3000 val_loss:2.5178 train_time:2578307ms step_avg:995.49ms perplexity:12.4014 param_count:85,137,462
step:2601/3000 train_time:2579251ms step_avg:995.47ms
step:2602/3000 train_time:2580252ms step_avg:995.47ms
step:2603/3000 train_time:2581248ms step_avg:995.47ms
step:2604/3000 train_time:2582253ms step_avg:995.47ms
step:2605/3000 train_time:2583249ms step_avg:995.47ms
step:2606/3000 train_time:2584248ms step_avg:995.47ms
step:2607/3000 train_time:2585247ms step_avg:995.47ms
step:2608/3000 train_time:2586252ms step_avg:995.48ms
step:2609/3000 train_time:2587281ms step_avg:995.49ms
step:2610/3000 train_time:2588278ms step_avg:995.49ms
step:2611/3000 train_time:2589271ms step_avg:995.49ms
step:2612/3000 train_time:2590268ms step_avg:995.49ms
step:2613/3000 train_time:2591265ms step_avg:995.49ms
step:2614/3000 train_time:2592258ms step_avg:995.49ms
step:2615/3000 train_time:2593255ms step_avg:995.49ms
step:2616/3000 train_time:2594252ms step_avg:995.49ms
step:2617/3000 train_time:2595238ms step_avg:995.49ms
step:2618/3000 train_time:2596231ms step_avg:995.49ms
step:2619/3000 train_time:2597272ms step_avg:995.50ms
step:2620/3000 train_time:2598276ms step_avg:995.51ms
step:2621/3000 train_time:2599287ms step_avg:995.51ms
step:2622/3000 train_time:2600275ms step_avg:995.51ms
step:2623/3000 train_time:2601277ms step_avg:995.51ms
step:2624/3000 train_time:2602273ms step_avg:995.51ms
step:2625/3000 train_time:2603269ms step_avg:995.51ms
step:2625/3000 val_loss:2.5212 train_time:2603311ms step_avg:995.53ms perplexity:12.4436 param_count:85,137,462
step:2626/3000 train_time:2604252ms step_avg:995.51ms
step:2627/3000 train_time:2605260ms step_avg:995.51ms
step:2628/3000 train_time:2606261ms step_avg:995.52ms
step:2629/3000 train_time:2607260ms step_avg:995.52ms
step:2630/3000 train_time:2608278ms step_avg:995.53ms
step:2631/3000 train_time:2609275ms step_avg:995.53ms
step:2632/3000 train_time:2610308ms step_avg:995.54ms
step:2633/3000 train_time:2611327ms step_avg:995.55ms
step:2634/3000 train_time:2612321ms step_avg:995.55ms
step:2635/3000 train_time:2613324ms step_avg:995.55ms
step:2636/3000 train_time:2614319ms step_avg:995.55ms
step:2637/3000 train_time:2615316ms step_avg:995.55ms
step:2638/3000 train_time:2616310ms step_avg:995.55ms
step:2639/3000 train_time:2617309ms step_avg:995.55ms
step:2640/3000 train_time:2618312ms step_avg:995.56ms
step:2641/3000 train_time:2619316ms step_avg:995.56ms
step:2642/3000 train_time:2620319ms step_avg:995.56ms
step:2643/3000 train_time:2621314ms step_avg:995.56ms
step:2644/3000 train_time:2622327ms step_avg:995.57ms
step:2645/3000 train_time:2623332ms step_avg:995.57ms
step:2646/3000 train_time:2624328ms step_avg:995.57ms
step:2647/3000 train_time:2625343ms step_avg:995.58ms
step:2648/3000 train_time:2626331ms step_avg:995.58ms
step:2649/3000 train_time:2627322ms step_avg:995.57ms
step:2650/3000 train_time:2628333ms step_avg:995.58ms
step:2650/3000 val_loss:2.5217 train_time:2628371ms step_avg:995.60ms perplexity:12.4496 param_count:85,137,462
step:2651/3000 train_time:2629305ms step_avg:995.57ms
step:2652/3000 train_time:2630326ms step_avg:995.58ms
step:2653/3000 train_time:2631331ms step_avg:995.58ms
step:2654/3000 train_time:2632349ms step_avg:995.59ms
step:2655/3000 train_time:2633340ms step_avg:995.59ms
step:2656/3000 train_time:2634331ms step_avg:995.59ms
step:2657/3000 train_time:2635324ms step_avg:995.59ms
step:2658/3000 train_time:2636304ms step_avg:995.58ms
step:2659/3000 train_time:2637320ms step_avg:995.59ms
step:2660/3000 train_time:2638312ms step_avg:995.59ms
step:2661/3000 train_time:2639324ms step_avg:995.60ms
step:2662/3000 train_time:2640324ms step_avg:995.60ms
step:2663/3000 train_time:2641321ms step_avg:995.60ms
step:2664/3000 train_time:2642313ms step_avg:995.60ms
step:2665/3000 train_time:2643313ms step_avg:995.60ms
step:2666/3000 train_time:2644317ms step_avg:995.60ms
step:2667/3000 train_time:2645307ms step_avg:995.60ms
step:2668/3000 train_time:2646299ms step_avg:995.60ms
step:2669/3000 train_time:2647290ms step_avg:995.60ms
step:2670/3000 train_time:2648293ms step_avg:995.60ms
step:2671/3000 train_time:2649288ms step_avg:995.60ms
step:2672/3000 train_time:2650279ms step_avg:995.60ms
step:2673/3000 train_time:2651282ms step_avg:995.60ms
step:2674/3000 train_time:2652288ms step_avg:995.60ms
step:2675/3000 train_time:2653279ms step_avg:995.60ms
step:2675/3000 val_loss:2.5175 train_time:2653321ms step_avg:995.62ms perplexity:12.3979 param_count:85,137,462
step:2676/3000 train_time:2654271ms step_avg:995.60ms
step:2677/3000 train_time:2655272ms step_avg:995.60ms
step:2678/3000 train_time:2656287ms step_avg:995.61ms
step:2679/3000 train_time:2657300ms step_avg:995.62ms
step:2680/3000 train_time:2658286ms step_avg:995.61ms
step:2681/3000 train_time:2659270ms step_avg:995.61ms
step:2682/3000 train_time:2660275ms step_avg:995.61ms
step:2683/3000 train_time:2661268ms step_avg:995.61ms
step:2684/3000 train_time:2662254ms step_avg:995.61ms
step:2685/3000 train_time:2663238ms step_avg:995.60ms
step:2686/3000 train_time:2664255ms step_avg:995.61ms
step:2687/3000 train_time:2665248ms step_avg:995.61ms
step:2688/3000 train_time:2666246ms step_avg:995.61ms
step:2689/3000 train_time:2667252ms step_avg:995.61ms
step:2690/3000 train_time:2668245ms step_avg:995.61ms
step:2691/3000 train_time:2669254ms step_avg:995.62ms
step:2692/3000 train_time:2670245ms step_avg:995.62ms
step:2693/3000 train_time:2671234ms step_avg:995.61ms
step:2694/3000 train_time:2672230ms step_avg:995.61ms
step:2695/3000 train_time:2673255ms step_avg:995.63ms
step:2696/3000 train_time:2674243ms step_avg:995.62ms
step:2697/3000 train_time:2675232ms step_avg:995.62ms
step:2698/3000 train_time:2676238ms step_avg:995.62ms
step:2699/3000 train_time:2677238ms step_avg:995.63ms
step:2700/3000 train_time:2678224ms step_avg:995.62ms
step:2700/3000 val_loss:2.5209 train_time:2678264ms step_avg:995.64ms perplexity:12.4403 param_count:85,137,462
step:2701/3000 train_time:2679207ms step_avg:995.62ms
step:2702/3000 train_time:2680207ms step_avg:995.62ms
step:2703/3000 train_time:2681195ms step_avg:995.62ms
step:2704/3000 train_time:2682191ms step_avg:995.62ms
step:2705/3000 train_time:2683188ms step_avg:995.62ms
step:2706/3000 train_time:2684185ms step_avg:995.62ms
step:2707/3000 train_time:2685192ms step_avg:995.62ms
step:2708/3000 train_time:2686198ms step_avg:995.63ms
step:2709/3000 train_time:2687189ms step_avg:995.62ms
step:2710/3000 train_time:2688188ms step_avg:995.63ms
step:2711/3000 train_time:2689182ms step_avg:995.62ms
step:2712/3000 train_time:2690186ms step_avg:995.63ms
step:2713/3000 train_time:2691202ms step_avg:995.64ms
step:2714/3000 train_time:2692199ms step_avg:995.64ms
step:2715/3000 train_time:2693204ms step_avg:995.64ms
step:2716/3000 train_time:2694197ms step_avg:995.64ms
step:2717/3000 train_time:2695200ms step_avg:995.64ms
step:2718/3000 train_time:2696193ms step_avg:995.64ms
step:2719/3000 train_time:2697190ms step_avg:995.64ms
step:2720/3000 train_time:2698265ms step_avg:995.67ms
step:2721/3000 train_time:2699275ms step_avg:995.68ms
step:2722/3000 train_time:2700272ms step_avg:995.68ms
step:2723/3000 train_time:2701269ms step_avg:995.68ms
step:2724/3000 train_time:2702281ms step_avg:995.68ms
step:2725/3000 train_time:2703281ms step_avg:995.68ms
step:2725/3000 val_loss:2.5099 train_time:2703322ms step_avg:995.70ms perplexity:12.3035 param_count:85,137,462
step:2726/3000 train_time:2704281ms step_avg:995.69ms
step:2727/3000 train_time:2705288ms step_avg:995.69ms
step:2728/3000 train_time:2706299ms step_avg:995.69ms
step:2729/3000 train_time:2707288ms step_avg:995.69ms
step:2730/3000 train_time:2708286ms step_avg:995.69ms
step:2731/3000 train_time:2709279ms step_avg:995.69ms
step:2732/3000 train_time:2710286ms step_avg:995.70ms
step:2733/3000 train_time:2711280ms step_avg:995.70ms
step:2734/3000 train_time:2712273ms step_avg:995.69ms
step:2735/3000 train_time:2713268ms step_avg:995.69ms
step:2736/3000 train_time:2714262ms step_avg:995.69ms
step:2737/3000 train_time:2715261ms step_avg:995.70ms
step:2738/3000 train_time:2716260ms step_avg:995.70ms
step:2739/3000 train_time:2717264ms step_avg:995.70ms
step:2740/3000 train_time:2718259ms step_avg:995.70ms
step:2741/3000 train_time:2719266ms step_avg:995.70ms
step:2742/3000 train_time:2720255ms step_avg:995.70ms
step:2743/3000 train_time:2721246ms step_avg:995.70ms
step:2744/3000 train_time:2722234ms step_avg:995.70ms
step:2745/3000 train_time:2723237ms step_avg:995.70ms
step:2746/3000 train_time:2724293ms step_avg:995.72ms
step:2747/3000 train_time:2725310ms step_avg:995.73ms
step:2748/3000 train_time:2726353ms step_avg:995.75ms
step:2749/3000 train_time:2727349ms step_avg:995.75ms
step:2750/3000 train_time:2728358ms step_avg:995.75ms
step:2750/3000 val_loss:2.5175 train_time:2728399ms step_avg:995.77ms perplexity:12.3977 param_count:85,137,462
step:2751/3000 train_time:2729361ms step_avg:995.75ms
step:2752/3000 train_time:2730361ms step_avg:995.76ms
step:2753/3000 train_time:2731352ms step_avg:995.75ms
step:2754/3000 train_time:2732356ms step_avg:995.76ms
step:2755/3000 train_time:2733341ms step_avg:995.75ms
step:2756/3000 train_time:2734341ms step_avg:995.75ms
step:2757/3000 train_time:2735357ms step_avg:995.76ms
step:2758/3000 train_time:2736350ms step_avg:995.76ms
step:2759/3000 train_time:2737354ms step_avg:995.76ms
step:2760/3000 train_time:2738344ms step_avg:995.76ms
step:2761/3000 train_time:2739333ms step_avg:995.76ms
step:2762/3000 train_time:2740330ms step_avg:995.76ms
step:2763/3000 train_time:2741321ms step_avg:995.76ms
step:2764/3000 train_time:2742317ms step_avg:995.76ms
step:2765/3000 train_time:2743316ms step_avg:995.76ms
step:2766/3000 train_time:2744316ms step_avg:995.76ms
step:2767/3000 train_time:2745305ms step_avg:995.76ms
step:2768/3000 train_time:2746307ms step_avg:995.76ms
step:2769/3000 train_time:2747321ms step_avg:995.77ms
step:2770/3000 train_time:2748305ms step_avg:995.76ms
step:2771/3000 train_time:2749300ms step_avg:995.76ms
step:2772/3000 train_time:2750299ms step_avg:995.76ms
step:2773/3000 train_time:2751294ms step_avg:995.76ms
step:2774/3000 train_time:2752290ms step_avg:995.76ms
step:2775/3000 train_time:2753289ms step_avg:995.76ms
step:2775/3000 val_loss:2.5157 train_time:2753328ms step_avg:995.78ms perplexity:12.3756 param_count:85,137,462
step:2776/3000 train_time:2754282ms step_avg:995.76ms
step:2777/3000 train_time:2755267ms step_avg:995.76ms
step:2778/3000 train_time:2756263ms step_avg:995.76ms
step:2779/3000 train_time:2757253ms step_avg:995.76ms
step:2780/3000 train_time:2758247ms step_avg:995.76ms
step:2781/3000 train_time:2759243ms step_avg:995.76ms
step:2782/3000 train_time:2760227ms step_avg:995.75ms
step:2783/3000 train_time:2761242ms step_avg:995.76ms
step:2784/3000 train_time:2762234ms step_avg:995.76ms
step:2785/3000 train_time:2763222ms step_avg:995.76ms
step:2786/3000 train_time:2764218ms step_avg:995.76ms
step:2787/3000 train_time:2765216ms step_avg:995.76ms
step:2788/3000 train_time:2766224ms step_avg:995.76ms
step:2789/3000 train_time:2767231ms step_avg:995.77ms
step:2790/3000 train_time:2768231ms step_avg:995.77ms
step:2791/3000 train_time:2769225ms step_avg:995.77ms
step:2792/3000 train_time:2770211ms step_avg:995.76ms
step:2793/3000 train_time:2771202ms step_avg:995.76ms
step:2794/3000 train_time:2772223ms step_avg:995.77ms
step:2795/3000 train_time:2773229ms step_avg:995.77ms
step:2796/3000 train_time:2774232ms step_avg:995.78ms
step:2797/3000 train_time:2775228ms step_avg:995.78ms
step:2798/3000 train_time:2776215ms step_avg:995.77ms
step:2799/3000 train_time:2777237ms step_avg:995.78ms
step:2800/3000 train_time:2778240ms step_avg:995.79ms
step:2800/3000 val_loss:2.5176 train_time:2778280ms step_avg:995.80ms perplexity:12.3992 param_count:85,137,462
step:2801/3000 train_time:2779232ms step_avg:995.78ms
step:2802/3000 train_time:2780221ms step_avg:995.78ms
step:2803/3000 train_time:2781219ms step_avg:995.78ms
step:2804/3000 train_time:2782205ms step_avg:995.78ms
step:2805/3000 train_time:2783210ms step_avg:995.78ms
step:2806/3000 train_time:2784216ms step_avg:995.79ms
step:2807/3000 train_time:2785288ms step_avg:995.81ms
step:2808/3000 train_time:2786288ms step_avg:995.81ms
step:2809/3000 train_time:2787276ms step_avg:995.81ms
step:2810/3000 train_time:2788298ms step_avg:995.82ms
step:2811/3000 train_time:2789302ms step_avg:995.82ms
step:2812/3000 train_time:2790300ms step_avg:995.82ms
step:2813/3000 train_time:2791307ms step_avg:995.83ms
step:2814/3000 train_time:2792309ms step_avg:995.83ms
step:2815/3000 train_time:2793307ms step_avg:995.83ms
step:2816/3000 train_time:2794300ms step_avg:995.83ms
step:2817/3000 train_time:2795292ms step_avg:995.83ms
step:2818/3000 train_time:2796291ms step_avg:995.83ms
step:2819/3000 train_time:2797304ms step_avg:995.84ms
step:2820/3000 train_time:2798316ms step_avg:995.84ms
step:2821/3000 train_time:2799313ms step_avg:995.84ms
step:2822/3000 train_time:2800317ms step_avg:995.85ms
step:2823/3000 train_time:2801320ms step_avg:995.85ms
step:2824/3000 train_time:2802314ms step_avg:995.85ms
step:2825/3000 train_time:2803321ms step_avg:995.85ms
step:2825/3000 val_loss:2.5109 train_time:2803362ms step_avg:995.87ms perplexity:12.3159 param_count:85,137,462
step:2826/3000 train_time:2804310ms step_avg:995.85ms
step:2827/3000 train_time:2805339ms step_avg:995.86ms
step:2828/3000 train_time:2806336ms step_avg:995.86ms
step:2829/3000 train_time:2807340ms step_avg:995.86ms
step:2830/3000 train_time:2808343ms step_avg:995.87ms
step:2831/3000 train_time:2809343ms step_avg:995.87ms
step:2832/3000 train_time:2810335ms step_avg:995.87ms
step:2833/3000 train_time:2811333ms step_avg:995.87ms
step:2834/3000 train_time:2812329ms step_avg:995.87ms
step:2835/3000 train_time:2813328ms step_avg:995.87ms
step:2836/3000 train_time:2814312ms step_avg:995.86ms
step:2837/3000 train_time:2815314ms step_avg:995.87ms
step:2838/3000 train_time:2816313ms step_avg:995.87ms
step:2839/3000 train_time:2817335ms step_avg:995.88ms
step:2840/3000 train_time:2818323ms step_avg:995.87ms
step:2841/3000 train_time:2819320ms step_avg:995.87ms
step:2842/3000 train_time:2820321ms step_avg:995.88ms
step:2843/3000 train_time:2821316ms step_avg:995.88ms
step:2844/3000 train_time:2822311ms step_avg:995.88ms
step:2845/3000 train_time:2823295ms step_avg:995.87ms
step:2846/3000 train_time:2824285ms step_avg:995.87ms
step:2847/3000 train_time:2825278ms step_avg:995.87ms
step:2848/3000 train_time:2826275ms step_avg:995.87ms
step:2849/3000 train_time:2827274ms step_avg:995.87ms
step:2850/3000 train_time:2828278ms step_avg:995.87ms
step:2850/3000 val_loss:2.5091 train_time:2828319ms step_avg:995.89ms perplexity:12.2933 param_count:85,137,462
step:2851/3000 train_time:2829279ms step_avg:995.87ms
step:2852/3000 train_time:2830273ms step_avg:995.87ms
step:2853/3000 train_time:2831278ms step_avg:995.88ms
step:2854/3000 train_time:2832273ms step_avg:995.88ms
step:2855/3000 train_time:2833273ms step_avg:995.88ms
step:2856/3000 train_time:2834267ms step_avg:995.88ms
step:2857/3000 train_time:2835256ms step_avg:995.87ms
step:2858/3000 train_time:2836249ms step_avg:995.87ms
step:2859/3000 train_time:2837234ms step_avg:995.87ms
step:2860/3000 train_time:2838225ms step_avg:995.87ms
step:2861/3000 train_time:2839215ms step_avg:995.87ms
step:2862/3000 train_time:2840216ms step_avg:995.87ms
step:2863/3000 train_time:2841220ms step_avg:995.87ms
step:2864/3000 train_time:2842216ms step_avg:995.87ms
step:2865/3000 train_time:2843211ms step_avg:995.87ms
step:2866/3000 train_time:2844212ms step_avg:995.87ms
step:2867/3000 train_time:2845202ms step_avg:995.87ms
step:2868/3000 train_time:2846193ms step_avg:995.87ms
step:2869/3000 train_time:2847182ms step_avg:995.87ms
step:2870/3000 train_time:2848178ms step_avg:995.87ms
step:2871/3000 train_time:2849177ms step_avg:995.87ms
step:2872/3000 train_time:2850181ms step_avg:995.87ms
step:2873/3000 train_time:2851174ms step_avg:995.87ms
step:2874/3000 train_time:2852193ms step_avg:995.88ms
step:2875/3000 train_time:2853183ms step_avg:995.88ms
step:2875/3000 val_loss:2.5137 train_time:2853223ms step_avg:995.89ms perplexity:12.3503 param_count:85,137,462
step:2876/3000 train_time:2854185ms step_avg:995.88ms
step:2877/3000 train_time:2855184ms step_avg:995.88ms
step:2878/3000 train_time:2856193ms step_avg:995.88ms
step:2879/3000 train_time:2857201ms step_avg:995.89ms
step:2880/3000 train_time:2858220ms step_avg:995.90ms
step:2881/3000 train_time:2859210ms step_avg:995.89ms
step:2882/3000 train_time:2860204ms step_avg:995.89ms
step:2883/3000 train_time:2861224ms step_avg:995.90ms
step:2884/3000 train_time:2862234ms step_avg:995.91ms
step:2885/3000 train_time:2863227ms step_avg:995.90ms
step:2886/3000 train_time:2864217ms step_avg:995.90ms
step:2887/3000 train_time:2865235ms step_avg:995.91ms
step:2888/3000 train_time:2866228ms step_avg:995.91ms
step:2889/3000 train_time:2867241ms step_avg:995.92ms
step:2890/3000 train_time:2868242ms step_avg:995.92ms
step:2891/3000 train_time:2869249ms step_avg:995.92ms
step:2892/3000 train_time:2870242ms step_avg:995.92ms
step:2893/3000 train_time:2871241ms step_avg:995.92ms
step:2894/3000 train_time:2872240ms step_avg:995.92ms
step:2895/3000 train_time:2873228ms step_avg:995.92ms
step:2896/3000 train_time:2874223ms step_avg:995.92ms
step:2897/3000 train_time:2875254ms step_avg:995.93ms
step:2898/3000 train_time:2876245ms step_avg:995.93ms
step:2899/3000 train_time:2877240ms step_avg:995.93ms
step:2900/3000 train_time:2878226ms step_avg:995.93ms
step:2900/3000 val_loss:2.5093 train_time:2878265ms step_avg:995.94ms perplexity:12.2958 param_count:85,137,462
step:2901/3000 train_time:2879227ms step_avg:995.93ms
step:2902/3000 train_time:2880234ms step_avg:995.93ms
step:2903/3000 train_time:2881238ms step_avg:995.93ms
step:2904/3000 train_time:2882236ms step_avg:995.93ms
step:2905/3000 train_time:2883236ms step_avg:995.94ms
step:2906/3000 train_time:2884252ms step_avg:995.94ms
step:2907/3000 train_time:2885253ms step_avg:995.95ms
step:2908/3000 train_time:2886254ms step_avg:995.95ms
step:2909/3000 train_time:2887252ms step_avg:995.95ms
step:2910/3000 train_time:2888253ms step_avg:995.95ms
step:2911/3000 train_time:2889253ms step_avg:995.95ms
step:2912/3000 train_time:2890244ms step_avg:995.95ms
step:2913/3000 train_time:2891252ms step_avg:995.95ms
step:2914/3000 train_time:2892241ms step_avg:995.95ms
step:2915/3000 train_time:2893238ms step_avg:995.95ms
step:2916/3000 train_time:2894238ms step_avg:995.95ms
step:2917/3000 train_time:2895236ms step_avg:995.95ms
step:2918/3000 train_time:2896230ms step_avg:995.95ms
step:2919/3000 train_time:2897240ms step_avg:995.96ms
step:2920/3000 train_time:2898233ms step_avg:995.96ms
step:2921/3000 train_time:2899237ms step_avg:995.96ms
step:2922/3000 train_time:2900242ms step_avg:995.96ms
step:2923/3000 train_time:2901244ms step_avg:995.96ms
step:2924/3000 train_time:2902273ms step_avg:995.98ms
step:2925/3000 train_time:2903281ms step_avg:995.98ms
step:2925/3000 val_loss:2.5176 train_time:2903319ms step_avg:995.99ms perplexity:12.3991 param_count:85,137,462
step:2926/3000 train_time:2904261ms step_avg:995.97ms
step:2927/3000 train_time:2905247ms step_avg:995.97ms
step:2928/3000 train_time:2906236ms step_avg:995.97ms
step:2929/3000 train_time:2907232ms step_avg:995.97ms
step:2930/3000 train_time:2908216ms step_avg:995.96ms
step:2931/3000 train_time:2909226ms step_avg:995.97ms
step:2932/3000 train_time:2910220ms step_avg:995.97ms
step:2933/3000 train_time:2911214ms step_avg:995.97ms
step:2934/3000 train_time:2912216ms step_avg:995.97ms
step:2935/3000 train_time:2913208ms step_avg:995.97ms
step:2936/3000 train_time:2914193ms step_avg:995.96ms
step:2937/3000 train_time:2915192ms step_avg:995.97ms
step:2938/3000 train_time:2916184ms step_avg:995.96ms
step:2939/3000 train_time:2917180ms step_avg:995.96ms
step:2940/3000 train_time:2918177ms step_avg:995.96ms
step:2941/3000 train_time:2919193ms step_avg:995.97ms
step:2942/3000 train_time:2920202ms step_avg:995.98ms
step:2943/3000 train_time:2921211ms step_avg:995.98ms
step:2944/3000 train_time:2922200ms step_avg:995.98ms
step:2945/3000 train_time:2923196ms step_avg:995.98ms
step:2946/3000 train_time:2924187ms step_avg:995.98ms
step:2947/3000 train_time:2925180ms step_avg:995.98ms
step:2948/3000 train_time:2926180ms step_avg:995.98ms
step:2949/3000 train_time:2927190ms step_avg:995.98ms
step:2950/3000 train_time:2928205ms step_avg:995.99ms
step:2950/3000 val_loss:2.5181 train_time:2928246ms step_avg:996.00ms perplexity:12.4054 param_count:85,137,462
step:2951/3000 train_time:2929192ms step_avg:995.99ms
step:2952/3000 train_time:2930185ms step_avg:995.98ms
step:2953/3000 train_time:2931178ms step_avg:995.98ms
step:2954/3000 train_time:2932174ms step_avg:995.98ms
step:2955/3000 train_time:2933173ms step_avg:995.98ms
step:2956/3000 train_time:2934165ms step_avg:995.98ms
step:2957/3000 train_time:2935158ms step_avg:995.98ms
step:2958/3000 train_time:2936145ms step_avg:995.98ms
step:2959/3000 train_time:2937145ms step_avg:995.98ms
step:2960/3000 train_time:2938157ms step_avg:995.99ms
step:2961/3000 train_time:2939151ms step_avg:995.98ms
step:2962/3000 train_time:2940146ms step_avg:995.98ms
step:2963/3000 train_time:2941138ms step_avg:995.98ms
step:2964/3000 train_time:2942136ms step_avg:995.98ms
step:2965/3000 train_time:2943130ms step_avg:995.98ms
step:2966/3000 train_time:2944128ms step_avg:995.98ms
step:2967/3000 train_time:2945139ms step_avg:995.99ms
step:2968/3000 train_time:2946154ms step_avg:996.00ms
step:2969/3000 train_time:2947146ms step_avg:995.99ms
step:2970/3000 train_time:2948148ms step_avg:996.00ms
step:2971/3000 train_time:2949178ms step_avg:996.01ms
step:2972/3000 train_time:2950180ms step_avg:996.01ms
step:2973/3000 train_time:2951171ms step_avg:996.01ms
step:2974/3000 train_time:2952163ms step_avg:996.01ms
step:2975/3000 train_time:2953159ms step_avg:996.01ms
step:2975/3000 val_loss:2.5249 train_time:2953198ms step_avg:996.02ms perplexity:12.4901 param_count:85,137,462
step:2976/3000 train_time:2954145ms step_avg:996.00ms
step:2977/3000 train_time:2955247ms step_avg:996.04ms
step:2978/3000 train_time:2956267ms step_avg:996.05ms
step:2979/3000 train_time:2957265ms step_avg:996.05ms
step:2980/3000 train_time:2958250ms step_avg:996.04ms
step:2981/3000 train_time:2959230ms step_avg:996.04ms
step:2982/3000 train_time:2960238ms step_avg:996.04ms
step:2983/3000 train_time:2961236ms step_avg:996.04ms
step:2984/3000 train_time:2962230ms step_avg:996.04ms
step:2985/3000 train_time:2963225ms step_avg:996.04ms
step:2986/3000 train_time:2964231ms step_avg:996.05ms
step:2987/3000 train_time:2965245ms step_avg:996.05ms
step:2988/3000 train_time:2966241ms step_avg:996.05ms
step:2989/3000 train_time:2967246ms step_avg:996.05ms
step:2990/3000 train_time:2968245ms step_avg:996.06ms
step:2991/3000 train_time:2969235ms step_avg:996.05ms
step:2992/3000 train_time:2970243ms step_avg:996.06ms
step:2993/3000 train_time:2971233ms step_avg:996.06ms
step:2994/3000 train_time:2972247ms step_avg:996.06ms
step:2995/3000 train_time:2973262ms step_avg:996.07ms
step:2996/3000 train_time:2974257ms step_avg:996.07ms
step:2997/3000 train_time:2975255ms step_avg:996.07ms
step:2998/3000 train_time:2976250ms step_avg:996.07ms
step:2999/3000 train_time:2977235ms step_avg:996.06ms
step:3000/3000 train_time:2978236ms step_avg:996.07ms
step:3000/3000 val_loss:2.5152 train_time:2978275ms step_avg:996.08ms perplexity:12.3695 param_count:85,137,462
peak memory consumption: 12957 MiB
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment