Created
December 24, 2024 04:35
-
-
Save lapp0/e076d696df147c7df8028cb2069300d4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
with open(sys.argv[0]) as f: | |
code = f.read() # read the code of this file ASAP, for logging | |
import uuid | |
import time | |
import contextlib | |
from dataclasses import dataclass | |
import math | |
from pathlib import Path | |
import torch | |
from torch import nn | |
import torch.nn.functional as F | |
import torch.distributed as dist | |
import torch._inductor.config as config | |
from torch.nn.parallel import DistributedDataParallel as DDP | |
from torch.nn.attention.flex_attention import flex_attention, create_block_mask | |
# ----------------------------------------------------------------------------- | |
# Muon optimizer | |
@torch.compile | |
def zeropower_via_newtonschulz5(G, steps): | |
""" | |
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a | |
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose | |
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at | |
zero even beyond the point where the iteration no longer converges all the way to one everywhere | |
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T | |
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model | |
performance at all relative to UV^T, where USV^T = G is the SVD. | |
""" | |
assert len(G.shape) == 2 | |
a, b, c = (3.4445, -4.7750, 2.0315) | |
X = G.bfloat16() | |
if G.size(0) > G.size(1): | |
X = X.T | |
# Ensure spectral norm is at most 1 | |
X = X / (X.norm() + 1e-7) | |
# Perform the NS iterations | |
for _ in range(steps): | |
A = X @ X.T | |
B = b * A + c * A @ A # adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng | |
X = a * X + B @ X | |
if G.size(0) > G.size(1): | |
X = X.T | |
return X | |
class Muon(torch.optim.Optimizer): | |
""" | |
Muon - MomentUm Orthogonalized by Newton-schulz | |
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- | |
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal | |
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has | |
the advantage that it can be stably run in bfloat16 on the GPU. | |
Some warnings: | |
- This optimizer assumes that all parameters passed in are 2D. | |
- It should not be used for the embedding layer, the final fully connected layer, or any {0,1}-D | |
parameters; those should all be optimized by a standard method (e.g., AdamW). | |
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. | |
- We believe it is unlikely to work well for training with small batch size. | |
- We believe it may not work well for finetuning pretrained models, but we haven't tested this. | |
- We have not yet tried this optimizer for training scenarios larger than NanoGPT (124M). | |
Arguments: | |
lr: The learning rate used by the internal SGD. | |
momentum: The momentum used by the internal SGD. | |
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) | |
ns_steps: The number of Newton-Schulz iteration steps to use. | |
""" | |
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): | |
self.world_size = int(os.environ['WORLD_SIZE']) | |
self.rank = int(os.environ['RANK']) | |
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) | |
params = list(params) | |
assert all(isinstance(p, torch.Tensor) for p in params) | |
sizes = {p.numel() for p in params} | |
param_groups = [ | |
{ | |
'params': [p for p in params if p.numel() == size], | |
'update_buffer': [ | |
torch.empty(size, device='cuda', dtype=torch.bfloat16) | |
for _ in range(self.world_size) | |
], | |
} | |
for size in sizes | |
] | |
super().__init__(param_groups, defaults) | |
def step(self): | |
for group in self.param_groups: | |
lr = group['lr'] | |
momentum = group['momentum'] | |
nesterov = group['nesterov'] | |
ns_steps = group['ns_steps'] | |
update_buffers = group['update_buffer'] | |
# generate weight updates in distributed fashion | |
params = group['params'] | |
assert len(params) % self.world_size == 0 | |
handle = None | |
params_world = None | |
def update_prev(): | |
if params_world is None: | |
return | |
assert handle is not None | |
handle.wait() | |
for p_world, g_world in zip(params_world, update_buffers): | |
p_world.data.add_( | |
g_world.view_as(p_world), | |
alpha=-lr * max(1, p_world.size(0) / p_world.size(1)) ** 0.5, | |
) | |
for base_i in range(len(params))[::self.world_size]: | |
p = params[base_i + self.rank] | |
g = p.grad | |
assert g is not None | |
state = self.state[p] | |
if 'momentum_buffer' not in state: | |
state['momentum_buffer'] = torch.zeros_like(g) | |
buf = state['momentum_buffer'] | |
buf.lerp_(g, 1 - momentum) | |
g = g.lerp_(buf, momentum) if nesterov else buf | |
g = zeropower_via_newtonschulz5(g, steps=ns_steps).flatten() | |
update_prev() | |
handle = dist.all_gather(update_buffers, g, async_op=True) | |
params_world = params[base_i : base_i + self.world_size] | |
update_prev() | |
# ----------------------------------------------------------------------------- | |
# PyTorch nn.Module definitions | |
def norm(x): | |
return F.rms_norm(x, (x.size(-1),)) | |
class CastedLinear(nn.Linear): | |
def __init__(self, in_features, out_features): | |
super().__init__(in_features, out_features, bias=False) | |
def forward(self, x): | |
return F.linear(x, self.weight.to(x.dtype)) | |
class Rotary(torch.nn.Module): | |
def __init__(self, dim, base=10000): | |
super().__init__() | |
self.register_buffer('inv_freq', (1 / base) ** (torch.arange(0, dim, 2) / dim)) | |
self.seq_len_cached = None | |
self.cos_cached = None | |
self.sin_cached = None | |
def forward(self, x): | |
seq_len = x.shape[1] | |
if seq_len != self.seq_len_cached: | |
t = torch.arange(seq_len, device=x.device) | |
freqs = torch.outer(t, self.inv_freq) | |
self.seq_len_cached = seq_len | |
self.cos_cached = freqs.cos() | |
self.sin_cached = freqs.sin() | |
cos, sin = self.cos_cached[None, :, None, :], self.sin_cached[None, :, None, :] | |
# apply_rotary_emb(x, cos, sin) | |
x1, x2 = x.chunk(2, dim=3) | |
y1 = x1 * cos + x2 * sin | |
y2 = x1 * (-sin) + x2 * cos | |
return torch.cat((y1, y2), 3).type_as(x) | |
class CausalSelfAttention(nn.Module): | |
def __init__(self, dim, num_heads): | |
super().__init__() | |
assert dim % num_heads == 0 | |
self.num_heads = num_heads | |
self.c_q = CastedLinear(dim, dim) | |
self.c_k = CastedLinear(dim, dim) | |
self.c_v = CastedLinear(dim, dim) | |
self.lambdas = nn.Parameter(torch.tensor([0.5, 0.5])) | |
self.rotary = Rotary(dim // num_heads) # dim // num_heads = head_dim | |
self.c_proj = CastedLinear(dim, dim) | |
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 | |
def forward(self, x, vi, block_mask): | |
B, T = x.size(0), x.size(1) # batch size, sequence length | |
assert B == 1, "Must use batch size = 1 for FlexAttention" | |
q = self.c_q(x).view(B, T, self.num_heads, -1) | |
k = self.c_k(x).view(B, T, self.num_heads, -1) | |
v = self.c_v(x).view(B, T, self.num_heads, -1) | |
v = self.lambdas[0] * v + self.lambdas[1] * vi.view_as(v) # @KoszarskyB & @Grad62304977 | |
q, k = norm(q), norm(k) # QK norm @Grad62304977 | |
q, k = self.rotary(q), self.rotary(k) | |
y = flex_attention(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2), block_mask=block_mask, enable_gqa=True, kernel_options = { | |
"BLOCK_M": 64, "BLOCK_N": 64, # forward | |
"BLOCK_M1": 32, "BLOCK_N1": 64, "BLOCK_M2": 64, "BLOCK_N2": 32 # backwards | |
}) | |
y = y.transpose(1, 2).contiguous().view_as(x) # re-assemble all head outputs side by side | |
y = self.c_proj(y) | |
return y | |
class MLP(nn.Module): | |
def __init__(self, dim): | |
super().__init__() | |
self.c_fc = CastedLinear(dim, 4 * dim) | |
self.c_proj = CastedLinear(4 * dim, dim) | |
self.c_proj.weight.data.zero_() # zero init suggested by @Grad62304977 | |
def forward(self, x): | |
x = self.c_fc(x) | |
x = F.relu(x).square() # https://arxiv.org/abs/2109.08668v2; ~1-2% better than GELU; suggested by @SKYLINEZ007 and @Grad62304977 | |
x = self.c_proj(x) | |
return x | |
class Block(nn.Module): | |
def __init__(self, config): | |
super().__init__() | |
self.attn = CausalSelfAttention(config.model_dim, config.num_heads) | |
self.mlp = MLP(config.model_dim) | |
self.lambdas = nn.Parameter(torch.tensor([1., 0.])) | |
def forward(self, x, vi, x0, block_mask): | |
x = self.lambdas[0] * x + self.lambdas[1] * x0 | |
x = x + self.attn(norm(x), vi, block_mask) | |
x = x + self.mlp(norm(x)) | |
return x | |
class ValueEmbedding(nn.Module): | |
def __init__(self, config: "ModelConfig"): | |
super().__init__() | |
self.embed = nn.ModuleList([ | |
nn.Embedding(config.vocab_size, config.model_dim) | |
for _ in range(6) | |
]) | |
def forward(self, inputs) -> "list[torch.Tensor]": | |
ve = [emb(inputs) for emb in self.embed] | |
ve += reversed(ve) | |
return ve | |
# ----------------------------------------------------------------------------- | |
# The main ESM Bert model | |
class BERT(nn.Module): | |
def __init__(self, config: "ModelConfig"): | |
super().__init__() | |
self.mask_id = 32 | |
self.bos_id = 0 | |
self.num_layers = config.num_layers | |
# U-net design by @brendanh0gan | |
self.num_encoder_layers = config.num_layers // 2 # Half of the layers for encoder | |
self.num_decoder_layers = config.num_layers - self.num_encoder_layers # Remaining for decoder | |
# Add learnable skip connection weights for decoder layers | |
self.skip_weights = nn.Parameter(torch.ones(self.num_decoder_layers)) | |
self.embed = nn.Embedding(config.vocab_size, config.model_dim) | |
self.blocks = nn.ModuleList([Block(config) for _ in range(config.num_layers)]) | |
# token value embeddings by @KoszarskyB - inspired by @Grad62304977's value residual learning | |
# U-net structure on token value embeddings by @leloykun | |
self.value_embeds = ValueEmbedding(config) | |
self.lm_head = CastedLinear(config.model_dim, config.vocab_size) | |
self.lm_head.weight.data.zero_() # @Grad62304977 | |
def encoder_pass(self, input_seq: torch.Tensor, sliding_window_size: torch.Tensor): | |
docs = (input_seq == self.bos_id).cumsum(0) | |
def doc_mask_mod(b, h, q_idx, kv_idx): | |
bidirectional_sliding_window_mask = torch.abs(q_idx - kv_idx) < sliding_window_size | |
doc_mask = docs[q_idx] == docs[kv_idx] | |
return bidirectional_sliding_window_mask & doc_mask | |
S = len(input_seq) | |
block_mask = create_block_mask( | |
doc_mask_mod, None, None, S, S, | |
) | |
x = self.embed(input_seq[None]) | |
x = norm(x) # @Grad62304977 | |
x0 = x | |
ve = self.value_embeds(input_seq) | |
ve_enc, ve_dec = ve[:self.num_encoder_layers], ve[self.num_encoder_layers:] | |
# Store outputs for U-Net skip connections | |
skip_connections = [] | |
# Encoder pass - process only the first half of the blocks | |
for i in range(self.num_encoder_layers): | |
x = self.blocks[i](x, ve_enc[i], x0, block_mask) | |
skip_connections.append(x) | |
# Decoder pass - process the remaining blocks with weighted skip connections | |
for i in range(self.num_decoder_layers): | |
x = x + self.skip_weights[i] * skip_connections.pop() | |
# U-net structure on token value embeddings by @leloykun | |
x = self.blocks[self.num_encoder_layers + i](x, ve_dec[i], x0, block_mask) | |
x = norm(x) | |
logits = self.lm_head(x) | |
logits = 30 * torch.tanh(logits / 30) # @Grad62304977 | |
logits = logits.float() | |
return logits | |
def forward(self, seq, sliding_window_size: torch.Tensor): | |
# MLM mask/replace constants from https://www.biorxiv.org/content/10.1101/2022.07.20.500902v3.full.pdf | |
pct_masked = 0.12 | |
pct_replaced = 0.015 | |
pct_kept = 0.015 | |
# set pct_masked% to <mask> | |
mlm_mask = self.get_frac_mask(seq, pct_masked, torch.ones_like(seq, dtype=torch.bool)) | |
input_seq = seq.clone().masked_fill(mlm_mask, self.mask_id) | |
# substitute pct_replaced% with token id between 4 and 30 (inclusive) | |
sub_mask = self.get_frac_mask(seq, pct_replaced, ~mlm_mask) | |
input_seq[sub_mask] = torch.randint(4, 31, (sub_mask.sum(),), dtype=seq.dtype, device=seq.device) | |
# retain pct_kept% | |
keep_mask = self.get_frac_mask(seq, pct_kept, ~(sub_mask | mlm_mask)) | |
mlm_loss_mask = mlm_mask | sub_mask | keep_mask | |
logits = self.encoder_pass(input_seq, sliding_window_size) | |
return F.cross_entropy( | |
logits.view(-1, logits.size(-1)), | |
seq.masked_fill(~mlm_loss_mask, -100).to(dtype=torch.int64).view(-1), | |
ignore_index=-100 | |
) | |
def get_frac_mask(self, seq: torch.Tensor, pct: float, include=None): | |
docs = (seq == self.bos_id).cumsum(0) | |
valid_tokens_mask = (seq >= 4) & (seq <= 30) | |
if include is not None: | |
valid_tokens_mask &= include | |
random_values = torch.rand_like(docs, dtype=torch.float) * valid_tokens_mask | |
# Map each token to its doc index, count tokens per doc, and compute how many to mask | |
_, inv_docs = torch.unique(docs, return_inverse=True) | |
doc_counts = torch.bincount(inv_docs) # total tokens in each doc | |
num_to_mask = (doc_counts.float() * pct).ceil().to(torch.int64) | |
# Rank tokens globally by random value and select num_to_mask | |
sorted_indices = torch.argsort(random_values, descending=True) | |
ranks = torch.empty_like(sorted_indices, dtype=torch.int64) | |
ranks[sorted_indices] = torch.arange(len(seq), device=seq.device) | |
return ranks < num_to_mask[inv_docs] | |
# ----------------------------------------------------------------------------- | |
# Our own simple Distributed Data Loader | |
def _peek_data_shard(file: Path): | |
# only reads the header, returns header data | |
# header is 256 int32 | |
header = torch.from_file(f"{file}", False, 256, dtype=torch.int32) | |
assert header[0] == 20240520, "magic number mismatch in the data .bin file" | |
assert header[1] == 1, "unsupported version" | |
return int(header[2]) # number of tokens (claimed) | |
def _load_data_shard(path: Path, num_tokens): | |
with path.open("rb", buffering=0) as f: | |
tokens = torch.empty(num_tokens, dtype=torch.uint16, pin_memory=True) | |
f.seek(256 * 4) | |
nbytes = f.readinto(tokens.numpy()) | |
assert nbytes == 2 * num_tokens, "number of tokens read does not match header?" | |
return tokens | |
class DistributedDataLoader: | |
def __init__(self, filename_pattern, seq_len, process_rank, num_processes): | |
self.process_rank = process_rank | |
self.num_processes = num_processes | |
self.seq_len = seq_len | |
# glob files that match the pattern | |
self.files = sorted(Path.cwd().glob(filename_pattern)) | |
assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}" | |
# load and validate all data shards, count number of tokens in total | |
self.files_num_tokens = [_peek_data_shard(file) for file in self.files] | |
assert min(self.files_num_tokens) >= num_processes * seq_len + 1 | |
self.total_num_tokens = sum(self.files_num_tokens) | |
self.reset() | |
def reset(self): | |
self.current_shard = -1 | |
self.advance() | |
def advance(self): # advance to next data shard | |
self.current_shard = (self.current_shard + 1) % len(self.files) | |
self.current_position = self.process_rank * self.seq_len | |
self.tokens = _load_data_shard(self.files[self.current_shard], self.files_num_tokens[self.current_shard]) | |
def next_batch(self): | |
batch_size = self.seq_len * self.num_processes | |
buf = self.tokens[self.current_position:self.current_position+self.seq_len+1] | |
# host side async is sufficient; | |
# no performance improvement was observed when introducing a separate stream. | |
seq = buf.to(device="cuda", dtype=torch.int32, non_blocking=True) # inputs | |
# advance current position and load next shard if necessary | |
self.current_position += batch_size | |
if self.current_position + batch_size + 1 >= len(self.tokens): | |
self.advance() | |
return seq | |
# ----------------------------------------------------------------------------- | |
# int main | |
@dataclass | |
class Hyperparameters: | |
# data hyperparams | |
input_bin : str = 'data/omgprot50/omgprot50_train_*.bin' # input .bin to train on | |
input_val_bin : str = 'data/omgprot50/omgprot50_val_*.bin' # input .bin to eval validation loss on | |
# optimization hyperparams | |
batch_size : int = 16 # batch size, in sequences, across all devices | |
sequence_length : int = 32*1024 # sequence length, in tokens | |
num_iterations : int = 6000 # number of iterations to run | |
warmup_iters : int = 0 | |
cooldown_iters : int = 5000 # number of iterations of linear warmup/cooldown for triangular or trapezoidal schedule | |
weight_decay : float = 0 | |
# evaluation and logging hyperparams | |
val_loss_every : int = 25 # every how many steps to evaluate val loss? 0 for only at the end | |
val_tokens : int = 10485760 # how many tokens of validation data? it's important to keep this fixed for consistent comparisons | |
@dataclass | |
class ModelConfig: | |
# 33 tokens: https://huggingface.co/Synthyra/ESMplusplus_large/blob/main/modeling_esm_plusplus.py#L868-L874 | |
# Depth of the number of layers is typically more important than the depth of the hidden dimension for PLMs | |
# ESM2-8M has 6 layers, 20 heads, 320 hidden dim: https://huggingface.co/facebook/esm2_t6_8M_UR50D/blob/main/config.json | |
# ESM2-35M has 12 layers, 20 heads, 480 hidden dim: https://huggingface.co/facebook/esm2_t12_35M_UR50D/blob/main/config.json | |
# ESM2-150M has 30 layers, 20 heads, 640 hidden dim: https://huggingface.co/facebook/esm2_t30_150M_UR50D/blob/main/config.json | |
# ESM2-650M has 33 layers, 20 heads, 1280 hidden dim: https://huggingface.co/facebook/esm2_t33_650M_UR50D/blob/main/config.json | |
vocab_size : int = 33 | |
num_layers : int = 12 | |
num_heads : int = 6 # head dim 128 suggested by @Grad62304977 | |
model_dim : int = 768 | |
model_config = ModelConfig() | |
args = Hyperparameters() | |
def get_param_count(model): | |
total_params = 0 | |
for name, param in model.named_parameters(): | |
total_params += param.numel() | |
return total_params | |
# set up DDP (distributed data parallel). torchrun sets this env variable | |
ddp_rank = int(os.environ['RANK']) | |
ddp_local_rank = int(os.environ['LOCAL_RANK']) | |
ddp_world_size = int(os.environ['WORLD_SIZE']) | |
assert torch.cuda.is_available() | |
device = torch.device(f'cuda:{ddp_local_rank}') | |
torch.cuda.set_device(device) | |
print(f'using device: {device}') | |
dist.init_process_group(backend='nccl', device_id=device) | |
dist.barrier() | |
master_process = (ddp_rank == 0) # this process will do logging, checkpointing etc. | |
# begin logging | |
logfile = None | |
if master_process: | |
run_id = uuid.uuid4() | |
Path('logs').mkdir(exist_ok=True) | |
# logdir = Path('logs') / f'{run_id}' | |
# logdir.mkdir() | |
logfile = Path('logs') / f'{run_id}.txt' | |
print(logfile.stem) | |
# create the log file | |
with logfile.open('w') as f: | |
# begin the log by printing this file (the Python code) | |
print(code, file=f) | |
print('=' * 100, file=f) | |
def print0(s, logonly=False): | |
if master_process: | |
with logfile.open('a') as f: | |
if not logonly: | |
print(s) | |
print(s, file=f) | |
# log information about the hardware/software environment this is running on | |
# and print the full `nvidia-smi` to file | |
print0(f'Running python {sys.version}') | |
print0(f'Running pytorch {torch.version.__version__} compiled for CUDA {torch.version.cuda}\nnvidia-smi:') | |
import subprocess | |
result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
print0(f'{result.stdout}', logonly=True) | |
print0('='*100, logonly=True) | |
# calculate the number of steps to take in the val loop. | |
assert args.val_tokens % (args.sequence_length * ddp_world_size) == 0 | |
val_steps = args.val_tokens // (args.sequence_length * ddp_world_size) | |
# calculate the steps of gradient accumulation required to attain the desired global batch size. | |
assert args.batch_size % (ddp_world_size) == 0 | |
train_accumulation_steps = args.batch_size // ddp_world_size | |
# load tokens | |
train_loader = DistributedDataLoader(args.input_bin, args.sequence_length, ddp_rank, ddp_world_size) | |
val_loader = DistributedDataLoader(args.input_val_bin, args.sequence_length, ddp_rank, ddp_world_size) | |
print0(f"Training DataLoader: total number of tokens: {train_loader.total_num_tokens} across {len(train_loader.files)} files") | |
print0(f"Validation DataLoader: total number of tokens: {val_loader.total_num_tokens} across {len(val_loader.files)} files") | |
print0('='*100, logonly=True) | |
seq_train = train_loader.next_batch() | |
model = BERT(model_config) | |
model = model.cuda().bfloat16() | |
for m in model.modules(): | |
if isinstance(m, CastedLinear): | |
m.float() | |
config.coordinate_descent_tuning = True # suggested by @Chillee | |
model = torch.compile(model) | |
# here we wrap model into DDP container | |
model = DDP(model, device_ids=[ddp_local_rank], broadcast_buffers=False, gradient_as_bucket_view=True) | |
raw_model = model.module # always contains the "raw" unwrapped model | |
# init the optimizer(s) | |
embed_params = [*raw_model.embed.parameters(), *raw_model.value_embeds.parameters()] | |
optimizer1 = torch.optim.Adam(embed_params, lr=0.01, betas=(0.8, 0.95), fused=True) | |
optimizer2 = torch.optim.Adam([raw_model.lm_head.weight], lr=0.0001, betas=(0.8, 0.95), fused=True) | |
params = list(raw_model.blocks.parameters()) | |
matrix_params = [p for p in params if p.ndim == 2] | |
scalar_params = [p for p in params if p.ndim < 2] + [raw_model.skip_weights] | |
optimizer3 = Muon(matrix_params, lr=0.003, momentum=0.95) | |
optimizer4 = torch.optim.Adam(scalar_params, lr=0.003, betas=(0.8, 0.95), fused=True) | |
optimizers = [optimizer1, optimizer2, optimizer3, optimizer4] | |
# learning rate decay scheduler (linear warmup and cooldown) | |
def get_lr(it): | |
assert it <= args.num_iterations | |
# 1) linear warmup for warmup_iters steps | |
if it < args.warmup_iters: | |
return (it+1) / args.warmup_iters | |
# 2) constant lr for a while | |
elif it < args.num_iterations - args.cooldown_iters: | |
return 1.0 | |
# 3) linear cooldown | |
else: | |
decay_ratio = (args.num_iterations - it) / args.cooldown_iters | |
return decay_ratio | |
schedulers = [torch.optim.lr_scheduler.LambdaLR(opt, get_lr) for opt in optimizers] | |
sliding_window_size = torch.tensor(1024 - 128, dtype=torch.int32, device="cuda") | |
sw_prev = 1024 - 128 | |
# Start training loop | |
training_time_ms = 0 | |
# start the clock | |
torch.cuda.synchronize() | |
t0 = time.perf_counter() | |
# begin training | |
for step in range(args.num_iterations + 1): | |
last_step = (step == args.num_iterations) | |
# This effectively ignores timing first 10 steps, which are slower for weird reasons. | |
# Alternately, and slightly more correctly in terms of benchmarking, we could do 10 | |
# steps with dummy data first, and then re-initialize the model and reset the loader. | |
if step == 10: | |
training_time_ms = 0 | |
t0 = time.perf_counter() | |
timed_steps = float('nan') if step <= 11 else (step - 10) + 1 # <= 11 to avoid bug in val | |
# Linearly increase the sliding window size over training in chunks of 128 from 1024 -> 2048. By @fernbear.bsky.social | |
frac_done = step / args.num_iterations # training progress | |
sw_size = int(((1 - frac_done) * 1023 + frac_done * 2048) // 128) * 128 | |
if sw_size != sw_prev: | |
sliding_window_size.copy_(sw_size, non_blocking=True) | |
sw_prev = sw_size | |
# once in a while evaluate the validation dataset | |
if (last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0)): | |
# stop the clock | |
torch.cuda.synchronize() | |
training_time_ms += 1000 * (time.perf_counter() - t0) | |
# run validation batches | |
model.eval() | |
val_loader.reset() | |
val_loss = 0.0 | |
for _ in range(val_steps): | |
with torch.no_grad(): | |
seq_val = val_loader.next_batch() | |
val_loss += model(seq_val, sliding_window_size) | |
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) | |
val_loss /= val_steps | |
# log val loss to console and to logfile | |
print0(f'step:{step}/{args.num_iterations} val_loss:{val_loss:.4f} train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms/(timed_steps-1):.2f}ms perplexity:{(math.e**val_loss):.4f} param_count:{get_param_count(model):,}') | |
# start the clock again | |
torch.cuda.synchronize() | |
t0 = time.perf_counter() | |
# uncomment if you want to save any checkpoints | |
#save_every = 1000 | |
#if master_process and (last_step or (save_every > 0 and step % save_every == 0)): | |
# # stop the clock | |
# torch.cuda.synchronize() | |
# training_time_ms += 1000 * (time.perf_counter() - t0) | |
# # save the state of the training process | |
# log = dict(step=step, code=code, model=raw_model.state_dict(), optimizers=[opt.state_dict() for opt in optimizers]) | |
# torch.save(log, 'logs/%s/state_step%06d.pt' % (run_id, step)) | |
# # start the clock again | |
# torch.cuda.synchronize() | |
# t0 = time.perf_counter() | |
# bit confusing: we want to make sure to eval on 0th iteration | |
# but also after the very last iteration. so we loop for step <= num_iterations | |
# instead of just < num_iterations (one extra due to <=), only to do | |
# the validation/sampling one last time, and then we break right here as we're done. | |
if last_step: | |
break | |
# --------------- TRAINING SECTION BEGIN ----------------- | |
model.train() | |
for i in range(1, train_accumulation_steps + 1): | |
with contextlib.ExitStack() as stack: | |
if i < train_accumulation_steps: # there's no need to sync gradients every accumulation step | |
stack.enter_context(model.no_sync()) | |
#if step >= 5: | |
# stack.enter_context(torch.compiler.set_stance(skip_guard_eval_unsafe=True)) | |
model(seq_train, sliding_window_size).backward() | |
seq_train = train_loader.next_batch() | |
if train_accumulation_steps != 1: | |
for p in model.parameters(): | |
p.grad /= train_accumulation_steps | |
# momentum warmup for Muon | |
frac = min(step/1000, 1) | |
for group in optimizer3.param_groups: | |
group['momentum'] = (1 - frac) * 0.85 + frac * 0.95 | |
# step the optimizers and schedulers | |
for opt, sched in zip(optimizers, schedulers): | |
opt.step() | |
sched.step() | |
# null the gradients | |
model.zero_grad(set_to_none=True) | |
# --------------- TRAINING SECTION END ------------------- | |
# everything that follows now is just diagnostics, prints, logging, etc. | |
approx_time = training_time_ms + 1000 * (time.perf_counter() - t0) | |
print0(f"step:{step+1}/{args.num_iterations} train_time:{approx_time:.0f}ms step_avg:{approx_time/timed_steps:.2f}ms") | |
print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB") | |
# ------------------------------------------------------------------------- | |
# clean up nice | |
dist.destroy_process_group() | |
==================================================================================================== | |
Running python 3.11.10 | packaged by conda-forge | (main, Oct 16 2024, 01:27:36) [GCC 13.3.0] | |
Running pytorch 2.6.0.dev20241203+cu124 compiled for CUDA 12.4 | |
nvidia-smi: | |
Tue Dec 24 02:50:13 2024 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 550.120 Driver Version: 550.120 CUDA Version: 12.4 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA GeForce RTX 4090 On | 00000000:42:00.0 Off | Off | | |
| 30% 25C P2 44W / 450W | 1756MiB / 24564MiB | 17% Default | | |
| | | N/A | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA GeForce RTX 4090 On | 00000000:81:00.0 Off | Off | | |
| 31% 27C P2 41W / 450W | 591MiB / 24564MiB | 0% Default | | |
| | | N/A | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA GeForce RTX 4090 On | 00000000:82:00.0 Off | Off | | |
| 31% 26C P2 66W / 450W | 591MiB / 24564MiB | 0% Default | | |
| | | N/A | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA GeForce RTX 4090 On | 00000000:C1:00.0 Off | Off | | |
| 30% 29C P2 51W / 450W | 591MiB / 24564MiB | 0% Default | | |
| | | N/A | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
+-----------------------------------------------------------------------------------------+ | |
==================================================================================================== | |
Training DataLoader: total number of tokens: 27800000000 across 278 files | |
Validation DataLoader: total number of tokens: 100000000 across 1 files | |
==================================================================================================== | |
step:0/6000 val_loss:3.4965 train_time:0ms step_avg:nanms perplexity:33.0000 param_count:85,137,462 | |
step:1/6000 train_time:49132ms step_avg:nanms | |
step:2/6000 train_time:50804ms step_avg:nanms | |
step:3/6000 train_time:51750ms step_avg:nanms | |
step:4/6000 train_time:52736ms step_avg:nanms | |
step:5/6000 train_time:53729ms step_avg:nanms | |
step:6/6000 train_time:54724ms step_avg:nanms | |
step:7/6000 train_time:55709ms step_avg:nanms | |
step:8/6000 train_time:56693ms step_avg:nanms | |
step:9/6000 train_time:57682ms step_avg:nanms | |
step:10/6000 train_time:58682ms step_avg:nanms | |
step:11/6000 train_time:988ms step_avg:nanms | |
step:12/6000 train_time:1978ms step_avg:nanms | |
step:13/6000 train_time:2969ms step_avg:989.65ms | |
step:14/6000 train_time:3956ms step_avg:988.90ms | |
step:15/6000 train_time:4939ms step_avg:987.89ms | |
step:16/6000 train_time:5922ms step_avg:987.05ms | |
step:17/6000 train_time:6911ms step_avg:987.29ms | |
step:18/6000 train_time:7899ms step_avg:987.39ms | |
step:19/6000 train_time:8895ms step_avg:988.31ms | |
step:20/6000 train_time:9889ms step_avg:988.89ms | |
step:21/6000 train_time:10879ms step_avg:989.01ms | |
step:22/6000 train_time:11870ms step_avg:989.19ms | |
step:23/6000 train_time:12859ms step_avg:989.13ms | |
step:24/6000 train_time:13844ms step_avg:988.82ms | |
step:25/6000 train_time:14839ms step_avg:989.24ms | |
step:25/6000 val_loss:2.8944 train_time:14879ms step_avg:991.91ms perplexity:18.0722 param_count:85,137,462 | |
step:26/6000 train_time:15826ms step_avg:989.13ms | |
step:27/6000 train_time:16817ms step_avg:989.26ms | |
step:28/6000 train_time:17813ms step_avg:989.63ms | |
step:29/6000 train_time:18815ms step_avg:990.27ms | |
step:30/6000 train_time:19803ms step_avg:990.16ms | |
step:31/6000 train_time:20799ms step_avg:990.44ms | |
step:32/6000 train_time:21790ms step_avg:990.45ms | |
step:33/6000 train_time:22772ms step_avg:990.10ms | |
step:34/6000 train_time:23768ms step_avg:990.35ms | |
step:35/6000 train_time:24767ms step_avg:990.67ms | |
step:36/6000 train_time:25756ms step_avg:990.60ms | |
step:37/6000 train_time:26757ms step_avg:991.00ms | |
step:38/6000 train_time:27764ms step_avg:991.58ms | |
step:39/6000 train_time:28754ms step_avg:991.51ms | |
step:40/6000 train_time:29751ms step_avg:991.71ms | |
step:41/6000 train_time:30743ms step_avg:991.71ms | |
step:42/6000 train_time:31731ms step_avg:991.58ms | |
step:43/6000 train_time:32719ms step_avg:991.48ms | |
step:44/6000 train_time:33711ms step_avg:991.50ms | |
step:45/6000 train_time:34708ms step_avg:991.67ms | |
step:46/6000 train_time:35698ms step_avg:991.62ms | |
step:47/6000 train_time:36691ms step_avg:991.66ms | |
step:48/6000 train_time:37701ms step_avg:992.13ms | |
step:49/6000 train_time:38694ms step_avg:992.14ms | |
step:50/6000 train_time:39688ms step_avg:992.19ms | |
step:50/6000 val_loss:2.7718 train_time:39729ms step_avg:993.23ms perplexity:15.9871 param_count:85,137,462 | |
step:51/6000 train_time:40663ms step_avg:991.77ms | |
step:52/6000 train_time:41656ms step_avg:991.80ms | |
step:53/6000 train_time:42644ms step_avg:991.72ms | |
step:54/6000 train_time:43638ms step_avg:991.77ms | |
step:55/6000 train_time:44626ms step_avg:991.69ms | |
step:56/6000 train_time:45612ms step_avg:991.56ms | |
step:57/6000 train_time:46605ms step_avg:991.59ms | |
step:58/6000 train_time:47592ms step_avg:991.49ms | |
step:59/6000 train_time:48590ms step_avg:991.64ms | |
step:60/6000 train_time:49588ms step_avg:991.77ms | |
step:61/6000 train_time:50592ms step_avg:992.00ms | |
step:62/6000 train_time:51581ms step_avg:991.94ms | |
step:63/6000 train_time:52575ms step_avg:991.99ms | |
step:64/6000 train_time:53577ms step_avg:992.16ms | |
step:65/6000 train_time:54568ms step_avg:992.15ms | |
step:66/6000 train_time:55582ms step_avg:992.53ms | |
step:67/6000 train_time:56576ms step_avg:992.56ms | |
step:68/6000 train_time:57560ms step_avg:992.42ms | |
step:69/6000 train_time:58553ms step_avg:992.42ms | |
step:70/6000 train_time:59541ms step_avg:992.35ms | |
step:71/6000 train_time:60542ms step_avg:992.49ms | |
step:72/6000 train_time:61532ms step_avg:992.46ms | |
step:73/6000 train_time:62522ms step_avg:992.42ms | |
step:74/6000 train_time:63521ms step_avg:992.51ms | |
step:75/6000 train_time:64523ms step_avg:992.66ms | |
step:75/6000 val_loss:2.7101 train_time:64564ms step_avg:993.29ms perplexity:15.0314 param_count:85,137,462 | |
step:76/6000 train_time:65515ms step_avg:992.65ms | |
step:77/6000 train_time:66507ms step_avg:992.64ms | |
step:78/6000 train_time:67501ms step_avg:992.66ms | |
step:79/6000 train_time:68496ms step_avg:992.70ms | |
step:80/6000 train_time:69492ms step_avg:992.74ms | |
step:81/6000 train_time:70486ms step_avg:992.77ms | |
step:82/6000 train_time:71469ms step_avg:992.63ms | |
step:83/6000 train_time:72469ms step_avg:992.73ms | |
step:84/6000 train_time:73452ms step_avg:992.60ms | |
step:85/6000 train_time:74443ms step_avg:992.58ms | |
step:86/6000 train_time:75420ms step_avg:992.37ms | |
step:87/6000 train_time:76420ms step_avg:992.46ms | |
step:88/6000 train_time:77417ms step_avg:992.53ms | |
step:89/6000 train_time:78414ms step_avg:992.58ms | |
step:90/6000 train_time:79405ms step_avg:992.57ms | |
step:91/6000 train_time:80398ms step_avg:992.57ms | |
step:92/6000 train_time:81396ms step_avg:992.64ms | |
step:93/6000 train_time:82379ms step_avg:992.51ms | |
step:94/6000 train_time:83375ms step_avg:992.56ms | |
step:95/6000 train_time:84371ms step_avg:992.60ms | |
step:96/6000 train_time:85365ms step_avg:992.61ms | |
step:97/6000 train_time:86348ms step_avg:992.51ms | |
step:98/6000 train_time:87334ms step_avg:992.44ms | |
step:99/6000 train_time:88329ms step_avg:992.46ms | |
step:100/6000 train_time:89320ms step_avg:992.44ms | |
step:100/6000 val_loss:2.6657 train_time:89362ms step_avg:992.91ms perplexity:14.3780 param_count:85,137,462 | |
step:101/6000 train_time:90319ms step_avg:992.51ms | |
step:102/6000 train_time:91318ms step_avg:992.59ms | |
step:103/6000 train_time:92330ms step_avg:992.79ms | |
step:104/6000 train_time:93319ms step_avg:992.76ms | |
step:105/6000 train_time:94305ms step_avg:992.68ms | |
step:106/6000 train_time:95302ms step_avg:992.73ms | |
step:107/6000 train_time:96298ms step_avg:992.76ms | |
step:108/6000 train_time:97289ms step_avg:992.74ms | |
step:109/6000 train_time:98292ms step_avg:992.85ms | |
step:110/6000 train_time:99279ms step_avg:992.79ms | |
step:111/6000 train_time:100264ms step_avg:992.71ms | |
step:112/6000 train_time:101258ms step_avg:992.72ms | |
step:113/6000 train_time:102251ms step_avg:992.73ms | |
step:114/6000 train_time:103246ms step_avg:992.75ms | |
step:115/6000 train_time:104251ms step_avg:992.87ms | |
step:116/6000 train_time:105261ms step_avg:993.03ms | |
step:117/6000 train_time:106260ms step_avg:993.08ms | |
step:118/6000 train_time:107255ms step_avg:993.10ms | |
step:119/6000 train_time:108251ms step_avg:993.13ms | |
step:120/6000 train_time:109255ms step_avg:993.23ms | |
step:121/6000 train_time:110256ms step_avg:993.30ms | |
step:122/6000 train_time:111253ms step_avg:993.33ms | |
step:123/6000 train_time:112244ms step_avg:993.31ms | |
step:124/6000 train_time:113228ms step_avg:993.23ms | |
step:125/6000 train_time:114219ms step_avg:993.21ms | |
step:125/6000 val_loss:2.6492 train_time:114259ms step_avg:993.56ms perplexity:14.1429 param_count:85,137,462 | |
step:126/6000 train_time:115204ms step_avg:993.14ms | |
step:127/6000 train_time:116195ms step_avg:993.12ms | |
step:128/6000 train_time:117197ms step_avg:993.19ms | |
step:129/6000 train_time:118189ms step_avg:993.19ms | |
step:130/6000 train_time:119187ms step_avg:993.22ms | |
step:131/6000 train_time:120177ms step_avg:993.20ms | |
step:132/6000 train_time:121180ms step_avg:993.27ms | |
step:133/6000 train_time:122177ms step_avg:993.31ms | |
step:134/6000 train_time:123162ms step_avg:993.24ms | |
step:135/6000 train_time:124158ms step_avg:993.26ms | |
step:136/6000 train_time:125154ms step_avg:993.29ms | |
step:137/6000 train_time:126147ms step_avg:993.28ms | |
step:138/6000 train_time:127141ms step_avg:993.29ms | |
step:139/6000 train_time:128139ms step_avg:993.32ms | |
step:140/6000 train_time:129136ms step_avg:993.36ms | |
step:141/6000 train_time:130135ms step_avg:993.40ms | |
step:142/6000 train_time:131119ms step_avg:993.32ms | |
step:143/6000 train_time:132119ms step_avg:993.38ms | |
step:144/6000 train_time:133121ms step_avg:993.44ms | |
step:145/6000 train_time:134119ms step_avg:993.48ms | |
step:146/6000 train_time:135111ms step_avg:993.46ms | |
step:147/6000 train_time:136105ms step_avg:993.47ms | |
step:148/6000 train_time:137093ms step_avg:993.43ms | |
step:149/6000 train_time:138100ms step_avg:993.53ms | |
step:150/6000 train_time:139094ms step_avg:993.53ms | |
step:150/6000 val_loss:2.6275 train_time:139135ms step_avg:993.82ms perplexity:13.8396 param_count:85,137,462 | |
step:151/6000 train_time:140088ms step_avg:993.53ms | |
step:152/6000 train_time:141080ms step_avg:993.52ms | |
step:153/6000 train_time:142070ms step_avg:993.50ms | |
step:154/6000 train_time:143058ms step_avg:993.46ms | |
step:155/6000 train_time:144050ms step_avg:993.45ms | |
step:156/6000 train_time:145045ms step_avg:993.46ms | |
step:157/6000 train_time:146044ms step_avg:993.49ms | |
step:158/6000 train_time:147031ms step_avg:993.45ms | |
step:159/6000 train_time:148016ms step_avg:993.39ms | |
step:160/6000 train_time:149009ms step_avg:993.39ms | |
step:161/6000 train_time:150011ms step_avg:993.45ms | |
step:162/6000 train_time:151002ms step_avg:993.43ms | |
step:163/6000 train_time:151989ms step_avg:993.40ms | |
step:164/6000 train_time:152987ms step_avg:993.42ms | |
step:165/6000 train_time:153973ms step_avg:993.37ms | |
step:166/6000 train_time:154976ms step_avg:993.44ms | |
step:167/6000 train_time:155973ms step_avg:993.46ms | |
step:168/6000 train_time:156955ms step_avg:993.39ms | |
step:169/6000 train_time:157980ms step_avg:993.58ms | |
step:170/6000 train_time:158971ms step_avg:993.57ms | |
step:171/6000 train_time:159972ms step_avg:993.62ms | |
step:172/6000 train_time:160958ms step_avg:993.57ms | |
step:173/6000 train_time:161945ms step_avg:993.53ms | |
step:174/6000 train_time:162942ms step_avg:993.55ms | |
step:175/6000 train_time:163943ms step_avg:993.59ms | |
step:175/6000 val_loss:2.6247 train_time:163982ms step_avg:993.83ms perplexity:13.8002 param_count:85,137,462 | |
step:176/6000 train_time:164927ms step_avg:993.54ms | |
step:177/6000 train_time:165922ms step_avg:993.54ms | |
step:178/6000 train_time:166916ms step_avg:993.55ms | |
step:179/6000 train_time:167901ms step_avg:993.50ms | |
step:180/6000 train_time:168893ms step_avg:993.49ms | |
step:181/6000 train_time:169882ms step_avg:993.46ms | |
step:182/6000 train_time:170871ms step_avg:993.43ms | |
step:183/6000 train_time:171867ms step_avg:993.45ms | |
step:184/6000 train_time:172867ms step_avg:993.49ms | |
step:185/6000 train_time:173860ms step_avg:993.49ms | |
step:186/6000 train_time:174852ms step_avg:993.48ms | |
step:187/6000 train_time:175836ms step_avg:993.43ms | |
step:188/6000 train_time:176833ms step_avg:993.45ms | |
step:189/6000 train_time:177842ms step_avg:993.53ms | |
step:190/6000 train_time:178836ms step_avg:993.54ms | |
step:191/6000 train_time:179825ms step_avg:993.51ms | |
step:192/6000 train_time:180818ms step_avg:993.51ms | |
step:193/6000 train_time:181812ms step_avg:993.51ms | |
step:194/6000 train_time:182802ms step_avg:993.49ms | |
step:195/6000 train_time:183784ms step_avg:993.43ms | |
step:196/6000 train_time:184784ms step_avg:993.46ms | |
step:197/6000 train_time:185777ms step_avg:993.46ms | |
step:198/6000 train_time:186781ms step_avg:993.51ms | |
step:199/6000 train_time:187787ms step_avg:993.58ms | |
step:200/6000 train_time:188780ms step_avg:993.58ms | |
step:200/6000 val_loss:2.6182 train_time:188820ms step_avg:993.79ms perplexity:13.7111 param_count:85,137,462 | |
step:201/6000 train_time:189764ms step_avg:993.53ms | |
step:202/6000 train_time:190752ms step_avg:993.50ms | |
step:203/6000 train_time:191744ms step_avg:993.49ms | |
step:204/6000 train_time:192741ms step_avg:993.51ms | |
step:205/6000 train_time:193736ms step_avg:993.52ms | |
step:206/6000 train_time:194716ms step_avg:993.45ms | |
step:207/6000 train_time:195702ms step_avg:993.41ms | |
step:208/6000 train_time:196693ms step_avg:993.40ms | |
step:209/6000 train_time:197684ms step_avg:993.39ms | |
step:210/6000 train_time:198690ms step_avg:993.45ms | |
step:211/6000 train_time:199696ms step_avg:993.51ms | |
step:212/6000 train_time:200693ms step_avg:993.53ms | |
step:213/6000 train_time:201687ms step_avg:993.53ms | |
step:214/6000 train_time:202684ms step_avg:993.55ms | |
step:215/6000 train_time:203683ms step_avg:993.58ms | |
step:216/6000 train_time:204680ms step_avg:993.59ms | |
step:217/6000 train_time:205676ms step_avg:993.60ms | |
step:218/6000 train_time:206673ms step_avg:993.62ms | |
step:219/6000 train_time:207666ms step_avg:993.62ms | |
step:220/6000 train_time:208656ms step_avg:993.60ms | |
step:221/6000 train_time:209642ms step_avg:993.57ms | |
step:222/6000 train_time:210638ms step_avg:993.58ms | |
step:223/6000 train_time:211636ms step_avg:993.60ms | |
step:224/6000 train_time:212631ms step_avg:993.60ms | |
step:225/6000 train_time:213615ms step_avg:993.56ms | |
step:225/6000 val_loss:2.6169 train_time:213657ms step_avg:993.75ms perplexity:13.6932 param_count:85,137,462 | |
step:226/6000 train_time:214604ms step_avg:993.54ms | |
step:227/6000 train_time:215600ms step_avg:993.55ms | |
step:228/6000 train_time:216589ms step_avg:993.53ms | |
step:229/6000 train_time:217584ms step_avg:993.53ms | |
step:230/6000 train_time:218576ms step_avg:993.53ms | |
step:231/6000 train_time:219578ms step_avg:993.57ms | |
step:232/6000 train_time:220572ms step_avg:993.57ms | |
step:233/6000 train_time:221568ms step_avg:993.58ms | |
step:234/6000 train_time:222567ms step_avg:993.60ms | |
step:235/6000 train_time:223564ms step_avg:993.62ms | |
step:236/6000 train_time:224552ms step_avg:993.59ms | |
step:237/6000 train_time:225544ms step_avg:993.58ms | |
step:238/6000 train_time:226541ms step_avg:993.60ms | |
step:239/6000 train_time:227539ms step_avg:993.62ms | |
step:240/6000 train_time:228528ms step_avg:993.60ms | |
step:241/6000 train_time:229528ms step_avg:993.63ms | |
step:242/6000 train_time:230528ms step_avg:993.66ms | |
step:243/6000 train_time:231518ms step_avg:993.64ms | |
step:244/6000 train_time:232511ms step_avg:993.64ms | |
step:245/6000 train_time:233506ms step_avg:993.64ms | |
step:246/6000 train_time:234495ms step_avg:993.62ms | |
step:247/6000 train_time:235488ms step_avg:993.62ms | |
step:248/6000 train_time:236480ms step_avg:993.62ms | |
step:249/6000 train_time:237479ms step_avg:993.64ms | |
step:250/6000 train_time:238473ms step_avg:993.64ms | |
step:250/6000 val_loss:2.6120 train_time:238515ms step_avg:993.81ms perplexity:13.6259 param_count:85,137,462 | |
step:251/6000 train_time:239469ms step_avg:993.65ms | |
step:252/6000 train_time:240466ms step_avg:993.66ms | |
step:253/6000 train_time:241471ms step_avg:993.71ms | |
step:254/6000 train_time:242468ms step_avg:993.72ms | |
step:255/6000 train_time:243456ms step_avg:993.70ms | |
step:256/6000 train_time:244456ms step_avg:993.72ms | |
step:257/6000 train_time:245458ms step_avg:993.76ms | |
step:258/6000 train_time:246450ms step_avg:993.75ms | |
step:259/6000 train_time:247452ms step_avg:993.78ms | |
step:260/6000 train_time:248438ms step_avg:993.75ms | |
step:261/6000 train_time:249424ms step_avg:993.72ms | |
step:262/6000 train_time:250415ms step_avg:993.71ms | |
step:263/6000 train_time:251407ms step_avg:993.70ms | |
step:264/6000 train_time:252402ms step_avg:993.71ms | |
step:265/6000 train_time:253390ms step_avg:993.69ms | |
step:266/6000 train_time:254386ms step_avg:993.70ms | |
step:267/6000 train_time:255369ms step_avg:993.65ms | |
step:268/6000 train_time:256368ms step_avg:993.67ms | |
step:269/6000 train_time:257368ms step_avg:993.70ms | |
step:270/6000 train_time:258363ms step_avg:993.70ms | |
step:271/6000 train_time:259363ms step_avg:993.73ms | |
step:272/6000 train_time:260351ms step_avg:993.71ms | |
step:273/6000 train_time:261340ms step_avg:993.69ms | |
step:274/6000 train_time:262334ms step_avg:993.69ms | |
step:275/6000 train_time:263338ms step_avg:993.73ms | |
step:275/6000 val_loss:2.6139 train_time:263379ms step_avg:993.88ms perplexity:13.6515 param_count:85,137,462 | |
step:276/6000 train_time:264324ms step_avg:993.70ms | |
step:277/6000 train_time:265314ms step_avg:993.69ms | |
step:278/6000 train_time:266309ms step_avg:993.69ms | |
step:279/6000 train_time:267303ms step_avg:993.69ms | |
step:280/6000 train_time:268299ms step_avg:993.70ms | |
step:281/6000 train_time:269297ms step_avg:993.71ms | |
step:282/6000 train_time:270288ms step_avg:993.71ms | |
step:283/6000 train_time:271282ms step_avg:993.71ms | |
step:284/6000 train_time:272287ms step_avg:993.75ms | |
step:285/6000 train_time:273284ms step_avg:993.76ms | |
step:286/6000 train_time:274282ms step_avg:993.78ms | |
step:287/6000 train_time:275282ms step_avg:993.80ms | |
step:288/6000 train_time:276278ms step_avg:993.80ms | |
step:289/6000 train_time:277259ms step_avg:993.76ms | |
step:290/6000 train_time:278256ms step_avg:993.77ms | |
step:291/6000 train_time:279240ms step_avg:993.74ms | |
step:292/6000 train_time:280229ms step_avg:993.72ms | |
step:293/6000 train_time:281221ms step_avg:993.72ms | |
step:294/6000 train_time:282221ms step_avg:993.74ms | |
step:295/6000 train_time:283218ms step_avg:993.75ms | |
step:296/6000 train_time:284218ms step_avg:993.77ms | |
step:297/6000 train_time:285208ms step_avg:993.76ms | |
step:298/6000 train_time:286197ms step_avg:993.74ms | |
step:299/6000 train_time:287182ms step_avg:993.71ms | |
step:300/6000 train_time:288168ms step_avg:993.68ms | |
step:300/6000 val_loss:2.6143 train_time:288207ms step_avg:993.82ms perplexity:13.6575 param_count:85,137,462 | |
step:301/6000 train_time:289162ms step_avg:993.68ms | |
step:302/6000 train_time:290157ms step_avg:993.69ms | |
step:303/6000 train_time:291140ms step_avg:993.65ms | |
step:304/6000 train_time:292130ms step_avg:993.64ms | |
step:305/6000 train_time:293121ms step_avg:993.63ms | |
step:306/6000 train_time:294122ms step_avg:993.66ms | |
step:307/6000 train_time:295118ms step_avg:993.66ms | |
step:308/6000 train_time:296114ms step_avg:993.67ms | |
step:309/6000 train_time:297122ms step_avg:993.72ms | |
step:310/6000 train_time:298123ms step_avg:993.74ms | |
step:311/6000 train_time:299125ms step_avg:993.77ms | |
step:312/6000 train_time:300119ms step_avg:993.77ms | |
step:313/6000 train_time:301109ms step_avg:993.76ms | |
step:314/6000 train_time:302105ms step_avg:993.77ms | |
step:315/6000 train_time:303094ms step_avg:993.75ms | |
step:316/6000 train_time:304083ms step_avg:993.73ms | |
step:317/6000 train_time:305079ms step_avg:993.74ms | |
step:318/6000 train_time:306071ms step_avg:993.74ms | |
step:319/6000 train_time:307069ms step_avg:993.75ms | |
step:320/6000 train_time:308064ms step_avg:993.76ms | |
step:321/6000 train_time:309052ms step_avg:993.73ms | |
step:322/6000 train_time:310045ms step_avg:993.73ms | |
step:323/6000 train_time:311035ms step_avg:993.72ms | |
step:324/6000 train_time:312030ms step_avg:993.73ms | |
step:325/6000 train_time:313048ms step_avg:993.80ms | |
step:325/6000 val_loss:2.6193 train_time:313078ms step_avg:993.90ms perplexity:13.7256 param_count:85,137,462 | |
step:326/6000 train_time:314032ms step_avg:993.77ms | |
step:327/6000 train_time:315036ms step_avg:993.81ms | |
step:328/6000 train_time:316032ms step_avg:993.81ms | |
step:329/6000 train_time:317023ms step_avg:993.80ms | |
step:330/6000 train_time:318012ms step_avg:993.79ms | |
step:331/6000 train_time:319003ms step_avg:993.78ms | |
step:332/6000 train_time:319989ms step_avg:993.75ms | |
step:333/6000 train_time:320986ms step_avg:993.76ms | |
step:334/6000 train_time:321976ms step_avg:993.75ms | |
step:335/6000 train_time:322981ms step_avg:993.79ms | |
step:336/6000 train_time:323981ms step_avg:993.81ms | |
step:337/6000 train_time:324981ms step_avg:993.83ms | |
step:338/6000 train_time:325971ms step_avg:993.81ms | |
step:339/6000 train_time:326959ms step_avg:993.80ms | |
step:340/6000 train_time:327957ms step_avg:993.81ms | |
step:341/6000 train_time:328944ms step_avg:993.79ms | |
step:342/6000 train_time:329939ms step_avg:993.79ms | |
step:343/6000 train_time:330933ms step_avg:993.79ms | |
step:344/6000 train_time:331930ms step_avg:993.80ms | |
step:345/6000 train_time:332930ms step_avg:993.82ms | |
step:346/6000 train_time:333919ms step_avg:993.81ms | |
step:347/6000 train_time:334925ms step_avg:993.84ms | |
step:348/6000 train_time:335921ms step_avg:993.85ms | |
step:349/6000 train_time:336915ms step_avg:993.85ms | |
step:350/6000 train_time:337898ms step_avg:993.82ms | |
step:350/6000 val_loss:2.6077 train_time:337940ms step_avg:993.94ms perplexity:13.5671 param_count:85,137,462 | |
step:351/6000 train_time:338888ms step_avg:993.81ms | |
step:352/6000 train_time:339887ms step_avg:993.82ms | |
step:353/6000 train_time:340885ms step_avg:993.83ms | |
step:354/6000 train_time:341884ms step_avg:993.85ms | |
step:355/6000 train_time:342872ms step_avg:993.83ms | |
step:356/6000 train_time:343861ms step_avg:993.82ms | |
step:357/6000 train_time:344847ms step_avg:993.80ms | |
step:358/6000 train_time:345836ms step_avg:993.78ms | |
step:359/6000 train_time:346831ms step_avg:993.79ms | |
step:360/6000 train_time:347829ms step_avg:993.80ms | |
step:361/6000 train_time:348820ms step_avg:993.79ms | |
step:362/6000 train_time:349810ms step_avg:993.78ms | |
step:363/6000 train_time:350807ms step_avg:993.79ms | |
step:364/6000 train_time:351798ms step_avg:993.78ms | |
step:365/6000 train_time:352795ms step_avg:993.79ms | |
step:366/6000 train_time:353792ms step_avg:993.80ms | |
step:367/6000 train_time:354783ms step_avg:993.79ms | |
step:368/6000 train_time:355772ms step_avg:993.78ms | |
step:369/6000 train_time:356766ms step_avg:993.78ms | |
step:370/6000 train_time:357750ms step_avg:993.75ms | |
step:371/6000 train_time:358747ms step_avg:993.76ms | |
step:372/6000 train_time:359740ms step_avg:993.76ms | |
step:373/6000 train_time:360724ms step_avg:993.73ms | |
step:374/6000 train_time:361726ms step_avg:993.75ms | |
step:375/6000 train_time:362723ms step_avg:993.76ms | |
step:375/6000 val_loss:2.6025 train_time:362764ms step_avg:993.88ms perplexity:13.4977 param_count:85,137,462 | |
step:376/6000 train_time:363717ms step_avg:993.76ms | |
step:377/6000 train_time:364706ms step_avg:993.75ms | |
step:378/6000 train_time:365692ms step_avg:993.73ms | |
step:379/6000 train_time:366711ms step_avg:993.80ms | |
step:380/6000 train_time:367703ms step_avg:993.79ms | |
step:381/6000 train_time:368700ms step_avg:993.80ms | |
step:382/6000 train_time:369700ms step_avg:993.82ms | |
step:383/6000 train_time:370692ms step_avg:993.81ms | |
step:384/6000 train_time:371681ms step_avg:993.80ms | |
step:385/6000 train_time:372679ms step_avg:993.81ms | |
step:386/6000 train_time:373676ms step_avg:993.82ms | |
step:387/6000 train_time:374657ms step_avg:993.79ms | |
step:388/6000 train_time:375644ms step_avg:993.77ms | |
step:389/6000 train_time:376633ms step_avg:993.75ms | |
step:390/6000 train_time:377627ms step_avg:993.75ms | |
step:391/6000 train_time:378625ms step_avg:993.77ms | |
step:392/6000 train_time:379611ms step_avg:993.75ms | |
step:393/6000 train_time:380602ms step_avg:993.74ms | |
step:394/6000 train_time:381602ms step_avg:993.75ms | |
step:395/6000 train_time:382595ms step_avg:993.75ms | |
step:396/6000 train_time:383599ms step_avg:993.78ms | |
step:397/6000 train_time:384586ms step_avg:993.76ms | |
step:398/6000 train_time:385581ms step_avg:993.76ms | |
step:399/6000 train_time:386569ms step_avg:993.75ms | |
step:400/6000 train_time:387575ms step_avg:993.78ms | |
step:400/6000 val_loss:2.6019 train_time:387613ms step_avg:993.88ms perplexity:13.4896 param_count:85,137,462 | |
step:401/6000 train_time:388562ms step_avg:993.77ms | |
step:402/6000 train_time:389559ms step_avg:993.77ms | |
step:403/6000 train_time:390556ms step_avg:993.78ms | |
step:404/6000 train_time:391560ms step_avg:993.81ms | |
step:405/6000 train_time:392542ms step_avg:993.78ms | |
step:406/6000 train_time:393546ms step_avg:993.80ms | |
step:407/6000 train_time:394537ms step_avg:993.80ms | |
step:408/6000 train_time:395527ms step_avg:993.79ms | |
step:409/6000 train_time:396522ms step_avg:993.79ms | |
step:410/6000 train_time:397518ms step_avg:993.79ms | |
step:411/6000 train_time:398520ms step_avg:993.81ms | |
step:412/6000 train_time:399517ms step_avg:993.82ms | |
step:413/6000 train_time:400529ms step_avg:993.87ms | |
step:414/6000 train_time:401518ms step_avg:993.86ms | |
step:415/6000 train_time:402517ms step_avg:993.87ms | |
step:416/6000 train_time:403525ms step_avg:993.90ms | |
step:417/6000 train_time:404515ms step_avg:993.90ms | |
step:418/6000 train_time:405511ms step_avg:993.90ms | |
step:419/6000 train_time:406495ms step_avg:993.88ms | |
step:420/6000 train_time:407486ms step_avg:993.87ms | |
step:421/6000 train_time:408490ms step_avg:993.89ms | |
step:422/6000 train_time:409487ms step_avg:993.90ms | |
step:423/6000 train_time:410484ms step_avg:993.91ms | |
step:424/6000 train_time:411478ms step_avg:993.91ms | |
step:425/6000 train_time:412473ms step_avg:993.91ms | |
step:425/6000 val_loss:2.5998 train_time:412512ms step_avg:994.00ms perplexity:13.4604 param_count:85,137,462 | |
step:426/6000 train_time:413458ms step_avg:993.89ms | |
step:427/6000 train_time:414455ms step_avg:993.90ms | |
step:428/6000 train_time:415444ms step_avg:993.88ms | |
step:429/6000 train_time:416442ms step_avg:993.89ms | |
step:430/6000 train_time:417439ms step_avg:993.90ms | |
step:431/6000 train_time:418435ms step_avg:993.91ms | |
step:432/6000 train_time:419424ms step_avg:993.90ms | |
step:433/6000 train_time:420416ms step_avg:993.89ms | |
step:434/6000 train_time:421405ms step_avg:993.88ms | |
step:435/6000 train_time:422397ms step_avg:993.88ms | |
step:436/6000 train_time:423396ms step_avg:993.89ms | |
step:437/6000 train_time:424386ms step_avg:993.88ms | |
step:438/6000 train_time:425386ms step_avg:993.89ms | |
step:439/6000 train_time:426386ms step_avg:993.91ms | |
step:440/6000 train_time:427384ms step_avg:993.92ms | |
step:441/6000 train_time:428370ms step_avg:993.90ms | |
step:442/6000 train_time:429356ms step_avg:993.88ms | |
step:443/6000 train_time:430353ms step_avg:993.89ms | |
step:444/6000 train_time:431338ms step_avg:993.87ms | |
step:445/6000 train_time:432328ms step_avg:993.86ms | |
step:446/6000 train_time:433323ms step_avg:993.86ms | |
step:447/6000 train_time:434314ms step_avg:993.85ms | |
step:448/6000 train_time:435303ms step_avg:993.84ms | |
step:449/6000 train_time:436299ms step_avg:993.85ms | |
step:450/6000 train_time:437304ms step_avg:993.87ms | |
step:450/6000 val_loss:2.5874 train_time:437343ms step_avg:993.96ms perplexity:13.2945 param_count:85,137,462 | |
step:451/6000 train_time:438305ms step_avg:993.89ms | |
step:452/6000 train_time:439306ms step_avg:993.90ms | |
step:453/6000 train_time:440293ms step_avg:993.89ms | |
step:454/6000 train_time:441283ms step_avg:993.88ms | |
step:455/6000 train_time:442276ms step_avg:993.88ms | |
step:456/6000 train_time:443271ms step_avg:993.88ms | |
step:457/6000 train_time:444377ms step_avg:994.13ms | |
step:458/6000 train_time:445368ms step_avg:994.13ms | |
step:459/6000 train_time:446363ms step_avg:994.13ms | |
step:460/6000 train_time:447357ms step_avg:994.13ms | |
step:461/6000 train_time:448344ms step_avg:994.11ms | |
step:462/6000 train_time:449345ms step_avg:994.13ms | |
step:463/6000 train_time:450340ms step_avg:994.13ms | |
step:464/6000 train_time:451327ms step_avg:994.11ms | |
step:465/6000 train_time:452319ms step_avg:994.11ms | |
step:466/6000 train_time:453317ms step_avg:994.12ms | |
step:467/6000 train_time:454311ms step_avg:994.12ms | |
step:468/6000 train_time:455306ms step_avg:994.12ms | |
step:469/6000 train_time:456296ms step_avg:994.11ms | |
step:470/6000 train_time:457294ms step_avg:994.12ms | |
step:471/6000 train_time:458281ms step_avg:994.10ms | |
step:472/6000 train_time:459268ms step_avg:994.09ms | |
step:473/6000 train_time:460262ms step_avg:994.09ms | |
step:474/6000 train_time:461259ms step_avg:994.09ms | |
step:475/6000 train_time:462246ms step_avg:994.08ms | |
step:475/6000 val_loss:2.5989 train_time:462287ms step_avg:994.17ms perplexity:13.4496 param_count:85,137,462 | |
step:476/6000 train_time:463236ms step_avg:994.07ms | |
step:477/6000 train_time:464220ms step_avg:994.05ms | |
step:478/6000 train_time:465221ms step_avg:994.06ms | |
step:479/6000 train_time:466213ms step_avg:994.06ms | |
step:480/6000 train_time:467207ms step_avg:994.06ms | |
step:481/6000 train_time:468202ms step_avg:994.06ms | |
step:482/6000 train_time:469198ms step_avg:994.06ms | |
step:483/6000 train_time:470184ms step_avg:994.05ms | |
step:484/6000 train_time:471197ms step_avg:994.09ms | |
step:485/6000 train_time:472184ms step_avg:994.07ms | |
step:486/6000 train_time:473166ms step_avg:994.05ms | |
step:487/6000 train_time:474155ms step_avg:994.03ms | |
step:488/6000 train_time:475146ms step_avg:994.03ms | |
step:489/6000 train_time:476139ms step_avg:994.03ms | |
step:490/6000 train_time:477133ms step_avg:994.03ms | |
step:491/6000 train_time:478133ms step_avg:994.04ms | |
step:492/6000 train_time:479129ms step_avg:994.04ms | |
step:493/6000 train_time:480118ms step_avg:994.03ms | |
step:494/6000 train_time:481117ms step_avg:994.04ms | |
step:495/6000 train_time:482109ms step_avg:994.04ms | |
step:496/6000 train_time:483096ms step_avg:994.02ms | |
step:497/6000 train_time:484087ms step_avg:994.02ms | |
step:498/6000 train_time:485077ms step_avg:994.01ms | |
step:499/6000 train_time:486074ms step_avg:994.02ms | |
step:500/6000 train_time:487069ms step_avg:994.02ms | |
step:500/6000 val_loss:2.5898 train_time:487111ms step_avg:994.10ms perplexity:13.3265 param_count:85,137,462 | |
step:501/6000 train_time:488041ms step_avg:993.97ms | |
step:502/6000 train_time:489031ms step_avg:993.96ms | |
step:503/6000 train_time:490017ms step_avg:993.95ms | |
step:504/6000 train_time:491011ms step_avg:993.95ms | |
step:505/6000 train_time:492011ms step_avg:993.96ms | |
step:506/6000 train_time:493016ms step_avg:993.98ms | |
step:507/6000 train_time:494009ms step_avg:993.98ms | |
step:508/6000 train_time:495003ms step_avg:993.98ms | |
step:509/6000 train_time:495995ms step_avg:993.98ms | |
step:510/6000 train_time:496985ms step_avg:993.97ms | |
step:511/6000 train_time:497977ms step_avg:993.97ms | |
step:512/6000 train_time:498983ms step_avg:993.99ms | |
step:513/6000 train_time:499993ms step_avg:994.02ms | |
step:514/6000 train_time:500988ms step_avg:994.02ms | |
step:515/6000 train_time:501981ms step_avg:994.02ms | |
step:516/6000 train_time:502978ms step_avg:994.03ms | |
step:517/6000 train_time:503977ms step_avg:994.04ms | |
step:518/6000 train_time:504969ms step_avg:994.03ms | |
step:519/6000 train_time:505959ms step_avg:994.03ms | |
step:520/6000 train_time:506956ms step_avg:994.03ms | |
step:521/6000 train_time:507947ms step_avg:994.03ms | |
step:522/6000 train_time:508943ms step_avg:994.03ms | |
step:523/6000 train_time:509935ms step_avg:994.02ms | |
step:524/6000 train_time:510926ms step_avg:994.02ms | |
step:525/6000 train_time:511922ms step_avg:994.02ms | |
step:525/6000 val_loss:2.5866 train_time:511963ms step_avg:994.10ms perplexity:13.2843 param_count:85,137,462 | |
step:526/6000 train_time:512917ms step_avg:994.03ms | |
step:527/6000 train_time:513911ms step_avg:994.03ms | |
step:528/6000 train_time:514910ms step_avg:994.03ms | |
step:529/6000 train_time:515918ms step_avg:994.06ms | |
step:530/6000 train_time:516915ms step_avg:994.07ms | |
step:531/6000 train_time:517909ms step_avg:994.07ms | |
step:532/6000 train_time:518897ms step_avg:994.06ms | |
step:533/6000 train_time:519891ms step_avg:994.06ms | |
step:534/6000 train_time:520881ms step_avg:994.05ms | |
step:535/6000 train_time:521880ms step_avg:994.06ms | |
step:536/6000 train_time:522865ms step_avg:994.04ms | |
step:537/6000 train_time:523858ms step_avg:994.04ms | |
step:538/6000 train_time:524842ms step_avg:994.02ms | |
step:539/6000 train_time:525837ms step_avg:994.02ms | |
step:540/6000 train_time:526834ms step_avg:994.03ms | |
step:541/6000 train_time:527838ms step_avg:994.05ms | |
step:542/6000 train_time:528833ms step_avg:994.05ms | |
step:543/6000 train_time:529837ms step_avg:994.07ms | |
step:544/6000 train_time:530826ms step_avg:994.06ms | |
step:545/6000 train_time:531817ms step_avg:994.05ms | |
step:546/6000 train_time:532817ms step_avg:994.06ms | |
step:547/6000 train_time:533805ms step_avg:994.05ms | |
step:548/6000 train_time:534800ms step_avg:994.05ms | |
step:549/6000 train_time:535791ms step_avg:994.05ms | |
step:550/6000 train_time:536780ms step_avg:994.04ms | |
step:550/6000 val_loss:2.5959 train_time:536821ms step_avg:994.11ms perplexity:13.4090 param_count:85,137,462 | |
step:551/6000 train_time:537772ms step_avg:994.03ms | |
step:552/6000 train_time:538766ms step_avg:994.03ms | |
step:553/6000 train_time:539765ms step_avg:994.04ms | |
step:554/6000 train_time:540768ms step_avg:994.06ms | |
step:555/6000 train_time:541755ms step_avg:994.05ms | |
step:556/6000 train_time:542767ms step_avg:994.08ms | |
step:557/6000 train_time:543768ms step_avg:994.09ms | |
step:558/6000 train_time:544771ms step_avg:994.11ms | |
step:559/6000 train_time:545783ms step_avg:994.14ms | |
step:560/6000 train_time:546792ms step_avg:994.17ms | |
step:561/6000 train_time:547797ms step_avg:994.19ms | |
step:562/6000 train_time:548792ms step_avg:994.19ms | |
step:563/6000 train_time:549786ms step_avg:994.19ms | |
step:564/6000 train_time:550789ms step_avg:994.20ms | |
step:565/6000 train_time:551775ms step_avg:994.19ms | |
step:566/6000 train_time:552773ms step_avg:994.20ms | |
step:567/6000 train_time:553779ms step_avg:994.22ms | |
step:568/6000 train_time:554781ms step_avg:994.23ms | |
step:569/6000 train_time:555775ms step_avg:994.23ms | |
step:570/6000 train_time:556767ms step_avg:994.23ms | |
step:571/6000 train_time:557762ms step_avg:994.23ms | |
step:572/6000 train_time:558762ms step_avg:994.24ms | |
step:573/6000 train_time:559756ms step_avg:994.24ms | |
step:574/6000 train_time:560757ms step_avg:994.25ms | |
step:575/6000 train_time:561747ms step_avg:994.24ms | |
step:575/6000 val_loss:2.5778 train_time:561786ms step_avg:994.31ms perplexity:13.1681 param_count:85,137,462 | |
step:576/6000 train_time:562730ms step_avg:994.22ms | |
step:577/6000 train_time:563741ms step_avg:994.25ms | |
step:578/6000 train_time:564732ms step_avg:994.25ms | |
step:579/6000 train_time:565730ms step_avg:994.25ms | |
step:580/6000 train_time:566725ms step_avg:994.25ms | |
step:581/6000 train_time:567729ms step_avg:994.27ms | |
step:582/6000 train_time:568718ms step_avg:994.26ms | |
step:583/6000 train_time:569715ms step_avg:994.27ms | |
step:584/6000 train_time:570704ms step_avg:994.26ms | |
step:585/6000 train_time:571702ms step_avg:994.26ms | |
step:586/6000 train_time:572696ms step_avg:994.26ms | |
step:587/6000 train_time:573694ms step_avg:994.27ms | |
step:588/6000 train_time:574691ms step_avg:994.27ms | |
step:589/6000 train_time:575686ms step_avg:994.28ms | |
step:590/6000 train_time:576678ms step_avg:994.27ms | |
step:591/6000 train_time:577687ms step_avg:994.30ms | |
step:592/6000 train_time:578688ms step_avg:994.31ms | |
step:593/6000 train_time:579684ms step_avg:994.31ms | |
step:594/6000 train_time:580676ms step_avg:994.31ms | |
step:595/6000 train_time:581662ms step_avg:994.29ms | |
step:596/6000 train_time:582648ms step_avg:994.28ms | |
step:597/6000 train_time:583634ms step_avg:994.27ms | |
step:598/6000 train_time:584629ms step_avg:994.27ms | |
step:599/6000 train_time:585631ms step_avg:994.28ms | |
step:600/6000 train_time:586631ms step_avg:994.29ms | |
step:600/6000 val_loss:2.5902 train_time:586671ms step_avg:994.36ms perplexity:13.3320 param_count:85,137,462 | |
step:601/6000 train_time:587613ms step_avg:994.27ms | |
step:602/6000 train_time:588609ms step_avg:994.27ms | |
step:603/6000 train_time:589615ms step_avg:994.29ms | |
step:604/6000 train_time:590612ms step_avg:994.30ms | |
step:605/6000 train_time:591607ms step_avg:994.30ms | |
step:606/6000 train_time:592613ms step_avg:994.32ms | |
step:607/6000 train_time:593604ms step_avg:994.31ms | |
step:608/6000 train_time:594598ms step_avg:994.31ms | |
step:609/6000 train_time:595604ms step_avg:994.33ms | |
step:610/6000 train_time:596591ms step_avg:994.32ms | |
step:611/6000 train_time:597582ms step_avg:994.31ms | |
step:612/6000 train_time:598574ms step_avg:994.31ms | |
step:613/6000 train_time:599562ms step_avg:994.30ms | |
step:614/6000 train_time:600554ms step_avg:994.29ms | |
step:615/6000 train_time:601545ms step_avg:994.29ms | |
step:616/6000 train_time:602534ms step_avg:994.28ms | |
step:617/6000 train_time:603534ms step_avg:994.29ms | |
step:618/6000 train_time:604521ms step_avg:994.28ms | |
step:619/6000 train_time:605517ms step_avg:994.28ms | |
step:620/6000 train_time:606508ms step_avg:994.28ms | |
step:621/6000 train_time:607503ms step_avg:994.28ms | |
step:622/6000 train_time:608502ms step_avg:994.28ms | |
step:623/6000 train_time:609492ms step_avg:994.28ms | |
step:624/6000 train_time:610479ms step_avg:994.27ms | |
step:625/6000 train_time:611469ms step_avg:994.26ms | |
step:625/6000 val_loss:2.5817 train_time:611510ms step_avg:994.33ms perplexity:13.2199 param_count:85,137,462 | |
step:626/6000 train_time:612463ms step_avg:994.26ms | |
step:627/6000 train_time:613452ms step_avg:994.25ms | |
step:628/6000 train_time:614445ms step_avg:994.25ms | |
step:629/6000 train_time:615439ms step_avg:994.25ms | |
step:630/6000 train_time:616434ms step_avg:994.25ms | |
step:631/6000 train_time:617421ms step_avg:994.24ms | |
step:632/6000 train_time:618414ms step_avg:994.23ms | |
step:633/6000 train_time:619418ms step_avg:994.25ms | |
step:634/6000 train_time:620418ms step_avg:994.26ms | |
step:635/6000 train_time:621408ms step_avg:994.25ms | |
step:636/6000 train_time:622404ms step_avg:994.26ms | |
step:637/6000 train_time:623398ms step_avg:994.26ms | |
step:638/6000 train_time:624390ms step_avg:994.25ms | |
step:639/6000 train_time:625384ms step_avg:994.25ms | |
step:640/6000 train_time:626378ms step_avg:994.25ms | |
step:641/6000 train_time:627377ms step_avg:994.26ms | |
step:642/6000 train_time:628369ms step_avg:994.25ms | |
step:643/6000 train_time:629359ms step_avg:994.25ms | |
step:644/6000 train_time:630347ms step_avg:994.24ms | |
step:645/6000 train_time:631351ms step_avg:994.25ms | |
step:646/6000 train_time:632344ms step_avg:994.25ms | |
step:647/6000 train_time:633338ms step_avg:994.25ms | |
step:648/6000 train_time:634339ms step_avg:994.26ms | |
step:649/6000 train_time:635334ms step_avg:994.26ms | |
step:650/6000 train_time:636326ms step_avg:994.26ms | |
step:650/6000 val_loss:2.5805 train_time:636367ms step_avg:994.32ms perplexity:13.2032 param_count:85,137,462 | |
step:651/6000 train_time:637316ms step_avg:994.25ms | |
step:652/6000 train_time:638305ms step_avg:994.24ms | |
step:653/6000 train_time:639303ms step_avg:994.25ms | |
step:654/6000 train_time:640304ms step_avg:994.26ms | |
step:655/6000 train_time:641300ms step_avg:994.26ms | |
step:656/6000 train_time:642293ms step_avg:994.26ms | |
step:657/6000 train_time:643286ms step_avg:994.26ms | |
step:658/6000 train_time:644286ms step_avg:994.27ms | |
step:659/6000 train_time:645317ms step_avg:994.32ms | |
step:660/6000 train_time:646315ms step_avg:994.33ms | |
step:661/6000 train_time:647311ms step_avg:994.33ms | |
step:662/6000 train_time:648314ms step_avg:994.35ms | |
step:663/6000 train_time:649307ms step_avg:994.34ms | |
step:664/6000 train_time:650306ms step_avg:994.35ms | |
step:665/6000 train_time:651304ms step_avg:994.36ms | |
step:666/6000 train_time:652290ms step_avg:994.35ms | |
step:667/6000 train_time:653301ms step_avg:994.37ms | |
step:668/6000 train_time:654290ms step_avg:994.36ms | |
step:669/6000 train_time:655279ms step_avg:994.35ms | |
step:670/6000 train_time:656273ms step_avg:994.35ms | |
step:671/6000 train_time:657256ms step_avg:994.34ms | |
step:672/6000 train_time:658241ms step_avg:994.32ms | |
step:673/6000 train_time:659226ms step_avg:994.31ms | |
step:674/6000 train_time:660223ms step_avg:994.31ms | |
step:675/6000 train_time:661210ms step_avg:994.30ms | |
step:675/6000 val_loss:2.5815 train_time:661251ms step_avg:994.36ms perplexity:13.2173 param_count:85,137,462 | |
step:676/6000 train_time:662201ms step_avg:994.30ms | |
step:677/6000 train_time:663185ms step_avg:994.28ms | |
step:678/6000 train_time:664179ms step_avg:994.28ms | |
step:679/6000 train_time:665172ms step_avg:994.28ms | |
step:680/6000 train_time:666174ms step_avg:994.29ms | |
step:681/6000 train_time:667155ms step_avg:994.27ms | |
step:682/6000 train_time:668143ms step_avg:994.26ms | |
step:683/6000 train_time:669140ms step_avg:994.26ms | |
step:684/6000 train_time:670133ms step_avg:994.26ms | |
step:685/6000 train_time:671125ms step_avg:994.26ms | |
step:686/6000 train_time:672117ms step_avg:994.26ms | |
step:687/6000 train_time:673106ms step_avg:994.25ms | |
step:688/6000 train_time:674103ms step_avg:994.25ms | |
step:689/6000 train_time:675109ms step_avg:994.27ms | |
step:690/6000 train_time:676099ms step_avg:994.26ms | |
step:691/6000 train_time:677092ms step_avg:994.26ms | |
step:692/6000 train_time:678090ms step_avg:994.27ms | |
step:693/6000 train_time:679085ms step_avg:994.27ms | |
step:694/6000 train_time:680074ms step_avg:994.26ms | |
step:695/6000 train_time:681070ms step_avg:994.26ms | |
step:696/6000 train_time:682055ms step_avg:994.25ms | |
step:697/6000 train_time:683053ms step_avg:994.25ms | |
step:698/6000 train_time:684045ms step_avg:994.25ms | |
step:699/6000 train_time:685035ms step_avg:994.24ms | |
step:700/6000 train_time:686031ms step_avg:994.25ms | |
step:700/6000 val_loss:2.5840 train_time:686073ms step_avg:994.31ms perplexity:13.2506 param_count:85,137,462 | |
step:701/6000 train_time:687027ms step_avg:994.25ms | |
step:702/6000 train_time:688015ms step_avg:994.24ms | |
step:703/6000 train_time:689015ms step_avg:994.25ms | |
step:704/6000 train_time:690014ms step_avg:994.26ms | |
step:705/6000 train_time:691005ms step_avg:994.25ms | |
step:706/6000 train_time:692005ms step_avg:994.26ms | |
step:707/6000 train_time:692998ms step_avg:994.26ms | |
step:708/6000 train_time:693992ms step_avg:994.26ms | |
step:709/6000 train_time:694987ms step_avg:994.26ms | |
step:710/6000 train_time:695977ms step_avg:994.25ms | |
step:711/6000 train_time:696972ms step_avg:994.25ms | |
step:712/6000 train_time:697970ms step_avg:994.26ms | |
step:713/6000 train_time:698966ms step_avg:994.26ms | |
step:714/6000 train_time:699964ms step_avg:994.27ms | |
step:715/6000 train_time:700953ms step_avg:994.26ms | |
step:716/6000 train_time:701954ms step_avg:994.27ms | |
step:717/6000 train_time:702961ms step_avg:994.29ms | |
step:718/6000 train_time:703969ms step_avg:994.31ms | |
step:719/6000 train_time:704954ms step_avg:994.29ms | |
step:720/6000 train_time:705954ms step_avg:994.30ms | |
step:721/6000 train_time:706952ms step_avg:994.31ms | |
step:722/6000 train_time:707976ms step_avg:994.35ms | |
step:723/6000 train_time:708972ms step_avg:994.35ms | |
step:724/6000 train_time:709979ms step_avg:994.37ms | |
step:725/6000 train_time:710978ms step_avg:994.38ms | |
step:725/6000 val_loss:2.5802 train_time:711019ms step_avg:994.43ms perplexity:13.2000 param_count:85,137,462 | |
step:726/6000 train_time:711974ms step_avg:994.38ms | |
step:727/6000 train_time:712971ms step_avg:994.38ms | |
step:728/6000 train_time:713976ms step_avg:994.40ms | |
step:729/6000 train_time:714976ms step_avg:994.40ms | |
step:730/6000 train_time:715971ms step_avg:994.40ms | |
step:731/6000 train_time:716958ms step_avg:994.39ms | |
step:732/6000 train_time:717953ms step_avg:994.39ms | |
step:733/6000 train_time:718949ms step_avg:994.40ms | |
step:734/6000 train_time:719944ms step_avg:994.40ms | |
step:735/6000 train_time:720945ms step_avg:994.41ms | |
step:736/6000 train_time:721941ms step_avg:994.41ms | |
step:737/6000 train_time:722931ms step_avg:994.40ms | |
step:738/6000 train_time:723921ms step_avg:994.40ms | |
step:739/6000 train_time:724910ms step_avg:994.39ms | |
step:740/6000 train_time:725899ms step_avg:994.38ms | |
step:741/6000 train_time:726893ms step_avg:994.38ms | |
step:742/6000 train_time:727886ms step_avg:994.38ms | |
step:743/6000 train_time:728878ms step_avg:994.38ms | |
step:744/6000 train_time:729872ms step_avg:994.38ms | |
step:745/6000 train_time:730862ms step_avg:994.37ms | |
step:746/6000 train_time:731863ms step_avg:994.38ms | |
step:747/6000 train_time:732847ms step_avg:994.36ms | |
step:748/6000 train_time:733841ms step_avg:994.37ms | |
step:749/6000 train_time:734845ms step_avg:994.38ms | |
step:750/6000 train_time:735833ms step_avg:994.37ms | |
step:750/6000 val_loss:2.5756 train_time:735875ms step_avg:994.43ms perplexity:13.1396 param_count:85,137,462 | |
step:751/6000 train_time:736823ms step_avg:994.36ms | |
step:752/6000 train_time:737820ms step_avg:994.37ms | |
step:753/6000 train_time:738805ms step_avg:994.35ms | |
step:754/6000 train_time:739794ms step_avg:994.35ms | |
step:755/6000 train_time:740793ms step_avg:994.35ms | |
step:756/6000 train_time:741796ms step_avg:994.36ms | |
step:757/6000 train_time:742795ms step_avg:994.37ms | |
step:758/6000 train_time:743792ms step_avg:994.37ms | |
step:759/6000 train_time:744783ms step_avg:994.37ms | |
step:760/6000 train_time:745772ms step_avg:994.36ms | |
step:761/6000 train_time:746804ms step_avg:994.41ms | |
step:762/6000 train_time:747804ms step_avg:994.42ms | |
step:763/6000 train_time:748806ms step_avg:994.43ms | |
step:764/6000 train_time:749796ms step_avg:994.42ms | |
step:765/6000 train_time:750785ms step_avg:994.42ms | |
step:766/6000 train_time:751785ms step_avg:994.42ms | |
step:767/6000 train_time:752772ms step_avg:994.41ms | |
step:768/6000 train_time:753774ms step_avg:994.43ms | |
step:769/6000 train_time:754778ms step_avg:994.44ms | |
step:770/6000 train_time:755784ms step_avg:994.45ms | |
step:771/6000 train_time:756777ms step_avg:994.45ms | |
step:772/6000 train_time:757768ms step_avg:994.45ms | |
step:773/6000 train_time:758752ms step_avg:994.43ms | |
step:774/6000 train_time:759744ms step_avg:994.43ms | |
step:775/6000 train_time:760740ms step_avg:994.43ms | |
step:775/6000 val_loss:2.5869 train_time:760779ms step_avg:994.48ms perplexity:13.2887 param_count:85,137,462 | |
step:776/6000 train_time:761728ms step_avg:994.42ms | |
step:777/6000 train_time:762729ms step_avg:994.43ms | |
step:778/6000 train_time:763724ms step_avg:994.43ms | |
step:779/6000 train_time:764715ms step_avg:994.43ms | |
step:780/6000 train_time:765702ms step_avg:994.42ms | |
step:781/6000 train_time:766707ms step_avg:994.43ms | |
step:782/6000 train_time:767707ms step_avg:994.44ms | |
step:783/6000 train_time:768698ms step_avg:994.43ms | |
step:784/6000 train_time:769693ms step_avg:994.44ms | |
step:785/6000 train_time:770690ms step_avg:994.44ms | |
step:786/6000 train_time:771679ms step_avg:994.43ms | |
step:787/6000 train_time:772676ms step_avg:994.43ms | |
step:788/6000 train_time:773662ms step_avg:994.42ms | |
step:789/6000 train_time:774661ms step_avg:994.43ms | |
step:790/6000 train_time:775654ms step_avg:994.43ms | |
step:791/6000 train_time:776647ms step_avg:994.43ms | |
step:792/6000 train_time:777652ms step_avg:994.44ms | |
step:793/6000 train_time:778643ms step_avg:994.43ms | |
step:794/6000 train_time:779633ms step_avg:994.43ms | |
step:795/6000 train_time:780625ms step_avg:994.43ms | |
step:796/6000 train_time:781626ms step_avg:994.43ms | |
step:797/6000 train_time:782625ms step_avg:994.44ms | |
step:798/6000 train_time:783612ms step_avg:994.43ms | |
step:799/6000 train_time:784608ms step_avg:994.43ms | |
step:800/6000 train_time:785605ms step_avg:994.44ms | |
step:800/6000 val_loss:2.5772 train_time:785646ms step_avg:994.49ms perplexity:13.1603 param_count:85,137,462 | |
step:801/6000 train_time:786600ms step_avg:994.44ms | |
step:802/6000 train_time:787597ms step_avg:994.44ms | |
step:803/6000 train_time:788583ms step_avg:994.43ms | |
step:804/6000 train_time:789568ms step_avg:994.42ms | |
step:805/6000 train_time:790564ms step_avg:994.42ms | |
step:806/6000 train_time:791561ms step_avg:994.42ms | |
step:807/6000 train_time:792552ms step_avg:994.42ms | |
step:808/6000 train_time:793549ms step_avg:994.42ms | |
step:809/6000 train_time:794539ms step_avg:994.42ms | |
step:810/6000 train_time:795539ms step_avg:994.42ms | |
step:811/6000 train_time:796519ms step_avg:994.41ms | |
step:812/6000 train_time:797511ms step_avg:994.40ms | |
step:813/6000 train_time:798502ms step_avg:994.40ms | |
step:814/6000 train_time:799491ms step_avg:994.39ms | |
step:815/6000 train_time:800491ms step_avg:994.40ms | |
step:816/6000 train_time:801483ms step_avg:994.40ms | |
step:817/6000 train_time:802483ms step_avg:994.40ms | |
step:818/6000 train_time:803471ms step_avg:994.40ms | |
step:819/6000 train_time:804467ms step_avg:994.40ms | |
step:820/6000 train_time:805463ms step_avg:994.40ms | |
step:821/6000 train_time:806473ms step_avg:994.42ms | |
step:822/6000 train_time:807475ms step_avg:994.43ms | |
step:823/6000 train_time:808466ms step_avg:994.42ms | |
step:824/6000 train_time:809464ms step_avg:994.43ms | |
step:825/6000 train_time:810461ms step_avg:994.43ms | |
step:825/6000 val_loss:2.5799 train_time:810502ms step_avg:994.48ms perplexity:13.1957 param_count:85,137,462 | |
step:826/6000 train_time:811453ms step_avg:994.43ms | |
step:827/6000 train_time:812445ms step_avg:994.43ms | |
step:828/6000 train_time:813427ms step_avg:994.41ms | |
step:829/6000 train_time:814425ms step_avg:994.41ms | |
step:830/6000 train_time:815421ms step_avg:994.42ms | |
step:831/6000 train_time:816407ms step_avg:994.41ms | |
step:832/6000 train_time:817406ms step_avg:994.41ms | |
step:833/6000 train_time:818407ms step_avg:994.42ms | |
step:834/6000 train_time:819400ms step_avg:994.42ms | |
step:835/6000 train_time:820389ms step_avg:994.41ms | |
step:836/6000 train_time:821391ms step_avg:994.42ms | |
step:837/6000 train_time:822382ms step_avg:994.42ms | |
step:838/6000 train_time:823373ms step_avg:994.41ms | |
step:839/6000 train_time:824370ms step_avg:994.42ms | |
step:840/6000 train_time:825367ms step_avg:994.42ms | |
step:841/6000 train_time:826366ms step_avg:994.42ms | |
step:842/6000 train_time:827357ms step_avg:994.42ms | |
step:843/6000 train_time:828344ms step_avg:994.41ms | |
step:844/6000 train_time:829336ms step_avg:994.41ms | |
step:845/6000 train_time:830321ms step_avg:994.40ms | |
step:846/6000 train_time:831316ms step_avg:994.40ms | |
step:847/6000 train_time:832313ms step_avg:994.40ms | |
step:848/6000 train_time:833314ms step_avg:994.41ms | |
step:849/6000 train_time:834307ms step_avg:994.41ms | |
step:850/6000 train_time:835311ms step_avg:994.42ms | |
step:850/6000 val_loss:2.5752 train_time:835353ms step_avg:994.47ms perplexity:13.1342 param_count:85,137,462 | |
step:851/6000 train_time:836307ms step_avg:994.42ms | |
step:852/6000 train_time:837297ms step_avg:994.41ms | |
step:853/6000 train_time:838286ms step_avg:994.41ms | |
step:854/6000 train_time:839280ms step_avg:994.41ms | |
step:855/6000 train_time:840272ms step_avg:994.40ms | |
step:856/6000 train_time:841269ms step_avg:994.41ms | |
step:857/6000 train_time:842260ms step_avg:994.40ms | |
step:858/6000 train_time:843263ms step_avg:994.41ms | |
step:859/6000 train_time:844260ms step_avg:994.42ms | |
step:860/6000 train_time:845257ms step_avg:994.42ms | |
step:861/6000 train_time:846242ms step_avg:994.41ms | |
step:862/6000 train_time:847236ms step_avg:994.41ms | |
step:863/6000 train_time:848234ms step_avg:994.41ms | |
step:864/6000 train_time:849231ms step_avg:994.42ms | |
step:865/6000 train_time:850213ms step_avg:994.40ms | |
step:866/6000 train_time:851208ms step_avg:994.40ms | |
step:867/6000 train_time:852200ms step_avg:994.40ms | |
step:868/6000 train_time:853198ms step_avg:994.40ms | |
step:869/6000 train_time:854196ms step_avg:994.41ms | |
step:870/6000 train_time:855188ms step_avg:994.40ms | |
step:871/6000 train_time:856181ms step_avg:994.40ms | |
step:872/6000 train_time:857176ms step_avg:994.40ms | |
step:873/6000 train_time:858178ms step_avg:994.41ms | |
step:874/6000 train_time:859174ms step_avg:994.41ms | |
step:875/6000 train_time:860175ms step_avg:994.42ms | |
step:875/6000 val_loss:2.5755 train_time:860216ms step_avg:994.47ms perplexity:13.1375 param_count:85,137,462 | |
step:876/6000 train_time:861169ms step_avg:994.42ms | |
step:877/6000 train_time:862182ms step_avg:994.44ms | |
step:878/6000 train_time:863170ms step_avg:994.44ms | |
step:879/6000 train_time:864166ms step_avg:994.44ms | |
step:880/6000 train_time:865164ms step_avg:994.44ms | |
step:881/6000 train_time:866158ms step_avg:994.44ms | |
step:882/6000 train_time:867152ms step_avg:994.44ms | |
step:883/6000 train_time:868147ms step_avg:994.44ms | |
step:884/6000 train_time:869136ms step_avg:994.44ms | |
step:885/6000 train_time:870135ms step_avg:994.44ms | |
step:886/6000 train_time:871130ms step_avg:994.44ms | |
step:887/6000 train_time:872147ms step_avg:994.47ms | |
step:888/6000 train_time:873137ms step_avg:994.46ms | |
step:889/6000 train_time:874139ms step_avg:994.47ms | |
step:890/6000 train_time:875138ms step_avg:994.47ms | |
step:891/6000 train_time:876135ms step_avg:994.48ms | |
step:892/6000 train_time:877132ms step_avg:994.48ms | |
step:893/6000 train_time:878123ms step_avg:994.48ms | |
step:894/6000 train_time:879121ms step_avg:994.48ms | |
step:895/6000 train_time:880143ms step_avg:994.51ms | |
step:896/6000 train_time:881130ms step_avg:994.50ms | |
step:897/6000 train_time:882124ms step_avg:994.50ms | |
step:898/6000 train_time:883130ms step_avg:994.52ms | |
step:899/6000 train_time:884124ms step_avg:994.51ms | |
step:900/6000 train_time:885109ms step_avg:994.51ms | |
step:900/6000 val_loss:2.5705 train_time:885151ms step_avg:994.55ms perplexity:13.0730 param_count:85,137,462 | |
step:901/6000 train_time:886100ms step_avg:994.50ms | |
step:902/6000 train_time:887092ms step_avg:994.50ms | |
step:903/6000 train_time:888086ms step_avg:994.50ms | |
step:904/6000 train_time:889096ms step_avg:994.51ms | |
step:905/6000 train_time:890092ms step_avg:994.52ms | |
step:906/6000 train_time:891104ms step_avg:994.54ms | |
step:907/6000 train_time:892105ms step_avg:994.54ms | |
step:908/6000 train_time:893097ms step_avg:994.54ms | |
step:909/6000 train_time:894084ms step_avg:994.53ms | |
step:910/6000 train_time:895087ms step_avg:994.54ms | |
step:911/6000 train_time:896080ms step_avg:994.54ms | |
step:912/6000 train_time:897085ms step_avg:994.55ms | |
step:913/6000 train_time:898077ms step_avg:994.55ms | |
step:914/6000 train_time:899074ms step_avg:994.55ms | |
step:915/6000 train_time:900059ms step_avg:994.54ms | |
step:916/6000 train_time:901058ms step_avg:994.55ms | |
step:917/6000 train_time:902056ms step_avg:994.55ms | |
step:918/6000 train_time:903050ms step_avg:994.55ms | |
step:919/6000 train_time:904044ms step_avg:994.55ms | |
step:920/6000 train_time:905054ms step_avg:994.56ms | |
step:921/6000 train_time:906049ms step_avg:994.57ms | |
step:922/6000 train_time:907046ms step_avg:994.57ms | |
step:923/6000 train_time:908038ms step_avg:994.56ms | |
step:924/6000 train_time:909032ms step_avg:994.56ms | |
step:925/6000 train_time:910021ms step_avg:994.56ms | |
step:925/6000 val_loss:2.5792 train_time:910061ms step_avg:994.60ms perplexity:13.1867 param_count:85,137,462 | |
step:926/6000 train_time:911023ms step_avg:994.57ms | |
step:927/6000 train_time:912009ms step_avg:994.56ms | |
step:928/6000 train_time:913008ms step_avg:994.56ms | |
step:929/6000 train_time:914017ms step_avg:994.58ms | |
step:930/6000 train_time:915017ms step_avg:994.58ms | |
step:931/6000 train_time:916019ms step_avg:994.59ms | |
step:932/6000 train_time:917018ms step_avg:994.60ms | |
step:933/6000 train_time:918015ms step_avg:994.60ms | |
step:934/6000 train_time:919014ms step_avg:994.60ms | |
step:935/6000 train_time:920010ms step_avg:994.61ms | |
step:936/6000 train_time:921011ms step_avg:994.61ms | |
step:937/6000 train_time:922010ms step_avg:994.62ms | |
step:938/6000 train_time:923008ms step_avg:994.62ms | |
step:939/6000 train_time:924005ms step_avg:994.62ms | |
step:940/6000 train_time:925001ms step_avg:994.62ms | |
step:941/6000 train_time:925997ms step_avg:994.63ms | |
step:942/6000 train_time:926988ms step_avg:994.62ms | |
step:943/6000 train_time:927992ms step_avg:994.63ms | |
step:944/6000 train_time:928996ms step_avg:994.64ms | |
step:945/6000 train_time:929989ms step_avg:994.64ms | |
step:946/6000 train_time:930986ms step_avg:994.64ms | |
step:947/6000 train_time:931988ms step_avg:994.65ms | |
step:948/6000 train_time:932979ms step_avg:994.65ms | |
step:949/6000 train_time:933973ms step_avg:994.65ms | |
step:950/6000 train_time:934979ms step_avg:994.66ms | |
step:950/6000 val_loss:2.5716 train_time:935020ms step_avg:994.70ms perplexity:13.0870 param_count:85,137,462 | |
step:951/6000 train_time:935963ms step_avg:994.65ms | |
step:952/6000 train_time:936964ms step_avg:994.65ms | |
step:953/6000 train_time:937965ms step_avg:994.66ms | |
step:954/6000 train_time:938963ms step_avg:994.66ms | |
step:955/6000 train_time:939956ms step_avg:994.66ms | |
step:956/6000 train_time:940952ms step_avg:994.66ms | |
step:957/6000 train_time:941960ms step_avg:994.68ms | |
step:958/6000 train_time:942956ms step_avg:994.68ms | |
step:959/6000 train_time:943952ms step_avg:994.68ms | |
step:960/6000 train_time:944946ms step_avg:994.68ms | |
step:961/6000 train_time:945939ms step_avg:994.68ms | |
step:962/6000 train_time:946933ms step_avg:994.68ms | |
step:963/6000 train_time:947951ms step_avg:994.70ms | |
step:964/6000 train_time:948949ms step_avg:994.71ms | |
step:965/6000 train_time:949951ms step_avg:994.71ms | |
step:966/6000 train_time:950946ms step_avg:994.71ms | |
step:967/6000 train_time:951935ms step_avg:994.71ms | |
step:968/6000 train_time:952923ms step_avg:994.70ms | |
step:969/6000 train_time:953916ms step_avg:994.70ms | |
step:970/6000 train_time:954921ms step_avg:994.71ms | |
step:971/6000 train_time:955919ms step_avg:994.71ms | |
step:972/6000 train_time:956923ms step_avg:994.72ms | |
step:973/6000 train_time:957924ms step_avg:994.73ms | |
step:974/6000 train_time:958928ms step_avg:994.74ms | |
step:975/6000 train_time:959920ms step_avg:994.74ms | |
step:975/6000 val_loss:2.5762 train_time:959962ms step_avg:994.78ms perplexity:13.1470 param_count:85,137,462 | |
step:976/6000 train_time:960890ms step_avg:994.71ms | |
step:977/6000 train_time:961886ms step_avg:994.71ms | |
step:978/6000 train_time:962876ms step_avg:994.71ms | |
step:979/6000 train_time:963874ms step_avg:994.71ms | |
step:980/6000 train_time:964868ms step_avg:994.71ms | |
step:981/6000 train_time:965867ms step_avg:994.71ms | |
step:982/6000 train_time:966855ms step_avg:994.71ms | |
step:983/6000 train_time:967844ms step_avg:994.70ms | |
step:984/6000 train_time:968842ms step_avg:994.70ms | |
step:985/6000 train_time:969834ms step_avg:994.70ms | |
step:986/6000 train_time:970823ms step_avg:994.70ms | |
step:987/6000 train_time:971825ms step_avg:994.70ms | |
step:988/6000 train_time:972827ms step_avg:994.71ms | |
step:989/6000 train_time:973835ms step_avg:994.72ms | |
step:990/6000 train_time:974839ms step_avg:994.73ms | |
step:991/6000 train_time:975867ms step_avg:994.77ms | |
step:992/6000 train_time:976881ms step_avg:994.79ms | |
step:993/6000 train_time:977881ms step_avg:994.79ms | |
step:994/6000 train_time:978880ms step_avg:994.80ms | |
step:995/6000 train_time:979871ms step_avg:994.79ms | |
step:996/6000 train_time:980856ms step_avg:994.78ms | |
step:997/6000 train_time:981853ms step_avg:994.79ms | |
step:998/6000 train_time:982847ms step_avg:994.78ms | |
step:999/6000 train_time:983841ms step_avg:994.78ms | |
step:1000/6000 train_time:984826ms step_avg:994.77ms | |
step:1000/6000 val_loss:2.5695 train_time:984867ms step_avg:994.82ms perplexity:13.0597 param_count:85,137,462 | |
step:1001/6000 train_time:985814ms step_avg:994.77ms | |
step:1002/6000 train_time:986842ms step_avg:994.80ms | |
step:1003/6000 train_time:987837ms step_avg:994.80ms | |
step:1004/6000 train_time:988833ms step_avg:994.80ms | |
step:1005/6000 train_time:989820ms step_avg:994.79ms | |
step:1006/6000 train_time:990813ms step_avg:994.79ms | |
step:1007/6000 train_time:991808ms step_avg:994.79ms | |
step:1008/6000 train_time:992797ms step_avg:994.79ms | |
step:1009/6000 train_time:993801ms step_avg:994.80ms | |
step:1010/6000 train_time:994803ms step_avg:994.80ms | |
step:1011/6000 train_time:995809ms step_avg:994.81ms | |
step:1012/6000 train_time:996809ms step_avg:994.82ms | |
step:1013/6000 train_time:997814ms step_avg:994.83ms | |
step:1014/6000 train_time:998818ms step_avg:994.84ms | |
step:1015/6000 train_time:999813ms step_avg:994.84ms | |
step:1016/6000 train_time:1000821ms step_avg:994.85ms | |
step:1017/6000 train_time:1001818ms step_avg:994.85ms | |
step:1018/6000 train_time:1002814ms step_avg:994.86ms | |
step:1019/6000 train_time:1003812ms step_avg:994.86ms | |
step:1020/6000 train_time:1004832ms step_avg:994.88ms | |
step:1021/6000 train_time:1005824ms step_avg:994.88ms | |
step:1022/6000 train_time:1006814ms step_avg:994.88ms | |
step:1023/6000 train_time:1007809ms step_avg:994.88ms | |
step:1024/6000 train_time:1008808ms step_avg:994.88ms | |
step:1025/6000 train_time:1009812ms step_avg:994.89ms | |
step:1025/6000 val_loss:2.5772 train_time:1009853ms step_avg:994.93ms perplexity:13.1604 param_count:85,137,462 | |
step:1026/6000 train_time:1010801ms step_avg:994.88ms | |
step:1027/6000 train_time:1011786ms step_avg:994.87ms | |
step:1028/6000 train_time:1012784ms step_avg:994.88ms | |
step:1029/6000 train_time:1013790ms step_avg:994.89ms | |
step:1030/6000 train_time:1014778ms step_avg:994.88ms | |
step:1031/6000 train_time:1015768ms step_avg:994.88ms | |
step:1032/6000 train_time:1016766ms step_avg:994.88ms | |
step:1033/6000 train_time:1017776ms step_avg:994.89ms | |
step:1034/6000 train_time:1018773ms step_avg:994.90ms | |
step:1035/6000 train_time:1019772ms step_avg:994.90ms | |
step:1036/6000 train_time:1020767ms step_avg:994.90ms | |
step:1037/6000 train_time:1021779ms step_avg:994.92ms | |
step:1038/6000 train_time:1022779ms step_avg:994.92ms | |
step:1039/6000 train_time:1023785ms step_avg:994.93ms | |
step:1040/6000 train_time:1024792ms step_avg:994.94ms | |
step:1041/6000 train_time:1025781ms step_avg:994.94ms | |
step:1042/6000 train_time:1026776ms step_avg:994.94ms | |
step:1043/6000 train_time:1027773ms step_avg:994.94ms | |
step:1044/6000 train_time:1028765ms step_avg:994.94ms | |
step:1045/6000 train_time:1029752ms step_avg:994.93ms | |
step:1046/6000 train_time:1030751ms step_avg:994.93ms | |
step:1047/6000 train_time:1031745ms step_avg:994.93ms | |
step:1048/6000 train_time:1032742ms step_avg:994.93ms | |
step:1049/6000 train_time:1033739ms step_avg:994.94ms | |
step:1050/6000 train_time:1034732ms step_avg:994.93ms | |
step:1050/6000 val_loss:2.5720 train_time:1034772ms step_avg:994.97ms perplexity:13.0915 param_count:85,137,462 | |
step:1051/6000 train_time:1035714ms step_avg:994.92ms | |
step:1052/6000 train_time:1036710ms step_avg:994.92ms | |
step:1053/6000 train_time:1037709ms step_avg:994.93ms | |
step:1054/6000 train_time:1038703ms step_avg:994.93ms | |
step:1055/6000 train_time:1039690ms step_avg:994.92ms | |
step:1056/6000 train_time:1040685ms step_avg:994.92ms | |
step:1057/6000 train_time:1041681ms step_avg:994.92ms | |
step:1058/6000 train_time:1042663ms step_avg:994.91ms | |
step:1059/6000 train_time:1043663ms step_avg:994.91ms | |
step:1060/6000 train_time:1044657ms step_avg:994.91ms | |
step:1061/6000 train_time:1045657ms step_avg:994.92ms | |
step:1062/6000 train_time:1046651ms step_avg:994.92ms | |
step:1063/6000 train_time:1047654ms step_avg:994.92ms | |
step:1064/6000 train_time:1048653ms step_avg:994.93ms | |
step:1065/6000 train_time:1049655ms step_avg:994.93ms | |
step:1066/6000 train_time:1050653ms step_avg:994.94ms | |
step:1067/6000 train_time:1051650ms step_avg:994.94ms | |
step:1068/6000 train_time:1052637ms step_avg:994.93ms | |
step:1069/6000 train_time:1053642ms step_avg:994.94ms | |
step:1070/6000 train_time:1054634ms step_avg:994.94ms | |
step:1071/6000 train_time:1055624ms step_avg:994.93ms | |
step:1072/6000 train_time:1056623ms step_avg:994.94ms | |
step:1073/6000 train_time:1057635ms step_avg:994.95ms | |
step:1074/6000 train_time:1058637ms step_avg:994.96ms | |
step:1075/6000 train_time:1059629ms step_avg:994.96ms | |
step:1075/6000 val_loss:2.5723 train_time:1059671ms step_avg:995.00ms perplexity:13.0965 param_count:85,137,462 | |
step:1076/6000 train_time:1060634ms step_avg:994.97ms | |
step:1077/6000 train_time:1061626ms step_avg:994.96ms | |
step:1078/6000 train_time:1062639ms step_avg:994.98ms | |
step:1079/6000 train_time:1063629ms step_avg:994.98ms | |
step:1080/6000 train_time:1064632ms step_avg:994.98ms | |
step:1081/6000 train_time:1065625ms step_avg:994.98ms | |
step:1082/6000 train_time:1066620ms step_avg:994.98ms | |
step:1083/6000 train_time:1067620ms step_avg:994.99ms | |
step:1084/6000 train_time:1068614ms step_avg:994.98ms | |
step:1085/6000 train_time:1069615ms step_avg:994.99ms | |
step:1086/6000 train_time:1070606ms step_avg:994.99ms | |
step:1087/6000 train_time:1071599ms step_avg:994.99ms | |
step:1088/6000 train_time:1072589ms step_avg:994.98ms | |
step:1089/6000 train_time:1073596ms step_avg:994.99ms | |
step:1090/6000 train_time:1074590ms step_avg:994.99ms | |
step:1091/6000 train_time:1075592ms step_avg:995.00ms | |
step:1092/6000 train_time:1076607ms step_avg:995.02ms | |
step:1093/6000 train_time:1077606ms step_avg:995.02ms | |
step:1094/6000 train_time:1078599ms step_avg:995.02ms | |
step:1095/6000 train_time:1079600ms step_avg:995.02ms | |
step:1096/6000 train_time:1080615ms step_avg:995.04ms | |
step:1097/6000 train_time:1081609ms step_avg:995.04ms | |
step:1098/6000 train_time:1082608ms step_avg:995.04ms | |
step:1099/6000 train_time:1083601ms step_avg:995.04ms | |
step:1100/6000 train_time:1084595ms step_avg:995.04ms | |
step:1100/6000 val_loss:2.5687 train_time:1084636ms step_avg:995.08ms perplexity:13.0494 param_count:85,137,462 | |
step:1101/6000 train_time:1085592ms step_avg:995.04ms | |
step:1102/6000 train_time:1086587ms step_avg:995.04ms | |
step:1103/6000 train_time:1087578ms step_avg:995.04ms | |
step:1104/6000 train_time:1088571ms step_avg:995.04ms | |
step:1105/6000 train_time:1089563ms step_avg:995.03ms | |
step:1106/6000 train_time:1090547ms step_avg:995.02ms | |
step:1107/6000 train_time:1091544ms step_avg:995.03ms | |
step:1108/6000 train_time:1092543ms step_avg:995.03ms | |
step:1109/6000 train_time:1093530ms step_avg:995.02ms | |
step:1110/6000 train_time:1094523ms step_avg:995.02ms | |
step:1111/6000 train_time:1095519ms step_avg:995.02ms | |
step:1112/6000 train_time:1096509ms step_avg:995.02ms | |
step:1113/6000 train_time:1097502ms step_avg:995.02ms | |
step:1114/6000 train_time:1098494ms step_avg:995.01ms | |
step:1115/6000 train_time:1099498ms step_avg:995.02ms | |
step:1116/6000 train_time:1100490ms step_avg:995.02ms | |
step:1117/6000 train_time:1101473ms step_avg:995.01ms | |
step:1118/6000 train_time:1102463ms step_avg:995.00ms | |
step:1119/6000 train_time:1103451ms step_avg:995.00ms | |
step:1120/6000 train_time:1104449ms step_avg:995.00ms | |
step:1121/6000 train_time:1105446ms step_avg:995.00ms | |
step:1122/6000 train_time:1106443ms step_avg:995.00ms | |
step:1123/6000 train_time:1107438ms step_avg:995.00ms | |
step:1124/6000 train_time:1108440ms step_avg:995.01ms | |
step:1125/6000 train_time:1109437ms step_avg:995.01ms | |
step:1125/6000 val_loss:2.5690 train_time:1109476ms step_avg:995.05ms perplexity:13.0521 param_count:85,137,462 | |
step:1126/6000 train_time:1110428ms step_avg:995.01ms | |
step:1127/6000 train_time:1111429ms step_avg:995.01ms | |
step:1128/6000 train_time:1112434ms step_avg:995.02ms | |
step:1129/6000 train_time:1113431ms step_avg:995.02ms | |
step:1130/6000 train_time:1114430ms step_avg:995.03ms | |
step:1131/6000 train_time:1115418ms step_avg:995.02ms | |
step:1132/6000 train_time:1116409ms step_avg:995.02ms | |
step:1133/6000 train_time:1117395ms step_avg:995.01ms | |
step:1134/6000 train_time:1118384ms step_avg:995.00ms | |
step:1135/6000 train_time:1119378ms step_avg:995.00ms | |
step:1136/6000 train_time:1120367ms step_avg:995.00ms | |
step:1137/6000 train_time:1121365ms step_avg:995.00ms | |
step:1138/6000 train_time:1122359ms step_avg:995.00ms | |
step:1139/6000 train_time:1123363ms step_avg:995.01ms | |
step:1140/6000 train_time:1124357ms step_avg:995.01ms | |
step:1141/6000 train_time:1125358ms step_avg:995.01ms | |
step:1142/6000 train_time:1126347ms step_avg:995.01ms | |
step:1143/6000 train_time:1127330ms step_avg:995.00ms | |
step:1144/6000 train_time:1128324ms step_avg:994.99ms | |
step:1145/6000 train_time:1129312ms step_avg:994.99ms | |
step:1146/6000 train_time:1130313ms step_avg:994.99ms | |
step:1147/6000 train_time:1131302ms step_avg:994.99ms | |
step:1148/6000 train_time:1132294ms step_avg:994.99ms | |
step:1149/6000 train_time:1133282ms step_avg:994.98ms | |
step:1150/6000 train_time:1134278ms step_avg:994.98ms | |
step:1150/6000 val_loss:2.5669 train_time:1134319ms step_avg:995.02ms perplexity:13.0251 param_count:85,137,462 | |
step:1151/6000 train_time:1135261ms step_avg:994.97ms | |
step:1152/6000 train_time:1136246ms step_avg:994.96ms | |
step:1153/6000 train_time:1137241ms step_avg:994.96ms | |
step:1154/6000 train_time:1138238ms step_avg:994.96ms | |
step:1155/6000 train_time:1139234ms step_avg:994.96ms | |
step:1156/6000 train_time:1140237ms step_avg:994.97ms | |
step:1157/6000 train_time:1141245ms step_avg:994.98ms | |
step:1158/6000 train_time:1142228ms step_avg:994.97ms | |
step:1159/6000 train_time:1143226ms step_avg:994.97ms | |
step:1160/6000 train_time:1144215ms step_avg:994.97ms | |
step:1161/6000 train_time:1145213ms step_avg:994.97ms | |
step:1162/6000 train_time:1146212ms step_avg:994.98ms | |
step:1163/6000 train_time:1147204ms step_avg:994.97ms | |
step:1164/6000 train_time:1148193ms step_avg:994.97ms | |
step:1165/6000 train_time:1149191ms step_avg:994.97ms | |
step:1166/6000 train_time:1150189ms step_avg:994.97ms | |
step:1167/6000 train_time:1151193ms step_avg:994.98ms | |
step:1168/6000 train_time:1152185ms step_avg:994.98ms | |
step:1169/6000 train_time:1153183ms step_avg:994.98ms | |
step:1170/6000 train_time:1154180ms step_avg:994.98ms | |
step:1171/6000 train_time:1155175ms step_avg:994.98ms | |
step:1172/6000 train_time:1156177ms step_avg:994.99ms | |
step:1173/6000 train_time:1157175ms step_avg:994.99ms | |
step:1174/6000 train_time:1158173ms step_avg:994.99ms | |
step:1175/6000 train_time:1159174ms step_avg:995.00ms | |
step:1175/6000 val_loss:2.5703 train_time:1159215ms step_avg:995.03ms perplexity:13.0701 param_count:85,137,462 | |
step:1176/6000 train_time:1160164ms step_avg:994.99ms | |
step:1177/6000 train_time:1161164ms step_avg:995.00ms | |
step:1178/6000 train_time:1162157ms step_avg:995.00ms | |
step:1179/6000 train_time:1163162ms step_avg:995.01ms | |
step:1180/6000 train_time:1164156ms step_avg:995.01ms | |
step:1181/6000 train_time:1165145ms step_avg:995.00ms | |
step:1182/6000 train_time:1166137ms step_avg:995.00ms | |
step:1183/6000 train_time:1167125ms step_avg:994.99ms | |
step:1184/6000 train_time:1168143ms step_avg:995.01ms | |
step:1185/6000 train_time:1169140ms step_avg:995.01ms | |
step:1186/6000 train_time:1170132ms step_avg:995.01ms | |
step:1187/6000 train_time:1171128ms step_avg:995.01ms | |
step:1188/6000 train_time:1172125ms step_avg:995.01ms | |
step:1189/6000 train_time:1173119ms step_avg:995.01ms | |
step:1190/6000 train_time:1174117ms step_avg:995.01ms | |
step:1191/6000 train_time:1175115ms step_avg:995.02ms | |
step:1192/6000 train_time:1176105ms step_avg:995.01ms | |
step:1193/6000 train_time:1177104ms step_avg:995.02ms | |
step:1194/6000 train_time:1178098ms step_avg:995.02ms | |
step:1195/6000 train_time:1179107ms step_avg:995.03ms | |
step:1196/6000 train_time:1180114ms step_avg:995.04ms | |
step:1197/6000 train_time:1181117ms step_avg:995.04ms | |
step:1198/6000 train_time:1182101ms step_avg:995.03ms | |
step:1199/6000 train_time:1183095ms step_avg:995.03ms | |
step:1200/6000 train_time:1184109ms step_avg:995.05ms | |
step:1200/6000 val_loss:2.5699 train_time:1184150ms step_avg:995.08ms perplexity:13.0644 param_count:85,137,462 | |
step:1201/6000 train_time:1185098ms step_avg:995.04ms | |
step:1202/6000 train_time:1186091ms step_avg:995.04ms | |
step:1203/6000 train_time:1187097ms step_avg:995.05ms | |
step:1204/6000 train_time:1188100ms step_avg:995.06ms | |
step:1205/6000 train_time:1189093ms step_avg:995.06ms | |
step:1206/6000 train_time:1190090ms step_avg:995.06ms | |
step:1207/6000 train_time:1191094ms step_avg:995.07ms | |
step:1208/6000 train_time:1192087ms step_avg:995.06ms | |
step:1209/6000 train_time:1193092ms step_avg:995.07ms | |
step:1210/6000 train_time:1194079ms step_avg:995.07ms | |
step:1211/6000 train_time:1195068ms step_avg:995.06ms | |
step:1212/6000 train_time:1196071ms step_avg:995.07ms | |
step:1213/6000 train_time:1197070ms step_avg:995.07ms | |
step:1214/6000 train_time:1198062ms step_avg:995.07ms | |
step:1215/6000 train_time:1199058ms step_avg:995.07ms | |
step:1216/6000 train_time:1200056ms step_avg:995.07ms | |
step:1217/6000 train_time:1201048ms step_avg:995.07ms | |
step:1218/6000 train_time:1202046ms step_avg:995.07ms | |
step:1219/6000 train_time:1203054ms step_avg:995.08ms | |
step:1220/6000 train_time:1204049ms step_avg:995.08ms | |
step:1221/6000 train_time:1205047ms step_avg:995.08ms | |
step:1222/6000 train_time:1206042ms step_avg:995.08ms | |
step:1223/6000 train_time:1207032ms step_avg:995.08ms | |
step:1224/6000 train_time:1208034ms step_avg:995.09ms | |
step:1225/6000 train_time:1209033ms step_avg:995.09ms | |
step:1225/6000 val_loss:2.5575 train_time:1209073ms step_avg:995.12ms perplexity:12.9031 param_count:85,137,462 | |
step:1226/6000 train_time:1210023ms step_avg:995.09ms | |
step:1227/6000 train_time:1211024ms step_avg:995.09ms | |
step:1228/6000 train_time:1212013ms step_avg:995.08ms | |
step:1229/6000 train_time:1213006ms step_avg:995.08ms | |
step:1230/6000 train_time:1214008ms step_avg:995.09ms | |
step:1231/6000 train_time:1215005ms step_avg:995.09ms | |
step:1232/6000 train_time:1215996ms step_avg:995.09ms | |
step:1233/6000 train_time:1216986ms step_avg:995.08ms | |
step:1234/6000 train_time:1217985ms step_avg:995.09ms | |
step:1235/6000 train_time:1218985ms step_avg:995.09ms | |
step:1236/6000 train_time:1219982ms step_avg:995.09ms | |
step:1237/6000 train_time:1220977ms step_avg:995.09ms | |
step:1238/6000 train_time:1221978ms step_avg:995.10ms | |
step:1239/6000 train_time:1222974ms step_avg:995.10ms | |
step:1240/6000 train_time:1223973ms step_avg:995.10ms | |
step:1241/6000 train_time:1224978ms step_avg:995.11ms | |
step:1242/6000 train_time:1225963ms step_avg:995.10ms | |
step:1243/6000 train_time:1226952ms step_avg:995.09ms | |
step:1244/6000 train_time:1227944ms step_avg:995.09ms | |
step:1245/6000 train_time:1228930ms step_avg:995.08ms | |
step:1246/6000 train_time:1229922ms step_avg:995.08ms | |
step:1247/6000 train_time:1230922ms step_avg:995.09ms | |
step:1248/6000 train_time:1231909ms step_avg:995.08ms | |
step:1249/6000 train_time:1232900ms step_avg:995.08ms | |
step:1250/6000 train_time:1233900ms step_avg:995.08ms | |
step:1250/6000 val_loss:2.5791 train_time:1233942ms step_avg:995.11ms perplexity:13.1859 param_count:85,137,462 | |
step:1251/6000 train_time:1234891ms step_avg:995.08ms | |
step:1252/6000 train_time:1235888ms step_avg:995.08ms | |
step:1253/6000 train_time:1236882ms step_avg:995.08ms | |
step:1254/6000 train_time:1237873ms step_avg:995.08ms | |
step:1255/6000 train_time:1238859ms step_avg:995.07ms | |
step:1256/6000 train_time:1239850ms step_avg:995.06ms | |
step:1257/6000 train_time:1240834ms step_avg:995.06ms | |
step:1258/6000 train_time:1241826ms step_avg:995.05ms | |
step:1259/6000 train_time:1242820ms step_avg:995.05ms | |
step:1260/6000 train_time:1243829ms step_avg:995.06ms | |
step:1261/6000 train_time:1244839ms step_avg:995.07ms | |
step:1262/6000 train_time:1245835ms step_avg:995.08ms | |
step:1263/6000 train_time:1246832ms step_avg:995.08ms | |
step:1264/6000 train_time:1247826ms step_avg:995.08ms | |
step:1265/6000 train_time:1248825ms step_avg:995.08ms | |
step:1266/6000 train_time:1249831ms step_avg:995.09ms | |
step:1267/6000 train_time:1250827ms step_avg:995.09ms | |
step:1268/6000 train_time:1251826ms step_avg:995.09ms | |
step:1269/6000 train_time:1252813ms step_avg:995.09ms | |
step:1270/6000 train_time:1253801ms step_avg:995.08ms | |
step:1271/6000 train_time:1254803ms step_avg:995.09ms | |
step:1272/6000 train_time:1255799ms step_avg:995.09ms | |
step:1273/6000 train_time:1256790ms step_avg:995.08ms | |
step:1274/6000 train_time:1257792ms step_avg:995.09ms | |
step:1275/6000 train_time:1258793ms step_avg:995.09ms | |
step:1275/6000 val_loss:2.5503 train_time:1258833ms step_avg:995.13ms perplexity:12.8113 param_count:85,137,462 | |
step:1276/6000 train_time:1259779ms step_avg:995.09ms | |
step:1277/6000 train_time:1260773ms step_avg:995.08ms | |
step:1278/6000 train_time:1261774ms step_avg:995.09ms | |
step:1279/6000 train_time:1262764ms step_avg:995.09ms | |
step:1280/6000 train_time:1263758ms step_avg:995.09ms | |
step:1281/6000 train_time:1264758ms step_avg:995.09ms | |
step:1282/6000 train_time:1265750ms step_avg:995.09ms | |
step:1283/6000 train_time:1266748ms step_avg:995.09ms | |
step:1284/6000 train_time:1267749ms step_avg:995.09ms | |
step:1285/6000 train_time:1268750ms step_avg:995.10ms | |
step:1286/6000 train_time:1269758ms step_avg:995.11ms | |
step:1287/6000 train_time:1270758ms step_avg:995.11ms | |
step:1288/6000 train_time:1271765ms step_avg:995.12ms | |
step:1289/6000 train_time:1272764ms step_avg:995.12ms | |
step:1290/6000 train_time:1273770ms step_avg:995.13ms | |
step:1291/6000 train_time:1274768ms step_avg:995.14ms | |
step:1292/6000 train_time:1275774ms step_avg:995.14ms | |
step:1293/6000 train_time:1276771ms step_avg:995.14ms | |
step:1294/6000 train_time:1277769ms step_avg:995.15ms | |
step:1295/6000 train_time:1278769ms step_avg:995.15ms | |
step:1296/6000 train_time:1279753ms step_avg:995.14ms | |
step:1297/6000 train_time:1280758ms step_avg:995.15ms | |
step:1298/6000 train_time:1281752ms step_avg:995.15ms | |
step:1299/6000 train_time:1282750ms step_avg:995.15ms | |
step:1300/6000 train_time:1283748ms step_avg:995.15ms | |
step:1300/6000 val_loss:2.5572 train_time:1283786ms step_avg:995.18ms perplexity:12.8991 param_count:85,137,462 | |
step:1301/6000 train_time:1284728ms step_avg:995.14ms | |
step:1302/6000 train_time:1285728ms step_avg:995.15ms | |
step:1303/6000 train_time:1286725ms step_avg:995.15ms | |
step:1304/6000 train_time:1287709ms step_avg:995.14ms | |
step:1305/6000 train_time:1288705ms step_avg:995.14ms | |
step:1306/6000 train_time:1289710ms step_avg:995.15ms | |
step:1307/6000 train_time:1290710ms step_avg:995.15ms | |
step:1308/6000 train_time:1291706ms step_avg:995.15ms | |
step:1309/6000 train_time:1292705ms step_avg:995.15ms | |
step:1310/6000 train_time:1293707ms step_avg:995.16ms | |
step:1311/6000 train_time:1294707ms step_avg:995.16ms | |
step:1312/6000 train_time:1295706ms step_avg:995.17ms | |
step:1313/6000 train_time:1296707ms step_avg:995.17ms | |
step:1314/6000 train_time:1297715ms step_avg:995.18ms | |
step:1315/6000 train_time:1298709ms step_avg:995.18ms | |
step:1316/6000 train_time:1299710ms step_avg:995.18ms | |
step:1317/6000 train_time:1300704ms step_avg:995.18ms | |
step:1318/6000 train_time:1301702ms step_avg:995.18ms | |
step:1319/6000 train_time:1302708ms step_avg:995.19ms | |
step:1320/6000 train_time:1303706ms step_avg:995.20ms | |
step:1321/6000 train_time:1304707ms step_avg:995.20ms | |
step:1322/6000 train_time:1305705ms step_avg:995.20ms | |
step:1323/6000 train_time:1306712ms step_avg:995.21ms | |
step:1324/6000 train_time:1307705ms step_avg:995.21ms | |
step:1325/6000 train_time:1308701ms step_avg:995.21ms | |
step:1325/6000 val_loss:2.5647 train_time:1308742ms step_avg:995.24ms perplexity:12.9967 param_count:85,137,462 | |
step:1326/6000 train_time:1309681ms step_avg:995.20ms | |
step:1327/6000 train_time:1310673ms step_avg:995.20ms | |
step:1328/6000 train_time:1311662ms step_avg:995.19ms | |
step:1329/6000 train_time:1312678ms step_avg:995.21ms | |
step:1330/6000 train_time:1313669ms step_avg:995.20ms | |
step:1331/6000 train_time:1314672ms step_avg:995.21ms | |
step:1332/6000 train_time:1315670ms step_avg:995.21ms | |
step:1333/6000 train_time:1316663ms step_avg:995.21ms | |
step:1334/6000 train_time:1317657ms step_avg:995.21ms | |
step:1335/6000 train_time:1318657ms step_avg:995.21ms | |
step:1336/6000 train_time:1319652ms step_avg:995.21ms | |
step:1337/6000 train_time:1320645ms step_avg:995.21ms | |
step:1338/6000 train_time:1321645ms step_avg:995.21ms | |
step:1339/6000 train_time:1322637ms step_avg:995.21ms | |
step:1340/6000 train_time:1323632ms step_avg:995.21ms | |
step:1341/6000 train_time:1324634ms step_avg:995.22ms | |
step:1342/6000 train_time:1325639ms step_avg:995.22ms | |
step:1343/6000 train_time:1326632ms step_avg:995.22ms | |
step:1344/6000 train_time:1327637ms step_avg:995.23ms | |
step:1345/6000 train_time:1328641ms step_avg:995.24ms | |
step:1346/6000 train_time:1329635ms step_avg:995.24ms | |
step:1347/6000 train_time:1330639ms step_avg:995.24ms | |
step:1348/6000 train_time:1331636ms step_avg:995.24ms | |
step:1349/6000 train_time:1332623ms step_avg:995.24ms | |
step:1350/6000 train_time:1333614ms step_avg:995.23ms | |
step:1350/6000 val_loss:2.5725 train_time:1333654ms step_avg:995.26ms perplexity:13.0980 param_count:85,137,462 | |
step:1351/6000 train_time:1334623ms step_avg:995.24ms | |
step:1352/6000 train_time:1335620ms step_avg:995.25ms | |
step:1353/6000 train_time:1336623ms step_avg:995.25ms | |
step:1354/6000 train_time:1337616ms step_avg:995.25ms | |
step:1355/6000 train_time:1338606ms step_avg:995.25ms | |
step:1356/6000 train_time:1339598ms step_avg:995.24ms | |
step:1357/6000 train_time:1340596ms step_avg:995.25ms | |
step:1358/6000 train_time:1341590ms step_avg:995.24ms | |
step:1359/6000 train_time:1342592ms step_avg:995.25ms | |
step:1360/6000 train_time:1343580ms step_avg:995.24ms | |
step:1361/6000 train_time:1344578ms step_avg:995.25ms | |
step:1362/6000 train_time:1345570ms step_avg:995.24ms | |
step:1363/6000 train_time:1346569ms step_avg:995.25ms | |
step:1364/6000 train_time:1347575ms step_avg:995.25ms | |
step:1365/6000 train_time:1348572ms step_avg:995.26ms | |
step:1366/6000 train_time:1349571ms step_avg:995.26ms | |
step:1367/6000 train_time:1350566ms step_avg:995.26ms | |
step:1368/6000 train_time:1351560ms step_avg:995.26ms | |
step:1369/6000 train_time:1352554ms step_avg:995.26ms | |
step:1370/6000 train_time:1353551ms step_avg:995.26ms | |
step:1371/6000 train_time:1354547ms step_avg:995.26ms | |
step:1372/6000 train_time:1355561ms step_avg:995.27ms | |
step:1373/6000 train_time:1356537ms step_avg:995.26ms | |
step:1374/6000 train_time:1357533ms step_avg:995.26ms | |
step:1375/6000 train_time:1358532ms step_avg:995.26ms | |
step:1375/6000 val_loss:2.5504 train_time:1358573ms step_avg:995.29ms perplexity:12.8125 param_count:85,137,462 | |
step:1376/6000 train_time:1359519ms step_avg:995.26ms | |
step:1377/6000 train_time:1360524ms step_avg:995.26ms | |
step:1378/6000 train_time:1361515ms step_avg:995.26ms | |
step:1379/6000 train_time:1362515ms step_avg:995.26ms | |
step:1380/6000 train_time:1363507ms step_avg:995.26ms | |
step:1381/6000 train_time:1364501ms step_avg:995.26ms | |
step:1382/6000 train_time:1365499ms step_avg:995.26ms | |
step:1383/6000 train_time:1366484ms step_avg:995.25ms | |
step:1384/6000 train_time:1367472ms step_avg:995.25ms | |
step:1385/6000 train_time:1368460ms step_avg:995.24ms | |
step:1386/6000 train_time:1369449ms step_avg:995.24ms | |
step:1387/6000 train_time:1370443ms step_avg:995.24ms | |
step:1388/6000 train_time:1371427ms step_avg:995.23ms | |
step:1389/6000 train_time:1372422ms step_avg:995.23ms | |
step:1390/6000 train_time:1373426ms step_avg:995.24ms | |
step:1391/6000 train_time:1374426ms step_avg:995.24ms | |
step:1392/6000 train_time:1375408ms step_avg:995.23ms | |
step:1393/6000 train_time:1376401ms step_avg:995.23ms | |
step:1394/6000 train_time:1377378ms step_avg:995.22ms | |
step:1395/6000 train_time:1378373ms step_avg:995.21ms | |
step:1396/6000 train_time:1379366ms step_avg:995.21ms | |
step:1397/6000 train_time:1380361ms step_avg:995.21ms | |
step:1398/6000 train_time:1381357ms step_avg:995.21ms | |
step:1399/6000 train_time:1382352ms step_avg:995.21ms | |
step:1400/6000 train_time:1383354ms step_avg:995.22ms | |
step:1400/6000 val_loss:2.5605 train_time:1383394ms step_avg:995.25ms perplexity:12.9425 param_count:85,137,462 | |
step:1401/6000 train_time:1384341ms step_avg:995.21ms | |
step:1402/6000 train_time:1385330ms step_avg:995.21ms | |
step:1403/6000 train_time:1386319ms step_avg:995.20ms | |
step:1404/6000 train_time:1387309ms step_avg:995.20ms | |
step:1405/6000 train_time:1388301ms step_avg:995.20ms | |
step:1406/6000 train_time:1389302ms step_avg:995.20ms | |
step:1407/6000 train_time:1390293ms step_avg:995.20ms | |
step:1408/6000 train_time:1391293ms step_avg:995.20ms | |
step:1409/6000 train_time:1392280ms step_avg:995.20ms | |
step:1410/6000 train_time:1393272ms step_avg:995.19ms | |
step:1411/6000 train_time:1394255ms step_avg:995.19ms | |
step:1412/6000 train_time:1395247ms step_avg:995.18ms | |
step:1413/6000 train_time:1396240ms step_avg:995.18ms | |
step:1414/6000 train_time:1397225ms step_avg:995.17ms | |
step:1415/6000 train_time:1398218ms step_avg:995.17ms | |
step:1416/6000 train_time:1399214ms step_avg:995.17ms | |
step:1417/6000 train_time:1400198ms step_avg:995.17ms | |
step:1418/6000 train_time:1401185ms step_avg:995.16ms | |
step:1419/6000 train_time:1402178ms step_avg:995.16ms | |
step:1420/6000 train_time:1403178ms step_avg:995.16ms | |
step:1421/6000 train_time:1404174ms step_avg:995.16ms | |
step:1422/6000 train_time:1405166ms step_avg:995.16ms | |
step:1423/6000 train_time:1406158ms step_avg:995.16ms | |
step:1424/6000 train_time:1407142ms step_avg:995.15ms | |
step:1425/6000 train_time:1408129ms step_avg:995.14ms | |
step:1425/6000 val_loss:2.5683 train_time:1408171ms step_avg:995.17ms perplexity:13.0438 param_count:85,137,462 | |
step:1426/6000 train_time:1409120ms step_avg:995.14ms | |
step:1427/6000 train_time:1410111ms step_avg:995.14ms | |
step:1428/6000 train_time:1411108ms step_avg:995.14ms | |
step:1429/6000 train_time:1412095ms step_avg:995.13ms | |
step:1430/6000 train_time:1413094ms step_avg:995.14ms | |
step:1431/6000 train_time:1414087ms step_avg:995.14ms | |
step:1432/6000 train_time:1415075ms step_avg:995.13ms | |
step:1433/6000 train_time:1416061ms step_avg:995.12ms | |
step:1434/6000 train_time:1417056ms step_avg:995.12ms | |
step:1435/6000 train_time:1418051ms step_avg:995.12ms | |
step:1436/6000 train_time:1419044ms step_avg:995.12ms | |
step:1437/6000 train_time:1420045ms step_avg:995.13ms | |
step:1438/6000 train_time:1421047ms step_avg:995.13ms | |
step:1439/6000 train_time:1422034ms step_avg:995.12ms | |
step:1440/6000 train_time:1423023ms step_avg:995.12ms | |
step:1441/6000 train_time:1424019ms step_avg:995.12ms | |
step:1442/6000 train_time:1425013ms step_avg:995.12ms | |
step:1443/6000 train_time:1426014ms step_avg:995.13ms | |
step:1444/6000 train_time:1427033ms step_avg:995.14ms | |
step:1445/6000 train_time:1428020ms step_avg:995.14ms | |
step:1446/6000 train_time:1429016ms step_avg:995.14ms | |
step:1447/6000 train_time:1429999ms step_avg:995.13ms | |
step:1448/6000 train_time:1430994ms step_avg:995.13ms | |
step:1449/6000 train_time:1431986ms step_avg:995.13ms | |
step:1450/6000 train_time:1432983ms step_avg:995.13ms | |
step:1450/6000 val_loss:2.5643 train_time:1433022ms step_avg:995.15ms perplexity:12.9920 param_count:85,137,462 | |
step:1451/6000 train_time:1433965ms step_avg:995.12ms | |
step:1452/6000 train_time:1434953ms step_avg:995.11ms | |
step:1453/6000 train_time:1435943ms step_avg:995.11ms | |
step:1454/6000 train_time:1436929ms step_avg:995.10ms | |
step:1455/6000 train_time:1437924ms step_avg:995.10ms | |
step:1456/6000 train_time:1438928ms step_avg:995.11ms | |
step:1457/6000 train_time:1439929ms step_avg:995.11ms | |
step:1458/6000 train_time:1440918ms step_avg:995.11ms | |
step:1459/6000 train_time:1441932ms step_avg:995.12ms | |
step:1460/6000 train_time:1442933ms step_avg:995.13ms | |
step:1461/6000 train_time:1443924ms step_avg:995.12ms | |
step:1462/6000 train_time:1444927ms step_avg:995.13ms | |
step:1463/6000 train_time:1445924ms step_avg:995.13ms | |
step:1464/6000 train_time:1446917ms step_avg:995.13ms | |
step:1465/6000 train_time:1447906ms step_avg:995.12ms | |
step:1466/6000 train_time:1448901ms step_avg:995.12ms | |
step:1467/6000 train_time:1449899ms step_avg:995.13ms | |
step:1468/6000 train_time:1450893ms step_avg:995.13ms | |
step:1469/6000 train_time:1451889ms step_avg:995.13ms | |
step:1470/6000 train_time:1452893ms step_avg:995.13ms | |
step:1471/6000 train_time:1453881ms step_avg:995.13ms | |
step:1472/6000 train_time:1454878ms step_avg:995.13ms | |
step:1473/6000 train_time:1455873ms step_avg:995.13ms | |
step:1474/6000 train_time:1456868ms step_avg:995.13ms | |
step:1475/6000 train_time:1457854ms step_avg:995.12ms | |
step:1475/6000 val_loss:2.5574 train_time:1457897ms step_avg:995.15ms perplexity:12.9026 param_count:85,137,462 | |
step:1476/6000 train_time:1458848ms step_avg:995.12ms | |
step:1477/6000 train_time:1459843ms step_avg:995.12ms | |
step:1478/6000 train_time:1460844ms step_avg:995.13ms | |
step:1479/6000 train_time:1461838ms step_avg:995.12ms | |
step:1480/6000 train_time:1462838ms step_avg:995.13ms | |
step:1481/6000 train_time:1463828ms step_avg:995.12ms | |
step:1482/6000 train_time:1464820ms step_avg:995.12ms | |
step:1483/6000 train_time:1465822ms step_avg:995.13ms | |
step:1484/6000 train_time:1466823ms step_avg:995.13ms | |
step:1485/6000 train_time:1467819ms step_avg:995.13ms | |
step:1486/6000 train_time:1468814ms step_avg:995.13ms | |
step:1487/6000 train_time:1469819ms step_avg:995.14ms | |
step:1488/6000 train_time:1470814ms step_avg:995.14ms | |
step:1489/6000 train_time:1471813ms step_avg:995.14ms | |
step:1490/6000 train_time:1472808ms step_avg:995.14ms | |
step:1491/6000 train_time:1473806ms step_avg:995.14ms | |
step:1492/6000 train_time:1474802ms step_avg:995.14ms | |
step:1493/6000 train_time:1475799ms step_avg:995.14ms | |
step:1494/6000 train_time:1476788ms step_avg:995.14ms | |
step:1495/6000 train_time:1477776ms step_avg:995.14ms | |
step:1496/6000 train_time:1478765ms step_avg:995.13ms | |
step:1497/6000 train_time:1479773ms step_avg:995.14ms | |
step:1498/6000 train_time:1480758ms step_avg:995.13ms | |
step:1499/6000 train_time:1481751ms step_avg:995.13ms | |
step:1500/6000 train_time:1482738ms step_avg:995.13ms | |
step:1500/6000 val_loss:2.5612 train_time:1482779ms step_avg:995.15ms perplexity:12.9513 param_count:85,137,462 | |
step:1501/6000 train_time:1483747ms step_avg:995.14ms | |
step:1502/6000 train_time:1484740ms step_avg:995.13ms | |
step:1503/6000 train_time:1485738ms step_avg:995.14ms | |
step:1504/6000 train_time:1486736ms step_avg:995.14ms | |
step:1505/6000 train_time:1487738ms step_avg:995.14ms | |
step:1506/6000 train_time:1488740ms step_avg:995.15ms | |
step:1507/6000 train_time:1489736ms step_avg:995.15ms | |
step:1508/6000 train_time:1490727ms step_avg:995.15ms | |
step:1509/6000 train_time:1491707ms step_avg:995.13ms | |
step:1510/6000 train_time:1492702ms step_avg:995.13ms | |
step:1511/6000 train_time:1493696ms step_avg:995.13ms | |
step:1512/6000 train_time:1494693ms step_avg:995.14ms | |
step:1513/6000 train_time:1495689ms step_avg:995.14ms | |
step:1514/6000 train_time:1496677ms step_avg:995.13ms | |
step:1515/6000 train_time:1497668ms step_avg:995.13ms | |
step:1516/6000 train_time:1498659ms step_avg:995.13ms | |
step:1517/6000 train_time:1499654ms step_avg:995.13ms | |
step:1518/6000 train_time:1500660ms step_avg:995.13ms | |
step:1519/6000 train_time:1501651ms step_avg:995.13ms | |
step:1520/6000 train_time:1502652ms step_avg:995.13ms | |
step:1521/6000 train_time:1503645ms step_avg:995.13ms | |
step:1522/6000 train_time:1504637ms step_avg:995.13ms | |
step:1523/6000 train_time:1505629ms step_avg:995.13ms | |
step:1524/6000 train_time:1506623ms step_avg:995.13ms | |
step:1525/6000 train_time:1507626ms step_avg:995.13ms | |
step:1525/6000 val_loss:2.5612 train_time:1507666ms step_avg:995.16ms perplexity:12.9513 param_count:85,137,462 | |
step:1526/6000 train_time:1508616ms step_avg:995.13ms | |
step:1527/6000 train_time:1509611ms step_avg:995.13ms | |
step:1528/6000 train_time:1510592ms step_avg:995.12ms | |
step:1529/6000 train_time:1511589ms step_avg:995.12ms | |
step:1530/6000 train_time:1512591ms step_avg:995.13ms | |
step:1531/6000 train_time:1513578ms step_avg:995.12ms | |
step:1532/6000 train_time:1514584ms step_avg:995.13ms | |
step:1533/6000 train_time:1515582ms step_avg:995.13ms | |
step:1534/6000 train_time:1516581ms step_avg:995.13ms | |
step:1535/6000 train_time:1517580ms step_avg:995.13ms | |
step:1536/6000 train_time:1518568ms step_avg:995.13ms | |
step:1537/6000 train_time:1519567ms step_avg:995.13ms | |
step:1538/6000 train_time:1520563ms step_avg:995.13ms | |
step:1539/6000 train_time:1521555ms step_avg:995.13ms | |
step:1540/6000 train_time:1522543ms step_avg:995.13ms | |
step:1541/6000 train_time:1523542ms step_avg:995.13ms | |
step:1542/6000 train_time:1524535ms step_avg:995.13ms | |
step:1543/6000 train_time:1525530ms step_avg:995.13ms | |
step:1544/6000 train_time:1526530ms step_avg:995.13ms | |
step:1545/6000 train_time:1527523ms step_avg:995.13ms | |
step:1546/6000 train_time:1528528ms step_avg:995.14ms | |
step:1547/6000 train_time:1529526ms step_avg:995.14ms | |
step:1548/6000 train_time:1530526ms step_avg:995.14ms | |
step:1549/6000 train_time:1531517ms step_avg:995.14ms | |
step:1550/6000 train_time:1532518ms step_avg:995.14ms | |
step:1550/6000 val_loss:2.5621 train_time:1532559ms step_avg:995.17ms perplexity:12.9624 param_count:85,137,462 | |
step:1551/6000 train_time:1533511ms step_avg:995.14ms | |
step:1552/6000 train_time:1534511ms step_avg:995.14ms | |
step:1553/6000 train_time:1535504ms step_avg:995.14ms | |
step:1554/6000 train_time:1536492ms step_avg:995.14ms | |
step:1555/6000 train_time:1537485ms step_avg:995.14ms | |
step:1556/6000 train_time:1538481ms step_avg:995.14ms | |
step:1557/6000 train_time:1539494ms step_avg:995.15ms | |
step:1558/6000 train_time:1540501ms step_avg:995.16ms | |
step:1559/6000 train_time:1541495ms step_avg:995.15ms | |
step:1560/6000 train_time:1542483ms step_avg:995.15ms | |
step:1561/6000 train_time:1543482ms step_avg:995.15ms | |
step:1562/6000 train_time:1544480ms step_avg:995.15ms | |
step:1563/6000 train_time:1545482ms step_avg:995.16ms | |
step:1564/6000 train_time:1546481ms step_avg:995.16ms | |
step:1565/6000 train_time:1547475ms step_avg:995.16ms | |
step:1566/6000 train_time:1548467ms step_avg:995.16ms | |
step:1567/6000 train_time:1549465ms step_avg:995.16ms | |
step:1568/6000 train_time:1550464ms step_avg:995.16ms | |
step:1569/6000 train_time:1551463ms step_avg:995.17ms | |
step:1570/6000 train_time:1552456ms step_avg:995.16ms | |
step:1571/6000 train_time:1553451ms step_avg:995.16ms | |
step:1572/6000 train_time:1554445ms step_avg:995.16ms | |
step:1573/6000 train_time:1555442ms step_avg:995.16ms | |
step:1574/6000 train_time:1556447ms step_avg:995.17ms | |
step:1575/6000 train_time:1557436ms step_avg:995.17ms | |
step:1575/6000 val_loss:2.5495 train_time:1557477ms step_avg:995.19ms perplexity:12.8010 param_count:85,137,462 | |
step:1576/6000 train_time:1558433ms step_avg:995.17ms | |
step:1577/6000 train_time:1559432ms step_avg:995.17ms | |
step:1578/6000 train_time:1560427ms step_avg:995.17ms | |
step:1579/6000 train_time:1561431ms step_avg:995.18ms | |
step:1580/6000 train_time:1562429ms step_avg:995.18ms | |
step:1581/6000 train_time:1563447ms step_avg:995.19ms | |
step:1582/6000 train_time:1564449ms step_avg:995.20ms | |
step:1583/6000 train_time:1565439ms step_avg:995.19ms | |
step:1584/6000 train_time:1566438ms step_avg:995.20ms | |
step:1585/6000 train_time:1567425ms step_avg:995.19ms | |
step:1586/6000 train_time:1568413ms step_avg:995.19ms | |
step:1587/6000 train_time:1569404ms step_avg:995.18ms | |
step:1588/6000 train_time:1570398ms step_avg:995.18ms | |
step:1589/6000 train_time:1571393ms step_avg:995.18ms | |
step:1590/6000 train_time:1572384ms step_avg:995.18ms | |
step:1591/6000 train_time:1573384ms step_avg:995.18ms | |
step:1592/6000 train_time:1574379ms step_avg:995.18ms | |
step:1593/6000 train_time:1575375ms step_avg:995.18ms | |
step:1594/6000 train_time:1576370ms step_avg:995.18ms | |
step:1595/6000 train_time:1577355ms step_avg:995.18ms | |
step:1596/6000 train_time:1578364ms step_avg:995.19ms | |
step:1597/6000 train_time:1579366ms step_avg:995.19ms | |
step:1598/6000 train_time:1580373ms step_avg:995.20ms | |
step:1599/6000 train_time:1581376ms step_avg:995.20ms | |
step:1600/6000 train_time:1582366ms step_avg:995.20ms | |
step:1600/6000 val_loss:2.5533 train_time:1582409ms step_avg:995.23ms perplexity:12.8489 param_count:85,137,462 | |
step:1601/6000 train_time:1583361ms step_avg:995.20ms | |
step:1602/6000 train_time:1584424ms step_avg:995.24ms | |
step:1603/6000 train_time:1585418ms step_avg:995.24ms | |
step:1604/6000 train_time:1586427ms step_avg:995.25ms | |
step:1605/6000 train_time:1587423ms step_avg:995.25ms | |
step:1606/6000 train_time:1588416ms step_avg:995.25ms | |
step:1607/6000 train_time:1589423ms step_avg:995.26ms | |
step:1608/6000 train_time:1590424ms step_avg:995.26ms | |
step:1609/6000 train_time:1591414ms step_avg:995.26ms | |
step:1610/6000 train_time:1592422ms step_avg:995.26ms | |
step:1611/6000 train_time:1593405ms step_avg:995.26ms | |
step:1612/6000 train_time:1594402ms step_avg:995.26ms | |
step:1613/6000 train_time:1595394ms step_avg:995.26ms | |
step:1614/6000 train_time:1596384ms step_avg:995.25ms | |
step:1615/6000 train_time:1597383ms step_avg:995.25ms | |
step:1616/6000 train_time:1598380ms step_avg:995.26ms | |
step:1617/6000 train_time:1599376ms step_avg:995.26ms | |
step:1618/6000 train_time:1600374ms step_avg:995.26ms | |
step:1619/6000 train_time:1601379ms step_avg:995.26ms | |
step:1620/6000 train_time:1602365ms step_avg:995.26ms | |
step:1621/6000 train_time:1603365ms step_avg:995.26ms | |
step:1622/6000 train_time:1604376ms step_avg:995.27ms | |
step:1623/6000 train_time:1605372ms step_avg:995.27ms | |
step:1624/6000 train_time:1606361ms step_avg:995.27ms | |
step:1625/6000 train_time:1607364ms step_avg:995.27ms | |
step:1625/6000 val_loss:2.5555 train_time:1607405ms step_avg:995.30ms perplexity:12.8779 param_count:85,137,462 | |
step:1626/6000 train_time:1608353ms step_avg:995.27ms | |
step:1627/6000 train_time:1609368ms step_avg:995.28ms | |
step:1628/6000 train_time:1610358ms step_avg:995.28ms | |
step:1629/6000 train_time:1611353ms step_avg:995.28ms | |
step:1630/6000 train_time:1612349ms step_avg:995.28ms | |
step:1631/6000 train_time:1613340ms step_avg:995.27ms | |
step:1632/6000 train_time:1614338ms step_avg:995.28ms | |
step:1633/6000 train_time:1615337ms step_avg:995.28ms | |
step:1634/6000 train_time:1616336ms step_avg:995.28ms | |
step:1635/6000 train_time:1617329ms step_avg:995.28ms | |
step:1636/6000 train_time:1618326ms step_avg:995.28ms | |
step:1637/6000 train_time:1619329ms step_avg:995.28ms | |
step:1638/6000 train_time:1620320ms step_avg:995.28ms | |
step:1639/6000 train_time:1621323ms step_avg:995.29ms | |
step:1640/6000 train_time:1622316ms step_avg:995.29ms | |
step:1641/6000 train_time:1623323ms step_avg:995.29ms | |
step:1642/6000 train_time:1624321ms step_avg:995.29ms | |
step:1643/6000 train_time:1625321ms step_avg:995.30ms | |
step:1644/6000 train_time:1626321ms step_avg:995.30ms | |
step:1645/6000 train_time:1627319ms step_avg:995.30ms | |
step:1646/6000 train_time:1628302ms step_avg:995.29ms | |
step:1647/6000 train_time:1629291ms step_avg:995.29ms | |
step:1648/6000 train_time:1630287ms step_avg:995.29ms | |
step:1649/6000 train_time:1631293ms step_avg:995.30ms | |
step:1650/6000 train_time:1632291ms step_avg:995.30ms | |
step:1650/6000 val_loss:2.5610 train_time:1632331ms step_avg:995.32ms perplexity:12.9488 param_count:85,137,462 | |
step:1651/6000 train_time:1633279ms step_avg:995.30ms | |
step:1652/6000 train_time:1634277ms step_avg:995.30ms | |
step:1653/6000 train_time:1635263ms step_avg:995.29ms | |
step:1654/6000 train_time:1636251ms step_avg:995.29ms | |
step:1655/6000 train_time:1637251ms step_avg:995.29ms | |
step:1656/6000 train_time:1638245ms step_avg:995.29ms | |
step:1657/6000 train_time:1639244ms step_avg:995.29ms | |
step:1658/6000 train_time:1640253ms step_avg:995.30ms | |
step:1659/6000 train_time:1641237ms step_avg:995.29ms | |
step:1660/6000 train_time:1642237ms step_avg:995.29ms | |
step:1661/6000 train_time:1643221ms step_avg:995.29ms | |
step:1662/6000 train_time:1644223ms step_avg:995.29ms | |
step:1663/6000 train_time:1645210ms step_avg:995.29ms | |
step:1664/6000 train_time:1646222ms step_avg:995.30ms | |
step:1665/6000 train_time:1647220ms step_avg:995.30ms | |
step:1666/6000 train_time:1648218ms step_avg:995.30ms | |
step:1667/6000 train_time:1649223ms step_avg:995.31ms | |
step:1668/6000 train_time:1650216ms step_avg:995.30ms | |
step:1669/6000 train_time:1651224ms step_avg:995.31ms | |
step:1670/6000 train_time:1652236ms step_avg:995.32ms | |
step:1671/6000 train_time:1653236ms step_avg:995.33ms | |
step:1672/6000 train_time:1654235ms step_avg:995.33ms | |
step:1673/6000 train_time:1655245ms step_avg:995.34ms | |
step:1674/6000 train_time:1656245ms step_avg:995.34ms | |
step:1675/6000 train_time:1657237ms step_avg:995.34ms | |
step:1675/6000 val_loss:2.5536 train_time:1657275ms step_avg:995.36ms perplexity:12.8538 param_count:85,137,462 | |
step:1676/6000 train_time:1658219ms step_avg:995.33ms | |
step:1677/6000 train_time:1659216ms step_avg:995.33ms | |
step:1678/6000 train_time:1660215ms step_avg:995.33ms | |
step:1679/6000 train_time:1661218ms step_avg:995.34ms | |
step:1680/6000 train_time:1662220ms step_avg:995.34ms | |
step:1681/6000 train_time:1663209ms step_avg:995.34ms | |
step:1682/6000 train_time:1664205ms step_avg:995.34ms | |
step:1683/6000 train_time:1665198ms step_avg:995.34ms | |
step:1684/6000 train_time:1666196ms step_avg:995.34ms | |
step:1685/6000 train_time:1667208ms step_avg:995.35ms | |
step:1686/6000 train_time:1668220ms step_avg:995.36ms | |
step:1687/6000 train_time:1669215ms step_avg:995.36ms | |
step:1688/6000 train_time:1670206ms step_avg:995.36ms | |
step:1689/6000 train_time:1671203ms step_avg:995.36ms | |
step:1690/6000 train_time:1672223ms step_avg:995.37ms | |
step:1691/6000 train_time:1673210ms step_avg:995.37ms | |
step:1692/6000 train_time:1674222ms step_avg:995.38ms | |
step:1693/6000 train_time:1675219ms step_avg:995.38ms | |
step:1694/6000 train_time:1676220ms step_avg:995.38ms | |
step:1695/6000 train_time:1677225ms step_avg:995.39ms | |
step:1696/6000 train_time:1678219ms step_avg:995.39ms | |
step:1697/6000 train_time:1679223ms step_avg:995.39ms | |
step:1698/6000 train_time:1680214ms step_avg:995.39ms | |
step:1699/6000 train_time:1681212ms step_avg:995.39ms | |
step:1700/6000 train_time:1682214ms step_avg:995.39ms | |
step:1700/6000 val_loss:2.5528 train_time:1682254ms step_avg:995.42ms perplexity:12.8424 param_count:85,137,462 | |
step:1701/6000 train_time:1683200ms step_avg:995.39ms | |
step:1702/6000 train_time:1684202ms step_avg:995.39ms | |
step:1703/6000 train_time:1685196ms step_avg:995.39ms | |
step:1704/6000 train_time:1686196ms step_avg:995.39ms | |
step:1705/6000 train_time:1687202ms step_avg:995.40ms | |
step:1706/6000 train_time:1688200ms step_avg:995.40ms | |
step:1707/6000 train_time:1689197ms step_avg:995.40ms | |
step:1708/6000 train_time:1690196ms step_avg:995.40ms | |
step:1709/6000 train_time:1691200ms step_avg:995.41ms | |
step:1710/6000 train_time:1692208ms step_avg:995.42ms | |
step:1711/6000 train_time:1693213ms step_avg:995.42ms | |
step:1712/6000 train_time:1694213ms step_avg:995.42ms | |
step:1713/6000 train_time:1695218ms step_avg:995.43ms | |
step:1714/6000 train_time:1696223ms step_avg:995.44ms | |
step:1715/6000 train_time:1697222ms step_avg:995.44ms | |
step:1716/6000 train_time:1698232ms step_avg:995.45ms | |
step:1717/6000 train_time:1699228ms step_avg:995.45ms | |
step:1718/6000 train_time:1700223ms step_avg:995.45ms | |
step:1719/6000 train_time:1701223ms step_avg:995.45ms | |
step:1720/6000 train_time:1702240ms step_avg:995.46ms | |
step:1721/6000 train_time:1703231ms step_avg:995.46ms | |
step:1722/6000 train_time:1704221ms step_avg:995.46ms | |
step:1723/6000 train_time:1705218ms step_avg:995.46ms | |
step:1724/6000 train_time:1706213ms step_avg:995.46ms | |
step:1725/6000 train_time:1707231ms step_avg:995.47ms | |
step:1725/6000 val_loss:2.5497 train_time:1707267ms step_avg:995.49ms perplexity:12.8036 param_count:85,137,462 | |
step:1726/6000 train_time:1708218ms step_avg:995.46ms | |
step:1727/6000 train_time:1709212ms step_avg:995.46ms | |
step:1728/6000 train_time:1710207ms step_avg:995.46ms | |
step:1729/6000 train_time:1711215ms step_avg:995.47ms | |
step:1730/6000 train_time:1712217ms step_avg:995.48ms | |
step:1731/6000 train_time:1713212ms step_avg:995.47ms | |
step:1732/6000 train_time:1714214ms step_avg:995.48ms | |
step:1733/6000 train_time:1715210ms step_avg:995.48ms | |
step:1734/6000 train_time:1716206ms step_avg:995.48ms | |
step:1735/6000 train_time:1717200ms step_avg:995.48ms | |
step:1736/6000 train_time:1718200ms step_avg:995.48ms | |
step:1737/6000 train_time:1719189ms step_avg:995.48ms | |
step:1738/6000 train_time:1720187ms step_avg:995.48ms | |
step:1739/6000 train_time:1721182ms step_avg:995.48ms | |
step:1740/6000 train_time:1722181ms step_avg:995.48ms | |
step:1741/6000 train_time:1723180ms step_avg:995.48ms | |
step:1742/6000 train_time:1724181ms step_avg:995.49ms | |
step:1743/6000 train_time:1725173ms step_avg:995.48ms | |
step:1744/6000 train_time:1726178ms step_avg:995.49ms | |
step:1745/6000 train_time:1727180ms step_avg:995.49ms | |
step:1746/6000 train_time:1728177ms step_avg:995.49ms | |
step:1747/6000 train_time:1729179ms step_avg:995.50ms | |
step:1748/6000 train_time:1730172ms step_avg:995.50ms | |
step:1749/6000 train_time:1731168ms step_avg:995.50ms | |
step:1750/6000 train_time:1732170ms step_avg:995.50ms | |
step:1750/6000 val_loss:2.5499 train_time:1732212ms step_avg:995.52ms perplexity:12.8054 param_count:85,137,462 | |
step:1751/6000 train_time:1733172ms step_avg:995.50ms | |
step:1752/6000 train_time:1734170ms step_avg:995.51ms | |
step:1753/6000 train_time:1735175ms step_avg:995.51ms | |
step:1754/6000 train_time:1736169ms step_avg:995.51ms | |
step:1755/6000 train_time:1737180ms step_avg:995.52ms | |
step:1756/6000 train_time:1738185ms step_avg:995.52ms | |
step:1757/6000 train_time:1739177ms step_avg:995.52ms | |
step:1758/6000 train_time:1740182ms step_avg:995.53ms | |
step:1759/6000 train_time:1741169ms step_avg:995.52ms | |
step:1760/6000 train_time:1742162ms step_avg:995.52ms | |
step:1761/6000 train_time:1743156ms step_avg:995.52ms | |
step:1762/6000 train_time:1744161ms step_avg:995.53ms | |
step:1763/6000 train_time:1745155ms step_avg:995.52ms | |
step:1764/6000 train_time:1746168ms step_avg:995.53ms | |
step:1765/6000 train_time:1747159ms step_avg:995.53ms | |
step:1766/6000 train_time:1748156ms step_avg:995.53ms | |
step:1767/6000 train_time:1749174ms step_avg:995.55ms | |
step:1768/6000 train_time:1750162ms step_avg:995.54ms | |
step:1769/6000 train_time:1751164ms step_avg:995.55ms | |
step:1770/6000 train_time:1752157ms step_avg:995.54ms | |
step:1771/6000 train_time:1753143ms step_avg:995.54ms | |
step:1772/6000 train_time:1754139ms step_avg:995.54ms | |
step:1773/6000 train_time:1755118ms step_avg:995.53ms | |
step:1774/6000 train_time:1756117ms step_avg:995.53ms | |
step:1775/6000 train_time:1757103ms step_avg:995.53ms | |
step:1775/6000 val_loss:2.5377 train_time:1757144ms step_avg:995.55ms perplexity:12.6504 param_count:85,137,462 | |
step:1776/6000 train_time:1758114ms step_avg:995.53ms | |
step:1777/6000 train_time:1759109ms step_avg:995.53ms | |
step:1778/6000 train_time:1760118ms step_avg:995.54ms | |
step:1779/6000 train_time:1761117ms step_avg:995.54ms | |
step:1780/6000 train_time:1762125ms step_avg:995.55ms | |
step:1781/6000 train_time:1763127ms step_avg:995.55ms | |
step:1782/6000 train_time:1764125ms step_avg:995.56ms | |
step:1783/6000 train_time:1765125ms step_avg:995.56ms | |
step:1784/6000 train_time:1766129ms step_avg:995.56ms | |
step:1785/6000 train_time:1767128ms step_avg:995.57ms | |
step:1786/6000 train_time:1768131ms step_avg:995.57ms | |
step:1787/6000 train_time:1769122ms step_avg:995.57ms | |
step:1788/6000 train_time:1770119ms step_avg:995.57ms | |
step:1789/6000 train_time:1771121ms step_avg:995.57ms | |
step:1790/6000 train_time:1772114ms step_avg:995.57ms | |
step:1791/6000 train_time:1773104ms step_avg:995.57ms | |
step:1792/6000 train_time:1774107ms step_avg:995.57ms | |
step:1793/6000 train_time:1775103ms step_avg:995.57ms | |
step:1794/6000 train_time:1776100ms step_avg:995.57ms | |
step:1795/6000 train_time:1777091ms step_avg:995.57ms | |
step:1796/6000 train_time:1778081ms step_avg:995.57ms | |
step:1797/6000 train_time:1779083ms step_avg:995.57ms | |
step:1798/6000 train_time:1780077ms step_avg:995.57ms | |
step:1799/6000 train_time:1781088ms step_avg:995.58ms | |
step:1800/6000 train_time:1782086ms step_avg:995.58ms | |
step:1800/6000 val_loss:2.5483 train_time:1782126ms step_avg:995.60ms perplexity:12.7853 param_count:85,137,462 | |
step:1801/6000 train_time:1783095ms step_avg:995.59ms | |
step:1802/6000 train_time:1784096ms step_avg:995.59ms | |
step:1803/6000 train_time:1785101ms step_avg:995.59ms | |
step:1804/6000 train_time:1786116ms step_avg:995.61ms | |
step:1805/6000 train_time:1787126ms step_avg:995.61ms | |
step:1806/6000 train_time:1788117ms step_avg:995.61ms | |
step:1807/6000 train_time:1789109ms step_avg:995.61ms | |
step:1808/6000 train_time:1790116ms step_avg:995.62ms | |
step:1809/6000 train_time:1791117ms step_avg:995.62ms | |
step:1810/6000 train_time:1792107ms step_avg:995.61ms | |
step:1811/6000 train_time:1793104ms step_avg:995.62ms | |
step:1812/6000 train_time:1794108ms step_avg:995.62ms | |
step:1813/6000 train_time:1795117ms step_avg:995.63ms | |
step:1814/6000 train_time:1796107ms step_avg:995.62ms | |
step:1815/6000 train_time:1797104ms step_avg:995.63ms | |
step:1816/6000 train_time:1798107ms step_avg:995.63ms | |
step:1817/6000 train_time:1799101ms step_avg:995.63ms | |
step:1818/6000 train_time:1800105ms step_avg:995.63ms | |
step:1819/6000 train_time:1801092ms step_avg:995.63ms | |
step:1820/6000 train_time:1802098ms step_avg:995.63ms | |
step:1821/6000 train_time:1803087ms step_avg:995.63ms | |
step:1822/6000 train_time:1804107ms step_avg:995.64ms | |
step:1823/6000 train_time:1805101ms step_avg:995.64ms | |
step:1824/6000 train_time:1806102ms step_avg:995.65ms | |
step:1825/6000 train_time:1807112ms step_avg:995.65ms | |
step:1825/6000 val_loss:2.5449 train_time:1807152ms step_avg:995.68ms perplexity:12.7419 param_count:85,137,462 | |
step:1826/6000 train_time:1808094ms step_avg:995.65ms | |
step:1827/6000 train_time:1809088ms step_avg:995.65ms | |
step:1828/6000 train_time:1810092ms step_avg:995.65ms | |
step:1829/6000 train_time:1811092ms step_avg:995.65ms | |
step:1830/6000 train_time:1812090ms step_avg:995.65ms | |
step:1831/6000 train_time:1813179ms step_avg:995.71ms | |
step:1832/6000 train_time:1814182ms step_avg:995.71ms | |
step:1833/6000 train_time:1815184ms step_avg:995.71ms | |
step:1834/6000 train_time:1816183ms step_avg:995.71ms | |
step:1835/6000 train_time:1817179ms step_avg:995.71ms | |
step:1836/6000 train_time:1818173ms step_avg:995.71ms | |
step:1837/6000 train_time:1819175ms step_avg:995.72ms | |
step:1838/6000 train_time:1820178ms step_avg:995.72ms | |
step:1839/6000 train_time:1821176ms step_avg:995.72ms | |
step:1840/6000 train_time:1822171ms step_avg:995.72ms | |
step:1841/6000 train_time:1823173ms step_avg:995.73ms | |
step:1842/6000 train_time:1824173ms step_avg:995.73ms | |
step:1843/6000 train_time:1825163ms step_avg:995.72ms | |
step:1844/6000 train_time:1826156ms step_avg:995.72ms | |
step:1845/6000 train_time:1827156ms step_avg:995.73ms | |
step:1846/6000 train_time:1828154ms step_avg:995.73ms | |
step:1847/6000 train_time:1829142ms step_avg:995.72ms | |
step:1848/6000 train_time:1830131ms step_avg:995.72ms | |
step:1849/6000 train_time:1831123ms step_avg:995.72ms | |
step:1850/6000 train_time:1832126ms step_avg:995.72ms | |
step:1850/6000 val_loss:2.5506 train_time:1832167ms step_avg:995.74ms perplexity:12.8152 param_count:85,137,462 | |
step:1851/6000 train_time:1833107ms step_avg:995.71ms | |
step:1852/6000 train_time:1834119ms step_avg:995.72ms | |
step:1853/6000 train_time:1835108ms step_avg:995.72ms | |
step:1854/6000 train_time:1836097ms step_avg:995.71ms | |
step:1855/6000 train_time:1837102ms step_avg:995.72ms | |
step:1856/6000 train_time:1838100ms step_avg:995.72ms | |
step:1857/6000 train_time:1839104ms step_avg:995.73ms | |
step:1858/6000 train_time:1840102ms step_avg:995.73ms | |
step:1859/6000 train_time:1841119ms step_avg:995.74ms | |
step:1860/6000 train_time:1842115ms step_avg:995.74ms | |
step:1861/6000 train_time:1843112ms step_avg:995.74ms | |
step:1862/6000 train_time:1844104ms step_avg:995.74ms | |
step:1863/6000 train_time:1845105ms step_avg:995.74ms | |
step:1864/6000 train_time:1846096ms step_avg:995.74ms | |
step:1865/6000 train_time:1847098ms step_avg:995.74ms | |
step:1866/6000 train_time:1848087ms step_avg:995.74ms | |
step:1867/6000 train_time:1849075ms step_avg:995.73ms | |
step:1868/6000 train_time:1850080ms step_avg:995.74ms | |
step:1869/6000 train_time:1851073ms step_avg:995.74ms | |
step:1870/6000 train_time:1852063ms step_avg:995.73ms | |
step:1871/6000 train_time:1853071ms step_avg:995.74ms | |
step:1872/6000 train_time:1854098ms step_avg:995.76ms | |
step:1873/6000 train_time:1855091ms step_avg:995.75ms | |
step:1874/6000 train_time:1856085ms step_avg:995.75ms | |
step:1875/6000 train_time:1857068ms step_avg:995.75ms | |
step:1875/6000 val_loss:2.5427 train_time:1857109ms step_avg:995.77ms perplexity:12.7136 param_count:85,137,462 | |
step:1876/6000 train_time:1858064ms step_avg:995.75ms | |
step:1877/6000 train_time:1859064ms step_avg:995.75ms | |
step:1878/6000 train_time:1860056ms step_avg:995.75ms | |
step:1879/6000 train_time:1861058ms step_avg:995.75ms | |
step:1880/6000 train_time:1862060ms step_avg:995.75ms | |
step:1881/6000 train_time:1863046ms step_avg:995.75ms | |
step:1882/6000 train_time:1864038ms step_avg:995.75ms | |
step:1883/6000 train_time:1865036ms step_avg:995.75ms | |
step:1884/6000 train_time:1866029ms step_avg:995.75ms | |
step:1885/6000 train_time:1867038ms step_avg:995.75ms | |
step:1886/6000 train_time:1868035ms step_avg:995.75ms | |
step:1887/6000 train_time:1869044ms step_avg:995.76ms | |
step:1888/6000 train_time:1870055ms step_avg:995.77ms | |
step:1889/6000 train_time:1871040ms step_avg:995.76ms | |
step:1890/6000 train_time:1872042ms step_avg:995.77ms | |
step:1891/6000 train_time:1873044ms step_avg:995.77ms | |
step:1892/6000 train_time:1874039ms step_avg:995.77ms | |
step:1893/6000 train_time:1875049ms step_avg:995.78ms | |
step:1894/6000 train_time:1876051ms step_avg:995.78ms | |
step:1895/6000 train_time:1877050ms step_avg:995.78ms | |
step:1896/6000 train_time:1878035ms step_avg:995.78ms | |
step:1897/6000 train_time:1879035ms step_avg:995.78ms | |
step:1898/6000 train_time:1880032ms step_avg:995.78ms | |
step:1899/6000 train_time:1881034ms step_avg:995.78ms | |
step:1900/6000 train_time:1882036ms step_avg:995.79ms | |
step:1900/6000 val_loss:2.5520 train_time:1882074ms step_avg:995.81ms perplexity:12.8333 param_count:85,137,462 | |
step:1901/6000 train_time:1883036ms step_avg:995.79ms | |
step:1902/6000 train_time:1884032ms step_avg:995.79ms | |
step:1903/6000 train_time:1885034ms step_avg:995.79ms | |
step:1904/6000 train_time:1886027ms step_avg:995.79ms | |
step:1905/6000 train_time:1887021ms step_avg:995.79ms | |
step:1906/6000 train_time:1888028ms step_avg:995.80ms | |
step:1907/6000 train_time:1889027ms step_avg:995.80ms | |
step:1908/6000 train_time:1890030ms step_avg:995.80ms | |
step:1909/6000 train_time:1891039ms step_avg:995.81ms | |
step:1910/6000 train_time:1892032ms step_avg:995.81ms | |
step:1911/6000 train_time:1893034ms step_avg:995.81ms | |
step:1912/6000 train_time:1894028ms step_avg:995.81ms | |
step:1913/6000 train_time:1895031ms step_avg:995.81ms | |
step:1914/6000 train_time:1896037ms step_avg:995.82ms | |
step:1915/6000 train_time:1897036ms step_avg:995.82ms | |
step:1916/6000 train_time:1898028ms step_avg:995.82ms | |
step:1917/6000 train_time:1899027ms step_avg:995.82ms | |
step:1918/6000 train_time:1900027ms step_avg:995.82ms | |
step:1919/6000 train_time:1901021ms step_avg:995.82ms | |
step:1920/6000 train_time:1902017ms step_avg:995.82ms | |
step:1921/6000 train_time:1903011ms step_avg:995.82ms | |
step:1922/6000 train_time:1904004ms step_avg:995.82ms | |
step:1923/6000 train_time:1905003ms step_avg:995.82ms | |
step:1924/6000 train_time:1906003ms step_avg:995.82ms | |
step:1925/6000 train_time:1906994ms step_avg:995.82ms | |
step:1925/6000 val_loss:2.5435 train_time:1907033ms step_avg:995.84ms perplexity:12.7247 param_count:85,137,462 | |
step:1926/6000 train_time:1907985ms step_avg:995.82ms | |
step:1927/6000 train_time:1908993ms step_avg:995.82ms | |
step:1928/6000 train_time:1909982ms step_avg:995.82ms | |
step:1929/6000 train_time:1910986ms step_avg:995.82ms | |
step:1930/6000 train_time:1911986ms step_avg:995.83ms | |
step:1931/6000 train_time:1912982ms step_avg:995.83ms | |
step:1932/6000 train_time:1913975ms step_avg:995.82ms | |
step:1933/6000 train_time:1914974ms step_avg:995.83ms | |
step:1934/6000 train_time:1915965ms step_avg:995.82ms | |
step:1935/6000 train_time:1916964ms step_avg:995.83ms | |
step:1936/6000 train_time:1917954ms step_avg:995.82ms | |
step:1937/6000 train_time:1918953ms step_avg:995.82ms | |
step:1938/6000 train_time:1919958ms step_avg:995.83ms | |
step:1939/6000 train_time:1920962ms step_avg:995.83ms | |
step:1940/6000 train_time:1921962ms step_avg:995.84ms | |
step:1941/6000 train_time:1922964ms step_avg:995.84ms | |
step:1942/6000 train_time:1923961ms step_avg:995.84ms | |
step:1943/6000 train_time:1924959ms step_avg:995.84ms | |
step:1944/6000 train_time:1925961ms step_avg:995.84ms | |
step:1945/6000 train_time:1926950ms step_avg:995.84ms | |
step:1946/6000 train_time:1927950ms step_avg:995.84ms | |
step:1947/6000 train_time:1928946ms step_avg:995.84ms | |
step:1948/6000 train_time:1929950ms step_avg:995.85ms | |
step:1949/6000 train_time:1930951ms step_avg:995.85ms | |
step:1950/6000 train_time:1931935ms step_avg:995.84ms | |
step:1950/6000 val_loss:2.5506 train_time:1931975ms step_avg:995.86ms perplexity:12.8148 param_count:85,137,462 | |
step:1951/6000 train_time:1932919ms step_avg:995.84ms | |
step:1952/6000 train_time:1933923ms step_avg:995.84ms | |
step:1953/6000 train_time:1934923ms step_avg:995.84ms | |
step:1954/6000 train_time:1935918ms step_avg:995.84ms | |
step:1955/6000 train_time:1936906ms step_avg:995.84ms | |
step:1956/6000 train_time:1937910ms step_avg:995.84ms | |
step:1957/6000 train_time:1938915ms step_avg:995.85ms | |
step:1958/6000 train_time:1939906ms step_avg:995.84ms | |
step:1959/6000 train_time:1940899ms step_avg:995.84ms | |
step:1960/6000 train_time:1941903ms step_avg:995.85ms | |
step:1961/6000 train_time:1942894ms step_avg:995.85ms | |
step:1962/6000 train_time:1943885ms step_avg:995.84ms | |
step:1963/6000 train_time:1944883ms step_avg:995.84ms | |
step:1964/6000 train_time:1945882ms step_avg:995.85ms | |
step:1965/6000 train_time:1946875ms step_avg:995.84ms | |
step:1966/6000 train_time:1947885ms step_avg:995.85ms | |
step:1967/6000 train_time:1948874ms step_avg:995.85ms | |
step:1968/6000 train_time:1949874ms step_avg:995.85ms | |
step:1969/6000 train_time:1950880ms step_avg:995.86ms | |
step:1970/6000 train_time:1951878ms step_avg:995.86ms | |
step:1971/6000 train_time:1952877ms step_avg:995.86ms | |
step:1972/6000 train_time:1953880ms step_avg:995.86ms | |
step:1973/6000 train_time:1954877ms step_avg:995.86ms | |
step:1974/6000 train_time:1955878ms step_avg:995.86ms | |
step:1975/6000 train_time:1956869ms step_avg:995.86ms | |
step:1975/6000 val_loss:2.5508 train_time:1956909ms step_avg:995.88ms perplexity:12.8178 param_count:85,137,462 | |
step:1976/6000 train_time:1957853ms step_avg:995.86ms | |
step:1977/6000 train_time:1958858ms step_avg:995.86ms | |
step:1978/6000 train_time:1959849ms step_avg:995.86ms | |
step:1979/6000 train_time:1960869ms step_avg:995.87ms | |
step:1980/6000 train_time:1961870ms step_avg:995.87ms | |
step:1981/6000 train_time:1962881ms step_avg:995.88ms | |
step:1982/6000 train_time:1963868ms step_avg:995.88ms | |
step:1983/6000 train_time:1964862ms step_avg:995.88ms | |
step:1984/6000 train_time:1965857ms step_avg:995.87ms | |
step:1985/6000 train_time:1966857ms step_avg:995.88ms | |
step:1986/6000 train_time:1967854ms step_avg:995.88ms | |
step:1987/6000 train_time:1968861ms step_avg:995.88ms | |
step:1988/6000 train_time:1969859ms step_avg:995.88ms | |
step:1989/6000 train_time:1970853ms step_avg:995.88ms | |
step:1990/6000 train_time:1971857ms step_avg:995.89ms | |
step:1991/6000 train_time:1972844ms step_avg:995.88ms | |
step:1992/6000 train_time:1973843ms step_avg:995.88ms | |
step:1993/6000 train_time:1974843ms step_avg:995.89ms | |
step:1994/6000 train_time:1975839ms step_avg:995.89ms | |
step:1995/6000 train_time:1976831ms step_avg:995.88ms | |
step:1996/6000 train_time:1977836ms step_avg:995.89ms | |
step:1997/6000 train_time:1978837ms step_avg:995.89ms | |
step:1998/6000 train_time:1979839ms step_avg:995.89ms | |
step:1999/6000 train_time:1980833ms step_avg:995.89ms | |
step:2000/6000 train_time:1981828ms step_avg:995.89ms | |
step:2000/6000 val_loss:2.5359 train_time:1981868ms step_avg:995.91ms perplexity:12.6272 param_count:85,137,462 | |
step:2001/6000 train_time:1982829ms step_avg:995.90ms | |
step:2002/6000 train_time:1983823ms step_avg:995.89ms | |
step:2003/6000 train_time:1984814ms step_avg:995.89ms | |
step:2004/6000 train_time:1985803ms step_avg:995.89ms | |
step:2005/6000 train_time:1986802ms step_avg:995.89ms | |
step:2006/6000 train_time:1987798ms step_avg:995.89ms | |
step:2007/6000 train_time:1988796ms step_avg:995.89ms | |
step:2008/6000 train_time:1989814ms step_avg:995.90ms | |
step:2009/6000 train_time:1990819ms step_avg:995.91ms | |
step:2010/6000 train_time:1991811ms step_avg:995.91ms | |
step:2011/6000 train_time:1992824ms step_avg:995.91ms | |
step:2012/6000 train_time:1993815ms step_avg:995.91ms | |
step:2013/6000 train_time:1994803ms step_avg:995.91ms | |
step:2014/6000 train_time:1995786ms step_avg:995.90ms | |
step:2015/6000 train_time:1996781ms step_avg:995.90ms | |
step:2016/6000 train_time:1997798ms step_avg:995.91ms | |
step:2017/6000 train_time:1998802ms step_avg:995.92ms | |
step:2018/6000 train_time:1999798ms step_avg:995.92ms | |
step:2019/6000 train_time:2000807ms step_avg:995.92ms | |
step:2020/6000 train_time:2001806ms step_avg:995.92ms | |
step:2021/6000 train_time:2002805ms step_avg:995.93ms | |
step:2022/6000 train_time:2003805ms step_avg:995.93ms | |
step:2023/6000 train_time:2004801ms step_avg:995.93ms | |
step:2024/6000 train_time:2005804ms step_avg:995.93ms | |
step:2025/6000 train_time:2006799ms step_avg:995.93ms | |
step:2025/6000 val_loss:2.5466 train_time:2006836ms step_avg:995.95ms perplexity:12.7639 param_count:85,137,462 | |
step:2026/6000 train_time:2007779ms step_avg:995.92ms | |
step:2027/6000 train_time:2008780ms step_avg:995.92ms | |
step:2028/6000 train_time:2009781ms step_avg:995.93ms | |
step:2029/6000 train_time:2010778ms step_avg:995.93ms | |
step:2030/6000 train_time:2011777ms step_avg:995.93ms | |
step:2031/6000 train_time:2012779ms step_avg:995.93ms | |
step:2032/6000 train_time:2013780ms step_avg:995.93ms | |
step:2033/6000 train_time:2014799ms step_avg:995.95ms | |
step:2034/6000 train_time:2015803ms step_avg:995.95ms | |
step:2035/6000 train_time:2016804ms step_avg:995.95ms | |
step:2036/6000 train_time:2017805ms step_avg:995.95ms | |
step:2037/6000 train_time:2018803ms step_avg:995.96ms | |
step:2038/6000 train_time:2019798ms step_avg:995.96ms | |
step:2039/6000 train_time:2020792ms step_avg:995.95ms | |
step:2040/6000 train_time:2021789ms step_avg:995.95ms | |
step:2041/6000 train_time:2022794ms step_avg:995.96ms | |
step:2042/6000 train_time:2023792ms step_avg:995.96ms | |
step:2043/6000 train_time:2024794ms step_avg:995.96ms | |
step:2044/6000 train_time:2025801ms step_avg:995.97ms | |
step:2045/6000 train_time:2026812ms step_avg:995.98ms | |
step:2046/6000 train_time:2027813ms step_avg:995.98ms | |
step:2047/6000 train_time:2028807ms step_avg:995.98ms | |
step:2048/6000 train_time:2029800ms step_avg:995.98ms | |
step:2049/6000 train_time:2030798ms step_avg:995.98ms | |
step:2050/6000 train_time:2031794ms step_avg:995.98ms | |
step:2050/6000 val_loss:2.5363 train_time:2031833ms step_avg:996.00ms perplexity:12.6328 param_count:85,137,462 | |
step:2051/6000 train_time:2032796ms step_avg:995.98ms | |
step:2052/6000 train_time:2033799ms step_avg:995.98ms | |
step:2053/6000 train_time:2034797ms step_avg:995.98ms | |
step:2054/6000 train_time:2035797ms step_avg:995.99ms | |
step:2055/6000 train_time:2036783ms step_avg:995.98ms | |
step:2056/6000 train_time:2037785ms step_avg:995.98ms | |
step:2057/6000 train_time:2038773ms step_avg:995.98ms | |
step:2058/6000 train_time:2039773ms step_avg:995.98ms | |
step:2059/6000 train_time:2040778ms step_avg:995.99ms | |
step:2060/6000 train_time:2041774ms step_avg:995.99ms | |
step:2061/6000 train_time:2042775ms step_avg:995.99ms | |
step:2062/6000 train_time:2043792ms step_avg:996.00ms | |
step:2063/6000 train_time:2044794ms step_avg:996.00ms | |
step:2064/6000 train_time:2045791ms step_avg:996.00ms | |
step:2065/6000 train_time:2046784ms step_avg:996.00ms | |
step:2066/6000 train_time:2047783ms step_avg:996.00ms | |
step:2067/6000 train_time:2048781ms step_avg:996.00ms | |
step:2068/6000 train_time:2049780ms step_avg:996.01ms | |
step:2069/6000 train_time:2050777ms step_avg:996.01ms | |
step:2070/6000 train_time:2051773ms step_avg:996.01ms | |
step:2071/6000 train_time:2052788ms step_avg:996.02ms | |
step:2072/6000 train_time:2053776ms step_avg:996.01ms | |
step:2073/6000 train_time:2054774ms step_avg:996.01ms | |
step:2074/6000 train_time:2055782ms step_avg:996.02ms | |
step:2075/6000 train_time:2056772ms step_avg:996.02ms | |
step:2075/6000 val_loss:2.5248 train_time:2056812ms step_avg:996.03ms perplexity:12.4890 param_count:85,137,462 | |
step:2076/6000 train_time:2057764ms step_avg:996.01ms | |
step:2077/6000 train_time:2058762ms step_avg:996.01ms | |
step:2078/6000 train_time:2059754ms step_avg:996.01ms | |
step:2079/6000 train_time:2060756ms step_avg:996.02ms | |
step:2080/6000 train_time:2061764ms step_avg:996.02ms | |
step:2081/6000 train_time:2062783ms step_avg:996.03ms | |
step:2082/6000 train_time:2063779ms step_avg:996.03ms | |
step:2083/6000 train_time:2064778ms step_avg:996.03ms | |
step:2084/6000 train_time:2065778ms step_avg:996.04ms | |
step:2085/6000 train_time:2066779ms step_avg:996.04ms | |
step:2086/6000 train_time:2067780ms step_avg:996.04ms | |
step:2087/6000 train_time:2068782ms step_avg:996.04ms | |
step:2088/6000 train_time:2069772ms step_avg:996.04ms | |
step:2089/6000 train_time:2070773ms step_avg:996.04ms | |
step:2090/6000 train_time:2071764ms step_avg:996.04ms | |
step:2091/6000 train_time:2072778ms step_avg:996.05ms | |
step:2092/6000 train_time:2073762ms step_avg:996.04ms | |
step:2093/6000 train_time:2074771ms step_avg:996.05ms | |
step:2094/6000 train_time:2075772ms step_avg:996.05ms | |
step:2095/6000 train_time:2076772ms step_avg:996.05ms | |
step:2096/6000 train_time:2077766ms step_avg:996.05ms | |
step:2097/6000 train_time:2078756ms step_avg:996.05ms | |
step:2098/6000 train_time:2079760ms step_avg:996.05ms | |
step:2099/6000 train_time:2080755ms step_avg:996.05ms | |
step:2100/6000 train_time:2081743ms step_avg:996.05ms | |
step:2100/6000 val_loss:2.5430 train_time:2081782ms step_avg:996.07ms perplexity:12.7177 param_count:85,137,462 | |
step:2101/6000 train_time:2082736ms step_avg:996.05ms | |
step:2102/6000 train_time:2083740ms step_avg:996.05ms | |
step:2103/6000 train_time:2084748ms step_avg:996.06ms | |
step:2104/6000 train_time:2085749ms step_avg:996.06ms | |
step:2105/6000 train_time:2086753ms step_avg:996.06ms | |
step:2106/6000 train_time:2087755ms step_avg:996.07ms | |
step:2107/6000 train_time:2088756ms step_avg:996.07ms | |
step:2108/6000 train_time:2089760ms step_avg:996.07ms | |
step:2109/6000 train_time:2090750ms step_avg:996.07ms | |
step:2110/6000 train_time:2091753ms step_avg:996.07ms | |
step:2111/6000 train_time:2092749ms step_avg:996.07ms | |
step:2112/6000 train_time:2093743ms step_avg:996.07ms | |
step:2113/6000 train_time:2094729ms step_avg:996.07ms | |
step:2114/6000 train_time:2095732ms step_avg:996.07ms | |
step:2115/6000 train_time:2096732ms step_avg:996.07ms | |
step:2116/6000 train_time:2097741ms step_avg:996.08ms | |
step:2117/6000 train_time:2098743ms step_avg:996.08ms | |
step:2118/6000 train_time:2099733ms step_avg:996.08ms | |
step:2119/6000 train_time:2100736ms step_avg:996.08ms | |
step:2120/6000 train_time:2101739ms step_avg:996.08ms | |
step:2121/6000 train_time:2102735ms step_avg:996.08ms | |
step:2122/6000 train_time:2103736ms step_avg:996.09ms | |
step:2123/6000 train_time:2104731ms step_avg:996.09ms | |
step:2124/6000 train_time:2105731ms step_avg:996.09ms | |
step:2125/6000 train_time:2106728ms step_avg:996.09ms | |
step:2125/6000 val_loss:2.5401 train_time:2106767ms step_avg:996.11ms perplexity:12.6814 param_count:85,137,462 | |
step:2126/6000 train_time:2107712ms step_avg:996.08ms | |
step:2127/6000 train_time:2108710ms step_avg:996.08ms | |
step:2128/6000 train_time:2109715ms step_avg:996.09ms | |
step:2129/6000 train_time:2110724ms step_avg:996.09ms | |
step:2130/6000 train_time:2111730ms step_avg:996.10ms | |
step:2131/6000 train_time:2112751ms step_avg:996.11ms | |
step:2132/6000 train_time:2113748ms step_avg:996.11ms | |
step:2133/6000 train_time:2114770ms step_avg:996.12ms | |
step:2134/6000 train_time:2115764ms step_avg:996.12ms | |
step:2135/6000 train_time:2116762ms step_avg:996.12ms | |
step:2136/6000 train_time:2117774ms step_avg:996.13ms | |
step:2137/6000 train_time:2118775ms step_avg:996.13ms | |
step:2138/6000 train_time:2119776ms step_avg:996.14ms | |
step:2139/6000 train_time:2120768ms step_avg:996.13ms | |
step:2140/6000 train_time:2121769ms step_avg:996.14ms | |
step:2141/6000 train_time:2122775ms step_avg:996.14ms | |
step:2142/6000 train_time:2123762ms step_avg:996.14ms | |
step:2143/6000 train_time:2124762ms step_avg:996.14ms | |
step:2144/6000 train_time:2125770ms step_avg:996.14ms | |
step:2145/6000 train_time:2126769ms step_avg:996.14ms | |
step:2146/6000 train_time:2127773ms step_avg:996.15ms | |
step:2147/6000 train_time:2128780ms step_avg:996.15ms | |
step:2148/6000 train_time:2129802ms step_avg:996.17ms | |
step:2149/6000 train_time:2130795ms step_avg:996.16ms | |
step:2150/6000 train_time:2131808ms step_avg:996.17ms | |
step:2150/6000 val_loss:2.5349 train_time:2131848ms step_avg:996.19ms perplexity:12.6146 param_count:85,137,462 | |
step:2151/6000 train_time:2132798ms step_avg:996.17ms | |
step:2152/6000 train_time:2133799ms step_avg:996.17ms | |
step:2153/6000 train_time:2134802ms step_avg:996.17ms | |
step:2154/6000 train_time:2135793ms step_avg:996.17ms | |
step:2155/6000 train_time:2136790ms step_avg:996.17ms | |
step:2156/6000 train_time:2137795ms step_avg:996.18ms | |
step:2157/6000 train_time:2138797ms step_avg:996.18ms | |
step:2158/6000 train_time:2139794ms step_avg:996.18ms | |
step:2159/6000 train_time:2140797ms step_avg:996.18ms | |
step:2160/6000 train_time:2141800ms step_avg:996.19ms | |
step:2161/6000 train_time:2142818ms step_avg:996.20ms | |
step:2162/6000 train_time:2143822ms step_avg:996.20ms | |
step:2163/6000 train_time:2144820ms step_avg:996.20ms | |
step:2164/6000 train_time:2145824ms step_avg:996.20ms | |
step:2165/6000 train_time:2146820ms step_avg:996.20ms | |
step:2166/6000 train_time:2147818ms step_avg:996.20ms | |
step:2167/6000 train_time:2148832ms step_avg:996.21ms | |
step:2168/6000 train_time:2149838ms step_avg:996.22ms | |
step:2169/6000 train_time:2150840ms step_avg:996.22ms | |
step:2170/6000 train_time:2151833ms step_avg:996.22ms | |
step:2171/6000 train_time:2152842ms step_avg:996.23ms | |
step:2172/6000 train_time:2153842ms step_avg:996.23ms | |
step:2173/6000 train_time:2154842ms step_avg:996.23ms | |
step:2174/6000 train_time:2155860ms step_avg:996.24ms | |
step:2175/6000 train_time:2156866ms step_avg:996.24ms | |
step:2175/6000 val_loss:2.5370 train_time:2156909ms step_avg:996.26ms perplexity:12.6421 param_count:85,137,462 | |
step:2176/6000 train_time:2157863ms step_avg:996.24ms | |
step:2177/6000 train_time:2158866ms step_avg:996.25ms | |
step:2178/6000 train_time:2159869ms step_avg:996.25ms | |
step:2179/6000 train_time:2160874ms step_avg:996.25ms | |
step:2180/6000 train_time:2161877ms step_avg:996.26ms | |
step:2181/6000 train_time:2162879ms step_avg:996.26ms | |
step:2182/6000 train_time:2163888ms step_avg:996.27ms | |
step:2183/6000 train_time:2164884ms step_avg:996.27ms | |
step:2184/6000 train_time:2165885ms step_avg:996.27ms | |
step:2185/6000 train_time:2166916ms step_avg:996.28ms | |
step:2186/6000 train_time:2167915ms step_avg:996.28ms | |
step:2187/6000 train_time:2168926ms step_avg:996.29ms | |
step:2188/6000 train_time:2169937ms step_avg:996.30ms | |
step:2189/6000 train_time:2170951ms step_avg:996.31ms | |
step:2190/6000 train_time:2171940ms step_avg:996.30ms | |
step:2191/6000 train_time:2172934ms step_avg:996.30ms | |
step:2192/6000 train_time:2173939ms step_avg:996.31ms | |
step:2193/6000 train_time:2174928ms step_avg:996.30ms | |
step:2194/6000 train_time:2175930ms step_avg:996.31ms | |
step:2195/6000 train_time:2176953ms step_avg:996.32ms | |
step:2196/6000 train_time:2177973ms step_avg:996.33ms | |
step:2197/6000 train_time:2178969ms step_avg:996.33ms | |
step:2198/6000 train_time:2179953ms step_avg:996.32ms | |
step:2199/6000 train_time:2180945ms step_avg:996.32ms | |
step:2200/6000 train_time:2181964ms step_avg:996.33ms | |
step:2200/6000 val_loss:2.5368 train_time:2182005ms step_avg:996.35ms perplexity:12.6394 param_count:85,137,462 | |
step:2201/6000 train_time:2182975ms step_avg:996.34ms | |
step:2202/6000 train_time:2183984ms step_avg:996.34ms | |
step:2203/6000 train_time:2184984ms step_avg:996.34ms | |
step:2204/6000 train_time:2185988ms step_avg:996.35ms | |
step:2205/6000 train_time:2186984ms step_avg:996.35ms | |
step:2206/6000 train_time:2187980ms step_avg:996.35ms | |
step:2207/6000 train_time:2188985ms step_avg:996.35ms | |
step:2208/6000 train_time:2189988ms step_avg:996.35ms | |
step:2209/6000 train_time:2190992ms step_avg:996.36ms | |
step:2210/6000 train_time:2191990ms step_avg:996.36ms | |
step:2211/6000 train_time:2193008ms step_avg:996.37ms | |
step:2212/6000 train_time:2194000ms step_avg:996.37ms | |
step:2213/6000 train_time:2195000ms step_avg:996.37ms | |
step:2214/6000 train_time:2195991ms step_avg:996.37ms | |
step:2215/6000 train_time:2196995ms step_avg:996.37ms | |
step:2216/6000 train_time:2197992ms step_avg:996.37ms | |
step:2217/6000 train_time:2198992ms step_avg:996.37ms | |
step:2218/6000 train_time:2199990ms step_avg:996.37ms | |
step:2219/6000 train_time:2200991ms step_avg:996.37ms | |
step:2220/6000 train_time:2202006ms step_avg:996.38ms | |
step:2221/6000 train_time:2203007ms step_avg:996.38ms | |
step:2222/6000 train_time:2204008ms step_avg:996.39ms | |
step:2223/6000 train_time:2204996ms step_avg:996.38ms | |
step:2224/6000 train_time:2205993ms step_avg:996.38ms | |
step:2225/6000 train_time:2207003ms step_avg:996.39ms | |
step:2225/6000 val_loss:2.5310 train_time:2207045ms step_avg:996.41ms perplexity:12.5655 param_count:85,137,462 | |
step:2226/6000 train_time:2208018ms step_avg:996.40ms | |
step:2227/6000 train_time:2209021ms step_avg:996.40ms | |
step:2228/6000 train_time:2210043ms step_avg:996.41ms | |
step:2229/6000 train_time:2211055ms step_avg:996.42ms | |
step:2230/6000 train_time:2212067ms step_avg:996.43ms | |
step:2231/6000 train_time:2213080ms step_avg:996.43ms | |
step:2232/6000 train_time:2214073ms step_avg:996.43ms | |
step:2233/6000 train_time:2215070ms step_avg:996.43ms | |
step:2234/6000 train_time:2216072ms step_avg:996.44ms | |
step:2235/6000 train_time:2217070ms step_avg:996.44ms | |
step:2236/6000 train_time:2218055ms step_avg:996.43ms | |
step:2237/6000 train_time:2219060ms step_avg:996.43ms | |
step:2238/6000 train_time:2220059ms step_avg:996.44ms | |
step:2239/6000 train_time:2221079ms step_avg:996.45ms | |
step:2240/6000 train_time:2222093ms step_avg:996.45ms | |
step:2241/6000 train_time:2223096ms step_avg:996.46ms | |
step:2242/6000 train_time:2224098ms step_avg:996.46ms | |
step:2243/6000 train_time:2225126ms step_avg:996.47ms | |
step:2244/6000 train_time:2226113ms step_avg:996.47ms | |
step:2245/6000 train_time:2227119ms step_avg:996.47ms | |
step:2246/6000 train_time:2228128ms step_avg:996.48ms | |
step:2247/6000 train_time:2229124ms step_avg:996.48ms | |
step:2248/6000 train_time:2230121ms step_avg:996.48ms | |
step:2249/6000 train_time:2231106ms step_avg:996.47ms | |
step:2250/6000 train_time:2232107ms step_avg:996.48ms | |
step:2250/6000 val_loss:2.5334 train_time:2232151ms step_avg:996.50ms perplexity:12.5968 param_count:85,137,462 | |
step:2251/6000 train_time:2233100ms step_avg:996.47ms | |
step:2252/6000 train_time:2234096ms step_avg:996.47ms | |
step:2253/6000 train_time:2235100ms step_avg:996.48ms | |
step:2254/6000 train_time:2236105ms step_avg:996.48ms | |
step:2255/6000 train_time:2237105ms step_avg:996.48ms | |
step:2256/6000 train_time:2238120ms step_avg:996.49ms | |
step:2257/6000 train_time:2239135ms step_avg:996.50ms | |
step:2258/6000 train_time:2240144ms step_avg:996.51ms | |
step:2259/6000 train_time:2241151ms step_avg:996.51ms | |
step:2260/6000 train_time:2242184ms step_avg:996.53ms | |
step:2261/6000 train_time:2243184ms step_avg:996.53ms | |
step:2262/6000 train_time:2244181ms step_avg:996.53ms | |
step:2263/6000 train_time:2245175ms step_avg:996.53ms | |
step:2264/6000 train_time:2246186ms step_avg:996.53ms | |
step:2265/6000 train_time:2247208ms step_avg:996.54ms | |
step:2266/6000 train_time:2248217ms step_avg:996.55ms | |
step:2267/6000 train_time:2249221ms step_avg:996.55ms | |
step:2268/6000 train_time:2250232ms step_avg:996.56ms | |
step:2269/6000 train_time:2251233ms step_avg:996.56ms | |
step:2270/6000 train_time:2252255ms step_avg:996.57ms | |
step:2271/6000 train_time:2253256ms step_avg:996.58ms | |
step:2272/6000 train_time:2254257ms step_avg:996.58ms | |
step:2273/6000 train_time:2255300ms step_avg:996.60ms | |
step:2274/6000 train_time:2256300ms step_avg:996.60ms | |
step:2275/6000 train_time:2257302ms step_avg:996.60ms | |
step:2275/6000 val_loss:2.5275 train_time:2257347ms step_avg:996.62ms perplexity:12.5225 param_count:85,137,462 | |
step:2276/6000 train_time:2258306ms step_avg:996.60ms | |
step:2277/6000 train_time:2259324ms step_avg:996.61ms | |
step:2278/6000 train_time:2260329ms step_avg:996.62ms | |
step:2279/6000 train_time:2261354ms step_avg:996.63ms | |
step:2280/6000 train_time:2262367ms step_avg:996.64ms | |
step:2281/6000 train_time:2263361ms step_avg:996.64ms | |
step:2282/6000 train_time:2264363ms step_avg:996.64ms | |
step:2283/6000 train_time:2265366ms step_avg:996.64ms | |
step:2284/6000 train_time:2266375ms step_avg:996.65ms | |
step:2285/6000 train_time:2267383ms step_avg:996.65ms | |
step:2286/6000 train_time:2268394ms step_avg:996.66ms | |
step:2287/6000 train_time:2269413ms step_avg:996.67ms | |
step:2288/6000 train_time:2270413ms step_avg:996.67ms | |
step:2289/6000 train_time:2271421ms step_avg:996.67ms | |
step:2290/6000 train_time:2272422ms step_avg:996.68ms | |
step:2291/6000 train_time:2273435ms step_avg:996.68ms | |
step:2292/6000 train_time:2274432ms step_avg:996.68ms | |
step:2293/6000 train_time:2275427ms step_avg:996.68ms | |
step:2294/6000 train_time:2276440ms step_avg:996.69ms | |
step:2295/6000 train_time:2277448ms step_avg:996.70ms | |
step:2296/6000 train_time:2278466ms step_avg:996.70ms | |
step:2297/6000 train_time:2279480ms step_avg:996.71ms | |
step:2298/6000 train_time:2280491ms step_avg:996.72ms | |
step:2299/6000 train_time:2281494ms step_avg:996.72ms | |
step:2300/6000 train_time:2282509ms step_avg:996.73ms | |
step:2300/6000 val_loss:2.5246 train_time:2282554ms step_avg:996.75ms perplexity:12.4860 param_count:85,137,462 | |
step:2301/6000 train_time:2283511ms step_avg:996.73ms | |
step:2302/6000 train_time:2284514ms step_avg:996.73ms | |
step:2303/6000 train_time:2285526ms step_avg:996.74ms | |
step:2304/6000 train_time:2286527ms step_avg:996.74ms | |
step:2305/6000 train_time:2287537ms step_avg:996.75ms | |
step:2306/6000 train_time:2288542ms step_avg:996.75ms | |
step:2307/6000 train_time:2289561ms step_avg:996.76ms | |
step:2308/6000 train_time:2290561ms step_avg:996.76ms | |
step:2309/6000 train_time:2291561ms step_avg:996.76ms | |
step:2310/6000 train_time:2292575ms step_avg:996.77ms | |
step:2311/6000 train_time:2293587ms step_avg:996.78ms | |
step:2312/6000 train_time:2294597ms step_avg:996.78ms | |
step:2313/6000 train_time:2295605ms step_avg:996.79ms | |
step:2314/6000 train_time:2296611ms step_avg:996.79ms | |
step:2315/6000 train_time:2297613ms step_avg:996.80ms | |
step:2316/6000 train_time:2298642ms step_avg:996.81ms | |
step:2317/6000 train_time:2299645ms step_avg:996.81ms | |
step:2318/6000 train_time:2300656ms step_avg:996.82ms | |
step:2319/6000 train_time:2301663ms step_avg:996.82ms | |
step:2320/6000 train_time:2302671ms step_avg:996.83ms | |
step:2321/6000 train_time:2303667ms step_avg:996.83ms | |
step:2322/6000 train_time:2304673ms step_avg:996.83ms | |
step:2323/6000 train_time:2305689ms step_avg:996.84ms | |
step:2324/6000 train_time:2306717ms step_avg:996.85ms | |
step:2325/6000 train_time:2307713ms step_avg:996.85ms | |
step:2325/6000 val_loss:2.5267 train_time:2307758ms step_avg:996.87ms perplexity:12.5125 param_count:85,137,462 | |
step:2326/6000 train_time:2308721ms step_avg:996.86ms | |
step:2327/6000 train_time:2309734ms step_avg:996.86ms | |
step:2328/6000 train_time:2310739ms step_avg:996.87ms | |
step:2329/6000 train_time:2311760ms step_avg:996.88ms | |
step:2330/6000 train_time:2312776ms step_avg:996.89ms | |
step:2331/6000 train_time:2313786ms step_avg:996.89ms | |
step:2332/6000 train_time:2314783ms step_avg:996.89ms | |
step:2333/6000 train_time:2315788ms step_avg:996.90ms | |
step:2334/6000 train_time:2316796ms step_avg:996.90ms | |
step:2335/6000 train_time:2317807ms step_avg:996.91ms | |
step:2336/6000 train_time:2318811ms step_avg:996.91ms | |
step:2337/6000 train_time:2319840ms step_avg:996.92ms | |
step:2338/6000 train_time:2320866ms step_avg:996.94ms | |
step:2339/6000 train_time:2321871ms step_avg:996.94ms | |
step:2340/6000 train_time:2322880ms step_avg:996.94ms | |
step:2341/6000 train_time:2323884ms step_avg:996.95ms | |
step:2342/6000 train_time:2324890ms step_avg:996.95ms | |
step:2343/6000 train_time:2325902ms step_avg:996.96ms | |
step:2344/6000 train_time:2326905ms step_avg:996.96ms | |
step:2345/6000 train_time:2327916ms step_avg:996.97ms | |
step:2346/6000 train_time:2328935ms step_avg:996.98ms | |
step:2347/6000 train_time:2329937ms step_avg:996.98ms | |
step:2348/6000 train_time:2330944ms step_avg:996.98ms | |
step:2349/6000 train_time:2331946ms step_avg:996.98ms | |
step:2350/6000 train_time:2332942ms step_avg:996.98ms | |
step:2350/6000 val_loss:2.5289 train_time:2332986ms step_avg:997.00ms perplexity:12.5399 param_count:85,137,462 | |
step:2351/6000 train_time:2333942ms step_avg:996.98ms | |
step:2352/6000 train_time:2334940ms step_avg:996.99ms | |
step:2353/6000 train_time:2335952ms step_avg:996.99ms | |
step:2354/6000 train_time:2336953ms step_avg:996.99ms | |
step:2355/6000 train_time:2337953ms step_avg:996.99ms | |
step:2356/6000 train_time:2338967ms step_avg:997.00ms | |
step:2357/6000 train_time:2339975ms step_avg:997.01ms | |
step:2358/6000 train_time:2340979ms step_avg:997.01ms | |
step:2359/6000 train_time:2341985ms step_avg:997.01ms | |
step:2360/6000 train_time:2342986ms step_avg:997.02ms | |
step:2361/6000 train_time:2344000ms step_avg:997.02ms | |
step:2362/6000 train_time:2345006ms step_avg:997.03ms | |
step:2363/6000 train_time:2346015ms step_avg:997.03ms | |
step:2364/6000 train_time:2347008ms step_avg:997.03ms | |
step:2365/6000 train_time:2348035ms step_avg:997.04ms | |
step:2366/6000 train_time:2349032ms step_avg:997.04ms | |
step:2367/6000 train_time:2350042ms step_avg:997.05ms | |
step:2368/6000 train_time:2351048ms step_avg:997.05ms | |
step:2369/6000 train_time:2352054ms step_avg:997.06ms | |
step:2370/6000 train_time:2353056ms step_avg:997.06ms | |
step:2371/6000 train_time:2354066ms step_avg:997.06ms | |
step:2372/6000 train_time:2355069ms step_avg:997.07ms | |
step:2373/6000 train_time:2356076ms step_avg:997.07ms | |
step:2374/6000 train_time:2357095ms step_avg:997.08ms | |
step:2375/6000 train_time:2358112ms step_avg:997.09ms | |
step:2375/6000 val_loss:2.5360 train_time:2358157ms step_avg:997.11ms perplexity:12.6289 param_count:85,137,462 | |
step:2376/6000 train_time:2359107ms step_avg:997.09ms | |
step:2377/6000 train_time:2360106ms step_avg:997.09ms | |
step:2378/6000 train_time:2361111ms step_avg:997.09ms | |
step:2379/6000 train_time:2362106ms step_avg:997.09ms | |
step:2380/6000 train_time:2363103ms step_avg:997.09ms | |
step:2381/6000 train_time:2364114ms step_avg:997.10ms | |
step:2382/6000 train_time:2365161ms step_avg:997.12ms | |
step:2383/6000 train_time:2366167ms step_avg:997.12ms | |
step:2384/6000 train_time:2367182ms step_avg:997.13ms | |
step:2385/6000 train_time:2368207ms step_avg:997.14ms | |
step:2386/6000 train_time:2369220ms step_avg:997.15ms | |
step:2387/6000 train_time:2370226ms step_avg:997.15ms | |
step:2388/6000 train_time:2371234ms step_avg:997.15ms | |
step:2389/6000 train_time:2372234ms step_avg:997.16ms | |
step:2390/6000 train_time:2373235ms step_avg:997.16ms | |
step:2391/6000 train_time:2374232ms step_avg:997.16ms | |
step:2392/6000 train_time:2375238ms step_avg:997.16ms | |
step:2393/6000 train_time:2376234ms step_avg:997.16ms | |
step:2394/6000 train_time:2377239ms step_avg:997.16ms | |
step:2395/6000 train_time:2378249ms step_avg:997.17ms | |
step:2396/6000 train_time:2379263ms step_avg:997.18ms | |
step:2397/6000 train_time:2380277ms step_avg:997.18ms | |
step:2398/6000 train_time:2381300ms step_avg:997.19ms | |
step:2399/6000 train_time:2382320ms step_avg:997.20ms | |
step:2400/6000 train_time:2383326ms step_avg:997.21ms | |
step:2400/6000 val_loss:2.5283 train_time:2383372ms step_avg:997.23ms perplexity:12.5323 param_count:85,137,462 | |
step:2401/6000 train_time:2384324ms step_avg:997.21ms | |
step:2402/6000 train_time:2385328ms step_avg:997.21ms | |
step:2403/6000 train_time:2386328ms step_avg:997.21ms | |
step:2404/6000 train_time:2387349ms step_avg:997.22ms | |
step:2405/6000 train_time:2388344ms step_avg:997.22ms | |
step:2406/6000 train_time:2389342ms step_avg:997.22ms | |
step:2407/6000 train_time:2390348ms step_avg:997.22ms | |
step:2408/6000 train_time:2391356ms step_avg:997.23ms | |
step:2409/6000 train_time:2392351ms step_avg:997.23ms | |
step:2410/6000 train_time:2393362ms step_avg:997.23ms | |
step:2411/6000 train_time:2394369ms step_avg:997.24ms | |
step:2412/6000 train_time:2395380ms step_avg:997.24ms | |
step:2413/6000 train_time:2396371ms step_avg:997.24ms | |
step:2414/6000 train_time:2397383ms step_avg:997.25ms | |
step:2415/6000 train_time:2398387ms step_avg:997.25ms | |
step:2416/6000 train_time:2399385ms step_avg:997.25ms | |
step:2417/6000 train_time:2400414ms step_avg:997.26ms | |
step:2418/6000 train_time:2401444ms step_avg:997.28ms | |
step:2419/6000 train_time:2402444ms step_avg:997.28ms | |
step:2420/6000 train_time:2403443ms step_avg:997.28ms | |
step:2421/6000 train_time:2404463ms step_avg:997.29ms | |
step:2422/6000 train_time:2405469ms step_avg:997.29ms | |
step:2423/6000 train_time:2406475ms step_avg:997.30ms | |
step:2424/6000 train_time:2407478ms step_avg:997.30ms | |
step:2425/6000 train_time:2408506ms step_avg:997.31ms | |
step:2425/6000 val_loss:2.5340 train_time:2408552ms step_avg:997.33ms perplexity:12.6041 param_count:85,137,462 | |
step:2426/6000 train_time:2409512ms step_avg:997.31ms | |
step:2427/6000 train_time:2410521ms step_avg:997.32ms | |
step:2428/6000 train_time:2411523ms step_avg:997.32ms | |
step:2429/6000 train_time:2412548ms step_avg:997.33ms | |
step:2430/6000 train_time:2413557ms step_avg:997.34ms | |
step:2431/6000 train_time:2414565ms step_avg:997.34ms | |
step:2432/6000 train_time:2415579ms step_avg:997.35ms | |
step:2433/6000 train_time:2416590ms step_avg:997.35ms | |
step:2434/6000 train_time:2417601ms step_avg:997.36ms | |
step:2435/6000 train_time:2418613ms step_avg:997.37ms | |
step:2436/6000 train_time:2419639ms step_avg:997.38ms | |
step:2437/6000 train_time:2420640ms step_avg:997.38ms | |
step:2438/6000 train_time:2421653ms step_avg:997.39ms | |
step:2439/6000 train_time:2422651ms step_avg:997.39ms | |
step:2440/6000 train_time:2423660ms step_avg:997.39ms | |
step:2441/6000 train_time:2424670ms step_avg:997.40ms | |
step:2442/6000 train_time:2425678ms step_avg:997.40ms | |
step:2443/6000 train_time:2426680ms step_avg:997.40ms | |
step:2444/6000 train_time:2427704ms step_avg:997.41ms | |
step:2445/6000 train_time:2428709ms step_avg:997.42ms | |
step:2446/6000 train_time:2429732ms step_avg:997.43ms | |
step:2447/6000 train_time:2430741ms step_avg:997.43ms | |
step:2448/6000 train_time:2431747ms step_avg:997.44ms | |
step:2449/6000 train_time:2432765ms step_avg:997.44ms | |
step:2450/6000 train_time:2433788ms step_avg:997.45ms | |
step:2450/6000 val_loss:2.5206 train_time:2433833ms step_avg:997.47ms perplexity:12.4356 param_count:85,137,462 | |
step:2451/6000 train_time:2434782ms step_avg:997.45ms | |
step:2452/6000 train_time:2435786ms step_avg:997.46ms | |
step:2453/6000 train_time:2436779ms step_avg:997.45ms | |
step:2454/6000 train_time:2437783ms step_avg:997.46ms | |
step:2455/6000 train_time:2438805ms step_avg:997.47ms | |
step:2456/6000 train_time:2439810ms step_avg:997.47ms | |
step:2457/6000 train_time:2440815ms step_avg:997.47ms | |
step:2458/6000 train_time:2441819ms step_avg:997.47ms | |
step:2459/6000 train_time:2442830ms step_avg:997.48ms | |
step:2460/6000 train_time:2443843ms step_avg:997.49ms | |
step:2461/6000 train_time:2444850ms step_avg:997.49ms | |
step:2462/6000 train_time:2445847ms step_avg:997.49ms | |
step:2463/6000 train_time:2446848ms step_avg:997.49ms | |
step:2464/6000 train_time:2447862ms step_avg:997.50ms | |
step:2465/6000 train_time:2448871ms step_avg:997.50ms | |
step:2466/6000 train_time:2449871ms step_avg:997.50ms | |
step:2467/6000 train_time:2450886ms step_avg:997.51ms | |
step:2468/6000 train_time:2451895ms step_avg:997.52ms | |
step:2469/6000 train_time:2452919ms step_avg:997.53ms | |
step:2470/6000 train_time:2453939ms step_avg:997.54ms | |
step:2471/6000 train_time:2454958ms step_avg:997.54ms | |
step:2472/6000 train_time:2455965ms step_avg:997.55ms | |
step:2473/6000 train_time:2456971ms step_avg:997.55ms | |
step:2474/6000 train_time:2457998ms step_avg:997.56ms | |
step:2475/6000 train_time:2459001ms step_avg:997.57ms | |
step:2475/6000 val_loss:2.5200 train_time:2459047ms step_avg:997.59ms perplexity:12.4281 param_count:85,137,462 | |
step:2476/6000 train_time:2460006ms step_avg:997.57ms | |
step:2477/6000 train_time:2461013ms step_avg:997.57ms | |
step:2478/6000 train_time:2462011ms step_avg:997.57ms | |
step:2479/6000 train_time:2463020ms step_avg:997.58ms | |
step:2480/6000 train_time:2464023ms step_avg:997.58ms | |
step:2481/6000 train_time:2465032ms step_avg:997.58ms | |
step:2482/6000 train_time:2466024ms step_avg:997.58ms | |
step:2483/6000 train_time:2467035ms step_avg:997.59ms | |
step:2484/6000 train_time:2468033ms step_avg:997.59ms | |
step:2485/6000 train_time:2469042ms step_avg:997.59ms | |
step:2486/6000 train_time:2470049ms step_avg:997.60ms | |
step:2487/6000 train_time:2471061ms step_avg:997.60ms | |
step:2488/6000 train_time:2472072ms step_avg:997.61ms | |
step:2489/6000 train_time:2473090ms step_avg:997.62ms | |
step:2490/6000 train_time:2474089ms step_avg:997.62ms | |
step:2491/6000 train_time:2475111ms step_avg:997.63ms | |
step:2492/6000 train_time:2476114ms step_avg:997.63ms | |
step:2493/6000 train_time:2477125ms step_avg:997.63ms | |
step:2494/6000 train_time:2478145ms step_avg:997.64ms | |
step:2495/6000 train_time:2479160ms step_avg:997.65ms | |
step:2496/6000 train_time:2480180ms step_avg:997.66ms | |
step:2497/6000 train_time:2481196ms step_avg:997.67ms | |
step:2498/6000 train_time:2482199ms step_avg:997.67ms | |
step:2499/6000 train_time:2483205ms step_avg:997.67ms | |
step:2500/6000 train_time:2484216ms step_avg:997.68ms | |
step:2500/6000 val_loss:2.5319 train_time:2484262ms step_avg:997.70ms perplexity:12.5778 param_count:85,137,462 | |
step:2501/6000 train_time:2485214ms step_avg:997.68ms | |
step:2502/6000 train_time:2486221ms step_avg:997.68ms | |
step:2503/6000 train_time:2487220ms step_avg:997.68ms | |
step:2504/6000 train_time:2488228ms step_avg:997.69ms | |
step:2505/6000 train_time:2489231ms step_avg:997.69ms | |
step:2506/6000 train_time:2490232ms step_avg:997.69ms | |
step:2507/6000 train_time:2491235ms step_avg:997.69ms | |
step:2508/6000 train_time:2492253ms step_avg:997.70ms | |
step:2509/6000 train_time:2493264ms step_avg:997.70ms | |
step:2510/6000 train_time:2494276ms step_avg:997.71ms | |
step:2511/6000 train_time:2495283ms step_avg:997.71ms | |
step:2512/6000 train_time:2496299ms step_avg:997.72ms | |
step:2513/6000 train_time:2497314ms step_avg:997.73ms | |
step:2514/6000 train_time:2498313ms step_avg:997.73ms | |
step:2515/6000 train_time:2499316ms step_avg:997.73ms | |
step:2516/6000 train_time:2500316ms step_avg:997.73ms | |
step:2517/6000 train_time:2501326ms step_avg:997.74ms | |
step:2518/6000 train_time:2502347ms step_avg:997.75ms | |
step:2519/6000 train_time:2503359ms step_avg:997.75ms | |
step:2520/6000 train_time:2504382ms step_avg:997.76ms | |
step:2521/6000 train_time:2505376ms step_avg:997.76ms | |
step:2522/6000 train_time:2506382ms step_avg:997.76ms | |
step:2523/6000 train_time:2507384ms step_avg:997.77ms | |
step:2524/6000 train_time:2508397ms step_avg:997.77ms | |
step:2525/6000 train_time:2509406ms step_avg:997.78ms | |
step:2525/6000 val_loss:2.5169 train_time:2509453ms step_avg:997.79ms perplexity:12.3896 param_count:85,137,462 | |
step:2526/6000 train_time:2510421ms step_avg:997.78ms | |
step:2527/6000 train_time:2511430ms step_avg:997.79ms | |
step:2528/6000 train_time:2512447ms step_avg:997.79ms | |
step:2529/6000 train_time:2513439ms step_avg:997.79ms | |
step:2530/6000 train_time:2514439ms step_avg:997.79ms | |
step:2531/6000 train_time:2515451ms step_avg:997.80ms | |
step:2532/6000 train_time:2516452ms step_avg:997.80ms | |
step:2533/6000 train_time:2517462ms step_avg:997.80ms | |
step:2534/6000 train_time:2518474ms step_avg:997.81ms | |
step:2535/6000 train_time:2519468ms step_avg:997.81ms | |
step:2536/6000 train_time:2520498ms step_avg:997.82ms | |
step:2537/6000 train_time:2521502ms step_avg:997.82ms | |
step:2538/6000 train_time:2522508ms step_avg:997.83ms | |
step:2539/6000 train_time:2523510ms step_avg:997.83ms | |
step:2540/6000 train_time:2524524ms step_avg:997.84ms | |
step:2541/6000 train_time:2525540ms step_avg:997.84ms | |
step:2542/6000 train_time:2526574ms step_avg:997.86ms | |
step:2543/6000 train_time:2527619ms step_avg:997.88ms | |
step:2544/6000 train_time:2528635ms step_avg:997.88ms | |
step:2545/6000 train_time:2529642ms step_avg:997.89ms | |
step:2546/6000 train_time:2530648ms step_avg:997.89ms | |
step:2547/6000 train_time:2531654ms step_avg:997.89ms | |
step:2548/6000 train_time:2532664ms step_avg:997.90ms | |
step:2549/6000 train_time:2533670ms step_avg:997.90ms | |
step:2550/6000 train_time:2534674ms step_avg:997.90ms | |
step:2550/6000 val_loss:2.5289 train_time:2534720ms step_avg:997.92ms perplexity:12.5393 param_count:85,137,462 | |
step:2551/6000 train_time:2535664ms step_avg:997.90ms | |
step:2552/6000 train_time:2536687ms step_avg:997.91ms | |
step:2553/6000 train_time:2537693ms step_avg:997.91ms | |
step:2554/6000 train_time:2538696ms step_avg:997.92ms | |
step:2555/6000 train_time:2539699ms step_avg:997.92ms | |
step:2556/6000 train_time:2540696ms step_avg:997.92ms | |
step:2557/6000 train_time:2541705ms step_avg:997.92ms | |
step:2558/6000 train_time:2542718ms step_avg:997.93ms | |
step:2559/6000 train_time:2543732ms step_avg:997.93ms | |
step:2560/6000 train_time:2544724ms step_avg:997.93ms | |
step:2561/6000 train_time:2545729ms step_avg:997.93ms | |
step:2562/6000 train_time:2546738ms step_avg:997.94ms | |
step:2563/6000 train_time:2547742ms step_avg:997.94ms | |
step:2564/6000 train_time:2548758ms step_avg:997.95ms | |
step:2565/6000 train_time:2549775ms step_avg:997.95ms | |
step:2566/6000 train_time:2550797ms step_avg:997.96ms | |
step:2567/6000 train_time:2551811ms step_avg:997.97ms | |
step:2568/6000 train_time:2552820ms step_avg:997.97ms | |
step:2569/6000 train_time:2553829ms step_avg:997.98ms | |
step:2570/6000 train_time:2554822ms step_avg:997.98ms | |
step:2571/6000 train_time:2555818ms step_avg:997.98ms | |
step:2572/6000 train_time:2556829ms step_avg:997.98ms | |
step:2573/6000 train_time:2557842ms step_avg:997.99ms | |
step:2574/6000 train_time:2558851ms step_avg:997.99ms | |
step:2575/6000 train_time:2559850ms step_avg:997.99ms | |
step:2575/6000 val_loss:2.5257 train_time:2559893ms step_avg:998.01ms perplexity:12.4999 param_count:85,137,462 | |
step:2576/6000 train_time:2560844ms step_avg:997.99ms | |
step:2577/6000 train_time:2561858ms step_avg:998.00ms | |
step:2578/6000 train_time:2562864ms step_avg:998.00ms | |
step:2579/6000 train_time:2563876ms step_avg:998.01ms | |
step:2580/6000 train_time:2564886ms step_avg:998.01ms | |
step:2581/6000 train_time:2565894ms step_avg:998.01ms | |
step:2582/6000 train_time:2566892ms step_avg:998.01ms | |
step:2583/6000 train_time:2567888ms step_avg:998.01ms | |
step:2584/6000 train_time:2568884ms step_avg:998.01ms | |
step:2585/6000 train_time:2569889ms step_avg:998.02ms | |
step:2586/6000 train_time:2570901ms step_avg:998.02ms | |
step:2587/6000 train_time:2571912ms step_avg:998.03ms | |
step:2588/6000 train_time:2572920ms step_avg:998.03ms | |
step:2589/6000 train_time:2573918ms step_avg:998.03ms | |
step:2590/6000 train_time:2574926ms step_avg:998.03ms | |
step:2591/6000 train_time:2575922ms step_avg:998.03ms | |
step:2592/6000 train_time:2576957ms step_avg:998.05ms | |
step:2593/6000 train_time:2577961ms step_avg:998.05ms | |
step:2594/6000 train_time:2578963ms step_avg:998.05ms | |
step:2595/6000 train_time:2579964ms step_avg:998.05ms | |
step:2596/6000 train_time:2580964ms step_avg:998.05ms | |
step:2597/6000 train_time:2581979ms step_avg:998.06ms | |
step:2598/6000 train_time:2582986ms step_avg:998.06ms | |
step:2599/6000 train_time:2584001ms step_avg:998.07ms | |
step:2600/6000 train_time:2585013ms step_avg:998.07ms | |
step:2600/6000 val_loss:2.5198 train_time:2585058ms step_avg:998.09ms perplexity:12.4263 param_count:85,137,462 | |
step:2601/6000 train_time:2586015ms step_avg:998.08ms | |
step:2602/6000 train_time:2587022ms step_avg:998.08ms | |
step:2603/6000 train_time:2588029ms step_avg:998.08ms | |
step:2604/6000 train_time:2589039ms step_avg:998.09ms | |
step:2605/6000 train_time:2590047ms step_avg:998.09ms | |
step:2606/6000 train_time:2591058ms step_avg:998.10ms | |
step:2607/6000 train_time:2592062ms step_avg:998.10ms | |
step:2608/6000 train_time:2593073ms step_avg:998.10ms | |
step:2609/6000 train_time:2594104ms step_avg:998.12ms | |
step:2610/6000 train_time:2595107ms step_avg:998.12ms | |
step:2611/6000 train_time:2596114ms step_avg:998.12ms | |
step:2612/6000 train_time:2597122ms step_avg:998.13ms | |
step:2613/6000 train_time:2598130ms step_avg:998.13ms | |
step:2614/6000 train_time:2599138ms step_avg:998.13ms | |
step:2615/6000 train_time:2600146ms step_avg:998.14ms | |
step:2616/6000 train_time:2601158ms step_avg:998.14ms | |
step:2617/6000 train_time:2602160ms step_avg:998.14ms | |
step:2618/6000 train_time:2603166ms step_avg:998.15ms | |
step:2619/6000 train_time:2604205ms step_avg:998.16ms | |
step:2620/6000 train_time:2605223ms step_avg:998.17ms | |
step:2621/6000 train_time:2606242ms step_avg:998.18ms | |
step:2622/6000 train_time:2607240ms step_avg:998.18ms | |
step:2623/6000 train_time:2608251ms step_avg:998.18ms | |
step:2624/6000 train_time:2609253ms step_avg:998.18ms | |
step:2625/6000 train_time:2610268ms step_avg:998.19ms | |
step:2625/6000 val_loss:2.5196 train_time:2610314ms step_avg:998.21ms perplexity:12.4242 param_count:85,137,462 | |
step:2626/6000 train_time:2611265ms step_avg:998.19ms | |
step:2627/6000 train_time:2612280ms step_avg:998.20ms | |
step:2628/6000 train_time:2613292ms step_avg:998.20ms | |
step:2629/6000 train_time:2614300ms step_avg:998.21ms | |
step:2630/6000 train_time:2615324ms step_avg:998.22ms | |
step:2631/6000 train_time:2616330ms step_avg:998.22ms | |
step:2632/6000 train_time:2617355ms step_avg:998.23ms | |
step:2633/6000 train_time:2618372ms step_avg:998.24ms | |
step:2634/6000 train_time:2619375ms step_avg:998.24ms | |
step:2635/6000 train_time:2620382ms step_avg:998.24ms | |
step:2636/6000 train_time:2621387ms step_avg:998.24ms | |
step:2637/6000 train_time:2622392ms step_avg:998.25ms | |
step:2638/6000 train_time:2623395ms step_avg:998.25ms | |
step:2639/6000 train_time:2624403ms step_avg:998.25ms | |
step:2640/6000 train_time:2625427ms step_avg:998.26ms | |
step:2641/6000 train_time:2626441ms step_avg:998.27ms | |
step:2642/6000 train_time:2627454ms step_avg:998.27ms | |
step:2643/6000 train_time:2628455ms step_avg:998.27ms | |
step:2644/6000 train_time:2629475ms step_avg:998.28ms | |
step:2645/6000 train_time:2630491ms step_avg:998.29ms | |
step:2646/6000 train_time:2631495ms step_avg:998.29ms | |
step:2647/6000 train_time:2632515ms step_avg:998.30ms | |
step:2648/6000 train_time:2633512ms step_avg:998.30ms | |
step:2649/6000 train_time:2634518ms step_avg:998.30ms | |
step:2650/6000 train_time:2635532ms step_avg:998.31ms | |
step:2650/6000 val_loss:2.5169 train_time:2635579ms step_avg:998.33ms perplexity:12.3901 param_count:85,137,462 | |
step:2651/6000 train_time:2636522ms step_avg:998.30ms | |
step:2652/6000 train_time:2637548ms step_avg:998.31ms | |
step:2653/6000 train_time:2638564ms step_avg:998.32ms | |
step:2654/6000 train_time:2639592ms step_avg:998.33ms | |
step:2655/6000 train_time:2640592ms step_avg:998.33ms | |
step:2656/6000 train_time:2641594ms step_avg:998.33ms | |
step:2657/6000 train_time:2642592ms step_avg:998.33ms | |
step:2658/6000 train_time:2643586ms step_avg:998.33ms | |
step:2659/6000 train_time:2644600ms step_avg:998.34ms | |
step:2660/6000 train_time:2645604ms step_avg:998.34ms | |
step:2661/6000 train_time:2646623ms step_avg:998.35ms | |
step:2662/6000 train_time:2647628ms step_avg:998.35ms | |
step:2663/6000 train_time:2648633ms step_avg:998.35ms | |
step:2664/6000 train_time:2649636ms step_avg:998.36ms | |
step:2665/6000 train_time:2650646ms step_avg:998.36ms | |
step:2666/6000 train_time:2651661ms step_avg:998.37ms | |
step:2667/6000 train_time:2652661ms step_avg:998.37ms | |
step:2668/6000 train_time:2653665ms step_avg:998.37ms | |
step:2669/6000 train_time:2654670ms step_avg:998.37ms | |
step:2670/6000 train_time:2655686ms step_avg:998.38ms | |
step:2671/6000 train_time:2656687ms step_avg:998.38ms | |
step:2672/6000 train_time:2657685ms step_avg:998.38ms | |
step:2673/6000 train_time:2658699ms step_avg:998.38ms | |
step:2674/6000 train_time:2659710ms step_avg:998.39ms | |
step:2675/6000 train_time:2660714ms step_avg:998.39ms | |
step:2675/6000 val_loss:2.5107 train_time:2660755ms step_avg:998.41ms perplexity:12.3141 param_count:85,137,462 | |
step:2676/6000 train_time:2661711ms step_avg:998.39ms | |
step:2677/6000 train_time:2662716ms step_avg:998.39ms | |
step:2678/6000 train_time:2663725ms step_avg:998.40ms | |
step:2679/6000 train_time:2664740ms step_avg:998.40ms | |
step:2680/6000 train_time:2665738ms step_avg:998.40ms | |
step:2681/6000 train_time:2666737ms step_avg:998.40ms | |
step:2682/6000 train_time:2667750ms step_avg:998.41ms | |
step:2683/6000 train_time:2668748ms step_avg:998.41ms | |
step:2684/6000 train_time:2669747ms step_avg:998.41ms | |
step:2685/6000 train_time:2670739ms step_avg:998.41ms | |
step:2686/6000 train_time:2671761ms step_avg:998.42ms | |
step:2687/6000 train_time:2672771ms step_avg:998.42ms | |
step:2688/6000 train_time:2673782ms step_avg:998.42ms | |
step:2689/6000 train_time:2674797ms step_avg:998.43ms | |
step:2690/6000 train_time:2675803ms step_avg:998.43ms | |
step:2691/6000 train_time:2676809ms step_avg:998.44ms | |
step:2692/6000 train_time:2677809ms step_avg:998.44ms | |
step:2693/6000 train_time:2678813ms step_avg:998.44ms | |
step:2694/6000 train_time:2679820ms step_avg:998.44ms | |
step:2695/6000 train_time:2680850ms step_avg:998.45ms | |
step:2696/6000 train_time:2681847ms step_avg:998.45ms | |
step:2697/6000 train_time:2682847ms step_avg:998.45ms | |
step:2698/6000 train_time:2683862ms step_avg:998.46ms | |
step:2699/6000 train_time:2684876ms step_avg:998.47ms | |
step:2700/6000 train_time:2685878ms step_avg:998.47ms | |
step:2700/6000 val_loss:2.5061 train_time:2685925ms step_avg:998.48ms perplexity:12.2565 param_count:85,137,462 | |
step:2701/6000 train_time:2686873ms step_avg:998.47ms | |
step:2702/6000 train_time:2687886ms step_avg:998.47ms | |
step:2703/6000 train_time:2688883ms step_avg:998.47ms | |
step:2704/6000 train_time:2689885ms step_avg:998.47ms | |
step:2705/6000 train_time:2690885ms step_avg:998.47ms | |
step:2706/6000 train_time:2691894ms step_avg:998.48ms | |
step:2707/6000 train_time:2692898ms step_avg:998.48ms | |
step:2708/6000 train_time:2693917ms step_avg:998.49ms | |
step:2709/6000 train_time:2694918ms step_avg:998.49ms | |
step:2710/6000 train_time:2695926ms step_avg:998.49ms | |
step:2711/6000 train_time:2696928ms step_avg:998.49ms | |
step:2712/6000 train_time:2697940ms step_avg:998.50ms | |
step:2713/6000 train_time:2698961ms step_avg:998.51ms | |
step:2714/6000 train_time:2699965ms step_avg:998.51ms | |
step:2715/6000 train_time:2700978ms step_avg:998.51ms | |
step:2716/6000 train_time:2701986ms step_avg:998.52ms | |
step:2717/6000 train_time:2703000ms step_avg:998.52ms | |
step:2718/6000 train_time:2704002ms step_avg:998.52ms | |
step:2719/6000 train_time:2705005ms step_avg:998.53ms | |
step:2720/6000 train_time:2706001ms step_avg:998.52ms | |
step:2721/6000 train_time:2707020ms step_avg:998.53ms | |
step:2722/6000 train_time:2708025ms step_avg:998.53ms | |
step:2723/6000 train_time:2709034ms step_avg:998.54ms | |
step:2724/6000 train_time:2710058ms step_avg:998.55ms | |
step:2725/6000 train_time:2711071ms step_avg:998.55ms | |
step:2725/6000 val_loss:2.5173 train_time:2711117ms step_avg:998.57ms perplexity:12.3951 param_count:85,137,462 | |
step:2726/6000 train_time:2712076ms step_avg:998.56ms | |
step:2727/6000 train_time:2713083ms step_avg:998.56ms | |
step:2728/6000 train_time:2714095ms step_avg:998.56ms | |
step:2729/6000 train_time:2715095ms step_avg:998.56ms | |
step:2730/6000 train_time:2716103ms step_avg:998.57ms | |
step:2731/6000 train_time:2717108ms step_avg:998.57ms | |
step:2732/6000 train_time:2718115ms step_avg:998.57ms | |
step:2733/6000 train_time:2719119ms step_avg:998.57ms | |
step:2734/6000 train_time:2720121ms step_avg:998.58ms | |
step:2735/6000 train_time:2721128ms step_avg:998.58ms | |
step:2736/6000 train_time:2722131ms step_avg:998.58ms | |
step:2737/6000 train_time:2723139ms step_avg:998.58ms | |
step:2738/6000 train_time:2724142ms step_avg:998.59ms | |
step:2739/6000 train_time:2725153ms step_avg:998.59ms | |
step:2740/6000 train_time:2726161ms step_avg:998.59ms | |
step:2741/6000 train_time:2727171ms step_avg:998.60ms | |
step:2742/6000 train_time:2728172ms step_avg:998.60ms | |
step:2743/6000 train_time:2729171ms step_avg:998.60ms | |
step:2744/6000 train_time:2730170ms step_avg:998.60ms | |
step:2745/6000 train_time:2731184ms step_avg:998.60ms | |
step:2746/6000 train_time:2732233ms step_avg:998.62ms | |
step:2747/6000 train_time:2733256ms step_avg:998.63ms | |
step:2748/6000 train_time:2734291ms step_avg:998.65ms | |
step:2749/6000 train_time:2735288ms step_avg:998.64ms | |
step:2750/6000 train_time:2736308ms step_avg:998.65ms | |
step:2750/6000 val_loss:2.5213 train_time:2736354ms step_avg:998.67ms perplexity:12.4442 param_count:85,137,462 | |
step:2751/6000 train_time:2737321ms step_avg:998.66ms | |
step:2752/6000 train_time:2738329ms step_avg:998.66ms | |
step:2753/6000 train_time:2739330ms step_avg:998.66ms | |
step:2754/6000 train_time:2740338ms step_avg:998.67ms | |
step:2755/6000 train_time:2741336ms step_avg:998.67ms | |
step:2756/6000 train_time:2742347ms step_avg:998.67ms | |
step:2757/6000 train_time:2743360ms step_avg:998.67ms | |
step:2758/6000 train_time:2744362ms step_avg:998.68ms | |
step:2759/6000 train_time:2745377ms step_avg:998.68ms | |
step:2760/6000 train_time:2746376ms step_avg:998.68ms | |
step:2761/6000 train_time:2747381ms step_avg:998.68ms | |
step:2762/6000 train_time:2748391ms step_avg:998.69ms | |
step:2763/6000 train_time:2749392ms step_avg:998.69ms | |
step:2764/6000 train_time:2750396ms step_avg:998.69ms | |
step:2765/6000 train_time:2751407ms step_avg:998.70ms | |
step:2766/6000 train_time:2752420ms step_avg:998.70ms | |
step:2767/6000 train_time:2753414ms step_avg:998.70ms | |
step:2768/6000 train_time:2754430ms step_avg:998.71ms | |
step:2769/6000 train_time:2755448ms step_avg:998.71ms | |
step:2770/6000 train_time:2756443ms step_avg:998.71ms | |
step:2771/6000 train_time:2757449ms step_avg:998.71ms | |
step:2772/6000 train_time:2758457ms step_avg:998.72ms | |
step:2773/6000 train_time:2759466ms step_avg:998.72ms | |
step:2774/6000 train_time:2760465ms step_avg:998.72ms | |
step:2775/6000 train_time:2761473ms step_avg:998.72ms | |
step:2775/6000 val_loss:2.5174 train_time:2761519ms step_avg:998.74ms perplexity:12.3962 param_count:85,137,462 | |
step:2776/6000 train_time:2762484ms step_avg:998.73ms | |
step:2777/6000 train_time:2763485ms step_avg:998.73ms | |
step:2778/6000 train_time:2764487ms step_avg:998.73ms | |
step:2779/6000 train_time:2765485ms step_avg:998.73ms | |
step:2780/6000 train_time:2766490ms step_avg:998.73ms | |
step:2781/6000 train_time:2767498ms step_avg:998.74ms | |
step:2782/6000 train_time:2768493ms step_avg:998.73ms | |
step:2783/6000 train_time:2769509ms step_avg:998.74ms | |
step:2784/6000 train_time:2770514ms step_avg:998.74ms | |
step:2785/6000 train_time:2771511ms step_avg:998.74ms | |
step:2786/6000 train_time:2772514ms step_avg:998.74ms | |
step:2787/6000 train_time:2773524ms step_avg:998.75ms | |
step:2788/6000 train_time:2774538ms step_avg:998.75ms | |
step:2789/6000 train_time:2775557ms step_avg:998.76ms | |
step:2790/6000 train_time:2776560ms step_avg:998.76ms | |
step:2791/6000 train_time:2777567ms step_avg:998.77ms | |
step:2792/6000 train_time:2778576ms step_avg:998.77ms | |
step:2793/6000 train_time:2779582ms step_avg:998.77ms | |
step:2794/6000 train_time:2780606ms step_avg:998.78ms | |
step:2795/6000 train_time:2781617ms step_avg:998.79ms | |
step:2796/6000 train_time:2782629ms step_avg:998.79ms | |
step:2797/6000 train_time:2783638ms step_avg:998.79ms | |
step:2798/6000 train_time:2784639ms step_avg:998.79ms | |
step:2799/6000 train_time:2785661ms step_avg:998.80ms | |
step:2800/6000 train_time:2786675ms step_avg:998.81ms | |
step:2800/6000 val_loss:2.4997 train_time:2786720ms step_avg:998.82ms perplexity:12.1789 param_count:85,137,462 | |
step:2801/6000 train_time:2787677ms step_avg:998.81ms | |
step:2802/6000 train_time:2788679ms step_avg:998.81ms | |
step:2803/6000 train_time:2789688ms step_avg:998.81ms | |
step:2804/6000 train_time:2790690ms step_avg:998.82ms | |
step:2805/6000 train_time:2791703ms step_avg:998.82ms | |
step:2806/6000 train_time:2792713ms step_avg:998.82ms | |
step:2807/6000 train_time:2793776ms step_avg:998.85ms | |
step:2808/6000 train_time:2794786ms step_avg:998.85ms | |
step:2809/6000 train_time:2795793ms step_avg:998.85ms | |
step:2810/6000 train_time:2796815ms step_avg:998.86ms | |
step:2811/6000 train_time:2797829ms step_avg:998.87ms | |
step:2812/6000 train_time:2798831ms step_avg:998.87ms | |
step:2813/6000 train_time:2799836ms step_avg:998.87ms | |
step:2814/6000 train_time:2800842ms step_avg:998.87ms | |
step:2815/6000 train_time:2801849ms step_avg:998.88ms | |
step:2816/6000 train_time:2802851ms step_avg:998.88ms | |
step:2817/6000 train_time:2803849ms step_avg:998.88ms | |
step:2818/6000 train_time:2804855ms step_avg:998.88ms | |
step:2819/6000 train_time:2805876ms step_avg:998.89ms | |
step:2820/6000 train_time:2806895ms step_avg:998.90ms | |
step:2821/6000 train_time:2807903ms step_avg:998.90ms | |
step:2822/6000 train_time:2808917ms step_avg:998.90ms | |
step:2823/6000 train_time:2809929ms step_avg:998.91ms | |
step:2824/6000 train_time:2810934ms step_avg:998.91ms | |
step:2825/6000 train_time:2811952ms step_avg:998.92ms | |
step:2825/6000 val_loss:2.5166 train_time:2811997ms step_avg:998.93ms perplexity:12.3862 param_count:85,137,462 | |
step:2826/6000 train_time:2812949ms step_avg:998.92ms | |
step:2827/6000 train_time:2813982ms step_avg:998.93ms | |
step:2828/6000 train_time:2814990ms step_avg:998.93ms | |
step:2829/6000 train_time:2815997ms step_avg:998.93ms | |
step:2830/6000 train_time:2817007ms step_avg:998.94ms | |
step:2831/6000 train_time:2818016ms step_avg:998.94ms | |
step:2832/6000 train_time:2819014ms step_avg:998.94ms | |
step:2833/6000 train_time:2820022ms step_avg:998.95ms | |
step:2834/6000 train_time:2821025ms step_avg:998.95ms | |
step:2835/6000 train_time:2822034ms step_avg:998.95ms | |
step:2836/6000 train_time:2823028ms step_avg:998.95ms | |
step:2837/6000 train_time:2824036ms step_avg:998.95ms | |
step:2838/6000 train_time:2825040ms step_avg:998.95ms | |
step:2839/6000 train_time:2826060ms step_avg:998.96ms | |
step:2840/6000 train_time:2827062ms step_avg:998.96ms | |
step:2841/6000 train_time:2828070ms step_avg:998.97ms | |
step:2842/6000 train_time:2829080ms step_avg:998.97ms | |
step:2843/6000 train_time:2830089ms step_avg:998.97ms | |
step:2844/6000 train_time:2831093ms step_avg:998.97ms | |
step:2845/6000 train_time:2832093ms step_avg:998.97ms | |
step:2846/6000 train_time:2833098ms step_avg:998.98ms | |
step:2847/6000 train_time:2834102ms step_avg:998.98ms | |
step:2848/6000 train_time:2835106ms step_avg:998.98ms | |
step:2849/6000 train_time:2836112ms step_avg:998.98ms | |
step:2850/6000 train_time:2837128ms step_avg:998.99ms | |
step:2850/6000 val_loss:2.5066 train_time:2837174ms step_avg:999.00ms perplexity:12.2626 param_count:85,137,462 | |
step:2851/6000 train_time:2838137ms step_avg:998.99ms | |
step:2852/6000 train_time:2839140ms step_avg:998.99ms | |
step:2853/6000 train_time:2840152ms step_avg:999.00ms | |
step:2854/6000 train_time:2841156ms step_avg:999.00ms | |
step:2855/6000 train_time:2842167ms step_avg:999.00ms | |
step:2856/6000 train_time:2843177ms step_avg:999.01ms | |
step:2857/6000 train_time:2844184ms step_avg:999.01ms | |
step:2858/6000 train_time:2845183ms step_avg:999.01ms | |
step:2859/6000 train_time:2846181ms step_avg:999.01ms | |
step:2860/6000 train_time:2847186ms step_avg:999.01ms | |
step:2861/6000 train_time:2848191ms step_avg:999.01ms | |
step:2862/6000 train_time:2849198ms step_avg:999.02ms | |
step:2863/6000 train_time:2850216ms step_avg:999.02ms | |
step:2864/6000 train_time:2851223ms step_avg:999.03ms | |
step:2865/6000 train_time:2852230ms step_avg:999.03ms | |
step:2866/6000 train_time:2853242ms step_avg:999.03ms | |
step:2867/6000 train_time:2854245ms step_avg:999.04ms | |
step:2868/6000 train_time:2855249ms step_avg:999.04ms | |
step:2869/6000 train_time:2856252ms step_avg:999.04ms | |
step:2870/6000 train_time:2857263ms step_avg:999.04ms | |
step:2871/6000 train_time:2858274ms step_avg:999.05ms | |
step:2872/6000 train_time:2859284ms step_avg:999.05ms | |
step:2873/6000 train_time:2860297ms step_avg:999.06ms | |
step:2874/6000 train_time:2861329ms step_avg:999.07ms | |
step:2875/6000 train_time:2862332ms step_avg:999.07ms | |
step:2875/6000 val_loss:2.5086 train_time:2862378ms step_avg:999.08ms perplexity:12.2872 param_count:85,137,462 | |
step:2876/6000 train_time:2863344ms step_avg:999.07ms | |
step:2877/6000 train_time:2864342ms step_avg:999.07ms | |
step:2878/6000 train_time:2865355ms step_avg:999.08ms | |
step:2879/6000 train_time:2866367ms step_avg:999.08ms | |
step:2880/6000 train_time:2867391ms step_avg:999.09ms | |
step:2881/6000 train_time:2868393ms step_avg:999.09ms | |
step:2882/6000 train_time:2869404ms step_avg:999.10ms | |
step:2883/6000 train_time:2870423ms step_avg:999.10ms | |
step:2884/6000 train_time:2871436ms step_avg:999.11ms | |
step:2885/6000 train_time:2872441ms step_avg:999.11ms | |
step:2886/6000 train_time:2873444ms step_avg:999.11ms | |
step:2887/6000 train_time:2874463ms step_avg:999.12ms | |
step:2888/6000 train_time:2875462ms step_avg:999.12ms | |
step:2889/6000 train_time:2876473ms step_avg:999.12ms | |
step:2890/6000 train_time:2877488ms step_avg:999.13ms | |
step:2891/6000 train_time:2878508ms step_avg:999.13ms | |
step:2892/6000 train_time:2879511ms step_avg:999.14ms | |
step:2893/6000 train_time:2880515ms step_avg:999.14ms | |
step:2894/6000 train_time:2881523ms step_avg:999.14ms | |
step:2895/6000 train_time:2882526ms step_avg:999.14ms | |
step:2896/6000 train_time:2883534ms step_avg:999.15ms | |
step:2897/6000 train_time:2884562ms step_avg:999.16ms | |
step:2898/6000 train_time:2885562ms step_avg:999.16ms | |
step:2899/6000 train_time:2886546ms step_avg:999.15ms | |
step:2900/6000 train_time:2887536ms step_avg:999.15ms | |
step:2900/6000 val_loss:2.5072 train_time:2887579ms step_avg:999.16ms perplexity:12.2708 param_count:85,137,462 | |
step:2901/6000 train_time:2888535ms step_avg:999.15ms | |
step:2902/6000 train_time:2889553ms step_avg:999.15ms | |
step:2903/6000 train_time:2890559ms step_avg:999.16ms | |
step:2904/6000 train_time:2891564ms step_avg:999.16ms | |
step:2905/6000 train_time:2892564ms step_avg:999.16ms | |
step:2906/6000 train_time:2893584ms step_avg:999.17ms | |
step:2907/6000 train_time:2894595ms step_avg:999.17ms | |
step:2908/6000 train_time:2895604ms step_avg:999.17ms | |
step:2909/6000 train_time:2896609ms step_avg:999.18ms | |
step:2910/6000 train_time:2897617ms step_avg:999.18ms | |
step:2911/6000 train_time:2898629ms step_avg:999.18ms | |
step:2912/6000 train_time:2899632ms step_avg:999.18ms | |
step:2913/6000 train_time:2900647ms step_avg:999.19ms | |
step:2914/6000 train_time:2901656ms step_avg:999.19ms | |
step:2915/6000 train_time:2902662ms step_avg:999.20ms | |
step:2916/6000 train_time:2903667ms step_avg:999.20ms | |
step:2917/6000 train_time:2904677ms step_avg:999.20ms | |
step:2918/6000 train_time:2905683ms step_avg:999.20ms | |
step:2919/6000 train_time:2906698ms step_avg:999.21ms | |
step:2920/6000 train_time:2907703ms step_avg:999.21ms | |
step:2921/6000 train_time:2908709ms step_avg:999.21ms | |
step:2922/6000 train_time:2909725ms step_avg:999.22ms | |
step:2923/6000 train_time:2910734ms step_avg:999.22ms | |
step:2924/6000 train_time:2911775ms step_avg:999.24ms | |
step:2925/6000 train_time:2912780ms step_avg:999.24ms | |
step:2925/6000 val_loss:2.5006 train_time:2912826ms step_avg:999.25ms perplexity:12.1894 param_count:85,137,462 | |
step:2926/6000 train_time:2913775ms step_avg:999.24ms | |
step:2927/6000 train_time:2914770ms step_avg:999.24ms | |
step:2928/6000 train_time:2915767ms step_avg:999.23ms | |
step:2929/6000 train_time:2916777ms step_avg:999.24ms | |
step:2930/6000 train_time:2917773ms step_avg:999.24ms | |
step:2931/6000 train_time:2918796ms step_avg:999.25ms | |
step:2932/6000 train_time:2919800ms step_avg:999.25ms | |
step:2933/6000 train_time:2920800ms step_avg:999.25ms | |
step:2934/6000 train_time:2921815ms step_avg:999.25ms | |
step:2935/6000 train_time:2922817ms step_avg:999.25ms | |
step:2936/6000 train_time:2923820ms step_avg:999.25ms | |
step:2937/6000 train_time:2924828ms step_avg:999.26ms | |
step:2938/6000 train_time:2925820ms step_avg:999.26ms | |
step:2939/6000 train_time:2926827ms step_avg:999.26ms | |
step:2940/6000 train_time:2927839ms step_avg:999.26ms | |
step:2941/6000 train_time:2928862ms step_avg:999.27ms | |
step:2942/6000 train_time:2929882ms step_avg:999.28ms | |
step:2943/6000 train_time:2930898ms step_avg:999.28ms | |
step:2944/6000 train_time:2931901ms step_avg:999.28ms | |
step:2945/6000 train_time:2932909ms step_avg:999.29ms | |
step:2946/6000 train_time:2933910ms step_avg:999.29ms | |
step:2947/6000 train_time:2934914ms step_avg:999.29ms | |
step:2948/6000 train_time:2935928ms step_avg:999.29ms | |
step:2949/6000 train_time:2936947ms step_avg:999.30ms | |
step:2950/6000 train_time:2937971ms step_avg:999.31ms | |
step:2950/6000 val_loss:2.5080 train_time:2938018ms step_avg:999.33ms perplexity:12.2801 param_count:85,137,462 | |
step:2951/6000 train_time:2938966ms step_avg:999.31ms | |
step:2952/6000 train_time:2939967ms step_avg:999.31ms | |
step:2953/6000 train_time:2940974ms step_avg:999.31ms | |
step:2954/6000 train_time:2941977ms step_avg:999.31ms | |
step:2955/6000 train_time:2942982ms step_avg:999.31ms | |
step:2956/6000 train_time:2943987ms step_avg:999.32ms | |
step:2957/6000 train_time:2944991ms step_avg:999.32ms | |
step:2958/6000 train_time:2945998ms step_avg:999.32ms | |
step:2959/6000 train_time:2947010ms step_avg:999.33ms | |
step:2960/6000 train_time:2948031ms step_avg:999.33ms | |
step:2961/6000 train_time:2949045ms step_avg:999.34ms | |
step:2962/6000 train_time:2950053ms step_avg:999.34ms | |
step:2963/6000 train_time:2951055ms step_avg:999.34ms | |
step:2964/6000 train_time:2952069ms step_avg:999.35ms | |
step:2965/6000 train_time:2953074ms step_avg:999.35ms | |
step:2966/6000 train_time:2954082ms step_avg:999.35ms | |
step:2967/6000 train_time:2955100ms step_avg:999.36ms | |
step:2968/6000 train_time:2956122ms step_avg:999.36ms | |
step:2969/6000 train_time:2957125ms step_avg:999.37ms | |
step:2970/6000 train_time:2958132ms step_avg:999.37ms | |
step:2971/6000 train_time:2959170ms step_avg:999.38ms | |
step:2972/6000 train_time:2960188ms step_avg:999.39ms | |
step:2973/6000 train_time:2961185ms step_avg:999.39ms | |
step:2974/6000 train_time:2962182ms step_avg:999.39ms | |
step:2975/6000 train_time:2963186ms step_avg:999.39ms | |
step:2975/6000 val_loss:2.5030 train_time:2963232ms step_avg:999.40ms perplexity:12.2196 param_count:85,137,462 | |
step:2976/6000 train_time:2964187ms step_avg:999.39ms | |
step:2977/6000 train_time:2965241ms step_avg:999.41ms | |
step:2978/6000 train_time:2966260ms step_avg:999.41ms | |
step:2979/6000 train_time:2967269ms step_avg:999.42ms | |
step:2980/6000 train_time:2968266ms step_avg:999.42ms | |
step:2981/6000 train_time:2969259ms step_avg:999.41ms | |
step:2982/6000 train_time:2970276ms step_avg:999.42ms | |
step:2983/6000 train_time:2971284ms step_avg:999.42ms | |
step:2984/6000 train_time:2972287ms step_avg:999.42ms | |
step:2985/6000 train_time:2973291ms step_avg:999.43ms | |
step:2986/6000 train_time:2974308ms step_avg:999.43ms | |
step:2987/6000 train_time:2975325ms step_avg:999.44ms | |
step:2988/6000 train_time:2976333ms step_avg:999.44ms | |
step:2989/6000 train_time:2977347ms step_avg:999.45ms | |
step:2990/6000 train_time:2978355ms step_avg:999.45ms | |
step:2991/6000 train_time:2979357ms step_avg:999.45ms | |
step:2992/6000 train_time:2980372ms step_avg:999.45ms | |
step:2993/6000 train_time:2981375ms step_avg:999.46ms | |
step:2994/6000 train_time:2982394ms step_avg:999.46ms | |
step:2995/6000 train_time:2983408ms step_avg:999.47ms | |
step:2996/6000 train_time:2984417ms step_avg:999.47ms | |
step:2997/6000 train_time:2985428ms step_avg:999.47ms | |
step:2998/6000 train_time:2986435ms step_avg:999.48ms | |
step:2999/6000 train_time:2987440ms step_avg:999.48ms | |
step:3000/6000 train_time:2988447ms step_avg:999.48ms | |
step:3000/6000 val_loss:2.5188 train_time:2988494ms step_avg:999.50ms perplexity:12.4139 param_count:85,137,462 | |
step:3001/6000 train_time:2989450ms step_avg:999.48ms | |
step:3002/6000 train_time:2990460ms step_avg:999.49ms | |
step:3003/6000 train_time:2991462ms step_avg:999.49ms | |
step:3004/6000 train_time:2992471ms step_avg:999.49ms | |
step:3005/6000 train_time:2993480ms step_avg:999.49ms | |
step:3006/6000 train_time:2994484ms step_avg:999.49ms | |
step:3007/6000 train_time:2995499ms step_avg:999.50ms | |
step:3008/6000 train_time:2996513ms step_avg:999.50ms | |
step:3009/6000 train_time:2997510ms step_avg:999.50ms | |
step:3010/6000 train_time:2998524ms step_avg:999.51ms | |
step:3011/6000 train_time:2999538ms step_avg:999.51ms | |
step:3012/6000 train_time:3000537ms step_avg:999.51ms | |
step:3013/6000 train_time:3001552ms step_avg:999.52ms | |
step:3014/6000 train_time:3002556ms step_avg:999.52ms | |
step:3015/6000 train_time:3003574ms step_avg:999.53ms | |
step:3016/6000 train_time:3004588ms step_avg:999.53ms | |
step:3017/6000 train_time:3005601ms step_avg:999.53ms | |
step:3018/6000 train_time:3006627ms step_avg:999.54ms | |
step:3019/6000 train_time:3007626ms step_avg:999.54ms | |
step:3020/6000 train_time:3008623ms step_avg:999.54ms | |
step:3021/6000 train_time:3009627ms step_avg:999.54ms | |
step:3022/6000 train_time:3010654ms step_avg:999.55ms | |
step:3023/6000 train_time:3011656ms step_avg:999.55ms | |
step:3024/6000 train_time:3012666ms step_avg:999.56ms | |
step:3025/6000 train_time:3013679ms step_avg:999.56ms | |
step:3025/6000 val_loss:2.5108 train_time:3013726ms step_avg:999.58ms perplexity:12.3145 param_count:85,137,462 | |
step:3026/6000 train_time:3014672ms step_avg:999.56ms | |
step:3027/6000 train_time:3015680ms step_avg:999.56ms | |
step:3028/6000 train_time:3016684ms step_avg:999.56ms | |
step:3029/6000 train_time:3017687ms step_avg:999.57ms | |
step:3030/6000 train_time:3018704ms step_avg:999.57ms | |
step:3031/6000 train_time:3019718ms step_avg:999.58ms | |
step:3032/6000 train_time:3020719ms step_avg:999.58ms | |
step:3033/6000 train_time:3021719ms step_avg:999.58ms | |
step:3034/6000 train_time:3022737ms step_avg:999.58ms | |
step:3035/6000 train_time:3023755ms step_avg:999.59ms | |
step:3036/6000 train_time:3024761ms step_avg:999.59ms | |
step:3037/6000 train_time:3025774ms step_avg:999.59ms | |
step:3038/6000 train_time:3026795ms step_avg:999.60ms | |
step:3039/6000 train_time:3027798ms step_avg:999.60ms | |
step:3040/6000 train_time:3028809ms step_avg:999.61ms | |
step:3041/6000 train_time:3029813ms step_avg:999.61ms | |
step:3042/6000 train_time:3030860ms step_avg:999.62ms | |
step:3043/6000 train_time:3031880ms step_avg:999.63ms | |
step:3044/6000 train_time:3032891ms step_avg:999.63ms | |
step:3045/6000 train_time:3033893ms step_avg:999.64ms | |
step:3046/6000 train_time:3034901ms step_avg:999.64ms | |
step:3047/6000 train_time:3035912ms step_avg:999.64ms | |
step:3048/6000 train_time:3036918ms step_avg:999.64ms | |
step:3049/6000 train_time:3037936ms step_avg:999.65ms | |
step:3050/6000 train_time:3038957ms step_avg:999.66ms | |
step:3050/6000 val_loss:2.4989 train_time:3039002ms step_avg:999.67ms perplexity:12.1686 param_count:85,137,462 | |
step:3051/6000 train_time:3039965ms step_avg:999.66ms | |
step:3052/6000 train_time:3040975ms step_avg:999.66ms | |
step:3053/6000 train_time:3041987ms step_avg:999.67ms | |
step:3054/6000 train_time:3042989ms step_avg:999.67ms | |
step:3055/6000 train_time:3043992ms step_avg:999.67ms | |
step:3056/6000 train_time:3045024ms step_avg:999.68ms | |
step:3057/6000 train_time:3046014ms step_avg:999.68ms | |
step:3058/6000 train_time:3047018ms step_avg:999.68ms | |
step:3059/6000 train_time:3048023ms step_avg:999.68ms | |
step:3060/6000 train_time:3049046ms step_avg:999.69ms | |
step:3061/6000 train_time:3050069ms step_avg:999.69ms | |
step:3062/6000 train_time:3051086ms step_avg:999.70ms | |
step:3063/6000 train_time:3052082ms step_avg:999.70ms | |
step:3064/6000 train_time:3053095ms step_avg:999.70ms | |
step:3065/6000 train_time:3054107ms step_avg:999.71ms | |
step:3066/6000 train_time:3055108ms step_avg:999.71ms | |
step:3067/6000 train_time:3056118ms step_avg:999.71ms | |
step:3068/6000 train_time:3057146ms step_avg:999.72ms | |
step:3069/6000 train_time:3058151ms step_avg:999.72ms | |
step:3070/6000 train_time:3059161ms step_avg:999.73ms | |
step:3071/6000 train_time:3060173ms step_avg:999.73ms | |
step:3072/6000 train_time:3061189ms step_avg:999.74ms | |
step:3073/6000 train_time:3062197ms step_avg:999.74ms | |
step:3074/6000 train_time:3063198ms step_avg:999.74ms | |
step:3075/6000 train_time:3064216ms step_avg:999.74ms | |
step:3075/6000 val_loss:2.5051 train_time:3064262ms step_avg:999.76ms perplexity:12.2448 param_count:85,137,462 | |
step:3076/6000 train_time:3065231ms step_avg:999.75ms | |
step:3077/6000 train_time:3066246ms step_avg:999.75ms | |
step:3078/6000 train_time:3067250ms step_avg:999.76ms | |
step:3079/6000 train_time:3068257ms step_avg:999.76ms | |
step:3080/6000 train_time:3069276ms step_avg:999.76ms | |
step:3081/6000 train_time:3070278ms step_avg:999.76ms | |
step:3082/6000 train_time:3071285ms step_avg:999.77ms | |
step:3083/6000 train_time:3072301ms step_avg:999.77ms | |
step:3084/6000 train_time:3073314ms step_avg:999.78ms | |
step:3085/6000 train_time:3074334ms step_avg:999.78ms | |
step:3086/6000 train_time:3075338ms step_avg:999.78ms | |
step:3087/6000 train_time:3076337ms step_avg:999.78ms | |
step:3088/6000 train_time:3077349ms step_avg:999.79ms | |
step:3089/6000 train_time:3078365ms step_avg:999.79ms | |
step:3090/6000 train_time:3079371ms step_avg:999.80ms | |
step:3091/6000 train_time:3080372ms step_avg:999.80ms | |
step:3092/6000 train_time:3081389ms step_avg:999.80ms | |
step:3093/6000 train_time:3082416ms step_avg:999.81ms | |
step:3094/6000 train_time:3083423ms step_avg:999.81ms | |
step:3095/6000 train_time:3084466ms step_avg:999.83ms | |
step:3096/6000 train_time:3085468ms step_avg:999.83ms | |
step:3097/6000 train_time:3086479ms step_avg:999.83ms | |
step:3098/6000 train_time:3087487ms step_avg:999.83ms | |
step:3099/6000 train_time:3088493ms step_avg:999.84ms | |
step:3100/6000 train_time:3089501ms step_avg:999.84ms | |
step:3100/6000 val_loss:2.5057 train_time:3089547ms step_avg:999.85ms perplexity:12.2526 param_count:85,137,462 | |
step:3101/6000 train_time:3090497ms step_avg:999.84ms | |
step:3102/6000 train_time:3091506ms step_avg:999.84ms | |
step:3103/6000 train_time:3092513ms step_avg:999.84ms | |
step:3104/6000 train_time:3093516ms step_avg:999.84ms | |
step:3105/6000 train_time:3094520ms step_avg:999.85ms | |
step:3106/6000 train_time:3095536ms step_avg:999.85ms | |
step:3107/6000 train_time:3096542ms step_avg:999.85ms | |
step:3108/6000 train_time:3097555ms step_avg:999.86ms | |
step:3109/6000 train_time:3098558ms step_avg:999.86ms | |
step:3110/6000 train_time:3099573ms step_avg:999.86ms | |
step:3111/6000 train_time:3100575ms step_avg:999.86ms | |
step:3112/6000 train_time:3101578ms step_avg:999.86ms | |
step:3113/6000 train_time:3102582ms step_avg:999.87ms | |
step:3114/6000 train_time:3103596ms step_avg:999.87ms | |
step:3115/6000 train_time:3104611ms step_avg:999.87ms | |
step:3116/6000 train_time:3105621ms step_avg:999.88ms | |
step:3117/6000 train_time:3106630ms step_avg:999.88ms | |
step:3118/6000 train_time:3107643ms step_avg:999.89ms | |
step:3119/6000 train_time:3108655ms step_avg:999.89ms | |
step:3120/6000 train_time:3109686ms step_avg:999.90ms | |
step:3121/6000 train_time:3110704ms step_avg:999.90ms | |
step:3122/6000 train_time:3111707ms step_avg:999.91ms | |
step:3123/6000 train_time:3112715ms step_avg:999.91ms | |
step:3124/6000 train_time:3113736ms step_avg:999.92ms | |
step:3125/6000 train_time:3114748ms step_avg:999.92ms | |
step:3125/6000 val_loss:2.5107 train_time:3114793ms step_avg:999.93ms perplexity:12.3136 param_count:85,137,462 | |
step:3126/6000 train_time:3115751ms step_avg:999.92ms | |
step:3127/6000 train_time:3116764ms step_avg:999.92ms | |
step:3128/6000 train_time:3117777ms step_avg:999.93ms | |
step:3129/6000 train_time:3118773ms step_avg:999.93ms | |
step:3130/6000 train_time:3119777ms step_avg:999.93ms | |
step:3131/6000 train_time:3120792ms step_avg:999.93ms | |
step:3132/6000 train_time:3121792ms step_avg:999.93ms | |
step:3133/6000 train_time:3122797ms step_avg:999.93ms | |
step:3134/6000 train_time:3123801ms step_avg:999.94ms | |
step:3135/6000 train_time:3124806ms step_avg:999.94ms | |
step:3136/6000 train_time:3125816ms step_avg:999.94ms | |
step:3137/6000 train_time:3126823ms step_avg:999.94ms | |
step:3138/6000 train_time:3127835ms step_avg:999.95ms | |
step:3139/6000 train_time:3128841ms step_avg:999.95ms | |
step:3140/6000 train_time:3129850ms step_avg:999.95ms | |
step:3141/6000 train_time:3130847ms step_avg:999.95ms | |
step:3142/6000 train_time:3131850ms step_avg:999.95ms | |
step:3143/6000 train_time:3132865ms step_avg:999.96ms | |
step:3144/6000 train_time:3133872ms step_avg:999.96ms | |
step:3145/6000 train_time:3134869ms step_avg:999.96ms | |
step:3146/6000 train_time:3135878ms step_avg:999.96ms | |
step:3147/6000 train_time:3136900ms step_avg:999.97ms | |
step:3148/6000 train_time:3137918ms step_avg:999.97ms | |
step:3149/6000 train_time:3138942ms step_avg:999.98ms | |
step:3150/6000 train_time:3139946ms step_avg:999.98ms | |
step:3150/6000 val_loss:2.5012 train_time:3139992ms step_avg:1000.00ms perplexity:12.1977 param_count:85,137,462 | |
step:3151/6000 train_time:3140946ms step_avg:999.98ms | |
step:3152/6000 train_time:3141969ms step_avg:999.99ms | |
step:3153/6000 train_time:3142969ms step_avg:999.99ms | |
step:3154/6000 train_time:3143974ms step_avg:999.99ms | |
step:3155/6000 train_time:3144983ms step_avg:999.99ms | |
step:3156/6000 train_time:3145979ms step_avg:999.99ms | |
step:3157/6000 train_time:3146984ms step_avg:1000.00ms | |
step:3158/6000 train_time:3148015ms step_avg:1000.00ms | |
step:3159/6000 train_time:3149029ms step_avg:1000.01ms | |
step:3160/6000 train_time:3150041ms step_avg:1000.01ms | |
step:3161/6000 train_time:3151044ms step_avg:1000.01ms | |
step:3162/6000 train_time:3152056ms step_avg:1000.02ms | |
step:3163/6000 train_time:3153080ms step_avg:1000.03ms | |
step:3164/6000 train_time:3154077ms step_avg:1000.02ms | |
step:3165/6000 train_time:3155080ms step_avg:1000.03ms | |
step:3166/6000 train_time:3156090ms step_avg:1000.03ms | |
step:3167/6000 train_time:3157083ms step_avg:1000.03ms | |
step:3168/6000 train_time:3158111ms step_avg:1000.04ms | |
step:3169/6000 train_time:3159109ms step_avg:1000.03ms | |
step:3170/6000 train_time:3160116ms step_avg:1000.04ms | |
step:3171/6000 train_time:3161128ms step_avg:1000.04ms | |
step:3172/6000 train_time:3162145ms step_avg:1000.05ms | |
step:3173/6000 train_time:3163146ms step_avg:1000.05ms | |
step:3174/6000 train_time:3164162ms step_avg:1000.05ms | |
step:3175/6000 train_time:3165172ms step_avg:1000.05ms | |
step:3175/6000 val_loss:2.4893 train_time:3165218ms step_avg:1000.07ms perplexity:12.0523 param_count:85,137,462 | |
step:3176/6000 train_time:3166180ms step_avg:1000.06ms | |
step:3177/6000 train_time:3167191ms step_avg:1000.06ms | |
step:3178/6000 train_time:3168201ms step_avg:1000.06ms | |
step:3179/6000 train_time:3169210ms step_avg:1000.07ms | |
step:3180/6000 train_time:3170222ms step_avg:1000.07ms | |
step:3181/6000 train_time:3171230ms step_avg:1000.07ms | |
step:3182/6000 train_time:3172245ms step_avg:1000.08ms | |
step:3183/6000 train_time:3173268ms step_avg:1000.08ms | |
step:3184/6000 train_time:3174293ms step_avg:1000.09ms | |
step:3185/6000 train_time:3175289ms step_avg:1000.09ms | |
step:3186/6000 train_time:3176312ms step_avg:1000.10ms | |
step:3187/6000 train_time:3177318ms step_avg:1000.10ms | |
step:3188/6000 train_time:3178324ms step_avg:1000.10ms | |
step:3189/6000 train_time:3179344ms step_avg:1000.11ms | |
step:3190/6000 train_time:3180373ms step_avg:1000.12ms | |
step:3191/6000 train_time:3181382ms step_avg:1000.12ms | |
step:3192/6000 train_time:3182394ms step_avg:1000.12ms | |
step:3193/6000 train_time:3183397ms step_avg:1000.12ms | |
step:3194/6000 train_time:3184406ms step_avg:1000.13ms | |
step:3195/6000 train_time:3185417ms step_avg:1000.13ms | |
step:3196/6000 train_time:3186426ms step_avg:1000.13ms | |
step:3197/6000 train_time:3187434ms step_avg:1000.14ms | |
step:3198/6000 train_time:3188433ms step_avg:1000.14ms | |
step:3199/6000 train_time:3189452ms step_avg:1000.14ms | |
step:3200/6000 train_time:3190450ms step_avg:1000.14ms | |
step:3200/6000 val_loss:2.5125 train_time:3190496ms step_avg:1000.16ms perplexity:12.3353 param_count:85,137,462 | |
step:3201/6000 train_time:3191448ms step_avg:1000.14ms | |
step:3202/6000 train_time:3192481ms step_avg:1000.15ms | |
step:3203/6000 train_time:3193499ms step_avg:1000.16ms | |
step:3204/6000 train_time:3194501ms step_avg:1000.16ms | |
step:3205/6000 train_time:3195505ms step_avg:1000.16ms | |
step:3206/6000 train_time:3196509ms step_avg:1000.16ms | |
step:3207/6000 train_time:3197517ms step_avg:1000.16ms | |
step:3208/6000 train_time:3198521ms step_avg:1000.16ms | |
step:3209/6000 train_time:3199539ms step_avg:1000.17ms | |
step:3210/6000 train_time:3200553ms step_avg:1000.17ms | |
step:3211/6000 train_time:3201556ms step_avg:1000.17ms | |
step:3212/6000 train_time:3202558ms step_avg:1000.17ms | |
step:3213/6000 train_time:3203561ms step_avg:1000.18ms | |
step:3214/6000 train_time:3204561ms step_avg:1000.18ms | |
step:3215/6000 train_time:3205577ms step_avg:1000.18ms | |
step:3216/6000 train_time:3206578ms step_avg:1000.18ms | |
step:3217/6000 train_time:3207587ms step_avg:1000.18ms | |
step:3218/6000 train_time:3208587ms step_avg:1000.18ms | |
step:3219/6000 train_time:3209597ms step_avg:1000.19ms | |
step:3220/6000 train_time:3210599ms step_avg:1000.19ms | |
step:3221/6000 train_time:3211618ms step_avg:1000.19ms | |
step:3222/6000 train_time:3212633ms step_avg:1000.20ms | |
step:3223/6000 train_time:3213642ms step_avg:1000.20ms | |
step:3224/6000 train_time:3214643ms step_avg:1000.20ms | |
step:3225/6000 train_time:3215658ms step_avg:1000.20ms | |
step:3225/6000 val_loss:2.5086 train_time:3215699ms step_avg:1000.22ms perplexity:12.2880 param_count:85,137,462 | |
step:3226/6000 train_time:3216651ms step_avg:1000.20ms | |
step:3227/6000 train_time:3217650ms step_avg:1000.20ms | |
step:3228/6000 train_time:3218674ms step_avg:1000.21ms | |
step:3229/6000 train_time:3219681ms step_avg:1000.21ms | |
step:3230/6000 train_time:3220703ms step_avg:1000.22ms | |
step:3231/6000 train_time:3221728ms step_avg:1000.23ms | |
step:3232/6000 train_time:3222737ms step_avg:1000.23ms | |
step:3233/6000 train_time:3223741ms step_avg:1000.23ms | |
step:3234/6000 train_time:3224739ms step_avg:1000.23ms | |
step:3235/6000 train_time:3225746ms step_avg:1000.23ms | |
step:3236/6000 train_time:3226756ms step_avg:1000.23ms | |
step:3237/6000 train_time:3227759ms step_avg:1000.24ms | |
step:3238/6000 train_time:3228770ms step_avg:1000.24ms | |
step:3239/6000 train_time:3229781ms step_avg:1000.24ms | |
step:3240/6000 train_time:3230790ms step_avg:1000.24ms | |
step:3241/6000 train_time:3231796ms step_avg:1000.25ms | |
step:3242/6000 train_time:3232801ms step_avg:1000.25ms | |
step:3243/6000 train_time:3233816ms step_avg:1000.25ms | |
step:3244/6000 train_time:3234832ms step_avg:1000.26ms | |
step:3245/6000 train_time:3235859ms step_avg:1000.27ms | |
step:3246/6000 train_time:3236862ms step_avg:1000.27ms | |
step:3247/6000 train_time:3237886ms step_avg:1000.27ms | |
step:3248/6000 train_time:3238896ms step_avg:1000.28ms | |
step:3249/6000 train_time:3239912ms step_avg:1000.28ms | |
step:3250/6000 train_time:3240936ms step_avg:1000.29ms | |
step:3250/6000 val_loss:2.5062 train_time:3240981ms step_avg:1000.30ms perplexity:12.2583 param_count:85,137,462 | |
step:3251/6000 train_time:3241932ms step_avg:1000.29ms | |
step:3252/6000 train_time:3242940ms step_avg:1000.29ms | |
step:3253/6000 train_time:3243932ms step_avg:1000.29ms | |
step:3254/6000 train_time:3244930ms step_avg:1000.29ms | |
step:3255/6000 train_time:3245941ms step_avg:1000.29ms | |
step:3256/6000 train_time:3246955ms step_avg:1000.29ms | |
step:3257/6000 train_time:3247966ms step_avg:1000.30ms | |
step:3258/6000 train_time:3248982ms step_avg:1000.30ms | |
step:3259/6000 train_time:3249976ms step_avg:1000.30ms | |
step:3260/6000 train_time:3250978ms step_avg:1000.30ms | |
step:3261/6000 train_time:3251983ms step_avg:1000.30ms | |
step:3262/6000 train_time:3252976ms step_avg:1000.30ms | |
step:3263/6000 train_time:3253985ms step_avg:1000.30ms | |
step:3264/6000 train_time:3255000ms step_avg:1000.31ms | |
step:3265/6000 train_time:3256012ms step_avg:1000.31ms | |
step:3266/6000 train_time:3257028ms step_avg:1000.32ms | |
step:3267/6000 train_time:3258045ms step_avg:1000.32ms | |
step:3268/6000 train_time:3259058ms step_avg:1000.32ms | |
step:3269/6000 train_time:3260062ms step_avg:1000.33ms | |
step:3270/6000 train_time:3261069ms step_avg:1000.33ms | |
step:3271/6000 train_time:3262067ms step_avg:1000.33ms | |
step:3272/6000 train_time:3263088ms step_avg:1000.33ms | |
step:3273/6000 train_time:3264106ms step_avg:1000.34ms | |
step:3274/6000 train_time:3265108ms step_avg:1000.34ms | |
step:3275/6000 train_time:3266111ms step_avg:1000.34ms | |
step:3275/6000 val_loss:2.4933 train_time:3266157ms step_avg:1000.35ms perplexity:12.1008 param_count:85,137,462 | |
step:3276/6000 train_time:3267105ms step_avg:1000.34ms | |
step:3277/6000 train_time:3268099ms step_avg:1000.34ms | |
step:3278/6000 train_time:3269111ms step_avg:1000.34ms | |
step:3279/6000 train_time:3270117ms step_avg:1000.34ms | |
step:3280/6000 train_time:3271114ms step_avg:1000.34ms | |
step:3281/6000 train_time:3272109ms step_avg:1000.34ms | |
step:3282/6000 train_time:3273119ms step_avg:1000.34ms | |
step:3283/6000 train_time:3274127ms step_avg:1000.34ms | |
step:3284/6000 train_time:3275122ms step_avg:1000.34ms | |
step:3285/6000 train_time:3276133ms step_avg:1000.35ms | |
step:3286/6000 train_time:3277141ms step_avg:1000.35ms | |
step:3287/6000 train_time:3278156ms step_avg:1000.35ms | |
step:3288/6000 train_time:3279166ms step_avg:1000.36ms | |
step:3289/6000 train_time:3280162ms step_avg:1000.35ms | |
step:3290/6000 train_time:3281162ms step_avg:1000.35ms | |
step:3291/6000 train_time:3282175ms step_avg:1000.36ms | |
step:3292/6000 train_time:3283172ms step_avg:1000.36ms | |
step:3293/6000 train_time:3284193ms step_avg:1000.36ms | |
step:3294/6000 train_time:3285204ms step_avg:1000.37ms | |
step:3295/6000 train_time:3286203ms step_avg:1000.37ms | |
step:3296/6000 train_time:3287205ms step_avg:1000.37ms | |
step:3297/6000 train_time:3288211ms step_avg:1000.37ms | |
step:3298/6000 train_time:3289218ms step_avg:1000.37ms | |
step:3299/6000 train_time:3290219ms step_avg:1000.37ms | |
step:3300/6000 train_time:3291231ms step_avg:1000.37ms | |
step:3300/6000 val_loss:2.5007 train_time:3291277ms step_avg:1000.39ms perplexity:12.1912 param_count:85,137,462 | |
step:3301/6000 train_time:3292234ms step_avg:1000.37ms | |
step:3302/6000 train_time:3293244ms step_avg:1000.38ms | |
step:3303/6000 train_time:3294242ms step_avg:1000.38ms | |
step:3304/6000 train_time:3295255ms step_avg:1000.38ms | |
step:3305/6000 train_time:3296260ms step_avg:1000.38ms | |
step:3306/6000 train_time:3297268ms step_avg:1000.38ms | |
step:3307/6000 train_time:3298271ms step_avg:1000.39ms | |
step:3308/6000 train_time:3299280ms step_avg:1000.39ms | |
step:3309/6000 train_time:3300283ms step_avg:1000.39ms | |
step:3310/6000 train_time:3301295ms step_avg:1000.39ms | |
step:3311/6000 train_time:3302310ms step_avg:1000.40ms | |
step:3312/6000 train_time:3303320ms step_avg:1000.40ms | |
step:3313/6000 train_time:3304338ms step_avg:1000.41ms | |
step:3314/6000 train_time:3305345ms step_avg:1000.41ms | |
step:3315/6000 train_time:3306345ms step_avg:1000.41ms | |
step:3316/6000 train_time:3307353ms step_avg:1000.41ms | |
step:3317/6000 train_time:3308357ms step_avg:1000.41ms | |
step:3318/6000 train_time:3309353ms step_avg:1000.41ms | |
step:3319/6000 train_time:3310366ms step_avg:1000.41ms | |
step:3320/6000 train_time:3311364ms step_avg:1000.41ms | |
step:3321/6000 train_time:3312362ms step_avg:1000.41ms | |
step:3322/6000 train_time:3313374ms step_avg:1000.41ms | |
step:3323/6000 train_time:3314383ms step_avg:1000.42ms | |
step:3324/6000 train_time:3315397ms step_avg:1000.42ms | |
step:3325/6000 train_time:3316407ms step_avg:1000.42ms | |
step:3325/6000 val_loss:2.5191 train_time:3316453ms step_avg:1000.44ms perplexity:12.4171 param_count:85,137,462 | |
step:3326/6000 train_time:3317403ms step_avg:1000.42ms | |
step:3327/6000 train_time:3318414ms step_avg:1000.43ms | |
step:3328/6000 train_time:3319415ms step_avg:1000.43ms | |
step:3329/6000 train_time:3320435ms step_avg:1000.43ms | |
step:3330/6000 train_time:3321431ms step_avg:1000.43ms | |
step:3331/6000 train_time:3322485ms step_avg:1000.45ms | |
step:3332/6000 train_time:3323480ms step_avg:1000.45ms | |
step:3333/6000 train_time:3324487ms step_avg:1000.45ms | |
step:3334/6000 train_time:3325503ms step_avg:1000.45ms | |
step:3335/6000 train_time:3326520ms step_avg:1000.46ms | |
step:3336/6000 train_time:3327528ms step_avg:1000.46ms | |
step:3337/6000 train_time:3328540ms step_avg:1000.46ms | |
step:3338/6000 train_time:3329561ms step_avg:1000.47ms | |
step:3339/6000 train_time:3330558ms step_avg:1000.47ms | |
step:3340/6000 train_time:3331569ms step_avg:1000.47ms | |
step:3341/6000 train_time:3332574ms step_avg:1000.47ms | |
step:3342/6000 train_time:3333570ms step_avg:1000.47ms | |
step:3343/6000 train_time:3334575ms step_avg:1000.47ms | |
step:3344/6000 train_time:3335589ms step_avg:1000.48ms | |
step:3345/6000 train_time:3336589ms step_avg:1000.48ms | |
step:3346/6000 train_time:3337590ms step_avg:1000.48ms | |
step:3347/6000 train_time:3338612ms step_avg:1000.48ms | |
step:3348/6000 train_time:3339614ms step_avg:1000.48ms | |
step:3349/6000 train_time:3340618ms step_avg:1000.48ms | |
step:3350/6000 train_time:3341625ms step_avg:1000.49ms | |
step:3350/6000 val_loss:2.5091 train_time:3341667ms step_avg:1000.50ms perplexity:12.2935 param_count:85,137,462 | |
step:3351/6000 train_time:3342620ms step_avg:1000.48ms | |
step:3352/6000 train_time:3343639ms step_avg:1000.49ms | |
step:3353/6000 train_time:3344634ms step_avg:1000.49ms | |
step:3354/6000 train_time:3345638ms step_avg:1000.49ms | |
step:3355/6000 train_time:3346649ms step_avg:1000.49ms | |
step:3356/6000 train_time:3347665ms step_avg:1000.50ms | |
step:3357/6000 train_time:3348673ms step_avg:1000.50ms | |
step:3358/6000 train_time:3349701ms step_avg:1000.51ms | |
step:3359/6000 train_time:3350710ms step_avg:1000.51ms | |
step:3360/6000 train_time:3351720ms step_avg:1000.51ms | |
step:3361/6000 train_time:3352722ms step_avg:1000.51ms | |
step:3362/6000 train_time:3353735ms step_avg:1000.52ms | |
step:3363/6000 train_time:3354740ms step_avg:1000.52ms | |
step:3364/6000 train_time:3355759ms step_avg:1000.52ms | |
step:3365/6000 train_time:3356770ms step_avg:1000.53ms | |
step:3366/6000 train_time:3357798ms step_avg:1000.54ms | |
step:3367/6000 train_time:3358808ms step_avg:1000.54ms | |
step:3368/6000 train_time:3359835ms step_avg:1000.55ms | |
step:3369/6000 train_time:3360843ms step_avg:1000.55ms | |
step:3370/6000 train_time:3361843ms step_avg:1000.55ms | |
step:3371/6000 train_time:3362853ms step_avg:1000.55ms | |
step:3372/6000 train_time:3363866ms step_avg:1000.55ms | |
step:3373/6000 train_time:3364869ms step_avg:1000.56ms | |
step:3374/6000 train_time:3365880ms step_avg:1000.56ms | |
step:3375/6000 train_time:3366890ms step_avg:1000.56ms | |
step:3375/6000 val_loss:2.4978 train_time:3366937ms step_avg:1000.58ms perplexity:12.1561 param_count:85,137,462 | |
step:3376/6000 train_time:3367884ms step_avg:1000.56ms | |
step:3377/6000 train_time:3368888ms step_avg:1000.56ms | |
step:3378/6000 train_time:3369883ms step_avg:1000.56ms | |
step:3379/6000 train_time:3370888ms step_avg:1000.56ms | |
step:3380/6000 train_time:3371889ms step_avg:1000.56ms | |
step:3381/6000 train_time:3372902ms step_avg:1000.56ms | |
step:3382/6000 train_time:3373933ms step_avg:1000.57ms | |
step:3383/6000 train_time:3374948ms step_avg:1000.58ms | |
step:3384/6000 train_time:3375953ms step_avg:1000.58ms | |
step:3385/6000 train_time:3376975ms step_avg:1000.59ms | |
step:3386/6000 train_time:3377971ms step_avg:1000.58ms | |
step:3387/6000 train_time:3378973ms step_avg:1000.58ms | |
step:3388/6000 train_time:3379974ms step_avg:1000.58ms | |
step:3389/6000 train_time:3380988ms step_avg:1000.59ms | |
step:3390/6000 train_time:3382000ms step_avg:1000.59ms | |
step:3391/6000 train_time:3382989ms step_avg:1000.59ms | |
step:3392/6000 train_time:3383995ms step_avg:1000.59ms | |
step:3393/6000 train_time:3385008ms step_avg:1000.59ms | |
step:3394/6000 train_time:3386006ms step_avg:1000.59ms | |
step:3395/6000 train_time:3387002ms step_avg:1000.59ms | |
step:3396/6000 train_time:3388007ms step_avg:1000.59ms | |
step:3397/6000 train_time:3389008ms step_avg:1000.59ms | |
step:3398/6000 train_time:3390019ms step_avg:1000.60ms | |
step:3399/6000 train_time:3391031ms step_avg:1000.60ms | |
step:3400/6000 train_time:3392034ms step_avg:1000.60ms | |
step:3400/6000 val_loss:2.4908 train_time:3392080ms step_avg:1000.61ms perplexity:12.0709 param_count:85,137,462 | |
step:3401/6000 train_time:3393030ms step_avg:1000.60ms | |
step:3402/6000 train_time:3394034ms step_avg:1000.60ms | |
step:3403/6000 train_time:3395035ms step_avg:1000.60ms | |
step:3404/6000 train_time:3396036ms step_avg:1000.60ms | |
step:3405/6000 train_time:3397056ms step_avg:1000.61ms | |
step:3406/6000 train_time:3398064ms step_avg:1000.61ms | |
step:3407/6000 train_time:3399076ms step_avg:1000.61ms | |
step:3408/6000 train_time:3400081ms step_avg:1000.61ms | |
step:3409/6000 train_time:3401095ms step_avg:1000.62ms | |
step:3410/6000 train_time:3402110ms step_avg:1000.62ms | |
step:3411/6000 train_time:3403123ms step_avg:1000.62ms | |
step:3412/6000 train_time:3404133ms step_avg:1000.63ms | |
step:3413/6000 train_time:3405138ms step_avg:1000.63ms | |
step:3414/6000 train_time:3406154ms step_avg:1000.63ms | |
step:3415/6000 train_time:3407167ms step_avg:1000.64ms | |
step:3416/6000 train_time:3408176ms step_avg:1000.64ms | |
step:3417/6000 train_time:3409176ms step_avg:1000.64ms | |
step:3418/6000 train_time:3410185ms step_avg:1000.64ms | |
step:3419/6000 train_time:3411187ms step_avg:1000.64ms | |
step:3420/6000 train_time:3412195ms step_avg:1000.64ms | |
step:3421/6000 train_time:3413202ms step_avg:1000.65ms | |
step:3422/6000 train_time:3414211ms step_avg:1000.65ms | |
step:3423/6000 train_time:3415219ms step_avg:1000.65ms | |
step:3424/6000 train_time:3416224ms step_avg:1000.65ms | |
step:3425/6000 train_time:3417223ms step_avg:1000.65ms | |
step:3425/6000 val_loss:2.4974 train_time:3417268ms step_avg:1000.66ms perplexity:12.1505 param_count:85,137,462 | |
step:3426/6000 train_time:3418217ms step_avg:1000.65ms | |
step:3427/6000 train_time:3419244ms step_avg:1000.66ms | |
step:3428/6000 train_time:3420246ms step_avg:1000.66ms | |
step:3429/6000 train_time:3421257ms step_avg:1000.66ms | |
step:3430/6000 train_time:3422258ms step_avg:1000.66ms | |
step:3431/6000 train_time:3423257ms step_avg:1000.66ms | |
step:3432/6000 train_time:3424267ms step_avg:1000.66ms | |
step:3433/6000 train_time:3425260ms step_avg:1000.66ms | |
step:3434/6000 train_time:3426263ms step_avg:1000.66ms | |
step:3435/6000 train_time:3427269ms step_avg:1000.66ms | |
step:3436/6000 train_time:3428288ms step_avg:1000.67ms | |
step:3437/6000 train_time:3429297ms step_avg:1000.67ms | |
step:3438/6000 train_time:3430314ms step_avg:1000.68ms | |
step:3439/6000 train_time:3431331ms step_avg:1000.68ms | |
step:3440/6000 train_time:3432319ms step_avg:1000.68ms | |
step:3441/6000 train_time:3433318ms step_avg:1000.68ms | |
step:3442/6000 train_time:3434323ms step_avg:1000.68ms | |
step:3443/6000 train_time:3435333ms step_avg:1000.68ms | |
step:3444/6000 train_time:3436333ms step_avg:1000.68ms | |
step:3445/6000 train_time:3437326ms step_avg:1000.68ms | |
step:3446/6000 train_time:3438342ms step_avg:1000.68ms | |
step:3447/6000 train_time:3439341ms step_avg:1000.68ms | |
step:3448/6000 train_time:3440340ms step_avg:1000.68ms | |
step:3449/6000 train_time:3441348ms step_avg:1000.68ms | |
step:3450/6000 train_time:3442357ms step_avg:1000.69ms | |
step:3450/6000 val_loss:2.4992 train_time:3442399ms step_avg:1000.70ms perplexity:12.1731 param_count:85,137,462 | |
step:3451/6000 train_time:3443345ms step_avg:1000.68ms | |
step:3452/6000 train_time:3444353ms step_avg:1000.68ms | |
step:3453/6000 train_time:3445365ms step_avg:1000.69ms | |
step:3454/6000 train_time:3446371ms step_avg:1000.69ms | |
step:3455/6000 train_time:3447372ms step_avg:1000.69ms | |
step:3456/6000 train_time:3448387ms step_avg:1000.69ms | |
step:3457/6000 train_time:3449384ms step_avg:1000.69ms | |
step:3458/6000 train_time:3450384ms step_avg:1000.69ms | |
step:3459/6000 train_time:3451397ms step_avg:1000.70ms | |
step:3460/6000 train_time:3452405ms step_avg:1000.70ms | |
step:3461/6000 train_time:3453411ms step_avg:1000.70ms | |
step:3462/6000 train_time:3454428ms step_avg:1000.70ms | |
step:3463/6000 train_time:3455457ms step_avg:1000.71ms | |
step:3464/6000 train_time:3456472ms step_avg:1000.72ms | |
step:3465/6000 train_time:3457486ms step_avg:1000.72ms | |
step:3466/6000 train_time:3458491ms step_avg:1000.72ms | |
step:3467/6000 train_time:3459509ms step_avg:1000.73ms | |
step:3468/6000 train_time:3460533ms step_avg:1000.73ms | |
step:3469/6000 train_time:3461540ms step_avg:1000.73ms | |
step:3470/6000 train_time:3462541ms step_avg:1000.73ms | |
step:3471/6000 train_time:3463544ms step_avg:1000.74ms | |
step:3472/6000 train_time:3464574ms step_avg:1000.74ms | |
step:3473/6000 train_time:3465589ms step_avg:1000.75ms | |
step:3474/6000 train_time:3466587ms step_avg:1000.75ms | |
step:3475/6000 train_time:3467582ms step_avg:1000.75ms | |
step:3475/6000 val_loss:2.4987 train_time:3467629ms step_avg:1000.76ms perplexity:12.1667 param_count:85,137,462 | |
step:3476/6000 train_time:3468579ms step_avg:1000.74ms | |
step:3477/6000 train_time:3469574ms step_avg:1000.74ms | |
step:3478/6000 train_time:3470583ms step_avg:1000.74ms | |
step:3479/6000 train_time:3471590ms step_avg:1000.75ms | |
step:3480/6000 train_time:3472583ms step_avg:1000.74ms | |
step:3481/6000 train_time:3473588ms step_avg:1000.75ms | |
step:3482/6000 train_time:3474609ms step_avg:1000.75ms | |
step:3483/6000 train_time:3475616ms step_avg:1000.75ms | |
step:3484/6000 train_time:3476615ms step_avg:1000.75ms | |
step:3485/6000 train_time:3477618ms step_avg:1000.75ms | |
step:3486/6000 train_time:3478612ms step_avg:1000.75ms | |
step:3487/6000 train_time:3479616ms step_avg:1000.75ms | |
step:3488/6000 train_time:3480635ms step_avg:1000.76ms | |
step:3489/6000 train_time:3481637ms step_avg:1000.76ms | |
step:3490/6000 train_time:3482622ms step_avg:1000.75ms | |
step:3491/6000 train_time:3483629ms step_avg:1000.76ms | |
step:3492/6000 train_time:3484638ms step_avg:1000.76ms | |
step:3493/6000 train_time:3485651ms step_avg:1000.76ms | |
step:3494/6000 train_time:3486660ms step_avg:1000.76ms | |
step:3495/6000 train_time:3487666ms step_avg:1000.77ms | |
step:3496/6000 train_time:3488693ms step_avg:1000.77ms | |
step:3497/6000 train_time:3489702ms step_avg:1000.77ms | |
step:3498/6000 train_time:3490714ms step_avg:1000.78ms | |
step:3499/6000 train_time:3491715ms step_avg:1000.78ms | |
step:3500/6000 train_time:3492725ms step_avg:1000.78ms | |
step:3500/6000 val_loss:2.4965 train_time:3492772ms step_avg:1000.79ms perplexity:12.1399 param_count:85,137,462 | |
step:3501/6000 train_time:3493769ms step_avg:1000.79ms | |
step:3502/6000 train_time:3494761ms step_avg:1000.79ms | |
step:3503/6000 train_time:3495789ms step_avg:1000.80ms | |
step:3504/6000 train_time:3496788ms step_avg:1000.80ms | |
step:3505/6000 train_time:3497813ms step_avg:1000.80ms | |
step:3506/6000 train_time:3498815ms step_avg:1000.81ms | |
step:3507/6000 train_time:3499835ms step_avg:1000.81ms | |
step:3508/6000 train_time:3500850ms step_avg:1000.81ms | |
step:3509/6000 train_time:3501845ms step_avg:1000.81ms | |
step:3510/6000 train_time:3502846ms step_avg:1000.81ms | |
step:3511/6000 train_time:3503848ms step_avg:1000.81ms | |
step:3512/6000 train_time:3504864ms step_avg:1000.82ms | |
step:3513/6000 train_time:3505868ms step_avg:1000.82ms | |
step:3514/6000 train_time:3506873ms step_avg:1000.82ms | |
step:3515/6000 train_time:3507873ms step_avg:1000.82ms | |
step:3516/6000 train_time:3508899ms step_avg:1000.83ms | |
step:3517/6000 train_time:3509902ms step_avg:1000.83ms | |
step:3518/6000 train_time:3510919ms step_avg:1000.83ms | |
step:3519/6000 train_time:3511945ms step_avg:1000.84ms | |
step:3520/6000 train_time:3512945ms step_avg:1000.84ms | |
step:3521/6000 train_time:3513952ms step_avg:1000.84ms | |
step:3522/6000 train_time:3514959ms step_avg:1000.84ms | |
step:3523/6000 train_time:3515949ms step_avg:1000.84ms | |
step:3524/6000 train_time:3516959ms step_avg:1000.84ms | |
step:3525/6000 train_time:3517959ms step_avg:1000.84ms | |
step:3525/6000 val_loss:2.5000 train_time:3518005ms step_avg:1000.85ms perplexity:12.1830 param_count:85,137,462 | |
step:3526/6000 train_time:3518968ms step_avg:1000.84ms | |
step:3527/6000 train_time:3519993ms step_avg:1000.85ms | |
step:3528/6000 train_time:3520980ms step_avg:1000.85ms | |
step:3529/6000 train_time:3521984ms step_avg:1000.85ms | |
step:3530/6000 train_time:3522993ms step_avg:1000.85ms | |
step:3531/6000 train_time:3523991ms step_avg:1000.85ms | |
step:3532/6000 train_time:3524996ms step_avg:1000.85ms | |
step:3533/6000 train_time:3526000ms step_avg:1000.85ms | |
step:3534/6000 train_time:3527002ms step_avg:1000.85ms | |
step:3535/6000 train_time:3528011ms step_avg:1000.85ms | |
step:3536/6000 train_time:3529011ms step_avg:1000.85ms | |
step:3537/6000 train_time:3530018ms step_avg:1000.86ms | |
step:3538/6000 train_time:3531029ms step_avg:1000.86ms | |
step:3539/6000 train_time:3532048ms step_avg:1000.86ms | |
step:3540/6000 train_time:3533052ms step_avg:1000.86ms | |
step:3541/6000 train_time:3534047ms step_avg:1000.86ms | |
step:3542/6000 train_time:3535069ms step_avg:1000.87ms | |
step:3543/6000 train_time:3536069ms step_avg:1000.87ms | |
step:3544/6000 train_time:3537070ms step_avg:1000.87ms | |
step:3545/6000 train_time:3538087ms step_avg:1000.87ms | |
step:3546/6000 train_time:3539097ms step_avg:1000.88ms | |
step:3547/6000 train_time:3540098ms step_avg:1000.88ms | |
step:3548/6000 train_time:3541102ms step_avg:1000.88ms | |
step:3549/6000 train_time:3542105ms step_avg:1000.88ms | |
step:3550/6000 train_time:3543115ms step_avg:1000.88ms | |
step:3550/6000 val_loss:2.4981 train_time:3543162ms step_avg:1000.89ms perplexity:12.1593 param_count:85,137,462 | |
step:3551/6000 train_time:3544112ms step_avg:1000.88ms | |
step:3552/6000 train_time:3545114ms step_avg:1000.88ms | |
step:3553/6000 train_time:3546120ms step_avg:1000.88ms | |
step:3554/6000 train_time:3547128ms step_avg:1000.88ms | |
step:3555/6000 train_time:3548127ms step_avg:1000.88ms | |
step:3556/6000 train_time:3549143ms step_avg:1000.89ms | |
step:3557/6000 train_time:3550164ms step_avg:1000.89ms | |
step:3558/6000 train_time:3551173ms step_avg:1000.89ms | |
step:3559/6000 train_time:3552176ms step_avg:1000.89ms | |
step:3560/6000 train_time:3553185ms step_avg:1000.90ms | |
step:3561/6000 train_time:3554188ms step_avg:1000.90ms | |
step:3562/6000 train_time:3555222ms step_avg:1000.91ms | |
step:3563/6000 train_time:3556242ms step_avg:1000.91ms | |
step:3564/6000 train_time:3557252ms step_avg:1000.91ms | |
step:3565/6000 train_time:3558246ms step_avg:1000.91ms | |
step:3566/6000 train_time:3559256ms step_avg:1000.92ms | |
step:3567/6000 train_time:3560268ms step_avg:1000.92ms | |
step:3568/6000 train_time:3561291ms step_avg:1000.93ms | |
step:3569/6000 train_time:3562282ms step_avg:1000.92ms | |
step:3570/6000 train_time:3563290ms step_avg:1000.92ms | |
step:3571/6000 train_time:3564291ms step_avg:1000.92ms | |
step:3572/6000 train_time:3565302ms step_avg:1000.93ms | |
step:3573/6000 train_time:3566305ms step_avg:1000.93ms | |
step:3574/6000 train_time:3567346ms step_avg:1000.94ms | |
step:3575/6000 train_time:3568356ms step_avg:1000.94ms | |
step:3575/6000 val_loss:2.4995 train_time:3568398ms step_avg:1000.95ms perplexity:12.1758 param_count:85,137,462 | |
step:3576/6000 train_time:3569347ms step_avg:1000.94ms | |
step:3577/6000 train_time:3570354ms step_avg:1000.94ms | |
step:3578/6000 train_time:3571363ms step_avg:1000.94ms | |
step:3579/6000 train_time:3572378ms step_avg:1000.95ms | |
step:3580/6000 train_time:3573385ms step_avg:1000.95ms | |
step:3581/6000 train_time:3574402ms step_avg:1000.95ms | |
step:3582/6000 train_time:3575409ms step_avg:1000.95ms | |
step:3583/6000 train_time:3576408ms step_avg:1000.95ms | |
step:3584/6000 train_time:3577427ms step_avg:1000.96ms | |
step:3585/6000 train_time:3578433ms step_avg:1000.96ms | |
step:3586/6000 train_time:3579443ms step_avg:1000.96ms | |
step:3587/6000 train_time:3580439ms step_avg:1000.96ms | |
step:3588/6000 train_time:3581473ms step_avg:1000.97ms | |
step:3589/6000 train_time:3582482ms step_avg:1000.97ms | |
step:3590/6000 train_time:3583495ms step_avg:1000.98ms | |
step:3591/6000 train_time:3584491ms step_avg:1000.97ms | |
step:3592/6000 train_time:3585495ms step_avg:1000.98ms | |
step:3593/6000 train_time:3586510ms step_avg:1000.98ms | |
step:3594/6000 train_time:3587525ms step_avg:1000.98ms | |
step:3595/6000 train_time:3588552ms step_avg:1000.99ms | |
step:3596/6000 train_time:3589548ms step_avg:1000.99ms | |
step:3597/6000 train_time:3590564ms step_avg:1000.99ms | |
step:3598/6000 train_time:3591566ms step_avg:1000.99ms | |
step:3599/6000 train_time:3592565ms step_avg:1000.99ms | |
step:3600/6000 train_time:3593658ms step_avg:1001.02ms | |
step:3600/6000 val_loss:2.4868 train_time:3593703ms step_avg:1001.03ms perplexity:12.0228 param_count:85,137,462 | |
step:3601/6000 train_time:3594659ms step_avg:1001.02ms | |
step:3602/6000 train_time:3595671ms step_avg:1001.02ms | |
step:3603/6000 train_time:3596681ms step_avg:1001.02ms | |
step:3604/6000 train_time:3597699ms step_avg:1001.03ms | |
step:3605/6000 train_time:3598710ms step_avg:1001.03ms | |
step:3606/6000 train_time:3599722ms step_avg:1001.04ms | |
step:3607/6000 train_time:3600734ms step_avg:1001.04ms | |
step:3608/6000 train_time:3601747ms step_avg:1001.04ms | |
step:3609/6000 train_time:3602764ms step_avg:1001.05ms | |
step:3610/6000 train_time:3603789ms step_avg:1001.05ms | |
step:3611/6000 train_time:3604795ms step_avg:1001.05ms | |
step:3612/6000 train_time:3605796ms step_avg:1001.05ms | |
step:3613/6000 train_time:3606796ms step_avg:1001.05ms | |
step:3614/6000 train_time:3607791ms step_avg:1001.05ms | |
step:3615/6000 train_time:3608806ms step_avg:1001.06ms | |
step:3616/6000 train_time:3609818ms step_avg:1001.06ms | |
step:3617/6000 train_time:3610819ms step_avg:1001.06ms | |
step:3618/6000 train_time:3611832ms step_avg:1001.06ms | |
step:3619/6000 train_time:3612843ms step_avg:1001.06ms | |
step:3620/6000 train_time:3613851ms step_avg:1001.07ms | |
step:3621/6000 train_time:3614863ms step_avg:1001.07ms | |
step:3622/6000 train_time:3615856ms step_avg:1001.07ms | |
step:3623/6000 train_time:3616853ms step_avg:1001.07ms | |
step:3624/6000 train_time:3617866ms step_avg:1001.07ms | |
step:3625/6000 train_time:3618868ms step_avg:1001.07ms | |
step:3625/6000 val_loss:2.5018 train_time:3618914ms step_avg:1001.08ms perplexity:12.2040 param_count:85,137,462 | |
step:3626/6000 train_time:3619875ms step_avg:1001.07ms | |
step:3627/6000 train_time:3620884ms step_avg:1001.07ms | |
step:3628/6000 train_time:3621897ms step_avg:1001.08ms | |
step:3629/6000 train_time:3622911ms step_avg:1001.08ms | |
step:3630/6000 train_time:3623911ms step_avg:1001.08ms | |
step:3631/6000 train_time:3624910ms step_avg:1001.08ms | |
step:3632/6000 train_time:3625926ms step_avg:1001.08ms | |
step:3633/6000 train_time:3627021ms step_avg:1001.11ms | |
step:3634/6000 train_time:3628041ms step_avg:1001.12ms | |
step:3635/6000 train_time:3629052ms step_avg:1001.12ms | |
step:3636/6000 train_time:3630060ms step_avg:1001.12ms | |
step:3637/6000 train_time:3631061ms step_avg:1001.12ms | |
step:3638/6000 train_time:3632085ms step_avg:1001.13ms | |
step:3639/6000 train_time:3633087ms step_avg:1001.13ms | |
step:3640/6000 train_time:3634095ms step_avg:1001.13ms | |
step:3641/6000 train_time:3635100ms step_avg:1001.13ms | |
step:3642/6000 train_time:3636103ms step_avg:1001.13ms | |
step:3643/6000 train_time:3637121ms step_avg:1001.13ms | |
step:3644/6000 train_time:3638114ms step_avg:1001.13ms | |
step:3645/6000 train_time:3639130ms step_avg:1001.14ms | |
step:3646/6000 train_time:3640144ms step_avg:1001.14ms | |
step:3647/6000 train_time:3641152ms step_avg:1001.14ms | |
step:3648/6000 train_time:3642164ms step_avg:1001.14ms | |
step:3649/6000 train_time:3643169ms step_avg:1001.15ms | |
step:3650/6000 train_time:3644169ms step_avg:1001.15ms | |
step:3650/6000 val_loss:2.4919 train_time:3644216ms step_avg:1001.16ms perplexity:12.0845 param_count:85,137,462 | |
step:3651/6000 train_time:3645168ms step_avg:1001.14ms | |
step:3652/6000 train_time:3646186ms step_avg:1001.15ms | |
step:3653/6000 train_time:3647205ms step_avg:1001.15ms | |
step:3654/6000 train_time:3648223ms step_avg:1001.16ms | |
step:3655/6000 train_time:3649219ms step_avg:1001.16ms | |
step:3656/6000 train_time:3650226ms step_avg:1001.16ms | |
step:3657/6000 train_time:3651238ms step_avg:1001.16ms | |
step:3658/6000 train_time:3652233ms step_avg:1001.16ms | |
step:3659/6000 train_time:3653240ms step_avg:1001.16ms | |
step:3660/6000 train_time:3654243ms step_avg:1001.16ms | |
step:3661/6000 train_time:3655245ms step_avg:1001.16ms | |
step:3662/6000 train_time:3656260ms step_avg:1001.17ms | |
step:3663/6000 train_time:3657269ms step_avg:1001.17ms | |
step:3664/6000 train_time:3658271ms step_avg:1001.17ms | |
step:3665/6000 train_time:3659289ms step_avg:1001.17ms | |
step:3666/6000 train_time:3660290ms step_avg:1001.17ms | |
step:3667/6000 train_time:3661289ms step_avg:1001.17ms | |
step:3668/6000 train_time:3662299ms step_avg:1001.18ms | |
step:3669/6000 train_time:3663320ms step_avg:1001.18ms | |
step:3670/6000 train_time:3664324ms step_avg:1001.18ms | |
step:3671/6000 train_time:3665320ms step_avg:1001.18ms | |
step:3672/6000 train_time:3666336ms step_avg:1001.18ms | |
step:3673/6000 train_time:3667342ms step_avg:1001.19ms | |
step:3674/6000 train_time:3668341ms step_avg:1001.18ms | |
step:3675/6000 train_time:3669346ms step_avg:1001.19ms | |
step:3675/6000 val_loss:2.4959 train_time:3669392ms step_avg:1001.20ms perplexity:12.1328 param_count:85,137,462 | |
step:3676/6000 train_time:3670347ms step_avg:1001.19ms | |
step:3677/6000 train_time:3671354ms step_avg:1001.19ms | |
step:3678/6000 train_time:3672351ms step_avg:1001.19ms | |
step:3679/6000 train_time:3673375ms step_avg:1001.19ms | |
step:3680/6000 train_time:3674396ms step_avg:1001.20ms | |
step:3681/6000 train_time:3675399ms step_avg:1001.20ms | |
step:3682/6000 train_time:3676409ms step_avg:1001.20ms | |
step:3683/6000 train_time:3677410ms step_avg:1001.20ms | |
step:3684/6000 train_time:3678433ms step_avg:1001.21ms | |
step:3685/6000 train_time:3679433ms step_avg:1001.21ms | |
step:3686/6000 train_time:3680435ms step_avg:1001.21ms | |
step:3687/6000 train_time:3681435ms step_avg:1001.21ms | |
step:3688/6000 train_time:3682443ms step_avg:1001.21ms | |
step:3689/6000 train_time:3683450ms step_avg:1001.21ms | |
step:3690/6000 train_time:3684455ms step_avg:1001.21ms | |
step:3691/6000 train_time:3685461ms step_avg:1001.21ms | |
step:3692/6000 train_time:3686473ms step_avg:1001.21ms | |
step:3693/6000 train_time:3687482ms step_avg:1001.22ms | |
step:3694/6000 train_time:3688497ms step_avg:1001.22ms | |
step:3695/6000 train_time:3689508ms step_avg:1001.22ms | |
step:3696/6000 train_time:3690505ms step_avg:1001.22ms | |
step:3697/6000 train_time:3691511ms step_avg:1001.22ms | |
step:3698/6000 train_time:3692496ms step_avg:1001.22ms | |
step:3699/6000 train_time:3693529ms step_avg:1001.23ms | |
step:3700/6000 train_time:3694540ms step_avg:1001.23ms | |
step:3700/6000 val_loss:2.4941 train_time:3694586ms step_avg:1001.24ms perplexity:12.1104 param_count:85,137,462 | |
step:3701/6000 train_time:3695552ms step_avg:1001.23ms | |
step:3702/6000 train_time:3696569ms step_avg:1001.24ms | |
step:3703/6000 train_time:3697586ms step_avg:1001.24ms | |
step:3704/6000 train_time:3698588ms step_avg:1001.24ms | |
step:3705/6000 train_time:3699593ms step_avg:1001.24ms | |
step:3706/6000 train_time:3700602ms step_avg:1001.25ms | |
step:3707/6000 train_time:3701609ms step_avg:1001.25ms | |
step:3708/6000 train_time:3702614ms step_avg:1001.25ms | |
step:3709/6000 train_time:3703612ms step_avg:1001.25ms | |
step:3710/6000 train_time:3704628ms step_avg:1001.25ms | |
step:3711/6000 train_time:3705637ms step_avg:1001.25ms | |
step:3712/6000 train_time:3706647ms step_avg:1001.26ms | |
step:3713/6000 train_time:3707659ms step_avg:1001.26ms | |
step:3714/6000 train_time:3708663ms step_avg:1001.26ms | |
step:3715/6000 train_time:3709662ms step_avg:1001.26ms | |
step:3716/6000 train_time:3710669ms step_avg:1001.26ms | |
step:3717/6000 train_time:3711680ms step_avg:1001.26ms | |
step:3718/6000 train_time:3712684ms step_avg:1001.26ms | |
step:3719/6000 train_time:3713693ms step_avg:1001.27ms | |
step:3720/6000 train_time:3714700ms step_avg:1001.27ms | |
step:3721/6000 train_time:3715700ms step_avg:1001.27ms | |
step:3722/6000 train_time:3716707ms step_avg:1001.27ms | |
step:3723/6000 train_time:3717705ms step_avg:1001.27ms | |
step:3724/6000 train_time:3718724ms step_avg:1001.27ms | |
step:3725/6000 train_time:3719719ms step_avg:1001.27ms | |
step:3725/6000 val_loss:2.4865 train_time:3719766ms step_avg:1001.28ms perplexity:12.0189 param_count:85,137,462 | |
step:3726/6000 train_time:3720727ms step_avg:1001.27ms | |
step:3727/6000 train_time:3721733ms step_avg:1001.27ms | |
step:3728/6000 train_time:3722747ms step_avg:1001.28ms | |
step:3729/6000 train_time:3723769ms step_avg:1001.28ms | |
step:3730/6000 train_time:3724772ms step_avg:1001.28ms | |
step:3731/6000 train_time:3725782ms step_avg:1001.29ms | |
step:3732/6000 train_time:3726791ms step_avg:1001.29ms | |
step:3733/6000 train_time:3727792ms step_avg:1001.29ms | |
step:3734/6000 train_time:3728805ms step_avg:1001.29ms | |
step:3735/6000 train_time:3729830ms step_avg:1001.30ms | |
step:3736/6000 train_time:3730845ms step_avg:1001.30ms | |
step:3737/6000 train_time:3731844ms step_avg:1001.30ms | |
step:3738/6000 train_time:3732859ms step_avg:1001.30ms | |
step:3739/6000 train_time:3733866ms step_avg:1001.30ms | |
step:3740/6000 train_time:3734875ms step_avg:1001.31ms | |
step:3741/6000 train_time:3735889ms step_avg:1001.31ms | |
step:3742/6000 train_time:3736897ms step_avg:1001.31ms | |
step:3743/6000 train_time:3737910ms step_avg:1001.32ms | |
step:3744/6000 train_time:3738922ms step_avg:1001.32ms | |
step:3745/6000 train_time:3739928ms step_avg:1001.32ms | |
step:3746/6000 train_time:3740930ms step_avg:1001.32ms | |
step:3747/6000 train_time:3741939ms step_avg:1001.32ms | |
step:3748/6000 train_time:3742949ms step_avg:1001.32ms | |
step:3749/6000 train_time:3743947ms step_avg:1001.32ms | |
step:3750/6000 train_time:3744981ms step_avg:1001.33ms | |
step:3750/6000 val_loss:2.4837 train_time:3745028ms step_avg:1001.34ms perplexity:11.9857 param_count:85,137,462 | |
step:3751/6000 train_time:3745973ms step_avg:1001.33ms | |
step:3752/6000 train_time:3746971ms step_avg:1001.33ms | |
step:3753/6000 train_time:3747969ms step_avg:1001.33ms | |
step:3754/6000 train_time:3748975ms step_avg:1001.33ms | |
step:3755/6000 train_time:3749972ms step_avg:1001.33ms | |
step:3756/6000 train_time:3750978ms step_avg:1001.33ms | |
step:3757/6000 train_time:3752008ms step_avg:1001.34ms | |
step:3758/6000 train_time:3753027ms step_avg:1001.34ms | |
step:3759/6000 train_time:3754025ms step_avg:1001.34ms | |
step:3760/6000 train_time:3755029ms step_avg:1001.34ms | |
step:3761/6000 train_time:3756044ms step_avg:1001.34ms | |
step:3762/6000 train_time:3757048ms step_avg:1001.35ms | |
step:3763/6000 train_time:3758059ms step_avg:1001.35ms | |
step:3764/6000 train_time:3759057ms step_avg:1001.35ms | |
step:3765/6000 train_time:3760091ms step_avg:1001.36ms | |
step:3766/6000 train_time:3761089ms step_avg:1001.36ms | |
step:3767/6000 train_time:3762094ms step_avg:1001.36ms | |
step:3768/6000 train_time:3763115ms step_avg:1001.36ms | |
step:3769/6000 train_time:3764138ms step_avg:1001.37ms | |
step:3770/6000 train_time:3765132ms step_avg:1001.36ms | |
step:3771/6000 train_time:3766141ms step_avg:1001.37ms | |
step:3772/6000 train_time:3767145ms step_avg:1001.37ms | |
step:3773/6000 train_time:3768146ms step_avg:1001.37ms | |
step:3774/6000 train_time:3769139ms step_avg:1001.37ms | |
step:3775/6000 train_time:3770147ms step_avg:1001.37ms | |
step:3775/6000 val_loss:2.5027 train_time:3770191ms step_avg:1001.38ms perplexity:12.2149 param_count:85,137,462 | |
step:3776/6000 train_time:3771151ms step_avg:1001.37ms | |
step:3777/6000 train_time:3772170ms step_avg:1001.37ms | |
step:3778/6000 train_time:3773173ms step_avg:1001.37ms | |
step:3779/6000 train_time:3774172ms step_avg:1001.37ms | |
step:3780/6000 train_time:3775186ms step_avg:1001.38ms | |
step:3781/6000 train_time:3776187ms step_avg:1001.38ms | |
step:3782/6000 train_time:3777200ms step_avg:1001.38ms | |
step:3783/6000 train_time:3778198ms step_avg:1001.38ms | |
step:3784/6000 train_time:3779218ms step_avg:1001.38ms | |
step:3785/6000 train_time:3780229ms step_avg:1001.39ms | |
step:3786/6000 train_time:3781236ms step_avg:1001.39ms | |
step:3787/6000 train_time:3782240ms step_avg:1001.39ms | |
step:3788/6000 train_time:3783242ms step_avg:1001.39ms | |
step:3789/6000 train_time:3784244ms step_avg:1001.39ms | |
step:3790/6000 train_time:3785253ms step_avg:1001.39ms | |
step:3791/6000 train_time:3786250ms step_avg:1001.39ms | |
step:3792/6000 train_time:3787256ms step_avg:1001.39ms | |
step:3793/6000 train_time:3788267ms step_avg:1001.39ms | |
step:3794/6000 train_time:3789288ms step_avg:1001.40ms | |
step:3795/6000 train_time:3790298ms step_avg:1001.40ms | |
step:3796/6000 train_time:3791307ms step_avg:1001.40ms | |
step:3797/6000 train_time:3792318ms step_avg:1001.40ms | |
step:3798/6000 train_time:3793329ms step_avg:1001.41ms | |
step:3799/6000 train_time:3794361ms step_avg:1001.41ms | |
step:3800/6000 train_time:3795358ms step_avg:1001.41ms | |
step:3800/6000 val_loss:2.4973 train_time:3795400ms step_avg:1001.42ms perplexity:12.1501 param_count:85,137,462 | |
step:3801/6000 train_time:3796352ms step_avg:1001.41ms | |
step:3802/6000 train_time:3797358ms step_avg:1001.41ms | |
step:3803/6000 train_time:3798361ms step_avg:1001.41ms | |
step:3804/6000 train_time:3799385ms step_avg:1001.42ms | |
step:3805/6000 train_time:3800387ms step_avg:1001.42ms | |
step:3806/6000 train_time:3801392ms step_avg:1001.42ms | |
step:3807/6000 train_time:3802385ms step_avg:1001.42ms | |
step:3808/6000 train_time:3803408ms step_avg:1001.42ms | |
step:3809/6000 train_time:3804410ms step_avg:1001.42ms | |
step:3810/6000 train_time:3805417ms step_avg:1001.43ms | |
step:3811/6000 train_time:3806413ms step_avg:1001.42ms | |
step:3812/6000 train_time:3807416ms step_avg:1001.42ms | |
step:3813/6000 train_time:3808449ms step_avg:1001.43ms | |
step:3814/6000 train_time:3809460ms step_avg:1001.44ms | |
step:3815/6000 train_time:3810468ms step_avg:1001.44ms | |
step:3816/6000 train_time:3811465ms step_avg:1001.44ms | |
step:3817/6000 train_time:3812458ms step_avg:1001.43ms | |
step:3818/6000 train_time:3813476ms step_avg:1001.44ms | |
step:3819/6000 train_time:3814495ms step_avg:1001.44ms | |
step:3820/6000 train_time:3815506ms step_avg:1001.45ms | |
step:3821/6000 train_time:3816528ms step_avg:1001.45ms | |
step:3822/6000 train_time:3817540ms step_avg:1001.45ms | |
step:3823/6000 train_time:3818548ms step_avg:1001.46ms | |
step:3824/6000 train_time:3819568ms step_avg:1001.46ms | |
step:3825/6000 train_time:3820608ms step_avg:1001.47ms | |
step:3825/6000 val_loss:2.4885 train_time:3820653ms step_avg:1001.48ms perplexity:12.0434 param_count:85,137,462 | |
step:3826/6000 train_time:3821605ms step_avg:1001.47ms | |
step:3827/6000 train_time:3822608ms step_avg:1001.47ms | |
step:3828/6000 train_time:3823627ms step_avg:1001.47ms | |
step:3829/6000 train_time:3824639ms step_avg:1001.48ms | |
step:3830/6000 train_time:3825647ms step_avg:1001.48ms | |
step:3831/6000 train_time:3826640ms step_avg:1001.48ms | |
step:3832/6000 train_time:3827670ms step_avg:1001.48ms | |
step:3833/6000 train_time:3828673ms step_avg:1001.48ms | |
step:3834/6000 train_time:3829679ms step_avg:1001.49ms | |
step:3835/6000 train_time:3830693ms step_avg:1001.49ms | |
step:3836/6000 train_time:3831706ms step_avg:1001.49ms | |
step:3837/6000 train_time:3832707ms step_avg:1001.49ms | |
step:3838/6000 train_time:3833705ms step_avg:1001.49ms | |
step:3839/6000 train_time:3834712ms step_avg:1001.49ms | |
step:3840/6000 train_time:3835719ms step_avg:1001.49ms | |
step:3841/6000 train_time:3836776ms step_avg:1001.51ms | |
step:3842/6000 train_time:3837782ms step_avg:1001.51ms | |
step:3843/6000 train_time:3838797ms step_avg:1001.51ms | |
step:3844/6000 train_time:3839808ms step_avg:1001.51ms | |
step:3845/6000 train_time:3840801ms step_avg:1001.51ms | |
step:3846/6000 train_time:3841796ms step_avg:1001.51ms | |
step:3847/6000 train_time:3842807ms step_avg:1001.51ms | |
step:3848/6000 train_time:3843811ms step_avg:1001.51ms | |
step:3849/6000 train_time:3844846ms step_avg:1001.52ms | |
step:3850/6000 train_time:3845867ms step_avg:1001.53ms | |
step:3850/6000 val_loss:2.4970 train_time:3845913ms step_avg:1001.54ms perplexity:12.1463 param_count:85,137,462 | |
step:3851/6000 train_time:3846865ms step_avg:1001.53ms | |
step:3852/6000 train_time:3847873ms step_avg:1001.53ms | |
step:3853/6000 train_time:3848875ms step_avg:1001.53ms | |
step:3854/6000 train_time:3849871ms step_avg:1001.53ms | |
step:3855/6000 train_time:3850891ms step_avg:1001.53ms | |
step:3856/6000 train_time:3851893ms step_avg:1001.53ms | |
step:3857/6000 train_time:3852904ms step_avg:1001.53ms | |
step:3858/6000 train_time:3853912ms step_avg:1001.54ms | |
step:3859/6000 train_time:3854936ms step_avg:1001.54ms | |
step:3860/6000 train_time:3855935ms step_avg:1001.54ms | |
step:3861/6000 train_time:3856939ms step_avg:1001.54ms | |
step:3862/6000 train_time:3857942ms step_avg:1001.54ms | |
step:3863/6000 train_time:3858956ms step_avg:1001.55ms | |
step:3864/6000 train_time:3859992ms step_avg:1001.55ms | |
step:3865/6000 train_time:3860994ms step_avg:1001.55ms | |
step:3866/6000 train_time:3862004ms step_avg:1001.56ms | |
step:3867/6000 train_time:3863025ms step_avg:1001.56ms | |
step:3868/6000 train_time:3864031ms step_avg:1001.56ms | |
step:3869/6000 train_time:3865036ms step_avg:1001.56ms | |
step:3870/6000 train_time:3866036ms step_avg:1001.56ms | |
step:3871/6000 train_time:3867042ms step_avg:1001.56ms | |
step:3872/6000 train_time:3868046ms step_avg:1001.57ms | |
step:3873/6000 train_time:3869053ms step_avg:1001.57ms | |
step:3874/6000 train_time:3870067ms step_avg:1001.57ms | |
step:3875/6000 train_time:3871068ms step_avg:1001.57ms | |
step:3875/6000 val_loss:2.4804 train_time:3871114ms step_avg:1001.58ms perplexity:11.9462 param_count:85,137,462 | |
step:3876/6000 train_time:3872054ms step_avg:1001.57ms | |
step:3877/6000 train_time:3873079ms step_avg:1001.57ms | |
step:3878/6000 train_time:3874132ms step_avg:1001.59ms | |
step:3879/6000 train_time:3875141ms step_avg:1001.59ms | |
step:3880/6000 train_time:3876158ms step_avg:1001.59ms | |
step:3881/6000 train_time:3877179ms step_avg:1001.60ms | |
step:3882/6000 train_time:3878186ms step_avg:1001.60ms | |
step:3883/6000 train_time:3879201ms step_avg:1001.60ms | |
step:3884/6000 train_time:3880201ms step_avg:1001.60ms | |
step:3885/6000 train_time:3881214ms step_avg:1001.60ms | |
step:3886/6000 train_time:3882218ms step_avg:1001.60ms | |
step:3887/6000 train_time:3883219ms step_avg:1001.60ms | |
step:3888/6000 train_time:3884224ms step_avg:1001.60ms | |
step:3889/6000 train_time:3885226ms step_avg:1001.61ms | |
step:3890/6000 train_time:3886249ms step_avg:1001.61ms | |
step:3891/6000 train_time:3887257ms step_avg:1001.61ms | |
step:3892/6000 train_time:3888260ms step_avg:1001.61ms | |
step:3893/6000 train_time:3889269ms step_avg:1001.61ms | |
step:3894/6000 train_time:3890281ms step_avg:1001.62ms | |
step:3895/6000 train_time:3891304ms step_avg:1001.62ms | |
step:3896/6000 train_time:3892329ms step_avg:1001.63ms | |
step:3897/6000 train_time:3893328ms step_avg:1001.63ms | |
step:3898/6000 train_time:3894339ms step_avg:1001.63ms | |
step:3899/6000 train_time:3895353ms step_avg:1001.63ms | |
step:3900/6000 train_time:3896388ms step_avg:1001.64ms | |
step:3900/6000 val_loss:2.4812 train_time:3896434ms step_avg:1001.65ms perplexity:11.9557 param_count:85,137,462 | |
step:3901/6000 train_time:3897400ms step_avg:1001.64ms | |
step:3902/6000 train_time:3898468ms step_avg:1001.66ms | |
step:3903/6000 train_time:3899476ms step_avg:1001.66ms | |
step:3904/6000 train_time:3900497ms step_avg:1001.67ms | |
step:3905/6000 train_time:3901496ms step_avg:1001.67ms | |
step:3906/6000 train_time:3902499ms step_avg:1001.67ms | |
step:3907/6000 train_time:3903496ms step_avg:1001.67ms | |
step:3908/6000 train_time:3904499ms step_avg:1001.67ms | |
step:3909/6000 train_time:3905504ms step_avg:1001.67ms | |
step:3910/6000 train_time:3906502ms step_avg:1001.67ms | |
step:3911/6000 train_time:3907508ms step_avg:1001.67ms | |
step:3912/6000 train_time:3908527ms step_avg:1001.67ms | |
step:3913/6000 train_time:3909532ms step_avg:1001.67ms | |
step:3914/6000 train_time:3910555ms step_avg:1001.68ms | |
step:3915/6000 train_time:3911557ms step_avg:1001.68ms | |
step:3916/6000 train_time:3912553ms step_avg:1001.68ms | |
step:3917/6000 train_time:3913582ms step_avg:1001.68ms | |
step:3918/6000 train_time:3914605ms step_avg:1001.69ms | |
step:3919/6000 train_time:3915614ms step_avg:1001.69ms | |
step:3920/6000 train_time:3916624ms step_avg:1001.69ms | |
step:3921/6000 train_time:3917633ms step_avg:1001.70ms | |
step:3922/6000 train_time:3918628ms step_avg:1001.69ms | |
step:3923/6000 train_time:3919633ms step_avg:1001.70ms | |
step:3924/6000 train_time:3920651ms step_avg:1001.70ms | |
step:3925/6000 train_time:3921658ms step_avg:1001.70ms | |
step:3925/6000 val_loss:2.4879 train_time:3921703ms step_avg:1001.71ms perplexity:12.0365 param_count:85,137,462 | |
step:3926/6000 train_time:3922663ms step_avg:1001.70ms | |
step:3927/6000 train_time:3923665ms step_avg:1001.70ms | |
step:3928/6000 train_time:3924669ms step_avg:1001.70ms | |
step:3929/6000 train_time:3925671ms step_avg:1001.70ms | |
step:3930/6000 train_time:3926680ms step_avg:1001.70ms | |
step:3931/6000 train_time:3927685ms step_avg:1001.70ms | |
step:3932/6000 train_time:3928704ms step_avg:1001.71ms | |
step:3933/6000 train_time:3929717ms step_avg:1001.71ms | |
step:3934/6000 train_time:3930711ms step_avg:1001.71ms | |
step:3935/6000 train_time:3931711ms step_avg:1001.71ms | |
step:3936/6000 train_time:3932708ms step_avg:1001.71ms | |
step:3937/6000 train_time:3933724ms step_avg:1001.71ms | |
step:3938/6000 train_time:3934734ms step_avg:1001.71ms | |
step:3939/6000 train_time:3935736ms step_avg:1001.71ms | |
step:3940/6000 train_time:3936743ms step_avg:1001.72ms | |
step:3941/6000 train_time:3937774ms step_avg:1001.72ms | |
step:3942/6000 train_time:3938794ms step_avg:1001.73ms | |
step:3943/6000 train_time:3939809ms step_avg:1001.73ms | |
step:3944/6000 train_time:3940827ms step_avg:1001.74ms | |
step:3945/6000 train_time:3941829ms step_avg:1001.74ms | |
step:3946/6000 train_time:3942833ms step_avg:1001.74ms | |
step:3947/6000 train_time:3943855ms step_avg:1001.74ms | |
step:3948/6000 train_time:3944862ms step_avg:1001.74ms | |
step:3949/6000 train_time:3945884ms step_avg:1001.75ms | |
step:3950/6000 train_time:3946883ms step_avg:1001.75ms | |
step:3950/6000 val_loss:2.4872 train_time:3946930ms step_avg:1001.76ms perplexity:12.0278 param_count:85,137,462 | |
step:3951/6000 train_time:3947899ms step_avg:1001.75ms | |
step:3952/6000 train_time:3948902ms step_avg:1001.75ms | |
step:3953/6000 train_time:3949912ms step_avg:1001.75ms | |
step:3954/6000 train_time:3950947ms step_avg:1001.76ms | |
step:3955/6000 train_time:3951956ms step_avg:1001.76ms | |
step:3956/6000 train_time:3952969ms step_avg:1001.77ms | |
step:3957/6000 train_time:3953972ms step_avg:1001.77ms | |
step:3958/6000 train_time:3954998ms step_avg:1001.77ms | |
step:3959/6000 train_time:3956012ms step_avg:1001.78ms | |
step:3960/6000 train_time:3957025ms step_avg:1001.78ms | |
step:3961/6000 train_time:3958043ms step_avg:1001.78ms | |
step:3962/6000 train_time:3959046ms step_avg:1001.78ms | |
step:3963/6000 train_time:3960035ms step_avg:1001.78ms | |
step:3964/6000 train_time:3961051ms step_avg:1001.78ms | |
step:3965/6000 train_time:3962070ms step_avg:1001.79ms | |
step:3966/6000 train_time:3963081ms step_avg:1001.79ms | |
step:3967/6000 train_time:3964089ms step_avg:1001.79ms | |
step:3968/6000 train_time:3965098ms step_avg:1001.79ms | |
step:3969/6000 train_time:3966098ms step_avg:1001.79ms | |
step:3970/6000 train_time:3967112ms step_avg:1001.80ms | |
step:3971/6000 train_time:3968143ms step_avg:1001.80ms | |
step:3972/6000 train_time:3969186ms step_avg:1001.81ms | |
step:3973/6000 train_time:3970177ms step_avg:1001.81ms | |
step:3974/6000 train_time:3971192ms step_avg:1001.81ms | |
step:3975/6000 train_time:3972218ms step_avg:1001.82ms | |
step:3975/6000 val_loss:2.4846 train_time:3972264ms step_avg:1001.83ms perplexity:11.9968 param_count:85,137,462 | |
step:3976/6000 train_time:3973213ms step_avg:1001.82ms | |
step:3977/6000 train_time:3974216ms step_avg:1001.82ms | |
step:3978/6000 train_time:3975217ms step_avg:1001.82ms | |
step:3979/6000 train_time:3976221ms step_avg:1001.82ms | |
step:3980/6000 train_time:3977228ms step_avg:1001.82ms | |
step:3981/6000 train_time:3978246ms step_avg:1001.82ms | |
step:3982/6000 train_time:3979249ms step_avg:1001.82ms | |
step:3983/6000 train_time:3980246ms step_avg:1001.82ms | |
step:3984/6000 train_time:3981252ms step_avg:1001.82ms | |
step:3985/6000 train_time:3982254ms step_avg:1001.83ms | |
step:3986/6000 train_time:3983259ms step_avg:1001.83ms | |
step:3987/6000 train_time:3984271ms step_avg:1001.83ms | |
step:3988/6000 train_time:3985300ms step_avg:1001.84ms | |
step:3989/6000 train_time:3986311ms step_avg:1001.84ms | |
step:3990/6000 train_time:3987329ms step_avg:1001.84ms | |
step:3991/6000 train_time:3988332ms step_avg:1001.84ms | |
step:3992/6000 train_time:3989351ms step_avg:1001.85ms | |
step:3993/6000 train_time:3990344ms step_avg:1001.84ms | |
step:3994/6000 train_time:3991365ms step_avg:1001.85ms | |
step:3995/6000 train_time:3992373ms step_avg:1001.85ms | |
step:3996/6000 train_time:3993379ms step_avg:1001.85ms | |
step:3997/6000 train_time:3994395ms step_avg:1001.85ms | |
step:3998/6000 train_time:3995412ms step_avg:1001.86ms | |
step:3999/6000 train_time:3996437ms step_avg:1001.86ms | |
step:4000/6000 train_time:3997443ms step_avg:1001.87ms | |
step:4000/6000 val_loss:2.4887 train_time:3997488ms step_avg:1001.88ms perplexity:12.0450 param_count:85,137,462 | |
step:4001/6000 train_time:3998445ms step_avg:1001.87ms | |
step:4002/6000 train_time:3999452ms step_avg:1001.87ms | |
step:4003/6000 train_time:4000460ms step_avg:1001.87ms | |
step:4004/6000 train_time:4001451ms step_avg:1001.87ms | |
step:4005/6000 train_time:4002454ms step_avg:1001.87ms | |
step:4006/6000 train_time:4003471ms step_avg:1001.87ms | |
step:4007/6000 train_time:4004474ms step_avg:1001.87ms | |
step:4008/6000 train_time:4005483ms step_avg:1001.87ms | |
step:4009/6000 train_time:4006474ms step_avg:1001.87ms | |
step:4010/6000 train_time:4007482ms step_avg:1001.87ms | |
step:4011/6000 train_time:4008489ms step_avg:1001.87ms | |
step:4012/6000 train_time:4009496ms step_avg:1001.87ms | |
step:4013/6000 train_time:4010511ms step_avg:1001.88ms | |
step:4014/6000 train_time:4011513ms step_avg:1001.88ms | |
step:4015/6000 train_time:4012524ms step_avg:1001.88ms | |
step:4016/6000 train_time:4013531ms step_avg:1001.88ms | |
step:4017/6000 train_time:4014532ms step_avg:1001.88ms | |
step:4018/6000 train_time:4015538ms step_avg:1001.88ms | |
step:4019/6000 train_time:4016550ms step_avg:1001.88ms | |
step:4020/6000 train_time:4017579ms step_avg:1001.89ms | |
step:4021/6000 train_time:4018585ms step_avg:1001.89ms | |
step:4022/6000 train_time:4019584ms step_avg:1001.89ms | |
step:4023/6000 train_time:4020610ms step_avg:1001.90ms | |
step:4024/6000 train_time:4021623ms step_avg:1001.90ms | |
step:4025/6000 train_time:4022622ms step_avg:1001.90ms | |
step:4025/6000 val_loss:2.4882 train_time:4022663ms step_avg:1001.91ms perplexity:12.0395 param_count:85,137,462 | |
step:4026/6000 train_time:4023620ms step_avg:1001.90ms | |
step:4027/6000 train_time:4024610ms step_avg:1001.89ms | |
step:4028/6000 train_time:4025649ms step_avg:1001.90ms | |
step:4029/6000 train_time:4026642ms step_avg:1001.90ms | |
step:4030/6000 train_time:4027645ms step_avg:1001.90ms | |
step:4031/6000 train_time:4028658ms step_avg:1001.90ms | |
step:4032/6000 train_time:4029679ms step_avg:1001.91ms | |
step:4033/6000 train_time:4030689ms step_avg:1001.91ms | |
step:4034/6000 train_time:4031695ms step_avg:1001.91ms | |
step:4035/6000 train_time:4032697ms step_avg:1001.91ms | |
step:4036/6000 train_time:4033692ms step_avg:1001.91ms | |
step:4037/6000 train_time:4034722ms step_avg:1001.92ms | |
step:4038/6000 train_time:4035736ms step_avg:1001.92ms | |
step:4039/6000 train_time:4036744ms step_avg:1001.92ms | |
step:4040/6000 train_time:4037806ms step_avg:1001.94ms | |
step:4041/6000 train_time:4038809ms step_avg:1001.94ms | |
step:4042/6000 train_time:4039828ms step_avg:1001.94ms | |
step:4043/6000 train_time:4040837ms step_avg:1001.94ms | |
step:4044/6000 train_time:4041841ms step_avg:1001.94ms | |
step:4045/6000 train_time:4042864ms step_avg:1001.95ms | |
step:4046/6000 train_time:4043872ms step_avg:1001.95ms | |
step:4047/6000 train_time:4044880ms step_avg:1001.95ms | |
step:4048/6000 train_time:4045881ms step_avg:1001.95ms | |
step:4049/6000 train_time:4046893ms step_avg:1001.95ms | |
step:4050/6000 train_time:4047899ms step_avg:1001.96ms | |
step:4050/6000 val_loss:2.4903 train_time:4047945ms step_avg:1001.97ms perplexity:12.0645 param_count:85,137,462 | |
step:4051/6000 train_time:4048886ms step_avg:1001.95ms | |
step:4052/6000 train_time:4049891ms step_avg:1001.95ms | |
step:4053/6000 train_time:4050893ms step_avg:1001.95ms | |
step:4054/6000 train_time:4051889ms step_avg:1001.95ms | |
step:4055/6000 train_time:4052894ms step_avg:1001.95ms | |
step:4056/6000 train_time:4053879ms step_avg:1001.95ms | |
step:4057/6000 train_time:4054882ms step_avg:1001.95ms | |
step:4058/6000 train_time:4055888ms step_avg:1001.95ms | |
step:4059/6000 train_time:4056896ms step_avg:1001.95ms | |
step:4060/6000 train_time:4057923ms step_avg:1001.96ms | |
step:4061/6000 train_time:4058923ms step_avg:1001.96ms | |
step:4062/6000 train_time:4059931ms step_avg:1001.96ms | |
step:4063/6000 train_time:4060937ms step_avg:1001.96ms | |
step:4064/6000 train_time:4061933ms step_avg:1001.96ms | |
step:4065/6000 train_time:4062948ms step_avg:1001.96ms | |
step:4066/6000 train_time:4063953ms step_avg:1001.96ms | |
step:4067/6000 train_time:4064954ms step_avg:1001.96ms | |
step:4068/6000 train_time:4065964ms step_avg:1001.96ms | |
step:4069/6000 train_time:4066969ms step_avg:1001.96ms | |
step:4070/6000 train_time:4067962ms step_avg:1001.96ms | |
step:4071/6000 train_time:4068969ms step_avg:1001.96ms | |
step:4072/6000 train_time:4069972ms step_avg:1001.96ms | |
step:4073/6000 train_time:4070988ms step_avg:1001.97ms | |
step:4074/6000 train_time:4071994ms step_avg:1001.97ms | |
step:4075/6000 train_time:4072995ms step_avg:1001.97ms | |
step:4075/6000 val_loss:2.4943 train_time:4073042ms step_avg:1001.98ms perplexity:12.1129 param_count:85,137,462 | |
step:4076/6000 train_time:4074005ms step_avg:1001.97ms | |
step:4077/6000 train_time:4075017ms step_avg:1001.97ms | |
step:4078/6000 train_time:4076013ms step_avg:1001.97ms | |
step:4079/6000 train_time:4077014ms step_avg:1001.97ms | |
step:4080/6000 train_time:4078048ms step_avg:1001.98ms | |
step:4081/6000 train_time:4079046ms step_avg:1001.98ms | |
step:4082/6000 train_time:4080049ms step_avg:1001.98ms | |
step:4083/6000 train_time:4081049ms step_avg:1001.98ms | |
step:4084/6000 train_time:4082065ms step_avg:1001.98ms | |
step:4085/6000 train_time:4083073ms step_avg:1001.98ms | |
step:4086/6000 train_time:4084125ms step_avg:1001.99ms | |
step:4087/6000 train_time:4085135ms step_avg:1002.00ms | |
step:4088/6000 train_time:4086144ms step_avg:1002.00ms | |
step:4089/6000 train_time:4087151ms step_avg:1002.00ms | |
step:4090/6000 train_time:4088162ms step_avg:1002.00ms | |
step:4091/6000 train_time:4089164ms step_avg:1002.00ms | |
step:4092/6000 train_time:4090185ms step_avg:1002.01ms | |
step:4093/6000 train_time:4091192ms step_avg:1002.01ms | |
step:4094/6000 train_time:4092186ms step_avg:1002.00ms | |
step:4095/6000 train_time:4093196ms step_avg:1002.01ms | |
step:4096/6000 train_time:4094204ms step_avg:1002.01ms | |
step:4097/6000 train_time:4095205ms step_avg:1002.01ms | |
step:4098/6000 train_time:4096216ms step_avg:1002.01ms | |
step:4099/6000 train_time:4097227ms step_avg:1002.01ms | |
step:4100/6000 train_time:4098256ms step_avg:1002.02ms | |
step:4100/6000 val_loss:2.4892 train_time:4098303ms step_avg:1002.03ms perplexity:12.0521 param_count:85,137,462 | |
step:4101/6000 train_time:4099253ms step_avg:1002.02ms | |
step:4102/6000 train_time:4100278ms step_avg:1002.02ms | |
step:4103/6000 train_time:4101285ms step_avg:1002.02ms | |
step:4104/6000 train_time:4102292ms step_avg:1002.03ms | |
step:4105/6000 train_time:4103317ms step_avg:1002.03ms | |
step:4106/6000 train_time:4104333ms step_avg:1002.03ms | |
step:4107/6000 train_time:4105322ms step_avg:1002.03ms | |
step:4108/6000 train_time:4106337ms step_avg:1002.03ms | |
step:4109/6000 train_time:4107353ms step_avg:1002.04ms | |
step:4110/6000 train_time:4108351ms step_avg:1002.04ms | |
step:4111/6000 train_time:4109355ms step_avg:1002.04ms | |
step:4112/6000 train_time:4110366ms step_avg:1002.04ms | |
step:4113/6000 train_time:4111382ms step_avg:1002.04ms | |
step:4114/6000 train_time:4112399ms step_avg:1002.05ms | |
step:4115/6000 train_time:4113399ms step_avg:1002.05ms | |
step:4116/6000 train_time:4114395ms step_avg:1002.04ms | |
step:4117/6000 train_time:4115401ms step_avg:1002.05ms | |
step:4118/6000 train_time:4116416ms step_avg:1002.05ms | |
step:4119/6000 train_time:4117427ms step_avg:1002.05ms | |
step:4120/6000 train_time:4118437ms step_avg:1002.05ms | |
step:4121/6000 train_time:4119441ms step_avg:1002.05ms | |
step:4122/6000 train_time:4120443ms step_avg:1002.05ms | |
step:4123/6000 train_time:4121453ms step_avg:1002.06ms | |
step:4124/6000 train_time:4122473ms step_avg:1002.06ms | |
step:4125/6000 train_time:4123488ms step_avg:1002.06ms | |
step:4125/6000 val_loss:2.4880 train_time:4123535ms step_avg:1002.07ms perplexity:12.0375 param_count:85,137,462 | |
step:4126/6000 train_time:4124488ms step_avg:1002.06ms | |
step:4127/6000 train_time:4125492ms step_avg:1002.06ms | |
step:4128/6000 train_time:4126499ms step_avg:1002.06ms | |
step:4129/6000 train_time:4127497ms step_avg:1002.06ms | |
step:4130/6000 train_time:4128501ms step_avg:1002.06ms | |
step:4131/6000 train_time:4129517ms step_avg:1002.07ms | |
step:4132/6000 train_time:4130524ms step_avg:1002.07ms | |
step:4133/6000 train_time:4131526ms step_avg:1002.07ms | |
step:4134/6000 train_time:4132516ms step_avg:1002.06ms | |
step:4135/6000 train_time:4133529ms step_avg:1002.07ms | |
step:4136/6000 train_time:4134539ms step_avg:1002.07ms | |
step:4137/6000 train_time:4135549ms step_avg:1002.07ms | |
step:4138/6000 train_time:4136565ms step_avg:1002.07ms | |
step:4139/6000 train_time:4137581ms step_avg:1002.08ms | |
step:4140/6000 train_time:4138595ms step_avg:1002.08ms | |
step:4141/6000 train_time:4139596ms step_avg:1002.08ms | |
step:4142/6000 train_time:4140631ms step_avg:1002.09ms | |
step:4143/6000 train_time:4141632ms step_avg:1002.09ms | |
step:4144/6000 train_time:4142640ms step_avg:1002.09ms | |
step:4145/6000 train_time:4143631ms step_avg:1002.09ms | |
step:4146/6000 train_time:4144633ms step_avg:1002.09ms | |
step:4147/6000 train_time:4145658ms step_avg:1002.09ms | |
step:4148/6000 train_time:4146657ms step_avg:1002.09ms | |
step:4149/6000 train_time:4147669ms step_avg:1002.09ms | |
step:4150/6000 train_time:4148679ms step_avg:1002.10ms | |
step:4150/6000 val_loss:2.4884 train_time:4148724ms step_avg:1002.11ms perplexity:12.0417 param_count:85,137,462 | |
step:4151/6000 train_time:4149696ms step_avg:1002.10ms | |
step:4152/6000 train_time:4150701ms step_avg:1002.10ms | |
step:4153/6000 train_time:4151706ms step_avg:1002.10ms | |
step:4154/6000 train_time:4152713ms step_avg:1002.10ms | |
step:4155/6000 train_time:4153710ms step_avg:1002.10ms | |
step:4156/6000 train_time:4154707ms step_avg:1002.10ms | |
step:4157/6000 train_time:4155707ms step_avg:1002.10ms | |
step:4158/6000 train_time:4156715ms step_avg:1002.10ms | |
step:4159/6000 train_time:4157720ms step_avg:1002.10ms | |
step:4160/6000 train_time:4158750ms step_avg:1002.11ms | |
step:4161/6000 train_time:4159758ms step_avg:1002.11ms | |
step:4162/6000 train_time:4160765ms step_avg:1002.11ms | |
step:4163/6000 train_time:4161770ms step_avg:1002.11ms | |
step:4164/6000 train_time:4162781ms step_avg:1002.11ms | |
step:4165/6000 train_time:4163788ms step_avg:1002.12ms | |
step:4166/6000 train_time:4164811ms step_avg:1002.12ms | |
step:4167/6000 train_time:4165821ms step_avg:1002.12ms | |
step:4168/6000 train_time:4166837ms step_avg:1002.13ms | |
step:4169/6000 train_time:4167845ms step_avg:1002.13ms | |
step:4170/6000 train_time:4168855ms step_avg:1002.13ms | |
step:4171/6000 train_time:4169861ms step_avg:1002.13ms | |
step:4172/6000 train_time:4170857ms step_avg:1002.13ms | |
step:4173/6000 train_time:4171870ms step_avg:1002.13ms | |
step:4174/6000 train_time:4172874ms step_avg:1002.13ms | |
step:4175/6000 train_time:4173888ms step_avg:1002.13ms | |
step:4175/6000 val_loss:2.4779 train_time:4173933ms step_avg:1002.14ms perplexity:11.9157 param_count:85,137,462 | |
step:4176/6000 train_time:4174903ms step_avg:1002.14ms | |
step:4177/6000 train_time:4175905ms step_avg:1002.14ms | |
step:4178/6000 train_time:4176905ms step_avg:1002.14ms | |
step:4179/6000 train_time:4177925ms step_avg:1002.14ms | |
step:4180/6000 train_time:4178923ms step_avg:1002.14ms | |
step:4181/6000 train_time:4179943ms step_avg:1002.14ms | |
step:4182/6000 train_time:4180948ms step_avg:1002.14ms | |
step:4183/6000 train_time:4181960ms step_avg:1002.15ms | |
step:4184/6000 train_time:4182974ms step_avg:1002.15ms | |
step:4185/6000 train_time:4183987ms step_avg:1002.15ms | |
step:4186/6000 train_time:4184991ms step_avg:1002.15ms | |
step:4187/6000 train_time:4186013ms step_avg:1002.16ms | |
step:4188/6000 train_time:4187017ms step_avg:1002.16ms | |
step:4189/6000 train_time:4188023ms step_avg:1002.16ms | |
step:4190/6000 train_time:4189020ms step_avg:1002.16ms | |
step:4191/6000 train_time:4190035ms step_avg:1002.16ms | |
step:4192/6000 train_time:4191048ms step_avg:1002.16ms | |
step:4193/6000 train_time:4192054ms step_avg:1002.16ms | |
step:4194/6000 train_time:4193059ms step_avg:1002.17ms | |
step:4195/6000 train_time:4194064ms step_avg:1002.17ms | |
step:4196/6000 train_time:4195073ms step_avg:1002.17ms | |
step:4197/6000 train_time:4196085ms step_avg:1002.17ms | |
step:4198/6000 train_time:4197091ms step_avg:1002.17ms | |
step:4199/6000 train_time:4198099ms step_avg:1002.17ms | |
step:4200/6000 train_time:4199099ms step_avg:1002.17ms | |
step:4200/6000 val_loss:2.4926 train_time:4199146ms step_avg:1002.18ms perplexity:12.0921 param_count:85,137,462 | |
step:4201/6000 train_time:4200105ms step_avg:1002.17ms | |
step:4202/6000 train_time:4201115ms step_avg:1002.17ms | |
step:4203/6000 train_time:4202113ms step_avg:1002.17ms | |
step:4204/6000 train_time:4203140ms step_avg:1002.18ms | |
step:4205/6000 train_time:4204140ms step_avg:1002.18ms | |
step:4206/6000 train_time:4205142ms step_avg:1002.18ms | |
step:4207/6000 train_time:4206157ms step_avg:1002.18ms | |
step:4208/6000 train_time:4207164ms step_avg:1002.18ms | |
step:4209/6000 train_time:4208169ms step_avg:1002.18ms | |
step:4210/6000 train_time:4209227ms step_avg:1002.20ms | |
step:4211/6000 train_time:4210235ms step_avg:1002.20ms | |
step:4212/6000 train_time:4211238ms step_avg:1002.20ms | |
step:4213/6000 train_time:4212250ms step_avg:1002.20ms | |
step:4214/6000 train_time:4213257ms step_avg:1002.20ms | |
step:4215/6000 train_time:4214262ms step_avg:1002.20ms | |
step:4216/6000 train_time:4215262ms step_avg:1002.20ms | |
step:4217/6000 train_time:4216273ms step_avg:1002.20ms | |
step:4218/6000 train_time:4217285ms step_avg:1002.21ms | |
step:4219/6000 train_time:4218292ms step_avg:1002.21ms | |
step:4220/6000 train_time:4219298ms step_avg:1002.21ms | |
step:4221/6000 train_time:4220291ms step_avg:1002.21ms | |
step:4222/6000 train_time:4221298ms step_avg:1002.21ms | |
step:4223/6000 train_time:4222306ms step_avg:1002.21ms | |
step:4224/6000 train_time:4223316ms step_avg:1002.21ms | |
step:4225/6000 train_time:4224327ms step_avg:1002.21ms | |
step:4225/6000 val_loss:2.4844 train_time:4224372ms step_avg:1002.22ms perplexity:11.9935 param_count:85,137,462 | |
step:4226/6000 train_time:4225326ms step_avg:1002.21ms | |
step:4227/6000 train_time:4226316ms step_avg:1002.21ms | |
step:4228/6000 train_time:4227341ms step_avg:1002.21ms | |
step:4229/6000 train_time:4228336ms step_avg:1002.21ms | |
step:4230/6000 train_time:4229346ms step_avg:1002.21ms | |
step:4231/6000 train_time:4230366ms step_avg:1002.22ms | |
step:4232/6000 train_time:4231380ms step_avg:1002.22ms | |
step:4233/6000 train_time:4232405ms step_avg:1002.23ms | |
step:4234/6000 train_time:4233406ms step_avg:1002.23ms | |
step:4235/6000 train_time:4234420ms step_avg:1002.23ms | |
step:4236/6000 train_time:4235430ms step_avg:1002.23ms | |
step:4237/6000 train_time:4236445ms step_avg:1002.23ms | |
step:4238/6000 train_time:4237464ms step_avg:1002.24ms | |
step:4239/6000 train_time:4238467ms step_avg:1002.24ms | |
step:4240/6000 train_time:4239478ms step_avg:1002.24ms | |
step:4241/6000 train_time:4240489ms step_avg:1002.24ms | |
step:4242/6000 train_time:4241505ms step_avg:1002.25ms | |
step:4243/6000 train_time:4242521ms step_avg:1002.25ms | |
step:4244/6000 train_time:4243529ms step_avg:1002.25ms | |
step:4245/6000 train_time:4244545ms step_avg:1002.25ms | |
step:4246/6000 train_time:4245555ms step_avg:1002.26ms | |
step:4247/6000 train_time:4246573ms step_avg:1002.26ms | |
step:4248/6000 train_time:4247584ms step_avg:1002.26ms | |
step:4249/6000 train_time:4248604ms step_avg:1002.27ms | |
step:4250/6000 train_time:4249610ms step_avg:1002.27ms | |
step:4250/6000 val_loss:2.4807 train_time:4249653ms step_avg:1002.28ms perplexity:11.9501 param_count:85,137,462 | |
step:4251/6000 train_time:4250615ms step_avg:1002.27ms | |
step:4252/6000 train_time:4251625ms step_avg:1002.27ms | |
step:4253/6000 train_time:4252627ms step_avg:1002.27ms | |
step:4254/6000 train_time:4253647ms step_avg:1002.27ms | |
step:4255/6000 train_time:4254654ms step_avg:1002.27ms | |
step:4256/6000 train_time:4255651ms step_avg:1002.27ms | |
step:4257/6000 train_time:4256665ms step_avg:1002.28ms | |
step:4258/6000 train_time:4257666ms step_avg:1002.28ms | |
step:4259/6000 train_time:4258672ms step_avg:1002.28ms | |
step:4260/6000 train_time:4259683ms step_avg:1002.28ms | |
step:4261/6000 train_time:4260685ms step_avg:1002.28ms | |
step:4262/6000 train_time:4261700ms step_avg:1002.28ms | |
step:4263/6000 train_time:4262708ms step_avg:1002.28ms | |
step:4264/6000 train_time:4263712ms step_avg:1002.28ms | |
step:4265/6000 train_time:4264730ms step_avg:1002.29ms | |
step:4266/6000 train_time:4265730ms step_avg:1002.29ms | |
step:4267/6000 train_time:4266745ms step_avg:1002.29ms | |
step:4268/6000 train_time:4267753ms step_avg:1002.29ms | |
step:4269/6000 train_time:4268768ms step_avg:1002.29ms | |
step:4270/6000 train_time:4269772ms step_avg:1002.29ms | |
step:4271/6000 train_time:4270785ms step_avg:1002.30ms | |
step:4272/6000 train_time:4271789ms step_avg:1002.30ms | |
step:4273/6000 train_time:4272808ms step_avg:1002.30ms | |
step:4274/6000 train_time:4273819ms step_avg:1002.30ms | |
step:4275/6000 train_time:4274819ms step_avg:1002.30ms | |
step:4275/6000 val_loss:2.4813 train_time:4274864ms step_avg:1002.31ms perplexity:11.9572 param_count:85,137,462 | |
step:4276/6000 train_time:4275817ms step_avg:1002.30ms | |
step:4277/6000 train_time:4276819ms step_avg:1002.30ms | |
step:4278/6000 train_time:4277894ms step_avg:1002.32ms | |
step:4279/6000 train_time:4278893ms step_avg:1002.32ms | |
step:4280/6000 train_time:4279885ms step_avg:1002.31ms | |
step:4281/6000 train_time:4280893ms step_avg:1002.32ms | |
step:4282/6000 train_time:4281899ms step_avg:1002.32ms | |
step:4283/6000 train_time:4282917ms step_avg:1002.32ms | |
step:4284/6000 train_time:4283939ms step_avg:1002.33ms | |
step:4285/6000 train_time:4284950ms step_avg:1002.33ms | |
step:4286/6000 train_time:4285950ms step_avg:1002.33ms | |
step:4287/6000 train_time:4286950ms step_avg:1002.33ms | |
step:4288/6000 train_time:4287950ms step_avg:1002.33ms | |
step:4289/6000 train_time:4288950ms step_avg:1002.33ms | |
step:4290/6000 train_time:4289955ms step_avg:1002.33ms | |
step:4291/6000 train_time:4290940ms step_avg:1002.32ms | |
step:4292/6000 train_time:4291949ms step_avg:1002.32ms | |
step:4293/6000 train_time:4292950ms step_avg:1002.32ms | |
step:4294/6000 train_time:4293955ms step_avg:1002.32ms | |
step:4295/6000 train_time:4294962ms step_avg:1002.32ms | |
step:4296/6000 train_time:4295964ms step_avg:1002.32ms | |
step:4297/6000 train_time:4296958ms step_avg:1002.32ms | |
step:4298/6000 train_time:4297954ms step_avg:1002.32ms | |
step:4299/6000 train_time:4298966ms step_avg:1002.32ms | |
step:4300/6000 train_time:4299959ms step_avg:1002.32ms | |
step:4300/6000 val_loss:2.4888 train_time:4300000ms step_avg:1002.33ms perplexity:12.0463 param_count:85,137,462 | |
step:4301/6000 train_time:4300975ms step_avg:1002.32ms | |
step:4302/6000 train_time:4301978ms step_avg:1002.32ms | |
step:4303/6000 train_time:4302989ms step_avg:1002.33ms | |
step:4304/6000 train_time:4303976ms step_avg:1002.32ms | |
step:4305/6000 train_time:4304994ms step_avg:1002.33ms | |
step:4306/6000 train_time:4306002ms step_avg:1002.33ms | |
step:4307/6000 train_time:4307005ms step_avg:1002.33ms | |
step:4308/6000 train_time:4308012ms step_avg:1002.33ms | |
step:4309/6000 train_time:4309016ms step_avg:1002.33ms | |
step:4310/6000 train_time:4310004ms step_avg:1002.33ms | |
step:4311/6000 train_time:4311013ms step_avg:1002.33ms | |
step:4312/6000 train_time:4312016ms step_avg:1002.33ms | |
step:4313/6000 train_time:4313027ms step_avg:1002.33ms | |
step:4314/6000 train_time:4314034ms step_avg:1002.33ms | |
step:4315/6000 train_time:4315026ms step_avg:1002.33ms | |
step:4316/6000 train_time:4316021ms step_avg:1002.33ms | |
step:4317/6000 train_time:4317035ms step_avg:1002.33ms | |
step:4318/6000 train_time:4318053ms step_avg:1002.33ms | |
step:4319/6000 train_time:4319051ms step_avg:1002.33ms | |
step:4320/6000 train_time:4320062ms step_avg:1002.33ms | |
step:4321/6000 train_time:4321065ms step_avg:1002.33ms | |
step:4322/6000 train_time:4322062ms step_avg:1002.33ms | |
step:4323/6000 train_time:4323082ms step_avg:1002.34ms | |
step:4324/6000 train_time:4324074ms step_avg:1002.34ms | |
step:4325/6000 train_time:4325081ms step_avg:1002.34ms | |
step:4325/6000 val_loss:2.4809 train_time:4325122ms step_avg:1002.35ms perplexity:11.9521 param_count:85,137,462 | |
step:4326/6000 train_time:4326078ms step_avg:1002.33ms | |
step:4327/6000 train_time:4327081ms step_avg:1002.34ms | |
step:4328/6000 train_time:4328108ms step_avg:1002.34ms | |
step:4329/6000 train_time:4329106ms step_avg:1002.34ms | |
step:4330/6000 train_time:4330105ms step_avg:1002.34ms | |
step:4331/6000 train_time:4331126ms step_avg:1002.34ms | |
step:4332/6000 train_time:4332132ms step_avg:1002.34ms | |
step:4333/6000 train_time:4333136ms step_avg:1002.34ms | |
step:4334/6000 train_time:4334144ms step_avg:1002.35ms | |
step:4335/6000 train_time:4335142ms step_avg:1002.35ms | |
step:4336/6000 train_time:4336147ms step_avg:1002.35ms | |
step:4337/6000 train_time:4337150ms step_avg:1002.35ms | |
step:4338/6000 train_time:4338148ms step_avg:1002.34ms | |
step:4339/6000 train_time:4339168ms step_avg:1002.35ms | |
step:4340/6000 train_time:4340166ms step_avg:1002.35ms | |
step:4341/6000 train_time:4341178ms step_avg:1002.35ms | |
step:4342/6000 train_time:4342182ms step_avg:1002.35ms | |
step:4343/6000 train_time:4343208ms step_avg:1002.36ms | |
step:4344/6000 train_time:4344203ms step_avg:1002.35ms | |
step:4345/6000 train_time:4345208ms step_avg:1002.35ms | |
step:4346/6000 train_time:4346212ms step_avg:1002.36ms | |
step:4347/6000 train_time:4347213ms step_avg:1002.35ms | |
step:4348/6000 train_time:4348217ms step_avg:1002.36ms | |
step:4349/6000 train_time:4349231ms step_avg:1002.36ms | |
step:4350/6000 train_time:4350261ms step_avg:1002.36ms | |
step:4350/6000 val_loss:2.4835 train_time:4350302ms step_avg:1002.37ms perplexity:11.9837 param_count:85,137,462 | |
step:4351/6000 train_time:4351259ms step_avg:1002.36ms | |
step:4352/6000 train_time:4352271ms step_avg:1002.37ms | |
step:4353/6000 train_time:4353277ms step_avg:1002.37ms | |
step:4354/6000 train_time:4354294ms step_avg:1002.37ms | |
step:4355/6000 train_time:4355299ms step_avg:1002.37ms | |
step:4356/6000 train_time:4356302ms step_avg:1002.37ms | |
step:4357/6000 train_time:4357309ms step_avg:1002.37ms | |
step:4358/6000 train_time:4358329ms step_avg:1002.38ms | |
step:4359/6000 train_time:4359343ms step_avg:1002.38ms | |
step:4360/6000 train_time:4360348ms step_avg:1002.38ms | |
step:4361/6000 train_time:4361371ms step_avg:1002.38ms | |
step:4362/6000 train_time:4362386ms step_avg:1002.39ms | |
step:4363/6000 train_time:4363404ms step_avg:1002.39ms | |
step:4364/6000 train_time:4364401ms step_avg:1002.39ms | |
step:4365/6000 train_time:4365405ms step_avg:1002.39ms | |
step:4366/6000 train_time:4366423ms step_avg:1002.39ms | |
step:4367/6000 train_time:4367423ms step_avg:1002.39ms | |
step:4368/6000 train_time:4368422ms step_avg:1002.39ms | |
step:4369/6000 train_time:4369419ms step_avg:1002.39ms | |
step:4370/6000 train_time:4370435ms step_avg:1002.39ms | |
step:4371/6000 train_time:4371469ms step_avg:1002.40ms | |
step:4372/6000 train_time:4372479ms step_avg:1002.40ms | |
step:4373/6000 train_time:4373492ms step_avg:1002.40ms | |
step:4374/6000 train_time:4374505ms step_avg:1002.41ms | |
step:4375/6000 train_time:4375516ms step_avg:1002.41ms | |
step:4375/6000 val_loss:2.4766 train_time:4375562ms step_avg:1002.42ms perplexity:11.9006 param_count:85,137,462 | |
step:4376/6000 train_time:4376508ms step_avg:1002.41ms | |
step:4377/6000 train_time:4377515ms step_avg:1002.41ms | |
step:4378/6000 train_time:4378520ms step_avg:1002.41ms | |
step:4379/6000 train_time:4379536ms step_avg:1002.41ms | |
step:4380/6000 train_time:4380547ms step_avg:1002.41ms | |
step:4381/6000 train_time:4381552ms step_avg:1002.41ms | |
step:4382/6000 train_time:4382556ms step_avg:1002.41ms | |
step:4383/6000 train_time:4383558ms step_avg:1002.41ms | |
step:4384/6000 train_time:4384576ms step_avg:1002.42ms | |
step:4385/6000 train_time:4385590ms step_avg:1002.42ms | |
step:4386/6000 train_time:4386586ms step_avg:1002.42ms | |
step:4387/6000 train_time:4387592ms step_avg:1002.42ms | |
step:4388/6000 train_time:4388598ms step_avg:1002.42ms | |
step:4389/6000 train_time:4389601ms step_avg:1002.42ms | |
step:4390/6000 train_time:4390603ms step_avg:1002.42ms | |
step:4391/6000 train_time:4391658ms step_avg:1002.43ms | |
step:4392/6000 train_time:4392660ms step_avg:1002.43ms | |
step:4393/6000 train_time:4393674ms step_avg:1002.44ms | |
step:4394/6000 train_time:4394677ms step_avg:1002.44ms | |
step:4395/6000 train_time:4395678ms step_avg:1002.44ms | |
step:4396/6000 train_time:4396687ms step_avg:1002.44ms | |
step:4397/6000 train_time:4397690ms step_avg:1002.44ms | |
step:4398/6000 train_time:4398692ms step_avg:1002.44ms | |
step:4399/6000 train_time:4399704ms step_avg:1002.44ms | |
step:4400/6000 train_time:4400703ms step_avg:1002.44ms | |
step:4400/6000 val_loss:2.4845 train_time:4400749ms step_avg:1002.45ms perplexity:11.9955 param_count:85,137,462 | |
step:4401/6000 train_time:4401703ms step_avg:1002.44ms | |
step:4402/6000 train_time:4402701ms step_avg:1002.44ms | |
step:4403/6000 train_time:4403708ms step_avg:1002.44ms | |
step:4404/6000 train_time:4404711ms step_avg:1002.44ms | |
step:4405/6000 train_time:4405722ms step_avg:1002.44ms | |
step:4406/6000 train_time:4406731ms step_avg:1002.44ms | |
step:4407/6000 train_time:4407729ms step_avg:1002.44ms | |
step:4408/6000 train_time:4408735ms step_avg:1002.44ms | |
step:4409/6000 train_time:4409730ms step_avg:1002.44ms | |
step:4410/6000 train_time:4410736ms step_avg:1002.44ms | |
step:4411/6000 train_time:4411737ms step_avg:1002.44ms | |
step:4412/6000 train_time:4412745ms step_avg:1002.44ms | |
step:4413/6000 train_time:4413759ms step_avg:1002.44ms | |
step:4414/6000 train_time:4414774ms step_avg:1002.45ms | |
step:4415/6000 train_time:4415788ms step_avg:1002.45ms | |
step:4416/6000 train_time:4416793ms step_avg:1002.45ms | |
step:4417/6000 train_time:4417803ms step_avg:1002.45ms | |
step:4418/6000 train_time:4418804ms step_avg:1002.45ms | |
step:4419/6000 train_time:4419810ms step_avg:1002.45ms | |
step:4420/6000 train_time:4420832ms step_avg:1002.46ms | |
step:4421/6000 train_time:4421830ms step_avg:1002.46ms | |
step:4422/6000 train_time:4422846ms step_avg:1002.46ms | |
step:4423/6000 train_time:4423854ms step_avg:1002.46ms | |
step:4424/6000 train_time:4424851ms step_avg:1002.46ms | |
step:4425/6000 train_time:4425861ms step_avg:1002.46ms | |
step:4425/6000 val_loss:2.4838 train_time:4425905ms step_avg:1002.47ms perplexity:11.9872 param_count:85,137,462 | |
step:4426/6000 train_time:4426860ms step_avg:1002.46ms | |
step:4427/6000 train_time:4427878ms step_avg:1002.46ms | |
step:4428/6000 train_time:4428876ms step_avg:1002.46ms | |
step:4429/6000 train_time:4429867ms step_avg:1002.46ms | |
step:4430/6000 train_time:4430876ms step_avg:1002.46ms | |
step:4431/6000 train_time:4431900ms step_avg:1002.47ms | |
step:4432/6000 train_time:4432899ms step_avg:1002.46ms | |
step:4433/6000 train_time:4433911ms step_avg:1002.47ms | |
step:4434/6000 train_time:4434911ms step_avg:1002.47ms | |
step:4435/6000 train_time:4435914ms step_avg:1002.47ms | |
step:4436/6000 train_time:4436951ms step_avg:1002.47ms | |
step:4437/6000 train_time:4437953ms step_avg:1002.47ms | |
step:4438/6000 train_time:4438960ms step_avg:1002.48ms | |
step:4439/6000 train_time:4439984ms step_avg:1002.48ms | |
step:4440/6000 train_time:4441003ms step_avg:1002.48ms | |
step:4441/6000 train_time:4442011ms step_avg:1002.48ms | |
step:4442/6000 train_time:4443031ms step_avg:1002.49ms | |
step:4443/6000 train_time:4444049ms step_avg:1002.49ms | |
step:4444/6000 train_time:4445058ms step_avg:1002.49ms | |
step:4445/6000 train_time:4446069ms step_avg:1002.50ms | |
step:4446/6000 train_time:4447076ms step_avg:1002.50ms | |
step:4447/6000 train_time:4448086ms step_avg:1002.50ms | |
step:4448/6000 train_time:4449089ms step_avg:1002.50ms | |
step:4449/6000 train_time:4450080ms step_avg:1002.50ms | |
step:4450/6000 train_time:4451090ms step_avg:1002.50ms | |
step:4450/6000 val_loss:2.4724 train_time:4451134ms step_avg:1002.51ms perplexity:11.8514 param_count:85,137,462 | |
step:4451/6000 train_time:4452087ms step_avg:1002.50ms | |
step:4452/6000 train_time:4453092ms step_avg:1002.50ms | |
step:4453/6000 train_time:4454099ms step_avg:1002.50ms | |
step:4454/6000 train_time:4455108ms step_avg:1002.50ms | |
step:4455/6000 train_time:4456106ms step_avg:1002.50ms | |
step:4456/6000 train_time:4457142ms step_avg:1002.51ms | |
step:4457/6000 train_time:4458151ms step_avg:1002.51ms | |
step:4458/6000 train_time:4459160ms step_avg:1002.51ms | |
step:4459/6000 train_time:4460177ms step_avg:1002.51ms | |
step:4460/6000 train_time:4461199ms step_avg:1002.52ms | |
step:4461/6000 train_time:4462194ms step_avg:1002.52ms | |
step:4462/6000 train_time:4463233ms step_avg:1002.52ms | |
step:4463/6000 train_time:4464263ms step_avg:1002.53ms | |
step:4464/6000 train_time:4465274ms step_avg:1002.53ms | |
step:4465/6000 train_time:4466282ms step_avg:1002.53ms | |
step:4466/6000 train_time:4467294ms step_avg:1002.53ms | |
step:4467/6000 train_time:4468318ms step_avg:1002.54ms | |
step:4468/6000 train_time:4469355ms step_avg:1002.55ms | |
step:4469/6000 train_time:4470371ms step_avg:1002.55ms | |
step:4470/6000 train_time:4471383ms step_avg:1002.55ms | |
step:4471/6000 train_time:4472393ms step_avg:1002.55ms | |
step:4472/6000 train_time:4473384ms step_avg:1002.55ms | |
step:4473/6000 train_time:4474396ms step_avg:1002.55ms | |
step:4474/6000 train_time:4475405ms step_avg:1002.55ms | |
step:4475/6000 train_time:4476427ms step_avg:1002.56ms | |
step:4475/6000 val_loss:2.4938 train_time:4476472ms step_avg:1002.57ms perplexity:12.1068 param_count:85,137,462 | |
step:4476/6000 train_time:4477431ms step_avg:1002.56ms | |
step:4477/6000 train_time:4478437ms step_avg:1002.56ms | |
step:4478/6000 train_time:4479432ms step_avg:1002.56ms | |
step:4479/6000 train_time:4480444ms step_avg:1002.56ms | |
step:4480/6000 train_time:4481457ms step_avg:1002.56ms | |
step:4481/6000 train_time:4482453ms step_avg:1002.56ms | |
step:4482/6000 train_time:4483460ms step_avg:1002.56ms | |
step:4483/6000 train_time:4484457ms step_avg:1002.56ms | |
step:4484/6000 train_time:4485453ms step_avg:1002.56ms | |
step:4485/6000 train_time:4486452ms step_avg:1002.56ms | |
step:4486/6000 train_time:4487465ms step_avg:1002.56ms | |
step:4487/6000 train_time:4488488ms step_avg:1002.57ms | |
step:4488/6000 train_time:4489515ms step_avg:1002.57ms | |
step:4489/6000 train_time:4490549ms step_avg:1002.58ms | |
step:4490/6000 train_time:4491559ms step_avg:1002.58ms | |
step:4491/6000 train_time:4492571ms step_avg:1002.58ms | |
step:4492/6000 train_time:4493596ms step_avg:1002.59ms | |
step:4493/6000 train_time:4494602ms step_avg:1002.59ms | |
step:4494/6000 train_time:4495609ms step_avg:1002.59ms | |
step:4495/6000 train_time:4496641ms step_avg:1002.60ms | |
step:4496/6000 train_time:4497657ms step_avg:1002.60ms | |
step:4497/6000 train_time:4498645ms step_avg:1002.60ms | |
step:4498/6000 train_time:4499650ms step_avg:1002.60ms | |
step:4499/6000 train_time:4500668ms step_avg:1002.60ms | |
step:4500/6000 train_time:4501679ms step_avg:1002.60ms | |
step:4500/6000 val_loss:2.4829 train_time:4501726ms step_avg:1002.61ms perplexity:11.9754 param_count:85,137,462 | |
step:4501/6000 train_time:4502669ms step_avg:1002.60ms | |
step:4502/6000 train_time:4503697ms step_avg:1002.60ms | |
step:4503/6000 train_time:4504708ms step_avg:1002.61ms | |
step:4504/6000 train_time:4505717ms step_avg:1002.61ms | |
step:4505/6000 train_time:4506732ms step_avg:1002.61ms | |
step:4506/6000 train_time:4507750ms step_avg:1002.61ms | |
step:4507/6000 train_time:4508757ms step_avg:1002.61ms | |
step:4508/6000 train_time:4509770ms step_avg:1002.62ms | |
step:4509/6000 train_time:4510780ms step_avg:1002.62ms | |
step:4510/6000 train_time:4511787ms step_avg:1002.62ms | |
step:4511/6000 train_time:4512793ms step_avg:1002.62ms | |
step:4512/6000 train_time:4513800ms step_avg:1002.62ms | |
step:4513/6000 train_time:4514807ms step_avg:1002.62ms | |
step:4514/6000 train_time:4515821ms step_avg:1002.62ms | |
step:4515/6000 train_time:4516828ms step_avg:1002.63ms | |
step:4516/6000 train_time:4517839ms step_avg:1002.63ms | |
step:4517/6000 train_time:4518853ms step_avg:1002.63ms | |
step:4518/6000 train_time:4519861ms step_avg:1002.63ms | |
step:4519/6000 train_time:4520867ms step_avg:1002.63ms | |
step:4520/6000 train_time:4521878ms step_avg:1002.63ms | |
step:4521/6000 train_time:4522883ms step_avg:1002.63ms | |
step:4522/6000 train_time:4523902ms step_avg:1002.64ms | |
step:4523/6000 train_time:4524919ms step_avg:1002.64ms | |
step:4524/6000 train_time:4525920ms step_avg:1002.64ms | |
step:4525/6000 train_time:4526936ms step_avg:1002.64ms | |
step:4525/6000 val_loss:2.4760 train_time:4526982ms step_avg:1002.65ms perplexity:11.8936 param_count:85,137,462 | |
step:4526/6000 train_time:4527943ms step_avg:1002.64ms | |
step:4527/6000 train_time:4528958ms step_avg:1002.65ms | |
step:4528/6000 train_time:4529971ms step_avg:1002.65ms | |
step:4529/6000 train_time:4530973ms step_avg:1002.65ms | |
step:4530/6000 train_time:4531989ms step_avg:1002.65ms | |
step:4531/6000 train_time:4533013ms step_avg:1002.66ms | |
step:4532/6000 train_time:4534033ms step_avg:1002.66ms | |
step:4533/6000 train_time:4535055ms step_avg:1002.67ms | |
step:4534/6000 train_time:4536055ms step_avg:1002.66ms | |
step:4535/6000 train_time:4537082ms step_avg:1002.67ms | |
step:4536/6000 train_time:4538082ms step_avg:1002.67ms | |
step:4537/6000 train_time:4539092ms step_avg:1002.67ms | |
step:4538/6000 train_time:4540111ms step_avg:1002.67ms | |
step:4539/6000 train_time:4541111ms step_avg:1002.67ms | |
step:4540/6000 train_time:4542120ms step_avg:1002.68ms | |
step:4541/6000 train_time:4543134ms step_avg:1002.68ms | |
step:4542/6000 train_time:4544139ms step_avg:1002.68ms | |
step:4543/6000 train_time:4545136ms step_avg:1002.68ms | |
step:4544/6000 train_time:4546144ms step_avg:1002.68ms | |
step:4545/6000 train_time:4547151ms step_avg:1002.68ms | |
step:4546/6000 train_time:4548153ms step_avg:1002.68ms | |
step:4547/6000 train_time:4549161ms step_avg:1002.68ms | |
step:4548/6000 train_time:4550177ms step_avg:1002.68ms | |
step:4549/6000 train_time:4551184ms step_avg:1002.68ms | |
step:4550/6000 train_time:4552187ms step_avg:1002.68ms | |
step:4550/6000 val_loss:2.4841 train_time:4552233ms step_avg:1002.69ms perplexity:11.9903 param_count:85,137,462 | |
step:4551/6000 train_time:4553198ms step_avg:1002.69ms | |
step:4552/6000 train_time:4554204ms step_avg:1002.69ms | |
step:4553/6000 train_time:4555228ms step_avg:1002.69ms | |
step:4554/6000 train_time:4556238ms step_avg:1002.69ms | |
step:4555/6000 train_time:4557252ms step_avg:1002.70ms | |
step:4556/6000 train_time:4558274ms step_avg:1002.70ms | |
step:4557/6000 train_time:4559278ms step_avg:1002.70ms | |
step:4558/6000 train_time:4560286ms step_avg:1002.70ms | |
step:4559/6000 train_time:4561297ms step_avg:1002.70ms | |
step:4560/6000 train_time:4562303ms step_avg:1002.70ms | |
step:4561/6000 train_time:4563322ms step_avg:1002.71ms | |
step:4562/6000 train_time:4564326ms step_avg:1002.71ms | |
step:4563/6000 train_time:4565350ms step_avg:1002.71ms | |
step:4564/6000 train_time:4566355ms step_avg:1002.71ms | |
step:4565/6000 train_time:4567362ms step_avg:1002.71ms | |
step:4566/6000 train_time:4568358ms step_avg:1002.71ms | |
step:4567/6000 train_time:4569386ms step_avg:1002.72ms | |
step:4568/6000 train_time:4570390ms step_avg:1002.72ms | |
step:4569/6000 train_time:4571399ms step_avg:1002.72ms | |
step:4570/6000 train_time:4572393ms step_avg:1002.72ms | |
step:4571/6000 train_time:4573421ms step_avg:1002.72ms | |
step:4572/6000 train_time:4574433ms step_avg:1002.73ms | |
step:4573/6000 train_time:4575443ms step_avg:1002.73ms | |
step:4574/6000 train_time:4576439ms step_avg:1002.73ms | |
step:4575/6000 train_time:4577450ms step_avg:1002.73ms | |
step:4575/6000 val_loss:2.4893 train_time:4577497ms step_avg:1002.74ms perplexity:12.0533 param_count:85,137,462 | |
step:4576/6000 train_time:4578457ms step_avg:1002.73ms | |
step:4577/6000 train_time:4579459ms step_avg:1002.73ms | |
step:4578/6000 train_time:4580468ms step_avg:1002.73ms | |
step:4579/6000 train_time:4581465ms step_avg:1002.73ms | |
step:4580/6000 train_time:4582471ms step_avg:1002.73ms | |
step:4581/6000 train_time:4583475ms step_avg:1002.73ms | |
step:4582/6000 train_time:4584480ms step_avg:1002.73ms | |
step:4583/6000 train_time:4585480ms step_avg:1002.73ms | |
step:4584/6000 train_time:4586477ms step_avg:1002.73ms | |
step:4585/6000 train_time:4587491ms step_avg:1002.73ms | |
step:4586/6000 train_time:4588489ms step_avg:1002.73ms | |
step:4587/6000 train_time:4589505ms step_avg:1002.73ms | |
step:4588/6000 train_time:4590512ms step_avg:1002.73ms | |
step:4589/6000 train_time:4591540ms step_avg:1002.74ms | |
step:4590/6000 train_time:4592560ms step_avg:1002.74ms | |
step:4591/6000 train_time:4593563ms step_avg:1002.74ms | |
step:4592/6000 train_time:4594582ms step_avg:1002.75ms | |
step:4593/6000 train_time:4595592ms step_avg:1002.75ms | |
step:4594/6000 train_time:4596598ms step_avg:1002.75ms | |
step:4595/6000 train_time:4597612ms step_avg:1002.75ms | |
step:4596/6000 train_time:4598626ms step_avg:1002.75ms | |
step:4597/6000 train_time:4599634ms step_avg:1002.75ms | |
step:4598/6000 train_time:4600631ms step_avg:1002.75ms | |
step:4599/6000 train_time:4601634ms step_avg:1002.75ms | |
step:4600/6000 train_time:4602656ms step_avg:1002.76ms | |
step:4600/6000 val_loss:2.4848 train_time:4602702ms step_avg:1002.77ms perplexity:11.9982 param_count:85,137,462 | |
step:4601/6000 train_time:4603677ms step_avg:1002.76ms | |
step:4602/6000 train_time:4604678ms step_avg:1002.76ms | |
step:4603/6000 train_time:4605673ms step_avg:1002.76ms | |
step:4604/6000 train_time:4606671ms step_avg:1002.76ms | |
step:4605/6000 train_time:4607671ms step_avg:1002.76ms | |
step:4606/6000 train_time:4608679ms step_avg:1002.76ms | |
step:4607/6000 train_time:4609695ms step_avg:1002.76ms | |
step:4608/6000 train_time:4610699ms step_avg:1002.76ms | |
step:4609/6000 train_time:4611716ms step_avg:1002.76ms | |
step:4610/6000 train_time:4612730ms step_avg:1002.77ms | |
step:4611/6000 train_time:4613740ms step_avg:1002.77ms | |
step:4612/6000 train_time:4614754ms step_avg:1002.77ms | |
step:4613/6000 train_time:4615769ms step_avg:1002.77ms | |
step:4614/6000 train_time:4616764ms step_avg:1002.77ms | |
step:4615/6000 train_time:4617776ms step_avg:1002.77ms | |
step:4616/6000 train_time:4618783ms step_avg:1002.78ms | |
step:4617/6000 train_time:4619771ms step_avg:1002.77ms | |
step:4618/6000 train_time:4620795ms step_avg:1002.78ms | |
step:4619/6000 train_time:4621819ms step_avg:1002.78ms | |
step:4620/6000 train_time:4622835ms step_avg:1002.78ms | |
step:4621/6000 train_time:4623844ms step_avg:1002.79ms | |
step:4622/6000 train_time:4624870ms step_avg:1002.79ms | |
step:4623/6000 train_time:4625879ms step_avg:1002.79ms | |
step:4624/6000 train_time:4626894ms step_avg:1002.79ms | |
step:4625/6000 train_time:4627901ms step_avg:1002.80ms | |
step:4625/6000 val_loss:2.4894 train_time:4627948ms step_avg:1002.81ms perplexity:12.0539 param_count:85,137,462 | |
step:4626/6000 train_time:4628899ms step_avg:1002.79ms | |
step:4627/6000 train_time:4629908ms step_avg:1002.80ms | |
step:4628/6000 train_time:4630925ms step_avg:1002.80ms | |
step:4629/6000 train_time:4631938ms step_avg:1002.80ms | |
step:4630/6000 train_time:4632955ms step_avg:1002.80ms | |
step:4631/6000 train_time:4633965ms step_avg:1002.81ms | |
step:4632/6000 train_time:4634971ms step_avg:1002.81ms | |
step:4633/6000 train_time:4635992ms step_avg:1002.81ms | |
step:4634/6000 train_time:4637007ms step_avg:1002.81ms | |
step:4635/6000 train_time:4638011ms step_avg:1002.81ms | |
step:4636/6000 train_time:4639055ms step_avg:1002.82ms | |
step:4637/6000 train_time:4640049ms step_avg:1002.82ms | |
step:4638/6000 train_time:4641053ms step_avg:1002.82ms | |
step:4639/6000 train_time:4642056ms step_avg:1002.82ms | |
step:4640/6000 train_time:4643067ms step_avg:1002.82ms | |
step:4641/6000 train_time:4644085ms step_avg:1002.83ms | |
step:4642/6000 train_time:4645089ms step_avg:1002.83ms | |
step:4643/6000 train_time:4646110ms step_avg:1002.83ms | |
step:4644/6000 train_time:4647134ms step_avg:1002.83ms | |
step:4645/6000 train_time:4648142ms step_avg:1002.84ms | |
step:4646/6000 train_time:4649154ms step_avg:1002.84ms | |
step:4647/6000 train_time:4650158ms step_avg:1002.84ms | |
step:4648/6000 train_time:4651176ms step_avg:1002.84ms | |
step:4649/6000 train_time:4652181ms step_avg:1002.84ms | |
step:4650/6000 train_time:4653177ms step_avg:1002.84ms | |
step:4650/6000 val_loss:2.4909 train_time:4653224ms step_avg:1002.85ms perplexity:12.0726 param_count:85,137,462 | |
step:4651/6000 train_time:4654179ms step_avg:1002.84ms | |
step:4652/6000 train_time:4655198ms step_avg:1002.84ms | |
step:4653/6000 train_time:4656214ms step_avg:1002.85ms | |
step:4654/6000 train_time:4657227ms step_avg:1002.85ms | |
step:4655/6000 train_time:4658216ms step_avg:1002.85ms | |
step:4656/6000 train_time:4659218ms step_avg:1002.85ms | |
step:4657/6000 train_time:4660236ms step_avg:1002.85ms | |
step:4658/6000 train_time:4661243ms step_avg:1002.85ms | |
step:4659/6000 train_time:4662241ms step_avg:1002.85ms | |
step:4660/6000 train_time:4663246ms step_avg:1002.85ms | |
step:4661/6000 train_time:4664275ms step_avg:1002.85ms | |
step:4662/6000 train_time:4665280ms step_avg:1002.85ms | |
step:4663/6000 train_time:4666290ms step_avg:1002.86ms | |
step:4664/6000 train_time:4667294ms step_avg:1002.86ms | |
step:4665/6000 train_time:4668295ms step_avg:1002.86ms | |
step:4666/6000 train_time:4669304ms step_avg:1002.86ms | |
step:4667/6000 train_time:4670315ms step_avg:1002.86ms | |
step:4668/6000 train_time:4671327ms step_avg:1002.86ms | |
step:4669/6000 train_time:4672339ms step_avg:1002.86ms | |
step:4670/6000 train_time:4673351ms step_avg:1002.87ms | |
step:4671/6000 train_time:4674355ms step_avg:1002.87ms | |
step:4672/6000 train_time:4675367ms step_avg:1002.87ms | |
step:4673/6000 train_time:4676386ms step_avg:1002.87ms | |
step:4674/6000 train_time:4677394ms step_avg:1002.87ms | |
step:4675/6000 train_time:4678410ms step_avg:1002.87ms | |
step:4675/6000 val_loss:2.4764 train_time:4678456ms step_avg:1002.88ms perplexity:11.8978 param_count:85,137,462 | |
step:4676/6000 train_time:4679408ms step_avg:1002.87ms | |
step:4677/6000 train_time:4680417ms step_avg:1002.87ms | |
step:4678/6000 train_time:4681426ms step_avg:1002.88ms | |
step:4679/6000 train_time:4682430ms step_avg:1002.88ms | |
step:4680/6000 train_time:4683428ms step_avg:1002.88ms | |
step:4681/6000 train_time:4684435ms step_avg:1002.88ms | |
step:4682/6000 train_time:4685459ms step_avg:1002.88ms | |
step:4683/6000 train_time:4686464ms step_avg:1002.88ms | |
step:4684/6000 train_time:4687465ms step_avg:1002.88ms | |
step:4685/6000 train_time:4688479ms step_avg:1002.88ms | |
step:4686/6000 train_time:4689486ms step_avg:1002.88ms | |
step:4687/6000 train_time:4690498ms step_avg:1002.89ms | |
step:4688/6000 train_time:4691517ms step_avg:1002.89ms | |
step:4689/6000 train_time:4692520ms step_avg:1002.89ms | |
step:4690/6000 train_time:4693529ms step_avg:1002.89ms | |
step:4691/6000 train_time:4694534ms step_avg:1002.89ms | |
step:4692/6000 train_time:4695553ms step_avg:1002.89ms | |
step:4693/6000 train_time:4696556ms step_avg:1002.89ms | |
step:4694/6000 train_time:4697606ms step_avg:1002.90ms | |
step:4695/6000 train_time:4698617ms step_avg:1002.91ms | |
step:4696/6000 train_time:4699614ms step_avg:1002.91ms | |
step:4697/6000 train_time:4700616ms step_avg:1002.91ms | |
step:4698/6000 train_time:4701627ms step_avg:1002.91ms | |
step:4699/6000 train_time:4702628ms step_avg:1002.91ms | |
step:4700/6000 train_time:4703650ms step_avg:1002.91ms | |
step:4700/6000 val_loss:2.4877 train_time:4703697ms step_avg:1002.92ms perplexity:12.0330 param_count:85,137,462 | |
step:4701/6000 train_time:4704667ms step_avg:1002.91ms | |
step:4702/6000 train_time:4705674ms step_avg:1002.91ms | |
step:4703/6000 train_time:4706672ms step_avg:1002.91ms | |
step:4704/6000 train_time:4707685ms step_avg:1002.92ms | |
step:4705/6000 train_time:4708691ms step_avg:1002.92ms | |
step:4706/6000 train_time:4709704ms step_avg:1002.92ms | |
step:4707/6000 train_time:4710702ms step_avg:1002.92ms | |
step:4708/6000 train_time:4711716ms step_avg:1002.92ms | |
step:4709/6000 train_time:4712730ms step_avg:1002.92ms | |
step:4710/6000 train_time:4713737ms step_avg:1002.92ms | |
step:4711/6000 train_time:4714745ms step_avg:1002.92ms | |
step:4712/6000 train_time:4715770ms step_avg:1002.93ms | |
step:4713/6000 train_time:4716778ms step_avg:1002.93ms | |
step:4714/6000 train_time:4717781ms step_avg:1002.93ms | |
step:4715/6000 train_time:4718794ms step_avg:1002.93ms | |
step:4716/6000 train_time:4719803ms step_avg:1002.93ms | |
step:4717/6000 train_time:4720809ms step_avg:1002.93ms | |
step:4718/6000 train_time:4721812ms step_avg:1002.93ms | |
step:4719/6000 train_time:4722815ms step_avg:1002.93ms | |
step:4720/6000 train_time:4723834ms step_avg:1002.94ms | |
step:4721/6000 train_time:4724848ms step_avg:1002.94ms | |
step:4722/6000 train_time:4725861ms step_avg:1002.94ms | |
step:4723/6000 train_time:4726863ms step_avg:1002.94ms | |
step:4724/6000 train_time:4727879ms step_avg:1002.94ms | |
step:4725/6000 train_time:4728898ms step_avg:1002.95ms | |
step:4725/6000 val_loss:2.4802 train_time:4728943ms step_avg:1002.96ms perplexity:11.9437 param_count:85,137,462 | |
step:4726/6000 train_time:4729883ms step_avg:1002.94ms | |
step:4727/6000 train_time:4730902ms step_avg:1002.95ms | |
step:4728/6000 train_time:4731903ms step_avg:1002.95ms | |
step:4729/6000 train_time:4732899ms step_avg:1002.95ms | |
step:4730/6000 train_time:4733921ms step_avg:1002.95ms | |
step:4731/6000 train_time:4734920ms step_avg:1002.95ms | |
step:4732/6000 train_time:4735933ms step_avg:1002.95ms | |
step:4733/6000 train_time:4736936ms step_avg:1002.95ms | |
step:4734/6000 train_time:4737941ms step_avg:1002.95ms | |
step:4735/6000 train_time:4738958ms step_avg:1002.95ms | |
step:4736/6000 train_time:4739961ms step_avg:1002.95ms | |
step:4737/6000 train_time:4740971ms step_avg:1002.96ms | |
step:4738/6000 train_time:4741984ms step_avg:1002.96ms | |
step:4739/6000 train_time:4742981ms step_avg:1002.96ms | |
step:4740/6000 train_time:4743986ms step_avg:1002.96ms | |
step:4741/6000 train_time:4744987ms step_avg:1002.96ms | |
step:4742/6000 train_time:4746005ms step_avg:1002.96ms | |
step:4743/6000 train_time:4747007ms step_avg:1002.96ms | |
step:4744/6000 train_time:4748009ms step_avg:1002.96ms | |
step:4745/6000 train_time:4749021ms step_avg:1002.96ms | |
step:4746/6000 train_time:4750038ms step_avg:1002.96ms | |
step:4747/6000 train_time:4751065ms step_avg:1002.97ms | |
step:4748/6000 train_time:4752068ms step_avg:1002.97ms | |
step:4749/6000 train_time:4753077ms step_avg:1002.97ms | |
step:4750/6000 train_time:4754090ms step_avg:1002.97ms | |
step:4750/6000 val_loss:2.4777 train_time:4754136ms step_avg:1002.98ms perplexity:11.9133 param_count:85,137,462 | |
step:4751/6000 train_time:4755102ms step_avg:1002.97ms | |
step:4752/6000 train_time:4756101ms step_avg:1002.97ms | |
step:4753/6000 train_time:4757102ms step_avg:1002.97ms | |
step:4754/6000 train_time:4758093ms step_avg:1002.97ms | |
step:4755/6000 train_time:4759106ms step_avg:1002.97ms | |
step:4756/6000 train_time:4760114ms step_avg:1002.97ms | |
step:4757/6000 train_time:4761118ms step_avg:1002.97ms | |
step:4758/6000 train_time:4762125ms step_avg:1002.97ms | |
step:4759/6000 train_time:4763125ms step_avg:1002.97ms | |
step:4760/6000 train_time:4764154ms step_avg:1002.98ms | |
step:4761/6000 train_time:4765171ms step_avg:1002.98ms | |
step:4762/6000 train_time:4766165ms step_avg:1002.98ms | |
step:4763/6000 train_time:4767174ms step_avg:1002.98ms | |
step:4764/6000 train_time:4768186ms step_avg:1002.98ms | |
step:4765/6000 train_time:4769195ms step_avg:1002.99ms | |
step:4766/6000 train_time:4770199ms step_avg:1002.99ms | |
step:4767/6000 train_time:4771234ms step_avg:1002.99ms | |
step:4768/6000 train_time:4772246ms step_avg:1002.99ms | |
step:4769/6000 train_time:4773266ms step_avg:1003.00ms | |
step:4770/6000 train_time:4774282ms step_avg:1003.00ms | |
step:4771/6000 train_time:4775293ms step_avg:1003.00ms | |
step:4772/6000 train_time:4776304ms step_avg:1003.00ms | |
step:4773/6000 train_time:4777317ms step_avg:1003.01ms | |
step:4774/6000 train_time:4778326ms step_avg:1003.01ms | |
step:4775/6000 train_time:4779316ms step_avg:1003.00ms | |
step:4775/6000 val_loss:2.4782 train_time:4779362ms step_avg:1003.01ms perplexity:11.9199 param_count:85,137,462 | |
step:4776/6000 train_time:4780321ms step_avg:1003.00ms | |
step:4777/6000 train_time:4781336ms step_avg:1003.01ms | |
step:4778/6000 train_time:4782335ms step_avg:1003.01ms | |
step:4779/6000 train_time:4783337ms step_avg:1003.01ms | |
step:4780/6000 train_time:4784339ms step_avg:1003.01ms | |
step:4781/6000 train_time:4785345ms step_avg:1003.01ms | |
step:4782/6000 train_time:4786355ms step_avg:1003.01ms | |
step:4783/6000 train_time:4787397ms step_avg:1003.02ms | |
step:4784/6000 train_time:4788411ms step_avg:1003.02ms | |
step:4785/6000 train_time:4789423ms step_avg:1003.02ms | |
step:4786/6000 train_time:4790434ms step_avg:1003.02ms | |
step:4787/6000 train_time:4791438ms step_avg:1003.02ms | |
step:4788/6000 train_time:4792461ms step_avg:1003.03ms | |
step:4789/6000 train_time:4793485ms step_avg:1003.03ms | |
step:4790/6000 train_time:4794491ms step_avg:1003.03ms | |
step:4791/6000 train_time:4795493ms step_avg:1003.03ms | |
step:4792/6000 train_time:4796500ms step_avg:1003.03ms | |
step:4793/6000 train_time:4797514ms step_avg:1003.03ms | |
step:4794/6000 train_time:4798527ms step_avg:1003.04ms | |
step:4795/6000 train_time:4799543ms step_avg:1003.04ms | |
step:4796/6000 train_time:4800549ms step_avg:1003.04ms | |
step:4797/6000 train_time:4801573ms step_avg:1003.04ms | |
step:4798/6000 train_time:4802586ms step_avg:1003.05ms | |
step:4799/6000 train_time:4803600ms step_avg:1003.05ms | |
step:4800/6000 train_time:4804619ms step_avg:1003.05ms | |
step:4800/6000 val_loss:2.4810 train_time:4804665ms step_avg:1003.06ms perplexity:11.9531 param_count:85,137,462 | |
step:4801/6000 train_time:4805631ms step_avg:1003.05ms | |
step:4802/6000 train_time:4806663ms step_avg:1003.06ms | |
step:4803/6000 train_time:4807658ms step_avg:1003.06ms | |
step:4804/6000 train_time:4808680ms step_avg:1003.06ms | |
step:4805/6000 train_time:4809683ms step_avg:1003.06ms | |
step:4806/6000 train_time:4810703ms step_avg:1003.07ms | |
step:4807/6000 train_time:4811722ms step_avg:1003.07ms | |
step:4808/6000 train_time:4812726ms step_avg:1003.07ms | |
step:4809/6000 train_time:4813743ms step_avg:1003.07ms | |
step:4810/6000 train_time:4814752ms step_avg:1003.07ms | |
step:4811/6000 train_time:4815759ms step_avg:1003.07ms | |
step:4812/6000 train_time:4816781ms step_avg:1003.08ms | |
step:4813/6000 train_time:4817806ms step_avg:1003.08ms | |
step:4814/6000 train_time:4818816ms step_avg:1003.08ms | |
step:4815/6000 train_time:4819820ms step_avg:1003.08ms | |
step:4816/6000 train_time:4820835ms step_avg:1003.09ms | |
step:4817/6000 train_time:4821837ms step_avg:1003.09ms | |
step:4818/6000 train_time:4822846ms step_avg:1003.09ms | |
step:4819/6000 train_time:4823860ms step_avg:1003.09ms | |
step:4820/6000 train_time:4824886ms step_avg:1003.09ms | |
step:4821/6000 train_time:4825894ms step_avg:1003.10ms | |
step:4822/6000 train_time:4826901ms step_avg:1003.10ms | |
step:4823/6000 train_time:4827913ms step_avg:1003.10ms | |
step:4824/6000 train_time:4828928ms step_avg:1003.10ms | |
step:4825/6000 train_time:4829932ms step_avg:1003.10ms | |
step:4825/6000 val_loss:2.4766 train_time:4829975ms step_avg:1003.11ms perplexity:11.9003 param_count:85,137,462 | |
step:4826/6000 train_time:4830936ms step_avg:1003.10ms | |
step:4827/6000 train_time:4831932ms step_avg:1003.10ms | |
step:4828/6000 train_time:4832938ms step_avg:1003.10ms | |
step:4829/6000 train_time:4833947ms step_avg:1003.10ms | |
step:4830/6000 train_time:4834954ms step_avg:1003.10ms | |
step:4831/6000 train_time:4835960ms step_avg:1003.10ms | |
step:4832/6000 train_time:4836971ms step_avg:1003.10ms | |
step:4833/6000 train_time:4837971ms step_avg:1003.10ms | |
step:4834/6000 train_time:4838981ms step_avg:1003.11ms | |
step:4835/6000 train_time:4840003ms step_avg:1003.11ms | |
step:4836/6000 train_time:4841022ms step_avg:1003.11ms | |
step:4837/6000 train_time:4842038ms step_avg:1003.12ms | |
step:4838/6000 train_time:4843064ms step_avg:1003.12ms | |
step:4839/6000 train_time:4844080ms step_avg:1003.12ms | |
step:4840/6000 train_time:4845103ms step_avg:1003.13ms | |
step:4841/6000 train_time:4846100ms step_avg:1003.13ms | |
step:4842/6000 train_time:4847115ms step_avg:1003.13ms | |
step:4843/6000 train_time:4848119ms step_avg:1003.13ms | |
step:4844/6000 train_time:4849136ms step_avg:1003.13ms | |
step:4845/6000 train_time:4850142ms step_avg:1003.13ms | |
step:4846/6000 train_time:4851139ms step_avg:1003.13ms | |
step:4847/6000 train_time:4852160ms step_avg:1003.13ms | |
step:4848/6000 train_time:4853169ms step_avg:1003.14ms | |
step:4849/6000 train_time:4854171ms step_avg:1003.14ms | |
step:4850/6000 train_time:4855188ms step_avg:1003.14ms | |
step:4850/6000 val_loss:2.4738 train_time:4855234ms step_avg:1003.15ms perplexity:11.8678 param_count:85,137,462 | |
step:4851/6000 train_time:4856191ms step_avg:1003.14ms | |
step:4852/6000 train_time:4857189ms step_avg:1003.14ms | |
step:4853/6000 train_time:4858201ms step_avg:1003.14ms | |
step:4854/6000 train_time:4859222ms step_avg:1003.14ms | |
step:4855/6000 train_time:4860229ms step_avg:1003.14ms | |
step:4856/6000 train_time:4861240ms step_avg:1003.14ms | |
step:4857/6000 train_time:4862245ms step_avg:1003.15ms | |
step:4858/6000 train_time:4863248ms step_avg:1003.15ms | |
step:4859/6000 train_time:4864253ms step_avg:1003.15ms | |
step:4860/6000 train_time:4865258ms step_avg:1003.15ms | |
step:4861/6000 train_time:4866316ms step_avg:1003.16ms | |
step:4862/6000 train_time:4867339ms step_avg:1003.16ms | |
step:4863/6000 train_time:4868335ms step_avg:1003.16ms | |
step:4864/6000 train_time:4869341ms step_avg:1003.16ms | |
step:4865/6000 train_time:4870344ms step_avg:1003.16ms | |
step:4866/6000 train_time:4871361ms step_avg:1003.16ms | |
step:4867/6000 train_time:4872372ms step_avg:1003.16ms | |
step:4868/6000 train_time:4873368ms step_avg:1003.16ms | |
step:4869/6000 train_time:4874378ms step_avg:1003.16ms | |
step:4870/6000 train_time:4875397ms step_avg:1003.17ms | |
step:4871/6000 train_time:4876400ms step_avg:1003.17ms | |
step:4872/6000 train_time:4877408ms step_avg:1003.17ms | |
step:4873/6000 train_time:4878445ms step_avg:1003.18ms | |
step:4874/6000 train_time:4879445ms step_avg:1003.18ms | |
step:4875/6000 train_time:4880457ms step_avg:1003.18ms | |
step:4875/6000 val_loss:2.4851 train_time:4880504ms step_avg:1003.19ms perplexity:12.0018 param_count:85,137,462 | |
step:4876/6000 train_time:4881465ms step_avg:1003.18ms | |
step:4877/6000 train_time:4882471ms step_avg:1003.18ms | |
step:4878/6000 train_time:4883470ms step_avg:1003.18ms | |
step:4879/6000 train_time:4884482ms step_avg:1003.18ms | |
step:4880/6000 train_time:4885498ms step_avg:1003.18ms | |
step:4881/6000 train_time:4886531ms step_avg:1003.19ms | |
step:4882/6000 train_time:4887542ms step_avg:1003.19ms | |
step:4883/6000 train_time:4888546ms step_avg:1003.19ms | |
step:4884/6000 train_time:4889541ms step_avg:1003.19ms | |
step:4885/6000 train_time:4890576ms step_avg:1003.20ms | |
step:4886/6000 train_time:4891582ms step_avg:1003.20ms | |
step:4887/6000 train_time:4892583ms step_avg:1003.20ms | |
step:4888/6000 train_time:4893580ms step_avg:1003.19ms | |
step:4889/6000 train_time:4894590ms step_avg:1003.20ms | |
step:4890/6000 train_time:4895593ms step_avg:1003.20ms | |
step:4891/6000 train_time:4896615ms step_avg:1003.20ms | |
step:4892/6000 train_time:4897622ms step_avg:1003.20ms | |
step:4893/6000 train_time:4898635ms step_avg:1003.20ms | |
step:4894/6000 train_time:4899653ms step_avg:1003.20ms | |
step:4895/6000 train_time:4900662ms step_avg:1003.21ms | |
step:4896/6000 train_time:4901691ms step_avg:1003.21ms | |
step:4897/6000 train_time:4902732ms step_avg:1003.22ms | |
step:4898/6000 train_time:4903732ms step_avg:1003.22ms | |
step:4899/6000 train_time:4904743ms step_avg:1003.22ms | |
step:4900/6000 train_time:4905756ms step_avg:1003.22ms | |
step:4900/6000 val_loss:2.4776 train_time:4905803ms step_avg:1003.23ms perplexity:11.9127 param_count:85,137,462 | |
step:4901/6000 train_time:4906754ms step_avg:1003.22ms | |
step:4902/6000 train_time:4907756ms step_avg:1003.22ms | |
step:4903/6000 train_time:4908755ms step_avg:1003.22ms | |
step:4904/6000 train_time:4909754ms step_avg:1003.22ms | |
step:4905/6000 train_time:4910768ms step_avg:1003.22ms | |
step:4906/6000 train_time:4911767ms step_avg:1003.22ms | |
step:4907/6000 train_time:4912798ms step_avg:1003.23ms | |
step:4908/6000 train_time:4913805ms step_avg:1003.23ms | |
step:4909/6000 train_time:4914808ms step_avg:1003.23ms | |
step:4910/6000 train_time:4915808ms step_avg:1003.23ms | |
step:4911/6000 train_time:4916814ms step_avg:1003.23ms | |
step:4912/6000 train_time:4917829ms step_avg:1003.23ms | |
step:4913/6000 train_time:4918844ms step_avg:1003.23ms | |
step:4914/6000 train_time:4919860ms step_avg:1003.23ms | |
step:4915/6000 train_time:4920871ms step_avg:1003.24ms | |
step:4916/6000 train_time:4921880ms step_avg:1003.24ms | |
step:4917/6000 train_time:4922885ms step_avg:1003.24ms | |
step:4918/6000 train_time:4923913ms step_avg:1003.24ms | |
step:4919/6000 train_time:4924919ms step_avg:1003.24ms | |
step:4920/6000 train_time:4925938ms step_avg:1003.25ms | |
step:4921/6000 train_time:4926940ms step_avg:1003.25ms | |
step:4922/6000 train_time:4927953ms step_avg:1003.25ms | |
step:4923/6000 train_time:4928947ms step_avg:1003.25ms | |
step:4924/6000 train_time:4929963ms step_avg:1003.25ms | |
step:4925/6000 train_time:4930967ms step_avg:1003.25ms | |
step:4925/6000 val_loss:2.4788 train_time:4931013ms step_avg:1003.26ms perplexity:11.9268 param_count:85,137,462 | |
step:4926/6000 train_time:4931974ms step_avg:1003.25ms | |
step:4927/6000 train_time:4933008ms step_avg:1003.26ms | |
step:4928/6000 train_time:4934010ms step_avg:1003.26ms | |
step:4929/6000 train_time:4935019ms step_avg:1003.26ms | |
step:4930/6000 train_time:4936023ms step_avg:1003.26ms | |
step:4931/6000 train_time:4937046ms step_avg:1003.26ms | |
step:4932/6000 train_time:4938074ms step_avg:1003.27ms | |
step:4933/6000 train_time:4939086ms step_avg:1003.27ms | |
step:4934/6000 train_time:4940092ms step_avg:1003.27ms | |
step:4935/6000 train_time:4941109ms step_avg:1003.27ms | |
step:4936/6000 train_time:4942102ms step_avg:1003.27ms | |
step:4937/6000 train_time:4943105ms step_avg:1003.27ms | |
step:4938/6000 train_time:4944118ms step_avg:1003.27ms | |
step:4939/6000 train_time:4945139ms step_avg:1003.27ms | |
step:4940/6000 train_time:4946167ms step_avg:1003.28ms | |
step:4941/6000 train_time:4947181ms step_avg:1003.28ms | |
step:4942/6000 train_time:4948200ms step_avg:1003.28ms | |
step:4943/6000 train_time:4949208ms step_avg:1003.29ms | |
step:4944/6000 train_time:4950219ms step_avg:1003.29ms | |
step:4945/6000 train_time:4951246ms step_avg:1003.29ms | |
step:4946/6000 train_time:4952244ms step_avg:1003.29ms | |
step:4947/6000 train_time:4953259ms step_avg:1003.29ms | |
step:4948/6000 train_time:4954261ms step_avg:1003.29ms | |
step:4949/6000 train_time:4955269ms step_avg:1003.29ms | |
step:4950/6000 train_time:4956269ms step_avg:1003.29ms | |
step:4950/6000 val_loss:2.4865 train_time:4956316ms step_avg:1003.30ms perplexity:12.0186 param_count:85,137,462 | |
step:4951/6000 train_time:4957267ms step_avg:1003.29ms | |
step:4952/6000 train_time:4958277ms step_avg:1003.29ms | |
step:4953/6000 train_time:4959298ms step_avg:1003.30ms | |
step:4954/6000 train_time:4960307ms step_avg:1003.30ms | |
step:4955/6000 train_time:4961318ms step_avg:1003.30ms | |
step:4956/6000 train_time:4962333ms step_avg:1003.30ms | |
step:4957/6000 train_time:4963338ms step_avg:1003.30ms | |
step:4958/6000 train_time:4964336ms step_avg:1003.30ms | |
step:4959/6000 train_time:4965341ms step_avg:1003.30ms | |
step:4960/6000 train_time:4966369ms step_avg:1003.31ms | |
step:4961/6000 train_time:4967374ms step_avg:1003.31ms | |
step:4962/6000 train_time:4968388ms step_avg:1003.31ms | |
step:4963/6000 train_time:4969391ms step_avg:1003.31ms | |
step:4964/6000 train_time:4970394ms step_avg:1003.31ms | |
step:4965/6000 train_time:4971398ms step_avg:1003.31ms | |
step:4966/6000 train_time:4972407ms step_avg:1003.31ms | |
step:4967/6000 train_time:4973403ms step_avg:1003.31ms | |
step:4968/6000 train_time:4974413ms step_avg:1003.31ms | |
step:4969/6000 train_time:4975422ms step_avg:1003.31ms | |
step:4970/6000 train_time:4976431ms step_avg:1003.31ms | |
step:4971/6000 train_time:4977447ms step_avg:1003.32ms | |
step:4972/6000 train_time:4978439ms step_avg:1003.31ms | |
step:4973/6000 train_time:4979440ms step_avg:1003.31ms | |
step:4974/6000 train_time:4980432ms step_avg:1003.31ms | |
step:4975/6000 train_time:4981426ms step_avg:1003.31ms | |
step:4975/6000 val_loss:2.4818 train_time:4981472ms step_avg:1003.32ms perplexity:11.9630 param_count:85,137,462 | |
step:4976/6000 train_time:4982426ms step_avg:1003.31ms | |
step:4977/6000 train_time:4983426ms step_avg:1003.31ms | |
step:4978/6000 train_time:4984435ms step_avg:1003.31ms | |
step:4979/6000 train_time:4985457ms step_avg:1003.31ms | |
step:4980/6000 train_time:4986474ms step_avg:1003.31ms | |
step:4981/6000 train_time:4987474ms step_avg:1003.31ms | |
step:4982/6000 train_time:4988484ms step_avg:1003.32ms | |
step:4983/6000 train_time:4989489ms step_avg:1003.32ms | |
step:4984/6000 train_time:4990483ms step_avg:1003.31ms | |
step:4985/6000 train_time:4991476ms step_avg:1003.31ms | |
step:4986/6000 train_time:4992501ms step_avg:1003.32ms | |
step:4987/6000 train_time:4993515ms step_avg:1003.32ms | |
step:4988/6000 train_time:4994534ms step_avg:1003.32ms | |
step:4989/6000 train_time:4995531ms step_avg:1003.32ms | |
step:4990/6000 train_time:4996525ms step_avg:1003.32ms | |
step:4991/6000 train_time:4997527ms step_avg:1003.32ms | |
step:4992/6000 train_time:4998528ms step_avg:1003.32ms | |
step:4993/6000 train_time:4999552ms step_avg:1003.32ms | |
step:4994/6000 train_time:5000561ms step_avg:1003.32ms | |
step:4995/6000 train_time:5001557ms step_avg:1003.32ms | |
step:4996/6000 train_time:5002559ms step_avg:1003.32ms | |
step:4997/6000 train_time:5003590ms step_avg:1003.33ms | |
step:4998/6000 train_time:5004593ms step_avg:1003.33ms | |
step:4999/6000 train_time:5005599ms step_avg:1003.33ms | |
step:5000/6000 train_time:5006614ms step_avg:1003.33ms | |
step:5000/6000 val_loss:2.4722 train_time:5006660ms step_avg:1003.34ms perplexity:11.8483 param_count:85,137,462 | |
step:5001/6000 train_time:5007627ms step_avg:1003.33ms | |
step:5002/6000 train_time:5008632ms step_avg:1003.33ms | |
step:5003/6000 train_time:5009639ms step_avg:1003.33ms | |
step:5004/6000 train_time:5010651ms step_avg:1003.33ms | |
step:5005/6000 train_time:5011658ms step_avg:1003.33ms | |
step:5006/6000 train_time:5012663ms step_avg:1003.34ms | |
step:5007/6000 train_time:5013696ms step_avg:1003.34ms | |
step:5008/6000 train_time:5014693ms step_avg:1003.34ms | |
step:5009/6000 train_time:5015699ms step_avg:1003.34ms | |
step:5010/6000 train_time:5016708ms step_avg:1003.34ms | |
step:5011/6000 train_time:5017710ms step_avg:1003.34ms | |
step:5012/6000 train_time:5018710ms step_avg:1003.34ms | |
step:5013/6000 train_time:5019721ms step_avg:1003.34ms | |
step:5014/6000 train_time:5020745ms step_avg:1003.35ms | |
step:5015/6000 train_time:5021758ms step_avg:1003.35ms | |
step:5016/6000 train_time:5022760ms step_avg:1003.35ms | |
step:5017/6000 train_time:5023773ms step_avg:1003.35ms | |
step:5018/6000 train_time:5024802ms step_avg:1003.35ms | |
step:5019/6000 train_time:5025808ms step_avg:1003.36ms | |
step:5020/6000 train_time:5026822ms step_avg:1003.36ms | |
step:5021/6000 train_time:5027833ms step_avg:1003.36ms | |
step:5022/6000 train_time:5028834ms step_avg:1003.36ms | |
step:5023/6000 train_time:5029835ms step_avg:1003.36ms | |
step:5024/6000 train_time:5030843ms step_avg:1003.36ms | |
step:5025/6000 train_time:5031862ms step_avg:1003.36ms | |
step:5025/6000 val_loss:2.4781 train_time:5031908ms step_avg:1003.37ms perplexity:11.9184 param_count:85,137,462 | |
step:5026/6000 train_time:5032878ms step_avg:1003.36ms | |
step:5027/6000 train_time:5033886ms step_avg:1003.37ms | |
step:5028/6000 train_time:5034882ms step_avg:1003.36ms | |
step:5029/6000 train_time:5035885ms step_avg:1003.36ms | |
step:5030/6000 train_time:5036888ms step_avg:1003.36ms | |
step:5031/6000 train_time:5037900ms step_avg:1003.37ms | |
step:5032/6000 train_time:5038895ms step_avg:1003.36ms | |
step:5033/6000 train_time:5039902ms step_avg:1003.36ms | |
step:5034/6000 train_time:5040911ms step_avg:1003.37ms | |
step:5035/6000 train_time:5041909ms step_avg:1003.37ms | |
step:5036/6000 train_time:5042909ms step_avg:1003.36ms | |
step:5037/6000 train_time:5043904ms step_avg:1003.36ms | |
step:5038/6000 train_time:5044922ms step_avg:1003.37ms | |
step:5039/6000 train_time:5045953ms step_avg:1003.37ms | |
step:5040/6000 train_time:5046947ms step_avg:1003.37ms | |
step:5041/6000 train_time:5047955ms step_avg:1003.37ms | |
step:5042/6000 train_time:5048953ms step_avg:1003.37ms | |
step:5043/6000 train_time:5049963ms step_avg:1003.37ms | |
step:5044/6000 train_time:5050964ms step_avg:1003.37ms | |
step:5045/6000 train_time:5051979ms step_avg:1003.37ms | |
step:5046/6000 train_time:5052994ms step_avg:1003.37ms | |
step:5047/6000 train_time:5054013ms step_avg:1003.38ms | |
step:5048/6000 train_time:5055022ms step_avg:1003.38ms | |
step:5049/6000 train_time:5056034ms step_avg:1003.38ms | |
step:5050/6000 train_time:5057029ms step_avg:1003.38ms | |
step:5050/6000 val_loss:2.4763 train_time:5057071ms step_avg:1003.39ms perplexity:11.8977 param_count:85,137,462 | |
step:5051/6000 train_time:5058029ms step_avg:1003.38ms | |
step:5052/6000 train_time:5059034ms step_avg:1003.38ms | |
step:5053/6000 train_time:5060029ms step_avg:1003.38ms | |
step:5054/6000 train_time:5061037ms step_avg:1003.38ms | |
step:5055/6000 train_time:5062037ms step_avg:1003.38ms | |
step:5056/6000 train_time:5063058ms step_avg:1003.38ms | |
step:5057/6000 train_time:5064058ms step_avg:1003.38ms | |
step:5058/6000 train_time:5065059ms step_avg:1003.38ms | |
step:5059/6000 train_time:5066065ms step_avg:1003.38ms | |
step:5060/6000 train_time:5067080ms step_avg:1003.38ms | |
step:5061/6000 train_time:5068090ms step_avg:1003.38ms | |
step:5062/6000 train_time:5069101ms step_avg:1003.38ms | |
step:5063/6000 train_time:5070109ms step_avg:1003.39ms | |
step:5064/6000 train_time:5071127ms step_avg:1003.39ms | |
step:5065/6000 train_time:5072139ms step_avg:1003.39ms | |
step:5066/6000 train_time:5073152ms step_avg:1003.39ms | |
step:5067/6000 train_time:5074162ms step_avg:1003.39ms | |
step:5068/6000 train_time:5075175ms step_avg:1003.40ms | |
step:5069/6000 train_time:5076183ms step_avg:1003.40ms | |
step:5070/6000 train_time:5077197ms step_avg:1003.40ms | |
step:5071/6000 train_time:5078211ms step_avg:1003.40ms | |
step:5072/6000 train_time:5079249ms step_avg:1003.41ms | |
step:5073/6000 train_time:5080246ms step_avg:1003.41ms | |
step:5074/6000 train_time:5081257ms step_avg:1003.41ms | |
step:5075/6000 train_time:5082274ms step_avg:1003.41ms | |
step:5075/6000 val_loss:2.4801 train_time:5082321ms step_avg:1003.42ms perplexity:11.9423 param_count:85,137,462 | |
step:5076/6000 train_time:5083290ms step_avg:1003.41ms | |
step:5077/6000 train_time:5084286ms step_avg:1003.41ms | |
step:5078/6000 train_time:5085294ms step_avg:1003.41ms | |
step:5079/6000 train_time:5086310ms step_avg:1003.41ms | |
step:5080/6000 train_time:5087327ms step_avg:1003.42ms | |
step:5081/6000 train_time:5088329ms step_avg:1003.42ms | |
step:5082/6000 train_time:5089337ms step_avg:1003.42ms | |
step:5083/6000 train_time:5090335ms step_avg:1003.42ms | |
step:5084/6000 train_time:5091341ms step_avg:1003.42ms | |
step:5085/6000 train_time:5092335ms step_avg:1003.42ms | |
step:5086/6000 train_time:5093329ms step_avg:1003.41ms | |
step:5087/6000 train_time:5094335ms step_avg:1003.41ms | |
step:5088/6000 train_time:5095342ms step_avg:1003.42ms | |
step:5089/6000 train_time:5096365ms step_avg:1003.42ms | |
step:5090/6000 train_time:5097363ms step_avg:1003.42ms | |
step:5091/6000 train_time:5098368ms step_avg:1003.42ms | |
step:5092/6000 train_time:5099420ms step_avg:1003.43ms | |
step:5093/6000 train_time:5100426ms step_avg:1003.43ms | |
step:5094/6000 train_time:5101451ms step_avg:1003.43ms | |
step:5095/6000 train_time:5102443ms step_avg:1003.43ms | |
step:5096/6000 train_time:5103432ms step_avg:1003.43ms | |
step:5097/6000 train_time:5104442ms step_avg:1003.43ms | |
step:5098/6000 train_time:5105447ms step_avg:1003.43ms | |
step:5099/6000 train_time:5106450ms step_avg:1003.43ms | |
step:5100/6000 train_time:5107462ms step_avg:1003.43ms | |
step:5100/6000 val_loss:2.4714 train_time:5107508ms step_avg:1003.44ms perplexity:11.8395 param_count:85,137,462 | |
step:5101/6000 train_time:5108477ms step_avg:1003.43ms | |
step:5102/6000 train_time:5109474ms step_avg:1003.43ms | |
step:5103/6000 train_time:5110477ms step_avg:1003.43ms | |
step:5104/6000 train_time:5111477ms step_avg:1003.43ms | |
step:5105/6000 train_time:5112477ms step_avg:1003.43ms | |
step:5106/6000 train_time:5113493ms step_avg:1003.43ms | |
step:5107/6000 train_time:5114493ms step_avg:1003.43ms | |
step:5108/6000 train_time:5115497ms step_avg:1003.43ms | |
step:5109/6000 train_time:5116497ms step_avg:1003.43ms | |
step:5110/6000 train_time:5117507ms step_avg:1003.43ms | |
step:5111/6000 train_time:5118514ms step_avg:1003.43ms | |
step:5112/6000 train_time:5119519ms step_avg:1003.43ms | |
step:5113/6000 train_time:5120559ms step_avg:1003.44ms | |
step:5114/6000 train_time:5121545ms step_avg:1003.44ms | |
step:5115/6000 train_time:5122543ms step_avg:1003.44ms | |
step:5116/6000 train_time:5123555ms step_avg:1003.44ms | |
step:5117/6000 train_time:5124550ms step_avg:1003.44ms | |
step:5118/6000 train_time:5125556ms step_avg:1003.44ms | |
step:5119/6000 train_time:5126580ms step_avg:1003.44ms | |
step:5120/6000 train_time:5127584ms step_avg:1003.44ms | |
step:5121/6000 train_time:5128592ms step_avg:1003.44ms | |
step:5122/6000 train_time:5129593ms step_avg:1003.44ms | |
step:5123/6000 train_time:5130591ms step_avg:1003.44ms | |
step:5124/6000 train_time:5131597ms step_avg:1003.44ms | |
step:5125/6000 train_time:5132606ms step_avg:1003.44ms | |
step:5125/6000 val_loss:2.4806 train_time:5132651ms step_avg:1003.45ms perplexity:11.9481 param_count:85,137,462 | |
step:5126/6000 train_time:5133610ms step_avg:1003.44ms | |
step:5127/6000 train_time:5134618ms step_avg:1003.44ms | |
step:5128/6000 train_time:5135634ms step_avg:1003.45ms | |
step:5129/6000 train_time:5136639ms step_avg:1003.45ms | |
step:5130/6000 train_time:5137653ms step_avg:1003.45ms | |
step:5131/6000 train_time:5138652ms step_avg:1003.45ms | |
step:5132/6000 train_time:5139670ms step_avg:1003.45ms | |
step:5133/6000 train_time:5140667ms step_avg:1003.45ms | |
step:5134/6000 train_time:5141679ms step_avg:1003.45ms | |
step:5135/6000 train_time:5142689ms step_avg:1003.45ms | |
step:5136/6000 train_time:5143710ms step_avg:1003.45ms | |
step:5137/6000 train_time:5144711ms step_avg:1003.45ms |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment