Skip to content

Instantly share code, notes, and snippets.

@JonathanRaiman
Last active November 27, 2018 02:25
Show Gist options
  • Save JonathanRaiman/8c5bd046823f66b97e2944e571e45d78 to your computer and use it in GitHub Desktop.
Save JonathanRaiman/8c5bd046823f66b97e2944e571e45d78 to your computer and use it in GitHub Desktop.
Code auto generated by Dali
auto a = op::uniform(-20.0, 20.0, {2, 5}).astype(dtype);
a.eval();
auto exped = op::exp(a - op::max(a, {-1}, true));
auto fused_softmax = exped / op::sum(exped, {-1}, true);
#include "dali/array/jit/array_view.h"
#include "dali/array/jit/reducer_kernels.h"
#include "dali/array/functor.h"
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2>
void __global__
assign_kernel_11204438187360910603(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < num_el0; i += stride) {
auto nd_idx = index_to_dim(idx, shape0);
dst0[nd_idx] = src0[nd_idx];
}
__syncthreads();
for (int i = idx; i < num_el1; i += stride) {
auto nd_idx = index_to_dim(idx, shape1);
dst1[nd_idx] = src1[nd_idx];
}
__syncthreads();
for (int i = idx; i < num_el2; i += stride) {
auto nd_idx = index_to_dim(idx, shape2);
dst2[nd_idx] = src2[nd_idx];
}
}
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
struct ElementWiseKernel2D2 {
C1 arg_1_view_;
C2 arg_2_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D2(
const C1& arg_1_view,
const C2& arg_2_view)
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
const C1& arg_1_view,
const C2& arg_2_view) {
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
}
template<template <typename> class Functor, typename Type,
typename C1>
struct ElementWiseKernel2D1 {
C1 arg_1_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D1(
const C1& arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1>
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
const C1& arg_1_view) {
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
}
template<typename C1>
struct Broadcasted_reshapeFTKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0], 0}];
}
};
template<typename C1>
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
return Broadcasted_reshapeFTKernel<C1>(array, shape);
}
template<typename C1>
struct Expand_dims_1_2dKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0]}];
}
};
template<typename C1>
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
return Expand_dims_1_2dKernel<C1>(array, shape);
}
template<typename Reducer, typename Type, typename C1>
struct AxisReduceKernel2 {
C1 arg_1_view_;
static const int ndim = 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE AxisReduceKernel2(
C1 arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<1>& input_query) const {
Shape<2> query = input_query.expand_dims(2);
T res;
Reducer::SetInitValue(res);
int& i1 = query[1];
for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
Reducer::Reduce(res, arg_1_view_[query]);
}
return res;
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
C1 arg_1_view) {
return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
}
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]);
auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]);
Shape<2> shape_0(shapes[0]);
Shape<2> shape_1(shapes[1]);
Shape<2> shape_2(shapes[2]);
Shape<2> shape_3(shapes[3]);
Shape<2> shape_4(shapes[4]);
Shape<2> shape_5(shapes[5]);
auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]);
auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]);
const int NT = 128;
// const int MAX_BLOCKS = 40960;
auto dest_shape_0 = temp_0.shape();
int num_el0 = dest_shape_0.numel();
int max_num_el = num_el0;
auto dest_shape_1 = temp_1.shape();
int num_el1 = dest_shape_1.numel();
max_num_el = max(num_el1, max_num_el);
auto dest_shape_2 = array_0_view.shape();
int num_el2 = dest_shape_2.numel();
max_num_el = max(num_el2, max_num_el);
int grid_size = div_ceil(max_num_el, NT);
// assert(grid_size <= MAX_BLOCKS);
assign_kernel_11204438187360910603<<<grid_size, NT, 0, NULL>>>(
temp_0,
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1),
num_el0,
dest_shape_0,
temp_1,
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3),
num_el1,
dest_shape_1,
array_0_view,
element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)),
num_el2,
dest_shape_2
);
}
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
run(a, b, c, d, e, f);
}
#include "dali/array/jit/array_view.h"
#include "dali/array/jit/reducer_kernels.h"
#include "dali/array/functor.h"
template<typename C1>
struct Expand_dims_1_2dKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0]}];
}
};
template<typename C1>
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
return Expand_dims_1_2dKernel<C1>(array, shape);
}
template<typename Reducer, int x_bits, typename DType>
inline __device__ void ReduceX2(volatile DType buf[], int tid) {
if (x_bits >= 10) {
if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
__syncthreads();
}
if (x_bits >= 9) {
if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
__syncthreads();
}
if (x_bits >= 8) {
if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
__syncthreads();
}
if (x_bits >= 7) {
if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
__syncthreads();
}
if (x_bits >= 6) {
if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
__syncthreads();
}
// in warp optimization
if (x_bits >= 5) {
if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
__syncthreads();
}
if (x_bits >= 4) {
if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
__syncthreads();
}
if (x_bits >= 3) {
if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
__syncthreads();
}
if (x_bits >= 2) {
if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
__syncthreads();
}
if (x_bits >= 1) {
if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
__syncthreads();
}
}
template<typename Reducer, typename Type, typename C1>
struct WarpAxisReduceKernel2 {
C1 arg_1_view_;
static const int ndim = 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE WarpAxisReduceKernel2(
C1 arg_1_view)
: arg_1_view_(arg_1_view) {}
inline __device__ T operator[](const Shape<1>& input_query) const {
Shape<2> query = input_query.expand_dims(2);
__shared__ T buffer[256];
query[1] = threadIdx.x;
if (threadIdx.x < arg_1_view_.shape()[1]) {
buffer[threadIdx.x] = arg_1_view_[query];
}
for (unsigned x = 256; x < arg_1_view_.shape()[1]; x += 256) {
const int col = x + threadIdx.x;
if (col < arg_1_view_.shape()[1]) {
query[1] = col;
Reducer::Reduce(buffer[threadIdx.x], arg_1_view_[query]);
}
}
__syncthreads();
// if number of rows is smaller than buffer,
// fill buffer with neutral value
if (threadIdx.x >= arg_1_view_.shape()[1]) {
Reducer::SetInitValue(buffer[threadIdx.x]);
}
__syncthreads();
ReduceX2<Reducer, 8>(buffer, threadIdx.x);
return buffer[0];
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE WarpAxisReduceKernel2<Reducer, Type, C1> warp_axis_reduce_kernel_2d(
C1 arg_1_view) {
return WarpAxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
}
template<typename Reducer, typename Type, typename C1>
struct AxisReduceKernel2 {
C1 arg_1_view_;
static const int ndim = 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE AxisReduceKernel2(
C1 arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<1>& input_query) const {
Shape<2> query = input_query.expand_dims(2);
T res;
Reducer::SetInitValue(res);
int& i1 = query[1];
for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
Reducer::Reduce(res, arg_1_view_[query]);
}
return res;
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
C1 arg_1_view) {
return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
}
template<template <typename> class Functor, typename Type,
typename C1>
struct ElementWiseKernel2D1 {
C1 arg_1_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D1(
const C1& arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1>
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
const C1& arg_1_view) {
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
}
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
struct ElementWiseKernel2D2 {
C1 arg_1_view_;
C2 arg_2_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D2(
const C1& arg_1_view,
const C2& arg_2_view)
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
const C1& arg_1_view,
const C2& arg_2_view) {
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
}
template<typename C1>
struct Broadcasted_reshapeFTKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0], 0}];
}
};
template<typename C1>
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
return Broadcasted_reshapeFTKernel<C1>(array, shape);
}
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2>
void __global__
assign_kernel_16960456299461880923(Destination0 dst0, Source0 src0, Shape<2> shape0, Destination1 dst1, Source1 src1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) {
int stride = blockDim.x * gridDim.x;
int idx = blockIdx.x;
auto nd_idx_2 = index_to_dim(idx, shape0);
dst0[nd_idx_2] = src0[nd_idx_2];
__syncthreads();
nd_idx_2 = index_to_dim(idx, shape1);
dst1[nd_idx_2] = src1[nd_idx_2];
__syncthreads();
idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx; i < num_el2; i += stride) {
auto nd_idx = index_to_dim(idx, shape2);
dst2[nd_idx] = src2[nd_idx];
}
}
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]);
auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]);
Shape<2> shape_0(shapes[0]);
Shape<2> shape_1(shapes[1]);
Shape<2> shape_2(shapes[2]);
Shape<2> shape_3(shapes[3]);
Shape<2> shape_4(shapes[4]);
Shape<2> shape_5(shapes[5]);
auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]);
auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]);
const int NT = 256;
// const int MAX_BLOCKS = 40960;
auto dest_shape_0 = temp_0.shape();
int num_el0 = dest_shape_0.numel();
int max_num_el = num_el0;
auto dest_shape_1 = temp_1.shape();
int num_el1 = dest_shape_1.numel();
max_num_el = max(num_el1, max_num_el);
auto dest_shape_2 = array_0_view.shape();
int num_el2 = dest_shape_2.numel();
max_num_el = max(num_el2, max_num_el);
int grid_size = max_num_el;
// assert(grid_size <= MAX_BLOCKS);
assign_kernel_16960456299461880923<<<grid_size, NT, 0, NULL>>>(
temp_0,
expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1),
dest_shape_0,
temp_1,
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_5), shape_4)))), shape_3),
dest_shape_1,
array_0_view,
element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)),
num_el2,
dest_shape_2
);
}
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
run(a, b, c, d, e, f);
}
#include "dali/array/jit/array_view.h"
#include "dali/array/jit/reducer_kernels.h"
#include "dali/array/functor.h"
template<typename C1>
struct Expand_dims_1_2dKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0]}];
}
};
template<typename C1>
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
return Expand_dims_1_2dKernel<C1>(array, shape);
}
template<typename Reducer, int x_bits, typename DType>
inline __device__ void ReduceX(volatile DType buf[], int tid) {
if (x_bits >= 10) {
if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
__syncthreads();
}
if (x_bits >= 9) {
if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
__syncthreads();
}
if (x_bits >= 8) {
if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
__syncthreads();
}
if (x_bits >= 7) {
if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
__syncthreads();
}
if (x_bits >= 6) {
if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
__syncthreads();
}
// in warp optimization
if (x_bits >= 5) {
if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
__syncthreads();
}
if (x_bits >= 4) {
if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
__syncthreads();
}
if (x_bits >= 3) {
if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
__syncthreads();
}
if (x_bits >= 2) {
if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
__syncthreads();
}
if (x_bits >= 1) {
if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
__syncthreads();
}
}
template<typename Reducer, typename Type, typename C1>
struct WarpAxisReduceKernel2 {
C1 arg_1_view_;
static const int ndim = 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE WarpAxisReduceKernel2(
C1 arg_1_view)
: arg_1_view_(arg_1_view) {}
inline __device__ T operator[](const Shape<1>& input_query) const {
Shape<2> query = input_query.expand_dims(2);
__shared__ T buffer[256];
query[1] = threadIdx.x;
if (threadIdx.x < arg_1_view_.shape()[1]) {
buffer[threadIdx.x] = arg_1_view_[query];
}
for (unsigned x = blockDim.x; x < arg_1_view_.shape()[1]; x += blockDim.x) {
const int col = x + threadIdx.x;
if (col < arg_1_view_.shape()[1]) {
query[1] = col;
Reducer::Reduce(buffer[threadIdx.x], arg_1_view_[query]);
}
}
__syncthreads();
// if number of rows is smaller than buffer,
// fill buffer with neutral value
if (threadIdx.x >= arg_1_view_.shape()[1]) {
Reducer::SetInitValue(buffer[threadIdx.x]);
}
__syncthreads();
ReduceX<Reducer, 8>(buffer, threadIdx.x);
return buffer[0];
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE WarpAxisReduceKernel2<Reducer, Type, C1> warp_axis_reduce_kernel_2d(
C1 arg_1_view) {
return WarpAxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
}
#include <cooperative_groups.h>
template<typename T, typename C1, int ndim>
inline __device__ T thread_sum(const C1& input, Shape<ndim> query, int start, int stride) {
T sum = 0;
int& i = query[ndim - 1];
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
return sum;
}
template<typename T, int ndim>
inline __device__ int thread_sum(const ArrayView<int, ndim>& input, Shape<ndim> query, int start, int stride) {
int sum = 0;
int4* ptr = (int4*) &input[query];
int cols_div_word_length = input.shape()[ndim-1] / 4;
int& i = query[ndim - 1];
if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
for(i = start;
i < cols_div_word_length;
i += stride) {
int4 in = ptr[i];
sum += in.x + in.y + in.z + in.w;
}
} else {
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
}
return sum;
}
template<typename T, int ndim>
inline __device__ float thread_sum(const ArrayView<float, ndim>& input, Shape<ndim> query, int start, int stride) {
float sum = 0;
float4* ptr = (float4*) &input[query];
int cols_div_word_length = input.shape()[ndim-1] / 4;
int& i = query[ndim - 1];
if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
for(i = start;
i < cols_div_word_length;
i += stride) {
float4 in = ptr[i];
sum += in.x + in.y + in.z + in.w;
}
} else {
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
}
return sum;
}
template<typename T, int ndim>
inline __device__ double thread_sum(const ArrayView<double, ndim>& input, Shape<ndim> query, int start, int stride) {
double sum = 0;
double2* ptr = (double2*) &input[query];
int cols_div_word_length = input.shape()[ndim-1] / 2;
int& i = query[ndim - 1];
if (cols_div_word_length * 2 == input.shape()[ndim-1]) {
for(i = start;
i < cols_div_word_length;
i += stride) {
double2 in = ptr[i];
sum += in.x + in.y;
}
} else {
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
}
return sum;
}
template <int tile_sz, typename T>
__device__ T reduce_sum_tile_shfl(cooperative_groups::thread_block_tile<tile_sz> g, T val) {
int lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread adds its partial sum[i] to sum[lane+i]
for (int i = g.size() / 2; i > 0; i /= 2) {
val += g.shfl_down(val, i);
}
return val; // note: only thread 0 will return full sum
}
template<typename Reducer, typename Type, typename C1>
struct ShflDownWarpAxisSum1 {
C1 arg_;
static const int ndim = C1::ndim - 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE ShflDownWarpAxisSum1(C1 arg) : arg_(arg) {}
inline __device__ T operator[](const Shape<ndim>& input_query) const {
__shared__ T sum;
sum = 0;
Shape<ndim + 1> query = input_query.expand_dims(ndim);
query[ndim] = 0;
T my_sum = thread_sum<T>(arg_, query, threadIdx.x, blockDim.x);
auto tile = cooperative_groups::tiled_partition<16>(
cooperative_groups::this_thread_block());
T tile_sum = reduce_sum_tile_shfl<16>(tile, my_sum);
if (tile.thread_rank() == 0) atomicAdd(&sum, tile_sum);
__syncthreads();
return sum;
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE ShflDownWarpAxisSum1<Reducer, Type, C1> shfl_down_warp_axis_sum2d(
C1 arg) {
return ShflDownWarpAxisSum1<Reducer, Type, C1>(arg);
}
template<template <typename> class Functor, typename Type,
typename C1>
struct ElementWiseKernel2D1 {
C1 arg_1_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D1(
const C1& arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1>
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
const C1& arg_1_view) {
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
}
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
struct ElementWiseKernel2D2 {
C1 arg_1_view_;
C2 arg_2_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D2(
const C1& arg_1_view,
const C2& arg_2_view)
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
const C1& arg_1_view,
const C2& arg_2_view) {
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
}
template<typename C1>
struct Broadcasted_reshapeFTKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0], 0}];
}
};
template<typename C1>
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
return Broadcasted_reshapeFTKernel<C1>(array, shape);
}
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2, typename Destination3, typename Source3>
void __global__
assign_kernel_6585648013932656081(Destination0 dst0, Source0 src0, Shape<2> shape0, Destination1 dst1, Source1 src1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2, Destination3 dst3, Source3 src3, int num_el3, Shape<2> shape3) {
int idx = blockIdx.x;
auto nd_idx_2 = index_to_dim(idx, shape0);
dst0[nd_idx_2] = src0[nd_idx_2];
__syncthreads();
nd_idx_2 = index_to_dim(idx, shape1);
dst1[nd_idx_2] = src1[nd_idx_2];
__syncthreads();
idx = blockDim.x * blockIdx.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = idx; i < num_el2; i += stride) {
auto nd_idx = index_to_dim(idx, shape2);
dst2[nd_idx] = src2[nd_idx];
}
__syncthreads();
idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx; i < num_el3; i += stride) {
auto nd_idx = index_to_dim(idx, shape3);
dst3[nd_idx] = src3[nd_idx];
}
}
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
auto array_0_view = make_view<float, 2>(array_data[0], offsets[0], sizes[0]);
auto array_1_view = make_view<float, 2>(array_data[1], offsets[1], sizes[1]);
Shape<1> shape_0(shapes[0]);
Shape<2> shape_1(shapes[1]);
Shape<2> shape_2(shapes[2]);
Shape<2> shape_3(shapes[3]);
Shape<2> shape_4(shapes[4]);
Shape<1> shape_5(shapes[5]);
auto temp_0 = make_view<float, 2>(array_data[2], offsets[2], sizes[2]);
auto temp_1 = make_view<float, 2>(array_data[3], offsets[3], sizes[3]);
auto temp_2 = make_view<float, 2>(array_data[4], offsets[4], sizes[4]);
const int NT = 256;
// const int MAX_BLOCKS = 40960;
auto dest_shape_0 = temp_0.shape();
int num_el0 = dest_shape_0.numel();
int max_num_el = num_el0;
auto dest_shape_1 = temp_1.shape();
int num_el1 = dest_shape_1.numel();
max_num_el = max(num_el1, max_num_el);
auto dest_shape_2 = temp_2.shape();
int num_el2 = dest_shape_2.numel();
max_num_el = max(num_el2, max_num_el);
auto dest_shape_3 = array_0_view.shape();
int num_el3 = dest_shape_3.numel();
max_num_el = max(num_el3, max_num_el);
int grid_size = max_num_el;
// assert(grid_size <= MAX_BLOCKS);
assign_kernel_6585648013932656081<<<grid_size, NT, 0, NULL>>>(
temp_0,
expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, float>(array_1_view), shape_1),
dest_shape_0,
temp_1,
expand_dims_1_2d(shfl_down_warp_axis_sum2d<reducers::sum, float>(element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3),
dest_shape_1,
temp_2,
element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4))),
num_el2,
dest_shape_2,
array_0_view,
element_wise_kernel2D2<functor::eltdiv, float>(temp_2, broadcasted_reshapeFT(temp_1, shape_2)),
num_el3,
dest_shape_3
);
}
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
run(a, b, c, d, e, f);
}
#include "dali/array/jit/array_view.h"
#include "dali/array/jit/reducer_kernels.h"
#include "dali/array/functor.h"
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2, typename Destination3, typename Source3>
void __global__
assign_kernel_11773770055001859919(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2, Destination3 dst3, Source3 src3, int num_el3, Shape<2> shape3) {
for (int i0 = blockIdx.x; i0 < shape0[0]; i0 += gridDim.x) {
for (int i1 = threadIdx.x; i1 < shape0[1]; i1 += blockDim.x) {
Shape<2> nd_idx({i0, i1});
dst0[i1] = src0[i1];
}
}
__syncthreads();
for (int i0 = blockIdx.x; i0 < shape1[0]; i0 += gridDim.x) {
for (int i1 = threadIdx.x; i1 < shape1[1]; i1 += blockDim.x) {
Shape<2> nd_idx({i0, i1});
dst1[i1] = src1[i1];
}
}
__syncthreads();
for (int i0 = blockIdx.x; i0 < shape2[0]; i0 += gridDim.x) {
for (int i1 = 0; i1 < shape2[1]; i1 += 1) {
Shape<2> nd_idx({i0, i1});
dst2[i1] = src2[i1];
}
}
__syncthreads();
for (int i0 = blockIdx.x; i0 < shape3[0]; i0 += gridDim.x) {
for (int i1 = threadIdx.x; i1 < shape3[1]; i1 += blockDim.x) {
Shape<2> nd_idx({i0, i1});
dst3[i1] = src3[i1];
}
}
}
template<typename C1>
struct Expand_dims_1_2dKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0]}];
}
};
template<typename C1>
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
return Expand_dims_1_2dKernel<C1>(array, shape);
}
template<typename Reducer, typename Type, typename C1>
struct AxisReduceKernel2 {
C1 arg_1_view_;
static const int ndim = 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE AxisReduceKernel2(
C1 arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<1>& input_query) const {
Shape<2> query = input_query.expand_dims(2);
T res;
Reducer::SetInitValue(res);
int& i1 = query[1];
for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
Reducer::Reduce(res, arg_1_view_[query]);
}
return res;
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
C1 arg_1_view) {
return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
}
template<template <typename> class Functor, typename Type,
typename C1>
struct ElementWiseKernel2D1 {
C1 arg_1_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D1(
const C1& arg_1_view)
: arg_1_view_(arg_1_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1>
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
const C1& arg_1_view) {
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
}
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
struct ElementWiseKernel2D2 {
C1 arg_1_view_;
C2 arg_2_view_;
static const int ndim = 2;
typedef Type T;
XINLINE const Shape<ndim>& shape() const {
return arg_1_view_.shape();
}
XINLINE ElementWiseKernel2D2(
const C1& arg_1_view,
const C2& arg_2_view)
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
}
};
template<template <typename> class Functor, typename Type,
typename C1,
typename C2>
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
const C1& arg_1_view,
const C2& arg_2_view) {
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
}
template<typename C1>
struct Broadcasted_reshapeFTKernel {
const C1 array_;
const Shape<2> shape_;
static const int ndim = 2;
typedef typename C1::T T;
XINLINE const Shape<ndim>& shape() const {return shape_;}
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
: array_(array), shape_(shape) {}
XINLINE T operator[](const Shape<ndim>& query) const {
return array_[{query[0], 0}];
}
};
template<typename C1>
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
return Broadcasted_reshapeFTKernel<C1>(array, shape);
}
#include <cooperative_groups.h>
template<typename T, typename C1, int ndim>
inline __device__ T thread_sum(const C1& input, Shape<ndim> query, int start, int stride) {
T sum = 0;
int& i = query[ndim - 1];
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
return sum;
}
template<typename T, int ndim>
inline __device__ int thread_sum(const ArrayView<int, ndim>& input, Shape<ndim> query, int start, int stride) {
int sum = 0;
int4* ptr = (int4*) &input[query];
int cols_div_word_length = input.shape()[ndim-1] / 4;
int& i = query[ndim - 1];
if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
for(i = start;
i < cols_div_word_length;
i += stride) {
int4 in = ptr[i];
sum += in.x + in.y + in.z + in.w;
}
} else {
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
}
return sum;
}
template<typename T, int ndim>
inline __device__ float thread_sum(const ArrayView<float, ndim>& input, Shape<ndim> query, int start, int stride) {
float sum = 0;
float4* ptr = (float4*) &input[query];
int cols_div_word_length = input.shape()[ndim-1] / 4;
int& i = query[ndim - 1];
if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
for(i = start;
i < cols_div_word_length;
i += stride) {
float4 in = ptr[i];
sum += in.x + in.y + in.z + in.w;
}
} else {
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
}
return sum;
}
template<typename T, int ndim>
inline __device__ double thread_sum(const ArrayView<double, ndim>& input, Shape<ndim> query, int start, int stride) {
double sum = 0;
double2* ptr = (double2*) &input[query];
int cols_div_word_length = input.shape()[ndim-1] / 2;
int& i = query[ndim - 1];
if (cols_div_word_length * 2 == input.shape()[ndim-1]) {
for(i = start;
i < cols_div_word_length;
i += stride) {
double2 in = ptr[i];
sum += in.x + in.y;
}
} else {
for(i = start;
i < input.shape()[ndim-1];
i += stride) {
sum += input[query];
}
}
return sum;
}
template <int tile_sz, typename T>
__device__ T reduce_sum_tile_shfl(cooperative_groups::thread_block_tile<tile_sz> g, T val) {
int lane = g.thread_rank();
// Each iteration halves the number of active threads
// Each thread adds its partial sum[i] to sum[lane+i]
for (int i = g.size() / 2; i > 0; i /= 2) {
val += g.shfl_down(val, i);
}
return val; // note: only thread 0 will return full sum
}
template<typename Reducer, typename Type, typename C1>
struct ShflDownWarpAxisSum1 {
C1 arg_;
static const int ndim = C1::ndim - 1;
typedef Type T;
XINLINE Shape<ndim> shape() const {
return arg_.shape().template axis_reduced_shape<0, ndim>();
}
XINLINE ShflDownWarpAxisSum1(C1 arg) : arg_(arg) {}
inline __device__ T operator[](const Shape<ndim>& input_query) const {
__shared__ T sum;
sum = 0;
Shape<ndim + 1> query = input_query.expand_dims(ndim);
query[ndim] = 0;
T my_sum = thread_sum<T>(arg_, query, threadIdx.x, blockDim.x);
auto tile = cooperative_groups::tiled_partition<16>(
cooperative_groups::this_thread_block());
T tile_sum = reduce_sum_tile_shfl<16>(tile, my_sum);
if (tile.thread_rank() == 0) atomicAdd(&sum, tile_sum);
__syncthreads();
return sum;
}
};
template<typename Reducer, typename Type, typename C1>
XINLINE ShflDownWarpAxisSum1<Reducer, Type, C1> shfl_down_warp_axis_sum2d(
C1 arg) {
return ShflDownWarpAxisSum1<Reducer, Type, C1>(arg);
}
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
auto array_0_view = make_view<float, 2>(array_data[0], offsets[0], sizes[0]);
auto array_1_view = make_view<float, 2>(array_data[1], offsets[1], sizes[1]);
Shape<2> shape_0(shapes[0]);
Shape<2> shape_1(shapes[1]);
Shape<2> shape_2(shapes[2]);
Shape<2> shape_3(shapes[3]);
Shape<2> shape_4(shapes[4]);
Shape<2> shape_5(shapes[5]);
auto temp_0 = make_view<float, 2>(array_data[2], offsets[2], sizes[2]);
auto temp_1 = make_view<float, 2>(array_data[3], offsets[3], sizes[3]);
auto temp_2 = make_view<float, 2>(array_data[4], offsets[4], sizes[4]);
const int NT = 256;
// const int MAX_BLOCKS = 40960;
auto dest_shape_0 = temp_0.shape();
int num_el0 = dest_shape_0.numel();
auto dest_shape_1 = temp_2.shape();
int num_el1 = dest_shape_1.numel();
auto dest_shape_2 = temp_1.shape();
int num_el2 = dest_shape_2.numel();
auto dest_shape_3 = array_0_view.shape();
int num_el3 = dest_shape_3.numel();
int max_num_el = dest_shape_0[0];
max_num_el = max(max_num_el, dest_shape_1[0]);
max_num_el = max(max_num_el, dest_shape_2[0]);
max_num_el = max(max_num_el, dest_shape_3[0]);
// assert(grid_size <= MAX_BLOCKS);
int grid_size = max_num_el;
assign_kernel_11773770055001859919<<<grid_size, NT, 0, NULL>>>(
temp_0,
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, float>(array_1_view), shape_5),
num_el0,
dest_shape_0,
temp_2,
element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4))),
num_el1,
dest_shape_1,
temp_1,
expand_dims_1_2d(shfl_down_warp_axis_sum2d<reducers::sum, float>(temp_2), shape_3),
num_el2,
dest_shape_2,
array_0_view,
element_wise_kernel2D2<functor::eltdiv, float>(temp_2, broadcasted_reshapeFT(temp_1, shape_2)),
num_el3,
dest_shape_3
);
}
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
run(a, b, c, d, e, f);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment