Last active
November 27, 2018 02:25
-
-
Save JonathanRaiman/8c5bd046823f66b97e2944e571e45d78 to your computer and use it in GitHub Desktop.
Code auto generated by Dali
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
auto a = op::uniform(-20.0, 20.0, {2, 5}).astype(dtype); | |
a.eval(); | |
auto exped = op::exp(a - op::max(a, {-1}, true)); | |
auto fused_softmax = exped / op::sum(exped, {-1}, true); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "dali/array/jit/array_view.h" | |
#include "dali/array/jit/reducer_kernels.h" | |
#include "dali/array/functor.h" | |
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2> | |
void __global__ | |
assign_kernel_11204438187360910603(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) { | |
int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
for (int i = idx; i < num_el0; i += stride) { | |
auto nd_idx = index_to_dim(idx, shape0); | |
dst0[nd_idx] = src0[nd_idx]; | |
} | |
__syncthreads(); | |
for (int i = idx; i < num_el1; i += stride) { | |
auto nd_idx = index_to_dim(idx, shape1); | |
dst1[nd_idx] = src1[nd_idx]; | |
} | |
__syncthreads(); | |
for (int i = idx; i < num_el2; i += stride) { | |
auto nd_idx = index_to_dim(idx, shape2); | |
dst2[nd_idx] = src2[nd_idx]; | |
} | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
struct ElementWiseKernel2D2 { | |
C1 arg_1_view_; | |
C2 arg_2_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) | |
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) { | |
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
struct ElementWiseKernel2D1 { | |
C1 arg_1_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D1( | |
const C1& arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1( | |
const C1& arg_1_view) { | |
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view); | |
} | |
template<typename C1> | |
struct Broadcasted_reshapeFTKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0], 0}]; | |
} | |
}; | |
template<typename C1> | |
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) { | |
return Broadcasted_reshapeFTKernel<C1>(array, shape); | |
} | |
template<typename C1> | |
struct Expand_dims_1_2dKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0]}]; | |
} | |
}; | |
template<typename C1> | |
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) { | |
return Expand_dims_1_2dKernel<C1>(array, shape); | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct AxisReduceKernel2 { | |
C1 arg_1_view_; | |
static const int ndim = 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE AxisReduceKernel2( | |
C1 arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<1>& input_query) const { | |
Shape<2> query = input_query.expand_dims(2); | |
T res; | |
Reducer::SetInitValue(res); | |
int& i1 = query[1]; | |
for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) { | |
Reducer::Reduce(res, arg_1_view_[query]); | |
} | |
return res; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d( | |
C1 arg_1_view) { | |
return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view); | |
} | |
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) { | |
auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]); | |
auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]); | |
Shape<2> shape_0(shapes[0]); | |
Shape<2> shape_1(shapes[1]); | |
Shape<2> shape_2(shapes[2]); | |
Shape<2> shape_3(shapes[3]); | |
Shape<2> shape_4(shapes[4]); | |
Shape<2> shape_5(shapes[5]); | |
auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]); | |
auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]); | |
const int NT = 128; | |
// const int MAX_BLOCKS = 40960; | |
auto dest_shape_0 = temp_0.shape(); | |
int num_el0 = dest_shape_0.numel(); | |
int max_num_el = num_el0; | |
auto dest_shape_1 = temp_1.shape(); | |
int num_el1 = dest_shape_1.numel(); | |
max_num_el = max(num_el1, max_num_el); | |
auto dest_shape_2 = array_0_view.shape(); | |
int num_el2 = dest_shape_2.numel(); | |
max_num_el = max(num_el2, max_num_el); | |
int grid_size = div_ceil(max_num_el, NT); | |
// assert(grid_size <= MAX_BLOCKS); | |
assign_kernel_11204438187360910603<<<grid_size, NT, 0, NULL>>>( | |
temp_0, | |
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1), | |
num_el0, | |
dest_shape_0, | |
temp_1, | |
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3), | |
num_el1, | |
dest_shape_1, | |
array_0_view, | |
element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)), | |
num_el2, | |
dest_shape_2 | |
); | |
} | |
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){ | |
run(a, b, c, d, e, f); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "dali/array/jit/array_view.h" | |
#include "dali/array/jit/reducer_kernels.h" | |
#include "dali/array/functor.h" | |
template<typename C1> | |
struct Expand_dims_1_2dKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0]}]; | |
} | |
}; | |
template<typename C1> | |
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) { | |
return Expand_dims_1_2dKernel<C1>(array, shape); | |
} | |
template<typename Reducer, int x_bits, typename DType> | |
inline __device__ void ReduceX2(volatile DType buf[], int tid) { | |
if (x_bits >= 10) { | |
if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]); | |
__syncthreads(); | |
} | |
if (x_bits >= 9) { | |
if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]); | |
__syncthreads(); | |
} | |
if (x_bits >= 8) { | |
if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]); | |
__syncthreads(); | |
} | |
if (x_bits >= 7) { | |
if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]); | |
__syncthreads(); | |
} | |
if (x_bits >= 6) { | |
if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]); | |
__syncthreads(); | |
} | |
// in warp optimization | |
if (x_bits >= 5) { | |
if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); | |
__syncthreads(); | |
} | |
if (x_bits >= 4) { | |
if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); | |
__syncthreads(); | |
} | |
if (x_bits >= 3) { | |
if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]); | |
__syncthreads(); | |
} | |
if (x_bits >= 2) { | |
if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]); | |
__syncthreads(); | |
} | |
if (x_bits >= 1) { | |
if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]); | |
__syncthreads(); | |
} | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct WarpAxisReduceKernel2 { | |
C1 arg_1_view_; | |
static const int ndim = 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE WarpAxisReduceKernel2( | |
C1 arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
inline __device__ T operator[](const Shape<1>& input_query) const { | |
Shape<2> query = input_query.expand_dims(2); | |
__shared__ T buffer[256]; | |
query[1] = threadIdx.x; | |
if (threadIdx.x < arg_1_view_.shape()[1]) { | |
buffer[threadIdx.x] = arg_1_view_[query]; | |
} | |
for (unsigned x = 256; x < arg_1_view_.shape()[1]; x += 256) { | |
const int col = x + threadIdx.x; | |
if (col < arg_1_view_.shape()[1]) { | |
query[1] = col; | |
Reducer::Reduce(buffer[threadIdx.x], arg_1_view_[query]); | |
} | |
} | |
__syncthreads(); | |
// if number of rows is smaller than buffer, | |
// fill buffer with neutral value | |
if (threadIdx.x >= arg_1_view_.shape()[1]) { | |
Reducer::SetInitValue(buffer[threadIdx.x]); | |
} | |
__syncthreads(); | |
ReduceX2<Reducer, 8>(buffer, threadIdx.x); | |
return buffer[0]; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE WarpAxisReduceKernel2<Reducer, Type, C1> warp_axis_reduce_kernel_2d( | |
C1 arg_1_view) { | |
return WarpAxisReduceKernel2<Reducer, Type, C1>(arg_1_view); | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct AxisReduceKernel2 { | |
C1 arg_1_view_; | |
static const int ndim = 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE AxisReduceKernel2( | |
C1 arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<1>& input_query) const { | |
Shape<2> query = input_query.expand_dims(2); | |
T res; | |
Reducer::SetInitValue(res); | |
int& i1 = query[1]; | |
for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) { | |
Reducer::Reduce(res, arg_1_view_[query]); | |
} | |
return res; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d( | |
C1 arg_1_view) { | |
return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
struct ElementWiseKernel2D1 { | |
C1 arg_1_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D1( | |
const C1& arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1( | |
const C1& arg_1_view) { | |
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
struct ElementWiseKernel2D2 { | |
C1 arg_1_view_; | |
C2 arg_2_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) | |
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) { | |
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view); | |
} | |
template<typename C1> | |
struct Broadcasted_reshapeFTKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0], 0}]; | |
} | |
}; | |
template<typename C1> | |
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) { | |
return Broadcasted_reshapeFTKernel<C1>(array, shape); | |
} | |
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2> | |
void __global__ | |
assign_kernel_16960456299461880923(Destination0 dst0, Source0 src0, Shape<2> shape0, Destination1 dst1, Source1 src1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) { | |
int stride = blockDim.x * gridDim.x; | |
int idx = blockIdx.x; | |
auto nd_idx_2 = index_to_dim(idx, shape0); | |
dst0[nd_idx_2] = src0[nd_idx_2]; | |
__syncthreads(); | |
nd_idx_2 = index_to_dim(idx, shape1); | |
dst1[nd_idx_2] = src1[nd_idx_2]; | |
__syncthreads(); | |
idx = blockDim.x * blockIdx.x + threadIdx.x; | |
for (int i = idx; i < num_el2; i += stride) { | |
auto nd_idx = index_to_dim(idx, shape2); | |
dst2[nd_idx] = src2[nd_idx]; | |
} | |
} | |
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) { | |
auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]); | |
auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]); | |
Shape<2> shape_0(shapes[0]); | |
Shape<2> shape_1(shapes[1]); | |
Shape<2> shape_2(shapes[2]); | |
Shape<2> shape_3(shapes[3]); | |
Shape<2> shape_4(shapes[4]); | |
Shape<2> shape_5(shapes[5]); | |
auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]); | |
auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]); | |
const int NT = 256; | |
// const int MAX_BLOCKS = 40960; | |
auto dest_shape_0 = temp_0.shape(); | |
int num_el0 = dest_shape_0.numel(); | |
int max_num_el = num_el0; | |
auto dest_shape_1 = temp_1.shape(); | |
int num_el1 = dest_shape_1.numel(); | |
max_num_el = max(num_el1, max_num_el); | |
auto dest_shape_2 = array_0_view.shape(); | |
int num_el2 = dest_shape_2.numel(); | |
max_num_el = max(num_el2, max_num_el); | |
int grid_size = max_num_el; | |
// assert(grid_size <= MAX_BLOCKS); | |
assign_kernel_16960456299461880923<<<grid_size, NT, 0, NULL>>>( | |
temp_0, | |
expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1), | |
dest_shape_0, | |
temp_1, | |
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_5), shape_4)))), shape_3), | |
dest_shape_1, | |
array_0_view, | |
element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)), | |
num_el2, | |
dest_shape_2 | |
); | |
} | |
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){ | |
run(a, b, c, d, e, f); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "dali/array/jit/array_view.h" | |
#include "dali/array/jit/reducer_kernels.h" | |
#include "dali/array/functor.h" | |
template<typename C1> | |
struct Expand_dims_1_2dKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0]}]; | |
} | |
}; | |
template<typename C1> | |
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) { | |
return Expand_dims_1_2dKernel<C1>(array, shape); | |
} | |
template<typename Reducer, int x_bits, typename DType> | |
inline __device__ void ReduceX(volatile DType buf[], int tid) { | |
if (x_bits >= 10) { | |
if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]); | |
__syncthreads(); | |
} | |
if (x_bits >= 9) { | |
if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]); | |
__syncthreads(); | |
} | |
if (x_bits >= 8) { | |
if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]); | |
__syncthreads(); | |
} | |
if (x_bits >= 7) { | |
if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]); | |
__syncthreads(); | |
} | |
if (x_bits >= 6) { | |
if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]); | |
__syncthreads(); | |
} | |
// in warp optimization | |
if (x_bits >= 5) { | |
if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]); | |
__syncthreads(); | |
} | |
if (x_bits >= 4) { | |
if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]); | |
__syncthreads(); | |
} | |
if (x_bits >= 3) { | |
if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]); | |
__syncthreads(); | |
} | |
if (x_bits >= 2) { | |
if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]); | |
__syncthreads(); | |
} | |
if (x_bits >= 1) { | |
if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]); | |
__syncthreads(); | |
} | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct WarpAxisReduceKernel2 { | |
C1 arg_1_view_; | |
static const int ndim = 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE WarpAxisReduceKernel2( | |
C1 arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
inline __device__ T operator[](const Shape<1>& input_query) const { | |
Shape<2> query = input_query.expand_dims(2); | |
__shared__ T buffer[256]; | |
query[1] = threadIdx.x; | |
if (threadIdx.x < arg_1_view_.shape()[1]) { | |
buffer[threadIdx.x] = arg_1_view_[query]; | |
} | |
for (unsigned x = blockDim.x; x < arg_1_view_.shape()[1]; x += blockDim.x) { | |
const int col = x + threadIdx.x; | |
if (col < arg_1_view_.shape()[1]) { | |
query[1] = col; | |
Reducer::Reduce(buffer[threadIdx.x], arg_1_view_[query]); | |
} | |
} | |
__syncthreads(); | |
// if number of rows is smaller than buffer, | |
// fill buffer with neutral value | |
if (threadIdx.x >= arg_1_view_.shape()[1]) { | |
Reducer::SetInitValue(buffer[threadIdx.x]); | |
} | |
__syncthreads(); | |
ReduceX<Reducer, 8>(buffer, threadIdx.x); | |
return buffer[0]; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE WarpAxisReduceKernel2<Reducer, Type, C1> warp_axis_reduce_kernel_2d( | |
C1 arg_1_view) { | |
return WarpAxisReduceKernel2<Reducer, Type, C1>(arg_1_view); | |
} | |
#include <cooperative_groups.h> | |
template<typename T, typename C1, int ndim> | |
inline __device__ T thread_sum(const C1& input, Shape<ndim> query, int start, int stride) { | |
T sum = 0; | |
int& i = query[ndim - 1]; | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
return sum; | |
} | |
template<typename T, int ndim> | |
inline __device__ int thread_sum(const ArrayView<int, ndim>& input, Shape<ndim> query, int start, int stride) { | |
int sum = 0; | |
int4* ptr = (int4*) &input[query]; | |
int cols_div_word_length = input.shape()[ndim-1] / 4; | |
int& i = query[ndim - 1]; | |
if (cols_div_word_length * 4 == input.shape()[ndim-1]) { | |
for(i = start; | |
i < cols_div_word_length; | |
i += stride) { | |
int4 in = ptr[i]; | |
sum += in.x + in.y + in.z + in.w; | |
} | |
} else { | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
} | |
return sum; | |
} | |
template<typename T, int ndim> | |
inline __device__ float thread_sum(const ArrayView<float, ndim>& input, Shape<ndim> query, int start, int stride) { | |
float sum = 0; | |
float4* ptr = (float4*) &input[query]; | |
int cols_div_word_length = input.shape()[ndim-1] / 4; | |
int& i = query[ndim - 1]; | |
if (cols_div_word_length * 4 == input.shape()[ndim-1]) { | |
for(i = start; | |
i < cols_div_word_length; | |
i += stride) { | |
float4 in = ptr[i]; | |
sum += in.x + in.y + in.z + in.w; | |
} | |
} else { | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
} | |
return sum; | |
} | |
template<typename T, int ndim> | |
inline __device__ double thread_sum(const ArrayView<double, ndim>& input, Shape<ndim> query, int start, int stride) { | |
double sum = 0; | |
double2* ptr = (double2*) &input[query]; | |
int cols_div_word_length = input.shape()[ndim-1] / 2; | |
int& i = query[ndim - 1]; | |
if (cols_div_word_length * 2 == input.shape()[ndim-1]) { | |
for(i = start; | |
i < cols_div_word_length; | |
i += stride) { | |
double2 in = ptr[i]; | |
sum += in.x + in.y; | |
} | |
} else { | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
} | |
return sum; | |
} | |
template <int tile_sz, typename T> | |
__device__ T reduce_sum_tile_shfl(cooperative_groups::thread_block_tile<tile_sz> g, T val) { | |
int lane = g.thread_rank(); | |
// Each iteration halves the number of active threads | |
// Each thread adds its partial sum[i] to sum[lane+i] | |
for (int i = g.size() / 2; i > 0; i /= 2) { | |
val += g.shfl_down(val, i); | |
} | |
return val; // note: only thread 0 will return full sum | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct ShflDownWarpAxisSum1 { | |
C1 arg_; | |
static const int ndim = C1::ndim - 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE ShflDownWarpAxisSum1(C1 arg) : arg_(arg) {} | |
inline __device__ T operator[](const Shape<ndim>& input_query) const { | |
__shared__ T sum; | |
sum = 0; | |
Shape<ndim + 1> query = input_query.expand_dims(ndim); | |
query[ndim] = 0; | |
T my_sum = thread_sum<T>(arg_, query, threadIdx.x, blockDim.x); | |
auto tile = cooperative_groups::tiled_partition<16>( | |
cooperative_groups::this_thread_block()); | |
T tile_sum = reduce_sum_tile_shfl<16>(tile, my_sum); | |
if (tile.thread_rank() == 0) atomicAdd(&sum, tile_sum); | |
__syncthreads(); | |
return sum; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE ShflDownWarpAxisSum1<Reducer, Type, C1> shfl_down_warp_axis_sum2d( | |
C1 arg) { | |
return ShflDownWarpAxisSum1<Reducer, Type, C1>(arg); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
struct ElementWiseKernel2D1 { | |
C1 arg_1_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D1( | |
const C1& arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1( | |
const C1& arg_1_view) { | |
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
struct ElementWiseKernel2D2 { | |
C1 arg_1_view_; | |
C2 arg_2_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) | |
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) { | |
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view); | |
} | |
template<typename C1> | |
struct Broadcasted_reshapeFTKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0], 0}]; | |
} | |
}; | |
template<typename C1> | |
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) { | |
return Broadcasted_reshapeFTKernel<C1>(array, shape); | |
} | |
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2, typename Destination3, typename Source3> | |
void __global__ | |
assign_kernel_6585648013932656081(Destination0 dst0, Source0 src0, Shape<2> shape0, Destination1 dst1, Source1 src1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2, Destination3 dst3, Source3 src3, int num_el3, Shape<2> shape3) { | |
int idx = blockIdx.x; | |
auto nd_idx_2 = index_to_dim(idx, shape0); | |
dst0[nd_idx_2] = src0[nd_idx_2]; | |
__syncthreads(); | |
nd_idx_2 = index_to_dim(idx, shape1); | |
dst1[nd_idx_2] = src1[nd_idx_2]; | |
__syncthreads(); | |
idx = blockDim.x * blockIdx.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
for (int i = idx; i < num_el2; i += stride) { | |
auto nd_idx = index_to_dim(idx, shape2); | |
dst2[nd_idx] = src2[nd_idx]; | |
} | |
__syncthreads(); | |
idx = blockDim.x * blockIdx.x + threadIdx.x; | |
for (int i = idx; i < num_el3; i += stride) { | |
auto nd_idx = index_to_dim(idx, shape3); | |
dst3[nd_idx] = src3[nd_idx]; | |
} | |
} | |
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) { | |
auto array_0_view = make_view<float, 2>(array_data[0], offsets[0], sizes[0]); | |
auto array_1_view = make_view<float, 2>(array_data[1], offsets[1], sizes[1]); | |
Shape<1> shape_0(shapes[0]); | |
Shape<2> shape_1(shapes[1]); | |
Shape<2> shape_2(shapes[2]); | |
Shape<2> shape_3(shapes[3]); | |
Shape<2> shape_4(shapes[4]); | |
Shape<1> shape_5(shapes[5]); | |
auto temp_0 = make_view<float, 2>(array_data[2], offsets[2], sizes[2]); | |
auto temp_1 = make_view<float, 2>(array_data[3], offsets[3], sizes[3]); | |
auto temp_2 = make_view<float, 2>(array_data[4], offsets[4], sizes[4]); | |
const int NT = 256; | |
// const int MAX_BLOCKS = 40960; | |
auto dest_shape_0 = temp_0.shape(); | |
int num_el0 = dest_shape_0.numel(); | |
int max_num_el = num_el0; | |
auto dest_shape_1 = temp_1.shape(); | |
int num_el1 = dest_shape_1.numel(); | |
max_num_el = max(num_el1, max_num_el); | |
auto dest_shape_2 = temp_2.shape(); | |
int num_el2 = dest_shape_2.numel(); | |
max_num_el = max(num_el2, max_num_el); | |
auto dest_shape_3 = array_0_view.shape(); | |
int num_el3 = dest_shape_3.numel(); | |
max_num_el = max(num_el3, max_num_el); | |
int grid_size = max_num_el; | |
// assert(grid_size <= MAX_BLOCKS); | |
assign_kernel_6585648013932656081<<<grid_size, NT, 0, NULL>>>( | |
temp_0, | |
expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, float>(array_1_view), shape_1), | |
dest_shape_0, | |
temp_1, | |
expand_dims_1_2d(shfl_down_warp_axis_sum2d<reducers::sum, float>(element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3), | |
dest_shape_1, | |
temp_2, | |
element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4))), | |
num_el2, | |
dest_shape_2, | |
array_0_view, | |
element_wise_kernel2D2<functor::eltdiv, float>(temp_2, broadcasted_reshapeFT(temp_1, shape_2)), | |
num_el3, | |
dest_shape_3 | |
); | |
} | |
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){ | |
run(a, b, c, d, e, f); | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "dali/array/jit/array_view.h" | |
#include "dali/array/jit/reducer_kernels.h" | |
#include "dali/array/functor.h" | |
template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2, typename Destination3, typename Source3> | |
void __global__ | |
assign_kernel_11773770055001859919(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2, Destination3 dst3, Source3 src3, int num_el3, Shape<2> shape3) { | |
for (int i0 = blockIdx.x; i0 < shape0[0]; i0 += gridDim.x) { | |
for (int i1 = threadIdx.x; i1 < shape0[1]; i1 += blockDim.x) { | |
Shape<2> nd_idx({i0, i1}); | |
dst0[i1] = src0[i1]; | |
} | |
} | |
__syncthreads(); | |
for (int i0 = blockIdx.x; i0 < shape1[0]; i0 += gridDim.x) { | |
for (int i1 = threadIdx.x; i1 < shape1[1]; i1 += blockDim.x) { | |
Shape<2> nd_idx({i0, i1}); | |
dst1[i1] = src1[i1]; | |
} | |
} | |
__syncthreads(); | |
for (int i0 = blockIdx.x; i0 < shape2[0]; i0 += gridDim.x) { | |
for (int i1 = 0; i1 < shape2[1]; i1 += 1) { | |
Shape<2> nd_idx({i0, i1}); | |
dst2[i1] = src2[i1]; | |
} | |
} | |
__syncthreads(); | |
for (int i0 = blockIdx.x; i0 < shape3[0]; i0 += gridDim.x) { | |
for (int i1 = threadIdx.x; i1 < shape3[1]; i1 += blockDim.x) { | |
Shape<2> nd_idx({i0, i1}); | |
dst3[i1] = src3[i1]; | |
} | |
} | |
} | |
template<typename C1> | |
struct Expand_dims_1_2dKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0]}]; | |
} | |
}; | |
template<typename C1> | |
Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) { | |
return Expand_dims_1_2dKernel<C1>(array, shape); | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct AxisReduceKernel2 { | |
C1 arg_1_view_; | |
static const int ndim = 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_1_view_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE AxisReduceKernel2( | |
C1 arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<1>& input_query) const { | |
Shape<2> query = input_query.expand_dims(2); | |
T res; | |
Reducer::SetInitValue(res); | |
int& i1 = query[1]; | |
for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) { | |
Reducer::Reduce(res, arg_1_view_[query]); | |
} | |
return res; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d( | |
C1 arg_1_view) { | |
return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
struct ElementWiseKernel2D1 { | |
C1 arg_1_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D1( | |
const C1& arg_1_view) | |
: arg_1_view_(arg_1_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1> | |
XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1( | |
const C1& arg_1_view) { | |
return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view); | |
} | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
struct ElementWiseKernel2D2 { | |
C1 arg_1_view_; | |
C2 arg_2_view_; | |
static const int ndim = 2; | |
typedef Type T; | |
XINLINE const Shape<ndim>& shape() const { | |
return arg_1_view_.shape(); | |
} | |
XINLINE ElementWiseKernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) | |
: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]); | |
} | |
}; | |
template<template <typename> class Functor, typename Type, | |
typename C1, | |
typename C2> | |
XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2( | |
const C1& arg_1_view, | |
const C2& arg_2_view) { | |
return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view); | |
} | |
template<typename C1> | |
struct Broadcasted_reshapeFTKernel { | |
const C1 array_; | |
const Shape<2> shape_; | |
static const int ndim = 2; | |
typedef typename C1::T T; | |
XINLINE const Shape<ndim>& shape() const {return shape_;} | |
XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape) | |
: array_(array), shape_(shape) {} | |
XINLINE T operator[](const Shape<ndim>& query) const { | |
return array_[{query[0], 0}]; | |
} | |
}; | |
template<typename C1> | |
Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) { | |
return Broadcasted_reshapeFTKernel<C1>(array, shape); | |
} | |
#include <cooperative_groups.h> | |
template<typename T, typename C1, int ndim> | |
inline __device__ T thread_sum(const C1& input, Shape<ndim> query, int start, int stride) { | |
T sum = 0; | |
int& i = query[ndim - 1]; | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
return sum; | |
} | |
template<typename T, int ndim> | |
inline __device__ int thread_sum(const ArrayView<int, ndim>& input, Shape<ndim> query, int start, int stride) { | |
int sum = 0; | |
int4* ptr = (int4*) &input[query]; | |
int cols_div_word_length = input.shape()[ndim-1] / 4; | |
int& i = query[ndim - 1]; | |
if (cols_div_word_length * 4 == input.shape()[ndim-1]) { | |
for(i = start; | |
i < cols_div_word_length; | |
i += stride) { | |
int4 in = ptr[i]; | |
sum += in.x + in.y + in.z + in.w; | |
} | |
} else { | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
} | |
return sum; | |
} | |
template<typename T, int ndim> | |
inline __device__ float thread_sum(const ArrayView<float, ndim>& input, Shape<ndim> query, int start, int stride) { | |
float sum = 0; | |
float4* ptr = (float4*) &input[query]; | |
int cols_div_word_length = input.shape()[ndim-1] / 4; | |
int& i = query[ndim - 1]; | |
if (cols_div_word_length * 4 == input.shape()[ndim-1]) { | |
for(i = start; | |
i < cols_div_word_length; | |
i += stride) { | |
float4 in = ptr[i]; | |
sum += in.x + in.y + in.z + in.w; | |
} | |
} else { | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
} | |
return sum; | |
} | |
template<typename T, int ndim> | |
inline __device__ double thread_sum(const ArrayView<double, ndim>& input, Shape<ndim> query, int start, int stride) { | |
double sum = 0; | |
double2* ptr = (double2*) &input[query]; | |
int cols_div_word_length = input.shape()[ndim-1] / 2; | |
int& i = query[ndim - 1]; | |
if (cols_div_word_length * 2 == input.shape()[ndim-1]) { | |
for(i = start; | |
i < cols_div_word_length; | |
i += stride) { | |
double2 in = ptr[i]; | |
sum += in.x + in.y; | |
} | |
} else { | |
for(i = start; | |
i < input.shape()[ndim-1]; | |
i += stride) { | |
sum += input[query]; | |
} | |
} | |
return sum; | |
} | |
template <int tile_sz, typename T> | |
__device__ T reduce_sum_tile_shfl(cooperative_groups::thread_block_tile<tile_sz> g, T val) { | |
int lane = g.thread_rank(); | |
// Each iteration halves the number of active threads | |
// Each thread adds its partial sum[i] to sum[lane+i] | |
for (int i = g.size() / 2; i > 0; i /= 2) { | |
val += g.shfl_down(val, i); | |
} | |
return val; // note: only thread 0 will return full sum | |
} | |
template<typename Reducer, typename Type, typename C1> | |
struct ShflDownWarpAxisSum1 { | |
C1 arg_; | |
static const int ndim = C1::ndim - 1; | |
typedef Type T; | |
XINLINE Shape<ndim> shape() const { | |
return arg_.shape().template axis_reduced_shape<0, ndim>(); | |
} | |
XINLINE ShflDownWarpAxisSum1(C1 arg) : arg_(arg) {} | |
inline __device__ T operator[](const Shape<ndim>& input_query) const { | |
__shared__ T sum; | |
sum = 0; | |
Shape<ndim + 1> query = input_query.expand_dims(ndim); | |
query[ndim] = 0; | |
T my_sum = thread_sum<T>(arg_, query, threadIdx.x, blockDim.x); | |
auto tile = cooperative_groups::tiled_partition<16>( | |
cooperative_groups::this_thread_block()); | |
T tile_sum = reduce_sum_tile_shfl<16>(tile, my_sum); | |
if (tile.thread_rank() == 0) atomicAdd(&sum, tile_sum); | |
__syncthreads(); | |
return sum; | |
} | |
}; | |
template<typename Reducer, typename Type, typename C1> | |
XINLINE ShflDownWarpAxisSum1<Reducer, Type, C1> shfl_down_warp_axis_sum2d( | |
C1 arg) { | |
return ShflDownWarpAxisSum1<Reducer, Type, C1>(arg); | |
} | |
void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) { | |
auto array_0_view = make_view<float, 2>(array_data[0], offsets[0], sizes[0]); | |
auto array_1_view = make_view<float, 2>(array_data[1], offsets[1], sizes[1]); | |
Shape<2> shape_0(shapes[0]); | |
Shape<2> shape_1(shapes[1]); | |
Shape<2> shape_2(shapes[2]); | |
Shape<2> shape_3(shapes[3]); | |
Shape<2> shape_4(shapes[4]); | |
Shape<2> shape_5(shapes[5]); | |
auto temp_0 = make_view<float, 2>(array_data[2], offsets[2], sizes[2]); | |
auto temp_1 = make_view<float, 2>(array_data[3], offsets[3], sizes[3]); | |
auto temp_2 = make_view<float, 2>(array_data[4], offsets[4], sizes[4]); | |
const int NT = 256; | |
// const int MAX_BLOCKS = 40960; | |
auto dest_shape_0 = temp_0.shape(); | |
int num_el0 = dest_shape_0.numel(); | |
auto dest_shape_1 = temp_2.shape(); | |
int num_el1 = dest_shape_1.numel(); | |
auto dest_shape_2 = temp_1.shape(); | |
int num_el2 = dest_shape_2.numel(); | |
auto dest_shape_3 = array_0_view.shape(); | |
int num_el3 = dest_shape_3.numel(); | |
int max_num_el = dest_shape_0[0]; | |
max_num_el = max(max_num_el, dest_shape_1[0]); | |
max_num_el = max(max_num_el, dest_shape_2[0]); | |
max_num_el = max(max_num_el, dest_shape_3[0]); | |
// assert(grid_size <= MAX_BLOCKS); | |
int grid_size = max_num_el; | |
assign_kernel_11773770055001859919<<<grid_size, NT, 0, NULL>>>( | |
temp_0, | |
expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, float>(array_1_view), shape_5), | |
num_el0, | |
dest_shape_0, | |
temp_2, | |
element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4))), | |
num_el1, | |
dest_shape_1, | |
temp_1, | |
expand_dims_1_2d(shfl_down_warp_axis_sum2d<reducers::sum, float>(temp_2), shape_3), | |
num_el2, | |
dest_shape_2, | |
array_0_view, | |
element_wise_kernel2D2<functor::eltdiv, float>(temp_2, broadcasted_reshapeFT(temp_1, shape_2)), | |
num_el3, | |
dest_shape_3 | |
); | |
} | |
extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){ | |
run(a, b, c, d, e, f); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment