JonathanRaiman · November 27, 2018 02:25
diff --git a/human.cpp b/human.cpp
 auto a = op::uniform(-20.0, 20.0, {2, 5}).astype(dtype);
 a.eval();
 auto exped = op::exp(a - op::max(a, {-1}, true));
 auto fused_softmax = exped / op::sum(exped, {-1}, true);
diff --git a/softmax-v0.cu b/softmax-v0.cu
 #include "dali/array/jit/array_view.h"
 #include "dali/array/jit/reducer_kernels.h"
 #include "dali/array/functor.h"
 template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2>
 void __global__
 assign_kernel_11204438187360910603(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = idx; i < num_el0; i += stride) {
        auto nd_idx = index_to_dim(idx, shape0);
        dst0[nd_idx] = src0[nd_idx];
    }
    __syncthreads();
    for (int i = idx; i < num_el1; i += stride) {
        auto nd_idx = index_to_dim(idx, shape1);
        dst1[nd_idx] = src1[nd_idx];
    }
    __syncthreads();
    for (int i = idx; i < num_el2; i += stride) {
        auto nd_idx = index_to_dim(idx, shape2);
        dst2[nd_idx] = src2[nd_idx];
    }
 }
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 struct ElementWiseKernel2D2 {
    C1 arg_1_view_;
    C2 arg_2_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view)
        : arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view) {
    return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
 }
 template<template <typename> class Functor, typename Type,
         typename C1>
 struct ElementWiseKernel2D1 {
    C1 arg_1_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D1(
        const C1& arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1>
 XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
        const C1& arg_1_view) {
    return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
 }
 template<typename C1>
 struct Broadcasted_reshapeFTKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0], 0}];
    }
 };
 template<typename C1>
 Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
    return Broadcasted_reshapeFTKernel<C1>(array, shape);
 }
 template<typename C1>
 struct Expand_dims_1_2dKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0]}];
    }
 };
 template<typename C1>
 Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
    return Expand_dims_1_2dKernel<C1>(array, shape);
 }
 template<typename Reducer, typename Type, typename C1>
 struct AxisReduceKernel2 {
    C1 arg_1_view_;
    static const int ndim = 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE AxisReduceKernel2(
        C1 arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<1>& input_query) const {
        Shape<2> query = input_query.expand_dims(2);
        T res;
        Reducer::SetInitValue(res);
        int& i1 = query[1];
        for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
            Reducer::Reduce(res, arg_1_view_[query]);
        }
        return res;
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
        C1 arg_1_view) {
    return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
 }
 void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
    auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]);
    auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]);
    Shape<2> shape_0(shapes[0]);
    Shape<2> shape_1(shapes[1]);
    Shape<2> shape_2(shapes[2]);
    Shape<2> shape_3(shapes[3]);
    Shape<2> shape_4(shapes[4]);
    Shape<2> shape_5(shapes[5]);
    auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]);
    auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]);

    const int NT = 128;
    // const int MAX_BLOCKS = 40960;
    auto dest_shape_0 = temp_0.shape();
    int num_el0 = dest_shape_0.numel();
    int max_num_el = num_el0;
    auto dest_shape_1 = temp_1.shape();
    int num_el1 = dest_shape_1.numel();
    max_num_el = max(num_el1, max_num_el);
    auto dest_shape_2 = array_0_view.shape();
    int num_el2 = dest_shape_2.numel();
    max_num_el = max(num_el2, max_num_el);
    int grid_size = div_ceil(max_num_el, NT);
    // assert(grid_size <= MAX_BLOCKS);
    assign_kernel_11204438187360910603<<<grid_size, NT, 0, NULL>>>(
        temp_0,
        expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1),
        num_el0,
        dest_shape_0,
        temp_1,
        expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3),
        num_el1,
        dest_shape_1,
        array_0_view,
        element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)),
        num_el2,
        dest_shape_2
    );
 }

 extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
    run(a, b, c, d, e, f);
 }
diff --git a/softmax-v1.cu b/softmax-v1.cu
 #include "dali/array/jit/array_view.h"
 #include "dali/array/jit/reducer_kernels.h"
 #include "dali/array/functor.h"
 template<typename C1>
 struct Expand_dims_1_2dKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0]}];
    }
 };
 template<typename C1>
 Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
    return Expand_dims_1_2dKernel<C1>(array, shape);
 }
 template<typename Reducer, int x_bits, typename DType>
 inline __device__ void ReduceX2(volatile DType buf[], int tid) {
  if (x_bits >= 10) {
    if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
    __syncthreads();
  }
  if (x_bits >= 9) {
    if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
    __syncthreads();
  }
  if (x_bits >= 8) {
    if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
    __syncthreads();
  }
  if (x_bits >= 7) {
    if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
    __syncthreads();
  }
  if (x_bits >= 6) {
    if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
    __syncthreads();
  }
  // in warp optimization
  if (x_bits >= 5) {
    if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
    __syncthreads();
  }
  if (x_bits >= 4) {
    if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
    __syncthreads();
  }
  if (x_bits >= 3) {
    if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
    __syncthreads();
  }
  if (x_bits >= 2) {
    if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
    __syncthreads();
  }
  if (x_bits >= 1) {
    if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
    __syncthreads();
  }
 }
 template<typename Reducer, typename Type, typename C1>
 struct WarpAxisReduceKernel2 {
    C1 arg_1_view_;
    static const int ndim = 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE WarpAxisReduceKernel2(
        C1 arg_1_view)
        : arg_1_view_(arg_1_view) {}
    inline __device__ T operator[](const Shape<1>& input_query) const {
        Shape<2> query = input_query.expand_dims(2);
        __shared__ T buffer[256];
        query[1] = threadIdx.x;
        if (threadIdx.x < arg_1_view_.shape()[1]) {
            buffer[threadIdx.x] = arg_1_view_[query];
        }
        for (unsigned x = 256; x < arg_1_view_.shape()[1]; x += 256) {
            const int col = x + threadIdx.x;
            if (col < arg_1_view_.shape()[1]) {
                query[1] = col;
                Reducer::Reduce(buffer[threadIdx.x], arg_1_view_[query]);
            }
        }
        __syncthreads();
        // if number of rows is smaller than buffer,
        // fill buffer with neutral value
        if (threadIdx.x >= arg_1_view_.shape()[1]) {
            Reducer::SetInitValue(buffer[threadIdx.x]);
        }
        __syncthreads();
        ReduceX2<Reducer, 8>(buffer, threadIdx.x);
        return buffer[0];
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE WarpAxisReduceKernel2<Reducer, Type, C1> warp_axis_reduce_kernel_2d(
        C1 arg_1_view) {
    return WarpAxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
 }
 template<typename Reducer, typename Type, typename C1>
 struct AxisReduceKernel2 {
    C1 arg_1_view_;
    static const int ndim = 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE AxisReduceKernel2(
        C1 arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<1>& input_query) const {
        Shape<2> query = input_query.expand_dims(2);
        T res;
        Reducer::SetInitValue(res);
        int& i1 = query[1];
        for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
            Reducer::Reduce(res, arg_1_view_[query]);
        }
        return res;
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
        C1 arg_1_view) {
    return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
 }
 template<template <typename> class Functor, typename Type,
         typename C1>
 struct ElementWiseKernel2D1 {
    C1 arg_1_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D1(
        const C1& arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1>
 XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
        const C1& arg_1_view) {
    return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
 }
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 struct ElementWiseKernel2D2 {
    C1 arg_1_view_;
    C2 arg_2_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view)
        : arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view) {
    return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
 }
 template<typename C1>
 struct Broadcasted_reshapeFTKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0], 0}];
    }
 };
 template<typename C1>
 Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
    return Broadcasted_reshapeFTKernel<C1>(array, shape);
 }
 template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2>
 void __global__
 assign_kernel_16960456299461880923(Destination0 dst0, Source0 src0, Shape<2> shape0, Destination1 dst1, Source1 src1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) {
    int stride = blockDim.x * gridDim.x;
    int idx = blockIdx.x;
    auto nd_idx_2 = index_to_dim(idx, shape0);
    dst0[nd_idx_2] = src0[nd_idx_2];
    __syncthreads();
    nd_idx_2 = index_to_dim(idx, shape1);
    dst1[nd_idx_2] = src1[nd_idx_2];
    __syncthreads();
    idx = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = idx; i < num_el2; i += stride) {
        auto nd_idx = index_to_dim(idx, shape2);
        dst2[nd_idx] = src2[nd_idx];
    }
 }
 void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
    auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]);
    auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]);
    Shape<2> shape_0(shapes[0]);
    Shape<2> shape_1(shapes[1]);
    Shape<2> shape_2(shapes[2]);
    Shape<2> shape_3(shapes[3]);
    Shape<2> shape_4(shapes[4]);
    Shape<2> shape_5(shapes[5]);
    auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]);
    auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]);

    const int NT = 256;
    // const int MAX_BLOCKS = 40960;
    auto dest_shape_0 = temp_0.shape();
    int num_el0 = dest_shape_0.numel();
    int max_num_el = num_el0;
    auto dest_shape_1 = temp_1.shape();
    int num_el1 = dest_shape_1.numel();
    max_num_el = max(num_el1, max_num_el);
    auto dest_shape_2 = array_0_view.shape();
    int num_el2 = dest_shape_2.numel();
    max_num_el = max(num_el2, max_num_el);
    int grid_size = max_num_el;
    // assert(grid_size <= MAX_BLOCKS);
    assign_kernel_16960456299461880923<<<grid_size, NT, 0, NULL>>>(
        temp_0,
        expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1),
        dest_shape_0,
        temp_1,
        expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_5), shape_4)))), shape_3),
        dest_shape_1,
        array_0_view,
        element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)),
        num_el2,
        dest_shape_2
    );
 }

 extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
    run(a, b, c, d, e, f);
 }
diff --git a/softmax-v2.cu b/softmax-v2.cu
 #include "dali/array/jit/array_view.h"
 #include "dali/array/jit/reducer_kernels.h"
 #include "dali/array/functor.h"
 template<typename C1>
 struct Expand_dims_1_2dKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0]}];
    }
 };
 template<typename C1>
 Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
    return Expand_dims_1_2dKernel<C1>(array, shape);
 }
 template<typename Reducer, int x_bits, typename DType>
 inline __device__ void ReduceX(volatile DType buf[], int tid) {
  if (x_bits >= 10) {
    if (tid < 512) Reducer::Reduce(buf[tid] , buf[tid + 512]);
    __syncthreads();
  }
  if (x_bits >= 9) {
    if (tid < 256) Reducer::Reduce(buf[tid] , buf[tid + 256]);
    __syncthreads();
  }
  if (x_bits >= 8) {
    if (tid < 128) Reducer::Reduce(buf[tid] , buf[tid + 128]);
    __syncthreads();
  }
  if (x_bits >= 7) {
    if (tid < 64) Reducer::Reduce(buf[tid] , buf[tid + 64]);
    __syncthreads();
  }
  if (x_bits >= 6) {
    if (tid < 32) Reducer::Reduce(buf[tid] , buf[tid + 32]);
    __syncthreads();
  }
  // in warp optimization
  if (x_bits >= 5) {
    if (tid < 16) Reducer::Reduce(buf[tid] , buf[tid + 16]);
    __syncthreads();
  }
  if (x_bits >= 4) {
    if (tid < 8) Reducer::Reduce(buf[tid] , buf[tid + 8]);
    __syncthreads();
  }
  if (x_bits >= 3) {
    if (tid < 4) Reducer::Reduce(buf[tid] , buf[tid + 4]);
    __syncthreads();
  }
  if (x_bits >= 2) {
    if (tid < 2) Reducer::Reduce(buf[tid] , buf[tid + 2]);
    __syncthreads();
  }
  if (x_bits >= 1) {
    if (tid < 1) Reducer::Reduce(buf[tid] , buf[tid + 1]);
    __syncthreads();
  }
 }
 template<typename Reducer, typename Type, typename C1>
 struct WarpAxisReduceKernel2 {
    C1 arg_1_view_;
    static const int ndim = 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE WarpAxisReduceKernel2(
        C1 arg_1_view)
        : arg_1_view_(arg_1_view) {}
    inline __device__ T operator[](const Shape<1>& input_query) const {
        Shape<2> query = input_query.expand_dims(2);
        __shared__ T buffer[256];
        query[1] = threadIdx.x;
        if (threadIdx.x < arg_1_view_.shape()[1]) {
            buffer[threadIdx.x] = arg_1_view_[query];
        }
        for (unsigned x = blockDim.x; x < arg_1_view_.shape()[1]; x += blockDim.x) {
            const int col = x + threadIdx.x;
            if (col < arg_1_view_.shape()[1]) {
                query[1] = col;
                Reducer::Reduce(buffer[threadIdx.x], arg_1_view_[query]);
            }
        }
        __syncthreads();
        // if number of rows is smaller than buffer,
        // fill buffer with neutral value
        if (threadIdx.x >= arg_1_view_.shape()[1]) {
            Reducer::SetInitValue(buffer[threadIdx.x]);
        }
        __syncthreads();
        ReduceX<Reducer, 8>(buffer, threadIdx.x);
        return buffer[0];
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE WarpAxisReduceKernel2<Reducer, Type, C1> warp_axis_reduce_kernel_2d(
        C1 arg_1_view) {
    return WarpAxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
 }
 #include <cooperative_groups.h>
 template<typename T, typename C1, int ndim>
 inline __device__ T thread_sum(const C1& input, Shape<ndim> query, int start, int stride) {
    T sum = 0;
    int& i = query[ndim - 1];
    for(i = start;
        i < input.shape()[ndim-1];
        i += stride) {
        sum += input[query];
    }
    return sum;
 }
 template<typename T, int ndim>
 inline __device__ int thread_sum(const ArrayView<int, ndim>& input, Shape<ndim> query, int start, int stride) {
    int sum = 0;
    int4* ptr = (int4*) &input[query];
    int cols_div_word_length = input.shape()[ndim-1] / 4;
    int& i = query[ndim - 1];
    if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
        for(i = start;
            i < cols_div_word_length;
            i += stride) {
            int4 in = ptr[i];
            sum += in.x + in.y + in.z + in.w;
        }
    } else {
        for(i = start;
            i < input.shape()[ndim-1];
            i += stride) {
            sum += input[query];
        }
    }
    return sum;
 }
 template<typename T, int ndim>
 inline __device__ float thread_sum(const ArrayView<float, ndim>& input, Shape<ndim> query, int start, int stride) {
    float sum = 0;
    float4* ptr = (float4*) &input[query];
    int cols_div_word_length = input.shape()[ndim-1] / 4;
    int& i = query[ndim - 1];
    if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
        for(i = start;
            i < cols_div_word_length;
            i += stride) {
            float4 in = ptr[i];
            sum += in.x + in.y + in.z + in.w;
        }
    } else {
        for(i = start;
            i < input.shape()[ndim-1];
            i += stride) {
            sum += input[query];
        }
    }
    return sum;
 }
 template<typename T, int ndim>
 inline __device__ double thread_sum(const ArrayView<double, ndim>& input, Shape<ndim> query, int start, int stride) {
    double sum = 0;
    double2* ptr = (double2*) &input[query];
    int cols_div_word_length = input.shape()[ndim-1] / 2;
    int& i = query[ndim - 1];
    if (cols_div_word_length * 2 == input.shape()[ndim-1]) {
        for(i = start;
            i < cols_div_word_length;
            i += stride) {
            double2 in = ptr[i];
            sum += in.x + in.y;
        }
    } else {
        for(i = start;
            i < input.shape()[ndim-1];
            i += stride) {
            sum += input[query];
        }
    }
    return sum;
 }
 template <int tile_sz, typename T>
 __device__ T reduce_sum_tile_shfl(cooperative_groups::thread_block_tile<tile_sz> g, T val) {
    int lane = g.thread_rank();
    // Each iteration halves the number of active threads
    // Each thread adds its partial sum[i] to sum[lane+i]
    for (int i = g.size() / 2; i > 0; i /= 2) {
        val += g.shfl_down(val, i);
    }
    return val; // note: only thread 0 will return full sum
 }
 template<typename Reducer, typename Type, typename C1>
 struct ShflDownWarpAxisSum1 {
    C1 arg_;
    static const int ndim = C1::ndim - 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE ShflDownWarpAxisSum1(C1 arg) : arg_(arg) {}
    inline __device__ T operator[](const Shape<ndim>& input_query) const {
        __shared__ T sum;
        sum = 0;
        Shape<ndim + 1> query = input_query.expand_dims(ndim);
        query[ndim] = 0;
        T my_sum = thread_sum<T>(arg_, query, threadIdx.x, blockDim.x);
        auto tile = cooperative_groups::tiled_partition<16>(
            cooperative_groups::this_thread_block());
        T tile_sum = reduce_sum_tile_shfl<16>(tile, my_sum);
        if (tile.thread_rank() == 0) atomicAdd(&sum, tile_sum);
        __syncthreads();
        return sum;
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE ShflDownWarpAxisSum1<Reducer, Type, C1> shfl_down_warp_axis_sum2d(
        C1 arg) {
    return ShflDownWarpAxisSum1<Reducer, Type, C1>(arg);
 }
 template<template <typename> class Functor, typename Type,
         typename C1>
 struct ElementWiseKernel2D1 {
    C1 arg_1_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D1(
        const C1& arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1>
 XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
        const C1& arg_1_view) {
    return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
 }
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 struct ElementWiseKernel2D2 {
    C1 arg_1_view_;
    C2 arg_2_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view)
        : arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view) {
    return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
 }
 template<typename C1>
 struct Broadcasted_reshapeFTKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0], 0}];
    }
 };
 template<typename C1>
 Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
    return Broadcasted_reshapeFTKernel<C1>(array, shape);
 }
 template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2, typename Destination3, typename Source3>
 void __global__
 assign_kernel_6585648013932656081(Destination0 dst0, Source0 src0, Shape<2> shape0, Destination1 dst1, Source1 src1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2, Destination3 dst3, Source3 src3, int num_el3, Shape<2> shape3) {
    int idx = blockIdx.x;
    auto nd_idx_2 = index_to_dim(idx, shape0);
    dst0[nd_idx_2] = src0[nd_idx_2];
    __syncthreads();
    nd_idx_2 = index_to_dim(idx, shape1);
    dst1[nd_idx_2] = src1[nd_idx_2];
    __syncthreads();
    idx = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = idx; i < num_el2; i += stride) {
        auto nd_idx = index_to_dim(idx, shape2);
        dst2[nd_idx] = src2[nd_idx];
    }
    __syncthreads();
    idx = blockDim.x * blockIdx.x + threadIdx.x;
    for (int i = idx; i < num_el3; i += stride) {
        auto nd_idx = index_to_dim(idx, shape3);
        dst3[nd_idx] = src3[nd_idx];
    }
 }
 void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
    auto array_0_view = make_view<float, 2>(array_data[0], offsets[0], sizes[0]);
    auto array_1_view = make_view<float, 2>(array_data[1], offsets[1], sizes[1]);
    Shape<1> shape_0(shapes[0]);
    Shape<2> shape_1(shapes[1]);
    Shape<2> shape_2(shapes[2]);
    Shape<2> shape_3(shapes[3]);
    Shape<2> shape_4(shapes[4]);
    Shape<1> shape_5(shapes[5]);
    auto temp_0 = make_view<float, 2>(array_data[2], offsets[2], sizes[2]);
    auto temp_1 = make_view<float, 2>(array_data[3], offsets[3], sizes[3]);
    auto temp_2 = make_view<float, 2>(array_data[4], offsets[4], sizes[4]);

    const int NT = 256;
    // const int MAX_BLOCKS = 40960;
    auto dest_shape_0 = temp_0.shape();
    int num_el0 = dest_shape_0.numel();
    int max_num_el = num_el0;
    auto dest_shape_1 = temp_1.shape();
    int num_el1 = dest_shape_1.numel();
    max_num_el = max(num_el1, max_num_el);
    auto dest_shape_2 = temp_2.shape();
    int num_el2 = dest_shape_2.numel();
    max_num_el = max(num_el2, max_num_el);
    auto dest_shape_3 = array_0_view.shape();
    int num_el3 = dest_shape_3.numel();
    max_num_el = max(num_el3, max_num_el);
    int grid_size = max_num_el;
    // assert(grid_size <= MAX_BLOCKS);
    assign_kernel_6585648013932656081<<<grid_size, NT, 0, NULL>>>(
        temp_0,
        expand_dims_1_2d(warp_axis_reduce_kernel_2d<reducers::maximum, float>(array_1_view), shape_1),
        dest_shape_0,
        temp_1,
        expand_dims_1_2d(shfl_down_warp_axis_sum2d<reducers::sum, float>(element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3),
        dest_shape_1,
        temp_2,
        element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4))),
        num_el2,
        dest_shape_2,
        array_0_view,
        element_wise_kernel2D2<functor::eltdiv, float>(temp_2, broadcasted_reshapeFT(temp_1, shape_2)),
        num_el3,
        dest_shape_3
    );
 }

 extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
    run(a, b, c, d, e, f);
 }
diff --git a/softmax-v3.cu b/softmax-v3.cu
 #include "dali/array/jit/array_view.h"
 #include "dali/array/jit/reducer_kernels.h"
 #include "dali/array/functor.h"
 template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2, typename Destination3, typename Source3>
 void __global__
 assign_kernel_11773770055001859919(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2, Destination3 dst3, Source3 src3, int num_el3, Shape<2> shape3) {
    for (int i0 = blockIdx.x; i0 < shape0[0]; i0 += gridDim.x) {
        for (int i1 = threadIdx.x; i1 < shape0[1]; i1 += blockDim.x) {
            Shape<2> nd_idx({i0, i1});
            dst0[i1] = src0[i1];
        }
    }
    __syncthreads();
    for (int i0 = blockIdx.x; i0 < shape1[0]; i0 += gridDim.x) {
        for (int i1 = threadIdx.x; i1 < shape1[1]; i1 += blockDim.x) {
            Shape<2> nd_idx({i0, i1});
            dst1[i1] = src1[i1];
        }
    }
    __syncthreads();
    for (int i0 = blockIdx.x; i0 < shape2[0]; i0 += gridDim.x) {
        for (int i1 = 0; i1 < shape2[1]; i1 += 1) {
            Shape<2> nd_idx({i0, i1});
            dst2[i1] = src2[i1];
        }
    }
    __syncthreads();
    for (int i0 = blockIdx.x; i0 < shape3[0]; i0 += gridDim.x) {
        for (int i1 = threadIdx.x; i1 < shape3[1]; i1 += blockDim.x) {
            Shape<2> nd_idx({i0, i1});
            dst3[i1] = src3[i1];
        }
    }
 }
 template<typename C1>
 struct Expand_dims_1_2dKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0]}];
    }
 };
 template<typename C1>
 Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
    return Expand_dims_1_2dKernel<C1>(array, shape);
 }
 template<typename Reducer, typename Type, typename C1>
 struct AxisReduceKernel2 {
    C1 arg_1_view_;
    static const int ndim = 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE AxisReduceKernel2(
        C1 arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<1>& input_query) const {
        Shape<2> query = input_query.expand_dims(2);
        T res;
        Reducer::SetInitValue(res);
        int& i1 = query[1];
        for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
            Reducer::Reduce(res, arg_1_view_[query]);
        }
        return res;
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
        C1 arg_1_view) {
    return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
 }
 template<template <typename> class Functor, typename Type,
         typename C1>
 struct ElementWiseKernel2D1 {
    C1 arg_1_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D1(
        const C1& arg_1_view)
        : arg_1_view_(arg_1_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1>
 XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
        const C1& arg_1_view) {
    return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
 }
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 struct ElementWiseKernel2D2 {
    C1 arg_1_view_;
    C2 arg_2_view_;
    static const int ndim = 2;
    typedef Type T;
    XINLINE const Shape<ndim>& shape() const {
        return arg_1_view_.shape();
    }
    XINLINE ElementWiseKernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view)
        : arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
    }
 };
 template<template <typename> class Functor, typename Type,
         typename C1,
         typename C2>
 XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
        const C1& arg_1_view,
        const C2& arg_2_view) {
    return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
 }
 template<typename C1>
 struct Broadcasted_reshapeFTKernel {
    const C1 array_;
    const Shape<2> shape_;
    static const int ndim = 2;
    typedef typename C1::T T;
    XINLINE const Shape<ndim>& shape() const {return shape_;}
    XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
       : array_(array), shape_(shape) {}
    XINLINE T operator[](const Shape<ndim>& query) const {
        return array_[{query[0], 0}];
    }
 };
 template<typename C1>
 Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
    return Broadcasted_reshapeFTKernel<C1>(array, shape);
 }
 #include <cooperative_groups.h>
 template<typename T, typename C1, int ndim>
 inline __device__ T thread_sum(const C1& input, Shape<ndim> query, int start, int stride) {
    T sum = 0;
    int& i = query[ndim - 1];
    for(i = start;
        i < input.shape()[ndim-1];
        i += stride) {
        sum += input[query];
    }
    return sum;
 }
 template<typename T, int ndim>
 inline __device__ int thread_sum(const ArrayView<int, ndim>& input, Shape<ndim> query, int start, int stride) {
    int sum = 0;
    int4* ptr = (int4*) &input[query];
    int cols_div_word_length = input.shape()[ndim-1] / 4;
    int& i = query[ndim - 1];
    if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
        for(i = start;
            i < cols_div_word_length;
            i += stride) {
            int4 in = ptr[i];
            sum += in.x + in.y + in.z + in.w;
        }
    } else {
        for(i = start;
            i < input.shape()[ndim-1];
            i += stride) {
            sum += input[query];
        }
    }
    return sum;
 }
 template<typename T, int ndim>
 inline __device__ float thread_sum(const ArrayView<float, ndim>& input, Shape<ndim> query, int start, int stride) {
    float sum = 0;
    float4* ptr = (float4*) &input[query];
    int cols_div_word_length = input.shape()[ndim-1] / 4;
    int& i = query[ndim - 1];
    if (cols_div_word_length * 4 == input.shape()[ndim-1]) {
        for(i = start;
            i < cols_div_word_length;
            i += stride) {
            float4 in = ptr[i];
            sum += in.x + in.y + in.z + in.w;
        }
    } else {
        for(i = start;
            i < input.shape()[ndim-1];
            i += stride) {
            sum += input[query];
        }
    }
    return sum;
 }
 template<typename T, int ndim>
 inline __device__ double thread_sum(const ArrayView<double, ndim>& input, Shape<ndim> query, int start, int stride) {
    double sum = 0;
    double2* ptr = (double2*) &input[query];
    int cols_div_word_length = input.shape()[ndim-1] / 2;
    int& i = query[ndim - 1];
    if (cols_div_word_length * 2 == input.shape()[ndim-1]) {
        for(i = start;
            i < cols_div_word_length;
            i += stride) {
            double2 in = ptr[i];
            sum += in.x + in.y;
        }
    } else {
        for(i = start;
            i < input.shape()[ndim-1];
            i += stride) {
            sum += input[query];
        }
    }
    return sum;
 }
 template <int tile_sz, typename T>
 __device__ T reduce_sum_tile_shfl(cooperative_groups::thread_block_tile<tile_sz> g, T val) {
    int lane = g.thread_rank();
    // Each iteration halves the number of active threads
    // Each thread adds its partial sum[i] to sum[lane+i]
    for (int i = g.size() / 2; i > 0; i /= 2) {
        val += g.shfl_down(val, i);
    }
    return val; // note: only thread 0 will return full sum
 }
 template<typename Reducer, typename Type, typename C1>
 struct ShflDownWarpAxisSum1 {
    C1 arg_;
    static const int ndim = C1::ndim - 1;
    typedef Type T;
    XINLINE Shape<ndim> shape() const {
        return arg_.shape().template axis_reduced_shape<0, ndim>();
    }
    XINLINE ShflDownWarpAxisSum1(C1 arg) : arg_(arg) {}
    inline __device__ T operator[](const Shape<ndim>& input_query) const {
        __shared__ T sum;
        sum = 0;
        Shape<ndim + 1> query = input_query.expand_dims(ndim);
        query[ndim] = 0;
        T my_sum = thread_sum<T>(arg_, query, threadIdx.x, blockDim.x);
        auto tile = cooperative_groups::tiled_partition<16>(
            cooperative_groups::this_thread_block());
        T tile_sum = reduce_sum_tile_shfl<16>(tile, my_sum);
        if (tile.thread_rank() == 0) atomicAdd(&sum, tile_sum);
        __syncthreads();
        return sum;
    }
 };
 template<typename Reducer, typename Type, typename C1>
 XINLINE ShflDownWarpAxisSum1<Reducer, Type, C1> shfl_down_warp_axis_sum2d(
        C1 arg) {
    return ShflDownWarpAxisSum1<Reducer, Type, C1>(arg);
 }
 void run(void** array_data, const int* offsets, const int** sizes, const int** strides, const void** scalar_arguments, const int** shapes) {
    auto array_0_view = make_view<float, 2>(array_data[0], offsets[0], sizes[0]);
    auto array_1_view = make_view<float, 2>(array_data[1], offsets[1], sizes[1]);
    Shape<2> shape_0(shapes[0]);
    Shape<2> shape_1(shapes[1]);
    Shape<2> shape_2(shapes[2]);
    Shape<2> shape_3(shapes[3]);
    Shape<2> shape_4(shapes[4]);
    Shape<2> shape_5(shapes[5]);
    auto temp_0 = make_view<float, 2>(array_data[2], offsets[2], sizes[2]);
    auto temp_1 = make_view<float, 2>(array_data[3], offsets[3], sizes[3]);
    auto temp_2 = make_view<float, 2>(array_data[4], offsets[4], sizes[4]);

    const int NT = 256;
    // const int MAX_BLOCKS = 40960;
    auto dest_shape_0 = temp_0.shape();
    int num_el0 = dest_shape_0.numel();
    auto dest_shape_1 = temp_2.shape();
    int num_el1 = dest_shape_1.numel();
    auto dest_shape_2 = temp_1.shape();
    int num_el2 = dest_shape_2.numel();
    auto dest_shape_3 = array_0_view.shape();
    int num_el3 = dest_shape_3.numel();
    int max_num_el = dest_shape_0[0];
    max_num_el = max(max_num_el, dest_shape_1[0]);
    max_num_el = max(max_num_el, dest_shape_2[0]);
    max_num_el = max(max_num_el, dest_shape_3[0]);
    // assert(grid_size <= MAX_BLOCKS);
    int grid_size = max_num_el;
    assign_kernel_11773770055001859919<<<grid_size, NT, 0, NULL>>>(
        temp_0,
        expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, float>(array_1_view), shape_5),
        num_el0,
        dest_shape_0,
        temp_2,
        element_wise_kernel2D1<functor::exp, float>(element_wise_kernel2D2<functor::subtract, float>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4))),
        num_el1,
        dest_shape_1,
        temp_1,
        expand_dims_1_2d(shfl_down_warp_axis_sum2d<reducers::sum, float>(temp_2), shape_3),
        num_el2,
        dest_shape_2,
        array_0_view,
        element_wise_kernel2D2<functor::eltdiv, float>(temp_2, broadcasted_reshapeFT(temp_1, shape_2)),
        num_el3,
        dest_shape_3
    );
 }

 extern "C" void maker (void** a, int const* b, int const** c, int const** d, void const** e, int const** f){
    run(a, b, c, d, e, f);
 }
	auto a = op::uniform(-20.0, 20.0, {2, 5}).astype(dtype);
	a.eval();
	auto exped = op::exp(a - op::max(a, {-1}, true));
	auto fused_softmax = exped / op::sum(exped, {-1}, true);
	#include "dali/array/jit/array_view.h"
	#include "dali/array/jit/reducer_kernels.h"
	#include "dali/array/functor.h"
	template <typename Destination0, typename Source0, typename Destination1, typename Source1, typename Destination2, typename Source2>
	void __global__
	assign_kernel_11204438187360910603(Destination0 dst0, Source0 src0, int num_el0, Shape<2> shape0, Destination1 dst1, Source1 src1, int num_el1, Shape<2> shape1, Destination2 dst2, Source2 src2, int num_el2, Shape<2> shape2) {
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	int stride = blockDim.x * gridDim.x;
	for (int i = idx; i < num_el0; i += stride) {
	auto nd_idx = index_to_dim(idx, shape0);
	dst0[nd_idx] = src0[nd_idx];
	}
	__syncthreads();
	for (int i = idx; i < num_el1; i += stride) {
	auto nd_idx = index_to_dim(idx, shape1);
	dst1[nd_idx] = src1[nd_idx];
	}
	__syncthreads();
	for (int i = idx; i < num_el2; i += stride) {
	auto nd_idx = index_to_dim(idx, shape2);
	dst2[nd_idx] = src2[nd_idx];
	}
	}
	template<template <typename> class Functor, typename Type,
	typename C1,
	typename C2>
	struct ElementWiseKernel2D2 {
	C1 arg_1_view_;
	C2 arg_2_view_;
	static const int ndim = 2;
	typedef Type T;
	XINLINE const Shape<ndim>& shape() const {
	return arg_1_view_.shape();
	}
	XINLINE ElementWiseKernel2D2(
	const C1& arg_1_view,
	const C2& arg_2_view)
	: arg_1_view_(arg_1_view), arg_2_view_(arg_2_view) {}
	XINLINE T operator[](const Shape<ndim>& query) const {
	return Functor<T>::Map(arg_1_view_[query], arg_2_view_[query]);
	}
	};
	template<template <typename> class Functor, typename Type,
	typename C1,
	typename C2>
	XINLINE ElementWiseKernel2D2<Functor, Type, C1, C2> element_wise_kernel2D2(
	const C1& arg_1_view,
	const C2& arg_2_view) {
	return ElementWiseKernel2D2<Functor, Type, C1, C2>(arg_1_view, arg_2_view);
	}
	template<template <typename> class Functor, typename Type,
	typename C1>
	struct ElementWiseKernel2D1 {
	C1 arg_1_view_;
	static const int ndim = 2;
	typedef Type T;
	XINLINE const Shape<ndim>& shape() const {
	return arg_1_view_.shape();
	}
	XINLINE ElementWiseKernel2D1(
	const C1& arg_1_view)
	: arg_1_view_(arg_1_view) {}
	XINLINE T operator[](const Shape<ndim>& query) const {
	return Functor<T>::Map(arg_1_view_[query]);
	}
	};
	template<template <typename> class Functor, typename Type,
	typename C1>
	XINLINE ElementWiseKernel2D1<Functor, Type, C1> element_wise_kernel2D1(
	const C1& arg_1_view) {
	return ElementWiseKernel2D1<Functor, Type, C1>(arg_1_view);
	}
	template<typename C1>
	struct Broadcasted_reshapeFTKernel {
	const C1 array_;
	const Shape<2> shape_;
	static const int ndim = 2;
	typedef typename C1::T T;
	XINLINE const Shape<ndim>& shape() const {return shape_;}
	XINLINE Broadcasted_reshapeFTKernel(const C1& array, const Shape<2>& shape)
	: array_(array), shape_(shape) {}
	XINLINE T operator[](const Shape<ndim>& query) const {
	return array_[{query[0], 0}];
	}
	};
	template<typename C1>
	Broadcasted_reshapeFTKernel<C1> broadcasted_reshapeFT(const C1& array, const Shape<2>& shape) {
	return Broadcasted_reshapeFTKernel<C1>(array, shape);
	}
	template<typename C1>
	struct Expand_dims_1_2dKernel {
	const C1 array_;
	const Shape<2> shape_;
	static const int ndim = 2;
	typedef typename C1::T T;
	XINLINE const Shape<ndim>& shape() const {return shape_;}
	XINLINE Expand_dims_1_2dKernel(const C1& array, const Shape<2>& shape)
	: array_(array), shape_(shape) {}
	XINLINE T operator[](const Shape<ndim>& query) const {
	return array_[{query[0]}];
	}
	};
	template<typename C1>
	Expand_dims_1_2dKernel<C1> expand_dims_1_2d(const C1& array, const Shape<2>& shape) {
	return Expand_dims_1_2dKernel<C1>(array, shape);
	}
	template<typename Reducer, typename Type, typename C1>
	struct AxisReduceKernel2 {
	C1 arg_1_view_;
	static const int ndim = 1;
	typedef Type T;
	XINLINE Shape<ndim> shape() const {
	return arg_1_view_.shape().template axis_reduced_shape<0, ndim>();
	}
	XINLINE AxisReduceKernel2(
	C1 arg_1_view)
	: arg_1_view_(arg_1_view) {}
	XINLINE T operator[](const Shape<1>& input_query) const {
	Shape<2> query = input_query.expand_dims(2);
	T res;
	Reducer::SetInitValue(res);
	int& i1 = query[1];
	for (i1 = 0; i1 < arg_1_view_.shape()[1]; ++i1) {
	Reducer::Reduce(res, arg_1_view_[query]);
	}
	return res;
	}
	};
	template<typename Reducer, typename Type, typename C1>
	XINLINE AxisReduceKernel2<Reducer, Type, C1> axis_reduce_kernel_2d(
	C1 arg_1_view) {
	return AxisReduceKernel2<Reducer, Type, C1>(arg_1_view);
	}
	void run(void** array_data, const int* offsets, const int sizes, const int strides, const void scalar_arguments, const int shapes) {
	auto array_0_view = make_view<double, 2>(array_data[0], offsets[0], sizes[0]);
	auto array_1_view = make_view<double, 2>(array_data[1], offsets[1], sizes[1]);
	Shape<2> shape_0(shapes[0]);
	Shape<2> shape_1(shapes[1]);
	Shape<2> shape_2(shapes[2]);
	Shape<2> shape_3(shapes[3]);
	Shape<2> shape_4(shapes[4]);
	Shape<2> shape_5(shapes[5]);
	auto temp_0 = make_view<double, 2>(array_data[2], offsets[2], sizes[2]);
	auto temp_1 = make_view<double, 2>(array_data[3], offsets[3], sizes[3]);

	const int NT = 128;
	// const int MAX_BLOCKS = 40960;
	auto dest_shape_0 = temp_0.shape();
	int num_el0 = dest_shape_0.numel();
	int max_num_el = num_el0;
	auto dest_shape_1 = temp_1.shape();
	int num_el1 = dest_shape_1.numel();
	max_num_el = max(num_el1, max_num_el);
	auto dest_shape_2 = array_0_view.shape();
	int num_el2 = dest_shape_2.numel();
	max_num_el = max(num_el2, max_num_el);
	int grid_size = div_ceil(max_num_el, NT);
	// assert(grid_size <= MAX_BLOCKS);
	assign_kernel_11204438187360910603<<<grid_size, NT, 0, NULL>>>(
	temp_0,
	expand_dims_1_2d(axis_reduce_kernel_2d<reducers::maximum, double>(array_1_view), shape_1),
	num_el0,
	dest_shape_0,
	temp_1,
	expand_dims_1_2d(axis_reduce_kernel_2d<reducers::sum, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_4)))), shape_3),
	num_el1,
	dest_shape_1,
	array_0_view,
	element_wise_kernel2D2<functor::eltdiv, double>(element_wise_kernel2D1<functor::exp, double>(element_wise_kernel2D2<functor::subtract, double>(array_1_view, broadcasted_reshapeFT(temp_0, shape_0))), broadcasted_reshapeFT(temp_1, shape_2)),
	num_el2,
	dest_shape_2
	);
	}

	extern "C" void maker (void** a, int const* b, int const c, int const d, void const e, int const f){
	run(a, b, c, d, e, f);
	}