szihs · August 15, 2021 11:33
diff --git a/test.cpp b/test.cpp
 #include <iostream>
 #include "include/libnpy/npy.hpp"
 #include "arm_compute/runtime/NEON/NEFunctions.h"

 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/MemoryManagerOnDemand.h"
 #include "arm_compute/runtime/PoolManager.h"
 #include "utils/Utils.h"

 using namespace arm_compute;
 using namespace utils;
 using namespace std;

 class TestCNNExample : public Example
 {
 public:
    bool do_setup(int argc, char **argv) override
    {
        ARM_COMPUTE_UNUSED(argc);
        ARM_COMPUTE_UNUSED(argv);

        // Create memory manager components
        // We need 2 memory managers: 1 for handling the tensors within the functions (mm_layers) and 1 for handling the input and output tensors of the functions (mm_transitions))
        auto lifetime_mgr0  = std::make_shared<BlobLifetimeManager>();                           // Create lifetime manager
        auto lifetime_mgr1  = std::make_shared<BlobLifetimeManager>();                           // Create lifetime manager
        auto pool_mgr0      = std::make_shared<PoolManager>();                                   // Create pool manager
        auto pool_mgr1      = std::make_shared<PoolManager>();                                   // Create pool manager
        auto mm_layers      = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr0, pool_mgr0); // Create the memory manager
        auto mm_transitions = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr1, pool_mgr1); // Create the memory manager

        // The weights and biases tensors should be initialized with the values inferred with the training

        // Set memory manager where allowed to manage internal memory requirements
        conv0   = arm_compute::support::cpp14::make_unique<NEConvolutionLayer>(mm_layers);

 /*
 From
 https://www.tensorflow.org/api_docs/python/tf/nn/conv2d
 input -> (1,6,3,2) [batch = 1, in_height = 6, in_width = 3, in_channels = 2]
 filter -> (4, 3, 2, 4) [filter_height =4, filter_width =3, in_channels=2, out_channels=4]
 output -> (1, 6, 3, 4), height=6, width=3,channels=4
 */
        // Initialize src tensor
        constexpr unsigned int width_src_image  = 3;
        constexpr unsigned int height_src_image = 6;
        constexpr unsigned int ifm_src_img      = 2;

        const TensorShape src_shape(width_src_image, height_src_image, ifm_src_img);
        src.allocator()->init(TensorInfo(src_shape, 1, DataType::F32));

        // Initialize tensors of conv0
        constexpr unsigned int kernel_x_conv0 = 3;
        constexpr unsigned int kernel_y_conv0 = 4;
        constexpr unsigned int ofm_conv0      = 4;

 	//HWIO in TF, OIHW in ACL, i.e ( W, H, I, O)
        const TensorShape weights_shape_conv0(kernel_x_conv0, kernel_y_conv0, src_shape.z(), ofm_conv0);
        const TensorShape biases_shape_conv0(ofm_conv0);
        const TensorShape out_shape_conv0(src_shape.x(), src_shape.y(), weights_shape_conv0[3]);

        weights0.allocator()->init(TensorInfo(weights_shape_conv0, 1, DataType::F32));
        biases0.allocator()->init(TensorInfo(biases_shape_conv0, 1, DataType::F32));
        out_conv0.allocator()->init(TensorInfo(out_shape_conv0, 1, DataType::F32));

        /* -----------------------End: [Initialize tensors] */

        /* [Configure functions] */


 								//PadStrideInfo (unsigned int stride_x, unsigned int stride_y, unsigned int pad_left, unsigned int pad_right, unsigned int pad_top, unsigned int pad_bottom, DimensionRoundingType round
        conv0->configure(&src, &weights0, &biases0, &out_conv0, PadStrideInfo(1 /* stride_x */, 1 /* stride_y */, 1 /* pad_left */, 1 /* pad_right */, 	1, 2, DimensionRoundingType::FLOOR ));

        /* -----------------------End: [Configure functions] */

        /*[ Add tensors to memory manager ]*/

        // We need 2 memory groups for handling the input and output
        // We call explicitly allocate after manage() in order to avoid overlapping lifetimes
        memory_group0 = arm_compute::support::cpp14::make_unique<MemoryGroup>(mm_transitions);
        memory_group1 = arm_compute::support::cpp14::make_unique<MemoryGroup>(mm_transitions);

        memory_group0->manage(&out_conv0);
        out_conv0.allocator()->allocate();

        /* -----------------------End: [ Add tensors to memory manager ] */

        /* [Allocate tensors] */

        // Now that the padding requirements are known we can allocate all tensors
        src.allocator()->allocate();
        weights0.allocator()->allocate();
        biases0.allocator()->allocate();

        /* -----------------------End: [Allocate tensors] */

        // Populate the layers manager. (Validity checks, memory allocations etc)
        mm_layers->populate(allocator, 1 /* num_pools */);

        // Populate the transitions manager. (Validity checks, memory allocations etc)
        mm_transitions->populate(allocator, 2 /* num_pools */);


        return true;
    }
    void do_run() override
    {
        // Acquire memory for the memory groups
        memory_group0->acquire();
        memory_group1->acquire();


 	{
 		Window window;
 		window.use_tensor_dimensions(src.info()->tensor_shape());
 		execute_window_loop(window, [&](const Coordinates & id)
 				{
 				*reinterpret_cast<float *>(src.ptr_to_element(id)) = 1.0f;
 				});
 	}

 	{
 		vector<unsigned long> shape;
 		vector<float>data;
 		npy::LoadArrayFromNumpy("./build/examples/w.npy", shape, data); 

 		std::cout << "weights shape contains:";
 		for (std::vector<unsigned long>::iterator it = shape.begin() ; it != shape.end(); ++it)
 			std::cout << ' ' << *it;
 		std::cout << '\n';

 		Window window;
 		window.use_tensor_dimensions(weights0.info()->tensor_shape());
 		const DataLayout  data_layout  = weights0.info()->data_layout();
 		const TensorShape tensor_shape = weights0.info()->tensor_shape();
 		cout << "Num dims " << tensor_shape.num_dimensions() << std::endl;
 		for (int i = 0; i < tensor_shape.num_dimensions(); i++)
 			std::cout << " " << tensor_shape[i];
 		std::cout << '\n';

 		std::cout << " WEIGHT WIDTH  " << window.x().end();
 		std::cout << " HT " << window.y().end(); 
 		std::cout << " CHANNEL " << window.z().end();
 		std::cout << " BATCH " << window[3].end() << endl;

 		Iterator out(&weights0, window);
 		int count = 0;
 		execute_window_loop(window, [&](const Coordinates & id)
 				{
 				std::cout << "( " << id[3] << ", " << id[2] << ", " << id[1] << ", " << id[0] << " ) " ;
 				std::cout << " - " << data[count] << endl;
 				*reinterpret_cast<float *>(out.ptr()) = static_cast<float>( data[count]);
 				count++;
 				}, out);
 	}
 	{
 		vector<unsigned long> shape;
 		vector<float>data;
 		npy::LoadArrayFromNumpy("./build/examples/b.npy", shape, data); 

 		std::cout << "bias shape contains:";
 		for (std::vector<unsigned long>::iterator it = shape.begin() ; it != shape.end(); ++it)
 			std::cout << ' ' << *it;
 		std::cout << '\n';

 		int count = 0;
 		Window window;
 		window.use_tensor_dimensions(biases0.info()->tensor_shape());
 		const DataLayout  data_layout  = biases0.info()->data_layout();
 		const TensorShape tensor_shape = biases0.info()->tensor_shape();
 		cout << "Num  dims " << tensor_shape.num_dimensions() << std::endl;

 		for (int i = 0; i < tensor_shape.num_dimensions(); i++)
 			std::cout << " " << tensor_shape[i];
 		std::cout << '\n';
 		std::cout << " BIAS WIDTH  " << window.x().end();
 		std::cout << " HT " << window.y().end(); 
 		std::cout << " CHANNEL" << window.z().end();
 		std::cout << " BATCH" << window[3].end() << endl;

 		Iterator out(&biases0, window);
 		execute_window_loop(window, [&](const Coordinates & id)
 				{
 				std::cout << " B " << data[count] << std::endl;
 				*reinterpret_cast<float *>(out.ptr()) = static_cast<float>( data[count]);
 				count++;
 				}, out);
 	}
        conv0->run();
 {
 		Window window;
 		window.use_tensor_dimensions(out_conv0.info()->tensor_shape());
 		const DataLayout  data_layout  = out_conv0.info()->data_layout();
 		const TensorShape tensor_shape = out_conv0.info()->tensor_shape();
 		cout << "max dims " << tensor_shape.num_dimensions() << std::endl;

 		for (int i = 0; i < tensor_shape.num_dimensions(); i++)
 			std::cout << " " << tensor_shape[i];

 		std::cout << '\n';
 		std::cout << " OUT WIDTH  " << window.x().end();
 		std::cout << " HT " << window.y().end(); 
 		std::cout << " CHANNEL " << window.z().end();
 		std::cout << " BATCH " << window[3].end() << endl;
 		unsigned long ww = window.x().end();
 		unsigned long hh = window.y().end();
 		unsigned long cc = window.z().end();
 		unsigned long bb = window[3].end();
 	//const unsigned long shape[] = {ww, hh, cc, bb};
 		const unsigned long shape[] = {bb, cc, hh, ww};
 		std::vector <float> data (shape[0] *shape[1] * shape[2] * shape[3]);
 		int count = 0;
 		execute_window_loop(window, [&](const Coordinates & id)
 				{
 				data[count++] = *reinterpret_cast<float *>(out_conv0.ptr_to_element(id));
 				});

 		npy::SaveArrayAsNumpy( "./build/examples/out.npy", false, 4, shape, data);
 	}
        // Release memory
        memory_group0->release();
        memory_group1->release();
    }

 private:
    // The src tensor should contain the input image
    Tensor src{};

    // Intermediate tensors used
    Tensor weights0{};
    Tensor biases0{};
    Tensor out_conv0{};

    // NEON allocator
    Allocator allocator{};

    // Memory groups
    std::unique_ptr<MemoryGroup> memory_group0{};
    std::unique_ptr<MemoryGroup> memory_group1{};

    // Layers
    std::unique_ptr<NEConvolutionLayer>    conv0{};
 };

 /** Main program for cnn test
 *
 * The example implements the following CNN architecture:
 *
 * Input -> conv0
 *
 * @param[in] argc Number of arguments
 * @param[in] argv Arguments
 */
 int main(int argc, char **argv)
 {
    return utils::run_example<TestCNNExample>(argc, argv);
 }
	#include <iostream>
	#include "include/libnpy/npy.hpp"
	#include "arm_compute/runtime/NEON/NEFunctions.h"

	#include "arm_compute/core/Types.h"
	#include "arm_compute/runtime/Allocator.h"
	#include "arm_compute/runtime/BlobLifetimeManager.h"
	#include "arm_compute/runtime/MemoryManagerOnDemand.h"
	#include "arm_compute/runtime/PoolManager.h"
	#include "utils/Utils.h"

	using namespace arm_compute;
	using namespace utils;
	using namespace std;

	class TestCNNExample : public Example
	{
	public:
	bool do_setup(int argc, char **argv) override
	{
	ARM_COMPUTE_UNUSED(argc);
	ARM_COMPUTE_UNUSED(argv);

	// Create memory manager components
	// We need 2 memory managers: 1 for handling the tensors within the functions (mm_layers) and 1 for handling the input and output tensors of the functions (mm_transitions))
	auto lifetime_mgr0 = std::make_shared<BlobLifetimeManager>(); // Create lifetime manager
	auto lifetime_mgr1 = std::make_shared<BlobLifetimeManager>(); // Create lifetime manager
	auto pool_mgr0 = std::make_shared<PoolManager>(); // Create pool manager
	auto pool_mgr1 = std::make_shared<PoolManager>(); // Create pool manager
	auto mm_layers = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr0, pool_mgr0); // Create the memory manager
	auto mm_transitions = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr1, pool_mgr1); // Create the memory manager

	// The weights and biases tensors should be initialized with the values inferred with the training

	// Set memory manager where allowed to manage internal memory requirements
	conv0 = arm_compute::support::cpp14::make_unique<NEConvolutionLayer>(mm_layers);

	/*
	From
	https://www.tensorflow.org/api_docs/python/tf/nn/conv2d
	input -> (1,6,3,2) [batch = 1, in_height = 6, in_width = 3, in_channels = 2]
	filter -> (4, 3, 2, 4) [filter_height =4, filter_width =3, in_channels=2, out_channels=4]
	output -> (1, 6, 3, 4), height=6, width=3,channels=4
	*/
	// Initialize src tensor
	constexpr unsigned int width_src_image = 3;
	constexpr unsigned int height_src_image = 6;
	constexpr unsigned int ifm_src_img = 2;

	const TensorShape src_shape(width_src_image, height_src_image, ifm_src_img);
	src.allocator()->init(TensorInfo(src_shape, 1, DataType::F32));

	// Initialize tensors of conv0
	constexpr unsigned int kernel_x_conv0 = 3;
	constexpr unsigned int kernel_y_conv0 = 4;
	constexpr unsigned int ofm_conv0 = 4;

	//HWIO in TF, OIHW in ACL, i.e ( W, H, I, O)
	const TensorShape weights_shape_conv0(kernel_x_conv0, kernel_y_conv0, src_shape.z(), ofm_conv0);
	const TensorShape biases_shape_conv0(ofm_conv0);
	const TensorShape out_shape_conv0(src_shape.x(), src_shape.y(), weights_shape_conv0[3]);

	weights0.allocator()->init(TensorInfo(weights_shape_conv0, 1, DataType::F32));
	biases0.allocator()->init(TensorInfo(biases_shape_conv0, 1, DataType::F32));
	out_conv0.allocator()->init(TensorInfo(out_shape_conv0, 1, DataType::F32));

	/* -----------------------End: [Initialize tensors] */

	/* [Configure functions] */


	//PadStrideInfo (unsigned int stride_x, unsigned int stride_y, unsigned int pad_left, unsigned int pad_right, unsigned int pad_top, unsigned int pad_bottom, DimensionRoundingType round
	conv0->configure(&src, &weights0, &biases0, &out_conv0, PadStrideInfo(1 /* stride_x /, 1 / stride_y /, 1 / pad_left /, 1 / pad_right */, 1, 2, DimensionRoundingType::FLOOR ));

	/* -----------------------End: [Configure functions] */

	/[ Add tensors to memory manager ]/

	// We need 2 memory groups for handling the input and output
	// We call explicitly allocate after manage() in order to avoid overlapping lifetimes
	memory_group0 = arm_compute::support::cpp14::make_unique<MemoryGroup>(mm_transitions);
	memory_group1 = arm_compute::support::cpp14::make_unique<MemoryGroup>(mm_transitions);

	memory_group0->manage(&out_conv0);
	out_conv0.allocator()->allocate();

	/* -----------------------End: [ Add tensors to memory manager ] */

	/* [Allocate tensors] */

	// Now that the padding requirements are known we can allocate all tensors
	src.allocator()->allocate();
	weights0.allocator()->allocate();
	biases0.allocator()->allocate();

	/* -----------------------End: [Allocate tensors] */

	// Populate the layers manager. (Validity checks, memory allocations etc)
	mm_layers->populate(allocator, 1 /* num_pools */);

	// Populate the transitions manager. (Validity checks, memory allocations etc)
	mm_transitions->populate(allocator, 2 /* num_pools */);


	return true;
	}
	void do_run() override
	{
	// Acquire memory for the memory groups
	memory_group0->acquire();
	memory_group1->acquire();


	{
	Window window;
	window.use_tensor_dimensions(src.info()->tensor_shape());
	execute_window_loop(window, [&](const Coordinates & id)
	{
	reinterpret_cast<float >(src.ptr_to_element(id)) = 1.0f;
	});
	}

	{
	vector<unsigned long> shape;
	vector<float>data;
	npy::LoadArrayFromNumpy("./build/examples/w.npy", shape, data);

	std::cout << "weights shape contains:";
	for (std::vector<unsigned long>::iterator it = shape.begin() ; it != shape.end(); ++it)
	std::cout << ' ' << *it;
	std::cout << '\n';

	Window window;
	window.use_tensor_dimensions(weights0.info()->tensor_shape());
	const DataLayout data_layout = weights0.info()->data_layout();
	const TensorShape tensor_shape = weights0.info()->tensor_shape();
	cout << "Num dims " << tensor_shape.num_dimensions() << std::endl;
	for (int i = 0; i < tensor_shape.num_dimensions(); i++)
	std::cout << " " << tensor_shape[i];
	std::cout << '\n';

	std::cout << " WEIGHT WIDTH " << window.x().end();
	std::cout << " HT " << window.y().end();
	std::cout << " CHANNEL " << window.z().end();
	std::cout << " BATCH " << window[3].end() << endl;

	Iterator out(&weights0, window);
	int count = 0;
	execute_window_loop(window, [&](const Coordinates & id)
	{
	std::cout << "( " << id[3] << ", " << id[2] << ", " << id[1] << ", " << id[0] << " ) " ;
	std::cout << " - " << data[count] << endl;
	reinterpret_cast<float >(out.ptr()) = static_cast<float>( data[count]);
	count++;
	}, out);
	}
	{
	vector<unsigned long> shape;
	vector<float>data;
	npy::LoadArrayFromNumpy("./build/examples/b.npy", shape, data);

	std::cout << "bias shape contains:";
	for (std::vector<unsigned long>::iterator it = shape.begin() ; it != shape.end(); ++it)
	std::cout << ' ' << *it;
	std::cout << '\n';

	int count = 0;
	Window window;
	window.use_tensor_dimensions(biases0.info()->tensor_shape());
	const DataLayout data_layout = biases0.info()->data_layout();
	const TensorShape tensor_shape = biases0.info()->tensor_shape();
	cout << "Num dims " << tensor_shape.num_dimensions() << std::endl;

	for (int i = 0; i < tensor_shape.num_dimensions(); i++)
	std::cout << " " << tensor_shape[i];
	std::cout << '\n';
	std::cout << " BIAS WIDTH " << window.x().end();
	std::cout << " HT " << window.y().end();
	std::cout << " CHANNEL" << window.z().end();
	std::cout << " BATCH" << window[3].end() << endl;

	Iterator out(&biases0, window);
	execute_window_loop(window, [&](const Coordinates & id)
	{
	std::cout << " B " << data[count] << std::endl;
	reinterpret_cast<float >(out.ptr()) = static_cast<float>( data[count]);
	count++;
	}, out);
	}
	conv0->run();
	{
	Window window;
	window.use_tensor_dimensions(out_conv0.info()->tensor_shape());
	const DataLayout data_layout = out_conv0.info()->data_layout();
	const TensorShape tensor_shape = out_conv0.info()->tensor_shape();
	cout << "max dims " << tensor_shape.num_dimensions() << std::endl;

	for (int i = 0; i < tensor_shape.num_dimensions(); i++)
	std::cout << " " << tensor_shape[i];

	std::cout << '\n';
	std::cout << " OUT WIDTH " << window.x().end();
	std::cout << " HT " << window.y().end();
	std::cout << " CHANNEL " << window.z().end();
	std::cout << " BATCH " << window[3].end() << endl;
	unsigned long ww = window.x().end();
	unsigned long hh = window.y().end();
	unsigned long cc = window.z().end();
	unsigned long bb = window[3].end();
	//const unsigned long shape[] = {ww, hh, cc, bb};
	const unsigned long shape[] = {bb, cc, hh, ww};
	std::vector <float> data (shape[0] shape[1] shape[2] * shape[3]);
	int count = 0;
	execute_window_loop(window, [&](const Coordinates & id)
	{
	data[count++] = reinterpret_cast<float >(out_conv0.ptr_to_element(id));
	});

	npy::SaveArrayAsNumpy( "./build/examples/out.npy", false, 4, shape, data);
	}
	// Release memory
	memory_group0->release();
	memory_group1->release();
	}

	private:
	// The src tensor should contain the input image
	Tensor src{};

	// Intermediate tensors used
	Tensor weights0{};
	Tensor biases0{};
	Tensor out_conv0{};

	// NEON allocator
	Allocator allocator{};

	// Memory groups
	std::unique_ptr<MemoryGroup> memory_group0{};
	std::unique_ptr<MemoryGroup> memory_group1{};

	// Layers
	std::unique_ptr<NEConvolutionLayer> conv0{};
	};

	/** Main program for cnn test
	*
	* The example implements the following CNN architecture:
	*
	* Input -> conv0
	*
	* @param[in] argc Number of arguments
	* @param[in] argv Arguments
	*/
	int main(int argc, char **argv)
	{
	return utils::run_example<TestCNNExample>(argc, argv);
	}