Last active
October 19, 2022 07:31
-
-
Save battlesnake/2cce48284e2230ed05c80272f2419e72 to your computer and use it in GitHub Desktop.
STM32H750 DMA-based memcpy performance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Max clocks (480MHz CPU/SysTick, 240MHz AXI+AHB1+AHB2) | |
Using DMA2 (in D2 domain) | |
Copying 32-bit non-bursts |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
================================================================================ | |
== D1 AXI SRAM to D1 AXI SRAM | |
================================================================================ | |
Copying 64 bytes from D1 AXI SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
263 cycles until first byte changed | |
723 cycles until last byte changed | |
40 kB/s overall | |
63 kB/s since first byte changed | |
158 kB/s for memcpy via CPU | |
0.2x relative to memcpy | |
Copying 256 bytes from D1 AXI SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
263 cycles until first byte changed | |
2363 cycles until last byte changed | |
49 kB/s overall | |
55 kB/s since first byte changed | |
205 kB/s for memcpy via CPU | |
0.2x relative to memcpy | |
Copying 4096 bytes from D1 AXI SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
263 cycles until first byte changed | |
35163 cycles until last byte changed | |
53 kB/s overall | |
53 kB/s since first byte changed | |
227 kB/s for memcpy via CPU | |
0.2x relative to memcpy | |
Copying 65536 bytes from D1 AXI SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
263 cycles until first byte changed | |
559939 cycles until last byte changed | |
53 kB/s overall | |
53 kB/s since first byte changed | |
228 kB/s for memcpy via CPU | |
0.2x relative to memcpy | |
================================================================================ | |
== D2 AHB SRAM to D2 AHB SRAM | |
================================================================================ | |
Copying 64 bytes from D2 AHB SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
371 cycles until last byte changed | |
78 kB/s overall | |
281 kB/s since first byte changed | |
161 kB/s for memcpy via CPU | |
0.4x relative to memcpy | |
Copying 256 bytes from D2 AHB SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
943 cycles until last byte changed | |
124 kB/s overall | |
173 kB/s since first byte changed | |
207 kB/s for memcpy via CPU | |
0.5x relative to memcpy | |
Copying 4096 bytes from D2 AHB SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
13063 cycles until last byte changed | |
143 kB/s overall | |
146 kB/s since first byte changed | |
227 kB/s for memcpy via CPU | |
0.6x relative to memcpy | |
Copying 65536 bytes from D2 AHB SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
206931 cycles until last byte changed | |
144 kB/s overall | |
145 kB/s since first byte changed | |
228 kB/s for memcpy via CPU | |
0.6x relative to memcpy | |
================================================================================ | |
== D1 AXI SRAM to D2 AHB SRAM | |
================================================================================ | |
Copying 64 bytes from D1 AXI SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
427 cycles until last byte changed | |
68 kB/s overall | |
183 kB/s since first byte changed | |
161 kB/s for memcpy via CPU | |
0.4x relative to memcpy | |
Copying 256 bytes from D1 AXI SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
1215 cycles until last byte changed | |
96 kB/s overall | |
123 kB/s since first byte changed | |
207 kB/s for memcpy via CPU | |
0.4x relative to memcpy | |
Copying 4096 bytes from D1 AXI SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
16335 cycles until last byte changed | |
114 kB/s overall | |
116 kB/s since first byte changed | |
227 kB/s for memcpy via CPU | |
0.5x relative to memcpy | |
Copying 65536 bytes from D1 AXI SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
265 cycles until first byte changed | |
258253 cycles until last byte changed | |
116 kB/s overall | |
116 kB/s since first byte changed | |
228 kB/s for memcpy via CPU | |
0.5x relative to memcpy | |
Copying 131072 bytes from D1 AXI SRAM to D2 AHB SRAM | |
162 cycles to start DMA copy | |
265 cycles until first byte changed | |
516307 cycles until last byte changed | |
116 kB/s overall | |
116 kB/s since first byte changed | |
228 kB/s for memcpy via CPU | |
0.5x relative to memcpy | |
================================================================================ | |
== D2 AHB SRAM to D1 AXI SRAM | |
================================================================================ | |
Copying 64 bytes from D2 AHB SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
267 cycles until first byte changed | |
479 cycles until last byte changed | |
61 kB/s overall | |
138 kB/s since first byte changed | |
158 kB/s for memcpy via CPU | |
0.3x relative to memcpy | |
Copying 256 bytes from D2 AHB SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
265 cycles until first byte changed | |
1341 cycles until last byte changed | |
87 kB/s overall | |
108 kB/s since first byte changed | |
205 kB/s for memcpy via CPU | |
0.4x relative to memcpy | |
Copying 4096 bytes from D2 AHB SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
265 cycles until first byte changed | |
18621 cycles until last byte changed | |
100 kB/s overall | |
102 kB/s since first byte changed | |
227 kB/s for memcpy via CPU | |
0.4x relative to memcpy | |
Copying 65536 bytes from D2 AHB SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
265 cycles until first byte changed | |
295123 cycles until last byte changed | |
101 kB/s overall | |
101 kB/s since first byte changed | |
228 kB/s for memcpy via CPU | |
0.4x relative to memcpy | |
Copying 131072 bytes from D2 AHB SRAM to D1 AXI SRAM | |
162 cycles to start DMA copy | |
265 cycles until first byte changed | |
590013 cycles until last byte changed | |
101 kB/s overall | |
101 kB/s since first byte changed | |
228 kB/s for memcpy via CPU | |
0.4x relative to memcpy |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
struct ramdef | |
{ | |
const char *name; | |
volatile uint8_t *ptr; | |
uint32_t size; | |
}; | |
#define FREQ (480000000ULL) | |
#define MEM_SIZE (0x28000) | |
static volatile uint8_t d1buf[MEM_SIZE] _dma_align _d1ram_bss; | |
static volatile uint8_t d2buf[MEM_SIZE] _dma_align _d2ram_bss; | |
const struct ramdef d1 = { .name = "D1 AXI SRAM", .ptr = d1buf, .size = 0x20000 }; | |
const struct ramdef d1_a = { .name = "D1 AXI SRAM", .ptr = d1buf, .size = 0x10000 }; | |
const struct ramdef d1_b = { .name = "D1 AXI SRAM", .ptr = d1buf + 0x10000, .size = 0x10000 }; | |
const struct ramdef d2 = { .name = "D2 AHB SRAM", .ptr = d2buf, .size = 0x20000 }; | |
const struct ramdef d2_a = { .name = "D2 AHB SRAM", .ptr = d2buf, .size = 0x10000 }; | |
const struct ramdef d2_b = { .name = "D2 AHB SRAM", .ptr = d2buf + 0x10000, .size = 0x10000 }; | |
static void run_test(const struct ramdef *src, const struct ramdef *dst, size_t size) | |
{ | |
if (src->size < size || dst->size < size) { | |
return; | |
} | |
memset(src->ptr, 0x55, size); | |
__DSB(); | |
__ISB(); | |
volatile uint32_t tx = read_cycle_counter(); | |
__DSB(); | |
__ISB(); | |
memset(dst->ptr, 0x00, size); | |
__DSB(); | |
__ISB(); | |
volatile uint32_t ty = read_cycle_counter(); | |
__DSB(); | |
__ISB(); | |
/* Cache maintenance - flush */ | |
dma_pre_transmit(src->ptr, size); | |
dma_pre_transmit(dst->ptr, size); | |
__DSB(); | |
__ISB(); | |
volatile uint32_t t0 = read_cycle_counter(); | |
__DSB(); | |
__ISB(); | |
dma_memcpy_start(0, dst->ptr, src->ptr, size); | |
__DSB(); | |
__ISB(); | |
volatile uint32_t t1 = read_cycle_counter(); | |
__DSB(); | |
__ISB(); | |
do { | |
dma_pre_receive(&dst->ptr[0], 32); | |
} while (dst->ptr[0] != 0x55); | |
__DSB(); | |
__ISB(); | |
volatile uint32_t t2 = read_cycle_counter(); | |
__DSB(); | |
__ISB(); | |
do { | |
dma_pre_receive(&dst->ptr[size - 32], 32); | |
} while (dst->ptr[size - 1] != 0x55); | |
__DSB(); | |
__ISB(); | |
volatile uint32_t t3 = read_cycle_counter(); | |
__DSB(); | |
__ISB(); | |
uint64_t rate_30 = ((uint64_t) size * FREQ / (t3 - t0) / 1048576); | |
uint64_t rate_32 = ((uint64_t) size * FREQ / (t3 - t2) / 1048576); | |
uint64_t rate_yx = ((uint64_t) size * FREQ / (ty - tx) / 1048576); | |
uint32_t speedup = rate_30 * 10 / rate_yx; | |
isr_printf("Copying %u bytes from %s to %s\n", size, src->name, dst->name); | |
isr_printf(" %u cycles to start DMA copy\n", t1 - t0); | |
isr_printf(" %u cycles until first byte changed\n", t2 - t0); | |
isr_printf(" %u cycles until last byte changed\n", t3 - t0); | |
isr_printf(" %llu kB/s overall\n", rate_30); | |
isr_printf(" %llu kB/s since first byte changed\n", rate_32); | |
isr_printf(" %llu kB/s for memcpy via CPU\n", rate_yx); | |
isr_printf(" %u.%ux relative to memcpy\n", speedup / 10, speedup % 10); | |
isr_printf("\n"); | |
} | |
static void run_tests(const struct ramdef *src, const struct ramdef *dst) | |
{ | |
isr_printf("\n"); | |
isr_printf("================================================================================\n"); | |
isr_printf("== %s to %s\n", src->name, dst->name); | |
isr_printf("================================================================================\n"); | |
run_test(src, dst, 64); | |
run_test(src, dst, 256); | |
run_test(src, dst, 4096); | |
run_test(src, dst, 65536); | |
run_test(src, dst, 131072); | |
isr_printf("\n"); | |
} | |
_noreturn | |
void entry_point() | |
{ | |
dma_mem_init(); | |
serial_init(); | |
serial_tx_init(); | |
run_tests(&d1_a, &d1_b); | |
run_tests(&d2_a, &d2_b); | |
run_tests(&d1, &d2); | |
run_tests(&d2, &d1); | |
while (1) ; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment