Skip to content

Instantly share code, notes, and snippets.

@vmiheer
Created December 19, 2024 16:56
Show Gist options
  • Save vmiheer/06c23a25e37e69f3de05c9d031e1512f to your computer and use it in GitHub Desktop.
Save vmiheer/06c23a25e37e69f3de05c9d031e1512f to your computer and use it in GitHub Desktop.
lapis bspmm
#!/usr/bin/env python3
import numpy as np
from pathlib import Path
import torch
import dgl.sparse as dglsp
from copy import deepcopy
from itertools import product, repeat
from more_itertools import take
import argparse
import sys
def is_interactive():
import __main__ as main
return not hasattr(main, "__file__")
def tensor_from_file(file_name: Path, dtype=np.float32, read_size=-1):
if read_size > 0:
a = open(file_name, "rb").read(read_size)
else:
a = open(file_name, "rb").read()
n = np.frombuffer(a, dtype=dtype)
return torch.from_numpy(deepcopy(n))
if is_interactive():
sys.argv = [
"a.py",
"banded_11_r2.coordinates.bin",
"11",
"4",
"2",
]
# sys.argv = [
# "a.py",
# "banded_4_r2.coordinates.bin",
# "4",
# "1",
# "1",
# "/scratch/general/vast/u1290058/LAPIS_Workspace/snl-utah/sandboxes/mseyden/graph-attention/dgl/arxiv.coordinates.bin",
# "169344",
# "1",
# "1",
# ]
if len(sys.argv) != 5:
print(f"Usage: python3 {sys.argv[0]} <dataset_name> <N> <Dh> <Nh>")
exit(1)
N, Dh, Nh = map(int, sys.argv[2:])
A = tensor_from_file(Path(sys.argv[1]), dtype=np.int64)
nnzCount = A.shape[0] // 2
info_file_name = sys.argv[1].split(".")[0] + ".coordinates.info"
Nv, _, Nnz = map(int, open(info_file_name).read().split())
print(f"Nv: {Nv}, N: {N}, Nnz: {Nnz}, nnzCount: {nnzCount}")
print(f"N: {N}, Dh: {Dh}, Nh: {Nh}")
# assert N == Nv
edgeFeatPath = sys.argv[1].split(".")[0] + ".edge.data.bin"
edgeData = tensor_from_file(edgeFeatPath, read_size=(Nnz * Nh * 4)).reshape(Nnz, Nh)
print("EdgeData[0]: ", edgeData[0])
A = dglsp.spmatrix(
A.reshape(Nnz, 2).transpose(1, 0),
# val=torch.tensor([1.0] * Nnz),
val=edgeData,
shape=(N, N),
)
featPath = sys.argv[1].split(".")[0] + ".vert.data.bin"
# Q = tensor_from_file(featPath, read_size=(N * Dh * Nh * 4)).reshape(N, Dh, Nh)
# K = tensor_from_file(featPath, read_size=(N * Dh * Nh * 4)).reshape(N, Dh, Nh)
V = tensor_from_file(featPath, read_size=(N * Dh * Nh * 4)).reshape(N, Dh, Nh)
spmm_out = dglsp.bspmm(A, V)
print(spmm_out.shape)
outPath = sys.argv[1].split(".")[0].split("/")[-1] + ".res"
out = tensor_from_file(outPath, read_size=(N * Dh * Nh * 4)).reshape(N, Dh, Nh)
# sddmm_out = dglsp.bsddmm(A, Q, K.transpose(1, 0)).softmax(dim=1)
# spmm_out = dglsp.bspmm(sddmm_out, V)
## sddmm_out = dglsp.bsddmm(A, Q, K.transpose(1, 0)).to_dense().softmax(dim=1)
## spmm_out = torch.mm(torch.squeeze(sddmm_out), torch.squeeze(V, -1))
## # print(spmm_out)
## sddmm_out = dglsp.bsddmm(A, Q, K.transpose(1, 0)).softmax(dim=1)
## # print(sddmm_out)
## spmm_out = dglsp.bspmm(sddmm_out, V)
## print(sddmm_out)
## exit(1)
## spmm_out = torch.mm(sddmm_out, V)
if not (torch.allclose(out, spmm_out)):
print(spmm_out - out)
print(spmm_out)
print(out)
exit(1)
exit(0)
#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
#csrv = #sparse_tensor.encoding<{ map = (d0, d1, d2) ->
(d0 : dense, d1 : compressed, d2 : dense) }>
#dense = #sparse_tensor.encoding<{ map = (d0, d1) ->
(d0 : dense, d1 : dense) }>
#densev = #sparse_tensor.encoding<{ map = (d0, d1, d2) ->
(d0 : dense, d1 : dense, d2 : dense) }>
#csr = #sparse_tensor.encoding<{ map = (d0, d1) ->
(d0 : dense, d1 : compressed) }>
#partCsr = #part_tensor.encoding<{
partConst = 1,
sparseAttributes = #csr
}>
#partDensev = #part_tensor.encoding<{
partConst = 1,
sparseAttributes = #densev
}>
#bsddmm_map = {
indexing_maps = [
affine_map<(n1, n2, dh, nh) -> (n1, dh, nh)>, // q (in)
affine_map<(n1, n2, dh, nh) -> (n2, dh, nh)>, // k (in)
affine_map<(n1, n2, dh, nh) -> (n1, n2)>, // A (in)
affine_map<(n1, n2, dh, nh) -> (n1, n2, nh)> // attn (out)
],
iterator_types = ["parallel", "parallel", "reduction", "parallel"],
doc = "attn(n1, n2, nh) = q(n1, dh, nh) * k(n2, dh, nh)"
}
#bspmm_map = {
indexing_maps = [
affine_map<(n1, n2, dh, nh) -> (n1, n2, nh)>, // attn (in)
affine_map<(n1, n2, dh, nh) -> (n2, dh, nh)>, // v (in)
affine_map<(n1, n2, dh, nh) -> (n1, dh, nh)> // out (out)
],
iterator_types = ["parallel", "parallel", "reduction", "parallel"],
doc = "out(n1, dh, nh) = attn(n1, n2, nh) * v(n2, dh, nh)"
}
module {
func.func @pte_local_bspmm(%A: tensor<?x?x?xf32, #csrv>,
%B: tensor<?x?x?xf32, #densev>) -> tensor<?x?x?xf32, #densev>
{
%c0_index = arith.constant 0 : index
%c1_index = arith.constant 1 : index
%c2_index = arith.constant 2 : index
%c3_index = arith.constant 3 : index
%c4_index = arith.constant 4 : index
%c5_index = arith.constant 5 : index
%c6_index = arith.constant 6 : index
%c9_index = arith.constant 9 : index
%c0_f32 = arith.constant 0.0 : f32
%N1 = tensor.dim %A, %c0_index : tensor<?x?x?xf32, #csrv>
%N2 = tensor.dim %A, %c1_index : tensor<?x?x?xf32, #csrv>
%nh = tensor.dim %A, %c2_index : tensor<?x?x?xf32, #csrv>
%dh = tensor.dim %B, %c1_index : tensor<?x?x?xf32, #densev>
%spmm_in0 = tensor.empty (%N1, %dh, %nh) : tensor<?x?x?xf32, #densev>
%spmm_in1 = linalg.fill ins(%c0_f32: f32)
outs(%spmm_in0 : tensor<?x?x?xf32, #densev>)
-> tensor<?x?x?xf32, #densev>
%attn4 = linalg.generic #bspmm_map
ins(%A, %B: tensor<?x?x?xf32, #csrv>, tensor<?x?x?xf32, #densev>)
outs(%spmm_in1: tensor<?x?x?xf32, #densev>) {
^bb0(%q: f32, %k: f32, %attn: f32): // no predecessors
%0 = arith.mulf %q, %k : f32
%1 = arith.addf %0, %attn: f32
linalg.yield %1 : f32
} -> tensor<?x?x?xf32, #densev>
// %attn4 = tensor.cast %attn3: tensor<?x?x?xf32, #csrv> to tensor<?x?x?xf32>
return %attn4 : tensor<?x?x?xf32, #densev>
}
}
// Local Variables:
// rmsbolt-command: "lapis-opt --sparse-compiler-kokkos='pt-backend=mpi'"
// rmsbolt-automatic-recompile: on-save
// End:
#!/usr/bin/env python3
import numpy as np
import sys
if len(sys.argv) == 3 and sys.argv[2] == "-f":
a = np.frombuffer(open(sys.argv[1], "rb").read(), dtype=np.float32)
else:
a = (
np.frombuffer(open(sys.argv[1], "rb").read(), dtype=np.int64)
.reshape(-1, 2)
.transpose(1, 0)
)
print(a)
cmake_minimum_required(VERSION 3.15)
project(dump_partitions)
include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/CPM.cmake)
CPMAddPackage("gh:fmtlib/fmt#11.0.2")
CPMAddPackage("gh:p-ranav/argparse#v3.1")
file(WRITE "${CMAKE_BINARY_DIR}/cleanupxformdata.cmake"
[=[
file(READ "${SOURCE}" TEXT)
string(REGEX REPLACE "\nmodule {\n" "\n" TEXT "${TEXT}")
string(REGEX REPLACE "\n[ ]+module attributes .*" "\n" TEXT "${TEXT}")
file(WRITE "${TARGET}" "${TEXT}")
]=])
option(USE_KOKKOS "Use Kokkos" ON)
find_package(MLIR REQUIRED)
find_package(LLVM REQUIRED)
if(USE_KOKKOS)
find_package(Kokkos REQUIRED)
file(WRITE "${CMAKE_BINARY_DIR}/makeExtern.cmake"
[=[
file(READ "${SOURCE}" TEXT)
string(REGEX REPLACE "void[*] sparse_mha.*{" "extern \"C\" \\0" TEXT "${TEXT}")
string(REGEX REPLACE "void[*] pte_[^{]*{" "extern \"C\" \\0" TEXT "${TEXT}")
file(WRITE "${TARGET}" "${TEXT}")
]=])
endif()
add_library(mmio mmio.c)
function(compile_mlir SRC_FILE)
add_custom_command(
OUTPUT
${CMAKE_CURRENT_BINARY_DIR}/${SRC_FILE}.o
COMMAND
mlir-opt --sparsifier=\"enable-runtime-library=true\"
${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FILE}.mlir --sparsifier -o
${SRC_FILE}.llvm.mlir
COMMAND
mlir-translate -mlir-to-llvmir ${SRC_FILE}.llvm.mlir -o ${SRC_FILE}.llvm
COMMAND
llc --relocation-model=pic ${SRC_FILE}.llvm -o ${SRC_FILE}.s
COMMAND
as -g -o ${SRC_FILE}.o ${SRC_FILE}.s
DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FILE}.mlir
)
add_library(${SRC_FILE} OBJECT IMPORTED)
set_target_properties(${SRC_FILE} PROPERTIES IMPORTED_OBJECTS
${CMAKE_CURRENT_BINARY_DIR}/${SRC_FILE}.o)
endfunction()
function(compile_mlir_to_kokkos SRC_FILE)
add_custom_command(
OUTPUT
${CMAKE_CURRENT_BINARY_DIR}/${SRC_FILE}.cpp
COMMAND
lapis-opt --sparse-compiler-kokkos='pt-backend=mpi parallelization-strategy=dense-any-loop'
${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FILE}.mlir -o ${SRC_FILE}.scf.mlir
COMMAND
lapis-translate -sparse-mlir-to-kokkos ${SRC_FILE}.scf.mlir -o
${SRC_FILE}.cpp.temp
COMMAND
# cp ${SRC_FILE}.cpp.temp ${SRC_FILE}.cpp
${CMAKE_COMMAND} -DSOURCE=${SRC_FILE}.cpp.temp -DTARGET=${SRC_FILE}.cpp
-P makeExtern.cmake
DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FILE}.mlir
)
add_library(${SRC_FILE} ${CMAKE_CURRENT_BINARY_DIR}/${SRC_FILE}.cpp)
target_compile_definitions(${SRC_FILE} PUBLIC USE_KOKKOS=1)
# target_compile_options(${SRC_FILE} PRIVATE -g -ggdb)
target_link_libraries(${SRC_FILE} PRIVATE Kokkos::kokkos fmt::fmt)
endfunction()
set(MLIR_INPUT_NAME "bspmm")
if (USE_KOKKOS)
compile_mlir_to_kokkos(${MLIR_INPUT_NAME})
else()
compile_mlir(${MLIR_INPUT_NAME})
endif()
add_executable(dump_partitions.part_tensor wrapper.cpp)
target_include_directories(dump_partitions.part_tensor
PRIVATE
${MLIR_INCLUDE_DIRS}
${LLVM_INCLUDE_DIRS}
)
target_link_libraries(dump_partitions.part_tensor
PRIVATE
${MLIR_INPUT_NAME}
MLIRExecutionEngine
MLIRSparseTensorUtils
MLIRSparseTensorRuntime
mlir_c_runner_utils
mlir_runner_utils
argparse
mmio
fmt::fmt
)
target_compile_options(dump_partitions.part_tensor PUBLIC -fno-rtti)
set_target_properties(dump_partitions.part_tensor PROPERTIES
CXX_STANDARD 23
CXX_EXTENSIONS OFF
)
{
"version": 6,
"cmakeMinimumRequired": {
"major": 3,
"minor": 23,
"patch": 0
},
"configurePresets": [
{
"name": "default-kokkos",
"displayName": "default",
"description": "Default local preset",
"generator": "Ninja",
"binaryDir": "${sourceDir}/localbuild",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_BUILD_TYPE": "Release",
"USE_KOKKOS": "ON"
}
},
{
"name": "default-llvm",
"displayName": "default",
"description": "Default preset",
"generator": "Ninja",
"binaryDir": "${sourceDir}/build",
"cacheVariables": {
"CMAKE_C_COMPILER": "/usr/bin/gcc-13",
"CMAKE_CXX_COMPILER": "/usr/bin/g++-13",
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_BUILD_TYPE": "Debug",
"USE_KOKKOS": "OFF"
}
}
],
"buildPresets": [
{
"name": "default-llvm",
"displayName": "Default",
"description": "Default preset",
"configurePreset": "default-llvm"
},
{
"name": "default",
"displayName": "Default",
"description": "Default preset",
"configurePreset": "default-kokkos"
},
{
"name": "notchpeak",
"displayName": "notchpeak",
"description": "Default preset for notchpeak",
"configurePreset": "default-kokkos"
}
],
"workflowPresets": [
{
"name": "default-kokkos",
"description": "default",
"steps": [
{
"type": "configure",
"name": "default-kokkos"
},
{
"type": "build",
"name": "default"
}
]
},
{
"name": "default-llvm",
"description": "default",
"steps": [
{
"type": "configure",
"name": "default-llvm"
},
{
"type": "build",
"name": "default-llvm"
}
]
},
{
"name": "notchpeak",
"description": "default workflow for notchpeak",
"steps": [
{
"type": "configure",
"name": "default-kokkos"
},
{
"type": "build",
"name": "notchpeak"
}
]
}
]
}
/*
* Matrix Market I/O library for ANSI C
*
* See http://math.nist.gov/MatrixMarket for details.
*
*
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "mmio.h"
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
double **val_, int **I_, int **J_)
{
FILE *f;
MM_typecode matcode;
int M, N, nz;
int i;
double *val;
int *I, *J;
if ((f = fopen(fname, "r")) == NULL)
return -1;
if (mm_read_banner(f, &matcode) != 0)
{
printf("mm_read_unsymetric: Could not process Matrix Market banner ");
printf(" in file [%s]\n", fname);
return -1;
}
if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
mm_is_sparse(matcode)))
{
fprintf(stderr, "Sorry, this application does not support ");
fprintf(stderr, "Market Market type: [%s]\n",
mm_typecode_to_str(matcode));
return -1;
}
/* find out size of sparse matrix: M, N, nz .... */
if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
{
fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
return -1;
}
*M_ = M;
*N_ = N;
*nz_ = nz;
/* reseve memory for matrices */
I = (int *) malloc(nz * sizeof(int));
J = (int *) malloc(nz * sizeof(int));
val = (double *) malloc(nz * sizeof(double));
*val_ = val;
*I_ = I;
*J_ = J;
/* NOTE: when reading in doubles, ANSI C requires the use of the "l" */
/* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
/* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */
for (i=0; i<nz; i++)
{
fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
I[i]--; /* adjust from 1-based to 0-based */
J[i]--;
}
fclose(f);
return 0;
}
int mm_is_valid(MM_typecode matcode)
{
if (!mm_is_matrix(matcode)) return 0;
if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) ||
mm_is_skew(matcode))) return 0;
return 1;
}
int mm_read_banner(FILE *f, MM_typecode *matcode)
{
char line[MM_MAX_LINE_LENGTH];
char banner[MM_MAX_TOKEN_LENGTH];
char mtx[MM_MAX_TOKEN_LENGTH];
char crd[MM_MAX_TOKEN_LENGTH];
char data_type[MM_MAX_TOKEN_LENGTH];
char storage_scheme[MM_MAX_TOKEN_LENGTH];
char *p;
mm_clear_typecode(matcode);
if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
return MM_PREMATURE_EOF;
if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
storage_scheme) != 5)
return MM_PREMATURE_EOF;
for (p=mtx; *p!='\0'; *p=tolower(*p),p++); /* convert to lower case */
for (p=crd; *p!='\0'; *p=tolower(*p),p++);
for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
/* check for banner */
if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
return MM_NO_HEADER;
/* first field should be "mtx" */
if (strcmp(mtx, MM_MTX_STR) != 0)
return MM_UNSUPPORTED_TYPE;
mm_set_matrix(matcode);
/* second field describes whether this is a sparse matrix (in coordinate
storgae) or a dense array */
if (strcmp(crd, MM_SPARSE_STR) == 0)
mm_set_sparse(matcode);
else
if (strcmp(crd, MM_DENSE_STR) == 0)
mm_set_dense(matcode);
else
return MM_UNSUPPORTED_TYPE;
/* third field */
if (strcmp(data_type, MM_REAL_STR) == 0)
mm_set_real(matcode);
else
if (strcmp(data_type, MM_COMPLEX_STR) == 0)
mm_set_complex(matcode);
else
if (strcmp(data_type, MM_PATTERN_STR) == 0)
mm_set_pattern(matcode);
else
if (strcmp(data_type, MM_INT_STR) == 0)
mm_set_integer(matcode);
else
return MM_UNSUPPORTED_TYPE;
/* fourth field */
if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
mm_set_general(matcode);
else
if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
mm_set_symmetric(matcode);
else
if (strcmp(storage_scheme, MM_HERM_STR) == 0)
mm_set_hermitian(matcode);
else
if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
mm_set_skew(matcode);
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
{
if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
{
char line[MM_MAX_LINE_LENGTH];
int num_items_read;
/* set return null parameter values, in case we exit with errors */
*M = *N = *nz = 0;
/* now continue scanning until you reach the end-of-comments */
do
{
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
return MM_PREMATURE_EOF;
}while (line[0] == '%');
/* line[] is either blank or has M,N, nz */
if (sscanf(line, "%d %d %d", M, N, nz) == 3)
return 0;
else
do
{
num_items_read = fscanf(f, "%d %d %d", M, N, nz);
if (num_items_read == EOF) return MM_PREMATURE_EOF;
}
while (num_items_read != 3);
return 0;
}
int mm_read_mtx_array_size(FILE *f, int *M, int *N)
{
char line[MM_MAX_LINE_LENGTH];
int num_items_read;
/* set return null parameter values, in case we exit with errors */
*M = *N = 0;
/* now continue scanning until you reach the end-of-comments */
do
{
if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL)
return MM_PREMATURE_EOF;
}while (line[0] == '%');
/* line[] is either blank or has M,N, nz */
if (sscanf(line, "%d %d", M, N) == 2)
return 0;
else /* we have a blank line */
do
{
num_items_read = fscanf(f, "%d %d", M, N);
if (num_items_read == EOF) return MM_PREMATURE_EOF;
}
while (num_items_read != 2);
return 0;
}
int mm_write_mtx_array_size(FILE *f, int M, int N)
{
if (fprintf(f, "%d %d\n", M, N) != 2)
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
/*-------------------------------------------------------------------------*/
/******************************************************************/
/* use when I[], J[], and val[]J, and val[] are already allocated */
/******************************************************************/
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode)
{
int i;
if (mm_is_complex(matcode))
{
for (i=0; i<nz; i++)
if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
!= 4) return MM_PREMATURE_EOF;
}
else if (mm_is_real(matcode))
{
for (i=0; i<nz; i++)
{
if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
!= 3) return MM_PREMATURE_EOF;
}
}
else if (mm_is_pattern(matcode))
{
for (i=0; i<nz; i++)
if (fscanf(f, "%d %d", &I[i], &J[i])
!= 2) return MM_PREMATURE_EOF;
}
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
double *real, double *imag, MM_typecode matcode)
{
if (mm_is_complex(matcode))
{
if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
!= 4) return MM_PREMATURE_EOF;
}
else if (mm_is_real(matcode))
{
if (fscanf(f, "%d %d %lg\n", I, J, real)
!= 3) return MM_PREMATURE_EOF;
}
else if (mm_is_pattern(matcode))
{
if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
}
else
return MM_UNSUPPORTED_TYPE;
return 0;
}
/************************************************************************
mm_read_mtx_crd() fills M, N, nz, array of values, and return
type code, e.g. 'MCRS'
if matrix is complex, values[] is of size 2*nz,
(nz pairs of real/imaginary values)
************************************************************************/
int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
double **val, MM_typecode *matcode)
{
int ret_code;
FILE *f;
if (strcmp(fname, "stdin") == 0) f=stdin;
else
if ((f = fopen(fname, "r")) == NULL)
return MM_COULD_NOT_READ_FILE;
if ((ret_code = mm_read_banner(f, matcode)) != 0)
return ret_code;
if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
mm_is_matrix(*matcode)))
return MM_UNSUPPORTED_TYPE;
if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
return ret_code;
*I = (int *) malloc(*nz * sizeof(int));
*J = (int *) malloc(*nz * sizeof(int));
*val = NULL;
if (mm_is_complex(*matcode))
{
*val = (double *) malloc(*nz * 2 * sizeof(double));
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
else if (mm_is_real(*matcode))
{
*val = (double *) malloc(*nz * sizeof(double));
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
else if (mm_is_pattern(*matcode))
{
ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val,
*matcode);
if (ret_code != 0) return ret_code;
}
if (f != stdin) fclose(f);
return 0;
}
int mm_write_banner(FILE *f, MM_typecode matcode)
{
char *str = mm_typecode_to_str(matcode);
int ret_code;
ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
free(str);
if (ret_code !=2 )
return MM_COULD_NOT_WRITE_FILE;
else
return 0;
}
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode)
{
FILE *f;
int i;
if (strcmp(fname, "stdout") == 0)
f = stdout;
else
if ((f = fopen(fname, "w")) == NULL)
return MM_COULD_NOT_WRITE_FILE;
/* print banner followed by typecode */
fprintf(f, "%s ", MatrixMarketBanner);
fprintf(f, "%s\n", mm_typecode_to_str(matcode));
/* print matrix sizes and nonzeros */
fprintf(f, "%d %d %d\n", M, N, nz);
/* print values */
if (mm_is_pattern(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d\n", I[i], J[i]);
else
if (mm_is_real(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
else
if (mm_is_complex(matcode))
for (i=0; i<nz; i++)
fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i],
val[2*i+1]);
else
{
if (f != stdout) fclose(f);
return MM_UNSUPPORTED_TYPE;
}
if (f !=stdout) fclose(f);
return 0;
}
/**
* Create a new copy of a string s. mm_strdup() is a common routine, but
* not part of ANSI C, so it is included here. Used by mm_typecode_to_str().
*
*/
char *mm_strdup(const char *s)
{
int len = strlen(s);
char *s2 = (char *) malloc((len+1)*sizeof(char));
return strcpy(s2, s);
}
char *mm_typecode_to_str(MM_typecode matcode)
{
char buffer[MM_MAX_LINE_LENGTH];
char *types[4];
char *mm_strdup(const char *);
int error =0;
/* check for MTX type */
if (mm_is_matrix(matcode))
types[0] = MM_MTX_STR;
else
error=1;
/* check for CRD or ARR matrix */
if (mm_is_sparse(matcode))
types[1] = MM_SPARSE_STR;
else
if (mm_is_dense(matcode))
types[1] = MM_DENSE_STR;
else
return NULL;
/* check for element data type */
if (mm_is_real(matcode))
types[2] = MM_REAL_STR;
else
if (mm_is_complex(matcode))
types[2] = MM_COMPLEX_STR;
else
if (mm_is_pattern(matcode))
types[2] = MM_PATTERN_STR;
else
if (mm_is_integer(matcode))
types[2] = MM_INT_STR;
else
return NULL;
/* check for symmetry type */
if (mm_is_general(matcode))
types[3] = MM_GENERAL_STR;
else
if (mm_is_symmetric(matcode))
types[3] = MM_SYMM_STR;
else
if (mm_is_hermitian(matcode))
types[3] = MM_HERM_STR;
else
if (mm_is_skew(matcode))
types[3] = MM_SKEW_STR;
else
return NULL;
sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
return mm_strdup(buffer);
}
/*
* Matrix Market I/O library for ANSI C
*
* See http://math.nist.gov/MatrixMarket for details.
*
*
*/
#ifndef MM_IO_H
#define MM_IO_H
#define MM_MAX_LINE_LENGTH 1025
#define MatrixMarketBanner "%%MatrixMarket"
#define MM_MAX_TOKEN_LENGTH 64
typedef char MM_typecode[4];
char *mm_typecode_to_str(MM_typecode matcode);
int mm_read_banner(FILE *f, MM_typecode *matcode);
int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz);
int mm_read_mtx_array_size(FILE *f, int *M, int *N);
int mm_write_banner(FILE *f, MM_typecode matcode);
int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
int mm_write_mtx_array_size(FILE *f, int M, int N);
/********************* MM_typecode query fucntions ***************************/
#define mm_is_matrix(typecode) ((typecode)[0]=='M')
#define mm_is_sparse(typecode) ((typecode)[1]=='C')
#define mm_is_coordinate(typecode)((typecode)[1]=='C')
#define mm_is_dense(typecode) ((typecode)[1]=='A')
#define mm_is_array(typecode) ((typecode)[1]=='A')
#define mm_is_complex(typecode) ((typecode)[2]=='C')
#define mm_is_real(typecode) ((typecode)[2]=='R')
#define mm_is_pattern(typecode) ((typecode)[2]=='P')
#define mm_is_integer(typecode) ((typecode)[2]=='I')
#define mm_is_symmetric(typecode)((typecode)[3]=='S')
#define mm_is_general(typecode) ((typecode)[3]=='G')
#define mm_is_skew(typecode) ((typecode)[3]=='K')
#define mm_is_hermitian(typecode)((typecode)[3]=='H')
int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
/********************* MM_typecode modify fucntions ***************************/
#define mm_set_matrix(typecode) ((*typecode)[0]='M')
#define mm_set_coordinate(typecode) ((*typecode)[1]='C')
#define mm_set_array(typecode) ((*typecode)[1]='A')
#define mm_set_dense(typecode) mm_set_array(typecode)
#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
#define mm_set_complex(typecode)((*typecode)[2]='C')
#define mm_set_real(typecode) ((*typecode)[2]='R')
#define mm_set_pattern(typecode)((*typecode)[2]='P')
#define mm_set_integer(typecode)((*typecode)[2]='I')
#define mm_set_symmetric(typecode)((*typecode)[3]='S')
#define mm_set_general(typecode)((*typecode)[3]='G')
#define mm_set_skew(typecode) ((*typecode)[3]='K')
#define mm_set_hermitian(typecode)((*typecode)[3]='H')
#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
(*typecode)[2]=' ',(*typecode)[3]='G')
#define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
/********************* Matrix Market error codes ***************************/
#define MM_COULD_NOT_READ_FILE 11
#define MM_PREMATURE_EOF 12
#define MM_NOT_MTX 13
#define MM_NO_HEADER 14
#define MM_UNSUPPORTED_TYPE 15
#define MM_LINE_TOO_LONG 16
#define MM_COULD_NOT_WRITE_FILE 17
/******************** Matrix Market internal definitions ********************
MM_matrix_typecode: 4-character sequence
ojbect sparse/ data storage
dense type scheme
string position: [0] [1] [2] [3]
Matrix typecode: M(atrix) C(oord) R(eal) G(eneral)
A(array) C(omplex) H(ermitian)
P(attern) S(ymmetric)
I(nteger) K(kew)
***********************************************************************/
#define MM_MTX_STR "matrix"
#define MM_ARRAY_STR "array"
#define MM_DENSE_STR "array"
#define MM_COORDINATE_STR "coordinate"
#define MM_SPARSE_STR "coordinate"
#define MM_COMPLEX_STR "complex"
#define MM_REAL_STR "real"
#define MM_INT_STR "integer"
#define MM_GENERAL_STR "general"
#define MM_SYMM_STR "symmetric"
#define MM_HERM_STR "hermitian"
#define MM_SKEW_STR "skew-symmetric"
#define MM_PATTERN_STR "pattern"
/* high level routines */
int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode);
int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
double val[], MM_typecode matcode);
int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
MM_typecode matcode);
int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
double **val_, int **I_, int **J_);
#endif
#include <algorithm>
#include <cassert>
#include <cfloat>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <fstream>
#include <ios>
#include <iostream>
#include <memory>
#include <numeric>
#include <random>
#include <span>
#include <string>
#include <vector>
extern "C" {
#include "mmio.h"
}
#include <argparse/argparse.hpp>
#include <fmt/format.h>
#include <fmt/ranges.h>
#include "mlir/ExecutionEngine/SparseTensor/COO.h"
#include "mlir/ExecutionEngine/SparseTensor/Storage.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/Support/FormatVariadic.h"
#if DISTRIBUTED
#include "parttensor_mpi_backend/Storage.h"
#include "parttensor_mpi_backend/parttensor_mpi_backend.h"
#include <mpi.h>
template <class... T>
using PartTensorStorage = mlir::parttensor_mpi::MPITensorStorage<T...>;
using mlir::parttensor_mpi::Product;
using mlir::parttensor_mpi::SplitLoHiPoint;
using mlir::parttensor_mpi::SubtractPoints;
#endif // DISTRIBUTED
using namespace mlir::sparse_tensor;
using std::begin;
using std::end;
using std::make_unique;
using std::span;
using std::unique_ptr;
using std::vector;
using index_t = uint64_t;
void getRss() {
std::ifstream file("/proc/self/status");
std::string line;
#if DISTRIBUTED
const auto rank = _mlir_ciface_mpi_getRank();
fmt::print("{}: ", rank);
#endif
while (std::getline(file, line)) {
if (line.starts_with("VmRSS:")) {
line = line.substr(7);
std::istringstream iss(line);
long rss{};
iss >> rss;
fmt::print("{} GB\n", double(rss) / pow(2, 20));
std::string rest;
iss >> rest;
assert(rest == "kB");
break;
}
}
}
bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
float abs_th = FLT_MIN)
// those defaults are arbitrary and could be removed
{
assert(std::numeric_limits<float>::epsilon() <= epsilon);
assert(epsilon < 1.f);
if (a == b)
return true;
auto diff = std::abs(a - b);
auto norm =
std::min((std::abs(a) + std::abs(b)), std::numeric_limits<float>::max());
// or even faster: std::min(std::abs(a + b),
// std::numeric_limits<float>::max()); keeping this commented out until I
// update figures below
return diff < std::max(abs_th, epsilon * norm);
}
// copied from: https://en.cppreference.com/w/cpp/types/numeric_limits/epsilon
template <class T>
std::enable_if_t<not std::numeric_limits<T>::is_integer, bool>
equal_within_ulps(T x, T y, std::size_t n) {
// Since `epsilon()` is the gap size (ULP, unit in the last place)
// of floating-point numbers in interval [1, 2), we can scale it to
// the gap size in interval [2^e, 2^{e+1}), where `e` is the exponent
// of `x` and `y`.
// If `x` and `y` have different gap sizes (which means they have
// different exponents), we take the smaller one. Taking the bigger
// one is also reasonable, I guess.
const T m = std::min(std::fabs(x), std::fabs(y));
// Subnormal numbers have fixed exponent, which is `min_exponent - 1`.
const int exp = m < std::numeric_limits<T>::min()
? std::numeric_limits<T>::min_exponent - 1
: std::ilogb(m);
// We consider `x` and `y` equal if the difference between them is
// within `n` ULPs.
return std::abs(x - y) <=
n * std::ldexp(std::numeric_limits<T>::epsilon(), exp);
}
using index_t = uint64_t;
template <class T, std::size_t dims = 1> struct memref_t {
uint64_t deadbeef;
T *dataptr;
uint64_t offset;
uint64_t sizes[dims];
uint64_t strides[dims];
};
using memref_1d_i64 = memref_t<uint64_t, 1>;
using memref_2d_f32 = memref_t<float, 2>;
using memref_3d_f32 = memref_t<float, 3>;
template <class T> T multiply(span<T> sizes) {
return std::accumulate(begin(sizes), end(sizes), size_t(1),
std::multiplies<T>());
}
template <class T, size_t dims> void print_memref(memref_t<T, dims> out) {
fmt::print("sizes: {}\n", out.sizes);
fmt::print("strides: {}\n", out.strides);
fmt::print("values: {}\n",
span(out.dataptr, multiply(span(out.sizes, dims))));
}
extern "C" {
extern void *part_tensor_softmax(void *A, void *out_file_name);
extern void printSeperator();
extern uint64_t _mlir_ciface_mpi_getRank();
extern void delSparseTensor(void *tensor);
}
void printSeperator() {
printf("---------------------------------------------------------------------"
"-----------\n");
}
auto init_bin_matrix_sizes(const std::string &infile, unsigned &M, unsigned &N,
unsigned &nnz) {
auto base_name = infile.substr(0, infile.find_last_of('.'));
auto info_file = base_name + ".info";
auto file = fopen(info_file.c_str(), "r");
if (!file) {
std::cout << "Not valid file\n";
exit(1);
}
int M_, N_, nnz_;
auto nfieldsRead = fscanf(file, "%d %d %d", &M_, &N_, &nnz_);
M = M_;
N = N_;
nnz = nnz_;
// fmt::print("M: {} N: {} nnz: {}\n", M, N, nnz);
if (nfieldsRead != 3) {
std::cout << "Could not read mm size\n";
exit(1);
}
fclose(file);
}
// std::tuple<int, int, std::unique_ptr<SparseTensorCOO<float>>>
// init_matrix_a(std::string infile) {
template <class FilterTy>
auto init_bin_matrix_a(const std::string &infile, unsigned &M, unsigned &N,
unsigned &nnz, unsigned Nf, FilterTy &&f,
bool overrideValues = false) {
std::mt19937 gen;
gen.seed(0);
std::uniform_real_distribution<> dis(0.0, 1.0);
init_bin_matrix_sizes(infile, M, N, nnz);
auto file = fopen(infile.c_str(), "rb");
std::ifstream edgeData(
(infile.substr(0, infile.find_first_of('.')) + ".edge.data.bin").c_str(),
std::ios::binary);
auto stCooA = std::make_unique<SparseTensorCOO<float>>(
std::vector<index_t>({index_t(M), index_t(N), index_t(Nf)}));
std::vector<index_t> coords(3);
auto statusUpdate = 0;
std::vector<float> vals(Nf);
for (auto i : llvm::seq(0u, nnz)) {
if (statusUpdate++ == 10000000) {
fmt::print(".");
statusUpdate = 0;
}
// The coordinates are 2d but tensor is 3d
auto numRead = fread(coords.data(), sizeof(decltype(coords)::value_type),
std::size(coords) - 1, file);
assert(numRead == std::size(coords) && "Read error");
const auto numBytesToRead = sizeof(decltype(vals)::value_type) * Nf;
if (f(coords[0])) {
edgeData.read(reinterpret_cast<char *>(vals.data()), numBytesToRead);
for (auto j : llvm::seq(0u, Nf)) {
coords.back() = j;
stCooA->add(coords, vals[j]);
}
} else {
edgeData.seekg(numBytesToRead, std::ios_base::cur);
}
}
fmt::print("\n");
assert(!(ferror(file) || feof(file)) && "Input file too short");
auto numRead = fread(coords.data(), sizeof(decltype(coords)::value_type),
std::size(coords), file);
assert(numRead == 0 && "Input file too long");
fclose(file);
return stCooA;
}
auto init_matrix_a(const std::string &infile, unsigned &M, unsigned &N,
unsigned &nnz, bool overrideValues = false) {
std::mt19937 gen;
gen.seed(0);
std::uniform_real_distribution<> dis(0.0, 1.0);
MM_typecode mc = {};
auto file = fopen(infile.c_str(), "r");
auto retCode = mm_read_banner(file, (MM_typecode *)&mc);
if (retCode != 0) {
std::cout << "Not valid file\n";
exit(1);
}
int M_, N_, nnz_;
retCode = mm_read_mtx_crd_size(file, &M_, &N_, &nnz_);
M = M_;
N = N_;
nnz = nnz_;
if (retCode != 0) {
std::cout << "Could not read mm size\n";
exit(1);
}
auto stCooA = std::make_unique<SparseTensorCOO<float>>(
std::vector<index_t>({index_t(M), index_t(N)}));
const bool IsSymmetric = mm_is_symmetric(mc);
const bool IsPattern = mm_is_pattern(mc);
if (IsPattern)
overrideValues = true;
for (auto i : llvm::seq(0, nnz_)) {
index_t r, c;
float v;
if (IsPattern)
fscanf(file, "%ld %ld", &r, &c);
else
fscanf(file, "%ld %ld %f", &r, &c, &v);
v = overrideValues ? dis(gen) : v;
r--;
c--; // mtx is 1 based
stCooA->add({r, c}, v);
if (IsSymmetric && r != c) {
stCooA->add({c, r}, v);
}
}
return stCooA;
}
template <class T>
void init_feats(SparseTensorStorage<index_t, index_t, T> *tensor,
const std::string &infile, unsigned N, unsigned Dh, unsigned Nh,
unsigned beginOffset = 0, bool ValidateFileSizes = false,
bool overrideValues = false) {
std::mt19937 gen;
gen.seed(0);
std::uniform_real_distribution<> dis(0.0, 1.0);
std::ifstream file(
(infile.substr(0, infile.find_first_of('.')) + ".vert.data.bin").c_str(),
std::ios::binary);
if (!file) {
std::cout << "Not valid file\n";
exit(1);
}
std::streampos fileBegin =
file.tellg(); // Get the current position (which is the file size)
file.seekg(0, std::ios::end); // Seek to the end of the file
size_t fileSize = file.tellg() - fileBegin;
file.seekg(beginOffset, std::ios::beg); // Seek back to the base offset
assert(!ValidateFileSizes ||
fileSize == N * Dh * Nh * sizeof(T) && "File size mismatch");
std::vector<T> *vals;
tensor->getValues(&vals);
file.read(reinterpret_cast<char *>(vals->data()), sizeof(T) * vals->size());
// ValidateFileSizes -> file.good()
assert((!ValidateFileSizes || file.good()) && "File read error");
return;
}
template <class T>
auto init_feats(const std::string &infile, unsigned &N, unsigned Dh,
unsigned Nh, unsigned beginOffset = 0,
bool overrideValues = false) {
std::mt19937 gen;
gen.seed(0);
std::uniform_real_distribution<> dis(0.0, 1.0);
std::ifstream file(infile, std::ios::binary);
if (!file) {
std::cout << "Not valid file\n";
exit(1);
}
std::streampos fileBegin =
file.tellg(); // Get the current position (which is the file size)
file.seekg(0, std::ios::end); // Seek to the end of the file
size_t fileSize = file.tellg() - fileBegin;
file.seekg(beginOffset, std::ios::beg); // Seek back to the base offset
assert(fileSize == N * Dh * Nh * sizeof(T) && "File size mismatch");
auto stCooA = std::make_unique<SparseTensorCOO<T>>(
std::vector<index_t>({index_t(N), index_t(Dh), index_t(Nh)}));
for (auto n : llvm::seq(0u, N))
for (auto dh : llvm::seq(0u, Dh))
for (auto nh : llvm::seq(0u, Nh)) {
T val;
file.read(reinterpret_cast<char *>(&val), sizeof(T));
val = overrideValues ? dis(gen) : val;
stCooA->add({n, dh, nh}, val);
assert(file.good() && "File read error");
}
return stCooA;
}
std::unique_ptr<SparseTensorCOO<float>> init_matrix_a(index_t rowSize = 4) {
assert(0 && "Need to init with multiple features!");
auto dims = std::vector<size_t>{rowSize, rowSize};
auto stCooA = std::make_unique<SparseTensorCOO<float>>(dims);
for (auto i : llvm::seq(0ul, rowSize))
for (auto j : llvm::seq(0ul, rowSize))
stCooA->add({i, j}, float(i * rowSize + j));
return stCooA;
}
template <class T>
auto init_random(SparseTensorCOO<T> &vec, size_t N, size_t seed = 0) {
std::mt19937 gen;
gen.seed(seed);
std::uniform_real_distribution<> dis(T(0), T(100));
auto vals = vec.getElements();
for (auto i : llvm::seq(0ul, N))
(vals.at(i)).value = (dis(gen));
}
template <class T>
auto init_random(std::vector<T> &vec, size_t N, size_t seed = 0) {
std::mt19937 gen;
gen.seed(seed);
std::uniform_real_distribution<> dis(T(0), T(100));
for (auto i : llvm::seq(0ul, N))
vec.push_back(dis(gen));
}
template <> auto init_random(std::vector<uint8_t> &vec, size_t N, size_t seed) {
std::mt19937 gen;
gen.seed(seed);
std::bernoulli_distribution dis(.5);
for (auto i : llvm::seq(0ul, N))
vec.push_back(dis(gen));
}
/// Initialize a 2D sparse square tensor with random values
/// @param vec: SparseTensorCOO<float> to be initialized
/// @param N: rows and columns of the 2D tensor
/// @param seed: seed for random number generator
/// @param density: density of the tensor
template <class T>
auto init_random_sparse_2d(SparseTensorCOO<float> &stCoo, size_t N, size_t seed,
double density = 0.5) {
std::mt19937 gen;
gen.seed(seed);
std::bernoulli_distribution dis(density);
for (auto i : llvm::seq(0ul, N * N))
if (dis(gen))
stCoo.add({i / N, i % N}, T{1});
}
template <class T> void write_tensor(span<T> vec, std::string filename) {
std::ofstream(filename, std::ios::binary)
.write(reinterpret_cast<char *>(vec.data()), sizeof(T) * vec.size());
}
template <class T>
void write_vector_to_stcoo3d(const std::vector<T> &vec,
SparseTensorCOO<T> &stCoo) {
auto dims = stCoo.getDimSizes();
auto [N, Dh, Nh] = std::tuple(dims[0], dims[1], dims[2]);
llvm::for_each(llvm::enumerate(vec), [&stCoo, N, Dh, Nh](auto pair) {
auto [index, val] = pair;
auto nh = index % Nh;
auto rest = (index - nh) / Nh;
auto dh = rest % Dh;
auto n = (rest - dh) / Dh;
stCoo.add({n, dh, nh}, val);
});
}
#if DISTRIBUTED
auto get2DPartition(size_t rows, size_t cols, size_t nh, size_t rowParts,
size_t colParts) {
std::vector<index_t> partitionPlan;
auto rowPartitionSize = rows / rowParts;
auto colPartitionSize = cols / colParts;
assert(rows % rowParts == 0 && "rows % rowParts != 0");
assert(cols % colParts == 0 && "cols % colParts != 0");
for (int j = 0; j < cols; j += colPartitionSize) {
for (int i = 0; i < rows; i += rowPartitionSize) {
if (false && _mlir_ciface_mpi_getRank() == 0)
std::cout << "(" << i << "," << j << ") -> (" << i + rowPartitionSize
<< "," << j + colPartitionSize << ") \n";
partitionPlan.push_back(index_t(i));
partitionPlan.push_back(index_t(j));
if (nh)
partitionPlan.push_back(index_t(0));
partitionPlan.push_back(index_t(i + rowPartitionSize));
partitionPlan.push_back(index_t(j + colPartitionSize));
if (nh)
partitionPlan.push_back(index_t(nh));
}
}
return partitionPlan;
}
#endif
#define VEC_TO_MEMREF2D(v, size_0, size_1) \
(void *)0xdeadbeef, v.data(), 0, size_0, size_1, size_1, 1
#define VEC_TO_MEMREF3D(v, size_0, size_1, size_2) \
(void *)0xdeadbeef, v.data(), 0, size_0, size_1, size_2, size_2 *size_1, \
size_2, 1
#define TENSOR_2D_ARG(t) \
void *t##_deadbeef, void *t##_dataptr, uint64_t t##_offset, \
uint64_t t##_sizes_0, uint64_t t##_sizes_1, uint64_t t##_strides_0, \
uint64_t t##_strides_1
#define TENSOR_3D_ARG(t) \
void *t##_deadbeef, void *t##_dataptr, uint64_t t##_offset, \
uint64_t t##_sizes_0, uint64_t t##_sizes_1, uint64_t t##_sizes_2, \
uint64_t t##_strides_0, uint64_t t##_strides_1, uint64_t t##_strides_2
extern "C" void *sparse_mha(void *A, void *Q, void *K, void *V, void *);
extern "C" void *pte_local_bsddmm(void *A, void *Q, void *K);
extern "C" void *pte_local_bspmm(void *A, void *V);
extern "C" void *pte_bsddmm(void *A, void *Q, void *K, index_t n1, index_t n2,
index_t dh, index_t nh);
extern "C" void *pte_local_sparse_mha(void *A, void *Q, void *K, void *V);
extern "C" void *pte_sparse_mha(void *A, void *Q, void *K, void *V, index_t n1,
index_t n2, index_t dh, index_t nh);
extern "C" void lapis_initialize();
extern "C" void lapis_finalize();
void validate_cli(const argparse::ArgumentParser &p) {
if (p.is_used("-n") && p.is_used("-i")) {
fmt::print("Only one of -n OR -i can be specified\n");
exit(1);
}
if (p.is_used("-d") && p.is_used("-i")) {
fmt::print("Only one of -d OR -i can be specified\n");
exit(1);
}
if (!p.is_used("--check") && !p.is_used("--ntimes")) {
fmt::print("Atleast --check or --ntimes should be specified\n");
exit(1);
}
}
int main(int argc, char **argv) {
using namespace mlir::sparse_tensor;
#if DISTRIBUTED
MPI_Init(&argc, &argv);
#endif
#if defined(USE_KOKKOS)
lapis_initialize();
#endif
unsigned N = 4, Nh = 2, Dh = 2, Nnz{}, nTimes = 1;
unsigned Nparts = 1;
unsigned logRank = 99;
// bool PerfOnly = false, LocalOnly = false, CheckCorrectness = false;
bool LocalOnly = false, DistOnly = false, CheckCorrectness = false,
ValidateFileSizes = false;
double density = 0.5;
std::string infile;
argparse::ArgumentParser program(argv[0]);
program.add_argument("-dh").store_into(Dh).help("head size, default = 2");
program.add_argument("-nh").store_into(Nh).help("#head, default = 2");
program.add_argument("-n").store_into(N).help("#nodes, default = 4");
program.add_argument("-i").store_into(infile).help("Input File");
program.add_argument("--ntimes").store_into(nTimes).help("rerun n times = 1");
program.add_argument("-d", "--density")
.store_into(density)
.help("sparsity density, default = 0.5");
program.add_argument("-np", "--nparts")
.store_into(Nparts)
.help("number of parts, default = 1");
program.add_argument("--logrank")
.store_into(logRank)
.help("log when rank == logrank, default = 99");
program.add_argument("--check")
.store_into(CheckCorrectness)
.help("Do correctness check, default = false")
.flag();
program.add_argument("--validate-file-sizes")
.store_into(ValidateFileSizes)
.help("Make sure Nh and Dh are compatible with the input file sizes, "
"default = false")
.flag();
program.add_argument("--dist-only")
.store_into(DistOnly)
.help("Skip local run, default = false")
.flag();
program.add_argument("--local-only")
.store_into(LocalOnly)
.help("Skip distributed run, default = false")
.flag();
try {
program.parse_args(argc, argv);
} catch (const std::exception &err) {
std::cerr << err.what() << std::endl;
std::cerr << program;
return 1;
}
validate_cli(program);
decltype(init_matrix_a(4)) stCooA{}, stCooQ{};
#if DISTRIBUTED
const auto rank = _mlir_ciface_mpi_getRank();
#endif // DISTRIBUTED
if (!infile.empty()) {
decltype(N) M;
if (DistOnly) {
#if DISTRIBUTED
if (!infile.ends_with(".bin")) {
fmt::print("Only binary input supported for distributed run\n");
exit(1);
}
init_bin_matrix_sizes(infile, M, N, Nnz);
auto aPartitionPlan = get2DPartition(N, N, 0, Nparts, 1);
const std::vector<size_t> dims = {N, N};
auto myaPartSpec =
std::span(aPartitionPlan)
.subspan(rank * std::size(dims) * 2, std::size(dims) * 2);
auto [aaLo, aaHi] = SplitLoHiPoint(
llvm::ArrayRef(myaPartSpec.data(), std::size(myaPartSpec)));
// fmt::println("Rank: {} aaHi: {} aaLo: {}", rank, aaHi, aaLo);
stCooA = init_bin_matrix_a(infile, M, N, Nnz, Dh * Nh, [=](auto n1) {
return (aaLo[0] <= n1 && n1 < aaHi[0]);
});
#endif // DISTRIBUTED
} else {
stCooA = infile.ends_with(".bin")
? init_bin_matrix_a(infile, M, N, Nnz, Nh,
[](auto) { return true; })
: init_matrix_a(infile, M, N, Nnz);
}
assert(M == N);
} else {
assert(0);
}
fmt::println(">>> Loaded SparseMat");
fflush(nullptr);
system("date");
assert(N % Nparts == 0 && "Problem size (n) should divisible by nparts");
const size_t TensorSize = N * Dh * Nh, MatSize = N * N;
const std::vector<size_t> dims = {N, N, Nh};
const std::vector<size_t> featDims = {N, Dh, Nh};
const bool AllocateWholeTensors = CheckCorrectness || (!DistOnly);
if (infile.empty()) {
stCooA = make_unique<SparseTensorCOO<float>>(dims);
init_random_sparse_2d<float>(*stCooA, N, 1, density);
}
stCooQ = make_unique<SparseTensorCOO<float>>(featDims);
const LevelType denseLvl =
*mlir::sparse_tensor::buildLevelType(LevelFormat::Dense, {});
const LevelType compressedLvl = *mlir::sparse_tensor::buildLevelType(
LevelFormat::Compressed, false, false);
const LevelType kCSR[] = {denseLvl, compressedLvl};
const LevelType kCSRV[] = {denseLvl, compressedLvl, denseLvl};
const LevelType kDenseV[] = {denseLvl, denseLvl, denseLvl};
const uint64_t src2tgt[] = {0, 1, 2};
auto stA = AllocateWholeTensors
? SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(dims), dims.data(), std::size(dims),
dims.data(), kCSRV, src2tgt, src2tgt, stCooA.get())
: nullptr;
getRss();
if (LocalOnly)
stCooA.reset();
getRss();
fmt::println(">>> Allocated A");
fflush(nullptr);
system("date");
auto stQ =
AllocateWholeTensors
? SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(featDims), featDims.data(), std::size(featDims),
featDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get())
: nullptr;
if (stQ)
init_feats(stQ, infile, N, Dh, Nh);
if (stQ) {
fmt::println(">>> Allocated Q");
fflush(nullptr);
system("date");
}
auto stK =
AllocateWholeTensors
? SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(featDims), featDims.data(), std::size(featDims),
featDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get())
: nullptr;
if (stK)
init_feats(stK, infile, N, Dh, Nh);
if (stK) {
fmt::println(">>> Allocated K");
fflush(nullptr);
system("date");
}
auto stV =
AllocateWholeTensors
? SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(featDims), featDims.data(), std::size(featDims),
featDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get())
: nullptr;
if (stV)
init_feats(stV, infile, N, Dh, Nh);
if (stV) {
fmt::println(">>> Allocated V");
fflush(nullptr);
system("date");
}
stCooQ.reset();
std::vector<std::chrono::milliseconds> localTimes;
auto start = std::chrono::high_resolution_clock::now();
auto gold_out =
AllocateWholeTensors
? static_cast<SparseTensorStorage<index_t, index_t, float> *>(
pte_local_bspmm(stA, stV))
: nullptr;
auto end = std::chrono::high_resolution_clock::now();
localTimes.push_back(
std::chrono::duration_cast<std::chrono::milliseconds>(end - start));
if (!CheckCorrectness && gold_out) {
std::vector<float> *o;
gold_out->getValues(&o);
auto outfile = infile.substr(infile.find_last_of('/') + 1);
outfile =
fmt::format("{}.res", outfile.substr(0, outfile.find_first_of('.')));
write_tensor<float>(*o, outfile.c_str());
delSparseTensor((void *)gold_out);
}
if (AllocateWholeTensors) {
fmt::println(">>> Done running local kernel");
fflush(nullptr);
system("date");
getRss();
}
for (auto i : llvm::seq(0u, !AllocateWholeTensors ? 0u : nTimes)) {
auto start = std::chrono::high_resolution_clock::now();
auto out = static_cast<SparseTensorStorage<index_t, index_t, float> *>(
pte_local_bspmm(stA, stV));
auto end = std::chrono::high_resolution_clock::now();
localTimes.push_back(
std::chrono::duration_cast<std::chrono::milliseconds>(end - start));
delSparseTensor((void *)out);
}
// print average time
if (AllocateWholeTensors)
fmt::print("Local time: {}ms\n",
float(std::accumulate(localTimes.begin(), localTimes.end(),
std::chrono::milliseconds(0))
.count()) /
localTimes.size());
delSparseTensor((void *)stA);
delSparseTensor((void *)stQ);
delSparseTensor((void *)stK);
delSparseTensor((void *)stV);
if (LocalOnly) {
#if defined(USE_KOKKOS)
lapis_finalize();
#endif
#if DISTRIBUTED
MPI_Finalize();
#endif
return 0;
}
#if DISTRIBUTED
auto aPartitionPlan = get2DPartition(N, N, 0, Nparts, 1);
auto ptA = PartTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(aPartitionPlan), aPartitionPlan.data(), std::size(dims),
dims.data(), kCSR, stCooA.get());
stCooA.reset();
fmt::println(">>> {}: allocated ptA", rank);
fflush(nullptr);
system("date");
getRss();
auto qPartitionPlan = get2DPartition(N, Dh, Nh, Nparts, 1);
auto myPartSpec =
std::span(qPartitionPlan)
.subspan(rank * std::size(featDims) * 2, std::size(featDims) * 2);
auto fileOffset = myPartSpec[0] * Product(featDims, 1) * sizeof(float);
auto partFeatDims = std::vector<size_t>(3);
auto [aLo, aHi] =
SplitLoHiPoint(llvm::ArrayRef(myPartSpec.data(), std::size(myPartSpec)));
// fmt::print("Rank: {} aHi: {} aLo: {}\n", rank, aHi, aLo);
SubtractPoints(partFeatDims, aHi, aLo);
// fmt::print("Rank: {} partSpec: {} fileOffset: {} partFeatDims: {}\n", rank,
// myPartSpec, fileOffset, partFeatDims);
auto stPartQ = SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(partFeatDims), partFeatDims.data(), std::size(partFeatDims),
partFeatDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get());
fmt::println(">>> {}: allocated ptQ", rank);
fflush(nullptr);
system("date");
getRss();
auto stPartK = SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(partFeatDims), partFeatDims.data(), std::size(partFeatDims),
partFeatDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get());
auto stPartV = SparseTensorStorage<index_t, index_t, float>::newFromCOO(
std::size(partFeatDims), partFeatDims.data(), std::size(partFeatDims),
partFeatDims.data(), kDenseV, src2tgt, src2tgt, stCooQ.get());
init_feats(stPartQ, infile, partFeatDims[0], partFeatDims[1], partFeatDims[2],
fileOffset);
init_feats(stPartK, infile, partFeatDims[0], partFeatDims[1], partFeatDims[2],
fileOffset);
init_feats(stPartV, infile, partFeatDims[0], partFeatDims[1], partFeatDims[2],
fileOffset);
auto ptQ =
PartTensorStorage<index_t, index_t, float>::newFromSparseTensorStorage(
std::size(qPartitionPlan), qPartitionPlan.data(), std::size(featDims),
featDims.data(), stPartQ);
auto ptK =
PartTensorStorage<index_t, index_t, float>::newFromSparseTensorStorage(
std::size(qPartitionPlan), qPartitionPlan.data(), std::size(featDims),
featDims.data(), stPartK);
auto ptV =
PartTensorStorage<index_t, index_t, float>::newFromSparseTensorStorage(
std::size(qPartitionPlan), qPartitionPlan.data(), std::size(featDims),
featDims.data(), stPartV);
fmt::println(">>> {}: allocated ptV", rank);
fflush(nullptr);
system("date");
getRss();
localTimes.clear();
start = std::chrono::high_resolution_clock::now();
auto out = static_cast<SparseTensorStorage<index_t, index_t, float> *>(
pte_sparse_mha(ptA, ptQ, ptK, ptV, N, N, Dh, Nh));
end = std::chrono::high_resolution_clock::now();
localTimes.push_back(
std::chrono::duration_cast<std::chrono::milliseconds>(end - start));
fmt::println(">>> {}: Done dist exec", rank);
fflush(nullptr);
system("date");
getRss();
if (!CheckCorrectness) {
std::vector<float> *o;
out->getValues(&o);
int comm_size;
// get comm size
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
auto outfile = infile.substr(infile.find_last_of('/') + 1);
outfile = fmt::format("{}.{}.{}.res",
outfile.substr(0, outfile.find_first_of('.')), rank,
comm_size);
write_tensor<float>(*o, outfile.c_str());
delSparseTensor((void *)out);
}
for (auto i : llvm::seq(0u, CheckCorrectness ? 0u : nTimes)) {
auto start = std::chrono::high_resolution_clock::now();
auto out = static_cast<SparseTensorStorage<index_t, index_t, float> *>(
pte_sparse_mha(ptA, ptQ, ptK, ptV, N, N, Dh, Nh));
auto end = std::chrono::high_resolution_clock::now();
localTimes.push_back(
std::chrono::duration_cast<std::chrono::milliseconds>(end - start));
delSparseTensor((void *)out);
}
// print average time
fmt::print("Dist time: {}ms\n",
float(std::accumulate(localTimes.begin(), localTimes.end(),
std::chrono::milliseconds(0))
.count()) /
localTimes.size());
if (CheckCorrectness) {
std::vector<float> *o, *g;
out->getValues(&o);
gold_out->getValues(&g);
size_t partSize = (N * Dh * Nh / Nparts);
auto partBegin = partSize * _mlir_ciface_mpi_getRank();
auto partEnd = partBegin + partSize;
for (auto i : llvm::seq(partBegin, partEnd)) {
auto localI = i - partBegin;
if (g->at(i) != o->at(localI)) {
fmt::print("Mismatch at rank {} index: {} expected: {} got: {}\n", rank,
i, g->at(i), o->at(localI));
return 1;
}
}
}
// fmt::print("Q tensor: {}\n", Q);
// fmt::print("K tensor: {}\n", K);
// fmt::print("V tensor: {}\n", V);
#define PRINT_OUTPUT(out) \
{ \
std::vector<float> *o; \
out->getValues(&o); \
fmt::print(#out " tensor: {}\n", *o); \
write_tensor<float>(*o, #out ".dat"); \
}
// PRINT_OUTPUT(out)
// PRINT_OUTPUT(gold_out)
#undef PRINT_OUTPUT
#if defined(USE_KOKKOS)
lapis_finalize();
#endif
MPI_Finalize();
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment