Skip to content

Instantly share code, notes, and snippets.

View makslevental's full-sized avatar
💩

Maksim Levental makslevental

💩
View GitHub Profile
/*===- TableGen'erated file -------------------------------------*- C++ -*-===*\
|* *|
|* Target Register Enum Values *|
|* *|
|* Automatically generated file, do not edit! *|
|* *|
\*===----------------------------------------------------------------------===*/
#ifdef GET_REGINFO_ENUM
@makslevental
makslevental / v_pk_add_f32.s
Last active April 25, 2025 04:13
unpack v_pk_add_f32
.section .AMDGPU.config,"",@progbits
.long 47176
.long 11468865
.long 47180
.long 5008
.long 47200
.long 0
.long 4
.long 0
.long 8
from enum import Flag
from pprint import pprint
class SchedGroupMask(Flag):
NONE = 0
ALU = 1 << 0
VALU = 1 << 1
SALU = 1 << 2
MFMA = 1 << 3
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 {
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x()
%6 = shl i32 %5, 10
%7 = tail call i32 @llvm.amdgcn.workitem.id.x()
%8 = shl i32 %7, 2
%9 = and i32 %8, 1020
%10 = or disjoint i32 %9, %6
%11 = icmp slt i32 %10, %3
br i1 %11, label %.critedge, label %.critedge2
diff --git a/CMakeLists.txt b/CMakeLists.txt
--- a/CMakeLists.txt (revision 7fb35319d38e57f7341e8434a4f5b2049aec8657)
+++ b/CMakeLists.txt (date 1744728290525)
@@ -318,6 +318,9 @@
endif()
target_link_options(triton PRIVATE ${PYTHON_LDFLAGS})
+ set_target_properties(triton PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/triton/_C)
+ set_target_properties(triton PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/triton/_C)
+ set_target_properties(triton PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/triton/_C)
W20250414 14:53:24.356304 132475353552832 metadata.cpp:186] rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent 1 (gfx1150) :: Agent HW architecture is not supported, no counter metrics found.
W20250414 14:53:24.651769 131105783740352 metadata.cpp:186] rocprofiler_iterate_agent_supported_counters returned ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED for agent 1 (gfx1150) :: Agent HW architecture is not supported, no counter metrics found.
F20250414 14:53:24.970933 131105783740352 agent.cpp:1069] rocprofiler was only able to map 2 rocprofiler agents to HSA agents, expected 3
@ 0x773d72612360 (unknown)
@ 0x773d72613297 (unknown)
@ 0x773d7205d18a (unknown)
@ 0x773d72098ed5 rocprofiler_set_api_table
@ 0x773d71ad5afc rocprofiler_register_library_api_table
@ 0x773d660a2229 (unknown)
@ 0x773d660a3c58 (unknown)
Dispatch_ID GPU_ID Queue_ID PID TID Grid_Size Workgroup_Size LDS_Per_Workgroup Scratch_Per_Workitem Arch_VGPR Accum_VGPR SGPR Wave_Size Kernel_Name Start_Timestamp End_Timestamp Correlation_ID
0 1 1 31938 31938 32 32 0 0 40 0 128 32 smol_matmul.kd 5382059929644 5382059931804 0
# RUN: python3 %s | FileCheck %s
import triton
import triton.language as tl
from triton.backends.compiler import GPUTarget
def print_test_name_and_run(f):
print(f"Test: {f.__name__}")
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
tt.func public @matmul_kernel_persistent(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #ttg.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [32, 32], isTransposed = true}>>
%true = arith.constant true
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #ttg.blocked<{sizePerThread = [8, 1], thre
import re
import subprocess
import sys
from black.trans import defaultdict
test_file = "/home/mlevental/dev_projects/triton/test/TritonGPU/amd/amd-range-analysis.mlir"
cmnd = [
"/home/mlevental/dev_projects/llvm-project/cmake-build-debug/bin/triton-opt",