Yaoyao Ding yaoyaoding

(Beta) Hidet: a dynamo backend focuses on inference acceleration

With torch dynamo, we can dispatch a pytorch model to other awesome deep learning framework/compilers for acceleration. Hidet is one of such deep learning compilers that accelerates your model with a bunch of optimizations (e.g., subgraph fusion, rewriting and kernel tuning). To use hidet, please first install it via

$ pip install hidet

Then you can enable it via torch.compile(model, backend='hidet') as shown in the code snippet below:

import torch
import hidet

	"""Minimal repro: tvm-ffi Object aliasing across multiprocessing fork.

	Why we want each tvm-ffi C-handle to map to a unique Python wrapper
	-------------------------------------------------------------------
	A single C++ Object can currently be reached from Python by many distinct
	wrapper instances — every ``obj.field`` / ``arr[i]`` access mints a fresh
	``CObject`` wrapper around the same chandle. Same handle, but different
	``id(...)``.

	That's harmless when staying in-process (handle equality still works), but

	'''
	Steps to reproduce:

	$ pip install nvidia-cutlass-dsl
	$ python main.py

	'''

	import ctypes
	import ctypes.util

	from typing import List
	import os
	import torch
	import hidet
	from hidet.apps.llm import create_llm
	from hidet.apps.llm.sampler import SamplingParams
	from hidet.apps.llm.nn.attention import DefaultAttnState
	from hidet.apps.llm.tokenizer import Tokenizer
	from hidet.apps.llm.modeling.llama import LlamaForCausalLM

	from typing import List
	from functools import lru_cache
	import torch
	import hidet

	hidet.option.cache_dir('./outs/cache')


	@lru_cache(maxsize=None)
	def build_kernel():

	import atexit

	# clean the cache dir in envorioment variable TRITON_CACHE_DIR
	import os
	import shutil

	if 'TRITON_CACHE_DIR' not in os.environ:
	os.environ['TRITON_CACHE_DIR'] = './triton_cache'
	cache_dir = os.environ['TRITON_CACHE_DIR']
	if os.path.exists(cache_dir):

	from typing import List, Union
	import json
	from collections import defaultdict


	class Model:
	def __init__(self, graph, description="", author="", company="", license="", domain="", source=""):
	self.graphs: List[Graph] = [graph]
	self.description: str = description
	self.author: str = author

	#include <cassert>
	#include <cstdio>
	extern "C" {

	__device__ __forceinline__ void matmul_bt128x128_bsz256_s128x128_block_c_init_warp(float out[64]) {
	int32_t lane_id = (threadIdx.x % 32);
	out[0] = 0.0;
	out[1] = 0.0;
	out[2] = 0.0;
	out[3] = 0.0;

	import tvm
	from tvm import te, tir


	def extract_launch_config(prim_func: tir.PrimFunc):
	"""
	Extract the launch configuration of given prim_func.

	Parameters
	----------