Akihiro Nitta akihironitta

Google に入ったときの話 (Kinuko)

祭っぽいので私も書いてみることにした！お手軽に gist で。

と書き出したはいいが、私が受けたときは本も情報もあまりなく、かつ競プロ的なものの存在も知らなかったので、とりあえず家にあったアルゴリズムの本を２回くらい読み直した。そして受かった。最初っから情報があまりない方のパターンで申し訳ない 😄

	import torch

	# CUDA kernel with inline PTX
	kernel_source = """
	__global__ void vector_add(const float* a, const float* b, float* c, int n) {
	int idx;
	asm("mov.u32 %0, %%ctaid.x;" : "=r"(idx));
	int tid;
	asm("mov.u32 %0, %%tid.x;" : "=r"(tid));
	int ntid;

	import torch
	from torch import nn
	from torch.distributed.tensor.placement_types import Replicate, Shard
	from torch.testing._internal.distributed.fake_pg import FakeStore

	import torch.distributed as dist
	from torch.distributed.device_mesh import init_device_mesh
	from torch.distributed.tensor import DTensor, Replicate

	world_size = 4

	import torch
	torch.set_default_device('cuda')
	from triton.testing import do_bench
	from collections import defaultdict
	from functools import partial
	import random
	random.seed(0)

	def get_flops(A, B):
	ms = do_bench(lambda: torch.mm(A, B))

	"""
	Augment kernel metadata generated by kernel_metadata metric table in inductor.
	For each row in input, use NCU to profile the kernel. The corresponding output row
	contains more metadata gathered by NCU.

	It can be super slow to run NCU. e.g. for the 10K kernels gathered from Huggingface,
	it took almost a whole day to run NCU for each unique kernels. The script thus cache
	the ncu output in the file system. If the ncu output is cached, we don't run NCU again.

	Example input: https://gist.github.com/shunting314/22995da0da8b66d4cf989cb7f0508399

	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
	------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
	graph_0_cpp_fused_add_exp_index_select_mul_scatter_a... 29.02% 11.966ms 29.02% 11.966ms 11.966ms 1
	graph_0_cpp_fused_add_clone_exp_index_select_mul_rel... 28.60% 11.794ms 28.60% 11.794ms 11.794ms 1
	graph_0_cpp_fused_add_clone_exp_index_select_mul_new... 27.49% 11.335ms 27.49% 11.335ms 11.335ms 1
	aten::scatter_ 5.92% 2.442ms 5.92% 2.442ms 814.000us 3

	import torch

	import torch_geometric
	from torch_geometric.profile import benchmark
	from torch_geometric.testing import (
	disableExtensions,
	onlyFullTest,
	onlyLinux,
	withCUDA,
	withPackage,

	import torch
	import torch._inductor.config
	import time

	torch._inductor.config.triton.cudagraphs = False
	torch.set_float32_matmul_precision('high')

	def bench(f, name=None, iters=100, warmup=5, display=True, profile=False):
	for _ in range(warmup):
	f()

	#!/bin/bash

	sudo apt install build-essential texinfo libx11-dev libxpm-dev libjpeg-dev libpng-dev libgif-dev libtiff-dev libgtk2.0-dev libncurses-dev libgnutls28-dev

	wget http://ftp.gnu.org/gnu/emacs/emacs-29.3.tar.gz
	tar xvzf emacs-29.3.tar.gz

	cd emacs-29.3
	./configure --with-mailutils
	make

	"""
	Parse all users from release notes according to PR reference.

	Resources:
	- https://developer.github.com/v3/pulls/
	- http://zetcode.com/python/requests/
	- https://developer.github.com/v3/#authentication
	"""

	import json