Tom Medema tommedema

Running GLM-5.2 (753B DSA MoE) on 8× A100 80GB with vLLM

TL;DR. GLM-5.2 (glm_moe_dsa — DeepSeek Sparse Attention) does not run on Ampere (A100, sm_80) with stock vLLM: the sparse-MLA attention backend (FLASHMLA_SPARSE) and the lightning-indexer's fp8_mqa_logits (DeepGEMM) are Hopper/Blackwell-only. vLLM PR #38476 (issue #38006) adds a Triton sparse-MLA backend (TRITON_MLA_SPARSE) + a bf16 Triton indexer fallback that run on Ampere. Cherry-picking it onto current main is a Python-only change — no CUDA recompile. Result: GLM-5.2 AWQ-INT4 serves on 8× A100 at ~56 tok/s single-stream and ~625 tok/s aggregate decode (32-way), with coherent output.

This is an independent 8× A100 confirmation of PR #38476 (the author validated on 32× A100), plus a no-recompile install note. Credit to @haosdent for the PR.

Requirements

8× A100 80GB (sm_80). ~410 GiB VRAM used at TP=8, so all 8 GPUs.

	/**
	* Note that this temporarily works around an issue where Amplify does not support
	* the AWS SDK' correctClockSkew option, by retrying with a manually defined offset
	* if the request time is too skewed.
	* @see https://github.com/aws-amplify/amplify-js/issues/2014
	*/
	const attemptPutWithClockSkewRetry = () =>
	Storage
	.put(
	itemKey,

	const copyUrlToClipboard = (inputHTMLElement) => {
	// the actual, to be copied url
	const copyUrl = window.location.href

	// the pretty url to display (without schema)
	// this should be the default value of the input on instantiation
	const displayUrl = window.location.host + window.location.pathname

	// just prior to copying, temporarily change the input value to the
	// to be copied url

	functions:
	processAction:
	handler: dist/actionsProcessor.onActionsCreated
	events:
	- stream:
	type: dynamodb
	batchSize: 100
	startingPosition: LATEST
	arn:
	Fn::GetAtt: [DynamoDBTableActions, StreamArn]

	...
	resources:
	Resources:
	DynamoDBTableActions:
	Type: AWS::DynamoDB::Table
	DeletionPolicy: Retain
	Properties:
	StreamSpecification:
	StreamViewType: NEW_IMAGE
	...

	/// <reference lib="dom"/>

	declare function Analytics(opts: Analytics.IOptions): Analytics.IAnalytics

	declare namespace Analytics {
	interface IOptions {
	app: string,
	plugins: object[]
	}

	T0 Server 1 sends "task.admincreation", rolls 5
	T4 S2 receives from S1
	T5 S3 receives from S1

	T0 Server 2 sends "task.admincreation", rolls 50
	T6 S1 receives from S2 roll 50
	T3 S3 receives from S2

	T0 Server 3 sends "task.admincreation", rolls 30
	T2 S1 receives from S3

	env JMX_PORT=10000 && bin/kafka-server-start.sh config/server2.properties
	SSH_AGENT_PID=1943
	GPG_AGENT_INFO=/tmp/keyring-ousT1B/gpg:0:1
	TERM=xterm
	SHELL=/bin/bash
	XDG_SESSION_COOKIE=e26ace99362c0a676c41b4ce00000021-1351396305.370669-340869060
	WINDOWID=69215116
	GNOME_KEYRING_CONTROL=/tmp/keyring-ousT1B
	USER=tom
	LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:.tar=01;31:.tgz=01;31:.arj=01;31:.taz=01;31:.lzh=01;31:.lzma=01;31:.tlz=01;31:.txz=01;31:.zip=01;31:.z=01;31:.Z=01;31:.dz=01;31:.gz=01;31:.lz=01;31:.xz=01;31:.bz2=01;31:.bz=01;31:.tbz=01;31:.tbz2=01;31:.tz=01;31:.deb=01;31:.rpm=01;31:.jar=01;31:.war=01;31:.ear=01;31:.sar=01;31:.rar=01;31:.ace=01;31:.zoo=01;31:.cpio=01;31:.7z=01;31:.rz=01;31:.jpg=01;35:.jpeg=01;35:.gif=01;35:.bmp=01;35:.pbm=01;35:.pgm=01;35:.ppm=01;35:.tga=01;35:.xbm=01;35:.xpm=01;35:.tif=01;35:.tiff=01;35:.png=01;35:.svg=01;35:*.

	//find the superadmin user (force reading from master to ensure consistency)
	users.findOne({name: config.admin.name}, {slaveOk: false}, function(err, doc) {

	//when an error occured
	if (err) {
	var dbErr = new Error('failed to find super admin account');
	dbErr.origErr = err;
	return mediator.emit('db.error', dbErr);
	}

	//FIXME: validate user input somewhere

	function getAuthParams(req) {
	//1. check `authuser` and `authpass` request query parameter
	//2. check `authuser` and `authpass` request body properties
	var user = req.query.authuser \|\| req.body.authuser,
	pass = req.query.authpass \|\| req.body.authpass;
	if (user && pass) return {
	user: user,
	pass: pass