Skip to content

Instantly share code, notes, and snippets.

@bunkbail
Last active February 23, 2026 02:03
Show Gist options
  • Select an option

  • Save bunkbail/5ec9c2ba5f2ab949db81c2a493083f93 to your computer and use it in GitHub Desktop.

Select an option

Save bunkbail/5ec9c2ba5f2ab949db81c2a493083f93 to your computer and use it in GitHub Desktop.
FSR-Lite v1.3: A highly optimized, single-pass implementation of AMD FidelityFX Super Resolution
// FSR-Lite v1.3
//
// A single-pass implementation of AMD FidelityFX Super Resolution.
// Optimized and Tuned by bunkbail
// Based on the mpv port by agyild and original code by AMD
//
// --- ARCHITECTURE OVERVIEW ---
// FSR-Lite merges the two-pass FSR 1.0 pipeline (EASU + RCAS) into a single
// shader while preserving virtually identical visual output.
//
// 1. Faithful EASU Replication:
// - Full 12-tap edge-adaptive Lanczos kernel with batched accumulation.
// - Identical direction/length analysis, kernel shaping, and deringing.
// - Uses the same fast math approximations (APrxLoRcpF1, APrxLoRsqF1).
// - Optional early exit for flat areas (bilinear fallback).
//
// 2. Source-Domain RCAS:
// - Original RCAS samples a 5-tap cross from the upscaled EASU output
// texture, which requires a second pass. FSR-Lite instead uses the 5
// bilinear sub-neighborhoods (lA..lE) already computed for EASU's
// gradient analysis as the RCAS ring. These form a smooth cross at
// ~1 source texel spacing and vary continuously with sub-pixel position.
// - The literal RCAS formula is applied: (e + lobe*(b+d+f+h)) / (1+4*lobe)
// with easu as center and lA,lB,lD,lE as the ring.
// - Adaptive lobe uses the same hitMin/hitMax math and APrxMedRcpF1
// reciprocal as the original, preserving RCAS's self-limiting behavior.
//
// 3. Gamma-Correct Processing:
// - SDR: Operates entirely in gamma space, matching the original FSR.
// - HDR: Applies x^4 linearization for PQ content, matching the original.
//
// --- COPYRIGHT & LICENSE ---
// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//!HOOK LUMA
//!BIND HOOKED
//!DESC FSR-Lite v1.3 (EASU + RCAS)
//!WHEN OUTPUT.w OUTPUT.h * LUMA.w LUMA.h * / 1.0 >
//!WIDTH OUTPUT.w OUTPUT.w LUMA.w 2 * < * LUMA.w 2 * OUTPUT.w LUMA.w 2 * > * + OUTPUT.w OUTPUT.w LUMA.w 2 * = * +
//!HEIGHT OUTPUT.h OUTPUT.h LUMA.h 2 * < * LUMA.h 2 * OUTPUT.h LUMA.h 2 * > * + OUTPUT.h OUTPUT.h LUMA.h 2 * = * +
//!COMPONENTS 1
// ============================================================================
// TUNABLE PARAMETERS
// ============================================================================
// Sharpness (RCAS convention: 0.0 = maximum, higher = less sharp in stops).
// 0.2 matches the original FSR default.
#define SHARPNESS 0.2
// Whether the source content has PQ gamma. 0 = SDR, 1 = HDR (PQ).
#define FSR_PQ 0
// EASU deringing. Clamps output to center 2x2 min/max. 0 or 1.
#define FSR_DERING 1
// Early exit for flat areas (bilinear fallback). 0 or 1.
#define FSR_QUIT_EARLY 1
// RCAS denoise. Lessens sharpening on noisy areas. 0 or 1.
#define FSR_RCAS_DENOISE 0
// RCAS limit. Prevents unnatural over-sharpening.
#define FSR_RCAS_LIMIT (0.25 - (1.0 / 16.0))
// ============================================================================
// FAST MATH
// ============================================================================
float APrxLoRcpF1(float a) {
return uintBitsToFloat(uint(0x7ef07ebb) - floatBitsToUint(a));
}
float APrxLoRsqF1(float a) {
return uintBitsToFloat(uint(0x5f347d74) - (floatBitsToUint(a) >> uint(1)));
}
float APrxMedRcpF1(float a) {
float b = uintBitsToFloat(uint(0x7ef19fff) - floatBitsToUint(a));
return b * (-b * a + 2.0);
}
float AMin3F1(float x, float y, float z) { return min(x, min(y, z)); }
float AMax3F1(float x, float y, float z) { return max(x, max(y, z)); }
// ============================================================================
// PQ TRANSFORMS
// ============================================================================
#if (FSR_PQ == 1)
float ToGamma2(float a) { return a * a * a * a; }
float FromGamma2(float a) { return sqrt(sqrt(max(a, 0.0))); }
#endif
// ============================================================================
// EASU TAP (Lanczos-2 approximation)
// ============================================================================
void FsrEasuTap(
inout float aC, inout float aW,
float d2, float lob, float clp, float c)
{
d2 = min(d2, clp);
float wB = (0.25 * d2 - 1.25) * d2 + 1.0;
float wA = lob * d2 - 1.0;
wA *= wA;
float tw = wB * wA;
aC += c * tw;
aW += tw;
}
// ============================================================================
// MAIN
// ============================================================================
vec4 hook() {
vec2 pp = HOOKED_pos * HOOKED_size - vec2(0.5);
vec2 fp = floor(pp);
pp -= fp;
// 12-tap fetch
float bL, cL, eL, fL, gL, hL, iL, jL, kL, lL, nL, oL;
#if (defined(HOOKED_gather) && (__VERSION__ >= 400 || (GL_ES && __VERSION__ >= 310)))
vec4 bczzL = HOOKED_gather(vec2((fp + vec2(1.0, -1.0)) * HOOKED_pt), 0);
vec4 ijfeL = HOOKED_gather(vec2((fp + vec2(0.0, 1.0)) * HOOKED_pt), 0);
vec4 klhgL = HOOKED_gather(vec2((fp + vec2(2.0, 1.0)) * HOOKED_pt), 0);
vec4 zzonL = HOOKED_gather(vec2((fp + vec2(1.0, 3.0)) * HOOKED_pt), 0);
bL = bczzL.x; cL = bczzL.y;
iL = ijfeL.x; jL = ijfeL.y; fL = ijfeL.z; eL = ijfeL.w;
kL = klhgL.x; lL = klhgL.y; hL = klhgL.z; gL = klhgL.w;
oL = zzonL.z; nL = zzonL.w;
#else
bL = HOOKED_tex(vec2((fp + vec2(0.5, -0.5)) * HOOKED_pt)).r;
cL = HOOKED_tex(vec2((fp + vec2(1.5, -0.5)) * HOOKED_pt)).r;
eL = HOOKED_tex(vec2((fp + vec2(-0.5, 0.5)) * HOOKED_pt)).r;
fL = HOOKED_tex(vec2((fp + vec2( 0.5, 0.5)) * HOOKED_pt)).r;
gL = HOOKED_tex(vec2((fp + vec2( 1.5, 0.5)) * HOOKED_pt)).r;
hL = HOOKED_tex(vec2((fp + vec2( 2.5, 0.5)) * HOOKED_pt)).r;
iL = HOOKED_tex(vec2((fp + vec2(-0.5, 1.5)) * HOOKED_pt)).r;
jL = HOOKED_tex(vec2((fp + vec2( 0.5, 1.5)) * HOOKED_pt)).r;
kL = HOOKED_tex(vec2((fp + vec2( 1.5, 1.5)) * HOOKED_pt)).r;
lL = HOOKED_tex(vec2((fp + vec2( 2.5, 1.5)) * HOOKED_pt)).r;
nL = HOOKED_tex(vec2((fp + vec2(0.5, 2.5)) * HOOKED_pt)).r;
oL = HOOKED_tex(vec2((fp + vec2(1.5, 2.5)) * HOOKED_pt)).r;
#endif
#if (FSR_PQ == 1)
bL = ToGamma2(bL); cL = ToGamma2(cL);
eL = ToGamma2(eL); fL = ToGamma2(fL);
gL = ToGamma2(gL); hL = ToGamma2(hL);
iL = ToGamma2(iL); jL = ToGamma2(jL);
kL = ToGamma2(kL); lL = ToGamma2(lL);
nL = ToGamma2(nL); oL = ToGamma2(oL);
#endif
// Bilinear weights
vec4 bw;
bw.x = (1.0 - pp.x) * (1.0 - pp.y);
bw.y = pp.x * (1.0 - pp.y);
bw.z = (1.0 - pp.x) * pp.y;
bw.w = pp.x * pp.y;
// 5 bilinear sub-neighborhoods (reused for EASU analysis and RCAS ring)
// lA
// lB lC lD
// lE
float lA = dot(bw, vec4(bL, cL, fL, gL));
float lB = dot(bw, vec4(eL, fL, iL, jL));
float lC = dot(bw, vec4(fL, gL, jL, kL));
float lD = dot(bw, vec4(gL, hL, kL, lL));
float lE = dot(bw, vec4(jL, kL, nL, oL));
// Direction and length analysis
float dc = lD - lC;
float cb = lC - lB;
float lenX = max(abs(dc), abs(cb));
lenX = APrxLoRcpF1(lenX);
float dirX = lD - lB;
lenX = clamp(abs(dirX) * lenX, 0.0, 1.0);
lenX *= lenX;
float ec = lE - lC;
float ca = lC - lA;
float lenY = max(abs(ec), abs(ca));
lenY = APrxLoRcpF1(lenY);
float dirY = lE - lA;
lenY = clamp(abs(dirY) * lenY, 0.0, 1.0);
lenY *= lenY;
float len = lenX + lenY;
vec2 dir = vec2(dirX, dirY);
// Normalize direction
vec2 dir2 = dir * dir;
float dirR = dir2.x + dir2.y;
bool zro = dirR < (1.0 / 64.0);
dirR = APrxLoRsqF1(dirR);
#if (FSR_QUIT_EARLY == 1)
if (zro) {
return vec4(clamp(lC, 0.0, 1.0), 0.0, 0.0, 1.0);
}
#else
dirR = zro ? 1.0 : dirR;
dir.x = zro ? 1.0 : dir.x;
#endif
dir *= vec2(dirR);
// Kernel shape
len = len * 0.5;
len *= len;
float stretch = (dir.x * dir.x + dir.y * dir.y) * APrxLoRcpF1(max(abs(dir.x), abs(dir.y)));
vec2 len2 = vec2(1.0 + (stretch - 1.0) * len, 1.0 + -0.5 * len);
float lob = 0.5 + float((1.0 / 4.0 - 0.04) - 0.5) * len;
float clp = APrxLoRcpF1(lob);
// 12-tap Lanczos accumulation
vec4 rs = vec4(dir.x, dir.y, -dir.y, dir.x) * len2.xxyy;
vec2 ppRot = vec2(pp.x * rs.x + pp.y * rs.y, pp.x * rs.z + pp.y * rs.w);
float Sxy = rs.x + rs.y; float Dxy = rs.x - rs.y;
float Szw = rs.z + rs.w; float Dzw = rs.z - rs.w;
float aC = 0.0;
float aW = 0.0;
{
vec4 vX = vec4(-rs.y, Dxy, -Dxy, rs.y) - ppRot.x;
vec4 vY = vec4(-rs.w, Dzw, -Dzw, rs.w) - ppRot.y;
vec4 d2 = vX * vX + vY * vY;
FsrEasuTap(aC, aW, d2.x, lob, clp, bL);
FsrEasuTap(aC, aW, d2.y, lob, clp, cL);
FsrEasuTap(aC, aW, d2.z, lob, clp, iL);
FsrEasuTap(aC, aW, d2.w, lob, clp, jL);
}
{
vec4 vX = vec4(0.0, -rs.x, Sxy, rs.x + Sxy) - ppRot.x;
vec4 vY = vec4(0.0, -rs.z, Szw, rs.z + Szw) - ppRot.y;
vec4 d2 = vX * vX + vY * vY;
FsrEasuTap(aC, aW, d2.x, lob, clp, fL);
FsrEasuTap(aC, aW, d2.y, lob, clp, eL);
FsrEasuTap(aC, aW, d2.z, lob, clp, kL);
FsrEasuTap(aC, aW, d2.w, lob, clp, lL);
}
{
vec4 vX = vec4(2.0 * rs.x, rs.x, rs.x + 2.0 * rs.y, 2.0 * rs.y) - ppRot.x;
vec4 vY = vec4(2.0 * rs.z, rs.z, rs.z + 2.0 * rs.w, 2.0 * rs.w) - ppRot.y;
vec4 d2 = vX * vX + vY * vY;
FsrEasuTap(aC, aW, d2.x, lob, clp, hL);
FsrEasuTap(aC, aW, d2.y, lob, clp, gL);
FsrEasuTap(aC, aW, d2.z, lob, clp, oL);
FsrEasuTap(aC, aW, d2.w, lob, clp, nL);
}
float easu = aC / (aW + 1.0e-5);
// Deringing
#if (FSR_DERING == 1)
float mn1 = min(AMin3F1(fL, gL, jL), kL);
float mx1 = max(AMax3F1(fL, gL, jL), kL);
easu = clamp(easu, mn1, mx1);
#endif
easu = clamp(easu, 0.0, 1.0);
// RCAS (source-domain ring: lA, lB, lD, lE)
float mn_ring = min(AMin3F1(lA, lB, lD), lE);
float mx_ring = max(AMax3F1(lA, lB, lD), lE);
float hitMinL = min(mn_ring, easu) / (4.0 * mx_ring);
float hitMaxL = (1.0 - max(mx_ring, easu)) / (4.0 * mn_ring - 4.0);
float lobeL = max(-hitMinL, hitMaxL);
float rcas_lobe = max(float(-FSR_RCAS_LIMIT), min(lobeL, 0.0)) * exp2(-clamp(float(SHARPNESS), 0.0, 2.0));
#if (FSR_RCAS_DENOISE == 1)
float nz = 0.25 * (lA + lB + lD + lE) - easu;
nz = clamp(abs(nz) * APrxMedRcpF1(
AMax3F1(AMax3F1(lA, lB, easu), lD, lE) -
AMin3F1(AMin3F1(lA, lB, easu), lD, lE)), 0.0, 1.0);
nz = -0.5 * nz + 1.0;
rcas_lobe *= nz;
#endif
float rcpL = APrxMedRcpF1(4.0 * rcas_lobe + 1.0);
float res = (rcas_lobe * (lA + lB + lD + lE) + easu) * rcpL;
#if (FSR_PQ == 1)
res = FromGamma2(res);
#endif
return vec4(clamp(res, 0.0, 1.0), 0.0, 0.0, 1.0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment