#idempotent
- Functions are compiled individually
- The instructions in the function body use and produce values in SSA form i.e.
Single Static Assignment- SSA construction pushed to the frontend - done with the
cranelift_frontendcrate - The
cranelift_frontend cratecontains utilities for translating from programs containing multiple assignments to the same variables into SSA form for Cranelift [IR].
- SSA construction pushed to the frontend - done with the
- Cranelift’s IR is typed i.e. All SSA values have a type which determines the size and shape of the value.
- Integer types:
- Floating point types:
- SIMD vector types:
- Memory instructions:
- has fully general
loadandstoreinstructions for accessing memory.
- has fully general
- Uses traditional CFG (control-flow graphs) using basic blocks.
rustc’s cg_clif compiles the following:
../rustc_codegen_cranelift/dist/bin/rustc-clif --emit=llvm-ir src/main.rs#[no_mangle]
fn test(a: u8, b: u8) -> u8 {
if a > b {
let c = a - b;
c
} else {
let c = a + b;
c
}
}to this
; compiler options
set opt_level=none
set tls_model=macho
set libcall_call_conv=isa_default
set probestack_size_log2=12
set probestack_strategy=inline
set bb_padding_log2_minus_one=0
set regalloc_checker=0
set regalloc_verbose_logs=0
set enable_alias_analysis=1
set enable_verifier=0
set enable_pcc=0
set is_pic=1
set use_colocated_libcalls=0
set enable_float=1
set enable_nan_canonicalization=0
set enable_pinned_reg=0
set enable_atomics=1
set enable_safepoints=0
set enable_llvm_abi_extensions=1
set unwind_info=1
set preserve_frame_pointers=1
set machine_code_cfg_info=0
set enable_probestack=1
set enable_jump_tables=1
set enable_heap_access_spectre_mitigation=1
set enable_table_access_spectre_mitigation=1
set enable_incremental_compilation_cache_checks=0
target aarch64 has_lse=0 has_pauth=0 sign_return_address_all=0 sign_return_address=0 sign_return_address_with_bkey=0 use_bti=0
function u0:9(i8, i8) -> i8 apple_aarch64 {
; symbol test
; instance Instance { def: Item(DefId(0:4 ~ main[1152]::test)), args: [] }
; abi FnAbi { args: [ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }], ret: ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, c_variadic: false, fixed_count: 2, conv: Rust, can_unwind: false }
; kind loc.idx param pass mode ty
; ssa _0 u8 1b 1, 1 var=0
; ret _0 - Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg _1 = v0 Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg _2 = v1 Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; kind local ty size align (abi,pref)
; ssa _1 u8 1b 1, 1 var=1
; ssa _2 u8 1b 1, 1 var=2
; ssa _3 bool 1b 1, 1 var=3
; ssa _4 u8 1b 1, 1 var=4
; ssa _5 (u8, bool) 2b 1, 8 var=(5, 6)
; ssa _6 u8 1b 1, 1 var=7
; ssa _7 (u8, bool) 2b 1, 8 var=(8, 9)
gv0 = symbol colocated userextname0 ; alloc10
gv1 = symbol colocated userextname2 ; alloc11
sig0 = (i64) apple_aarch64
sig1 = (i64) apple_aarch64
fn0 = u0:11 sig0 ; "_ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E"
fn1 = u0:12 sig1 ; "_ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E"
block0(v0: i8, v1: i8):
v2 -> v0
v5 -> v0
v11 -> v0
v3 -> v1
v6 -> v1
v12 -> v1
nop
; write_cvalue: Var(_1, var1): u8 <- ByVal(v0): u8
; write_cvalue: Var(_2, var2): u8 <- ByVal(v1): u8
jump block1
block1:
nop
; _3 = Gt(copy _1, copy _2)
v4 = icmp.i8 ugt v2, v3
; write_cvalue: Var(_3, var3): bool <- ByVal(v4): bool
;
; switchInt(move _3)
brif v4, block2, block4
block2:
nop
; _5 = SubWithOverflow(copy _1, copy _2)
v7 = isub.i8 v5, v6
v10 -> v7
v8 = icmp ugt v7, v5
; write_cvalue: VarPair(_5, var5, var6): (u8, bool) <- ByValPair(v7, v8): (u8, bool)
;
; assert(!move (_5.1: bool), "attempt to compute `{} - {}`, which would overflow", copy _1, copy _2)
brif v8, block7, block3
block7 cold:
nop
v9 = global_value.i64 gv0
call fn0(v9)
; lib_call _ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E
trap unreachable
block3:
nop
; _4 = move (_5.0: u8)
; write_cvalue: Var(_4, var4): u8 <- ByVal(v10): u8
; _0 = copy _4
; write_cvalue: Var(_0, var0): u8 <- ByVal(v10): u8
;
; goto
return v10
block4:
nop
; _7 = AddWithOverflow(copy _1, copy _2)
v13 = iadd.i8 v11, v12
v16 -> v13
v14 = icmp ult v13, v11
; write_cvalue: VarPair(_7, var8, var9): (u8, bool) <- ByValPair(v13, v14): (u8, bool)
;
; assert(!move (_7.1: bool), "attempt to compute `{} + {}`, which would overflow", copy _1, copy _2)
brif v14, block8, block5
block8 cold:
nop
v15 = global_value.i64 gv1
call fn1(v15)
; lib_call _ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E
trap unreachable
block5:
nop
; _6 = move (_7.0: u8)
; write_cvalue: Var(_6, var7): u8 <- ByVal(v16): u8
; _0 = copy _6
; write_cvalue: Var(_0, var0): u8 <- ByVal(v16): u8
;
; goto
return v16
block6:
v18 = iconst.i8 0
v17 -> v18
nop
;
; return
return v17 ; v17 = 0
}Notes:
- cranelift-module uses
u0:Xto refer to functionXwithin the current module (can be an import) andu1:Yto refer to data objectY. - The
v2 -> v0are aliases. So in this case all references to v2 are implicitly replaced with v0 during compilation. They mostly exist to make the SSA lowering in cranelift-frontend easier.
struct FnAbi {
args: [
ArgAbi {
layout: TyAndLayout {
ty: u8,
layout: Layout {
size: Size(1 bytes),
align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
fields: Primitive,
largest_niche: None,
variants: Single { index: 0 },
max_repr_align: None,
unadjusted_abi_align: Align(1 bytes),
}
},
mode: Direct(ArgAttributes {
regular: NoUndef,
arg_ext: None,
pointee_size: Size(0 bytes),
pointee_align: None
})
},
ArgAbi {
layout: TyAndLayout {
ty: u8,
layout: Layout {
size: Size(1 bytes),
align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
fields: Primitive,
largest_niche: None,
variants: Single { index: 0 },
max_repr_align: None,
unadjusted_abi_align: Align(1 bytes),
}
},
mode: Direct(ArgAttributes {
regular: NoUndef,
arg_ext: None,
pointee_size: Size(0 bytes),
pointee_align: None
})
}
],
ret: ArgAbi {
layout: TyAndLayout {
ty: u8,
layout: Layout {
size: Size(1 bytes),
align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
fields: Primitive,
largest_niche: None,
variants: Single { index: 0 },
max_repr_align: None,
unadjusted_abi_align: Align(1 bytes),
}
},
mode: Direct(ArgAttributes {
regular: NoUndef,
arg_ext: None,
pointee_size: Size(0 bytes),
pointee_align: None
})
},
c_variadic: false,
fixed_count: 2,
conv: Rust,
can_unwind: false
}CLIF -> legalization -> mid-end egraph rewrites if enabled (rules in ISLE) -> lowering to backend-specific VCode (rules in ISLE) -> regalloc -> binary emission- translates Cranelift IR to machine level IR (with machine op codes)
- we are actually mapping the Cranelift IR to a particular target architecture
- For example: Cranelift IR’s
iaddinstruction takes two parameters and produces a parameter of the same type. - we can map this to an equivalent instruction in say
x86-64and call it an encoding. - Legalization - the goal is to turn IR instructions into instructions that are legal for a target machine or architecture.
- For example: Cranelift IR’s
- we are actually mapping the Cranelift IR to a particular target architecture
- Code-generators focus on optimizations such as -
- GVN (Global value numbering)
- LICM (Loop invariant code motion)
- DCE (Dead code elimination)
For each codegend function three files are emitted.
- The
.unopt.cliffile contains whatcg_clifemits. - The
.opt.cliffile contains the result of optimizing by Cranelift and - The
.vcodefile contains theirright before cranelift emits machine code bytes.
- The Cranelift project includes
clif-util, a Cranelift code generator utility that can be used for various tasks such as testing, execution, and interpretation.
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.37s
Running `/Users/nihal.pasham/devspace/compiler/wasmtime/target/debug/clif-util -h`
Cranelift code generator utility
Usage: clif-util <COMMAND>
Commands:
test Run Cranelift tests
run Execute clif code and verify with test expressions
interpret Interpret clif code
cat Outputs .clif file
print-cfg Prints out cfg in GraphViz Dot format
compile Compiles Cranelift IR into target language
pass Run specified pass(es) on an input file
bugpoint Reduce size of clif file causing panic during compilation
souper-harvest Harvest candidates for superoptimization from a Wasm or Clif file
help Print this message or the help of the given subcommand(s)
Options:
-h, --help Print helpExample:
cargo run -- test filetests/filetests/isa/aarch64/arithmetic.clifTerminologies:
CLIF- Cranelift IRVCode- Virtual Register Coderegalloc- Register Allocationbinemit- Binary Emission
Flow: Translating (high-level IR) CLIF to (low-level IR) VCode is called lowering.
- As VCode is target-dependent, the translation process is also target-specific.
- This is where we consider which machine instructions will eventually be used for a given
CLIF opcode. - There are many ways to achieve the same machine state results for given semantics, but some of these ways are faster than others and/or require fewer code bytes to achieve.
- The problem can be summed up like this:
- given
some CLIF, which VCodecan we create- to generate the fastest
- and/or smallest machine code
- that carries out the desired semantics?
- given
- This is called instruction selection, because we're selecting the VCode instructions from a set of different possible instructions.
-
- A given CLIF node may be lowered into
- 1 to N VCode instructions.
- A given VCode instruction may lead to the generation of
- 1 to M machine instructions.
- There are no rules governing the maximum number of entities mapped.
- For instance, the integer addition CLIF opcode
iaddon 64-bit inputs maps to a single VCode instruction on AArch64. - The VCode instruction then causes a single machine instruction to be generated.
- ISLE is a domain specific language (DSL)
- stands for instruction selection or lowering expressions.
- ISLE is a statically-typed term-rewriting language.
- You define rewriting rules that map input terms (`clif` instructions) into output terms (`MachInsts`).
- These rules get compiled down into Rust source that uses a tree of match expressions that is as good or better than what you would have written by hand.
- [The Context trait:](https://github.com/bytecodealliance/wasmtime/blob/main/cranelift/docs/isle-integration.md#gluing-isles-generated-code-into-cranelift) Each ISA-specific, ISLE-generated file is generic over a `Context` trait that has a trait method for each extern helper defined in ISLE.
- `ISLE build process:` The ISLE compiler is set up as a build dependency, and the build script uses it to compile ISLE source into generated Rust code.
- A normal build typically generates an `out` folder within the `target` directory, containing all ISLE-to-Rust source translations.
The ISLE README.md has a pretty good example.
Describes ISLE constructors, external constructors, extractors
A term can be declared as both a constructor and an extractor because:
- A constructor allows you to create the type. If it’s used in an expression (RHS) that creates a new value, it’s a constructor.
- An extractor allows you to pattern-match and de-structure the type. If it’s used in a pattern (LHS) that de-structures or matches an existing value, it’s an extractor. This versatility lets ISLE handle different scenarios more concisely and efficiently, using the same name in contexts where each role is clear.
- The register allocation problem involves mapping intermediate representation (IR) values, often referred to as registers (or Reg in VCode), to machine-specific "containers" such as physical registers or memory locations. This process is known as register allocation (regalloc).
The implementation uses an abstraction that includes the following types:
Operand:virtual registers (orVRegs) defined or used in an instruction, along with their constraints.Function:A trait implemented by the client that provides the allocator with information about function blocks, block instructions, instruction operands, block predecessors, successors, and additional details.MachinEnv:A struct that holds information about resources on the target machine, including physical registers for each register class, scratch registers, and registers designated as fixed stack slots.Allocation:An ‘Allocation’ represents the end result of regalloc for anOperand.Output:The output of the register allocator.- among other things, holds a
Vec<Allocation>’s - the main output that contains the final allocations for all operands across all instructions in the function. edits- new instructions to be inserted at specific program points (before or after instructions, mostly stackspills).
- among other things, holds a
- Virtual Registers: Inputs to register allocation are numerous and map to
virtualvalues, known as virtual registers. TheVin VCode stands for these virtual registers. VCode instructions reference values that are virtual registers before register allocation, hence the code is in virtualized register form. - Output of Register Allocation: The output is a set of instructions where virtual registers have been replaced by physical registers or stack slot references, constrained by the limited number of physical registers available on the machine. Additional metadata may also be produced to assist with this mapping.
- Purpose and Challenges: The main challenge in register allocation is efficiently utilizing the limited physical registers while minimizing the need to spill registers to memory, which can degrade performance. This involves deciding which variables should be kept in registers and which should be moved to memory, a process that typically includes techniques like graph coloring and live range splitting.
- Register allocation may create an arbitrary number of spills, reloads, and register moves[^Regalloc2 impl] around VCode instructions to ensure that their register allocation constraints are met.
- This is why the output of register allocation is a new list of instructions that includes not only the initial instructions filled with the actual registers but also additional spill, reload, and move (VCode) instructions added by regalloc.
- A data structure that represents an equivalence relationship over terms
- It’s composed of equivalence classes or
e-classeswhich are sets ofe-nodes - These e-nodes are basically operators in a given language
- Like
/or*operators - Or childless operators like
aand2
- Like
- an operator takes as its children not another operator but a whole family of equivalent operators (i.e. children are e-classes themselves)
- We can grow our e-graph by adding equivalence rewrites to it - like so
- Final rewrite doesn’t add any more e-nodes but it does decrease the number of e-classes
- So, we shrank the e-graph but it contains more equivalences
- if you kept adding rewrites, you may get to a point where you are not adding any more information to the e-graph (i.e. we are not adding new e-classes or e-nodes)
- This is called saturation or equality saturation
- the last step is to extract
- extraction is a procedure which allows us to pick out the best represented term (i.e. the best possible equivalent term) according to some cost function from a specific e-class (i.e. the initial term)
- There’s a lot of ways to extract
Idempotent means that performing the same operation multiple times has the same effect as performing it once.