Cranelift

#idempotent

Cranelift IR:

Functions are compiled individually
The instructions in the function body use and produce values in SSA form i.e. Single Static Assignment
- SSA construction pushed to the frontend - done with the cranelift_frontend crate
- The cranelift_frontend crate contains utilities for translating from programs containing multiple assignments to the same variables into SSA form for Cranelift [IR].
Cranelift’s IR is typed i.e. All SSA values have a type which determines the size and shape of the value.
- Integer types:
- Floating point types:
- SIMD vector types:
Memory instructions:
- has fully general load and store instructions for accessing memory.
Uses traditional CFG (control-flow graphs) using basic blocks.

rustc’s cg_clif compiles the following:

../rustc_codegen_cranelift/dist/bin/rustc-clif --emit=llvm-ir src/main.rs

#[no_mangle]
fn test(a: u8, b: u8) -> u8 {
    if a > b {
        let c = a - b;
        c
    } else {
        let c = a + b;
        c
    }
}

to this

; compiler options
set opt_level=none
set tls_model=macho
set libcall_call_conv=isa_default
set probestack_size_log2=12
set probestack_strategy=inline
set bb_padding_log2_minus_one=0
set regalloc_checker=0
set regalloc_verbose_logs=0
set enable_alias_analysis=1
set enable_verifier=0
set enable_pcc=0
set is_pic=1
set use_colocated_libcalls=0
set enable_float=1
set enable_nan_canonicalization=0
set enable_pinned_reg=0
set enable_atomics=1
set enable_safepoints=0
set enable_llvm_abi_extensions=1
set unwind_info=1
set preserve_frame_pointers=1
set machine_code_cfg_info=0
set enable_probestack=1
set enable_jump_tables=1
set enable_heap_access_spectre_mitigation=1
set enable_table_access_spectre_mitigation=1
set enable_incremental_compilation_cache_checks=0
target aarch64 has_lse=0 has_pauth=0 sign_return_address_all=0 sign_return_address=0 sign_return_address_with_bkey=0 use_bti=0


function u0:9(i8, i8) -> i8 apple_aarch64 {
; symbol test
; instance Instance { def: Item(DefId(0:4 ~ main[1152]::test)), args: [] }
; abi FnAbi { args: [ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }], ret: ArgAbi { layout: TyAndLayout { ty: u8, layout: Layout { size: Size(1 bytes), align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) }, abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }), fields: Primitive, largest_niche: None, variants: Single { index: 0 }, max_repr_align: None, unadjusted_abi_align: Align(1 bytes) } }, mode: Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) }, c_variadic: false, fixed_count: 2, conv: Rust, can_unwind: false }

; kind  loc.idx   param    pass mode                            ty
; ssa   _0    u8                                1b 1, 1              var=0
; ret   _0      -          Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg   _1      = v0       Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8
; arg   _2      = v1       Direct(ArgAttributes { regular: NoUndef, arg_ext: None, pointee_size: Size(0 bytes), pointee_align: None }) u8

; kind  local ty                              size align (abi,pref)
; ssa   _1    u8                                1b 1, 1              var=1
; ssa   _2    u8                                1b 1, 1              var=2
; ssa   _3    bool                              1b 1, 1              var=3
; ssa   _4    u8                                1b 1, 1              var=4
; ssa   _5    (u8, bool)                        2b 1, 8              var=(5, 6)
; ssa   _6    u8                                1b 1, 1              var=7
; ssa   _7    (u8, bool)                        2b 1, 8              var=(8, 9)

    gv0 = symbol colocated userextname0 ; alloc10
    gv1 = symbol colocated userextname2 ; alloc11
    sig0 = (i64) apple_aarch64
    sig1 = (i64) apple_aarch64
    fn0 = u0:11 sig0 ; "_ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E"
    fn1 = u0:12 sig1 ; "_ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E"

block0(v0: i8, v1: i8):
    v2 -> v0
    v5 -> v0
    v11 -> v0
    v3 -> v1
    v6 -> v1
    v12 -> v1
    nop 
; write_cvalue: Var(_1, var1): u8 <- ByVal(v0): u8
; write_cvalue: Var(_2, var2): u8 <- ByVal(v1): u8
    jump block1

block1:
    nop 
; _3 = Gt(copy _1, copy _2)
    v4 = icmp.i8 ugt v2, v3
; write_cvalue: Var(_3, var3): bool <- ByVal(v4): bool
; 
; switchInt(move _3)
    brif v4, block2, block4

block2:
    nop 
; _5 = SubWithOverflow(copy _1, copy _2)
    v7 = isub.i8 v5, v6
    v10 -> v7
    v8 = icmp ugt v7, v5
; write_cvalue: VarPair(_5, var5, var6): (u8, bool) <- ByValPair(v7, v8): (u8, bool)
; 
; assert(!move (_5.1: bool), "attempt to compute `{} - {}`, which would overflow", copy _1, copy _2)
    brif v8, block7, block3

block7 cold:
    nop 
    v9 = global_value.i64 gv0
    call fn0(v9)
; lib_call _ZN4core9panicking11panic_const24panic_const_sub_overflow17h69a7109fa301a030E
    trap unreachable

block3:
    nop 
; _4 = move (_5.0: u8)
; write_cvalue: Var(_4, var4): u8 <- ByVal(v10): u8
; _0 = copy _4
; write_cvalue: Var(_0, var0): u8 <- ByVal(v10): u8
; 
; goto
    return v10

block4:
    nop 
; _7 = AddWithOverflow(copy _1, copy _2)
    v13 = iadd.i8 v11, v12
    v16 -> v13
    v14 = icmp ult v13, v11
; write_cvalue: VarPair(_7, var8, var9): (u8, bool) <- ByValPair(v13, v14): (u8, bool)
; 
; assert(!move (_7.1: bool), "attempt to compute `{} + {}`, which would overflow", copy _1, copy _2)
    brif v14, block8, block5

block8 cold:
    nop 
    v15 = global_value.i64 gv1
    call fn1(v15)
; lib_call _ZN4core9panicking11panic_const24panic_const_add_overflow17h1c3192e1cfd46512E
    trap unreachable

block5:
    nop 
; _6 = move (_7.0: u8)
; write_cvalue: Var(_6, var7): u8 <- ByVal(v16): u8
; _0 = copy _6
; write_cvalue: Var(_0, var0): u8 <- ByVal(v16): u8
; 
; goto
    return v16

block6:
    v18 = iconst.i8 0
    v17 -> v18
    nop 
; 
; return
    return v17  ; v17 = 0
}

Notes:

cranelift-module uses u0:X to refer to function X within the current module (can be an import) and u1:Y to refer to data object Y.
The v2 -> v0 are aliases. So in this case all references to v2 are implicitly replaced with v0 during compilation. They mostly exist to make the SSA lowering in cranelift-frontend easier.

struct FnAbi {
    args: [
        ArgAbi {
            layout: TyAndLayout {
                ty: u8,
                layout: Layout {
                    size: Size(1 bytes),
                    align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
                    abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
                    fields: Primitive,
                    largest_niche: None,
                    variants: Single { index: 0 },
                    max_repr_align: None,
                    unadjusted_abi_align: Align(1 bytes),
                }
            },
            mode: Direct(ArgAttributes { 
                regular: NoUndef, 
                arg_ext: None, 
                pointee_size: Size(0 bytes), 
                pointee_align: None 
            })
        },
        ArgAbi {
            layout: TyAndLayout {
                ty: u8,
                layout: Layout {
                    size: Size(1 bytes),
                    align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
                    abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
                    fields: Primitive,
                    largest_niche: None,
                    variants: Single { index: 0 },
                    max_repr_align: None,
                    unadjusted_abi_align: Align(1 bytes),
                }
            },
            mode: Direct(ArgAttributes { 
                regular: NoUndef, 
                arg_ext: None, 
                pointee_size: Size(0 bytes), 
                pointee_align: None 
            })
        }
    ],
    ret: ArgAbi {
        layout: TyAndLayout {
            ty: u8,
            layout: Layout {
                size: Size(1 bytes),
                align: AbiAndPrefAlign { abi: Align(1 bytes), pref: Align(1 bytes) },
                abi: Scalar(Initialized { value: Int(I8, false), valid_range: 0..=255 }),
                fields: Primitive,
                largest_niche: None,
                variants: Single { index: 0 },
                max_repr_align: None,
                unadjusted_abi_align: Align(1 bytes),
            }
        },
        mode: Direct(ArgAttributes { 
            regular: NoUndef, 
            arg_ext: None, 
            pointee_size: Size(0 bytes), 
            pointee_align: None 
        })
    },
    c_variadic: false,
    fixed_count: 2,
    conv: Rust,
    can_unwind: false
}

A High-Level Overview of Cranelift’s Pipeline:

CLIF -> legalization -> mid-end egraph rewrites if enabled (rules in ISLE) -> lowering to backend-specific VCode (rules in ISLE) -> regalloc -> binary emission

Cranelift Code-generator:

translates Cranelift IR to machine level IR (with machine op codes)
- we are actually mapping the Cranelift IR to a particular target architecture
  - For example: Cranelift IR’s iadd instruction takes two parameters and produces a parameter of the same type.
  - we can map this to an equivalent instruction in say x86-64 and call it an encoding.
  - Legalization - the goal is to turn IR instructions into instructions that are legal for a target machine or architecture.
Code-generators focus on optimizations such as -
- GVN (Global value numbering)
- LICM (Loop invariant code motion)
- DCE (Dead code elimination)

For each codegend function three files are emitted.

The .unopt.clif file contains what cg_clif emits.
The .opt.clif file contains the result of optimizing by Cranelift and
The .vcode file contains the ir right before cranelift emits machine code bytes.

Testing Cranelift

The Cranelift project includes clif-util, a Cranelift code generator utility that can be used for various tasks such as testing, execution, and interpretation.

    Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.37s
     Running `/Users/nihal.pasham/devspace/compiler/wasmtime/target/debug/clif-util -h`
Cranelift code generator utility

Usage: clif-util <COMMAND>

Commands:
  test            Run Cranelift tests
  run             Execute clif code and verify with test expressions
  interpret       Interpret clif code
  cat             Outputs .clif file
  print-cfg       Prints out cfg in GraphViz Dot format
  compile         Compiles Cranelift IR into target language
  pass            Run specified pass(es) on an input file
  bugpoint        Reduce size of clif file causing panic during compilation
  souper-harvest  Harvest candidates for superoptimization from a Wasm or Clif file
  help            Print this message or the help of the given subcommand(s)

Options:
  -h, --help  Print help

Example:

cargo run -- test filetests/filetests/isa/aarch64/arithmetic.clif

Craneflit Codegen (post mid-end optimization)

Terminologies:

CLIF - Cranelift IR
VCode - Virtual Register Code
regalloc - Register Allocation
binemit - Binary Emission

Flow: Translating (high-level IR) CLIF to (low-level IR) VCode is called lowering.

As VCode is target-dependent, the translation process is also target-specific.
This is where we consider which machine instructions will eventually be used for a given CLIF opcode.
There are many ways to achieve the same machine state results for given semantics, but some of these ways are faster than others and/or require fewer code bytes to achieve.
The problem can be summed up like this:
- given some CLIF,
- which VCode can we create
- to generate the fastest
- and/or smallest machine code
- that carries out the desired semantics?
This is called instruction selection, because we're selecting the VCode instructions from a set of different possible instructions.
A given CLIF node may be lowered into
- 1 to N VCode instructions.
A given VCode instruction may lead to the generation of
- 1 to M machine instructions.
There are no rules governing the maximum number of entities mapped.
For instance, the integer addition CLIF opcode iadd on 64-bit inputs maps to a single VCode instruction on AArch64.
The VCode instruction then causes a single machine instruction to be generated.

ISLE:

- ISLE is a domain specific language (DSL) - stands for instruction selection or lowering expressions. - ISLE is a statically-typed term-rewriting language. - You define rewriting rules that map input terms (`clif` instructions) into output terms (`MachInsts`). - These rules get compiled down into Rust source that uses a tree of match expressions that is as good or better than what you would have written by hand. - [The Context trait:](https://github.com/bytecodealliance/wasmtime/blob/main/cranelift/docs/isle-integration.md#gluing-isles-generated-code-into-cranelift) Each ISA-specific, ISLE-generated file is generic over a `Context` trait that has a trait method for each extern helper defined in ISLE. - `ISLE build process:` The ISLE compiler is set up as a build dependency, and the build script uses it to compile ISLE source into generated Rust code. - A normal build typically generates an `out` folder within the `target` directory, containing all ISLE-to-Rust source translations.

The ISLE README.md has a pretty good example.

Describes ISLE constructors, external constructors, extractors

A term can be declared as both a constructor and an extractor because:

A constructor allows you to create the type. If it’s used in an expression (RHS) that creates a new value, it’s a constructor.

An extractor allows you to pattern-match and de-structure the type. If it’s used in a pattern (LHS) that de-structures or matches an existing value, it’s an extractor. This versatility lets ISLE handle different scenarios more concisely and efficiently, using the same name in contexts where each role is clear.

Register Allocation:

The register allocation problem involves mapping intermediate representation (IR) values, often referred to as registers (or Reg in VCode), to machine-specific "containers" such as physical registers or memory locations. This process is known as register allocation (regalloc).

Regalloc2 implementation:

The implementation uses an abstraction that includes the following types:

Operand: virtual registers (or VRegs) defined or used in an instruction, along with their constraints.
Function: A trait implemented by the client that provides the allocator with information about function blocks, block instructions, instruction operands, block predecessors, successors, and additional details.
MachinEnv: A struct that holds information about resources on the target machine, including physical registers for each register class, scratch registers, and registers designated as fixed stack slots.
Allocation: An ‘Allocation’ represents the end result of regalloc for an Operand.
Output: The output of the register allocator.
- among other things, holds a Vec<Allocation> ’s - the main output that contains the final allocations for all operands across all instructions in the function.
- edits - new instructions to be inserted at specific program points (before or after instructions, mostly stackspills).

Key Points:

Virtual Registers: Inputs to register allocation are numerous and map to virtual values, known as virtual registers. The V in VCode stands for these virtual registers. VCode instructions reference values that are virtual registers before register allocation, hence the code is in virtualized register form.
Output of Register Allocation: The output is a set of instructions where virtual registers have been replaced by physical registers or stack slot references, constrained by the limited number of physical registers available on the machine. Additional metadata may also be produced to assist with this mapping.
Purpose and Challenges: The main challenge in register allocation is efficiently utilizing the limited physical registers while minimizing the need to spill registers to memory, which can degrade performance. This involves deciding which variables should be kept in registers and which should be moved to memory, a process that typically includes techniques like graph coloring and live range splitting.

Notes:

Register allocation may create an arbitrary number of spills, reloads, and register moves[^Regalloc2 impl] around VCode instructions to ensure that their register allocation constraints are met.
This is why the output of register allocation is a new list of instructions that includes not only the initial instructions filled with the actual registers but also additional spill, reload, and move (VCode) instructions added by regalloc.

E-graphs:

A data structure that represents an equivalence relationship over terms
It’s composed of equivalence classes or e-classes which are sets of e-nodes
These e-nodes are basically operators in a given language
- Like /or * operators
- Or childless operators like a and 2

- an operator takes as its children not another operator but a whole family of equivalent operators (i.e. children are e-classes themselves) - We can grow our e-graph by adding equivalence rewrites to it - like so

- Final rewrite doesn’t add any more e-nodes but it does decrease the number of e-classes - So, we shrank the e-graph but it contains more equivalences

- if you kept adding rewrites, you may get to a point where you are not adding any more information to the e-graph (i.e. we are not adding new e-classes or e-nodes) - This is called saturation or equality saturation

- the last step is to extract - extraction is a procedure which allows us to pick out the best represented term (i.e. the best possible equivalent term) according to some cost function from a specific e-class (i.e. the initial term) - There’s a lot of ways to extract

Idempotent means that performing the same operation multiple times has the same effect as performing it once.

nihalpasham/crane lift_gist.md