Last active
June 3, 2025 10:34
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <math.h> | |
enum { | |
MAX_CHAINS = 64, | |
PARTICLES_PER_CHAIN = 16, | |
SPHERES_PER_CHAIN = 8, | |
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), | |
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN), | |
CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN, | |
MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN), | |
MAX_ITERATIONS = 4, | |
}; | |
struct chain_cfg { | |
float damping; | |
float stiffness; | |
float stretch_factor; | |
float max_strain; | |
float friction; | |
float collision_radius; | |
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
} __attribute__((aligned(32))); | |
struct chain_sim { | |
unsigned short free_idx_cnt; | |
unsigned short free_idx[MAX_CHAINS]; | |
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain global forces (world space gravity and wind) | |
float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain transformations (world space) | |
float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
// Particle positions (model space) | |
float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
// Sphere positions (model space) | |
float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); | |
// Rest positions (model space) | |
float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float motion_radius[MAX_PARTICLES] __attribute__((aligned(32))); | |
}; | |
void simulate_chains(struct chain_sim *cs, float dt) { | |
// Common SIMD constants | |
const __m256 dt_vec = _mm256_set1_ps(dt); | |
const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
const __m256 one_vec = _mm256_set1_ps(1.0f); | |
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); | |
const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f); | |
const __m256 zero_vec = _mm256_setzero_ps(); | |
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); | |
const __m256 two_vec = _mm256_set1_ps(2.0f); | |
const __m256 half_vec = _mm256_set1_ps(0.5f); | |
const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f); | |
const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f); | |
const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f); | |
const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f); | |
const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f); | |
const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f); | |
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); | |
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
// Local arrays for PBD solver | |
float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
// Initialize dummy elements | |
px_local[PARTICLES_PER_CHAIN] = 0.0f; | |
py_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pz_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pm_local[PARTICLES_PER_CHAIN] = 0.0f; | |
// Stack arrays for precomputed data | |
float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Step 0: Precomputation (SIMD, 8 chains at once) | |
for (int c = 0; c < MAX_CHAINS; c += 8) { | |
// Load chain quaternions | |
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); | |
// Compute local-space wind | |
{ | |
__m256 vx = _mm256_load_ps(&cs->wind_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->wind_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->wind_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&wind_local_x[c], result_x); | |
_mm256_store_ps(&wind_local_y[c], result_y); | |
_mm256_store_ps(&wind_local_z[c], result_z); | |
} | |
// Compute local-space gravity | |
{ | |
__m256 vx = _mm256_load_ps(&cs->gravity_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->gravity_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->gravity_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&gravity_local_x[c], result_x); | |
_mm256_store_ps(&gravity_local_y[c], result_y); | |
_mm256_store_ps(&gravity_local_z[c], result_z); | |
} | |
// Compute linear velocity | |
{ | |
__m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); | |
__m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); | |
__m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); | |
__m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); | |
__m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); | |
__m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); | |
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); | |
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); | |
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); | |
_mm256_store_ps(&vel_x[c], vel_x_vec); | |
_mm256_store_ps(&vel_y[c], vel_y_vec); | |
_mm256_store_ps(&vel_z[c], vel_z_vec); | |
} | |
// Compute angular velocity (SIMD with polynomial approximations) | |
{ | |
__m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); | |
__m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); | |
__m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); | |
__m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); | |
// Quaternion inverse | |
__m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy)); | |
norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq); | |
norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq); | |
__m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec)); | |
__m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq); | |
__m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq); | |
__m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq); | |
__m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq); | |
// Delta quaternion: q_current * q_prev^-1 | |
__m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx)); | |
delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx)); | |
__m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy)); | |
delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy)); | |
__m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz)); | |
delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz)); | |
__m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw)); | |
delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw); | |
delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw); | |
// Angular velocity with polynomial approximations | |
__m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec); | |
// Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))) | |
__m256 x = qw_clamped; | |
__m256 x2 = _mm256_mul_ps(x, x); | |
__m256 x4 = _mm256_mul_ps(x2, x2); | |
__m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec); | |
poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec); | |
poly = _mm256_fmadd_ps(x2, poly, one_vec); | |
poly = _mm256_mul_ps(x, poly); | |
__m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly)); | |
// Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2)) | |
__m256 half_angle = _mm256_mul_ps(angle, half_vec); | |
__m256 ha2 = _mm256_mul_ps(half_angle, half_angle); | |
__m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec); | |
sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec); | |
__m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly); | |
__m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec)); | |
// Sign: Use qw > 0 instead of angle < π | |
__m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ); | |
sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec)); | |
// Angular velocity: (sign * qx * inv_sin_half_angle / dt) | |
__m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec)); | |
__m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale); | |
__m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale); | |
__m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale); | |
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); | |
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); | |
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
// Load precomputed velocities and inertia parameters | |
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); | |
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); | |
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); | |
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); | |
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); | |
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); | |
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); | |
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); | |
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
// Linear inertia: v * dt * linear_inertia | |
__m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
__m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); | |
ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); | |
ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
// Calculate displacement: (ω × (ω × r)) * dt^2 | |
__m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
__m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
__m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
// Apply the centrifugal_inertia factor | |
__m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec); | |
__m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec); | |
__m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec); | |
// Total delta | |
__m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); | |
__m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y); | |
__m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z); | |
// Update positions | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z)); | |
} | |
// Update previous transformation | |
cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; | |
cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; | |
cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; | |
cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; | |
cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; | |
cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; | |
cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; | |
} | |
// Step 2: Verlet Integration | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); | |
__m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); | |
__m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); | |
__m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); | |
__m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); | |
__m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); | |
__m256 damping_vec = _mm256_set1_ps(cfg->damping); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]); | |
__m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec); | |
__m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec); | |
__m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec); | |
__m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass); | |
__m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass); | |
__m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z); | |
_mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x); | |
_mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y); | |
_mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z); | |
} | |
} | |
// Step 3: Distance Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 stiffness_vec = _mm256_set1_ps(cfg->stiffness); | |
__m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor); | |
__m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain); | |
__m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); | |
_mm256_store_ps(&px_local[i], px); | |
_mm256_store_ps(&py_local[i], py); | |
_mm256_store_ps(&pz_local[i], pz); | |
_mm256_store_ps(&pm_local[i], pm); | |
} | |
for (int iter = 0; iter < MAX_ITERATIONS; iter++) { | |
// First block (i=0 to 7) | |
{ | |
__m256 p0x = _mm256_load_ps(&px_local[0]); | |
__m256 p0y = _mm256_load_ps(&py_local[0]); | |
__m256 p0z = _mm256_load_ps(&pz_local[0]); | |
__m256 p0m = _mm256_load_ps(&pm_local[0]); | |
__m256 p1x = _mm256_loadu_ps(&px_local[1]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[1]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[1]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[1]); | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 diff = _mm256_sub_ps(dist, rest_len_vec); | |
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); | |
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); | |
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); | |
__m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); | |
__m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom)); | |
__m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec); | |
corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum); | |
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part); | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
__m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
_mm256_store_ps(&px_local[0], _mm256_fmadd_ps(delta_x, p0m, p0x)); | |
_mm256_store_ps(&py_local[0], _mm256_fmadd_ps(delta_y, p0m, p0y)); | |
_mm256_store_ps(&pz_local[0], _mm256_fmadd_ps(delta_z, p0m, p0z)); | |
_mm256_store_ps(&px_local[1], _mm256_fnmadd_ps(delta_x, p1m, p1x)); | |
_mm256_store_ps(&py_local[1], _mm256_fnmadd_ps(delta_y, p1m, p1y)); | |
_mm256_store_ps(&pz_local[1], _mm256_fnmadd_ps(delta_z, p1m, p1z)); | |
} | |
// Second block (i=8 to 14) | |
{ | |
__m256 p0x = _mm256_load_ps(&px_local[8]); | |
__m256 p0y = _mm256_load_ps(&py_local[8]); | |
__m256 p0z = _mm256_load_ps(&pz_local[8]); | |
__m256 p0m = _mm256_load_ps(&pm_local[8]); | |
__m256 p1x = _mm256_loadu_ps(&px_local[9]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[9]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[9]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[9]); | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 diff = _mm256_sub_ps(dist, rest_len_vec); | |
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); | |
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); | |
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); | |
__m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); | |
__m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom)); | |
__m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec); | |
corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum); | |
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part); | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; | |
__m256 valid_mask = _mm256_load_ps(valid_mask_array); | |
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); | |
__m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
_mm256_store_ps(&px_local[8], _mm256_fmadd_ps(delta_x, p0m, p0x)); | |
_mm256_store_ps(&py_local[8], _mm256_fmadd_ps(delta_y, p0m, p0y)); | |
_mm256_store_ps(&pz_local[8], _mm256_fmadd_ps(delta_z, p0m, p0z)); | |
_mm256_storeu_ps(&px_local[9], _mm256_fnmadd_ps(delta_x, p1m, p1x)); | |
_mm256_storeu_ps(&py_local[9], _mm256_fnmadd_ps(delta_y, p1m, p1y)); | |
_mm256_storeu_ps(&pz_local[9], _mm256_fnmadd_ps(delta_z, p1m, p1z)); | |
} | |
} | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&px_local[i]); | |
__m256 py = _mm256_load_ps(&py_local[i]); | |
__m256 pz = _mm256_load_ps(&pz_local[i]); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], px); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], py); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], pz); | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int s_base = c * SPHERES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 friction_vec = _mm256_set1_ps(cfg->friction); | |
for (int s = 0; s < SPHERES_PER_CHAIN; s++) { | |
__m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]); | |
__m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]); | |
__m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]); | |
__m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); | |
__m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); | |
__m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(sphere_r_vec, dist); | |
__m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 normal_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 normal_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 normal_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 normal_corr_x = _mm256_mul_ps(normal_x, penetration); | |
__m256 normal_corr_y = _mm256_mul_ps(normal_y, penetration); | |
__m256 normal_corr_z = _mm256_mul_ps(normal_z, penetration); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x); | |
vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal); | |
vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal); | |
__m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x)); | |
__m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y)); | |
__m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z)); | |
__m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq); | |
__m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)); | |
__m256 friction_mag = _mm256_mul_ps(penetration, friction_vec); | |
__m256 friction_x = _mm256_mul_ps(_mm256_mul_ps(tangent_x, inv_tangent_mag), friction_mag); | |
__m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag); | |
__m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag); | |
__m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x); | |
__m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y); | |
__m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z); | |
total_corr_x = _mm256_and_ps(total_corr_x, collision_mask); | |
total_corr_y = _mm256_and_ps(total_corr_y, collision_mask); | |
total_corr_z = _mm256_and_ps(total_corr_z, collision_mask); | |
total_corr_x = _mm256_mul_ps(total_corr_x, p_inv_mass); | |
total_corr_y = _mm256_mul_ps(total_corr_y, p_inv_mass); | |
total_corr_z = _mm256_mul_ps(total_corr_z, p_inv_mass); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], new_p_x); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], new_p_y); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], new_p_z); | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]); | |
__m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]); | |
__m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]); | |
__m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]); | |
__m256 dx = _mm256_sub_ps(px, rx); | |
__m256 dy = _mm256_sub_ps(py, ry); | |
__m256 dz = _mm256_sub_ps(pz, rz); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(dist, radius_vec); | |
__m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 corr_factor = _mm256_mul_ps(penetration, inv_dist); | |
corr_factor = _mm256_and_ps(corr_factor, mask); | |
__m256 delta_x = _mm256_mul_ps(dx, corr_factor); | |
__m256 delta_y = _mm256_mul_ps(dy, corr_factor); | |
__m256 delta_z = _mm256_mul_ps(dz, corr_factor); | |
delta_x = _mm256_mul_ps(delta_x, pm); | |
delta_y = _mm256_mul_ps(delta_y, pm); | |
delta_z = _mm256_mul_ps(delta_z, pm); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment