Test Program

// Transpose a 4x4 matrix of single precision floats stored // as 4 vector floats using endogenous variables: // transpose1, transpose2, transepose3, transpose4#define TRANSPOSE4_NoVar( a, b, c, d ) \ { \

transpose1 = vec_mergeh( a, c ); \ transpose2 = vec_mergeh( b, d ); \ transpose3 = vec_mergel( a, c ); \ transpose4 = vec_mergel( b, d ); \ a = vec_mergeh( transpose1, transpose2 ); \ b = vec_mergel( transpose1, transpose2 ); \ c = vec_mergeh( transpose3, transpose4 ); \ d = vec_mergel( transpose3, transpose4 ); \

}

//This one is a bit harder to follow, because some registers //had to be reused to avoid running out. This function uses //31 of the 32 available registers! Avoiding register spilling //and explicitly inlining the reciprocal square root function, //which wasn't being inlined properly, more than doubled the //speed of this function.

// Normalize 16 vectors of the type { X, Y, Z, W }void NormalizeUniform16( vector float v[16] ) {

register vector float temp1, temp2, temp3, temp4; register vector float tempA, tempB, tempC, tempD; register vector float transpose1, transpose2; register vector float transpose3, transpose4; register vector float zero, oneHalf, one;
//Initialize our constantsoneHalf = vec_ctf( vec_splat_u32( 1 ), 1 ); one = vec_ctf( vec_splat_u32( 1 ), 0 ); zero = (vector float) vec_splat_u32(0);

//Load in the sixteen vectors into x, y, z, w //These wont be vectors full of x, y, z, and w //until after the transposeregister vector float x1 = v[0]; register vector float y1 = v[1]; register vector float z1 = v[2]; register vector float w1 = v[3]; register vector float x2 = v[4]; register vector float y2 = v[5]; register vector float z2 = v[6]; register vector float w2 = v[7]; register vector float x3 = v[8]; register vector float y3 = v[9]; register vector float z3 = v[10]; register vector float w3 = v[11]; register vector float x4 = v[12]; register vector float y4 = v[13]; register vector float z4 = v[14]; register vector float w4 = v[15];

//Transpose the 4 vectors to create 4 uniform vectors for //X, Y, Z and WTRANSPOSE4_NoVar( x1, y1, z1, w1 ); TRANSPOSE4_NoVar( x2, y2, z2, w2 ); TRANSPOSE4_NoVar( x3, y3, z3, w3 ); TRANSPOSE4_NoVar( x4, y4, z4, w4 );

// length2 = X2 + Y2 + Z2temp1 = vec_madd( x1, x1, zero ); temp2 = vec_madd( x2, x2, zero ); temp3 = vec_madd( x3, x3, zero ); temp4 = vec_madd( x4, x4, zero );

temp1 = vec_madd( y1, y1, temp1 ); temp2 = vec_madd( y2, y2, temp2 ); temp3 = vec_madd( y3, y3, temp3); temp4 = vec_madd( y4, y4, temp4 );

temp1 = vec_madd( z1, z1, temp1 ); temp2 = vec_madd( z2, z2, temp2 ); temp3 = vec_madd( z3, z3, temp3 ); temp4 = vec_madd( z4, z4, temp4 );

//Calculate reciprocal square root // 1 / length = 1 / sqrt( length2) //This wasn't getting inlined properly, //so I inlined it explicitlytempA = vec_rsqrte( temp1 ); tempB = vec_rsqrte( temp2 ); tempC = vec_rsqrte( temp3 ); tempD = vec_rsqrte( temp4 );

//estimate squaredtranspose1 = vec_madd( tempA, tempA, zero ); transpose2 = vec_madd( tempB, tempB, zero ); transpose3 = vec_madd( tempC, tempC, zero ); transpose4 = vec_madd( tempD, tempD, zero );

//1 - value * estimate squared -- the errortranspose1 = vec_nmsub( temp1, transpose1, one ); transpose2 = vec_nmsub( temp2, transpose2, one ); transpose3 = vec_nmsub( temp3, transpose3, one ); transpose4 = vec_nmsub( temp4, transpose4, one );

//estimate / 2temp1 = vec_madd( tempA, oneHalf, zero ); temp2 = vec_madd( tempB, oneHalf, zero ); temp3 = vec_madd( tempC, oneHalf, zero ); temp4 = vec_madd( tempD, oneHalf, zero );

// result = error * estimate/2 + estimatetemp1 = vec_madd( transpose1, temp1, tempA ); temp2 = vec_madd( transpose2, temp2, tempB ); temp3 = vec_madd( transpose3, temp3, tempC ); temp4 = vec_madd( transpose4, temp4, tempD );

//Normalize X, Y and Z by multiplying each by 1 / lengthx1 = vec_madd( x1, temp1, zero ); x2 = vec_madd( x2, temp2, zero ); x3 = vec_madd( x3, temp3, zero ); x4 = vec_madd( x4, temp4, zero );

y1 = vec_madd( y1, temp1, zero ); y2 = vec_madd( y2, temp2, zero ); y3 = vec_madd( y3, temp3, zero ); y4 = vec_madd( y4, temp4, zero );

z1 = vec_madd( z1, temp1, zero ); z2 = vec_madd( z2, temp2, zero ); z3 = vec_madd( z3, temp3, zero ); z4 = vec_madd( z4, temp4, zero );

//Convert X, Y, Z and W vectors back to our //starting four vectors, now normalizedTRANSPOSE4_NoVar(x1, y1, z1, w1); TRANSPOSE4_NoVar(x2, y2, z2, w2); TRANSPOSE4_NoVar(x3, y3, z3, w3); TRANSPOSE4_NoVar(x4, y4, z4, w4);

//write back the resultsv[0] = x1; v[1] = y1; v[2] = z1; v[3] = w1; v[4] = x2; v[5] = y2; v[6] = z2; v[7] = w2; v[8] = x3; v[9] = y3; v[10] = z3; v[11] = w3; v[12] = x4; v[13] = y4; v[14] = z4; v[15] = w4;

}