// Transpose a 4x4 matrix of single precision floats stored
// as 4 vector floats using endogenous variables:
// transpose1, transpose2, transepose3, transpose4
#define TRANSPOSE4_NoVar( a, b, c, d ) \
{ \

transpose1 = vec_mergeh( a, c ); \
transpose2 = vec_mergeh( b, d ); \
transpose3 = vec_mergel( a, c ); \
transpose4 = vec_mergel( b, d ); \
a = vec_mergeh( transpose1, transpose2 ); \
b = vec_mergel( transpose1, transpose2 ); \
c = vec_mergeh( transpose3, transpose4 ); \
d = vec_mergel( transpose3, transpose4 ); \

}

 

//This one is a bit harder to follow, because some registers
//had to be reused to avoid running out. This function uses
//31 of the 32 available registers! Avoiding register spilling
//and explicitly inlining the reciprocal square root function,
//which wasn't being inlined properly, more than doubled the
//speed of this function.

// Normalize 16 vectors of the type { X, Y, Z, W }
void NormalizeUniform16( vector float v[16] )
{

register vector float temp1, temp2, temp3, temp4;
register vector float tempA, tempB, tempC, tempD;
register vector float transpose1, transpose2;
register vector float transpose3, transpose4;
register vector float zero, oneHalf, one;

//Initialize our constants
oneHalf = vec_ctf( vec_splat_u32( 1 ), 1 );
one = vec_ctf( vec_splat_u32( 1 ), 0 );
zero = (vector float) vec_splat_u32(0);

//Load in the sixteen vectors into x, y, z, w
//These wont be vectors full of x, y, z, and w
//until after the transpose
register vector float x1 = v[0];
register vector float y1 = v[1];
register vector float z1 = v[2];
register vector float w1 = v[3];
register vector float x2 = v[4];
register vector float y2 = v[5];
register vector float z2 = v[6];
register vector float w2 = v[7];
register vector float x3 = v[8];
register vector float y3 = v[9];
register vector float z3 = v[10];
register vector float w3 = v[11];
register vector float x4 = v[12];
register vector float y4 = v[13];
register vector float z4 = v[14];
register vector float w4 = v[15];

//Transpose the 4 vectors to create 4 uniform vectors for
//X, Y, Z and W
TRANSPOSE4_NoVar( x1, y1, z1, w1 );
TRANSPOSE4_NoVar( x2, y2, z2, w2 );
TRANSPOSE4_NoVar( x3, y3, z3, w3 );
TRANSPOSE4_NoVar( x4, y4, z4, w4 );

// length2 = X2 + Y2 + Z2
temp1 = vec_madd( x1, x1, zero );
temp2 = vec_madd( x2, x2, zero );
temp3 = vec_madd( x3, x3, zero );
temp4 = vec_madd( x4, x4, zero );

temp1 = vec_madd( y1, y1, temp1 );
temp2 = vec_madd( y2, y2, temp2 );
temp3 = vec_madd( y3, y3, temp3);
temp4 = vec_madd( y4, y4, temp4 );

temp1 = vec_madd( z1, z1, temp1 );
temp2 = vec_madd( z2, z2, temp2 );
temp3 = vec_madd( z3, z3, temp3 );
temp4 = vec_madd( z4, z4, temp4 );

//Calculate reciprocal square root
// 1 / length = 1 / sqrt( length2)
//This wasn't getting inlined properly,
//so I inlined it explicitly
tempA = vec_rsqrte( temp1 );
tempB = vec_rsqrte( temp2 );
tempC = vec_rsqrte( temp3 );
tempD = vec_rsqrte( temp4 );

//estimate squared
transpose1 = vec_madd( tempA, tempA, zero );
transpose2 = vec_madd( tempB, tempB, zero );
transpose3 = vec_madd( tempC, tempC, zero );
transpose4 = vec_madd( tempD, tempD, zero );

//1 - value * estimate squared -- the error
transpose1 = vec_nmsub( temp1, transpose1, one );
transpose2 = vec_nmsub( temp2, transpose2, one );
transpose3 = vec_nmsub( temp3, transpose3, one );
transpose4 = vec_nmsub( temp4, transpose4, one );

//estimate / 2
temp1 = vec_madd( tempA, oneHalf, zero );
temp2 = vec_madd( tempB, oneHalf, zero );
temp3 = vec_madd( tempC, oneHalf, zero );
temp4 = vec_madd( tempD, oneHalf, zero );

// result = error * estimate/2 + estimate
temp1 = vec_madd( transpose1, temp1, tempA );
temp2 = vec_madd( transpose2, temp2, tempB );
temp3 = vec_madd( transpose3, temp3, tempC );
temp4 = vec_madd( transpose4, temp4, tempD );

//Normalize X, Y and Z by multiplying each by 1 / length
x1 = vec_madd( x1, temp1, zero );
x2 = vec_madd( x2, temp2, zero );
x3 = vec_madd( x3, temp3, zero );
x4 = vec_madd( x4, temp4, zero );

y1 = vec_madd( y1, temp1, zero );
y2 = vec_madd( y2, temp2, zero );
y3 = vec_madd( y3, temp3, zero );
y4 = vec_madd( y4, temp4, zero );

z1 = vec_madd( z1, temp1, zero );
z2 = vec_madd( z2, temp2, zero );
z3 = vec_madd( z3, temp3, zero );
z4 = vec_madd( z4, temp4, zero );

//Convert X, Y, Z and W vectors back to our
//starting four vectors, now normalized
TRANSPOSE4_NoVar(x1, y1, z1, w1);
TRANSPOSE4_NoVar(x2, y2, z2, w2);
TRANSPOSE4_NoVar(x3, y3, z3, w3);
TRANSPOSE4_NoVar(x4, y4, z4, w4);

//write back the results
v[0] = x1;
v[1] = y1;
v[2] = z1;
v[3] = w1;
v[4] = x2;
v[5] = y2;
v[6] = z2;
v[7] = w2;
v[8] = x3;
v[9] = y3;
v[10] = z3;
v[11] = w3;
v[12] = x4;
v[13] = y4;
v[14] = z4;
v[15] = w4;

}