//
//
// File rsqrtSqrt.c,
// These simple functions illustrate the use of multiple sets of data as
// well as a fast way of computing a double precision sqrt and reciprocal
// sqrt. Detailed performance analysis is presented in the form of a cycle
// accurate simulation of function execution on a PowerPC 7400 or 7410.
//
// Note: these functions are not IEEE-754 compliant and do not calculate
// the last bit correctly. For correct accuary, refer to libm on MacOS X.
//
// Copyright © 2002 Apple Computer, Inc. All rights reserved. *
//
// Written by A. Sazegari, started on February 2002.
//
//
#include <stdio.h>
#include <math.h>
void frsqrt ( double *arg1, double *arg2, double *arg3 );
void fsqrt ( double *arg1, double *arg2, double *arg3 );
unsigned long Start_StopAmber ( void );
#if ! defined( __MWERKS__)
// Define __frsqrte() if we are not using the Metrowerks compiler,
// for which this is already available. This causes the frsqrte
// instruction to be used to calculate a 5 bit estimate of the
// reciprocal square root of the argument
inline double __frsqrte ( double argument )
{
double result;
asm ( "frsqrte %0, %1" : /*OUT*/ "=f" ( result ) : /*IN*/ "f" ( argument ) );
return result;
}
// fucntion required by the instrumentation program amber.
unsigned long Start_StopAmber ( void )
{
register unsigned long result;
__asm__ volatile ( "mfspr %0, 1023" : "=r" (result) );
return result;
}
#else
// fucntion required by the instrumentation program amber.
unsigned long Start_StopAmber ( void )
{
register long result;
asm{ mfspr result, 1023 }
return result;
}
#endif
//Do three square roots simultaneously
void FastSquareRoot_x3( double *arg1, double *arg2, double *arg3 )
{
register double estimate1, estimate2, estimate3;
register double halfOfArg1, halfOfArg2, halfOfArg3;
int i;
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
estimate1 = __frsqrte ( *arg1 );
estimate2 = __frsqrte ( *arg2 );
estimate3 = __frsqrte ( *arg3 );
halfOfArg1 = 0.5 * *arg1;
halfOfArg2 = 0.5 * *arg2;
halfOfArg3 = 0.5 * *arg3;
//if you require less precision, you may reduce the number of loop iterations
for ( i = 0; i < 4; i++ )
{
estimate1 = estimate1 * ( 1.5 - halfOfArg1 * estimate1 * estimate1 );
estimate2 = estimate2 * ( 1.5 - halfOfArg2 * estimate2 * estimate2 );
estimate3 = estimate3 * ( 1.5 - halfOfArg3 * estimate3 * estimate3 );
}
*arg1 = estimate1 * *arg1;
*arg2 = estimate2 * *arg2;
*arg3 = estimate3 * *arg3;
}
//Caculate three reciprocal square roots simultaneously: (*arg = (*arg)-0.5)
void FastReciprocalSquareRoot_x3( double *arg1, double *arg2, double *arg3 )
{
register double estimate1, estimate2, estimate3;
int i;
//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
estimate1 = __frsqrte ( *arg1 );
estimate2 = __frsqrte ( *arg2 );
estimate3 = __frsqrte ( *arg3 );
//if you require less precision, you may reduce the number of loop iterations
for ( i = 0; i < 4; i++ )
{
estimate1 = estimate1 + 0.5 * estimate1 * ( 1.0 - *arg1 * estimate1 * estimate1 );
estimate2 = estimate2 + 0.5 * estimate2 * ( 1.0 - *arg2 * estimate2 * estimate2 );
estimate3 = estimate3 + 0.5 * estimate3 * ( 1.0 - *arg3 * estimate3 * estimate3 );
}
*arg1 = estimate1;
*arg2 = estimate2;
*arg3 = estimate3;
}
/*******************************************************************************
* a little study of the pipeline scheduling for sqrt and inverse sqrt. *
*******************************************************************************/
int main ( )
{
double estimate1, estimate2, estimate3, arg1, arg2, arg3, trueValue1, trueValue2, trueValue3;
int i, j;
arg1 = 2.0;
arg2 = 3.0;
arg3 = 7.0;
trueValue1 = 1.0 / sqrt ( arg1 );
trueValue2 = 1.0 / sqrt ( arg2 );
trueValue3 = 1.0 / sqrt ( arg3 );
printf ( "true value of inverse sqrt:\n" );
printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n\n", trueValue1, trueValue1, trueValue2, trueValue2, trueValue3, trueValue3 );
Start_StopAmber ();
frsqrt ( &arg1, &arg2, &arg3 );
Start_StopAmber ();
printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n\n\n", arg1, arg1, arg2, arg2, arg3, arg3 );
arg1 = 2.0;
arg2 = 3.0;
arg3 = 7.0;
trueValue1 = sqrt ( arg1 );
trueValue2 = sqrt ( arg2 );
trueValue3 = sqrt ( arg3 );
printf ( "true value of sqrt:\n" );
printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n\n", trueValue1, trueValue1, trueValue2, trueValue2, trueValue3, trueValue3 );
// Start_StopAmber();
fsqrt ( &arg1, &arg2, &arg3 );
// Start_StopAmber();
printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n", arg1, arg1, arg2, arg2, arg3, arg3 );
return 0;
}
/*
results of the SimG4 cycle accurate simulator (for PPC 7400/7410) used with the following options:
"-sp simg4.result -st 0 -r warmup_l1=1 warmup_l2=1 -sw 80"
0:or R4,R21,R21 |
| 1 | |
IDR............................................................................. | 3 |