//
//
// File rsqrtSqrt.c,
// These simple functions illustrate the use of multiple sets of data as
// well as a fast way of computing a double precision sqrt and reciprocal
// sqrt. Detailed performance analysis is presented in the form of a cycle
// accurate simulation of function execution on a PowerPC 7400 or 7410.
//
// Note: these functions are not IEEE-754 compliant and do not calculate
// the last bit correctly. For correct accuary, refer to libm on MacOS X.
//

// Copyright © 2002 Apple Computer, Inc. All rights reserved. *
//
// Written by A. Sazegari, started on February 2002.
//
//

#include <stdio.h>
#include <math.h>

void frsqrt ( double *arg1, double *arg2, double *arg3 );
void fsqrt ( double *arg1, double *arg2, double *arg3 );
unsigned long Start_StopAmber ( void );

#if ! defined( __MWERKS__)

// Define __frsqrte() if we are not using the Metrowerks compiler,
/
/ for which this is already available. This causes the frsqrte
// instruction to be used to calculate a 5 bit estimate of the
// reciprocal square root of the argument
inline double __frsqrte ( double argument )
{

double result;
asm ( "frsqrte %0, %1" : /*OUT*/ "=f" ( result ) : /*IN*/ "f" ( argument ) );
return result;

}

// fucntion required by the instrumentation program amber.
unsigned long Start_StopAmber ( void )
{

register unsigned long result;
__asm__ volatile ( "mfspr %0, 1023" : "=r" (result) );
return result;

}

#else

// fucntion required by the instrumentation program amber.
unsigned long Start_StopAmber ( void )
{

register long result;
asm{ mfspr result, 1023 }
return result;

}

#endif

//Do three square roots simultaneously
void FastSquareRoot_x3( double *arg1, double *arg2, double *arg3 )
{

register double estimate1, estimate2, estimate3;
register double halfOfArg1, halfOfArg2, halfOfArg3;
int i;

//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
estimate1 = __frsqrte ( *arg1 );
estimate2 = __frsqrte ( *arg2 );
estimate3 = __frsqrte ( *arg3 );

halfOfArg1 = 0.5 * *arg1;
halfOfArg2 = 0.5 * *arg2;
halfOfArg3 = 0.5 * *arg3;

//if you require less precision, you may reduce the number of loop iterations
for ( i = 0; i < 4; i++ )
{

estimate1 = estimate1 * ( 1.5 - halfOfArg1 * estimate1 * estimate1 );
estimate2 = estimate2 * ( 1.5 - halfOfArg2 * estimate2 * estimate2 );
estimate3 = estimate3 * ( 1.5 - halfOfArg3 * estimate3 * estimate3 );

}

*arg1 = estimate1 * *arg1;
*arg2 = estimate2 * *arg2;
*arg3 = estimate3 * *arg3;

}

//Caculate three reciprocal square roots simultaneously: (*arg = (*arg)-0.5)
void FastReciprocalSquareRoot_x3( double *arg1, double *arg2, double *arg3 )
{

register double estimate1, estimate2, estimate3;
int i;

//Calculate a 5 bit starting estimate for the reciprocal sqrt of each
estimate1 = __frsqrte ( *arg1 );
estimate2 = __frsqrte ( *arg2 );
estimate3 = __frsqrte ( *arg3 );

//if you require less precision, you may reduce the number of loop iterations
for ( i = 0; i < 4; i++ )
{

estimate1 = estimate1 + 0.5 * estimate1 * ( 1.0 - *arg1 * estimate1 * estimate1 );
estimate2 = estimate2 + 0.5 * estimate2 * ( 1.0 - *arg2 * estimate2 * estimate2 );
estimate3 = estimate3 + 0.5 * estimate3 * ( 1.0 - *arg3 * estimate3 * estimate3 );

}

*arg1 = estimate1;
*arg2 = estimate2;
*arg3 = estimate3;

}

/*******************************************************************************
* a little study of the pipeline scheduling for sqrt and inverse sqrt. *
*******************************************************************************/

int main ( )
{

double estimate1, estimate2, estimate3, arg1, arg2, arg3, trueValue1, trueValue2, trueValue3;
int i, j;

arg1 = 2.0;
arg2 = 3.0;
arg3 = 7.0;

trueValue1 = 1.0 / sqrt ( arg1 );
trueValue2 = 1.0 / sqrt ( arg2 );
trueValue3 = 1.0 / sqrt ( arg3 );

printf ( "true value of inverse sqrt:\n" );
printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n\n", trueValue1, trueValue1, trueValue2, trueValue2, trueValue3, trueValue3 );

Start_StopAmber ();

frsqrt ( &arg1, &arg2, &arg3 );

Start_StopAmber ();

printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n\n\n", arg1, arg1, arg2, arg2, arg3, arg3 );

arg1 = 2.0;
arg2 = 3.0;
arg3 = 7.0;

trueValue1 = sqrt ( arg1 );
trueValue2 = sqrt ( arg2 );
trueValue3 = sqrt ( arg3 );

printf ( "true value of sqrt:\n" );
printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n\n", trueValue1, trueValue1, trueValue2, trueValue2, trueValue3, trueValue3 );

// Start_StopAmber();

fsqrt ( &arg1, &arg2, &arg3 );

// Start_StopAmber();

printf ( "%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\t%22.15e = %8.8x%8.8x\n", arg1, arg1, arg2, arg2, arg3, arg3 );

return 0;

}

/*

results of the SimG4 cycle accurate simulator (for PPC 7400/7410) used with the following options:

"-sp simg4.result -st 0 -r warmup_l1=1 warmup_l2=1 -sw 80"

0:or R4,R21,R21
1:or R5,R22,R22
2:or R3,R20,R20
3:bl 0x3fffdac
4:mfspr R0,LR
5:bcl+ 20,31,0x4
6:mfspr R12,LR
7:mtspr LR,R0
8:lfd F0,0x0(R3)
9:addis R9,R12,0x0
10:lfd F12,0x0(R4)
11:addi R0,R0,0x4
12:lfd F13,0x0(R5)
13:mtspr CTR,R0
14:lfd F4,0x67c(R9)
15:fmr F1,F0
16:addis R9,R12,0x0
17:fmr F2,F12
18:fmr F3,F13
19:lfd F5,0x684(R9)
20:frsqrte F6,F0
21:frsqrte F8,F12
22:frsqrte F7,F13
23:fmul F0,F1,F6
24:fmul F13,F2,F8
25:fmul F12,F3,F7
26:fmul F11,F6,F4
27:fmul F10,F8,F4
28:fmul F9,F7,F4
29:fnmsub F0,F0,F6,F5
30:fnmsub F13,F13,F8,F5
31:fnmsub F12,F12,F7,F5
32:fmadd F6,F11,F0,F6
33:fmadd F8,F10,F13,F8
34:fmadd F7,F9,F12,F7
35:bc+ 16,0,0xffd0
36:fmul F0,F1,F6
37:fmul F13,F2,F8
38:fmul F12,F3,F7
39:fmul F11,F6,F4
40:fmul F10,F8,F4
41:fmul F9,F7,F4
42:fnmsub F0,F0,F6,F5
43:fnmsub F13,F13,F8,F5
44:fnmsub F12,F12,F7,F5
45:fmadd F6,F11,F0,F6
46:fmadd F8,F10,F13,F8
47:fmadd F7,F9,F12,F7
48:bc+ 16,0,0xffd0
49:fmul F0,F1,F6
50:fmul F13,F2,F8
51:fmul F12,F3,F7
52:fmul F11,F6,F4
53:fmul F10,F8,F4
54:fmul F9,F7,F4
55:fnmsub F0,F0,F6,F5
56:fnmsub F13,F13,F8,F5
57:fnmsub F12,F12,F7,F5
58:fmadd F6,F11,F0,F6
59:fmadd F8,F10,F13,F8
60:fmadd F7,F9,F12,F7
61:bc+ 16,0,0xffd0
62:fmul F0,F1,F6
63:fmul F13,F2,F8
64:fmul F12,F3,F7
65:fmul F11,F6,F4
66:fmul F10,F8,F4
67:fmul F9,F7,F4
68:fnmsub F0,F0,F6,F5
69:fnmsub F13,F13,F8,F5
70:fnmsub F12,F12,F7,F5
71:fmadd F6,F11,F0,F6
72:fmadd F8,F10,F13,F8
73:fmadd F7,F9,F12,F7
74:bc+ 16,0,0xffd0
75:stfd F6,0x0(R3)
76:stfd F8,0x0(R4)
77:stfd F7,0x0(R5)
78:bclr+ 20,0
79:bl 0x3fffda0
80:mfspr R3,PMC1
| 1 |
| 1 |
| 1 |
| 2 |
| 4 |
| 4 |
| 5 |
| 5 |
| 5 |
| 6 |
| 6 |
| 6 |
| 7 |
| 8 |
| 8 |
| 9 |
| 9 |
| 10 |
| 10 |
| 11 |
| 11 |
| 12 |
| 12 |
| 13 |
| 14 |
| 15 |
| 16 |
| 17 |
| 18 |
| 20 |
| 21 |
| 22 |
| 24 |
| 25 |
| 26 |
| 28 |
| 30 |
| 30 |
| 32 |
| 33 |
| 34 |
| 34 |
| 36 |
| 37 |
| 38 |
| 40 |
| 41 |
| 42 |
| 44 |
| 45 |
| 46 |
| 48 |
| 49 |
| 50 |
| 50 |
| 52 |
| 53 |
| 54 |
| 56 |
| 57 |
| 58 |
| 60 |
| 61 |
| 62 |
| 64 |
| 65 |
| 66 |
| 66 |
| 68 |
| 69 |
| 70 |
| 72 |
| 73 |
| 74 |
| 76 |
| 77 |
| 78 |
| 80 |
| 81 |
| 83 |
| 85 |
IDR............................................................................. | 3
IDR............................................................................. | 3
IIDR............................................................................ | 4
.IFR............................................................................ | 4
...IDR.......................................................................... | 6
...IFR.......................................................................... | 6
....IDR......................................................................... | 7
....IIDR........................................................................ | 8
....IIDER....................................................................... | 9
.....IIDR....................................................................... | 9
.....IIDER...................................................................... | 10
.....IIIDR...................................................................... | 10
......IIDER..................................................................... | 11
.......IIDER.................................................................... | 12
.......IIDER.................................................................... | 12
........IIDEEFR................................................................. | 15
........IIDFFFR................................................................. | 15
.........IIDEEFR................................................................ | 16
.........IIIDEEFR............................................................... | 17
..........IIDEFFR............................................................... | 17
..........IIIIDEEFR............................................................. | 19
...........IIIIDEEFR............................................................ | 20
...........IIIIIDEEFR........................................................... | 21
............IIIIIIDEEFR......................................................... | 23
.............IIIIIIDEEFR........................................................ | 24
..............IIIIIIDEEFR....................................................... | 25
...............IIIIIIIDEEFR..................................................... | 27
................IIIIIIIDEEFR.................................................... | 28
.................IIIIIIIDEEFR................................................... | 29
...................IIIIIIIDEEFR................................................. | 31
....................IIIIIIIDEEFR................................................ | 32
.....................IIIIIIIDEEFR............................................... | 33
.......................IIIIIIIDEEFR............................................. | 35
........................IIIIIIIDEEFR............................................ | 36
.........................IIIIIIIDEEFR........................................... | 37
...........................IIIIIFFFFR........................................... | 37
.............................IIIIIDEEFR......................................... | 39
.............................IIIIIIDEEFR........................................ | 40
...............................IIIIIDEEFR....................................... | 41
................................IIIIIIDEEFR..................................... | 43
.................................IIIIIIDEEFR.................................... | 44
.................................IIIIIIIDEEFR................................... | 45
...................................IIIIIIIDEEFR................................. | 47
....................................IIIIIIIDEEFR................................ | 48
.....................................IIIIIIIDEEFR............................... | 49
.......................................IIIIIIIDEEFR............................. | 51
........................................IIIIIIIDEEFR............................ | 52
.........................................IIIIIIIDEEFR........................... | 53
...........................................IIIIIFFFFR........................... | 53
............................................IIIIIIDEEFR......................... | 55
.............................................IIIIIIDEEFR........................ | 56
...............................................IIIIIDEEFR....................... | 57
................................................IIIIIIDEEFR..................... | 59
.................................................IIIIIIDEEFR.................... | 60
.................................................IIIIIIIDEEFR................... | 61
...................................................IIIIIIIDEEFR................. | 63
....................................................IIIIIIIDEEFR................ | 64
.....................................................IIIIIIIDEEFR............... | 65
.......................................................IIIIIIIDEEFR............. | 67
........................................................IIIIIIIDEEFR............ | 68
.........................................................IIIIIIIDEEFR........... | 69
...........................................................IIIIIFFFFR........... | 69
............................................................IIIIIIDEEFR......... | 71
.............................................................IIIIIIDEEFR........ | 72
...............................................................IIIIIDEEFR....... | 73
................................................................IIIIIIDEEFR..... | 75
.................................................................IIIIIIDEEFR.... | 76
.................................................................IIIIIIIDEEFR... | 77
...................................................................IIIIIIIDEEFR. | 79
....................................................................IIIIIIIDEEFR | 80
R....................................................................IIIIIIIDEEF | 81
EFR....................................................................IIIIIIIDE | 83
EEFR....................................................................IIIIIIID | 84
DEEFR....................................................................IIIIIII | 85
FFFFR......................................................................IIIII | 85
IDEEER......................................................................IIII | 86
IIDEEER......................................................................III | 87
IIIDDEER.......................................................................I | 88
@@@@@@@@........................................................................ | 81
..IFFFFR........................................................................ | 88
....IDEER....................................................................... | 89