home *** CD-ROM | disk | FTP | other *** search
File List | 1989-10-04 | 10.0 KB | 200 lines |
-
- _OPTIMIZING IN A PARALLEL ENVIRONMENT_
- by Barr E. Bauer
-
- [LISTIN╟ ONE]
-
- program test 1
- * 2
- * purpose is to test SGI parallelization scheme for loop selection, 3
- * numerically-intensive calculations, and total reduction. See text 4
- * for details. 5
- * 6
- parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10) 7
- real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST) 8
- real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST) 9
- real*8 sub_total(MAXFIRST), partial_total(4) 10
- real*8 d(MAXTHIRD), c, tmp ! local variables 11
- real*8 dist(MAXSECOND,MAXFIRST), grand_total 12
- real*8 grand_total ! test for proper operation 13
- logical parallel ! selects 2-version loops 14
- integer*4 iflag ! used to show LASTLOCAL value 15
- 16
- data parallel /.false./ 17
- data sub_total, iflag /MAXFIRST*0.0, 0/ 18
- * 19
- * outer loop: contains both interior loops 20
- * 21
- 22
- * C$doacross local(k,j,i,tmp,d,c), share(a,b,sub_total,dist), 23
- * C$& lastlocal(iflag) 24
- 25
- do i = 1, MAXFIRST 26
- * 27
- * first inner loop: fills arrays a and b 28
- * 29
- 30
- * C$doacross local(j,k,c), share(i,a,b) 31
- 32
- do j = 1, MAXSECOND 33
- do k = 1, MAXTHIRD 34
- a(k,j,i) = dsqrt(dfloat(i*j*k)) 35
- c = 1.0 - a(k,j,i) 36
- if (c .le. 0.0 .and. i .lt. j*k) then 37
- c = -c 38
- else 39
- c = c**2 40
- endif 41
- b(k,j,i) = 32*(dcos(c)**5)*dsin(c)- 42
- 1 32*(dcos(c)**3)*dsin(c)+ 43
- 2 6*dcos(c)*dsin(c) 44
- enddo 45
- enddo 46
- * 47
- * seond inner loop: determines distance and starts summation 48
- * 49
- 50
- * c$doacross local(j,k,d,tmp), share(i,a,b,dist,sub_total), 51
- * c$& lastlocal(iflag) 52
- 53
- è do j=1, MAXSECOND 54
- tmp = 0.0 55
- do k = 1, MAXTHIRD 56
- d(k) = a(k,j,i) - b(k,j,i) 57
- enddo 58
- do k = 1, MAXTHIRD 59
- tmp = tmp + d(k)**2 60
- enddo 61
- dist(j,i) = dsqrt(tmp) 62
- if (dist(j,i) .le. 0.1) iflag = iflag + 1 63
- sub_total(j) = sub_total(j) + dist(j,i) 64
- enddo 65
- enddo 66
- * 67
- * the next section is an example of sum reduction optimized to the 68
- * parallel environment and the use of a more efficient 2 loop summation 69
- * 70
- * if -mp option is active, parallel is set to .true. which then 71
- * selects the parallel version 72
- * 73
- 74
- C$ parallel = .true. 75
- grand_total = 0.0 76
- if (parallel) then ! parallel version 77
- C$ num_threads = mp_numthreads() 78
- ichunk = (MAXFIRST + (num_threads - 1))/num_threads 79
- 80
- C$doacross local(k,j), 81
- C$& share(num_threads,partial_total,sub_total,ichunk) 82
- 83
- do k = 1, num_threads ! this loop is parallelized 84
- partial_total(k) = 0.0 85
- do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST) 86
- partial_total(k) = partial_total(k) + sub_total(j) 87
- enddo 88
- enddo 89
- do j = 1, num_threads ! smaller loop handled as scalar 90
- grand_total = grand_total + partial_total(j) 91
- enddo 92
- else ! the scalar version 93
- do j = 1, MAXFIRST 94
- grand_total = grand_total + sub_total(j) 95
- enddo 96
- endif 97
- 98
- if (parallel) then 99
- C$ write (*,10) grand_total, num_threads 100
- C$ write (*,20) iflag 101
- else 102
- write (*,30) grand_total 103
- write (*,40) iflag 104
- endif 105
- stop 106
- C$10 format(1x,'grand total = ',g10.3,'threads = ',i4) 107
- C$20 format(1x,'parallel iflag = ',i10) 108
- è30 format(1x,'grand total = ',g10.3) 109
- 40 format(1x,'scalar iflag = ',i10) 110
- end 111
-
-
-
- [LISTIN╟ TWO]
-
-
- (source code)
-
- subroutine example(a, b, c, n)
- integer*4 n
- real*4 a(n), b(n), c(n)
-
- (additional code)
-
- c$doacross local(i, x)
- do i=1, n
- x = a(n) * b(n)
- c(n) = x**2
- enddo
-
- (additional code)
-
- return
- end
-
- (the loop is transformed to)
-
- subroutine _example_1(
- 1 _local_start, ! index starting value
- è 2 _local_ntrip, ! number of loop executions
- 3 _incr, ! index increment
- 4 _my_threadno) ! unique process ID number
-
- integer*4 _local_start, _local_ntrip, _incr, _my_threadno
-
- integer*4 i ! declared local
- real*4 x ! declared local
-
- integer*4 _tmp ! created local
-
- i = _local_start
- do _tmp = 1, _local_ntrip
- x = a(i) * b(i)
- c(i) = x**2
- i = i + _incr
- enddo
- return
- end
-
-
- Examplσ 1║ A typical D╧ loop
-
-
- do i = 1, n
- a(i) = x * b(i)
- enddo
-
-
- Examplσ 2║ ┴ D╧ loo≡ iε whicΦ thσ arra∙ variablσ reference≤ ß ì
- valuσ tha⌠ i≤ no⌠ curren⌠ witΦ ⌠hσ index
-
- do i = 2, n
- arr(i) = b(i) - arr(i-1)
- enddo
-
- Examplσ 3║ Aε examplσ oµ loaΣ imbalance
-
- do i = 1, n
- do j = 1, i
- a(j, i) = a(j, i) * xmult
- enddo
- enddo
-
-
- Examplσ 4║ LoaΣ balancing
-
- num_threads = mp_numthreads()
- c$doacross local(i, j, k)
- do k = 1, num_threads
- do i = k, n, num_threads
- do j = 1, i
- a(j, i) = a(j, i) * xmult
- enddo
- enddo
- enddo
-