home *** CD-ROM | disk | FTP | other *** search
/ Simtel MSDOS 1992 June / SIMTEL_0692.cdr / msdos / ddjmag / ddj8911.arc / BAUER.LST next >
Encoding:
File List  |  1989-10-04  |  10.0 KB  |  200 lines

  1.  
  2. _OPTIMIZING IN A PARALLEL ENVIRONMENT_
  3. by Barr E. Bauer
  4.  
  5. [LISTIN╟ ONE]
  6.  
  7.       program test                                                      1
  8. *                                                                       2
  9. * purpose is to test SGI parallelization scheme for loop selection,     3
  10. * numerically-intensive calculations, and total reduction. See text     4
  11. * for details.                                                          5
  12. *                                                                       6
  13.       parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10)              7
  14.       real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST)                             8
  15.       real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST)                             9
  16.       real*8 sub_total(MAXFIRST), partial_total(4)                      10
  17.       real*8 d(MAXTHIRD), c, tmp    ! local variables                   11
  18.       real*8 dist(MAXSECOND,MAXFIRST), grand_total                      12
  19.       real*8 grand_total      ! test for proper operation               13
  20.       logical parallel        ! selects 2-version loops                 14
  21.       integer*4 iflag         ! used to show LASTLOCAL value            15
  22.                                                                         16
  23.       data parallel /.false./                                           17
  24.       data sub_total, iflag /MAXFIRST*0.0, 0/                           18
  25. *                                                                       19
  26. * outer loop: contains both interior loops                              20
  27. *                                                                       21
  28.                                                                         22
  29. * C$doacross local(k,j,i,tmp,d,c), share(a,b,sub_total,dist),           23
  30. * C$&        lastlocal(iflag)                                           24
  31.                                                                         25
  32.       do i = 1, MAXFIRST                                                26
  33. *                                                                       27
  34. * first inner loop: fills arrays a and b                                28
  35. *                                                                       29
  36.                                                                         30
  37. * C$doacross local(j,k,c), share(i,a,b)                                 31
  38.                                                                         32
  39.         do j = 1, MAXSECOND                                             33
  40.           do k = 1, MAXTHIRD                                            34
  41.             a(k,j,i) = dsqrt(dfloat(i*j*k))                             35
  42.             c = 1.0 - a(k,j,i)                                          36
  43.             if (c .le. 0.0 .and. i .lt. j*k) then                       37
  44.               c = -c                                                    38
  45.             else                                                        39
  46.               c = c**2                                                  40
  47.             endif                                                       41
  48.             b(k,j,i) = 32*(dcos(c)**5)*dsin(c)-                         42
  49.      1                 32*(dcos(c)**3)*dsin(c)+                         43
  50.      2                  6*dcos(c)*dsin(c)                               44
  51.           enddo                                                         45
  52.         enddo                                                           46
  53. *                                                                       47
  54. * seond inner loop: determines distance and starts summation            48
  55. *                                                                       49
  56.                                                                         50
  57. * c$doacross local(j,k,d,tmp), share(i,a,b,dist,sub_total),             51
  58. * c$&        lastlocal(iflag)                                           52
  59.                                                                         53
  60. è        do j=1, MAXSECOND                                               54
  61.           tmp = 0.0                                                     55
  62.           do k = 1, MAXTHIRD                                            56
  63.             d(k) = a(k,j,i) - b(k,j,i)                                  57
  64.           enddo                                                         58
  65.           do k = 1, MAXTHIRD                                            59
  66.             tmp = tmp + d(k)**2                                         60
  67.           enddo                                                         61
  68.           dist(j,i) = dsqrt(tmp)                                        62
  69.           if (dist(j,i) .le. 0.1) iflag = iflag + 1                     63
  70.           sub_total(j) = sub_total(j) + dist(j,i)                       64
  71.         enddo                                                           65
  72.       enddo                                                             66
  73. *                                                                       67
  74. * the next section is an example of sum reduction optimized to the      68
  75. * parallel environment and the use of a more efficient 2 loop summation 69
  76. *                                                                       70
  77. * if -mp option is active, parallel is set to .true. which then         71
  78. * selects the parallel version                                          72
  79. *                                                                       73
  80.                                                                         74
  81. C$    parallel = .true.                                                 75
  82.       grand_total = 0.0                                                 76
  83.       if (parallel) then                     ! parallel version         77
  84. C$      num_threads = mp_numthreads()                                   78
  85.         ichunk = (MAXFIRST + (num_threads - 1))/num_threads             79
  86.                                                                         80
  87. C$doacross local(k,j),                                                  81
  88. C$&        share(num_threads,partial_total,sub_total,ichunk)            82
  89.                                                                         83
  90.         do k = 1, num_threads ! this loop is parallelized               84
  91.           partial_total(k) = 0.0                                        85
  92.           do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST)          86
  93.             partial_total(k) = partial_total(k) + sub_total(j)          87
  94.           enddo                                                         88
  95.         enddo                                                           89
  96.         do j = 1, num_threads   ! smaller loop handled as scalar        90
  97.           grand_total = grand_total + partial_total(j)                  91
  98.         enddo                                                           92
  99.       else                                   ! the scalar version       93
  100.         do j = 1, MAXFIRST                                              94
  101.           grand_total = grand_total + sub_total(j)                      95
  102.         enddo                                                           96
  103.       endif                                                             97
  104.                                                                         98
  105.       if (parallel) then                                                99
  106. C$      write (*,10) grand_total, num_threads                           100
  107. C$      write (*,20) iflag                                              101
  108.       else                                                              102
  109.         write (*,30) grand_total                                        103
  110.         write (*,40) iflag                                              104
  111.       endif                                                             105
  112.       stop                                                              106
  113. C$10  format(1x,'grand total = ',g10.3,'threads = ',i4)                 107
  114. C$20  format(1x,'parallel iflag = ',i10)                                108
  115. è30    format(1x,'grand total = ',g10.3)                                 109
  116. 40    format(1x,'scalar iflag = ',i10)                                  110
  117.       end                                                               111
  118.  
  119.  
  120.  
  121. [LISTIN╟ TWO]
  122.  
  123.  
  124. (source code)
  125.  
  126.       subroutine example(a, b, c, n)
  127.       integer*4 n
  128.       real*4 a(n), b(n), c(n)
  129.  
  130.       (additional code)
  131.  
  132. c$doacross local(i, x)
  133.       do i=1, n
  134.         x = a(n) * b(n)
  135.         c(n) = x**2
  136.       enddo
  137.  
  138.       (additional code)
  139.  
  140.       return
  141.       end
  142.  
  143. (the loop is transformed to)
  144.  
  145.       subroutine _example_1(
  146.      1  _local_start,             ! index starting value
  147. è     2  _local_ntrip,             ! number of loop executions
  148.      3  _incr,                    ! index increment
  149.      4  _my_threadno)             ! unique process ID number
  150.  
  151.       integer*4 _local_start, _local_ntrip, _incr, _my_threadno
  152.  
  153.       integer*4  i                ! declared local
  154.       real*4     x                ! declared local
  155.  
  156.       integer*4  _tmp             ! created local
  157.  
  158.       i = _local_start
  159.       do _tmp = 1, _local_ntrip
  160.         x = a(i) * b(i)
  161.         c(i) = x**2
  162.         i = i + _incr
  163.       enddo
  164.       return
  165.       end
  166.  
  167.  
  168. Examplσ 1║ A typical D╧ loop
  169.  
  170.  
  171. do i = 1, n
  172.    a(i) = x * b(i)
  173. enddo
  174.  
  175.  
  176. Examplσ 2║ ┴ D╧ loo≡ iε whicΦ thσ arra∙ variablσ reference≤ ß ì
  177. valuσ tha⌠ i≤ no⌠ curren⌠ witΦ ⌠hσ index
  178.  
  179. do i = 2, n
  180.    arr(i) = b(i) - arr(i-1)
  181. enddo
  182.  
  183. Examplσ 3║ Aε examplσ oµ loaΣ imbalance
  184.  
  185.       do i = 1, n
  186.           do j = 1, i
  187.                a(j, i) = a(j, i) * xmult
  188.           enddo
  189.       enddo
  190.  
  191.  
  192. Examplσ 4║ LoaΣ balancing
  193.  
  194.       num_threads = mp_numthreads()
  195. c$doacross local(i, j, k)
  196.       do k = 1, num_threads
  197.           do i = k, n, num_threads
  198.                do j = 1, i
  199.                     a(j, i) = a(j, i) * xmult
  200.                enddo
  201.           enddo
  202.       enddo
  203.