home *** CD-ROM | disk | FTP | other *** search
/ Mega CD-ROM 1 / megacd_rom_1.zip / megacd_rom_1 / MAGAZINE / DDJMAG / DDJ8911.ZIP / BAUER.LST next >
File List  |  1989-10-04  |  10KB  |  200 lines

  1.  
  2. _OPTIMIZING IN A PARALLEL ENVIRONMENT_
  3. by Barr E. Bauer
  4.  
  5. [LISTIN╟ ONE]
  6.  
  7.       program test                                                      1
  8. *                                                                       2
  9. * purpose is to test SGI parallelization scheme for loop selection,     3
  10. * numerically-intensive calculations, and total reduction. See text     4
  11. * for details.                                                          5
  12. *                                                                       6
  13.       parameter (MAXFIRST=250, MAXSECOND=250, MAXTHIRD=10)              7
  14.       real*8 a(MAXTHIRD,MAXSECOND,MAXFIRST)                             8
  15.       real*8 b(MAXTHIRD,MAXSECOND,MAXFIRST)                             9
  16.       real*8 sub_total(MAXFIRST), partial_total(4)                      10
  17.       real*8 d(MAXTHIRD), c, tmp    ! local variables                   11
  18.       real*8 dist(MAXSECOND,MAXFIRST), grand_total                      12
  19.       real*8 grand_total      ! test for proper operation               13
  20.       logical parallel        ! selects 2-version loops                 14
  21.       integer*4 iflag         ! used to show LASTLOCAL value            15
  22.                                                                         16
  23.       data parallel /.false./                                           17
  24.       data sub_total, iflag /MAXFIRST*0.0, 0/                           18
  25. *                                                                       19
  26. * outer loop: contains both interior loops                              20
  27. *                                                                       21
  28.                                                                         22
  29. * C$doacross local(k,j,i,tmp,d,c), share(a,b,sub_total,dist),           23
  30. * C$&        lastlocal(iflag)                                           24
  31.                                                                         25
  32.       do i = 1, MAXFIRST                                                26
  33. *                                                                       27
  34. * first inner loop: fills arrays a and b                                28
  35. *                                                                       29
  36.                                                                         30
  37. * C$doacross local(j,k,c), share(i,a,b)                                 31
  38.                                                                         32
  39.         do j = 1, MAXSECOND                                             33
  40.           do k = 1, MAXTHIRD                                            34
  41.             a(k,j,i) = dsqrt(dfloat(i*j*k))                             35
  42.             c = 1.0 - a(k,j,i)                                          36
  43.             if (c .le. 0.0 .and. i .lt. j*k) then                       37
  44.               c = -c                                                    38
  45.             else                                                        39
  46.               c = c**2                                                  40
  47.             endif                                                       41
  48.             b(k,j,i) = 32*(dcos(c)**5)*dsin(c)-                         42
  49.      1                 32*(dcos(c)**3)*dsin(c)+                         43
  50.      2                  6*dcos(c)*dsin(c)                               44
  51.           enddo                                                         45
  52.         enddo                                                           46
  53. *                                                                       47
  54. * seond inner loop: determines distance and starts summation            48
  55. *                                                                       49
  56.                                                                         50
  57. * c$doacross local(j,k,d,tmp), share(i,a,b,dist,sub_total),             51
  58. * c$&        lastlocal(iflag)                                           52
  59.                                                                         53è        do j=1, MAXSECOND                                               54
  60.           tmp = 0.0                                                     55
  61.           do k = 1, MAXTHIRD                                            56
  62.             d(k) = a(k,j,i) - b(k,j,i)                                  57
  63.           enddo                                                         58
  64.           do k = 1, MAXTHIRD                                            59
  65.             tmp = tmp + d(k)**2                                         60
  66.           enddo                                                         61
  67.           dist(j,i) = dsqrt(tmp)                                        62
  68.           if (dist(j,i) .le. 0.1) iflag = iflag + 1                     63
  69.           sub_total(j) = sub_total(j) + dist(j,i)                       64
  70.         enddo                                                           65
  71.       enddo                                                             66
  72. *                                                                       67
  73. * the next section is an example of sum reduction optimized to the      68
  74. * parallel environment and the use of a more efficient 2 loop summation 69
  75. *                                                                       70
  76. * if -mp option is active, parallel is set to .true. which then         71
  77. * selects the parallel version                                          72
  78. *                                                                       73
  79.                                                                         74
  80. C$    parallel = .true.                                                 75
  81.       grand_total = 0.0                                                 76
  82.       if (parallel) then                     ! parallel version         77
  83. C$      num_threads = mp_numthreads()                                   78
  84.         ichunk = (MAXFIRST + (num_threads - 1))/num_threads             79
  85.                                                                         80
  86. C$doacross local(k,j),                                                  81
  87. C$&        share(num_threads,partial_total,sub_total,ichunk)            82
  88.                                                                         83
  89.         do k = 1, num_threads ! this loop is parallelized               84
  90.           partial_total(k) = 0.0                                        85
  91.           do j = k*ichunk - ichunk + 1, min(k*ichunk,MAXFIRST)          86
  92.             partial_total(k) = partial_total(k) + sub_total(j)          87
  93.           enddo                                                         88
  94.         enddo                                                           89
  95.         do j = 1, num_threads   ! smaller loop handled as scalar        90
  96.           grand_total = grand_total + partial_total(j)                  91
  97.         enddo                                                           92
  98.       else                                   ! the scalar version       93
  99.         do j = 1, MAXFIRST                                              94
  100.           grand_total = grand_total + sub_total(j)                      95
  101.         enddo                                                           96
  102.       endif                                                             97
  103.                                                                         98
  104.       if (parallel) then                                                99
  105. C$      write (*,10) grand_total, num_threads                           100
  106. C$      write (*,20) iflag                                              101
  107.       else                                                              102
  108.         write (*,30) grand_total                                        103
  109.         write (*,40) iflag                                              104
  110.       endif                                                             105
  111.       stop                                                              106
  112. C$10  format(1x,'grand total = ',g10.3,'threads = ',i4)                 107
  113. C$20  format(1x,'parallel iflag = ',i10)                                108è30    format(1x,'grand total = ',g10.3)                                 109
  114. 40    format(1x,'scalar iflag = ',i10)                                  110
  115.       end                                                               111
  116.  
  117.  
  118.  
  119. [LISTIN╟ TWO]
  120.  
  121.  
  122. (source code)
  123.  
  124.       subroutine example(a, b, c, n)
  125.       integer*4 n
  126.       real*4 a(n), b(n), c(n)
  127.  
  128.       (additional code)
  129.  
  130. c$doacross local(i, x)
  131.       do i=1, n
  132.         x = a(n) * b(n)
  133.         c(n) = x**2
  134.       enddo
  135.  
  136.       (additional code)
  137.  
  138.       return
  139.       end
  140.  
  141. (the loop is transformed to)
  142.  
  143.       subroutine _example_1(
  144.      1  _local_start,             ! index starting valueè     2  _local_ntrip,             ! number of loop executions
  145.      3  _incr,                    ! index increment
  146.      4  _my_threadno)             ! unique process ID number
  147.  
  148.       integer*4 _local_start, _local_ntrip, _incr, _my_threadno
  149.  
  150.       integer*4  i                ! declared local
  151.       real*4     x                ! declared local
  152.  
  153.       integer*4  _tmp             ! created local
  154.  
  155.       i = _local_start
  156.       do _tmp = 1, _local_ntrip
  157.         x = a(i) * b(i)
  158.         c(i) = x**2
  159.         i = i + _incr
  160.       enddo
  161.       return
  162.       end
  163.  
  164.  
  165. Examplσ 1║ A typical D╧ loop
  166.  
  167.  
  168. do i = 1, n
  169.    a(i) = x * b(i)
  170. enddo
  171.  
  172.  
  173. Examplσ 2║ ┴ D╧ loo≡ iε whicΦ thσ arra∙ variablσ reference≤ ß ì
  174. valuσ tha⌠ i≤ no⌠ curren⌠ witΦ ⌠hσ index
  175.  
  176. do i = 2, n
  177.    arr(i) = b(i) - arr(i-1)
  178. enddo
  179.  
  180. Examplσ 3║ Aε examplσ oµ loaΣ imbalance
  181.  
  182.       do i = 1, n
  183.           do j = 1, i
  184.                a(j, i) = a(j, i) * xmult
  185.           enddo
  186.       enddo
  187.  
  188.  
  189. Examplσ 4║ LoaΣ balancing
  190.  
  191.       num_threads = mp_numthreads()
  192. c$doacross local(i, j, k)
  193.       do k = 1, num_threads
  194.           do i = k, n, num_threads
  195.                do j = 1, i
  196.                     a(j, i) = a(j, i) * xmult
  197.                enddo
  198.           enddo
  199.       enddo
  200.