Actual source code: fsolvebaij.F

  1: !
  2: !  "$Id: fsolvebaij.F,v 1.9 2001/08/07 03:05:24 balay Exp $";
  3: !
  4: !    Fortran kernel for sparse triangular solve in the BAIJ matrix format
  5: ! This ONLY works for factorizations in the NATURAL ORDERING, i.e.
  6: ! with MatSolve_SeqBAIJ_4_NaturalOrdering()
  7: !
 8:  #include include/finclude/petscdef.h
  9: !

 11:       subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b)
 12:       implicit none
 13:       MatScalar        a(0:*)
 14:       PetscScalar      x(0:*),b(0:*)
 15:       integer          n,ai(0:*),aj(0:*),adiag(0:*)

 17:       integer          i,j,jstart,jend,idx,ax,jdx
 18:       PetscScalar      s1,s2,s3,s4
 19:       PetscScalar      x1,x2,x3,x4
 20: !
 21: !     Forward Solve
 22: !

 24:       x(0) = b(0)
 25:       x(1) = b(1)
 26:       x(2) = b(2)
 27:       x(3) = b(3)
 28:       idx  = 0
 29:       do 20 i=1,n-1
 30:          jstart = ai(i)
 31:          jend   = adiag(i) - 1
 32:          ax    = 16*jstart
 33:          idx    = idx + 4
 34:          s1     = b(idx)
 35:          s2     = b(idx+1)
 36:          s3     = b(idx+2)
 37:          s4     = b(idx+3)
 38:          do 30 j=jstart,jend
 39:            jdx   = 4*aj(j)
 40: 
 41:            x1    = x(jdx)
 42:            x2    = x(jdx+1)
 43:            x3    = x(jdx+2)
 44:            x4    = x(jdx+3)
 45:            s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
 46:            s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
 47:            s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
 48:            s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
 49:            ax = ax + 16
 50:  30      continue
 51:          x(idx)   = s1
 52:          x(idx+1) = s2
 53:          x(idx+2) = s3
 54:          x(idx+3) = s4
 55:  20   continue
 56: 
 57: !
 58: !     Backward solve the upper triangular
 59: !
 60:       do 40 i=n-1,0,-1
 61:          jstart  = adiag(i) + 1
 62:          jend    = ai(i+1) - 1
 63:          ax     = 16*jstart
 64:          s1      = x(idx)
 65:          s2      = x(idx+1)
 66:          s3      = x(idx+2)
 67:          s4      = x(idx+3)
 68:          do 50 j=jstart,jend
 69:            jdx   = 4*aj(j)
 70:            x1    = x(jdx)
 71:            x2    = x(jdx+1)
 72:            x3    = x(jdx+2)
 73:            x4    = x(jdx+3)
 74:            s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
 75:            s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
 76:            s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
 77:            s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
 78:            ax = ax + 16
 79:  50      continue
 80:          ax      = 16*adiag(i)
 81:          x(idx)   = a(ax)*s1  +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4
 82:          x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4
 83:          x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4
 84:          x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4
 85:          idx      = idx - 4
 86:  40   continue
 87:       return
 88:       end
 89: 
 90: !
 91: !   version that calls BLAS 2 operation for each row block
 92: !
 93:       subroutine FortranSolveBAIJ4BLAS(n,x,ai,aj,adiag,a,b,w)
 94:       implicit none
 95:       MatScalar        a(0:*),w(0:*)
 96:       PetscScalar      x(0:*),b(0:*)
 97:       integer          n,ai(0:*),aj(0:*),adiag(0:*)

 99:       integer          i,j,jstart,jend,idx,ax,jdx,kdx
100:       MatScalar        s(0:3)
101: !
102: !     Forward Solve
103: !

105:       x(0) = b(0)
106:       x(1) = b(1)
107:       x(2) = b(2)
108:       x(3) = b(3)
109:       idx  = 0
110:       do 20 i=1,n-1
111: !
112: !        Pack required part of vector into work array
113: !
114:          kdx    = 0
115:          jstart = ai(i)
116:          jend   = adiag(i) - 1
117:          if (jend - jstart .ge. 500) then
118:            write(6,*) 'Overflowing vector FortranSolveBAIJ4BLAS()'
119:          endif
120:          do 30 j=jstart,jend
121: 
122:            jdx       = 4*aj(j)
123: 
124:            w(kdx)    = x(jdx)
125:            w(kdx+1)  = x(jdx+1)
126:            w(kdx+2)  = x(jdx+2)
127:            w(kdx+3)  = x(jdx+3)
128:            kdx       = kdx + 4
129:  30      continue

131:          ax      = 16*jstart
132:          idx      = idx + 4
133:          s(0)     = b(idx)
134:          s(1)     = b(idx+1)
135:          s(2)     = b(idx+2)
136:          s(3)     = b(idx+3)
137: !
138: !    s = s - a(ax:)*w
139: !
140:          call dgemv('n',4,4*(jend-jstart+1),-1.d0,a(ax),4,w,1,1.d0,s,1)
141: !         call sgemv('n',4,4*(jend-jstart+1),-1.0,a(ax),4,w,1,1.0,s,1)

143:          x(idx)   = s(0)
144:          x(idx+1) = s(1)
145:          x(idx+2) = s(2)
146:          x(idx+3) = s(3)
147:  20   continue
148: 
149: !
150: !     Backward solve the upper triangular
151: !
152:       do 40 i=n-1,0,-1
153:          jstart    = adiag(i) + 1
154:          jend      = ai(i+1) - 1
155:          ax       = 16*jstart
156:          s(0)      = x(idx)
157:          s(1)      = x(idx+1)
158:          s(2)      = x(idx+2)
159:          s(3)      = x(idx+3)
160: !
161: !   Pack each chunk of vector needed
162: !
163:          kdx = 0
164:          if (jend - jstart .ge. 500) then
165:            write(6,*) 'Overflowing vector FortranSolveBAIJ4BLAS()'
166:          endif
167:          do 50 j=jstart,jend
168:            jdx      = 4*aj(j)
169:            w(kdx)   = x(jdx)
170:            w(kdx+1) = x(jdx+1)
171:            w(kdx+2) = x(jdx+2)
172:            w(kdx+3) = x(jdx+3)
173:            kdx      = kdx + 4
174:  50      continue
175: !         call sgemv('n',4,4*(jend-jstart+1),-1.0,a(ax),4,w,1,1.0,s,1)
176:          call dgemv('n',4,4*(jend-jstart+1),-1.d0,a(ax),4,w,1,1.d0,s,1)

178:          ax      = 16*adiag(i)
179:          x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
180:          x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
181:          x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
182:          x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
183:          idx     = idx - 4
184:  40   continue
185:       return
186:       end
187: 

189: !
190: !   version that does not call BLAS 2 operation for each row block
191: !
192:       subroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w)
193:       implicit none
194:       MatScalar        a(0:*)
195:       PetscScalar      x(0:*),b(0:*),w(0:*)
196:       integer          n,ai(0:*),aj(0:*),adiag(0:*),ii,jj

198:       integer          i,j,jstart,jend,idx,ax,jdx,kdx,nn
199:       PetscScalar      s(0:3)
200: !
201: !     Forward Solve
202: !

204:       x(0) = b(0)
205:       x(1) = b(1)
206:       x(2) = b(2)
207:       x(3) = b(3)
208:       idx  = 0
209:       do 20 i=1,n-1
210: !
211: !        Pack required part of vector into work array
212: !
213:          kdx    = 0
214:          jstart = ai(i)
215:          jend   = adiag(i) - 1
216:          if (jend - jstart .ge. 500) then
217:            write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
218:          endif
219:          do 30 j=jstart,jend
220: 
221:            jdx       = 4*aj(j)
222: 
223:            w(kdx)    = x(jdx)
224:            w(kdx+1)  = x(jdx+1)
225:            w(kdx+2)  = x(jdx+2)
226:            w(kdx+3)  = x(jdx+3)
227:            kdx       = kdx + 4
228:  30      continue

230:          ax       = 16*jstart
231:          idx      = idx + 4
232:          s(0)     = b(idx)
233:          s(1)     = b(idx+1)
234:          s(2)     = b(idx+2)
235:          s(3)     = b(idx+3)
236: !
237: !    s = s - a(ax:)*w
238: !
239:          nn = 4*(jend - jstart + 1) - 1
240:          do 100, ii=0,3
241:            do 110, jj=0,nn
242:              s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
243:  110       continue
244:  100     continue

246:          x(idx)   = s(0)
247:          x(idx+1) = s(1)
248:          x(idx+2) = s(2)
249:          x(idx+3) = s(3)
250:  20   continue
251: 
252: !
253: !     Backward solve the upper triangular
254: !
255:       do 40 i=n-1,0,-1
256:          jstart    = adiag(i) + 1
257:          jend      = ai(i+1) - 1
258:          ax        = 16*jstart
259:          s(0)      = x(idx)
260:          s(1)      = x(idx+1)
261:          s(2)      = x(idx+2)
262:          s(3)      = x(idx+3)
263: !
264: !   Pack each chunk of vector needed
265: !
266:          kdx = 0
267:          if (jend - jstart .ge. 500) then
268:            write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
269:          endif
270:          do 50 j=jstart,jend
271:            jdx      = 4*aj(j)
272:            w(kdx)   = x(jdx)
273:            w(kdx+1) = x(jdx+1)
274:            w(kdx+2) = x(jdx+2)
275:            w(kdx+3) = x(jdx+3)
276:            kdx      = kdx + 4
277:  50      continue
278:          nn = 4*(jend - jstart + 1) - 1
279:          do 200, ii=0,3
280:            do 210, jj=0,nn
281:              s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
282:  210       continue
283:  200     continue

285:          ax      = 16*adiag(i)
286:          x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
287:          x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
288:          x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
289:          x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
290:          idx     = idx - 4
291:  40   continue
292:       return
293:       end
294: