Actual source code: fsolvebaij.F
1: !
2: ! "$Id: fsolvebaij.F,v 1.9 2001/08/07 03:05:24 balay Exp $";
3: !
4: ! Fortran kernel for sparse triangular solve in the BAIJ matrix format
5: ! This ONLY works for factorizations in the NATURAL ORDERING, i.e.
6: ! with MatSolve_SeqBAIJ_4_NaturalOrdering()
7: !
8: #include include/finclude/petscdef.h
9: !
11: subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b)
12: implicit none
13: MatScalar a(0:*)
14: PetscScalar x(0:*),b(0:*)
15: integer n,ai(0:*),aj(0:*),adiag(0:*)
17: integer i,j,jstart,jend,idx,ax,jdx
18: PetscScalar s1,s2,s3,s4
19: PetscScalar x1,x2,x3,x4
20: !
21: ! Forward Solve
22: !
24: x(0) = b(0)
25: x(1) = b(1)
26: x(2) = b(2)
27: x(3) = b(3)
28: idx = 0
29: do 20 i=1,n-1
30: jstart = ai(i)
31: jend = adiag(i) - 1
32: ax = 16*jstart
33: idx = idx + 4
34: s1 = b(idx)
35: s2 = b(idx+1)
36: s3 = b(idx+2)
37: s4 = b(idx+3)
38: do 30 j=jstart,jend
39: jdx = 4*aj(j)
40:
41: x1 = x(jdx)
42: x2 = x(jdx+1)
43: x3 = x(jdx+2)
44: x4 = x(jdx+3)
45: s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
46: s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
47: s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
48: s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
49: ax = ax + 16
50: 30 continue
51: x(idx) = s1
52: x(idx+1) = s2
53: x(idx+2) = s3
54: x(idx+3) = s4
55: 20 continue
56:
57: !
58: ! Backward solve the upper triangular
59: !
60: do 40 i=n-1,0,-1
61: jstart = adiag(i) + 1
62: jend = ai(i+1) - 1
63: ax = 16*jstart
64: s1 = x(idx)
65: s2 = x(idx+1)
66: s3 = x(idx+2)
67: s4 = x(idx+3)
68: do 50 j=jstart,jend
69: jdx = 4*aj(j)
70: x1 = x(jdx)
71: x2 = x(jdx+1)
72: x3 = x(jdx+2)
73: x4 = x(jdx+3)
74: s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
75: s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
76: s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
77: s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
78: ax = ax + 16
79: 50 continue
80: ax = 16*adiag(i)
81: x(idx) = a(ax)*s1 +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4
82: x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4
83: x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4
84: x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4
85: idx = idx - 4
86: 40 continue
87: return
88: end
89:
90: !
91: ! version that calls BLAS 2 operation for each row block
92: !
93: subroutine FortranSolveBAIJ4BLAS(n,x,ai,aj,adiag,a,b,w)
94: implicit none
95: MatScalar a(0:*),w(0:*)
96: PetscScalar x(0:*),b(0:*)
97: integer n,ai(0:*),aj(0:*),adiag(0:*)
99: integer i,j,jstart,jend,idx,ax,jdx,kdx
100: MatScalar s(0:3)
101: !
102: ! Forward Solve
103: !
105: x(0) = b(0)
106: x(1) = b(1)
107: x(2) = b(2)
108: x(3) = b(3)
109: idx = 0
110: do 20 i=1,n-1
111: !
112: ! Pack required part of vector into work array
113: !
114: kdx = 0
115: jstart = ai(i)
116: jend = adiag(i) - 1
117: if (jend - jstart .ge. 500) then
118: write(6,*) 'Overflowing vector FortranSolveBAIJ4BLAS()'
119: endif
120: do 30 j=jstart,jend
121:
122: jdx = 4*aj(j)
123:
124: w(kdx) = x(jdx)
125: w(kdx+1) = x(jdx+1)
126: w(kdx+2) = x(jdx+2)
127: w(kdx+3) = x(jdx+3)
128: kdx = kdx + 4
129: 30 continue
131: ax = 16*jstart
132: idx = idx + 4
133: s(0) = b(idx)
134: s(1) = b(idx+1)
135: s(2) = b(idx+2)
136: s(3) = b(idx+3)
137: !
138: ! s = s - a(ax:)*w
139: !
140: call dgemv('n',4,4*(jend-jstart+1),-1.d0,a(ax),4,w,1,1.d0,s,1)
141: ! call sgemv('n',4,4*(jend-jstart+1),-1.0,a(ax),4,w,1,1.0,s,1)
143: x(idx) = s(0)
144: x(idx+1) = s(1)
145: x(idx+2) = s(2)
146: x(idx+3) = s(3)
147: 20 continue
148:
149: !
150: ! Backward solve the upper triangular
151: !
152: do 40 i=n-1,0,-1
153: jstart = adiag(i) + 1
154: jend = ai(i+1) - 1
155: ax = 16*jstart
156: s(0) = x(idx)
157: s(1) = x(idx+1)
158: s(2) = x(idx+2)
159: s(3) = x(idx+3)
160: !
161: ! Pack each chunk of vector needed
162: !
163: kdx = 0
164: if (jend - jstart .ge. 500) then
165: write(6,*) 'Overflowing vector FortranSolveBAIJ4BLAS()'
166: endif
167: do 50 j=jstart,jend
168: jdx = 4*aj(j)
169: w(kdx) = x(jdx)
170: w(kdx+1) = x(jdx+1)
171: w(kdx+2) = x(jdx+2)
172: w(kdx+3) = x(jdx+3)
173: kdx = kdx + 4
174: 50 continue
175: ! call sgemv('n',4,4*(jend-jstart+1),-1.0,a(ax),4,w,1,1.0,s,1)
176: call dgemv('n',4,4*(jend-jstart+1),-1.d0,a(ax),4,w,1,1.d0,s,1)
178: ax = 16*adiag(i)
179: x(idx) = a(ax)*s(0) +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
180: x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
181: x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
182: x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
183: idx = idx - 4
184: 40 continue
185: return
186: end
187:
189: !
190: ! version that does not call BLAS 2 operation for each row block
191: !
192: subroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w)
193: implicit none
194: MatScalar a(0:*)
195: PetscScalar x(0:*),b(0:*),w(0:*)
196: integer n,ai(0:*),aj(0:*),adiag(0:*),ii,jj
198: integer i,j,jstart,jend,idx,ax,jdx,kdx,nn
199: PetscScalar s(0:3)
200: !
201: ! Forward Solve
202: !
204: x(0) = b(0)
205: x(1) = b(1)
206: x(2) = b(2)
207: x(3) = b(3)
208: idx = 0
209: do 20 i=1,n-1
210: !
211: ! Pack required part of vector into work array
212: !
213: kdx = 0
214: jstart = ai(i)
215: jend = adiag(i) - 1
216: if (jend - jstart .ge. 500) then
217: write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
218: endif
219: do 30 j=jstart,jend
220:
221: jdx = 4*aj(j)
222:
223: w(kdx) = x(jdx)
224: w(kdx+1) = x(jdx+1)
225: w(kdx+2) = x(jdx+2)
226: w(kdx+3) = x(jdx+3)
227: kdx = kdx + 4
228: 30 continue
230: ax = 16*jstart
231: idx = idx + 4
232: s(0) = b(idx)
233: s(1) = b(idx+1)
234: s(2) = b(idx+2)
235: s(3) = b(idx+3)
236: !
237: ! s = s - a(ax:)*w
238: !
239: nn = 4*(jend - jstart + 1) - 1
240: do 100, ii=0,3
241: do 110, jj=0,nn
242: s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
243: 110 continue
244: 100 continue
246: x(idx) = s(0)
247: x(idx+1) = s(1)
248: x(idx+2) = s(2)
249: x(idx+3) = s(3)
250: 20 continue
251:
252: !
253: ! Backward solve the upper triangular
254: !
255: do 40 i=n-1,0,-1
256: jstart = adiag(i) + 1
257: jend = ai(i+1) - 1
258: ax = 16*jstart
259: s(0) = x(idx)
260: s(1) = x(idx+1)
261: s(2) = x(idx+2)
262: s(3) = x(idx+3)
263: !
264: ! Pack each chunk of vector needed
265: !
266: kdx = 0
267: if (jend - jstart .ge. 500) then
268: write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
269: endif
270: do 50 j=jstart,jend
271: jdx = 4*aj(j)
272: w(kdx) = x(jdx)
273: w(kdx+1) = x(jdx+1)
274: w(kdx+2) = x(jdx+2)
275: w(kdx+3) = x(jdx+3)
276: kdx = kdx + 4
277: 50 continue
278: nn = 4*(jend - jstart + 1) - 1
279: do 200, ii=0,3
280: do 210, jj=0,nn
281: s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
282: 210 continue
283: 200 continue
285: ax = 16*adiag(i)
286: x(idx) = a(ax)*s(0) +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
287: x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
288: x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
289: x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
290: idx = idx - 4
291: 40 continue
292: return
293: end
294: