Actual source code: dgefa4.c
1: #define PETSCMAT_DLL
3: /*
4: Inverts 4 by 4 matrix using partial pivoting.
6: Used by the sparse factorization routines in
7: src/mat/impls/baij/seq and src/mat/impls/bdiag/seq
9: See also src/inline/ilu.h
11: This is a combination of the Linpack routines
12: dgefa() and dgedi() specialized for a size of 4.
14: */
15: #include petsc.h
19: PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a)
20: {
21: PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3;
22: PetscInt k4,j3;
23: MatScalar *aa,*ax,*ay,work[16],stmp;
24: MatReal tmp,max;
26: /* gaussian elimination with partial pivoting */
29: /* Parameter adjustments */
30: a -= 5;
32: for (k = 1; k <= 3; ++k) {
33: kp1 = k + 1;
34: k3 = 4*k;
35: k4 = k3 + k;
36: /* find l = pivot index */
38: i__2 = 4 - k;
39: aa = &a[k4];
40: max = PetscAbsScalar(aa[0]);
41: l = 1;
42: for (ll=1; ll<i__2; ll++) {
43: tmp = PetscAbsScalar(aa[ll]);
44: if (tmp > max) { max = tmp; l = ll+1;}
45: }
46: l += k - 1;
47: ipvt[k-1] = l;
49: if (a[l + k3] == 0.0) {
50: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1);
51: }
53: /* interchange if necessary */
55: if (l != k) {
56: stmp = a[l + k3];
57: a[l + k3] = a[k4];
58: a[k4] = stmp;
59: }
61: /* compute multipliers */
63: stmp = -1. / a[k4];
64: i__2 = 4 - k;
65: aa = &a[1 + k4];
66: for (ll=0; ll<i__2; ll++) {
67: aa[ll] *= stmp;
68: }
70: /* row elimination with column indexing */
72: ax = &a[k4+1];
73: for (j = kp1; j <= 4; ++j) {
74: j3 = 4*j;
75: stmp = a[l + j3];
76: if (l != k) {
77: a[l + j3] = a[k + j3];
78: a[k + j3] = stmp;
79: }
81: i__3 = 4 - k;
82: ay = &a[1+k+j3];
83: for (ll=0; ll<i__3; ll++) {
84: ay[ll] += stmp*ax[ll];
85: }
86: }
87: }
88: ipvt[3] = 4;
89: if (a[20] == 0.0) {
90: SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3);
91: }
93: /*
94: Now form the inverse
95: */
97: /* compute inverse(u) */
99: for (k = 1; k <= 4; ++k) {
100: k3 = 4*k;
101: k4 = k3 + k;
102: a[k4] = 1.0 / a[k4];
103: stmp = -a[k4];
104: i__2 = k - 1;
105: aa = &a[k3 + 1];
106: for (ll=0; ll<i__2; ll++) aa[ll] *= stmp;
107: kp1 = k + 1;
108: if (4 < kp1) continue;
109: ax = aa;
110: for (j = kp1; j <= 4; ++j) {
111: j3 = 4*j;
112: stmp = a[k + j3];
113: a[k + j3] = 0.0;
114: ay = &a[j3 + 1];
115: for (ll=0; ll<k; ll++) {
116: ay[ll] += stmp*ax[ll];
117: }
118: }
119: }
121: /* form inverse(u)*inverse(l) */
123: for (kb = 1; kb <= 3; ++kb) {
124: k = 4 - kb;
125: k3 = 4*k;
126: kp1 = k + 1;
127: aa = a + k3;
128: for (i = kp1; i <= 4; ++i) {
129: work[i-1] = aa[i];
130: aa[i] = 0.0;
131: }
132: for (j = kp1; j <= 4; ++j) {
133: stmp = work[j-1];
134: ax = &a[4*j + 1];
135: ay = &a[k3 + 1];
136: ay[0] += stmp*ax[0];
137: ay[1] += stmp*ax[1];
138: ay[2] += stmp*ax[2];
139: ay[3] += stmp*ax[3];
140: }
141: l = ipvt[k-1];
142: if (l != k) {
143: ax = &a[k3 + 1];
144: ay = &a[4*l + 1];
145: stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp;
146: stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp;
147: stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp;
148: stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp;
149: }
150: }
151: return(0);
152: }
154: #if defined(PETSC_HAVE_SSE)
155: #include PETSC_HAVE_SSE
159: PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a)
160: {
161: /*
162: This routine is converted from Intel's Small Matrix Library.
163: See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix
164: Order Number: 245043-001
165: March 1999
166: http://www.intel.com
168: Inverse of a 4x4 matrix via Kramer's Rule:
169: bool Invert4x4(SMLXMatrix &);
170: */
173: SSE_SCOPE_BEGIN;
174: SSE_INLINE_BEGIN_1(a)
176: /* ----------------------------------------------- */
178: SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0)
179: SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0)
181: SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5)
182: SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5)
184: SSE_COPY_PS(XMM3,XMM0)
185: SSE_SHUFFLE(XMM3,XMM5,0x88)
187: SSE_SHUFFLE(XMM5,XMM0,0xDD)
189: SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0)
190: SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0)
192: SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6)
193: SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6)
195: SSE_COPY_PS(XMM4,XMM0)
196: SSE_SHUFFLE(XMM4,XMM6,0x88)
198: SSE_SHUFFLE(XMM6,XMM0,0xDD)
200: /* ----------------------------------------------- */
202: SSE_COPY_PS(XMM7,XMM4)
203: SSE_MULT_PS(XMM7,XMM6)
205: SSE_SHUFFLE(XMM7,XMM7,0xB1)
207: SSE_COPY_PS(XMM0,XMM5)
208: SSE_MULT_PS(XMM0,XMM7)
210: SSE_COPY_PS(XMM2,XMM3)
211: SSE_MULT_PS(XMM2,XMM7)
213: SSE_SHUFFLE(XMM7,XMM7,0x4E)
215: SSE_COPY_PS(XMM1,XMM5)
216: SSE_MULT_PS(XMM1,XMM7)
217: SSE_SUB_PS(XMM1,XMM0)
219: SSE_MULT_PS(XMM7,XMM3)
220: SSE_SUB_PS(XMM7,XMM2)
222: SSE_SHUFFLE(XMM7,XMM7,0x4E)
223: SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7)
225: /* ----------------------------------------------- */
227: SSE_COPY_PS(XMM0,XMM5)
228: SSE_MULT_PS(XMM0,XMM4)
230: SSE_SHUFFLE(XMM0,XMM0,0xB1)
232: SSE_COPY_PS(XMM2,XMM6)
233: SSE_MULT_PS(XMM2,XMM0)
234: SSE_ADD_PS(XMM2,XMM1)
235:
236: SSE_COPY_PS(XMM7,XMM3)
237: SSE_MULT_PS(XMM7,XMM0)
239: SSE_SHUFFLE(XMM0,XMM0,0x4E)
241: SSE_COPY_PS(XMM1,XMM6)
242: SSE_MULT_PS(XMM1,XMM0)
243: SSE_SUB_PS(XMM2,XMM1)
245: SSE_MULT_PS(XMM0,XMM3)
246: SSE_SUB_PS(XMM0,XMM7)
248: SSE_SHUFFLE(XMM0,XMM0,0x4E)
249: SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0)
251: /* ----------------------------------------------- */
253: SSE_COPY_PS(XMM7,XMM5)
254: SSE_SHUFFLE(XMM7,XMM5,0x4E)
255: SSE_MULT_PS(XMM7,XMM6)
257: SSE_SHUFFLE(XMM7,XMM7,0xB1)
259: SSE_SHUFFLE(XMM4,XMM4,0x4E)
261: SSE_COPY_PS(XMM0,XMM4)
262: SSE_MULT_PS(XMM0,XMM7)
263: SSE_ADD_PS(XMM0,XMM2)
265: SSE_COPY_PS(XMM2,XMM3)
266: SSE_MULT_PS(XMM2,XMM7)
268: SSE_SHUFFLE(XMM7,XMM7,0x4E)
270: SSE_COPY_PS(XMM1,XMM4)
271: SSE_MULT_PS(XMM1,XMM7)
272: SSE_SUB_PS(XMM0,XMM1)
273: SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0)
275: SSE_MULT_PS(XMM7,XMM3)
276: SSE_SUB_PS(XMM7,XMM2)
278: SSE_SHUFFLE(XMM7,XMM7,0x4E)
280: /* ----------------------------------------------- */
282: SSE_COPY_PS(XMM1,XMM3)
283: SSE_MULT_PS(XMM1,XMM5)
285: SSE_SHUFFLE(XMM1,XMM1,0xB1)
287: SSE_COPY_PS(XMM0,XMM6)
288: SSE_MULT_PS(XMM0,XMM1)
289: SSE_ADD_PS(XMM0,XMM7)
290:
291: SSE_COPY_PS(XMM2,XMM4)
292: SSE_MULT_PS(XMM2,XMM1)
293: SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12)
295: SSE_SHUFFLE(XMM1,XMM1,0x4E)
297: SSE_COPY_PS(XMM7,XMM6)
298: SSE_MULT_PS(XMM7,XMM1)
299: SSE_SUB_PS(XMM7,XMM0)
301: SSE_MULT_PS(XMM1,XMM4)
302: SSE_SUB_PS(XMM2,XMM1)
303: SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2)
305: /* ----------------------------------------------- */
307: SSE_COPY_PS(XMM1,XMM3)
308: SSE_MULT_PS(XMM1,XMM6)
310: SSE_SHUFFLE(XMM1,XMM1,0xB1)
312: SSE_COPY_PS(XMM2,XMM4)
313: SSE_MULT_PS(XMM2,XMM1)
314: SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0)
315: SSE_SUB_PS(XMM0,XMM2)
317: SSE_COPY_PS(XMM2,XMM5)
318: SSE_MULT_PS(XMM2,XMM1)
319: SSE_ADD_PS(XMM2,XMM7)
321: SSE_SHUFFLE(XMM1,XMM1,0x4E)
323: SSE_COPY_PS(XMM7,XMM4)
324: SSE_MULT_PS(XMM7,XMM1)
325: SSE_ADD_PS(XMM7,XMM0)
327: SSE_MULT_PS(XMM1,XMM5)
328: SSE_SUB_PS(XMM2,XMM1)
330: /* ----------------------------------------------- */
332: SSE_MULT_PS(XMM4,XMM3)
334: SSE_SHUFFLE(XMM4,XMM4,0xB1)
336: SSE_COPY_PS(XMM1,XMM6)
337: SSE_MULT_PS(XMM1,XMM4)
338: SSE_ADD_PS(XMM1,XMM7)
340: SSE_COPY_PS(XMM0,XMM5)
341: SSE_MULT_PS(XMM0,XMM4)
342: SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7)
343: SSE_SUB_PS(XMM7,XMM0)
345: SSE_SHUFFLE(XMM4,XMM4,0x4E)
347: SSE_MULT_PS(XMM6,XMM4)
348: SSE_SUB_PS(XMM1,XMM6)
350: SSE_MULT_PS(XMM5,XMM4)
351: SSE_ADD_PS(XMM5,XMM7)
353: /* ----------------------------------------------- */
355: SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
356: SSE_MULT_PS(XMM3,XMM0)
358: SSE_COPY_PS(XMM4,XMM3)
359: SSE_SHUFFLE(XMM4,XMM3,0x4E)
360: SSE_ADD_PS(XMM4,XMM3)
362: SSE_COPY_PS(XMM6,XMM4)
363: SSE_SHUFFLE(XMM6,XMM4,0xB1)
364: SSE_ADD_SS(XMM6,XMM4)
366: SSE_COPY_PS(XMM3,XMM6)
367: SSE_RECIP_SS(XMM3,XMM6)
368: SSE_COPY_SS(XMM4,XMM3)
369: SSE_ADD_SS(XMM4,XMM3)
370: SSE_MULT_SS(XMM3,XMM3)
371: SSE_MULT_SS(XMM6,XMM3)
372: SSE_SUB_SS(XMM4,XMM6)
374: SSE_SHUFFLE(XMM4,XMM4,0x00)
376: SSE_MULT_PS(XMM0,XMM4)
377: SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0)
378: SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0)
380: SSE_MULT_PS(XMM1,XMM4)
381: SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1)
382: SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1)
384: SSE_MULT_PS(XMM2,XMM4)
385: SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2)
386: SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2)
388: SSE_MULT_PS(XMM4,XMM5)
389: SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4)
390: SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4)
392: /* ----------------------------------------------- */
394: SSE_INLINE_END_1;
395: SSE_SCOPE_END;
397: return(0);
398: }
400: #endif