-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLU_Task.c
166 lines (145 loc) · 4.95 KB
/
LU_Task.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<omp.h>
/*
*
* Decomposition Function, the result of this function is the Matrix A decomposed in the LU form with Pivoting.
* A= L-E+U s.t. P*A=L*U
* This function isn't parallelized because there are a lot of critical part and using the task or the critical region
* insert only overhead because the code must be filled with taskwait and critical region.
*/
int LUPDecompose(double *A, int N, double Tol, int *P)
{
int i,z, j, k, imax,flag=1;
double maxA, absA;
for (i = 0; i <= N; i++)
P[i] = i;
for (i = 0; i < N; i++)
{
maxA = 0.0;
imax = i;
for (k = i; k < N; k++)
if ((absA = fabs(A[k * N + i])) > maxA) //Selecting the Pivot of the Column
{
maxA = absA;
imax = k;
}
if (maxA > Tol)
{
flag = 1;
if (imax != i)
{
//Pivoting vector P
j = P[i];
P[i] = P[imax];
P[imax] = j;
//Pivoting Rows of A
for (z = 0; z < N; z++)
{
A[(2 * N * N)] = A[i * N + z];
A[i * N + z] = A[imax * N + z];
A[imax * N + z] = A[(2 * N * N)];
}
P[N]++;
}
//Compute the Matrix A
for (j = i + 1; j < N; j++)
{
A[j * N + i] /= A[i * N + i];
for (k = i + 1; k < N; k++)
{
A[j * N + k] -= A[j * N + i] * A[i * N + k];
}
}
}
else
{
flag = 0; //Error in decomposition
}
}
return flag;
}
/*
*
* This function computes the inversion of the previous matrix A
* The result is the the second part of the matrix A, starting from N*N.
* This version is the task version, the matrix is quite regular, so adding task cause only overhead (see the version with omp for, is faster and don't use task, simpler)
*/
void LUPInvert(double *A, int *P, int N,int nthreads,int chunk)
{
#pragma omp parallel shared(A,P,N,nthreads, chunk)
{
#pragma omp single
{
//is possible parallelize only the external loop because the inner region are critic
#pragma omp task
for (int j = 0; j < N; j++)
{
for (int i = 0; i < N; i++)
{
if (P[i] == j)
A[(N*N)+i*N+j] = 1.0;
else
A[(N*N)+i*N+j] = 0.0;
//it's possible add a pragma omp but need at the end a taskwait and introduce delay (remove the // and the code is slower than the serial)
//#pragma omp task
for (int k = 0; k < i; k++)
{
A[(N*N)+i*N+j] -= A[i*N+k] * A[(N*N)+k*N+j]; // Computation of the inverse
}
//#pragma omp taskwait
}
#pragma omp task
for (int i = N - 1; i >= 0; i--)
{
#pragma omp task
for (int k = i + 1; k < N; k++)
{
A[(N * N) + i * N + j] -= A[i * N + k] * A[(N * N) + k * N + j];
}
//there is a loop carried dependence so wait to inner loop end
#pragma omp taskwait
A[(N*N)+i*N+j] = A[(N*N)+i*N+j] / A[i*N+i];
}
}
}
}
}
int main()
{
int n;
printf("Dim A: ");
scanf("%d",&n);
int chunk=10;
int var;
printf("Insert number of threads: ");
scanf("%d",&var);
omp_set_num_threads(var);
int nthreads = omp_get_num_threads();
double *A=calloc(1+(2*n*n),sizeof(double *)); //contiguous allocation
// loop for fill the matrix
for(int i=0;i<n;i++)
{
for(int j=0;j<n;j++)
{
A[i*n+j]=rand()%10;
}
}
int *P=malloc((n+1)*sizeof(int));
double tol=0.000000000001; // to avoid the method degenerate
double begin,begin1;
double end,end1;
begin=begin1=omp_get_wtime();
int flag = LUPDecompose(A,n,tol,P);
end=end1=omp_get_wtime();
printf("FunctionA: %lf\n",end1-begin1);
if(flag!=0)
{
begin1 = omp_get_wtime();
LUPInvert(A, P, n, nthreads, chunk);
end1 = end = omp_get_wtime();
}
printf("FunctionB: %lf\n",end1-begin1);
printf("LU: %lf\n",end-begin);
}