Добавил:
Опубликованный материал нарушает ваши авторские права? Сообщите нам.
Вуз: Предмет: Файл:
Исследование параллелизма при решении двухточечной краевой задачи на графических ускорителях NVIDIA (магистерская диссертация).docx
Скачиваний:
72
Добавлен:
28.06.2014
Размер:
1.59 Mб
Скачать

Приложение 2. Основная часть программного кода разработанного приложения

#include <stdio.h>

#include <stdlib.h>

#include <assert.h>

#include <openacc.h>

#include <math.h>

#if defined(_WIN32) || defined(_WIN64)

#include <sys/timeb.h>

#define gettime(a) _ftime(a)

#define usec(t1,t2) ((((t2).time-(t1).time)*1000+((t2).millitm-(t1).millitm))*100)

typedef struct _timeb timestruct;

#else

#include <sys/time.h>

#define gettime(a) gettimeofday(a,NULL)

#define usec(t1,t2) (((t2).tv_sec-(t1).tv_sec)*1000000+((t2).tv_usec-(t1).tv_usec))

typedef struct timeval timestruct;

#endif

void

work( float*restrict a, float*restrict f, int n, int p, int m )

{

float k;

int i, j, t, r, r1, r2;

#pragma acc parallel present(a[0:n*n],f[0:n])

#pragma acc loop

for( j = 1; j <= p; j++ ){

if ((j == 1) || (j==p))

{ t=3+m-2; }

else

{ t=3+m-1; }

if (j == 1)

{ r1=0; }

else

{ r1=(j-1)*m-2; }

r2=r1+t;

for( i = 1+(j-1)*m; i < j*m; i++ ){

k=a[i*n+i-1]/a[(i-1)*n+i-1];

#pragma acc loop

for( r = r1; r <= r2; r++ ){

a[i*n+r]-=a[(i-1)*n+r]*k;

}

f[i]-=f[i-1]*k;

}

if (j == 1)

{ t=0; }

else

{ t=1; }

for( i = j*m-3; i >= (j-1)*m-t; i-- ){

k=a[i*n+i+1]/a[(i+1)*n+i+1];

#pragma acc loop

for( r = r1; r <= r2; r++ ){

a[i*n+r]-=a[(i+1)*n+r]*k;

}

f[i]-=f[i+1]*k;

}

}

}

void

workhost( float* a, float* f, int n, int p, int m )

{

float k;

int i, j, t, r, r1, r2;

for( j = 1; j <= p; j++ ){

if ((j == 1) || (j==p))

{ t=3+m-2; }

else

{ t=3+m-1; }

if (j == 1)

{ r1=0; }

else

{ r1=(j-1)*m-2; }

r2=r1+t;

for( i = 1+(j-1)*m; i < j*m; i++ ){

k=a[i*n+i-1]/a[(i-1)*n+i-1];

for( r = r1; r <= r2; r++ ){

a[i*n+r]-=a[(i-1)*n+r]*k;

}

f[i]-=f[i-1]*k;

}

if (j == 1)

{ t=0; }

else

{ t=1; }

for( i = j*m-3; i >= (j-1)*m-t; i-- ){

k=a[i*n+i+1]/a[(i+1)*n+i+1];

for( r = r1; r <= r2; r++ ){

a[i*n+r]-=a[(i+1)*n+r]*k;

}

f[i]-=f[i+1]*k;

}

}

}

void

workhost1( float* a, float* f, float* x, float* alpha, float* beta, int n )

{

int i;

alpha[1]=-a[1]/a[0];

beta[1]=f[0]/a[0];

for( i = 1; i < n-1; i++ ){

alpha[i+1]=-a[i*n+i+1]/(a[i*n+i-1]*alpha[i]+a[i*n+i]);

beta[i+1]=(f[i]-a[i*n+i-1]*beta[i])/(a[i*n+i-1]*alpha[i]+a[i*n+i]);

}

x[n-1]=(f[n-1]-a[(n-1)*n+(n-2)]*beta[n-1])/(a[(n-1)*n+(n-2)]*alpha[n-1]+a[(n-1)*n+(n-1)]);

for( i = n-2; i >= 0; i-- ){

x[i]=alpha[i+1]*x[i+1]+beta[i+1];

}

}

float

myRandom( float start )

{

return ((float) rand() / (RAND_MAX)) + start;

}

int

main( int argc, char* argv[] )

{

float *a, *x, *f, *alpha, *beta, *xhost, *ahost, *fhost;

int n, i, j, p, m, gangs;

timestruct t1, t2, t3;

long long cgpu, chost;

n = 0;

if( argc > 1 ){

n = atoi( argv[1] );

if( argc > 2 ){

p = atoi( argv[2] );

if( argc > 3 ){

gangs = atoi( argv[3] );

}

}

}

if( n <= 0 ) n = 5000;

m = n / p;

//Инициализация ускорителя

acc_set_device( acc_device_nvidia );

acc_init( acc_device_nvidia );

a = (float*) malloc( sizeof(float) * n * n );

x = (float*) malloc( sizeof(float) * n );

f = (float*) malloc( sizeof(float) * n );

ahost = (float*) malloc( sizeof(float) * n * n );

fhost = (float*) malloc( sizeof(float) * n );

xhost = (float*) malloc( sizeof(float) * n );

alpha = (float*) malloc( sizeof(float) * n );

beta = (float*) malloc( sizeof(float) * n );

for( i = 0; i < n*n; i++ ){

a[i]=0;

ahost[i]=0;

}

ahost[0]=a[0]=myRandom(2);

ahost[1]=a[1]=myRandom(0);

for( i = 1; i < n-1; i++ ){

ahost[i*n+i-1]=a[i*n+i-1]=myRandom(0);

ahost[i*n+i]=a[i*n+i]=myRandom(2);

ahost[i*n+i+1]=a[i*n+i+1]=myRandom(0);

}

ahost[(n-1)*n+(n-2)]=a[(n-1)*n+(n-2)]=myRandom(0);

ahost[(n-1)*n+(n-1)]=a[(n-1)*n+(n-1)]=myRandom(2);

fhost[0]=f[0]=myRandom(2);

for( i = 1; i < n; i++ ){

fhost[i]=f[i]=myRandom(0);

}

gettime( &t1 );

//Копирование данных до и после вычислений

#pragma acc data copy(a[0:n*n],f[0:n])

{

//Вычисления на ускорителе

work( a, f, n, p, m );

}

gettime( &t2 );

cgpu = usec(t1,t2);

printf("matrix %d\n", n);

printf("p %d\n", p);

printf( "%13ld microseconds on gpu\n", cgpu );

return 0;

}

__global__ void solve_forward(int n,

float* subDiagonal, float* mainDiagonal, float* supDiagonal,

float* b)

{

int blockIndex = blockDim.x * blockIdx.x + threadIdx.x;

int startRow = blockIndex * ROWS_PER_MATRIX_BLOCK;

int endRow = startRow + ROWS_PER_MATRIX_BLOCK - 1;

float k;

for(int rowIndex = startRow; rowIndex < endRow; ++rowIndex)

{

k = subDiagonal[rowIndex] / mainDiagonal[rowIndex];

if(blockIndex > 0)

{

subDiagonal[rowIndex] = -subDiagonal[rowIndex - 1] * k;

}

else

{

subDiagonal[rowIndex] = 0.0;

}

mainDiagonal[rowIndex + 1] -= supDiagonal[rowIndex] * k;

b[rowIndex + 1] -= b[rowIndex] * k;

}

}

__global__ void solve_backward(int n,

float* subDiagonal, float* mainDiagonal, float* supDiagonal,

float* b)

{

int blockIndex = blockDim.x * blockIdx.x + threadIdx.x;

int startRow = blockIndex * ROWS_PER_MATRIX_BLOCK;

int endRow = startRow + ROWS_PER_MATRIX_BLOCK - 1;

startRow = max(startRow, 1);

float k;

for(int rowIndex = endRow - 1; rowIndex >= startRow; -- rowIndex)

{

k = supDiagonal[rowIndex - 1] / mainDiagonal[rowIndex];

if(blockIndex > 0)

{

if(rowIndex > startRow)

{

subDiagonal[rowIndex - 2] -= subDiagonal[rowIndex - 1] * k;

}

else

{

mainDiagonal[rowIndex - 1] -= subDiagonal[rowIndex - 1] * k;

}

}

supDiagonal[rowIndex - 1] = -supDiagonal[rowIndex] * k;

b[rowIndex - 1] -= b[rowIndex] * k;

}

}

double solve_Block(int n,

float* subDiagonal, float* mainDiagonal, float* supDiagonal,

float* b,

float* x)

{

clock_t start, finish;

double time;

cudaError_t error;

if(n % ROWS_PER_MATRIX_BLOCK != 0)

{

printf("n cannot be divided evenly by %d (ROWS_PER_MATRIX_BLOCK).", ROWS_PER_MATRIX_BLOCK);

exit(-1);

}

int matrixBlockCount = n / ROWS_PER_MATRIX_BLOCK;

if(matrixBlockCount % THREADS_PER_BLOCK != 0)

{

printf("matrixBlockCount cannot be divided evenly by %d (THREADS_PER_BLOCK).", THREADS_PER_BLOCK);

exit(-1);

}

start = clock();

size_t dataElementSize = sizeof(float);

float* deviceSubDiagonal = 0;

float* deviceMainDiagonal = 0;

float* deviceSupDiagonal = 0;

float* deviceB = 0;

float* deviceX = 0;

float* deviceMatrix2 = 0;

//Выделение памяти

cudaMalloc(&deviceSubDiagonal, dataElementSize * (n - 1));

cudaMalloc(&deviceMainDiagonal, dataElementSize * n);

cudaMalloc(&deviceSupDiagonal, dataElementSize * (n - 1));

cudaMalloc(&deviceB, dataElementSize * n);

cudaMalloc(&deviceX, dataElementSize * n);

cudaMalloc(&deviceMatrix2, dataElementSize * matrixBlockCount * 4);

error = cudaGetLastError();

if(error == cudaSuccess)

{

//Копирование данных на ускоритель

cudaMemcpy(deviceSubDiagonal, subDiagonal,

dataElementSize * (n - 1), cudaMemcpyHostToDevice);

cudaMemcpy(deviceMainDiagonal, mainDiagonal,

dataElementSize * n, cudaMemcpyHostToDevice);

cudaMemcpy(deviceSupDiagonal, supDiagonal,

dataElementSize * (n - 1), cudaMemcpyHostToDevice);

cudaMemcpy(deviceB, b,

dataElementSize * n, cudaMemcpyHostToDevice);

int cudaBlockCount = matrixBlockCount / THREADS_PER_BLOCK;

printf("Equation count = %d.\n", n);

printf("CUDA block count = %d.\n", cudaBlockCount);

printf("Threads per block = %d.\n", THREADS_PER_BLOCK);

printf("Rows per thread = %d.\n", ROWS_PER_MATRIX_BLOCK);

printf("\n");

//Прямой ход метода

solve_forward<<<cudaBlockCount, THREADS_PER_BLOCK>>>(

n,

deviceSubDiagonal, deviceMainDiagonal, deviceSupDiagonal, deviceB);

cudaThreadSynchronize();

//Обратный ход метода

solve_backward<<<cudaBlockCount, THREADS_PER_BLOCK>>>(

n,

deviceSubDiagonal, deviceMainDiagonal, deviceSupDiagonal, deviceB);

cudaThreadSynchronize();

getAuxiliaryMatrixCoeffs<<<cudaBlockCount, THREADS_PER_BLOCK>>>(

matrixBlockCount,

deviceSubDiagonal, deviceMainDiagonal, deviceSupDiagonal, deviceB,

deviceMatrix2);

float* Matrix2[5];

Matrix2[0] = new float[matrixBlockCount - 1];

Matrix2[1] = new float[matrixBlockCount];

Matrix2[2] = new float[matrixBlockCount - 1];

Matrix2[3] = new float[matrixBlockCount];

Matrix2[4] = new float[matrixBlockCount];

cudaThreadSynchronize();

//Копирование данных на хост

cudaMemcpy(Matrix2[0], &deviceMatrix2[0 * matrixBlockCount],

dataElementSize * (matrixBlockCount - 1), cudaMemcpyDeviceToHost);

cudaMemcpy(Matrix2[1], &deviceMatrix2[1 * matrixBlockCount],

dataElementSize * matrixBlockCount, cudaMemcpyDeviceToHost);

cudaMemcpy(Matrix2[2], &deviceMatrix2[2 * matrixBlockCount],

dataElementSize * (matrixBlockCount - 1), cudaMemcpyDeviceToHost);

cudaMemcpy(Matrix2[3], &deviceMatrix2[3 * matrixBlockCount],

dataElementSize * matrixBlockCount, cudaMemcpyDeviceToHost);

solveTDM(matrixBlockCount,

Matrix2[0], Matrix2[1], Matrix2[2],

Matrix2[3],

Matrix2[4]);

cudaMemcpy(deviceMatrix2, Matrix2[4],

dataElementSize * matrixBlockCount, cudaMemcpyHostToDevice);

setX<<<cudaBlockCount, THREADS_PER_BLOCK>>>(

deviceMatrix2,

deviceX);

delete[] Matrix2[0];

delete[] Matrix2[1];

delete[] Matrix2[2];

delete[] Matrix2[3];

delete[] Matrix2[4];

cudaThreadSynchronize();

//Нахождение неизвестных

calculateX<<<cudaBlockCount, THREADS_PER_BLOCK>>>(

n,

deviceSubDiagonal, deviceMainDiagonal, deviceSupDiagonal, deviceB,

deviceX);

cudaThreadSynchronize();

cudaMemcpy(x, deviceX,

dataElementSize * n, cudaMemcpyDeviceToHost);

error = cudaGetLastError();

}

//Освобождение памяти

if(deviceSubDiagonal != 0) {

cudaFree(deviceSubDiagonal);

deviceSubDiagonal = 0;

}

if(deviceMainDiagonal != 0) {

cudaFree(deviceMainDiagonal);

deviceMainDiagonal = 0;

}

if(deviceSupDiagonal != 0) {

cudaFree(deviceSupDiagonal);

deviceSupDiagonal = 0;

}

if(deviceB != 0) {

cudaFree(deviceB);

deviceB = 0;

}

if(deviceX != 0) {

cudaFree(deviceX);

deviceX = 0;

}

if(deviceMatrix2 != 0) {

cudaFree(deviceMatrix2);

deviceMatrix2 = 0;

}

finish = clock();

time = ((double)finish - (double)start) / CLOCKS_PER_SEC;

if(error != cudaSuccess)

{

printf("Error occurred (%s).\n", cudaGetErrorString(error));

}

return time;

}