Добавил:

hiiamfool Опубликованный материал нарушает ваши авторские права? Сообщите нам.

Вуз:

Санкт-Петербургский государственный электротехнический университет "ЛЭТИ"

Предмет:

Архитектура параллельных вычислительных систем

Файл:

1 / 1302_3_1

.pdf

Скачиваний:

Добавлен:

27.12.2025

Размер:

750.26 Кб

Скачать

☆

<<< < Предыдущая 12 / 32 3 > Следующая >>>

к замедлению работы (например, для 10000x10000 время на 16 потоках выше, чем на 4 и 8).

Это явление, вероятнее всего, связано с архитектурой тестовой системы:

•Предел физических ядер: система, на которой проводились тесты, вероятно, имеет 8 физических ядер. Использование большего числа потоков задействует технологию Hyper-Threading (виртуальные ядра), которая не всегда эффективна для вычислительно интенсивных задач.

•«Бутылочное горлышко» в подсистеме памяти: умножение матриц – операция, требующая интенсивного обмена данными с оперативной памятью. При большом количестве потоков пропускная способность памяти может стать ограничивающим фактором, и ядра процессора начинают простаивать в ожидании данных.

4.2.4. Сравнение методов распараллеливания (for и sections)

Сравнение двух параллельных подходов (для 4 потоков) показывает, что их производительность очень близка:

•Директива #pragma omp for является более гибким и универсальным решением. Она автоматически распределяет итерации цикла, что хорошо подходит для задач с однородными данными, как в нашем случае.

•Директива #pragma omp sections (ручное разделение) показала себя конкурентоспособной и даже немного более быстрой на размере 5000x5000 (15.49 с против 17.06 с у for). Однако она менее гибка, так как требует ручного кодирования распределения нагрузки и жестко привязана к определенному числу потоков.

В целом, для данной задачи оба метода показали высокую эффективность, но подход с директивой for является более предпочтительным

сточки зрения простоты и адаптируемости кода.

Вывод

В ходе данной лабораторной работы были реализованы и сравнены последовательный и параллельные (OpenMP) алгоритмы умножения матриц. Ключевым шагом стала оптимизация кода путем изменения порядка циклов на кэш-эффективный (i, k, j), что позволило многократно ускорить вычисления.

Анализ результатов показал, что параллельные алгоритмы значительно превосходят последовательный на больших размерах матриц. Так, для матрицы 10000x10000 было достигнуто ускорение примерно в 2.3 раза. На малых размерах, напротив, выигрыша нет из-за накладных расходов на управление потоками.

Также было установлено, что производительность не масштабируется линейно: оптимальное ускорение достигалось на 4-8 потоках, после чего пропускная способность памяти становилась ограничивающим фактором. Таким образом, работа подтвердила как эффективность распараллеливания с помощью OpenMP, так и критическую важность написания кэш-эффективного кода для достижения высокой производительности.

ПРИЛОЖЕНИЕ А Листинги исходного кода программы

main.cpp

#include <iostream> #include <vector> #include <chrono> #include <iomanip>

#include "matrix_ops.h"

using namespace std;

void printMatrix(const Matrix& mat, const string& name) { cout << "--- Matrix " << name << " ---" << endl;

if (mat.empty()) {

cout << "(empty)" << endl; return;

}

// Для матриц больше 10x10 выводится только верхний левый угол int printSize = min((int)mat.size(), 10);

for (int i = 0; i < printSize; ++i) { for (int j = 0; j < printSize; ++j) {

cout << fixed << setprecision(1) << setw(6) << mat[i][j] << " ";

}

if (mat.size() > printSize) { cout << "...";

}

cout << endl;

}

cout << endl;

}

void showVerificationExample() {

cout << "=============== Verification example for 10x10 matrix ==============="

<< endl;

const int exampleSize = 10;

Matrix a = createMatrix(exampleSize);

Matrix b = createMatrix(exampleSize);

Matrix c = createMatrix(exampleSize);

initializeMatrix(a);

initializeMatrix(b);

printMatrix(a, "A (random)"); printMatrix(b, "B (random)");

multiplySerial(a, b, c);

clearProgressLine();

printMatrix(c, "C = A * B (Result)");

cout << "===================================================================" << endl << endl;

}

void runPerformanceTests() {

cout << "=============== Performance measurements ===============" << endl;

vector<int> sizes = {10, 100, 500, 1000, 5000, 10000}; vector<int> threadCounts = {2, 4, 8, 16};

cout << left << setw(12) << "Size"

<<setw(18) << "Serial (s)"

<<setw(22) << "Parallel FOR (s)"

<<setw(28) << "Parallel SECTIONS (s)"

<<"Threads" << endl;

cout << string(90, '-') << endl;

for (int size : sizes) {

Matrix a = createMatrix(size); Matrix b = createMatrix(size); Matrix result = createMatrix(size); initializeMatrix(a); initializeMatrix(b);

cout << "Calculating for size " << size << "x" << size << "..." << endl;

// 1. Последовательная версия

auto startSerial = chrono::high_resolution_clock::now(); multiplySerial(a, b, result);

auto endSerial = chrono::high_resolution_clock::now(); chrono::duration<double> durationSerial = endSerial - startSerial; clearProgressLine();

// 1. Параллельные версии (сохраняются в векторы) vector<double> durationsFor(threadCounts.size()); vector<double> durationsSections(threadCounts.size());

for (size_t i = 0; i < threadCounts.size(); ++i) { int threads = threadCounts[i];

// Замер FOR

auto startFor = chrono::high_resolution_clock::now(); multiplyParallelFor(a, b, result, threads);

auto endFor = chrono::high_resolution_clock::now();

durationsFor[i] = chrono::duration<double>(endFor - startFor).count(); clearProgressLine();

// Замер SECTIONS if (threads == 4) {

auto startSections = chrono::high_resolution_clock::now(); multiplyParallelSections(a, b, result);

auto endSections = chrono::high_resolution_clock::now();

durationsSections[i] = chrono::duration<double>(endSections - startSections).count();

}

cout << "\033[A\r" << string(90, ' ') << "\r";

for (size_t i = 0; i < threadCounts.size(); ++i) { int threads = threadCounts[i];

if (i == 0) {

cout << left << setw(12) << (to_string(size) + "x" + to_string(size))

<< setw(18) << fixed << setprecision(5) << durationSerial.count();

} else {

cout << left << setw(12) << "" << setw(18) << "";

}

cout << setw(22) << fixed << setprecision(5) << durationsFor[i];

if (threads == 4) {

cout << setw(28) << fixed << setprecision(5) << durationsSections[i]; } else {

cout << setw(28) << "-";

}

cout << threads << endl;

}

cout << string(90, '-') << endl;

}

int main() { srand(time(0));

showVerificationExample();

runPerformanceTests();

return 0;

}

matrix_ops.h

#pragma once

#include <vector>

void printProgress(double percentage);

void clearProgressLine();

using Matrix = std::vector<std::vector<double>>;

Matrix createMatrix(int size);

void initializeMatrix(Matrix& mat);

void multiplySerial(const Matrix& a, const Matrix& b, Matrix& result);

void multiplyParallelFor(const Matrix& a, const Matrix& b, Matrix& result, int numThreads);

// Для "ручного" метода задаются строго 4 потока

void multiplyParallelSections(const Matrix& a, const Matrix& b, Matrix& result);

matrix_ops.cpp

#include "matrix_ops.h"

#include <omp.h> #include <iostream> #include <string> #include <iomanip>

void printProgress(double percentage) { int barWidth = 70;

std::cout << "[";

int pos = barWidth * percentage; for (int i = 0; i < barWidth; ++i) { if (i < pos) std::cout << "=";

else if (i == pos) std::cout << ">"; else std::cout << " ";

}

std::cout << "] " << std::fixed << std::setprecision(1) << (percentage * 100.0) << " %\r"; std::cout.flush();

}

void clearProgressLine() { const int lineWidth = 85;

std::cout << "\r" << std::string(lineWidth, ' ') << "\r"; std::cout.flush();

}

Matrix createMatrix(int size) {

return Matrix(size, std::vector<double>(size, 0.0));

}

void initializeMatrix(Matrix& mat) { if (mat.empty()) return;

int size = mat.size();

for (int i = 0; i < size; ++i) { for (int j = 0; j < size; ++j) {

mat[i][j] = (rand() % 100) / 10.0;

}

void multiplySerial(const Matrix& a, const Matrix& b, Matrix& result) { int size = a.size();

for (int i = 0; i < size; ++i) { if (i % 10 == 0) {

printProgress((double)i / size);

}

for (int j = 0; j < size; ++j) { result[i][j] = 0.0;

}

for (int k = 0; k < size; ++k) { for (int j = 0; j < size; ++j) {

result[i][j] += a[i][k] * b[k][j];

}

printProgress(1.0);

}

void multiplyParallelFor(const Matrix& a, const Matrix& b, Matrix& result, int numThreads) { int size = a.size();

#pragma omp parallel for num_threads(numThreads) for (int i = 0; i < size; ++i) {

if (i % 50 == 0) { #pragma omp critical

{