Parallel implementation of Hoare sorting

There is a task to make a parallel implementation of quick sorting, but I get it so that the parallel algorithm works many times slower than the sequential one. I think that somewhere I messed up with recursion in sections. I would be very grateful for your help

#include "stdafx.h"
#include <iostream>
#include <ctime>
#include <omp.h>

using namespace std;

const int n = 1000;
int first, last;

void quicksort(int* mas, int first, int last, int d = 0) {
    int mid, count, v;
    int f = first, l = last;
    mid = mas[(f + l) / 2];
    do {
        while (mas[f] < mid)
            f++;
        while (mas[l] > mid)
            l--;

        if (f <= l) {
            count = mas[f];
            mas[f] = mas[l];
            mas[l] = count;
            f++;
            l--;
            d++;
        }
    } while (f < l);

    #pragma omp parallel num_threads(2) if (d == 1)
    #pragma omp sections
    {
        #pragma omp section
        {
            if (first < l)
                quicksort(mas, first, l);
            //cout << "I am proc number: " << omp_get_thread_num() << " " << endl;
        }
        #pragma omp section
        {
            if (f < last)
                quicksort(mas, f, last);
            //cout << "I am proc number: " << omp_get_thread_num() << " "<< endl;
        }
    }

    //cout << "d = " << d;
}


void main() {
    int* A = new int[n];

    /*cout<<"Initial massive: ";
    for (int i=0; i<n; i++) {
        A[i]=rand()%100;
        cout<<A[i]<<" ";
    }*/

    first = 0;
    last = n - 1;
    double start_time = clock();
    quicksort(A, first, last);

    /*cout<<endl<<"Result massive: ";
    for (int i=0; i<n; i++) cout<<A[i]<<" ";*/

    double end_time = clock();
    cout << endl;
    double time = end_time - start_time;
    printf("%f", (double)time / CLOCKS_PER_SEC);
    delete[] A;
    cin.get();
}
 0
Author: AivanF., 2018-04-10

2 answers

Code errors in the question

There are many problems with the given code, both methodological and algorithmic, and, of course, in working with openmp. Here's what caught my eye:

  • The array is too small to evaluate or benefit from parallelization
  • clock () returns the CPU time of the entire process, which is unsuitable for measuring the execution time of a multithreaded application.
  • The ranges of recursive calls may overlap. in single-threaded mode, this can be forgiven, but in multithreaded mode, it is fatal.
  • Due to the clueless use of the recursion depth counter (d), the code always runs in a single thread (with the cost of openmp calls).
  • Even if it worked correctly, it would create no more than two threads, unless you call omp_set_nested(1) or pass the OMP_NESTED=true variable to the environment.

How to do

Although parallelization with recursion can be limited by means of sections, it will be more effective to do this with the help of tasks (task):

#include <algorithm>
#include <chrono>
#include <utility>

#include <cstdio>
#include <cstdlib>

#include <omp.h>

using namespace std;

constexpr size_t n = 1024*1024*128;
//const int n = 16;

static size_t minParallelSize = 128*1024;
static unsigned maxParallelDepth = 4;


void serialQuicksort(int* mas, size_t first, size_t last) {
    int mid;
    size_t f = first, l = last;
    mid = mas[(f + l) / 2];
    while (1) {
        while (mas[f] < mid) { ++f; };
        while (mas[l] > mid) { --l; };

        if (f >= l) { break; }

        std::swap (mas[f++], mas[l--]);
    }

    size_t part = l;

    if (first < part)
        serialQuicksort(mas, first, part);
    if (part+1 < last)
        serialQuicksort(mas, part+1, last);
}

void quicksort(int* mas, size_t first, size_t last, unsigned d = 0) {
    int mid;
    size_t f = first, l = last;
    mid = mas[(f + l) / 2];
    while (1) {
        while (mas[f] < mid) { ++f; };
        while (mas[l] > mid) { --l; };

        if (f >= l) { break; }

        std::swap (mas[f++], mas[l--]);
    }

    size_t part = l;

    #pragma omp task if (d <maxParallelDepth && (last - first) > minParallelSize)
    {
        if (first < part)
            quicksort(mas, first, part, d+1);
    }
    #pragma omp task if (d <maxParallelDepth && (last - first) > minParallelSize)
    {
        if (part+1 < last)
            quicksort(mas, part+1, last, d+1);
    }
    #pragma omp taskwait
}


int main() {
    int rc = EXIT_SUCCESS;

    int* parallelA = new int[n];
    int*   serialA = new int[n];

    for (size_t i=0; i<n; i++) {
        int r = rand();
        parallelA[i] = r;
        serialA[i]   = r;
    }

    [[maybe_unused]] unsigned num_threads = std::min<unsigned> (
                n / minParallelSize / 2,
                1 << (maxParallelDepth-1) ); 

    auto start_time = std::chrono::steady_clock::now();
    #pragma omp parallel num_threads(num_threads)
    {
        #pragma omp single
        { quicksort(parallelA, 0, n-1); }
    }
    auto end_time = std::chrono::steady_clock::now();
    std::chrono::duration<double, std::milli> time = end_time - start_time;

    fprintf(stderr, "Complete parallel qsort in %-8.3lfms\n", time.count ());

    start_time = std::chrono::steady_clock::now();
    serialQuicksort(serialA, 0, n-1);
    end_time = std::chrono::steady_clock::now();
    time = end_time - start_time;
    fprintf(stderr, "Complete   serial qsort in %-8.3lfms\n", time.count ());

    for (size_t i=0; i<n-1; i++) {
        if (parallelA[i]>parallelA[i+1]) {
            rc = EXIT_FAILURE;
            fprintf (stderr, "Array is not sorted A[%zd]=%d (>) A[%zd]=%d\n", 
                     i, parallelA[i], i+1, parallelA[i+1] );
            break;
        }
    }

    delete[] parallelA;
    delete[] serialA;

    return rc;
}

Typical result:

Complete parallel qsort in 6492.260ms
Complete   serial qsort in 13031.032ms

That is, the performance gain is ±2 times on 4 real cores.

Further optimization:

The example given is poorly optimized and requires more fine-tuning. First, you need to select such magic constants as minParallelSize and maxParallelDepth, which limit, respectively, the minimum array size and the maximum recursion depth for which tasks will be created for parallel execution. They are installed exclusively experimentally.

Next, you need to find the formula for num_threads in main(). The fact is that, apparently (just my guess), switching threads in the kernel is somewhat faster than switching tasks in the OpenmMP implementation(at least in the linux/gomp bundle). As a result, a simple excessive increase in the number of threads gives an increase of up to 25%. Current formula is clean speculative. Among other things, it should probably depend on the number of available processors.

In addition, calls to non-parallel versions under the above conditions, instead of simple recursion, will give a significant performance boost; as well as the use of other sorting methods for small arrays (pyramid? shell? merge?).

 3
Author: Fat-Zer, 2018-04-16 11:52:19

I think you're just measuring the results wrong. It is not enough to run the code one way, another way, and say what is better. You need to loop both options at least 1000 times, then compare the average results. Here is my slightly modified version (the creation of threads occurs regardless of the depth of recursion):

#include <iostream>
#include <ctime>
#include <omp.h>

using namespace std;

const int total_count = 50000;
const int n = 1000;
int first, last;

void quicksort(int* mas, int first, int last, int d=0) {
    int mid, count, v;
    int f = first, l = last;
    mid = mas[(f + l) / 2];
    do {
        while (mas[f] < mid)
            f++;
        while (mas[l] > mid)
            l--;

        if (f <= l) {
            count = mas[f];
            mas[f] = mas[l];
            mas[l] = count;
            f++;
            l--;
            d++;
        }
    } while (f < l);

    #pragma omp parallel num_threads(2)
    #pragma omp sections
    {
        #pragma omp section
        {
            if (first < l)
                quicksort(mas, first, l, d);
        }
        #pragma omp section
        {
            if (f < last)
                quicksort(mas, f, last, d);
        }
    }
}

int main() {
    int* A = new int[n];
    double total_time = 0;

    for (int once = 0; once < total_count; once++) {
        for (int i=0; i<n; i++) {
            A[i] = rand() % 10000;
        }
        first = 0;
        last = n - 1;
        double start_time = clock();
        quicksort(A, first, last);
        double end_time = clock();
        total_time += end_time - start_time;
    }

    total_time /= total_count;
    printf("Avg.time: %f", (double)total_time / CLOCKS_PER_SEC);
    delete[] A;
    cin.get();
    return 0;
}

As a result, I have both options reduced to exactly 0.000044. This, of course,is not a win, but it is not a loss.

 0
Author: AivanF., 2018-04-10 09:52:57