GOST 28147-89 (Simple replacement mode). (C++) - CUDA

The program implements the encryption algorithm GOST 28147-89 in the simple replacement mode. Initially, I wanted to do encryption and decryption on the CPU and on the GPU. There were no problems on the CPU, it works both ways without complaints. I can't guarantee the beauty of the code and optimization, at my level, the most important thing is that the program just works. There were difficulties on the GPU. The code compiles, passes all the stages. But at the output, when encrypted, there are results that are different from the results on the CPU and it is also not possible to decipher this result. I.e. it does not work correctly.

Mb can someone tell me why this might be?

To file "in.txt" for the initial check, I set a series of numbers "12345", so that their size does not exceed the dimension of one block.

The initial algorithm was taken from one article on habra, which is displayed in the search engine among the very first.

Also, the difficulty is that there is no way to install NVIDIA Parallel Nsight and see what is there inside is happening. For I / O files, the Windows encoding is 1251.

#define _CRT_SECURE_NO_WARNINGS
#include <cstdlib>
#include <stdio.h>
#include <conio.h>
#include <iostream>
#include <fstream>
#include <iomanip>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <omp.h>

#define TH 1

unsigned char p[8][16] =   // таблица замен в памяти хоста
{
    { 0xc, 0x4, 0x6, 0x2, 0xa, 0x5, 0xb, 0x9, 0xe, 0x8, 0xd, 0x7, 0x0, 0x3, 0xf, 0x1 },
    { 0x6, 0x8, 0x2, 0x3, 0x9, 0xa, 0x5, 0xc, 0x1, 0xe, 0x4, 0x7, 0xb, 0xd, 0x0, 0xf },
    { 0xb, 0x3, 0x5, 0x8, 0x2, 0xf, 0xa, 0xd, 0xe, 0x1, 0x7, 0x4, 0xc, 0x9, 0x6, 0x0 },
    { 0xc, 0x8, 0x2, 0x1, 0xd, 0x4, 0xf, 0x6, 0x7, 0x0, 0xa, 0x5, 0x3, 0xe, 0x9, 0xb },
    { 0x7, 0xf, 0x5, 0xa, 0x8, 0x1, 0x6, 0xd, 0x0, 0x9, 0x3, 0xe, 0xb, 0x4, 0x2, 0xc },
    { 0x5, 0xd, 0xf, 0x6, 0x9, 0x2, 0xc, 0xa, 0xb, 0x7, 0x8, 0x1, 0x4, 0x3, 0xe, 0x0 },
    { 0x8, 0xe, 0x2, 0x5, 0x6, 0x9, 0x1, 0xc, 0xf, 0x4, 0xb, 0x0, 0xd, 0xa, 0x3, 0x7 },
    { 0x1, 0x7, 0xe, 0xd, 0x0, 0x5, 0x8, 0x3, 0x4, 0xf, 0xa, 0x6, 0x9, 0xc, 0xb, 0x2 }
};

// ключик в памяти хоста
unsigned __int32 key[8] =
{
    0x0123,
    0x4567,
    0x89AB,
    0xCDEF,
    0x0123,
    0x4567,
    0x89AB,
    0xCDEF
};

#pragma once

using std::cout;
using std::cin;
using std::endl;
//---------------------------------------------------------------------------
// взято из хелпа, определяем размер файла
long filesize(FILE *stream)
{
    long curpos, length;
    curpos = ftell(stream);
    fseek(stream, 0L, SEEK_END);
    length = ftell(stream);
    fseek(stream, curpos, SEEK_SET);
    return length;
}

__device__ unsigned char pD[8][16] =  // таблица замен в памяти устройства
{
{ 0xc, 0x4, 0x6, 0x2, 0xa, 0x5, 0xb, 0x9, 0xe, 0x8, 0xd, 0x7, 0x0, 0x3, 0xf, 0x1 },
{ 0x6, 0x8, 0x2, 0x3, 0x9, 0xa, 0x5, 0xc, 0x1, 0xe, 0x4, 0x7, 0xb, 0xd, 0x0, 0xf },
{ 0xb, 0x3, 0x5, 0x8, 0x2, 0xf, 0xa, 0xd, 0xe, 0x1, 0x7, 0x4, 0xc, 0x9, 0x6, 0x0 },
{ 0xc, 0x8, 0x2, 0x1, 0xd, 0x4, 0xf, 0x6, 0x7, 0x0, 0xa, 0x5, 0x3, 0xe, 0x9, 0xb },
{ 0x7, 0xf, 0x5, 0xa, 0x8, 0x1, 0x6, 0xd, 0x0, 0x9, 0x3, 0xe, 0xb, 0x4, 0x2, 0xc },
{ 0x5, 0xd, 0xf, 0x6, 0x9, 0x2, 0xc, 0xa, 0xb, 0x7, 0x8, 0x1, 0x4, 0x3, 0xe, 0x0 },
{ 0x8, 0xe, 0x2, 0x5, 0x6, 0x9, 0x1, 0xc, 0xf, 0x4, 0xb, 0x0, 0xd, 0xa, 0x3, 0x7 },
{ 0x1, 0x7, 0xe, 0xd, 0x0, 0x5, 0x8, 0x3, 0x4, 0xf, 0xa, 0x6, 0x9, 0xc, 0xb, 0x2 }
};

__device__ unsigned __int32 keyD[8] =   // ключ в памяти устройства
{
0x0123,
0x4567,
0x89AB,
0xCDEF,
0x0123,
0x4567,
0x89AB,
0xCDEF
};

__global__ void rpz_DEVICE(unsigned __int32* n1_d, unsigned __int32* n2_d, int* rezhim_d, int* block_D)
{

    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    unsigned __int32 SUM232 = 0;
    if (i < *block_D)
    {
        // 32 цикла простой замены
        // ключ считываем в требуемом ГОСТом порядке
        int c = 0;
        for (int k = 0; k<32; k++)
        {
            if (*rezhim_d == 1) { if (k == 24) c = 7; }
            else { if (k == 8) c = 7; }

            // суммируем в сумматоре СМ1
            SUM232 = keyD[c] + n1_d[i];

            // заменяем по таблице замен
            unsigned char first_byte = 0, second_byte = 0, zam_symbol = 0;
            int n = 7;
            for (int q = 3; q >= 0; q--)
            {
                zam_symbol = *((unsigned char *)&SUM232 + q);
                first_byte = (zam_symbol & 0xF0) >> 4;
                second_byte = (zam_symbol & 0x0F);
                first_byte = pD[n][first_byte];
                n--;
                second_byte = pD[n][second_byte];
                n--;
                zam_symbol = (first_byte << 4) | second_byte;
                *((unsigned char *)&SUM232 + q) = zam_symbol;
            }

            SUM232 = (SUM232 << 11) | (SUM232 >> 21); // циклический сдвиг на 11
            SUM232 = n2_d[i] ^ SUM232; // складываем в сумматоре СМ2

            if (k<31)
            {
                n2_d[i] = n1_d[i];
                n1_d[i] = SUM232;
            }
            if (*rezhim_d == 1)
            {
                if (k<24)
                {
                    c++;
                    if (c>7) c = 0;
                }
                else
                {
                    c--;
                    if (c<0) c = 7;
                }
            }
            else
            {
                if (k<8)
                {
                    c++;
                    if (c>7) c = 0;
                }
                else
                {
                    c--;
                    if (c<0) c = 7;
                }
            }
        }
        n2_d[i] = SUM232;
    }
}

// функция, реализующая работу ГОСТ 28147-89 в режиме простой замены
void rpz_HOST(int block_H, unsigned __int32 *n1, unsigned __int32 *n2, int rezh)
{

    unsigned __int32 SUM232 = 0;
    for (int i = 0; i<block_H; i++)
    {
        // 32 цикла простой замены
        // ключ считываем в требуемом ГОСТом порядке
        int c = 0;
        for (int k = 0; k<32; k++)
        {
            if (rezh == 1) { if (k == 24) c = 7; }
            else { if (k == 8) c = 7; }

            // суммируем в сумматоре СМ1
            SUM232 = key[c] + n1[i];

            // заменяем по таблице замен
            unsigned char first_byte = 0, second_byte = 0, zam_symbol = 0;
            int n = 7;
            for (int q = 3; q >= 0; q--)
            {
                zam_symbol = *((unsigned char *)&SUM232 + q);
                first_byte = (zam_symbol & 0xF0) >> 4;
                second_byte = (zam_symbol & 0x0F);
                first_byte = p[n][first_byte];
                n--;
                second_byte = p[n][second_byte];
                n--;
                zam_symbol = (first_byte << 4) | second_byte;
                *((unsigned char *)&SUM232 + q) = zam_symbol;
            }

            SUM232 = (SUM232 << 11) | (SUM232 >> 21); // циклический сдвиг на 11
            SUM232 = n2[i] ^ SUM232; // складываем в сумматоре СМ2

            if (k<31)
            {
                n2[i] = n1[i];
                n1[i] = SUM232;
            }
            if (rezh == 1)
            {
                if (k<24)
                {
                    c++;
                    if (c>7) c = 0;
                }
                else
                {
                    c--;
                    if (c<0) c = 7;
                }
            }
            else
            {
                if (k<8)
                {
                    c++;
                    if (c>7) c = 0;
                }
                else
                {
                    c--;
                    if (c<0) c = 7;
                }
            }
        }
        n2[i] = SUM232;
    }
}

//---------------------------------------------------------------------------
int main()
{
    // выбираем шифрование или расшифрование
    int HorD = 0;
    int rezhim = 0;
    setlocale(LC_ALL, "Russian");
    //Переменная которая будет хранить время выполнения программы на устройстве
    double start_execution_time = 0.0;
    double end_execution_time = 0.0;
    do
    {
        printf("Выберите режим работы:\nНа хосте      - 1\nНа устройстве - 2\n");
        scanf("%d", &HorD);
    } while ((HorD != 1) && (HorD != 2)); // повторяем до тех пор, пока не будет введено 1 или 

    do
    {
        printf("Выберите режим работы:\nШифрование    - 1\nРасшифрование - 2\n");
        scanf("%d", &rezhim);
    } while ((rezhim != 1) && (rezhim != 2)); // повторяем до тех пор, пока не будет введено 1 или 
    FILE *f_begin, *f_end; // потоки для исходного и конечного файлов
    char open_str[8] = "in.txt";
    char save_str[9] = "out.txt";
    char N_H[4]; // 32-разрядный накопитель,
    unsigned __int32 *n1, *n2; // накопители N1, N2, и сумматор
    // открываем файлы
    f_begin = fopen(open_str, "rb"); // Открывает двоичный файл для чтения
    f_end = fopen(save_str, "ab"); //дописывает информацию в двоичныйы файл для записи
    // определим количество блоков
    float blokoff;
    blokoff = 8 * filesize(f_begin);
    blokoff = blokoff / 64;
    int block_H = blokoff;
    if (blokoff - block_H>0) block_H++;
    int sh;
    if (filesize(f_begin) >= 4) sh = 4; else sh = filesize(f_begin);
    int sh1 = 0;
    int flag = 0;
    n1 = new unsigned __int32[block_H];
    n2 = new unsigned __int32[block_H];
    // начнем считывание и преобразование блоков
    // присутствуют проверки на полноту блоков, чтобы считать только нужное количество бит
    for (int i = 0; i<block_H; i++)
    {
        // записываем в накопитель N1
        for (int q = 0; q<4; q++) *((unsigned char *)&N_H + q) = 0x00;
        if ((sh1 + sh)<filesize(f_begin))
        {
            fread(N_H, sh, 1, f_begin);
            sh1 += sh;
        }
        else
        {
            sh = filesize(f_begin) - sh1;
            fread(N_H, sh, 1, f_begin);
            flag = 1;
        }
        n1[i] = *((unsigned __int32 *)&N_H);
        cout << endl;
        cout << "i = " << i << endl;
        cout << "n1[i] = " << n1[i] << endl;

        // записываем в накопитель N2
        for (int q = 0; q<4; q++) *((unsigned char *)&N_H + q) = 0x00;
        if ((sh1 + sh)<filesize(f_begin))
        {
            fread(N_H, sh, 1, f_begin);
            sh1 += sh;
        }
        else
        {
            if (flag == 0)
            {
                sh = filesize(f_begin) - sh1;
                fread(N_H, sh, 1, f_begin);
            }
        }
        n2[i] = *((unsigned __int32 *)&N_H);
        cout << "n2[i] = " << n2[i] << endl;
    }
    start_execution_time = omp_get_wtime(); // Начальная отсечка времени
    if (HorD == 1)
    {
        rpz_HOST(block_H, n1, n2, rezhim); // запускаем РПЗ
    }
    else
    {
        int blocks, thread;
        thread = TH;
        blocks = (block_H*block_H) / TH + 1;

        int* block_D;
        int *rezhim_d;
        unsigned __int32 *n1_d, *n2_d;  
        unsigned __int32 *n1_h, *n2_h;  
        int NumBytes = block_H*sizeof(unsigned __int32);

        cudaMallocHost((void**) &n1_h,      NumBytes    );
        cudaMallocHost((void**) &n2_h,      NumBytes    );

        cudaMalloc((void**) &n1_d,      NumBytes    );
        cudaMalloc((void**) &n2_d,      NumBytes    );
        cudaMalloc((void**) &rezhim_d,  sizeof(int) );
        cudaMalloc((void**) &block_D,   sizeof(int) );
        cudaMemcpy(n1_d, &n1, NumBytes, cudaMemcpyHostToDevice);
        cudaMemcpy(n2_d, &n2, NumBytes, cudaMemcpyHostToDevice);
        cudaMemcpy(block_D,     &block_H,   sizeof(int),    cudaMemcpyHostToDevice);
        cudaMemcpy(rezhim_d,    &rezhim,    sizeof(int),    cudaMemcpyHostToDevice);
        rpz_DEVICE << <blocks, thread >> >( n1_d,   n2_d,   rezhim_d,   block_D );
        cudaMemcpy(n1_h, n1_d, NumBytes, cudaMemcpyDeviceToHost);
        cudaMemcpy(n2_h, n2_d, NumBytes, cudaMemcpyDeviceToHost);
        for (int i = 0; i<block_H; i++)
        {
            n1[i] = n1_h[i];
            n2[i] = n2_h[i];

        }
    }
    end_execution_time = omp_get_wtime(); //Конечная отсечка времени
    printf("Время выполнения на программы:  %lf \n", end_execution_time - start_execution_time);
    for (int i = 0; i<block_H; i++)
    {
        cout << endl;
        cout << "i rez  = " << i << endl;
        cout << "n1[i] rez = " << n1[i] << endl;
        cout << "n2[i] rez = " << n2[i] << endl;

        // вывод результата в файл

        char sym_rez;
        for (int q = 0; q <= 3; q++)
        {
            sym_rez = *((unsigned char *)&n1[i] + q);
            fprintf(f_end, "%c", sym_rez);
        }
        for (int q = 0; q <= 3; q++)
        {
            sym_rez = *((unsigned char *)&n2[i] + q);
            fprintf(f_end, "%c", sym_rez);
        }
    }
    fclose(f_begin);
    fclose(f_end);
    system("pause");
    return 0;
}
//---------------------------------------------------------------------------
Author: Студент, 2018-07-04