Интегрирование унаследованного кода

Если вы высоко оптимизировали код CUDA^® для определенных подфункций, которые вы хотите включить в свой сгенерированный код, GPU Coder™ расширяет coder.ceval функциональность, чтобы помочь вам достигнуть этой цели.

Внешняя функция CUDA должна использовать __device__ спецификатор, чтобы выполнить функцию на устройстве графического процессора. Эти функции устройства отличаются от глобальных функций (ядра), в которых они могут только быть названы от другого устройства или глобальных функций. Поэтому coder.ceval вызовы функций устройства должны быть из цикла, который сопоставлен с ядром.

Примечание

Генерация кода перестала работать если цикл, содержащий coder.ceval вызовы не могут быть сопоставлены с ядром. Смотрите тему поиска и устранения неисправностей в документации GPU Coder, чтобы проверять на проблемы, предотвращающие создание ядра и их предложенные обходные решения. Если ваша секция кода MATLAB^® содержит неподдерживаемые функции, то необходимо удалить coder.ceval вызовы от таких разделов.

`coder.ceval` для GPU Coder

coder.ceval('-gpudevicefcn', 'devicefun_name',devicefun_arguments) подмножество coder.ceval функция от MATLAB Coder™, который позволяет вам вызывать __device__ функции из ядер. '-gpudevicefcn' указывает к coder.ceval то, что целевая функция находится на устройстве графического процессора. devicefun_name имя __device__ функция и devicefun_arguments список, разделенный запятыми входных параметров в порядке что devicefun_name требует.

Для генерации кода необходимо задать тип, размер и тип данных сложности аргументов перед вызовом coder.ceval.

Эта функция является функцией генерации кода и вызывает ошибки, когда используется в противном случае.

Пример унаследованного кода

Пример несоизмеримости стерео измеряет расстояние между двумя соответствующими точками в левом и правом изображении стереопары. stereoDisparity_cuda_sample вызовы функции точки входа __usad4_wrap функция внешнего устройства при помощи coder.ceval функция.

%% modified algorithm for stereo disparity block matching
% In this implementation instead of finding shifted image ,indices are mapped 
% accordingly to save memory and some processing RGBA column major packed 
% data is used as input for compatibility with CUDA intrinsics. Convolution
% is performed using separable filters (Horizontal and then Vertical)

function [out_disp] = stereoDisparity_cuda_sample(img0,img1)
coder.cinclude('cuda_intrinsic.h');

% gpu code generation pragma
coder.gpu.kernelfun;

%% Stereo disparity Parameters
% WIN_RAD is the radius of the window to be operated,min_disparity is the 
% minimum disparity level the search continues for, max_disparity is the maximum
% disparity level the search continues for.
WIN_RAD = 8;
min_disparity = -16;
max_disparity = 0;

%% Image dimensions for loop control
% The number of channels packed are 4 (RGBA) so as nChannels are 4
[imgHeight,imgWidth]=size(img0);
nChannels = 4;
imgHeight = imgHeight/nChannels;

%% To store the raw differences
diff_img = zeros([imgHeight+2*WIN_RAD,imgWidth+2*WIN_RAD],'int32');

%To store the minimum cost
min_cost = zeros([imgHeight,imgWidth],'int32');
min_cost(:,:) = 99999999;

% Store the final disparity
out_disp = zeros([imgHeight,imgWidth],'int16');

%% Filters for aggregating the differences
% filter_h is the horizontal filter used in separable convolution
% filter_v is the vertical filter used in separable convolution which
% operates on the output of the row convolution
filt_h = ones([1 17],'int32');
filt_v = ones([17 1],'int32');

%% Main Loop that runs for all the disparity levels. This loop is currently
% expected to run on CPU.
for d=min_disparity:max_disparity
    
    % Find the difference matrix for the current disparity level. Expect
    % this to generate a Kernel function.
    coder.gpu.kernel;
    for colIdx=1:imgWidth+2*WIN_RAD
        coder.gpu.kernel;
        for rowIdx=1:imgHeight+2*WIN_RAD
            % Row index calculation
            ind_h = rowIdx - WIN_RAD;
            
            % Column indices calculation for left image
            ind_w1 = colIdx - WIN_RAD;
            
            % Row indices calculation for right image
            ind_w2 = colIdx + d - WIN_RAD;
            
            % Border clamping for row Indices
            if ind_h <= 0
                ind_h = 1;
            end
            if ind_h > imgHeight
                ind_h = imgHeight;
            end
            
            % Border clamping for column indices for left image
            if ind_w1 <= 0
                ind_w1 = 1;
            end
            if ind_w1 > imgWidth
                ind_w1 = imgWidth;
            end
            
            % Border clamping for column indices for right image
            if ind_w2 <= 0
                ind_w2 = 1;
            end
            if ind_w2 > imgWidth
                ind_w2 = imgWidth;
            end
            
            % In this step, Sum of absolute Differences is performed
            % across Four channels. This piece of code is suitable 
            % for replacement with SAD intrinsics
            tDiff = int32(0);
            tDiff = coder.ceval('-gpudevicefcn', '__usad4_wrap', 
                    coder.rref(img0((ind_h-1)*(nChannels)+1,ind_w1)), 
                    coder.rref(img1((ind_h-1)*(nChannels)+1,ind_w2)));
            
            %Store the SAD cost into a matrix
            diff_img(rowIdx,colIdx) = tDiff;
        end
    end
    
    % Aggregating the differences using separable convolution. Expect this 
    % to generate two Kernel using shared memory.The first kernel is the 
    % convolution with the horizontal kernel and second kernel operates on
    % its output the column wise convolution.
    cost_v = conv2(diff_img,filt_h,'valid');
    cost = conv2(cost_v,filt_v,'valid');
    
    % This part updates the min_cost matrix with by comparing the values
    % with current disparity level. Expect to generate a Kernel for this.
    for ll=1:imgWidth
        for kk=1:imgHeight
            % load the cost
            temp_cost = int32(cost(kk,ll));
            
            % compare against the minimum cost available and store the
            % disparity value
            if min_cost(kk,ll) > temp_cost
                min_cost(kk,ll) = temp_cost;
                out_disp(kk,ll) = abs(d) + 8;
            end
            
        end
    end
    
end
end

Определение для __usad4_wrap написан во внешнем файле cuda_intrinsic.h. Файл расположен в той же папке как функция точки входа.

__device__ unsigned int __usad4(unsigned int A, unsigned int B, unsigned int C=0)
{
    unsigned int result;
#if (__CUDA_ARCH__ >= 300) // Kepler (SM 3.x) supports a 4 vector SAD SIMD
    asm("vabsdiff4.u32.u32.u32.add" " %0, %1, %2, %3;": "=r"(result):"r"(A),
    "r"(B), "r"(C));
#else // SM 2.0            // Fermi  (SM 2.x) supports only 1 SAD SIMD, 
                           // so there are 4 instructions
    asm("vabsdiff.u32.u32.u32.add" " %0, %1.b0, %2.b0, %3;": 
         "=r"(result):"r"(A), "r"(B), "r"(C));
    asm("vabsdiff.u32.u32.u32.add" " %0, %1.b1, %2.b1, %3;": 
         "=r"(result):"r"(A), "r"(B), "r"(result));
    asm("vabsdiff.u32.u32.u32.add" " %0, %1.b2, %2.b2, %3;": 
         "=r"(result):"r"(A), "r"(B), "r"(result));
    asm("vabsdiff.u32.u32.u32.add" " %0, %1.b3, %2.b3, %3;": 
         "=r"(result):"r"(A), "r"(B), "r"(result));
#endif
    return result;
}

__device__ unsigned int packBytes(const uint8_T *inBytes)
{
    unsigned int packed = inBytes[0] | (inBytes[1] << 8) | 
                    (inBytes[2] << 16) | (inBytes[3] << 24);
    return packed;
}

__device__ unsigned int __usad4_wrap(const uint8_T *A, const uint8_T *B)
{
    unsigned int x = packBytes(A);
    unsigned int y = packBytes(B);

    return __usad4(x, y);
}

Сгенерируйте код CUDA

Сгенерируйте код CUDA путем создания объекта настройки кода. Задайте местоположение пользовательских файлов C путем установки свойств пользовательского кода (CustomInclude) на объектах настройки. Следующее является скриптом генерации примера кода, который указывает на местоположение cuda_intrinsic.h файл.

cfg = coder.gpuConfig('mex');
cfg.CustomInclude = pwd;

codegen -config cfg -args {imgRGB0, imgRGB1} stereoDisparity_cuda_sample_intrinsic;

Сгенерированный код

GPU Coder создает четыре ядра. Следующее является отрывком сгенерированного кода CUDA.

e_stereoDisparity_cuda_sample_i<<<dim3(704U, 1U, 1U), dim3(512U, 1U, 1U)>>>
                    (gpu_img1, gpu_img0, d, gpu_diff_img);*/
/*  Aggregating the differences using separable convolution.*/ 
/*  Expect this to generate two Kernel using shared memory.*/
/*  The first kernel is the convolution with the horizontal kernel and*/
/*  second kernel operates on its output the column wise convolution. */
f_stereoDisparity_cuda_sample_i<<<dim3(704U, 1U, 1U), dim3(512U, 1U, 1U)>>>
                    (gpu_diff_img, gpu_a);
g_stereoDisparity_cuda_sample_i<<<dim3(18U, 20U, 1U), dim3(32U, 32U, 1U)>>>
                    (gpu_a, gpu_cost_v);
h_stereoDisparity_cuda_sample_i<<<dim3(17U, 20U, 1U), dim3(32U, 32U, 1U)>>>
                    (gpu_a, gpu_cost_v);
/*  This part updates the min_cost matrix with by comparing the values */
/*  with current disparity level. Expect to generate a Kernel for this. */
i_stereoDisparity_cuda_sample_i<<<dim3(667U, 1U, 1U), dim3(512U, 1U, 1U)>>>
                    (d, gpu_cost, gpu_out_disp, gpu_min_cost);

e_stereoDisparity_cuda_sample_i ядро является тем, которое вызывает __usad4_wrap функция устройства. Следующее является отрывком e_stereoDisparity_cuda_sample_i код ядра.

static __global__ __launch_bounds__(512, 1) void e_stereoDisparity_cuda_sample_i
  (const uint8_T *img1, const uint8_T *img0, int32_T d, int32_T *diff_img)
{
  ...
    /*  In this step, Sum of absolute Differences is performed */
    /*  across Four channels. This piece of code is suitable */
    /*  for replacement with SAD intrinsics */
    temp_cost = __usad4_wrap(&img0[((ind_h - 1) << 2) + 2132 * (ind_w1 - 1)],
      &img1[((ind_h - 1) << 2) + 2132 * (temp_cost - 1)]);

    /* Store the SAD cost into a matrix */
    diff_img[rowIdx + 549 * colIdx] = temp_cost;
  }
}

Документация