typedef struct __attribute__((packed)) { uint rows; uint cols; char transposed; } cl_GPUMat; #define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] #define matrixGetIJ(mat, idx, i, j) \ { \ if(!mat.transposed) { \ i = idx / mat.cols; \ j = idx % mat.cols; \ } else { \ i = idx / mat.rows; \ j = idx % mat.rows; \ } \ } void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { uint idx = get_global_id(0); if(idx >= matOut.rows * matOut.cols) return; uint i, j; matrixGetIJ(matOut, idx, i, j); float sum = 0; for(unsigned int k = 0; k < matA.cols; k++) { sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j); } matrixAt(matOut, matOut_values, i, j) = sum; } void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) { uint idx = get_global_id(0); if(idx >= matA.rows * matA.cols) return; matA_values[idx] += matB_values[idx]; } void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) { uint idx = get_global_id(0); if(idx >= mat.rows * mat.cols) return; mat_values[idx] = 1 / (1 + exp(-mat_values[idx])); } void mat_dSigmoid(cl_GPUMat mat, __global float *mat_values) { uint idx = get_global_id(0); if(idx >= mat.rows * mat.cols) return; float v = mat_values[idx]; mat_values[idx] = v * (1 - v); } void mat_copy(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) { uint idx = get_global_id(0); if(idx >= mat.rows * mat.cols) return; other_values[idx] = mat_values[idx]; } void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) { uint idx = get_global_id(0); if(idx >= mat.rows * mat.cols) return; other_values[idx] *= mat_values[idx]; } void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) { uint idx = get_global_id(0); if(idx >= mat.rows * mat.cols) return; mat_values[idx] *= scalar; } __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) { // FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations! for(unsigned int b = 0; b < batchSize; b++) { __global float *batchInput_values = input_values + b * input.rows * input.cols; __global float *batchOut_values = out_values + b * out.rows * out.cols; mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values); mat_add(out, batchOut_values, bias, bias_values); mat_sigmoid(out, batchOut_values); } } __kernel void linear_backprop_1(unsigned int batchSize, float learnRate, cl_GPUMat weights, __global float *weights_values, cl_GPUMat outputs, __global float *outputs_values, cl_GPUMat inputErrors, __global float *inputErrors_values, cl_GPUMat outputGradients, __global float *outputGradients_values) { for(unsigned int b = 0; b < batchSize; b++) { __global float *batchOut_values = outputs_values + b * outputs.rows * outputs.cols; __global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols; __global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols; mat_copy(outputs, batchOut_values, outputGradients, batchOutGradients_values); mat_dSigmoid(outputGradients, batchOutGradients_values); // dsig(yhat) mat_multiply_elements(outputGradients, batchOutGradients_values, inputErrors, batchInErrors_values); // (yhat - y) . dsig(yhat) mat_multiply_scalar(outputGradients, batchOutGradients_values, learnRate); } } __kernel void linear_backprop_2(unsigned int batchSize, cl_GPUMat weights, __global float *weights_values, cl_GPUMat inputs, __global float *inputs_values, cl_GPUMat inputErrors, __global float *inputErrors_values, char updateErrors, cl_GPUMat outputErrors, __global float *outputErrors_values, cl_GPUMat outputWeightsErrors, __global float *outputWeightsErrors_values, cl_GPUMat outputGradients, __global float *outputGradients_values) { for(unsigned int b = 0; b < batchSize; b++) { __global float *batchInput_values = inputs_values + b * inputs.rows * inputs.cols; __global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols; __global float *batchOutErrors_values = outputErrors_values + b * outputErrors.rows * outputErrors.cols; __global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols; __global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols; cl_GPUMat inputsT = inputs; inputsT.transposed = true; mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); // clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]); // clm_matrixAddMatrix(linear->bias, gradient); if(updateErrors) { cl_GPUMat weightsT = weights; weightsT.transposed = true; mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); } } }