nn-testing/src/mat.cl
2024-02-22 17:42:19 +01:00

144 lines
5.5 KiB
Common Lisp

typedef struct __attribute__((packed)) {
uint rows;
uint cols;
char transposed;
} cl_GPUMat;
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
#define matrixGetIJ(mat, idx, i, j) \
{ \
if(!mat.transposed) { \
i = idx / mat.cols; \
j = idx % mat.cols; \
} else { \
i = idx % mat.rows; \
j = idx / mat.rows; \
} \
}
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
uint idx = get_global_id(0);
if(idx >= matOut.rows * matOut.cols) return;
// TODO: might not work with transposed matOut
uint i = idx / matOut.cols;
uint j = idx % matOut.cols;
float sum = 0;
for(unsigned int k = 0; k < matA.cols; k++) {
sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
}
matOut_values[idx] = sum;
}
void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
uint idx = get_global_id(0);
if(idx >= matA.rows * matA.cols) return;
matA_values[idx] += matB_values[idx];
}
void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
}
void mat_dSigmoid(cl_GPUMat mat, __global float *mat_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
float v = mat_values[idx];
mat_values[idx] = v * (1 - v);
}
void mat_copy(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
other_values[idx] = mat_values[idx];
}
void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
other_values[idx] *= mat_values[idx];
}
void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
mat_values[idx] *= scalar;
}
cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) {
cl_GPUMat tr = {0};
tr.cols = mat.rows;
tr.rows = mat.cols;
tr.transposed = !mat.transposed;
return tr;
}
__kernel void linear_forward(unsigned int batchSize,
cl_GPUMat input, __global float *input_values,
cl_GPUMat weights, __global float *weights_values,
cl_GPUMat bias, __global float *bias_values,
cl_GPUMat out, __global float *out_values) {
// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchInput_values = input_values + b * input.rows * input.cols;
__global float *batchOut_values = out_values + b * out.rows * out.cols;
mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values);
mat_add(out, batchOut_values, bias, bias_values);
mat_sigmoid(out, batchOut_values);
}
}
__kernel void linear_backprop_1(unsigned int batchSize, float learnRate,
cl_GPUMat weights, __global float *weights_values,
cl_GPUMat outputs, __global float *outputs_values,
cl_GPUMat inputErrors, __global float *inputErrors_values,
cl_GPUMat outputGradients, __global float *outputGradients_values) {
for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchOut_values = outputs_values + b * outputs.rows * outputs.cols;
__global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols;
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
mat_copy(outputs, batchOut_values, outputGradients, batchOutGradients_values);
mat_dSigmoid(outputGradients, batchOutGradients_values); // dsig(yhat)
mat_multiply_elements(outputGradients, batchOutGradients_values, inputErrors, batchInErrors_values); // (yhat - y) . dsig(yhat)
mat_multiply_scalar(outputGradients, batchOutGradients_values, learnRate);
}
}
__kernel void linear_backprop_2(unsigned int batchSize,
cl_GPUMat weights, __global float *weights_values,
cl_GPUMat inputs, __global float *inputs_values,
cl_GPUMat inputErrors, __global float *inputErrors_values,
char updateErrors,
cl_GPUMat outputErrors, __global float *outputErrors_values,
cl_GPUMat outputWeightsErrors, __global float *outputWeightsErrors_values,
cl_GPUMat outputGradients, __global float *outputGradients_values) {
for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchInput_values = inputs_values + b * inputs.rows * inputs.cols;
__global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols;
__global float *batchOutErrors_values = outputErrors_values + b * outputErrors.rows * outputErrors.cols;
__global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols;
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
cl_GPUMat inputsT = clm_matrixTranspose(inputs);
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
if(updateErrors) {
cl_GPUMat weightsT = clm_matrixTranspose(weightsT);
// mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
}
}
}