Fix OpenCL implementation

This commit is contained in:
MrLetsplay 2024-03-06 22:45:01 +01:00
parent e7ec45ba0a
commit 09fea04022
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
5 changed files with 105 additions and 52 deletions

View File

@ -253,11 +253,20 @@ void clm_freeLinear(clm_Linear linear) {
clm_freeMatrix(linear.bias.matrix); clm_freeMatrix(linear.bias.matrix);
} }
void clm_vectorPrint(clm_Vector vec) {
printf("[");
for(unsigned int i = 0; i < vec.length; i++) {
printf("%f", vec.values[i]);
if(i != vec.length - 1) printf(" ");
}
printf("]\n");
}
void clm_matrixPrint(clm_Matrix mat) { void clm_matrixPrint(clm_Matrix mat) {
printf("[\n"); printf("[\n");
for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int i = 0; i < mat.rows; i++) {
for(unsigned int j = 0; j < mat.cols; j++) { for(unsigned int j = 0; j < mat.cols; j++) {
printf("%7.3f", matrixAt(mat, i, j)); printf("%f ", matrixAt(mat, i, j));
} }
printf("\n"); printf("\n");
} }

View File

@ -81,6 +81,7 @@ bool clm_vectorIsInvalid(clm_Vector vec);
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs); clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs);
clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize); clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize);
void clm_vectorPrint(clm_Vector vec);
void clm_matrixPrint(clm_Matrix mat); void clm_matrixPrint(clm_Matrix mat);
void clm_matrixPrintShape(clm_Matrix mat); void clm_matrixPrintShape(clm_Matrix mat);
void clm_freeMatrix(clm_Matrix mat); void clm_freeMatrix(clm_Matrix mat);

View File

@ -49,7 +49,7 @@ int clm_gpuInit(unsigned int mode) {
return 1; return 1;
} }
printf("%s", buffer); // printf("%s", buffer);
context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
if(!context) { if(!context) {
@ -132,13 +132,16 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m
cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf); cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf);
for(unsigned int i = 0; i < numMats; i++) { for(unsigned int i = 0; i < numMats; i++) {
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, mem, CL_FALSE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err)); printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
return NULL; return NULL;
} }
} }
clFlush(queue);
clFinish(queue);
return mem; return mem;
} }
@ -153,20 +156,22 @@ static cl_mem writeNativeMatrix(clm_NativeMatrix matrix) {
return mem; return mem;
} }
static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array) { static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array, unsigned int n) {
clm_Matrix mat = array.matrixes[0]; clm_Matrix mat = array.matrixes[0];
cl_mem mem = array.native->mem; cl_mem mem = array.native->mem;
// TODO: don't do blocking writes, instead wait once at the end
size_t matLength = sizeof(float) * mat.rows * mat.cols; size_t matLength = sizeof(float) * mat.rows * mat.cols;
for(unsigned int i = 0; i < array.length; i++) { for(unsigned int i = 0; i < array.length; i++) {
cl_int err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL); cl_int err = clEnqueueWriteBuffer(queue, mem, CL_FALSE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err)); printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
return NULL; return NULL;
} }
} }
clFlush(queue);
clFinish(queue);
return mem; return mem;
} }
@ -175,12 +180,15 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c
cl_mem mem = nativeBuf->mem; cl_mem mem = nativeBuf->mem;
for(unsigned int i = 0; i < numMats; i++) { for(unsigned int i = 0; i < numMats; i++) {
err = clEnqueueReadBuffer(queue, mem, CL_TRUE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, mem, CL_FALSE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err)); printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
return; return;
} }
} }
clFlush(queue);
clFinish(queue);
} }
static void readNativeMatrix(clm_NativeMatrix matrix) { static void readNativeMatrix(clm_NativeMatrix matrix) {
@ -191,18 +199,21 @@ static void readNativeMatrix(clm_NativeMatrix matrix) {
} }
} }
static void readNativeMatrixArray(clm_NativeMatrixArray array) { static void readNativeMatrixArray(clm_NativeMatrixArray array, unsigned int n) {
clm_Matrix mat = array.matrixes[0]; clm_Matrix mat = array.matrixes[0];
size_t matLength = sizeof(float) * mat.rows * mat.cols; size_t matLength = sizeof(float) * mat.rows * mat.cols;
// TODO: don't do blocking reads, instead wait once at the end // TODO: don't do blocking reads, instead wait once at the end
for(unsigned int i = 0; i < array.length; i++) { for(unsigned int i = 0; i < n; i++) {
cl_int err = clEnqueueReadBuffer(queue, array.native->mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL); cl_int err = clEnqueueReadBuffer(queue, array.native->mem, CL_FALSE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err)); printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
return; return;
} }
} }
clFlush(queue);
clFinish(queue);
} }
static void clm_nativeAllocMatrix(clm_NativeMatrix *matrix, cl_mem_flags flags) { static void clm_nativeAllocMatrix(clm_NativeMatrix *matrix, cl_mem_flags flags) {
@ -281,13 +292,7 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
clFlush(queue); clFlush(queue);
clFinish(queue); clFinish(queue);
/*err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL); readNativeMatrixArray(linear->output, batchSize);
if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n");
return;
}*/
readNativeMatrixArray(linear->output);
} }
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) { void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) {
@ -305,8 +310,8 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput); cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
cl_mem matWeights_values = writeNativeMatrix(linear->weights); cl_mem matWeights_values = writeNativeMatrix(linear->weights);
cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error); cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error, batchSize);
cl_mem matOutput_values = writeNativeMatrixArray(linear->output); cl_mem matOutput_values = writeNativeMatrixArray(linear->output, batchSize);
if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) { if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) {
printf("Failed to write GPU mats\n"); printf("Failed to write GPU mats\n");
return; return;
@ -333,13 +338,17 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients); err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients);
err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values); err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to set kernel args (2): %d\n", err); printf("Failed to set kernel args (2): %s\n", clm_clErrorToString(err));
return; return;
} }
size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols; size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols;
size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local; size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local;
clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL); err = clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue: %s\n", clm_clErrorToString(err));
return;
}
clFlush(queue); clFlush(queue);
clFinish(queue); clFinish(queue);
@ -354,9 +363,11 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values); err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values);
err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors); err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors);
err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors);
if(updateErrors) { if(updateErrors) {
err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors);
err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values); err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values);
} else {
err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), NULL);
} }
err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors); err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors);
@ -364,23 +375,23 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients); err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients);
err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values); err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to set kernel args (3): %d\n", err); printf("Failed to set kernel args (3): %s\n", clm_clErrorToString(err));
return; return;
} }
size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols; size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols;
size_t step2Global = ceil((float) step1WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local; size_t step2Global = ceil((float) step2WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local;
clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL); err = clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue: %s\n", clm_clErrorToString(err));
return;
}
clFlush(queue); clFlush(queue);
clFinish(queue); clFinish(queue);
clm_matrixPrint(linear->weightsError.matrixes[0]); readNativeMatrixArray(linear->weightsError, batchSize);
readNativeMatrixArray(linear->gradient, batchSize);
readNativeMatrixArray(linear->weightsError);
readNativeMatrixArray(linear->gradient);
clm_matrixPrint(linear->weightsError.matrixes[0]);
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputErrors); if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputErrors);
} }

View File

@ -37,9 +37,12 @@ float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx]); float *pred = predict(nn, images[idx]);
// if(idx < 100) clm_vectorPrint((clm_Vector){.values = pred, .length = labels[0].length});
unsigned int predDigit = 0; unsigned int predDigit = 0;
float max = -1; float max = -1;
for(unsigned int j = 0; j < 10; j++) { for(unsigned int j = 0; j < labels[0].length; j++) {
// printf("%.2f ", pred[j]); // printf("%.2f ", pred[j]);
if(pred[j] > max || max < 0) { if(pred[j] > max || max < 0) {
max = pred[j]; max = pred[j];
@ -50,15 +53,17 @@ float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels
unsigned int actDigit = 0; unsigned int actDigit = 0;
float maxA = -1; float maxA = -1;
for(unsigned int j = 0; j < 10; j++) { for(unsigned int j = 0; j < labels[0].length; j++) {
// printf("%.2f ", pred[j]); // printf("%.2f ", pred[j]);
if(labels[idx].values[j] > maxA || maxA < 0) { if(labels[idx].values[j] > maxA || maxA < 0) {
maxA = labels[idx].values[j]; maxA = labels[idx].values[j];
actDigit = j; actDigit = j;
} }
} }
// if(idx < 100) printf("Actual: %u\n", actDigit); // if(idx < 100) {
// printf("\n"); // printf("Actual: %u\n", actDigit);
// printf("\n");
// }
if(predDigit == actDigit) correct++; if(predDigit == actDigit) correct++;
} }
@ -233,7 +238,7 @@ int main(int argc, const char *argv[]) {
srand(1); srand(1);
unsigned int /*unsigned int
i = 784, i = 784,
h = 30, h = 30,
o = 10; o = 10;
@ -241,13 +246,40 @@ int main(int argc, const char *argv[]) {
clm_Linear layers[] = { clm_Linear layers[] = {
clm_linearCreateRandom(i, h), clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)}; clm_linearCreateRandom(h, o)};
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000); clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000);*/
float v_00[2] = {0, 0};
float v_01[2] = {0, 1};
float v_10[2] = {1, 0};
float v_11[2] = {1, 1};
images = calloc(4, sizeof(clm_Vector));
images[0] = (clm_Vector){.values = v_00, .length = 2};
images[1] = (clm_Vector){.values = v_01, .length = 2};
images[2] = (clm_Vector){.values = v_10, .length = 2};
images[3] = (clm_Vector){.values = v_11, .length = 2};
labels = calloc(4, sizeof(clm_Vector));
labels[0] = (clm_Vector){.values = v_10, .length = 2};
labels[1] = (clm_Vector){.values = v_01, .length = 2};
labels[2] = (clm_Vector){.values = v_01, .length = 2};
labels[3] = (clm_Vector){.values = v_01, .length = 2};
imageCount = 4;
unsigned int
i = 2,
o = 2;
clm_Linear layers[] = {
clm_linearCreateRandom(i, o)};
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.5, 4);
for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) { for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) {
clm_linearInit(&nn.layers[i]); clm_linearInit(&nn.layers[i]);
} }
for(unsigned int epoch = 0; epoch < 10; epoch++) { for(unsigned int epoch = 0; epoch < 1000; epoch++) {
printf("Epoch %u\n", epoch); printf("Epoch %u\n", epoch);
/*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
if(idx % 1000 == 0) { if(idx % 1000 == 0) {

View File

@ -6,17 +6,6 @@ typedef struct __attribute__((packed)) {
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] #define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
#define matrixGetIJ(mat, idx, i, j) \
{ \
if(!mat.transposed) { \
i = idx / mat.cols; \
j = idx % mat.cols; \
} else { \
i = idx % mat.rows; \
j = idx / mat.rows; \
} \
}
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
uint idx = get_global_id(0); uint idx = get_global_id(0);
if(idx >= matOut.rows * matOut.cols) return; if(idx >= matOut.rows * matOut.cols) return;
@ -66,7 +55,7 @@ void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat
uint idx = get_global_id(0); uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return; if(idx >= mat.rows * mat.cols) return;
other_values[idx] *= mat_values[idx]; mat_values[idx] *= other_values[idx];
} }
void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) { void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) {
@ -84,6 +73,16 @@ cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) {
return tr; return tr;
} }
void clm_matrixPrint(cl_GPUMat mat, __global float *ptr) {
printf("THEMATRIX: [");
for(unsigned int i = 0; i < mat.rows; i++) {
for(unsigned int j = 0; j < mat.cols; j++) {
printf("%7.6f", matrixAt(mat, ptr, i, j));
}
}
printf("]");
}
__kernel void linear_forward(unsigned int batchSize, __kernel void linear_forward(unsigned int batchSize,
cl_GPUMat input, __global float *input_values, cl_GPUMat input, __global float *input_values,
cl_GPUMat weights, __global float *weights_values, cl_GPUMat weights, __global float *weights_values,
@ -93,6 +92,7 @@ __kernel void linear_forward(unsigned int batchSize,
for(unsigned int b = 0; b < batchSize; b++) { for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchInput_values = input_values + b * input.rows * input.cols; __global float *batchInput_values = input_values + b * input.rows * input.cols;
__global float *batchOut_values = out_values + b * out.rows * out.cols; __global float *batchOut_values = out_values + b * out.rows * out.cols;
mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values); mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values);
mat_add(out, batchOut_values, bias, bias_values); mat_add(out, batchOut_values, bias, bias_values);
mat_sigmoid(out, batchOut_values); mat_sigmoid(out, batchOut_values);
@ -136,8 +136,8 @@ __kernel void linear_backprop_2(unsigned int batchSize,
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
if(updateErrors) { if(updateErrors) {
cl_GPUMat weightsT = clm_matrixTranspose(weightsT); cl_GPUMat weightsT = clm_matrixTranspose(weights);
// mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
} }
} }
} }