From 09fea040228be1aacb856897dad2bf1d23d027e7 Mon Sep 17 00:00:00 2001 From: MrLetsplay Date: Wed, 6 Mar 2024 22:45:01 +0100 Subject: [PATCH] Fix OpenCL implementation --- src/clm.c | 11 +++++++- src/clm.h | 1 + src/clm_opencl.c | 71 ++++++++++++++++++++++++++++-------------------- src/cltest.c | 46 ++++++++++++++++++++++++++----- src/mat.cl | 28 +++++++++---------- 5 files changed, 105 insertions(+), 52 deletions(-) diff --git a/src/clm.c b/src/clm.c index 9baceda..7bbe230 100644 --- a/src/clm.c +++ b/src/clm.c @@ -253,11 +253,20 @@ void clm_freeLinear(clm_Linear linear) { clm_freeMatrix(linear.bias.matrix); } +void clm_vectorPrint(clm_Vector vec) { + printf("["); + for(unsigned int i = 0; i < vec.length; i++) { + printf("%f", vec.values[i]); + if(i != vec.length - 1) printf(" "); + } + printf("]\n"); +} + void clm_matrixPrint(clm_Matrix mat) { printf("[\n"); for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int j = 0; j < mat.cols; j++) { - printf("%7.3f", matrixAt(mat, i, j)); + printf("%f ", matrixAt(mat, i, j)); } printf("\n"); } diff --git a/src/clm.h b/src/clm.h index 53bdbe9..8c166a1 100644 --- a/src/clm.h +++ b/src/clm.h @@ -81,6 +81,7 @@ bool clm_vectorIsInvalid(clm_Vector vec); clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs); clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize); +void clm_vectorPrint(clm_Vector vec); void clm_matrixPrint(clm_Matrix mat); void clm_matrixPrintShape(clm_Matrix mat); void clm_freeMatrix(clm_Matrix mat); diff --git a/src/clm_opencl.c b/src/clm_opencl.c index e028b25..8a961c1 100644 --- a/src/clm_opencl.c +++ b/src/clm_opencl.c @@ -49,7 +49,7 @@ int clm_gpuInit(unsigned int mode) { return 1; } - printf("%s", buffer); + // printf("%s", buffer); context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); if(!context) { @@ -132,13 +132,16 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf); for(unsigned int i = 0; i < numMats; i++) { - err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL); + err = clEnqueueWriteBuffer(queue, mem, CL_FALSE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to enqueue write: %s\n", clm_clErrorToString(err)); return NULL; } } + clFlush(queue); + clFinish(queue); + return mem; } @@ -153,20 +156,22 @@ static cl_mem writeNativeMatrix(clm_NativeMatrix matrix) { return mem; } -static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array) { +static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array, unsigned int n) { clm_Matrix mat = array.matrixes[0]; cl_mem mem = array.native->mem; - // TODO: don't do blocking writes, instead wait once at the end size_t matLength = sizeof(float) * mat.rows * mat.cols; for(unsigned int i = 0; i < array.length; i++) { - cl_int err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL); + cl_int err = clEnqueueWriteBuffer(queue, mem, CL_FALSE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to enqueue write: %s\n", clm_clErrorToString(err)); return NULL; } } + clFlush(queue); + clFinish(queue); + return mem; } @@ -175,12 +180,15 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c cl_mem mem = nativeBuf->mem; for(unsigned int i = 0; i < numMats; i++) { - err = clEnqueueReadBuffer(queue, mem, CL_TRUE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL); + err = clEnqueueReadBuffer(queue, mem, CL_FALSE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to enqueue read: %s\n", clm_clErrorToString(err)); return; } } + + clFlush(queue); + clFinish(queue); } static void readNativeMatrix(clm_NativeMatrix matrix) { @@ -191,18 +199,21 @@ static void readNativeMatrix(clm_NativeMatrix matrix) { } } -static void readNativeMatrixArray(clm_NativeMatrixArray array) { +static void readNativeMatrixArray(clm_NativeMatrixArray array, unsigned int n) { clm_Matrix mat = array.matrixes[0]; size_t matLength = sizeof(float) * mat.rows * mat.cols; // TODO: don't do blocking reads, instead wait once at the end - for(unsigned int i = 0; i < array.length; i++) { - cl_int err = clEnqueueReadBuffer(queue, array.native->mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL); + for(unsigned int i = 0; i < n; i++) { + cl_int err = clEnqueueReadBuffer(queue, array.native->mem, CL_FALSE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to enqueue read: %s\n", clm_clErrorToString(err)); return; } } + + clFlush(queue); + clFinish(queue); } static void clm_nativeAllocMatrix(clm_NativeMatrix *matrix, cl_mem_flags flags) { @@ -281,13 +292,7 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i clFlush(queue); clFinish(queue); - /*err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to read from buffer\n"); - return; - }*/ - - readNativeMatrixArray(linear->output); + readNativeMatrixArray(linear->output, batchSize); } void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) { @@ -305,8 +310,8 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput); cl_mem matWeights_values = writeNativeMatrix(linear->weights); - cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error); - cl_mem matOutput_values = writeNativeMatrixArray(linear->output); + cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error, batchSize); + cl_mem matOutput_values = writeNativeMatrixArray(linear->output, batchSize); if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) { printf("Failed to write GPU mats\n"); return; @@ -333,13 +338,17 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients); err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values); if(err != CL_SUCCESS) { - printf("Failed to set kernel args (2): %d\n", err); + printf("Failed to set kernel args (2): %s\n", clm_clErrorToString(err)); return; } size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols; size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local; - clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL); + err = clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue: %s\n", clm_clErrorToString(err)); + return; + } clFlush(queue); clFinish(queue); @@ -354,9 +363,11 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values); err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors); + err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors); if(updateErrors) { - err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors); err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values); + } else { + err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), NULL); } err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors); @@ -364,23 +375,23 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients); err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values); if(err != CL_SUCCESS) { - printf("Failed to set kernel args (3): %d\n", err); + printf("Failed to set kernel args (3): %s\n", clm_clErrorToString(err)); return; } size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols; - size_t step2Global = ceil((float) step1WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local; - clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL); + size_t step2Global = ceil((float) step2WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local; + err = clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue: %s\n", clm_clErrorToString(err)); + return; + } clFlush(queue); clFinish(queue); - clm_matrixPrint(linear->weightsError.matrixes[0]); - - readNativeMatrixArray(linear->weightsError); - readNativeMatrixArray(linear->gradient); - - clm_matrixPrint(linear->weightsError.matrixes[0]); + readNativeMatrixArray(linear->weightsError, batchSize); + readNativeMatrixArray(linear->gradient, batchSize); if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputErrors); } diff --git a/src/cltest.c b/src/cltest.c index bc97c78..82ab4db 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -37,9 +37,12 @@ float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); float *pred = predict(nn, images[idx]); + + // if(idx < 100) clm_vectorPrint((clm_Vector){.values = pred, .length = labels[0].length}); + unsigned int predDigit = 0; float max = -1; - for(unsigned int j = 0; j < 10; j++) { + for(unsigned int j = 0; j < labels[0].length; j++) { // printf("%.2f ", pred[j]); if(pred[j] > max || max < 0) { max = pred[j]; @@ -50,15 +53,17 @@ float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels unsigned int actDigit = 0; float maxA = -1; - for(unsigned int j = 0; j < 10; j++) { + for(unsigned int j = 0; j < labels[0].length; j++) { // printf("%.2f ", pred[j]); if(labels[idx].values[j] > maxA || maxA < 0) { maxA = labels[idx].values[j]; actDigit = j; } } - // if(idx < 100) printf("Actual: %u\n", actDigit); - // printf("\n"); + // if(idx < 100) { + // printf("Actual: %u\n", actDigit); + // printf("\n"); + // } if(predDigit == actDigit) correct++; } @@ -233,7 +238,7 @@ int main(int argc, const char *argv[]) { srand(1); - unsigned int + /*unsigned int i = 784, h = 30, o = 10; @@ -241,13 +246,40 @@ int main(int argc, const char *argv[]) { clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; - clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000); + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000);*/ + + float v_00[2] = {0, 0}; + float v_01[2] = {0, 1}; + float v_10[2] = {1, 0}; + float v_11[2] = {1, 1}; + + images = calloc(4, sizeof(clm_Vector)); + images[0] = (clm_Vector){.values = v_00, .length = 2}; + images[1] = (clm_Vector){.values = v_01, .length = 2}; + images[2] = (clm_Vector){.values = v_10, .length = 2}; + images[3] = (clm_Vector){.values = v_11, .length = 2}; + + labels = calloc(4, sizeof(clm_Vector)); + labels[0] = (clm_Vector){.values = v_10, .length = 2}; + labels[1] = (clm_Vector){.values = v_01, .length = 2}; + labels[2] = (clm_Vector){.values = v_01, .length = 2}; + labels[3] = (clm_Vector){.values = v_01, .length = 2}; + + imageCount = 4; + + unsigned int + i = 2, + o = 2; + + clm_Linear layers[] = { + clm_linearCreateRandom(i, o)}; + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.5, 4); for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) { clm_linearInit(&nn.layers[i]); } - for(unsigned int epoch = 0; epoch < 10; epoch++) { + for(unsigned int epoch = 0; epoch < 1000; epoch++) { printf("Epoch %u\n", epoch); /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample if(idx % 1000 == 0) { diff --git a/src/mat.cl b/src/mat.cl index f078e43..dd0cc15 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -6,17 +6,6 @@ typedef struct __attribute__((packed)) { #define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] -#define matrixGetIJ(mat, idx, i, j) \ - { \ - if(!mat.transposed) { \ - i = idx / mat.cols; \ - j = idx % mat.cols; \ - } else { \ - i = idx % mat.rows; \ - j = idx / mat.rows; \ - } \ - } - void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { uint idx = get_global_id(0); if(idx >= matOut.rows * matOut.cols) return; @@ -66,7 +55,7 @@ void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat uint idx = get_global_id(0); if(idx >= mat.rows * mat.cols) return; - other_values[idx] *= mat_values[idx]; + mat_values[idx] *= other_values[idx]; } void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) { @@ -84,6 +73,16 @@ cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) { return tr; } +void clm_matrixPrint(cl_GPUMat mat, __global float *ptr) { + printf("THEMATRIX: ["); + for(unsigned int i = 0; i < mat.rows; i++) { + for(unsigned int j = 0; j < mat.cols; j++) { + printf("%7.6f", matrixAt(mat, ptr, i, j)); + } + } + printf("]"); +} + __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, @@ -93,6 +92,7 @@ __kernel void linear_forward(unsigned int batchSize, for(unsigned int b = 0; b < batchSize; b++) { __global float *batchInput_values = input_values + b * input.rows * input.cols; __global float *batchOut_values = out_values + b * out.rows * out.cols; + mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values); mat_add(out, batchOut_values, bias, bias_values); mat_sigmoid(out, batchOut_values); @@ -136,8 +136,8 @@ __kernel void linear_backprop_2(unsigned int batchSize, mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); if(updateErrors) { - cl_GPUMat weightsT = clm_matrixTranspose(weightsT); - // mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); + cl_GPUMat weightsT = clm_matrixTranspose(weights); + mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); } } }