From d55f7c69baa5699fbf27e4a09bb55052e37ca093 Mon Sep 17 00:00:00 2001 From: MrLetsplay Date: Thu, 22 Feb 2024 17:42:19 +0100 Subject: [PATCH] More OpenCL debugging --- src/clm_opencl.c | 6 +++ src/cltest.c | 104 +++++++++++++++++++---------------------------- src/mat.cl | 21 ++++++---- 3 files changed, 62 insertions(+), 69 deletions(-) diff --git a/src/clm_opencl.c b/src/clm_opencl.c index b5024fc..42790b9 100644 --- a/src/clm_opencl.c +++ b/src/clm_opencl.c @@ -230,6 +230,9 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i }*/ readGPUMats(matOut, batchSize, outputs, linear->nativeOutput); + + clFlush(queue); + clFinish(queue); } void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { @@ -320,4 +323,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient); readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError); if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError); + + clFlush(queue); + clFinish(queue); } diff --git a/src/cltest.c b/src/cltest.c index dd40f46..5bf5527 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -31,6 +31,41 @@ float *predict(clm_NN nn, clm_Vector input) { return xM.values; } +float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels) { + unsigned int correct = 0; + for(unsigned int idx = 0; idx < count; idx++) { // Each train sample + // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], + // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); + float *pred = predict(nn, images[idx]); + unsigned int predDigit = 0; + float max = -1; + for(unsigned int j = 0; j < 10; j++) { + // printf("%.2f ", pred[j]); + if(pred[j] > max || max < 0) { + max = pred[j]; + predDigit = j; + } + } + // if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); + + unsigned int actDigit = 0; + float maxA = -1; + for(unsigned int j = 0; j < 10; j++) { + // printf("%.2f ", pred[j]); + if(labels[idx].values[j] > maxA || maxA < 0) { + maxA = labels[idx].values[j]; + actDigit = j; + } + } + // if(idx < 100) printf("Actual: %u\n", actDigit); + // printf("\n"); + + if(predDigit == actDigit) correct++; + } + + return (float) correct / count * 100; +} + void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) { clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix)); clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix)); @@ -73,31 +108,6 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector * clm_matrixAddMatrix(layer->bias, layer->gradient[b]); } } - - /*for(int i = nn.numLayers - 1; i >= 0; i--) { - clm_Linear layer = nn.layers[i]; - clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; - clm_Matrix *outputsOfThisLayer = nn.layers[i].output; - clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; - clm_Matrix error = layer.error; - - if(i == nn.numLayers - 1) { - clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y - } else { - clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); - clm_matrixMultiplyMatrix(weightsT, prevError, error); - } - - clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat) - clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) - clm_matrixMultiplyScalar(gradient, nn.learnRate); - - clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]); - clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); - - clm_matrixAddMatrix(layer.weights, layer.weightsError); - clm_matrixAddMatrix(layer.bias, gradient); - }*/ } free(batchInputs); @@ -216,7 +226,7 @@ int main(int argc, const char *argv[]) { unsigned int imageCount; loadImages(&images, &imageCount); - imageCount = 60000; + imageCount = 600; printf("%f\n", images[0].values[0]); @@ -230,13 +240,13 @@ int main(int argc, const char *argv[]) { clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; - clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 500); + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000); for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) { clm_linearInit(&nn.layers[i]); } - for(unsigned int epoch = 0; epoch < 1; epoch++) { + for(unsigned int epoch = 0; epoch < 10; epoch++) { printf("Epoch %u\n", epoch); /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample if(idx % 1000 == 0) { @@ -245,45 +255,15 @@ int main(int argc, const char *argv[]) { } }*/ train(nn, imageCount, images, labels); + + printf("Score: %.2f\n", eval(nn, imageCount, images, labels)); printf("\n"); } printf("Train done\n"); - unsigned int correct = 0; - for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample - // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], - // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); - float *pred = predict(nn, images[idx]); - unsigned int predDigit = 0; - float max = -1; - for(unsigned int j = 0; j < 10; j++) { - // printf("%.2f ", pred[j]); - if(pred[j] > max || max < 0) { - max = pred[j]; - predDigit = j; - } - } - // if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); - - unsigned int actDigit = 0; - float maxA = -1; - for(unsigned int j = 0; j < 10; j++) { - // printf("%.2f ", pred[j]); - if(labels[idx].values[j] > maxA || maxA < 0) { - maxA = labels[idx].values[j]; - actDigit = j; - } - } - // if(idx < 100) printf("Actual: %u\n", actDigit); - // printf("\n"); - - if(predDigit == actDigit) correct++; - } - - printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100); - - printf("\n"); + float score = eval(nn, imageCount, images, labels); + printf("Correct: %.2f\n", score); clm_gpuDestroy(); } diff --git a/src/mat.cl b/src/mat.cl index fbc8b87..f078e43 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -21,8 +21,9 @@ void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, _ uint idx = get_global_id(0); if(idx >= matOut.rows * matOut.cols) return; - uint i, j; - matrixGetIJ(matOut, idx, i, j); + // TODO: might not work with transposed matOut + uint i = idx / matOut.cols; + uint j = idx % matOut.cols; float sum = 0; for(unsigned int k = 0; k < matA.cols; k++) { @@ -75,6 +76,14 @@ void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar mat_values[idx] *= scalar; } +cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) { + cl_GPUMat tr = {0}; + tr.cols = mat.rows; + tr.rows = mat.cols; + tr.transposed = !mat.transposed; + return tr; +} + __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, @@ -123,14 +132,12 @@ __kernel void linear_backprop_2(unsigned int batchSize, __global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols; __global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols; - cl_GPUMat inputsT = inputs; - inputsT.transposed = true; + cl_GPUMat inputsT = clm_matrixTranspose(inputs); mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); if(updateErrors) { - cl_GPUMat weightsT = weights; - weightsT.transposed = true; - mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); + cl_GPUMat weightsT = clm_matrixTranspose(weightsT); + // mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); } } }