From 8240f7c01b95a6640018d75a97a13d525ac47d5c Mon Sep 17 00:00:00 2001 From: MrLetsplay Date: Tue, 31 Oct 2023 14:30:09 +0100 Subject: [PATCH] Basic batch implementation (WIP) --- src/clm.c | 19 +++++++-- src/clm.h | 4 +- src/clm_gpu.h | 2 +- src/clm_gpu_cpu.c | 19 ++++----- src/clm_gpu_opencl.c | 57 +++++++++++++++++---------- src/cltest.c | 94 +++++++++++++++++++++++++------------------- src/mat.cl | 17 ++++---- 7 files changed, 129 insertions(+), 83 deletions(-) diff --git a/src/clm.c b/src/clm.c index 5221e53..0f2a4a8 100644 --- a/src/clm.c +++ b/src/clm.c @@ -52,7 +52,7 @@ clm_Matrix clm_matrixCopyALLOC(clm_Matrix mat) { clm_Matrix clm_matrixCopy(clm_Matrix mat, clm_Matrix out) { if(mat.cols != out.cols || mat.rows != out.rows) { - printf("Failed to copy matrix\n"); + printf("Failed to copy matrix (got %dx%d and %dx%d)\n", mat.rows, mat.cols, out.rows, out.cols); return INVALID_MATRIX; } @@ -86,7 +86,7 @@ clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) { clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) { if(mat.cols != other.cols || mat.rows != other.rows) { - printf("Failed to sub matrices\n"); + printf("Failed to sub matrices (got %dx%d and %dx%d)\n", mat.rows, mat.cols, other.rows, other.cols); return INVALID_MATRIX; } @@ -210,12 +210,25 @@ clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) { linear.weights = clm_createMatrixRandom(outputs, inputs); linear.bias = clm_createMatrixRandom(outputs, 1); - linear.output = clm_createMatrix(outputs, 1); + // linear.output = clm_createMatrix(outputs, 1); linear.error = clm_createMatrix(outputs, 1); linear.weightsError = clm_createMatrix(outputs, inputs); return linear; } +clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize) { + clm_NN nn = {.numLayers = numLayers, .layers = layers, .learnRate = learnRate, .batchSize = batchSize}; + + for(unsigned int i = 0; i < numLayers; i++) { + layers[i].output = calloc(batchSize, sizeof(clm_Matrix)); + for(unsigned int j = 0; j < batchSize; j++) { + layers[i].output[j] = clm_createMatrix(layers[i].weights.rows, 1); + } + } + + return nn; +} + void clm_freeVector(clm_Vector vector) { free(vector.values); } diff --git a/src/clm.h b/src/clm.h index 2897d52..bc0d2a1 100644 --- a/src/clm.h +++ b/src/clm.h @@ -22,7 +22,7 @@ typedef struct { typedef struct { clm_Matrix weights; clm_Matrix bias; - clm_Matrix output; + clm_Matrix *output; clm_Matrix error; clm_Matrix weightsError; clm_NativeBuf *nativeWeights; @@ -34,6 +34,7 @@ typedef struct { clm_Linear *layers; unsigned int numLayers; float learnRate; + unsigned int batchSize; } clm_NN; extern const clm_Matrix INVALID_MATRIX; @@ -64,6 +65,7 @@ clm_Vector clm_vectorCreate(unsigned int length); bool clm_vectorIsInvalid(clm_Vector vec); clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs); +clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize); void clm_matrixPrint(clm_Matrix mat); void clm_matrixPrintShape(clm_Matrix mat); diff --git a/src/clm_gpu.h b/src/clm_gpu.h index 2548383..ed04b01 100644 --- a/src/clm_gpu.h +++ b/src/clm_gpu.h @@ -6,6 +6,6 @@ int clm_gpuInit(); void clm_gpuDestroy(); -void clm_linearForward(clm_Linear *linear, clm_Matrix input); +void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); #endif diff --git a/src/clm_gpu_cpu.c b/src/clm_gpu_cpu.c index 7ab5caf..6fda836 100644 --- a/src/clm_gpu_cpu.c +++ b/src/clm_gpu_cpu.c @@ -10,15 +10,16 @@ int clm_gpuInit() { void clm_gpuDestroy() {} -void clm_linearForward(clm_Linear *linear, clm_Matrix input) { - clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output); +void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { + for(unsigned int b = 0; b < batchSize; b++) { + clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]); - if(clm_matrixIsInvalid(newX)) { - printf("Forward pass failed\n"); - return; + if(clm_matrixIsInvalid(outputs[b])) { + printf("Forward pass failed\n"); + return; + } + + clm_matrixAddMatrix(outputs[b], linear->bias); + clm_matrixSigmoid(outputs[b]); } - - clm_matrixAddMatrix(newX, linear->bias); - clm_matrixSigmoid(newX); - linear->output = newX; } diff --git a/src/clm_gpu_opencl.c b/src/clm_gpu_opencl.c index fd64052..a999b2e 100644 --- a/src/clm_gpu_opencl.c +++ b/src/clm_gpu_opencl.c @@ -88,11 +88,11 @@ int clm_gpuInit() { void clm_gpuDestroy() { } -static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) { +static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nativeBuf) { cl_int err; if(!nativeBuf->mem) { - cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err); + cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, count * sizeof(float) * mat.rows * mat.cols, NULL, &err); if(!mat_values) { printf("Failed to alloc buffer: %d\n", err); return NULL; @@ -105,7 +105,7 @@ static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) { static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) { cl_int err; - cl_mem mem = allocGPUMat(gpuMat, nativeBuf); + cl_mem mem = allocGPUMat(gpuMat, 1, nativeBuf); err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL); if(err != CL_SUCCESS) { @@ -116,49 +116,66 @@ static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativ return mem; } +static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) { + cl_int err; + cl_mem mem = allocGPUMat(gpuMat, numMats, nativeBuf); + + for(unsigned int i = 0; i < numMats; i++) { + err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue write: %d\n", err); + return NULL; + } + } + + return mem; +} + clm_NativeBuf nativeInput; // TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time) -void clm_linearForward(clm_Linear *linear, clm_Matrix input) { +void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { + if(batchSize == 0) return; + if(!linear->nativeWeights) { linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); } - cl_GPUMat matInput = gpuMat(input); + cl_GPUMat matInput = gpuMat(inputs[0]); cl_GPUMat matWeights = gpuMat(linear->weights); cl_GPUMat matBias = gpuMat(linear->bias); - cl_GPUMat matOut = gpuMat(linear->output); + cl_GPUMat matOut = gpuMat(outputs[0]); size_t workSize = matOut.rows * matOut.cols; cl_int err; - cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput); + cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, &nativeInput); cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights); cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias); if(!matInput_values || !matWeights_values || !matBias_values) { - linear->output = INVALID_MATRIX; + // linear->output = INVALID_MATRIX; return; } - cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput); + cl_mem matOut_values = allocGPUMat(matOut, batchSize, linear->nativeOutput); if(!matOut_values) { - printf("Failed to alloc out: %d\n", err); - linear->output = INVALID_MATRIX; + // linear->output = INVALID_MATRIX; return; } err = 0; - err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput); - err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values); - err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights); - err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values); - err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias); - err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values); - err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut); - err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values); + err |= clSetKernelArg(kernel, 0, sizeof(cl_uint), &batchSize); + err |= clSetKernelArg(kernel, 1, sizeof(matInput), &matInput); + err |= clSetKernelArg(kernel, 2, sizeof(matInput_values), &matInput_values); + err |= clSetKernelArg(kernel, 3, sizeof(matWeights), &matWeights); + err |= clSetKernelArg(kernel, 4, sizeof(matWeights_values), &matWeights_values); + err |= clSetKernelArg(kernel, 5, sizeof(matBias), &matBias); + err |= clSetKernelArg(kernel, 6, sizeof(matBias_values), &matBias_values); + err |= clSetKernelArg(kernel, 7, sizeof(matOut), &matOut); + err |= clSetKernelArg(kernel, 8, sizeof(matOut_values), &matOut_values); if(err != CL_SUCCESS) { printf("Failed to set kernel args: %d\n", err); return; @@ -184,7 +201,7 @@ void clm_linearForward(clm_Linear *linear, clm_Matrix input) { clFinish(queue); - err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL); + err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to read from buffer\n"); return; diff --git a/src/cltest.c b/src/cltest.c index 69eccb9..f138812 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -1,5 +1,5 @@ -#include #include +#include #include #include @@ -18,54 +18,69 @@ float train_data_y[4][1] = { {1}, {0}}; -float *predict(clm_NN nn, float *x, unsigned int length) { - clm_Matrix xM = clm_matrixWrapArray(x, length); +float *predict(clm_NN nn, clm_Vector input) { + clm_Matrix xM = clm_matrixWrapArray(input.values, input.length); for(unsigned int i = 0; i < nn.numLayers; i++) { - clm_linearForward(&nn.layers[i], xM); - xM = nn.layers[i].output; + clm_linearForward(&nn.layers[i], 1, &xM, &nn.layers[i].output[0]); + xM = nn.layers[i].output[0]; } return xM.values; } -void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { - clm_Matrix xM = clm_matrixWrapArray(x, xL); - clm_Matrix yM = clm_matrixWrapArray(y, yL); +void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) { + clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix)); + clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix)); - // TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached) + for(unsigned int b = 0; b < ceil((float) numElements / nn.batchSize); b++) { + unsigned int batchSize = numElements - b * nn.batchSize; + if(batchSize > nn.batchSize) batchSize = nn.batchSize; - // Forward pass - clm_Matrix currentX = xM; - for(unsigned int i = 0; i < nn.numLayers; i++) { - clm_linearForward(&nn.layers[i], currentX); - currentX = nn.layers[i].output; - } + // printf("Batch %d (size %d)\n", b, batchSize); - for(int i = nn.numLayers - 1; i >= 0; i--) { - clm_Linear layer = nn.layers[i]; - clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output; - clm_Matrix outputOfThisLayer = nn.layers[i].output; - clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; - clm_Matrix error = layer.error; - - if(i == nn.numLayers - 1) { - clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y - } else { - clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); - clm_matrixMultiplyMatrix(weightsT, prevError, error); + for(unsigned int i = 0; i < batchSize; i++) { + clm_Vector input = inputs[b * nn.batchSize + i]; + clm_Vector output = expectedOutputs[b * nn.batchSize + i]; + batchInputs[i] = clm_matrixWrapArray(input.values, input.length); + batchOutputs[i] = clm_matrixWrapArray(output.values, output.length); } - clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat) - clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) - clm_matrixMultiplyScalar(gradient, nn.learnRate); + // Forward pass + clm_Matrix *currentXs = batchInputs; + for(unsigned int i = 0; i < nn.numLayers; i++) { + clm_linearForward(&nn.layers[i], batchSize, currentXs, nn.layers[i].output); + currentXs = nn.layers[i].output; + } - clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer); - clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); + for(int i = nn.numLayers - 1; i >= 0; i--) { + clm_Linear layer = nn.layers[i]; + clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; + clm_Matrix *outputsOfThisLayer = nn.layers[i].output; + clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; + clm_Matrix error = layer.error; - clm_matrixAddMatrix(layer.weights, layer.weightsError); - clm_matrixAddMatrix(layer.bias, gradient); + if(i == nn.numLayers - 1) { + clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y + } else { + clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); + clm_matrixMultiplyMatrix(weightsT, prevError, error); + } + + clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat) + clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) + clm_matrixMultiplyScalar(gradient, nn.learnRate); + + clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]); + clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); + + clm_matrixAddMatrix(layer.weights, layer.weightsError); + clm_matrixAddMatrix(layer.bias, gradient); + } } + + free(batchInputs); + free(batchOutputs); } void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { @@ -175,18 +190,17 @@ int main() { clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; - clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01}; + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 100); for(unsigned int epoch = 0; epoch < 1; epoch++) { printf("Epoch %u\n", epoch); - for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample + /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample if(idx % 1000 == 0) { printf("\r%.2f%%", idx / (float) imageCount * 100); fflush(stdout); } - - train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); - } + }*/ + train(nn, imageCount, images, labels); printf("\n"); } @@ -196,7 +210,7 @@ int main() { for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); - float *pred = predict(nn, images[idx].values, images[idx].length); + float *pred = predict(nn, images[idx]); unsigned int predDigit = 0; float max = -1; for(unsigned int j = 0; j < 10; j++) { diff --git a/src/mat.cl b/src/mat.cl index 0fa9eee..2f6d460 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -34,14 +34,13 @@ void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) { mat_values[idx] = 1 / (1 + exp(-mat_values[idx])); } -// clm_Matrix input; -// clm_Matrix weights; -// clm_Matrix bias; -// clm_Matrix output; - -__kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) { +__kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) { // FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations! - mat_multiply(weights, weights_values, input, input_values, out, out_values); - mat_add(out, out_values, bias, bias_values); - mat_sigmoid(out, out_values); + for(unsigned int b = 0; b < batchSize; b++) { + __global float *batchInput_values = input_values + b * input.rows * input.cols; + __global float *batchOut_values = out_values + b * out.rows * out.cols; + mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values); + mat_add(out, batchOut_values, bias, bias_values); + mat_sigmoid(out, batchOut_values); + } }