Basic batch implementation (WIP)

This commit is contained in:
MrLetsplay 2023-10-31 14:30:09 +01:00
parent 4d93fa76cc
commit 8240f7c01b
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
7 changed files with 129 additions and 83 deletions

View File

@ -52,7 +52,7 @@ clm_Matrix clm_matrixCopyALLOC(clm_Matrix mat) {
clm_Matrix clm_matrixCopy(clm_Matrix mat, clm_Matrix out) { clm_Matrix clm_matrixCopy(clm_Matrix mat, clm_Matrix out) {
if(mat.cols != out.cols || mat.rows != out.rows) { if(mat.cols != out.cols || mat.rows != out.rows) {
printf("Failed to copy matrix\n"); printf("Failed to copy matrix (got %dx%d and %dx%d)\n", mat.rows, mat.cols, out.rows, out.cols);
return INVALID_MATRIX; return INVALID_MATRIX;
} }
@ -86,7 +86,7 @@ clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) {
clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) { clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
if(mat.cols != other.cols || mat.rows != other.rows) { if(mat.cols != other.cols || mat.rows != other.rows) {
printf("Failed to sub matrices\n"); printf("Failed to sub matrices (got %dx%d and %dx%d)\n", mat.rows, mat.cols, other.rows, other.cols);
return INVALID_MATRIX; return INVALID_MATRIX;
} }
@ -210,12 +210,25 @@ clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
linear.weights = clm_createMatrixRandom(outputs, inputs); linear.weights = clm_createMatrixRandom(outputs, inputs);
linear.bias = clm_createMatrixRandom(outputs, 1); linear.bias = clm_createMatrixRandom(outputs, 1);
linear.output = clm_createMatrix(outputs, 1); // linear.output = clm_createMatrix(outputs, 1);
linear.error = clm_createMatrix(outputs, 1); linear.error = clm_createMatrix(outputs, 1);
linear.weightsError = clm_createMatrix(outputs, inputs); linear.weightsError = clm_createMatrix(outputs, inputs);
return linear; return linear;
} }
clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize) {
clm_NN nn = {.numLayers = numLayers, .layers = layers, .learnRate = learnRate, .batchSize = batchSize};
for(unsigned int i = 0; i < numLayers; i++) {
layers[i].output = calloc(batchSize, sizeof(clm_Matrix));
for(unsigned int j = 0; j < batchSize; j++) {
layers[i].output[j] = clm_createMatrix(layers[i].weights.rows, 1);
}
}
return nn;
}
void clm_freeVector(clm_Vector vector) { void clm_freeVector(clm_Vector vector) {
free(vector.values); free(vector.values);
} }

View File

@ -22,7 +22,7 @@ typedef struct {
typedef struct { typedef struct {
clm_Matrix weights; clm_Matrix weights;
clm_Matrix bias; clm_Matrix bias;
clm_Matrix output; clm_Matrix *output;
clm_Matrix error; clm_Matrix error;
clm_Matrix weightsError; clm_Matrix weightsError;
clm_NativeBuf *nativeWeights; clm_NativeBuf *nativeWeights;
@ -34,6 +34,7 @@ typedef struct {
clm_Linear *layers; clm_Linear *layers;
unsigned int numLayers; unsigned int numLayers;
float learnRate; float learnRate;
unsigned int batchSize;
} clm_NN; } clm_NN;
extern const clm_Matrix INVALID_MATRIX; extern const clm_Matrix INVALID_MATRIX;
@ -64,6 +65,7 @@ clm_Vector clm_vectorCreate(unsigned int length);
bool clm_vectorIsInvalid(clm_Vector vec); bool clm_vectorIsInvalid(clm_Vector vec);
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs); clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs);
clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize);
void clm_matrixPrint(clm_Matrix mat); void clm_matrixPrint(clm_Matrix mat);
void clm_matrixPrintShape(clm_Matrix mat); void clm_matrixPrintShape(clm_Matrix mat);

View File

@ -6,6 +6,6 @@
int clm_gpuInit(); int clm_gpuInit();
void clm_gpuDestroy(); void clm_gpuDestroy();
void clm_linearForward(clm_Linear *linear, clm_Matrix input); void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
#endif #endif

View File

@ -10,15 +10,16 @@ int clm_gpuInit() {
void clm_gpuDestroy() {} void clm_gpuDestroy() {}
void clm_linearForward(clm_Linear *linear, clm_Matrix input) { void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output); for(unsigned int b = 0; b < batchSize; b++) {
clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]);
if(clm_matrixIsInvalid(newX)) { if(clm_matrixIsInvalid(outputs[b])) {
printf("Forward pass failed\n"); printf("Forward pass failed\n");
return; return;
}
clm_matrixAddMatrix(outputs[b], linear->bias);
clm_matrixSigmoid(outputs[b]);
} }
clm_matrixAddMatrix(newX, linear->bias);
clm_matrixSigmoid(newX);
linear->output = newX;
} }

View File

@ -88,11 +88,11 @@ int clm_gpuInit() {
void clm_gpuDestroy() { void clm_gpuDestroy() {
} }
static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) { static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nativeBuf) {
cl_int err; cl_int err;
if(!nativeBuf->mem) { if(!nativeBuf->mem) {
cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err); cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, count * sizeof(float) * mat.rows * mat.cols, NULL, &err);
if(!mat_values) { if(!mat_values) {
printf("Failed to alloc buffer: %d\n", err); printf("Failed to alloc buffer: %d\n", err);
return NULL; return NULL;
@ -105,7 +105,7 @@ static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) {
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) { static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
cl_int err; cl_int err;
cl_mem mem = allocGPUMat(gpuMat, nativeBuf); cl_mem mem = allocGPUMat(gpuMat, 1, nativeBuf);
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
@ -116,49 +116,66 @@ static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativ
return mem; return mem;
} }
static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
cl_int err;
cl_mem mem = allocGPUMat(gpuMat, numMats, nativeBuf);
for(unsigned int i = 0; i < numMats; i++) {
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue write: %d\n", err);
return NULL;
}
}
return mem;
}
clm_NativeBuf nativeInput; clm_NativeBuf nativeInput;
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time) // TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
void clm_linearForward(clm_Linear *linear, clm_Matrix input) { void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
if(batchSize == 0) return;
if(!linear->nativeWeights) { if(!linear->nativeWeights) {
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
} }
cl_GPUMat matInput = gpuMat(input); cl_GPUMat matInput = gpuMat(inputs[0]);
cl_GPUMat matWeights = gpuMat(linear->weights); cl_GPUMat matWeights = gpuMat(linear->weights);
cl_GPUMat matBias = gpuMat(linear->bias); cl_GPUMat matBias = gpuMat(linear->bias);
cl_GPUMat matOut = gpuMat(linear->output); cl_GPUMat matOut = gpuMat(outputs[0]);
size_t workSize = matOut.rows * matOut.cols; size_t workSize = matOut.rows * matOut.cols;
cl_int err; cl_int err;
cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput); cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, &nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights); cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias); cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
if(!matInput_values || !matWeights_values || !matBias_values) { if(!matInput_values || !matWeights_values || !matBias_values) {
linear->output = INVALID_MATRIX; // linear->output = INVALID_MATRIX;
return; return;
} }
cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput); cl_mem matOut_values = allocGPUMat(matOut, batchSize, linear->nativeOutput);
if(!matOut_values) { if(!matOut_values) {
printf("Failed to alloc out: %d\n", err); // linear->output = INVALID_MATRIX;
linear->output = INVALID_MATRIX;
return; return;
} }
err = 0; err = 0;
err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput); err |= clSetKernelArg(kernel, 0, sizeof(cl_uint), &batchSize);
err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values); err |= clSetKernelArg(kernel, 1, sizeof(matInput), &matInput);
err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights); err |= clSetKernelArg(kernel, 2, sizeof(matInput_values), &matInput_values);
err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values); err |= clSetKernelArg(kernel, 3, sizeof(matWeights), &matWeights);
err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias); err |= clSetKernelArg(kernel, 4, sizeof(matWeights_values), &matWeights_values);
err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values); err |= clSetKernelArg(kernel, 5, sizeof(matBias), &matBias);
err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut); err |= clSetKernelArg(kernel, 6, sizeof(matBias_values), &matBias_values);
err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values); err |= clSetKernelArg(kernel, 7, sizeof(matOut), &matOut);
err |= clSetKernelArg(kernel, 8, sizeof(matOut_values), &matOut_values);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to set kernel args: %d\n", err); printf("Failed to set kernel args: %d\n", err);
return; return;
@ -184,7 +201,7 @@ void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
clFinish(queue); clFinish(queue);
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n"); printf("Failed to read from buffer\n");
return; return;

View File

@ -1,5 +1,5 @@
#include <dlfcn.h>
#include <inttypes.h> #include <inttypes.h>
#include <math.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -18,54 +18,69 @@ float train_data_y[4][1] = {
{1}, {1},
{0}}; {0}};
float *predict(clm_NN nn, float *x, unsigned int length) { float *predict(clm_NN nn, clm_Vector input) {
clm_Matrix xM = clm_matrixWrapArray(x, length); clm_Matrix xM = clm_matrixWrapArray(input.values, input.length);
for(unsigned int i = 0; i < nn.numLayers; i++) { for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_linearForward(&nn.layers[i], xM); clm_linearForward(&nn.layers[i], 1, &xM, &nn.layers[i].output[0]);
xM = nn.layers[i].output; xM = nn.layers[i].output[0];
} }
return xM.values; return xM.values;
} }
void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) {
clm_Matrix xM = clm_matrixWrapArray(x, xL); clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix));
clm_Matrix yM = clm_matrixWrapArray(y, yL); clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix));
// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached) for(unsigned int b = 0; b < ceil((float) numElements / nn.batchSize); b++) {
unsigned int batchSize = numElements - b * nn.batchSize;
if(batchSize > nn.batchSize) batchSize = nn.batchSize;
// Forward pass // printf("Batch %d (size %d)\n", b, batchSize);
clm_Matrix currentX = xM;
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_linearForward(&nn.layers[i], currentX);
currentX = nn.layers[i].output;
}
for(int i = nn.numLayers - 1; i >= 0; i--) { for(unsigned int i = 0; i < batchSize; i++) {
clm_Linear layer = nn.layers[i]; clm_Vector input = inputs[b * nn.batchSize + i];
clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output; clm_Vector output = expectedOutputs[b * nn.batchSize + i];
clm_Matrix outputOfThisLayer = nn.layers[i].output; batchInputs[i] = clm_matrixWrapArray(input.values, input.length);
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; batchOutputs[i] = clm_matrixWrapArray(output.values, output.length);
clm_Matrix error = layer.error;
if(i == nn.numLayers - 1) {
clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
} else {
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
clm_matrixMultiplyMatrix(weightsT, prevError, error);
} }
clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat) // Forward pass
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) clm_Matrix *currentXs = batchInputs;
clm_matrixMultiplyScalar(gradient, nn.learnRate); for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_linearForward(&nn.layers[i], batchSize, currentXs, nn.layers[i].output);
currentXs = nn.layers[i].output;
}
clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer); for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); clm_Linear layer = nn.layers[i];
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
clm_Matrix error = layer.error;
clm_matrixAddMatrix(layer.weights, layer.weightsError); if(i == nn.numLayers - 1) {
clm_matrixAddMatrix(layer.bias, gradient); clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y
} else {
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
clm_matrixMultiplyMatrix(weightsT, prevError, error);
}
clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat)
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(gradient, nn.learnRate);
clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]);
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
clm_matrixAddMatrix(layer.weights, layer.weightsError);
clm_matrixAddMatrix(layer.bias, gradient);
}
} }
free(batchInputs);
free(batchOutputs);
} }
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
@ -175,18 +190,17 @@ int main() {
clm_Linear layers[] = { clm_Linear layers[] = {
clm_linearCreateRandom(i, h), clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)}; clm_linearCreateRandom(h, o)};
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01}; clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 100);
for(unsigned int epoch = 0; epoch < 1; epoch++) { for(unsigned int epoch = 0; epoch < 1; epoch++) {
printf("Epoch %u\n", epoch); printf("Epoch %u\n", epoch);
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
if(idx % 1000 == 0) { if(idx % 1000 == 0) {
printf("\r%.2f%%", idx / (float) imageCount * 100); printf("\r%.2f%%", idx / (float) imageCount * 100);
fflush(stdout); fflush(stdout);
} }
}*/
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); train(nn, imageCount, images, labels);
}
printf("\n"); printf("\n");
} }
@ -196,7 +210,7 @@ int main() {
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx].values, images[idx].length); float *pred = predict(nn, images[idx]);
unsigned int predDigit = 0; unsigned int predDigit = 0;
float max = -1; float max = -1;
for(unsigned int j = 0; j < 10; j++) { for(unsigned int j = 0; j < 10; j++) {

View File

@ -34,14 +34,13 @@ void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
mat_values[idx] = 1 / (1 + exp(-mat_values[idx])); mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
} }
// clm_Matrix input; __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
// clm_Matrix weights;
// clm_Matrix bias;
// clm_Matrix output;
__kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations! // FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
mat_multiply(weights, weights_values, input, input_values, out, out_values); for(unsigned int b = 0; b < batchSize; b++) {
mat_add(out, out_values, bias, bias_values); __global float *batchInput_values = input_values + b * input.rows * input.cols;
mat_sigmoid(out, out_values); __global float *batchOut_values = out_values + b * out.rows * out.cols;
mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values);
mat_add(out, batchOut_values, bias, bias_values);
mat_sigmoid(out, batchOut_values);
}
} }