Improve some code, Add memory leak

This commit is contained in:
MrLetsplay 2024-01-24 12:16:17 +01:00
parent 8240f7c01b
commit 2f39db3f52
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
11 changed files with 553 additions and 80 deletions

View File

@ -2,10 +2,14 @@ SRC=src
BUILD=build
CFLAGS=-Wall -g
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c
CLM_SOURCES_CPU=$(CLM_SOURCES) $(SRC)/clm_gpu_cpu.c
CLM_SOURCES_OPENCL=$(CLM_SOURCES) $(SRC)/clm_gpu_opencl.c
.PHONY: all
all:
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(CLM_SOURCES_OPENCL)
.PHONY: run
run: all
@ -14,7 +18,7 @@ run: all
.PHONY: cl
cl:
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(CLM_SOURCES_OPENCL)
.PHONY: cl_run
cl_run: cl

View File

@ -3,6 +3,7 @@
#include <math.h>
#include "clm.h"
#include "util.h"
#include <CL/cl.h>
#include <stdbool.h>
@ -15,16 +16,6 @@ typedef struct __attribute__((packed)) {
cl_char transposed;
} cl_GPUMat;
char *loadFile(const char *path) {
FILE *file = fopen(path, "r");
fseek(file, 0, SEEK_END);
size_t length = ftell(file);
fseek(file, 0, SEEK_SET);
char *buffer = calloc(1, length + 1);
fread(buffer, length, 1, file);
return buffer;
}
int main() {
// Connect to a compute device
int useGPU = true;

View File

@ -10,7 +10,7 @@ const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};
clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
printf("CREATING MATRIX\n");
clm_Matrix mat;
clm_Matrix mat = {0};
mat.rows = rows;
mat.cols = cols;
mat.values = calloc(rows * cols, sizeof(float));
@ -98,7 +98,7 @@ clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
}
clm_Matrix clm_matrixTranspose(clm_Matrix mat) {
clm_Matrix tr;
clm_Matrix tr = {0};
tr.cols = mat.rows;
tr.rows = mat.cols;
tr.values = mat.values;
@ -183,7 +183,7 @@ clm_Matrix clm_matrixFromArray(float *array, unsigned int length) {
}
clm_Matrix clm_matrixWrapArray(float *array, unsigned int length) {
clm_Matrix mat;
clm_Matrix mat = {0};
mat.rows = length;
mat.cols = 1;
mat.values = array;
@ -195,7 +195,7 @@ bool clm_matrixIsInvalid(clm_Matrix mat) {
}
clm_Vector clm_vectorCreate(unsigned int length) {
clm_Vector vector;
clm_Vector vector = {0};
vector.length = length;
vector.values = calloc(length, sizeof(float));
return vector;
@ -206,13 +206,13 @@ bool clm_vectorIsInvalid(clm_Vector vec) {
}
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
clm_Linear linear;
clm_Linear linear = {0};
linear.weights = clm_createMatrixRandom(outputs, inputs);
linear.bias = clm_createMatrixRandom(outputs, 1);
// linear.output = clm_createMatrix(outputs, 1);
linear.error = clm_createMatrix(outputs, 1);
linear.weightsError = clm_createMatrix(outputs, inputs);
// linear.error = clm_createMatrix(outputs, 1);
// linear.weightsError = clm_createMatrix(outputs, inputs);
return linear;
}
@ -221,8 +221,15 @@ clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate,
for(unsigned int i = 0; i < numLayers; i++) {
layers[i].output = calloc(batchSize, sizeof(clm_Matrix));
layers[i].error = calloc(batchSize, sizeof(clm_Matrix));
layers[i].weightsError = calloc(batchSize, sizeof(clm_Matrix));
layers[i].gradient = calloc(batchSize, sizeof(clm_Matrix));
for(unsigned int j = 0; j < batchSize; j++) {
layers[i].output[j] = clm_createMatrix(layers[i].weights.rows, 1);
layers[i].error[j] = clm_createMatrix(layers[i].weights.rows, 1);
layers[i].weightsError[j] = clm_createMatrix(layers[i].weights.rows, layers[i].weights.cols);
layers[i].gradient[j] = clm_createMatrix(layers[i].weights.rows, 1);
}
}

View File

@ -23,11 +23,16 @@ typedef struct {
clm_Matrix weights;
clm_Matrix bias;
clm_Matrix *output;
clm_Matrix error;
clm_Matrix weightsError;
clm_Matrix *error;
clm_Matrix *weightsError;
clm_Matrix *gradient;
clm_NativeBuf *nativeWeights;
clm_NativeBuf *nativeBias;
clm_NativeBuf *nativeOutput;
clm_NativeBuf *nativeInputError;
clm_NativeBuf *nativeOutputError;
clm_NativeBuf *nativeWeightsError;
clm_NativeBuf *nativeGradient;
} clm_Linear;
typedef struct {

View File

@ -7,5 +7,6 @@ int clm_gpuInit();
void clm_gpuDestroy();
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
#endif

View File

@ -1,3 +1,4 @@
#include "clm.h"
#include "clm_gpu.h"
#include <stdio.h>
@ -23,3 +24,30 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
clm_matrixSigmoid(outputs[b]);
}
}
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
for(unsigned int b = 0; b < batchSize; b++) {
// clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
// clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
// clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
// clm_Matrix error = linear.error;
// GPU step 1
clm_matrixDSigmoid(clm_matrixCopy(outputs[b], outputGradients[b])); // dsig(yhat)
clm_matrixMultiplyMatrixElements(outputGradients[b], inputErrors[b]); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(outputGradients[b], learnRate);
// GPU Step 2
clm_Matrix inputT = clm_matrixTranspose(inputs[b]);
clm_matrixMultiplyMatrix(outputGradients[b], inputT, outputWeightsErrors[b]);
// clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]);
// clm_matrixAddMatrix(linear->bias, gradient);
if(updateErrors) {
// GPU Step 2 extended (opt)
clm_Matrix weightsT = clm_matrixTranspose(linear->weights);
clm_matrixMultiplyMatrix(weightsT, inputErrors[b], outputErrors[b]);
}
}
}

View File

@ -1,7 +1,8 @@
#include "clm_gpu.h"
#include <math.h>
#define CL_TARGET_OPENCL_VERSION 200
#include "clm_util.h"
#include <CL/cl_platform.h>
#include <CL/cl.h>
@ -12,14 +13,18 @@
static cl_context context;
static cl_device_id deviceID;
static cl_command_queue queue;
static cl_kernel kernel;
static cl_kernel kernelLinearForward;
static cl_kernel kernelLinearBackprop1;
static cl_kernel kernelLinearBackprop2;
static size_t kernelLinearForwardLocal;
static size_t kernelLinearBackprop1Local;
static size_t kernelLinearBackprop2Local;
struct clm_NativeBuf {
cl_mem mem;
};
typedef struct __attribute__((packed)) {
cl_uint rows;
cl_uint cols;
cl_char transposed;
@ -28,16 +33,6 @@ typedef struct __attribute__((packed)) {
#define gpuMat(mat) \
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
char *loadFile(const char *path) {
FILE *file = fopen(path, "r");
fseek(file, 0, SEEK_END);
size_t length = ftell(file);
fseek(file, 0, SEEK_SET);
char *buffer = calloc(1, length + 1);
fread(buffer, length, 1, file);
return buffer;
}
int clm_gpuInit() {
// Connect to a compute device
int useGPU = true;
@ -47,7 +42,12 @@ int clm_gpuInit() {
return 1;
}
char *buffer = loadFile("src/mat.cl");
char *buffer = clm_loadFile("src/mat.cl");
if(!buffer) {
printf("Failed to load mat kernel\n");
return 1;
}
printf("%s", buffer);
context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
@ -76,9 +76,19 @@ int clm_gpuInit() {
return 1;
}
kernel = clCreateKernel(program, "linear_forward", &err);
if(!kernel) {
printf("Failed to create kernel\n");
kernelLinearForward = clCreateKernel(program, "linear_forward", &err);
kernelLinearBackprop1 = clCreateKernel(program, "linear_backprop_1", &err);
kernelLinearBackprop2 = clCreateKernel(program, "linear_backprop_2", &err);
if(!kernelLinearForward || !kernelLinearBackprop1 || !kernelLinearBackprop2) {
printf("Failed to create kernels: %d\n", err);
return 1;
}
err = clGetKernelWorkGroupInfo(kernelLinearForward, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearForwardLocal), &kernelLinearForwardLocal, NULL);
err |= clGetKernelWorkGroupInfo(kernelLinearBackprop1, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearBackprop1Local), &kernelLinearBackprop1Local, NULL);
err |= clGetKernelWorkGroupInfo(kernelLinearBackprop2, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearBackprop2Local), &kernelLinearBackprop2Local, NULL);
if(err != CL_SUCCESS) {
printf("Failed to get work group sizes\n");
return 1;
}
@ -88,13 +98,13 @@ int clm_gpuInit() {
void clm_gpuDestroy() {
}
static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nativeBuf) {
static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, cl_mem_flags flags, clm_NativeBuf *nativeBuf) {
cl_int err;
if(!nativeBuf->mem) {
cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, count * sizeof(float) * mat.rows * mat.cols, NULL, &err);
cl_mem mat_values = clCreateBuffer(context, flags, count * sizeof(float) * mat.rows * mat.cols, NULL, &err);
if(!mat_values) {
printf("Failed to alloc buffer: %d\n", err);
printf("Failed to alloc buffer: %s\n", clm_clErrorToString(err));
return NULL;
}
nativeBuf->mem = mat_values;
@ -103,27 +113,27 @@ static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nati
return nativeBuf->mem;
}
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, cl_mem_flags flags, clm_NativeBuf *nativeBuf) {
cl_int err;
cl_mem mem = allocGPUMat(gpuMat, 1, nativeBuf);
cl_mem mem = allocGPUMat(gpuMat, 1, flags, nativeBuf);
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue write: %d\n", err);
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
return NULL;
}
return mem;
}
static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, cl_mem_flags flags, clm_NativeBuf *nativeBuf) {
cl_int err;
cl_mem mem = allocGPUMat(gpuMat, numMats, nativeBuf);
cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf);
for(unsigned int i = 0; i < numMats; i++) {
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue write: %d\n", err);
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
return NULL;
}
}
@ -131,6 +141,19 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m
return mem;
}
static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
cl_int err;
cl_mem mem = nativeBuf->mem;
for(unsigned int i = 0; i < numMats; i++) {
err = clEnqueueReadBuffer(queue, mem, CL_TRUE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
return;
}
}
}
clm_NativeBuf nativeInput;
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
@ -152,32 +175,33 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
cl_int err;
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, &nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
// TODO: make sure to always alloc nn.batchSize, not batchSize
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias);
if(!matInput_values || !matWeights_values || !matBias_values) {
// linear->output = INVALID_MATRIX;
return;
}
cl_mem matOut_values = allocGPUMat(matOut, batchSize, linear->nativeOutput);
cl_mem matOut_values = allocGPUMat(matOut, batchSize, CL_MEM_READ_WRITE, linear->nativeOutput);
if(!matOut_values) {
// linear->output = INVALID_MATRIX;
return;
}
err = 0;
err |= clSetKernelArg(kernel, 0, sizeof(cl_uint), &batchSize);
err |= clSetKernelArg(kernel, 1, sizeof(matInput), &matInput);
err |= clSetKernelArg(kernel, 2, sizeof(matInput_values), &matInput_values);
err |= clSetKernelArg(kernel, 3, sizeof(matWeights), &matWeights);
err |= clSetKernelArg(kernel, 4, sizeof(matWeights_values), &matWeights_values);
err |= clSetKernelArg(kernel, 5, sizeof(matBias), &matBias);
err |= clSetKernelArg(kernel, 6, sizeof(matBias_values), &matBias_values);
err |= clSetKernelArg(kernel, 7, sizeof(matOut), &matOut);
err |= clSetKernelArg(kernel, 8, sizeof(matOut_values), &matOut_values);
err |= clSetKernelArg(kernelLinearForward, 0, sizeof(cl_uint), &batchSize);
err |= clSetKernelArg(kernelLinearForward, 1, sizeof(matInput), &matInput);
err |= clSetKernelArg(kernelLinearForward, 2, sizeof(matInput_values), &matInput_values);
err |= clSetKernelArg(kernelLinearForward, 3, sizeof(matWeights), &matWeights);
err |= clSetKernelArg(kernelLinearForward, 4, sizeof(matWeights_values), &matWeights_values);
err |= clSetKernelArg(kernelLinearForward, 5, sizeof(matBias), &matBias);
err |= clSetKernelArg(kernelLinearForward, 6, sizeof(matBias_values), &matBias_values);
err |= clSetKernelArg(kernelLinearForward, 7, sizeof(matOut), &matOut);
err |= clSetKernelArg(kernelLinearForward, 8, sizeof(matOut_values), &matOut_values);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args: %d\n", err);
printf("Failed to set kernel args (1): %d\n", err);
return;
}
@ -185,25 +209,121 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
printf("INFO: %s\n", info);*/
size_t local;
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if(err != CL_SUCCESS) {
printf("Failed to get work group size\n");
return;
}
size_t global = ceil((float) workSize / local) * local;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
size_t global = ceil((float) workSize / kernelLinearForwardLocal) * kernelLinearForwardLocal;
err = clEnqueueNDRangeKernel(queue, kernelLinearForward, 1, NULL, &global, &kernelLinearForwardLocal, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue: %d\n", err);
return;
}
clFlush(queue);
clFinish(queue);
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL);
/*err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n");
return;
}
}*/
readGPUMats(matOut, batchSize, outputs, linear->nativeOutput);
}
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
if(batchSize == 0) return;
if(!linear->nativeInputError) {
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
}
cl_GPUMat matInput = gpuMat(inputs[0]);
cl_GPUMat matWeights = gpuMat(linear->weights);
cl_GPUMat matOutput = gpuMat(outputs[0]);
cl_GPUMat matInputErrors = gpuMat(inputErrors[0]);
cl_GPUMat matOutputErrors = !updateErrors ? (cl_GPUMat){0} : (cl_GPUMat) gpuMat(outputErrors[0]);
cl_GPUMat matOutputWeightsErrors = gpuMat(outputWeightsErrors[0]);
cl_GPUMat matOutputGradients = gpuMat(outputGradients[0]);
cl_int err;
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError);
cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput);
if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) {
printf("Failed to write GPU mats\n");
return;
}
// cl_mem matOutput_values = allocGPUMat(matOutput, batchSize, CL_MEM_READ_ONLY, linear->nativeOutput);
cl_mem matOutputErrors_values = !updateErrors ? NULL : allocGPUMat(matOutputErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeOutputError);
cl_mem matOutputWeightsErrors_values = allocGPUMat(matOutputWeightsErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeWeightsError);
cl_mem matOutputGradients_values = allocGPUMat(matOutputGradients, batchSize, CL_MEM_READ_WRITE, linear->nativeGradient);
if(!matOutputWeightsErrors_values || !matOutputGradients_values) {
printf("Failed to alloc GPU mats\n");
return;
}
err = 0;
err |= clSetKernelArg(kernelLinearBackprop1, 0, sizeof(cl_uint), &batchSize);
err |= clSetKernelArg(kernelLinearBackprop1, 1, sizeof(cl_float), &learnRate);
err |= clSetKernelArg(kernelLinearBackprop1, 2, sizeof(matWeights), &matWeights);
err |= clSetKernelArg(kernelLinearBackprop1, 3, sizeof(matWeights_values), &matWeights_values);
err |= clSetKernelArg(kernelLinearBackprop1, 4, sizeof(matOutput), &matOutput);
err |= clSetKernelArg(kernelLinearBackprop1, 5, sizeof(matOutput_values), &matOutput_values);
err |= clSetKernelArg(kernelLinearBackprop1, 6, sizeof(matInputErrors), &matInputErrors);
err |= clSetKernelArg(kernelLinearBackprop1, 7, sizeof(matInputErrors_values), &matInputErrors_values);
err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients);
err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args (2): %d\n", err);
return;
}
size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols;
size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local;
clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL);
clFlush(queue);
clFinish(queue);
err = 0;
err |= clSetKernelArg(kernelLinearBackprop2, 0, sizeof(cl_uint), &batchSize);
err |= clSetKernelArg(kernelLinearBackprop2, 1, sizeof(matWeights), &matWeights);
err |= clSetKernelArg(kernelLinearBackprop2, 2, sizeof(matWeights_values), &matWeights_values);
err |= clSetKernelArg(kernelLinearBackprop2, 3, sizeof(matInput), &matInput);
err |= clSetKernelArg(kernelLinearBackprop2, 4, sizeof(matInput_values), &matInput_values);
err |= clSetKernelArg(kernelLinearBackprop2, 5, sizeof(matInputErrors), &matInputErrors);
err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values);
err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors);
if(updateErrors) {
err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors);
err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values);
}
err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors);
err |= clSetKernelArg(kernelLinearBackprop2, 11, sizeof(matOutputWeightsErrors_values), &matOutputWeightsErrors_values);
err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients);
err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args (3): %d\n", err);
return;
}
size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols;
size_t step2Global = ceil((float) step1WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local;
clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL);
clFlush(queue);
clFinish(queue);
readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient);
readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError);
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError);
}

195
src/clm_util.c Normal file
View File

@ -0,0 +1,195 @@
#include "clm_util.h"
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *clm_loadFile(const char *path) {
FILE *file = fopen(path, "r");
if(!file) {
printf("fopen failed: %s\n", strerror(errno));
goto error;
}
if(fseek(file, 0, SEEK_END) < 0) {
printf("fseek failed: %s\n", strerror(errno));
goto cleanup_file;
}
long length = ftell(file);
if(length < 0) {
printf("ftell failed: %s\n", strerror(errno));
goto cleanup_file;
}
if(fseek(file, 0, SEEK_SET) < 0) {
printf("fseek failed: %s\n", strerror(errno));
goto cleanup_file;
}
char *buffer = calloc(1, length + 1);
if(!buffer) {
printf("calloc failed: %s\n", strerror(errno));
goto cleanup_file;
}
size_t nRead = fread(buffer, length, 1, file);
if(nRead != 1) {
printf("fread failed: (nRead = %zu, err %d, eof %d)\n", nRead, ferror(file), feof(file));
goto cleanup_buffer;
}
fclose(file);
return buffer;
cleanup_buffer:
free(buffer);
cleanup_file:
fclose(file);
error:
return NULL;
}
const char *clm_clErrorToString(cl_int error) {
// Source: https://stackoverflow.com/questions/24326432/convenient-way-to-show-opencl-error-codes
switch(error) {
// run-time and JIT compiler errors
case 0:
return "CL_SUCCESS";
case -1:
return "CL_DEVICE_NOT_FOUND";
case -2:
return "CL_DEVICE_NOT_AVAILABLE";
case -3:
return "CL_COMPILER_NOT_AVAILABLE";
case -4:
return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case -5:
return "CL_OUT_OF_RESOURCES";
case -6:
return "CL_OUT_OF_HOST_MEMORY";
case -7:
return "CL_PROFILING_INFO_NOT_AVAILABLE";
case -8:
return "CL_MEM_COPY_OVERLAP";
case -9:
return "CL_IMAGE_FORMAT_MISMATCH";
case -10:
return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case -11:
return "CL_BUILD_PROGRAM_FAILURE";
case -12:
return "CL_MAP_FAILURE";
case -13:
return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
case -14:
return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
case -15:
return "CL_COMPILE_PROGRAM_FAILURE";
case -16:
return "CL_LINKER_NOT_AVAILABLE";
case -17:
return "CL_LINK_PROGRAM_FAILURE";
case -18:
return "CL_DEVICE_PARTITION_FAILED";
case -19:
return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
// compile-time errors
case -30:
return "CL_INVALID_VALUE";
case -31:
return "CL_INVALID_DEVICE_TYPE";
case -32:
return "CL_INVALID_PLATFORM";
case -33:
return "CL_INVALID_DEVICE";
case -34:
return "CL_INVALID_CONTEXT";
case -35:
return "CL_INVALID_QUEUE_PROPERTIES";
case -36:
return "CL_INVALID_COMMAND_QUEUE";
case -37:
return "CL_INVALID_HOST_PTR";
case -38:
return "CL_INVALID_MEM_OBJECT";
case -39:
return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case -40:
return "CL_INVALID_IMAGE_SIZE";
case -41:
return "CL_INVALID_SAMPLER";
case -42:
return "CL_INVALID_BINARY";
case -43:
return "CL_INVALID_BUILD_OPTIONS";
case -44:
return "CL_INVALID_PROGRAM";
case -45:
return "CL_INVALID_PROGRAM_EXECUTABLE";
case -46:
return "CL_INVALID_KERNEL_NAME";
case -47:
return "CL_INVALID_KERNEL_DEFINITION";
case -48:
return "CL_INVALID_KERNEL";
case -49:
return "CL_INVALID_ARG_INDEX";
case -50:
return "CL_INVALID_ARG_VALUE";
case -51:
return "CL_INVALID_ARG_SIZE";
case -52:
return "CL_INVALID_KERNEL_ARGS";
case -53:
return "CL_INVALID_WORK_DIMENSION";
case -54:
return "CL_INVALID_WORK_GROUP_SIZE";
case -55:
return "CL_INVALID_WORK_ITEM_SIZE";
case -56:
return "CL_INVALID_GLOBAL_OFFSET";
case -57:
return "CL_INVALID_EVENT_WAIT_LIST";
case -58:
return "CL_INVALID_EVENT";
case -59:
return "CL_INVALID_OPERATION";
case -60:
return "CL_INVALID_GL_OBJECT";
case -61:
return "CL_INVALID_BUFFER_SIZE";
case -62:
return "CL_INVALID_MIP_LEVEL";
case -63:
return "CL_INVALID_GLOBAL_WORK_SIZE";
case -64:
return "CL_INVALID_PROPERTY";
case -65:
return "CL_INVALID_IMAGE_DESCRIPTOR";
case -66:
return "CL_INVALID_COMPILER_OPTIONS";
case -67:
return "CL_INVALID_LINKER_OPTIONS";
case -68:
return "CL_INVALID_DEVICE_PARTITION_COUNT";
// extension errors
case -1000:
return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
case -1001:
return "CL_PLATFORM_NOT_FOUND_KHR";
case -1002:
return "CL_INVALID_D3D10_DEVICE_KHR";
case -1003:
return "CL_INVALID_D3D10_RESOURCE_KHR";
case -1004:
return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
case -1005:
return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
default:
return "Unknown OpenCL error";
}
}

11
src/clm_util.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef _UTIL_H_
#define _UTIL_H_
#define CL_TARGET_OPENCL_VERSION 200
#include <CL/cl_platform.h>
char *clm_loadFile(const char *path);
const char *clm_clErrorToString(cl_int error);
#endif

View File

@ -37,7 +37,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
unsigned int batchSize = numElements - b * nn.batchSize;
if(batchSize > nn.batchSize) batchSize = nn.batchSize;
// printf("Batch %d (size %d)\n", b, batchSize);
printf("Batch %d (size %d)\n", b, batchSize);
for(unsigned int i = 0; i < batchSize; i++) {
clm_Vector input = inputs[b * nn.batchSize + i];
@ -53,7 +53,26 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
currentXs = nn.layers[i].output;
}
clm_Linear lastLayer = nn.layers[nn.numLayers - 1];
for(unsigned int b = 0; b < batchSize; b++) {
// Error of last layer = y - yhat
clm_matrixCopy(batchOutputs[b], lastLayer.error[b]); // lastLayer.error = y
clm_matrixSubtractMatrix(lastLayer.error[b], lastLayer.output[b]); // lastLayer.error -= yhat
}
for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_Linear layer = nn.layers[i];
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
clm_Matrix *outputsOfThisLayer = layer.output;
clm_linearBackprop(&layer, nn.learnRate, batchSize, inputsToThisLayer, outputsOfThisLayer, layer.error, i > 0, i == 0 ? NULL : nn.layers[i - 1].error, layer.weightsError, layer.gradient);
for(unsigned int b = 0; b < batchSize; b++) {
clm_matrixAddMatrix(layer.weights, layer.weightsError[b]);
clm_matrixAddMatrix(layer.bias, layer.gradient[b]);
}
}
/*for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_Linear layer = nn.layers[i];
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
@ -76,7 +95,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
clm_matrixAddMatrix(layer.weights, layer.weightsError);
clm_matrixAddMatrix(layer.bias, gradient);
}
}*/
}
free(batchInputs);
@ -190,7 +209,7 @@ int main() {
clm_Linear layers[] = {
clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)};
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 100);
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000);
for(unsigned int epoch = 0; epoch < 1; epoch++) {
printf("Epoch %u\n", epoch);
@ -220,7 +239,7 @@ int main() {
predDigit = j;
}
}
if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
unsigned int actDigit = 0;
float maxA = -1;
@ -231,7 +250,7 @@ int main() {
actDigit = j;
}
}
if(idx < 100) printf("Actual: %u\n", actDigit);
// if(idx < 100) printf("Actual: %u\n", actDigit);
// printf("\n");
if(predDigit == actDigit) correct++;

View File

@ -6,12 +6,23 @@ typedef struct __attribute__((packed)) {
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
#define matrixGetIJ(mat, idx, i, j) \
{ \
if(!mat.transposed) { \
i = idx / mat.cols; \
j = idx % mat.cols; \
} else { \
i = idx / mat.rows; \
j = idx % mat.rows; \
} \
}
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
uint idx = get_global_id(0);
if(idx >= matOut.rows * matOut.cols) return;
uint i = idx / matOut.cols;
uint j = idx % matOut.cols;
uint i, j;
matrixGetIJ(matOut, idx, i, j);
float sum = 0;
for(unsigned int k = 0; k < matA.cols; k++) {
@ -34,7 +45,40 @@ void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
}
__kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
void mat_dSigmoid(cl_GPUMat mat, __global float *mat_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
float v = mat_values[idx];
mat_values[idx] = v * (1 - v);
}
void mat_copy(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
other_values[idx] = mat_values[idx];
}
void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
other_values[idx] *= mat_values[idx];
}
void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
mat_values[idx] *= scalar;
}
__kernel void linear_forward(unsigned int batchSize,
cl_GPUMat input, __global float *input_values,
cl_GPUMat weights, __global float *weights_values,
cl_GPUMat bias, __global float *bias_values,
cl_GPUMat out, __global float *out_values) {
// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchInput_values = input_values + b * input.rows * input.cols;
@ -44,3 +88,51 @@ __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global f
mat_sigmoid(out, batchOut_values);
}
}
__kernel void linear_backprop_1(unsigned int batchSize, float learnRate,
cl_GPUMat weights, __global float *weights_values,
cl_GPUMat outputs, __global float *outputs_values,
cl_GPUMat inputErrors, __global float *inputErrors_values,
cl_GPUMat outputGradients, __global float *outputGradients_values) {
for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchOut_values = outputs_values + b * outputs.rows * outputs.cols;
__global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols;
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
mat_copy(outputs, batchOut_values, outputGradients, batchOutGradients_values);
mat_dSigmoid(outputGradients, batchOutGradients_values); // dsig(yhat)
mat_multiply_elements(outputGradients, batchOutGradients_values, inputErrors, batchInErrors_values); // (yhat - y) . dsig(yhat)
mat_multiply_scalar(outputGradients, batchOutGradients_values, learnRate);
}
}
__kernel void linear_backprop_2(unsigned int batchSize,
cl_GPUMat weights, __global float *weights_values,
cl_GPUMat inputs, __global float *inputs_values,
cl_GPUMat inputErrors, __global float *inputErrors_values,
char updateErrors,
cl_GPUMat outputErrors, __global float *outputErrors_values,
cl_GPUMat outputWeightsErrors, __global float *outputWeightsErrors_values,
cl_GPUMat outputGradients, __global float *outputGradients_values) {
for(unsigned int b = 0; b < batchSize; b++) {
__global float *batchInput_values = inputs_values + b * inputs.rows * inputs.cols;
__global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols;
__global float *batchOutErrors_values = outputErrors_values + b * outputErrors.rows * outputErrors.cols;
__global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols;
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
cl_GPUMat inputsT = inputs;
inputsT.transposed = true;
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
// clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]);
// clm_matrixAddMatrix(linear->bias, gradient);
if(updateErrors) {
cl_GPUMat weightsT = weights;
weightsT.transposed = true;
mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
}
}
}