Fix OpenCL bugs (WIP)

This commit is contained in:
MrLetsplay 2024-02-21 23:45:11 +01:00
parent 4c70af2496
commit 707a9bf754
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
9 changed files with 38 additions and 31 deletions

View File

@ -5,14 +5,14 @@ CFLAGS=-Wall -g
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c
.PHONY: all .PHONY: all
all: libclm_cpu libclm_opencl cltest all: libclm libclm_cpu libclm_opencl cltest
.PHONY: run_cpu .PHONY: run_cpu
run: all run_cpu: all
cd $(BUILD) && ./cltest cpu cd $(BUILD) && ./cltest cpu
.PHONY: run_opencl .PHONY: run_opencl
run: all run_opencl: all
cd $(BUILD) && ./cltest opencl cd $(BUILD) && ./cltest opencl
.PHONY: cltest .PHONY: cltest

View File

@ -31,6 +31,7 @@ typedef struct {
clm_NativeBuf *nativeWeights; clm_NativeBuf *nativeWeights;
clm_NativeBuf *nativeBias; clm_NativeBuf *nativeBias;
clm_NativeBuf *nativeInput;
clm_NativeBuf *nativeOutput; clm_NativeBuf *nativeOutput;
clm_NativeBuf *nativeInputError; clm_NativeBuf *nativeInputError;
clm_NativeBuf *nativeOutputError; clm_NativeBuf *nativeOutputError;

View File

@ -11,6 +11,8 @@ int clm_gpuInit(unsigned int mode) {
void clm_gpuDestroy() {} void clm_gpuDestroy() {}
void clm_linearInit(clm_Linear *linear) {}
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
for(unsigned int b = 0; b < batchSize; b++) { for(unsigned int b = 0; b < batchSize; b++) {
clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]); clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]);

View File

@ -5,12 +5,14 @@
typedef int (*clm_gpuInitFunc)(unsigned int); typedef int (*clm_gpuInitFunc)(unsigned int);
typedef void (*clm_gpuDestroyFunc)(); typedef void (*clm_gpuDestroyFunc)();
typedef void (*clm_linearInitFunc)(clm_Linear *linear);
typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients); typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
static void *lib; static void *lib;
static clm_gpuInitFunc initFunc; static clm_gpuInitFunc initFunc;
static clm_gpuDestroyFunc destroyFunc; static clm_gpuDestroyFunc destroyFunc;
static clm_linearInitFunc linearInitFunc;
static clm_linearForwardFunc linearForwardFunc; static clm_linearForwardFunc linearForwardFunc;
static clm_linearBackpropFunc linearBackpropFunc; static clm_linearBackpropFunc linearBackpropFunc;
@ -33,6 +35,7 @@ int clm_gpuInit(unsigned int mode) {
initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit"); initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit");
destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy"); destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy");
linearInitFunc = (clm_linearInitFunc) dlsym(lib, "clm_linearInit");
linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward"); linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward");
linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop"); linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop");
@ -49,6 +52,10 @@ void clm_gpuDestroy() {
dlclose(lib); dlclose(lib);
} }
void clm_linearInit(clm_Linear *linear) {
linearInitFunc(linear);
}
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
linearForwardFunc(linear, batchSize, inputs, outputs); linearForwardFunc(linear, batchSize, inputs, outputs);
} }

View File

@ -9,6 +9,7 @@
int clm_gpuInit(unsigned int mode); int clm_gpuInit(unsigned int mode);
void clm_gpuDestroy(); void clm_gpuDestroy();
void clm_linearInit(clm_Linear *linear);
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients); void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);

View File

@ -1,3 +1,4 @@
#include "clm.h"
#include "clm_gpu.h" #include "clm_gpu.h"
#include <math.h> #include <math.h>
@ -38,7 +39,7 @@ int clm_gpuInit(unsigned int mode) {
int useGPU = true; int useGPU = true;
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
if(err != CL_SUCCESS) { if(err != CL_SUCCESS) {
printf("Error: Failed to create a device group!\n"); printf("Error: Failed to create a device group\n");
return 1; return 1;
} }
@ -154,18 +155,21 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c
} }
} }
clm_NativeBuf nativeInput; void clm_linearInit(clm_Linear *linear) {
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
linear->nativeInput = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
}
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time) // TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
if(batchSize == 0) return; if(batchSize == 0) return;
if(!linear->nativeWeights) {
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
}
cl_GPUMat matInput = gpuMat(inputs[0]); cl_GPUMat matInput = gpuMat(inputs[0]);
cl_GPUMat matWeights = gpuMat(linear->weights); cl_GPUMat matWeights = gpuMat(linear->weights);
cl_GPUMat matBias = gpuMat(linear->bias); cl_GPUMat matBias = gpuMat(linear->bias);
@ -176,7 +180,7 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
cl_int err; cl_int err;
// TODO: make sure to always alloc nn.batchSize, not batchSize // TODO: make sure to always alloc nn.batchSize, not batchSize
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput); cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights); cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias); cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias);
if(!matInput_values || !matWeights_values || !matBias_values) { if(!matInput_values || !matWeights_values || !matBias_values) {
@ -231,16 +235,6 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
if(batchSize == 0) return; if(batchSize == 0) return;
if(!linear->nativeInputError) {
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
}
cl_GPUMat matInput = gpuMat(inputs[0]); cl_GPUMat matInput = gpuMat(inputs[0]);
cl_GPUMat matWeights = gpuMat(linear->weights); cl_GPUMat matWeights = gpuMat(linear->weights);
cl_GPUMat matOutput = gpuMat(outputs[0]); cl_GPUMat matOutput = gpuMat(outputs[0]);
@ -251,7 +245,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
cl_int err; cl_int err;
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput); cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights); cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError); cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError);
cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput); cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput);

View File

@ -8,7 +8,7 @@
char *clm_loadFile(const char *path) { char *clm_loadFile(const char *path) {
FILE *file = fopen(path, "r"); FILE *file = fopen(path, "r");
if(!file) { if(!file) {
printf("fopen failed: %s\n", strerror(errno)); printf("fopen failed for file %s: %s\n", path, strerror(errno));
goto error; goto error;
} }

View File

@ -230,7 +230,11 @@ int main(int argc, const char *argv[]) {
clm_Linear layers[] = { clm_Linear layers[] = {
clm_linearCreateRandom(i, h), clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)}; clm_linearCreateRandom(h, o)};
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, imageCount / 2); clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 500);
for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) {
clm_linearInit(&nn.layers[i]);
}
for(unsigned int epoch = 0; epoch < 1; epoch++) { for(unsigned int epoch = 0; epoch < 1; epoch++) {
printf("Epoch %u\n", epoch); printf("Epoch %u\n", epoch);

View File

@ -12,8 +12,8 @@ typedef struct __attribute__((packed)) {
i = idx / mat.cols; \ i = idx / mat.cols; \
j = idx % mat.cols; \ j = idx % mat.cols; \
} else { \ } else { \
i = idx / mat.rows; \ i = idx % mat.rows; \
j = idx % mat.rows; \ j = idx / mat.rows; \
} \ } \
} }
@ -28,7 +28,8 @@ void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, _
for(unsigned int k = 0; k < matA.cols; k++) { for(unsigned int k = 0; k < matA.cols; k++) {
sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j); sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
} }
matrixAt(matOut, matOut_values, i, j) = sum;
matOut_values[idx] = sum;
} }
void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) { void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
@ -126,9 +127,6 @@ __kernel void linear_backprop_2(unsigned int batchSize,
inputsT.transposed = true; inputsT.transposed = true;
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
// clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]);
// clm_matrixAddMatrix(linear->bias, gradient);
if(updateErrors) { if(updateErrors) {
cl_GPUMat weightsT = weights; cl_GPUMat weightsT = weights;
weightsT.transposed = true; weightsT.transposed = true;