Fix OpenCL bugs (WIP)
This commit is contained in:
parent
4c70af2496
commit
707a9bf754
6
Makefile
6
Makefile
@ -5,14 +5,14 @@ CFLAGS=-Wall -g
|
||||
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c
|
||||
|
||||
.PHONY: all
|
||||
all: libclm_cpu libclm_opencl cltest
|
||||
all: libclm libclm_cpu libclm_opencl cltest
|
||||
|
||||
.PHONY: run_cpu
|
||||
run: all
|
||||
run_cpu: all
|
||||
cd $(BUILD) && ./cltest cpu
|
||||
|
||||
.PHONY: run_opencl
|
||||
run: all
|
||||
run_opencl: all
|
||||
cd $(BUILD) && ./cltest opencl
|
||||
|
||||
.PHONY: cltest
|
||||
|
@ -31,6 +31,7 @@ typedef struct {
|
||||
|
||||
clm_NativeBuf *nativeWeights;
|
||||
clm_NativeBuf *nativeBias;
|
||||
clm_NativeBuf *nativeInput;
|
||||
clm_NativeBuf *nativeOutput;
|
||||
clm_NativeBuf *nativeInputError;
|
||||
clm_NativeBuf *nativeOutputError;
|
||||
|
@ -11,6 +11,8 @@ int clm_gpuInit(unsigned int mode) {
|
||||
|
||||
void clm_gpuDestroy() {}
|
||||
|
||||
void clm_linearInit(clm_Linear *linear) {}
|
||||
|
||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]);
|
||||
|
@ -5,12 +5,14 @@
|
||||
|
||||
typedef int (*clm_gpuInitFunc)(unsigned int);
|
||||
typedef void (*clm_gpuDestroyFunc)();
|
||||
typedef void (*clm_linearInitFunc)(clm_Linear *linear);
|
||||
typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
||||
typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
|
||||
|
||||
static void *lib;
|
||||
static clm_gpuInitFunc initFunc;
|
||||
static clm_gpuDestroyFunc destroyFunc;
|
||||
static clm_linearInitFunc linearInitFunc;
|
||||
static clm_linearForwardFunc linearForwardFunc;
|
||||
static clm_linearBackpropFunc linearBackpropFunc;
|
||||
|
||||
@ -33,6 +35,7 @@ int clm_gpuInit(unsigned int mode) {
|
||||
|
||||
initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit");
|
||||
destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy");
|
||||
linearInitFunc = (clm_linearInitFunc) dlsym(lib, "clm_linearInit");
|
||||
linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward");
|
||||
linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop");
|
||||
|
||||
@ -49,6 +52,10 @@ void clm_gpuDestroy() {
|
||||
dlclose(lib);
|
||||
}
|
||||
|
||||
void clm_linearInit(clm_Linear *linear) {
|
||||
linearInitFunc(linear);
|
||||
}
|
||||
|
||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
||||
linearForwardFunc(linear, batchSize, inputs, outputs);
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
int clm_gpuInit(unsigned int mode);
|
||||
void clm_gpuDestroy();
|
||||
|
||||
void clm_linearInit(clm_Linear *linear);
|
||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "clm.h"
|
||||
#include "clm_gpu.h"
|
||||
#include <math.h>
|
||||
|
||||
@ -38,7 +39,7 @@ int clm_gpuInit(unsigned int mode) {
|
||||
int useGPU = true;
|
||||
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Error: Failed to create a device group!\n");
|
||||
printf("Error: Failed to create a device group\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -154,18 +155,21 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c
|
||||
}
|
||||
}
|
||||
|
||||
clm_NativeBuf nativeInput;
|
||||
void clm_linearInit(clm_Linear *linear) {
|
||||
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeInput = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
|
||||
}
|
||||
|
||||
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
|
||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
||||
if(batchSize == 0) return;
|
||||
|
||||
if(!linear->nativeWeights) {
|
||||
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
|
||||
}
|
||||
|
||||
cl_GPUMat matInput = gpuMat(inputs[0]);
|
||||
cl_GPUMat matWeights = gpuMat(linear->weights);
|
||||
cl_GPUMat matBias = gpuMat(linear->bias);
|
||||
@ -176,7 +180,7 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
||||
cl_int err;
|
||||
|
||||
// TODO: make sure to always alloc nn.batchSize, not batchSize
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput);
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
|
||||
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias);
|
||||
if(!matInput_values || !matWeights_values || !matBias_values) {
|
||||
@ -231,16 +235,6 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
||||
if(batchSize == 0) return;
|
||||
|
||||
if(!linear->nativeInputError) {
|
||||
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
|
||||
}
|
||||
|
||||
cl_GPUMat matInput = gpuMat(inputs[0]);
|
||||
cl_GPUMat matWeights = gpuMat(linear->weights);
|
||||
cl_GPUMat matOutput = gpuMat(outputs[0]);
|
||||
@ -251,7 +245,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
||||
|
||||
cl_int err;
|
||||
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput);
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
|
||||
cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError);
|
||||
cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput);
|
||||
|
@ -8,7 +8,7 @@
|
||||
char *clm_loadFile(const char *path) {
|
||||
FILE *file = fopen(path, "r");
|
||||
if(!file) {
|
||||
printf("fopen failed: %s\n", strerror(errno));
|
||||
printf("fopen failed for file %s: %s\n", path, strerror(errno));
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
@ -230,7 +230,11 @@ int main(int argc, const char *argv[]) {
|
||||
clm_Linear layers[] = {
|
||||
clm_linearCreateRandom(i, h),
|
||||
clm_linearCreateRandom(h, o)};
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, imageCount / 2);
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 500);
|
||||
|
||||
for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) {
|
||||
clm_linearInit(&nn.layers[i]);
|
||||
}
|
||||
|
||||
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
||||
printf("Epoch %u\n", epoch);
|
||||
|
10
src/mat.cl
10
src/mat.cl
@ -12,8 +12,8 @@ typedef struct __attribute__((packed)) {
|
||||
i = idx / mat.cols; \
|
||||
j = idx % mat.cols; \
|
||||
} else { \
|
||||
i = idx / mat.rows; \
|
||||
j = idx % mat.rows; \
|
||||
i = idx % mat.rows; \
|
||||
j = idx / mat.rows; \
|
||||
} \
|
||||
}
|
||||
|
||||
@ -28,7 +28,8 @@ void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, _
|
||||
for(unsigned int k = 0; k < matA.cols; k++) {
|
||||
sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
|
||||
}
|
||||
matrixAt(matOut, matOut_values, i, j) = sum;
|
||||
|
||||
matOut_values[idx] = sum;
|
||||
}
|
||||
|
||||
void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
|
||||
@ -126,9 +127,6 @@ __kernel void linear_backprop_2(unsigned int batchSize,
|
||||
inputsT.transposed = true;
|
||||
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
|
||||
|
||||
// clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]);
|
||||
// clm_matrixAddMatrix(linear->bias, gradient);
|
||||
|
||||
if(updateErrors) {
|
||||
cl_GPUMat weightsT = weights;
|
||||
weightsT.transposed = true;
|
||||
|
Loading…
Reference in New Issue
Block a user