diff --git a/Makefile b/Makefile index a40a3e2..9d5d5dc 100644 --- a/Makefile +++ b/Makefile @@ -5,14 +5,14 @@ CFLAGS=-Wall -g CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c .PHONY: all -all: libclm_cpu libclm_opencl cltest +all: libclm libclm_cpu libclm_opencl cltest .PHONY: run_cpu -run: all +run_cpu: all cd $(BUILD) && ./cltest cpu .PHONY: run_opencl -run: all +run_opencl: all cd $(BUILD) && ./cltest opencl .PHONY: cltest diff --git a/src/clm.h b/src/clm.h index fbb181b..d409ff1 100644 --- a/src/clm.h +++ b/src/clm.h @@ -31,6 +31,7 @@ typedef struct { clm_NativeBuf *nativeWeights; clm_NativeBuf *nativeBias; + clm_NativeBuf *nativeInput; clm_NativeBuf *nativeOutput; clm_NativeBuf *nativeInputError; clm_NativeBuf *nativeOutputError; diff --git a/src/clm_cpu.c b/src/clm_cpu.c index 2d4c5b4..c5f9037 100644 --- a/src/clm_cpu.c +++ b/src/clm_cpu.c @@ -11,6 +11,8 @@ int clm_gpuInit(unsigned int mode) { void clm_gpuDestroy() {} +void clm_linearInit(clm_Linear *linear) {} + void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { for(unsigned int b = 0; b < batchSize; b++) { clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]); diff --git a/src/clm_funcs.c b/src/clm_funcs.c index 3405b9a..00d5c2a 100644 --- a/src/clm_funcs.c +++ b/src/clm_funcs.c @@ -5,12 +5,14 @@ typedef int (*clm_gpuInitFunc)(unsigned int); typedef void (*clm_gpuDestroyFunc)(); +typedef void (*clm_linearInitFunc)(clm_Linear *linear); typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients); static void *lib; static clm_gpuInitFunc initFunc; static clm_gpuDestroyFunc destroyFunc; +static clm_linearInitFunc linearInitFunc; static clm_linearForwardFunc linearForwardFunc; static clm_linearBackpropFunc linearBackpropFunc; @@ -33,6 +35,7 @@ int clm_gpuInit(unsigned int mode) { initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit"); destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy"); + linearInitFunc = (clm_linearInitFunc) dlsym(lib, "clm_linearInit"); linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward"); linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop"); @@ -49,6 +52,10 @@ void clm_gpuDestroy() { dlclose(lib); } +void clm_linearInit(clm_Linear *linear) { + linearInitFunc(linear); +} + void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { linearForwardFunc(linear, batchSize, inputs, outputs); } diff --git a/src/clm_gpu.h b/src/clm_gpu.h index 9f65027..0f384dc 100644 --- a/src/clm_gpu.h +++ b/src/clm_gpu.h @@ -9,6 +9,7 @@ int clm_gpuInit(unsigned int mode); void clm_gpuDestroy(); +void clm_linearInit(clm_Linear *linear); void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients); diff --git a/src/clm_opencl.c b/src/clm_opencl.c index 79df0c8..b5024fc 100644 --- a/src/clm_opencl.c +++ b/src/clm_opencl.c @@ -1,3 +1,4 @@ +#include "clm.h" #include "clm_gpu.h" #include @@ -38,7 +39,7 @@ int clm_gpuInit(unsigned int mode) { int useGPU = true; cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); if(err != CL_SUCCESS) { - printf("Error: Failed to create a device group!\n"); + printf("Error: Failed to create a device group\n"); return 1; } @@ -154,18 +155,21 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c } } -clm_NativeBuf nativeInput; +void clm_linearInit(clm_Linear *linear) { + linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeInput = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf)); +} // TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time) void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { if(batchSize == 0) return; - if(!linear->nativeWeights) { - linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); - } - cl_GPUMat matInput = gpuMat(inputs[0]); cl_GPUMat matWeights = gpuMat(linear->weights); cl_GPUMat matBias = gpuMat(linear->bias); @@ -176,7 +180,7 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i cl_int err; // TODO: make sure to always alloc nn.batchSize, not batchSize - cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput); + cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput); cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights); cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias); if(!matInput_values || !matWeights_values || !matBias_values) { @@ -231,16 +235,6 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { if(batchSize == 0) return; - if(!linear->nativeInputError) { - linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf)); - linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf)); - } - cl_GPUMat matInput = gpuMat(inputs[0]); cl_GPUMat matWeights = gpuMat(linear->weights); cl_GPUMat matOutput = gpuMat(outputs[0]); @@ -251,7 +245,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS cl_int err; - cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput); + cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput); cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights); cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError); cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput); diff --git a/src/clm_util.c b/src/clm_util.c index 915c523..0b2ad82 100644 --- a/src/clm_util.c +++ b/src/clm_util.c @@ -8,7 +8,7 @@ char *clm_loadFile(const char *path) { FILE *file = fopen(path, "r"); if(!file) { - printf("fopen failed: %s\n", strerror(errno)); + printf("fopen failed for file %s: %s\n", path, strerror(errno)); goto error; } diff --git a/src/cltest.c b/src/cltest.c index bd1444e..dd40f46 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -230,7 +230,11 @@ int main(int argc, const char *argv[]) { clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; - clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, imageCount / 2); + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 500); + + for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) { + clm_linearInit(&nn.layers[i]); + } for(unsigned int epoch = 0; epoch < 1; epoch++) { printf("Epoch %u\n", epoch); diff --git a/src/mat.cl b/src/mat.cl index 34cbfd7..fbc8b87 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -12,8 +12,8 @@ typedef struct __attribute__((packed)) { i = idx / mat.cols; \ j = idx % mat.cols; \ } else { \ - i = idx / mat.rows; \ - j = idx % mat.rows; \ + i = idx % mat.rows; \ + j = idx / mat.rows; \ } \ } @@ -28,7 +28,8 @@ void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, _ for(unsigned int k = 0; k < matA.cols; k++) { sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j); } - matrixAt(matOut, matOut_values, i, j) = sum; + + matOut_values[idx] = sum; } void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) { @@ -126,9 +127,6 @@ __kernel void linear_backprop_2(unsigned int batchSize, inputsT.transposed = true; mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); - // clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]); - // clm_matrixAddMatrix(linear->bias, gradient); - if(updateErrors) { cl_GPUMat weightsT = weights; weightsT.transposed = true;