diff --git a/Makefile b/Makefile index 61f49af..44af914 100644 --- a/Makefile +++ b/Makefile @@ -2,10 +2,14 @@ SRC=src BUILD=build CFLAGS=-Wall -g +CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c +CLM_SOURCES_CPU=$(CLM_SOURCES) $(SRC)/clm_gpu_cpu.c +CLM_SOURCES_OPENCL=$(CLM_SOURCES) $(SRC)/clm_gpu_opencl.c + .PHONY: all all: mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c + gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(CLM_SOURCES_OPENCL) .PHONY: run run: all @@ -14,7 +18,7 @@ run: all .PHONY: cl cl: mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c + gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(CLM_SOURCES_OPENCL) .PHONY: cl_run cl_run: cl diff --git a/src/cl.c b/src/cl.c index 987daa0..6e0c937 100644 --- a/src/cl.c +++ b/src/cl.c @@ -3,6 +3,7 @@ #include #include "clm.h" +#include "util.h" #include #include @@ -15,16 +16,6 @@ typedef struct __attribute__((packed)) { cl_char transposed; } cl_GPUMat; -char *loadFile(const char *path) { - FILE *file = fopen(path, "r"); - fseek(file, 0, SEEK_END); - size_t length = ftell(file); - fseek(file, 0, SEEK_SET); - char *buffer = calloc(1, length + 1); - fread(buffer, length, 1, file); - return buffer; -} - int main() { // Connect to a compute device int useGPU = true; diff --git a/src/clm.c b/src/clm.c index 0f2a4a8..95b428e 100644 --- a/src/clm.c +++ b/src/clm.c @@ -10,7 +10,7 @@ const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL}; clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) { printf("CREATING MATRIX\n"); - clm_Matrix mat; + clm_Matrix mat = {0}; mat.rows = rows; mat.cols = cols; mat.values = calloc(rows * cols, sizeof(float)); @@ -98,7 +98,7 @@ clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) { } clm_Matrix clm_matrixTranspose(clm_Matrix mat) { - clm_Matrix tr; + clm_Matrix tr = {0}; tr.cols = mat.rows; tr.rows = mat.cols; tr.values = mat.values; @@ -183,7 +183,7 @@ clm_Matrix clm_matrixFromArray(float *array, unsigned int length) { } clm_Matrix clm_matrixWrapArray(float *array, unsigned int length) { - clm_Matrix mat; + clm_Matrix mat = {0}; mat.rows = length; mat.cols = 1; mat.values = array; @@ -195,7 +195,7 @@ bool clm_matrixIsInvalid(clm_Matrix mat) { } clm_Vector clm_vectorCreate(unsigned int length) { - clm_Vector vector; + clm_Vector vector = {0}; vector.length = length; vector.values = calloc(length, sizeof(float)); return vector; @@ -206,13 +206,13 @@ bool clm_vectorIsInvalid(clm_Vector vec) { } clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) { - clm_Linear linear; + clm_Linear linear = {0}; linear.weights = clm_createMatrixRandom(outputs, inputs); linear.bias = clm_createMatrixRandom(outputs, 1); // linear.output = clm_createMatrix(outputs, 1); - linear.error = clm_createMatrix(outputs, 1); - linear.weightsError = clm_createMatrix(outputs, inputs); + // linear.error = clm_createMatrix(outputs, 1); + // linear.weightsError = clm_createMatrix(outputs, inputs); return linear; } @@ -221,8 +221,15 @@ clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, for(unsigned int i = 0; i < numLayers; i++) { layers[i].output = calloc(batchSize, sizeof(clm_Matrix)); + layers[i].error = calloc(batchSize, sizeof(clm_Matrix)); + layers[i].weightsError = calloc(batchSize, sizeof(clm_Matrix)); + layers[i].gradient = calloc(batchSize, sizeof(clm_Matrix)); + for(unsigned int j = 0; j < batchSize; j++) { layers[i].output[j] = clm_createMatrix(layers[i].weights.rows, 1); + layers[i].error[j] = clm_createMatrix(layers[i].weights.rows, 1); + layers[i].weightsError[j] = clm_createMatrix(layers[i].weights.rows, layers[i].weights.cols); + layers[i].gradient[j] = clm_createMatrix(layers[i].weights.rows, 1); } } diff --git a/src/clm.h b/src/clm.h index bc0d2a1..aac9246 100644 --- a/src/clm.h +++ b/src/clm.h @@ -23,11 +23,16 @@ typedef struct { clm_Matrix weights; clm_Matrix bias; clm_Matrix *output; - clm_Matrix error; - clm_Matrix weightsError; + clm_Matrix *error; + clm_Matrix *weightsError; + clm_Matrix *gradient; clm_NativeBuf *nativeWeights; clm_NativeBuf *nativeBias; clm_NativeBuf *nativeOutput; + clm_NativeBuf *nativeInputError; + clm_NativeBuf *nativeOutputError; + clm_NativeBuf *nativeWeightsError; + clm_NativeBuf *nativeGradient; } clm_Linear; typedef struct { diff --git a/src/clm_gpu.h b/src/clm_gpu.h index ed04b01..7d0cbfb 100644 --- a/src/clm_gpu.h +++ b/src/clm_gpu.h @@ -7,5 +7,6 @@ int clm_gpuInit(); void clm_gpuDestroy(); void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); +void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients); #endif diff --git a/src/clm_gpu_cpu.c b/src/clm_gpu_cpu.c index 6fda836..0865a26 100644 --- a/src/clm_gpu_cpu.c +++ b/src/clm_gpu_cpu.c @@ -1,3 +1,4 @@ +#include "clm.h" #include "clm_gpu.h" #include @@ -23,3 +24,30 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i clm_matrixSigmoid(outputs[b]); } } + +void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { + for(unsigned int b = 0; b < batchSize; b++) { + // clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; + // clm_Matrix *outputsOfThisLayer = nn.layers[i].output; + // clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; + // clm_Matrix error = linear.error; + + // GPU step 1 + clm_matrixDSigmoid(clm_matrixCopy(outputs[b], outputGradients[b])); // dsig(yhat) + clm_matrixMultiplyMatrixElements(outputGradients[b], inputErrors[b]); // (yhat - y) . dsig(yhat) + clm_matrixMultiplyScalar(outputGradients[b], learnRate); + + // GPU Step 2 + clm_Matrix inputT = clm_matrixTranspose(inputs[b]); + clm_matrixMultiplyMatrix(outputGradients[b], inputT, outputWeightsErrors[b]); + + // clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]); + // clm_matrixAddMatrix(linear->bias, gradient); + + if(updateErrors) { + // GPU Step 2 extended (opt) + clm_Matrix weightsT = clm_matrixTranspose(linear->weights); + clm_matrixMultiplyMatrix(weightsT, inputErrors[b], outputErrors[b]); + } + } +} diff --git a/src/clm_gpu_opencl.c b/src/clm_gpu_opencl.c index a999b2e..36f39c1 100644 --- a/src/clm_gpu_opencl.c +++ b/src/clm_gpu_opencl.c @@ -1,7 +1,8 @@ #include "clm_gpu.h" #include -#define CL_TARGET_OPENCL_VERSION 200 +#include "clm_util.h" + #include #include @@ -12,14 +13,18 @@ static cl_context context; static cl_device_id deviceID; static cl_command_queue queue; -static cl_kernel kernel; +static cl_kernel kernelLinearForward; +static cl_kernel kernelLinearBackprop1; +static cl_kernel kernelLinearBackprop2; +static size_t kernelLinearForwardLocal; +static size_t kernelLinearBackprop1Local; +static size_t kernelLinearBackprop2Local; struct clm_NativeBuf { cl_mem mem; }; typedef struct __attribute__((packed)) { - cl_uint rows; cl_uint cols; cl_char transposed; @@ -28,16 +33,6 @@ typedef struct __attribute__((packed)) { #define gpuMat(mat) \ { .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed } -char *loadFile(const char *path) { - FILE *file = fopen(path, "r"); - fseek(file, 0, SEEK_END); - size_t length = ftell(file); - fseek(file, 0, SEEK_SET); - char *buffer = calloc(1, length + 1); - fread(buffer, length, 1, file); - return buffer; -} - int clm_gpuInit() { // Connect to a compute device int useGPU = true; @@ -47,7 +42,12 @@ int clm_gpuInit() { return 1; } - char *buffer = loadFile("src/mat.cl"); + char *buffer = clm_loadFile("src/mat.cl"); + if(!buffer) { + printf("Failed to load mat kernel\n"); + return 1; + } + printf("%s", buffer); context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); @@ -76,9 +76,19 @@ int clm_gpuInit() { return 1; } - kernel = clCreateKernel(program, "linear_forward", &err); - if(!kernel) { - printf("Failed to create kernel\n"); + kernelLinearForward = clCreateKernel(program, "linear_forward", &err); + kernelLinearBackprop1 = clCreateKernel(program, "linear_backprop_1", &err); + kernelLinearBackprop2 = clCreateKernel(program, "linear_backprop_2", &err); + if(!kernelLinearForward || !kernelLinearBackprop1 || !kernelLinearBackprop2) { + printf("Failed to create kernels: %d\n", err); + return 1; + } + + err = clGetKernelWorkGroupInfo(kernelLinearForward, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearForwardLocal), &kernelLinearForwardLocal, NULL); + err |= clGetKernelWorkGroupInfo(kernelLinearBackprop1, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearBackprop1Local), &kernelLinearBackprop1Local, NULL); + err |= clGetKernelWorkGroupInfo(kernelLinearBackprop2, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearBackprop2Local), &kernelLinearBackprop2Local, NULL); + if(err != CL_SUCCESS) { + printf("Failed to get work group sizes\n"); return 1; } @@ -88,13 +98,13 @@ int clm_gpuInit() { void clm_gpuDestroy() { } -static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nativeBuf) { +static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, cl_mem_flags flags, clm_NativeBuf *nativeBuf) { cl_int err; if(!nativeBuf->mem) { - cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, count * sizeof(float) * mat.rows * mat.cols, NULL, &err); + cl_mem mat_values = clCreateBuffer(context, flags, count * sizeof(float) * mat.rows * mat.cols, NULL, &err); if(!mat_values) { - printf("Failed to alloc buffer: %d\n", err); + printf("Failed to alloc buffer: %s\n", clm_clErrorToString(err)); return NULL; } nativeBuf->mem = mat_values; @@ -103,27 +113,27 @@ static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nati return nativeBuf->mem; } -static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) { +static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, cl_mem_flags flags, clm_NativeBuf *nativeBuf) { cl_int err; - cl_mem mem = allocGPUMat(gpuMat, 1, nativeBuf); + cl_mem mem = allocGPUMat(gpuMat, 1, flags, nativeBuf); err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL); if(err != CL_SUCCESS) { - printf("Failed to enqueue write: %d\n", err); + printf("Failed to enqueue write: %s\n", clm_clErrorToString(err)); return NULL; } return mem; } -static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) { +static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, cl_mem_flags flags, clm_NativeBuf *nativeBuf) { cl_int err; - cl_mem mem = allocGPUMat(gpuMat, numMats, nativeBuf); + cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf); for(unsigned int i = 0; i < numMats; i++) { err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL); if(err != CL_SUCCESS) { - printf("Failed to enqueue write: %d\n", err); + printf("Failed to enqueue write: %s\n", clm_clErrorToString(err)); return NULL; } } @@ -131,6 +141,19 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m return mem; } +static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) { + cl_int err; + cl_mem mem = nativeBuf->mem; + + for(unsigned int i = 0; i < numMats; i++) { + err = clEnqueueReadBuffer(queue, mem, CL_TRUE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue read: %s\n", clm_clErrorToString(err)); + return; + } + } +} + clm_NativeBuf nativeInput; // TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time) @@ -152,32 +175,33 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i cl_int err; - cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, &nativeInput); - cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights); - cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias); + // TODO: make sure to always alloc nn.batchSize, not batchSize + cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput); + cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights); + cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias); if(!matInput_values || !matWeights_values || !matBias_values) { // linear->output = INVALID_MATRIX; return; } - cl_mem matOut_values = allocGPUMat(matOut, batchSize, linear->nativeOutput); + cl_mem matOut_values = allocGPUMat(matOut, batchSize, CL_MEM_READ_WRITE, linear->nativeOutput); if(!matOut_values) { // linear->output = INVALID_MATRIX; return; } err = 0; - err |= clSetKernelArg(kernel, 0, sizeof(cl_uint), &batchSize); - err |= clSetKernelArg(kernel, 1, sizeof(matInput), &matInput); - err |= clSetKernelArg(kernel, 2, sizeof(matInput_values), &matInput_values); - err |= clSetKernelArg(kernel, 3, sizeof(matWeights), &matWeights); - err |= clSetKernelArg(kernel, 4, sizeof(matWeights_values), &matWeights_values); - err |= clSetKernelArg(kernel, 5, sizeof(matBias), &matBias); - err |= clSetKernelArg(kernel, 6, sizeof(matBias_values), &matBias_values); - err |= clSetKernelArg(kernel, 7, sizeof(matOut), &matOut); - err |= clSetKernelArg(kernel, 8, sizeof(matOut_values), &matOut_values); + err |= clSetKernelArg(kernelLinearForward, 0, sizeof(cl_uint), &batchSize); + err |= clSetKernelArg(kernelLinearForward, 1, sizeof(matInput), &matInput); + err |= clSetKernelArg(kernelLinearForward, 2, sizeof(matInput_values), &matInput_values); + err |= clSetKernelArg(kernelLinearForward, 3, sizeof(matWeights), &matWeights); + err |= clSetKernelArg(kernelLinearForward, 4, sizeof(matWeights_values), &matWeights_values); + err |= clSetKernelArg(kernelLinearForward, 5, sizeof(matBias), &matBias); + err |= clSetKernelArg(kernelLinearForward, 6, sizeof(matBias_values), &matBias_values); + err |= clSetKernelArg(kernelLinearForward, 7, sizeof(matOut), &matOut); + err |= clSetKernelArg(kernelLinearForward, 8, sizeof(matOut_values), &matOut_values); if(err != CL_SUCCESS) { - printf("Failed to set kernel args: %d\n", err); + printf("Failed to set kernel args (1): %d\n", err); return; } @@ -185,25 +209,121 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL); printf("INFO: %s\n", info);*/ - size_t local; - err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); - if(err != CL_SUCCESS) { - printf("Failed to get work group size\n"); - return; - } - - size_t global = ceil((float) workSize / local) * local; - err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); + size_t global = ceil((float) workSize / kernelLinearForwardLocal) * kernelLinearForwardLocal; + err = clEnqueueNDRangeKernel(queue, kernelLinearForward, 1, NULL, &global, &kernelLinearForwardLocal, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to enqueue: %d\n", err); return; } + clFlush(queue); clFinish(queue); - err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL); + /*err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to read from buffer\n"); return; - } + }*/ + + readGPUMats(matOut, batchSize, outputs, linear->nativeOutput); +} + +void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { + if(batchSize == 0) return; + + if(!linear->nativeInputError) { + linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf)); + } + + cl_GPUMat matInput = gpuMat(inputs[0]); + cl_GPUMat matWeights = gpuMat(linear->weights); + cl_GPUMat matOutput = gpuMat(outputs[0]); + cl_GPUMat matInputErrors = gpuMat(inputErrors[0]); + cl_GPUMat matOutputErrors = !updateErrors ? (cl_GPUMat){0} : (cl_GPUMat) gpuMat(outputErrors[0]); + cl_GPUMat matOutputWeightsErrors = gpuMat(outputWeightsErrors[0]); + cl_GPUMat matOutputGradients = gpuMat(outputGradients[0]); + + cl_int err; + + cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput); + cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights); + cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError); + cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput); + if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) { + printf("Failed to write GPU mats\n"); + return; + } + + // cl_mem matOutput_values = allocGPUMat(matOutput, batchSize, CL_MEM_READ_ONLY, linear->nativeOutput); + cl_mem matOutputErrors_values = !updateErrors ? NULL : allocGPUMat(matOutputErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeOutputError); + cl_mem matOutputWeightsErrors_values = allocGPUMat(matOutputWeightsErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeWeightsError); + cl_mem matOutputGradients_values = allocGPUMat(matOutputGradients, batchSize, CL_MEM_READ_WRITE, linear->nativeGradient); + if(!matOutputWeightsErrors_values || !matOutputGradients_values) { + printf("Failed to alloc GPU mats\n"); + return; + } + + err = 0; + err |= clSetKernelArg(kernelLinearBackprop1, 0, sizeof(cl_uint), &batchSize); + err |= clSetKernelArg(kernelLinearBackprop1, 1, sizeof(cl_float), &learnRate); + err |= clSetKernelArg(kernelLinearBackprop1, 2, sizeof(matWeights), &matWeights); + err |= clSetKernelArg(kernelLinearBackprop1, 3, sizeof(matWeights_values), &matWeights_values); + err |= clSetKernelArg(kernelLinearBackprop1, 4, sizeof(matOutput), &matOutput); + err |= clSetKernelArg(kernelLinearBackprop1, 5, sizeof(matOutput_values), &matOutput_values); + err |= clSetKernelArg(kernelLinearBackprop1, 6, sizeof(matInputErrors), &matInputErrors); + err |= clSetKernelArg(kernelLinearBackprop1, 7, sizeof(matInputErrors_values), &matInputErrors_values); + err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients); + err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values); + if(err != CL_SUCCESS) { + printf("Failed to set kernel args (2): %d\n", err); + return; + } + + size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols; + size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local; + clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL); + + clFlush(queue); + clFinish(queue); + + err = 0; + err |= clSetKernelArg(kernelLinearBackprop2, 0, sizeof(cl_uint), &batchSize); + err |= clSetKernelArg(kernelLinearBackprop2, 1, sizeof(matWeights), &matWeights); + err |= clSetKernelArg(kernelLinearBackprop2, 2, sizeof(matWeights_values), &matWeights_values); + err |= clSetKernelArg(kernelLinearBackprop2, 3, sizeof(matInput), &matInput); + err |= clSetKernelArg(kernelLinearBackprop2, 4, sizeof(matInput_values), &matInput_values); + err |= clSetKernelArg(kernelLinearBackprop2, 5, sizeof(matInputErrors), &matInputErrors); + err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values); + err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors); + + if(updateErrors) { + err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors); + err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values); + } + + err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors); + err |= clSetKernelArg(kernelLinearBackprop2, 11, sizeof(matOutputWeightsErrors_values), &matOutputWeightsErrors_values); + err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients); + err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values); + if(err != CL_SUCCESS) { + printf("Failed to set kernel args (3): %d\n", err); + return; + } + + size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols; + size_t step2Global = ceil((float) step1WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local; + clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL); + + clFlush(queue); + clFinish(queue); + + readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient); + readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError); + if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError); } diff --git a/src/clm_util.c b/src/clm_util.c new file mode 100644 index 0000000..915c523 --- /dev/null +++ b/src/clm_util.c @@ -0,0 +1,195 @@ +#include "clm_util.h" + +#include +#include +#include +#include + +char *clm_loadFile(const char *path) { + FILE *file = fopen(path, "r"); + if(!file) { + printf("fopen failed: %s\n", strerror(errno)); + goto error; + } + + if(fseek(file, 0, SEEK_END) < 0) { + printf("fseek failed: %s\n", strerror(errno)); + goto cleanup_file; + } + + long length = ftell(file); + if(length < 0) { + printf("ftell failed: %s\n", strerror(errno)); + goto cleanup_file; + } + + if(fseek(file, 0, SEEK_SET) < 0) { + printf("fseek failed: %s\n", strerror(errno)); + goto cleanup_file; + } + + char *buffer = calloc(1, length + 1); + if(!buffer) { + printf("calloc failed: %s\n", strerror(errno)); + goto cleanup_file; + } + + size_t nRead = fread(buffer, length, 1, file); + if(nRead != 1) { + printf("fread failed: (nRead = %zu, err %d, eof %d)\n", nRead, ferror(file), feof(file)); + goto cleanup_buffer; + } + + fclose(file); + return buffer; + +cleanup_buffer: + free(buffer); +cleanup_file: + fclose(file); +error: + return NULL; +} + +const char *clm_clErrorToString(cl_int error) { + // Source: https://stackoverflow.com/questions/24326432/convenient-way-to-show-opencl-error-codes + switch(error) { + // run-time and JIT compiler errors + case 0: + return "CL_SUCCESS"; + case -1: + return "CL_DEVICE_NOT_FOUND"; + case -2: + return "CL_DEVICE_NOT_AVAILABLE"; + case -3: + return "CL_COMPILER_NOT_AVAILABLE"; + case -4: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: + return "CL_OUT_OF_RESOURCES"; + case -6: + return "CL_OUT_OF_HOST_MEMORY"; + case -7: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: + return "CL_MEM_COPY_OVERLAP"; + case -9: + return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: + return "CL_BUILD_PROGRAM_FAILURE"; + case -12: + return "CL_MAP_FAILURE"; + case -13: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: + return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: + return "CL_LINKER_NOT_AVAILABLE"; + case -17: + return "CL_LINK_PROGRAM_FAILURE"; + case -18: + return "CL_DEVICE_PARTITION_FAILED"; + case -19: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + + // compile-time errors + case -30: + return "CL_INVALID_VALUE"; + case -31: + return "CL_INVALID_DEVICE_TYPE"; + case -32: + return "CL_INVALID_PLATFORM"; + case -33: + return "CL_INVALID_DEVICE"; + case -34: + return "CL_INVALID_CONTEXT"; + case -35: + return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: + return "CL_INVALID_COMMAND_QUEUE"; + case -37: + return "CL_INVALID_HOST_PTR"; + case -38: + return "CL_INVALID_MEM_OBJECT"; + case -39: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: + return "CL_INVALID_IMAGE_SIZE"; + case -41: + return "CL_INVALID_SAMPLER"; + case -42: + return "CL_INVALID_BINARY"; + case -43: + return "CL_INVALID_BUILD_OPTIONS"; + case -44: + return "CL_INVALID_PROGRAM"; + case -45: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: + return "CL_INVALID_KERNEL_NAME"; + case -47: + return "CL_INVALID_KERNEL_DEFINITION"; + case -48: + return "CL_INVALID_KERNEL"; + case -49: + return "CL_INVALID_ARG_INDEX"; + case -50: + return "CL_INVALID_ARG_VALUE"; + case -51: + return "CL_INVALID_ARG_SIZE"; + case -52: + return "CL_INVALID_KERNEL_ARGS"; + case -53: + return "CL_INVALID_WORK_DIMENSION"; + case -54: + return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: + return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: + return "CL_INVALID_GLOBAL_OFFSET"; + case -57: + return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: + return "CL_INVALID_EVENT"; + case -59: + return "CL_INVALID_OPERATION"; + case -60: + return "CL_INVALID_GL_OBJECT"; + case -61: + return "CL_INVALID_BUFFER_SIZE"; + case -62: + return "CL_INVALID_MIP_LEVEL"; + case -63: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: + return "CL_INVALID_PROPERTY"; + case -65: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: + return "CL_INVALID_COMPILER_OPTIONS"; + case -67: + return "CL_INVALID_LINKER_OPTIONS"; + case -68: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; + + // extension errors + case -1000: + return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; + case -1001: + return "CL_PLATFORM_NOT_FOUND_KHR"; + case -1002: + return "CL_INVALID_D3D10_DEVICE_KHR"; + case -1003: + return "CL_INVALID_D3D10_RESOURCE_KHR"; + case -1004: + return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; + case -1005: + return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; + default: + return "Unknown OpenCL error"; + } +} diff --git a/src/clm_util.h b/src/clm_util.h new file mode 100644 index 0000000..3e095df --- /dev/null +++ b/src/clm_util.h @@ -0,0 +1,11 @@ +#ifndef _UTIL_H_ +#define _UTIL_H_ + +#define CL_TARGET_OPENCL_VERSION 200 +#include + +char *clm_loadFile(const char *path); + +const char *clm_clErrorToString(cl_int error); + +#endif diff --git a/src/cltest.c b/src/cltest.c index f138812..ac80a68 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -37,7 +37,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector * unsigned int batchSize = numElements - b * nn.batchSize; if(batchSize > nn.batchSize) batchSize = nn.batchSize; - // printf("Batch %d (size %d)\n", b, batchSize); + printf("Batch %d (size %d)\n", b, batchSize); for(unsigned int i = 0; i < batchSize; i++) { clm_Vector input = inputs[b * nn.batchSize + i]; @@ -53,7 +53,26 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector * currentXs = nn.layers[i].output; } + clm_Linear lastLayer = nn.layers[nn.numLayers - 1]; + for(unsigned int b = 0; b < batchSize; b++) { + // Error of last layer = y - yhat + clm_matrixCopy(batchOutputs[b], lastLayer.error[b]); // lastLayer.error = y + clm_matrixSubtractMatrix(lastLayer.error[b], lastLayer.output[b]); // lastLayer.error -= yhat + } + for(int i = nn.numLayers - 1; i >= 0; i--) { + clm_Linear layer = nn.layers[i]; + clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; + clm_Matrix *outputsOfThisLayer = layer.output; + clm_linearBackprop(&layer, nn.learnRate, batchSize, inputsToThisLayer, outputsOfThisLayer, layer.error, i > 0, i == 0 ? NULL : nn.layers[i - 1].error, layer.weightsError, layer.gradient); + + for(unsigned int b = 0; b < batchSize; b++) { + clm_matrixAddMatrix(layer.weights, layer.weightsError[b]); + clm_matrixAddMatrix(layer.bias, layer.gradient[b]); + } + } + + /*for(int i = nn.numLayers - 1; i >= 0; i--) { clm_Linear layer = nn.layers[i]; clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; clm_Matrix *outputsOfThisLayer = nn.layers[i].output; @@ -76,7 +95,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector * clm_matrixAddMatrix(layer.weights, layer.weightsError); clm_matrixAddMatrix(layer.bias, gradient); - } + }*/ } free(batchInputs); @@ -190,7 +209,7 @@ int main() { clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; - clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 100); + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000); for(unsigned int epoch = 0; epoch < 1; epoch++) { printf("Epoch %u\n", epoch); @@ -220,7 +239,7 @@ int main() { predDigit = j; } } - if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); + // if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); unsigned int actDigit = 0; float maxA = -1; @@ -231,7 +250,7 @@ int main() { actDigit = j; } } - if(idx < 100) printf("Actual: %u\n", actDigit); + // if(idx < 100) printf("Actual: %u\n", actDigit); // printf("\n"); if(predDigit == actDigit) correct++; diff --git a/src/mat.cl b/src/mat.cl index 2f6d460..34cbfd7 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -6,12 +6,23 @@ typedef struct __attribute__((packed)) { #define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] +#define matrixGetIJ(mat, idx, i, j) \ + { \ + if(!mat.transposed) { \ + i = idx / mat.cols; \ + j = idx % mat.cols; \ + } else { \ + i = idx / mat.rows; \ + j = idx % mat.rows; \ + } \ + } + void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { uint idx = get_global_id(0); if(idx >= matOut.rows * matOut.cols) return; - uint i = idx / matOut.cols; - uint j = idx % matOut.cols; + uint i, j; + matrixGetIJ(matOut, idx, i, j); float sum = 0; for(unsigned int k = 0; k < matA.cols; k++) { @@ -34,7 +45,40 @@ void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) { mat_values[idx] = 1 / (1 + exp(-mat_values[idx])); } -__kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) { +void mat_dSigmoid(cl_GPUMat mat, __global float *mat_values) { + uint idx = get_global_id(0); + if(idx >= mat.rows * mat.cols) return; + + float v = mat_values[idx]; + mat_values[idx] = v * (1 - v); +} + +void mat_copy(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) { + uint idx = get_global_id(0); + if(idx >= mat.rows * mat.cols) return; + + other_values[idx] = mat_values[idx]; +} + +void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) { + uint idx = get_global_id(0); + if(idx >= mat.rows * mat.cols) return; + + other_values[idx] *= mat_values[idx]; +} + +void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) { + uint idx = get_global_id(0); + if(idx >= mat.rows * mat.cols) return; + + mat_values[idx] *= scalar; +} + +__kernel void linear_forward(unsigned int batchSize, + cl_GPUMat input, __global float *input_values, + cl_GPUMat weights, __global float *weights_values, + cl_GPUMat bias, __global float *bias_values, + cl_GPUMat out, __global float *out_values) { // FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations! for(unsigned int b = 0; b < batchSize; b++) { __global float *batchInput_values = input_values + b * input.rows * input.cols; @@ -44,3 +88,51 @@ __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global f mat_sigmoid(out, batchOut_values); } } + +__kernel void linear_backprop_1(unsigned int batchSize, float learnRate, + cl_GPUMat weights, __global float *weights_values, + cl_GPUMat outputs, __global float *outputs_values, + cl_GPUMat inputErrors, __global float *inputErrors_values, + cl_GPUMat outputGradients, __global float *outputGradients_values) { + + for(unsigned int b = 0; b < batchSize; b++) { + __global float *batchOut_values = outputs_values + b * outputs.rows * outputs.cols; + __global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols; + __global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols; + + mat_copy(outputs, batchOut_values, outputGradients, batchOutGradients_values); + mat_dSigmoid(outputGradients, batchOutGradients_values); // dsig(yhat) + mat_multiply_elements(outputGradients, batchOutGradients_values, inputErrors, batchInErrors_values); // (yhat - y) . dsig(yhat) + mat_multiply_scalar(outputGradients, batchOutGradients_values, learnRate); + } +} + +__kernel void linear_backprop_2(unsigned int batchSize, + cl_GPUMat weights, __global float *weights_values, + cl_GPUMat inputs, __global float *inputs_values, + cl_GPUMat inputErrors, __global float *inputErrors_values, + char updateErrors, + cl_GPUMat outputErrors, __global float *outputErrors_values, + cl_GPUMat outputWeightsErrors, __global float *outputWeightsErrors_values, + cl_GPUMat outputGradients, __global float *outputGradients_values) { + for(unsigned int b = 0; b < batchSize; b++) { + __global float *batchInput_values = inputs_values + b * inputs.rows * inputs.cols; + __global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols; + __global float *batchOutErrors_values = outputErrors_values + b * outputErrors.rows * outputErrors.cols; + __global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols; + __global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols; + + cl_GPUMat inputsT = inputs; + inputsT.transposed = true; + mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values); + + // clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]); + // clm_matrixAddMatrix(linear->bias, gradient); + + if(updateErrors) { + cl_GPUMat weightsT = weights; + weightsT.transposed = true; + mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values); + } + } +}