Improve some code, Add memory leak
This commit is contained in:
parent
8240f7c01b
commit
2f39db3f52
8
Makefile
8
Makefile
@ -2,10 +2,14 @@ SRC=src
|
||||
BUILD=build
|
||||
CFLAGS=-Wall -g
|
||||
|
||||
CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c
|
||||
CLM_SOURCES_CPU=$(CLM_SOURCES) $(SRC)/clm_gpu_cpu.c
|
||||
CLM_SOURCES_OPENCL=$(CLM_SOURCES) $(SRC)/clm_gpu_opencl.c
|
||||
|
||||
.PHONY: all
|
||||
all:
|
||||
mkdir -p $(BUILD)
|
||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
|
||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(CLM_SOURCES_OPENCL)
|
||||
|
||||
.PHONY: run
|
||||
run: all
|
||||
@ -14,7 +18,7 @@ run: all
|
||||
.PHONY: cl
|
||||
cl:
|
||||
mkdir -p $(BUILD)
|
||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
|
||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(CLM_SOURCES_OPENCL)
|
||||
|
||||
.PHONY: cl_run
|
||||
cl_run: cl
|
||||
|
11
src/cl.c
11
src/cl.c
@ -3,6 +3,7 @@
|
||||
#include <math.h>
|
||||
|
||||
#include "clm.h"
|
||||
#include "util.h"
|
||||
|
||||
#include <CL/cl.h>
|
||||
#include <stdbool.h>
|
||||
@ -15,16 +16,6 @@ typedef struct __attribute__((packed)) {
|
||||
cl_char transposed;
|
||||
} cl_GPUMat;
|
||||
|
||||
char *loadFile(const char *path) {
|
||||
FILE *file = fopen(path, "r");
|
||||
fseek(file, 0, SEEK_END);
|
||||
size_t length = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
char *buffer = calloc(1, length + 1);
|
||||
fread(buffer, length, 1, file);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
int main() {
|
||||
// Connect to a compute device
|
||||
int useGPU = true;
|
||||
|
21
src/clm.c
21
src/clm.c
@ -10,7 +10,7 @@ const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};
|
||||
|
||||
clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
|
||||
printf("CREATING MATRIX\n");
|
||||
clm_Matrix mat;
|
||||
clm_Matrix mat = {0};
|
||||
mat.rows = rows;
|
||||
mat.cols = cols;
|
||||
mat.values = calloc(rows * cols, sizeof(float));
|
||||
@ -98,7 +98,7 @@ clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
|
||||
}
|
||||
|
||||
clm_Matrix clm_matrixTranspose(clm_Matrix mat) {
|
||||
clm_Matrix tr;
|
||||
clm_Matrix tr = {0};
|
||||
tr.cols = mat.rows;
|
||||
tr.rows = mat.cols;
|
||||
tr.values = mat.values;
|
||||
@ -183,7 +183,7 @@ clm_Matrix clm_matrixFromArray(float *array, unsigned int length) {
|
||||
}
|
||||
|
||||
clm_Matrix clm_matrixWrapArray(float *array, unsigned int length) {
|
||||
clm_Matrix mat;
|
||||
clm_Matrix mat = {0};
|
||||
mat.rows = length;
|
||||
mat.cols = 1;
|
||||
mat.values = array;
|
||||
@ -195,7 +195,7 @@ bool clm_matrixIsInvalid(clm_Matrix mat) {
|
||||
}
|
||||
|
||||
clm_Vector clm_vectorCreate(unsigned int length) {
|
||||
clm_Vector vector;
|
||||
clm_Vector vector = {0};
|
||||
vector.length = length;
|
||||
vector.values = calloc(length, sizeof(float));
|
||||
return vector;
|
||||
@ -206,13 +206,13 @@ bool clm_vectorIsInvalid(clm_Vector vec) {
|
||||
}
|
||||
|
||||
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
|
||||
clm_Linear linear;
|
||||
clm_Linear linear = {0};
|
||||
linear.weights = clm_createMatrixRandom(outputs, inputs);
|
||||
linear.bias = clm_createMatrixRandom(outputs, 1);
|
||||
|
||||
// linear.output = clm_createMatrix(outputs, 1);
|
||||
linear.error = clm_createMatrix(outputs, 1);
|
||||
linear.weightsError = clm_createMatrix(outputs, inputs);
|
||||
// linear.error = clm_createMatrix(outputs, 1);
|
||||
// linear.weightsError = clm_createMatrix(outputs, inputs);
|
||||
return linear;
|
||||
}
|
||||
|
||||
@ -221,8 +221,15 @@ clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate,
|
||||
|
||||
for(unsigned int i = 0; i < numLayers; i++) {
|
||||
layers[i].output = calloc(batchSize, sizeof(clm_Matrix));
|
||||
layers[i].error = calloc(batchSize, sizeof(clm_Matrix));
|
||||
layers[i].weightsError = calloc(batchSize, sizeof(clm_Matrix));
|
||||
layers[i].gradient = calloc(batchSize, sizeof(clm_Matrix));
|
||||
|
||||
for(unsigned int j = 0; j < batchSize; j++) {
|
||||
layers[i].output[j] = clm_createMatrix(layers[i].weights.rows, 1);
|
||||
layers[i].error[j] = clm_createMatrix(layers[i].weights.rows, 1);
|
||||
layers[i].weightsError[j] = clm_createMatrix(layers[i].weights.rows, layers[i].weights.cols);
|
||||
layers[i].gradient[j] = clm_createMatrix(layers[i].weights.rows, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -23,11 +23,16 @@ typedef struct {
|
||||
clm_Matrix weights;
|
||||
clm_Matrix bias;
|
||||
clm_Matrix *output;
|
||||
clm_Matrix error;
|
||||
clm_Matrix weightsError;
|
||||
clm_Matrix *error;
|
||||
clm_Matrix *weightsError;
|
||||
clm_Matrix *gradient;
|
||||
clm_NativeBuf *nativeWeights;
|
||||
clm_NativeBuf *nativeBias;
|
||||
clm_NativeBuf *nativeOutput;
|
||||
clm_NativeBuf *nativeInputError;
|
||||
clm_NativeBuf *nativeOutputError;
|
||||
clm_NativeBuf *nativeWeightsError;
|
||||
clm_NativeBuf *nativeGradient;
|
||||
} clm_Linear;
|
||||
|
||||
typedef struct {
|
||||
|
@ -7,5 +7,6 @@ int clm_gpuInit();
|
||||
void clm_gpuDestroy();
|
||||
|
||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
|
||||
|
||||
#endif
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include "clm.h"
|
||||
#include "clm_gpu.h"
|
||||
|
||||
#include <stdio.h>
|
||||
@ -23,3 +24,30 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
||||
clm_matrixSigmoid(outputs[b]);
|
||||
}
|
||||
}
|
||||
|
||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
// clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
|
||||
// clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
|
||||
// clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
|
||||
// clm_Matrix error = linear.error;
|
||||
|
||||
// GPU step 1
|
||||
clm_matrixDSigmoid(clm_matrixCopy(outputs[b], outputGradients[b])); // dsig(yhat)
|
||||
clm_matrixMultiplyMatrixElements(outputGradients[b], inputErrors[b]); // (yhat - y) . dsig(yhat)
|
||||
clm_matrixMultiplyScalar(outputGradients[b], learnRate);
|
||||
|
||||
// GPU Step 2
|
||||
clm_Matrix inputT = clm_matrixTranspose(inputs[b]);
|
||||
clm_matrixMultiplyMatrix(outputGradients[b], inputT, outputWeightsErrors[b]);
|
||||
|
||||
// clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]);
|
||||
// clm_matrixAddMatrix(linear->bias, gradient);
|
||||
|
||||
if(updateErrors) {
|
||||
// GPU Step 2 extended (opt)
|
||||
clm_Matrix weightsT = clm_matrixTranspose(linear->weights);
|
||||
clm_matrixMultiplyMatrix(weightsT, inputErrors[b], outputErrors[b]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,8 @@
|
||||
#include "clm_gpu.h"
|
||||
#include <math.h>
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 200
|
||||
#include "clm_util.h"
|
||||
|
||||
#include <CL/cl_platform.h>
|
||||
|
||||
#include <CL/cl.h>
|
||||
@ -12,14 +13,18 @@
|
||||
static cl_context context;
|
||||
static cl_device_id deviceID;
|
||||
static cl_command_queue queue;
|
||||
static cl_kernel kernel;
|
||||
static cl_kernel kernelLinearForward;
|
||||
static cl_kernel kernelLinearBackprop1;
|
||||
static cl_kernel kernelLinearBackprop2;
|
||||
static size_t kernelLinearForwardLocal;
|
||||
static size_t kernelLinearBackprop1Local;
|
||||
static size_t kernelLinearBackprop2Local;
|
||||
|
||||
struct clm_NativeBuf {
|
||||
cl_mem mem;
|
||||
};
|
||||
|
||||
typedef struct __attribute__((packed)) {
|
||||
|
||||
cl_uint rows;
|
||||
cl_uint cols;
|
||||
cl_char transposed;
|
||||
@ -28,16 +33,6 @@ typedef struct __attribute__((packed)) {
|
||||
#define gpuMat(mat) \
|
||||
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
|
||||
|
||||
char *loadFile(const char *path) {
|
||||
FILE *file = fopen(path, "r");
|
||||
fseek(file, 0, SEEK_END);
|
||||
size_t length = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
char *buffer = calloc(1, length + 1);
|
||||
fread(buffer, length, 1, file);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
int clm_gpuInit() {
|
||||
// Connect to a compute device
|
||||
int useGPU = true;
|
||||
@ -47,7 +42,12 @@ int clm_gpuInit() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
char *buffer = loadFile("src/mat.cl");
|
||||
char *buffer = clm_loadFile("src/mat.cl");
|
||||
if(!buffer) {
|
||||
printf("Failed to load mat kernel\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("%s", buffer);
|
||||
|
||||
context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
|
||||
@ -76,9 +76,19 @@ int clm_gpuInit() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
kernel = clCreateKernel(program, "linear_forward", &err);
|
||||
if(!kernel) {
|
||||
printf("Failed to create kernel\n");
|
||||
kernelLinearForward = clCreateKernel(program, "linear_forward", &err);
|
||||
kernelLinearBackprop1 = clCreateKernel(program, "linear_backprop_1", &err);
|
||||
kernelLinearBackprop2 = clCreateKernel(program, "linear_backprop_2", &err);
|
||||
if(!kernelLinearForward || !kernelLinearBackprop1 || !kernelLinearBackprop2) {
|
||||
printf("Failed to create kernels: %d\n", err);
|
||||
return 1;
|
||||
}
|
||||
|
||||
err = clGetKernelWorkGroupInfo(kernelLinearForward, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearForwardLocal), &kernelLinearForwardLocal, NULL);
|
||||
err |= clGetKernelWorkGroupInfo(kernelLinearBackprop1, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearBackprop1Local), &kernelLinearBackprop1Local, NULL);
|
||||
err |= clGetKernelWorkGroupInfo(kernelLinearBackprop2, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(kernelLinearBackprop2Local), &kernelLinearBackprop2Local, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to get work group sizes\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -88,13 +98,13 @@ int clm_gpuInit() {
|
||||
void clm_gpuDestroy() {
|
||||
}
|
||||
|
||||
static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nativeBuf) {
|
||||
static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, cl_mem_flags flags, clm_NativeBuf *nativeBuf) {
|
||||
cl_int err;
|
||||
|
||||
if(!nativeBuf->mem) {
|
||||
cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, count * sizeof(float) * mat.rows * mat.cols, NULL, &err);
|
||||
cl_mem mat_values = clCreateBuffer(context, flags, count * sizeof(float) * mat.rows * mat.cols, NULL, &err);
|
||||
if(!mat_values) {
|
||||
printf("Failed to alloc buffer: %d\n", err);
|
||||
printf("Failed to alloc buffer: %s\n", clm_clErrorToString(err));
|
||||
return NULL;
|
||||
}
|
||||
nativeBuf->mem = mat_values;
|
||||
@ -103,27 +113,27 @@ static cl_mem allocGPUMat(cl_GPUMat mat, unsigned int count, clm_NativeBuf *nati
|
||||
return nativeBuf->mem;
|
||||
}
|
||||
|
||||
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
|
||||
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, cl_mem_flags flags, clm_NativeBuf *nativeBuf) {
|
||||
cl_int err;
|
||||
cl_mem mem = allocGPUMat(gpuMat, 1, nativeBuf);
|
||||
cl_mem mem = allocGPUMat(gpuMat, 1, flags, nativeBuf);
|
||||
|
||||
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue write: %d\n", err);
|
||||
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return mem;
|
||||
}
|
||||
|
||||
static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
|
||||
static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *mats, cl_mem_flags flags, clm_NativeBuf *nativeBuf) {
|
||||
cl_int err;
|
||||
cl_mem mem = allocGPUMat(gpuMat, numMats, nativeBuf);
|
||||
cl_mem mem = allocGPUMat(gpuMat, numMats, flags, nativeBuf);
|
||||
|
||||
for(unsigned int i = 0; i < numMats; i++) {
|
||||
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * sizeof(float) * gpuMat.rows * gpuMat.cols, sizeof(float) * gpuMat.rows * gpuMat.cols, mats[i].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue write: %d\n", err);
|
||||
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -131,6 +141,19 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m
|
||||
return mem;
|
||||
}
|
||||
|
||||
static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
|
||||
cl_int err;
|
||||
cl_mem mem = nativeBuf->mem;
|
||||
|
||||
for(unsigned int i = 0; i < numMats; i++) {
|
||||
err = clEnqueueReadBuffer(queue, mem, CL_TRUE, i * sizeof(float) * mat.rows * mat.cols, sizeof(float) * mat.rows * mat.cols, mats[i].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
clm_NativeBuf nativeInput;
|
||||
|
||||
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
|
||||
@ -152,32 +175,33 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
||||
|
||||
cl_int err;
|
||||
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, &nativeInput);
|
||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
|
||||
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
|
||||
// TODO: make sure to always alloc nn.batchSize, not batchSize
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput);
|
||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
|
||||
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias);
|
||||
if(!matInput_values || !matWeights_values || !matBias_values) {
|
||||
// linear->output = INVALID_MATRIX;
|
||||
return;
|
||||
}
|
||||
|
||||
cl_mem matOut_values = allocGPUMat(matOut, batchSize, linear->nativeOutput);
|
||||
cl_mem matOut_values = allocGPUMat(matOut, batchSize, CL_MEM_READ_WRITE, linear->nativeOutput);
|
||||
if(!matOut_values) {
|
||||
// linear->output = INVALID_MATRIX;
|
||||
return;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
err |= clSetKernelArg(kernel, 0, sizeof(cl_uint), &batchSize);
|
||||
err |= clSetKernelArg(kernel, 1, sizeof(matInput), &matInput);
|
||||
err |= clSetKernelArg(kernel, 2, sizeof(matInput_values), &matInput_values);
|
||||
err |= clSetKernelArg(kernel, 3, sizeof(matWeights), &matWeights);
|
||||
err |= clSetKernelArg(kernel, 4, sizeof(matWeights_values), &matWeights_values);
|
||||
err |= clSetKernelArg(kernel, 5, sizeof(matBias), &matBias);
|
||||
err |= clSetKernelArg(kernel, 6, sizeof(matBias_values), &matBias_values);
|
||||
err |= clSetKernelArg(kernel, 7, sizeof(matOut), &matOut);
|
||||
err |= clSetKernelArg(kernel, 8, sizeof(matOut_values), &matOut_values);
|
||||
err |= clSetKernelArg(kernelLinearForward, 0, sizeof(cl_uint), &batchSize);
|
||||
err |= clSetKernelArg(kernelLinearForward, 1, sizeof(matInput), &matInput);
|
||||
err |= clSetKernelArg(kernelLinearForward, 2, sizeof(matInput_values), &matInput_values);
|
||||
err |= clSetKernelArg(kernelLinearForward, 3, sizeof(matWeights), &matWeights);
|
||||
err |= clSetKernelArg(kernelLinearForward, 4, sizeof(matWeights_values), &matWeights_values);
|
||||
err |= clSetKernelArg(kernelLinearForward, 5, sizeof(matBias), &matBias);
|
||||
err |= clSetKernelArg(kernelLinearForward, 6, sizeof(matBias_values), &matBias_values);
|
||||
err |= clSetKernelArg(kernelLinearForward, 7, sizeof(matOut), &matOut);
|
||||
err |= clSetKernelArg(kernelLinearForward, 8, sizeof(matOut_values), &matOut_values);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to set kernel args: %d\n", err);
|
||||
printf("Failed to set kernel args (1): %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -185,25 +209,121 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
||||
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
|
||||
printf("INFO: %s\n", info);*/
|
||||
|
||||
size_t local;
|
||||
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to get work group size\n");
|
||||
return;
|
||||
}
|
||||
|
||||
size_t global = ceil((float) workSize / local) * local;
|
||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
||||
size_t global = ceil((float) workSize / kernelLinearForwardLocal) * kernelLinearForwardLocal;
|
||||
err = clEnqueueNDRangeKernel(queue, kernelLinearForward, 1, NULL, &global, &kernelLinearForwardLocal, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to enqueue: %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL);
|
||||
/*err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output[0].values, 0, NULL, NULL);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to read from buffer\n");
|
||||
return;
|
||||
}
|
||||
}*/
|
||||
|
||||
readGPUMats(matOut, batchSize, outputs, linear->nativeOutput);
|
||||
}
|
||||
|
||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
||||
if(batchSize == 0) return;
|
||||
|
||||
if(!linear->nativeInputError) {
|
||||
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
|
||||
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
|
||||
}
|
||||
|
||||
cl_GPUMat matInput = gpuMat(inputs[0]);
|
||||
cl_GPUMat matWeights = gpuMat(linear->weights);
|
||||
cl_GPUMat matOutput = gpuMat(outputs[0]);
|
||||
cl_GPUMat matInputErrors = gpuMat(inputErrors[0]);
|
||||
cl_GPUMat matOutputErrors = !updateErrors ? (cl_GPUMat){0} : (cl_GPUMat) gpuMat(outputErrors[0]);
|
||||
cl_GPUMat matOutputWeightsErrors = gpuMat(outputWeightsErrors[0]);
|
||||
cl_GPUMat matOutputGradients = gpuMat(outputGradients[0]);
|
||||
|
||||
cl_int err;
|
||||
|
||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, &nativeInput);
|
||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
|
||||
cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError);
|
||||
cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput);
|
||||
if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) {
|
||||
printf("Failed to write GPU mats\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// cl_mem matOutput_values = allocGPUMat(matOutput, batchSize, CL_MEM_READ_ONLY, linear->nativeOutput);
|
||||
cl_mem matOutputErrors_values = !updateErrors ? NULL : allocGPUMat(matOutputErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeOutputError);
|
||||
cl_mem matOutputWeightsErrors_values = allocGPUMat(matOutputWeightsErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeWeightsError);
|
||||
cl_mem matOutputGradients_values = allocGPUMat(matOutputGradients, batchSize, CL_MEM_READ_WRITE, linear->nativeGradient);
|
||||
if(!matOutputWeightsErrors_values || !matOutputGradients_values) {
|
||||
printf("Failed to alloc GPU mats\n");
|
||||
return;
|
||||
}
|
||||
|
||||
err = 0;
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 0, sizeof(cl_uint), &batchSize);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 1, sizeof(cl_float), &learnRate);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 2, sizeof(matWeights), &matWeights);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 3, sizeof(matWeights_values), &matWeights_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 4, sizeof(matOutput), &matOutput);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 5, sizeof(matOutput_values), &matOutput_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 6, sizeof(matInputErrors), &matInputErrors);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 7, sizeof(matInputErrors_values), &matInputErrors_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 8, sizeof(matOutputGradients), &matOutputGradients);
|
||||
err |= clSetKernelArg(kernelLinearBackprop1, 9, sizeof(matOutputGradients_values), &matOutputGradients_values);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to set kernel args (2): %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t step1WorkSize = matOutputGradients.rows * matOutputGradients.cols;
|
||||
size_t step1Global = ceil((float) step1WorkSize / kernelLinearBackprop1Local) * kernelLinearBackprop1Local;
|
||||
clEnqueueNDRangeKernel(queue, kernelLinearBackprop1, 1, NULL, &step1Global, &kernelLinearBackprop1Local, 0, NULL, NULL);
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
err = 0;
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 0, sizeof(cl_uint), &batchSize);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 1, sizeof(matWeights), &matWeights);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 2, sizeof(matWeights_values), &matWeights_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 3, sizeof(matInput), &matInput);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 4, sizeof(matInput_values), &matInput_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 5, sizeof(matInputErrors), &matInputErrors);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 6, sizeof(matInputErrors_values), &matInputErrors_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 7, sizeof(cl_char), &updateErrors);
|
||||
|
||||
if(updateErrors) {
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 8, sizeof(matOutputErrors), &matOutputErrors);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 9, sizeof(matOutputErrors_values), &matOutputErrors_values);
|
||||
}
|
||||
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 10, sizeof(matOutputWeightsErrors), &matOutputWeightsErrors);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 11, sizeof(matOutputWeightsErrors_values), &matOutputWeightsErrors_values);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 12, sizeof(matOutputGradients), &matOutputGradients);
|
||||
err |= clSetKernelArg(kernelLinearBackprop2, 13, sizeof(matOutputGradients_values), &matOutputGradients_values);
|
||||
if(err != CL_SUCCESS) {
|
||||
printf("Failed to set kernel args (3): %d\n", err);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t step2WorkSize = matOutputWeightsErrors.rows * matOutputWeightsErrors.cols;
|
||||
size_t step2Global = ceil((float) step1WorkSize / kernelLinearBackprop2Local) * kernelLinearBackprop2Local;
|
||||
clEnqueueNDRangeKernel(queue, kernelLinearBackprop2, 1, NULL, &step2Global, &kernelLinearBackprop2Local, 0, NULL, NULL);
|
||||
|
||||
clFlush(queue);
|
||||
clFinish(queue);
|
||||
|
||||
readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient);
|
||||
readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError);
|
||||
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError);
|
||||
}
|
||||
|
195
src/clm_util.c
Normal file
195
src/clm_util.c
Normal file
@ -0,0 +1,195 @@
|
||||
#include "clm_util.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
char *clm_loadFile(const char *path) {
|
||||
FILE *file = fopen(path, "r");
|
||||
if(!file) {
|
||||
printf("fopen failed: %s\n", strerror(errno));
|
||||
goto error;
|
||||
}
|
||||
|
||||
if(fseek(file, 0, SEEK_END) < 0) {
|
||||
printf("fseek failed: %s\n", strerror(errno));
|
||||
goto cleanup_file;
|
||||
}
|
||||
|
||||
long length = ftell(file);
|
||||
if(length < 0) {
|
||||
printf("ftell failed: %s\n", strerror(errno));
|
||||
goto cleanup_file;
|
||||
}
|
||||
|
||||
if(fseek(file, 0, SEEK_SET) < 0) {
|
||||
printf("fseek failed: %s\n", strerror(errno));
|
||||
goto cleanup_file;
|
||||
}
|
||||
|
||||
char *buffer = calloc(1, length + 1);
|
||||
if(!buffer) {
|
||||
printf("calloc failed: %s\n", strerror(errno));
|
||||
goto cleanup_file;
|
||||
}
|
||||
|
||||
size_t nRead = fread(buffer, length, 1, file);
|
||||
if(nRead != 1) {
|
||||
printf("fread failed: (nRead = %zu, err %d, eof %d)\n", nRead, ferror(file), feof(file));
|
||||
goto cleanup_buffer;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return buffer;
|
||||
|
||||
cleanup_buffer:
|
||||
free(buffer);
|
||||
cleanup_file:
|
||||
fclose(file);
|
||||
error:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char *clm_clErrorToString(cl_int error) {
|
||||
// Source: https://stackoverflow.com/questions/24326432/convenient-way-to-show-opencl-error-codes
|
||||
switch(error) {
|
||||
// run-time and JIT compiler errors
|
||||
case 0:
|
||||
return "CL_SUCCESS";
|
||||
case -1:
|
||||
return "CL_DEVICE_NOT_FOUND";
|
||||
case -2:
|
||||
return "CL_DEVICE_NOT_AVAILABLE";
|
||||
case -3:
|
||||
return "CL_COMPILER_NOT_AVAILABLE";
|
||||
case -4:
|
||||
return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||
case -5:
|
||||
return "CL_OUT_OF_RESOURCES";
|
||||
case -6:
|
||||
return "CL_OUT_OF_HOST_MEMORY";
|
||||
case -7:
|
||||
return "CL_PROFILING_INFO_NOT_AVAILABLE";
|
||||
case -8:
|
||||
return "CL_MEM_COPY_OVERLAP";
|
||||
case -9:
|
||||
return "CL_IMAGE_FORMAT_MISMATCH";
|
||||
case -10:
|
||||
return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
|
||||
case -11:
|
||||
return "CL_BUILD_PROGRAM_FAILURE";
|
||||
case -12:
|
||||
return "CL_MAP_FAILURE";
|
||||
case -13:
|
||||
return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
|
||||
case -14:
|
||||
return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
|
||||
case -15:
|
||||
return "CL_COMPILE_PROGRAM_FAILURE";
|
||||
case -16:
|
||||
return "CL_LINKER_NOT_AVAILABLE";
|
||||
case -17:
|
||||
return "CL_LINK_PROGRAM_FAILURE";
|
||||
case -18:
|
||||
return "CL_DEVICE_PARTITION_FAILED";
|
||||
case -19:
|
||||
return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
|
||||
|
||||
// compile-time errors
|
||||
case -30:
|
||||
return "CL_INVALID_VALUE";
|
||||
case -31:
|
||||
return "CL_INVALID_DEVICE_TYPE";
|
||||
case -32:
|
||||
return "CL_INVALID_PLATFORM";
|
||||
case -33:
|
||||
return "CL_INVALID_DEVICE";
|
||||
case -34:
|
||||
return "CL_INVALID_CONTEXT";
|
||||
case -35:
|
||||
return "CL_INVALID_QUEUE_PROPERTIES";
|
||||
case -36:
|
||||
return "CL_INVALID_COMMAND_QUEUE";
|
||||
case -37:
|
||||
return "CL_INVALID_HOST_PTR";
|
||||
case -38:
|
||||
return "CL_INVALID_MEM_OBJECT";
|
||||
case -39:
|
||||
return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
|
||||
case -40:
|
||||
return "CL_INVALID_IMAGE_SIZE";
|
||||
case -41:
|
||||
return "CL_INVALID_SAMPLER";
|
||||
case -42:
|
||||
return "CL_INVALID_BINARY";
|
||||
case -43:
|
||||
return "CL_INVALID_BUILD_OPTIONS";
|
||||
case -44:
|
||||
return "CL_INVALID_PROGRAM";
|
||||
case -45:
|
||||
return "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||
case -46:
|
||||
return "CL_INVALID_KERNEL_NAME";
|
||||
case -47:
|
||||
return "CL_INVALID_KERNEL_DEFINITION";
|
||||
case -48:
|
||||
return "CL_INVALID_KERNEL";
|
||||
case -49:
|
||||
return "CL_INVALID_ARG_INDEX";
|
||||
case -50:
|
||||
return "CL_INVALID_ARG_VALUE";
|
||||
case -51:
|
||||
return "CL_INVALID_ARG_SIZE";
|
||||
case -52:
|
||||
return "CL_INVALID_KERNEL_ARGS";
|
||||
case -53:
|
||||
return "CL_INVALID_WORK_DIMENSION";
|
||||
case -54:
|
||||
return "CL_INVALID_WORK_GROUP_SIZE";
|
||||
case -55:
|
||||
return "CL_INVALID_WORK_ITEM_SIZE";
|
||||
case -56:
|
||||
return "CL_INVALID_GLOBAL_OFFSET";
|
||||
case -57:
|
||||
return "CL_INVALID_EVENT_WAIT_LIST";
|
||||
case -58:
|
||||
return "CL_INVALID_EVENT";
|
||||
case -59:
|
||||
return "CL_INVALID_OPERATION";
|
||||
case -60:
|
||||
return "CL_INVALID_GL_OBJECT";
|
||||
case -61:
|
||||
return "CL_INVALID_BUFFER_SIZE";
|
||||
case -62:
|
||||
return "CL_INVALID_MIP_LEVEL";
|
||||
case -63:
|
||||
return "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||
case -64:
|
||||
return "CL_INVALID_PROPERTY";
|
||||
case -65:
|
||||
return "CL_INVALID_IMAGE_DESCRIPTOR";
|
||||
case -66:
|
||||
return "CL_INVALID_COMPILER_OPTIONS";
|
||||
case -67:
|
||||
return "CL_INVALID_LINKER_OPTIONS";
|
||||
case -68:
|
||||
return "CL_INVALID_DEVICE_PARTITION_COUNT";
|
||||
|
||||
// extension errors
|
||||
case -1000:
|
||||
return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
|
||||
case -1001:
|
||||
return "CL_PLATFORM_NOT_FOUND_KHR";
|
||||
case -1002:
|
||||
return "CL_INVALID_D3D10_DEVICE_KHR";
|
||||
case -1003:
|
||||
return "CL_INVALID_D3D10_RESOURCE_KHR";
|
||||
case -1004:
|
||||
return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
|
||||
case -1005:
|
||||
return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
|
||||
default:
|
||||
return "Unknown OpenCL error";
|
||||
}
|
||||
}
|
11
src/clm_util.h
Normal file
11
src/clm_util.h
Normal file
@ -0,0 +1,11 @@
|
||||
#ifndef _UTIL_H_
|
||||
#define _UTIL_H_
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 200
|
||||
#include <CL/cl_platform.h>
|
||||
|
||||
char *clm_loadFile(const char *path);
|
||||
|
||||
const char *clm_clErrorToString(cl_int error);
|
||||
|
||||
#endif
|
29
src/cltest.c
29
src/cltest.c
@ -37,7 +37,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
|
||||
unsigned int batchSize = numElements - b * nn.batchSize;
|
||||
if(batchSize > nn.batchSize) batchSize = nn.batchSize;
|
||||
|
||||
// printf("Batch %d (size %d)\n", b, batchSize);
|
||||
printf("Batch %d (size %d)\n", b, batchSize);
|
||||
|
||||
for(unsigned int i = 0; i < batchSize; i++) {
|
||||
clm_Vector input = inputs[b * nn.batchSize + i];
|
||||
@ -53,7 +53,26 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
|
||||
currentXs = nn.layers[i].output;
|
||||
}
|
||||
|
||||
clm_Linear lastLayer = nn.layers[nn.numLayers - 1];
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
// Error of last layer = y - yhat
|
||||
clm_matrixCopy(batchOutputs[b], lastLayer.error[b]); // lastLayer.error = y
|
||||
clm_matrixSubtractMatrix(lastLayer.error[b], lastLayer.output[b]); // lastLayer.error -= yhat
|
||||
}
|
||||
|
||||
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
||||
clm_Linear layer = nn.layers[i];
|
||||
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
|
||||
clm_Matrix *outputsOfThisLayer = layer.output;
|
||||
clm_linearBackprop(&layer, nn.learnRate, batchSize, inputsToThisLayer, outputsOfThisLayer, layer.error, i > 0, i == 0 ? NULL : nn.layers[i - 1].error, layer.weightsError, layer.gradient);
|
||||
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
clm_matrixAddMatrix(layer.weights, layer.weightsError[b]);
|
||||
clm_matrixAddMatrix(layer.bias, layer.gradient[b]);
|
||||
}
|
||||
}
|
||||
|
||||
/*for(int i = nn.numLayers - 1; i >= 0; i--) {
|
||||
clm_Linear layer = nn.layers[i];
|
||||
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
|
||||
clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
|
||||
@ -76,7 +95,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
|
||||
|
||||
clm_matrixAddMatrix(layer.weights, layer.weightsError);
|
||||
clm_matrixAddMatrix(layer.bias, gradient);
|
||||
}
|
||||
}*/
|
||||
}
|
||||
|
||||
free(batchInputs);
|
||||
@ -190,7 +209,7 @@ int main() {
|
||||
clm_Linear layers[] = {
|
||||
clm_linearCreateRandom(i, h),
|
||||
clm_linearCreateRandom(h, o)};
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 100);
|
||||
clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000);
|
||||
|
||||
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
||||
printf("Epoch %u\n", epoch);
|
||||
@ -220,7 +239,7 @@ int main() {
|
||||
predDigit = j;
|
||||
}
|
||||
}
|
||||
if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
|
||||
// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
|
||||
|
||||
unsigned int actDigit = 0;
|
||||
float maxA = -1;
|
||||
@ -231,7 +250,7 @@ int main() {
|
||||
actDigit = j;
|
||||
}
|
||||
}
|
||||
if(idx < 100) printf("Actual: %u\n", actDigit);
|
||||
// if(idx < 100) printf("Actual: %u\n", actDigit);
|
||||
// printf("\n");
|
||||
|
||||
if(predDigit == actDigit) correct++;
|
||||
|
98
src/mat.cl
98
src/mat.cl
@ -6,12 +6,23 @@ typedef struct __attribute__((packed)) {
|
||||
|
||||
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
||||
|
||||
#define matrixGetIJ(mat, idx, i, j) \
|
||||
{ \
|
||||
if(!mat.transposed) { \
|
||||
i = idx / mat.cols; \
|
||||
j = idx % mat.cols; \
|
||||
} else { \
|
||||
i = idx / mat.rows; \
|
||||
j = idx % mat.rows; \
|
||||
} \
|
||||
}
|
||||
|
||||
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= matOut.rows * matOut.cols) return;
|
||||
|
||||
uint i = idx / matOut.cols;
|
||||
uint j = idx % matOut.cols;
|
||||
uint i, j;
|
||||
matrixGetIJ(matOut, idx, i, j);
|
||||
|
||||
float sum = 0;
|
||||
for(unsigned int k = 0; k < matA.cols; k++) {
|
||||
@ -34,7 +45,40 @@ void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
|
||||
mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
|
||||
}
|
||||
|
||||
__kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
|
||||
void mat_dSigmoid(cl_GPUMat mat, __global float *mat_values) {
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= mat.rows * mat.cols) return;
|
||||
|
||||
float v = mat_values[idx];
|
||||
mat_values[idx] = v * (1 - v);
|
||||
}
|
||||
|
||||
void mat_copy(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) {
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= mat.rows * mat.cols) return;
|
||||
|
||||
other_values[idx] = mat_values[idx];
|
||||
}
|
||||
|
||||
void mat_multiply_elements(cl_GPUMat mat, __global float *mat_values, cl_GPUMat other, __global float *other_values) {
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= mat.rows * mat.cols) return;
|
||||
|
||||
other_values[idx] *= mat_values[idx];
|
||||
}
|
||||
|
||||
void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar) {
|
||||
uint idx = get_global_id(0);
|
||||
if(idx >= mat.rows * mat.cols) return;
|
||||
|
||||
mat_values[idx] *= scalar;
|
||||
}
|
||||
|
||||
__kernel void linear_forward(unsigned int batchSize,
|
||||
cl_GPUMat input, __global float *input_values,
|
||||
cl_GPUMat weights, __global float *weights_values,
|
||||
cl_GPUMat bias, __global float *bias_values,
|
||||
cl_GPUMat out, __global float *out_values) {
|
||||
// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
__global float *batchInput_values = input_values + b * input.rows * input.cols;
|
||||
@ -44,3 +88,51 @@ __kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global f
|
||||
mat_sigmoid(out, batchOut_values);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void linear_backprop_1(unsigned int batchSize, float learnRate,
|
||||
cl_GPUMat weights, __global float *weights_values,
|
||||
cl_GPUMat outputs, __global float *outputs_values,
|
||||
cl_GPUMat inputErrors, __global float *inputErrors_values,
|
||||
cl_GPUMat outputGradients, __global float *outputGradients_values) {
|
||||
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
__global float *batchOut_values = outputs_values + b * outputs.rows * outputs.cols;
|
||||
__global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols;
|
||||
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
|
||||
|
||||
mat_copy(outputs, batchOut_values, outputGradients, batchOutGradients_values);
|
||||
mat_dSigmoid(outputGradients, batchOutGradients_values); // dsig(yhat)
|
||||
mat_multiply_elements(outputGradients, batchOutGradients_values, inputErrors, batchInErrors_values); // (yhat - y) . dsig(yhat)
|
||||
mat_multiply_scalar(outputGradients, batchOutGradients_values, learnRate);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void linear_backprop_2(unsigned int batchSize,
|
||||
cl_GPUMat weights, __global float *weights_values,
|
||||
cl_GPUMat inputs, __global float *inputs_values,
|
||||
cl_GPUMat inputErrors, __global float *inputErrors_values,
|
||||
char updateErrors,
|
||||
cl_GPUMat outputErrors, __global float *outputErrors_values,
|
||||
cl_GPUMat outputWeightsErrors, __global float *outputWeightsErrors_values,
|
||||
cl_GPUMat outputGradients, __global float *outputGradients_values) {
|
||||
for(unsigned int b = 0; b < batchSize; b++) {
|
||||
__global float *batchInput_values = inputs_values + b * inputs.rows * inputs.cols;
|
||||
__global float *batchInErrors_values = inputErrors_values + b * inputErrors.rows * inputErrors.cols;
|
||||
__global float *batchOutErrors_values = outputErrors_values + b * outputErrors.rows * outputErrors.cols;
|
||||
__global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols;
|
||||
__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
|
||||
|
||||
cl_GPUMat inputsT = inputs;
|
||||
inputsT.transposed = true;
|
||||
mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
|
||||
|
||||
// clm_matrixAddMatrix(linear->weights, outputWeightsErrors[b]);
|
||||
// clm_matrixAddMatrix(linear->bias, gradient);
|
||||
|
||||
if(updateErrors) {
|
||||
cl_GPUMat weightsT = weights;
|
||||
weightsT.transposed = true;
|
||||
mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user