Major code refactor (WIP)
This commit is contained in:
parent
d55f7c69ba
commit
9547d86251
37
src/clm.c
37
src/clm.c
@ -206,8 +206,8 @@ bool clm_vectorIsInvalid(clm_Vector vec) {
|
|||||||
|
|
||||||
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
|
clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
|
||||||
clm_Linear linear = {0};
|
clm_Linear linear = {0};
|
||||||
linear.weights = clm_createMatrixRandom(outputs, inputs);
|
linear.weights.matrix = clm_createMatrixRandom(outputs, inputs);
|
||||||
linear.bias = clm_createMatrixRandom(outputs, 1);
|
linear.bias.matrix = clm_createMatrixRandom(outputs, 1);
|
||||||
|
|
||||||
// linear.output = clm_createMatrix(outputs, 1);
|
// linear.output = clm_createMatrix(outputs, 1);
|
||||||
// linear.error = clm_createMatrix(outputs, 1);
|
// linear.error = clm_createMatrix(outputs, 1);
|
||||||
@ -215,33 +215,42 @@ clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
|
|||||||
return linear;
|
return linear;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static clm_NativeMatrixArray clm_createMatrixArray(unsigned int length, unsigned int rows, unsigned int cols) {
|
||||||
|
clm_NativeMatrixArray array = {0};
|
||||||
|
array.length = length;
|
||||||
|
array.matrixes = calloc(length, sizeof(clm_Matrix));
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < length; i++) {
|
||||||
|
array.matrixes[i] = clm_createMatrix(rows, cols);
|
||||||
|
}
|
||||||
|
|
||||||
|
return array;
|
||||||
|
}
|
||||||
|
|
||||||
clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize) {
|
clm_NN clm_nnCreate(unsigned int numLayers, clm_Linear *layers, float learnRate, unsigned int batchSize) {
|
||||||
clm_NN nn = {.numLayers = numLayers, .layers = layers, .learnRate = learnRate, .batchSize = batchSize};
|
clm_NN nn = {.numLayers = numLayers, .layers = layers, .learnRate = learnRate, .batchSize = batchSize};
|
||||||
|
|
||||||
for(unsigned int i = 0; i < numLayers; i++) {
|
for(unsigned int i = 0; i < numLayers; i++) {
|
||||||
layers[i].output = calloc(batchSize, sizeof(clm_Matrix));
|
clm_Matrix weights = layers[i].weights.matrix;
|
||||||
layers[i].error = calloc(batchSize, sizeof(clm_Matrix));
|
|
||||||
layers[i].weightsError = calloc(batchSize, sizeof(clm_Matrix));
|
|
||||||
layers[i].gradient = calloc(batchSize, sizeof(clm_Matrix));
|
|
||||||
|
|
||||||
for(unsigned int j = 0; j < batchSize; j++) {
|
layers[i].output = clm_createMatrixArray(batchSize, weights.rows, 1);
|
||||||
layers[i].output[j] = clm_createMatrix(layers[i].weights.rows, 1);
|
layers[i].error = clm_createMatrixArray(batchSize, weights.rows, 1);
|
||||||
layers[i].error[j] = clm_createMatrix(layers[i].weights.rows, 1);
|
layers[i].weightsError = clm_createMatrixArray(batchSize, weights.rows, weights.cols);
|
||||||
layers[i].weightsError[j] = clm_createMatrix(layers[i].weights.rows, layers[i].weights.cols);
|
layers[i].gradient = clm_createMatrixArray(batchSize, weights.rows, 1);
|
||||||
layers[i].gradient[j] = clm_createMatrix(layers[i].weights.rows, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return nn;
|
return nn;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: clm_freeNN
|
||||||
|
|
||||||
void clm_freeVector(clm_Vector vector) {
|
void clm_freeVector(clm_Vector vector) {
|
||||||
free(vector.values);
|
free(vector.values);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clm_freeLinear(clm_Linear linear) {
|
void clm_freeLinear(clm_Linear linear) {
|
||||||
clm_freeMatrix(linear.weights);
|
clm_freeMatrix(linear.weights.matrix);
|
||||||
clm_freeMatrix(linear.bias);
|
clm_freeMatrix(linear.bias.matrix);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clm_matrixPrint(clm_Matrix mat) {
|
void clm_matrixPrint(clm_Matrix mat) {
|
||||||
|
31
src/clm.h
31
src/clm.h
@ -22,21 +22,26 @@ typedef struct {
|
|||||||
} clm_Vector;
|
} clm_Vector;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
clm_Matrix weights;
|
clm_Matrix matrix;
|
||||||
clm_Matrix bias;
|
clm_NativeBuf *native;
|
||||||
clm_Matrix *output;
|
} clm_NativeMatrix;
|
||||||
clm_Matrix *error;
|
|
||||||
clm_Matrix *weightsError;
|
typedef struct {
|
||||||
clm_Matrix *gradient;
|
clm_Matrix *matrixes;
|
||||||
|
clm_NativeBuf *native;
|
||||||
|
unsigned int length;
|
||||||
|
} clm_NativeMatrixArray;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
clm_NativeMatrix weights;
|
||||||
|
clm_NativeMatrix bias;
|
||||||
|
clm_NativeMatrixArray output;
|
||||||
|
clm_NativeMatrixArray error;
|
||||||
|
clm_NativeMatrixArray weightsError;
|
||||||
|
clm_NativeMatrixArray gradient;
|
||||||
|
|
||||||
clm_NativeBuf *nativeWeights;
|
|
||||||
clm_NativeBuf *nativeBias;
|
|
||||||
clm_NativeBuf *nativeInput;
|
clm_NativeBuf *nativeInput;
|
||||||
clm_NativeBuf *nativeOutput;
|
clm_NativeBuf *nativeOutputErrors;
|
||||||
clm_NativeBuf *nativeInputError;
|
|
||||||
clm_NativeBuf *nativeOutputError;
|
|
||||||
clm_NativeBuf *nativeWeightsError;
|
|
||||||
clm_NativeBuf *nativeGradient;
|
|
||||||
} clm_Linear;
|
} clm_Linear;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -13,21 +13,27 @@ void clm_gpuDestroy() {}
|
|||||||
|
|
||||||
void clm_linearInit(clm_Linear *linear) {}
|
void clm_linearInit(clm_Linear *linear) {}
|
||||||
|
|
||||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs) {
|
||||||
|
clm_Matrix *outputs = linear->output.matrixes;
|
||||||
for(unsigned int b = 0; b < batchSize; b++) {
|
for(unsigned int b = 0; b < batchSize; b++) {
|
||||||
clm_matrixMultiplyMatrix(linear->weights, inputs[b], outputs[b]);
|
clm_matrixMultiplyMatrix(linear->weights.matrix, inputs[b], outputs[b]);
|
||||||
|
|
||||||
if(clm_matrixIsInvalid(outputs[b])) {
|
if(clm_matrixIsInvalid(outputs[b])) {
|
||||||
printf("Forward pass failed\n");
|
printf("Forward pass failed\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
clm_matrixAddMatrix(outputs[b], linear->bias);
|
clm_matrixAddMatrix(outputs[b], linear->bias.matrix);
|
||||||
clm_matrixSigmoid(outputs[b]);
|
clm_matrixSigmoid(outputs[b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) {
|
||||||
|
clm_Matrix *outputs = linear->output.matrixes;
|
||||||
|
clm_Matrix *outputGradients = linear->gradient.matrixes;
|
||||||
|
clm_Matrix *outputWeightsErrors = linear->weightsError.matrixes;
|
||||||
|
clm_Matrix *inputErrors = linear->error.matrixes;
|
||||||
|
|
||||||
for(unsigned int b = 0; b < batchSize; b++) {
|
for(unsigned int b = 0; b < batchSize; b++) {
|
||||||
// clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
|
// clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
|
||||||
// clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
|
// clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
|
||||||
@ -48,7 +54,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
|||||||
|
|
||||||
if(updateErrors) {
|
if(updateErrors) {
|
||||||
// GPU Step 2 extended (opt)
|
// GPU Step 2 extended (opt)
|
||||||
clm_Matrix weightsT = clm_matrixTranspose(linear->weights);
|
clm_Matrix weightsT = clm_matrixTranspose(linear->weights.matrix);
|
||||||
clm_matrixMultiplyMatrix(weightsT, inputErrors[b], outputErrors[b]);
|
clm_matrixMultiplyMatrix(weightsT, inputErrors[b], outputErrors[b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,8 +6,8 @@
|
|||||||
typedef int (*clm_gpuInitFunc)(unsigned int);
|
typedef int (*clm_gpuInitFunc)(unsigned int);
|
||||||
typedef void (*clm_gpuDestroyFunc)();
|
typedef void (*clm_gpuDestroyFunc)();
|
||||||
typedef void (*clm_linearInitFunc)(clm_Linear *linear);
|
typedef void (*clm_linearInitFunc)(clm_Linear *linear);
|
||||||
typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs);
|
||||||
typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
|
typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors);
|
||||||
|
|
||||||
static void *lib;
|
static void *lib;
|
||||||
static clm_gpuInitFunc initFunc;
|
static clm_gpuInitFunc initFunc;
|
||||||
@ -56,10 +56,10 @@ void clm_linearInit(clm_Linear *linear) {
|
|||||||
linearInitFunc(linear);
|
linearInitFunc(linear);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs) {
|
||||||
linearForwardFunc(linear, batchSize, inputs, outputs);
|
linearForwardFunc(linear, batchSize, inputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) {
|
||||||
linearBackpropFunc(linear, learnRate, batchSize, inputs, outputs, inputErrors, updateErrors, outputErrors, outputWeightsErrors, outputGradients);
|
linearBackpropFunc(linear, learnRate, batchSize, inputs, updateErrors, outputErrors);
|
||||||
}
|
}
|
||||||
|
@ -10,7 +10,7 @@ int clm_gpuInit(unsigned int mode);
|
|||||||
void clm_gpuDestroy();
|
void clm_gpuDestroy();
|
||||||
|
|
||||||
void clm_linearInit(clm_Linear *linear);
|
void clm_linearInit(clm_Linear *linear);
|
||||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs);
|
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs);
|
||||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients);
|
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
124
src/clm_opencl.c
124
src/clm_opencl.c
@ -32,7 +32,7 @@ typedef struct __attribute__((packed)) {
|
|||||||
} cl_GPUMat;
|
} cl_GPUMat;
|
||||||
|
|
||||||
#define gpuMat(mat) \
|
#define gpuMat(mat) \
|
||||||
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
|
((cl_GPUMat){.rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed})
|
||||||
|
|
||||||
int clm_gpuInit(unsigned int mode) {
|
int clm_gpuInit(unsigned int mode) {
|
||||||
// Connect to a compute device
|
// Connect to a compute device
|
||||||
@ -142,6 +142,33 @@ static cl_mem writeGPUMats(cl_GPUMat gpuMat, unsigned int numMats, clm_Matrix *m
|
|||||||
return mem;
|
return mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static cl_mem writeNativeMatrix(clm_NativeMatrix matrix) {
|
||||||
|
cl_mem mem = matrix.native->mem;
|
||||||
|
cl_int err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * matrix.matrix.rows * matrix.matrix.cols, matrix.matrix.values, 0, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
static cl_mem writeNativeMatrixArray(clm_NativeMatrixArray array) {
|
||||||
|
clm_Matrix mat = array.matrixes[0];
|
||||||
|
cl_mem mem = array.native->mem;
|
||||||
|
|
||||||
|
size_t matLength = sizeof(float) * mat.rows * mat.cols;
|
||||||
|
for(unsigned int i = 0; i < array.length; i++) {
|
||||||
|
cl_int err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, i * matLength, matLength, array.matrixes[i].values, 0, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to enqueue write: %s\n", clm_clErrorToString(err));
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return mem;
|
||||||
|
}
|
||||||
|
|
||||||
static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
|
static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, clm_NativeBuf *nativeBuf) {
|
||||||
cl_int err;
|
cl_int err;
|
||||||
cl_mem mem = nativeBuf->mem;
|
cl_mem mem = nativeBuf->mem;
|
||||||
@ -155,25 +182,51 @@ static void readGPUMats(cl_GPUMat mat, unsigned int numMats, clm_Matrix *mats, c
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void readNativeMatrix(clm_NativeMatrix matrix) {
|
||||||
|
cl_int err = clEnqueueReadBuffer(queue, matrix.native->mem, CL_TRUE, 0, sizeof(float) * matrix.matrix.rows * matrix.matrix.cols, matrix.matrix.values, 0, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to enqueue read: %s\n", clm_clErrorToString(err));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void readNativeMatrixArray(clm_NativeMatrixArray array) {
|
||||||
|
clm_Matrix mat = array.matrixes[0];
|
||||||
|
for(unsigned int i = 0; i < array.length; i++) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void clm_nativeAllocMatrix(clm_NativeMatrix *matrix, cl_mem_flags flags) {
|
||||||
|
matrix->native = calloc(1, sizeof(clm_NativeBuf));
|
||||||
|
allocGPUMat(gpuMat(matrix->matrix), 1, flags, matrix->native);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void clm_nativeAllocMatrixArray(clm_NativeMatrixArray *matrixArray, cl_mem_flags flags) {
|
||||||
|
matrixArray->native = calloc(1, sizeof(clm_NativeBuf));
|
||||||
|
allocGPUMat(gpuMat(matrixArray->matrixes[0]), matrixArray->length, flags, matrixArray->native);
|
||||||
|
}
|
||||||
|
|
||||||
void clm_linearInit(clm_Linear *linear) {
|
void clm_linearInit(clm_Linear *linear) {
|
||||||
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
|
clm_nativeAllocMatrix(&linear->weights, CL_MEM_READ_ONLY);
|
||||||
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
|
clm_nativeAllocMatrix(&linear->bias, CL_MEM_READ_ONLY);
|
||||||
|
clm_nativeAllocMatrixArray(&linear->output, CL_MEM_READ_WRITE);
|
||||||
|
clm_nativeAllocMatrixArray(&linear->error, CL_MEM_READ_WRITE);
|
||||||
|
clm_nativeAllocMatrixArray(&linear->weightsError, CL_MEM_READ_WRITE);
|
||||||
|
clm_nativeAllocMatrixArray(&linear->gradient, CL_MEM_READ_WRITE);
|
||||||
|
|
||||||
|
// TODO: alloc mem
|
||||||
linear->nativeInput = calloc(1, sizeof(clm_NativeBuf));
|
linear->nativeInput = calloc(1, sizeof(clm_NativeBuf));
|
||||||
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
|
linear->nativeOutputErrors = calloc(1, sizeof(clm_NativeBuf));
|
||||||
linear->nativeInputError = calloc(1, sizeof(clm_NativeBuf));
|
|
||||||
linear->nativeOutputError = calloc(1, sizeof(clm_NativeBuf));
|
|
||||||
linear->nativeWeightsError = calloc(1, sizeof(clm_NativeBuf));
|
|
||||||
linear->nativeGradient = calloc(1, sizeof(clm_NativeBuf));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
|
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
|
||||||
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) {
|
void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs) {
|
||||||
if(batchSize == 0) return;
|
if(batchSize == 0) return;
|
||||||
|
|
||||||
cl_GPUMat matInput = gpuMat(inputs[0]);
|
cl_GPUMat matInput = gpuMat(inputs[0]);
|
||||||
cl_GPUMat matWeights = gpuMat(linear->weights);
|
cl_GPUMat matWeights = gpuMat(linear->weights.matrix);
|
||||||
cl_GPUMat matBias = gpuMat(linear->bias);
|
cl_GPUMat matBias = gpuMat(linear->bias.matrix);
|
||||||
cl_GPUMat matOut = gpuMat(outputs[0]);
|
cl_GPUMat matOut = gpuMat(linear->output.matrixes[0]);
|
||||||
|
|
||||||
size_t workSize = matOut.rows * matOut.cols;
|
size_t workSize = matOut.rows * matOut.cols;
|
||||||
|
|
||||||
@ -181,18 +234,14 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
|||||||
|
|
||||||
// TODO: make sure to always alloc nn.batchSize, not batchSize
|
// TODO: make sure to always alloc nn.batchSize, not batchSize
|
||||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
||||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
|
cl_mem matWeights_values = writeNativeMatrix(linear->weights);
|
||||||
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, CL_MEM_READ_ONLY, linear->nativeBias);
|
cl_mem matBias_values = writeNativeMatrix(linear->bias);
|
||||||
if(!matInput_values || !matWeights_values || !matBias_values) {
|
if(!matInput_values || !matWeights_values || !matBias_values) {
|
||||||
// linear->output = INVALID_MATRIX;
|
// linear->output = INVALID_MATRIX;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
cl_mem matOut_values = allocGPUMat(matOut, batchSize, CL_MEM_READ_WRITE, linear->nativeOutput);
|
cl_mem matOut_values = linear->output.native->mem;
|
||||||
if(!matOut_values) {
|
|
||||||
// linear->output = INVALID_MATRIX;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
err = 0;
|
err = 0;
|
||||||
err |= clSetKernelArg(kernelLinearForward, 0, sizeof(cl_uint), &batchSize);
|
err |= clSetKernelArg(kernelLinearForward, 0, sizeof(cl_uint), &batchSize);
|
||||||
@ -229,38 +278,35 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
|
|||||||
return;
|
return;
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
readGPUMats(matOut, batchSize, outputs, linear->nativeOutput);
|
readNativeMatrixArray(linear->output);
|
||||||
|
|
||||||
clFlush(queue);
|
|
||||||
clFinish(queue);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
|
void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, bool updateErrors, clm_Matrix *outputErrors) {
|
||||||
if(batchSize == 0) return;
|
if(batchSize == 0) return;
|
||||||
|
|
||||||
cl_GPUMat matInput = gpuMat(inputs[0]);
|
cl_GPUMat matInput = gpuMat(inputs[0]);
|
||||||
cl_GPUMat matWeights = gpuMat(linear->weights);
|
cl_GPUMat matWeights = gpuMat(linear->weights.matrix);
|
||||||
cl_GPUMat matOutput = gpuMat(outputs[0]);
|
cl_GPUMat matOutput = gpuMat(linear->output.matrixes[0]);
|
||||||
cl_GPUMat matInputErrors = gpuMat(inputErrors[0]);
|
cl_GPUMat matInputErrors = gpuMat(linear->error.matrixes[0]);
|
||||||
cl_GPUMat matOutputErrors = !updateErrors ? (cl_GPUMat){0} : (cl_GPUMat) gpuMat(outputErrors[0]);
|
cl_GPUMat matOutputErrors = !updateErrors ? (cl_GPUMat){0} : (cl_GPUMat) gpuMat(outputErrors[0]);
|
||||||
cl_GPUMat matOutputWeightsErrors = gpuMat(outputWeightsErrors[0]);
|
cl_GPUMat matOutputWeightsErrors = gpuMat(linear->weightsError.matrixes[0]);
|
||||||
cl_GPUMat matOutputGradients = gpuMat(outputGradients[0]);
|
cl_GPUMat matOutputGradients = gpuMat(linear->gradient.matrixes[0]);
|
||||||
|
|
||||||
cl_int err;
|
cl_int err;
|
||||||
|
|
||||||
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
cl_mem matInput_values = writeGPUMats(matInput, batchSize, inputs, CL_MEM_READ_ONLY, linear->nativeInput);
|
||||||
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, CL_MEM_READ_ONLY, linear->nativeWeights);
|
cl_mem matWeights_values = writeNativeMatrix(linear->weights);
|
||||||
cl_mem matInputErrors_values = writeGPUMats(matInputErrors, batchSize, inputErrors, CL_MEM_READ_ONLY, linear->nativeInputError);
|
cl_mem matInputErrors_values = writeNativeMatrixArray(linear->error);
|
||||||
cl_mem matOutput_values = writeGPUMats(matOutput, batchSize, outputs, CL_MEM_READ_WRITE, linear->nativeOutput);
|
cl_mem matOutput_values = writeNativeMatrixArray(linear->output);
|
||||||
if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) {
|
if(!matInput_values || !matWeights_values || !matInputErrors_values || !matOutput_values) {
|
||||||
printf("Failed to write GPU mats\n");
|
printf("Failed to write GPU mats\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// cl_mem matOutput_values = allocGPUMat(matOutput, batchSize, CL_MEM_READ_ONLY, linear->nativeOutput);
|
// cl_mem matOutput_values = allocGPUMat(matOutput, batchSize, CL_MEM_READ_ONLY, linear->nativeOutput);
|
||||||
cl_mem matOutputErrors_values = !updateErrors ? NULL : allocGPUMat(matOutputErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeOutputError);
|
cl_mem matOutputErrors_values = !updateErrors ? NULL : allocGPUMat(matOutputErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeOutputErrors);
|
||||||
cl_mem matOutputWeightsErrors_values = allocGPUMat(matOutputWeightsErrors, batchSize, CL_MEM_READ_WRITE, linear->nativeWeightsError);
|
cl_mem matOutputWeightsErrors_values = linear->weightsError.native->mem;
|
||||||
cl_mem matOutputGradients_values = allocGPUMat(matOutputGradients, batchSize, CL_MEM_READ_WRITE, linear->nativeGradient);
|
cl_mem matOutputGradients_values = linear->gradient.native->mem;
|
||||||
if(!matOutputWeightsErrors_values || !matOutputGradients_values) {
|
if(!matOutputWeightsErrors_values || !matOutputGradients_values) {
|
||||||
printf("Failed to alloc GPU mats\n");
|
printf("Failed to alloc GPU mats\n");
|
||||||
return;
|
return;
|
||||||
@ -320,10 +366,8 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
|
|||||||
clFlush(queue);
|
clFlush(queue);
|
||||||
clFinish(queue);
|
clFinish(queue);
|
||||||
|
|
||||||
readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient);
|
readNativeMatrixArray(linear->weightsError);
|
||||||
readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError);
|
readNativeMatrixArray(linear->gradient);
|
||||||
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError);
|
|
||||||
|
|
||||||
clFlush(queue);
|
if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputErrors);
|
||||||
clFinish(queue);
|
|
||||||
}
|
}
|
||||||
|
23
src/cltest.c
23
src/cltest.c
@ -24,8 +24,8 @@ float *predict(clm_NN nn, clm_Vector input) {
|
|||||||
clm_Matrix xM = clm_matrixWrapArray(input.values, input.length);
|
clm_Matrix xM = clm_matrixWrapArray(input.values, input.length);
|
||||||
|
|
||||||
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
||||||
clm_linearForward(&nn.layers[i], 1, &xM, &nn.layers[i].output[0]);
|
clm_linearForward(&nn.layers[i], 1, &xM);
|
||||||
xM = nn.layers[i].output[0];
|
xM = nn.layers[i].output.matrixes[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
return xM.values;
|
return xM.values;
|
||||||
@ -84,28 +84,29 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Forward pass
|
// Forward pass
|
||||||
|
printf("Forward pass\n");
|
||||||
clm_Matrix *currentXs = batchInputs;
|
clm_Matrix *currentXs = batchInputs;
|
||||||
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
||||||
clm_linearForward(&nn.layers[i], batchSize, currentXs, nn.layers[i].output);
|
clm_linearForward(&nn.layers[i], batchSize, currentXs);
|
||||||
currentXs = nn.layers[i].output;
|
currentXs = nn.layers[i].output.matrixes;
|
||||||
}
|
}
|
||||||
|
|
||||||
clm_Linear *lastLayer = &nn.layers[nn.numLayers - 1];
|
clm_Linear *lastLayer = &nn.layers[nn.numLayers - 1];
|
||||||
for(unsigned int b = 0; b < batchSize; b++) {
|
for(unsigned int b = 0; b < batchSize; b++) {
|
||||||
// Error of last layer = y - yhat
|
// Error of last layer = y - yhat
|
||||||
clm_matrixCopy(batchOutputs[b], lastLayer->error[b]); // lastLayer.error = y
|
clm_matrixCopy(batchOutputs[b], lastLayer->error.matrixes[b]); // lastLayer.error = y
|
||||||
clm_matrixSubtractMatrix(lastLayer->error[b], lastLayer->output[b]); // lastLayer.error -= yhat
|
clm_matrixSubtractMatrix(lastLayer->error.matrixes[b], lastLayer->output.matrixes[b]); // lastLayer.error -= yhat
|
||||||
}
|
}
|
||||||
|
|
||||||
|
printf("Backprop\n");
|
||||||
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
||||||
clm_Linear *layer = &nn.layers[i];
|
clm_Linear *layer = &nn.layers[i];
|
||||||
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
|
clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output.matrixes;
|
||||||
clm_Matrix *outputsOfThisLayer = layer->output;
|
clm_linearBackprop(layer, nn.learnRate, batchSize, inputsToThisLayer, i > 0, i == 0 ? NULL : nn.layers[i - 1].error.matrixes);
|
||||||
clm_linearBackprop(layer, nn.learnRate, batchSize, inputsToThisLayer, outputsOfThisLayer, layer->error, i > 0, i == 0 ? NULL : nn.layers[i - 1].error, layer->weightsError, layer->gradient);
|
|
||||||
|
|
||||||
for(unsigned int b = 0; b < batchSize; b++) {
|
for(unsigned int b = 0; b < batchSize; b++) {
|
||||||
clm_matrixAddMatrix(layer->weights, layer->weightsError[b]);
|
clm_matrixAddMatrix(layer->weights.matrix, layer->weightsError.matrixes[b]);
|
||||||
clm_matrixAddMatrix(layer->bias, layer->gradient[b]);
|
clm_matrixAddMatrix(layer->bias.matrix, layer->gradient.matrixes[b]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user