Don't dynamically allocate memory for training

This commit is contained in:
MrLetsplay 2023-10-29 01:15:22 +02:00
parent 9c8e023420
commit 544bafdd5e
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
8 changed files with 304 additions and 114 deletions

29
.clang-format Normal file
View File

@ -0,0 +1,29 @@
Language: Cpp
BasedOnStyle: LLVM
IndentWidth: 4
UseTab: Always
TabWidth: 4
ColumnLimit: 0
IndentCaseLabels: true
AllowShortIfStatementsOnASingleLine: true
FixNamespaceComments: false
SpaceBeforeParens: Never
SpaceAfterCStyleCast: true
SeparateDefinitionBlocks: Always
PackConstructorInitializers: Never
IncludeBlocks: Preserve
SpaceBeforeInheritanceColon: false
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignTrailingComments: false
AlignOperands: false
AlignEscapedNewlines: false
AlignConsecutiveMacros: false
AllowShortCaseLabelsOnASingleLine: false
SpaceBeforeCtorInitializerColon: false
SpaceBeforeAssignmentOperators: true
AllowShortLoopsOnASingleLine: true
AlignAfterOpenBracket: DontAlign
LambdaBodyIndentation: Signature
LineEnding: LF
ContinuationIndentWidth: 4

View File

@ -8,6 +8,7 @@
"string": "c", "string": "c",
"string_view": "c", "string_view": "c",
"stdint.h": "c", "stdint.h": "c",
"inttypes.h": "c" "inttypes.h": "c",
"cl.h": "c"
} }
} }

View File

@ -10,3 +10,12 @@ all:
.PHONY: run .PHONY: run
run: all run: all
$(BUILD)/cltest $(BUILD)/cltest
.PHONY: cl
cl:
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c
.PHONY: cl_run
cl_run: cl
$(BUILD)/cl

128
src/cl.c Normal file
View File

@ -0,0 +1,128 @@
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
typedef struct __attribute__ ((packed)) {
} cl_GPUMat;
char *loadFile(const char *path) {
FILE *file = fopen(path, "r");
fseek(file, 0, SEEK_END);
size_t length = ftell(file);
fseek(file, 0, SEEK_SET);
char *buffer = calloc(1, length + 1);
fread(buffer, length, 1, file);
return buffer;
}
int main() {
// Connect to a compute device
int useGPU = true;
cl_device_id deviceID;
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
if (err != CL_SUCCESS) {
printf("Error: Failed to create a device group!\n");
return 1;
}
char *buffer = loadFile("src/mat.cl");
printf("%s", buffer);
cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
if(!context) {
printf("Failed to create context\n");
return 1;
}
cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
if(!queue) {
printf("Failed to create command queue\n");
return 1;
}
size_t length = strlen(buffer);
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
if(!program) {
printf("Failed to create program\n");
return 1;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to build program\n");
// clGetProgramBuildInfo...
return 1;
}
cl_kernel kernel = clCreateKernel(program, "do_stuff", &err);
if(!kernel) {
printf("Failed to create kernel\n");
return 1;
}
unsigned int inputSize = 256000000;
float *inputData = calloc(inputSize, sizeof(float));
for(unsigned int i = 0; i < inputSize; i++) {
inputData[i] = i;
}
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err);
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err);
if(!input || !output) {
printf("Failed to allocate input/output buffer\n");
return 1;
}
err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to write to buffer\n");
return 1;
}
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(input), &input);
err |= clSetKernelArg(kernel, 1, sizeof(output), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args\n");
return 1;
}
/*char *info = calloc(1, 1024);
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
printf("INFO: %s\n", info);*/
size_t local;
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if(err != CL_SUCCESS) {
printf("Failed to get work group size\n");
return 1;
}
printf("Group size is %zu\n", local);
size_t global = inputSize;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue\n");
return 1;
}
clFinish(queue);
float *outputData = calloc(inputSize, sizeof(float));
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n");
return 1;
}
for(unsigned int i = 0; i < inputSize; i++) {
if(i % 1000 != 0) continue;
printf("%f: %f\n", inputData[i], outputData[i]);
}
}

View File

@ -1,18 +1,20 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "clm.h" #include "clm.h"
const clm_Matrix INVALID_MATRIX = { .rows = 0, .cols = 0, .values = NULL }; const clm_Matrix INVALID_MATRIX = {.rows = 0, .cols = 0, .values = NULL};
const clm_Vector INVALID_VECTOR = { .length = 0, .values = NULL }; const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};
clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) { clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
printf("CREATING MATRIX\n");
clm_Matrix mat; clm_Matrix mat;
mat.rows = rows; mat.rows = rows;
mat.cols = cols; mat.cols = cols;
mat.values = calloc(rows * cols, sizeof(float)); mat.values = calloc(rows * cols, sizeof(float));
mat.transposed = false;
return mat; return mat;
} }
@ -28,6 +30,14 @@ clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols) {
return mat; return mat;
} }
clm_Matrix clm_matrixZero(clm_Matrix mat) {
for(unsigned int i = 0; i < mat.rows * mat.cols; i++) {
mat.values[i] = 0;
}
return mat;
}
void clm_freeMatrix(clm_Matrix mat) { void clm_freeMatrix(clm_Matrix mat) {
free(mat.values); free(mat.values);
} }
@ -50,7 +60,7 @@ clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar) {
clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) { clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) {
if(mat.cols != other.cols || mat.rows != other.rows) { if(mat.cols != other.cols || mat.rows != other.rows) {
printf("Failed to add matrices\n"); printf("Failed to add matrices (got %dx%d and %dx%d)\n", mat.cols, mat.rows, other.cols, other.rows);
return INVALID_MATRIX; return INVALID_MATRIX;
} }
@ -58,14 +68,12 @@ clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) {
mat.values[i] += other.values[i]; mat.values[i] += other.values[i];
} }
//clm_freeMatrix(other);
return mat; return mat;
} }
clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) { clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
if(mat.cols != other.cols || mat.rows != other.rows) { if(mat.cols != other.cols || mat.rows != other.rows) {
printf("Failed to add matrices\n"); printf("Failed to sub matrices\n");
return INVALID_MATRIX; return INVALID_MATRIX;
} }
@ -73,45 +81,46 @@ clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
mat.values[i] -= other.values[i]; mat.values[i] -= other.values[i];
} }
//clm_freeMatrix(other);
return mat; return mat;
} }
clm_Matrix clm_matrixTranspose(clm_Matrix mat) { clm_Matrix clm_matrixTranspose(clm_Matrix mat) {
clm_Matrix tr = clm_createMatrix(mat.cols, mat.rows); clm_Matrix tr;
for(unsigned int i = 0; i < tr.rows; i++) { tr.cols = mat.rows;
for(unsigned int j = 0; j < tr.cols; j++) { tr.rows = mat.cols;
tr.values[i * tr.cols + j] = mat.values[j * mat.cols + i]; tr.values = mat.values;
} tr.transposed = !mat.transposed;
}
//clm_freeMatrix(mat);
return tr; return tr;
} }
clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b) { clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out) {
if(a.cols != b.rows) { if(a.cols != b.rows) {
printf("Cannot multiply matrices\n"); printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
return INVALID_MATRIX; return INVALID_MATRIX;
} }
clm_Matrix r = clm_createMatrix(a.rows, b.cols); if(out.rows != a.rows || out.cols != b.cols) {
for(unsigned int i = 0; i < r.rows; i++) { printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
for(unsigned int j = 0; j < r.cols; j++) { return INVALID_MATRIX;
}
for(unsigned int i = 0; i < out.rows; i++) {
for(unsigned int j = 0; j < out.cols; j++) {
float sum = 0; float sum = 0;
for(unsigned int k = 0; k < a.cols; k++) { for(unsigned int k = 0; k < a.cols; k++) {
sum += a.values[i * a.cols + k] * b.values[k * b.cols + j]; sum += a.values[i * a.cols + k] * b.values[k * b.cols + j];
} }
r.values[i * r.cols + j] = sum; out.values[i * out.cols + j] = sum;
} }
} }
//clm_freeMatrix(a); return out;
//clm_freeMatrix(b); }
return r; clm_Matrix clm_matrixMultiplyMatrixALLOC(clm_Matrix a, clm_Matrix b) {
clm_Matrix out = clm_createMatrix(a.rows, b.cols);
clm_matrixMultiplyMatrix(a, b, out);
return out;
} }
clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other) { clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other) {
@ -138,7 +147,7 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) {
clm_Matrix clm_matrixSigmoid(clm_Matrix mat) { clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int i = 0; i < mat.rows; i++) {
for(unsigned int j = 0; j < mat.cols; j++) { for(unsigned int j = 0; j < mat.cols; j++) {
mat.values[i * mat.cols + j] = 1 / (1 + exp(-mat.values[i * mat.cols + j])); matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j)));
} }
} }
@ -148,8 +157,8 @@ clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
clm_Matrix clm_matrixDSigmoid(clm_Matrix mat) { clm_Matrix clm_matrixDSigmoid(clm_Matrix mat) {
for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int i = 0; i < mat.rows; i++) {
for(unsigned int j = 0; j < mat.cols; j++) { for(unsigned int j = 0; j < mat.cols; j++) {
float v = mat.values[i * mat.cols + j]; float v = matrixAt(mat, i, j);
mat.values[i * mat.cols + j] = v * (1 - v); matrixAt(mat, i, j) = v * (1 - v);
} }
} }
@ -162,6 +171,14 @@ clm_Matrix clm_matrixFromArray(float *array, unsigned int length) {
return matrix; return matrix;
} }
clm_Matrix clm_matrixWrapArray(float *array, unsigned int length) {
clm_Matrix mat;
mat.rows = length;
mat.cols = 1;
mat.values = array;
return mat;
}
bool clm_matrixIsInvalid(clm_Matrix mat) { bool clm_matrixIsInvalid(clm_Matrix mat) {
return mat.values == NULL; return mat.values == NULL;
} }
@ -181,6 +198,10 @@ clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
clm_Linear linear; clm_Linear linear;
linear.weights = clm_createMatrixRandom(outputs, inputs); linear.weights = clm_createMatrixRandom(outputs, inputs);
linear.bias = clm_createMatrixRandom(outputs, 1); linear.bias = clm_createMatrixRandom(outputs, 1);
linear.output = clm_createMatrix(outputs, 1);
linear.error = clm_createMatrix(outputs, 1);
linear.weightsError = clm_createMatrix(outputs, inputs);
return linear; return linear;
} }
@ -197,7 +218,7 @@ void clm_matrixPrint(clm_Matrix mat) {
printf("[\n"); printf("[\n");
for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int i = 0; i < mat.rows; i++) {
for(unsigned int j = 0; j < mat.cols; j++) { for(unsigned int j = 0; j < mat.cols; j++) {
printf("%7.3f", mat.values[i * mat.cols + j]); printf("%7.3f", matrixAt(mat, i, j));
} }
printf("\n"); printf("\n");
} }

View File

@ -3,10 +3,13 @@
#include <stdbool.h> #include <stdbool.h>
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
typedef struct { typedef struct {
float *values; float *values;
unsigned int rows; unsigned int rows;
unsigned int cols; unsigned int cols;
bool transposed;
} clm_Matrix; } clm_Matrix;
typedef struct { typedef struct {
@ -17,6 +20,9 @@ typedef struct {
typedef struct { typedef struct {
clm_Matrix weights; clm_Matrix weights;
clm_Matrix bias; clm_Matrix bias;
clm_Matrix output;
clm_Matrix error;
clm_Matrix weightsError;
} clm_Linear; } clm_Linear;
typedef struct { typedef struct {
@ -29,17 +35,20 @@ extern const clm_Matrix INVALID_MATRIX;
extern const clm_Vector INVALID_VECTOR; extern const clm_Vector INVALID_VECTOR;
clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols); clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols);
clm_Matrix clm_matrixZero(clm_Matrix mat);
clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar); clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar);
clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other); clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other);
clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other); clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other);
clm_Matrix clm_matrixTranspose(clm_Matrix mat); clm_Matrix clm_matrixTranspose(clm_Matrix mat);
clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b); clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out);
clm_Matrix clm_matrixMultiplyMatrixALLOC(clm_Matrix a, clm_Matrix b);
clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other); clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other);
clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar); clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar);
clm_Matrix clm_matrixSigmoid(clm_Matrix mat); clm_Matrix clm_matrixSigmoid(clm_Matrix mat);
clm_Matrix clm_matrixDSigmoid(clm_Matrix mat); clm_Matrix clm_matrixDSigmoid(clm_Matrix mat);
clm_Matrix clm_matrixFromArray(float *array, unsigned int length); clm_Matrix clm_matrixFromArray(float *array, unsigned int length);
clm_Matrix clm_matrixWrapArray(float *array, unsigned int length);
clm_Matrix clm_matrixCopy(clm_Matrix matrix); clm_Matrix clm_matrixCopy(clm_Matrix matrix);
bool clm_matrixIsInvalid(clm_Matrix mat); bool clm_matrixIsInvalid(clm_Matrix mat);

View File

@ -1,6 +1,7 @@
#include <dlfcn.h>
#include <inttypes.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <inttypes.h>
#include "clm.h" #include "clm.h"
@ -8,22 +9,20 @@ float train_data_x[4][2] = {
{0, 0}, {0, 0},
{0, 1}, {0, 1},
{1, 0}, {1, 0},
{1, 1} {1, 1}};
};
float train_data_y[4][1] = { float train_data_y[4][1] = {
{0}, {0},
{1}, {1},
{1}, {1},
{0} {0}};
};
float *predict(clm_NN nn, float *x, unsigned int length) { float *predict(clm_NN nn, float *x, unsigned int length) {
clm_Matrix xM = clm_matrixFromArray(x, length); clm_Matrix xM = clm_matrixWrapArray(x, length);
for(unsigned int i = 0; i < nn.numLayers; i++) { for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i]; clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM); clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
if(clm_matrixIsInvalid(newX)) { if(clm_matrixIsInvalid(newX)) {
printf("Failed to predict\n"); printf("Failed to predict\n");
@ -32,7 +31,6 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
clm_matrixAddMatrix(newX, layer.bias); clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX); clm_matrixSigmoid(newX);
clm_freeMatrix(xM);
xM = newX; xM = newX;
} }
@ -40,17 +38,16 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
} }
void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
clm_Matrix xM = clm_matrixFromArray(x, xL); clm_Matrix xM = clm_matrixWrapArray(x, xL);
clm_Matrix yM = clm_matrixFromArray(y, yL); clm_Matrix yM = clm_matrixWrapArray(y, yL);
// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached) // TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
// Forward pass // Forward pass
clm_Matrix *outputs = calloc(nn.numLayers + 1 /* 1 for input */, sizeof(clm_Matrix)); clm_Matrix currentX = xM;
outputs[0] = xM;
for(unsigned int i = 0; i < nn.numLayers; i++) { for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i]; clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM); clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
if(clm_matrixIsInvalid(newX)) { if(clm_matrixIsInvalid(newX)) {
printf("Forward pass failed\n"); printf("Forward pass failed\n");
return; return;
@ -58,59 +55,34 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
clm_matrixAddMatrix(newX, layer.bias); clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX); clm_matrixSigmoid(newX);
xM = newX; currentX = newX;
outputs[i + 1] = xM;
} }
clm_Matrix dError = clm_matrixSubtractMatrix(yM, outputs[nn.numLayers]); // yhat - y for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_Matrix lastGradient = clm_matrixDSigmoid(clm_matrixCopy(outputs[nn.numLayers])); // dsig(yhat)
clm_matrixMultiplyMatrixElements(lastGradient, dError); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(lastGradient, nn.learnRate);
clm_Matrix lastInputT = clm_matrixTranspose(outputs[nn.numLayers - 1]);
clm_Matrix lastDeltaW = clm_matrixMultiplyMatrix(lastGradient, lastInputT);
clm_freeMatrix(lastInputT);
clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].weights, lastDeltaW);
clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].bias, lastGradient);
clm_freeMatrix(lastDeltaW);
clm_freeMatrix(lastGradient);
for(int i = nn.numLayers - 2; i >= 0; i--) {
clm_Linear layer = nn.layers[i]; clm_Linear layer = nn.layers[i];
clm_Matrix inputToThisLayer = outputs[i]; clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
clm_Matrix outputOfThisLayer = outputs[i + 1]; clm_Matrix outputOfThisLayer = nn.layers[i].output;
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
clm_Matrix error = layer.error;
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); if(i == nn.numLayers - 1) {
clm_Matrix newDError = clm_matrixMultiplyMatrix(weightsT, dError); clm_matrixZero(error); // Zero the error matrix
clm_freeMatrix(weightsT); clm_matrixSubtractMatrix(clm_matrixAddMatrix(error, yM), outputOfThisLayer); // yhat - y
clm_freeMatrix(dError); } else {
dError = newDError; clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
clm_matrixMultiplyMatrix(weightsT, prevError, error);
}
clm_Matrix gradient = clm_matrixDSigmoid(clm_matrixCopy(outputOfThisLayer)); clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
clm_matrixMultiplyMatrixElements(gradient, dError); clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(gradient, nn.learnRate); clm_matrixMultiplyScalar(gradient, nn.learnRate);
clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer); clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
clm_Matrix deltaW = clm_matrixMultiplyMatrix(gradient, inputT); clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
clm_freeMatrix(inputT);
clm_matrixAddMatrix(layer.weights, deltaW); clm_matrixAddMatrix(layer.weights, layer.weightsError);
clm_matrixAddMatrix(layer.bias, gradient); clm_matrixAddMatrix(layer.bias, gradient);
clm_freeMatrix(deltaW);
clm_freeMatrix(gradient);
} }
clm_freeMatrix(dError);
for(unsigned int i = 0; i <= nn.numLayers; i++) {
clm_freeMatrix(outputs[i]);
}
free(outputs);
} }
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
@ -128,7 +100,8 @@ void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
unsigned char lengthBytes[4]; unsigned char lengthBytes[4];
fread(lengthBytes, sizeof(lengthBytes), 1, file); fread(lengthBytes, sizeof(lengthBytes), 1, file);
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3]; uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
(lengthBytes[2] << 8) | lengthBytes[3];
printf("%" PRId32 "\n", length); printf("%" PRId32 "\n", length);
clm_Vector *vectors = calloc(length, sizeof(clm_Vector)); clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
@ -190,7 +163,13 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
*imageCountOut = length; *imageCountOut = length;
} }
typedef void *(*callocFunc)(size_t, size_t);
callocFunc oldCalloc;
int main() { int main() {
oldCalloc = dlsym(RTLD_NEXT, "calloc");
clm_Vector *labels = NULL; clm_Vector *labels = NULL;
unsigned int labelCount; unsigned int labelCount;
loadLabels(&labels, &labelCount); loadLabels(&labels, &labelCount);
@ -211,50 +190,48 @@ int main() {
h = 30, h = 30,
o = 10; o = 10;
clm_Linear layer1; clm_Linear layer1 = clm_linearCreateRandom(i, h);
layer1.weights = clm_createMatrixRandom(h, i); clm_Linear layer2 = clm_linearCreateRandom(h, o);
layer1.bias = clm_createMatrixRandom(h, 1);
clm_Linear layer2;
layer2.weights = clm_createMatrixRandom(o, h);
layer2.bias = clm_createMatrixRandom(o, 1);
clm_Linear layers[] = {layer1, layer2}; clm_Linear layers[] = {layer1, layer2};
clm_NN nn = { layers, sizeof(layers) / sizeof(clm_Linear), 0.01 }; clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
for(unsigned int epoch = 0; epoch < 10; epoch++) { for(unsigned int epoch = 0; epoch < 10; epoch++) {
printf("Epoch %u\n", epoch); printf("Epoch %u\n", epoch);
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
if(idx % 1000 == 0) { if(idx % 1000 == 0) {
printf("%.2f%%\n", idx / (float) imageCount * 100); printf("\r%.2f%%", idx / (float) imageCount * 100);
fflush(stdout);
} }
//printf("%u\n", idx); // printf("%u\n", idx);
//train(nn, train_data_x[idx], 2, train_data_y[idx], 1); // train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
/*for(unsigned int f = 0; f < images[idx].length; f++) { /*for(unsigned int f = 0; f < images[idx].length; f++) {
printf("%.2f ", images[idx].values[f]); printf("%.2f ", images[idx].values[f]);
} }
printf("\n"); printf("\n");
for(unsigned int f = 0; f < labels[idx].length; f++) { for(unsigned int f = 0; f < labels[idx].length; f++) {
printf("%.2f ", labels[idx].values[f]); printf("%.2f ", labels[idx].values[f]);
} }
printf("\n");*/ printf("\n");*/
//printf("%.2f\n", labels.values[idx]); // printf("%.2f\n", labels.values[idx]);
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
//train(nn, test, 784, target, 10); // train(nn, test, 784, target, 10);
//predict(nn, test, 784); // predict(nn, test, 784);
} }
printf("\n");
} }
printf("Train done\n");
unsigned int correct = 0; unsigned int correct = 0;
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
//printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx].values, images[idx].length); float *pred = predict(nn, images[idx].values, images[idx].length);
unsigned int predDigit = 0; unsigned int predDigit = 0;
float max = -1; float max = -1;
for(unsigned int j = 0; j < 10; j++) { for(unsigned int j = 0; j < 10; j++) {
//printf("%.2f ", pred[j]); // printf("%.2f ", pred[j]);
if(pred[j] > max || max < 0) { if(pred[j] > max || max < 0) {
max = pred[j]; max = pred[j];
predDigit = j; predDigit = j;
@ -265,14 +242,14 @@ int main() {
unsigned int actDigit = 0; unsigned int actDigit = 0;
float maxA = -1; float maxA = -1;
for(unsigned int j = 0; j < 10; j++) { for(unsigned int j = 0; j < 10; j++) {
//printf("%.2f ", pred[j]); // printf("%.2f ", pred[j]);
if(labels[idx].values[j] > maxA || maxA < 0) { if(labels[idx].values[j] > maxA || maxA < 0) {
maxA = labels[idx].values[j]; maxA = labels[idx].values[j];
actDigit = j; actDigit = j;
} }
} }
if(idx < 100) printf("Actual: %u\n", actDigit); if(idx < 100) printf("Actual: %u\n", actDigit);
//printf("\n"); // printf("\n");
if(predDigit == actDigit) correct++; if(predDigit == actDigit) correct++;
} }
@ -281,3 +258,8 @@ int main() {
printf("\n"); printf("\n");
} }
void *calloc(size_t nmemb, size_t size) {
// printf("CALLOC\n");
return oldCalloc(nmemb, size);
}

11
src/mat.cl Normal file
View File

@ -0,0 +1,11 @@
typedef struct __attribute__ ((packed)) {
} cl_GPUMat;
void amogus() {
printf("HEllo world!");
}
__kernel void mat_multiply(__global float *nnWeights, __global float *inputs, __global float *c) {
amogus();
}