Don't dynamically allocate memory for training

2023-10-29 01:15:22 +02:00 · 2023-10-29 01:15:22 +02:00 · 544bafdd5e
commit 544bafdd5e
parent 9c8e023420
8 changed files with 304 additions and 114 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,29 @@
+Language: Cpp
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Always
+TabWidth: 4
+ColumnLimit: 0
+IndentCaseLabels: true
+AllowShortIfStatementsOnASingleLine: true
+FixNamespaceComments: false
+SpaceBeforeParens: Never
+SpaceAfterCStyleCast: true
+SeparateDefinitionBlocks: Always
+PackConstructorInitializers: Never
+IncludeBlocks: Preserve
+SpaceBeforeInheritanceColon: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignTrailingComments: false
+AlignOperands: false
+AlignEscapedNewlines: false
+AlignConsecutiveMacros: false
+AllowShortCaseLabelsOnASingleLine: false
+SpaceBeforeCtorInitializerColon: false
+SpaceBeforeAssignmentOperators: true
+AllowShortLoopsOnASingleLine: true
+AlignAfterOpenBracket: DontAlign
+LambdaBodyIndentation: Signature
+LineEnding: LF
+ContinuationIndentWidth: 4
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -8,6 +8,7 @@
 		"string": "c",
 		"string_view": "c",
 		"stdint.h": "c",
-		"inttypes.h": "c"
+		"inttypes.h": "c",
+		"cl.h": "c"
 	}
 }
--- a/9
+++ b/9
@ -10,3 +10,12 @@ all:
 .PHONY: run
 run: all
 	$(BUILD)/cltest
+
+.PHONY: cl
+cl:
+	mkdir -p $(BUILD)
+	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c
+
+.PHONY: cl_run
+cl_run: cl
+	$(BUILD)/cl
--- a/src/cl.c
+++ b/src/cl.c
@ -0,0 +1,128 @@
+#define CL_TARGET_OPENCL_VERSION 300
+
+#include <CL/cl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+
+typedef struct __attribute__ ((packed)) {
+
+} cl_GPUMat;
+
+char *loadFile(const char *path) {
+	FILE *file = fopen(path, "r");
+	fseek(file, 0, SEEK_END);
+	size_t length = ftell(file);
+	fseek(file, 0, SEEK_SET);
+	char *buffer = calloc(1, length + 1);
+	fread(buffer, length, 1, file);
+	return buffer;
+}
+
+int main() {
+	// Connect to a compute device
+	int useGPU = true;
+	cl_device_id deviceID;
+	cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
+	if (err != CL_SUCCESS) {
+		printf("Error: Failed to create a device group!\n");
+		return 1;
+	}
+
+	char *buffer = loadFile("src/mat.cl");
+	printf("%s", buffer);
+
+	cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
+	if(!context) {
+		printf("Failed to create context\n");
+		return 1;
+	}
+
+	cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
+	if(!queue) {
+		printf("Failed to create command queue\n");
+		return 1;
+	}
+
+	size_t length = strlen(buffer);
+	cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
+	if(!program) {
+		printf("Failed to create program\n");
+		return 1;
+	}
+
+	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to build program\n");
+		// clGetProgramBuildInfo...
+		return 1;
+	}
+
+	cl_kernel kernel = clCreateKernel(program, "do_stuff", &err);
+	if(!kernel) {
+		printf("Failed to create kernel\n");
+		return 1;
+	}
+
+	unsigned int inputSize = 256000000;
+	float *inputData = calloc(inputSize, sizeof(float));
+	for(unsigned int i = 0; i < inputSize; i++) {
+		inputData[i] = i;
+	}
+
+	cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err);
+	cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err);
+	if(!input || !output) {
+		printf("Failed to allocate input/output buffer\n");
+		return 1;
+	}
+
+	err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to write to buffer\n");
+		return 1;
+	}
+
+	err = 0;
+	err = clSetKernelArg(kernel, 0, sizeof(input), &input);
+	err |= clSetKernelArg(kernel, 1, sizeof(output), &output);
+	err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
+	if(err != CL_SUCCESS) {
+		printf("Failed to set kernel args\n");
+		return 1;
+	}
+
+	/*char *info = calloc(1, 1024);
+	clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
+	printf("INFO: %s\n", info);*/
+
+	size_t local;
+	err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to get work group size\n");
+		return 1;
+	}
+
+	printf("Group size is %zu\n", local);
+
+	size_t global = inputSize;
+	err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to enqueue\n");
+		return 1;
+	}
+
+	clFinish(queue);
+
+	float *outputData = calloc(inputSize, sizeof(float));
+	err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to read from buffer\n");
+		return 1;
+	}
+
+	for(unsigned int i = 0; i < inputSize; i++) {
+		if(i % 1000 != 0) continue;
+		printf("%f: %f\n", inputData[i], outputData[i]);
+	}
+}
--- a/src/clm.c
+++ b/src/clm.c
@ -1,18 +1,20 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 #include "clm.h"

-const clm_Matrix INVALID_MATRIX = { .rows = 0, .cols = 0, .values = NULL };
-const clm_Vector INVALID_VECTOR = { .length = 0, .values = NULL };
+const clm_Matrix INVALID_MATRIX = {.rows = 0, .cols = 0, .values = NULL};
+const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL};

 clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) {
+	printf("CREATING MATRIX\n");
 	clm_Matrix mat;
 	mat.rows = rows;
 	mat.cols = cols;
 	mat.values = calloc(rows * cols, sizeof(float));
+	mat.transposed = false;
 	return mat;
 }

@ -28,6 +30,14 @@ clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols) {
 	return mat;
 }

+clm_Matrix clm_matrixZero(clm_Matrix mat) {
+	for(unsigned int i = 0; i < mat.rows * mat.cols; i++) {
+		mat.values[i] = 0;
+	}
+
+	return mat;
+}
+
 void clm_freeMatrix(clm_Matrix mat) {
 	free(mat.values);
 }
@ -50,7 +60,7 @@ clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar) {

 clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) {
 	if(mat.cols != other.cols || mat.rows != other.rows) {
-		printf("Failed to add matrices\n");
+		printf("Failed to add matrices (got %dx%d and %dx%d)\n", mat.cols, mat.rows, other.cols, other.rows);
 		return INVALID_MATRIX;
 	}

@ -58,14 +68,12 @@ clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) {
 		mat.values[i] += other.values[i];
 	}

-	//clm_freeMatrix(other);
-
 	return mat;
 }

 clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
 	if(mat.cols != other.cols || mat.rows != other.rows) {
-		printf("Failed to add matrices\n");
+		printf("Failed to sub matrices\n");
 		return INVALID_MATRIX;
 	}

@ -73,45 +81,46 @@ clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) {
 		mat.values[i] -= other.values[i];
 	}

-	//clm_freeMatrix(other);
-
 	return mat;
 }

 clm_Matrix clm_matrixTranspose(clm_Matrix mat) {
-	clm_Matrix tr = clm_createMatrix(mat.cols, mat.rows);
-	for(unsigned int i = 0; i < tr.rows; i++) {
-		for(unsigned int j = 0; j < tr.cols; j++) {
-			tr.values[i * tr.cols + j] = mat.values[j * mat.cols + i];
-		}
-	}
-
-	//clm_freeMatrix(mat);
-
+	clm_Matrix tr;
+	tr.cols = mat.rows;
+	tr.rows = mat.cols;
+	tr.values = mat.values;
+	tr.transposed = !mat.transposed;
 	return tr;
 }

-clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b) {
+clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out) {
 	if(a.cols != b.rows) {
-		printf("Cannot multiply matrices\n");
+		printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
 		return INVALID_MATRIX;
 	}

-	clm_Matrix r = clm_createMatrix(a.rows, b.cols);
-	for(unsigned int i = 0; i < r.rows; i++) {
-		for(unsigned int j = 0; j < r.cols; j++) {
+	if(out.rows != a.rows || out.cols != b.cols) {
+		printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
+		return INVALID_MATRIX;
+	}
+
+	for(unsigned int i = 0; i < out.rows; i++) {
+		for(unsigned int j = 0; j < out.cols; j++) {
 			float sum = 0;
 			for(unsigned int k = 0; k < a.cols; k++) {
 				sum += a.values[i * a.cols + k] * b.values[k * b.cols + j];
 			}
-			r.values[i * r.cols + j] = sum;
+			out.values[i * out.cols + j] = sum;
 		}
 	}

-	//clm_freeMatrix(a);
-	//clm_freeMatrix(b);
+	return out;
+}

-	return r;
+clm_Matrix clm_matrixMultiplyMatrixALLOC(clm_Matrix a, clm_Matrix b) {
+	clm_Matrix out = clm_createMatrix(a.rows, b.cols);
+	clm_matrixMultiplyMatrix(a, b, out);
+	return out;
 }

 clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other) {
@ -138,7 +147,7 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) {
 clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
 	for(unsigned int i = 0; i < mat.rows; i++) {
 		for(unsigned int j = 0; j < mat.cols; j++) {
-			mat.values[i * mat.cols + j] = 1 / (1 + exp(-mat.values[i * mat.cols + j]));
+			matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j)));
 		}
 	}

@ -148,8 +157,8 @@ clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
 clm_Matrix clm_matrixDSigmoid(clm_Matrix mat) {
 	for(unsigned int i = 0; i < mat.rows; i++) {
 		for(unsigned int j = 0; j < mat.cols; j++) {
-			float v = mat.values[i * mat.cols + j];
-			mat.values[i * mat.cols + j] = v * (1 - v);
+			float v = matrixAt(mat, i, j);
+			matrixAt(mat, i, j) = v * (1 - v);
 		}
 	}

@ -162,6 +171,14 @@ clm_Matrix clm_matrixFromArray(float *array, unsigned int length) {
 	return matrix;
 }

+clm_Matrix clm_matrixWrapArray(float *array, unsigned int length) {
+	clm_Matrix mat;
+	mat.rows = length;
+	mat.cols = 1;
+	mat.values = array;
+	return mat;
+}
+
 bool clm_matrixIsInvalid(clm_Matrix mat) {
 	return mat.values == NULL;
 }
@ -181,6 +198,10 @@ clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) {
 	clm_Linear linear;
 	linear.weights = clm_createMatrixRandom(outputs, inputs);
 	linear.bias = clm_createMatrixRandom(outputs, 1);
+
+	linear.output = clm_createMatrix(outputs, 1);
+	linear.error = clm_createMatrix(outputs, 1);
+	linear.weightsError = clm_createMatrix(outputs, inputs);
 	return linear;
 }

@ -197,7 +218,7 @@ void clm_matrixPrint(clm_Matrix mat) {
 	printf("[\n");
 	for(unsigned int i = 0; i < mat.rows; i++) {
 		for(unsigned int j = 0; j < mat.cols; j++) {
-			printf("%7.3f", mat.values[i * mat.cols + j]);
+			printf("%7.3f", matrixAt(mat, i, j));
 		}
 		printf("\n");
 	}
--- a/src/clm.h
+++ b/src/clm.h
@ -3,10 +3,13 @@

 #include <stdbool.h>

+#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
+
 typedef struct {
 	float *values;
 	unsigned int rows;
 	unsigned int cols;
+	bool transposed;
 } clm_Matrix;

 typedef struct {
@ -17,6 +20,9 @@ typedef struct {
 typedef struct {
 	clm_Matrix weights;
 	clm_Matrix bias;
+	clm_Matrix output;
+	clm_Matrix error;
+	clm_Matrix weightsError;
 } clm_Linear;

 typedef struct {
@ -29,17 +35,20 @@ extern const clm_Matrix INVALID_MATRIX;
 extern const clm_Vector INVALID_VECTOR;

 clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols);
+clm_Matrix clm_matrixZero(clm_Matrix mat);
 clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar);
 clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other);
 clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other);
 clm_Matrix clm_matrixTranspose(clm_Matrix mat);
-clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b);
+clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out);
+clm_Matrix clm_matrixMultiplyMatrixALLOC(clm_Matrix a, clm_Matrix b);
 clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other);
 clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar);
 clm_Matrix clm_matrixSigmoid(clm_Matrix mat);
 clm_Matrix clm_matrixDSigmoid(clm_Matrix mat);

 clm_Matrix clm_matrixFromArray(float *array, unsigned int length);
+clm_Matrix clm_matrixWrapArray(float *array, unsigned int length);
 clm_Matrix clm_matrixCopy(clm_Matrix matrix);

 bool clm_matrixIsInvalid(clm_Matrix mat);
--- a/src/cltest.c
+++ b/src/cltest.c
@ -1,6 +1,7 @@
+#include <dlfcn.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <inttypes.h>

 #include "clm.h"

@ -8,22 +9,20 @@ float train_data_x[4][2] = {
 	{0, 0},
 	{0, 1},
 	{1, 0},
-	{1, 1}
-};
+	{1, 1}};

 float train_data_y[4][1] = {
 	{0},
 	{1},
 	{1},
-	{0}
-};
+	{0}};

 float *predict(clm_NN nn, float *x, unsigned int length) {
-	clm_Matrix xM = clm_matrixFromArray(x, length);
+	clm_Matrix xM = clm_matrixWrapArray(x, length);

 	for(unsigned int i = 0; i < nn.numLayers; i++) {
 		clm_Linear layer = nn.layers[i];
-		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM);
+		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);

 		if(clm_matrixIsInvalid(newX)) {
 			printf("Failed to predict\n");
@ -32,7 +31,6 @@ float *predict(clm_NN nn, float *x, unsigned int length) {

 		clm_matrixAddMatrix(newX, layer.bias);
 		clm_matrixSigmoid(newX);
-		clm_freeMatrix(xM);
 		xM = newX;
 	}

@ -40,17 +38,16 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
 }

 void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
-	clm_Matrix xM = clm_matrixFromArray(x, xL);
-	clm_Matrix yM = clm_matrixFromArray(y, yL);
+	clm_Matrix xM = clm_matrixWrapArray(x, xL);
+	clm_Matrix yM = clm_matrixWrapArray(y, yL);

 	// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)

 	// Forward pass
-	clm_Matrix *outputs = calloc(nn.numLayers + 1 /* 1 for input */, sizeof(clm_Matrix));
-	outputs[0] = xM;
+	clm_Matrix currentX = xM;
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
 		clm_Linear layer = nn.layers[i];
-		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM);
+		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
 		if(clm_matrixIsInvalid(newX)) {
 			printf("Forward pass failed\n");
 			return;
@ -58,59 +55,34 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {

 		clm_matrixAddMatrix(newX, layer.bias);
 		clm_matrixSigmoid(newX);
-		xM = newX;
-		outputs[i + 1] = xM;
+		currentX = newX;
 	}

-	clm_Matrix dError = clm_matrixSubtractMatrix(yM, outputs[nn.numLayers]); // yhat - y
-
-	clm_Matrix lastGradient = clm_matrixDSigmoid(clm_matrixCopy(outputs[nn.numLayers])); // dsig(yhat)
-	clm_matrixMultiplyMatrixElements(lastGradient, dError); // (yhat - y) . dsig(yhat)
-	clm_matrixMultiplyScalar(lastGradient, nn.learnRate);
-
-	clm_Matrix lastInputT = clm_matrixTranspose(outputs[nn.numLayers - 1]);
-	clm_Matrix lastDeltaW = clm_matrixMultiplyMatrix(lastGradient, lastInputT);
-	clm_freeMatrix(lastInputT);
-
-	clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].weights, lastDeltaW);
-	clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].bias, lastGradient);
-
-	clm_freeMatrix(lastDeltaW);
-	clm_freeMatrix(lastGradient);
-
-	for(int i = nn.numLayers - 2; i >= 0; i--) {
+	for(int i = nn.numLayers - 1; i >= 0; i--) {
 		clm_Linear layer = nn.layers[i];
-		clm_Matrix inputToThisLayer = outputs[i];
-		clm_Matrix outputOfThisLayer = outputs[i + 1];
+		clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
+		clm_Matrix outputOfThisLayer = nn.layers[i].output;
+		clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
+		clm_Matrix error = layer.error;

-		clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
-		clm_Matrix newDError = clm_matrixMultiplyMatrix(weightsT, dError);
-		clm_freeMatrix(weightsT);
-		clm_freeMatrix(dError);
-		dError = newDError;
+		if(i == nn.numLayers - 1) {
+			clm_matrixZero(error); // Zero the error matrix
+			clm_matrixSubtractMatrix(clm_matrixAddMatrix(error, yM), outputOfThisLayer); // yhat - y
+		} else {
+			clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
+			clm_matrixMultiplyMatrix(weightsT, prevError, error);
+		}

-		clm_Matrix gradient = clm_matrixDSigmoid(clm_matrixCopy(outputOfThisLayer));
-		clm_matrixMultiplyMatrixElements(gradient, dError);
+		clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
+		clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
 		clm_matrixMultiplyScalar(gradient, nn.learnRate);

 		clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
-		clm_Matrix deltaW = clm_matrixMultiplyMatrix(gradient, inputT);
-		clm_freeMatrix(inputT);
+		clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);

-		clm_matrixAddMatrix(layer.weights, deltaW);
+		clm_matrixAddMatrix(layer.weights, layer.weightsError);
 		clm_matrixAddMatrix(layer.bias, gradient);
-
-		clm_freeMatrix(deltaW);
-		clm_freeMatrix(gradient);
 	}
-
-	clm_freeMatrix(dError);
-
-	for(unsigned int i = 0; i <= nn.numLayers; i++) {
-		clm_freeMatrix(outputs[i]);
-	}
-
-	free(outputs);
 }

 void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
@ -128,7 +100,8 @@ void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
 	unsigned char lengthBytes[4];
 	fread(lengthBytes, sizeof(lengthBytes), 1, file);

-	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
+	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
+		(lengthBytes[2] << 8) | lengthBytes[3];
 	printf("%" PRId32 "\n", length);

 	clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
@ -190,7 +163,13 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
 	*imageCountOut = length;
 }

+typedef void *(*callocFunc)(size_t, size_t);
+
+callocFunc oldCalloc;
+
 int main() {
+	oldCalloc = dlsym(RTLD_NEXT, "calloc");
+
 	clm_Vector *labels = NULL;
 	unsigned int labelCount;
 	loadLabels(&labels, &labelCount);
@ -211,50 +190,48 @@ int main() {
 		h = 30,
 		o = 10;

-	clm_Linear layer1;
-	layer1.weights = clm_createMatrixRandom(h, i);
-	layer1.bias = clm_createMatrixRandom(h, 1);
-
-	clm_Linear layer2;
-	layer2.weights = clm_createMatrixRandom(o, h);
-	layer2.bias = clm_createMatrixRandom(o, 1);
-
+	clm_Linear layer1 = clm_linearCreateRandom(i, h);
+	clm_Linear layer2 = clm_linearCreateRandom(h, o);
 	clm_Linear layers[] = {layer1, layer2};
-	clm_NN nn = { layers, sizeof(layers) / sizeof(clm_Linear), 0.01 };
+	clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};

-	for(unsigned int epoch = 0; epoch < 10;  epoch++) {
+	for(unsigned int epoch = 0; epoch < 10; epoch++) {
 		printf("Epoch %u\n", epoch);
-
 		for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
 			if(idx % 1000 == 0) {
-				printf("%.2f%%\n", idx / (float) imageCount * 100);
+				printf("\r%.2f%%", idx / (float) imageCount * 100);
+				fflush(stdout);
 			}
-			//printf("%u\n", idx);
-			//train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
+			// printf("%u\n", idx);
+			// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
 			/*for(unsigned int f = 0; f < images[idx].length; f++) {
-				printf("%.2f ", images[idx].values[f]);
+					printf("%.2f ", images[idx].values[f]);
 			}
 			printf("\n");
 			for(unsigned int f = 0; f < labels[idx].length; f++) {
-				printf("%.2f ", labels[idx].values[f]);
+					printf("%.2f ", labels[idx].values[f]);
 			}
 			printf("\n");*/
-			//printf("%.2f\n", labels.values[idx]);
+			// printf("%.2f\n", labels.values[idx]);

 			train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
-			//train(nn, test, 784, target, 10);
-			//predict(nn, test, 784);
+			// train(nn, test, 784, target, 10);
+			// predict(nn, test, 784);
 		}
+		printf("\n");
 	}

+	printf("Train done\n");
+
 	unsigned int correct = 0;
 	for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
-		//printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
+		// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
+		// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
 		float *pred = predict(nn, images[idx].values, images[idx].length);
 		unsigned int predDigit = 0;
 		float max = -1;
 		for(unsigned int j = 0; j < 10; j++) {
-			//printf("%.2f ", pred[j]);
+			// printf("%.2f ", pred[j]);
 			if(pred[j] > max || max < 0) {
 				max = pred[j];
 				predDigit = j;
@ -265,14 +242,14 @@ int main() {
 		unsigned int actDigit = 0;
 		float maxA = -1;
 		for(unsigned int j = 0; j < 10; j++) {
-			//printf("%.2f ", pred[j]);
+			// printf("%.2f ", pred[j]);
 			if(labels[idx].values[j] > maxA || maxA < 0) {
 				maxA = labels[idx].values[j];
 				actDigit = j;
 			}
 		}
 		if(idx < 100) printf("Actual: %u\n", actDigit);
-		//printf("\n");
+		// printf("\n");

 		if(predDigit == actDigit) correct++;
 	}
@ -281,3 +258,8 @@ int main() {

 	printf("\n");
 }
+
+void *calloc(size_t nmemb, size_t size) {
+	// printf("CALLOC\n");
+	return oldCalloc(nmemb, size);
+}
--- a/src/mat.cl
+++ b/src/mat.cl
@ -0,0 +1,11 @@
+typedef struct __attribute__ ((packed)) {
+
+} cl_GPUMat;
+
+void amogus() {
+	printf("HEllo world!");
+}
+
+__kernel void mat_multiply(__global float *nnWeights, __global float *inputs, __global float *c) {
+	amogus();
+}