diff --git a/Makefile b/Makefile
index 7945949..61f49af 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ CFLAGS=-Wall -g
 .PHONY: all
 all:
 	mkdir -p $(BUILD)
-	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c
+	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
 
 .PHONY: run
 run: all
@@ -14,7 +14,7 @@ run: all
 .PHONY: cl
 cl:
 	mkdir -p $(BUILD)
-	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c
+	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
 
 .PHONY: cl_run
 cl_run: cl
diff --git a/src/cl.c b/src/cl.c
index a2c98cb..987daa0 100644
--- a/src/cl.c
+++ b/src/cl.c
@@ -1,6 +1,6 @@
+#define CL_TARGET_OPENCL_VERSION 200
 #include <CL/cl_platform.h>
 #include <math.h>
-#define CL_TARGET_OPENCL_VERSION 300
 
 #include "clm.h"
 
@@ -70,9 +70,9 @@ int main() {
 		return 1;
 	}
 
-	clm_Matrix a = clm_createMatrixRandom(3, 4);
-	clm_Matrix b = clm_createMatrixRandom(4, 5);
-	clm_Matrix out = clm_createMatrixRandom(3, 5);
+	clm_Matrix a = clm_createMatrixRandom(5, 10);
+	clm_Matrix b = clm_createMatrixRandom(10, 3);
+	clm_Matrix out = clm_createMatrixRandom(5, 3);
 
 	cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
 	cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
diff --git a/src/clm.c b/src/clm.c
index bdacf87..5221e53 100644
--- a/src/clm.c
+++ b/src/clm.c
@@ -158,10 +158,8 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) {
 }
 
 clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
-	for(unsigned int i = 0; i < mat.rows; i++) {
-		for(unsigned int j = 0; j < mat.cols; j++) {
-			matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j)));
-		}
+	for(unsigned int i = 0; i < mat.rows * mat.cols; i++) {
+		mat.values[i] = 1 / (1 + exp(-mat.values[i]));
 	}
 
 	return mat;
diff --git a/src/clm.h b/src/clm.h
index d452aa1..2897d52 100644
--- a/src/clm.h
+++ b/src/clm.h
@@ -5,6 +5,8 @@
 
 #define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
 
+typedef struct clm_NativeBuf clm_NativeBuf;
+
 typedef struct {
 	float *values;
 	unsigned int rows;
@@ -23,6 +25,9 @@ typedef struct {
 	clm_Matrix output;
 	clm_Matrix error;
 	clm_Matrix weightsError;
+	clm_NativeBuf *nativeWeights;
+	clm_NativeBuf *nativeBias;
+	clm_NativeBuf *nativeOutput;
 } clm_Linear;
 
 typedef struct {
diff --git a/src/clm_gpu.h b/src/clm_gpu.h
new file mode 100644
index 0000000..2548383
--- /dev/null
+++ b/src/clm_gpu.h
@@ -0,0 +1,11 @@
+#ifndef _CLM_GPU_H_
+#define _CLM_GPU_H_
+
+#include "clm.h"
+
+int clm_gpuInit();
+void clm_gpuDestroy();
+
+void clm_linearForward(clm_Linear *linear, clm_Matrix input);
+
+#endif
diff --git a/src/clm_gpu_cpu.c b/src/clm_gpu_cpu.c
new file mode 100644
index 0000000..7ab5caf
--- /dev/null
+++ b/src/clm_gpu_cpu.c
@@ -0,0 +1,24 @@
+#include "clm_gpu.h"
+
+#include <stdio.h>
+
+struct clm_NativeBuf {};
+
+int clm_gpuInit() {
+	return 0;
+}
+
+void clm_gpuDestroy() {}
+
+void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
+	clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output);
+
+	if(clm_matrixIsInvalid(newX)) {
+		printf("Forward pass failed\n");
+		return;
+	}
+
+	clm_matrixAddMatrix(newX, linear->bias);
+	clm_matrixSigmoid(newX);
+	linear->output = newX;
+}
diff --git a/src/clm_gpu_opencl.c b/src/clm_gpu_opencl.c
new file mode 100644
index 0000000..fd64052
--- /dev/null
+++ b/src/clm_gpu_opencl.c
@@ -0,0 +1,192 @@
+#include "clm_gpu.h"
+#include <math.h>
+
+#define CL_TARGET_OPENCL_VERSION 200
+#include <CL/cl_platform.h>
+
+#include <CL/cl.h>
+
+#include <stdio.h>
+#include <string.h>
+
+static cl_context context;
+static cl_device_id deviceID;
+static cl_command_queue queue;
+static cl_kernel kernel;
+
+struct clm_NativeBuf {
+	cl_mem mem;
+};
+
+typedef struct __attribute__((packed)) {
+
+	cl_uint rows;
+	cl_uint cols;
+	cl_char transposed;
+} cl_GPUMat;
+
+#define gpuMat(mat) \
+	{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
+
+char *loadFile(const char *path) {
+	FILE *file = fopen(path, "r");
+	fseek(file, 0, SEEK_END);
+	size_t length = ftell(file);
+	fseek(file, 0, SEEK_SET);
+	char *buffer = calloc(1, length + 1);
+	fread(buffer, length, 1, file);
+	return buffer;
+}
+
+int clm_gpuInit() {
+	// Connect to a compute device
+	int useGPU = true;
+	cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Error: Failed to create a device group!\n");
+		return 1;
+	}
+
+	char *buffer = loadFile("src/mat.cl");
+	printf("%s", buffer);
+
+	context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
+	if(!context) {
+		printf("Failed to create context\n");
+		return 1;
+	}
+
+	queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
+	if(!queue) {
+		printf("Failed to create command queue\n");
+		return 1;
+	}
+
+	size_t length = strlen(buffer);
+	cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
+	if(!program) {
+		printf("Failed to create program\n");
+		return 1;
+	}
+
+	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to build program\n");
+		// clGetProgramBuildInfo...
+		return 1;
+	}
+
+	kernel = clCreateKernel(program, "linear_forward", &err);
+	if(!kernel) {
+		printf("Failed to create kernel\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+void clm_gpuDestroy() {
+}
+
+static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) {
+	cl_int err;
+
+	if(!nativeBuf->mem) {
+		cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err);
+		if(!mat_values) {
+			printf("Failed to alloc buffer: %d\n", err);
+			return NULL;
+		}
+		nativeBuf->mem = mat_values;
+	}
+
+	return nativeBuf->mem;
+}
+
+static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
+	cl_int err;
+	cl_mem mem = allocGPUMat(gpuMat, nativeBuf);
+
+	err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to enqueue write: %d\n", err);
+		return NULL;
+	}
+
+	return mem;
+}
+
+clm_NativeBuf nativeInput;
+
+// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
+void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
+	if(!linear->nativeWeights) {
+		linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
+		linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
+		linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
+	}
+
+	cl_GPUMat matInput = gpuMat(input);
+	cl_GPUMat matWeights = gpuMat(linear->weights);
+	cl_GPUMat matBias = gpuMat(linear->bias);
+	cl_GPUMat matOut = gpuMat(linear->output);
+
+	size_t workSize = matOut.rows * matOut.cols;
+
+	cl_int err;
+
+	cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput);
+	cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
+	cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
+	if(!matInput_values || !matWeights_values || !matBias_values) {
+		linear->output = INVALID_MATRIX;
+		return;
+	}
+
+	cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput);
+	if(!matOut_values) {
+		printf("Failed to alloc out: %d\n", err);
+		linear->output = INVALID_MATRIX;
+		return;
+	}
+
+	err = 0;
+	err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput);
+	err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values);
+	err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights);
+	err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values);
+	err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias);
+	err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values);
+	err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut);
+	err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values);
+	if(err != CL_SUCCESS) {
+		printf("Failed to set kernel args: %d\n", err);
+		return;
+	}
+
+	/*char *info = calloc(1, 1024);
+	clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
+	printf("INFO: %s\n", info);*/
+
+	size_t local;
+	err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to get work group size\n");
+		return;
+	}
+
+	size_t global = ceil((float) workSize / local) * local;
+	err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to enqueue: %d\n", err);
+		return;
+	}
+
+	clFinish(queue);
+
+	err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL);
+	if(err != CL_SUCCESS) {
+		printf("Failed to read from buffer\n");
+		return;
+	}
+}
diff --git a/src/cltest.bak2.c b/src/cltest.bak2.c
new file mode 100644
index 0000000..4ce80c6
--- /dev/null
+++ b/src/cltest.bak2.c
@@ -0,0 +1,264 @@
+#include <dlfcn.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "clm.h"
+
+float train_data_x[4][2] = {
+	{0, 0},
+	{0, 1},
+	{1, 0},
+	{1, 1}};
+
+float train_data_y[4][1] = {
+	{0},
+	{1},
+	{1},
+	{0}};
+
+float *predict(clm_NN nn, float *x, unsigned int length) {
+	clm_Matrix xM = clm_matrixWrapArray(x, length);
+
+	for(unsigned int i = 0; i < nn.numLayers; i++) {
+		clm_Linear layer = nn.layers[i];
+		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
+
+		if(clm_matrixIsInvalid(newX)) {
+			printf("Failed to predict\n");
+			return NULL;
+		}
+
+		clm_matrixAddMatrix(newX, layer.bias);
+		clm_matrixSigmoid(newX);
+		xM = newX;
+	}
+
+	return xM.values;
+}
+
+void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
+	clm_Matrix xM = clm_matrixWrapArray(x, xL);
+	clm_Matrix yM = clm_matrixWrapArray(y, yL);
+
+	// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
+
+	// Forward pass
+	clm_Matrix currentX = xM;
+	for(unsigned int i = 0; i < nn.numLayers; i++) {
+		clm_Linear layer = nn.layers[i];
+		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
+		if(clm_matrixIsInvalid(newX)) {
+			printf("Forward pass failed\n");
+			return;
+		}
+
+		clm_matrixAddMatrix(newX, layer.bias);
+		clm_matrixSigmoid(newX);
+		currentX = newX;
+	}
+
+	for(int i = nn.numLayers - 1; i >= 0; i--) {
+		clm_Linear layer = nn.layers[i];
+		clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
+		clm_Matrix outputOfThisLayer = nn.layers[i].output;
+		clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
+		clm_Matrix error = layer.error;
+
+		if(i == nn.numLayers - 1) {
+			clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
+		} else {
+			clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
+			clm_matrixMultiplyMatrix(weightsT, prevError, error);
+		}
+
+		clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
+		clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
+		clm_matrixMultiplyScalar(gradient, nn.learnRate);
+
+		clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
+		clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
+
+		clm_matrixAddMatrix(layer.weights, layer.weightsError);
+		clm_matrixAddMatrix(layer.bias, gradient);
+	}
+}
+
+void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
+	FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
+	if(!file) {
+		perror("Failed to open labels\n");
+		return;
+	}
+
+	unsigned char magicBytes[4];
+	fread(magicBytes, sizeof(magicBytes), 1, file);
+
+	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
+
+	unsigned char lengthBytes[4];
+	fread(lengthBytes, sizeof(lengthBytes), 1, file);
+
+	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
+		(lengthBytes[2] << 8) | lengthBytes[3];
+	printf("%" PRId32 "\n", length);
+
+	clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
+
+	for(unsigned int i = 0; i < length; i++) {
+		unsigned char label;
+		fread(&label, sizeof(unsigned char), 1, file);
+
+		clm_Vector vector = clm_vectorCreate(10);
+		for(unsigned int j = 0; j < 10; j++) {
+			vector.values[j] = label == j ? 1 : 0;
+		}
+		vectors[i] = vector;
+	}
+
+	*labelsOut = vectors;
+	*labelsCountOut = length;
+}
+
+void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
+	FILE *file = fopen("data/train-images.idx3-ubyte", "r");
+	if(!file) {
+		perror("Failed to open images\n");
+		return;
+	}
+
+	unsigned char magicBytes[4];
+	fread(magicBytes, sizeof(magicBytes), 1, file);
+
+	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
+
+	unsigned char lengthBytes[4];
+	fread(lengthBytes, sizeof(lengthBytes), 1, file);
+	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
+	printf("%" PRId32 "\n", length);
+
+	unsigned char rowsBytes[4];
+	fread(rowsBytes, sizeof(rowsBytes), 1, file);
+	uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
+	printf("%" PRId32 "\n", rows);
+
+	unsigned char colsBytes[4];
+	fread(colsBytes, sizeof(colsBytes), 1, file);
+	uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
+	printf("%" PRId32 "\n", cols);
+
+	clm_Vector *images = calloc(length, sizeof(clm_Vector));
+	for(unsigned int i = 0; i < length; i++) {
+		clm_Vector vec = clm_vectorCreate(cols * rows);
+		unsigned char image[cols * rows];
+		fread(image, sizeof(image), 1, file);
+		for(unsigned int j = 0; j < cols * rows; j++) {
+			vec.values[j] = (float) image[j];
+		}
+		images[i] = vec;
+	}
+
+	*imagesOut = images;
+	*imageCountOut = length;
+}
+
+typedef void *(*callocFunc)(size_t, size_t);
+
+callocFunc oldCalloc;
+
+int main() {
+	oldCalloc = dlsym(RTLD_NEXT, "calloc");
+
+	clm_Vector *labels = NULL;
+	unsigned int labelCount;
+	loadLabels(&labels, &labelCount);
+	printf("LENGTH: %u\n", labelCount);
+
+	clm_Vector *images = NULL;
+	unsigned int imageCount;
+	loadImages(&images, &imageCount);
+
+	imageCount = 60000;
+
+	printf("%f\n", images[0].values[0]);
+
+	srand(1);
+
+	unsigned int
+		i = 784,
+		h = 30,
+		o = 10;
+
+	clm_Linear layer1 = clm_linearCreateRandom(i, h);
+	clm_Linear layer2 = clm_linearCreateRandom(h, o);
+	clm_Linear layers[] = {layer1, layer2};
+	clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
+
+	for(unsigned int epoch = 0; epoch < 1; epoch++) {
+		printf("Epoch %u\n", epoch);
+		for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
+			if(idx % 1000 == 0) {
+				printf("\r%.2f%%", idx / (float) imageCount * 100);
+				fflush(stdout);
+			}
+			// printf("%u\n", idx);
+			// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
+			/*for(unsigned int f = 0; f < images[idx].length; f++) {
+					printf("%.2f ", images[idx].values[f]);
+			}
+			printf("\n");
+			for(unsigned int f = 0; f < labels[idx].length; f++) {
+					printf("%.2f ", labels[idx].values[f]);
+			}
+			printf("\n");*/
+			// printf("%.2f\n", labels.values[idx]);
+
+			train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
+			// train(nn, test, 784, target, 10);
+			// predict(nn, test, 784);
+		}
+		printf("\n");
+	}
+
+	printf("Train done\n");
+
+	unsigned int correct = 0;
+	for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
+		// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
+		// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
+		float *pred = predict(nn, images[idx].values, images[idx].length);
+		unsigned int predDigit = 0;
+		float max = -1;
+		for(unsigned int j = 0; j < 10; j++) {
+			// printf("%.2f ", pred[j]);
+			if(pred[j] > max || max < 0) {
+				max = pred[j];
+				predDigit = j;
+			}
+		}
+		if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
+
+		unsigned int actDigit = 0;
+		float maxA = -1;
+		for(unsigned int j = 0; j < 10; j++) {
+			// printf("%.2f ", pred[j]);
+			if(labels[idx].values[j] > maxA || maxA < 0) {
+				maxA = labels[idx].values[j];
+				actDigit = j;
+			}
+		}
+		if(idx < 100) printf("Actual: %u\n", actDigit);
+		// printf("\n");
+
+		if(predDigit == actDigit) correct++;
+	}
+
+	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
+
+	printf("\n");
+}
+
+void *calloc(size_t nmemb, size_t size) {
+	// printf("CALLOC\n");
+	return oldCalloc(nmemb, size);
+}
diff --git a/src/cltest.c b/src/cltest.c
index 4ce80c6..69eccb9 100644
--- a/src/cltest.c
+++ b/src/cltest.c
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 
 #include "clm.h"
+#include "clm_gpu.h"
 
 float train_data_x[4][2] = {
 	{0, 0},
@@ -21,17 +22,8 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
 	clm_Matrix xM = clm_matrixWrapArray(x, length);
 
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
-		clm_Linear layer = nn.layers[i];
-		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
-
-		if(clm_matrixIsInvalid(newX)) {
-			printf("Failed to predict\n");
-			return NULL;
-		}
-
-		clm_matrixAddMatrix(newX, layer.bias);
-		clm_matrixSigmoid(newX);
-		xM = newX;
+		clm_linearForward(&nn.layers[i], xM);
+		xM = nn.layers[i].output;
 	}
 
 	return xM.values;
@@ -46,16 +38,8 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
 	// Forward pass
 	clm_Matrix currentX = xM;
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
-		clm_Linear layer = nn.layers[i];
-		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
-		if(clm_matrixIsInvalid(newX)) {
-			printf("Forward pass failed\n");
-			return;
-		}
-
-		clm_matrixAddMatrix(newX, layer.bias);
-		clm_matrixSigmoid(newX);
-		currentX = newX;
+		clm_linearForward(&nn.layers[i], currentX);
+		currentX = nn.layers[i].output;
 	}
 
 	for(int i = nn.numLayers - 1; i >= 0; i--) {
@@ -162,12 +146,11 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
 	*imageCountOut = length;
 }
 
-typedef void *(*callocFunc)(size_t, size_t);
-
-callocFunc oldCalloc;
-
 int main() {
-	oldCalloc = dlsym(RTLD_NEXT, "calloc");
+	if(clm_gpuInit() != 0) {
+		printf("Failed to init GPU\n");
+		return 1;
+	}
 
 	clm_Vector *labels = NULL;
 	unsigned int labelCount;
@@ -189,9 +172,9 @@ int main() {
 		h = 30,
 		o = 10;
 
-	clm_Linear layer1 = clm_linearCreateRandom(i, h);
-	clm_Linear layer2 = clm_linearCreateRandom(h, o);
-	clm_Linear layers[] = {layer1, layer2};
+	clm_Linear layers[] = {
+		clm_linearCreateRandom(i, h),
+		clm_linearCreateRandom(h, o)};
 	clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
 
 	for(unsigned int epoch = 0; epoch < 1; epoch++) {
@@ -201,21 +184,8 @@ int main() {
 				printf("\r%.2f%%", idx / (float) imageCount * 100);
 				fflush(stdout);
 			}
-			// printf("%u\n", idx);
-			// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
-			/*for(unsigned int f = 0; f < images[idx].length; f++) {
-					printf("%.2f ", images[idx].values[f]);
-			}
-			printf("\n");
-			for(unsigned int f = 0; f < labels[idx].length; f++) {
-					printf("%.2f ", labels[idx].values[f]);
-			}
-			printf("\n");*/
-			// printf("%.2f\n", labels.values[idx]);
 
 			train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
-			// train(nn, test, 784, target, 10);
-			// predict(nn, test, 784);
 		}
 		printf("\n");
 	}
@@ -256,9 +226,6 @@ int main() {
 	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
 
 	printf("\n");
-}
 
-void *calloc(size_t nmemb, size_t size) {
-	// printf("CALLOC\n");
-	return oldCalloc(nmemb, size);
+	clm_gpuDestroy();
 }
diff --git a/src/mat.cl b/src/mat.cl
index f9d936e..0fa9eee 100644
--- a/src/mat.cl
+++ b/src/mat.cl
@@ -4,30 +4,44 @@ typedef struct __attribute__((packed)) {
 	char transposed;
 } cl_GPUMat;
 
-__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
-	/*if(a.cols != b.rows) {
-		printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
-		return INVALID_MATRIX;
-	}
-
-	if(out.rows != a.rows || out.cols != b.cols) {
-		printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
-		return INVALID_MATRIX;
-	}*/
+#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
 
+void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
 	uint idx = get_global_id(0);
 	if(idx >= matOut.rows * matOut.cols) return;
 
 	uint i = idx / matOut.cols;
 	uint j = idx % matOut.cols;
 
-	// for(unsigned int i = 0; i < out.rows; i++) {
-	// for(unsigned int j = 0; j < out.cols; j++) {
 	float sum = 0;
 	for(unsigned int k = 0; k < matA.cols; k++) {
-		sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j];
+		sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
 	}
-	matOut_values[i * matOut.cols + j] = sum;
-	//}
-	//}
+	matrixAt(matOut, matOut_values, i, j) = sum;
+}
+
+void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
+	uint idx = get_global_id(0);
+	if(idx >= matA.rows * matA.cols) return;
+
+	matA_values[idx] += matB_values[idx];
+}
+
+void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
+	uint idx = get_global_id(0);
+	if(idx >= mat.rows * mat.cols) return;
+
+	mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
+}
+
+// clm_Matrix input;
+// clm_Matrix weights;
+// clm_Matrix bias;
+// clm_Matrix output;
+
+__kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
+	// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
+	mat_multiply(weights, weights_values, input, input_values, out, out_values);
+	mat_add(out, out_values, bias, bias_values);
+	mat_sigmoid(out, out_values);
 }