More GPU stuff

2023-10-29 23:01:46 +01:00 · 2023-10-29 23:01:46 +01:00 · 4d93fa76cc
commit 4d93fa76cc
parent c91ca11346
10 changed files with 547 additions and 72 deletions
--- a/4
+++ b/4
@ -5,7 +5,7 @@ CFLAGS=-Wall -g
 .PHONY: all
 all:
 	mkdir -p $(BUILD)
-	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c
+	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
 .PHONY: run
 run: all
@ -14,7 +14,7 @@ run: all
 .PHONY: cl
 cl:
 	mkdir -p $(BUILD)
-	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c
+	gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
 .PHONY: cl_run
 cl_run: cl
--- a/src/cl.c
+++ b/src/cl.c
@ -1,6 +1,6 @@
 #define CL_TARGET_OPENCL_VERSION 200
 #include <CL/cl_platform.h>
 #include <math.h>
 #define CL_TARGET_OPENCL_VERSION 300
 #include "clm.h"
@ -70,9 +70,9 @@ int main() {
 		return 1;
 	}
-	clm_Matrix a = clm_createMatrixRandom(3, 4);
+	clm_Matrix a = clm_createMatrixRandom(5, 10);
-	clm_Matrix b = clm_createMatrixRandom(4, 5);
+	clm_Matrix b = clm_createMatrixRandom(10, 3);
-	clm_Matrix out = clm_createMatrixRandom(3, 5);
+	clm_Matrix out = clm_createMatrixRandom(5, 3);
 	cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
 	cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
--- a/src/clm.c
+++ b/src/clm.c
@ -158,10 +158,8 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) {
 }
 clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
-	for(unsigned int i = 0; i < mat.rows; i++) {
+	for(unsigned int i = 0; i < mat.rows * mat.cols; i++) {
-		for(unsigned int j = 0; j < mat.cols; j++) {
+		mat.values[i] = 1 / (1 + exp(-mat.values[i]));
 			matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j)));
 		}
 	}
 	return mat;
--- a/src/clm.h
+++ b/src/clm.h
@ -5,6 +5,8 @@
 #define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
 typedef struct clm_NativeBuf clm_NativeBuf;
 typedef struct {
 	float *values;
 	unsigned int rows;
@ -23,6 +25,9 @@ typedef struct {
 	clm_Matrix output;
 	clm_Matrix error;
 	clm_Matrix weightsError;
 	clm_NativeBuf *nativeWeights;
 	clm_NativeBuf *nativeBias;
 	clm_NativeBuf *nativeOutput;
 } clm_Linear;
 typedef struct {
--- a/src/clm_gpu.h
+++ b/src/clm_gpu.h
@ -0,0 +1,11 @@
 #ifndef _CLM_GPU_H_
 #define _CLM_GPU_H_
 #include "clm.h"
 int clm_gpuInit();
 void clm_gpuDestroy();
 void clm_linearForward(clm_Linear *linear, clm_Matrix input);
 #endif
--- a/src/clm_gpu_cpu.c
+++ b/src/clm_gpu_cpu.c
@ -0,0 +1,24 @@
 #include "clm_gpu.h"
 #include <stdio.h>
 struct clm_NativeBuf {};
 int clm_gpuInit() {
 	return 0;
 }
 void clm_gpuDestroy() {}
 void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
 	clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output);
 	if(clm_matrixIsInvalid(newX)) {
 		printf("Forward pass failed\n");
 		return;
 	}
 	clm_matrixAddMatrix(newX, linear->bias);
 	clm_matrixSigmoid(newX);
 	linear->output = newX;
 }
--- a/src/clm_gpu_opencl.c
+++ b/src/clm_gpu_opencl.c
@ -0,0 +1,192 @@
 #include "clm_gpu.h"
 #include <math.h>
 #define CL_TARGET_OPENCL_VERSION 200
 #include <CL/cl_platform.h>
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
 static cl_context context;
 static cl_device_id deviceID;
 static cl_command_queue queue;
 static cl_kernel kernel;
 struct clm_NativeBuf {
 	cl_mem mem;
 };
 typedef struct __attribute__((packed)) {
 	cl_uint rows;
 	cl_uint cols;
 	cl_char transposed;
 } cl_GPUMat;
 #define gpuMat(mat) \
 	{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
 char *loadFile(const char *path) {
 	FILE *file = fopen(path, "r");
 	fseek(file, 0, SEEK_END);
 	size_t length = ftell(file);
 	fseek(file, 0, SEEK_SET);
 	char *buffer = calloc(1, length + 1);
 	fread(buffer, length, 1, file);
 	return buffer;
 }
 int clm_gpuInit() {
 	// Connect to a compute device
 	int useGPU = true;
 	cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
 	if(err != CL_SUCCESS) {
 		printf("Error: Failed to create a device group!\n");
 		return 1;
 	}
 	char *buffer = loadFile("src/mat.cl");
 	printf("%s", buffer);
 	context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
 	if(!context) {
 		printf("Failed to create context\n");
 		return 1;
 	}
 	queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
 	if(!queue) {
 		printf("Failed to create command queue\n");
 		return 1;
 	}
 	size_t length = strlen(buffer);
 	cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
 	if(!program) {
 		printf("Failed to create program\n");
 		return 1;
 	}
 	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 	if(err != CL_SUCCESS) {
 		printf("Failed to build program\n");
 		// clGetProgramBuildInfo...
 		return 1;
 	}
 	kernel = clCreateKernel(program, "linear_forward", &err);
 	if(!kernel) {
 		printf("Failed to create kernel\n");
 		return 1;
 	}
 	return 0;
 }
 void clm_gpuDestroy() {
 }
 static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) {
 	cl_int err;
 	if(!nativeBuf->mem) {
 		cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err);
 		if(!mat_values) {
 			printf("Failed to alloc buffer: %d\n", err);
 			return NULL;
 		}
 		nativeBuf->mem = mat_values;
 	}
 	return nativeBuf->mem;
 }
 static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
 	cl_int err;
 	cl_mem mem = allocGPUMat(gpuMat, nativeBuf);
 	err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
 	if(err != CL_SUCCESS) {
 		printf("Failed to enqueue write: %d\n", err);
 		return NULL;
 	}
 	return mem;
 }
 clm_NativeBuf nativeInput;
 // TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
 void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
 	if(!linear->nativeWeights) {
 		linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
 		linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
 		linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
 	}
 	cl_GPUMat matInput = gpuMat(input);
 	cl_GPUMat matWeights = gpuMat(linear->weights);
 	cl_GPUMat matBias = gpuMat(linear->bias);
 	cl_GPUMat matOut = gpuMat(linear->output);
 	size_t workSize = matOut.rows * matOut.cols;
 	cl_int err;
 	cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput);
 	cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
 	cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
 	if(!matInput_values || !matWeights_values || !matBias_values) {
 		linear->output = INVALID_MATRIX;
 		return;
 	}
 	cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput);
 	if(!matOut_values) {
 		printf("Failed to alloc out: %d\n", err);
 		linear->output = INVALID_MATRIX;
 		return;
 	}
 	err = 0;
 	err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput);
 	err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values);
 	err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights);
 	err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values);
 	err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias);
 	err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values);
 	err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut);
 	err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values);
 	if(err != CL_SUCCESS) {
 		printf("Failed to set kernel args: %d\n", err);
 		return;
 	}
 	/*char *info = calloc(1, 1024);
 	clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
 	printf("INFO: %s\n", info);*/
 	size_t local;
 	err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
 	if(err != CL_SUCCESS) {
 		printf("Failed to get work group size\n");
 		return;
 	}
 	size_t global = ceil((float) workSize / local) * local;
 	err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 	if(err != CL_SUCCESS) {
 		printf("Failed to enqueue: %d\n", err);
 		return;
 	}
 	clFinish(queue);
 	err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL);
 	if(err != CL_SUCCESS) {
 		printf("Failed to read from buffer\n");
 		return;
 	}
 }
--- a/src/cltest.bak2.c
+++ b/src/cltest.bak2.c
@ -0,0 +1,264 @@
 #include <dlfcn.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include "clm.h"
 float train_data_x[4][2] = {
 	{0, 0},
 	{0, 1},
 	{1, 0},
 	{1, 1}};
 float train_data_y[4][1] = {
 	{0},
 	{1},
 	{1},
 	{0}};
 float *predict(clm_NN nn, float *x, unsigned int length) {
 	clm_Matrix xM = clm_matrixWrapArray(x, length);
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
 		clm_Linear layer = nn.layers[i];
 		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
 		if(clm_matrixIsInvalid(newX)) {
 			printf("Failed to predict\n");
 			return NULL;
 		}
 		clm_matrixAddMatrix(newX, layer.bias);
 		clm_matrixSigmoid(newX);
 		xM = newX;
 	}
 	return xM.values;
 }
 void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
 	clm_Matrix xM = clm_matrixWrapArray(x, xL);
 	clm_Matrix yM = clm_matrixWrapArray(y, yL);
 	// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
 	// Forward pass
 	clm_Matrix currentX = xM;
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
 		clm_Linear layer = nn.layers[i];
 		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
 		if(clm_matrixIsInvalid(newX)) {
 			printf("Forward pass failed\n");
 			return;
 		}
 		clm_matrixAddMatrix(newX, layer.bias);
 		clm_matrixSigmoid(newX);
 		currentX = newX;
 	}
 	for(int i = nn.numLayers - 1; i >= 0; i--) {
 		clm_Linear layer = nn.layers[i];
 		clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
 		clm_Matrix outputOfThisLayer = nn.layers[i].output;
 		clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
 		clm_Matrix error = layer.error;
 		if(i == nn.numLayers - 1) {
 			clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
 		} else {
 			clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
 			clm_matrixMultiplyMatrix(weightsT, prevError, error);
 		}
 		clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
 		clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
 		clm_matrixMultiplyScalar(gradient, nn.learnRate);
 		clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
 		clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
 		clm_matrixAddMatrix(layer.weights, layer.weightsError);
 		clm_matrixAddMatrix(layer.bias, gradient);
 	}
 }
 void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
 	FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
 	if(!file) {
 		perror("Failed to open labels\n");
 		return;
 	}
 	unsigned char magicBytes[4];
 	fread(magicBytes, sizeof(magicBytes), 1, file);
 	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
 	unsigned char lengthBytes[4];
 	fread(lengthBytes, sizeof(lengthBytes), 1, file);
 	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
 		(lengthBytes[2] << 8) | lengthBytes[3];
 	printf("%" PRId32 "\n", length);
 	clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
 	for(unsigned int i = 0; i < length; i++) {
 		unsigned char label;
 		fread(&label, sizeof(unsigned char), 1, file);
 		clm_Vector vector = clm_vectorCreate(10);
 		for(unsigned int j = 0; j < 10; j++) {
 			vector.values[j] = label == j ? 1 : 0;
 		}
 		vectors[i] = vector;
 	}
 	*labelsOut = vectors;
 	*labelsCountOut = length;
 }
 void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
 	FILE *file = fopen("data/train-images.idx3-ubyte", "r");
 	if(!file) {
 		perror("Failed to open images\n");
 		return;
 	}
 	unsigned char magicBytes[4];
 	fread(magicBytes, sizeof(magicBytes), 1, file);
 	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
 	unsigned char lengthBytes[4];
 	fread(lengthBytes, sizeof(lengthBytes), 1, file);
 	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
 	printf("%" PRId32 "\n", length);
 	unsigned char rowsBytes[4];
 	fread(rowsBytes, sizeof(rowsBytes), 1, file);
 	uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
 	printf("%" PRId32 "\n", rows);
 	unsigned char colsBytes[4];
 	fread(colsBytes, sizeof(colsBytes), 1, file);
 	uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
 	printf("%" PRId32 "\n", cols);
 	clm_Vector *images = calloc(length, sizeof(clm_Vector));
 	for(unsigned int i = 0; i < length; i++) {
 		clm_Vector vec = clm_vectorCreate(cols * rows);
 		unsigned char image[cols * rows];
 		fread(image, sizeof(image), 1, file);
 		for(unsigned int j = 0; j < cols * rows; j++) {
 			vec.values[j] = (float) image[j];
 		}
 		images[i] = vec;
 	}
 	*imagesOut = images;
 	*imageCountOut = length;
 }
 typedef void *(*callocFunc)(size_t, size_t);
 callocFunc oldCalloc;
 int main() {
 	oldCalloc = dlsym(RTLD_NEXT, "calloc");
 	clm_Vector *labels = NULL;
 	unsigned int labelCount;
 	loadLabels(&labels, &labelCount);
 	printf("LENGTH: %u\n", labelCount);
 	clm_Vector *images = NULL;
 	unsigned int imageCount;
 	loadImages(&images, &imageCount);
 	imageCount = 60000;
 	printf("%f\n", images[0].values[0]);
 	srand(1);
 	unsigned int
 		i = 784,
 		h = 30,
 		o = 10;
 	clm_Linear layer1 = clm_linearCreateRandom(i, h);
 	clm_Linear layer2 = clm_linearCreateRandom(h, o);
 	clm_Linear layers[] = {layer1, layer2};
 	clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
 	for(unsigned int epoch = 0; epoch < 1; epoch++) {
 		printf("Epoch %u\n", epoch);
 		for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
 			if(idx % 1000 == 0) {
 				printf("\r%.2f%%", idx / (float) imageCount * 100);
 				fflush(stdout);
 			}
 			// printf("%u\n", idx);
 			// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
 			/*for(unsigned int f = 0; f < images[idx].length; f++) {
 					printf("%.2f ", images[idx].values[f]);
 			}
 			printf("\n");
 			for(unsigned int f = 0; f < labels[idx].length; f++) {
 					printf("%.2f ", labels[idx].values[f]);
 			}
 			printf("\n");*/
 			// printf("%.2f\n", labels.values[idx]);
 			train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
 			// train(nn, test, 784, target, 10);
 			// predict(nn, test, 784);
 		}
 		printf("\n");
 	}
 	printf("Train done\n");
 	unsigned int correct = 0;
 	for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
 		// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
 		// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
 		float *pred = predict(nn, images[idx].values, images[idx].length);
 		unsigned int predDigit = 0;
 		float max = -1;
 		for(unsigned int j = 0; j < 10; j++) {
 			// printf("%.2f ", pred[j]);
 			if(pred[j] > max || max < 0) {
 				max = pred[j];
 				predDigit = j;
 			}
 		}
 		if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
 		unsigned int actDigit = 0;
 		float maxA = -1;
 		for(unsigned int j = 0; j < 10; j++) {
 			// printf("%.2f ", pred[j]);
 			if(labels[idx].values[j] > maxA || maxA < 0) {
 				maxA = labels[idx].values[j];
 				actDigit = j;
 			}
 		}
 		if(idx < 100) printf("Actual: %u\n", actDigit);
 		// printf("\n");
 		if(predDigit == actDigit) correct++;
 	}
 	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
 	printf("\n");
 }
 void *calloc(size_t nmemb, size_t size) {
 	// printf("CALLOC\n");
 	return oldCalloc(nmemb, size);
 }
--- a/src/cltest.c
+++ b/src/cltest.c
@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include "clm.h"
 #include "clm_gpu.h"
 float train_data_x[4][2] = {
 	{0, 0},
@ -21,17 +22,8 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
 	clm_Matrix xM = clm_matrixWrapArray(x, length);
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
-		clm_Linear layer = nn.layers[i];
+		clm_linearForward(&nn.layers[i], xM);
-		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
+		xM = nn.layers[i].output;
 		if(clm_matrixIsInvalid(newX)) {
 			printf("Failed to predict\n");
 			return NULL;
 		}
 		clm_matrixAddMatrix(newX, layer.bias);
 		clm_matrixSigmoid(newX);
 		xM = newX;
 	}
 	return xM.values;
@ -46,16 +38,8 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
 	// Forward pass
 	clm_Matrix currentX = xM;
 	for(unsigned int i = 0; i < nn.numLayers; i++) {
-		clm_Linear layer = nn.layers[i];
+		clm_linearForward(&nn.layers[i], currentX);
-		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
+		currentX = nn.layers[i].output;
 		if(clm_matrixIsInvalid(newX)) {
 			printf("Forward pass failed\n");
 			return;
 		}
 		clm_matrixAddMatrix(newX, layer.bias);
 		clm_matrixSigmoid(newX);
 		currentX = newX;
 	}
 	for(int i = nn.numLayers - 1; i >= 0; i--) {
@ -162,12 +146,11 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
 	*imageCountOut = length;
 }
 typedef void *(*callocFunc)(size_t, size_t);
 callocFunc oldCalloc;
 int main() {
-	oldCalloc = dlsym(RTLD_NEXT, "calloc");
+	if(clm_gpuInit() != 0) {
 		printf("Failed to init GPU\n");
 		return 1;
 	}
 	clm_Vector *labels = NULL;
 	unsigned int labelCount;
@ -189,9 +172,9 @@ int main() {
 		h = 30,
 		o = 10;
-	clm_Linear layer1 = clm_linearCreateRandom(i, h);
+	clm_Linear layers[] = {
-	clm_Linear layer2 = clm_linearCreateRandom(h, o);
+		clm_linearCreateRandom(i, h),
-	clm_Linear layers[] = {layer1, layer2};
+		clm_linearCreateRandom(h, o)};
 	clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
 	for(unsigned int epoch = 0; epoch < 1; epoch++) {
@ -201,21 +184,8 @@ int main() {
 				printf("\r%.2f%%", idx / (float) imageCount * 100);
 				fflush(stdout);
 			}
 			// printf("%u\n", idx);
 			// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
 			/*for(unsigned int f = 0; f < images[idx].length; f++) {
 					printf("%.2f ", images[idx].values[f]);
 			}
 			printf("\n");
 			for(unsigned int f = 0; f < labels[idx].length; f++) {
 					printf("%.2f ", labels[idx].values[f]);
 			}
 			printf("\n");*/
 			// printf("%.2f\n", labels.values[idx]);
 			train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
 			// train(nn, test, 784, target, 10);
 			// predict(nn, test, 784);
 		}
 		printf("\n");
 	}
@ -256,9 +226,6 @@ int main() {
 	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
 	printf("\n");
 }
-void *calloc(size_t nmemb, size_t size) {
+	clm_gpuDestroy();
 	// printf("CALLOC\n");
 	return oldCalloc(nmemb, size);
 }
--- a/src/mat.cl
+++ b/src/mat.cl
@ -4,30 +4,44 @@ typedef struct __attribute__((packed)) {
 	char transposed;
 } cl_GPUMat;
-__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
+#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
 	/*if(a.cols != b.rows) {
 		printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
 		return INVALID_MATRIX;
 	}
 	if(out.rows != a.rows || out.cols != b.cols) {
 		printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
 		return INVALID_MATRIX;
 	}*/
 void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
 	uint idx = get_global_id(0);
 	if(idx >= matOut.rows * matOut.cols) return;
 	uint i = idx / matOut.cols;
 	uint j = idx % matOut.cols;
 	// for(unsigned int i = 0; i < out.rows; i++) {
 	// for(unsigned int j = 0; j < out.cols; j++) {
 	float sum = 0;
 	for(unsigned int k = 0; k < matA.cols; k++) {
-		sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j];
+		sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
 	}
-	matOut_values[i * matOut.cols + j] = sum;
+	matrixAt(matOut, matOut_values, i, j) = sum;
-	//}
+}
-	//}
+
 void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
 	uint idx = get_global_id(0);
 	if(idx >= matA.rows * matA.cols) return;
 	matA_values[idx] += matB_values[idx];
 }
 void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
 	uint idx = get_global_id(0);
 	if(idx >= mat.rows * mat.cols) return;
 	mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
 }
 // clm_Matrix input;
 // clm_Matrix weights;
 // clm_Matrix bias;
 // clm_Matrix output;
 __kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
 	// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
 	mat_multiply(weights, weights_values, input, input_values, out, out_values);
 	mat_add(out, out_values, bias, bias_values);
 	mat_sigmoid(out, out_values);
 }