diff --git a/src/clm_opencl.c b/src/clm_opencl.c
index b5024fc..42790b9 100644
--- a/src/clm_opencl.c
+++ b/src/clm_opencl.c
@@ -230,6 +230,9 @@ void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *i
 	}*/
 
 	readGPUMats(matOut, batchSize, outputs, linear->nativeOutput);
+
+	clFlush(queue);
+	clFinish(queue);
 }
 
 void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) {
@@ -320,4 +323,7 @@ void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchS
 	readGPUMats(matOutputGradients, batchSize, outputGradients, linear->nativeGradient);
 	readGPUMats(matOutputWeightsErrors, batchSize, outputWeightsErrors, linear->nativeWeightsError);
 	if(updateErrors) readGPUMats(matOutputErrors, batchSize, outputErrors, linear->nativeOutputError);
+
+	clFlush(queue);
+	clFinish(queue);
 }
diff --git a/src/cltest.c b/src/cltest.c
index dd40f46..5bf5527 100644
--- a/src/cltest.c
+++ b/src/cltest.c
@@ -31,6 +31,41 @@ float *predict(clm_NN nn, clm_Vector input) {
 	return xM.values;
 }
 
+float eval(clm_NN nn, unsigned int count, clm_Vector *images, clm_Vector *labels) {
+	unsigned int correct = 0;
+	for(unsigned int idx = 0; idx < count; idx++) { // Each train sample
+		// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
+		// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
+		float *pred = predict(nn, images[idx]);
+		unsigned int predDigit = 0;
+		float max = -1;
+		for(unsigned int j = 0; j < 10; j++) {
+			// printf("%.2f ", pred[j]);
+			if(pred[j] > max || max < 0) {
+				max = pred[j];
+				predDigit = j;
+			}
+		}
+		// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
+
+		unsigned int actDigit = 0;
+		float maxA = -1;
+		for(unsigned int j = 0; j < 10; j++) {
+			// printf("%.2f ", pred[j]);
+			if(labels[idx].values[j] > maxA || maxA < 0) {
+				maxA = labels[idx].values[j];
+				actDigit = j;
+			}
+		}
+		// if(idx < 100) printf("Actual: %u\n", actDigit);
+		// printf("\n");
+
+		if(predDigit == actDigit) correct++;
+	}
+
+	return (float) correct / count * 100;
+}
+
 void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) {
 	clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix));
 	clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix));
@@ -73,31 +108,6 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *
 				clm_matrixAddMatrix(layer->bias, layer->gradient[b]);
 			}
 		}
-
-		/*for(int i = nn.numLayers - 1; i >= 0; i--) {
-			clm_Linear layer = nn.layers[i];
-			clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
-			clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
-			clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
-			clm_Matrix error = layer.error;
-
-			if(i == nn.numLayers - 1) {
-				clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y
-			} else {
-				clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
-				clm_matrixMultiplyMatrix(weightsT, prevError, error);
-			}
-
-			clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat)
-			clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
-			clm_matrixMultiplyScalar(gradient, nn.learnRate);
-
-			clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]);
-			clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
-
-			clm_matrixAddMatrix(layer.weights, layer.weightsError);
-			clm_matrixAddMatrix(layer.bias, gradient);
-		}*/
 	}
 
 	free(batchInputs);
@@ -216,7 +226,7 @@ int main(int argc, const char *argv[]) {
 	unsigned int imageCount;
 	loadImages(&images, &imageCount);
 
-	imageCount = 60000;
+	imageCount = 600;
 
 	printf("%f\n", images[0].values[0]);
 
@@ -230,13 +240,13 @@ int main(int argc, const char *argv[]) {
 	clm_Linear layers[] = {
 		clm_linearCreateRandom(i, h),
 		clm_linearCreateRandom(h, o)};
-	clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 500);
+	clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 10000);
 
 	for(unsigned int i = 0; i < sizeof(layers) / sizeof(clm_Linear); i++) {
 		clm_linearInit(&nn.layers[i]);
 	}
 
-	for(unsigned int epoch = 0; epoch < 1; epoch++) {
+	for(unsigned int epoch = 0; epoch < 10; epoch++) {
 		printf("Epoch %u\n", epoch);
 		/*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
 			if(idx % 1000 == 0) {
@@ -245,45 +255,15 @@ int main(int argc, const char *argv[]) {
 			}
 		}*/
 		train(nn, imageCount, images, labels);
+
+		printf("Score: %.2f\n", eval(nn, imageCount, images, labels));
 		printf("\n");
 	}
 
 	printf("Train done\n");
 
-	unsigned int correct = 0;
-	for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
-		// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
-		// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
-		float *pred = predict(nn, images[idx]);
-		unsigned int predDigit = 0;
-		float max = -1;
-		for(unsigned int j = 0; j < 10; j++) {
-			// printf("%.2f ", pred[j]);
-			if(pred[j] > max || max < 0) {
-				max = pred[j];
-				predDigit = j;
-			}
-		}
-		// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
-
-		unsigned int actDigit = 0;
-		float maxA = -1;
-		for(unsigned int j = 0; j < 10; j++) {
-			// printf("%.2f ", pred[j]);
-			if(labels[idx].values[j] > maxA || maxA < 0) {
-				maxA = labels[idx].values[j];
-				actDigit = j;
-			}
-		}
-		// if(idx < 100) printf("Actual: %u\n", actDigit);
-		// printf("\n");
-
-		if(predDigit == actDigit) correct++;
-	}
-
-	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
-
-	printf("\n");
+	float score = eval(nn, imageCount, images, labels);
+	printf("Correct: %.2f\n", score);
 
 	clm_gpuDestroy();
 }
diff --git a/src/mat.cl b/src/mat.cl
index fbc8b87..f078e43 100644
--- a/src/mat.cl
+++ b/src/mat.cl
@@ -21,8 +21,9 @@ void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, _
 	uint idx = get_global_id(0);
 	if(idx >= matOut.rows * matOut.cols) return;
 
-	uint i, j;
-	matrixGetIJ(matOut, idx, i, j);
+	// TODO: might not work with transposed matOut
+	uint i = idx / matOut.cols;
+	uint j = idx % matOut.cols;
 
 	float sum = 0;
 	for(unsigned int k = 0; k < matA.cols; k++) {
@@ -75,6 +76,14 @@ void mat_multiply_scalar(cl_GPUMat mat, __global float *mat_values, float scalar
 	mat_values[idx] *= scalar;
 }
 
+cl_GPUMat clm_matrixTranspose(cl_GPUMat mat) {
+	cl_GPUMat tr = {0};
+	tr.cols = mat.rows;
+	tr.rows = mat.cols;
+	tr.transposed = !mat.transposed;
+	return tr;
+}
+
 __kernel void linear_forward(unsigned int batchSize,
 	cl_GPUMat input, __global float *input_values,
 	cl_GPUMat weights, __global float *weights_values,
@@ -123,14 +132,12 @@ __kernel void linear_backprop_2(unsigned int batchSize,
 		__global float *batchOutWeightsErrors_values = outputWeightsErrors_values + b * outputWeightsErrors.rows * outputWeightsErrors.cols;
 		__global float *batchOutGradients_values = outputGradients_values + b * outputGradients.rows * outputGradients.cols;
 
-		cl_GPUMat inputsT = inputs;
-		inputsT.transposed = true;
+		cl_GPUMat inputsT = clm_matrixTranspose(inputs);
 		mat_multiply(outputGradients, batchOutGradients_values, inputsT, batchInput_values, outputWeightsErrors, batchOutWeightsErrors_values);
 
 		if(updateErrors) {
-			cl_GPUMat weightsT = weights;
-			weightsT.transposed = true;
-			mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
+			cl_GPUMat weightsT = clm_matrixTranspose(weightsT);
+			// mat_multiply(weightsT, weights_values, inputErrors, batchInErrors_values, outputErrors, batchOutErrors_values);
 		}
 	}
 }