#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>

#include "clm.h"

float train_data_x[4][2] = {
	{0, 0},
	{0, 1},
	{1, 0},
	{1, 1}
};

float train_data_y[4][1] = {
	{0},
	{1},
	{1},
	{0}
};

float *predict(clm_NN nn, float *x, unsigned int length) {
	clm_Matrix xM = clm_matrixFromArray(x, length);

	for(unsigned int i = 0; i < nn.numLayers; i++) {
		clm_Linear layer = nn.layers[i];
		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM);

		if(clm_matrixIsInvalid(newX)) {
			printf("Failed to predict\n");
			return NULL;
		}

		clm_matrixAddMatrix(newX, layer.bias);
		clm_matrixSigmoid(newX);
		clm_freeMatrix(xM);
		xM = newX;
	}

	return xM.values;
}

void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
	clm_Matrix xM = clm_matrixFromArray(x, xL);
	clm_Matrix yM = clm_matrixFromArray(y, yL);

	// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)

	// Forward pass
	clm_Matrix *outputs = calloc(nn.numLayers + 1 /* 1 for input */, sizeof(clm_Matrix));
	outputs[0] = xM;
	for(unsigned int i = 0; i < nn.numLayers; i++) {
		clm_Linear layer = nn.layers[i];
		clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM);
		if(clm_matrixIsInvalid(newX)) {
			printf("Forward pass failed\n");
			return;
		}

		clm_matrixAddMatrix(newX, layer.bias);
		clm_matrixSigmoid(newX);
		xM = newX;
		outputs[i + 1] = xM;
	}

	clm_Matrix dError = clm_matrixSubtractMatrix(yM, outputs[nn.numLayers]); // yhat - y

	clm_Matrix lastGradient = clm_matrixDSigmoid(clm_matrixCopy(outputs[nn.numLayers])); // dsig(yhat)
	clm_matrixMultiplyMatrixElements(lastGradient, dError); // (yhat - y) . dsig(yhat)
	clm_matrixMultiplyScalar(lastGradient, nn.learnRate);

	clm_Matrix lastInputT = clm_matrixTranspose(outputs[nn.numLayers - 1]);
	clm_Matrix lastDeltaW = clm_matrixMultiplyMatrix(lastGradient, lastInputT);
	clm_freeMatrix(lastInputT);

	clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].weights, lastDeltaW);
	clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].bias, lastGradient);

	clm_freeMatrix(lastDeltaW);
	clm_freeMatrix(lastGradient);

	for(int i = nn.numLayers - 2; i >= 0; i--) {
		clm_Linear layer = nn.layers[i];
		clm_Matrix inputToThisLayer = outputs[i];
		clm_Matrix outputOfThisLayer = outputs[i + 1];

		clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
		clm_Matrix newDError = clm_matrixMultiplyMatrix(weightsT, dError);
		clm_freeMatrix(weightsT);
		clm_freeMatrix(dError);
		dError = newDError;

		clm_Matrix gradient = clm_matrixDSigmoid(clm_matrixCopy(outputOfThisLayer));
		clm_matrixMultiplyMatrixElements(gradient, dError);
		clm_matrixMultiplyScalar(gradient, nn.learnRate);

		clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
		clm_Matrix deltaW = clm_matrixMultiplyMatrix(gradient, inputT);
		clm_freeMatrix(inputT);

		clm_matrixAddMatrix(layer.weights, deltaW);
		clm_matrixAddMatrix(layer.bias, gradient);

		clm_freeMatrix(deltaW);
		clm_freeMatrix(gradient);
	}

	clm_freeMatrix(dError);

	for(unsigned int i = 0; i <= nn.numLayers; i++) {
		clm_freeMatrix(outputs[i]);
	}

	free(outputs);
}

void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
	FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
	if(!file) {
		perror("Failed to open labels\n");
		return;
	}

	unsigned char magicBytes[4];
	fread(magicBytes, sizeof(magicBytes), 1, file);

	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);

	unsigned char lengthBytes[4];
	fread(lengthBytes, sizeof(lengthBytes), 1, file);

	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
	printf("%" PRId32 "\n", length);

	clm_Vector *vectors = calloc(length, sizeof(clm_Vector));

	for(unsigned int i = 0; i < length; i++) {
		unsigned char label;
		fread(&label, sizeof(unsigned char), 1, file);

		clm_Vector vector = clm_vectorCreate(10);
		for(unsigned int j = 0; j < 10; j++) {
			vector.values[j] = label == j ? 1 : 0;
		}
		vectors[i] = vector;
	}

	*labelsOut = vectors;
	*labelsCountOut = length;
}

void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
	FILE *file = fopen("data/train-images.idx3-ubyte", "r");
	if(!file) {
		perror("Failed to open images\n");
		return;
	}

	unsigned char magicBytes[4];
	fread(magicBytes, sizeof(magicBytes), 1, file);

	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);

	unsigned char lengthBytes[4];
	fread(lengthBytes, sizeof(lengthBytes), 1, file);
	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
	printf("%" PRId32 "\n", length);

	unsigned char rowsBytes[4];
	fread(rowsBytes, sizeof(rowsBytes), 1, file);
	uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
	printf("%" PRId32 "\n", rows);

	unsigned char colsBytes[4];
	fread(colsBytes, sizeof(colsBytes), 1, file);
	uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
	printf("%" PRId32 "\n", cols);

	clm_Vector *images = calloc(length, sizeof(clm_Vector));
	for(unsigned int i = 0; i < length; i++) {
		clm_Vector vec = clm_vectorCreate(cols * rows);
		unsigned char image[cols * rows];
		fread(image, sizeof(image), 1, file);
		for(unsigned int j = 0; j < cols * rows; j++) {
			vec.values[j] = (float) image[j];
		}
		images[i] = vec;
	}

	*imagesOut = images;
	*imageCountOut = length;
}

int main() {
	clm_Vector *labels = NULL;
	unsigned int labelCount;
	loadLabels(&labels, &labelCount);
	printf("LENGTH: %u\n", labelCount);

	clm_Vector *images = NULL;
	unsigned int imageCount;
	loadImages(&images, &imageCount);

	imageCount = 60000;

	printf("%f\n", images[0].values[0]);

	srand(1);

	unsigned int
		i = 784,
		h = 30,
		o = 10;

	clm_Linear layer1;
	layer1.weights = clm_createMatrixRandom(h, i);
	layer1.bias = clm_createMatrixRandom(h, 1);

	clm_Linear layer2;
	layer2.weights = clm_createMatrixRandom(o, h);
	layer2.bias = clm_createMatrixRandom(o, 1);

	clm_Linear layers[] = {layer1, layer2};
	clm_NN nn = { layers, sizeof(layers) / sizeof(clm_Linear), 0.01 };

	for(unsigned int epoch = 0; epoch < 10;  epoch++) {
		printf("Epoch %u\n", epoch);

		for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
			if(idx % 1000 == 0) {
				printf("%.2f%%\n", idx / (float) imageCount * 100);
			}
			//printf("%u\n", idx);
			//train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
			/*for(unsigned int f = 0; f < images[idx].length; f++) {
				printf("%.2f ", images[idx].values[f]);
			}
			printf("\n");
			for(unsigned int f = 0; f < labels[idx].length; f++) {
				printf("%.2f ", labels[idx].values[f]);
			}
			printf("\n");*/
			//printf("%.2f\n", labels.values[idx]);

			train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
			//train(nn, test, 784, target, 10);
			//predict(nn, test, 784);
		}
	}

	unsigned int correct = 0;
	for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
		//printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
		float *pred = predict(nn, images[idx].values, images[idx].length);
		unsigned int predDigit = 0;
		float max = -1;
		for(unsigned int j = 0; j < 10; j++) {
			//printf("%.2f ", pred[j]);
			if(pred[j] > max || max < 0) {
				max = pred[j];
				predDigit = j;
			}
		}
		if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);

		unsigned int actDigit = 0;
		float maxA = -1;
		for(unsigned int j = 0; j < 10; j++) {
			//printf("%.2f ", pred[j]);
			if(labels[idx].values[j] > maxA || maxA < 0) {
				maxA = labels[idx].values[j];
				actDigit = j;
			}
		}
		if(idx < 100) printf("Actual: %u\n", actDigit);
		//printf("\n");

		if(predDigit == actDigit) correct++;
	}

	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);

	printf("\n");
}