#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "clm.h"
#include "clm_gpu.h"

float train_data_x[4][2] = {
	{0, 0},
	{0, 1},
	{1, 0},
	{1, 1}};

float train_data_y[4][1] = {
	{0},
	{1},
	{1},
	{0}};

float *predict(clm_NN nn, clm_Vector input) {
	clm_Matrix xM = clm_matrixWrapArray(input.values, input.length);

	for(unsigned int i = 0; i < nn.numLayers; i++) {
		clm_linearForward(&nn.layers[i], 1, &xM, &nn.layers[i].output[0]);
		xM = nn.layers[i].output[0];
	}

	return xM.values;
}

void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) {
	clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix));
	clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix));

	for(unsigned int b = 0; b < ceil((float) numElements / nn.batchSize); b++) {
		unsigned int batchSize = numElements - b * nn.batchSize;
		if(batchSize > nn.batchSize) batchSize = nn.batchSize;

		printf("Batch %d (size %d)\n", b, batchSize);

		for(unsigned int i = 0; i < batchSize; i++) {
			clm_Vector input = inputs[b * nn.batchSize + i];
			clm_Vector output = expectedOutputs[b * nn.batchSize + i];
			batchInputs[i] = clm_matrixWrapArray(input.values, input.length);
			batchOutputs[i] = clm_matrixWrapArray(output.values, output.length);
		}

		// Forward pass
		clm_Matrix *currentXs = batchInputs;
		for(unsigned int i = 0; i < nn.numLayers; i++) {
			clm_linearForward(&nn.layers[i], batchSize, currentXs, nn.layers[i].output);
			currentXs = nn.layers[i].output;
		}

		clm_Linear *lastLayer = &nn.layers[nn.numLayers - 1];
		for(unsigned int b = 0; b < batchSize; b++) {
			// Error of last layer = y - yhat
			clm_matrixCopy(batchOutputs[b], lastLayer->error[b]); // lastLayer.error = y
			clm_matrixSubtractMatrix(lastLayer->error[b], lastLayer->output[b]); // lastLayer.error -= yhat
		}

		for(int i = nn.numLayers - 1; i >= 0; i--) {
			clm_Linear *layer = &nn.layers[i];
			clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
			clm_Matrix *outputsOfThisLayer = layer->output;
			clm_linearBackprop(layer, nn.learnRate, batchSize, inputsToThisLayer, outputsOfThisLayer, layer->error, i > 0, i == 0 ? NULL : nn.layers[i - 1].error, layer->weightsError, layer->gradient);

			for(unsigned int b = 0; b < batchSize; b++) {
				clm_matrixAddMatrix(layer->weights, layer->weightsError[b]);
				clm_matrixAddMatrix(layer->bias, layer->gradient[b]);
			}
		}

		/*for(int i = nn.numLayers - 1; i >= 0; i--) {
			clm_Linear layer = nn.layers[i];
			clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output;
			clm_Matrix *outputsOfThisLayer = nn.layers[i].output;
			clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
			clm_Matrix error = layer.error;

			if(i == nn.numLayers - 1) {
				clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y
			} else {
				clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
				clm_matrixMultiplyMatrix(weightsT, prevError, error);
			}

			clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat)
			clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
			clm_matrixMultiplyScalar(gradient, nn.learnRate);

			clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]);
			clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);

			clm_matrixAddMatrix(layer.weights, layer.weightsError);
			clm_matrixAddMatrix(layer.bias, gradient);
		}*/
	}

	free(batchInputs);
	free(batchOutputs);
}

void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
	FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
	if(!file) {
		perror("Failed to open labels\n");
		return;
	}

	unsigned char magicBytes[4];
	fread(magicBytes, sizeof(magicBytes), 1, file);

	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);

	unsigned char lengthBytes[4];
	fread(lengthBytes, sizeof(lengthBytes), 1, file);

	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
		(lengthBytes[2] << 8) | lengthBytes[3];
	printf("%" PRId32 "\n", length);

	clm_Vector *vectors = calloc(length, sizeof(clm_Vector));

	for(unsigned int i = 0; i < length; i++) {
		unsigned char label;
		fread(&label, sizeof(unsigned char), 1, file);

		clm_Vector vector = clm_vectorCreate(10);
		for(unsigned int j = 0; j < 10; j++) {
			vector.values[j] = label == j ? 1 : 0;
		}
		vectors[i] = vector;
	}

	*labelsOut = vectors;
	*labelsCountOut = length;
}

void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
	FILE *file = fopen("data/train-images.idx3-ubyte", "r");
	if(!file) {
		perror("Failed to open images\n");
		return;
	}

	unsigned char magicBytes[4];
	fread(magicBytes, sizeof(magicBytes), 1, file);

	printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);

	unsigned char lengthBytes[4];
	fread(lengthBytes, sizeof(lengthBytes), 1, file);
	uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
	printf("%" PRId32 "\n", length);

	unsigned char rowsBytes[4];
	fread(rowsBytes, sizeof(rowsBytes), 1, file);
	uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
	printf("%" PRId32 "\n", rows);

	unsigned char colsBytes[4];
	fread(colsBytes, sizeof(colsBytes), 1, file);
	uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
	printf("%" PRId32 "\n", cols);

	clm_Vector *images = calloc(length, sizeof(clm_Vector));
	for(unsigned int i = 0; i < length; i++) {
		clm_Vector vec = clm_vectorCreate(cols * rows);
		unsigned char image[cols * rows];
		fread(image, sizeof(image), 1, file);
		for(unsigned int j = 0; j < cols * rows; j++) {
			vec.values[j] = (float) image[j];
		}
		images[i] = vec;
	}

	*imagesOut = images;
	*imageCountOut = length;
}

int main() {
	if(clm_gpuInit() != 0) {
		printf("Failed to init GPU\n");
		return 1;
	}

	clm_Vector *labels = NULL;
	unsigned int labelCount;
	loadLabels(&labels, &labelCount);
	printf("LENGTH: %u\n", labelCount);

	clm_Vector *images = NULL;
	unsigned int imageCount;
	loadImages(&images, &imageCount);

	imageCount = 60000;

	printf("%f\n", images[0].values[0]);

	srand(1);

	unsigned int
		i = 784,
		h = 30,
		o = 10;

	clm_Linear layers[] = {
		clm_linearCreateRandom(i, h),
		clm_linearCreateRandom(h, o)};
	clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000);

	for(unsigned int epoch = 0; epoch < 1; epoch++) {
		printf("Epoch %u\n", epoch);
		/*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
			if(idx % 1000 == 0) {
				printf("\r%.2f%%", idx / (float) imageCount * 100);
				fflush(stdout);
			}
		}*/
		train(nn, imageCount, images, labels);
		printf("\n");
	}

	printf("Train done\n");

	unsigned int correct = 0;
	for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
		// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
		// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
		float *pred = predict(nn, images[idx]);
		unsigned int predDigit = 0;
		float max = -1;
		for(unsigned int j = 0; j < 10; j++) {
			// printf("%.2f ", pred[j]);
			if(pred[j] > max || max < 0) {
				max = pred[j];
				predDigit = j;
			}
		}
		// if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);

		unsigned int actDigit = 0;
		float maxA = -1;
		for(unsigned int j = 0; j < 10; j++) {
			// printf("%.2f ", pred[j]);
			if(labels[idx].values[j] > maxA || maxA < 0) {
				maxA = labels[idx].values[j];
				actDigit = j;
			}
		}
		// if(idx < 100) printf("Actual: %u\n", actDigit);
		// printf("\n");

		if(predDigit == actDigit) correct++;
	}

	printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);

	printf("\n");

	clm_gpuDestroy();
}