#include #include #include #include #include "clm.h" #include "clm_gpu.h" float train_data_x[4][2] = { {0, 0}, {0, 1}, {1, 0}, {1, 1}}; float train_data_y[4][1] = { {0}, {1}, {1}, {0}}; float *predict(clm_NN nn, clm_Vector input) { clm_Matrix xM = clm_matrixWrapArray(input.values, input.length); for(unsigned int i = 0; i < nn.numLayers; i++) { clm_linearForward(&nn.layers[i], 1, &xM, &nn.layers[i].output[0]); xM = nn.layers[i].output[0]; } return xM.values; } void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector *expectedOutputs) { clm_Matrix *batchInputs = calloc(nn.batchSize, sizeof(clm_Matrix)); clm_Matrix *batchOutputs = calloc(nn.batchSize, sizeof(clm_Matrix)); for(unsigned int b = 0; b < ceil((float) numElements / nn.batchSize); b++) { unsigned int batchSize = numElements - b * nn.batchSize; if(batchSize > nn.batchSize) batchSize = nn.batchSize; printf("Batch %d (size %d)\n", b, batchSize); for(unsigned int i = 0; i < batchSize; i++) { clm_Vector input = inputs[b * nn.batchSize + i]; clm_Vector output = expectedOutputs[b * nn.batchSize + i]; batchInputs[i] = clm_matrixWrapArray(input.values, input.length); batchOutputs[i] = clm_matrixWrapArray(output.values, output.length); } // Forward pass clm_Matrix *currentXs = batchInputs; for(unsigned int i = 0; i < nn.numLayers; i++) { clm_linearForward(&nn.layers[i], batchSize, currentXs, nn.layers[i].output); currentXs = nn.layers[i].output; } clm_Linear *lastLayer = &nn.layers[nn.numLayers - 1]; for(unsigned int b = 0; b < batchSize; b++) { // Error of last layer = y - yhat clm_matrixCopy(batchOutputs[b], lastLayer->error[b]); // lastLayer.error = y clm_matrixSubtractMatrix(lastLayer->error[b], lastLayer->output[b]); // lastLayer.error -= yhat } for(int i = nn.numLayers - 1; i >= 0; i--) { clm_Linear *layer = &nn.layers[i]; clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; clm_Matrix *outputsOfThisLayer = layer->output; clm_linearBackprop(layer, nn.learnRate, batchSize, inputsToThisLayer, outputsOfThisLayer, layer->error, i > 0, i == 0 ? NULL : nn.layers[i - 1].error, layer->weightsError, layer->gradient); for(unsigned int b = 0; b < batchSize; b++) { clm_matrixAddMatrix(layer->weights, layer->weightsError[b]); clm_matrixAddMatrix(layer->bias, layer->gradient[b]); } } /*for(int i = nn.numLayers - 1; i >= 0; i--) { clm_Linear layer = nn.layers[i]; clm_Matrix *inputsToThisLayer = i == 0 ? batchInputs : nn.layers[i - 1].output; clm_Matrix *outputsOfThisLayer = nn.layers[i].output; clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; clm_Matrix error = layer.error; if(i == nn.numLayers - 1) { clm_matrixSubtractMatrix(clm_matrixCopy(batchOutputs[0], error), outputsOfThisLayer[0]); // yhat - y } else { clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); clm_matrixMultiplyMatrix(weightsT, prevError, error); } clm_Matrix gradient = clm_matrixDSigmoid(outputsOfThisLayer[0]); // dsig(yhat) clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) clm_matrixMultiplyScalar(gradient, nn.learnRate); clm_Matrix inputT = clm_matrixTranspose(inputsToThisLayer[0]); clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); clm_matrixAddMatrix(layer.weights, layer.weightsError); clm_matrixAddMatrix(layer.bias, gradient); }*/ } free(batchInputs); free(batchOutputs); } void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { FILE *file = fopen("data/train-labels.idx1-ubyte", "r"); if(!file) { perror("Failed to open labels\n"); return; } unsigned char magicBytes[4]; fread(magicBytes, sizeof(magicBytes), 1, file); printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]); unsigned char lengthBytes[4]; fread(lengthBytes, sizeof(lengthBytes), 1, file); uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3]; printf("%" PRId32 "\n", length); clm_Vector *vectors = calloc(length, sizeof(clm_Vector)); for(unsigned int i = 0; i < length; i++) { unsigned char label; fread(&label, sizeof(unsigned char), 1, file); clm_Vector vector = clm_vectorCreate(10); for(unsigned int j = 0; j < 10; j++) { vector.values[j] = label == j ? 1 : 0; } vectors[i] = vector; } *labelsOut = vectors; *labelsCountOut = length; } void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { FILE *file = fopen("data/train-images.idx3-ubyte", "r"); if(!file) { perror("Failed to open images\n"); return; } unsigned char magicBytes[4]; fread(magicBytes, sizeof(magicBytes), 1, file); printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]); unsigned char lengthBytes[4]; fread(lengthBytes, sizeof(lengthBytes), 1, file); uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3]; printf("%" PRId32 "\n", length); unsigned char rowsBytes[4]; fread(rowsBytes, sizeof(rowsBytes), 1, file); uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3]; printf("%" PRId32 "\n", rows); unsigned char colsBytes[4]; fread(colsBytes, sizeof(colsBytes), 1, file); uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3]; printf("%" PRId32 "\n", cols); clm_Vector *images = calloc(length, sizeof(clm_Vector)); for(unsigned int i = 0; i < length; i++) { clm_Vector vec = clm_vectorCreate(cols * rows); unsigned char image[cols * rows]; fread(image, sizeof(image), 1, file); for(unsigned int j = 0; j < cols * rows; j++) { vec.values[j] = (float) image[j]; } images[i] = vec; } *imagesOut = images; *imageCountOut = length; } int main() { if(clm_gpuInit() != 0) { printf("Failed to init GPU\n"); return 1; } clm_Vector *labels = NULL; unsigned int labelCount; loadLabels(&labels, &labelCount); printf("LENGTH: %u\n", labelCount); clm_Vector *images = NULL; unsigned int imageCount; loadImages(&images, &imageCount); imageCount = 60000; printf("%f\n", images[0].values[0]); srand(1); unsigned int i = 784, h = 30, o = 10; clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000); for(unsigned int epoch = 0; epoch < 1; epoch++) { printf("Epoch %u\n", epoch); /*for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample if(idx % 1000 == 0) { printf("\r%.2f%%", idx / (float) imageCount * 100); fflush(stdout); } }*/ train(nn, imageCount, images, labels); printf("\n"); } printf("Train done\n"); unsigned int correct = 0; for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); float *pred = predict(nn, images[idx]); unsigned int predDigit = 0; float max = -1; for(unsigned int j = 0; j < 10; j++) { // printf("%.2f ", pred[j]); if(pred[j] > max || max < 0) { max = pred[j]; predDigit = j; } } // if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); unsigned int actDigit = 0; float maxA = -1; for(unsigned int j = 0; j < 10; j++) { // printf("%.2f ", pred[j]); if(labels[idx].values[j] > maxA || maxA < 0) { maxA = labels[idx].values[j]; actDigit = j; } } // if(idx < 100) printf("Actual: %u\n", actDigit); // printf("\n"); if(predDigit == actDigit) correct++; } printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100); printf("\n"); clm_gpuDestroy(); }