diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..b7c077b --- /dev/null +++ b/.clang-format @@ -0,0 +1,29 @@ +Language: Cpp +BasedOnStyle: LLVM +IndentWidth: 4 +UseTab: Always +TabWidth: 4 +ColumnLimit: 0 +IndentCaseLabels: true +AllowShortIfStatementsOnASingleLine: true +FixNamespaceComments: false +SpaceBeforeParens: Never +SpaceAfterCStyleCast: true +SeparateDefinitionBlocks: Always +PackConstructorInitializers: Never +IncludeBlocks: Preserve +SpaceBeforeInheritanceColon: false +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignTrailingComments: false +AlignOperands: false +AlignEscapedNewlines: false +AlignConsecutiveMacros: false +AllowShortCaseLabelsOnASingleLine: false +SpaceBeforeCtorInitializerColon: false +SpaceBeforeAssignmentOperators: true +AllowShortLoopsOnASingleLine: true +AlignAfterOpenBracket: DontAlign +LambdaBodyIndentation: Signature +LineEnding: LF +ContinuationIndentWidth: 4 diff --git a/.vscode/settings.json b/.vscode/settings.json index 28c23cb..030229e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,6 +8,7 @@ "string": "c", "string_view": "c", "stdint.h": "c", - "inttypes.h": "c" + "inttypes.h": "c", + "cl.h": "c" } } diff --git a/Makefile b/Makefile index 179731b..15c72a4 100644 --- a/Makefile +++ b/Makefile @@ -10,3 +10,12 @@ all: .PHONY: run run: all $(BUILD)/cltest + +.PHONY: cl +cl: + mkdir -p $(BUILD) + gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c + +.PHONY: cl_run +cl_run: cl + $(BUILD)/cl diff --git a/src/cl.c b/src/cl.c new file mode 100644 index 0000000..925c9ba --- /dev/null +++ b/src/cl.c @@ -0,0 +1,128 @@ +#define CL_TARGET_OPENCL_VERSION 300 + +#include +#include +#include +#include + +typedef struct __attribute__ ((packed)) { + +} cl_GPUMat; + +char *loadFile(const char *path) { + FILE *file = fopen(path, "r"); + fseek(file, 0, SEEK_END); + size_t length = ftell(file); + fseek(file, 0, SEEK_SET); + char *buffer = calloc(1, length + 1); + fread(buffer, length, 1, file); + return buffer; +} + +int main() { + // Connect to a compute device + int useGPU = true; + cl_device_id deviceID; + cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); + if (err != CL_SUCCESS) { + printf("Error: Failed to create a device group!\n"); + return 1; + } + + char *buffer = loadFile("src/mat.cl"); + printf("%s", buffer); + + cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); + if(!context) { + printf("Failed to create context\n"); + return 1; + } + + cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err); + if(!queue) { + printf("Failed to create command queue\n"); + return 1; + } + + size_t length = strlen(buffer); + cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err); + if(!program) { + printf("Failed to create program\n"); + return 1; + } + + err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to build program\n"); + // clGetProgramBuildInfo... + return 1; + } + + cl_kernel kernel = clCreateKernel(program, "do_stuff", &err); + if(!kernel) { + printf("Failed to create kernel\n"); + return 1; + } + + unsigned int inputSize = 256000000; + float *inputData = calloc(inputSize, sizeof(float)); + for(unsigned int i = 0; i < inputSize; i++) { + inputData[i] = i; + } + + cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err); + cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err); + if(!input || !output) { + printf("Failed to allocate input/output buffer\n"); + return 1; + } + + err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to write to buffer\n"); + return 1; + } + + err = 0; + err = clSetKernelArg(kernel, 0, sizeof(input), &input); + err |= clSetKernelArg(kernel, 1, sizeof(output), &output); + err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize); + if(err != CL_SUCCESS) { + printf("Failed to set kernel args\n"); + return 1; + } + + /*char *info = calloc(1, 1024); + clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL); + printf("INFO: %s\n", info);*/ + + size_t local; + err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); + if(err != CL_SUCCESS) { + printf("Failed to get work group size\n"); + return 1; + } + + printf("Group size is %zu\n", local); + + size_t global = inputSize; + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue\n"); + return 1; + } + + clFinish(queue); + + float *outputData = calloc(inputSize, sizeof(float)); + err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to read from buffer\n"); + return 1; + } + + for(unsigned int i = 0; i < inputSize; i++) { + if(i % 1000 != 0) continue; + printf("%f: %f\n", inputData[i], outputData[i]); + } +} diff --git a/src/clm.c b/src/clm.c index 1771ce6..e7c753a 100644 --- a/src/clm.c +++ b/src/clm.c @@ -1,18 +1,20 @@ -#include -#include -#include #include +#include +#include +#include #include "clm.h" -const clm_Matrix INVALID_MATRIX = { .rows = 0, .cols = 0, .values = NULL }; -const clm_Vector INVALID_VECTOR = { .length = 0, .values = NULL }; +const clm_Matrix INVALID_MATRIX = {.rows = 0, .cols = 0, .values = NULL}; +const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL}; clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) { + printf("CREATING MATRIX\n"); clm_Matrix mat; mat.rows = rows; mat.cols = cols; mat.values = calloc(rows * cols, sizeof(float)); + mat.transposed = false; return mat; } @@ -28,6 +30,14 @@ clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols) { return mat; } +clm_Matrix clm_matrixZero(clm_Matrix mat) { + for(unsigned int i = 0; i < mat.rows * mat.cols; i++) { + mat.values[i] = 0; + } + + return mat; +} + void clm_freeMatrix(clm_Matrix mat) { free(mat.values); } @@ -50,7 +60,7 @@ clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar) { clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) { if(mat.cols != other.cols || mat.rows != other.rows) { - printf("Failed to add matrices\n"); + printf("Failed to add matrices (got %dx%d and %dx%d)\n", mat.cols, mat.rows, other.cols, other.rows); return INVALID_MATRIX; } @@ -58,14 +68,12 @@ clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other) { mat.values[i] += other.values[i]; } - //clm_freeMatrix(other); - return mat; } clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) { if(mat.cols != other.cols || mat.rows != other.rows) { - printf("Failed to add matrices\n"); + printf("Failed to sub matrices\n"); return INVALID_MATRIX; } @@ -73,45 +81,46 @@ clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other) { mat.values[i] -= other.values[i]; } - //clm_freeMatrix(other); - return mat; } clm_Matrix clm_matrixTranspose(clm_Matrix mat) { - clm_Matrix tr = clm_createMatrix(mat.cols, mat.rows); - for(unsigned int i = 0; i < tr.rows; i++) { - for(unsigned int j = 0; j < tr.cols; j++) { - tr.values[i * tr.cols + j] = mat.values[j * mat.cols + i]; - } - } - - //clm_freeMatrix(mat); - + clm_Matrix tr; + tr.cols = mat.rows; + tr.rows = mat.cols; + tr.values = mat.values; + tr.transposed = !mat.transposed; return tr; } -clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b) { +clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out) { if(a.cols != b.rows) { - printf("Cannot multiply matrices\n"); + printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols); return INVALID_MATRIX; } - clm_Matrix r = clm_createMatrix(a.rows, b.cols); - for(unsigned int i = 0; i < r.rows; i++) { - for(unsigned int j = 0; j < r.cols; j++) { + if(out.rows != a.rows || out.cols != b.cols) { + printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols); + return INVALID_MATRIX; + } + + for(unsigned int i = 0; i < out.rows; i++) { + for(unsigned int j = 0; j < out.cols; j++) { float sum = 0; for(unsigned int k = 0; k < a.cols; k++) { sum += a.values[i * a.cols + k] * b.values[k * b.cols + j]; } - r.values[i * r.cols + j] = sum; + out.values[i * out.cols + j] = sum; } } - //clm_freeMatrix(a); - //clm_freeMatrix(b); + return out; +} - return r; +clm_Matrix clm_matrixMultiplyMatrixALLOC(clm_Matrix a, clm_Matrix b) { + clm_Matrix out = clm_createMatrix(a.rows, b.cols); + clm_matrixMultiplyMatrix(a, b, out); + return out; } clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other) { @@ -138,7 +147,7 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) { clm_Matrix clm_matrixSigmoid(clm_Matrix mat) { for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int j = 0; j < mat.cols; j++) { - mat.values[i * mat.cols + j] = 1 / (1 + exp(-mat.values[i * mat.cols + j])); + matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j))); } } @@ -148,8 +157,8 @@ clm_Matrix clm_matrixSigmoid(clm_Matrix mat) { clm_Matrix clm_matrixDSigmoid(clm_Matrix mat) { for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int j = 0; j < mat.cols; j++) { - float v = mat.values[i * mat.cols + j]; - mat.values[i * mat.cols + j] = v * (1 - v); + float v = matrixAt(mat, i, j); + matrixAt(mat, i, j) = v * (1 - v); } } @@ -162,6 +171,14 @@ clm_Matrix clm_matrixFromArray(float *array, unsigned int length) { return matrix; } +clm_Matrix clm_matrixWrapArray(float *array, unsigned int length) { + clm_Matrix mat; + mat.rows = length; + mat.cols = 1; + mat.values = array; + return mat; +} + bool clm_matrixIsInvalid(clm_Matrix mat) { return mat.values == NULL; } @@ -181,6 +198,10 @@ clm_Linear clm_linearCreateRandom(unsigned int inputs, unsigned int outputs) { clm_Linear linear; linear.weights = clm_createMatrixRandom(outputs, inputs); linear.bias = clm_createMatrixRandom(outputs, 1); + + linear.output = clm_createMatrix(outputs, 1); + linear.error = clm_createMatrix(outputs, 1); + linear.weightsError = clm_createMatrix(outputs, inputs); return linear; } @@ -197,7 +218,7 @@ void clm_matrixPrint(clm_Matrix mat) { printf("[\n"); for(unsigned int i = 0; i < mat.rows; i++) { for(unsigned int j = 0; j < mat.cols; j++) { - printf("%7.3f", mat.values[i * mat.cols + j]); + printf("%7.3f", matrixAt(mat, i, j)); } printf("\n"); } diff --git a/src/clm.h b/src/clm.h index 9c71566..af20484 100644 --- a/src/clm.h +++ b/src/clm.h @@ -3,10 +3,13 @@ #include +#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] + typedef struct { float *values; unsigned int rows; unsigned int cols; + bool transposed; } clm_Matrix; typedef struct { @@ -17,6 +20,9 @@ typedef struct { typedef struct { clm_Matrix weights; clm_Matrix bias; + clm_Matrix output; + clm_Matrix error; + clm_Matrix weightsError; } clm_Linear; typedef struct { @@ -29,17 +35,20 @@ extern const clm_Matrix INVALID_MATRIX; extern const clm_Vector INVALID_VECTOR; clm_Matrix clm_createMatrixRandom(unsigned int rows, unsigned int cols); +clm_Matrix clm_matrixZero(clm_Matrix mat); clm_Matrix clm_matrixAddScalar(clm_Matrix mat, float scalar); clm_Matrix clm_matrixAddMatrix(clm_Matrix mat, clm_Matrix other); clm_Matrix clm_matrixSubtractMatrix(clm_Matrix mat, clm_Matrix other); clm_Matrix clm_matrixTranspose(clm_Matrix mat); -clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b); +clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out); +clm_Matrix clm_matrixMultiplyMatrixALLOC(clm_Matrix a, clm_Matrix b); clm_Matrix clm_matrixMultiplyMatrixElements(clm_Matrix mat, clm_Matrix other); clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar); clm_Matrix clm_matrixSigmoid(clm_Matrix mat); clm_Matrix clm_matrixDSigmoid(clm_Matrix mat); clm_Matrix clm_matrixFromArray(float *array, unsigned int length); +clm_Matrix clm_matrixWrapArray(float *array, unsigned int length); clm_Matrix clm_matrixCopy(clm_Matrix matrix); bool clm_matrixIsInvalid(clm_Matrix mat); diff --git a/src/cltest.c b/src/cltest.c index 0e1ddc6..da9ca0d 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -1,6 +1,7 @@ +#include +#include #include #include -#include #include "clm.h" @@ -8,22 +9,20 @@ float train_data_x[4][2] = { {0, 0}, {0, 1}, {1, 0}, - {1, 1} -}; + {1, 1}}; float train_data_y[4][1] = { {0}, {1}, {1}, - {0} -}; + {0}}; float *predict(clm_NN nn, float *x, unsigned int length) { - clm_Matrix xM = clm_matrixFromArray(x, length); + clm_Matrix xM = clm_matrixWrapArray(x, length); for(unsigned int i = 0; i < nn.numLayers; i++) { clm_Linear layer = nn.layers[i]; - clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM); + clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output); if(clm_matrixIsInvalid(newX)) { printf("Failed to predict\n"); @@ -32,7 +31,6 @@ float *predict(clm_NN nn, float *x, unsigned int length) { clm_matrixAddMatrix(newX, layer.bias); clm_matrixSigmoid(newX); - clm_freeMatrix(xM); xM = newX; } @@ -40,17 +38,16 @@ float *predict(clm_NN nn, float *x, unsigned int length) { } void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { - clm_Matrix xM = clm_matrixFromArray(x, xL); - clm_Matrix yM = clm_matrixFromArray(y, yL); + clm_Matrix xM = clm_matrixWrapArray(x, xL); + clm_Matrix yM = clm_matrixWrapArray(y, yL); // TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached) // Forward pass - clm_Matrix *outputs = calloc(nn.numLayers + 1 /* 1 for input */, sizeof(clm_Matrix)); - outputs[0] = xM; + clm_Matrix currentX = xM; for(unsigned int i = 0; i < nn.numLayers; i++) { clm_Linear layer = nn.layers[i]; - clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM); + clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output); if(clm_matrixIsInvalid(newX)) { printf("Forward pass failed\n"); return; @@ -58,59 +55,34 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { clm_matrixAddMatrix(newX, layer.bias); clm_matrixSigmoid(newX); - xM = newX; - outputs[i + 1] = xM; + currentX = newX; } - clm_Matrix dError = clm_matrixSubtractMatrix(yM, outputs[nn.numLayers]); // yhat - y - - clm_Matrix lastGradient = clm_matrixDSigmoid(clm_matrixCopy(outputs[nn.numLayers])); // dsig(yhat) - clm_matrixMultiplyMatrixElements(lastGradient, dError); // (yhat - y) . dsig(yhat) - clm_matrixMultiplyScalar(lastGradient, nn.learnRate); - - clm_Matrix lastInputT = clm_matrixTranspose(outputs[nn.numLayers - 1]); - clm_Matrix lastDeltaW = clm_matrixMultiplyMatrix(lastGradient, lastInputT); - clm_freeMatrix(lastInputT); - - clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].weights, lastDeltaW); - clm_matrixAddMatrix(nn.layers[nn.numLayers - 1].bias, lastGradient); - - clm_freeMatrix(lastDeltaW); - clm_freeMatrix(lastGradient); - - for(int i = nn.numLayers - 2; i >= 0; i--) { + for(int i = nn.numLayers - 1; i >= 0; i--) { clm_Linear layer = nn.layers[i]; - clm_Matrix inputToThisLayer = outputs[i]; - clm_Matrix outputOfThisLayer = outputs[i + 1]; + clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output; + clm_Matrix outputOfThisLayer = nn.layers[i].output; + clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; + clm_Matrix error = layer.error; - clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); - clm_Matrix newDError = clm_matrixMultiplyMatrix(weightsT, dError); - clm_freeMatrix(weightsT); - clm_freeMatrix(dError); - dError = newDError; + if(i == nn.numLayers - 1) { + clm_matrixZero(error); // Zero the error matrix + clm_matrixSubtractMatrix(clm_matrixAddMatrix(error, yM), outputOfThisLayer); // yhat - y + } else { + clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); + clm_matrixMultiplyMatrix(weightsT, prevError, error); + } - clm_Matrix gradient = clm_matrixDSigmoid(clm_matrixCopy(outputOfThisLayer)); - clm_matrixMultiplyMatrixElements(gradient, dError); + clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat) + clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) clm_matrixMultiplyScalar(gradient, nn.learnRate); clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer); - clm_Matrix deltaW = clm_matrixMultiplyMatrix(gradient, inputT); - clm_freeMatrix(inputT); + clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); - clm_matrixAddMatrix(layer.weights, deltaW); + clm_matrixAddMatrix(layer.weights, layer.weightsError); clm_matrixAddMatrix(layer.bias, gradient); - - clm_freeMatrix(deltaW); - clm_freeMatrix(gradient); } - - clm_freeMatrix(dError); - - for(unsigned int i = 0; i <= nn.numLayers; i++) { - clm_freeMatrix(outputs[i]); - } - - free(outputs); } void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { @@ -128,7 +100,8 @@ void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { unsigned char lengthBytes[4]; fread(lengthBytes, sizeof(lengthBytes), 1, file); - uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3]; + uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | + (lengthBytes[2] << 8) | lengthBytes[3]; printf("%" PRId32 "\n", length); clm_Vector *vectors = calloc(length, sizeof(clm_Vector)); @@ -190,7 +163,13 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { *imageCountOut = length; } +typedef void *(*callocFunc)(size_t, size_t); + +callocFunc oldCalloc; + int main() { + oldCalloc = dlsym(RTLD_NEXT, "calloc"); + clm_Vector *labels = NULL; unsigned int labelCount; loadLabels(&labels, &labelCount); @@ -211,50 +190,48 @@ int main() { h = 30, o = 10; - clm_Linear layer1; - layer1.weights = clm_createMatrixRandom(h, i); - layer1.bias = clm_createMatrixRandom(h, 1); - - clm_Linear layer2; - layer2.weights = clm_createMatrixRandom(o, h); - layer2.bias = clm_createMatrixRandom(o, 1); - + clm_Linear layer1 = clm_linearCreateRandom(i, h); + clm_Linear layer2 = clm_linearCreateRandom(h, o); clm_Linear layers[] = {layer1, layer2}; - clm_NN nn = { layers, sizeof(layers) / sizeof(clm_Linear), 0.01 }; + clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01}; - for(unsigned int epoch = 0; epoch < 10; epoch++) { + for(unsigned int epoch = 0; epoch < 10; epoch++) { printf("Epoch %u\n", epoch); - for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample if(idx % 1000 == 0) { - printf("%.2f%%\n", idx / (float) imageCount * 100); + printf("\r%.2f%%", idx / (float) imageCount * 100); + fflush(stdout); } - //printf("%u\n", idx); - //train(nn, train_data_x[idx], 2, train_data_y[idx], 1); + // printf("%u\n", idx); + // train(nn, train_data_x[idx], 2, train_data_y[idx], 1); /*for(unsigned int f = 0; f < images[idx].length; f++) { - printf("%.2f ", images[idx].values[f]); + printf("%.2f ", images[idx].values[f]); } printf("\n"); for(unsigned int f = 0; f < labels[idx].length; f++) { - printf("%.2f ", labels[idx].values[f]); + printf("%.2f ", labels[idx].values[f]); } printf("\n");*/ - //printf("%.2f\n", labels.values[idx]); + // printf("%.2f\n", labels.values[idx]); train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); - //train(nn, test, 784, target, 10); - //predict(nn, test, 784); + // train(nn, test, 784, target, 10); + // predict(nn, test, 784); } + printf("\n"); } + printf("Train done\n"); + unsigned int correct = 0; for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample - //printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); + // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], + // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); float *pred = predict(nn, images[idx].values, images[idx].length); unsigned int predDigit = 0; float max = -1; for(unsigned int j = 0; j < 10; j++) { - //printf("%.2f ", pred[j]); + // printf("%.2f ", pred[j]); if(pred[j] > max || max < 0) { max = pred[j]; predDigit = j; @@ -265,14 +242,14 @@ int main() { unsigned int actDigit = 0; float maxA = -1; for(unsigned int j = 0; j < 10; j++) { - //printf("%.2f ", pred[j]); + // printf("%.2f ", pred[j]); if(labels[idx].values[j] > maxA || maxA < 0) { maxA = labels[idx].values[j]; actDigit = j; } } if(idx < 100) printf("Actual: %u\n", actDigit); - //printf("\n"); + // printf("\n"); if(predDigit == actDigit) correct++; } @@ -281,3 +258,8 @@ int main() { printf("\n"); } + +void *calloc(size_t nmemb, size_t size) { + // printf("CALLOC\n"); + return oldCalloc(nmemb, size); +} diff --git a/src/mat.cl b/src/mat.cl new file mode 100644 index 0000000..babdf5b --- /dev/null +++ b/src/mat.cl @@ -0,0 +1,11 @@ +typedef struct __attribute__ ((packed)) { + +} cl_GPUMat; + +void amogus() { + printf("HEllo world!"); +} + +__kernel void mat_multiply(__global float *nnWeights, __global float *inputs, __global float *c) { + amogus(); +}