diff --git a/Makefile b/Makefile index 44af914..a40a3e2 100644 --- a/Makefile +++ b/Makefile @@ -2,24 +2,35 @@ SRC=src BUILD=build CFLAGS=-Wall -g -CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c -CLM_SOURCES_CPU=$(CLM_SOURCES) $(SRC)/clm_gpu_cpu.c -CLM_SOURCES_OPENCL=$(CLM_SOURCES) $(SRC)/clm_gpu_opencl.c +CLM_SOURCES=$(SRC)/clm.c $(SRC)/clm_util.c $(SRC)/clm_funcs.c .PHONY: all -all: - mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(CLM_SOURCES_OPENCL) +all: libclm_cpu libclm_opencl cltest -.PHONY: run +.PHONY: run_cpu run: all - $(BUILD)/cltest + cd $(BUILD) && ./cltest cpu -.PHONY: cl -cl: +.PHONY: run_opencl +run: all + cd $(BUILD) && ./cltest opencl + +.PHONY: cltest +cltest: libclm mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(CLM_SOURCES_OPENCL) + gcc -lOpenCL -lm -Wl,-rpath -Wl,'$$ORIGIN' -L$(BUILD) -lclm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c -.PHONY: cl_run -cl_run: cl - $(BUILD)/cl +.PHONY: libclm +libclm: + mkdir -p $(BUILD) + gcc -lOpenCL -lm -Wl,-rpath -Wl,'$$ORIGIN' -shared -fpic $(CFLAGS) -o $(BUILD)/libclm.so $(CLM_SOURCES) + +.PHONY: libclm_cpu +libclm_cpu: + mkdir -p $(BUILD) + gcc -lOpenCL -lm -L$(BUILD) -lclm -shared -fpic $(CFLAGS) -o $(BUILD)/libclm_cpu.so $(SRC)/clm_cpu.c + +.PHONY: libclm_opencl +libclm_opencl: + mkdir -p $(BUILD) + gcc -lOpenCL -lm -L$(BUILD) -lclm -shared -fpic $(CFLAGS) -o $(BUILD)/libclm_opencl.so $(SRC)/clm_opencl.c diff --git a/src/cl.c b/src/cl.c deleted file mode 100644 index 6e0c937..0000000 --- a/src/cl.c +++ /dev/null @@ -1,141 +0,0 @@ -#define CL_TARGET_OPENCL_VERSION 200 -#include -#include - -#include "clm.h" -#include "util.h" - -#include -#include -#include -#include - -typedef struct __attribute__((packed)) { - cl_uint rows; - cl_uint cols; - cl_char transposed; -} cl_GPUMat; - -int main() { - // Connect to a compute device - int useGPU = true; - cl_device_id deviceID; - cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); - if(err != CL_SUCCESS) { - printf("Error: Failed to create a device group!\n"); - return 1; - } - - char *buffer = loadFile("src/mat.cl"); - printf("%s", buffer); - - cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); - if(!context) { - printf("Failed to create context\n"); - return 1; - } - - cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err); - if(!queue) { - printf("Failed to create command queue\n"); - return 1; - } - - size_t length = strlen(buffer); - cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err); - if(!program) { - printf("Failed to create program\n"); - return 1; - } - - err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to build program\n"); - // clGetProgramBuildInfo... - return 1; - } - - cl_kernel kernel = clCreateKernel(program, "mat_multiply", &err); - if(!kernel) { - printf("Failed to create kernel\n"); - return 1; - } - - clm_Matrix a = clm_createMatrixRandom(5, 10); - clm_Matrix b = clm_createMatrixRandom(10, 3); - clm_Matrix out = clm_createMatrixRandom(5, 3); - - cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed}; - cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed}; - cl_GPUMat matOut = {.rows = out.rows, .cols = out.cols, .transposed = out.transposed}; - - size_t inputSize = out.rows * out.cols; - - clm_matrixMultiplyMatrix(a, b, out); - - clm_matrixPrint(out); - clm_matrixZero(out); - - cl_mem matA_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * a.rows * a.cols, NULL, &err); - cl_mem matB_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * b.rows * b.cols, NULL, &err); - cl_mem matOut_values = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * out.rows * out.cols, NULL, &err); - if(!matA_values || !matB_values || !matOut_values) { - printf("Failed to allocate input/output buffer\n"); - return 1; - } - - err = clEnqueueWriteBuffer(queue, matA_values, CL_TRUE, 0, sizeof(float) * a.rows * a.cols, a.values, 0, NULL, NULL); - err |= clEnqueueWriteBuffer(queue, matB_values, CL_TRUE, 0, sizeof(float) * b.rows * b.cols, b.values, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to write to buffer\n"); - return 1; - } - - err = 0; - err |= clSetKernelArg(kernel, 0, sizeof(matA), &matA); - err |= clSetKernelArg(kernel, 1, sizeof(matA_values), &matA_values); - err |= clSetKernelArg(kernel, 2, sizeof(matB), &matB); - err |= clSetKernelArg(kernel, 3, sizeof(matB_values), &matB_values); - err |= clSetKernelArg(kernel, 4, sizeof(matOut), &matOut); - err |= clSetKernelArg(kernel, 5, sizeof(matOut_values), &matOut_values); - // err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize); - if(err != CL_SUCCESS) { - printf("Failed to set kernel args\n"); - return 1; - } - - /*char *info = calloc(1, 1024); - clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL); - printf("INFO: %s\n", info);*/ - - size_t local; - err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); - if(err != CL_SUCCESS) { - printf("Failed to get work group size\n"); - return 1; - } - - printf("Group size is %zu\n", local); - - size_t global = ceil((float) inputSize / local) * local; - err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to enqueue: %d\n", err); - return 1; - } - - clFinish(queue); - - err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * inputSize, out.values, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to read from buffer\n"); - return 1; - } - - clm_matrixPrint(out); - - /*for(unsigned int i = 0; i < inputSize; i++) { - if(i % 1000 != 0) continue; - printf("%f: %f\n", inputData[i], outputData[i]); - }*/ -} diff --git a/src/clm.c b/src/clm.c index 95b428e..732313f 100644 --- a/src/clm.c +++ b/src/clm.c @@ -9,7 +9,6 @@ const clm_Matrix INVALID_MATRIX = {.rows = 0, .cols = 0, .values = NULL}; const clm_Vector INVALID_VECTOR = {.length = 0, .values = NULL}; clm_Matrix clm_createMatrix(unsigned int rows, unsigned int cols) { - printf("CREATING MATRIX\n"); clm_Matrix mat = {0}; mat.rows = rows; mat.cols = cols; diff --git a/src/clm.h b/src/clm.h index aac9246..fbb181b 100644 --- a/src/clm.h +++ b/src/clm.h @@ -1,6 +1,8 @@ #ifndef _CLM_H_ #define _CLM_H_ +#define CL_TARGET_OPENCL_VERSION 200 +#include #include #define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] @@ -26,6 +28,7 @@ typedef struct { clm_Matrix *error; clm_Matrix *weightsError; clm_Matrix *gradient; + clm_NativeBuf *nativeWeights; clm_NativeBuf *nativeBias; clm_NativeBuf *nativeOutput; diff --git a/src/clm_gpu_cpu.c b/src/clm_cpu.c similarity index 98% rename from src/clm_gpu_cpu.c rename to src/clm_cpu.c index 0865a26..2d4c5b4 100644 --- a/src/clm_gpu_cpu.c +++ b/src/clm_cpu.c @@ -5,7 +5,7 @@ struct clm_NativeBuf {}; -int clm_gpuInit() { +int clm_gpuInit(unsigned int mode) { return 0; } diff --git a/src/clm_funcs.c b/src/clm_funcs.c new file mode 100644 index 0000000..3405b9a --- /dev/null +++ b/src/clm_funcs.c @@ -0,0 +1,58 @@ +#include "clm_gpu.h" + +#include +#include + +typedef int (*clm_gpuInitFunc)(unsigned int); +typedef void (*clm_gpuDestroyFunc)(); +typedef void (*clm_linearForwardFunc)(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); +typedef void (*clm_linearBackpropFunc)(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients); + +static void *lib; +static clm_gpuInitFunc initFunc; +static clm_gpuDestroyFunc destroyFunc; +static clm_linearForwardFunc linearForwardFunc; +static clm_linearBackpropFunc linearBackpropFunc; + +int clm_gpuInit(unsigned int mode) { + const char *libName = NULL; + if(mode & CLM_MODE_CPU) { + libName = "libclm_cpu.so"; + } else if(mode & CLM_MODE_OPENCL) { + libName = "libclm_opencl.so"; + } else { + printf("No valid mode supplied. Must set either CLM_MODE_CPU or CLM_MODE_GPU"); + return 1; + } + + lib = dlopen(libName, RTLD_NOW); + if(!lib) { + printf("Failed to load lib: %s\n", dlerror()); + return 1; + } + + initFunc = (clm_gpuInitFunc) dlsym(lib, "clm_gpuInit"); + destroyFunc = (clm_gpuDestroyFunc) dlsym(lib, "clm_gpuDestroy"); + linearForwardFunc = (clm_linearForwardFunc) dlsym(lib, "clm_linearForward"); + linearBackpropFunc = (clm_linearBackpropFunc) dlsym(lib, "clm_linearBackprop"); + + if(!initFunc || !destroyFunc || !linearForwardFunc || !linearBackpropFunc) { + printf("Failed to load functions from library: %s\n", dlerror()); + return 1; + } + + return initFunc(mode); +} + +void clm_gpuDestroy() { + destroyFunc(); + dlclose(lib); +} + +void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs) { + linearForwardFunc(linear, batchSize, inputs, outputs); +} + +void clm_linearBackprop(clm_Linear *linear, float learnRate, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs, clm_Matrix *inputErrors, bool updateErrors, clm_Matrix *outputErrors, clm_Matrix *outputWeightsErrors, clm_Matrix *outputGradients) { + linearBackpropFunc(linear, learnRate, batchSize, inputs, outputs, inputErrors, updateErrors, outputErrors, outputWeightsErrors, outputGradients); +} diff --git a/src/clm_gpu.h b/src/clm_gpu.h index 7d0cbfb..9f65027 100644 --- a/src/clm_gpu.h +++ b/src/clm_gpu.h @@ -3,7 +3,10 @@ #include "clm.h" -int clm_gpuInit(); +#define CLM_MODE_CPU 1 +#define CLM_MODE_OPENCL 2 + +int clm_gpuInit(unsigned int mode); void clm_gpuDestroy(); void clm_linearForward(clm_Linear *linear, unsigned int batchSize, clm_Matrix *inputs, clm_Matrix *outputs); diff --git a/src/clm_gpu_opencl.c b/src/clm_opencl.c similarity index 99% rename from src/clm_gpu_opencl.c rename to src/clm_opencl.c index 36f39c1..79df0c8 100644 --- a/src/clm_gpu_opencl.c +++ b/src/clm_opencl.c @@ -33,7 +33,7 @@ typedef struct __attribute__((packed)) { #define gpuMat(mat) \ { .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed } -int clm_gpuInit() { +int clm_gpuInit(unsigned int mode) { // Connect to a compute device int useGPU = true; cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); diff --git a/src/cltest.bak.c b/src/cltest.bak.c deleted file mode 100644 index 91b7588..0000000 --- a/src/cltest.bak.c +++ /dev/null @@ -1,119 +0,0 @@ -#define CL_TARGET_OPENCL_VERSION 300 - -#include -#include - -int main() -{ - // Connect to a compute device - // - int gpu = 1; - cl_device_id deviceID; - cl_int err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); - if (err != CL_SUCCESS) { - printf("Error: Failed to create a device group!\n"); - return 1; - } - - FILE *file = fopen("src/test.cl", "r"); - fseek(file, 0, SEEK_END); - size_t length = ftell(file); - fseek(file, 0, SEEK_SET); - char *buffer = calloc(1, length + 1); - fread(buffer, length, 1, file); - - printf("%s", buffer); - - cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); - if(!context) { - printf("Failed to create context\n"); - return 1; - } - - cl_command_queue queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err); - if(!queue) { - printf("Failed to create command queue\n"); - return 1; - } - - cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err); - if(!program) { - printf("Failed to create program\n"); - return 1; - } - - err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to build program\n"); - // clGetProgramBuildInfo... - return 1; - } - - cl_kernel kernel = clCreateKernel(program, "do_stuff", &err); - if(!kernel) { - printf("Failed to create kernel\n"); - return 1; - } - - unsigned int inputSize = 256000000; - float *inputData = calloc(inputSize, sizeof(float)); - for(unsigned int i = 0; i < inputSize; i++) { - inputData[i] = i; - } - - cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err); - cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err); - if(!input || !output) { - printf("Failed to allocate input/output buffer\n"); - return 1; - } - - err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to write to buffer\n"); - return 1; - } - - err = 0; - err = clSetKernelArg(kernel, 0, sizeof(input), &input); - err |= clSetKernelArg(kernel, 1, sizeof(output), &output); - err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize); - if(err != CL_SUCCESS) { - printf("Failed to set kernel args\n"); - return 1; - } - - /*char *info = calloc(1, 1024); - clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL); - printf("INFO: %s\n", info);*/ - - size_t local; - err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); - if(err != CL_SUCCESS) { - printf("Failed to get work group size\n"); - return 1; - } - - printf("Group size is %zu\n", local); - - size_t global = inputSize; - err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to enqueue\n"); - return 1; - } - - clFinish(queue); - - float *outputData = calloc(inputSize, sizeof(float)); - err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL); - if(err != CL_SUCCESS) { - printf("Failed to read from buffer\n"); - return 1; - } - - for(unsigned int i = 0; i < inputSize; i++) { - if(i % 1000 != 0) continue; - printf("%f: %f\n", inputData[i], outputData[i]); - } -} diff --git a/src/cltest.bak2.c b/src/cltest.bak2.c deleted file mode 100644 index 4ce80c6..0000000 --- a/src/cltest.bak2.c +++ /dev/null @@ -1,264 +0,0 @@ -#include -#include -#include -#include - -#include "clm.h" - -float train_data_x[4][2] = { - {0, 0}, - {0, 1}, - {1, 0}, - {1, 1}}; - -float train_data_y[4][1] = { - {0}, - {1}, - {1}, - {0}}; - -float *predict(clm_NN nn, float *x, unsigned int length) { - clm_Matrix xM = clm_matrixWrapArray(x, length); - - for(unsigned int i = 0; i < nn.numLayers; i++) { - clm_Linear layer = nn.layers[i]; - clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output); - - if(clm_matrixIsInvalid(newX)) { - printf("Failed to predict\n"); - return NULL; - } - - clm_matrixAddMatrix(newX, layer.bias); - clm_matrixSigmoid(newX); - xM = newX; - } - - return xM.values; -} - -void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { - clm_Matrix xM = clm_matrixWrapArray(x, xL); - clm_Matrix yM = clm_matrixWrapArray(y, yL); - - // TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached) - - // Forward pass - clm_Matrix currentX = xM; - for(unsigned int i = 0; i < nn.numLayers; i++) { - clm_Linear layer = nn.layers[i]; - clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output); - if(clm_matrixIsInvalid(newX)) { - printf("Forward pass failed\n"); - return; - } - - clm_matrixAddMatrix(newX, layer.bias); - clm_matrixSigmoid(newX); - currentX = newX; - } - - for(int i = nn.numLayers - 1; i >= 0; i--) { - clm_Linear layer = nn.layers[i]; - clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output; - clm_Matrix outputOfThisLayer = nn.layers[i].output; - clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; - clm_Matrix error = layer.error; - - if(i == nn.numLayers - 1) { - clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y - } else { - clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); - clm_matrixMultiplyMatrix(weightsT, prevError, error); - } - - clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat) - clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) - clm_matrixMultiplyScalar(gradient, nn.learnRate); - - clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer); - clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); - - clm_matrixAddMatrix(layer.weights, layer.weightsError); - clm_matrixAddMatrix(layer.bias, gradient); - } -} - -void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { - FILE *file = fopen("data/train-labels.idx1-ubyte", "r"); - if(!file) { - perror("Failed to open labels\n"); - return; - } - - unsigned char magicBytes[4]; - fread(magicBytes, sizeof(magicBytes), 1, file); - - printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]); - - unsigned char lengthBytes[4]; - fread(lengthBytes, sizeof(lengthBytes), 1, file); - - uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | - (lengthBytes[2] << 8) | lengthBytes[3]; - printf("%" PRId32 "\n", length); - - clm_Vector *vectors = calloc(length, sizeof(clm_Vector)); - - for(unsigned int i = 0; i < length; i++) { - unsigned char label; - fread(&label, sizeof(unsigned char), 1, file); - - clm_Vector vector = clm_vectorCreate(10); - for(unsigned int j = 0; j < 10; j++) { - vector.values[j] = label == j ? 1 : 0; - } - vectors[i] = vector; - } - - *labelsOut = vectors; - *labelsCountOut = length; -} - -void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { - FILE *file = fopen("data/train-images.idx3-ubyte", "r"); - if(!file) { - perror("Failed to open images\n"); - return; - } - - unsigned char magicBytes[4]; - fread(magicBytes, sizeof(magicBytes), 1, file); - - printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]); - - unsigned char lengthBytes[4]; - fread(lengthBytes, sizeof(lengthBytes), 1, file); - uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3]; - printf("%" PRId32 "\n", length); - - unsigned char rowsBytes[4]; - fread(rowsBytes, sizeof(rowsBytes), 1, file); - uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3]; - printf("%" PRId32 "\n", rows); - - unsigned char colsBytes[4]; - fread(colsBytes, sizeof(colsBytes), 1, file); - uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3]; - printf("%" PRId32 "\n", cols); - - clm_Vector *images = calloc(length, sizeof(clm_Vector)); - for(unsigned int i = 0; i < length; i++) { - clm_Vector vec = clm_vectorCreate(cols * rows); - unsigned char image[cols * rows]; - fread(image, sizeof(image), 1, file); - for(unsigned int j = 0; j < cols * rows; j++) { - vec.values[j] = (float) image[j]; - } - images[i] = vec; - } - - *imagesOut = images; - *imageCountOut = length; -} - -typedef void *(*callocFunc)(size_t, size_t); - -callocFunc oldCalloc; - -int main() { - oldCalloc = dlsym(RTLD_NEXT, "calloc"); - - clm_Vector *labels = NULL; - unsigned int labelCount; - loadLabels(&labels, &labelCount); - printf("LENGTH: %u\n", labelCount); - - clm_Vector *images = NULL; - unsigned int imageCount; - loadImages(&images, &imageCount); - - imageCount = 60000; - - printf("%f\n", images[0].values[0]); - - srand(1); - - unsigned int - i = 784, - h = 30, - o = 10; - - clm_Linear layer1 = clm_linearCreateRandom(i, h); - clm_Linear layer2 = clm_linearCreateRandom(h, o); - clm_Linear layers[] = {layer1, layer2}; - clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01}; - - for(unsigned int epoch = 0; epoch < 1; epoch++) { - printf("Epoch %u\n", epoch); - for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample - if(idx % 1000 == 0) { - printf("\r%.2f%%", idx / (float) imageCount * 100); - fflush(stdout); - } - // printf("%u\n", idx); - // train(nn, train_data_x[idx], 2, train_data_y[idx], 1); - /*for(unsigned int f = 0; f < images[idx].length; f++) { - printf("%.2f ", images[idx].values[f]); - } - printf("\n"); - for(unsigned int f = 0; f < labels[idx].length; f++) { - printf("%.2f ", labels[idx].values[f]); - } - printf("\n");*/ - // printf("%.2f\n", labels.values[idx]); - - train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); - // train(nn, test, 784, target, 10); - // predict(nn, test, 784); - } - printf("\n"); - } - - printf("Train done\n"); - - unsigned int correct = 0; - for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample - // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], - // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); - float *pred = predict(nn, images[idx].values, images[idx].length); - unsigned int predDigit = 0; - float max = -1; - for(unsigned int j = 0; j < 10; j++) { - // printf("%.2f ", pred[j]); - if(pred[j] > max || max < 0) { - max = pred[j]; - predDigit = j; - } - } - if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); - - unsigned int actDigit = 0; - float maxA = -1; - for(unsigned int j = 0; j < 10; j++) { - // printf("%.2f ", pred[j]); - if(labels[idx].values[j] > maxA || maxA < 0) { - maxA = labels[idx].values[j]; - actDigit = j; - } - } - if(idx < 100) printf("Actual: %u\n", actDigit); - // printf("\n"); - - if(predDigit == actDigit) correct++; - } - - printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100); - - printf("\n"); -} - -void *calloc(size_t nmemb, size_t size) { - // printf("CALLOC\n"); - return oldCalloc(nmemb, size); -} diff --git a/src/cltest.c b/src/cltest.c index 99bfe75..bd1444e 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -1,7 +1,9 @@ +#include #include #include #include #include +#include #include "clm.h" #include "clm_gpu.h" @@ -105,7 +107,7 @@ void train(clm_NN nn, unsigned int numElements, clm_Vector *inputs, clm_Vector * void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { FILE *file = fopen("data/train-labels.idx1-ubyte", "r"); if(!file) { - perror("Failed to open labels\n"); + printf("Failed to open labels: %s\n", strerror(errno)); return; } @@ -141,7 +143,7 @@ void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { FILE *file = fopen("data/train-images.idx3-ubyte", "r"); if(!file) { - perror("Failed to open images\n"); + printf("Failed to open images: %s\n", strerror(errno)); return; } @@ -180,8 +182,23 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { *imageCountOut = length; } -int main() { - if(clm_gpuInit() != 0) { +int main(int argc, const char *argv[]) { + if(argc < 2) { + printf("Usage: %s [cpu | opencl]\n", argv[0]); + return 1; + } + + unsigned int mode = 0; + if(strcmp(argv[1], "cpu") == 0) { + mode = CLM_MODE_CPU; + } else if(strcmp(argv[1], "opencl") == 0) { + mode = CLM_MODE_OPENCL; + } else { + printf("Invalid mode\n"); + return 1; + } + + if(clm_gpuInit(mode) != 0) { printf("Failed to init GPU\n"); return 1; } @@ -190,6 +207,10 @@ int main() { unsigned int labelCount; loadLabels(&labels, &labelCount); printf("LENGTH: %u\n", labelCount); + if(labelCount == 0) { + printf("Loaded no labels\n"); + return 1; + } clm_Vector *images = NULL; unsigned int imageCount; @@ -209,7 +230,7 @@ int main() { clm_Linear layers[] = { clm_linearCreateRandom(i, h), clm_linearCreateRandom(h, o)}; - clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, 1000); + clm_NN nn = clm_nnCreate(sizeof(layers) / sizeof(clm_Linear), layers, 0.01, imageCount / 2); for(unsigned int epoch = 0; epoch < 1; epoch++) { printf("Epoch %u\n", epoch); diff --git a/src/test.cl b/src/test.cl deleted file mode 100644 index 82c4361..0000000 --- a/src/test.cl +++ /dev/null @@ -1,5 +0,0 @@ -__kernel void do_stuff(__global float *input, __global float *output, unsigned int count) { - int i = get_global_id(0); - //printf("Task %d\n", i); - output[i] = input[i] * input[i]; -}