diff --git a/Makefile b/Makefile index 7945949..61f49af 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ CFLAGS=-Wall -g .PHONY: all all: mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c + gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c .PHONY: run run: all @@ -14,7 +14,7 @@ run: all .PHONY: cl cl: mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c + gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c .PHONY: cl_run cl_run: cl diff --git a/src/cl.c b/src/cl.c index a2c98cb..987daa0 100644 --- a/src/cl.c +++ b/src/cl.c @@ -1,6 +1,6 @@ +#define CL_TARGET_OPENCL_VERSION 200 #include #include -#define CL_TARGET_OPENCL_VERSION 300 #include "clm.h" @@ -70,9 +70,9 @@ int main() { return 1; } - clm_Matrix a = clm_createMatrixRandom(3, 4); - clm_Matrix b = clm_createMatrixRandom(4, 5); - clm_Matrix out = clm_createMatrixRandom(3, 5); + clm_Matrix a = clm_createMatrixRandom(5, 10); + clm_Matrix b = clm_createMatrixRandom(10, 3); + clm_Matrix out = clm_createMatrixRandom(5, 3); cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed}; cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed}; diff --git a/src/clm.c b/src/clm.c index bdacf87..5221e53 100644 --- a/src/clm.c +++ b/src/clm.c @@ -158,10 +158,8 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) { } clm_Matrix clm_matrixSigmoid(clm_Matrix mat) { - for(unsigned int i = 0; i < mat.rows; i++) { - for(unsigned int j = 0; j < mat.cols; j++) { - matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j))); - } + for(unsigned int i = 0; i < mat.rows * mat.cols; i++) { + mat.values[i] = 1 / (1 + exp(-mat.values[i])); } return mat; diff --git a/src/clm.h b/src/clm.h index d452aa1..2897d52 100644 --- a/src/clm.h +++ b/src/clm.h @@ -5,6 +5,8 @@ #define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] +typedef struct clm_NativeBuf clm_NativeBuf; + typedef struct { float *values; unsigned int rows; @@ -23,6 +25,9 @@ typedef struct { clm_Matrix output; clm_Matrix error; clm_Matrix weightsError; + clm_NativeBuf *nativeWeights; + clm_NativeBuf *nativeBias; + clm_NativeBuf *nativeOutput; } clm_Linear; typedef struct { diff --git a/src/clm_gpu.h b/src/clm_gpu.h new file mode 100644 index 0000000..2548383 --- /dev/null +++ b/src/clm_gpu.h @@ -0,0 +1,11 @@ +#ifndef _CLM_GPU_H_ +#define _CLM_GPU_H_ + +#include "clm.h" + +int clm_gpuInit(); +void clm_gpuDestroy(); + +void clm_linearForward(clm_Linear *linear, clm_Matrix input); + +#endif diff --git a/src/clm_gpu_cpu.c b/src/clm_gpu_cpu.c new file mode 100644 index 0000000..7ab5caf --- /dev/null +++ b/src/clm_gpu_cpu.c @@ -0,0 +1,24 @@ +#include "clm_gpu.h" + +#include + +struct clm_NativeBuf {}; + +int clm_gpuInit() { + return 0; +} + +void clm_gpuDestroy() {} + +void clm_linearForward(clm_Linear *linear, clm_Matrix input) { + clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output); + + if(clm_matrixIsInvalid(newX)) { + printf("Forward pass failed\n"); + return; + } + + clm_matrixAddMatrix(newX, linear->bias); + clm_matrixSigmoid(newX); + linear->output = newX; +} diff --git a/src/clm_gpu_opencl.c b/src/clm_gpu_opencl.c new file mode 100644 index 0000000..fd64052 --- /dev/null +++ b/src/clm_gpu_opencl.c @@ -0,0 +1,192 @@ +#include "clm_gpu.h" +#include + +#define CL_TARGET_OPENCL_VERSION 200 +#include + +#include + +#include +#include + +static cl_context context; +static cl_device_id deviceID; +static cl_command_queue queue; +static cl_kernel kernel; + +struct clm_NativeBuf { + cl_mem mem; +}; + +typedef struct __attribute__((packed)) { + + cl_uint rows; + cl_uint cols; + cl_char transposed; +} cl_GPUMat; + +#define gpuMat(mat) \ + { .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed } + +char *loadFile(const char *path) { + FILE *file = fopen(path, "r"); + fseek(file, 0, SEEK_END); + size_t length = ftell(file); + fseek(file, 0, SEEK_SET); + char *buffer = calloc(1, length + 1); + fread(buffer, length, 1, file); + return buffer; +} + +int clm_gpuInit() { + // Connect to a compute device + int useGPU = true; + cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); + if(err != CL_SUCCESS) { + printf("Error: Failed to create a device group!\n"); + return 1; + } + + char *buffer = loadFile("src/mat.cl"); + printf("%s", buffer); + + context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err); + if(!context) { + printf("Failed to create context\n"); + return 1; + } + + queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err); + if(!queue) { + printf("Failed to create command queue\n"); + return 1; + } + + size_t length = strlen(buffer); + cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err); + if(!program) { + printf("Failed to create program\n"); + return 1; + } + + err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to build program\n"); + // clGetProgramBuildInfo... + return 1; + } + + kernel = clCreateKernel(program, "linear_forward", &err); + if(!kernel) { + printf("Failed to create kernel\n"); + return 1; + } + + return 0; +} + +void clm_gpuDestroy() { +} + +static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) { + cl_int err; + + if(!nativeBuf->mem) { + cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err); + if(!mat_values) { + printf("Failed to alloc buffer: %d\n", err); + return NULL; + } + nativeBuf->mem = mat_values; + } + + return nativeBuf->mem; +} + +static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) { + cl_int err; + cl_mem mem = allocGPUMat(gpuMat, nativeBuf); + + err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue write: %d\n", err); + return NULL; + } + + return mem; +} + +clm_NativeBuf nativeInput; + +// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time) +void clm_linearForward(clm_Linear *linear, clm_Matrix input) { + if(!linear->nativeWeights) { + linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeBias = calloc(1, sizeof(clm_NativeBuf)); + linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf)); + } + + cl_GPUMat matInput = gpuMat(input); + cl_GPUMat matWeights = gpuMat(linear->weights); + cl_GPUMat matBias = gpuMat(linear->bias); + cl_GPUMat matOut = gpuMat(linear->output); + + size_t workSize = matOut.rows * matOut.cols; + + cl_int err; + + cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput); + cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights); + cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias); + if(!matInput_values || !matWeights_values || !matBias_values) { + linear->output = INVALID_MATRIX; + return; + } + + cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput); + if(!matOut_values) { + printf("Failed to alloc out: %d\n", err); + linear->output = INVALID_MATRIX; + return; + } + + err = 0; + err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput); + err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values); + err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights); + err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values); + err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias); + err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values); + err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut); + err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values); + if(err != CL_SUCCESS) { + printf("Failed to set kernel args: %d\n", err); + return; + } + + /*char *info = calloc(1, 1024); + clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL); + printf("INFO: %s\n", info);*/ + + size_t local; + err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); + if(err != CL_SUCCESS) { + printf("Failed to get work group size\n"); + return; + } + + size_t global = ceil((float) workSize / local) * local; + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to enqueue: %d\n", err); + return; + } + + clFinish(queue); + + err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL); + if(err != CL_SUCCESS) { + printf("Failed to read from buffer\n"); + return; + } +} diff --git a/src/cltest.bak2.c b/src/cltest.bak2.c new file mode 100644 index 0000000..4ce80c6 --- /dev/null +++ b/src/cltest.bak2.c @@ -0,0 +1,264 @@ +#include +#include +#include +#include + +#include "clm.h" + +float train_data_x[4][2] = { + {0, 0}, + {0, 1}, + {1, 0}, + {1, 1}}; + +float train_data_y[4][1] = { + {0}, + {1}, + {1}, + {0}}; + +float *predict(clm_NN nn, float *x, unsigned int length) { + clm_Matrix xM = clm_matrixWrapArray(x, length); + + for(unsigned int i = 0; i < nn.numLayers; i++) { + clm_Linear layer = nn.layers[i]; + clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output); + + if(clm_matrixIsInvalid(newX)) { + printf("Failed to predict\n"); + return NULL; + } + + clm_matrixAddMatrix(newX, layer.bias); + clm_matrixSigmoid(newX); + xM = newX; + } + + return xM.values; +} + +void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { + clm_Matrix xM = clm_matrixWrapArray(x, xL); + clm_Matrix yM = clm_matrixWrapArray(y, yL); + + // TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached) + + // Forward pass + clm_Matrix currentX = xM; + for(unsigned int i = 0; i < nn.numLayers; i++) { + clm_Linear layer = nn.layers[i]; + clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output); + if(clm_matrixIsInvalid(newX)) { + printf("Forward pass failed\n"); + return; + } + + clm_matrixAddMatrix(newX, layer.bias); + clm_matrixSigmoid(newX); + currentX = newX; + } + + for(int i = nn.numLayers - 1; i >= 0; i--) { + clm_Linear layer = nn.layers[i]; + clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output; + clm_Matrix outputOfThisLayer = nn.layers[i].output; + clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error; + clm_Matrix error = layer.error; + + if(i == nn.numLayers - 1) { + clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y + } else { + clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights); + clm_matrixMultiplyMatrix(weightsT, prevError, error); + } + + clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat) + clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat) + clm_matrixMultiplyScalar(gradient, nn.learnRate); + + clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer); + clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError); + + clm_matrixAddMatrix(layer.weights, layer.weightsError); + clm_matrixAddMatrix(layer.bias, gradient); + } +} + +void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) { + FILE *file = fopen("data/train-labels.idx1-ubyte", "r"); + if(!file) { + perror("Failed to open labels\n"); + return; + } + + unsigned char magicBytes[4]; + fread(magicBytes, sizeof(magicBytes), 1, file); + + printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]); + + unsigned char lengthBytes[4]; + fread(lengthBytes, sizeof(lengthBytes), 1, file); + + uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | + (lengthBytes[2] << 8) | lengthBytes[3]; + printf("%" PRId32 "\n", length); + + clm_Vector *vectors = calloc(length, sizeof(clm_Vector)); + + for(unsigned int i = 0; i < length; i++) { + unsigned char label; + fread(&label, sizeof(unsigned char), 1, file); + + clm_Vector vector = clm_vectorCreate(10); + for(unsigned int j = 0; j < 10; j++) { + vector.values[j] = label == j ? 1 : 0; + } + vectors[i] = vector; + } + + *labelsOut = vectors; + *labelsCountOut = length; +} + +void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { + FILE *file = fopen("data/train-images.idx3-ubyte", "r"); + if(!file) { + perror("Failed to open images\n"); + return; + } + + unsigned char magicBytes[4]; + fread(magicBytes, sizeof(magicBytes), 1, file); + + printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]); + + unsigned char lengthBytes[4]; + fread(lengthBytes, sizeof(lengthBytes), 1, file); + uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3]; + printf("%" PRId32 "\n", length); + + unsigned char rowsBytes[4]; + fread(rowsBytes, sizeof(rowsBytes), 1, file); + uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3]; + printf("%" PRId32 "\n", rows); + + unsigned char colsBytes[4]; + fread(colsBytes, sizeof(colsBytes), 1, file); + uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3]; + printf("%" PRId32 "\n", cols); + + clm_Vector *images = calloc(length, sizeof(clm_Vector)); + for(unsigned int i = 0; i < length; i++) { + clm_Vector vec = clm_vectorCreate(cols * rows); + unsigned char image[cols * rows]; + fread(image, sizeof(image), 1, file); + for(unsigned int j = 0; j < cols * rows; j++) { + vec.values[j] = (float) image[j]; + } + images[i] = vec; + } + + *imagesOut = images; + *imageCountOut = length; +} + +typedef void *(*callocFunc)(size_t, size_t); + +callocFunc oldCalloc; + +int main() { + oldCalloc = dlsym(RTLD_NEXT, "calloc"); + + clm_Vector *labels = NULL; + unsigned int labelCount; + loadLabels(&labels, &labelCount); + printf("LENGTH: %u\n", labelCount); + + clm_Vector *images = NULL; + unsigned int imageCount; + loadImages(&images, &imageCount); + + imageCount = 60000; + + printf("%f\n", images[0].values[0]); + + srand(1); + + unsigned int + i = 784, + h = 30, + o = 10; + + clm_Linear layer1 = clm_linearCreateRandom(i, h); + clm_Linear layer2 = clm_linearCreateRandom(h, o); + clm_Linear layers[] = {layer1, layer2}; + clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01}; + + for(unsigned int epoch = 0; epoch < 1; epoch++) { + printf("Epoch %u\n", epoch); + for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample + if(idx % 1000 == 0) { + printf("\r%.2f%%", idx / (float) imageCount * 100); + fflush(stdout); + } + // printf("%u\n", idx); + // train(nn, train_data_x[idx], 2, train_data_y[idx], 1); + /*for(unsigned int f = 0; f < images[idx].length; f++) { + printf("%.2f ", images[idx].values[f]); + } + printf("\n"); + for(unsigned int f = 0; f < labels[idx].length; f++) { + printf("%.2f ", labels[idx].values[f]); + } + printf("\n");*/ + // printf("%.2f\n", labels.values[idx]); + + train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); + // train(nn, test, 784, target, 10); + // predict(nn, test, 784); + } + printf("\n"); + } + + printf("Train done\n"); + + unsigned int correct = 0; + for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample + // printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0], + // train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]); + float *pred = predict(nn, images[idx].values, images[idx].length); + unsigned int predDigit = 0; + float max = -1; + for(unsigned int j = 0; j < 10; j++) { + // printf("%.2f ", pred[j]); + if(pred[j] > max || max < 0) { + max = pred[j]; + predDigit = j; + } + } + if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max); + + unsigned int actDigit = 0; + float maxA = -1; + for(unsigned int j = 0; j < 10; j++) { + // printf("%.2f ", pred[j]); + if(labels[idx].values[j] > maxA || maxA < 0) { + maxA = labels[idx].values[j]; + actDigit = j; + } + } + if(idx < 100) printf("Actual: %u\n", actDigit); + // printf("\n"); + + if(predDigit == actDigit) correct++; + } + + printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100); + + printf("\n"); +} + +void *calloc(size_t nmemb, size_t size) { + // printf("CALLOC\n"); + return oldCalloc(nmemb, size); +} diff --git a/src/cltest.c b/src/cltest.c index 4ce80c6..69eccb9 100644 --- a/src/cltest.c +++ b/src/cltest.c @@ -4,6 +4,7 @@ #include #include "clm.h" +#include "clm_gpu.h" float train_data_x[4][2] = { {0, 0}, @@ -21,17 +22,8 @@ float *predict(clm_NN nn, float *x, unsigned int length) { clm_Matrix xM = clm_matrixWrapArray(x, length); for(unsigned int i = 0; i < nn.numLayers; i++) { - clm_Linear layer = nn.layers[i]; - clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output); - - if(clm_matrixIsInvalid(newX)) { - printf("Failed to predict\n"); - return NULL; - } - - clm_matrixAddMatrix(newX, layer.bias); - clm_matrixSigmoid(newX); - xM = newX; + clm_linearForward(&nn.layers[i], xM); + xM = nn.layers[i].output; } return xM.values; @@ -46,16 +38,8 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) { // Forward pass clm_Matrix currentX = xM; for(unsigned int i = 0; i < nn.numLayers; i++) { - clm_Linear layer = nn.layers[i]; - clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output); - if(clm_matrixIsInvalid(newX)) { - printf("Forward pass failed\n"); - return; - } - - clm_matrixAddMatrix(newX, layer.bias); - clm_matrixSigmoid(newX); - currentX = newX; + clm_linearForward(&nn.layers[i], currentX); + currentX = nn.layers[i].output; } for(int i = nn.numLayers - 1; i >= 0; i--) { @@ -162,12 +146,11 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) { *imageCountOut = length; } -typedef void *(*callocFunc)(size_t, size_t); - -callocFunc oldCalloc; - int main() { - oldCalloc = dlsym(RTLD_NEXT, "calloc"); + if(clm_gpuInit() != 0) { + printf("Failed to init GPU\n"); + return 1; + } clm_Vector *labels = NULL; unsigned int labelCount; @@ -189,9 +172,9 @@ int main() { h = 30, o = 10; - clm_Linear layer1 = clm_linearCreateRandom(i, h); - clm_Linear layer2 = clm_linearCreateRandom(h, o); - clm_Linear layers[] = {layer1, layer2}; + clm_Linear layers[] = { + clm_linearCreateRandom(i, h), + clm_linearCreateRandom(h, o)}; clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01}; for(unsigned int epoch = 0; epoch < 1; epoch++) { @@ -201,21 +184,8 @@ int main() { printf("\r%.2f%%", idx / (float) imageCount * 100); fflush(stdout); } - // printf("%u\n", idx); - // train(nn, train_data_x[idx], 2, train_data_y[idx], 1); - /*for(unsigned int f = 0; f < images[idx].length; f++) { - printf("%.2f ", images[idx].values[f]); - } - printf("\n"); - for(unsigned int f = 0; f < labels[idx].length; f++) { - printf("%.2f ", labels[idx].values[f]); - } - printf("\n");*/ - // printf("%.2f\n", labels.values[idx]); train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length); - // train(nn, test, 784, target, 10); - // predict(nn, test, 784); } printf("\n"); } @@ -256,9 +226,6 @@ int main() { printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100); printf("\n"); -} -void *calloc(size_t nmemb, size_t size) { - // printf("CALLOC\n"); - return oldCalloc(nmemb, size); + clm_gpuDestroy(); } diff --git a/src/mat.cl b/src/mat.cl index f9d936e..0fa9eee 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -4,30 +4,44 @@ typedef struct __attribute__((packed)) { char transposed; } cl_GPUMat; -__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { - /*if(a.cols != b.rows) { - printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols); - return INVALID_MATRIX; - } - - if(out.rows != a.rows || out.cols != b.cols) { - printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols); - return INVALID_MATRIX; - }*/ +#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)] +void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { uint idx = get_global_id(0); if(idx >= matOut.rows * matOut.cols) return; uint i = idx / matOut.cols; uint j = idx % matOut.cols; - // for(unsigned int i = 0; i < out.rows; i++) { - // for(unsigned int j = 0; j < out.cols; j++) { float sum = 0; for(unsigned int k = 0; k < matA.cols; k++) { - sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j]; + sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j); } - matOut_values[i * matOut.cols + j] = sum; - //} - //} + matrixAt(matOut, matOut_values, i, j) = sum; +} + +void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) { + uint idx = get_global_id(0); + if(idx >= matA.rows * matA.cols) return; + + matA_values[idx] += matB_values[idx]; +} + +void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) { + uint idx = get_global_id(0); + if(idx >= mat.rows * mat.cols) return; + + mat_values[idx] = 1 / (1 + exp(-mat_values[idx])); +} + +// clm_Matrix input; +// clm_Matrix weights; +// clm_Matrix bias; +// clm_Matrix output; + +__kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) { + // FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations! + mat_multiply(weights, weights_values, input, input_values, out, out_values); + mat_add(out, out_values, bias, bias_values); + mat_sigmoid(out, out_values); }