diff --git a/Makefile b/Makefile index 15c72a4..7945949 100644 --- a/Makefile +++ b/Makefile @@ -14,7 +14,7 @@ run: all .PHONY: cl cl: mkdir -p $(BUILD) - gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c + gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c .PHONY: cl_run cl_run: cl diff --git a/src/cl.c b/src/cl.c index 925c9ba..a2c98cb 100644 --- a/src/cl.c +++ b/src/cl.c @@ -1,12 +1,18 @@ +#include +#include #define CL_TARGET_OPENCL_VERSION 300 +#include "clm.h" + #include +#include #include #include -#include - -typedef struct __attribute__ ((packed)) { +typedef struct __attribute__((packed)) { + cl_uint rows; + cl_uint cols; + cl_char transposed; } cl_GPUMat; char *loadFile(const char *path) { @@ -24,7 +30,7 @@ int main() { int useGPU = true; cl_device_id deviceID; cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL); - if (err != CL_SUCCESS) { + if(err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return 1; } @@ -58,35 +64,50 @@ int main() { return 1; } - cl_kernel kernel = clCreateKernel(program, "do_stuff", &err); + cl_kernel kernel = clCreateKernel(program, "mat_multiply", &err); if(!kernel) { printf("Failed to create kernel\n"); return 1; } - unsigned int inputSize = 256000000; - float *inputData = calloc(inputSize, sizeof(float)); - for(unsigned int i = 0; i < inputSize; i++) { - inputData[i] = i; - } + clm_Matrix a = clm_createMatrixRandom(3, 4); + clm_Matrix b = clm_createMatrixRandom(4, 5); + clm_Matrix out = clm_createMatrixRandom(3, 5); - cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err); - cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err); - if(!input || !output) { + cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed}; + cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed}; + cl_GPUMat matOut = {.rows = out.rows, .cols = out.cols, .transposed = out.transposed}; + + size_t inputSize = out.rows * out.cols; + + clm_matrixMultiplyMatrix(a, b, out); + + clm_matrixPrint(out); + clm_matrixZero(out); + + cl_mem matA_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * a.rows * a.cols, NULL, &err); + cl_mem matB_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * b.rows * b.cols, NULL, &err); + cl_mem matOut_values = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * out.rows * out.cols, NULL, &err); + if(!matA_values || !matB_values || !matOut_values) { printf("Failed to allocate input/output buffer\n"); return 1; } - err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL); + err = clEnqueueWriteBuffer(queue, matA_values, CL_TRUE, 0, sizeof(float) * a.rows * a.cols, a.values, 0, NULL, NULL); + err |= clEnqueueWriteBuffer(queue, matB_values, CL_TRUE, 0, sizeof(float) * b.rows * b.cols, b.values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to write to buffer\n"); return 1; } err = 0; - err = clSetKernelArg(kernel, 0, sizeof(input), &input); - err |= clSetKernelArg(kernel, 1, sizeof(output), &output); - err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize); + err |= clSetKernelArg(kernel, 0, sizeof(matA), &matA); + err |= clSetKernelArg(kernel, 1, sizeof(matA_values), &matA_values); + err |= clSetKernelArg(kernel, 2, sizeof(matB), &matB); + err |= clSetKernelArg(kernel, 3, sizeof(matB_values), &matB_values); + err |= clSetKernelArg(kernel, 4, sizeof(matOut), &matOut); + err |= clSetKernelArg(kernel, 5, sizeof(matOut_values), &matOut_values); + // err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize); if(err != CL_SUCCESS) { printf("Failed to set kernel args\n"); return 1; @@ -105,24 +126,25 @@ int main() { printf("Group size is %zu\n", local); - size_t global = inputSize; + size_t global = ceil((float) inputSize / local) * local; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if(err != CL_SUCCESS) { - printf("Failed to enqueue\n"); + printf("Failed to enqueue: %d\n", err); return 1; } clFinish(queue); - float *outputData = calloc(inputSize, sizeof(float)); - err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL); + err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * inputSize, out.values, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("Failed to read from buffer\n"); return 1; } - for(unsigned int i = 0; i < inputSize; i++) { + clm_matrixPrint(out); + + /*for(unsigned int i = 0; i < inputSize; i++) { if(i % 1000 != 0) continue; printf("%f: %f\n", inputData[i], outputData[i]); - } + }*/ } diff --git a/src/clm.c b/src/clm.c index 851d7fd..bdacf87 100644 --- a/src/clm.c +++ b/src/clm.c @@ -121,9 +121,9 @@ clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out) for(unsigned int j = 0; j < out.cols; j++) { float sum = 0; for(unsigned int k = 0; k < a.cols; k++) { - sum += a.values[i * a.cols + k] * b.values[k * b.cols + j]; + sum += matrixAt(a, i, k) * matrixAt(b, k, j); } - out.values[i * out.cols + j] = sum; + matrixAt(out, i, j) = sum; } } diff --git a/src/mat.cl b/src/mat.cl index babdf5b..f9d936e 100644 --- a/src/mat.cl +++ b/src/mat.cl @@ -1,11 +1,33 @@ -typedef struct __attribute__ ((packed)) { - +typedef struct __attribute__((packed)) { + uint rows; + uint cols; + char transposed; } cl_GPUMat; -void amogus() { - printf("HEllo world!"); -} +__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) { + /*if(a.cols != b.rows) { + printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols); + return INVALID_MATRIX; + } -__kernel void mat_multiply(__global float *nnWeights, __global float *inputs, __global float *c) { - amogus(); + if(out.rows != a.rows || out.cols != b.cols) { + printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols); + return INVALID_MATRIX; + }*/ + + uint idx = get_global_id(0); + if(idx >= matOut.rows * matOut.cols) return; + + uint i = idx / matOut.cols; + uint j = idx % matOut.cols; + + // for(unsigned int i = 0; i < out.rows; i++) { + // for(unsigned int j = 0; j < out.cols; j++) { + float sum = 0; + for(unsigned int k = 0; k < matA.cols; k++) { + sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j]; + } + matOut_values[i * matOut.cols + j] = sum; + //} + //} }