Basic GPU math
This commit is contained in:
parent
b6d62e9ad8
commit
c91ca11346
2
Makefile
2
Makefile
@ -14,7 +14,7 @@ run: all
|
|||||||
.PHONY: cl
|
.PHONY: cl
|
||||||
cl:
|
cl:
|
||||||
mkdir -p $(BUILD)
|
mkdir -p $(BUILD)
|
||||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c
|
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c
|
||||||
|
|
||||||
.PHONY: cl_run
|
.PHONY: cl_run
|
||||||
cl_run: cl
|
cl_run: cl
|
||||||
|
68
src/cl.c
68
src/cl.c
@ -1,12 +1,18 @@
|
|||||||
|
#include <CL/cl_platform.h>
|
||||||
|
#include <math.h>
|
||||||
#define CL_TARGET_OPENCL_VERSION 300
|
#define CL_TARGET_OPENCL_VERSION 300
|
||||||
|
|
||||||
|
#include "clm.h"
|
||||||
|
|
||||||
#include <CL/cl.h>
|
#include <CL/cl.h>
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <stdbool.h>
|
|
||||||
|
|
||||||
typedef struct __attribute__ ((packed)) {
|
|
||||||
|
|
||||||
|
typedef struct __attribute__((packed)) {
|
||||||
|
cl_uint rows;
|
||||||
|
cl_uint cols;
|
||||||
|
cl_char transposed;
|
||||||
} cl_GPUMat;
|
} cl_GPUMat;
|
||||||
|
|
||||||
char *loadFile(const char *path) {
|
char *loadFile(const char *path) {
|
||||||
@ -24,7 +30,7 @@ int main() {
|
|||||||
int useGPU = true;
|
int useGPU = true;
|
||||||
cl_device_id deviceID;
|
cl_device_id deviceID;
|
||||||
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
||||||
if (err != CL_SUCCESS) {
|
if(err != CL_SUCCESS) {
|
||||||
printf("Error: Failed to create a device group!\n");
|
printf("Error: Failed to create a device group!\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@ -58,35 +64,50 @@ int main() {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
cl_kernel kernel = clCreateKernel(program, "do_stuff", &err);
|
cl_kernel kernel = clCreateKernel(program, "mat_multiply", &err);
|
||||||
if(!kernel) {
|
if(!kernel) {
|
||||||
printf("Failed to create kernel\n");
|
printf("Failed to create kernel\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int inputSize = 256000000;
|
clm_Matrix a = clm_createMatrixRandom(3, 4);
|
||||||
float *inputData = calloc(inputSize, sizeof(float));
|
clm_Matrix b = clm_createMatrixRandom(4, 5);
|
||||||
for(unsigned int i = 0; i < inputSize; i++) {
|
clm_Matrix out = clm_createMatrixRandom(3, 5);
|
||||||
inputData[i] = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * inputSize, NULL, &err);
|
cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
|
||||||
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * inputSize, NULL, &err);
|
cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
|
||||||
if(!input || !output) {
|
cl_GPUMat matOut = {.rows = out.rows, .cols = out.cols, .transposed = out.transposed};
|
||||||
|
|
||||||
|
size_t inputSize = out.rows * out.cols;
|
||||||
|
|
||||||
|
clm_matrixMultiplyMatrix(a, b, out);
|
||||||
|
|
||||||
|
clm_matrixPrint(out);
|
||||||
|
clm_matrixZero(out);
|
||||||
|
|
||||||
|
cl_mem matA_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * a.rows * a.cols, NULL, &err);
|
||||||
|
cl_mem matB_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * b.rows * b.cols, NULL, &err);
|
||||||
|
cl_mem matOut_values = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * out.rows * out.cols, NULL, &err);
|
||||||
|
if(!matA_values || !matB_values || !matOut_values) {
|
||||||
printf("Failed to allocate input/output buffer\n");
|
printf("Failed to allocate input/output buffer\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, sizeof(float) * inputSize, inputData, 0, NULL, NULL);
|
err = clEnqueueWriteBuffer(queue, matA_values, CL_TRUE, 0, sizeof(float) * a.rows * a.cols, a.values, 0, NULL, NULL);
|
||||||
|
err |= clEnqueueWriteBuffer(queue, matB_values, CL_TRUE, 0, sizeof(float) * b.rows * b.cols, b.values, 0, NULL, NULL);
|
||||||
if(err != CL_SUCCESS) {
|
if(err != CL_SUCCESS) {
|
||||||
printf("Failed to write to buffer\n");
|
printf("Failed to write to buffer\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = 0;
|
err = 0;
|
||||||
err = clSetKernelArg(kernel, 0, sizeof(input), &input);
|
err |= clSetKernelArg(kernel, 0, sizeof(matA), &matA);
|
||||||
err |= clSetKernelArg(kernel, 1, sizeof(output), &output);
|
err |= clSetKernelArg(kernel, 1, sizeof(matA_values), &matA_values);
|
||||||
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
|
err |= clSetKernelArg(kernel, 2, sizeof(matB), &matB);
|
||||||
|
err |= clSetKernelArg(kernel, 3, sizeof(matB_values), &matB_values);
|
||||||
|
err |= clSetKernelArg(kernel, 4, sizeof(matOut), &matOut);
|
||||||
|
err |= clSetKernelArg(kernel, 5, sizeof(matOut_values), &matOut_values);
|
||||||
|
// err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &inputSize);
|
||||||
if(err != CL_SUCCESS) {
|
if(err != CL_SUCCESS) {
|
||||||
printf("Failed to set kernel args\n");
|
printf("Failed to set kernel args\n");
|
||||||
return 1;
|
return 1;
|
||||||
@ -105,24 +126,25 @@ int main() {
|
|||||||
|
|
||||||
printf("Group size is %zu\n", local);
|
printf("Group size is %zu\n", local);
|
||||||
|
|
||||||
size_t global = inputSize;
|
size_t global = ceil((float) inputSize / local) * local;
|
||||||
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
||||||
if(err != CL_SUCCESS) {
|
if(err != CL_SUCCESS) {
|
||||||
printf("Failed to enqueue\n");
|
printf("Failed to enqueue: %d\n", err);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
clFinish(queue);
|
clFinish(queue);
|
||||||
|
|
||||||
float *outputData = calloc(inputSize, sizeof(float));
|
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * inputSize, out.values, 0, NULL, NULL);
|
||||||
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(float) * inputSize, outputData, 0, NULL, NULL);
|
|
||||||
if(err != CL_SUCCESS) {
|
if(err != CL_SUCCESS) {
|
||||||
printf("Failed to read from buffer\n");
|
printf("Failed to read from buffer\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < inputSize; i++) {
|
clm_matrixPrint(out);
|
||||||
|
|
||||||
|
/*for(unsigned int i = 0; i < inputSize; i++) {
|
||||||
if(i % 1000 != 0) continue;
|
if(i % 1000 != 0) continue;
|
||||||
printf("%f: %f\n", inputData[i], outputData[i]);
|
printf("%f: %f\n", inputData[i], outputData[i]);
|
||||||
}
|
}*/
|
||||||
}
|
}
|
||||||
|
@ -121,9 +121,9 @@ clm_Matrix clm_matrixMultiplyMatrix(clm_Matrix a, clm_Matrix b, clm_Matrix out)
|
|||||||
for(unsigned int j = 0; j < out.cols; j++) {
|
for(unsigned int j = 0; j < out.cols; j++) {
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for(unsigned int k = 0; k < a.cols; k++) {
|
for(unsigned int k = 0; k < a.cols; k++) {
|
||||||
sum += a.values[i * a.cols + k] * b.values[k * b.cols + j];
|
sum += matrixAt(a, i, k) * matrixAt(b, k, j);
|
||||||
}
|
}
|
||||||
out.values[i * out.cols + j] = sum;
|
matrixAt(out, i, j) = sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
36
src/mat.cl
36
src/mat.cl
@ -1,11 +1,33 @@
|
|||||||
typedef struct __attribute__ ((packed)) {
|
typedef struct __attribute__((packed)) {
|
||||||
|
uint rows;
|
||||||
|
uint cols;
|
||||||
|
char transposed;
|
||||||
} cl_GPUMat;
|
} cl_GPUMat;
|
||||||
|
|
||||||
void amogus() {
|
__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
|
||||||
printf("HEllo world!");
|
/*if(a.cols != b.rows) {
|
||||||
}
|
printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
|
||||||
|
return INVALID_MATRIX;
|
||||||
|
}
|
||||||
|
|
||||||
__kernel void mat_multiply(__global float *nnWeights, __global float *inputs, __global float *c) {
|
if(out.rows != a.rows || out.cols != b.cols) {
|
||||||
amogus();
|
printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
|
||||||
|
return INVALID_MATRIX;
|
||||||
|
}*/
|
||||||
|
|
||||||
|
uint idx = get_global_id(0);
|
||||||
|
if(idx >= matOut.rows * matOut.cols) return;
|
||||||
|
|
||||||
|
uint i = idx / matOut.cols;
|
||||||
|
uint j = idx % matOut.cols;
|
||||||
|
|
||||||
|
// for(unsigned int i = 0; i < out.rows; i++) {
|
||||||
|
// for(unsigned int j = 0; j < out.cols; j++) {
|
||||||
|
float sum = 0;
|
||||||
|
for(unsigned int k = 0; k < matA.cols; k++) {
|
||||||
|
sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j];
|
||||||
|
}
|
||||||
|
matOut_values[i * matOut.cols + j] = sum;
|
||||||
|
//}
|
||||||
|
//}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user