More GPU stuff

This commit is contained in:
MrLetsplay 2023-10-29 23:01:46 +01:00
parent c91ca11346
commit 4d93fa76cc
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
10 changed files with 547 additions and 72 deletions

View File

@ -5,7 +5,7 @@ CFLAGS=-Wall -g
.PHONY: all
all:
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
.PHONY: run
run: all
@ -14,7 +14,7 @@ run: all
.PHONY: cl
cl:
mkdir -p $(BUILD)
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
.PHONY: cl_run
cl_run: cl

View File

@ -1,6 +1,6 @@
#define CL_TARGET_OPENCL_VERSION 200
#include <CL/cl_platform.h>
#include <math.h>
#define CL_TARGET_OPENCL_VERSION 300
#include "clm.h"
@ -70,9 +70,9 @@ int main() {
return 1;
}
clm_Matrix a = clm_createMatrixRandom(3, 4);
clm_Matrix b = clm_createMatrixRandom(4, 5);
clm_Matrix out = clm_createMatrixRandom(3, 5);
clm_Matrix a = clm_createMatrixRandom(5, 10);
clm_Matrix b = clm_createMatrixRandom(10, 3);
clm_Matrix out = clm_createMatrixRandom(5, 3);
cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};

View File

@ -158,10 +158,8 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) {
}
clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
for(unsigned int i = 0; i < mat.rows; i++) {
for(unsigned int j = 0; j < mat.cols; j++) {
matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j)));
}
for(unsigned int i = 0; i < mat.rows * mat.cols; i++) {
mat.values[i] = 1 / (1 + exp(-mat.values[i]));
}
return mat;

View File

@ -5,6 +5,8 @@
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
typedef struct clm_NativeBuf clm_NativeBuf;
typedef struct {
float *values;
unsigned int rows;
@ -23,6 +25,9 @@ typedef struct {
clm_Matrix output;
clm_Matrix error;
clm_Matrix weightsError;
clm_NativeBuf *nativeWeights;
clm_NativeBuf *nativeBias;
clm_NativeBuf *nativeOutput;
} clm_Linear;
typedef struct {

11
src/clm_gpu.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef _CLM_GPU_H_
#define _CLM_GPU_H_
#include "clm.h"
int clm_gpuInit();
void clm_gpuDestroy();
void clm_linearForward(clm_Linear *linear, clm_Matrix input);
#endif

24
src/clm_gpu_cpu.c Normal file
View File

@ -0,0 +1,24 @@
#include "clm_gpu.h"
#include <stdio.h>
struct clm_NativeBuf {};
int clm_gpuInit() {
return 0;
}
void clm_gpuDestroy() {}
void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output);
if(clm_matrixIsInvalid(newX)) {
printf("Forward pass failed\n");
return;
}
clm_matrixAddMatrix(newX, linear->bias);
clm_matrixSigmoid(newX);
linear->output = newX;
}

192
src/clm_gpu_opencl.c Normal file
View File

@ -0,0 +1,192 @@
#include "clm_gpu.h"
#include <math.h>
#define CL_TARGET_OPENCL_VERSION 200
#include <CL/cl_platform.h>
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
static cl_context context;
static cl_device_id deviceID;
static cl_command_queue queue;
static cl_kernel kernel;
struct clm_NativeBuf {
cl_mem mem;
};
typedef struct __attribute__((packed)) {
cl_uint rows;
cl_uint cols;
cl_char transposed;
} cl_GPUMat;
#define gpuMat(mat) \
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
char *loadFile(const char *path) {
FILE *file = fopen(path, "r");
fseek(file, 0, SEEK_END);
size_t length = ftell(file);
fseek(file, 0, SEEK_SET);
char *buffer = calloc(1, length + 1);
fread(buffer, length, 1, file);
return buffer;
}
int clm_gpuInit() {
// Connect to a compute device
int useGPU = true;
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
if(err != CL_SUCCESS) {
printf("Error: Failed to create a device group!\n");
return 1;
}
char *buffer = loadFile("src/mat.cl");
printf("%s", buffer);
context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
if(!context) {
printf("Failed to create context\n");
return 1;
}
queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
if(!queue) {
printf("Failed to create command queue\n");
return 1;
}
size_t length = strlen(buffer);
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
if(!program) {
printf("Failed to create program\n");
return 1;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to build program\n");
// clGetProgramBuildInfo...
return 1;
}
kernel = clCreateKernel(program, "linear_forward", &err);
if(!kernel) {
printf("Failed to create kernel\n");
return 1;
}
return 0;
}
void clm_gpuDestroy() {
}
static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) {
cl_int err;
if(!nativeBuf->mem) {
cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err);
if(!mat_values) {
printf("Failed to alloc buffer: %d\n", err);
return NULL;
}
nativeBuf->mem = mat_values;
}
return nativeBuf->mem;
}
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
cl_int err;
cl_mem mem = allocGPUMat(gpuMat, nativeBuf);
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue write: %d\n", err);
return NULL;
}
return mem;
}
clm_NativeBuf nativeInput;
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
if(!linear->nativeWeights) {
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
}
cl_GPUMat matInput = gpuMat(input);
cl_GPUMat matWeights = gpuMat(linear->weights);
cl_GPUMat matBias = gpuMat(linear->bias);
cl_GPUMat matOut = gpuMat(linear->output);
size_t workSize = matOut.rows * matOut.cols;
cl_int err;
cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput);
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
if(!matInput_values || !matWeights_values || !matBias_values) {
linear->output = INVALID_MATRIX;
return;
}
cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput);
if(!matOut_values) {
printf("Failed to alloc out: %d\n", err);
linear->output = INVALID_MATRIX;
return;
}
err = 0;
err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput);
err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values);
err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights);
err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values);
err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias);
err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values);
err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut);
err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values);
if(err != CL_SUCCESS) {
printf("Failed to set kernel args: %d\n", err);
return;
}
/*char *info = calloc(1, 1024);
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
printf("INFO: %s\n", info);*/
size_t local;
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if(err != CL_SUCCESS) {
printf("Failed to get work group size\n");
return;
}
size_t global = ceil((float) workSize / local) * local;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to enqueue: %d\n", err);
return;
}
clFinish(queue);
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL);
if(err != CL_SUCCESS) {
printf("Failed to read from buffer\n");
return;
}
}

264
src/cltest.bak2.c Normal file
View File

@ -0,0 +1,264 @@
#include <dlfcn.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include "clm.h"
float train_data_x[4][2] = {
{0, 0},
{0, 1},
{1, 0},
{1, 1}};
float train_data_y[4][1] = {
{0},
{1},
{1},
{0}};
float *predict(clm_NN nn, float *x, unsigned int length) {
clm_Matrix xM = clm_matrixWrapArray(x, length);
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
if(clm_matrixIsInvalid(newX)) {
printf("Failed to predict\n");
return NULL;
}
clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX);
xM = newX;
}
return xM.values;
}
void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
clm_Matrix xM = clm_matrixWrapArray(x, xL);
clm_Matrix yM = clm_matrixWrapArray(y, yL);
// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
// Forward pass
clm_Matrix currentX = xM;
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
if(clm_matrixIsInvalid(newX)) {
printf("Forward pass failed\n");
return;
}
clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX);
currentX = newX;
}
for(int i = nn.numLayers - 1; i >= 0; i--) {
clm_Linear layer = nn.layers[i];
clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
clm_Matrix outputOfThisLayer = nn.layers[i].output;
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
clm_Matrix error = layer.error;
if(i == nn.numLayers - 1) {
clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
} else {
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
clm_matrixMultiplyMatrix(weightsT, prevError, error);
}
clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
clm_matrixMultiplyScalar(gradient, nn.learnRate);
clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
clm_matrixAddMatrix(layer.weights, layer.weightsError);
clm_matrixAddMatrix(layer.bias, gradient);
}
}
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
if(!file) {
perror("Failed to open labels\n");
return;
}
unsigned char magicBytes[4];
fread(magicBytes, sizeof(magicBytes), 1, file);
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
unsigned char lengthBytes[4];
fread(lengthBytes, sizeof(lengthBytes), 1, file);
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
(lengthBytes[2] << 8) | lengthBytes[3];
printf("%" PRId32 "\n", length);
clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
for(unsigned int i = 0; i < length; i++) {
unsigned char label;
fread(&label, sizeof(unsigned char), 1, file);
clm_Vector vector = clm_vectorCreate(10);
for(unsigned int j = 0; j < 10; j++) {
vector.values[j] = label == j ? 1 : 0;
}
vectors[i] = vector;
}
*labelsOut = vectors;
*labelsCountOut = length;
}
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
if(!file) {
perror("Failed to open images\n");
return;
}
unsigned char magicBytes[4];
fread(magicBytes, sizeof(magicBytes), 1, file);
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
unsigned char lengthBytes[4];
fread(lengthBytes, sizeof(lengthBytes), 1, file);
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
printf("%" PRId32 "\n", length);
unsigned char rowsBytes[4];
fread(rowsBytes, sizeof(rowsBytes), 1, file);
uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
printf("%" PRId32 "\n", rows);
unsigned char colsBytes[4];
fread(colsBytes, sizeof(colsBytes), 1, file);
uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
printf("%" PRId32 "\n", cols);
clm_Vector *images = calloc(length, sizeof(clm_Vector));
for(unsigned int i = 0; i < length; i++) {
clm_Vector vec = clm_vectorCreate(cols * rows);
unsigned char image[cols * rows];
fread(image, sizeof(image), 1, file);
for(unsigned int j = 0; j < cols * rows; j++) {
vec.values[j] = (float) image[j];
}
images[i] = vec;
}
*imagesOut = images;
*imageCountOut = length;
}
typedef void *(*callocFunc)(size_t, size_t);
callocFunc oldCalloc;
int main() {
oldCalloc = dlsym(RTLD_NEXT, "calloc");
clm_Vector *labels = NULL;
unsigned int labelCount;
loadLabels(&labels, &labelCount);
printf("LENGTH: %u\n", labelCount);
clm_Vector *images = NULL;
unsigned int imageCount;
loadImages(&images, &imageCount);
imageCount = 60000;
printf("%f\n", images[0].values[0]);
srand(1);
unsigned int
i = 784,
h = 30,
o = 10;
clm_Linear layer1 = clm_linearCreateRandom(i, h);
clm_Linear layer2 = clm_linearCreateRandom(h, o);
clm_Linear layers[] = {layer1, layer2};
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
for(unsigned int epoch = 0; epoch < 1; epoch++) {
printf("Epoch %u\n", epoch);
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
if(idx % 1000 == 0) {
printf("\r%.2f%%", idx / (float) imageCount * 100);
fflush(stdout);
}
// printf("%u\n", idx);
// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
/*for(unsigned int f = 0; f < images[idx].length; f++) {
printf("%.2f ", images[idx].values[f]);
}
printf("\n");
for(unsigned int f = 0; f < labels[idx].length; f++) {
printf("%.2f ", labels[idx].values[f]);
}
printf("\n");*/
// printf("%.2f\n", labels.values[idx]);
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
// train(nn, test, 784, target, 10);
// predict(nn, test, 784);
}
printf("\n");
}
printf("Train done\n");
unsigned int correct = 0;
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
float *pred = predict(nn, images[idx].values, images[idx].length);
unsigned int predDigit = 0;
float max = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(pred[j] > max || max < 0) {
max = pred[j];
predDigit = j;
}
}
if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
unsigned int actDigit = 0;
float maxA = -1;
for(unsigned int j = 0; j < 10; j++) {
// printf("%.2f ", pred[j]);
if(labels[idx].values[j] > maxA || maxA < 0) {
maxA = labels[idx].values[j];
actDigit = j;
}
}
if(idx < 100) printf("Actual: %u\n", actDigit);
// printf("\n");
if(predDigit == actDigit) correct++;
}
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
printf("\n");
}
void *calloc(size_t nmemb, size_t size) {
// printf("CALLOC\n");
return oldCalloc(nmemb, size);
}

View File

@ -4,6 +4,7 @@
#include <stdlib.h>
#include "clm.h"
#include "clm_gpu.h"
float train_data_x[4][2] = {
{0, 0},
@ -21,17 +22,8 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
clm_Matrix xM = clm_matrixWrapArray(x, length);
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
if(clm_matrixIsInvalid(newX)) {
printf("Failed to predict\n");
return NULL;
}
clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX);
xM = newX;
clm_linearForward(&nn.layers[i], xM);
xM = nn.layers[i].output;
}
return xM.values;
@ -46,16 +38,8 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
// Forward pass
clm_Matrix currentX = xM;
for(unsigned int i = 0; i < nn.numLayers; i++) {
clm_Linear layer = nn.layers[i];
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
if(clm_matrixIsInvalid(newX)) {
printf("Forward pass failed\n");
return;
}
clm_matrixAddMatrix(newX, layer.bias);
clm_matrixSigmoid(newX);
currentX = newX;
clm_linearForward(&nn.layers[i], currentX);
currentX = nn.layers[i].output;
}
for(int i = nn.numLayers - 1; i >= 0; i--) {
@ -162,12 +146,11 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
*imageCountOut = length;
}
typedef void *(*callocFunc)(size_t, size_t);
callocFunc oldCalloc;
int main() {
oldCalloc = dlsym(RTLD_NEXT, "calloc");
if(clm_gpuInit() != 0) {
printf("Failed to init GPU\n");
return 1;
}
clm_Vector *labels = NULL;
unsigned int labelCount;
@ -189,9 +172,9 @@ int main() {
h = 30,
o = 10;
clm_Linear layer1 = clm_linearCreateRandom(i, h);
clm_Linear layer2 = clm_linearCreateRandom(h, o);
clm_Linear layers[] = {layer1, layer2};
clm_Linear layers[] = {
clm_linearCreateRandom(i, h),
clm_linearCreateRandom(h, o)};
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
for(unsigned int epoch = 0; epoch < 1; epoch++) {
@ -201,21 +184,8 @@ int main() {
printf("\r%.2f%%", idx / (float) imageCount * 100);
fflush(stdout);
}
// printf("%u\n", idx);
// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
/*for(unsigned int f = 0; f < images[idx].length; f++) {
printf("%.2f ", images[idx].values[f]);
}
printf("\n");
for(unsigned int f = 0; f < labels[idx].length; f++) {
printf("%.2f ", labels[idx].values[f]);
}
printf("\n");*/
// printf("%.2f\n", labels.values[idx]);
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
// train(nn, test, 784, target, 10);
// predict(nn, test, 784);
}
printf("\n");
}
@ -256,9 +226,6 @@ int main() {
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
printf("\n");
}
void *calloc(size_t nmemb, size_t size) {
// printf("CALLOC\n");
return oldCalloc(nmemb, size);
clm_gpuDestroy();
}

View File

@ -4,30 +4,44 @@ typedef struct __attribute__((packed)) {
char transposed;
} cl_GPUMat;
__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
/*if(a.cols != b.rows) {
printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
return INVALID_MATRIX;
}
if(out.rows != a.rows || out.cols != b.cols) {
printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
return INVALID_MATRIX;
}*/
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
uint idx = get_global_id(0);
if(idx >= matOut.rows * matOut.cols) return;
uint i = idx / matOut.cols;
uint j = idx % matOut.cols;
// for(unsigned int i = 0; i < out.rows; i++) {
// for(unsigned int j = 0; j < out.cols; j++) {
float sum = 0;
for(unsigned int k = 0; k < matA.cols; k++) {
sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j];
sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
}
matOut_values[i * matOut.cols + j] = sum;
//}
//}
matrixAt(matOut, matOut_values, i, j) = sum;
}
void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
uint idx = get_global_id(0);
if(idx >= matA.rows * matA.cols) return;
matA_values[idx] += matB_values[idx];
}
void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
uint idx = get_global_id(0);
if(idx >= mat.rows * mat.cols) return;
mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
}
// clm_Matrix input;
// clm_Matrix weights;
// clm_Matrix bias;
// clm_Matrix output;
__kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
mat_multiply(weights, weights_values, input, input_values, out, out_values);
mat_add(out, out_values, bias, bias_values);
mat_sigmoid(out, out_values);
}