More GPU stuff
This commit is contained in:
parent
c91ca11346
commit
4d93fa76cc
4
Makefile
4
Makefile
@ -5,7 +5,7 @@ CFLAGS=-Wall -g
|
|||||||
.PHONY: all
|
.PHONY: all
|
||||||
all:
|
all:
|
||||||
mkdir -p $(BUILD)
|
mkdir -p $(BUILD)
|
||||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c
|
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cltest $(SRC)/cltest.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
|
||||||
|
|
||||||
.PHONY: run
|
.PHONY: run
|
||||||
run: all
|
run: all
|
||||||
@ -14,7 +14,7 @@ run: all
|
|||||||
.PHONY: cl
|
.PHONY: cl
|
||||||
cl:
|
cl:
|
||||||
mkdir -p $(BUILD)
|
mkdir -p $(BUILD)
|
||||||
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c
|
gcc -lOpenCL -lm $(CFLAGS) -o $(BUILD)/cl $(SRC)/cl.c $(SRC)/clm.c $(SRC)/clm_gpu_opencl.c
|
||||||
|
|
||||||
.PHONY: cl_run
|
.PHONY: cl_run
|
||||||
cl_run: cl
|
cl_run: cl
|
||||||
|
8
src/cl.c
8
src/cl.c
@ -1,6 +1,6 @@
|
|||||||
|
#define CL_TARGET_OPENCL_VERSION 200
|
||||||
#include <CL/cl_platform.h>
|
#include <CL/cl_platform.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#define CL_TARGET_OPENCL_VERSION 300
|
|
||||||
|
|
||||||
#include "clm.h"
|
#include "clm.h"
|
||||||
|
|
||||||
@ -70,9 +70,9 @@ int main() {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
clm_Matrix a = clm_createMatrixRandom(3, 4);
|
clm_Matrix a = clm_createMatrixRandom(5, 10);
|
||||||
clm_Matrix b = clm_createMatrixRandom(4, 5);
|
clm_Matrix b = clm_createMatrixRandom(10, 3);
|
||||||
clm_Matrix out = clm_createMatrixRandom(3, 5);
|
clm_Matrix out = clm_createMatrixRandom(5, 3);
|
||||||
|
|
||||||
cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
|
cl_GPUMat matA = {.rows = a.rows, .cols = a.cols, .transposed = a.transposed};
|
||||||
cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
|
cl_GPUMat matB = {.rows = b.rows, .cols = b.cols, .transposed = b.transposed};
|
||||||
|
@ -158,10 +158,8 @@ clm_Matrix clm_matrixMultiplyScalar(clm_Matrix mat, float scalar) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
|
clm_Matrix clm_matrixSigmoid(clm_Matrix mat) {
|
||||||
for(unsigned int i = 0; i < mat.rows; i++) {
|
for(unsigned int i = 0; i < mat.rows * mat.cols; i++) {
|
||||||
for(unsigned int j = 0; j < mat.cols; j++) {
|
mat.values[i] = 1 / (1 + exp(-mat.values[i]));
|
||||||
matrixAt(mat, i, j) = 1 / (1 + exp(-matrixAt(mat, i, j)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return mat;
|
return mat;
|
||||||
|
@ -5,6 +5,8 @@
|
|||||||
|
|
||||||
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
#define matrixAt(mat, r, c) mat.values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
||||||
|
|
||||||
|
typedef struct clm_NativeBuf clm_NativeBuf;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
float *values;
|
float *values;
|
||||||
unsigned int rows;
|
unsigned int rows;
|
||||||
@ -23,6 +25,9 @@ typedef struct {
|
|||||||
clm_Matrix output;
|
clm_Matrix output;
|
||||||
clm_Matrix error;
|
clm_Matrix error;
|
||||||
clm_Matrix weightsError;
|
clm_Matrix weightsError;
|
||||||
|
clm_NativeBuf *nativeWeights;
|
||||||
|
clm_NativeBuf *nativeBias;
|
||||||
|
clm_NativeBuf *nativeOutput;
|
||||||
} clm_Linear;
|
} clm_Linear;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
11
src/clm_gpu.h
Normal file
11
src/clm_gpu.h
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
#ifndef _CLM_GPU_H_
|
||||||
|
#define _CLM_GPU_H_
|
||||||
|
|
||||||
|
#include "clm.h"
|
||||||
|
|
||||||
|
int clm_gpuInit();
|
||||||
|
void clm_gpuDestroy();
|
||||||
|
|
||||||
|
void clm_linearForward(clm_Linear *linear, clm_Matrix input);
|
||||||
|
|
||||||
|
#endif
|
24
src/clm_gpu_cpu.c
Normal file
24
src/clm_gpu_cpu.c
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#include "clm_gpu.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
struct clm_NativeBuf {};
|
||||||
|
|
||||||
|
int clm_gpuInit() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clm_gpuDestroy() {}
|
||||||
|
|
||||||
|
void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
|
||||||
|
clm_Matrix newX = clm_matrixMultiplyMatrix(linear->weights, input, linear->output);
|
||||||
|
|
||||||
|
if(clm_matrixIsInvalid(newX)) {
|
||||||
|
printf("Forward pass failed\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
clm_matrixAddMatrix(newX, linear->bias);
|
||||||
|
clm_matrixSigmoid(newX);
|
||||||
|
linear->output = newX;
|
||||||
|
}
|
192
src/clm_gpu_opencl.c
Normal file
192
src/clm_gpu_opencl.c
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
#include "clm_gpu.h"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#define CL_TARGET_OPENCL_VERSION 200
|
||||||
|
#include <CL/cl_platform.h>
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
static cl_context context;
|
||||||
|
static cl_device_id deviceID;
|
||||||
|
static cl_command_queue queue;
|
||||||
|
static cl_kernel kernel;
|
||||||
|
|
||||||
|
struct clm_NativeBuf {
|
||||||
|
cl_mem mem;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct __attribute__((packed)) {
|
||||||
|
|
||||||
|
cl_uint rows;
|
||||||
|
cl_uint cols;
|
||||||
|
cl_char transposed;
|
||||||
|
} cl_GPUMat;
|
||||||
|
|
||||||
|
#define gpuMat(mat) \
|
||||||
|
{ .rows = mat.rows, .cols = mat.cols, .transposed = mat.transposed }
|
||||||
|
|
||||||
|
char *loadFile(const char *path) {
|
||||||
|
FILE *file = fopen(path, "r");
|
||||||
|
fseek(file, 0, SEEK_END);
|
||||||
|
size_t length = ftell(file);
|
||||||
|
fseek(file, 0, SEEK_SET);
|
||||||
|
char *buffer = calloc(1, length + 1);
|
||||||
|
fread(buffer, length, 1, file);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
int clm_gpuInit() {
|
||||||
|
// Connect to a compute device
|
||||||
|
int useGPU = true;
|
||||||
|
cl_int err = clGetDeviceIDs(NULL, useGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &deviceID, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Error: Failed to create a device group!\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *buffer = loadFile("src/mat.cl");
|
||||||
|
printf("%s", buffer);
|
||||||
|
|
||||||
|
context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &err);
|
||||||
|
if(!context) {
|
||||||
|
printf("Failed to create context\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
queue = clCreateCommandQueueWithProperties(context, deviceID, NULL, &err);
|
||||||
|
if(!queue) {
|
||||||
|
printf("Failed to create command queue\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t length = strlen(buffer);
|
||||||
|
cl_program program = clCreateProgramWithSource(context, 1, (const char **) &buffer, &length, &err);
|
||||||
|
if(!program) {
|
||||||
|
printf("Failed to create program\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to build program\n");
|
||||||
|
// clGetProgramBuildInfo...
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
kernel = clCreateKernel(program, "linear_forward", &err);
|
||||||
|
if(!kernel) {
|
||||||
|
printf("Failed to create kernel\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clm_gpuDestroy() {
|
||||||
|
}
|
||||||
|
|
||||||
|
static cl_mem allocGPUMat(cl_GPUMat mat, clm_NativeBuf *nativeBuf) {
|
||||||
|
cl_int err;
|
||||||
|
|
||||||
|
if(!nativeBuf->mem) {
|
||||||
|
cl_mem mat_values = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mat.rows * mat.cols, NULL, &err);
|
||||||
|
if(!mat_values) {
|
||||||
|
printf("Failed to alloc buffer: %d\n", err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
nativeBuf->mem = mat_values;
|
||||||
|
}
|
||||||
|
|
||||||
|
return nativeBuf->mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
static cl_mem writeGPUMat(cl_GPUMat gpuMat, clm_Matrix mat, clm_NativeBuf *nativeBuf) {
|
||||||
|
cl_int err;
|
||||||
|
cl_mem mem = allocGPUMat(gpuMat, nativeBuf);
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(queue, mem, CL_TRUE, 0, sizeof(float) * mat.rows * mat.cols, mat.values, 0, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to enqueue write: %d\n", err);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
clm_NativeBuf nativeInput;
|
||||||
|
|
||||||
|
// TODO: allow writing multiple inputs at once to improve throughput (no need to rewrite weights/bias each time)
|
||||||
|
void clm_linearForward(clm_Linear *linear, clm_Matrix input) {
|
||||||
|
if(!linear->nativeWeights) {
|
||||||
|
linear->nativeWeights = calloc(1, sizeof(clm_NativeBuf));
|
||||||
|
linear->nativeBias = calloc(1, sizeof(clm_NativeBuf));
|
||||||
|
linear->nativeOutput = calloc(1, sizeof(clm_NativeBuf));
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_GPUMat matInput = gpuMat(input);
|
||||||
|
cl_GPUMat matWeights = gpuMat(linear->weights);
|
||||||
|
cl_GPUMat matBias = gpuMat(linear->bias);
|
||||||
|
cl_GPUMat matOut = gpuMat(linear->output);
|
||||||
|
|
||||||
|
size_t workSize = matOut.rows * matOut.cols;
|
||||||
|
|
||||||
|
cl_int err;
|
||||||
|
|
||||||
|
cl_mem matInput_values = writeGPUMat(matInput, input, &nativeInput);
|
||||||
|
cl_mem matWeights_values = writeGPUMat(matWeights, linear->weights, linear->nativeWeights);
|
||||||
|
cl_mem matBias_values = writeGPUMat(matBias, linear->bias, linear->nativeBias);
|
||||||
|
if(!matInput_values || !matWeights_values || !matBias_values) {
|
||||||
|
linear->output = INVALID_MATRIX;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_mem matOut_values = allocGPUMat(matOut, linear->nativeOutput);
|
||||||
|
if(!matOut_values) {
|
||||||
|
printf("Failed to alloc out: %d\n", err);
|
||||||
|
linear->output = INVALID_MATRIX;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
err = 0;
|
||||||
|
err |= clSetKernelArg(kernel, 0, sizeof(matInput), &matInput);
|
||||||
|
err |= clSetKernelArg(kernel, 1, sizeof(matInput_values), &matInput_values);
|
||||||
|
err |= clSetKernelArg(kernel, 2, sizeof(matWeights), &matWeights);
|
||||||
|
err |= clSetKernelArg(kernel, 3, sizeof(matWeights_values), &matWeights_values);
|
||||||
|
err |= clSetKernelArg(kernel, 4, sizeof(matBias), &matBias);
|
||||||
|
err |= clSetKernelArg(kernel, 5, sizeof(matBias_values), &matBias_values);
|
||||||
|
err |= clSetKernelArg(kernel, 6, sizeof(matOut), &matOut);
|
||||||
|
err |= clSetKernelArg(kernel, 7, sizeof(matOut_values), &matOut_values);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to set kernel args: %d\n", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*char *info = calloc(1, 1024);
|
||||||
|
clGetProgramInfo(program, CL_PROGRAM_STRING_DEBUG_INFO, 1024, info, NULL);
|
||||||
|
printf("INFO: %s\n", info);*/
|
||||||
|
|
||||||
|
size_t local;
|
||||||
|
err = clGetKernelWorkGroupInfo(kernel, deviceID, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to get work group size\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t global = ceil((float) workSize / local) * local;
|
||||||
|
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to enqueue: %d\n", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
clFinish(queue);
|
||||||
|
|
||||||
|
err = clEnqueueReadBuffer(queue, matOut_values, CL_TRUE, 0, sizeof(float) * workSize, linear->output.values, 0, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) {
|
||||||
|
printf("Failed to read from buffer\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
264
src/cltest.bak2.c
Normal file
264
src/cltest.bak2.c
Normal file
@ -0,0 +1,264 @@
|
|||||||
|
#include <dlfcn.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include "clm.h"
|
||||||
|
|
||||||
|
float train_data_x[4][2] = {
|
||||||
|
{0, 0},
|
||||||
|
{0, 1},
|
||||||
|
{1, 0},
|
||||||
|
{1, 1}};
|
||||||
|
|
||||||
|
float train_data_y[4][1] = {
|
||||||
|
{0},
|
||||||
|
{1},
|
||||||
|
{1},
|
||||||
|
{0}};
|
||||||
|
|
||||||
|
float *predict(clm_NN nn, float *x, unsigned int length) {
|
||||||
|
clm_Matrix xM = clm_matrixWrapArray(x, length);
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
||||||
|
clm_Linear layer = nn.layers[i];
|
||||||
|
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
|
||||||
|
|
||||||
|
if(clm_matrixIsInvalid(newX)) {
|
||||||
|
printf("Failed to predict\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
clm_matrixAddMatrix(newX, layer.bias);
|
||||||
|
clm_matrixSigmoid(newX);
|
||||||
|
xM = newX;
|
||||||
|
}
|
||||||
|
|
||||||
|
return xM.values;
|
||||||
|
}
|
||||||
|
|
||||||
|
void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
|
||||||
|
clm_Matrix xM = clm_matrixWrapArray(x, xL);
|
||||||
|
clm_Matrix yM = clm_matrixWrapArray(y, yL);
|
||||||
|
|
||||||
|
// TODO: potential compute/memory tradeoff? (recalculate matrices every time <-> keep everything cached)
|
||||||
|
|
||||||
|
// Forward pass
|
||||||
|
clm_Matrix currentX = xM;
|
||||||
|
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
||||||
|
clm_Linear layer = nn.layers[i];
|
||||||
|
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
|
||||||
|
if(clm_matrixIsInvalid(newX)) {
|
||||||
|
printf("Forward pass failed\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
clm_matrixAddMatrix(newX, layer.bias);
|
||||||
|
clm_matrixSigmoid(newX);
|
||||||
|
currentX = newX;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
||||||
|
clm_Linear layer = nn.layers[i];
|
||||||
|
clm_Matrix inputToThisLayer = i == 0 ? xM : nn.layers[i - 1].output;
|
||||||
|
clm_Matrix outputOfThisLayer = nn.layers[i].output;
|
||||||
|
clm_Matrix prevError = i == nn.numLayers - 1 ? INVALID_MATRIX : nn.layers[i + 1].error;
|
||||||
|
clm_Matrix error = layer.error;
|
||||||
|
|
||||||
|
if(i == nn.numLayers - 1) {
|
||||||
|
clm_matrixSubtractMatrix(clm_matrixCopy(yM, error), outputOfThisLayer); // yhat - y
|
||||||
|
} else {
|
||||||
|
clm_Matrix weightsT = clm_matrixTranspose(nn.layers[i + 1].weights);
|
||||||
|
clm_matrixMultiplyMatrix(weightsT, prevError, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
clm_Matrix gradient = clm_matrixDSigmoid(outputOfThisLayer); // dsig(yhat)
|
||||||
|
clm_matrixMultiplyMatrixElements(gradient, error); // (yhat - y) . dsig(yhat)
|
||||||
|
clm_matrixMultiplyScalar(gradient, nn.learnRate);
|
||||||
|
|
||||||
|
clm_Matrix inputT = clm_matrixTranspose(inputToThisLayer);
|
||||||
|
clm_matrixMultiplyMatrix(gradient, inputT, layer.weightsError);
|
||||||
|
|
||||||
|
clm_matrixAddMatrix(layer.weights, layer.weightsError);
|
||||||
|
clm_matrixAddMatrix(layer.bias, gradient);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void loadLabels(clm_Vector **labelsOut, unsigned int *labelsCountOut) {
|
||||||
|
FILE *file = fopen("data/train-labels.idx1-ubyte", "r");
|
||||||
|
if(!file) {
|
||||||
|
perror("Failed to open labels\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned char magicBytes[4];
|
||||||
|
fread(magicBytes, sizeof(magicBytes), 1, file);
|
||||||
|
|
||||||
|
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
|
||||||
|
|
||||||
|
unsigned char lengthBytes[4];
|
||||||
|
fread(lengthBytes, sizeof(lengthBytes), 1, file);
|
||||||
|
|
||||||
|
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) |
|
||||||
|
(lengthBytes[2] << 8) | lengthBytes[3];
|
||||||
|
printf("%" PRId32 "\n", length);
|
||||||
|
|
||||||
|
clm_Vector *vectors = calloc(length, sizeof(clm_Vector));
|
||||||
|
|
||||||
|
for(unsigned int i = 0; i < length; i++) {
|
||||||
|
unsigned char label;
|
||||||
|
fread(&label, sizeof(unsigned char), 1, file);
|
||||||
|
|
||||||
|
clm_Vector vector = clm_vectorCreate(10);
|
||||||
|
for(unsigned int j = 0; j < 10; j++) {
|
||||||
|
vector.values[j] = label == j ? 1 : 0;
|
||||||
|
}
|
||||||
|
vectors[i] = vector;
|
||||||
|
}
|
||||||
|
|
||||||
|
*labelsOut = vectors;
|
||||||
|
*labelsCountOut = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
|
||||||
|
FILE *file = fopen("data/train-images.idx3-ubyte", "r");
|
||||||
|
if(!file) {
|
||||||
|
perror("Failed to open images\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned char magicBytes[4];
|
||||||
|
fread(magicBytes, sizeof(magicBytes), 1, file);
|
||||||
|
|
||||||
|
printf("%d\n", (magicBytes[0] << 24) | (magicBytes[1] << 16) | (magicBytes[2] << 8) | magicBytes[3]);
|
||||||
|
|
||||||
|
unsigned char lengthBytes[4];
|
||||||
|
fread(lengthBytes, sizeof(lengthBytes), 1, file);
|
||||||
|
uint32_t length = (lengthBytes[0] << 24) | (lengthBytes[1] << 16) | (lengthBytes[2] << 8) | lengthBytes[3];
|
||||||
|
printf("%" PRId32 "\n", length);
|
||||||
|
|
||||||
|
unsigned char rowsBytes[4];
|
||||||
|
fread(rowsBytes, sizeof(rowsBytes), 1, file);
|
||||||
|
uint32_t rows = (rowsBytes[0] << 24) | (rowsBytes[1] << 16) | (rowsBytes[2] << 8) | rowsBytes[3];
|
||||||
|
printf("%" PRId32 "\n", rows);
|
||||||
|
|
||||||
|
unsigned char colsBytes[4];
|
||||||
|
fread(colsBytes, sizeof(colsBytes), 1, file);
|
||||||
|
uint32_t cols = (colsBytes[0] << 24) | (colsBytes[1] << 16) | (colsBytes[2] << 8) | colsBytes[3];
|
||||||
|
printf("%" PRId32 "\n", cols);
|
||||||
|
|
||||||
|
clm_Vector *images = calloc(length, sizeof(clm_Vector));
|
||||||
|
for(unsigned int i = 0; i < length; i++) {
|
||||||
|
clm_Vector vec = clm_vectorCreate(cols * rows);
|
||||||
|
unsigned char image[cols * rows];
|
||||||
|
fread(image, sizeof(image), 1, file);
|
||||||
|
for(unsigned int j = 0; j < cols * rows; j++) {
|
||||||
|
vec.values[j] = (float) image[j];
|
||||||
|
}
|
||||||
|
images[i] = vec;
|
||||||
|
}
|
||||||
|
|
||||||
|
*imagesOut = images;
|
||||||
|
*imageCountOut = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef void *(*callocFunc)(size_t, size_t);
|
||||||
|
|
||||||
|
callocFunc oldCalloc;
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
oldCalloc = dlsym(RTLD_NEXT, "calloc");
|
||||||
|
|
||||||
|
clm_Vector *labels = NULL;
|
||||||
|
unsigned int labelCount;
|
||||||
|
loadLabels(&labels, &labelCount);
|
||||||
|
printf("LENGTH: %u\n", labelCount);
|
||||||
|
|
||||||
|
clm_Vector *images = NULL;
|
||||||
|
unsigned int imageCount;
|
||||||
|
loadImages(&images, &imageCount);
|
||||||
|
|
||||||
|
imageCount = 60000;
|
||||||
|
|
||||||
|
printf("%f\n", images[0].values[0]);
|
||||||
|
|
||||||
|
srand(1);
|
||||||
|
|
||||||
|
unsigned int
|
||||||
|
i = 784,
|
||||||
|
h = 30,
|
||||||
|
o = 10;
|
||||||
|
|
||||||
|
clm_Linear layer1 = clm_linearCreateRandom(i, h);
|
||||||
|
clm_Linear layer2 = clm_linearCreateRandom(h, o);
|
||||||
|
clm_Linear layers[] = {layer1, layer2};
|
||||||
|
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
|
||||||
|
|
||||||
|
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
||||||
|
printf("Epoch %u\n", epoch);
|
||||||
|
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
|
||||||
|
if(idx % 1000 == 0) {
|
||||||
|
printf("\r%.2f%%", idx / (float) imageCount * 100);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
// printf("%u\n", idx);
|
||||||
|
// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
|
||||||
|
/*for(unsigned int f = 0; f < images[idx].length; f++) {
|
||||||
|
printf("%.2f ", images[idx].values[f]);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
for(unsigned int f = 0; f < labels[idx].length; f++) {
|
||||||
|
printf("%.2f ", labels[idx].values[f]);
|
||||||
|
}
|
||||||
|
printf("\n");*/
|
||||||
|
// printf("%.2f\n", labels.values[idx]);
|
||||||
|
|
||||||
|
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
|
||||||
|
// train(nn, test, 784, target, 10);
|
||||||
|
// predict(nn, test, 784);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Train done\n");
|
||||||
|
|
||||||
|
unsigned int correct = 0;
|
||||||
|
for(unsigned int idx = 0; idx < imageCount; idx++) { // Each train sample
|
||||||
|
// printf("pred(%.2f, %.2f) = %.2f\n", train_data_x[idx][0],
|
||||||
|
// train_data_x[idx][1], predict(nn, train_data_x[idx], 2)[0]);
|
||||||
|
float *pred = predict(nn, images[idx].values, images[idx].length);
|
||||||
|
unsigned int predDigit = 0;
|
||||||
|
float max = -1;
|
||||||
|
for(unsigned int j = 0; j < 10; j++) {
|
||||||
|
// printf("%.2f ", pred[j]);
|
||||||
|
if(pred[j] > max || max < 0) {
|
||||||
|
max = pred[j];
|
||||||
|
predDigit = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(idx < 100) printf("%u (confidence: %.2f)\n", predDigit, max);
|
||||||
|
|
||||||
|
unsigned int actDigit = 0;
|
||||||
|
float maxA = -1;
|
||||||
|
for(unsigned int j = 0; j < 10; j++) {
|
||||||
|
// printf("%.2f ", pred[j]);
|
||||||
|
if(labels[idx].values[j] > maxA || maxA < 0) {
|
||||||
|
maxA = labels[idx].values[j];
|
||||||
|
actDigit = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(idx < 100) printf("Actual: %u\n", actDigit);
|
||||||
|
// printf("\n");
|
||||||
|
|
||||||
|
if(predDigit == actDigit) correct++;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void *calloc(size_t nmemb, size_t size) {
|
||||||
|
// printf("CALLOC\n");
|
||||||
|
return oldCalloc(nmemb, size);
|
||||||
|
}
|
59
src/cltest.c
59
src/cltest.c
@ -4,6 +4,7 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "clm.h"
|
#include "clm.h"
|
||||||
|
#include "clm_gpu.h"
|
||||||
|
|
||||||
float train_data_x[4][2] = {
|
float train_data_x[4][2] = {
|
||||||
{0, 0},
|
{0, 0},
|
||||||
@ -21,17 +22,8 @@ float *predict(clm_NN nn, float *x, unsigned int length) {
|
|||||||
clm_Matrix xM = clm_matrixWrapArray(x, length);
|
clm_Matrix xM = clm_matrixWrapArray(x, length);
|
||||||
|
|
||||||
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
||||||
clm_Linear layer = nn.layers[i];
|
clm_linearForward(&nn.layers[i], xM);
|
||||||
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, xM, layer.output);
|
xM = nn.layers[i].output;
|
||||||
|
|
||||||
if(clm_matrixIsInvalid(newX)) {
|
|
||||||
printf("Failed to predict\n");
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_matrixAddMatrix(newX, layer.bias);
|
|
||||||
clm_matrixSigmoid(newX);
|
|
||||||
xM = newX;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return xM.values;
|
return xM.values;
|
||||||
@ -46,16 +38,8 @@ void train(clm_NN nn, float *x, unsigned int xL, float *y, unsigned int yL) {
|
|||||||
// Forward pass
|
// Forward pass
|
||||||
clm_Matrix currentX = xM;
|
clm_Matrix currentX = xM;
|
||||||
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
for(unsigned int i = 0; i < nn.numLayers; i++) {
|
||||||
clm_Linear layer = nn.layers[i];
|
clm_linearForward(&nn.layers[i], currentX);
|
||||||
clm_Matrix newX = clm_matrixMultiplyMatrix(layer.weights, currentX, layer.output);
|
currentX = nn.layers[i].output;
|
||||||
if(clm_matrixIsInvalid(newX)) {
|
|
||||||
printf("Forward pass failed\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
clm_matrixAddMatrix(newX, layer.bias);
|
|
||||||
clm_matrixSigmoid(newX);
|
|
||||||
currentX = newX;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
for(int i = nn.numLayers - 1; i >= 0; i--) {
|
||||||
@ -162,12 +146,11 @@ void loadImages(clm_Vector **imagesOut, unsigned int *imageCountOut) {
|
|||||||
*imageCountOut = length;
|
*imageCountOut = length;
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef void *(*callocFunc)(size_t, size_t);
|
|
||||||
|
|
||||||
callocFunc oldCalloc;
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
oldCalloc = dlsym(RTLD_NEXT, "calloc");
|
if(clm_gpuInit() != 0) {
|
||||||
|
printf("Failed to init GPU\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
clm_Vector *labels = NULL;
|
clm_Vector *labels = NULL;
|
||||||
unsigned int labelCount;
|
unsigned int labelCount;
|
||||||
@ -189,9 +172,9 @@ int main() {
|
|||||||
h = 30,
|
h = 30,
|
||||||
o = 10;
|
o = 10;
|
||||||
|
|
||||||
clm_Linear layer1 = clm_linearCreateRandom(i, h);
|
clm_Linear layers[] = {
|
||||||
clm_Linear layer2 = clm_linearCreateRandom(h, o);
|
clm_linearCreateRandom(i, h),
|
||||||
clm_Linear layers[] = {layer1, layer2};
|
clm_linearCreateRandom(h, o)};
|
||||||
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
|
clm_NN nn = {layers, sizeof(layers) / sizeof(clm_Linear), 0.01};
|
||||||
|
|
||||||
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
for(unsigned int epoch = 0; epoch < 1; epoch++) {
|
||||||
@ -201,21 +184,8 @@ int main() {
|
|||||||
printf("\r%.2f%%", idx / (float) imageCount * 100);
|
printf("\r%.2f%%", idx / (float) imageCount * 100);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
// printf("%u\n", idx);
|
|
||||||
// train(nn, train_data_x[idx], 2, train_data_y[idx], 1);
|
|
||||||
/*for(unsigned int f = 0; f < images[idx].length; f++) {
|
|
||||||
printf("%.2f ", images[idx].values[f]);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
for(unsigned int f = 0; f < labels[idx].length; f++) {
|
|
||||||
printf("%.2f ", labels[idx].values[f]);
|
|
||||||
}
|
|
||||||
printf("\n");*/
|
|
||||||
// printf("%.2f\n", labels.values[idx]);
|
|
||||||
|
|
||||||
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
|
train(nn, images[idx].values, images[idx].length, labels[idx].values, labels[idx].length);
|
||||||
// train(nn, test, 784, target, 10);
|
|
||||||
// predict(nn, test, 784);
|
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
@ -256,9 +226,6 @@ int main() {
|
|||||||
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
|
printf("Correct: %u -> %.2f", correct, (float) correct / imageCount * 100);
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
|
||||||
|
|
||||||
void *calloc(size_t nmemb, size_t size) {
|
clm_gpuDestroy();
|
||||||
// printf("CALLOC\n");
|
|
||||||
return oldCalloc(nmemb, size);
|
|
||||||
}
|
}
|
||||||
|
46
src/mat.cl
46
src/mat.cl
@ -4,30 +4,44 @@ typedef struct __attribute__((packed)) {
|
|||||||
char transposed;
|
char transposed;
|
||||||
} cl_GPUMat;
|
} cl_GPUMat;
|
||||||
|
|
||||||
__kernel void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
|
#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]
|
||||||
/*if(a.cols != b.rows) {
|
|
||||||
printf("Cannot multiply matrices (got %dx%d and %dx%d)\n", a.rows, a.cols, b.rows, b.cols);
|
|
||||||
return INVALID_MATRIX;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(out.rows != a.rows || out.cols != b.cols) {
|
|
||||||
printf("Cannot multiply matrices: output invalid shape (expected %dx%d, got %dx%d)\n", a.rows, b.cols, out.rows, out.cols);
|
|
||||||
return INVALID_MATRIX;
|
|
||||||
}*/
|
|
||||||
|
|
||||||
|
void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
|
||||||
uint idx = get_global_id(0);
|
uint idx = get_global_id(0);
|
||||||
if(idx >= matOut.rows * matOut.cols) return;
|
if(idx >= matOut.rows * matOut.cols) return;
|
||||||
|
|
||||||
uint i = idx / matOut.cols;
|
uint i = idx / matOut.cols;
|
||||||
uint j = idx % matOut.cols;
|
uint j = idx % matOut.cols;
|
||||||
|
|
||||||
// for(unsigned int i = 0; i < out.rows; i++) {
|
|
||||||
// for(unsigned int j = 0; j < out.cols; j++) {
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for(unsigned int k = 0; k < matA.cols; k++) {
|
for(unsigned int k = 0; k < matA.cols; k++) {
|
||||||
sum += matA_values[i * matA.cols + k] * matB_values[k * matB.cols + j];
|
sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
|
||||||
}
|
}
|
||||||
matOut_values[i * matOut.cols + j] = sum;
|
matrixAt(matOut, matOut_values, i, j) = sum;
|
||||||
//}
|
}
|
||||||
//}
|
|
||||||
|
void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
|
||||||
|
uint idx = get_global_id(0);
|
||||||
|
if(idx >= matA.rows * matA.cols) return;
|
||||||
|
|
||||||
|
matA_values[idx] += matB_values[idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
|
||||||
|
uint idx = get_global_id(0);
|
||||||
|
if(idx >= mat.rows * mat.cols) return;
|
||||||
|
|
||||||
|
mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
|
||||||
|
}
|
||||||
|
|
||||||
|
// clm_Matrix input;
|
||||||
|
// clm_Matrix weights;
|
||||||
|
// clm_Matrix bias;
|
||||||
|
// clm_Matrix output;
|
||||||
|
|
||||||
|
__kernel void linear_forward(cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
|
||||||
|
// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
|
||||||
|
mat_multiply(weights, weights_values, input, input_values, out, out_values);
|
||||||
|
mat_add(out, out_values, bias, bias_values);
|
||||||
|
mat_sigmoid(out, out_values);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user