nn-testing/src/mat.cl

typedef struct __attribute__((packed)) {
	uint rows;
	uint cols;
	char transposed;
} cl_GPUMat;

#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]

void mat_multiply(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values, cl_GPUMat matOut, __global float *matOut_values) {
	uint idx = get_global_id(0);
	if(idx >= matOut.rows * matOut.cols) return;

	uint i = idx / matOut.cols;
	uint j = idx % matOut.cols;

	float sum = 0;
	for(unsigned int k = 0; k < matA.cols; k++) {
		sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);
	}
	matrixAt(matOut, matOut_values, i, j) = sum;
}

void mat_add(cl_GPUMat matA, __global float *matA_values, cl_GPUMat matB, __global float *matB_values) {
	uint idx = get_global_id(0);
	if(idx >= matA.rows * matA.cols) return;

	matA_values[idx] += matB_values[idx];
}

void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {
	uint idx = get_global_id(0);
	if(idx >= mat.rows * mat.cols) return;

	mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));
}

__kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float *input_values, cl_GPUMat weights, __global float *weights_values, cl_GPUMat bias, __global float *bias_values, cl_GPUMat out, __global float *out_values) {
	// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!
	for(unsigned int b = 0; b < batchSize; b++) {
		__global float *batchInput_values = input_values + b * input.rows * input.cols;
		__global float *batchOut_values = out_values + b * out.rows * out.cols;
		mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values);
		mat_add(out, batchOut_values, bias, bias_values);
		mat_sigmoid(out, batchOut_values);
	}
}
Basic GPU math 2023-10-29 21:34:23 +01:00			`typedef struct __attribute__((packed)) {`
			`uint rows;`
			`uint cols;`
			`char transposed;`
Don't dynamically allocate memory for training 2023-10-29 01:15:22 +02:00			`} cl_GPUMat;`

More GPU stuff 2023-10-29 23:01:46 +01:00			`#define matrixAt(mat, mat_values, r, c) mat_values[(!mat.transposed ? r * mat.cols + c : c * mat.rows + r)]`
Basic GPU math 2023-10-29 21:34:23 +01:00
More GPU stuff 2023-10-29 23:01:46 +01:00			`void mat_multiply(cl_GPUMat matA, __global float matA_values, cl_GPUMat matB, __global float matB_values, cl_GPUMat matOut, __global float *matOut_values) {`
Basic GPU math 2023-10-29 21:34:23 +01:00			`uint idx = get_global_id(0);`
			`if(idx >= matOut.rows * matOut.cols) return;`

			`uint i = idx / matOut.cols;`
			`uint j = idx % matOut.cols;`
Don't dynamically allocate memory for training 2023-10-29 01:15:22 +02:00
Basic GPU math 2023-10-29 21:34:23 +01:00			`float sum = 0;`
			`for(unsigned int k = 0; k < matA.cols; k++) {`
More GPU stuff 2023-10-29 23:01:46 +01:00			`sum += matrixAt(matA, matA_values, i, k) * matrixAt(matB, matB_values, k, j);`
Basic GPU math 2023-10-29 21:34:23 +01:00			`}`
More GPU stuff 2023-10-29 23:01:46 +01:00			`matrixAt(matOut, matOut_values, i, j) = sum;`
			`}`

			`void mat_add(cl_GPUMat matA, __global float matA_values, cl_GPUMat matB, __global float matB_values) {`
			`uint idx = get_global_id(0);`
			`if(idx >= matA.rows * matA.cols) return;`

			`matA_values[idx] += matB_values[idx];`
			`}`

			`void mat_sigmoid(cl_GPUMat mat, __global float *mat_values) {`
			`uint idx = get_global_id(0);`
			`if(idx >= mat.rows * mat.cols) return;`

			`mat_values[idx] = 1 / (1 + exp(-mat_values[idx]));`
			`}`

Basic batch implementation (WIP) 2023-10-31 14:30:09 +01:00			`__kernel void linear_forward(unsigned int batchSize, cl_GPUMat input, __global float input_values, cl_GPUMat weights, __global float weights_values, cl_GPUMat bias, __global float bias_values, cl_GPUMat out, __global float out_values) {`
More GPU stuff 2023-10-29 23:01:46 +01:00			`// FIXME mat_multiply possibly doesn't index at idx when out is transposed, which is important to ensure it always accesses the same index for all operations!`
Basic batch implementation (WIP) 2023-10-31 14:30:09 +01:00			`for(unsigned int b = 0; b < batchSize; b++) {`
			`__global float batchInput_values = input_values + b input.rows * input.cols;`
			`__global float batchOut_values = out_values + b out.rows * out.cols;`
			`mat_multiply(weights, weights_values, input, batchInput_values, out, batchOut_values);`
			`mat_add(out, batchOut_values, bias, bias_values);`
			`mat_sigmoid(out, batchOut_values);`
			`}`
Don't dynamically allocate memory for training 2023-10-29 01:15:22 +02:00			`}`