#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

#define REPEAT	1

float *matrix_alloc(size_t size)
{
	float *a = malloc(sizeof(float) * size * size);

	if (a == NULL) {
		perror("matrix_alloc()");
		exit(1);
	}

	return a;
}

void matrix_random(size_t size, float *a)
{
	static uint64_t r = 0x92a834c7df2e983d;

	for (size_t i = 0; i < size; i++) {
		for (size_t j = 0; j < size; j++) {
			r = (r * 25214903917 + 11) & 0xffffffffffff;
			a[i * size + j] = (float)(r >> 16) / (float)0xffffffff;
		}
	}
}

float matrix_sum(size_t size, const float *a)
{
	float sum = 0.0;

	for (size_t i = 0; i < size; i++) {
		for (size_t j = 0; j < size; j++) {
			sum += a[i * size + j];
		}
	}

	return sum;
}

#define BLOCK	16

void block_zero(float *wx)
{
	for (size_t li = 0; li < BLOCK; li++) {
		for (size_t lj = 0; lj < BLOCK; lj++) {
			wx[li * BLOCK + lj] = 0.0;
		}
	}
}

void block_mul_accumulate(float *wx, const float *wa, const float *wbt)
{
	for (size_t li = 0; li < BLOCK; li++) {
		for (size_t lj = 0; lj < BLOCK; lj++) {
			float sum = 0.0;
			for (size_t lk = 0; lk < BLOCK; lk++) {
				sum += wa[li * BLOCK + lk] * wbt[lj * BLOCK + lk];
			}
			wx[li * BLOCK + lj] += sum;
		}
	}
}



void matrix_multiply(size_t size, float *x, const float *a, const float *b)
{
	float *wa = matrix_alloc(size);
	float *wbt = matrix_alloc(size);
	float wx[BLOCK * BLOCK];

	size_t grid = size / BLOCK;

	for (size_t wi = 0; wi < grid; wi++) {
		for (size_t wj = 0; wj < grid; wj++) {
			size_t block_index = wi * grid + wj;
			size_t block_offset =
				block_index * BLOCK * BLOCK;

			for (size_t li = 0; li < BLOCK; li++) {
				for (size_t lj = 0; lj < BLOCK; lj++) {
					size_t el_offset =
						block_offset + li * BLOCK + lj;

					size_t i = wi * BLOCK + li;
					size_t j = wj * BLOCK + lj;

					wa[el_offset] = a[i * size + j];
					wbt[el_offset] = b[j * size + i];
				}
			}
		}
	}

	for (size_t wi = 0; wi < grid; wi++) {
		for (size_t wj = 0; wj < grid; wj++) {
			block_zero(wx);
			for (size_t wk = 0; wk < grid; wk++) {
				block_mul_accumulate(
					wx,
					&wa[(wi * grid + wk) * BLOCK * BLOCK],
					&wbt[(wj * grid + wk) * BLOCK * BLOCK]
				);
			}

			for (size_t li = 0; li < BLOCK; li++) {
				for (size_t lj = 0; lj < BLOCK; lj++) {
					size_t i = wi * BLOCK + li;
					size_t j = wj * BLOCK + lj;
					x[i * size + j] = wx[li * BLOCK + lj];
				}
			}
		}
	}

	free(wa);
	free(wbt);
}

int main(int argc, char **argv)
{
	if (argc != 2) {
		printf("Usage: %s [size]\n", argv[0]);
		return 1;
	}

	size_t size = strtol(argv[1], NULL, 0);

	if (size <= 0)
		return 1;

	if ((size % BLOCK) != 0) {
		printf("Size %zd should be a multiple of %d\n", size, BLOCK);
		return 1;
	}

	float *a = matrix_alloc(size);
	float *b = matrix_alloc(size);
	float *x = matrix_alloc(size);

	matrix_random(size, a);
	matrix_random(size, b);

	for (int r = 0; r < REPEAT; r++) {
		matrix_multiply(size, x, a, b);
	}

	printf("%g\n", matrix_sum(size, x));

	free(a);
	free(b);
	free(x);

	return 0;
}
