#include #include #include #include #include #include #include #include #include "matrix.h" #include "util.h" #include "instrumentation.h" void matrix_init(struct matrix * m, size_t nrows, size_t ncolumns) { m->nrows = nrows; m->ncolumns = ncolumns; size_t size = nrows*ncolumns*sizeof(*m->data); m->data = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, 0, 0); } void matrix_fini(struct matrix * m) { munmap(m->data, m->nrows*m->ncolumns*sizeof(*m->data)); } struct matrix * matrix_create(size_t nrows, size_t ncolumns) { struct matrix * m = malloc(sizeof(*m)); matrix_init(m, nrows, ncolumns); return m; } void matrix_destroy(struct matrix * m) { matrix_fini(m); free(m); } void matrix_populate(struct matrix * m) { uint32_t r, c; double n = m->nrows*m->ncolumns; for (r = 0; r < m->nrows; r++) { for (c = 0; c < m->ncolumns; c++) *rm_pos(m,r,c) = 100*(r*m->ncolumns+c)/n; } } struct matrix * matrix_convert(const struct matrix * m, double (*elem)(const struct matrix * m, size_t r, size_t c), double * (*pos)(const struct matrix * m, size_t r, size_t c)) { matrix * result = matrix_create(m->nrows, m->ncolumns); uint32_t r, c; for (r = 0; r < m->nrows; r++) { for (c = 0; c < m->ncolumns; c++) *pos(result,r,c) = elem(m,r,c); } return result; } void matrix_show(const struct matrix * m, FILE * fp) { uint32_t r, c; for (r = 0; r < m->nrows; r++) { for (c = 0; c < m->ncolumns; c++) { fprintf(fp, "%lf\t", m->data[r*m->ncolumns+c]); } fprintf(fp, "\n"); } } bool matrix_equals(const struct matrix * a, const struct matrix * b) { if (a->nrows != b->nrows) return false; if (a->ncolumns != b->ncolumns) return false; uint32_t r, c; double epsilon = 10e-6; for (r = 0; r < a->nrows; r++) { for (c = 0; c < a->ncolumns; c++) if (fabs(a->data[r*a->ncolumns+c] - b->data[r*a->ncolumns+c]) > epsilon) { printf("%lf - %lf\n", a->data[r*a->ncolumns+c], b->data[r*a->ncolumns+c]); return false; } } return true; } struct matrix * matrix_multiply_rm(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); #ifdef PRINT_TIMINGS printf("---rm(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif #pragma omp parallel for schedule(static,8) for (size_t i = 0; i < c->nrows; i++) { for (size_t j = 0; j < c->ncolumns; j++) { for (size_t k = 0; k < a->ncolumns; k++) c->data[i*c->ncolumns+j] += a->data[i*a->ncolumns+k] * b->data[k*b->ncolumns+j]; } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_trm(const struct matrix * a, const struct matrix * b, size_t bsize) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); #ifdef PRINT_TIMINGS printf("---trm(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif size_t i, j, jj, k, kk; for (jj = 0; jj < c->nrows; jj += bsize) { for (kk = 0; kk < a->ncolumns; kk += bsize) { for (i = 0; i < c->nrows; i++) { for (j = jj; j < min(jj+bsize, c->ncolumns); j++) { for (k = kk; k < min(kk+bsize, a->ncolumns); k++) c->data[i*c->ncolumns+j] += a->data[i*a->ncolumns+k] * b->data[k*b->ncolumns+j]; } } } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_cm(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); #ifdef PRINT_TIMINGS printf("---cm(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif size_t i, j, k; for (i = 0; i < c->nrows; i++) { #pragma omp parallel for for (j = 0; j < c->ncolumns; j++) { for (k = 0; k < a->ncolumns; k++) c->data[i*c->nrows+j] += a->data[i*a->nrows+k] * b->data[k*b->nrows+j]; } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_hm(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); uint32_t m = min(nlz(c->nrows-1), nlz(c->ncolumns-1)); #ifdef PRINT_TIMINGS printf("---hm(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif #pragma omp parallel for schedule(static,8) for (size_t i = 0; i < c->nrows; i++) { for (size_t j = 0; j < c->ncolumns; j++) { size_t dc = 0, sc = 0, rowc; for (int32_t l = 31-m; l >= 0; l--) { size_t il = 4*((i >> l) & 1); size_t jl = 2*((j >> l) & 1); rowc = (8*sc) | il | jl; dc = (dc << 2) | ((0x361E9CB4 >> rowc) & 3); sc = (0x8FE65831 >> rowc) & 3; } for (size_t k = 0; k < a->ncolumns; k++) { size_t da = 0, db = 0; size_t sa = 0, sb = 0; size_t rowa, rowb; { for (int32_t l = 31-m; l >= 0; l--) { size_t il = 4*((i >> l) & 1); size_t jl = 2*((j >> l) & 1); size_t kl = (k >> l) & 1; rowa = (8*sa) | il | (2*kl); da = (da << 2) | ((0x361E9CB4 >> rowa) & 3); sa = (0x8FE65831 >> rowa) & 3; rowb = (8*sb) | (4*kl) | jl; db = (db << 2) | ((0x361E9CB4 >> rowb) & 3); sb = (0x8FE65831 >> rowb) & 3; } } c->data[dc] += a->data[da] * b->data[db]; } } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_hm3(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); uint32_t m = min(nlz(c->nrows-1), nlz(c->ncolumns-1)); #ifdef PRINT_TIMINGS printf("---hm3(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif const uint8_t luc1[32] = { 0, 2, 1, 2, 3, 1, 2, 1, 0, 2, 3, 3, 1, 0, 2, 1, 2, 3, 3, 3, 1, 0, 0, 0, 2, 3, 1, 2, 3, 1, 0, 0 }; const uint8_t luc2[32] = { 1, 0, 0, 2, 3, 1, 0, 0, 0, 0, 2, 3, 1, 2, 1, 0, 2, 3, 1, 0, 2, 3, 3, 3, 3, 3, 3, 1, 0, 0, 2, 1 }; #define C1(x) (luc1[(x)]) #define C2(x) (luc2[(x)]) #pragma omp parallel for schedule(static,8) for ( uint32_t i=0; i < c->nrows; i++) { #define COUNTLOOP(a) for ( uint8_t ll=31-m-a; ll; ll-- ) #define COUNTFROM 7 uint32_t imapc = 0, jmapc = 0, kmapc = 0; uint32_t iwork = i>>(COUNTFROM-1); COUNTLOOP(COUNTFROM) { imapc |= iwork & 4; iwork >>= 1; imapc <<= 1; } for ( uint32_t j = 0, jwork; j < c->ncolumns; j++) { if ( !(j%(2<>COUNTFROM; COUNTLOOP(COUNTFROM) { jmapc |= 2 & jwork; jwork >>= 1; jmapc <<= 1; } } uint64_t da,db,dc; uint8_t rowc, rowa, rowb; rowc ^= rowc, dc ^= dc; uint32_t imap = imapc, jmap = jmapc; COUNTLOOP(COUNTFROM) { jmap >>= 1, imap >>= 1, dc <<= 2; rowc |= (imap&4) | (jmap&2); dc |= C1(rowc) & 3; rowc = C2(rowc)<<3; } #define CL(a,b) do { \ rowc |= ((i>>a) & 4) | ((j>>b) & 2); \ dc <<= 2; \ dc |= C1(rowc);\ rowc = C2(rowc);\ rowc <<= 3; \ } while(0) // Approaching bottom, C CL(5,6); CL(4,5); CL(3,4); CL(2,3); CL(1,2); // l=2 dc <<= 2; rowc |= (i & 4) | ((j>>1) & 2); dc |= C1(rowc); rowc = C2(rowc) << 3; // l=1 dc <<= 2; rowc |= ((i<<1)&4) | (j&2); dc |= C1(rowc); rowc = C2(rowc) << 3; // l=0 rowc |= ((i<<2) & 4) | ((j<<1) & 2); dc = (dc << 2) | C1(rowc); for (uint32_t kk = 0, kwork; kk < a->ncolumns; kk+=(2<>(COUNTFROM); COUNTLOOP(COUNTFROM) { kmapc |= 2 & kwork; kwork >>= 1; kmapc <<= 1; } for (uint32_t k = kk; k < kk+(2<>= 1, jmap >>= 1; rowb |= (4&kmap) | (2&jmap); kmap >>= 1; rowa |= (4&imap) | (2&kmap); da = (da<<2) | C1(rowa), db = (db<<2) | C1(rowb); rowa = C2(rowa)<<3, rowb = C2(rowb)<<3; } #define AL(a,b) do { \ rowa |= 2 & (k>>a); \ da <<= 2; \ rowa |= 4 & (i>>b); \ da |= C1(rowa); \ rowa = C2(rowa)<<3; \ } while (0) #define BL(a,b) do { \ rowb |= 2 & (j>>a); \ db <<= 2; \ rowb |= 4 & (k>>b); \ db |= C1(rowb); \ rowb = C2(rowb)<<3; \ } while ( 0 ) // Approaching bottom, A, B AL(6,5); BL(6,5); AL(5,4); BL(5,4); AL(4,3); BL(4,3); AL(3,2); BL(3,2); AL(2,1); BL(2,1); // A, l=2 rowa |= 2 & (k >> 1); da <<= 2; rowa |= 4 & i; da |= C1(rowa); rowa = C2(rowa)<<3; // B, l=2 rowb |= (2 & (j>>1)); db <<= 2; rowb |= 4 & k; db |= C1(rowb); rowb = C2(rowb); rowb <<= 3; imap = i<<1; // A, l=1 rowa |= (imap&4) | (k&2); da <<= 2; da |= C1(rowa); rowa = C2(rowa)<<3; kmap = k<<1; // B, l=1 db <<= 2; rowb |= (kmap&4) | (j&2); db |= C1(rowb); rowb = C2(rowb); rowb <<= 3; // A, l=0 imap<<=1; rowa |= (imap&4) | (kmap&2); da <<= 2; da |= C1(rowa); // B, l=1 db <<= 2; rowb |= ((k<<2)&4) | ((j<<1)&2); db |= C1(rowb); c->data[dc] += a->data[da] * b->data[db]; } } } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_hm2(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); uint32_t m = min(nlz(c->nrows-1), nlz(c->ncolumns-1)); #ifdef PRINT_TIMINGS printf("---hm2(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif static const uint8_t lut[16] = { 0x4,0x1,0xF,0x2,0x0,0xB,0x5,0x6,0xA,0x7,0x9,0xC,0xE,0xD,0x3,0x8 }; #pragma omp parallel for schedule(static,8) for (size_t i = 0; i < c->nrows; i++) { for (size_t j = 0; j < c->ncolumns; j++) { uint64_t ils[31-m], jls[31-m]; size_t dc = 0, sc = 0; for (int32_t l = 31-m; l >= 0; l--) { ils[l] = ((i >> l) & 1) << 1; jls[l] = (j >> l) & 1; sc = lut[(sc & 0xCul) | ils[l] | jls[l]]; dc = (dc << 2) | (sc & 3); } for (size_t k = 0; k < a->ncolumns; k++) { size_t da = 0, db = 0; size_t sa = 0, sb = 0; { for (int64_t l = 31-m; l >= 0; l--) { uint64_t kl = (k >> l) & 1; sa = lut[(sa & 0xCul) | ils[l] | kl]; da = (da << 2) | (sa & 3); sb = lut[(sb & 0xCul) | (kl << 1) | jls[l]]; db = (db << 2) | (sb & 3); } } c->data[dc] += a->data[da] * b->data[db]; } } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_zm(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); memset(c->data, 0, c->nrows*c->ncolumns*sizeof(*c->data)); #ifdef PRINT_TIMINGS printf("---zm(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif for (size_t i = 0; i < c->nrows; i++) { #pragma omp parallel for schedule(static,8) for (size_t j = 0; j < c->ncolumns; j++) { uint64_t di = i; di = (di | (di << 16)) & 0x0000FFFF0000FFFFul; di = (di | (di << 8)) & 0x00FF00FF00FF00FFul; di = (di | (di << 4)) & 0x0F0F0F0F0F0F0F0Ful; di = (di | (di << 2)) & 0x3333333333333333ul; di = (di | (di << 1)) & 0x5555555555555555ul; uint64_t dj = j; dj = (dj | (dj << 16)) & 0x0000FFFF0000FFFFul; dj = (dj | (dj << 8)) & 0x00FF00FF00FF00FFul; dj = (dj | (dj << 4)) & 0x0F0F0F0F0F0F0F0Ful; dj = (dj | (dj << 2)) & 0x3333333333333333ul; dj = (dj | (dj << 1)) & 0x5555555555555555ul; uint64_t zc = dj << 1 | di; for (size_t k = 0; k < a->ncolumns; k++) { uint64_t dk = k; dk = (dk | (dk << 16)) & 0x0000FFFF0000FFFFul; dk = (dk | (dk << 8)) & 0x00FF00FF00FF00FFul; dk = (dk | (dk << 4)) & 0x0F0F0F0F0F0F0F0Ful; dk = (dk | (dk << 2)) & 0x3333333333333333ul; dk = (dk | (dk << 1)) & 0x5555555555555555ul; uint64_t za = dk << 1 | di; uint64_t zb = dj << 1 | dk; c->data[zc] += a->data[za] * b->data[zb]; } } } #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_atlas(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); #ifdef PRINT_TIMINGS printf("---atlas(%ld,%ld,%ld)---\n", a->nrows, a->nrows, sizeof(a->data[0])); start_logging(); #endif cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a->nrows, b->ncolumns, b->nrows, 1.0, a->data, a->ncolumns, b->data, b->ncolumns, 0.0, c->data, c->ncolumns); #ifdef PRINT_TIMINGS stop_logging(); printf("---"); #endif return c; } struct matrix * matrix_multiply_verify(const struct matrix * a, const struct matrix * b) { DEBUG_ASSERT(a->ncolumns == b->nrows); matrix * c = matrix_create(a->nrows, b->ncolumns); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, a->nrows, b->ncolumns, b->nrows, 1.0, a->data, a->ncolumns, b->data, b->ncolumns, 0.0, c->data, c->ncolumns); return c; }