#include #include #include #include // include SSE intrinsics #if defined(_MSC_VER) #include #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #include #endif #define CLOCK_RATE_GHZ 2.26e9 static __inline__ uint64_t RDTSC() { uint32_t hi, lo; __asm__ volatile ( "rdtsc" : "=a"(lo), "=d"(hi) ); return (((uint64_t)hi) << 32) | ((uint64_t)lo); } static int sum_naive(int n, int *a) { int sum = 0; for (int i = 0; i < n; i++) { sum += a[i]; } return sum; } static int sum_unrolled(int n, int *a) { int sum = 0; // unrolled loop for (int i = 0; i < n / 4 * 4; i += 4) { sum += a[i+0]; sum += a[i+1]; sum += a[i+2]; sum += a[i+3]; } // tail case for (int i = n / 4 * 4; i < n; i++) { sum += a[i]; } return sum; } static int sum_vectorized(int n, int *a) { // WRITE YOUR VECTORIZED CODE HERE } static int sum_vectorized_unrolled(int n, int *a) { // UNROLL YOUR VECTORIZED CODE HERE } void benchmark(int n, int *a, int(*computeSum)(int, int*), char *name) { // warm up cache int sum = computeSum(n, a); // measure uint64_t beginCycle = RDTSC(); sum += computeSum(n, a); uint64_t cycles = RDTSC() - beginCycle; double microseconds = cycles/CLOCK_RATE_GHZ*1e6; // print results printf("%20s: ", name); if (sum == 2 * sum_naive(n, a)) { printf("%.2f microseconds\n", microseconds); } else { printf("ERROR!\n"); } } int main(int argc, char **argv) { const int n = 7777; // initialize the array with random values srand48(time(NULL)); int a[n] __attribute__((aligned(16))); for (int i = 0; i < n; i++) { a[i] = lrand48(); } // benchmark series of codes benchmark(n, a, sum_naive, "naive"); benchmark(n, a, sum_unrolled, "unrolled"); benchmark(n, a, sum_vectorized, "vectorized"); benchmark(n, a, sum_vectorized_unrolled, "vectorized unrolled"); return 0; }