参考自:《深入理解计算机系统》第5章
优化方式:
代码移动(code motion):识别要执行多次(例如在循环里)但是计算结果不会改变的计算。
减少过程调用:减少循环中的判断。
消除不必要的存储器引用:在临时变量中存放结果,消除每次循环迭代中从存储器中读出并将更新值写回的需要。
循环展开:通过每次迭代计算的元素的数量,减少循环的迭代次数。
提高并行性:利用单元的流水线能力。
重新结合变换:减小计算中关键路径上操作的数量,通过更好地利用功能单元的流水线能力得到更好地性能
vec.h
typedef int data_t;//typedef double data_t;/* Create abstract data type for vector */typedef struct{ long int len; data_t *data;}vec_rec, *vec_ptr;#define IDENT 0#define OP +vec_ptr new_vec(long int len);void free_vec(vec_ptr v);int get_vec_element(vec_ptr v, long int index, data_t *dest);long int vec_length(vec_ptr v);void combine1(vec_ptr v, data_t *dest);void combine2(vec_ptr v, data_t *dest);void combine3(vec_ptr v, data_t *dest);void combine4(vec_ptr v, data_t *dest);void combine5(vec_ptr v, data_t *dest);void combine6(vec_ptr v, data_t *dest);void combine7(vec_ptr v, data_t *dest);
vec.cpp
#include "vec.h"#include/* Create vector of specified length */vec_ptr new_vec(long int len){ /* Allocate header structure */ vec_ptr result = (vec_ptr)malloc(sizeof(vec_rec)); if (!result) { return NULL; /* Couldn't allocate storage */ } result->len = len; /* Allocate array */ if (len > 0) { data_t *data = (data_t *)calloc(len, sizeof(data_t)); if (!data) { free((void *)result); return NULL; } result->data = data; } else { result->data = NULL; } return result;}void free_vec(vec_ptr v){ if (v) { if (v->data) { free((void*)v->data); } free(v); }}/* * Retrieve vector element and store at dest. * Return 0 (out of bounds) or 1 (successful) */int get_vec_element(vec_ptr v, long int index, data_t *dest){ if (index < 0 || index >= v->len) { return 0; } *dest = v->data[index]; return 1;}/* Return length of vector */long int vec_length(vec_ptr v){ return v->len;}/* Implementation with maximum use of data abstraction */void combine1(vec_ptr v, data_t *dest){ long int i; *dest = IDENT; for (i = 0; i < vec_length(v); i++) { data_t val; get_vec_element(v, i, &val); *dest = *dest OP val; }}// 代码移动(code motion)/* Move call to vec_length out of loop */void combine2(vec_ptr v, data_t *dest){ long int i; long int length = vec_length(v); *dest = IDENT; for (i = 0; i < length; i++) { data_t val; get_vec_element(v, i, &val); *dest = *dest OP val; }}data_t *get_vec_start(vec_ptr v){ return v->data;}// 减少过程调用/* Direct access to vector data */void combine3(vec_ptr v, data_t *dest){ long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); *dest = IDENT; for (i = 0; i < length; i++) { *dest = *dest OP data[i]; }}// 消除不必要的存储器引用/* Accumulate result in local variable */void combine4(vec_ptr v, data_t *dest){ long int i; long int length = vec_length(v); data_t *data = get_vec_start(v); data_t acc = IDENT; for (i = 0; i < length; i++) { acc = acc OP data[i]; } *dest = acc;}// 循环展开(两次循环展开)/* Unroll loop by 2 */void combine5(vec_ptr v, data_t *dest){ long int i; long int length = vec_length(v); long int limit = length - 1; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine 2 elements at a time */ for (i = 0; i < limit; i += 2) { acc = (acc OP data[i]) OP data[i + 1]; } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc;}// 提高并行性(两路并行)/* Unroll loop by 2, 2-way parallelism */void combine6(vec_ptr v, data_t *dest){ long int i; long int length = vec_length(v); long int limit = length - 1; data_t *data = get_vec_start(v); data_t acc0 = IDENT; data_t acc1 = IDENT; /* Combine 2 elements at a time */ for (i = 0; i < limit; i += 2) { acc0 = acc0 OP data[i]; acc1 = acc1 OP data[i + 1]; } /* Finish any remaining elements */ for (; i < length; i++) { acc0 = acc0 OP data[i]; } *dest = acc0 OP acc1;}// 重新结合变换/* Change associativity of combining operation */void combine7(vec_ptr v, data_t *dest){ long int i; long int length = vec_length(v); long int limit = length - 1; data_t *data = get_vec_start(v); data_t acc = IDENT; /* Combine2 elements at a time */ for (i = 0; i < limit; i += 2) { acc = acc OP(data[i] OP data[i + 1]); } /* Finish any remaining elements */ for (; i < length; i++) { acc = acc OP data[i]; } *dest = acc;}
main.cpp
#include#include #include "vec.h"int main(){ vec_ptr pt = new_vec(10000); for (int i = 0; i < pt->len; i++) { *(pt->data + i) = i + 1; } data_t dest; clock_t start, end; start = clock(); for (int i = 0; i < 1000; i++) { combine1(pt, &dest); } end = clock(); printf("1--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine2(pt, &dest); } end = clock(); printf("2--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine3(pt, &dest); } end = clock(); printf("3--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine4(pt, &dest); } end = clock(); printf("4--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine5(pt, &dest); } end = clock(); printf("5--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine6(pt, &dest); } end = clock(); printf("6--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); start = clock(); for (int i = 0; i < 1000; i++) { combine7(pt, &dest); } end = clock(); printf("7--time:%f----%d\n", (double)(end - start) / CLK_TCK, dest); free_vec(pt); return 0;}
运行结果