PowerPoint Presentation
Lecture 12
Optimization
Copyright By PowCoder代写 加微信 powcoder
COMP1411: Introduction to Computer Systems
Dr. Xianjin XIA
Department of Computing,
The Hong Kong Polytechnic University
Spring 2022
These slides are only intended to use internally. Do not publish it anywhere without permission.
Acknowledgement: These slides are based on the textbook (Computer Systems: A Programmer’s Perspective) and its slides.
Levels of optimization for a single program
Executable
Executable
Introducing several program optimization approaches
Code motion
Share common subexpressions
Reducing procedure calls
Reducing memory accesses
Introducing popular concepts of computing
Code motion
Reduce frequency with which computation performed
If it will always produce same result
Especially moving code out of loop
int ni = n*i;
for (j = 0; j < n; j++)
a[ni+j] = b[j];
void set_row(double *a, double *b,
long i, long n)
for (j = 0; j < n; j++)
a[n*i+j] = b[j];
Compiler-Generated Code Motion
testq %rcx, %rcx # Test n
jle .L4 # If 0, goto done
movq %rcx, %rax # rax = n
imulq %rdx, %rax # rax *= i, %rdx:i
leaq (%rdi,%rax,8), %rdx # rowp = a + n*i*8
movl $0, %r8d # j = 0
.L3: # loop:
movq (%rsi,%r8,8), %rax # t = b[j]
movq %rax, (%rdx) # *rowp = t
addq $1, %r8 # j++
addq $8, %rdx # rowp++
cmpq %r8, %rcx # Compare n:j
jg .L3 # If >, goto loop
.L4: # done:
long ni = n*i;
double *rowp = a+ni;
for (j = 0; j < n; j++)
*rowp++ = b[j];
void set_row(double *a, double *b,
long i, long n)
for (j = 0; j < n; j++)
a[n*i+j] = b[j];
Share Common Subexpressions
Reuse portions of expressions
/* Sum neighbors of i,j */
up = val[(i-1)*n + j ];
down = val[(i+1)*n + j ];
left = val[i*n + j-1];
right = val[i*n + j+1];
sum = up + down + left + right;
long inj = i*n + j;
up = val[inj - n];
down = val[inj + n];
left = val[inj - 1];
right = val[inj + 1];
sum = up + down + left + right;
3 multiplications: i*n, (i–1)*n, (i+1)*n
1 multiplication: i*n
leaq 1(%rsi), %rax # i+1
leaq -1(%rsi), %r8 # i-1
imulq %rcx, %rsi # i*n
imulq %rcx, %rax # (i+1)*n
imulq %rcx, %r8 # (i-1)*n
addq %rdx, %rsi # i*n+j
addq %rdx, %rax # (i+1)*n+j
addq %rdx, %r8 # (i-1)*n+j
imulq %rcx, %rsi # i*n
addq %rdx, %rsi # i*n+j
movq %rsi, %rax # i*n+j
subq %rcx, %rax # i*n+j-n
leaq (%rsi,%rcx), %rcx # i*n+j+n
Procedure calls
Procedure to Convert String to Lower Case
void lower(char *s)
for (i = 0; i < strlen(s); i++)
if (s[i] >= ‘A’ && s[i] <= 'Z')
s[i] -= ('A' - 'a');
lower1 0 20000 40000 60000 80000 100000 120000 140000 160000 180000 200000 220000 240000 260000 280000 300000 320000 340000 360000 380000 400000 420000 440000 460000 480000 500000 0 0.286912 1.1470389999999999 2.5802670000000001 4.5866410000000073 7.1661459999999941 10.318951999999999 14.044786999999999 18.344017000000001 23.216484999999999 28.673535999999999 34.707457000000012 41.304167 48.505589000000001 56.283847000000002 64.623097999999899 73.541931000000005 83.023829999999975 93.129919999999998 103.7657419999999 114.97881099999999 126.765697 139.12814400000019 152.06679399999999 165.5780470000002 179.704319 String length
CPU seconds
Procedure calls
Overall performance, string of length N
N calls to strlen
Require NxN times
Overall O(N2) performance
/* My version of strlen */
size_t strlen(const char *s)
size_t length = 0;
while (*s != '\0') {
return length;
Procedure calls
Improving performance
Move call to strlen outside of loop
Since result does not change from one iteration to another
void lower(char *s)
int len = strlen(s);
for (i = 0; i < len; i++)
if (s[i] >= ‘A’ && s[i] <= 'Z')
s[i] -= ('A' - 'a');
Procedure calls
lower1 0 20000 40000 60000 80000 100000 120000 140000 160000 180000 200000 220000 240000 260000 280000 300000 320000 340000 360000 380000 400000 420000 440000 460000 480000 500000 0 0.286912 1.1470389999999999 2.5802670000000001 4.5866410000000073 7.1661459999999941 10.318951999999999 14.044786999999999 18.344017000000001 23.216484999999999 28.673535999999999 34.707457000000012 41.304167 48.505589000000001 56.283847000000002 64.623097999999899 73.541931000000005 83.023829999999975 93.129919999999998 103.7657419999999 114.97881099999999 126.765697 139.12814400000019 152.06679399999999 165.5780470000002 179.704319 lower2 0 20000 40000 60000 80000 100000 120000 140000 160000 180000 200000 220000 240000 260000 280000 300000 320000 340000 360000 380000 400000 420000 440000 460000 480000 500000 0 2.9000000000000098E-5 5.7000000000000098E-5 8.6000000000000098E-5 1.15E-4 1.4300000000000001E-4 1.7200000000000001E-4 2.0000000000000001E-4 2.2900000000000001E-4 2.5700000000000001E-4 2.8600000000000001E-4 3.1500000000000001E-4 3.4299999999999999E-4 3.7200000000000102E-4 4.0099999999999999E-4 4.3000000000000102E-4 4.58000000000001E-4 4.87000000000001E-4 5.1599999999999997E-4 5.45000000000001E-4 5.7300000000000103E-4 6.02E-4 6.3100000000000103E-4 6.5900000000000095E-4 6.88000000000001E-4 7.1700000000000095E-4 String length
CPU seconds
Memory matters
/* Sum rows is of n X n matrix a
and store in vector b */
void sum_rows1(double *a, double *b, long n) {
long i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
b[i] += a[i*n + j];
double A[9] =
{ 0, 1, 2,
4, 8, 16,}
32, 64, 128 };
double B[3] = A+3;
sum_rows1(A, B, 3);
i = 0: [3, 8, 16]
init: [4, 8, 16]
i = 1: [3, 28, 16]
i = 2: [3, 28, 224]
Value of B:
Memory matters
# sum_rows1 inner loop
addsd (%rcx), %xmm0 # FP add
addq $8, %rcx
movsd %xmm0, (%rsi,%r8,8) # FP store
/* Sum rows is of n X n matrix a
and store in vector b */
void sum_rows1(double *a, double *b, long n) {
long i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++)
b[i] += a[i*n + j];
In each inner loop iteration, there is a memory write time consuming
Memory matters
# sum_rows2 inner loop
addsd (%rcx), %xmm0 # FP Add
addq $8, %rcx
decq %rax
/* Sum rows is of n X n matrix a
and store in vector b */
void sum_rows2(double *a, double *b, long n) {
long i, j;
for (i = 0; i < n; i++) {
double val = 0;
for (j = 0; j < n; j++)
val += a[i*n + j];
b[i] = val;
Removing memory writes in the inner loop
Embedded Computers
Computers are becoming smaller and smaller …
Internet-of-Things
Things are getting smarter
Internet-of-Things
The vision: connecting everything
Internet-of-Things
Application domains
Big Data and Cloud
Knowledge management
Turning data into wisdom, learn from data
Artificial Intelligence
Learning and making decisions from big data
Deep Neural Networks
Edge Computing
/docProps/thumbnail.jpeg
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com