guest@blog.cmj.tw: ~/posts $

Memory Access


最扎實的時候 就是念研究所階段了

一開始有人問了一個問題

朋友問了一個有趣的問題 `# pragma omp parallel for` 加後會跑平行化
但跑出來的結果比不加還慢

情境一

在 Single Thread 的簡單記憶體存取問題:A[i][j] 跟 A[j][i] 的差異

#include <stdio.h>
#include <time.h>

#define WIDTH	100000
#define HEIGHT	20

void case1(void) {
    int A[WIDTH][HEIGHT] = {1};
    int sum = 0;
    clock_t start, end;

    start = clock();
    for (int i = 0; i < WIDTH; ++i) {
        for (int j = 0; j< HEIGHT; ++j) {
            sum += A[i][j];
        }
    }
    end = clock();
    printf("case A[i][j] = %d\n", end - start);

    start = clock();
    for (int i = 0; i < HEIGHT; ++i) {
        for (int j = 0; j< WIDTH; ++j) {
            sum += A[j][i];
        }
    }
    end = clock();
    printf("case A[j][i] = %d\n", end - start);

    return ;
}

int main ()
{
    case1();
    return 0;
}

透過 gcc -fopenmp test.c 編譯執行之後會得到結果是

case A[i][j] = 4567
case A[j][i] = 7244
case B[i][j] = 4138
case B[j][i] = 3626

在記憶體存取範圍 (二維陣列) 是長方形的情況下,跳躍長邊的情況效率會比較慢。實際上記憶體是個 1 維的連續空間,在 case 2 的第二個情況,代表陣列擁有最短的寬跟最長的高,在存取上會優先存取。 從組合語言上來看,存取陣列空間的時候都透過 mov eax,DWORD PTR [rbp+rax*4-OFFSET] 。其中 rbp 跟 rax 的值決定要存取哪些記憶體空間。在較慢的 case 中,每次 I/O 存取的記憶體空間,較快的例子在 rax 的變動每次都只會增加 1:

# case 1/1
11b0:	e8 7b fe ff ff       	call   1030 <clock@plt>
11b5:	48 89 85 e0 ed 85 ff 	mov    QWORD PTR [rbp-0x7a1220],rax
11bc:	c7 85 d0 ed 85 ff 00 	mov    DWORD PTR [rbp-0x7a1230],0x0
11c3:	00 00 00
11c6:	eb 53                	jmp    121b <case1+0xb2>
11c8:	c7 85 d4 ed 85 ff 00 	mov    DWORD PTR [rbp-0x7a122c],0x0
11cf:	00 00 00
11d2:	eb 37                	jmp    120b <case1+0xa2>
11d4:	8b 85 d4 ed 85 ff    	mov    eax,DWORD PTR [rbp-0x7a122c]
11da:	48 63 c8             	movsxd rcx,eax
11dd:	8b 85 d0 ed 85 ff    	mov    eax,DWORD PTR [rbp-0x7a1230]
11e3:	48 63 d0             	movsxd rdx,eax
11e6:	48 89 d0             	mov    rax,rdx
11e9:	48 c1 e0 02          	shl    rax,0x2
11ed:	48 01 d0             	add    rax,rdx
11f0:	48 c1 e0 02          	shl    rax,0x2
11f4:	48 01 c8             	add    rax,rcx
11f7:	8b 84 85 f0 ed 85 ff 	mov    eax,DWORD PTR [rbp+rax*4-0x7a1210]
11fe:	01 85 cc ed 85 ff    	add    DWORD PTR [rbp-0x7a1234],eax
1204:	83 85 d4 ed 85 ff 01 	add    DWORD PTR [rbp-0x7a122c],0x1
120b:	83 bd d4 ed 85 ff 13 	cmp    DWORD PTR [rbp-0x7a122c],0x13
1212:	7e c0                	jle    11d4 <case1+0x6b>
1214:	83 85 d0 ed 85 ff 01 	add    DWORD PTR [rbp-0x7a1230],0x1
121b:	81 bd d0 ed 85 ff 9f 	cmp    DWORD PTR [rbp-0x7a1230],0x1869f
1222:	86 01 00
1225:	7e a1                	jle    11c8 <case1+0x5f>
1227:	e8 04 fe ff ff       	call   1030 <clock@plt>

# case 1/2
1255:	e8 d6 fd ff ff       	call   1030 <clock@plt>
125a:	48 89 85 e0 ed 85 ff 	mov    QWORD PTR [rbp-0x7a1220],rax
1261:	c7 85 d8 ed 85 ff 00 	mov    DWORD PTR [rbp-0x7a1228],0x0
1268:	00 00 00
126b:	eb 56                	jmp    12c3 <case1+0x15a>
126d:	c7 85 dc ed 85 ff 00 	mov    DWORD PTR [rbp-0x7a1224],0x0
1274:	00 00 00
1277:	eb 37                	jmp    12b0 <case1+0x147>
1279:	8b 85 d8 ed 85 ff    	mov    eax,DWORD PTR [rbp-0x7a1228]
127f:	48 63 c8             	movsxd rcx,eax
1282:	8b 85 dc ed 85 ff    	mov    eax,DWORD PTR [rbp-0x7a1224]
1288:	48 63 d0             	movsxd rdx,eax
128b:	48 89 d0             	mov    rax,rdx
128e:	48 c1 e0 02          	shl    rax,0x2
1292:	48 01 d0             	add    rax,rdx
1295:	48 c1 e0 02          	shl    rax,0x2
1299:	48 01 c8             	add    rax,rcx
129c:	8b 84 85 f0 ed 85 ff 	mov    eax,DWORD PTR [rbp+rax*4-0x7a1210]
12a3:	01 85 cc ed 85 ff    	add    DWORD PTR [rbp-0x7a1234],eax
12a9:	83 85 dc ed 85 ff 01 	add    DWORD PTR [rbp-0x7a1224],0x1
12b0:	81 bd dc ed 85 ff 9f 	cmp    DWORD PTR [rbp-0x7a1224],0x1869f
12b7:	86 01 00
12ba:	7e bd                	jle    1279 <case1+0x110>
12bc:	83 85 d8 ed 85 ff 01 	add    DWORD PTR [rbp-0x7a1228],0x1
12c3:	83 bd d8 ed 85 ff 13 	cmp    DWORD PTR [rbp-0x7a1228],0x13
12ca:	7e a1                	jle    126d <case1+0x104>
12cc:	e8 5f fd ff ff       	call   1030 <clock@plt>

情境二

導入 openmp 之後,在一樣的 source code 條件下會得到不一樣的結果:

case A[i][j] = 4388
case A[j][i] = 9413
case A[i][j] with openmp parallel = 844398
case A[j][i] with openmp parallel = 817167
case A[i][j] with openmp for = 38927
case A[j][i] with openmp for = 44711
case A[i][j] with openmp parallel for = 104988
case A[j][i] with openmp parallel for = 190774

原因是在 openmp 的例子中因為需要把資料寫回到一個 shared variable sum 導致平行化的效率不好。 在不寫入到 sum 變數之後使用 openmp 效果變得更快:

case A[i][j] = 2170
case A[j][i] = 2752
case A[i][j] with openmp parallel = 25289
case A[j][i] with openmp parallel = 49070
case A[i][j] with openmp for = 20128
case A[j][i] with openmp for = 21250
case A[i][j] with openmp parallel for = 623
case A[j][i] with openmp parallel for = 2019

在單獨使用 parallel 會把接下來的程式平行化處理、使用 for 則是會將 for-loop 中的 index 平行化處理 。因此同時使用 parallel + for 會得到最好的效果,也就是透過 multiple threading 將 for-loop 的程式 ,透過 multi-threading 的方式依序執行。

#pragma omp for
#pragma omp parallel
#pragma omp parallel for