Perf#
Debian 安装 perf#
1
| sudo apt install linux-perf
|
收集分支预测事件#
1
| perf stat -e branch-instructions,branch-misses <your_program>
|
- branch-instructions:分支指令的总数
- branch-misses:分支预测失败的次数
1
| hit_rate = 1 - branch-misses / branch-instructions //命中率
|
以下是一个包含多个复杂分支跳转的 C 语言程序示例。这个程序包含了多层嵌套的条件判断、循环和函数调用,以演示复杂的分支逻辑
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
| #include <stdio.h>
#include <stdlib.h>
#include <time.h>
// 模拟复杂条件的函数
int complex_condition(int x) {
if (x % 2 == 0) {
if (x % 3 == 0) {
return 1; // 可被2和3整除
} else {
if (x % 5 == 0) {
return 2; // 可被2和5整除
} else {
return 3; // 可被2整除,但不能被3或5整除
}
}
} else {
if (x % 7 == 0) {
return 4; // 可被7整除
} else {
return 5; // 不能被2或7整除
}
}
}
// 递归函数,模拟复杂的分支跳转
int recursive_function(int depth) {
if (depth <= 0) {
return 0;
} else {
if (depth % 2 == 0) {
return depth * recursive_function(depth - 1);
} else {
return depth + recursive_function(depth - 1);
}
}
}
// 主函数,包含多重循环和分支判断
int main() {
srand(time(NULL)); // 用当前时间作为随机数种子
// 模拟分支跳转
for (int i = 0; i < 20; i++) {
int rand_num = rand() % 100 + 1; // 生成1到100之间的随机数
int result = complex_condition(rand_num);
switch (result) {
case 1:
printf("Number %d is divisible by 2 and 3.\n", rand_num);
break;
case 2:
printf("Number %d is divisible by 2 and 5.\n", rand_num);
break;
case 3:
printf("Number %d is divisible by 2, but not by 3 or 5.\n", rand_num);
break;
case 4:
printf("Number %d is divisible by 7.\n", rand_num);
break;
case 5:
printf("Number %d is not divisible by 2 or 7.\n", rand_num);
break;
default:
printf("Unexpected result for number %d.\n", rand_num);
break;
}
}
// 递归调用,模拟更复杂的分支
int depth = rand() % 10 + 1;
printf("\nRecursive function result for depth %d: %d\n", depth, recursive_function(depth));
return 0;
}
|
编译之后,使用 perf 进行分支预测事件统计得到结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| hcy@debian:/tmp$ sudo perf stat -e branch-instructions,branch-misses ./a.exe
Number 43 is not divisible by 2 or 7.
Number 59 is not divisible by 2 or 7.
Number 17 is not divisible by 2 or 7.
Number 11 is not divisible by 2 or 7.
Number 38 is divisible by 2, but not by 3 or 5.
Number 97 is not divisible by 2 or 7.
Number 51 is not divisible by 2 or 7.
Number 3 is not divisible by 2 or 7.
Number 77 is divisible by 7.
Number 32 is divisible by 2, but not by 3 or 5.
Number 43 is not divisible by 2 or 7.
Number 32 is divisible by 2, but not by 3 or 5.
Number 24 is divisible by 2 and 3.
Number 72 is divisible by 2 and 3.
Number 82 is divisible by 2, but not by 3 or 5.
Number 41 is not divisible by 2 or 7.
Number 56 is divisible by 2, but not by 3 or 5.
Number 1 is not divisible by 2 or 7.
Number 55 is not divisible by 2 or 7.
Number 45 is not divisible by 2 or 7.
Recursive function result for depth 7: 157
Performance counter stats for './a.exe':
221,836 branch-instructions
7,323 branch-misses # 3.30% of all branches
0.000678774 seconds time elapsed
0.000720000 seconds user
0.000000000 seconds sys
|
收集缓存事件#
统计 cache-references
和 cache-misses
事件:
1
| perf stat -e cache-references,cache-misses ./your_program
|
计算命中率:
1
| hit_rate = 1 - cache-misses / cache-references //命中率
|
以下是一个设计较复杂的 C 语言程序,它模拟了一个具有多个分支和数组访问模式的程序,以便更好地展示缓存命中的情况。这个程序通过对二维数组的访问、嵌套循环等操作,模拟了常见的缓存命中与未命中场景。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
| #include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1000 // 定义二维数组的大小
// 按行顺序访问二维数组
void access_by_rows(int arr[N][N]) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
arr[i][j] = arr[i][j] * 2; // 访问并更新数组元素
}
}
}
// 按列顺序访问二维数组
void access_by_columns(int arr[N][N]) {
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
arr[i][j] = arr[i][j] * 2; // 访问并更新数组元素
}
}
}
// 创建一个大数组并执行访问模式
void test_cache_behavior() {
int arr[N][N];
// 初始化数组
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
arr[i][j] = rand() % 100; // 给数组赋随机值
}
}
// 测试按行访问
printf("Testing row-major access pattern...\n");
access_by_rows(arr);
// 测试按列访问
printf("Testing column-major access pattern...\n");
access_by_columns(arr);
}
int main() {
srand(time(NULL)); // 使用时间作为随机数种子
// 调用函数测试缓存行为
test_cache_behavior();
return 0;
}
|
编译之后,使用 perf 进行 cache 命中统计得到结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
| hcy@debian:/tmp$ sudo perf stat -e cache-references,cache-misses ./a.exe
Testing row-major access pattern...
Testing column-major access pattern...
Performance counter stats for './a.exe':
619,887 cache-references
123,863 cache-misses # 19.982 % of all cache refs
0.018228285 seconds time elapsed
0.018274000 seconds user
0.000000000 seconds sys
|
与 SPEC CPU 2017 的结合#
在 bash 语法里面," – " (即“空格”“双减号”“空格”)是一个分隔符号,用于明确告诉 Bash 后面的参数不再是选项(比如以 - 开头的参数),例如:
1
| grep -- -pattern filename.txt
|
这里 " – " 告诉 grep,后面的 “-pattern” 是要搜索的模式,而不是一个选项
SPEC CPU 2017 的启功命令通常很长,例如:
1
| runcpu --config test-gcc-1.cfg --tune all --reportable --nopower --runmode speed --tune base:peak --size test:train:refspeed intspeed
|
与分隔符结合,使用 perf 对 SPEC CPU 2017 进行性能测试例子, “\” 表示换行继续:
1
2
| perf stat -o $output_root/perf-stat-gcc-$1 -- \
runcpu $runcpu_args -a onlyrun -c gcc-$1.cfg --output_root $output_root/gcc-$1 548.exchange2_r
|
Perf 绘制火焰图#