Исправление LINUX-ORG-RU, (текущая версия) :
Я вот тут поигрался, даже на уродских структурах с уродским обращением к данным,писечная выгода есть, если устанавливать адрес кратный слову принудительно.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
typedef struct
{
uint8_t z;
uint32_t c;
uint8_t b;
uint64_t a;
}a_t;
const uint64_t len = 100000000;
int main(int argc, char *argv[])
{
a_t * a = NULL;
#if defined(A)
a = malloc(sizeof(a_t)*len);
assert(a);
#endif
#if defined(B)
int status = posix_memalign((void*)&a,sizeof(void*),sizeof(a_t)*len);
printf("%s\n",strerror(status));
#endif
printf("%p\n",a);
for (int i = 0; i < len; ++i)
{
a[i].a=222;
a[i].z=222;
a[i].b=33+i;
a[i].c=a[i].z;
}
return 0;
}
dron@gnu:~$ gcc mm.c -O0 -DA -o A; gcc mm.c -O0 -DB -o B ; sudo perf_5.2 stat ./A ; sync ;sudo perf_5.2 stat ./B ; valgrind --tool=cachegrind ./A ; valgrind --tool=cachegrind ./B
0x7fb04aa70010
Performance counter stats for './A':
3 210,79 msec task-clock # 0,990 CPUs utilized
308 context-switches # 0,096 K/sec
1 cpu-migrations # 0,000 K/sec
582 925 page-faults # 0,182 M/sec
9 020 202 803 cycles # 2,809 GHz (16,71%)
165 194 442 stalled-cycles-frontend # 1,83% frontend cycles idle (16,42%)
5 693 495 923 stalled-cycles-backend # 63,12% backend cycles idle (16,90%)
8 012 261 189 instructions # 0,89 insn per cycle
# 0,71 stalled cycles per insn (16,67%)
477 223 061 branches # 148,631 M/sec (16,78%)
8 223 474 branch-misses # 1,72% of all branches (16,53%)
3,241665624 seconds time elapsed
2,064511000 seconds user
1,142496000 seconds sys
Success
0x7f6b51f5f010
Performance counter stats for './B':
3 077,60 msec task-clock # 1,000 CPUs utilized
11 context-switches # 0,004 K/sec
0 cpu-migrations # 0,000 K/sec
582 417 page-faults # 0,189 M/sec
8 647 040 320 cycles # 2,810 GHz (16,65%)
145 962 049 stalled-cycles-frontend # 1,69% frontend cycles idle (16,76%)
5 443 927 280 stalled-cycles-backend # 62,96% backend cycles idle (16,68%)
7 460 307 769 instructions # 0,86 insn per cycle
# 0,73 stalled cycles per insn (16,64%)
462 399 195 branches # 150,247 M/sec (16,64%)
7 762 108 branch-misses # 1,68% of all branches (16,64%)
3,078232745 seconds time elapsed
2,061414000 seconds user
1,016697000 seconds sys
==25582== Cachegrind, a cache and branch-prediction profiler
==25582== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25582== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25582== Command: ./A
==25582==
--25582-- warning: L3 cache found, using its data for the LL simulation.
0x58c38010
==25582==
==25582== I refs: 6,000,291,908
==25582== I1 misses: 1,131
==25582== LLi misses: 1,117
==25582== I1 miss rate: 0.00%
==25582== LLi miss rate: 0.00%
==25582==
==25582== D refs: 1,800,093,249 (1,400,071,355 rd + 400,021,894 wr)
==25582== D1 misses: 37,503,997 ( 3,154 rd + 37,500,843 wr)
==25582== LLd misses: 37,503,272 ( 2,500 rd + 37,500,772 wr)
==25582== D1 miss rate: 2.1% ( 0.0% + 9.4% )
==25582== LLd miss rate: 2.1% ( 0.0% + 9.4% )
==25582==
==25582== LL refs: 37,505,128 ( 4,285 rd + 37,500,843 wr)
==25582== LL misses: 37,504,389 ( 3,617 rd + 37,500,772 wr)
==25582== LL miss rate: 0.5% ( 0.0% + 9.4% )
==25605== Cachegrind, a cache and branch-prediction profiler
==25605== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25605== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25605== Command: ./B
==25605==
--25605-- warning: L3 cache found, using its data for the LL simulation.
Success
0x58c38010
==25605==
==25605== I refs: 5,500,295,421
==25605== I1 misses: 1,193
==25605== LLi misses: 1,175
==25605== I1 miss rate: 0.00%
==25605== LLi miss rate: 0.00%
==25605==
==25605== D refs: 1,800,094,731 (1,400,072,372 rd + 400,022,359 wr)
==25605== D1 misses: 37,504,029 ( 3,187 rd + 37,500,842 wr)
==25605== LLd misses: 37,503,282 ( 2,509 rd + 37,500,773 wr)
==25605== D1 miss rate: 2.1% ( 0.0% + 9.4% )
==25605== LLd miss rate: 2.1% ( 0.0% + 9.4% )
==25605==
==25605== LL refs: 37,505,222 ( 4,380 rd + 37,500,842 wr)
==25605== LL misses: 37,504,457 ( 3,684 rd + 37,500,773 wr)
==25605== LL miss rate: 0.5% ( 0.0% + 9.4% )
dron@gnu:~$
Исходная версия LINUX-ORG-RU, :
Я вот тут поигрался, даже на уродских структурах с уродским обращением к данным,писечная выгода есть, если устанавливать размер кратный слову принудительно.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>
typedef struct
{
uint8_t z;
uint32_t c;
uint8_t b;
uint64_t a;
}a_t;
const uint64_t len = 100000000;
int main(int argc, char *argv[])
{
a_t * a = NULL;
#if defined(A)
a = malloc(sizeof(a_t)*len);
assert(a);
#endif
#if defined(B)
int status = posix_memalign((void*)&a,sizeof(void*),sizeof(a_t)*len);
printf("%s\n",strerror(status));
#endif
printf("%p\n",a);
for (int i = 0; i < len; ++i)
{
a[i].a=222;
a[i].z=222;
a[i].b=33+i;
a[i].c=a[i].z;
}
return 0;
}
dron@gnu:~$ gcc mm.c -O0 -DA -o A; gcc mm.c -O0 -DB -o B ; sudo perf_5.2 stat ./A ; sync ;sudo perf_5.2 stat ./B ; valgrind --tool=cachegrind ./A ; valgrind --tool=cachegrind ./B
0x7fb04aa70010
Performance counter stats for './A':
3 210,79 msec task-clock # 0,990 CPUs utilized
308 context-switches # 0,096 K/sec
1 cpu-migrations # 0,000 K/sec
582 925 page-faults # 0,182 M/sec
9 020 202 803 cycles # 2,809 GHz (16,71%)
165 194 442 stalled-cycles-frontend # 1,83% frontend cycles idle (16,42%)
5 693 495 923 stalled-cycles-backend # 63,12% backend cycles idle (16,90%)
8 012 261 189 instructions # 0,89 insn per cycle
# 0,71 stalled cycles per insn (16,67%)
477 223 061 branches # 148,631 M/sec (16,78%)
8 223 474 branch-misses # 1,72% of all branches (16,53%)
3,241665624 seconds time elapsed
2,064511000 seconds user
1,142496000 seconds sys
Success
0x7f6b51f5f010
Performance counter stats for './B':
3 077,60 msec task-clock # 1,000 CPUs utilized
11 context-switches # 0,004 K/sec
0 cpu-migrations # 0,000 K/sec
582 417 page-faults # 0,189 M/sec
8 647 040 320 cycles # 2,810 GHz (16,65%)
145 962 049 stalled-cycles-frontend # 1,69% frontend cycles idle (16,76%)
5 443 927 280 stalled-cycles-backend # 62,96% backend cycles idle (16,68%)
7 460 307 769 instructions # 0,86 insn per cycle
# 0,73 stalled cycles per insn (16,64%)
462 399 195 branches # 150,247 M/sec (16,64%)
7 762 108 branch-misses # 1,68% of all branches (16,64%)
3,078232745 seconds time elapsed
2,061414000 seconds user
1,016697000 seconds sys
==25582== Cachegrind, a cache and branch-prediction profiler
==25582== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25582== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25582== Command: ./A
==25582==
--25582-- warning: L3 cache found, using its data for the LL simulation.
0x58c38010
==25582==
==25582== I refs: 6,000,291,908
==25582== I1 misses: 1,131
==25582== LLi misses: 1,117
==25582== I1 miss rate: 0.00%
==25582== LLi miss rate: 0.00%
==25582==
==25582== D refs: 1,800,093,249 (1,400,071,355 rd + 400,021,894 wr)
==25582== D1 misses: 37,503,997 ( 3,154 rd + 37,500,843 wr)
==25582== LLd misses: 37,503,272 ( 2,500 rd + 37,500,772 wr)
==25582== D1 miss rate: 2.1% ( 0.0% + 9.4% )
==25582== LLd miss rate: 2.1% ( 0.0% + 9.4% )
==25582==
==25582== LL refs: 37,505,128 ( 4,285 rd + 37,500,843 wr)
==25582== LL misses: 37,504,389 ( 3,617 rd + 37,500,772 wr)
==25582== LL miss rate: 0.5% ( 0.0% + 9.4% )
==25605== Cachegrind, a cache and branch-prediction profiler
==25605== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25605== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25605== Command: ./B
==25605==
--25605-- warning: L3 cache found, using its data for the LL simulation.
Success
0x58c38010
==25605==
==25605== I refs: 5,500,295,421
==25605== I1 misses: 1,193
==25605== LLi misses: 1,175
==25605== I1 miss rate: 0.00%
==25605== LLi miss rate: 0.00%
==25605==
==25605== D refs: 1,800,094,731 (1,400,072,372 rd + 400,022,359 wr)
==25605== D1 misses: 37,504,029 ( 3,187 rd + 37,500,842 wr)
==25605== LLd misses: 37,503,282 ( 2,509 rd + 37,500,773 wr)
==25605== D1 miss rate: 2.1% ( 0.0% + 9.4% )
==25605== LLd miss rate: 2.1% ( 0.0% + 9.4% )
==25605==
==25605== LL refs: 37,505,222 ( 4,380 rd + 37,500,842 wr)
==25605== LL misses: 37,504,457 ( 3,684 rd + 37,500,773 wr)
==25605== LL miss rate: 0.5% ( 0.0% + 9.4% )
dron@gnu:~$