История изменений

Исправление LINUX-ORG-RU, 04.11.19 03:32 (текущая версия) :

Я вот тут поигрался, даже на уродских структурах с уродским обращением к данным,писечная выгода есть, если устанавливать адрес кратный слову принудительно.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>

typedef struct
{
  uint8_t   z;
  uint32_t  c;
  uint8_t   b;
  uint64_t  a;
}a_t;


const uint64_t len = 100000000;

int main(int argc, char *argv[])
{
    a_t * a = NULL;

#if defined(A)
    a = malloc(sizeof(a_t)*len);
    assert(a);
#endif
#if defined(B) 
    int status = posix_memalign((void*)&a,sizeof(void*),sizeof(a_t)*len);
    printf("%s\n",strerror(status));
#endif

    printf("%p\n",a);
    for (int i = 0; i < len; ++i)
    {
        a[i].a=222;
        a[i].z=222;
        a[i].b=33+i;
        a[i].c=a[i].z;
    }
    return 0;
}

dron@gnu:~$ gcc mm.c -O0 -DA -o A; gcc mm.c -O0 -DB -o B ; sudo perf_5.2 stat ./A ; sync ;sudo perf_5.2 stat ./B ;  valgrind --tool=cachegrind ./A  ; valgrind --tool=cachegrind ./B
0x7fb04aa70010

 Performance counter stats for './A':

          3 210,79 msec task-clock                #    0,990 CPUs utilized          
               308      context-switches          #    0,096 K/sec                  
                 1      cpu-migrations            #    0,000 K/sec                  
           582 925      page-faults               #    0,182 M/sec                  
     9 020 202 803      cycles                    #    2,809 GHz                      (16,71%)
       165 194 442      stalled-cycles-frontend   #    1,83% frontend cycles idle     (16,42%)
     5 693 495 923      stalled-cycles-backend    #   63,12% backend cycles idle      (16,90%)
     8 012 261 189      instructions              #    0,89  insn per cycle         
                                                  #    0,71  stalled cycles per insn  (16,67%)
       477 223 061      branches                  #  148,631 M/sec                    (16,78%)
         8 223 474      branch-misses             #    1,72% of all branches          (16,53%)

       3,241665624 seconds time elapsed

       2,064511000 seconds user
       1,142496000 seconds sys


Success
0x7f6b51f5f010

 Performance counter stats for './B':

          3 077,60 msec task-clock                #    1,000 CPUs utilized          
                11      context-switches          #    0,004 K/sec                  
                 0      cpu-migrations            #    0,000 K/sec                  
           582 417      page-faults               #    0,189 M/sec                  
     8 647 040 320      cycles                    #    2,810 GHz                      (16,65%)
       145 962 049      stalled-cycles-frontend   #    1,69% frontend cycles idle     (16,76%)
     5 443 927 280      stalled-cycles-backend    #   62,96% backend cycles idle      (16,68%)
     7 460 307 769      instructions              #    0,86  insn per cycle         
                                                  #    0,73  stalled cycles per insn  (16,64%)
       462 399 195      branches                  #  150,247 M/sec                    (16,64%)
         7 762 108      branch-misses             #    1,68% of all branches          (16,64%)

       3,078232745 seconds time elapsed

       2,061414000 seconds user
       1,016697000 seconds sys


==25582== Cachegrind, a cache and branch-prediction profiler
==25582== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25582== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25582== Command: ./A
==25582== 
--25582-- warning: L3 cache found, using its data for the LL simulation.
0x58c38010
==25582== 
==25582== I   refs:      6,000,291,908
==25582== I1  misses:            1,131
==25582== LLi misses:            1,117
==25582== I1  miss rate:          0.00%
==25582== LLi miss rate:          0.00%
==25582== 
==25582== D   refs:      1,800,093,249  (1,400,071,355 rd   + 400,021,894 wr)
==25582== D1  misses:       37,503,997  (        3,154 rd   +  37,500,843 wr)
==25582== LLd misses:       37,503,272  (        2,500 rd   +  37,500,772 wr)
==25582== D1  miss rate:           2.1% (          0.0%     +         9.4%  )
==25582== LLd miss rate:           2.1% (          0.0%     +         9.4%  )
==25582== 
==25582== LL refs:          37,505,128  (        4,285 rd   +  37,500,843 wr)
==25582== LL misses:        37,504,389  (        3,617 rd   +  37,500,772 wr)
==25582== LL miss rate:            0.5% (          0.0%     +         9.4%  )
==25605== Cachegrind, a cache and branch-prediction profiler
==25605== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25605== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25605== Command: ./B
==25605== 
--25605-- warning: L3 cache found, using its data for the LL simulation.
Success
0x58c38010
==25605== 
==25605== I   refs:      5,500,295,421
==25605== I1  misses:            1,193
==25605== LLi misses:            1,175
==25605== I1  miss rate:          0.00%
==25605== LLi miss rate:          0.00%
==25605== 
==25605== D   refs:      1,800,094,731  (1,400,072,372 rd   + 400,022,359 wr)
==25605== D1  misses:       37,504,029  (        3,187 rd   +  37,500,842 wr)
==25605== LLd misses:       37,503,282  (        2,509 rd   +  37,500,773 wr)
==25605== D1  miss rate:           2.1% (          0.0%     +         9.4%  )
==25605== LLd miss rate:           2.1% (          0.0%     +         9.4%  )
==25605== 
==25605== LL refs:          37,505,222  (        4,380 rd   +  37,500,842 wr)
==25605== LL misses:        37,504,457  (        3,684 rd   +  37,500,773 wr)
==25605== LL miss rate:            0.5% (          0.0%     +         9.4%  )
dron@gnu:~$

Исходная версия LINUX-ORG-RU, 04.11.19 03:31:

Я вот тут поигрался, даже на уродских структурах с уродским обращением к данным,писечная выгода есть, если устанавливать размер кратный слову принудительно.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <string.h>

typedef struct
{
  uint8_t   z;
  uint32_t  c;
  uint8_t   b;
  uint64_t  a;
}a_t;


const uint64_t len = 100000000;

int main(int argc, char *argv[])
{
    a_t * a = NULL;

#if defined(A)
    a = malloc(sizeof(a_t)*len);
    assert(a);
#endif
#if defined(B) 
    int status = posix_memalign((void*)&a,sizeof(void*),sizeof(a_t)*len);
    printf("%s\n",strerror(status));
#endif

    printf("%p\n",a);
    for (int i = 0; i < len; ++i)
    {
        a[i].a=222;
        a[i].z=222;
        a[i].b=33+i;
        a[i].c=a[i].z;
    }
    return 0;
}

dron@gnu:~$ gcc mm.c -O0 -DA -o A; gcc mm.c -O0 -DB -o B ; sudo perf_5.2 stat ./A ; sync ;sudo perf_5.2 stat ./B ;  valgrind --tool=cachegrind ./A  ; valgrind --tool=cachegrind ./B
0x7fb04aa70010

 Performance counter stats for './A':

          3 210,79 msec task-clock                #    0,990 CPUs utilized          
               308      context-switches          #    0,096 K/sec                  
                 1      cpu-migrations            #    0,000 K/sec                  
           582 925      page-faults               #    0,182 M/sec                  
     9 020 202 803      cycles                    #    2,809 GHz                      (16,71%)
       165 194 442      stalled-cycles-frontend   #    1,83% frontend cycles idle     (16,42%)
     5 693 495 923      stalled-cycles-backend    #   63,12% backend cycles idle      (16,90%)
     8 012 261 189      instructions              #    0,89  insn per cycle         
                                                  #    0,71  stalled cycles per insn  (16,67%)
       477 223 061      branches                  #  148,631 M/sec                    (16,78%)
         8 223 474      branch-misses             #    1,72% of all branches          (16,53%)

       3,241665624 seconds time elapsed

       2,064511000 seconds user
       1,142496000 seconds sys


Success
0x7f6b51f5f010

 Performance counter stats for './B':

          3 077,60 msec task-clock                #    1,000 CPUs utilized          
                11      context-switches          #    0,004 K/sec                  
                 0      cpu-migrations            #    0,000 K/sec                  
           582 417      page-faults               #    0,189 M/sec                  
     8 647 040 320      cycles                    #    2,810 GHz                      (16,65%)
       145 962 049      stalled-cycles-frontend   #    1,69% frontend cycles idle     (16,76%)
     5 443 927 280      stalled-cycles-backend    #   62,96% backend cycles idle      (16,68%)
     7 460 307 769      instructions              #    0,86  insn per cycle         
                                                  #    0,73  stalled cycles per insn  (16,64%)
       462 399 195      branches                  #  150,247 M/sec                    (16,64%)
         7 762 108      branch-misses             #    1,68% of all branches          (16,64%)

       3,078232745 seconds time elapsed

       2,061414000 seconds user
       1,016697000 seconds sys


==25582== Cachegrind, a cache and branch-prediction profiler
==25582== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25582== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25582== Command: ./A
==25582== 
--25582-- warning: L3 cache found, using its data for the LL simulation.
0x58c38010
==25582== 
==25582== I   refs:      6,000,291,908
==25582== I1  misses:            1,131
==25582== LLi misses:            1,117
==25582== I1  miss rate:          0.00%
==25582== LLi miss rate:          0.00%
==25582== 
==25582== D   refs:      1,800,093,249  (1,400,071,355 rd   + 400,021,894 wr)
==25582== D1  misses:       37,503,997  (        3,154 rd   +  37,500,843 wr)
==25582== LLd misses:       37,503,272  (        2,500 rd   +  37,500,772 wr)
==25582== D1  miss rate:           2.1% (          0.0%     +         9.4%  )
==25582== LLd miss rate:           2.1% (          0.0%     +         9.4%  )
==25582== 
==25582== LL refs:          37,505,128  (        4,285 rd   +  37,500,843 wr)
==25582== LL misses:        37,504,389  (        3,617 rd   +  37,500,772 wr)
==25582== LL miss rate:            0.5% (          0.0%     +         9.4%  )
==25605== Cachegrind, a cache and branch-prediction profiler
==25605== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al.
==25605== Using Valgrind-3.15.0 and LibVEX; rerun with -h for copyright info
==25605== Command: ./B
==25605== 
--25605-- warning: L3 cache found, using its data for the LL simulation.
Success
0x58c38010
==25605== 
==25605== I   refs:      5,500,295,421
==25605== I1  misses:            1,193
==25605== LLi misses:            1,175
==25605== I1  miss rate:          0.00%
==25605== LLi miss rate:          0.00%
==25605== 
==25605== D   refs:      1,800,094,731  (1,400,072,372 rd   + 400,022,359 wr)
==25605== D1  misses:       37,504,029  (        3,187 rd   +  37,500,842 wr)
==25605== LLd misses:       37,503,282  (        2,509 rd   +  37,500,773 wr)
==25605== D1  miss rate:           2.1% (          0.0%     +         9.4%  )
==25605== LLd miss rate:           2.1% (          0.0%     +         9.4%  )
==25605== 
==25605== LL refs:          37,505,222  (        4,380 rd   +  37,500,842 wr)
==25605== LL misses:        37,504,457  (        3,684 rd   +  37,500,773 wr)
==25605== LL miss rate:            0.5% (          0.0%     +         9.4%  )
dron@gnu:~$