История изменений
Исправление Novell-ch, (текущая версия) :
помогло
int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)
+ ((int64_t) convert->cmatrix[1][3] << 16)
+ ((int64_t) convert->cmatrix[2][3] << 0);
Performance counter stats for process id '17523':
18862.102342 task-clock # 0.272 CPUs utilized
30,875 context-switches # 0.002 M/sec
4,253 cpu-migrations # 0.225 K/sec
1,858 page-faults # 0.099 K/sec
56,563,008,115 cycles # 2.999 GHz
13,637,059,458 stalled-cycles-frontend # 24.11% frontend cycles idle
<not supported> stalled-cycles-backend
149,019,018,585 instructions # 2.63 insns per cycle
# 0.09 stalled cycles per insn
8,822,857,677 branches # 467.756 M/sec
32,225,169 branch-misses # 0.37% of all branches
<not supported> L1-dcache-loads:HG
860,977,224 L1-dcache-load-misses:HG # 0.00% of all L1-dcache hits
368,779,426 LLC-loads:HG # 19.551 M/sec
<not supported> LLC-load-misses:HG
69.386963059 seconds time elapsed
финальный результат такой, может примут в апстим, завтра потестю на 32 битах.
static int op=0;
static int64_t t_r3[256], t_g3[256], t_b3[256];
static void
videoconvert_convert_matrix8 (VideoConvert * convert, gpointer p)
{
while (op<2) {
int i, j;
int k_r[] = { convert->cmatrix[0][0], convert->cmatrix[1][0], convert->cmatrix[2][0] };
int k_g[] = { convert->cmatrix[0][1], convert->cmatrix[1][1], convert->cmatrix[2][1] };
int k_b[] = { convert->cmatrix[0][2], convert->cmatrix[1][2], convert->cmatrix[2][2] };
for (i = 0; i <= 255; i++)
{
int64_t r = 0, g = 0, b = 0;
for (j = 0; j <= 2; j++)
{
r = (r << 16) + k_r[j] * i;
g = (g << 16) + k_g[j] * i;
b = (b << 16) + k_b[j] * i;
}
t_r3[i] = r;
t_g3[i] = g;
t_b3[i] = b;
op=2;
}
}
int t;
uint8_t r, g, b;
guint8 *pixels = p;
int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)
+ ((int64_t) convert->cmatrix[1][3] << 16)
+ ((int64_t) convert->cmatrix[2][3] << 0);
for (t = 0; t < convert->width * 4; t += 4)
{
r = pixels[t + 1];
g = pixels[t + 2];
b = pixels[t + 3];
int64_t x3 = t_r3[r] + t_g3[g] + t_b3[b] + c;
pixels[t + 1] = x3 >> 40;
pixels[t + 2] = x3 >> 24;
pixels[t + 3] = x3 >> 8;
}
}
Исходная версия Novell-ch, :
помогло
int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)
+ ((int64_t) convert->cmatrix[1][3] << 16)
+ ((int64_t) convert->cmatrix[2][3] << 0);
Performance counter stats for process id '17523':
18862.102342 task-clock # 0.272 CPUs utilized
30,875 context-switches # 0.002 M/sec
4,253 cpu-migrations # 0.225 K/sec
1,858 page-faults # 0.099 K/sec
56,563,008,115 cycles # 2.999 GHz
13,637,059,458 stalled-cycles-frontend # 24.11% frontend cycles idle
<not supported> stalled-cycles-backend
149,019,018,585 instructions # 2.63 insns per cycle
# 0.09 stalled cycles per insn
8,822,857,677 branches # 467.756 M/sec
32,225,169 branch-misses # 0.37% of all branches
<not supported> L1-dcache-loads:HG
860,977,224 L1-dcache-load-misses:HG # 0.00% of all L1-dcache hits
368,779,426 LLC-loads:HG # 19.551 M/sec
<not supported> LLC-load-misses:HG
69.386963059 seconds time elapsed
static int op=0;
static int64_t t_r3[256], t_g3[256], t_b3[256];
фитальный результат такой, может примут в апстим, завтра потестю на 32 битах.
static void
videoconvert_convert_matrix8 (VideoConvert * convert, gpointer p)
{
while (op<2) {
int i, j;
int k_r[] = { convert->cmatrix[0][0], convert->cmatrix[1][0], convert->cmatrix[2][0] };
int k_g[] = { convert->cmatrix[0][1], convert->cmatrix[1][1], convert->cmatrix[2][1] };
int k_b[] = { convert->cmatrix[0][2], convert->cmatrix[1][2], convert->cmatrix[2][2] };
for (i = 0; i <= 255; i++)
{
int64_t r = 0, g = 0, b = 0;
for (j = 0; j <= 2; j++)
{
r = (r << 16) + k_r[j] * i;
g = (g << 16) + k_g[j] * i;
b = (b << 16) + k_b[j] * i;
}
t_r3[i] = r;
t_g3[i] = g;
t_b3[i] = b;
op=2;
}
}
int t;
uint8_t r, g, b;
guint8 *pixels = p;
int64_t c = ((int64_t) convert->cmatrix[0][3] << 32)
+ ((int64_t) convert->cmatrix[1][3] << 16)
+ ((int64_t) convert->cmatrix[2][3] << 0);
for (t = 0; t < convert->width * 4; t += 4)
{
r = pixels[t + 1];
g = pixels[t + 2];
b = pixels[t + 3];
int64_t x3 = t_r3[r] + t_g3[g] + t_b3[b] + c;
pixels[t + 1] = x3 >> 40;
pixels[t + 2] = x3 >> 24;
pixels[t + 3] = x3 >> 8;
}
}