home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Large Pack of OldSkool DOS MOD Trackers
/
goattracker_2.68.zip
/
src
/
resid-fp
/
convolve-sse.cpp
next >
Wrap
C/C++ Source or Header
|
2009-01-03
|
3KB
|
77 lines
// ---------------------------------------------------------------------------
// This file is part of reSID, a MOS6581 SID emulator engine.
// Copyright (C) 2004 Dag Lem <resid@nimrod.no>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// ---------------------------------------------------------------------------
#include "sid.h"
#if (RESID_USE_SSE==1)
#include <xmmintrin.h>
float convolve_sse(const float *a, const float *b, int n)
{
float out = 0.f;
__m128 out4 = { 0, 0, 0, 0 };
/* examine if we can use aligned loads on both pointers */
int diff = (int) (a - b) & 0xf;
/* long cast is no-op for x86-32, but x86-64 gcc needs 64 bit intermediate
* to convince compiler we mean this. */
unsigned int a_align = (unsigned int) (unsigned long) a & 0xf;
/* advance if necessary. We can't let n fall < 0, so no while (n --). */
while (n > 0 && a_align != 0 && a_align != 16) {
out += (*(a ++)) * (*(b ++));
--n;
a_align += 4;
}
int n4 = n / 4;
if (diff == 0) {
for (int i = 0; i < n4; i ++) {
out4 = _mm_add_ps(out4, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)));
a += 4;
b += 4;
}
} else {
/* XXX loadu is 4x slower than load, at least. We could at 4x memory
* use prepare versions of b aligned for any a alignment. We could
* also issue aligned loads and shuffle the halves at each iteration.
* Initial results indicate only very small improvements. */
for (int i = 0; i < n4; i ++) {
out4 = _mm_add_ps(out4, _mm_mul_ps(_mm_load_ps(a), _mm_loadu_ps(b)));
a += 4;
b += 4;
}
}
out4 = _mm_add_ps(_mm_movehl_ps(out4, out4), out4);
out4 = _mm_add_ss(_mm_shuffle_ps(out4, out4, 1), out4);
float out_tmp;
_mm_store_ss(&out_tmp, out4);
out += out_tmp;
n &= 3;
while (n --)
out += (*(a ++)) * (*(b ++));
return out;
}
#endif