Below is my innermost loop that's run several thousand times, with input sizes of 20 - 1000 or more. This piece of code takes up 99 - 99.5% of execution time. Is there anything I can do to help squeeze any more performance out of this?
I'm not looking to move this code to something like using tree codes (Barnes-Hut), but towards optimizing the actual calculations happening inside, since the same calculations occur in the Barnes-Hut algorithm.
Any help is appreciated!
Edit: I'm running in Windows 7 64-bit with Visual Studio 2008 edition on a Core 2 Duo T5850 (2.16 GHz)
typedef double real;
struct Particle
{
Vector pos, vel, acc, jerk;
Vector oldPos, oldVel, oldAcc, oldJerk;
real mass;
};
class Vector
{
private:
real vec[3];
public:
// Operators defined here
};
real Gravity::interact(Particle *p, size_t numParticles)
{
PROFILE_FUNC();
real tau_q = 1e300;
for (size_t i = 0; i < numParticles; i++)
{
p[i].jerk = 0;
p[i].acc = 0;
}
for (size_t i = 0; i < numParticles; i++)
{
for (size_t j = i+1; j < numParticles; j++)
{
Vector r = p[j].pos - p[i].pos;
Vector v = p[j].vel - p[i].vel;
real r2 = lengthsq(r);
real v2 = lengthsq(v);
// Calculate inverse of |r|^3
real r3i = Constants::G * pow(r2, -1.5);
// da = r / |r|^3
// dj = (v / |r|^3 - 3 * (r . v) * r / |r|^5
Vector da = r * r3i;
Vector dj = (v - r * (3 * dot(r, v) / r2)) * r3i;
// Calculate new acceleration and jerk
p[i].acc += da * p[j].mass;
p[i].jerk += dj * p[j].mass;
p[j].acc -= da * p[i].mass;
p[j].jerk -= dj * p[i].mass;
// Collision estimation
// Metric 1) tau = |r|^2 / |a(j) - a(i)|
// Metric 2) tau = |r|^4 / |v|^4
real mij = p[i].mass + p[j].mass;
real tau_est_q1 = r2 / (lengthsq(da) * mij * mij);
real tau_est_q2 = (r2*r2) / (v2*v2);
if (tau_est_q1 < tau_q)
tau_q = tau_est_q1;
if (tau_est_q2 < tau_q)
tau_q = tau_est_q2;
}
}
return sqrt(sqrt(tau_q));
}