My other answer is rather long and quite different from this one, so here's something else.
Rather than just filter out multiples of the first two primes, or encode all relevant primes in one byte each, this program filters out multiples of all of the primes that fit in eight bits, specifically 2 through 211. So instead of passing 33% of numbers, this passes about 10% on to the division operator.
The primes are kept in three SSE registers, and their moduli with the running counter are kept in another three. If the modulus of any prime with the counter is equal to zero, the counter cannot be prime. Also, if any modulus is equal to one, then (counter+2) cannot be prime, etc, up to (counter+30). Even numbers are ignored, and offsets like +3, +6, and +5 are skipped. Vector processing allows updating sixteen byte-sized variables at once.
After throwing a kitchen sink full o' micro-optimizations at it (but nothing more platform specific than an inline directive), I got a 1.78x performance boost (24 s vs 13.4 s on my laptop). If using a bignum library (even a very fast one), the advantage is greater. See below for a more readable, pre-optimization version.
/*
* factorize_sse.cpp
* Filter out multiples of the first 47 primes while factorizing a number.
*
* Created by David Krauss on 10/14/10.
*
*/
#include <cmath>
#include <sstream>
#include <iostream>
#include <xmmintrin.h>
using namespace std;
inline void remove_factor( unsigned long &n, unsigned long factor ) __attribute__((always_inline));
void remove_factor( unsigned long &n, unsigned long factor ) {
while ( n % factor == 0 ) {
n /= factor;
cout << factor << " ";
}
}
int main( int argc, char *argv[] ) {
unsigned long n;
if ( argc != 2
|| ( istringstream( argv[1] ) >> n >> ws ).rdstate() != ios::eofbit ) {
cerr << "Usage: " << argv[0] << " <number>\n";
return 1;
}
int primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47,
53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127,
131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211 };
for ( int *p = primes; p < primes + sizeof primes/sizeof *primes; ++ p ) {
remove_factor( n, * p );
}
//int histo[8] = {}, total = 0;
enum { bias = 15 - 128 };
__m128i const prime1 = _mm_set_epi8( 21, 21, 21, 22, 22, 26, 26, 17, 19, 23, 29, 31, 37, 41, 43, 47 ),
prime2 = _mm_set_epi8( 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127 ),
prime3 = _mm_set_epi8( 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211 ),
vbias = _mm_set1_epi8( bias ),
v3 = _mm_set1_epi8( 3+bias ), v5 = _mm_set1_epi8( 5+bias ), v6 = _mm_set1_epi8( 6+bias ), v8 = _mm_set1_epi8( 8+bias ),
v9 = _mm_set1_epi8( 9+bias ), v11 = _mm_set1_epi8( 11+bias ), v14 = _mm_set1_epi8( 14+bias ), v15 = _mm_set1_epi8( 15+bias );
__m128i mod1 = _mm_add_epi8( _mm_set_epi8( 3, 10, 17, 5, 16, 6, 19, 8, 9, 11, 14, 15, 18, 20, 21, 23 ), vbias ),
mod2 = _mm_add_epi8( _mm_set_epi8( 26, 29, 30, 33, 35, 36, 39, 41, 44, 48, 50, 51, 53, 54, 56, 63 ), vbias ),
mod3 = _mm_add_epi8( _mm_set_epi8( 65, 68, 69, 74, 75, 78, 81, 83, 86, 89, 90, 95, 96, 98, 99, 105 ), vbias );
for ( unsigned long factor = 1, limit = sqrtl( n ); factor <= limit + 30; factor += 30 ) {
if ( n == 1 ) goto done;
// up to 2^32, distribution of number candidates produced (0 up to 7) is
// 0.010841 0.0785208 0.222928 0.31905 0.246109 0.101023 0.0200728 0.00145546
unsigned candidates[8], *cand_pen = candidates;
* cand_pen = 6;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v3 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v3 ), _mm_cmpeq_epi8( mod3, v3 ) ) ) );
* cand_pen = 10;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v5 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v5 ), _mm_cmpeq_epi8( mod3, v5 ) ) ) );
* cand_pen = 12;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v6 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v6 ), _mm_cmpeq_epi8( mod3, v6 ) ) ) );
* cand_pen = 16;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v8 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v8 ), _mm_cmpeq_epi8( mod3, v8 ) ) ) );
* cand_pen = 18;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v9 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v9 ), _mm_cmpeq_epi8( mod3, v9 ) ) ) );
* cand_pen = 22;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v11 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v11 ), _mm_cmpeq_epi8( mod3, v11 ) ) ) );
* cand_pen = 28;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v14 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v14 ), _mm_cmpeq_epi8( mod3, v14 ) ) ) );
* cand_pen = 30;
cand_pen += !( _mm_movemask_epi8( _mm_cmpeq_epi8( mod1, v15 ) ) | _mm_movemask_epi8( _mm_or_si128( _mm_cmpeq_epi8( mod2, v15 ), _mm_cmpeq_epi8( mod3, v15 ) ) ) );
/*++ total;
++ histo[ cand_pen - candidates ];
cout << "( ";
while ( cand_pen != candidates ) cout << factor + * -- cand_pen << " ";
cout << ")" << endl; */
mod1 = _mm_sub_epi8( mod1, _mm_set1_epi8( 15 ) ); // update residuals
__m128i mask1 = _mm_cmplt_epi8( mod1, _mm_set1_epi8( 1+bias ) );
mask1 = _mm_and_si128( mask1, prime1 ); // residual goes to zero or negative?
mod1 = _mm_add_epi8( mask1, mod1 ); // combine reset into zero or negative
mod2 = _mm_sub_epi8( mod2, _mm_set1_epi8( 15 ) );
__m128i mask2 = _mm_cmplt_epi8( mod2, _mm_set1_epi8( 1+bias ) );
mask2 = _mm_and_si128( mask2, prime2 );
mod2 = _mm_add_epi8( mask2, mod2 );
mod3 = _mm_sub_epi8( mod3, _mm_set1_epi8( 15 ) );
__m128i mask3 = _mm_cmplt_epi8( mod3, _mm_set1_epi8( 1+bias ) );
mask3 = _mm_and_si128( mask3, prime3 );
mod3 = _mm_add_epi8( mask3, mod3 );
if ( cand_pen - candidates == 0 ) continue;
remove_factor( n, factor + candidates[ 0 ] );
if ( cand_pen - candidates == 1 ) continue;
remove_factor( n, factor + candidates[ 1 ] );
if ( cand_pen - candidates == 2 ) continue;
remove_factor( n, factor + candidates[ 2 ] );
if ( cand_pen - candidates == 3 ) continue;
remove_factor( n, factor + candidates[ 3 ] );
if ( cand_pen - candidates == 4 ) continue;
remove_factor( n, factor + candidates[ 4 ] );
if ( cand_pen - candidates == 5 ) continue;
remove_factor( n, factor + candidates[ 5 ] );
if ( cand_pen - candidates == 6 ) continue;
remove_factor( n, factor + candidates[ 6 ] );
}
cout << n;
done:
/*cout << endl;
for ( int hx = 0; hx < 8; ++ hx ) cout << (float) histo[hx] / total << " ";*/
cout << endl;
}
.
dkrauss$ /usr/local/bin/g++ main.cpp -o factorize_sse -O3 --profile-use
dkrauss$ time ./factorize_sse 18446743721522234449
4294967231 4294967279
real 0m13.437s
user 0m13.393s
sys 0m0.011s
Below is the first draft of the above. Optimizations included
- Make the cyclic counter merge unconditional (avoid a branch).
- Obtain ILP by unrolling loop by a factor of 15, increasing stride to 30.
- Inspired by your optimization.
- 30 seems to be a sweet spot, as it removes multiples of 2, 3, and 5 for free.
- Primes between 5 and 15 may have several multiples in one stride, so put several copies at varied phase in the vector.
- Factor out
remove_factor
.
- Change conditional, unpredictable
remove_factor
calls to non-branching array writes.
- Fully unroll final loop with
remove_factor
call and make sure the function is always inlined.
- Eliminate the final unrolled iteration as there is always a multiple of 7 among the candidates.
- Add another vector containing all the remaining primes that are small enough.
- Make more space by adding a bias to the counters, and add yet another vector. Now there are only six more primes that could possibly be filtered without bumping up to 16 bits, and I've also run out of registers: the loop needs 3 prime vectors, 3 modulus vectors, 8 constants to search for, and one constant each to increment by and to do the range check. That makes 16.
- The gain is minimal (but positive) in this application, but the original purpose of this technique was to filter primes for the sieve in the other answer. So stay tuned…
Readable version:
/*
* factorize_sse.cpp
* Filter out multiples of the first 17 primes while factorizing a number.
*
* Created by David Krauss on 10/14/10.
*
*/
#include <cmath>
#include <sstream>
#include <iostream>
#include <xmmintrin.h>
using namespace std;
int main( int argc, char *argv[] ) {
unsigned long n;
if ( argc != 2
|| ( istringstream( argv[1] ) >> n >> ws ).rdstate() != ios::eofbit ) {
cerr << "Usage: " << argv[0] << " <number>\n";
return 1;
}
int primes[] = { 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 };
for ( int *p = primes; p < primes + sizeof primes/sizeof *primes; ++ p ) {
while ( n % * p == 0 ) {
n /= * p;
cout << * p << " ";
}
}
if ( n != 1 ) {
__m128i mod = _mm_set_epi8( 1, 2, 3, 5, 6, 8, 9, 11, 14, 15, 18, 20, 21, 23, 26, 29 );
__m128i const prime = _mm_set_epi8( 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59 ),
one = _mm_set1_epi8( 1 );
for ( unsigned long factor = 1, limit = sqrtl( n ); factor < limit; ) {
factor += 2;
__m128i mask = _mm_cmpeq_epi8( mod, one ); // residual going to zero?
mod = _mm_sub_epi8( mod, one ); // update other residuals
if ( _mm_movemask_epi8( mask ) ) {
mask = _mm_and_si128( mask, prime ); // reset cycle if going to zero
mod = _mm_or_si128( mask, mod ); // combine reset into zeroed position
} else while ( n % factor == 0 ) {
n /= factor;
cout << factor << " ";
if ( n == 1 ) goto done;
}
}
cout << n;
}
done:
cout << endl;
}