Sunday, August 18, 2013

Prime sieve calculation

Introduction

Calculating Prime sieve.
All the implementation are in C++11

sieve of Erathostenes

This implements optimized version of classical Erathostenes sieve algorithm.
void sieve_of_erathostenes(hash* sieve) {
        sieve->set();
        sieve->reset(0);
        sieve->reset(1);
        unsigned long L = ceil(sqrt(N));
        unsigned long i=2;
        while(i <= L) {
                for(unsigned long p =i*i; p <= N; p += i)
                        sieve->reset(p);
                do i++; while(sieve->test(i) == false);
        }
}

sieve of Atkin

This implements Atkin sieve.
void sieve_of_atkin(hash* sieve) {
        sieve->reset();
        unsigned long long S = ceil(sqrt(N));
        unsigned long long p = 0;
        for(unsigned long long x =1; x <= S; ++x) {
                p += ((x << 1)-1); //x*x
                unsigned long long q = 0;
                for(unsigned long long y =1; y <= S; ++y) {
                        q += ((y << 1)-1); // y*y
                        unsigned long long v = (p << 2) + q; //4*p+q
                        if(v <= N && (v%12 == 1 || v%12 == 5)){
                                sieve->flip(v);
                        }
                        v -= p;//v = 3*p + q;
                        if(v <= N && (v%12 == 7)) {
                                sieve->flip(v);
                        }
                        if(x > y) {
                                v -= (q << 1);//v = 3*p - q;
                                if(v <= N && (v%12 == 11)) {
                                        sieve->flip(v);
                                }
                        }
                }
        }
        for (unsigned long long v = 5; v <= S; ++v)
                if (sieve->test(v) == true)
                        for (unsigned long long s = v*v, k = v*v; k <= N; k += s) sieve->reset(k);
        sieve->set(2);
        sieve->set(3);
}

main

#include <iostream>
#include <algorithm>
#include <iterator>
#include <bitset>
#include <string>
#include <cmath>
using namespace std;
#ifdef DEBUG#define NV(v)  #v"=" << v
#define TRACE(v)  std::cout << v << std::endl;
const unsigned long N = 100;
#else
#define NV(v) ""
#define TRACE(v);
const unsigned long N = 1000000000;
#endif
typedef bitset<N+1> hash;
/--- Add functions here ---/
int main() {
        hash* sieve = new hash();
#ifdef ERATHOSTENES
        sieve_of_erathostenes(sieve);
#else
        sieve_of_atkin(sieve);
#endif
        cout << sieve->count() << endl;
        delete sieve;
}
Both Erathostenes and Atkin gives almost same time complexity. If we carefully examine Erathostenes first loop there is scope to optimize outer loop.


Wheel Sieve

In the set of N natural number there will be 50% number divisible by 2, 33% number by 3 and so on. If we some how cross-out the multiple of 2, 3, 5 and 7 that means 96% of numbers have been excluded as non-prime. That is what wheel sieve does. Let's see implementation -
// g++ -std=c++11 s-w-sieve.cpp
#include <iostream>
#include <algorithm>
#include <iterator>
#include <bitset>
#include <string>
#include <vector>
#include <cmath>
using namespace std;
#define NV(v)  #v"=" << v 
#define TRACE(v)  std::cout << v << std::endl; 
typedef unsigned long long ULONGLONG;
typedef unsigned long ULONG;
const ULONGLONG N   = 1000000000UL;
typedef struct _element__ {
                unsigned long dist;
                unsigned long rp;
                unsigned long pos;
                unsigned long inv;
}E;
E _E0 = {};
/*
return true if X,Y relative Prime
false otherwise  
*/
bool RP(ULONG x, ULONG y) {
        if (x == 0)
                return y == 1;
        if (y == 0)
                return x == 1;
        ULONG cf2 = ctz(x | y);
        x >>= ctz(x);
        for (;;) {
                y >>= ctz(y);
                if (x == y)
                        break;
                if (x > y)
                        std::swap(x, y);
                if (x == 1)
                        break;
                y -= x;
        }
        return (x << cf2) == 1;
}
/*
Setup Wheel data-structure of K primes
*/
void Wheel(vector<E>& W, const ULONG& m) {
        E e0 = {false, 1, 0, -1};
        W[0] = e0;
        ULONG rp = 0, x = 1;
        while(x < m) {
                ULONG d = 1;
                W[x].rp = true;
                ++rp;
                while(RP(x+d, m) == false) ++d;
                W[x].inv = 0;
                W[x].pos = rp;
                W[rp].inv = x;
                W[x++].dist = d--;
                while(d > 0 && x < m) {
                        E ex = {false, d, 0, 0};
                        W[x] = ex;
                        ++x, --d;
                }
        }
        // phi(m) = W[m-1].pos
        W[W[m-1].pos].inv = 0;
}
void wheel_sieve(bitset<N+1>& sieve) {
        const ULONG S = floor(sqrt(N));
        const ULONG T = floor(sqrt(S));
        const ULONG v = floor((pow(N, 0.3333333333) + S)/2);
        ULONG k = 0, m = 1;
        vector<ULONG> P;
        sieve.set();
        sieve[0] = false;
        sieve[1] = false;
        ULONG p=2;
        // Get prime list below sqrt N and
        // value of k and m for wheel init
        while(p <= T) {
                P.push_back(p);
                if(m <= v && m*p <= v) {
                        m *= p;
                        ++k;
                }
                for(ULONG q =p*p; q <= S; q += p) sieve[q] = false;
                while(sieve[++p] == false);
        }

        do {
                if(sieve[p] == true) {
                        P.push_back(p);
                        if(m <= v && m*p <= v) {
                                m *= p;
                                ++k;
                        }
                }
        } while(++p <= S);
        // Construct wheel for k of size m 
        vector<E>* W = new vector<E>(m, _E0);
        Wheel(*W, m);
        // Good to go with Algorithm 
        sieve.reset();
        for(ULONG i=0; i < k; i++) sieve[P[i]] = true;
        // Till here quite fast
        // Main loop
        for(p = P[k]; p <= S; ++p)
                if(sieve[p] == true)
                        for(ULONG i = p; i <= N/p; i+=(*W)[i%m].dist)                    
                             sieve[p*i] = false;
        delete W;
}

int main() {
        bitset<N+1>* sieve = new bitset<N+1>();
        wheel_sieve(*sieve);
        ULONGLONG Primes = sieve->count();
        TRACE("Number of " << NV(Primes) <<" in " << NV(N));
        delete sieve;
        return 0;
}

Multi-threaded

Now there are few optimization over previous implementation and here we tried to break the outer main loop in slots because their execution is completely independent and there is no overlap. That gives us opportunity for parallel-execution of outer loop of main loop in slots. Note that Main Loop is still very heavy.
Now the prime sieve execution is quite fast and reasonable. I am trying all the algorithms with N = 1000000000, More than this will cause ULONG overflow.
//g++  -std=c++11 p-s-w-sieve.cpp -pthread
#include <iostream>
#include <algorithm>
#include <iterator>
#include <bitset>
#include <limits>
#include <climits>
#include <string>
#include <vector>
#include <cmath>
#include <thread>
using namespace std;
#define NV(v)  #v"=" << v 
#define TRACE(v)  std::cout << v << std::endl; 
typedef unsigned long long ULONGLONG;
typedef unsigned long ULONG;
typedef unsigned short USHORT;
const ULONGLONG N   = 1000000000UL;
typedef std::bitset<N+1> Bitset;
typedef std::array<USHORT, 6542> uarray;
//const ULONGLONG N   = ULONG_MAX-1;
//typedef std::bitset<ULONG_MAX> Bitset;
typedef struct _element__ {
                bool rp;
                unsigned long dist;
                unsigned long pos;
                long inv;
}E;
E _E0 = {};
static const int Mod37BitPosition[] = // map a bit value mod 37 to its position
{
          32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13, 4,
            7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9, 5,
                  20, 8, 19, 18
};
#define ctz(v) Mod37BitPosition[((-(v)) & (v)) % 37];
// First 10 prime enough to overflow ULONG
// P is list all all prime below sqrt(N)
// Fill it 
constexpr uarray P = {};

bool RP(ULONG x, ULONG y) {
        if (x == 0)
                return y == 1;
        if (y == 0)
                return x == 1;
        ULONG cf2 = ctz(x | y);
        x >>= ctz(x);
        for (;;) {
                y >>= ctz(y);
                if (x == y)
                        break;
                if (x > y)
                        std::swap(x, y);
                if (x == 1)
                        break;
                y -= x;
        }
        return (x << cf2) == 1;
}
void Wheel(vector<E>& W, const ULONG& m) {
        E e0 = {false, 1, 0, -1};
        W[0] = e0;
        ULONG rp = 0, x = 1;
        while(x < m) {
                ULONG d = 1;
                W[x].rp = true;
                ++rp;
                while(RP(x+d, m) == false) ++d;
                W[x].inv = 0;
                W[x].pos = rp;
                W[rp].inv = x;
                W[x++].dist = d--;
                while(d > 0 && x < m) {
                        E ex = {false, d, 0, 0};
                        W[x] = ex;
                        ++x, --d;
                }
        }
        // phi(m) = W[m-1].pos
        W[W[m-1].pos].inv = 0;
}


 USHORT setK(const uarray& P, const ULONG& S, ULONG& m) {
        ULONG V = (ULONG)floor((pow(N, 0.3333333333) + S)/2);
        USHORT k = 0;
        m = 1;
        // First 10 prime enough to overflow ULONG
        //while(m*P[k] <= V && k < 25) m *= P[k++];
        while(m*P[k] <= V) m *= P[k++];
        return k;
}
void do_main_loop(const ULONG& start,
                const ULONG& end,
                const ULONG& M,
                const uarray& Primes,
                const vector<E>& W,
                Bitset& sieve)
{
        //TRACE("task:" << this_thread::get_id() << " started " << NV(start) << " " << NV(end));
        for(USHORT j = (USHORT)(start); j < end; ++j)
                for(ULONG i = Primes[j]; i <= N/Primes[j]; i+=W[i%M].dist)
                        sieve[Primes[j]*i] = false;
        //TRACE("task:" << this_thread::get_id() << " end");
}

void wheel_sieve(Bitset& sieve) {
        constexpr const ULONG S = floor(sqrt(N));
        ULONG ntask = 4;
        ULONG width = 0;
        USHORT K = 0;
        ULONG  M = 1;
        sieve.reset();
        K = setK(P, S, M);
        // Construct wheel for k of size m 
        // Replace with precomputed array
        vector<E>* __W__ = new vector<E>(M, _E0);
        vector<E>& W = ref(*__W__);
        std::vector<std::thread> tasks;
        Wheel(W, M);

        //TRACE("running wheel");
        for(ULONG i = P[P.size()-1]; i <= N; i +=W[i%M].dist) 
                   sieve[i] = true;


        // Main loop
        //TRACE("main loop");

        width = ceil((P.size()-K)/ntask);
        if((P.size()-K)%ntask != 0)
                do_main_loop(width*ntask+K, P.size(), M, cref(P), cref(W), ref(sieve));

        tasks.clear();
        for(ULONG start=K; start+width <= (width*ntask+K); start+=width)
                tasks.push_back(thread(do_main_loop, start, start+width, M, cref(P), cref(W), ref(sieve)));

        for(auto& task : tasks) task.join();

        delete __W__;
}

int main() {
        Bitset* sieve = new Bitset();
        wheel_sieve(ref(*sieve));
        ULONG Primes = sieve->count()+P.size()-1;
        TRACE("Number of " << NV(Primes) <<" in " << NV(N));
        delete sieve;
        return 0;
}

Multi-threaded with big integer (GMP Libraries)

To-Do

Bench marking

To-Do