You can save a bit on memory by relying on the fact that the pointer will use less than 64 bits. First, define a compare&set
function (this ASM works in GCC & ICC):
inline bool CAS_ (volatile uint64_t* mem, uint64_t old_val, uint64_t new_val)
{
unsigned long old_high = old_val >> 32, old_low = old_val;
unsigned long new_high = new_val >> 32, new_low = new_val;
char res = 0;
asm volatile("lock; cmpxchg8b (%6);"
"setz %7; "
: "=a" (old_low), // 0
"=d" (old_high) // 1
: "0" (old_low), // 2
"1" (old_high), // 3
"b" (new_low), // 4
"c" (new_high), // 5
"r" (mem), // 6
"m" (res) // 7
: "cc", "memory");
return res;
}
You'll then need to build a tagged-pointer type. I'm assuming a 40-bit pointer with a cacheline-width of 128-bytes (like Nehalem). Aligning to the cache-line will give enormous speed improvements by reducing false-sharing, contention, etc.; this has the obvious trade-off of using a lot more memory, in some situations.
template <typename pointer_type, typename tag_type, int PtrAlign=7, int PtrWidth=40>
struct tagged_pointer
{
static const unsigned int PtrMask = (1 << (PtrWidth - PtrAlign)) - 1;
static const unsigned int TagMask = ~ PtrMask;
typedef unsigned long raw_value_type;
raw_value_type raw_m_;
tagged_pointer () : raw_m_(0) {}
tagged_pointer (pointer_type ptr) { this->pack(ptr, 0); }
tagged_pointer (pointer_type ptr, tag_type tag) { this->pack(ptr, tag); }
void pack (pointer_type ptr, tag_type tag)
{
this->raw_m_ = 0;
this->raw_m_ |= ((ptr >> PtrAlign) & PtrMask);
this->raw_m_ |= ((tag << (PtrWidth - PtrAlign)) & TagMask);
}
pointer_type ptr () const
{
raw_value_type p = (this->raw_m_ & PtrMask) << PtrAlign;
return *reinterpret_cast<pointer_type*>(&p);
}
tag_type tag () const
{
raw_value_type t = (this->raw_m_ & TagMask) >> (PtrWidth - PtrAlign_;
return *reinterpret_cast<tag_type*>(&t);
}
};
I haven't had a chance to debug this code, so you'll need to do that, but this is the general idea.