Revision as of 19:06, 12 August 2009 edit 122.167.85.239 (talk) There are little-/big-endian machinesa other than those made by motorola or intel. ← Previous edit		Revision as of 19:16, 12 August 2009 edit undo Oli Filth (talk \| contribs) Extended confirmed users, Pending changes reviewers, Rollbackers 18,040 edits rm large code snippets (inappropriate), and add notability tag Next edit →
Line 1: {{notability}} {{cleanup\|date=April 2009}} Line 16 ⟶ 17: There are also two versions[http://murmurhash.googlepages.com/MurmurHash2_64.cpp] available which generate 64-bit hash values, suitable for differentiating among tens of thousands of items without false positives caused by the [[birthday paradox]]. The first, MurmurHash64A, is MurmurHash2 scaled up directly to 64 bits, for maximum performance on 64-bit processors. The second, MurmurHash64B, is designed for 32-bit processors, so it essentially runs a pair of MurmurHash2 algorithms in parallel and combines the results. Both of these generate different hash values from each other and MurmurHash2, and have the same byte order and alignment requirements, except that MurmurHash64A alignment is on 8-byte, not 4-byte, boundaries. ==Usage and availability== ~~===MurmurHash2===~~ ~~//-----------------------------------------------------------------------------~~ ~~// MurmurHash2, by Austin Appleby~~ ~~// Note - This code makes a few assumptions about how your machine behaves -~~ ~~// 1. We can read a 4-byte value from any address without crashing~~ ~~// 2. sizeof(int) == 4~~ ~~// And it has a few limitations -~~ ~~// 1. It will not work incrementally.~~ ~~// 2. It will not produce the same results on little-endian and big-endian~~ ~~// machines.~~ ~~unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed )~~ { ~~// 'm' and 'r' are mixing constants generated offline.~~ ~~// They're not really 'magic', they just happen to work well.~~ ~~const unsigned int m = 0x5bd1e995;~~ ~~const int r = 24;~~ ~~// Initialize the hash to a 'random' value~~ ~~unsigned int h = seed ^ len;~~ ~~// Mix 4 bytes at a time into the hash~~ ~~const unsigned char * data = (const unsigned char )key;~~ ~~while(len >= 4)~~ { ~~unsigned int k = (unsigned int )data;~~ ~~k = m;~~ ~~k ^= k >> r;~~ ~~k = m;~~ ~~h = m;~~ ~~h ^= k;~~ ~~data += 4;~~ ~~len -= 4;~~ } ~~// Handle the last few bytes of the input array~~ ~~switch(len)~~ { ~~case 3: h ^= data[2] << 16;~~ ~~case 2: h ^= data[1] << 8;~~ ~~case 1: h ^= data[0];~~ ~~h = m;~~ }; ~~// Do a few final mixes of the hash to ensure the last few~~ ~~// bytes are well-incorporated.~~ ~~h ^= h >> 13;~~ ~~h = m;~~ ~~h ^= h >> 15;~~ ~~return h;~~ } ~~===MurmurHash64A/B===~~ ~~typedef unsigned __int64 uint64_t;~~ ~~// 64-bit hash for 64-bit platforms~~ ~~uint64_t MurmurHash64A ( const void * key, int len, unsigned int seed )~~ { ~~const uint64_t m = 0xc6a4a7935bd1e995;~~ ~~const int r = 47;~~ ~~uint64_t h = seed ^ (len * m);~~ ~~const uint64_t * data = (const uint64_t )key;~~ ~~const uint64_t end = data + (len/8);~~ ~~while(data != end)~~ { ~~uint64_t k = data++;~~ ~~k = m;~~ ~~k ^= k >> r;~~ ~~k = m;~~ ~~h ^= k;~~ ~~h = m;~~ } ~~const unsigned char * data2 = (const unsigned char)data;~~ ~~switch(len & 7)~~ { ~~case 7: h ^= uint64_t(data2[6]) << 48;~~ ~~case 6: h ^= uint64_t(data2[5]) << 40;~~ ~~case 5: h ^= uint64_t(data2[4]) << 32;~~ ~~case 4: h ^= uint64_t(data2[3]) << 24;~~ ~~case 3: h ^= uint64_t(data2[2]) << 16;~~ ~~case 2: h ^= uint64_t(data2[1]) << 8;~~ ~~case 1: h ^= uint64_t(data2[0]);~~ ~~h = m;~~ }; ~~h ^= h >> r;~~ ~~h = m;~~ ~~h ^= h >> r;~~ ~~return h;~~ } ~~// 64-bit hash for 32-bit platforms~~ ~~uint64_t MurmurHash64B ( const void key, int len, unsigned int seed )~~ { ~~const unsigned int m = 0x5bd1e995;~~ ~~const int r = 24;~~ ~~unsigned int h1 = seed ^ len;~~ ~~unsigned int h2 = 0;~~ ~~const unsigned int * data = (const unsigned int )key;~~ ~~while(len >= 8)~~ { ~~unsigned int k1 = data++;~~ ~~k1 = m; k1 ^= k1 >> r; k1 = m;~~ ~~h1 = m; h1 ^= k1;~~ ~~len -= 4;~~ ~~unsigned int k2 = data++;~~ ~~k2 = m; k2 ^= k2 >> r; k2 = m;~~ ~~h2 = m; h2 ^= k2;~~ ~~len -= 4;~~ } ~~if(len >= 4)~~ { ~~unsigned int k1 = data++;~~ ~~k1 = m; k1 ^= k1 >> r; k1 = m;~~ ~~h1 = m; h1 ^= k1;~~ ~~len -= 4;~~ } ~~switch(len)~~ { ~~case 3: h2 ^= ((unsigned char)data)[2] << 16;~~ ~~case 2: h2 ^= ((unsigned char)data)[1] << 8;~~ ~~case 1: h2 ^= ((unsigned char)data)[0];~~ ~~h2 = m;~~ }; ~~h1 ^= h2 >> 18; h1 = m;~~ ~~h2 ^= h1 >> 22; h2 = m;~~ ~~h1 ^= h2 >> 17; h1 = m;~~ ~~h2 ^= h1 >> 19; h2 *= m;~~ ~~uint64_t h = h1;~~ ~~h = (h << 32) \| h2;~~ ~~return h;~~ } ~~==Usage and availability.==~~ Murmur was originally expressed in C++, and has been ported to a number of popular languages, including Python[http://pypi.python.org/pypi/Murmur/0.1.3], C#[http://landman-code.blogspot.com/], Perl[http://search.cpan.org/~tmaesaka/Digest-MurmurHash-0.10/lib/Digest/MurmurHash.pm], Java[http://www.getopt.org/murmur/MurmurHash.java][http://hadoop.apache.org/hbase/docs/r0.19.1/api/org/apache/hadoop/hbase/util/MurmurHash.html] and Delphi[https://forums.embarcadero.com/thread.jspa?messageID=99863&#99863].

MurmurHash: Difference between revisions