001/* 002 * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. 003 * This java port was authored by Yonik Seeley and also placed into the public domain. 004 * The author hereby disclaims copyright to this source code. 005 */ 006 007package org.tribuo.util; 008 009/** 010 * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. 011 * This java port was authored by Yonik Seeley and also placed into the public domain. 012 * The author hereby disclaims copyright to this source code. 013 * <p> 014 * This produces exactly the same hash values as the final C++ 015 * version of MurmurHash3 and is thus suitable for producing the same hash values across 016 * platforms. 017 * <p> 018 * The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids. 019 * murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash. 020 * <p> 021 * Note - The x86 and x64 versions do _not_ produce the same results, as the 022 * algorithms are optimized for their respective platforms. 023 * <p> 024 * See http://github.com/yonik/java_util for future updates to this file. 025 */ 026public final class MurmurHash3 { 027 028 /** 128 bits of state */ 029 public static final class LongPair { 030 public long val1; 031 public long val2; 032 } 033 034 public static final int fmix32(int h) { 035 h ^= h >>> 16; 036 h *= 0x85ebca6b; 037 h ^= h >>> 13; 038 h *= 0xc2b2ae35; 039 h ^= h >>> 16; 040 return h; 041 } 042 043 public static final long fmix64(long k) { 044 k ^= k >>> 33; 045 k *= 0xff51afd7ed558ccdL; 046 k ^= k >>> 33; 047 k *= 0xc4ceb9fe1a85ec53L; 048 k ^= k >>> 33; 049 return k; 050 } 051 052 /** 053 * Gets a long from a byte buffer in little endian byte order. 054 * @param buf The buffer to operate on. 055 * @param offset The current offset into the buffer. 056 * @return A long. 057 */ 058 public static final long getLongLittleEndian(byte[] buf, int offset) { 059 return ((long)buf[offset+7] << 56) // no mask needed 060 | ((buf[offset+6] & 0xffL) << 48) 061 | ((buf[offset+5] & 0xffL) << 40) 062 | ((buf[offset+4] & 0xffL) << 32) 063 | ((buf[offset+3] & 0xffL) << 24) 064 | ((buf[offset+2] & 0xffL) << 16) 065 | ((buf[offset+1] & 0xffL) << 8) 066 | ((buf[offset ] & 0xffL)); // no shift needed 067 } 068 069 070 /** 071 * Returns the MurmurHash3_x86_32 hash. 072 * @param data The data to hash. 073 * @param offset The offset into the data. 074 * @param len The length of the data to hash. 075 * @param seed The initial seed of the hash. 076 * @return The murmurhash3_x86_32 hash. 077 */ 078 public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { 079 080 final int c1 = 0xcc9e2d51; 081 final int c2 = 0x1b873593; 082 083 int h1 = seed; 084 int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block 085 086 for (int i=offset; i<roundedEnd; i+=4) { 087 // little endian load order 088 int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24); 089 k1 *= c1; 090 k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 091 k1 *= c2; 092 093 h1 ^= k1; 094 h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 095 h1 = h1*5+0xe6546b64; 096 } 097 098 // tail 099 int k1 = 0; 100 101 switch(len & 0x03) { 102 case 3: 103 k1 = (data[roundedEnd + 2] & 0xff) << 16; 104 // fallthrough 105 case 2: 106 k1 |= (data[roundedEnd + 1] & 0xff) << 8; 107 // fallthrough 108 case 1: 109 k1 |= (data[roundedEnd] & 0xff); 110 k1 *= c1; 111 k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 112 k1 *= c2; 113 h1 ^= k1; 114 } 115 116 // finalization 117 h1 ^= len; 118 119 // fmix(h1); 120 h1 ^= h1 >>> 16; 121 h1 *= 0x85ebca6b; 122 h1 ^= h1 >>> 13; 123 h1 *= 0xc2b2ae35; 124 h1 ^= h1 >>> 16; 125 126 return h1; 127 } 128 129 130 /** 131 * Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding 132 * the string to a temporary buffer. This is more than 2x faster than hashing the result 133 * of String.getBytes(). 134 * @param data The data to hash. 135 * @param offset The offset into the data. 136 * @param len The length of the data to hash. 137 * @param seed The initial seed of the hash. 138 * @return The murmurhash3_x86_32 hash. 139 */ 140 public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) { 141 142 final int c1 = 0xcc9e2d51; 143 final int c2 = 0x1b873593; 144 145 int h1 = seed; 146 147 int pos = offset; 148 int end = offset + len; 149 int k1 = 0; 150 int k2 = 0; 151 int shift = 0; 152 int bits = 0; 153 int nBytes = 0; // length in UTF8 bytes 154 155 156 while (pos < end) { 157 int code = data.charAt(pos++); 158 if (code < 0x80) { 159 k2 = code; 160 bits = 8; 161 162 /*** 163 // optimized ascii implementation (currently slower!!! code size?) 164 if (shift == 24) { 165 k1 = k1 | (code << 24); 166 k1 *= c1; 167 k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 168 k1 *= c2; 169 h1 ^= k1; 170 h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 171 h1 = h1*5+0xe6546b64; 172 shift = 0; 173 nBytes += 4; 174 k1 = 0; 175 } else { 176 k1 |= code << shift; 177 shift += 8; 178 } 179 continue; 180 ***/ 181 182 } 183 else if (code < 0x800) { 184 k2 = (0xC0 | (code >> 6)) 185 | ((0x80 | (code & 0x3F)) << 8); 186 bits = 16; 187 } 188 else if (code < 0xD800 || code > 0xDFFF || pos>=end) { 189 // we check for pos>=end to encode an unpaired surrogate as 3 bytes. 190 k2 = (0xE0 | (code >> 12)) 191 | ((0x80 | ((code >> 6) & 0x3F)) << 8) 192 | ((0x80 | (code & 0x3F)) << 16); 193 bits = 24; 194 } else { 195 // surrogate pair 196 // int utf32 = pos < end ? (int) data.charAt(pos++) : 0; 197 int utf32 = (int) data.charAt(pos++); 198 utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); 199 k2 = (0xff & (0xF0 | (utf32 >> 18))) 200 | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8 201 | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16 202 | (0x80 | (utf32 & 0x3F)) << 24; 203 bits = 32; 204 } 205 206 207 k1 |= k2 << shift; 208 209 // int used_bits = 32 - shift; // how many bits of k2 were used in k1. 210 // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift 211 212 shift += bits; 213 if (shift >= 32) { 214 // mix after we have a complete word 215 216 k1 *= c1; 217 k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 218 k1 *= c2; 219 220 h1 ^= k1; 221 h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); 222 h1 = h1*5+0xe6546b64; 223 224 shift -= 32; 225 // unfortunately, java won't let you shift 32 bits off, so we need to check for 0 226 if (shift != 0) { 227 k1 = k2 >>> (bits-shift); // bits used == bits - newshift 228 } else { 229 k1 = 0; 230 } 231 nBytes += 4; 232 } 233 234 } // inner 235 236 // handle tail 237 if (shift > 0) { 238 nBytes += shift >> 3; 239 k1 *= c1; 240 k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); 241 k1 *= c2; 242 h1 ^= k1; 243 } 244 245 // finalization 246 h1 ^= nBytes; 247 248 // fmix(h1); 249 h1 ^= h1 >>> 16; 250 h1 *= 0x85ebca6b; 251 h1 ^= h1 >>> 13; 252 h1 *= 0xc2b2ae35; 253 h1 ^= h1 >>> 16; 254 255 return h1; 256 } 257 258 259 /** 260 * Returns the MurmurHash3_x64_128 hash, placing the result in "out". 261 * @param key The data to hash. 262 * @param offset The offset into the data. 263 * @param len The length of the data to hash. 264 * @param seed The initial state of the hash. 265 * @param out The output value (as it's 128 bits). 266 */ 267 public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) { 268 // The original algorithm does have a 32 bit unsigned seed. 269 // We have to mask to match the behavior of the unsigned types and prevent sign extension. 270 long h1 = seed & 0x00000000FFFFFFFFL; 271 long h2 = seed & 0x00000000FFFFFFFFL; 272 273 final long c1 = 0x87c37b91114253d5L; 274 final long c2 = 0x4cf5ad432745937fL; 275 276 int roundedEnd = offset + (len & 0xFFFFFFF0); // round down to 16 byte block 277 for (int i=offset; i<roundedEnd; i+=16) { 278 long k1 = getLongLittleEndian(key, i); 279 long k2 = getLongLittleEndian(key, i+8); 280 k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1; 281 h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 282 k2 *= c2; k2 = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2; 283 h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 284 } 285 286 long k1 = 0; 287 long k2 = 0; 288 289 switch (len & 15) { 290 case 15: k2 = (key[roundedEnd+14] & 0xffL) << 48; 291 case 14: k2 |= (key[roundedEnd+13] & 0xffL) << 40; 292 case 13: k2 |= (key[roundedEnd+12] & 0xffL) << 32; 293 case 12: k2 |= (key[roundedEnd+11] & 0xffL) << 24; 294 case 11: k2 |= (key[roundedEnd+10] & 0xffL) << 16; 295 case 10: k2 |= (key[roundedEnd+ 9] & 0xffL) << 8; 296 case 9: k2 |= (key[roundedEnd+ 8] & 0xffL); 297 k2 *= c2; k2 = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2; 298 case 8: k1 = ((long)key[roundedEnd+7]) << 56; 299 case 7: k1 |= (key[roundedEnd+6] & 0xffL) << 48; 300 case 6: k1 |= (key[roundedEnd+5] & 0xffL) << 40; 301 case 5: k1 |= (key[roundedEnd+4] & 0xffL) << 32; 302 case 4: k1 |= (key[roundedEnd+3] & 0xffL) << 24; 303 case 3: k1 |= (key[roundedEnd+2] & 0xffL) << 16; 304 case 2: k1 |= (key[roundedEnd+1] & 0xffL) << 8; 305 case 1: k1 |= (key[roundedEnd ] & 0xffL); 306 k1 *= c1; k1 = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1; 307 } 308 309 //---------- 310 // finalization 311 312 h1 ^= len; h2 ^= len; 313 314 h1 += h2; 315 h2 += h1; 316 317 h1 = fmix64(h1); 318 h2 = fmix64(h2); 319 320 h1 += h2; 321 h2 += h1; 322 323 out.val1 = h1; 324 out.val2 = h2; 325 } 326 327}