001/*
002 * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain.
003 * This java port was authored by Yonik Seeley and also placed into the public domain.
004 * The author hereby disclaims copyright to this source code.
005 */
006
007package org.tribuo.util;
008
009/**
010 *  The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain.
011 *  This java port was authored by Yonik Seeley and also placed into the public domain.
012 *  The author hereby disclaims copyright to this source code.
013 *  <p>
014 *  This produces exactly the same hash values as the final C++
015 *  version of MurmurHash3 and is thus suitable for producing the same hash values across
016 *  platforms.
017 *  <p>
018 *  The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids.
019 *  murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash.
020 *  <p>
021 *  Note - The x86 and x64 versions do _not_ produce the same results, as the
022 *  algorithms are optimized for their respective platforms.
023 *  <p>
024 *  See http://github.com/yonik/java_util for future updates to this file.
025 */
026public final class MurmurHash3 {
027
028    /** 128 bits of state */
029    public static final class LongPair {
030        public long val1;
031        public long val2;
032    }
033
034    public static final int fmix32(int h) {
035        h ^= h >>> 16;
036        h *= 0x85ebca6b;
037        h ^= h >>> 13;
038        h *= 0xc2b2ae35;
039        h ^= h >>> 16;
040        return h;
041    }
042
043    public static final long fmix64(long k) {
044        k ^= k >>> 33;
045        k *= 0xff51afd7ed558ccdL;
046        k ^= k >>> 33;
047        k *= 0xc4ceb9fe1a85ec53L;
048        k ^= k >>> 33;
049        return k;
050    }
051
052    /**
053     * Gets a long from a byte buffer in little endian byte order.
054     * @param buf The buffer to operate on.
055     * @param offset The current offset into the buffer.
056     * @return A long.
057     */
058    public static final long getLongLittleEndian(byte[] buf, int offset) {
059        return     ((long)buf[offset+7]    << 56)   // no mask needed
060                | ((buf[offset+6] & 0xffL) << 48)
061                | ((buf[offset+5] & 0xffL) << 40)
062                | ((buf[offset+4] & 0xffL) << 32)
063                | ((buf[offset+3] & 0xffL) << 24)
064                | ((buf[offset+2] & 0xffL) << 16)
065                | ((buf[offset+1] & 0xffL) << 8)
066                | ((buf[offset  ] & 0xffL));        // no shift needed
067    }
068
069
070    /**
071     * Returns the MurmurHash3_x86_32 hash.
072     * @param data The data to hash.
073     * @param offset The offset into the data.
074     * @param len The length of the data to hash.
075     * @param seed The initial seed of the hash.
076     * @return The murmurhash3_x86_32 hash.
077     */
078    public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) {
079
080        final int c1 = 0xcc9e2d51;
081        final int c2 = 0x1b873593;
082
083        int h1 = seed;
084        int roundedEnd = offset + (len & 0xfffffffc);  // round down to 4 byte block
085
086        for (int i=offset; i<roundedEnd; i+=4) {
087            // little endian load order
088            int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] & 0xff) << 16) | (data[i+3] << 24);
089            k1 *= c1;
090            k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
091            k1 *= c2;
092
093            h1 ^= k1;
094            h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
095            h1 = h1*5+0xe6546b64;
096        }
097
098        // tail
099        int k1 = 0;
100
101        switch(len & 0x03) {
102            case 3:
103                k1 = (data[roundedEnd + 2] & 0xff) << 16;
104                // fallthrough
105            case 2:
106                k1 |= (data[roundedEnd + 1] & 0xff) << 8;
107                // fallthrough
108            case 1:
109                k1 |= (data[roundedEnd] & 0xff);
110                k1 *= c1;
111                k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
112                k1 *= c2;
113                h1 ^= k1;
114        }
115
116        // finalization
117        h1 ^= len;
118
119        // fmix(h1);
120        h1 ^= h1 >>> 16;
121        h1 *= 0x85ebca6b;
122        h1 ^= h1 >>> 13;
123        h1 *= 0xc2b2ae35;
124        h1 ^= h1 >>> 16;
125
126        return h1;
127    }
128
129
130    /**
131     * Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding
132     * the string to a temporary buffer.  This is more than 2x faster than hashing the result
133     * of String.getBytes().
134     * @param data The data to hash.
135     * @param offset The offset into the data.
136     * @param len The length of the data to hash.
137     * @param seed The initial seed of the hash.
138     * @return The murmurhash3_x86_32 hash.
139     */
140    public static int murmurhash3_x86_32(CharSequence data, int offset, int len, int seed) {
141
142        final int c1 = 0xcc9e2d51;
143        final int c2 = 0x1b873593;
144
145        int h1 = seed;
146
147        int pos = offset;
148        int end = offset + len;
149        int k1 = 0;
150        int k2 = 0;
151        int shift = 0;
152        int bits = 0;
153        int nBytes = 0;   // length in UTF8 bytes
154
155
156        while (pos < end) {
157            int code = data.charAt(pos++);
158            if (code < 0x80) {
159                k2 = code;
160                bits = 8;
161
162                /***
163                 // optimized ascii implementation (currently slower!!! code size?)
164                 if (shift == 24) {
165                 k1 = k1 | (code << 24);
166                 k1 *= c1;
167                 k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
168                 k1 *= c2;
169                 h1 ^= k1;
170                 h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
171                 h1 = h1*5+0xe6546b64;
172                 shift = 0;
173                 nBytes += 4;
174                 k1 = 0;
175                 } else {
176                 k1 |= code << shift;
177                 shift += 8;
178                 }
179                 continue;
180                 ***/
181
182            }
183            else if (code < 0x800) {
184                k2 = (0xC0 | (code >> 6))
185                        | ((0x80 | (code & 0x3F)) << 8);
186                bits = 16;
187            }
188            else if (code < 0xD800 || code > 0xDFFF || pos>=end) {
189                // we check for pos>=end to encode an unpaired surrogate as 3 bytes.
190                k2 = (0xE0 | (code >> 12))
191                        | ((0x80 | ((code >> 6) & 0x3F)) << 8)
192                        | ((0x80 | (code & 0x3F)) << 16);
193                bits = 24;
194            } else {
195                // surrogate pair
196                // int utf32 = pos < end ? (int) data.charAt(pos++) : 0;
197                int utf32 = (int) data.charAt(pos++);
198                utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF);
199                k2 = (0xff & (0xF0 | (utf32 >> 18)))
200                        | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8
201                        | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16
202                        |  (0x80 | (utf32 & 0x3F)) << 24;
203                bits = 32;
204            }
205
206
207            k1 |= k2 << shift;
208
209            // int used_bits = 32 - shift;  // how many bits of k2 were used in k1.
210            // int unused_bits = bits - used_bits; //  (bits-(32-shift)) == bits+shift-32  == bits-newshift
211
212            shift += bits;
213            if (shift >= 32) {
214                // mix after we have a complete word
215
216                k1 *= c1;
217                k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
218                k1 *= c2;
219
220                h1 ^= k1;
221                h1 = (h1 << 13) | (h1 >>> 19);  // ROTL32(h1,13);
222                h1 = h1*5+0xe6546b64;
223
224                shift -= 32;
225                // unfortunately, java won't let you shift 32 bits off, so we need to check for 0
226                if (shift != 0) {
227                    k1 = k2 >>> (bits-shift);   // bits used == bits - newshift
228                } else {
229                    k1 = 0;
230                }
231                nBytes += 4;
232            }
233
234        } // inner
235
236        // handle tail
237        if (shift > 0) {
238            nBytes += shift >> 3;
239            k1 *= c1;
240            k1 = (k1 << 15) | (k1 >>> 17);  // ROTL32(k1,15);
241            k1 *= c2;
242            h1 ^= k1;
243        }
244
245        // finalization
246        h1 ^= nBytes;
247
248        // fmix(h1);
249        h1 ^= h1 >>> 16;
250        h1 *= 0x85ebca6b;
251        h1 ^= h1 >>> 13;
252        h1 *= 0xc2b2ae35;
253        h1 ^= h1 >>> 16;
254
255        return h1;
256    }
257
258
259    /**
260     * Returns the MurmurHash3_x64_128 hash, placing the result in "out".
261     * @param key The data to hash.
262     * @param offset The offset into the data.
263     * @param len The length of the data to hash.
264     * @param seed The initial state of the hash.
265     * @param out The output value (as it's 128 bits).
266     */
267    public static void murmurhash3_x64_128(byte[] key, int offset, int len, int seed, LongPair out) {
268        // The original algorithm does have a 32 bit unsigned seed.
269        // We have to mask to match the behavior of the unsigned types and prevent sign extension.
270        long h1 = seed & 0x00000000FFFFFFFFL;
271        long h2 = seed & 0x00000000FFFFFFFFL;
272
273        final long c1 = 0x87c37b91114253d5L;
274        final long c2 = 0x4cf5ad432745937fL;
275
276        int roundedEnd = offset + (len & 0xFFFFFFF0);  // round down to 16 byte block
277        for (int i=offset; i<roundedEnd; i+=16) {
278            long k1 = getLongLittleEndian(key, i);
279            long k2 = getLongLittleEndian(key, i+8);
280            k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
281            h1 = Long.rotateLeft(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
282            k2 *= c2; k2  = Long.rotateLeft(k2,33); k2 *= c1; h2 ^= k2;
283            h2 = Long.rotateLeft(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
284        }
285
286        long k1 = 0;
287        long k2 = 0;
288
289        switch (len & 15) {
290            case 15: k2  = (key[roundedEnd+14] & 0xffL) << 48;
291            case 14: k2 |= (key[roundedEnd+13] & 0xffL) << 40;
292            case 13: k2 |= (key[roundedEnd+12] & 0xffL) << 32;
293            case 12: k2 |= (key[roundedEnd+11] & 0xffL) << 24;
294            case 11: k2 |= (key[roundedEnd+10] & 0xffL) << 16;
295            case 10: k2 |= (key[roundedEnd+ 9] & 0xffL) << 8;
296            case  9: k2 |= (key[roundedEnd+ 8] & 0xffL);
297                k2 *= c2; k2  = Long.rotateLeft(k2, 33); k2 *= c1; h2 ^= k2;
298            case  8: k1  = ((long)key[roundedEnd+7]) << 56;
299            case  7: k1 |= (key[roundedEnd+6] & 0xffL) << 48;
300            case  6: k1 |= (key[roundedEnd+5] & 0xffL) << 40;
301            case  5: k1 |= (key[roundedEnd+4] & 0xffL) << 32;
302            case  4: k1 |= (key[roundedEnd+3] & 0xffL) << 24;
303            case  3: k1 |= (key[roundedEnd+2] & 0xffL) << 16;
304            case  2: k1 |= (key[roundedEnd+1] & 0xffL) << 8;
305            case  1: k1 |= (key[roundedEnd  ] & 0xffL);
306                k1 *= c1; k1  = Long.rotateLeft(k1,31); k1 *= c2; h1 ^= k1;
307        }
308
309        //----------
310        // finalization
311
312        h1 ^= len; h2 ^= len;
313
314        h1 += h2;
315        h2 += h1;
316
317        h1 = fmix64(h1);
318        h2 = fmix64(h2);
319
320        h1 += h2;
321        h2 += h1;
322
323        out.val1 = h1;
324        out.val2 = h2;
325    }
326
327}