Last active
April 21, 2025 23:48
-
-
Save DanielThomas/84ad3db09eef27af85e6bc95653e553f to your computer and use it in GitHub Desktop.
A hash index for primitive numeric values in Java using a twin prime, double hashed hash table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright 2024 Netflix Inc. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package com.netflix.index; | |
import java.math.BigInteger; | |
import java.util.Objects; | |
import java.util.Spliterator; | |
import java.util.function.LongFunction; | |
import java.util.function.LongPredicate; | |
import java.util.function.ToLongFunction; | |
/** | |
* An immutable index of numeric keys to numeric values that are externally mappable to objects of type `T`. | |
* | |
* <p> Unlike a {@link java.util.Map} multiple values for a given key are supported. They are collisions however, so have | |
* {@code O(n)} complexity for each additional duplicate, without considering collisions with other keys. The index does | |
* not store keys, the caller decides if the expected value was found for a given key with a predicate. | |
* | |
* <p> The number of bytes to be used to store the value can be specified, and can be any numeric value able to be widened | |
* to {@code long} supported. The largest positive value for the given byte length is reserved for internal use. | |
* | |
* <p> The index is backed by an open addressing, double hashed, twin prime sized hash table. Generics for the value type | |
* and specialized overloads for smaller value types are avoided, so in some cases the caller may have to narrow an | |
* argument, In those cases there is no risk of data loss as the length invariant is guaranteed at insertion time. This | |
* allows boxing to be avoided, while we wait out Project Valhalla. | |
* | |
* @param <T> the external type being indexed | |
* @see <a href="https://stackoverflow.com/a/62335671/364206">Data Structures and Other Objects Using C++ (4th Edition), Page 614</a> | |
*/ | |
public final class HashIndex<T> { | |
private static final float DEFAULT_LOAD_FACTOR = 0.5f; | |
private static final long UNUSED = 0; | |
private final Elements elements; | |
public static <T> HashIndex<T> ofByte(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) { | |
return of(source, keyMapper, valueMapper, Byte.BYTES, DEFAULT_LOAD_FACTOR); | |
} | |
public static <T> HashIndex<T> ofShort(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) { | |
return of(source, keyMapper, valueMapper, Short.BYTES, DEFAULT_LOAD_FACTOR); | |
} | |
public static <T> HashIndex<T> ofInt(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) { | |
return of(source, keyMapper, valueMapper, Integer.BYTES, DEFAULT_LOAD_FACTOR); | |
} | |
public static <T> HashIndex<T> ofLong(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) { | |
return of(source, keyMapper, valueMapper, Long.BYTES, DEFAULT_LOAD_FACTOR); | |
} | |
public static <T> HashIndex<T> of(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper, int valueSize, float loadFactor) { | |
return new HashIndex<>(source, keyMapper, valueMapper, valueSize, loadFactor); | |
} | |
private HashIndex(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper, int valueSize, float loadFactor) { | |
Objects.requireNonNull(source); | |
if (!source.hasCharacteristics(Spliterator.SIZED)) { | |
throw new IllegalArgumentException("source spliterator must be SIZED"); | |
} | |
Objects.requireNonNull(keyMapper); | |
Objects.requireNonNull(valueMapper); | |
if (valueSize <= 0) { | |
throw new IllegalArgumentException("valueSize must be positive"); | |
} | |
// plus one makes this a power of two, see https://bugs.openjdk.org/browse/JDK-8003585 | |
int length = twinnedPrimeTableSizeFor(source.getExactSizeIfKnown(), loadFactor) + 1; | |
if (valueSize == Byte.BYTES) { | |
elements = new ByteElements(length); | |
} else if (valueSize == Short.BYTES) { | |
elements = new ShortElements(length); | |
} else if (valueSize <= Integer.BYTES) { | |
elements = new IntElements(length); | |
} else if (valueSize <= Long.BYTES) { | |
elements = new LongElements(length); | |
} else { | |
throw new IllegalArgumentException("valueSize maximum is " + Long.BYTES + " bytes"); | |
} | |
source.forEachRemaining(input -> { | |
Objects.requireNonNull(input, "input element was null"); | |
long key = keyMapper.applyAsLong(input); | |
long value = valueMapper.applyAsLong(input); | |
if (!elements.supports(value)) { | |
throw new IllegalArgumentException("value out of range"); | |
} | |
if (elements.isMaxValue(value)) { | |
throw new IllegalArgumentException("max value is reserved"); | |
} | |
int idx = probe(key, existing -> existing == value); | |
if (idx == -1) { | |
throw new IndexOutOfBoundsException("the elements is full"); | |
} else if (idx < 0) { | |
int dst = -(idx + 2); | |
elements.put(dst, value >= 0 ? value + 1 : value); | |
} | |
// else it's >= 0 because we have a duplicate key and value | |
}); | |
} | |
/** | |
* Returns the value that a given key maps to, using a {@link LongFunction} to map the return type to an | |
* {@link T}. A null return value from the value mapper indicates the value was not a match, and the operation | |
* should continue until known to be absent. | |
* | |
* @param key the key to find a value for | |
* @param valueMapper a mapper from a value to a return type of {@link T} | |
* @return the result, null is the value does not exist | |
*/ | |
public T map(long key, LongFunction<T> valueMapper) { | |
int length = elements.length() - 1; | |
int idx = index(key, length); | |
int attempts = 0; | |
while (attempts < length) { | |
long value = elements.get(idx); | |
if (value == UNUSED) { | |
return null; | |
} else { | |
T mappedValue = valueMapper.apply(value > 0 ? value - 1 : value); | |
if (mappedValue != null) { | |
return mappedValue; | |
} | |
} | |
idx = Math.floorMod(idx + offset(key, length), length); | |
attempts++; | |
} | |
return null; | |
} | |
private int probe(long key, LongPredicate valueEquals) { | |
int length = elements.length() - 1; | |
int idx = index(key, length); | |
int attempts = 0; | |
while(attempts < length) { | |
long value = elements.get(idx); | |
if (value == UNUSED) { | |
return -idx - 2; | |
} else if (valueEquals.test(value > 0 ? value - 1 : value)) { | |
return idx; | |
} | |
idx = Math.floorMod(idx + offset(key, length), length); | |
attempts++; | |
} | |
return -1; | |
} | |
private static int index(long key, int length) { | |
return Math.floorMod(Long.hashCode(key), length); | |
} | |
private static int offset(long key, int length) { | |
return 1 + (Math.floorMod(Long.hashCode(key), (length - 2))); | |
} | |
private static int twinnedPrimeTableSizeFor(long numEntries, float loadFactor) { | |
long size = (long)Math.ceil(numEntries / (double) loadFactor); | |
BigInteger twin = BigInteger.valueOf(size).nextProbablePrime(); | |
BigInteger candidate = twin.nextProbablePrime(); | |
while (candidate.subtract(twin).intValue() != 2 | |
|| !(isPrime(twin.intValueExact()) && isPrime(candidate.intValueExact()))) { | |
twin = candidate; | |
candidate = twin.nextProbablePrime(); | |
} | |
return candidate.intValueExact(); | |
} | |
private static boolean isPrime(int candidate) { | |
if ((candidate & 1) != 0) { | |
int limit = (int) Math.sqrt(candidate); | |
for (int divisor = 3; divisor <= limit; divisor += 2) { | |
if ((candidate % divisor) == 0) { | |
return false; | |
} | |
} | |
return true; | |
} | |
return candidate == 2; | |
} | |
private interface Elements { | |
int length(); | |
long get(int i); | |
void put(int i, long value); | |
boolean supports(long value); | |
boolean isMaxValue(long value); | |
} | |
private static final class ByteElements implements Elements { | |
private final byte[] elements; | |
private ByteElements(int size) { | |
this.elements = new byte[size]; | |
} | |
@Override | |
public int length() { | |
return elements.length; | |
} | |
@Override | |
public long get(int i) { | |
return elements[i]; | |
} | |
@Override | |
public void put(int i, long value) { | |
elements[i] = (byte) value; | |
} | |
@Override | |
public boolean supports(long value) { | |
return value >= Byte.MIN_VALUE && value <= Byte.MAX_VALUE; | |
} | |
@Override | |
public boolean isMaxValue(long value) { | |
return value == Byte.MAX_VALUE; | |
} | |
} | |
private static final class ShortElements implements Elements { | |
private final short[] elements; | |
private ShortElements(int size) { | |
this.elements = new short[size]; | |
} | |
@Override | |
public int length() { | |
return elements.length; | |
} | |
@Override | |
public long get(int i) { | |
return elements[i]; | |
} | |
@Override | |
public void put(int i, long value) { | |
assert value == (short) value; | |
elements[i] = (short) value; | |
} | |
@Override | |
public boolean supports(long value) { | |
return value >= Short.MIN_VALUE && value <= Short.MAX_VALUE; | |
} | |
@Override | |
public boolean isMaxValue(long value) { | |
return value == Short.MAX_VALUE; | |
} | |
} | |
private static final class IntElements implements Elements { | |
private final int[] elements; | |
private IntElements(int size) { | |
this.elements = new int[size]; | |
} | |
@Override | |
public int length() { | |
return elements.length; | |
} | |
@Override | |
public long get(int i) { | |
return elements[i]; | |
} | |
@Override | |
public void put(int i, long value) { | |
assert value == (int) value; | |
elements[i] = (int) value; | |
} | |
@Override | |
public boolean supports(long value) { | |
return value >= Integer.MIN_VALUE && value <= Integer.MAX_VALUE; | |
} | |
@Override | |
public boolean isMaxValue(long value) { | |
return value == Integer.MAX_VALUE; | |
} | |
} | |
private static final class LongElements implements Elements { | |
private final long[] elements; | |
private LongElements(int size) { | |
this.elements = new long[size]; | |
} | |
@Override | |
public int length() { | |
return elements.length; | |
} | |
@Override | |
public long get(int i) { | |
return elements[i]; | |
} | |
@Override | |
public void put(int i, long value) { | |
elements[i] = value; | |
} | |
@Override | |
public boolean supports(long value) { | |
return true; | |
} | |
@Override | |
public boolean isMaxValue(long value) { | |
return value == Long.MAX_VALUE; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I couldn't find any prior art for a hash index that only stores values and used external equality. The default load factor of
0.5
provides excellent performance, but0.75
is a fine tradeoff for space, but will degrade significantly as you approach1.0
. If you're only storing positive values, you can double your value space by subtractingMIN_VALUE
for the type from your value.Tests for the class are available in a separate Gist.