Skip to content

Instantly share code, notes, and snippets.

@DanielThomas
Last active April 21, 2025 23:48
Show Gist options
  • Save DanielThomas/84ad3db09eef27af85e6bc95653e553f to your computer and use it in GitHub Desktop.
Save DanielThomas/84ad3db09eef27af85e6bc95653e553f to your computer and use it in GitHub Desktop.
A hash index for primitive numeric values in Java using a twin prime, double hashed hash table
/*
* Copyright 2024 Netflix Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netflix.index;
import java.math.BigInteger;
import java.util.Objects;
import java.util.Spliterator;
import java.util.function.LongFunction;
import java.util.function.LongPredicate;
import java.util.function.ToLongFunction;
/**
* An immutable index of numeric keys to numeric values that are externally mappable to objects of type `T`.
*
* <p> Unlike a {@link java.util.Map} multiple values for a given key are supported. They are collisions however, so have
* {@code O(n)} complexity for each additional duplicate, without considering collisions with other keys. The index does
* not store keys, the caller decides if the expected value was found for a given key with a predicate.
*
* <p> The number of bytes to be used to store the value can be specified, and can be any numeric value able to be widened
* to {@code long} supported. The largest positive value for the given byte length is reserved for internal use.
*
* <p> The index is backed by an open addressing, double hashed, twin prime sized hash table. Generics for the value type
* and specialized overloads for smaller value types are avoided, so in some cases the caller may have to narrow an
* argument, In those cases there is no risk of data loss as the length invariant is guaranteed at insertion time. This
* allows boxing to be avoided, while we wait out Project Valhalla.
*
* @param <T> the external type being indexed
* @see <a href="https://stackoverflow.com/a/62335671/364206">Data Structures and Other Objects Using C++ (4th Edition), Page 614</a>
*/
public final class HashIndex<T> {
private static final float DEFAULT_LOAD_FACTOR = 0.5f;
private static final long UNUSED = 0;
private final Elements elements;
public static <T> HashIndex<T> ofByte(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) {
return of(source, keyMapper, valueMapper, Byte.BYTES, DEFAULT_LOAD_FACTOR);
}
public static <T> HashIndex<T> ofShort(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) {
return of(source, keyMapper, valueMapper, Short.BYTES, DEFAULT_LOAD_FACTOR);
}
public static <T> HashIndex<T> ofInt(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) {
return of(source, keyMapper, valueMapper, Integer.BYTES, DEFAULT_LOAD_FACTOR);
}
public static <T> HashIndex<T> ofLong(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper) {
return of(source, keyMapper, valueMapper, Long.BYTES, DEFAULT_LOAD_FACTOR);
}
public static <T> HashIndex<T> of(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper, int valueSize, float loadFactor) {
return new HashIndex<>(source, keyMapper, valueMapper, valueSize, loadFactor);
}
private HashIndex(Spliterator<T> source, ToLongFunction<T> keyMapper, ToLongFunction<T> valueMapper, int valueSize, float loadFactor) {
Objects.requireNonNull(source);
if (!source.hasCharacteristics(Spliterator.SIZED)) {
throw new IllegalArgumentException("source spliterator must be SIZED");
}
Objects.requireNonNull(keyMapper);
Objects.requireNonNull(valueMapper);
if (valueSize <= 0) {
throw new IllegalArgumentException("valueSize must be positive");
}
// plus one makes this a power of two, see https://bugs.openjdk.org/browse/JDK-8003585
int length = twinnedPrimeTableSizeFor(source.getExactSizeIfKnown(), loadFactor) + 1;
if (valueSize == Byte.BYTES) {
elements = new ByteElements(length);
} else if (valueSize == Short.BYTES) {
elements = new ShortElements(length);
} else if (valueSize <= Integer.BYTES) {
elements = new IntElements(length);
} else if (valueSize <= Long.BYTES) {
elements = new LongElements(length);
} else {
throw new IllegalArgumentException("valueSize maximum is " + Long.BYTES + " bytes");
}
source.forEachRemaining(input -> {
Objects.requireNonNull(input, "input element was null");
long key = keyMapper.applyAsLong(input);
long value = valueMapper.applyAsLong(input);
if (!elements.supports(value)) {
throw new IllegalArgumentException("value out of range");
}
if (elements.isMaxValue(value)) {
throw new IllegalArgumentException("max value is reserved");
}
int idx = probe(key, existing -> existing == value);
if (idx == -1) {
throw new IndexOutOfBoundsException("the elements is full");
} else if (idx < 0) {
int dst = -(idx + 2);
elements.put(dst, value >= 0 ? value + 1 : value);
}
// else it's >= 0 because we have a duplicate key and value
});
}
/**
* Returns the value that a given key maps to, using a {@link LongFunction} to map the return type to an
* {@link T}. A null return value from the value mapper indicates the value was not a match, and the operation
* should continue until known to be absent.
*
* @param key the key to find a value for
* @param valueMapper a mapper from a value to a return type of {@link T}
* @return the result, null is the value does not exist
*/
public T map(long key, LongFunction<T> valueMapper) {
int length = elements.length() - 1;
int idx = index(key, length);
int attempts = 0;
while (attempts < length) {
long value = elements.get(idx);
if (value == UNUSED) {
return null;
} else {
T mappedValue = valueMapper.apply(value > 0 ? value - 1 : value);
if (mappedValue != null) {
return mappedValue;
}
}
idx = Math.floorMod(idx + offset(key, length), length);
attempts++;
}
return null;
}
private int probe(long key, LongPredicate valueEquals) {
int length = elements.length() - 1;
int idx = index(key, length);
int attempts = 0;
while(attempts < length) {
long value = elements.get(idx);
if (value == UNUSED) {
return -idx - 2;
} else if (valueEquals.test(value > 0 ? value - 1 : value)) {
return idx;
}
idx = Math.floorMod(idx + offset(key, length), length);
attempts++;
}
return -1;
}
private static int index(long key, int length) {
return Math.floorMod(Long.hashCode(key), length);
}
private static int offset(long key, int length) {
return 1 + (Math.floorMod(Long.hashCode(key), (length - 2)));
}
private static int twinnedPrimeTableSizeFor(long numEntries, float loadFactor) {
long size = (long)Math.ceil(numEntries / (double) loadFactor);
BigInteger twin = BigInteger.valueOf(size).nextProbablePrime();
BigInteger candidate = twin.nextProbablePrime();
while (candidate.subtract(twin).intValue() != 2
|| !(isPrime(twin.intValueExact()) && isPrime(candidate.intValueExact()))) {
twin = candidate;
candidate = twin.nextProbablePrime();
}
return candidate.intValueExact();
}
private static boolean isPrime(int candidate) {
if ((candidate & 1) != 0) {
int limit = (int) Math.sqrt(candidate);
for (int divisor = 3; divisor <= limit; divisor += 2) {
if ((candidate % divisor) == 0) {
return false;
}
}
return true;
}
return candidate == 2;
}
private interface Elements {
int length();
long get(int i);
void put(int i, long value);
boolean supports(long value);
boolean isMaxValue(long value);
}
private static final class ByteElements implements Elements {
private final byte[] elements;
private ByteElements(int size) {
this.elements = new byte[size];
}
@Override
public int length() {
return elements.length;
}
@Override
public long get(int i) {
return elements[i];
}
@Override
public void put(int i, long value) {
elements[i] = (byte) value;
}
@Override
public boolean supports(long value) {
return value >= Byte.MIN_VALUE && value <= Byte.MAX_VALUE;
}
@Override
public boolean isMaxValue(long value) {
return value == Byte.MAX_VALUE;
}
}
private static final class ShortElements implements Elements {
private final short[] elements;
private ShortElements(int size) {
this.elements = new short[size];
}
@Override
public int length() {
return elements.length;
}
@Override
public long get(int i) {
return elements[i];
}
@Override
public void put(int i, long value) {
assert value == (short) value;
elements[i] = (short) value;
}
@Override
public boolean supports(long value) {
return value >= Short.MIN_VALUE && value <= Short.MAX_VALUE;
}
@Override
public boolean isMaxValue(long value) {
return value == Short.MAX_VALUE;
}
}
private static final class IntElements implements Elements {
private final int[] elements;
private IntElements(int size) {
this.elements = new int[size];
}
@Override
public int length() {
return elements.length;
}
@Override
public long get(int i) {
return elements[i];
}
@Override
public void put(int i, long value) {
assert value == (int) value;
elements[i] = (int) value;
}
@Override
public boolean supports(long value) {
return value >= Integer.MIN_VALUE && value <= Integer.MAX_VALUE;
}
@Override
public boolean isMaxValue(long value) {
return value == Integer.MAX_VALUE;
}
}
private static final class LongElements implements Elements {
private final long[] elements;
private LongElements(int size) {
this.elements = new long[size];
}
@Override
public int length() {
return elements.length;
}
@Override
public long get(int i) {
return elements[i];
}
@Override
public void put(int i, long value) {
elements[i] = value;
}
@Override
public boolean supports(long value) {
return true;
}
@Override
public boolean isMaxValue(long value) {
return value == Long.MAX_VALUE;
}
}
}
@DanielThomas
Copy link
Author

DanielThomas commented Mar 25, 2024

I couldn't find any prior art for a hash index that only stores values and used external equality. The default load factor of 0.5 provides excellent performance, but 0.75 is a fine tradeoff for space, but will degrade significantly as you approach 1.0. If you're only storing positive values, you can double your value space by subtracting MIN_VALUE for the type from your value.

Tests for the class are available in a separate Gist.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment