Skip to content

Instantly share code, notes, and snippets.

@oneryalcin
Created November 15, 2024 21:14
Show Gist options
  • Save oneryalcin/55f3994cbdb066ed393abda42b3c75ee to your computer and use it in GitHub Desktop.
Save oneryalcin/55f3994cbdb066ed393abda42b3c75ee to your computer and use it in GitHub Desktop.
Fast Adaptive Filtering for scores
"""
Below is an adaptive thresholding algorithm that considers the distribution of scores,
not just a fixed top-k or fixed threshold approach. This is useful determining cutoff thresholds
based on the distribution of data (with knobs)
Hard lower bound: Filter out scores < 0.1 (configurable)
Hard upper bound: Keep scores > 0.7 (configurable)
Consider "drops" in scores to make smart cutoffs
Wider selection when scores are closely clustered
Narrower selection when there are clear score gaps
Important:
Assumes scores are given ordered.
"""
import numpy as np
from typing import Tuple, List, Optional
import time
def adaptive_filter(
scores: np.ndarray,
min_threshold: float = 0.1,
max_threshold: float = 0.7,
min_gap_percentile: float = 90,
) -> Tuple[np.ndarray, float]:
"""
Adaptively filter pre-sorted document scores (in descending order).
Args:
scores: Array of document scores between 0 and 1, pre-sorted in descending order
min_threshold: Minimum score threshold (default: 0.1)
max_threshold: Score threshold above which documents are automatically kept (default: 0.7)
min_gap_percentile: Percentile to consider for gap significance (default: 90)
Returns:
Tuple of (selected indices up to cut-off point, computed threshold)
"""
# Convert to numpy array if not already
scores = np.asarray(scores)
# Quick return if no valid scores
if len(scores) == 0 or scores[0] < min_threshold:
return np.array([], dtype=int), min_threshold
# Find where scores drop below min_threshold
below_min = np.where(scores < min_threshold)[0]
valid_length = len(scores) if len(below_min) == 0 else below_min[0]
# If we have very few valid documents, keep them all above min_threshold
if valid_length <= 3:
return np.arange(valid_length), min_threshold
# Calculate gaps between adjacent scores
score_gaps = -np.diff(scores[:valid_length]) # Negative diff since we want drops
# Find significant gaps
gap_threshold = np.percentile(score_gaps, min_gap_percentile)
significant_gaps = np.where(score_gaps >= gap_threshold)[0]
# If we found significant gaps, use the first appropriate one as cut point
if len(significant_gaps) > 0:
# Start with first gap
cut_idx = significant_gaps[0]
# If the score at cut point is above max_threshold,
# look for next significant gap below max_threshold
if scores[cut_idx] >= max_threshold:
for gap_idx in significant_gaps:
if scores[gap_idx] < max_threshold:
cut_idx = gap_idx
break
else: # No gap found below max_threshold
# Find first score below max_threshold
below_max = np.where(scores < max_threshold)[0]
cut_idx = below_max[0] if len(below_max) > 0 else valid_length
computed_threshold = scores[cut_idx]
else:
# If no significant gaps, use statistical measures
score_mean = np.mean(scores[:valid_length])
score_std = np.std(scores[:valid_length])
computed_threshold = max(min_threshold, score_mean - score_std)
# Find cut point based on computed threshold
cut_idx = np.where(scores < computed_threshold)[0]
cut_idx = cut_idx[0] if len(cut_idx) > 0 else valid_length
# Ensure we include all scores above max_threshold
if computed_threshold > max_threshold:
below_max = np.where(scores < max_threshold)[0]
cut_idx = below_max[0] if len(below_max) > 0 else valid_length
computed_threshold = max_threshold
return np.arange(cut_idx), computed_threshold
def test_performance():
"""Test the performance with 200 documents."""
np.random.seed(42)
n_docs = 200
# Test case 1: Gradually decreasing scores
scores_gradual = np.linspace(0.95, 0.05, n_docs)
# Test case 2: Clustered distribution (pre-sorted)
scores_clustered = np.concatenate([
np.random.normal(0.9, 0.6, 5), # High cluster
np.random.normal(0.3, 0.05, 100), # Medium cluster
np.random.normal(0.2, 0.05, 50) # Low cluster
])
scores_clustered = np.clip(scores_clustered, 0, 1)
scores_clustered = -np.sort(-scores_clustered) # Sort descending
# Test case 3: Power law distribution (pre-sorted)
scores_power = -np.sort(-(1 - np.random.power(4, n_docs)))
# print(scores_power)
# Time each case
for scores, name in [
(scores_gradual, "Gradual"),
(scores_clustered, "Clustered"),
(scores_power, "Power Law")
]:
start_time = time.time()
indices, threshold = adaptive_filter(scores)
elapsed = (time.time() - start_time) * 1000 # Convert to milliseconds
print(f"\n{name} Distribution:")
print(f"Time taken: {elapsed:.2f}ms")
print(f"Documents selected: {len(indices)}")
print(f"Computed threshold: {threshold:.3f}")
print(f"Score range: {scores[indices[0]]:.3f} - {scores[indices[-1]]:.3f}")
print(f"Total doc count: {len(scores)}")
print("-" * 50)
if __name__ == "__main__":
# Run performance tests
test_performance()
# Example with clear gaps
scores_with_gaps = np.array([
0.95, 0.92, 0.88, # High cluster
0.65, 0.62, # Medium cluster
0.35, 0.32, 0.31, # Low cluster
0.05, 0.02 # Very low cluster
])
indices, threshold = adaptive_filter(scores_with_gaps)
print("\nExample with clear gaps:")
print(f"Input scores: {scores_with_gaps}")
print(f"Selected indices: {indices}")
print(f"Selected scores: {scores_with_gaps[indices]}")
print(f"Computed threshold: {threshold:.3f}")
print(f"Total doc count: {len(scores_with_gaps)}")
"""
Gradual Distribution:
Time taken: 0.34ms
Documents selected: 56
Computed threshold: 0.697
Score range: 0.950 - 0.701
Total doc count: 200
--------------------------------------------------
Clustered Distribution:
Time taken: 0.90ms
Documents selected: 5
Computed threshold: 0.393
Score range: 1.000 - 0.760
Total doc count: 155
--------------------------------------------------
Power Law Distribution:
Time taken: 0.49ms
Documents selected: 1
Computed threshold: 0.677
Score range: 0.733 - 0.733
Total doc count: 200
--------------------------------------------------
Example with clear gaps:
Input scores: [0.95 0.92 0.88 0.65 0.62 0.35 0.32 0.31 0.05 0.02]
Selected indices: [0 1 2 3]
Selected scores: [0.95 0.92 0.88 0.65]
Computed threshold: 0.620
Total doc count: 10
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment