Created
November 15, 2024 21:14
-
-
Save oneryalcin/55f3994cbdb066ed393abda42b3c75ee to your computer and use it in GitHub Desktop.
Fast Adaptive Filtering for scores
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Below is an adaptive thresholding algorithm that considers the distribution of scores, | |
not just a fixed top-k or fixed threshold approach. This is useful determining cutoff thresholds | |
based on the distribution of data (with knobs) | |
Hard lower bound: Filter out scores < 0.1 (configurable) | |
Hard upper bound: Keep scores > 0.7 (configurable) | |
Consider "drops" in scores to make smart cutoffs | |
Wider selection when scores are closely clustered | |
Narrower selection when there are clear score gaps | |
Important: | |
Assumes scores are given ordered. | |
""" | |
import numpy as np | |
from typing import Tuple, List, Optional | |
import time | |
def adaptive_filter( | |
scores: np.ndarray, | |
min_threshold: float = 0.1, | |
max_threshold: float = 0.7, | |
min_gap_percentile: float = 90, | |
) -> Tuple[np.ndarray, float]: | |
""" | |
Adaptively filter pre-sorted document scores (in descending order). | |
Args: | |
scores: Array of document scores between 0 and 1, pre-sorted in descending order | |
min_threshold: Minimum score threshold (default: 0.1) | |
max_threshold: Score threshold above which documents are automatically kept (default: 0.7) | |
min_gap_percentile: Percentile to consider for gap significance (default: 90) | |
Returns: | |
Tuple of (selected indices up to cut-off point, computed threshold) | |
""" | |
# Convert to numpy array if not already | |
scores = np.asarray(scores) | |
# Quick return if no valid scores | |
if len(scores) == 0 or scores[0] < min_threshold: | |
return np.array([], dtype=int), min_threshold | |
# Find where scores drop below min_threshold | |
below_min = np.where(scores < min_threshold)[0] | |
valid_length = len(scores) if len(below_min) == 0 else below_min[0] | |
# If we have very few valid documents, keep them all above min_threshold | |
if valid_length <= 3: | |
return np.arange(valid_length), min_threshold | |
# Calculate gaps between adjacent scores | |
score_gaps = -np.diff(scores[:valid_length]) # Negative diff since we want drops | |
# Find significant gaps | |
gap_threshold = np.percentile(score_gaps, min_gap_percentile) | |
significant_gaps = np.where(score_gaps >= gap_threshold)[0] | |
# If we found significant gaps, use the first appropriate one as cut point | |
if len(significant_gaps) > 0: | |
# Start with first gap | |
cut_idx = significant_gaps[0] | |
# If the score at cut point is above max_threshold, | |
# look for next significant gap below max_threshold | |
if scores[cut_idx] >= max_threshold: | |
for gap_idx in significant_gaps: | |
if scores[gap_idx] < max_threshold: | |
cut_idx = gap_idx | |
break | |
else: # No gap found below max_threshold | |
# Find first score below max_threshold | |
below_max = np.where(scores < max_threshold)[0] | |
cut_idx = below_max[0] if len(below_max) > 0 else valid_length | |
computed_threshold = scores[cut_idx] | |
else: | |
# If no significant gaps, use statistical measures | |
score_mean = np.mean(scores[:valid_length]) | |
score_std = np.std(scores[:valid_length]) | |
computed_threshold = max(min_threshold, score_mean - score_std) | |
# Find cut point based on computed threshold | |
cut_idx = np.where(scores < computed_threshold)[0] | |
cut_idx = cut_idx[0] if len(cut_idx) > 0 else valid_length | |
# Ensure we include all scores above max_threshold | |
if computed_threshold > max_threshold: | |
below_max = np.where(scores < max_threshold)[0] | |
cut_idx = below_max[0] if len(below_max) > 0 else valid_length | |
computed_threshold = max_threshold | |
return np.arange(cut_idx), computed_threshold | |
def test_performance(): | |
"""Test the performance with 200 documents.""" | |
np.random.seed(42) | |
n_docs = 200 | |
# Test case 1: Gradually decreasing scores | |
scores_gradual = np.linspace(0.95, 0.05, n_docs) | |
# Test case 2: Clustered distribution (pre-sorted) | |
scores_clustered = np.concatenate([ | |
np.random.normal(0.9, 0.6, 5), # High cluster | |
np.random.normal(0.3, 0.05, 100), # Medium cluster | |
np.random.normal(0.2, 0.05, 50) # Low cluster | |
]) | |
scores_clustered = np.clip(scores_clustered, 0, 1) | |
scores_clustered = -np.sort(-scores_clustered) # Sort descending | |
# Test case 3: Power law distribution (pre-sorted) | |
scores_power = -np.sort(-(1 - np.random.power(4, n_docs))) | |
# print(scores_power) | |
# Time each case | |
for scores, name in [ | |
(scores_gradual, "Gradual"), | |
(scores_clustered, "Clustered"), | |
(scores_power, "Power Law") | |
]: | |
start_time = time.time() | |
indices, threshold = adaptive_filter(scores) | |
elapsed = (time.time() - start_time) * 1000 # Convert to milliseconds | |
print(f"\n{name} Distribution:") | |
print(f"Time taken: {elapsed:.2f}ms") | |
print(f"Documents selected: {len(indices)}") | |
print(f"Computed threshold: {threshold:.3f}") | |
print(f"Score range: {scores[indices[0]]:.3f} - {scores[indices[-1]]:.3f}") | |
print(f"Total doc count: {len(scores)}") | |
print("-" * 50) | |
if __name__ == "__main__": | |
# Run performance tests | |
test_performance() | |
# Example with clear gaps | |
scores_with_gaps = np.array([ | |
0.95, 0.92, 0.88, # High cluster | |
0.65, 0.62, # Medium cluster | |
0.35, 0.32, 0.31, # Low cluster | |
0.05, 0.02 # Very low cluster | |
]) | |
indices, threshold = adaptive_filter(scores_with_gaps) | |
print("\nExample with clear gaps:") | |
print(f"Input scores: {scores_with_gaps}") | |
print(f"Selected indices: {indices}") | |
print(f"Selected scores: {scores_with_gaps[indices]}") | |
print(f"Computed threshold: {threshold:.3f}") | |
print(f"Total doc count: {len(scores_with_gaps)}") | |
""" | |
Gradual Distribution: | |
Time taken: 0.34ms | |
Documents selected: 56 | |
Computed threshold: 0.697 | |
Score range: 0.950 - 0.701 | |
Total doc count: 200 | |
-------------------------------------------------- | |
Clustered Distribution: | |
Time taken: 0.90ms | |
Documents selected: 5 | |
Computed threshold: 0.393 | |
Score range: 1.000 - 0.760 | |
Total doc count: 155 | |
-------------------------------------------------- | |
Power Law Distribution: | |
Time taken: 0.49ms | |
Documents selected: 1 | |
Computed threshold: 0.677 | |
Score range: 0.733 - 0.733 | |
Total doc count: 200 | |
-------------------------------------------------- | |
Example with clear gaps: | |
Input scores: [0.95 0.92 0.88 0.65 0.62 0.35 0.32 0.31 0.05 0.02] | |
Selected indices: [0 1 2 3] | |
Selected scores: [0.95 0.92 0.88 0.65] | |
Computed threshold: 0.620 | |
Total doc count: 10 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment