oneryalcin · November 15, 2024 21:14
diff --git a/adaptive_filter.py b/adaptive_filter.py
 """
 Below is an adaptive thresholding algorithm that considers the distribution of scores, 
 not just a fixed top-k or fixed threshold approach. This is useful determining cutoff thresholds 
 based on the distribution of data (with knobs)

 Hard lower bound: Filter out scores < 0.1 (configurable)
 Hard upper bound: Keep scores > 0.7 (configurable)
 Consider "drops" in scores to make smart cutoffs
 Wider selection when scores are closely clustered
 Narrower selection when there are clear score gaps

 Important:
 Assumes scores are given ordered.
 """

 import numpy as np
 from typing import Tuple, List, Optional
 import time

 def adaptive_filter(
    scores: np.ndarray,
    min_threshold: float = 0.1,
    max_threshold: float = 0.7,
    min_gap_percentile: float = 90,
 ) -> Tuple[np.ndarray, float]:
    """
    Adaptively filter pre-sorted document scores (in descending order).
    
    Args:
        scores: Array of document scores between 0 and 1, pre-sorted in descending order
        min_threshold: Minimum score threshold (default: 0.1)
        max_threshold: Score threshold above which documents are automatically kept (default: 0.7)
        min_gap_percentile: Percentile to consider for gap significance (default: 90)
    
    Returns:
        Tuple of (selected indices up to cut-off point, computed threshold)
    """
    # Convert to numpy array if not already
    scores = np.asarray(scores)
    
    # Quick return if no valid scores
    if len(scores) == 0 or scores[0] < min_threshold:
        return np.array([], dtype=int), min_threshold
    
    # Find where scores drop below min_threshold
    below_min = np.where(scores < min_threshold)[0]
    valid_length = len(scores) if len(below_min) == 0 else below_min[0]
    
    # If we have very few valid documents, keep them all above min_threshold
    if valid_length <= 3:
        return np.arange(valid_length), min_threshold
    
    # Calculate gaps between adjacent scores
    score_gaps = -np.diff(scores[:valid_length])  # Negative diff since we want drops
    
    # Find significant gaps
    gap_threshold = np.percentile(score_gaps, min_gap_percentile)
    significant_gaps = np.where(score_gaps >= gap_threshold)[0]
    
    # If we found significant gaps, use the first appropriate one as cut point
    if len(significant_gaps) > 0:
        # Start with first gap
        cut_idx = significant_gaps[0]
        
        # If the score at cut point is above max_threshold,
        # look for next significant gap below max_threshold
        if scores[cut_idx] >= max_threshold:
            for gap_idx in significant_gaps:
                if scores[gap_idx] < max_threshold:
                    cut_idx = gap_idx
                    break
            else:  # No gap found below max_threshold
                # Find first score below max_threshold
                below_max = np.where(scores < max_threshold)[0]
                cut_idx = below_max[0] if len(below_max) > 0 else valid_length
        
        computed_threshold = scores[cut_idx]
    else:
        # If no significant gaps, use statistical measures
        score_mean = np.mean(scores[:valid_length])
        score_std = np.std(scores[:valid_length])
        computed_threshold = max(min_threshold, score_mean - score_std)
        
        # Find cut point based on computed threshold
        cut_idx = np.where(scores < computed_threshold)[0]
        cut_idx = cut_idx[0] if len(cut_idx) > 0 else valid_length
    
    # Ensure we include all scores above max_threshold
    if computed_threshold > max_threshold:
        below_max = np.where(scores < max_threshold)[0]
        cut_idx = below_max[0] if len(below_max) > 0 else valid_length
        computed_threshold = max_threshold
    
    return np.arange(cut_idx), computed_threshold

 def test_performance():
    """Test the performance with 200 documents."""
    np.random.seed(42)
    n_docs = 200
    
    # Test case 1: Gradually decreasing scores
    scores_gradual = np.linspace(0.95, 0.05, n_docs)
    
    # Test case 2: Clustered distribution (pre-sorted)
    scores_clustered = np.concatenate([
        np.random.normal(0.9, 0.6, 5),  # High cluster
        np.random.normal(0.3, 0.05, 100),  # Medium cluster
        np.random.normal(0.2, 0.05, 50)    # Low cluster
    ])
    scores_clustered = np.clip(scores_clustered, 0, 1)
    scores_clustered = -np.sort(-scores_clustered)  # Sort descending
    
    # Test case 3: Power law distribution (pre-sorted)
    scores_power = -np.sort(-(1 - np.random.power(4, n_docs)))
    # print(scores_power)
    
    # Time each case
    for scores, name in [
        (scores_gradual, "Gradual"),
        (scores_clustered, "Clustered"),
        (scores_power, "Power Law")
    ]:
        start_time = time.time()
        indices, threshold = adaptive_filter(scores)
        elapsed = (time.time() - start_time) * 1000  # Convert to milliseconds
        
        print(f"\n{name} Distribution:")
        print(f"Time taken: {elapsed:.2f}ms")
        print(f"Documents selected: {len(indices)}")
        print(f"Computed threshold: {threshold:.3f}")
        print(f"Score range: {scores[indices[0]]:.3f} - {scores[indices[-1]]:.3f}")
        print(f"Total doc count: {len(scores)}")
        print("-" * 50)

 if __name__ == "__main__":
    # Run performance tests
    test_performance()
    
    # Example with clear gaps
    scores_with_gaps = np.array([
        0.95, 0.92, 0.88,  # High cluster
        0.65, 0.62,        # Medium cluster
        0.35, 0.32, 0.31,  # Low cluster
        0.05, 0.02         # Very low cluster
    ])
    
    indices, threshold = adaptive_filter(scores_with_gaps)
    print("\nExample with clear gaps:")
    print(f"Input scores: {scores_with_gaps}")
    print(f"Selected indices: {indices}")
    print(f"Selected scores: {scores_with_gaps[indices]}")
    print(f"Computed threshold: {threshold:.3f}")
    print(f"Total doc count: {len(scores_with_gaps)}")


 """
 Gradual Distribution:
 Time taken: 0.34ms
 Documents selected: 56
 Computed threshold: 0.697
 Score range: 0.950 - 0.701
 Total doc count: 200
 --------------------------------------------------
 Clustered Distribution:
 Time taken: 0.90ms
 Documents selected: 5
 Computed threshold: 0.393
 Score range: 1.000 - 0.760
 Total doc count: 155
 --------------------------------------------------
 Power Law Distribution:
 Time taken: 0.49ms
 Documents selected: 1
 Computed threshold: 0.677
 Score range: 0.733 - 0.733
 Total doc count: 200
 --------------------------------------------------
 Example with clear gaps:
 Input scores: [0.95 0.92 0.88 0.65 0.62 0.35 0.32 0.31 0.05 0.02]
 Selected indices: [0 1 2 3]
 Selected scores: [0.95 0.92 0.88 0.65]
 Computed threshold: 0.620
 Total doc count: 10
 """
	"""
	Below is an adaptive thresholding algorithm that considers the distribution of scores,
	not just a fixed top-k or fixed threshold approach. This is useful determining cutoff thresholds
	based on the distribution of data (with knobs)

	Hard lower bound: Filter out scores < 0.1 (configurable)
	Hard upper bound: Keep scores > 0.7 (configurable)
	Consider "drops" in scores to make smart cutoffs
	Wider selection when scores are closely clustered
	Narrower selection when there are clear score gaps

	Important:
	Assumes scores are given ordered.
	"""

	import numpy as np
	from typing import Tuple, List, Optional
	import time

	def adaptive_filter(
	scores: np.ndarray,
	min_threshold: float = 0.1,
	max_threshold: float = 0.7,
	min_gap_percentile: float = 90,
	) -> Tuple[np.ndarray, float]:
	"""
	Adaptively filter pre-sorted document scores (in descending order).

	Args:
	scores: Array of document scores between 0 and 1, pre-sorted in descending order
	min_threshold: Minimum score threshold (default: 0.1)
	max_threshold: Score threshold above which documents are automatically kept (default: 0.7)
	min_gap_percentile: Percentile to consider for gap significance (default: 90)

	Returns:
	Tuple of (selected indices up to cut-off point, computed threshold)
	"""
	# Convert to numpy array if not already
	scores = np.asarray(scores)

	# Quick return if no valid scores
	if len(scores) == 0 or scores[0] < min_threshold:
	return np.array([], dtype=int), min_threshold

	# Find where scores drop below min_threshold
	below_min = np.where(scores < min_threshold)[0]
	valid_length = len(scores) if len(below_min) == 0 else below_min[0]

	# If we have very few valid documents, keep them all above min_threshold
	if valid_length <= 3:
	return np.arange(valid_length), min_threshold

	# Calculate gaps between adjacent scores
	score_gaps = -np.diff(scores[:valid_length]) # Negative diff since we want drops

	# Find significant gaps
	gap_threshold = np.percentile(score_gaps, min_gap_percentile)
	significant_gaps = np.where(score_gaps >= gap_threshold)[0]

	# If we found significant gaps, use the first appropriate one as cut point
	if len(significant_gaps) > 0:
	# Start with first gap
	cut_idx = significant_gaps[0]

	# If the score at cut point is above max_threshold,
	# look for next significant gap below max_threshold
	if scores[cut_idx] >= max_threshold:
	for gap_idx in significant_gaps:
	if scores[gap_idx] < max_threshold:
	cut_idx = gap_idx
	break
	else: # No gap found below max_threshold
	# Find first score below max_threshold
	below_max = np.where(scores < max_threshold)[0]
	cut_idx = below_max[0] if len(below_max) > 0 else valid_length

	computed_threshold = scores[cut_idx]
	else:
	# If no significant gaps, use statistical measures
	score_mean = np.mean(scores[:valid_length])
	score_std = np.std(scores[:valid_length])
	computed_threshold = max(min_threshold, score_mean - score_std)

	# Find cut point based on computed threshold
	cut_idx = np.where(scores < computed_threshold)[0]
	cut_idx = cut_idx[0] if len(cut_idx) > 0 else valid_length

	# Ensure we include all scores above max_threshold
	if computed_threshold > max_threshold:
	below_max = np.where(scores < max_threshold)[0]
	cut_idx = below_max[0] if len(below_max) > 0 else valid_length
	computed_threshold = max_threshold

	return np.arange(cut_idx), computed_threshold

	def test_performance():
	"""Test the performance with 200 documents."""
	np.random.seed(42)
	n_docs = 200

	# Test case 1: Gradually decreasing scores
	scores_gradual = np.linspace(0.95, 0.05, n_docs)

	# Test case 2: Clustered distribution (pre-sorted)
	scores_clustered = np.concatenate([
	np.random.normal(0.9, 0.6, 5), # High cluster
	np.random.normal(0.3, 0.05, 100), # Medium cluster
	np.random.normal(0.2, 0.05, 50) # Low cluster
	])
	scores_clustered = np.clip(scores_clustered, 0, 1)
	scores_clustered = -np.sort(-scores_clustered) # Sort descending

	# Test case 3: Power law distribution (pre-sorted)
	scores_power = -np.sort(-(1 - np.random.power(4, n_docs)))
	# print(scores_power)

	# Time each case
	for scores, name in [
	(scores_gradual, "Gradual"),
	(scores_clustered, "Clustered"),
	(scores_power, "Power Law")
	]:
	start_time = time.time()
	indices, threshold = adaptive_filter(scores)
	elapsed = (time.time() - start_time) * 1000 # Convert to milliseconds

	print(f"\n{name} Distribution:")
	print(f"Time taken: {elapsed:.2f}ms")
	print(f"Documents selected: {len(indices)}")
	print(f"Computed threshold: {threshold:.3f}")
	print(f"Score range: {scores[indices[0]]:.3f} - {scores[indices[-1]]:.3f}")
	print(f"Total doc count: {len(scores)}")
	print("-" * 50)

	if __name__ == "__main__":
	# Run performance tests
	test_performance()

	# Example with clear gaps
	scores_with_gaps = np.array([
	0.95, 0.92, 0.88, # High cluster
	0.65, 0.62, # Medium cluster
	0.35, 0.32, 0.31, # Low cluster
	0.05, 0.02 # Very low cluster
	])

	indices, threshold = adaptive_filter(scores_with_gaps)
	print("\nExample with clear gaps:")
	print(f"Input scores: {scores_with_gaps}")
	print(f"Selected indices: {indices}")
	print(f"Selected scores: {scores_with_gaps[indices]}")
	print(f"Computed threshold: {threshold:.3f}")
	print(f"Total doc count: {len(scores_with_gaps)}")


	"""
	Gradual Distribution:
	Time taken: 0.34ms
	Documents selected: 56
	Computed threshold: 0.697
	Score range: 0.950 - 0.701
	Total doc count: 200
	--------------------------------------------------
	Clustered Distribution:
	Time taken: 0.90ms
	Documents selected: 5
	Computed threshold: 0.393
	Score range: 1.000 - 0.760
	Total doc count: 155
	--------------------------------------------------
	Power Law Distribution:
	Time taken: 0.49ms
	Documents selected: 1
	Computed threshold: 0.677
	Score range: 0.733 - 0.733
	Total doc count: 200
	--------------------------------------------------
	Example with clear gaps:
	Input scores: [0.95 0.92 0.88 0.65 0.62 0.35 0.32 0.31 0.05 0.02]
	Selected indices: [0 1 2 3]
	Selected scores: [0.95 0.92 0.88 0.65]
	Computed threshold: 0.620
	Total doc count: 10
	"""