Skip to content

Instantly share code, notes, and snippets.

@PaulCapestany
Created April 8, 2025 02:13
Show Gist options
  • Save PaulCapestany/6d2c6f0d9bb7261ceedb736268ad6377 to your computer and use it in GitHub Desktop.
Save PaulCapestany/6d2c6f0d9bb7261ceedb736268ad6377 to your computer and use it in GitHub Desktop.
#!/bin/bash
# Example usage: test with 50 requests (first sequentially, then in parallel)
# ./test-embed.sh 50
# Configuration
MODEL=${MODEL:-"nomic-embed-text"} # Default model
NUM_REQUESTS=${1:-20} # Number of requests
HOST=${HOST:-"localhost"} # Ollama host
PORT=${PORT:-"11434"} # Ollama port
# Create log directory
LOG_DIR="./embed_test_logs"
mkdir -p "$LOG_DIR"
# Create two distinct test texts
TEXT1="Longer text (that changes due to including appended test number): "
TEXT2="a" # short text to test if < 4 tokens is still an issue (it appears it might no longer be an issue): https://github.com/ggml-org/llama.cpp/issues/6722#issuecomment-2785033321
# High-precision time measurement function
get_time_ms() {
echo $(date +%s.%N)
}
# Calculate duration in milliseconds
calc_duration_ms() {
local start=$1
local end=$2
echo $(echo "($end - $start) * 1000" | bc | cut -d'.' -f1)
}
echo "===== Ollama Embedding Parallelism Test (Multiple Inputs) ====="
echo "Model: $MODEL"
echo "Requests: $NUM_REQUESTS (each with 2 texts)"
echo "Timestamp: $(date)"
echo "============================================================="
# Run sequential requests as baseline
echo ""
echo "Running $NUM_REQUESTS sequential embedding requests (multiple inputs)..."
SEQ_START=$(get_time_ms)
for i in $(seq 1 $NUM_REQUESTS); do
echo "Starting sequential request $i..."
REQ_START=$(get_time_ms)
curl -s -X POST "http://$HOST:$PORT/api/embed" \
-H "Content-Type: application/json" \
-d "{\"model\": \"$MODEL\", \"input\": [\"$TEXT1 $i\", \"$TEXT2\"]}" \
> "$LOG_DIR/seq_multi_$i.json"
REQ_END=$(get_time_ms)
DURATION_MS=$(calc_duration_ms $REQ_START $REQ_END)
echo "Request $i completed in ${DURATION_MS}ms"
done
SEQ_END=$(get_time_ms)
SEQ_TOTAL_MS=$(calc_duration_ms $SEQ_START $SEQ_END)
echo "Sequential test completed in ${SEQ_TOTAL_MS}ms"
# Run parallel requests
echo ""
echo "Running $NUM_REQUESTS parallel embedding requests (multiple inputs)..."
PAR_START=$(get_time_ms)
# Start all requests in parallel
for i in $(seq 1 $NUM_REQUESTS); do
echo "Starting parallel request $i..."
curl -s -X POST "http://$HOST:$PORT/api/embed" \
-H "Content-Type: application/json" \
-d "{\"model\": \"$MODEL\", \"input\": [\"$TEXT1 $i\", \"$TEXT2\"]}" \
> "$LOG_DIR/par_multi_$i.json" &
done
# Wait for all background processes to complete
wait
PAR_END=$(get_time_ms)
PAR_TOTAL_MS=$(calc_duration_ms $PAR_START $PAR_END)
echo "Parallel test completed in ${PAR_TOTAL_MS}ms"
# Calculate speedup
if [ "$PAR_TOTAL_MS" -gt 0 ]; then
SPEEDUP=$(echo "scale=2; $SEQ_TOTAL_MS / $PAR_TOTAL_MS" | bc)
else
SPEEDUP="N/A"
fi
# Report results
echo ""
echo "===== RESULTS SUMMARY ====="
echo "Sequential execution time: ${SEQ_TOTAL_MS}ms"
echo "Parallel execution time: ${PAR_TOTAL_MS}ms"
echo "Speedup: ${SPEEDUP}x"
echo "============================="
if (( $(echo "$SPEEDUP > 1.2" | bc -l 2>/dev/null) )); then
echo "SUCCESS: Parallelism is working! Significant speedup detected."
elif (( $(echo "$SPEEDUP >= 0.9" | bc -l 2>/dev/null) )); then
echo "PARTIAL: Limited parallelism detected. Some speedup but not optimal."
else
echo "FAILED: No parallelism detected. Parallel execution was slower than sequential."
fi
echo ""
echo "Log files saved to: $LOG_DIR"
# Validate results by checking response format
echo ""
echo "Validating responses..."
SAMPLE_FILE=$(find $LOG_DIR -name "par_multi_1.json" -o -name "seq_multi_1.json" | head -1)
if [ -f "$SAMPLE_FILE" ]; then
# Check if we have embeddings array in the response
EMBEDDING_COUNT=$(grep -o "\"embeddings\":" "$SAMPLE_FILE" | wc -l)
if [ "$EMBEDDING_COUNT" -gt 0 ]; then
echo "Response format looks good, found embeddings array."
# Check if we have 2 embeddings per request
ARRAYS=$(grep -o "\\[\\[" "$SAMPLE_FILE" | wc -l)
if [ "$ARRAYS" -ge 2 ]; then
echo "SUCCESS: Found multiple embeddings in the response (one for each input text)."
else
echo "WARNING: Expected multiple embeddings arrays but found fewer than expected."
fi
else
echo "ERROR: Response format issue - no embeddings found in response."
fi
else
echo "WARNING: Could not find any response files to validate."
fi
# Check if any responses contain errors
grep -l "error" $LOG_DIR/*.json > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "WARNING: Some responses contain errors! Check log files."
grep -l "error" $LOG_DIR/*.json | while read file; do
echo "Error in $file:"
cat "$file" | grep -A 2 "error"
done
else
echo "No errors found in responses."
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment